diff --git a/.circleci/build.sh b/.circleci/build.sh index 909a047987..f502e1223b 100755 --- a/.circleci/build.sh +++ b/.circleci/build.sh @@ -140,7 +140,7 @@ elif [[ "$CIRCLE_JOB" == "PYTORCH" ]]; then cd build elif [[ "$CIRCLE_JOB" == "OPENCL" ]]; then install_pocl - CMAKE_ARGS+=("-DGLOW_WITH_OPENCL=ON" "-DGLOW_OPENCL_ALIGN=128") + CMAKE_ARGS+=("-DGLOW_WITH_OPENCL=ON") else CMAKE_ARGS+=("-DCMAKE_BUILD_TYPE=Debug") if [[ "${CIRCLE_JOB}" == "SHARED" ]]; then diff --git a/lib/Backends/OpenCL/OpenCL.cpp b/lib/Backends/OpenCL/OpenCL.cpp index 63b3dbb695..63fbe6a9a3 100644 --- a/lib/Backends/OpenCL/OpenCL.cpp +++ b/lib/Backends/OpenCL/OpenCL.cpp @@ -109,10 +109,6 @@ static void addStringOption(std::vector &options, options.push_back("-D" + name + "=" + value); } -cl_mem glow::runtime::OpenCLDeviceBindings::getBuffer(glow::Value *v) { - return weightBuffers[v->getName()]; -} - OpenCLFunction::OpenCLFunction(std::unique_ptr F, runtime::RuntimeBundle &&bundle, TraceInfo traceInfo) @@ -145,15 +141,15 @@ static std::string getKernelName(const char *baseName, ElemKind elemTy) { std::string name = baseName; switch (elemTy) { case ElemKind::FloatTy: - return name + "K"; + return name + "W"; case ElemKind::Int8QTy: - return name + "_i8K"; + return name + "_i8W"; case ElemKind::Int32QTy: - return name + "_i32K"; + return name + "_i32W"; case ElemKind::Int64ITy: - return name + "_uK"; + return name + "_uW"; case ElemKind::BoolTy: - return name + "_bK"; + return name + "_bW"; default: LOG(FATAL) << "Unsupported data type: " << Type::getElementName(elemTy).str(); @@ -223,14 +219,14 @@ static void setKernelArg(cl_kernel kernel, unsigned argIdx, T value) { /// /// \returns the index of the last set OpenCL kernel argument. static size_t setKernelArgsForBuffers(cl_kernel kernel, const Instruction &I, - runtime::OpenCLDeviceBindings *clBindings, + size_t nextKernelArgIdx, runtime::RuntimeBundle &bundle) { // Number of instruction operands. auto numArgs = I.getNumOperands(); // The predicate of the instruction if available. Value *predicate = I.hasPredicate() ? I.getPredicate() : nullptr; // The index of the kernel argument to be set. - unsigned kernelArgIdx = 0; + unsigned kernelArgIdx = nextKernelArgIdx; // Go over all operands and pass buffer operands to the kernel. for (unsigned arg = 0; arg < numArgs; arg++) { auto *value = I.getOperand(arg).first; @@ -239,20 +235,12 @@ static size_t setKernelArgsForBuffers(cl_kernel kernel, const Instruction &I, if (value == predicate) continue; // The value is a buffer that should be passed as a kernel argument. - setKernelArg(kernel, kernelArgIdx, clBindings->getBuffer(value)); + setKernelArg(kernel, kernelArgIdx, bundle.getValueOffset(value)); kernelArgIdx++; } return kernelArgIdx - 1; } -static size_t setQuantizationParams(cl_kernel kernel, unsigned kernelArgIdx, - QuantizationTransform32To8 ¶ms) { - setKernelArg(kernel, kernelArgIdx++, params.pre); - setKernelArg(kernel, kernelArgIdx++, params.post); - setKernelArg(kernel, kernelArgIdx, params.scale); - return kernelArgIdx; -} - /// \returns the preferred (intra) vector width for the given OpenCL \p device, /// and the given \p elementType. static unsigned getPreferredVectorWidth(cl_device_id device, @@ -281,13 +269,14 @@ static unsigned getPreferredVectorWidth(cl_device_id device, return width; } -void OpenCLFunction::fillBuffer(cl_mem buffer, uint64_t len, float value, - ElemKind elemKind, +void OpenCLFunction::fillBuffer(cl_mem buffer, uint64_t start, uint64_t len, + float value, ElemKind elemKind, runtime::OpenCLDeviceBindings *devBindings) { auto kernel = createKernel(getKernelName("splat", elemKind), devBindings->program); setKernelArg(kernel, 0, buffer); - setKernelArg(kernel, 1, value); + setKernelArg(kernel, 1, start); + setKernelArg(kernel, 2, value); enqueueKernel("splat", devBindings->commandQueue, kernel, devBindings->deviceId, {(size_t)len}, devBindings->kernelLaunches); @@ -507,10 +496,11 @@ void OpenCLFunction::executeNCHWConvolution( auto kernelName = isQuantized ? "conv_forward_mem_i8" : "conv_forward_mem"; auto kernel = createKernel(kernelName, prog); - setKernelArg(kernel, 0, devBindings->getBuffer(input)); - setKernelArg(kernel, 1, devBindings->getBuffer(weights)); - setKernelArg(kernel, 2, devBindings->getBuffer(bias)); - setKernelArg(kernel, 3, devBindings->getBuffer(output)); + setKernelArg(kernel, 0, devBindings->deviceBuffer); + setKernelArg(kernel, 1, runtimeBundle_.getValueOffset(input)); + setKernelArg(kernel, 2, runtimeBundle_.getValueOffset(weights)); + setKernelArg(kernel, 3, runtimeBundle_.getValueOffset(bias)); + setKernelArg(kernel, 4, runtimeBundle_.getValueOffset(output)); // Extra options for quantized kernel if (isQuantized) { @@ -518,14 +508,14 @@ void OpenCLFunction::executeNCHWConvolution( auto outputTy = CC->getDest()->getType(); auto biasTy = CC->getBias()->getType(); auto weightsTy = CC->getFilter()->getType(); - setKernelArg(kernel, 4, weightsTy->getOffset()); - setKernelArg(kernel, 5, weightsTy->getScale()); - setKernelArg(kernel, 6, inputTy->getOffset()); - setKernelArg(kernel, 7, inputTy->getScale()); - setKernelArg(kernel, 8, outputTy->getOffset()); - setKernelArg(kernel, 9, outputTy->getScale()); - setKernelArg(kernel, 10, biasTy->getOffset()); - setKernelArg(kernel, 11, biasTy->getScale()); + setKernelArg(kernel, 5, weightsTy->getOffset()); + setKernelArg(kernel, 6, weightsTy->getScale()); + setKernelArg(kernel, 7, inputTy->getOffset()); + setKernelArg(kernel, 8, inputTy->getScale()); + setKernelArg(kernel, 9, outputTy->getOffset()); + setKernelArg(kernel, 10, outputTy->getScale()); + setKernelArg(kernel, 11, biasTy->getOffset()); + setKernelArg(kernel, 12, biasTy->getScale()); } // Compute proper parameters for global work and workgroups. @@ -609,7 +599,7 @@ Error OpenCLFunction::execute(ExecutionContext *context) { isa(I)) { continue; } - // The kernels are named after the name of the instruction, plus the "K" + // The kernels are named after the name of the instruction, plus the "W" // suffix to prevent name colissions for functions like 'tanh' that are also // a part of the OpenCL runtime. auto elemTy = I.getNumOperands() ? I.getOperand(0).first->getElementType() @@ -659,8 +649,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { } cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); auto numMandatoryArgs = numArgs; (void)numMandatoryArgs; @@ -674,7 +664,7 @@ Error OpenCLFunction::execute(ExecutionContext *context) { destTy->getOffset()}; float val = SI->getValue(); int8_t int8Val = quantization::quantize(val, destQ); - setKernelArg(kernel, ++numArgs, int8Val); + setKernelArg(kernel, ++numArgs, static_cast(int8Val)); } } @@ -693,8 +683,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { auto rhsScaleParams = quantization::quantizeScaleOffset32To8( RHSTy->getScale() / destScale, RHSTy->getOffset()); setKernelArg(kernel, ++numArgs, destOffset); - setKernelArg(kernel, ++numArgs, lhsScaleParams.offset); - setKernelArg(kernel, ++numArgs, rhsScaleParams.offset); + setKernelArg(kernel, ++numArgs, lhsScaleParams); + setKernelArg(kernel, ++numArgs, rhsScaleParams); if (isa(I) || isa(I)) { float resultScale = isa(I) @@ -702,14 +692,7 @@ Error OpenCLFunction::execute(ExecutionContext *context) { : LHSTy->getScale() / (RHSTy->getScale() * destScale); auto resultScaleParams = quantization::quantizeScaleOffset32To8(resultScale, 0); - - numArgs = - setQuantizationParams(kernel, numArgs + 1, resultScaleParams); - } else { - numArgs = - setQuantizationParams(kernel, numArgs + 1, lhsScaleParams); - numArgs = - setQuantizationParams(kernel, numArgs + 1, rhsScaleParams); + setKernelArg(kernel, ++numArgs, resultScaleParams); } } // Quantize floating point tensor. Scale and Offset are based on return @@ -734,7 +717,7 @@ Error OpenCLFunction::execute(ExecutionContext *context) { setKernelArg(kernel, ++numArgs, destType->getOffset()); setKernelArg(kernel, ++numArgs, srcType->getOffset()); - numArgs = setQuantizationParams(kernel, numArgs + 1, rescaleParams); + setKernelArg(kernel, ++numArgs, rescaleParams); } // Dequantize integer tensor. Scale and Offset are based // on the source tensor type. @@ -760,8 +743,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // Implement Softmax by parallelizing the batch dimension. Each sample in // the batch is processed by a different parallel 'thread'. cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); // This is the number of elements for each slice. There are N slices in // our batch. @@ -780,8 +763,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // Implement Softmax by parallelizing the batch dimension. Each sample in // the batch is processed by a different parallel 'thread'. cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); // This is the number of elements for each slice. There are N slices in // our batch. @@ -798,8 +781,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *ET = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); // Currently support tensors up to 4 dimensions. // TODO: Handle other dimensions. @@ -838,8 +821,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *IT = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); // Currently support tensors of up to 4 dimensions. // TODO: Handle other dimensions. @@ -892,8 +875,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { auto tiledKernelName = isQuantized ? "matmul_tiled_i8" : "matmul_tiled"; cl_kernel kernel = createKernel(useTiledMatMul ? tiledKernelName : kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); auto ddim = ShapeNHWC::fromXY(BMM->getDest()->getType()->dims()); auto ldim = ShapeNHWC::fromXY(BMM->getLHS()->getType()->dims()); @@ -911,11 +894,7 @@ Error OpenCLFunction::execute(ExecutionContext *context) { setKernelArg(kernel, numArgs + 4, lhsTy->getOffset()); setKernelArg(kernel, numArgs + 5, rhsTy->getOffset()); setKernelArg(kernel, numArgs + 6, destTy->getOffset()); - if (useTiledMatMul) { - setKernelArg(kernel, numArgs + 7, destScaleParams); - } else { - setQuantizationParams(kernel, numArgs + 7, destScaleParams); - } + setKernelArg(kernel, numArgs + 7, destScaleParams); } if (useTiledMatMul) { @@ -939,8 +918,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { kernelName += "_32"; } cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); auto bdim = flattenCdr(BA->getBatch()->dims()); setKernelArg(kernel, numArgs + 1, bdim.first); @@ -993,8 +972,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // Create kernel and set arguments. cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); setKernelArg(kernel, numArgs + 1, batchDims[axis]); setKernelArg(kernel, numArgs + 2, axisSrcSliceSize); @@ -1014,18 +993,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // This is a naive implementation that parallelizes using three dims: // the X and the Y in the output filter. cl_kernel kernel = createKernel(kernelName, program); - unsigned numArgs = 0; - - // Okay, to be honest I have no idea why this is, but: if you don't pass - // in a void* for the first arg this specific kernel doesn't work. It's - // super weird and I've tried everything I can think of. Conceding defeat. - setKernelArg(kernel, numArgs++, nullptr); - - setKernelArg(kernel, numArgs++, clBindings->getBuffer(CC->getDest())); - setKernelArg(kernel, numArgs++, clBindings->getBuffer(CC->getSrc())); - setKernelArg(kernel, numArgs++, clBindings->getBuffer(CC->getFilter())); - setKernelArg(kernel, numArgs, clBindings->getBuffer(CC->getBias())); - + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); auto odim = ShapeNHWC(CC->getDest()->getType()->dims()); auto idim = ShapeNHWC(CC->getSrc()->getType()->dims()); auto pads = PaddingTLBR(CC->getPads()); @@ -1069,8 +1038,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { auto *filterGrad = CG->getFilterGrad(); auto *biasGrad = CG->getBiasGrad(); cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); auto destGradDim = ShapeNHWC(destGrad->dims()); auto srcDim = ShapeNHWC(src->dims()); @@ -1087,12 +1056,13 @@ Error OpenCLFunction::execute(ExecutionContext *context) { setKernelArg(kernel, numArgs + 7, destGradDim); setKernelArg(kernel, numArgs + 8, filterGradDim); // Zero memory for the output buffers. - fillBuffer(clBindings->getBuffer(srcGrad), srcGrad->size(), 0, - srcGrad->getElementType(), clBindings); - fillBuffer(clBindings->getBuffer(filterGrad), filterGrad->size(), 0, - filterGrad->getElementType(), clBindings); - fillBuffer(clBindings->getBuffer(biasGrad), biasGrad->size(), 0, - biasGrad->getElementType(), clBindings); + fillBuffer(deviceBuffer, runtimeBundle_.getValueOffset(srcGrad), + srcGrad->size(), 0, srcGrad->getElementType(), clBindings); + fillBuffer(deviceBuffer, runtimeBundle_.getValueOffset(filterGrad), + filterGrad->size(), 0, filterGrad->getElementType(), + clBindings); + fillBuffer(deviceBuffer, runtimeBundle_.getValueOffset(biasGrad), + biasGrad->size(), 0, biasGrad->getElementType(), clBindings); enqueueKernel(I.getName(), commands, kernel, deviceId, {destGradDim.h, destGradDim.w, destGradDim.c}, @@ -1108,8 +1078,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { } cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); ShapeHW kdim(PM->getKernels()); ShapeHW sdim(PM->getStrides()); @@ -1143,8 +1113,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // This is a naive implementation that parallelizes using three dims: // the X and the Y in the output filter. cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); auto odim = ShapeNHWC(PM->getDest()->getType()->dims()); auto idim = ShapeNHWC(PM->getSrc()->getType()->dims()); @@ -1164,8 +1134,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *PMG = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); auto destGradDim = ShapeNHWC(PMG->getDestGrad()->dims()); auto srcGradDim = ShapeNHWC(PMG->getSrcGrad()->dims()); @@ -1191,8 +1161,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { } cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); ShapeHW kdim(PA->getKernels()); ShapeHW sdim(PA->getStrides()); @@ -1225,8 +1195,7 @@ Error OpenCLFunction::execute(ExecutionContext *context) { (PA->getKernels()[0] * PA->getKernels()[0]), destTy->getOffset()); setKernelArg(kernel, numArgs + 6, srcTy->getOffset()); - setKernelArg(kernel, numArgs + 7, destScaleParam.offset); - numArgs = setQuantizationParams(kernel, numArgs + 8, destScaleParam); + setKernelArg(kernel, numArgs + 7, destScaleParam); } enqueueKernel(I.getName(), commands, kernel, deviceId, global, @@ -1241,8 +1210,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { << "This code supports only 4 and lower dimensional transposes"; cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); // Temporary hack to support 3-dim transposes. // TODO: support any dimensional transposes. @@ -1276,14 +1245,13 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (src == dest) { continue; } - - cl_mem destBuf = clBindings->getBuffer(dest); - cl_mem srcBuf = clBindings->getBuffer(src); + size_t destOff = runtimeBundle_.getValueOffset(dest); + size_t srcOff = runtimeBundle_.getValueOffset(src); size_t sizeInBytes = dest->getSizeInBytes(); cl_event event{nullptr}; - cl_int err = - clEnqueueCopyBuffer(commands, srcBuf, destBuf, 0, 0, sizeInBytes, 0, - nullptr, kernelProfiling_ ? &event : nullptr); + cl_int err = clEnqueueCopyBuffer(commands, deviceBuffer, deviceBuffer, + srcOff, destOff, sizeInBytes, 0, nullptr, + kernelProfiling_ ? &event : nullptr); if (kernelProfiling_) { kernelLaunches.emplace_back(KernelLaunch(I.getName(), "copy", event)); } @@ -1293,8 +1261,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *GI = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); unsigned_t batchDims = GI->getBatchDims(); auto *data = GI->getData(); @@ -1326,8 +1294,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *SDI = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + setKernelArg(kernel, 0, deviceBuffer); + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); auto *data = SDI->getData(); size_t dataSliceSize = data->size() / data->dims()[0]; @@ -1341,8 +1309,11 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *SLWS = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + // Set the device buffer as the first argument. + setKernelArg(kernel, 0, deviceBuffer); + // Set all buffer arguments from the instruction (data, dest, weights, + // indices, lengths) as subsequent arguments. + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); // Set the size of one slice of data as the last argument. auto *data = SLWS->getData(); @@ -1352,8 +1323,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // Zero the destination buffer so that the kernel can accumulate (+=) into // it. auto *dest = SLWS->getDest(); - fillBuffer(clBindings->getBuffer(dest), dest->size(), 0, - dest->getElementType(), clBindings); + fillBuffer(deviceBuffer, runtimeBundle_.getValueOffset(dest), + dest->size(), 0, dest->getElementType(), clBindings); // Get the number of segments. The output for each segment will be // computed in parallel by setting the global size equal to the number of @@ -1368,8 +1339,11 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *SLWSG = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - auto numArgs = - setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); + // Set the device buffer as the first argument. + setKernelArg(kernel, 0, deviceBuffer); + // Set all buffer arguments from the instruction (dataGrad, destGrad, + // weights, indices, lengths) as subsequent arguments. + auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); // Set the number of segments as the second last argument. auto *lengths = SLWSG->getLengths(); @@ -1384,8 +1358,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // Zero the data gradient buffer so that the kernel can accumulate (+=) // into it. auto *dataGrad = SLWSG->getDataGrad(); - fillBuffer(clBindings->getBuffer(dataGrad), dataGrad->size(), 0, - dataGrad->getElementType(), clBindings); + fillBuffer(deviceBuffer, runtimeBundle_.getValueOffset(dataGrad), + dataGrad->size(), 0, dataGrad->getElementType(), clBindings); // Enqueue the kernel. Set the global size to 1 so that all segments are // processed sequentially to avoid two kernel instances accumulating into @@ -1543,11 +1517,7 @@ OCLBackend::compileIR(std::unique_ptr IR) const { auto *module = IR->getGraph()->getParent(); TraceInfo traceInfo; -#ifdef GLOW_OPENCL_ALIGN - MemoryAllocator allocator("GPU", 0xFFFFFFFF, GLOW_OPENCL_ALIGN); -#else MemoryAllocator allocator("GPU", 0xFFFFFFFF); -#endif runtime::RuntimeBundle bundle = runtime::RuntimeBundle::create(*IR, allocator); std::unique_ptr function = @@ -1568,11 +1538,7 @@ OCLBackend::compile(Function *F, const BackendOptions &opts) const { autoInstrument(traceInfo, IR.get()); } -#ifdef GLOW_OPENCL_ALIGN - MemoryAllocator allocator("GPU", 0xFFFFFFFF, GLOW_OPENCL_ALIGN); -#else MemoryAllocator allocator("GPU", 0xFFFFFFFF); -#endif runtime::RuntimeBundle bundle = runtime::RuntimeBundle::create(*IR, allocator); diff --git a/lib/Backends/OpenCL/OpenCL.h b/lib/Backends/OpenCL/OpenCL.h index 722aa0573c..6f1437f4ef 100644 --- a/lib/Backends/OpenCL/OpenCL.h +++ b/lib/Backends/OpenCL/OpenCL.h @@ -145,7 +145,8 @@ class OpenCLFunction final : public CompiledFunction { /// Fill the device \p buffer with a given \p value. /// \param len number of buffer elements to be filled by the \p value. /// Elements are considered to be of the type described by \p elemKind. - void fillBuffer(cl_mem buffer, uint64_t len, float value, ElemKind elemKind, + void fillBuffer(cl_mem buffer, uint64_t start, uint64_t len, float value, + ElemKind elemKind, runtime::OpenCLDeviceBindings *devBindings); /// Execution a convolution instruction which uses NCHW format. @@ -243,13 +244,10 @@ namespace runtime { /// device specific information used to run a compiled function on a specific /// device. struct OpenCLDeviceBindings : DeviceBindings { - OpenCLDeviceBindings( - cl_mem buffer, cl_command_queue commands, cl_device_id device, - cl_context ctx, cl_program prog, - const std::unordered_map &subBuffers) + OpenCLDeviceBindings(cl_mem buffer, cl_command_queue commands, + cl_device_id device, cl_context ctx, cl_program prog) : DeviceBindings(OCLBackend::getName()), deviceBuffer{buffer}, - commandQueue{commands}, deviceId{device}, context{ctx}, program{prog}, - weightBuffers(subBuffers) {} + commandQueue{commands}, deviceId{device}, context{ctx}, program{prog} {} /// CL memory buffer. Currently this contains both mutable and immutable /// weights, the buffer is allocated once when the network is added. @@ -273,12 +271,6 @@ struct OpenCLDeviceBindings : DeviceBindings { /// A list of kernels and their associated events. std::vector kernelLaunches; - - /// Buffers or subBuffers associated with symbols. - std::unordered_map weightBuffers; - - /// /returns the subBufffer assciated with a Value. - cl_mem getBuffer(glow::Value *v); }; } // namespace runtime } // namespace glow diff --git a/lib/Backends/OpenCL/OpenCLDeviceManager.cpp b/lib/Backends/OpenCL/OpenCLDeviceManager.cpp index 4d51453573..af49fda744 100644 --- a/lib/Backends/OpenCL/OpenCLDeviceManager.cpp +++ b/lib/Backends/OpenCL/OpenCLDeviceManager.cpp @@ -55,29 +55,7 @@ DeviceManager *createOCLDeviceManager(const DeviceConfig &config) { return new OpenCLDeviceManager(config); } -OpenCLBuffer::~OpenCLBuffer() { - for (auto buf : subBuffers_) { - clReleaseMemObject(buf.second); - } - subBuffers_.clear(); - - clReleaseMemObject(buffer_); -} - -/// Add a mapping from a Symbol name to an offset into buffer_; -bool OpenCLBuffer::addSubBuffer(std::string name, size_t offset, size_t size) { - cl_buffer_region region({offset, size}); - cl_int err; - auto buf = clCreateSubBuffer(buffer_, CL_MEM_READ_WRITE, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); - auto res = subBuffers_.emplace(name, buf); - if (!res.second) { - llvm::dbgs() << "OpenCLBuffer: failed to add subBuffer for symbol " << name - << "\n"; - return false; - } - return true; -} +OpenCLBuffer::~OpenCLBuffer() { clReleaseMemObject(buffer_); } } // namespace runtime } // namespace glow @@ -378,15 +356,6 @@ void OpenCLDeviceManager::addNetworkImpl(const Module *module, clFinish(commands); } usedMemoryBytes_ += sizeInBytes; - - // Add a sub-buffer for each symbol in the symbol table. OpenCL sub-buffers - // are essentially TensorViews in Glow. - for (auto &pair : bundle.getSymbolTable()) { - bool success = buffer->addSubBuffer(pair.first, pair.second.offset, - pair.second.size); - DCHECK(success); - } - // Compile the CL program. // Add to the function name lookup map. // Add shared pointer to the buffer to buffers. This way the buffer will @@ -407,7 +376,6 @@ void OpenCLDeviceManager::addNetworkImpl(const Module *module, programs_.emplace(func.first, program); functions_.emplace(func.first, func.second); buffers_.emplace(func.first, buffer); - buffer->incrementUsers(); DCHECK_LE(usedMemoryBytes_, maxMemoryBytes_); @@ -698,7 +666,7 @@ void OpenCLDeviceManager::runFunctionImpl( auto program = programs_[function]; auto clBindings = glow::make_unique( buffers_[function]->getBuffer(), queue.backingQueue, deviceId_, context_, - program, buffers_[function]->getSubBuffers()); + program); // Copy inputs to the device. copyInputsToDevice(func->getRuntimeBundle(), context.get(), clBindings.get()); diff --git a/lib/Backends/OpenCL/OpenCLDeviceManager.h b/lib/Backends/OpenCL/OpenCLDeviceManager.h index a0a585c0ca..a7f4b84feb 100644 --- a/lib/Backends/OpenCL/OpenCLDeviceManager.h +++ b/lib/Backends/OpenCL/OpenCLDeviceManager.h @@ -98,9 +98,6 @@ class OpenCLBuffer { /// The OpenCL buffer being stored. cl_mem buffer_; - /// Subbuffers for symbols. - std::unordered_map subBuffers_; - /// Count of functions using this buffer. unsigned int users_{0}; @@ -123,14 +120,6 @@ class OpenCLBuffer { /// Get size of buffer in bytes. size_t getSize() { return size_; } - - /// Return the mapping from Symbol name to subBuffer for this Buffer. - const std::unordered_map &getSubBuffers() { - return subBuffers_; - } - - /// Add a mapping from a Symbol name to an offset into buffer_; - bool addSubBuffer(std::string name, size_t offset, size_t size); }; /// A class controlling a single OpenCL device. Many OpenCLFunctions may be diff --git a/lib/Backends/OpenCL/kernels.cl b/lib/Backends/OpenCL/kernels.cl index 420a0e61b1..ac5a0d8496 100644 --- a/lib/Backends/OpenCL/kernels.cl +++ b/lib/Backends/OpenCL/kernels.cl @@ -120,13 +120,6 @@ cl_int32_t scale_i32i8(cl_int32_t input, cl_int32_t pre, cl_int32_t post, return ((((input >> pre) * scale) + rtn) >> post) + offset; } -/// Scales a 32-bit integer using the integer shift-mult-shift method. - -cl_int32_t scale_i32i8p(cl_int32_t input, QuantizationTransform32To8 params, - cl_int32_t offset) { - return scale_i32i8(input, params.pre, params.post, params.scale, offset); -} - /// Clips int32_t into int8_t. cl_int8_t clip(cl_int32_t val) { return (cl_int8_t)min(max(val, -128), 127); } @@ -159,6 +152,16 @@ __kernel void quantize_i32K(__global cl_int32_t *dest, __global float *src, dest[i] = quantize_i32(src[i], scale, offset); } +__kernel void quantize_i8W(__global void *mem, cl_uint32_t dest, + cl_uint32_t src, float scale, cl_int32_t offset) { + quantize_i8K(&mem[dest], &mem[src], scale, offset); +} + +__kernel void quantize_i32W(__global void *mem, cl_uint32_t dest, + cl_uint32_t src, float scale, cl_int32_t offset) { + quantize_i32K(&mem[dest], &mem[src], scale, offset); +} + __kernel void rescalequantized_i8K(__global cl_int8_t *dest, __global cl_int8_t *src, cl_int32_t destOffset, cl_int32_t srcOffset, @@ -171,12 +174,26 @@ __kernel void rescalequantized_i8K(__global cl_int8_t *dest, dest[i] = clip(s); } +__kernel void rescalequantized_i8W(__global void *mem, cl_uint32_t dest, + cl_uint32_t src, cl_int32_t destOffset, + cl_int32_t srcOffset, + QuantizationTransform32To8 rescaleParams) { + rescalequantized_i8K(&mem[dest], &mem[src], destOffset, srcOffset, + rescaleParams.pre, rescaleParams.post, + rescaleParams.scale); +} + __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, float scale, cl_int32_t offset) { size_t i = get_global_id(0); dest[i] = dequantize(src[i], scale, offset); } +__kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, + float scale, cl_int32_t offset) { + dequantizeK(&mem[dest], &mem[src], scale, offset); +} + /// Macro to define a kernel for data-parallel ternay operations. The body of /// the kernel is auto-generated by the macro. /// Defines vectorized kernels for vector sizes 1, 8 and 16. @@ -211,6 +228,11 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, vstore8(VAL, i * 2 + 1, dest); \ } \ } \ + __kernel void name##W##16(__global void *mem, cl_uint32_t dest, \ + cl_uint32_t cond, cl_uint32_t lhs, \ + cl_uint32_t rhs) { \ + name##K##16(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); \ + } \ __kernel void name##K##8(__global type * dest, __global type * cond, \ __global type * lhs, __global type * rhs) { \ typedef float8 vtype; \ @@ -221,6 +243,11 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, vtype VAL = body; \ vstore8(VAL, i, dest); \ } \ + __kernel void name##W##8(__global void *mem, cl_uint32_t dest, \ + cl_uint32_t cond, cl_uint32_t lhs, \ + cl_uint32_t rhs) { \ + name##K##8(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); \ + } \ __kernel void name##K(__global type *dest, __global type *cond, \ __global type *lhs, __global type *rhs) { \ typedef float vtype; \ @@ -229,6 +256,10 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, vtype RHS = rhs[i]; \ vtype LHS = lhs[i]; \ dest[i] = body; \ + } \ + __kernel void name##W(__global void *mem, cl_uint32_t dest, \ + cl_uint32_t cond, cl_uint32_t lhs, cl_uint32_t rhs) { \ + name##K(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); \ } /// Macro to define a kernel for data-parallel binary operations. The body of @@ -263,6 +294,10 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, vstore8(VAL, i * 2 + 1, dest); \ } \ } \ + __kernel void name##W##16(__global void *mem, cl_uint32_t dest, \ + cl_uint32_t lhs, cl_uint32_t rhs) { \ + name##K##16(&mem[dest], &mem[lhs], &mem[rhs]); \ + } \ __kernel void name##K##8(__global type * dest, __global type * lhs, \ __global type * rhs) { \ typedef float8 vtype; \ @@ -272,6 +307,10 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, vtype VAL = body; \ vstore8(VAL, i, dest); \ } \ + __kernel void name##W##8(__global void *mem, cl_uint32_t dest, \ + cl_uint32_t lhs, cl_uint32_t rhs) { \ + name##K##8(&mem[dest], &mem[lhs], &mem[rhs]); \ + } \ __kernel void name##K(__global type *dest, __global type *lhs, \ __global type *rhs) { \ typedef float vtype; \ @@ -279,6 +318,10 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, vtype RHS = rhs[i]; \ vtype LHS = lhs[i]; \ dest[i] = body; \ + } \ + __kernel void name##W(__global void *mem, cl_uint32_t dest, cl_uint32_t lhs, \ + cl_uint32_t rhs) { \ + name##K(&mem[dest], &mem[lhs], &mem[rhs]); \ } /// Macro to define a kernel for data-parallel binary quantized operations. The @@ -300,6 +343,15 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, cl_int32_t RHS = \ scale_i32i8(rhs[i] - rhsOffset, rhsPre, rhsPost, rhsScale, 0); \ dest[i] = clip((body) + destOffset); \ + } \ + __kernel void name##_i8W( \ + __global void *mem, cl_uint32_t dest, cl_uint32_t lhs, cl_uint32_t rhs, \ + cl_int32_t destOffset, QuantizationTransform32To8 lhsScaleParams, \ + QuantizationTransform32To8 rhsScaleParams) { \ + name##_i8K(&mem[dest], &mem[lhs], &mem[rhs], destOffset, \ + lhsScaleParams.offset, rhsScaleParams.offset, \ + lhsScaleParams.pre, lhsScaleParams.post, lhsScaleParams.scale, \ + rhsScaleParams.pre, rhsScaleParams.post, rhsScaleParams.scale); \ } /// Macro to define a mini-kernel for data-parallel multiplicative quantized @@ -318,6 +370,16 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, cl_int32_t LHS = lhs[i] - lhsOffset; \ cl_int32_t RHS = rhs[i] - rhsOffset; \ dest[i] = clip(scale_i32i8((body), pre, post, scale, destOffset)); \ + } \ + __kernel void name##_i8W( \ + __global void *mem, cl_uint32_t dest, cl_uint32_t lhs, cl_uint32_t rhs, \ + cl_int32_t destOffset, QuantizationTransform32To8 lhsScaleParams, \ + QuantizationTransform32To8 rhsScaleParams, \ + QuantizationTransform32To8 resultScaleParams) { \ + name##_i8K(&mem[dest], &mem[lhs], &mem[rhs], destOffset, \ + lhsScaleParams.offset, rhsScaleParams.offset, \ + resultScaleParams.pre, resultScaleParams.post, \ + resultScaleParams.scale); \ } /// Macro to define a kernel for data-parallel unary operations. The body of @@ -349,6 +411,10 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, vstore8(VAL, i * 2 + 1, dest); \ } \ } \ + __kernel void name##W##16(__global void *mem, cl_uint32_t dest, \ + cl_uint32_t src) { \ + name##K##16(&mem[dest], &mem[src]); \ + } \ __kernel void name##K##8(__global type * dest, __global type * src) { \ typedef float8 vtype; \ size_t i = get_global_id(0); \ @@ -356,11 +422,19 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, vtype VAL = body; \ vstore8(VAL, i, dest); \ } \ + __kernel void name##W##8(__global void *mem, cl_uint32_t dest, \ + cl_uint32_t src) { \ + name##K##8(&mem[dest], &mem[src]); \ + } \ __kernel void name##K(__global type *dest, __global type *src) { \ typedef float vtype; \ size_t i = get_global_id(0); \ vtype SRC = src[i]; \ dest[i] = body; \ + } \ + __kernel void name##W(__global void *mem, cl_uint32_t dest, \ + cl_uint32_t src) { \ + name##K(&mem[dest], &mem[src]); \ } /// Macro to define a kernel for data-parallel unary operations with an @@ -393,6 +467,9 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, vstore8(VAL, i * 2 + 1, dest); \ } \ } \ + __kernel void name##W##16(__global void *mem, cl_uint32_t dest, float val) { \ + name##K##16(&mem[dest], (type)val); \ + } \ __kernel void name##K##8(__global type * dest, type val) { \ typedef type##8 vtype; \ size_t i = get_global_id(0); \ @@ -400,11 +477,17 @@ __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, vtype VAL = body; \ vstore8(VAL, i, dest); \ } \ + __kernel void name##W##8(__global void *mem, cl_uint32_t dest, float val) { \ + name##K##8(&mem[dest], (type)val); \ + } \ __kernel void name##K(__global type *dest, type val) { \ typedef type vtype; \ size_t i = get_global_id(0); \ vtype SRC = (vtype)val; \ dest[i] = body; \ + } \ + __kernel void name##W(__global void *mem, cl_uint32_t dest, float val) { \ + name##K(&mem[dest], (type)val); \ } DEFINE_OPENCL_BINARY_DATA_PARALLEL_KERNEL(elementadd, float, LHS + RHS) @@ -456,6 +539,12 @@ __kernel void elementselectK16(__global float *dest, __global cl_int8_t *cond, i + 1, dest); } +__kernel void elementselectW16(__global void *mem, cl_uint32_t dest, + cl_uint32_t cond, cl_uint32_t lhs, + cl_uint32_t rhs) { + elementselectK16(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); +} + __kernel void elementselectK8(__global float *dest, __global cl_int8_t *cond, __global float *lhs, __global float *rhs) { size_t i = get_global_id(0); @@ -466,6 +555,12 @@ __kernel void elementselectK8(__global float *dest, __global cl_int8_t *cond, i, dest); } +__kernel void elementselectW8(__global void *mem, cl_uint32_t dest, + cl_uint32_t cond, cl_uint32_t lhs, + cl_uint32_t rhs) { + elementselectK8(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); +} + __kernel void elementselectK(__global float *dest, __global cl_int8_t *cond, __global float *lhs, __global float *rhs) { size_t i = get_global_id(0); @@ -475,6 +570,12 @@ __kernel void elementselectK(__global float *dest, __global cl_int8_t *cond, dest[i] = (c != 0) ? LHS : RHS; } +__kernel void elementselectW(__global void *mem, cl_uint32_t dest, + cl_uint32_t cond, cl_uint32_t lhs, + cl_uint32_t rhs) { + elementselectK(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); +} + __kernel void elementcmplteK16(__global cl_int8_t *dest, __global float *LHS, __global float *RHS) { // This kernel uses 8-element vector primitives on two contiguous 8-element @@ -490,18 +591,33 @@ __kernel void elementcmplteK16(__global cl_int8_t *dest, __global float *LHS, i + 1, dest); } +__kernel void elementcmplteW16(__global void *mem, cl_uint32_t dest, + cl_uint32_t LHS, cl_uint32_t RHS) { + elementcmplteK16(&mem[dest], &mem[LHS], &mem[RHS]); +} + __kernel void elementcmplteK8(__global cl_int8_t *dest, __global float *LHS, __global float *RHS) { size_t i = get_global_id(0); vstore8(convert_char8(islessequal(vload8(i, LHS), vload8(i, RHS))), i, dest); } +__kernel void elementcmplteW8(__global void *mem, cl_uint32_t dest, + cl_uint32_t LHS, cl_uint32_t RHS) { + elementcmplteK8(&mem[dest], &mem[LHS], &mem[RHS]); +} + __kernel void elementcmplteK(__global cl_int8_t *dest, __global float *LHS, __global float *RHS) { size_t i = get_global_id(0); dest[i] = LHS[i] <= RHS[i]; } +__kernel void elementcmplteW(__global void *mem, cl_uint32_t dest, + cl_uint32_t LHS, cl_uint32_t RHS) { + elementcmplteK(&mem[dest], &mem[LHS], &mem[RHS]); +} + __kernel void oclbatchedreduceaddK(__global float *dest, __global float *batch, __global cl_int32_t *destSliceSizes, __global cl_int32_t *batchSliceSizes, @@ -543,6 +659,14 @@ __kernel void oclbatchedreduceaddK(__global float *dest, __global float *batch, } } +__kernel void +oclbatchedreduceaddW(__global void *mem, cl_uint32_t dest, cl_uint32_t batch, + cl_uint32_t destSliceSizes, cl_uint32_t batchSliceSizes, + cl_uint32_t numSlices, cl_uint32_t axisSliceSize) { + oclbatchedreduceaddK(&mem[dest], &mem[batch], &mem[destSliceSizes], + &mem[batchSliceSizes], numSlices, axisSliceSize); +} + __kernel void batchedaddK(__global float *dest, __global float *batch, __global float *slice, cl_uint32_t numSlice, cl_uint32_t sliceSize) { @@ -552,37 +676,70 @@ __kernel void batchedaddK(__global float *dest, __global float *batch, } } +__kernel void batchedaddW(__global void *mem, cl_uint32_t dest, + cl_uint32_t batch, cl_uint32_t slice, + cl_uint32_t numSlice, cl_uint32_t sliceSize) { + batchedaddK(&mem[dest], &mem[batch], &mem[slice], numSlice, sliceSize); +} + __kernel void batchedadd_i8K(__global cl_int8_t *dest, __global cl_int8_t *batch, __global cl_int8_t *slice, cl_uint32_t numSlice, cl_uint32_t sliceSize, cl_int32_t destOffset, + cl_int32_t batchOffset, cl_int32_t sliceOffset, + cl_int32_t batchPre, cl_int32_t batchPost, + cl_int32_t batchScale, cl_int32_t slicePre, + cl_int32_t slicePost, cl_int32_t sliceScale) { + size_t s = get_global_id(0); + for (size_t n = 0; n < numSlice; n++) { + cl_int32_t batchVal = batch[n * sliceSize + s] - batchOffset; + cl_int32_t sliceVal = slice[s] - sliceOffset; + cl_int32_t x = scale_i32i8(batchVal, batchPre, batchPost, batchScale, 0); + cl_int32_t y = scale_i32i8(sliceVal, slicePre, slicePost, sliceScale, 0); + dest[n * sliceSize + s] = clip(x + y + destOffset); + } +} + +__kernel void batchedadd_i8W(__global void *mem, cl_uint32_t dest, + cl_uint32_t batch, cl_uint32_t slice, + cl_uint32_t numSlice, cl_uint32_t sliceSize, + cl_int32_t destOffset, QuantizationTransform32To8 batchScaleParams, QuantizationTransform32To8 sliceScaleParams) { + batchedadd_i8K(&mem[dest], &mem[batch], &mem[slice], numSlice, sliceSize, + destOffset, batchScaleParams.offset, sliceScaleParams.offset, + batchScaleParams.pre, batchScaleParams.post, + batchScaleParams.scale, sliceScaleParams.pre, + sliceScaleParams.post, sliceScaleParams.scale); +} + +__kernel void batchedadd_i8K_32( + __global cl_int8_t *dest, __global cl_int8_t *batch, + __global cl_int32_t *slice, cl_uint32_t numSlice, cl_uint32_t sliceSize, + cl_int32_t destOffset, cl_int32_t batchOffset, cl_int32_t sliceOffset, + cl_int32_t batchPre, cl_int32_t batchPost, cl_int32_t batchScale, + cl_int32_t slicePre, cl_int32_t slicePost, cl_int32_t sliceScale) { size_t s = get_global_id(0); for (size_t n = 0; n < numSlice; n++) { - cl_int32_t batchVal = batch[n * sliceSize + s] - batchScaleParams.offset; - cl_int32_t sliceVal = slice[s] - sliceScaleParams.offset; - cl_int32_t x = scale_i32i8p(batchVal, batchScaleParams, 0); - cl_int32_t y = scale_i32i8p(sliceVal, sliceScaleParams, 0); + cl_int32_t batchVal = batch[n * sliceSize + s] - batchOffset; + cl_int32_t sliceVal = slice[s] - sliceOffset; + cl_int32_t x = scale_i32i8(batchVal, batchPre, batchPost, batchScale, 0); + cl_int32_t y = scale_i32i8(sliceVal, slicePre, slicePost, sliceScale, 0); dest[n * sliceSize + s] = clip(x + y + destOffset); } } -__kernel void batchedadd_i8K_32(__global cl_int8_t *dest, - __global cl_int8_t *batch, - __global cl_int32_t *slice, +__kernel void batchedadd_i8W_32(__global void *mem, cl_uint32_t dest, + cl_uint32_t batch, cl_uint32_t slice, cl_uint32_t numSlice, cl_uint32_t sliceSize, cl_int32_t destOffset, QuantizationTransform32To8 batchScaleParams, QuantizationTransform32To8 sliceScaleParams) { - size_t s = get_global_id(0); - for (size_t n = 0; n < numSlice; n++) { - cl_int32_t batchVal = batch[n * sliceSize + s] - batchScaleParams.offset; - cl_int32_t sliceVal = slice[s] - sliceScaleParams.offset; - cl_int32_t x = scale_i32i8p(batchVal, batchScaleParams, 0); - cl_int32_t y = scale_i32i8p(sliceVal, sliceScaleParams, 0); - dest[n * sliceSize + s] = clip(x + y + destOffset); - } + batchedadd_i8K_32( + &mem[dest], &mem[batch], &mem[slice], numSlice, sliceSize, destOffset, + batchScaleParams.offset, sliceScaleParams.offset, batchScaleParams.pre, + batchScaleParams.post, batchScaleParams.scale, sliceScaleParams.pre, + sliceScaleParams.post, sliceScaleParams.scale); } /// Size of the tile to be used for matrix multiplication. @@ -590,9 +747,13 @@ __kernel void batchedadd_i8K_32(__global cl_int8_t *dest, /// workgroups with sizes which are at least as big as a tile. #define TILE_SIZE 8 -__kernel void matmul_tiled(__global float *C, __global float *A, - __global float *B, ShapeNHWC ddim, ShapeNHWC ldim, - ShapeNHWC rdim) { +__kernel void matmul_tiled(__global void *mem, cl_uint32_t C_off, + cl_uint32_t A_off, cl_uint32_t B_off, ShapeNHWC ddim, + ShapeNHWC ldim, ShapeNHWC rdim) { + __global float *C = &mem[C_off]; + __global float *A = &mem[A_off]; + __global float *B = &mem[B_off]; + int M = ldim.n; int N = rdim.h; int K = ldim.h; @@ -636,12 +797,16 @@ __kernel void matmul_tiled(__global float *C, __global float *A, } } -__kernel void matmul_tiled_i8(__global cl_int8_t *C, __global cl_int8_t *A, - __global cl_int8_t *B, ShapeNHWC ddim, - ShapeNHWC ldim, ShapeNHWC rdim, +__kernel void matmul_tiled_i8(__global void *mem, cl_uint32_t C_off, + cl_uint32_t A_off, cl_uint32_t B_off, + ShapeNHWC ddim, ShapeNHWC ldim, ShapeNHWC rdim, cl_int32_t aOffset, cl_int32_t bOffset, cl_int32_t cOffset, QuantizationTransform32To8 destScaleParams) { + __global cl_int8_t *C = &mem[C_off]; + __global cl_int8_t *A = &mem[A_off]; + __global cl_int8_t *B = &mem[B_off]; + int M = ldim.n; int N = rdim.h; int K = ldim.h; @@ -705,6 +870,12 @@ __kernel void matmulK(__global float *dest, __global float *lhs, dest[getNHWC(ddim, x, y, 0, 0)] = sum; } +__kernel void matmulW(__global void *mem, cl_uint32_t dest, cl_uint32_t lhs, + cl_uint32_t rhs, ShapeNHWC ddim, ShapeNHWC ldim, + ShapeNHWC rdim) { + matmulK(&mem[dest], &mem[lhs], &mem[rhs], ddim, ldim, rdim); +} + __kernel void matmul_i8K(__global cl_int8_t *dest, __global cl_int8_t *lhs, __global cl_int8_t *rhs, ShapeNHWC ddim, ShapeNHWC ldim, ShapeNHWC rdim, cl_int32_t lhsOffset, @@ -727,8 +898,18 @@ __kernel void matmul_i8K(__global cl_int8_t *dest, __global cl_int8_t *lhs, clip(scale_i32i8(sum, destPre, destPost, destScale, destOffset)); } +__kernel void matmul_i8W(__global void *mem, cl_uint32_t dest, cl_uint32_t lhs, + cl_uint32_t rhs, ShapeNHWC ddim, ShapeNHWC ldim, + ShapeNHWC rdim, cl_int32_t lhsOffset, + cl_int32_t rhsOffset, cl_int32_t destOffset, + QuantizationTransform32To8 destScaleParams) { + matmul_i8K(&mem[dest], &mem[lhs], &mem[rhs], ddim, ldim, rdim, lhsOffset, + rhsOffset, destOffset, destScaleParams.pre, destScaleParams.post, + destScaleParams.scale); +} + __kernel void softmaxK(__global float *dest, __global float *src, - cl_uint32_t sliceSize) { + __global float *e_cache, cl_uint32_t sliceSize) { size_t i = get_global_id(0); float max_ = src[i * sliceSize]; for (size_t j = 0; j < sliceSize; j++) { @@ -742,11 +923,18 @@ __kernel void softmaxK(__global float *dest, __global float *src, } for (size_t j = 0; j < sliceSize; j++) { dest[i * sliceSize + j] /= sum; + if (e_cache) + e_cache[i * sliceSize + j] = dest[i * sliceSize + j]; } } -__kernel void softmaxgradK(__global float *outW, __global float *origSrc, - __global cl_uint64_t *selectedW, __global float *inG, +__kernel void softmaxW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, + cl_uint32_t sliceSize) { + softmaxK(&mem[dest], &mem[src], (__global float *)0, sliceSize); +} + +__kernel void softmaxgradK(__global float *inG, __global float *outW, + __global cl_uint64_t *selectedW, cl_uint32_t sliceSize) { size_t i = get_global_id(0); for (size_t j = 0; j < sliceSize; j++) { @@ -755,10 +943,16 @@ __kernel void softmaxgradK(__global float *outW, __global float *origSrc, } } -__kernel void convolutionK(__global void *unused, __global float *dest, - __global float *src, __global float *filter, - __global float *bias, ShapeHW kernelSizes, - ShapeHW strides, PaddingTLBR pads, cl_uint32_t group, +__kernel void softmaxgradW(__global void *mem, cl_uint32_t origDest, + cl_uint32_t origSrc, cl_uint32_t selected, + cl_uint32_t srcGrad, cl_uint32_t sliceSize) { + softmaxgradK(&mem[srcGrad], &mem[origDest], &mem[selected], sliceSize); +} + +__kernel void convolutionK(__global float *dest, __global float *src, + __global float *filter, __global float *bias, + ShapeHW kernelSizes, ShapeHW strides, + PaddingTLBR pads, cl_uint32_t group, cl_uint32_t dilation, ShapeNHWC odim, ShapeNHWC idim, ShapeNHWC filterDim) { size_t ax = get_global_id(0); @@ -802,6 +996,16 @@ __kernel void convolutionK(__global void *unused, __global float *dest, } // N } +__kernel void convolutionW(__global void *mem, cl_uint32_t dest, + cl_uint32_t src, cl_uint32_t filter, + cl_uint32_t bias, ShapeHW kernelSizes, + ShapeHW strides, PaddingTLBR pads, cl_uint32_t group, + cl_uint32_t dilation, ShapeNHWC odim, ShapeNHWC idim, + ShapeNHWC filterDim) { + convolutionK(&mem[dest], &mem[src], &mem[filter], &mem[bias], kernelSizes, + strides, pads, group, dilation, odim, idim, filterDim); +} + __kernel void convolution_i8K(__global cl_int8_t *dest, __global cl_int8_t *src, __global cl_int8_t *filter, __global cl_int32_t *bias, @@ -856,6 +1060,20 @@ convolution_i8K(__global cl_int8_t *dest, __global cl_int8_t *src, } } +__kernel void +convolution_i8W(__global void *mem, cl_uint32_t dest, cl_uint32_t src, + cl_uint32_t filter, cl_uint32_t bias, ShapeHW kernelSizes, + ShapeHW strides, PaddingTLBR pads, cl_uint32_t group, + cl_uint32_t dilation, ShapeNHWC odim, ShapeNHWC idim, + ShapeNHWC filterDim, cl_int32_t destOffset, float destScale, + cl_int32_t srcOffset, float srcScale, cl_int32_t filterOffset, + float filterScale, cl_int32_t biasOffset, float biasScale) { + convolution_i8K(&mem[dest], &mem[src], &mem[filter], &mem[bias], kernelSizes, + strides, destOffset, destScale, srcOffset, srcScale, + filterOffset, filterScale, biasOffset, biasScale, pads, group, + dilation, odim, idim, filterDim); +} + __kernel void convolutiongradK(const __global float *inW, const __global float *filterW, const __global float *outG, __global float *inG, @@ -909,6 +1127,19 @@ __kernel void convolutiongradK(const __global float *inW, } // N } +__kernel void convolutiongradW(__global void *mem, cl_uint32_t src, + cl_uint32_t filter, cl_uint32_t destGrad, + cl_uint32_t srcGrad, cl_uint32_t filterGrad, + cl_uint32_t biasGrad, ShapeHW kernelSizes, + ShapeHW strides, PaddingTLBR pads, + cl_uint32_t group, cl_uint32_t dilation, + ShapeNHWC srcDim, ShapeNHWC destGradDim, + ShapeNHWC filterGradDim) { + convolutiongradK(&mem[src], &mem[filter], &mem[destGrad], &mem[srcGrad], + &mem[filterGrad], &mem[biasGrad], kernelSizes, strides, pads, + group, dilation, srcDim, destGradDim, filterGradDim); +} + __kernel void maxpoolK(__global float *dest, __global float *src, cl_uint32_t kernelSize, cl_uint32_t stride, PaddingTLBR pads, ShapeNHWC odim, ShapeNHWC idim) { @@ -950,6 +1181,12 @@ __kernel void maxpoolK(__global float *dest, __global float *src, } // N } +__kernel void maxpoolW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, + cl_uint32_t kernelSize, cl_uint32_t stride, + PaddingTLBR pads, ShapeNHWC odim, ShapeNHWC idim) { + maxpoolK(&mem[dest], &mem[src], kernelSize, stride, pads, odim, idim); +} + /// Macro to define a kernel for oclmaxpool. The body of /// the kernel is auto-generated by the macro. /// \p name the name of this kernel @@ -988,6 +1225,11 @@ __kernel void maxpoolK(__global float *dest, __global float *src, } \ dest[getNCHW(odim, n, d, ax, ay)] = maxVal; \ } \ + } \ + __kernel void name##W(__global void *mem, cl_uint32_t dest, cl_uint32_t src, \ + cl_uint32_t kernelSize, cl_uint32_t stride, \ + PaddingTLBR pads, ShapeNCHW odim, ShapeNCHW idim) { \ + name##K(&mem[dest], &mem[src], kernelSize, stride, pads, odim, idim); \ } DEFINE_OPENCL_MAXPOOL_KERNEL(oclmaxpool, float) DEFINE_OPENCL_MAXPOOL_KERNEL(oclmaxpool_i8, char) @@ -1041,7 +1283,16 @@ __kernel void maxpoolwithargmaxK(__global float *dest, __global float *src, } // N } -__kernel void maxpoolwithargmaxgradK(__global float *dest, __global float *src, +__kernel void maxpoolwithargmaxW(__global void *mem, cl_uint32_t dest, + cl_uint32_t src, cl_uint32_t argmax, + cl_uint32_t kernelSize, cl_uint32_t stride, + PaddingTLBR pads, ShapeNHWC odim, + ShapeNHWC idim) { + maxpoolwithargmaxK(&mem[dest], &mem[src], &mem[argmax], kernelSize, stride, + pads, odim, idim); +} + +__kernel void maxpoolwithargmaxgradK(__global float *dest, __global cl_uint64_t *argmax, __global float *destGrad, __global float *srcGrad, @@ -1070,6 +1321,19 @@ __kernel void maxpoolwithargmaxgradK(__global float *dest, __global float *src, } // C } +__kernel void maxpoolwithargmaxgradW(__global void *mem, cl_uint32_t dest, + cl_uint32_t src, cl_uint32_t argmax, + cl_uint32_t destGrad, cl_uint32_t srcGrad, + cl_uint32_t kernelSize, cl_uint32_t stride, + PaddingTLBR pads, ShapeNHWC srcGradDim, + ShapeNHWC destDim) { + // src operand is present on the instruction but not needed by the OpenCL + // kernel. + maxpoolwithargmaxgradK(&mem[dest], &mem[argmax], &mem[destGrad], + &mem[srcGrad], kernelSize, stride, pads, srcGradDim, + destDim); +} + __kernel void avgpoolK(__global float *dest, __global float *src, cl_uint32_t kernelSize, cl_uint32_t stride, PaddingTLBR pads, ShapeNHWC odim, ShapeNHWC idim) { @@ -1106,6 +1370,12 @@ __kernel void avgpoolK(__global float *dest, __global float *src, } // N } +__kernel void avgpoolW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, + cl_uint32_t kernelSize, cl_uint32_t stride, + PaddingTLBR pads, ShapeNHWC odim, ShapeNHWC idim) { + avgpoolK(&mem[dest], &mem[src], kernelSize, stride, pads, odim, idim); +} + __kernel void oclavgpoolK(__global float *dest, __global float *src, cl_uint32_t kernelSize, cl_uint32_t stride, PaddingTLBR pads, ShapeNCHW odim, ShapeNCHW idim) { @@ -1142,6 +1412,12 @@ __kernel void oclavgpoolK(__global float *dest, __global float *src, } // N } +__kernel void oclavgpoolW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, + cl_uint32_t kernelSize, cl_uint32_t stride, + PaddingTLBR pads, ShapeNCHW odim, ShapeNCHW idim) { + oclavgpoolK(&mem[dest], &mem[src], kernelSize, stride, pads, odim, idim); +} + __kernel void oclavgpool_i8K(__global cl_int8_t *dest, __global cl_int8_t *src, cl_uint32_t kernelSize, cl_uint32_t stride, PaddingTLBR pads, ShapeNCHW odim, ShapeNCHW idim, @@ -1180,6 +1456,17 @@ __kernel void oclavgpool_i8K(__global cl_int8_t *dest, __global cl_int8_t *src, } } +__kernel void oclavgpool_i8W(__global void *mem, cl_uint32_t dest, + cl_uint32_t src, cl_uint32_t kernelSize, + cl_uint32_t stride, PaddingTLBR pads, + ShapeNCHW odim, ShapeNCHW idim, + cl_int32_t srcOffset, + QuantizationTransform32To8 destScaleParams) { + oclavgpool_i8K(&mem[dest], &mem[src], kernelSize, stride, pads, odim, idim, + srcOffset, destScaleParams.offset, destScaleParams.pre, + destScaleParams.post, destScaleParams.scale); +} + /// Macro to define a kernel for transpose operations. The body of /// the kernel is auto-generated by the macro. /// \p type the type of the tensor elements and of the return value @@ -1201,6 +1488,10 @@ __kernel void oclavgpool_i8K(__global cl_int8_t *dest, __global cl_int8_t *src, dest[dstIdx] = src[srcIdx]; \ } \ } \ + } \ + __kernel void name##W(__global void *mem, cl_uint32_t dest, cl_uint32_t src, \ + ShapeNHWC odim, ShapeNHWC idim, ShapeNHWC shuffle) { \ + name##K(&mem[dest], &mem[src], odim, idim, shuffle); \ } DEFINE_OPENCL_TRANSPOSE_KERNEL(transpose_i8, cl_int8_t) @@ -1240,6 +1531,11 @@ DEFINE_OPENCL_TRANSPOSE_KERNEL(transpose, float) } \ } \ } \ + } \ + __kernel void name##W(__global void *mem, cl_uint32_t dest, cl_uint32_t src, \ + ShapeNHWC odim, ShapeNHWC idim, ShapeNHWC offset, \ + cl_uint32_t count, cl_uint32_t axis) { \ + name##K(&mem[dest], &mem[src], odim, idim, offset, count, axis); \ } DEFINE_OPENCL_INSERT_TENSOR_KERNEL(inserttensor, float) DEFINE_OPENCL_INSERT_TENSOR_KERNEL(inserttensor_i8, char) @@ -1267,6 +1563,10 @@ DEFINE_OPENCL_INSERT_TENSOR_KERNEL(inserttensor_i8, char) dest[destIdx] = src[srcIdx]; \ } \ } \ + } \ + __kernel void name##W(__global void *mem, cl_uint32_t dest, cl_uint32_t src, \ + ShapeNHWC odim, ShapeNHWC idim, ShapeNHWC offset) { \ + name##K(&mem[dest], &mem[src], odim, idim, offset); \ } DEFINE_OPENCL_EXTRACT_TENSOR_KERNEL(extracttensor, float) DEFINE_OPENCL_EXTRACT_TENSOR_KERNEL(extracttensor_i8, char) @@ -1293,6 +1593,14 @@ __kernel void gatherK(__global float *dest, __global const float *src, } } +__kernel void gatherW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, + cl_uint32_t indices, cl_uint32_t numIndices, + cl_uint32_t sliceSize, cl_uint32_t numSamples, + cl_uint32_t destSampleSize, cl_uint32_t srcSampleSize) { + gatherK(&mem[dest], &mem[src], &mem[indices], numIndices, sliceSize, + numSamples, destSampleSize, srcSampleSize); +} + __kernel void scatterdataK(__global float *data, __global cl_uint64_t *indices, __global const float *slices, cl_uint32_t sliceSize) { @@ -1302,6 +1610,12 @@ __kernel void scatterdataK(__global float *data, __global cl_uint64_t *indices, sliceSize); } +__kernel void scatterdataW(__global void *mem, cl_uint32_t data, + cl_uint32_t indices, cl_uint32_t slices, + cl_uint32_t sliceSize) { + scatterdataK(&mem[data], &mem[indices], &mem[slices], sliceSize); +} + __kernel void sparselengthsweightedsumK(__global float *dest, __global float *data, __global float *weights, @@ -1340,11 +1654,20 @@ __kernel void sparselengthsweightedsumK(__global float *dest, } } +__kernel void sparselengthsweightedsumW(__global void *mem, cl_uint32_t dest, + cl_uint32_t data, cl_uint32_t weights, + cl_uint32_t indices, + cl_uint32_t lengths, + cl_uint32_t dataSliceSize) { + sparselengthsweightedsumK(&mem[dest], &mem[data], &mem[weights], + &mem[indices], &mem[lengths], dataSliceSize); +} + __kernel void sparselengthsweightedsumgradK( - __global float *data, __global float *weights, - __global cl_uint64_t *indices, __global cl_int32_t *lengths, __global float *destGrad, __global float *dataGrad, - __global float *weightsGrad, cl_uint32_t segments, cl_uint32_t sliceSize) { + __global float *weightsGrad, __global float *data, __global float *weights, + __global cl_uint64_t *indices, __global cl_int32_t *lengths, + cl_uint32_t segments, cl_uint32_t sliceSize) { // For each segment: for (cl_uint32_t i = 0, curIdx = 0; i < segments; ++i) { @@ -1375,5 +1698,16 @@ __kernel void sparselengthsweightedsumgradK( } } +__kernel void +sparselengthsweightedsumgradW(__global void *mem, cl_uint32_t data, + cl_uint32_t weights, cl_uint32_t indices, + cl_uint32_t lengths, cl_uint32_t destGrad, + cl_uint32_t dataGrad, cl_uint32_t weightsGrad, + cl_uint32_t segments, cl_uint32_t sliceSize) { + sparselengthsweightedsumgradK( + &mem[destGrad], &mem[dataGrad], &mem[weightsGrad], &mem[data], + &mem[weights], &mem[indices], &mem[lengths], segments, sliceSize); +} + /// An empty kernel used as a checkpoint for TraceEvents. __kernel void checkpoint(__global void *mem) {} diff --git a/lib/Backends/OpenCL/kernels_fwd_conv.cl b/lib/Backends/OpenCL/kernels_fwd_conv.cl index 9f347ab863..3dc01364ba 100644 --- a/lib/Backends/OpenCL/kernels_fwd_conv.cl +++ b/lib/Backends/OpenCL/kernels_fwd_conv.cl @@ -128,8 +128,13 @@ __kernel __attribute__((reqd_work_group_size(workgroup_size_0, workgroup_size_1, 1))) __attribute__((vec_type_hint(Dtype4))) void - conv_forward_mem(__global const Dtype *im_in, __global const Dtype *wg, - __global const Dtype *bias, __global Dtype *im_out) { + conv_forward_mem(__global void *mem, unsigned im_in_offset, + unsigned wg_offset, unsigned bias_offset, + unsigned im_out_offset) { + __global const Dtype *im_in = &mem[im_in_offset]; + __global const Dtype *wg = &mem[wg_offset]; + __global const Dtype *bias = &mem[bias_offset]; + __global Dtype *im_out = &mem[im_out_offset]; // Thread identifiers. // Local row ID (max: RTSM=TSM/WPTM). const int_tp tidn = get_local_id(0); diff --git a/lib/Backends/OpenCL/kernels_fwd_quantized_conv.cl b/lib/Backends/OpenCL/kernels_fwd_quantized_conv.cl index 6437b3f233..094fdfb2c8 100644 --- a/lib/Backends/OpenCL/kernels_fwd_quantized_conv.cl +++ b/lib/Backends/OpenCL/kernels_fwd_quantized_conv.cl @@ -129,11 +129,15 @@ char clip(int val) { return (char)min(max(val, -128), 127); } __kernel __attribute__((reqd_work_group_size(workgroup_size_0, workgroup_size_1, 1))) __attribute__((vec_type_hint(Dtype4))) void - conv_forward_mem_i8(__global const Dtype *im_in, __global const Dtype *wg, - __global const int *bias, __global Dtype *im_out, - int a_offset, float a_scale, int b_offset, - float b_scale, int c_offset, float c_scale, - int d_offset, float d_scale) { + conv_forward_mem_i8(__global void *mem, unsigned im_in_offset, + unsigned wg_offset, unsigned bias_offset, + unsigned im_out_offset, int a_offset, float a_scale, + int b_offset, float b_scale, int c_offset, + float c_scale, int d_offset, float d_scale) { + __global const Dtype *im_in = &mem[im_in_offset]; + __global const Dtype *wg = &mem[wg_offset]; + __global const int *bias = &mem[bias_offset]; + __global Dtype *im_out = &mem[im_out_offset]; // Thread identifiers. // Local row ID (max: RTSM=TSM/WPTM). const int_tp tidn = get_local_id(0);