diff --git a/.circleci/build.sh b/.circleci/build.sh index f502e1223b..909a047987 100755 --- a/.circleci/build.sh +++ b/.circleci/build.sh @@ -140,7 +140,7 @@ elif [[ "$CIRCLE_JOB" == "PYTORCH" ]]; then cd build elif [[ "$CIRCLE_JOB" == "OPENCL" ]]; then install_pocl - CMAKE_ARGS+=("-DGLOW_WITH_OPENCL=ON") + CMAKE_ARGS+=("-DGLOW_WITH_OPENCL=ON" "-DGLOW_OPENCL_ALIGN=128") else CMAKE_ARGS+=("-DCMAKE_BUILD_TYPE=Debug") if [[ "${CIRCLE_JOB}" == "SHARED" ]]; then diff --git a/lib/Backends/OpenCL/OpenCL.cpp b/lib/Backends/OpenCL/OpenCL.cpp index 46c388a447..a0670b86ee 100644 --- a/lib/Backends/OpenCL/OpenCL.cpp +++ b/lib/Backends/OpenCL/OpenCL.cpp @@ -108,6 +108,10 @@ static void addStringOption(std::vector &options, options.push_back("-D" + name + "=" + value); } +cl_mem glow::runtime::OpenCLDeviceBindings::getBuffer(glow::Value *v) { + return weightBuffers[v->getName()]; +} + OpenCLFunction::OpenCLFunction(std::unique_ptr F, runtime::RuntimeBundle &&bundle, TraceInfo traceInfo) @@ -140,15 +144,15 @@ static std::string getKernelName(const char *baseName, ElemKind elemTy) { std::string name = baseName; switch (elemTy) { case ElemKind::FloatTy: - return name + "W"; + return name + "K"; case ElemKind::Int8QTy: - return name + "_i8W"; + return name + "_i8K"; case ElemKind::Int32QTy: - return name + "_i32W"; + return name + "_i32K"; case ElemKind::Int64ITy: - return name + "_uW"; + return name + "_uK"; case ElemKind::BoolTy: - return name + "_bW"; + return name + "_bK"; default: LOG(FATAL) << "Unsupported data type: " << Type::getElementName(elemTy).str(); @@ -218,14 +222,14 @@ static void setKernelArg(cl_kernel kernel, unsigned argIdx, T value) { /// /// \returns the index of the last set OpenCL kernel argument. static size_t setKernelArgsForBuffers(cl_kernel kernel, const Instruction &I, - size_t nextKernelArgIdx, + runtime::OpenCLDeviceBindings *clBindings, runtime::RuntimeBundle &bundle) { // Number of instruction operands. auto numArgs = I.getNumOperands(); // The predicate of the instruction if available. Value *predicate = I.hasPredicate() ? I.getPredicate() : nullptr; // The index of the kernel argument to be set. - unsigned kernelArgIdx = nextKernelArgIdx; + unsigned kernelArgIdx = 0; // Go over all operands and pass buffer operands to the kernel. for (unsigned arg = 0; arg < numArgs; arg++) { auto *value = I.getOperand(arg).first; @@ -234,12 +238,20 @@ static size_t setKernelArgsForBuffers(cl_kernel kernel, const Instruction &I, if (value == predicate) continue; // The value is a buffer that should be passed as a kernel argument. - setKernelArg(kernel, kernelArgIdx, bundle.getValueOffset(value)); + setKernelArg(kernel, kernelArgIdx, clBindings->getBuffer(value)); kernelArgIdx++; } return kernelArgIdx - 1; } +static size_t setQuantizationParams(cl_kernel kernel, unsigned kernelArgIdx, + QuantizationTransform32To8 ¶ms) { + setKernelArg(kernel, kernelArgIdx++, params.pre); + setKernelArg(kernel, kernelArgIdx++, params.post); + setKernelArg(kernel, kernelArgIdx, params.scale); + return kernelArgIdx; +} + /// \returns the preferred (intra) vector width for the given OpenCL \p device, /// and the given \p elementType. static unsigned getPreferredVectorWidth(cl_device_id device, @@ -268,14 +280,13 @@ static unsigned getPreferredVectorWidth(cl_device_id device, return width; } -void OpenCLFunction::fillBuffer(cl_mem buffer, uint64_t start, uint64_t len, - float value, ElemKind elemKind, +void OpenCLFunction::fillBuffer(cl_mem buffer, uint64_t len, float value, + ElemKind elemKind, runtime::OpenCLDeviceBindings *devBindings) { auto kernel = createKernel(getKernelName("splat", elemKind), devBindings->program); setKernelArg(kernel, 0, buffer); - setKernelArg(kernel, 1, start); - setKernelArg(kernel, 2, value); + setKernelArg(kernel, 1, value); enqueueKernel("splat", devBindings->commandQueue, kernel, devBindings->deviceId, {(size_t)len}, devBindings->kernelLaunches); @@ -495,11 +506,10 @@ void OpenCLFunction::executeNCHWConvolution( auto kernelName = isQuantized ? "conv_forward_mem_i8" : "conv_forward_mem"; auto kernel = createKernel(kernelName, prog); - setKernelArg(kernel, 0, devBindings->deviceBuffer); - setKernelArg(kernel, 1, runtimeBundle_.getValueOffset(input)); - setKernelArg(kernel, 2, runtimeBundle_.getValueOffset(weights)); - setKernelArg(kernel, 3, runtimeBundle_.getValueOffset(bias)); - setKernelArg(kernel, 4, runtimeBundle_.getValueOffset(output)); + setKernelArg(kernel, 0, devBindings->getBuffer(input)); + setKernelArg(kernel, 1, devBindings->getBuffer(weights)); + setKernelArg(kernel, 2, devBindings->getBuffer(bias)); + setKernelArg(kernel, 3, devBindings->getBuffer(output)); // Extra options for quantized kernel if (isQuantized) { @@ -507,14 +517,14 @@ void OpenCLFunction::executeNCHWConvolution( auto outputTy = CC->getDest()->getType(); auto biasTy = CC->getBias()->getType(); auto weightsTy = CC->getFilter()->getType(); - setKernelArg(kernel, 5, weightsTy->getOffset()); - setKernelArg(kernel, 6, weightsTy->getScale()); - setKernelArg(kernel, 7, inputTy->getOffset()); - setKernelArg(kernel, 8, inputTy->getScale()); - setKernelArg(kernel, 9, outputTy->getOffset()); - setKernelArg(kernel, 10, outputTy->getScale()); - setKernelArg(kernel, 11, biasTy->getOffset()); - setKernelArg(kernel, 12, biasTy->getScale()); + setKernelArg(kernel, 4, weightsTy->getOffset()); + setKernelArg(kernel, 5, weightsTy->getScale()); + setKernelArg(kernel, 6, inputTy->getOffset()); + setKernelArg(kernel, 7, inputTy->getScale()); + setKernelArg(kernel, 8, outputTy->getOffset()); + setKernelArg(kernel, 9, outputTy->getScale()); + setKernelArg(kernel, 10, biasTy->getOffset()); + setKernelArg(kernel, 11, biasTy->getScale()); } // Compute proper parameters for global work and workgroups. @@ -598,7 +608,7 @@ Error OpenCLFunction::execute(ExecutionContext *context) { isa(I)) { continue; } - // The kernels are named after the name of the instruction, plus the "W" + // The kernels are named after the name of the instruction, plus the "K" // suffix to prevent name colissions for functions like 'tanh' that are also // a part of the OpenCL runtime. auto elemTy = I.getNumOperands() ? I.getOperand(0).first->getElementType() @@ -648,8 +658,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { } cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); auto numMandatoryArgs = numArgs; (void)numMandatoryArgs; @@ -663,7 +673,7 @@ Error OpenCLFunction::execute(ExecutionContext *context) { destTy->getOffset()}; float val = SI->getValue(); int8_t int8Val = quantization::quantize(val, destQ); - setKernelArg(kernel, ++numArgs, static_cast(int8Val)); + setKernelArg(kernel, ++numArgs, int8Val); } } @@ -682,8 +692,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { auto rhsScaleParams = quantization::quantizeScaleOffset32To8( RHSTy->getScale() / destScale, RHSTy->getOffset()); setKernelArg(kernel, ++numArgs, destOffset); - setKernelArg(kernel, ++numArgs, lhsScaleParams); - setKernelArg(kernel, ++numArgs, rhsScaleParams); + setKernelArg(kernel, ++numArgs, lhsScaleParams.offset); + setKernelArg(kernel, ++numArgs, rhsScaleParams.offset); if (isa(I) || isa(I)) { float resultScale = isa(I) @@ -691,7 +701,14 @@ Error OpenCLFunction::execute(ExecutionContext *context) { : LHSTy->getScale() / (RHSTy->getScale() * destScale); auto resultScaleParams = quantization::quantizeScaleOffset32To8(resultScale, 0); - setKernelArg(kernel, ++numArgs, resultScaleParams); + + numArgs = + setQuantizationParams(kernel, numArgs + 1, resultScaleParams); + } else { + numArgs = + setQuantizationParams(kernel, numArgs + 1, lhsScaleParams); + numArgs = + setQuantizationParams(kernel, numArgs + 1, rhsScaleParams); } } // Quantize floating point tensor. Scale and Offset are based on return @@ -716,7 +733,7 @@ Error OpenCLFunction::execute(ExecutionContext *context) { setKernelArg(kernel, ++numArgs, destType->getOffset()); setKernelArg(kernel, ++numArgs, srcType->getOffset()); - setKernelArg(kernel, ++numArgs, rescaleParams); + numArgs = setQuantizationParams(kernel, numArgs + 1, rescaleParams); } // Dequantize integer tensor. Scale and Offset are based // on the source tensor type. @@ -742,8 +759,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // Implement Softmax by parallelizing the batch dimension. Each sample in // the batch is processed by a different parallel 'thread'. cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); // This is the number of elements for each slice. There are N slices in // our batch. @@ -762,8 +779,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // Implement Softmax by parallelizing the batch dimension. Each sample in // the batch is processed by a different parallel 'thread'. cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); // This is the number of elements for each slice. There are N slices in // our batch. @@ -780,8 +797,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *ET = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); // Currently support tensors up to 4 dimensions. // TODO: Handle other dimensions. @@ -820,8 +837,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *IT = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); // Currently support tensors of up to 4 dimensions. // TODO: Handle other dimensions. @@ -874,8 +891,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { auto tiledKernelName = isQuantized ? "matmul_tiled_i8" : "matmul_tiled"; cl_kernel kernel = createKernel(useTiledMatMul ? tiledKernelName : kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); auto ddim = ShapeNHWC::fromXY(BMM->getDest()->getType()->dims()); auto ldim = ShapeNHWC::fromXY(BMM->getLHS()->getType()->dims()); @@ -893,7 +910,11 @@ Error OpenCLFunction::execute(ExecutionContext *context) { setKernelArg(kernel, numArgs + 4, lhsTy->getOffset()); setKernelArg(kernel, numArgs + 5, rhsTy->getOffset()); setKernelArg(kernel, numArgs + 6, destTy->getOffset()); - setKernelArg(kernel, numArgs + 7, destScaleParams); + if (useTiledMatMul) { + setKernelArg(kernel, numArgs + 7, destScaleParams); + } else { + setQuantizationParams(kernel, numArgs + 7, destScaleParams); + } } if (useTiledMatMul) { @@ -917,8 +938,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { kernelName += "_32"; } cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); auto bdim = flattenCdr(BA->getBatch()->dims()); setKernelArg(kernel, numArgs + 1, bdim.first); @@ -971,8 +992,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // Create kernel and set arguments. cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); setKernelArg(kernel, numArgs + 1, batchDims[axis]); setKernelArg(kernel, numArgs + 2, axisSrcSliceSize); @@ -992,8 +1013,18 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // This is a naive implementation that parallelizes using three dims: // the X and the Y in the output filter. cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + unsigned numArgs = 0; + + // Okay, to be honest I have no idea why this is, but: if you don't pass + // in a void* for the first arg this specific kernel doesn't work. It's + // super weird and I've tried everything I can think of. Conceding defeat. + setKernelArg(kernel, numArgs++, nullptr); + + setKernelArg(kernel, numArgs++, clBindings->getBuffer(CC->getDest())); + setKernelArg(kernel, numArgs++, clBindings->getBuffer(CC->getSrc())); + setKernelArg(kernel, numArgs++, clBindings->getBuffer(CC->getFilter())); + setKernelArg(kernel, numArgs, clBindings->getBuffer(CC->getBias())); + auto odim = ShapeNHWC(CC->getDest()->getType()->dims()); auto idim = ShapeNHWC(CC->getSrc()->getType()->dims()); auto pads = PaddingTLBR(CC->getPads()); @@ -1037,8 +1068,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { auto *filterGrad = CG->getFilterGrad(); auto *biasGrad = CG->getBiasGrad(); cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); auto destGradDim = ShapeNHWC(destGrad->dims()); auto srcDim = ShapeNHWC(src->dims()); @@ -1055,13 +1086,12 @@ Error OpenCLFunction::execute(ExecutionContext *context) { setKernelArg(kernel, numArgs + 7, destGradDim); setKernelArg(kernel, numArgs + 8, filterGradDim); // Zero memory for the output buffers. - fillBuffer(deviceBuffer, runtimeBundle_.getValueOffset(srcGrad), - srcGrad->size(), 0, srcGrad->getElementType(), clBindings); - fillBuffer(deviceBuffer, runtimeBundle_.getValueOffset(filterGrad), - filterGrad->size(), 0, filterGrad->getElementType(), - clBindings); - fillBuffer(deviceBuffer, runtimeBundle_.getValueOffset(biasGrad), - biasGrad->size(), 0, biasGrad->getElementType(), clBindings); + fillBuffer(clBindings->getBuffer(srcGrad), srcGrad->size(), 0, + srcGrad->getElementType(), clBindings); + fillBuffer(clBindings->getBuffer(filterGrad), filterGrad->size(), 0, + filterGrad->getElementType(), clBindings); + fillBuffer(clBindings->getBuffer(biasGrad), biasGrad->size(), 0, + biasGrad->getElementType(), clBindings); enqueueKernel(I.getName(), commands, kernel, deviceId, {destGradDim.h, destGradDim.w, destGradDim.c}, @@ -1077,8 +1107,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { } cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); ShapeHW kdim(PM->getKernels()); ShapeHW sdim(PM->getStrides()); @@ -1112,8 +1142,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // This is a naive implementation that parallelizes using three dims: // the X and the Y in the output filter. cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); auto odim = ShapeNHWC(PM->getDest()->getType()->dims()); auto idim = ShapeNHWC(PM->getSrc()->getType()->dims()); @@ -1133,8 +1163,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *PMG = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); auto destGradDim = ShapeNHWC(PMG->getDestGrad()->dims()); auto srcGradDim = ShapeNHWC(PMG->getSrcGrad()->dims()); @@ -1160,8 +1190,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { } cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); ShapeHW kdim(PA->getKernels()); ShapeHW sdim(PA->getStrides()); @@ -1194,7 +1224,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { (PA->getKernels()[0] * PA->getKernels()[0]), destTy->getOffset()); setKernelArg(kernel, numArgs + 6, srcTy->getOffset()); - setKernelArg(kernel, numArgs + 7, destScaleParam); + setKernelArg(kernel, numArgs + 7, destScaleParam.offset); + numArgs = setQuantizationParams(kernel, numArgs + 8, destScaleParam); } enqueueKernel(I.getName(), commands, kernel, deviceId, global, @@ -1209,8 +1240,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { << "This code supports only 4 and lower dimensional transposes"; cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); // Temporary hack to support 3-dim transposes. // TODO: support any dimensional transposes. @@ -1244,13 +1275,14 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (src == dest) { continue; } - size_t destOff = runtimeBundle_.getValueOffset(dest); - size_t srcOff = runtimeBundle_.getValueOffset(src); + + cl_mem destBuf = clBindings->getBuffer(dest); + cl_mem srcBuf = clBindings->getBuffer(src); size_t sizeInBytes = dest->getSizeInBytes(); cl_event event{nullptr}; - cl_int err = clEnqueueCopyBuffer(commands, deviceBuffer, deviceBuffer, - srcOff, destOff, sizeInBytes, 0, nullptr, - kernelProfiling_ ? &event : nullptr); + cl_int err = + clEnqueueCopyBuffer(commands, srcBuf, destBuf, 0, 0, sizeInBytes, 0, + nullptr, kernelProfiling_ ? &event : nullptr); if (kernelProfiling_) { kernelLaunches.emplace_back(KernelLaunch(I.getName(), "copy", event)); } @@ -1260,8 +1292,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *GI = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); unsigned_t batchDims = GI->getBatchDims(); auto *data = GI->getData(); @@ -1293,8 +1325,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *SDI = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - setKernelArg(kernel, 0, deviceBuffer); - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); auto *data = SDI->getData(); size_t dataSliceSize = data->size() / data->dims()[0]; @@ -1308,11 +1340,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *SLWS = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - // Set the device buffer as the first argument. - setKernelArg(kernel, 0, deviceBuffer); - // Set all buffer arguments from the instruction (data, dest, weights, - // indices, lengths) as subsequent arguments. - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); // Set the size of one slice of data as the last argument. auto *data = SLWS->getData(); @@ -1322,8 +1351,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // Zero the destination buffer so that the kernel can accumulate (+=) into // it. auto *dest = SLWS->getDest(); - fillBuffer(deviceBuffer, runtimeBundle_.getValueOffset(dest), - dest->size(), 0, dest->getElementType(), clBindings); + fillBuffer(clBindings->getBuffer(dest), dest->size(), 0, + dest->getElementType(), clBindings); // Get the number of segments. The output for each segment will be // computed in parallel by setting the global size equal to the number of @@ -1338,11 +1367,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { if (auto *SLWSG = dyn_cast(&I)) { cl_kernel kernel = createKernel(kernelName, program); - // Set the device buffer as the first argument. - setKernelArg(kernel, 0, deviceBuffer); - // Set all buffer arguments from the instruction (dataGrad, destGrad, - // weights, indices, lengths) as subsequent arguments. - auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); + auto numArgs = + setKernelArgsForBuffers(kernel, I, clBindings, runtimeBundle_); // Set the number of segments as the second last argument. auto *lengths = SLWSG->getLengths(); @@ -1357,8 +1383,8 @@ Error OpenCLFunction::execute(ExecutionContext *context) { // Zero the data gradient buffer so that the kernel can accumulate (+=) // into it. auto *dataGrad = SLWSG->getDataGrad(); - fillBuffer(deviceBuffer, runtimeBundle_.getValueOffset(dataGrad), - dataGrad->size(), 0, dataGrad->getElementType(), clBindings); + fillBuffer(clBindings->getBuffer(dataGrad), dataGrad->size(), 0, + dataGrad->getElementType(), clBindings); // Enqueue the kernel. Set the global size to 1 so that all segments are // processed sequentially to avoid two kernel instances accumulating into @@ -1516,7 +1542,11 @@ OCLBackend::compileIR(std::unique_ptr IR) const { auto *module = IR->getGraph()->getParent(); TraceInfo traceInfo; +#ifdef GLOW_OPENCL_ALIGN + MemoryAllocator allocator("GPU", 0xFFFFFFFF, GLOW_OPENCL_ALIGN); +#else MemoryAllocator allocator("GPU", 0xFFFFFFFF); +#endif runtime::RuntimeBundle bundle = runtime::RuntimeBundle::create(*IR, allocator); std::unique_ptr function = @@ -1537,7 +1567,11 @@ OCLBackend::compile(Function *F, const BackendOptions &opts) const { autoInstrument(traceInfo, IR.get()); } +#ifdef GLOW_OPENCL_ALIGN + MemoryAllocator allocator("GPU", 0xFFFFFFFF, GLOW_OPENCL_ALIGN); +#else MemoryAllocator allocator("GPU", 0xFFFFFFFF); +#endif runtime::RuntimeBundle bundle = runtime::RuntimeBundle::create(*IR, allocator); diff --git a/lib/Backends/OpenCL/OpenCL.h b/lib/Backends/OpenCL/OpenCL.h index 418e25f90f..59fe401dfb 100644 --- a/lib/Backends/OpenCL/OpenCL.h +++ b/lib/Backends/OpenCL/OpenCL.h @@ -145,8 +145,7 @@ class OpenCLFunction final : public CompiledFunction { /// Fill the device \p buffer with a given \p value. /// \param len number of buffer elements to be filled by the \p value. /// Elements are considered to be of the type described by \p elemKind. - void fillBuffer(cl_mem buffer, uint64_t start, uint64_t len, float value, - ElemKind elemKind, + void fillBuffer(cl_mem buffer, uint64_t len, float value, ElemKind elemKind, runtime::OpenCLDeviceBindings *devBindings); /// Execution a convolution instruction which uses NCHW format. @@ -242,10 +241,13 @@ namespace runtime { /// device specific information used to run a compiled function on a specific /// device. struct OpenCLDeviceBindings : DeviceBindings { - OpenCLDeviceBindings(cl_mem buffer, cl_command_queue commands, - cl_device_id device, cl_context ctx, cl_program prog) + OpenCLDeviceBindings( + cl_mem buffer, cl_command_queue commands, cl_device_id device, + cl_context ctx, cl_program prog, + const std::unordered_map &subBuffers) : DeviceBindings(OCLBackend::getName()), deviceBuffer{buffer}, - commandQueue{commands}, deviceId{device}, context{ctx}, program{prog} {} + commandQueue{commands}, deviceId{device}, context{ctx}, program{prog}, + weightBuffers(subBuffers) {} /// CL memory buffer. Currently this contains both mutable and immutable /// weights, the buffer is allocated once when the network is added. @@ -269,6 +271,12 @@ struct OpenCLDeviceBindings : DeviceBindings { /// A list of kernels and their associated events. std::vector kernelLaunches; + + /// Buffers or subBuffers associated with symbols. + std::unordered_map weightBuffers; + + /// /returns the subBufffer assciated with a Value. + cl_mem getBuffer(glow::Value *v); }; } // namespace runtime } // namespace glow diff --git a/lib/Backends/OpenCL/OpenCLDeviceManager.cpp b/lib/Backends/OpenCL/OpenCLDeviceManager.cpp index af49fda744..4d51453573 100644 --- a/lib/Backends/OpenCL/OpenCLDeviceManager.cpp +++ b/lib/Backends/OpenCL/OpenCLDeviceManager.cpp @@ -55,7 +55,29 @@ DeviceManager *createOCLDeviceManager(const DeviceConfig &config) { return new OpenCLDeviceManager(config); } -OpenCLBuffer::~OpenCLBuffer() { clReleaseMemObject(buffer_); } +OpenCLBuffer::~OpenCLBuffer() { + for (auto buf : subBuffers_) { + clReleaseMemObject(buf.second); + } + subBuffers_.clear(); + + clReleaseMemObject(buffer_); +} + +/// Add a mapping from a Symbol name to an offset into buffer_; +bool OpenCLBuffer::addSubBuffer(std::string name, size_t offset, size_t size) { + cl_buffer_region region({offset, size}); + cl_int err; + auto buf = clCreateSubBuffer(buffer_, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + auto res = subBuffers_.emplace(name, buf); + if (!res.second) { + llvm::dbgs() << "OpenCLBuffer: failed to add subBuffer for symbol " << name + << "\n"; + return false; + } + return true; +} } // namespace runtime } // namespace glow @@ -356,6 +378,15 @@ void OpenCLDeviceManager::addNetworkImpl(const Module *module, clFinish(commands); } usedMemoryBytes_ += sizeInBytes; + + // Add a sub-buffer for each symbol in the symbol table. OpenCL sub-buffers + // are essentially TensorViews in Glow. + for (auto &pair : bundle.getSymbolTable()) { + bool success = buffer->addSubBuffer(pair.first, pair.second.offset, + pair.second.size); + DCHECK(success); + } + // Compile the CL program. // Add to the function name lookup map. // Add shared pointer to the buffer to buffers. This way the buffer will @@ -376,6 +407,7 @@ void OpenCLDeviceManager::addNetworkImpl(const Module *module, programs_.emplace(func.first, program); functions_.emplace(func.first, func.second); buffers_.emplace(func.first, buffer); + buffer->incrementUsers(); DCHECK_LE(usedMemoryBytes_, maxMemoryBytes_); @@ -666,7 +698,7 @@ void OpenCLDeviceManager::runFunctionImpl( auto program = programs_[function]; auto clBindings = glow::make_unique( buffers_[function]->getBuffer(), queue.backingQueue, deviceId_, context_, - program); + program, buffers_[function]->getSubBuffers()); // Copy inputs to the device. copyInputsToDevice(func->getRuntimeBundle(), context.get(), clBindings.get()); diff --git a/lib/Backends/OpenCL/OpenCLDeviceManager.h b/lib/Backends/OpenCL/OpenCLDeviceManager.h index a7f4b84feb..a0a585c0ca 100644 --- a/lib/Backends/OpenCL/OpenCLDeviceManager.h +++ b/lib/Backends/OpenCL/OpenCLDeviceManager.h @@ -98,6 +98,9 @@ class OpenCLBuffer { /// The OpenCL buffer being stored. cl_mem buffer_; + /// Subbuffers for symbols. + std::unordered_map subBuffers_; + /// Count of functions using this buffer. unsigned int users_{0}; @@ -120,6 +123,14 @@ class OpenCLBuffer { /// Get size of buffer in bytes. size_t getSize() { return size_; } + + /// Return the mapping from Symbol name to subBuffer for this Buffer. + const std::unordered_map &getSubBuffers() { + return subBuffers_; + } + + /// Add a mapping from a Symbol name to an offset into buffer_; + bool addSubBuffer(std::string name, size_t offset, size_t size); }; /// A class controlling a single OpenCL device. Many OpenCLFunctions may be diff --git a/lib/Backends/OpenCL/kernels.cl b/lib/Backends/OpenCL/kernels.cl index ac5a0d8496..420a0e61b1 100644 --- a/lib/Backends/OpenCL/kernels.cl +++ b/lib/Backends/OpenCL/kernels.cl @@ -120,6 +120,13 @@ cl_int32_t scale_i32i8(cl_int32_t input, cl_int32_t pre, cl_int32_t post, return ((((input >> pre) * scale) + rtn) >> post) + offset; } +/// Scales a 32-bit integer using the integer shift-mult-shift method. + +cl_int32_t scale_i32i8p(cl_int32_t input, QuantizationTransform32To8 params, + cl_int32_t offset) { + return scale_i32i8(input, params.pre, params.post, params.scale, offset); +} + /// Clips int32_t into int8_t. cl_int8_t clip(cl_int32_t val) { return (cl_int8_t)min(max(val, -128), 127); } @@ -152,16 +159,6 @@ __kernel void quantize_i32K(__global cl_int32_t *dest, __global float *src, dest[i] = quantize_i32(src[i], scale, offset); } -__kernel void quantize_i8W(__global void *mem, cl_uint32_t dest, - cl_uint32_t src, float scale, cl_int32_t offset) { - quantize_i8K(&mem[dest], &mem[src], scale, offset); -} - -__kernel void quantize_i32W(__global void *mem, cl_uint32_t dest, - cl_uint32_t src, float scale, cl_int32_t offset) { - quantize_i32K(&mem[dest], &mem[src], scale, offset); -} - __kernel void rescalequantized_i8K(__global cl_int8_t *dest, __global cl_int8_t *src, cl_int32_t destOffset, cl_int32_t srcOffset, @@ -174,26 +171,12 @@ __kernel void rescalequantized_i8K(__global cl_int8_t *dest, dest[i] = clip(s); } -__kernel void rescalequantized_i8W(__global void *mem, cl_uint32_t dest, - cl_uint32_t src, cl_int32_t destOffset, - cl_int32_t srcOffset, - QuantizationTransform32To8 rescaleParams) { - rescalequantized_i8K(&mem[dest], &mem[src], destOffset, srcOffset, - rescaleParams.pre, rescaleParams.post, - rescaleParams.scale); -} - __kernel void dequantizeK(__global float *dest, __global cl_int8_t *src, float scale, cl_int32_t offset) { size_t i = get_global_id(0); dest[i] = dequantize(src[i], scale, offset); } -__kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, - float scale, cl_int32_t offset) { - dequantizeK(&mem[dest], &mem[src], scale, offset); -} - /// Macro to define a kernel for data-parallel ternay operations. The body of /// the kernel is auto-generated by the macro. /// Defines vectorized kernels for vector sizes 1, 8 and 16. @@ -228,11 +211,6 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, vstore8(VAL, i * 2 + 1, dest); \ } \ } \ - __kernel void name##W##16(__global void *mem, cl_uint32_t dest, \ - cl_uint32_t cond, cl_uint32_t lhs, \ - cl_uint32_t rhs) { \ - name##K##16(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); \ - } \ __kernel void name##K##8(__global type * dest, __global type * cond, \ __global type * lhs, __global type * rhs) { \ typedef float8 vtype; \ @@ -243,11 +221,6 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, vtype VAL = body; \ vstore8(VAL, i, dest); \ } \ - __kernel void name##W##8(__global void *mem, cl_uint32_t dest, \ - cl_uint32_t cond, cl_uint32_t lhs, \ - cl_uint32_t rhs) { \ - name##K##8(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); \ - } \ __kernel void name##K(__global type *dest, __global type *cond, \ __global type *lhs, __global type *rhs) { \ typedef float vtype; \ @@ -256,10 +229,6 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, vtype RHS = rhs[i]; \ vtype LHS = lhs[i]; \ dest[i] = body; \ - } \ - __kernel void name##W(__global void *mem, cl_uint32_t dest, \ - cl_uint32_t cond, cl_uint32_t lhs, cl_uint32_t rhs) { \ - name##K(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); \ } /// Macro to define a kernel for data-parallel binary operations. The body of @@ -294,10 +263,6 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, vstore8(VAL, i * 2 + 1, dest); \ } \ } \ - __kernel void name##W##16(__global void *mem, cl_uint32_t dest, \ - cl_uint32_t lhs, cl_uint32_t rhs) { \ - name##K##16(&mem[dest], &mem[lhs], &mem[rhs]); \ - } \ __kernel void name##K##8(__global type * dest, __global type * lhs, \ __global type * rhs) { \ typedef float8 vtype; \ @@ -307,10 +272,6 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, vtype VAL = body; \ vstore8(VAL, i, dest); \ } \ - __kernel void name##W##8(__global void *mem, cl_uint32_t dest, \ - cl_uint32_t lhs, cl_uint32_t rhs) { \ - name##K##8(&mem[dest], &mem[lhs], &mem[rhs]); \ - } \ __kernel void name##K(__global type *dest, __global type *lhs, \ __global type *rhs) { \ typedef float vtype; \ @@ -318,10 +279,6 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, vtype RHS = rhs[i]; \ vtype LHS = lhs[i]; \ dest[i] = body; \ - } \ - __kernel void name##W(__global void *mem, cl_uint32_t dest, cl_uint32_t lhs, \ - cl_uint32_t rhs) { \ - name##K(&mem[dest], &mem[lhs], &mem[rhs]); \ } /// Macro to define a kernel for data-parallel binary quantized operations. The @@ -343,15 +300,6 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, cl_int32_t RHS = \ scale_i32i8(rhs[i] - rhsOffset, rhsPre, rhsPost, rhsScale, 0); \ dest[i] = clip((body) + destOffset); \ - } \ - __kernel void name##_i8W( \ - __global void *mem, cl_uint32_t dest, cl_uint32_t lhs, cl_uint32_t rhs, \ - cl_int32_t destOffset, QuantizationTransform32To8 lhsScaleParams, \ - QuantizationTransform32To8 rhsScaleParams) { \ - name##_i8K(&mem[dest], &mem[lhs], &mem[rhs], destOffset, \ - lhsScaleParams.offset, rhsScaleParams.offset, \ - lhsScaleParams.pre, lhsScaleParams.post, lhsScaleParams.scale, \ - rhsScaleParams.pre, rhsScaleParams.post, rhsScaleParams.scale); \ } /// Macro to define a mini-kernel for data-parallel multiplicative quantized @@ -370,16 +318,6 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, cl_int32_t LHS = lhs[i] - lhsOffset; \ cl_int32_t RHS = rhs[i] - rhsOffset; \ dest[i] = clip(scale_i32i8((body), pre, post, scale, destOffset)); \ - } \ - __kernel void name##_i8W( \ - __global void *mem, cl_uint32_t dest, cl_uint32_t lhs, cl_uint32_t rhs, \ - cl_int32_t destOffset, QuantizationTransform32To8 lhsScaleParams, \ - QuantizationTransform32To8 rhsScaleParams, \ - QuantizationTransform32To8 resultScaleParams) { \ - name##_i8K(&mem[dest], &mem[lhs], &mem[rhs], destOffset, \ - lhsScaleParams.offset, rhsScaleParams.offset, \ - resultScaleParams.pre, resultScaleParams.post, \ - resultScaleParams.scale); \ } /// Macro to define a kernel for data-parallel unary operations. The body of @@ -411,10 +349,6 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, vstore8(VAL, i * 2 + 1, dest); \ } \ } \ - __kernel void name##W##16(__global void *mem, cl_uint32_t dest, \ - cl_uint32_t src) { \ - name##K##16(&mem[dest], &mem[src]); \ - } \ __kernel void name##K##8(__global type * dest, __global type * src) { \ typedef float8 vtype; \ size_t i = get_global_id(0); \ @@ -422,19 +356,11 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, vtype VAL = body; \ vstore8(VAL, i, dest); \ } \ - __kernel void name##W##8(__global void *mem, cl_uint32_t dest, \ - cl_uint32_t src) { \ - name##K##8(&mem[dest], &mem[src]); \ - } \ __kernel void name##K(__global type *dest, __global type *src) { \ typedef float vtype; \ size_t i = get_global_id(0); \ vtype SRC = src[i]; \ dest[i] = body; \ - } \ - __kernel void name##W(__global void *mem, cl_uint32_t dest, \ - cl_uint32_t src) { \ - name##K(&mem[dest], &mem[src]); \ } /// Macro to define a kernel for data-parallel unary operations with an @@ -467,9 +393,6 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, vstore8(VAL, i * 2 + 1, dest); \ } \ } \ - __kernel void name##W##16(__global void *mem, cl_uint32_t dest, float val) { \ - name##K##16(&mem[dest], (type)val); \ - } \ __kernel void name##K##8(__global type * dest, type val) { \ typedef type##8 vtype; \ size_t i = get_global_id(0); \ @@ -477,17 +400,11 @@ __kernel void dequantizeW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, vtype VAL = body; \ vstore8(VAL, i, dest); \ } \ - __kernel void name##W##8(__global void *mem, cl_uint32_t dest, float val) { \ - name##K##8(&mem[dest], (type)val); \ - } \ __kernel void name##K(__global type *dest, type val) { \ typedef type vtype; \ size_t i = get_global_id(0); \ vtype SRC = (vtype)val; \ dest[i] = body; \ - } \ - __kernel void name##W(__global void *mem, cl_uint32_t dest, float val) { \ - name##K(&mem[dest], (type)val); \ } DEFINE_OPENCL_BINARY_DATA_PARALLEL_KERNEL(elementadd, float, LHS + RHS) @@ -539,12 +456,6 @@ __kernel void elementselectK16(__global float *dest, __global cl_int8_t *cond, i + 1, dest); } -__kernel void elementselectW16(__global void *mem, cl_uint32_t dest, - cl_uint32_t cond, cl_uint32_t lhs, - cl_uint32_t rhs) { - elementselectK16(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); -} - __kernel void elementselectK8(__global float *dest, __global cl_int8_t *cond, __global float *lhs, __global float *rhs) { size_t i = get_global_id(0); @@ -555,12 +466,6 @@ __kernel void elementselectK8(__global float *dest, __global cl_int8_t *cond, i, dest); } -__kernel void elementselectW8(__global void *mem, cl_uint32_t dest, - cl_uint32_t cond, cl_uint32_t lhs, - cl_uint32_t rhs) { - elementselectK8(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); -} - __kernel void elementselectK(__global float *dest, __global cl_int8_t *cond, __global float *lhs, __global float *rhs) { size_t i = get_global_id(0); @@ -570,12 +475,6 @@ __kernel void elementselectK(__global float *dest, __global cl_int8_t *cond, dest[i] = (c != 0) ? LHS : RHS; } -__kernel void elementselectW(__global void *mem, cl_uint32_t dest, - cl_uint32_t cond, cl_uint32_t lhs, - cl_uint32_t rhs) { - elementselectK(&mem[dest], &mem[cond], &mem[lhs], &mem[rhs]); -} - __kernel void elementcmplteK16(__global cl_int8_t *dest, __global float *LHS, __global float *RHS) { // This kernel uses 8-element vector primitives on two contiguous 8-element @@ -591,33 +490,18 @@ __kernel void elementcmplteK16(__global cl_int8_t *dest, __global float *LHS, i + 1, dest); } -__kernel void elementcmplteW16(__global void *mem, cl_uint32_t dest, - cl_uint32_t LHS, cl_uint32_t RHS) { - elementcmplteK16(&mem[dest], &mem[LHS], &mem[RHS]); -} - __kernel void elementcmplteK8(__global cl_int8_t *dest, __global float *LHS, __global float *RHS) { size_t i = get_global_id(0); vstore8(convert_char8(islessequal(vload8(i, LHS), vload8(i, RHS))), i, dest); } -__kernel void elementcmplteW8(__global void *mem, cl_uint32_t dest, - cl_uint32_t LHS, cl_uint32_t RHS) { - elementcmplteK8(&mem[dest], &mem[LHS], &mem[RHS]); -} - __kernel void elementcmplteK(__global cl_int8_t *dest, __global float *LHS, __global float *RHS) { size_t i = get_global_id(0); dest[i] = LHS[i] <= RHS[i]; } -__kernel void elementcmplteW(__global void *mem, cl_uint32_t dest, - cl_uint32_t LHS, cl_uint32_t RHS) { - elementcmplteK(&mem[dest], &mem[LHS], &mem[RHS]); -} - __kernel void oclbatchedreduceaddK(__global float *dest, __global float *batch, __global cl_int32_t *destSliceSizes, __global cl_int32_t *batchSliceSizes, @@ -659,14 +543,6 @@ __kernel void oclbatchedreduceaddK(__global float *dest, __global float *batch, } } -__kernel void -oclbatchedreduceaddW(__global void *mem, cl_uint32_t dest, cl_uint32_t batch, - cl_uint32_t destSliceSizes, cl_uint32_t batchSliceSizes, - cl_uint32_t numSlices, cl_uint32_t axisSliceSize) { - oclbatchedreduceaddK(&mem[dest], &mem[batch], &mem[destSliceSizes], - &mem[batchSliceSizes], numSlices, axisSliceSize); -} - __kernel void batchedaddK(__global float *dest, __global float *batch, __global float *slice, cl_uint32_t numSlice, cl_uint32_t sliceSize) { @@ -676,70 +552,37 @@ __kernel void batchedaddK(__global float *dest, __global float *batch, } } -__kernel void batchedaddW(__global void *mem, cl_uint32_t dest, - cl_uint32_t batch, cl_uint32_t slice, - cl_uint32_t numSlice, cl_uint32_t sliceSize) { - batchedaddK(&mem[dest], &mem[batch], &mem[slice], numSlice, sliceSize); -} - __kernel void batchedadd_i8K(__global cl_int8_t *dest, __global cl_int8_t *batch, __global cl_int8_t *slice, cl_uint32_t numSlice, cl_uint32_t sliceSize, cl_int32_t destOffset, - cl_int32_t batchOffset, cl_int32_t sliceOffset, - cl_int32_t batchPre, cl_int32_t batchPost, - cl_int32_t batchScale, cl_int32_t slicePre, - cl_int32_t slicePost, cl_int32_t sliceScale) { - size_t s = get_global_id(0); - for (size_t n = 0; n < numSlice; n++) { - cl_int32_t batchVal = batch[n * sliceSize + s] - batchOffset; - cl_int32_t sliceVal = slice[s] - sliceOffset; - cl_int32_t x = scale_i32i8(batchVal, batchPre, batchPost, batchScale, 0); - cl_int32_t y = scale_i32i8(sliceVal, slicePre, slicePost, sliceScale, 0); - dest[n * sliceSize + s] = clip(x + y + destOffset); - } -} - -__kernel void batchedadd_i8W(__global void *mem, cl_uint32_t dest, - cl_uint32_t batch, cl_uint32_t slice, - cl_uint32_t numSlice, cl_uint32_t sliceSize, - cl_int32_t destOffset, QuantizationTransform32To8 batchScaleParams, QuantizationTransform32To8 sliceScaleParams) { - batchedadd_i8K(&mem[dest], &mem[batch], &mem[slice], numSlice, sliceSize, - destOffset, batchScaleParams.offset, sliceScaleParams.offset, - batchScaleParams.pre, batchScaleParams.post, - batchScaleParams.scale, sliceScaleParams.pre, - sliceScaleParams.post, sliceScaleParams.scale); -} - -__kernel void batchedadd_i8K_32( - __global cl_int8_t *dest, __global cl_int8_t *batch, - __global cl_int32_t *slice, cl_uint32_t numSlice, cl_uint32_t sliceSize, - cl_int32_t destOffset, cl_int32_t batchOffset, cl_int32_t sliceOffset, - cl_int32_t batchPre, cl_int32_t batchPost, cl_int32_t batchScale, - cl_int32_t slicePre, cl_int32_t slicePost, cl_int32_t sliceScale) { size_t s = get_global_id(0); for (size_t n = 0; n < numSlice; n++) { - cl_int32_t batchVal = batch[n * sliceSize + s] - batchOffset; - cl_int32_t sliceVal = slice[s] - sliceOffset; - cl_int32_t x = scale_i32i8(batchVal, batchPre, batchPost, batchScale, 0); - cl_int32_t y = scale_i32i8(sliceVal, slicePre, slicePost, sliceScale, 0); + cl_int32_t batchVal = batch[n * sliceSize + s] - batchScaleParams.offset; + cl_int32_t sliceVal = slice[s] - sliceScaleParams.offset; + cl_int32_t x = scale_i32i8p(batchVal, batchScaleParams, 0); + cl_int32_t y = scale_i32i8p(sliceVal, sliceScaleParams, 0); dest[n * sliceSize + s] = clip(x + y + destOffset); } } -__kernel void batchedadd_i8W_32(__global void *mem, cl_uint32_t dest, - cl_uint32_t batch, cl_uint32_t slice, +__kernel void batchedadd_i8K_32(__global cl_int8_t *dest, + __global cl_int8_t *batch, + __global cl_int32_t *slice, cl_uint32_t numSlice, cl_uint32_t sliceSize, cl_int32_t destOffset, QuantizationTransform32To8 batchScaleParams, QuantizationTransform32To8 sliceScaleParams) { - batchedadd_i8K_32( - &mem[dest], &mem[batch], &mem[slice], numSlice, sliceSize, destOffset, - batchScaleParams.offset, sliceScaleParams.offset, batchScaleParams.pre, - batchScaleParams.post, batchScaleParams.scale, sliceScaleParams.pre, - sliceScaleParams.post, sliceScaleParams.scale); + size_t s = get_global_id(0); + for (size_t n = 0; n < numSlice; n++) { + cl_int32_t batchVal = batch[n * sliceSize + s] - batchScaleParams.offset; + cl_int32_t sliceVal = slice[s] - sliceScaleParams.offset; + cl_int32_t x = scale_i32i8p(batchVal, batchScaleParams, 0); + cl_int32_t y = scale_i32i8p(sliceVal, sliceScaleParams, 0); + dest[n * sliceSize + s] = clip(x + y + destOffset); + } } /// Size of the tile to be used for matrix multiplication. @@ -747,13 +590,9 @@ __kernel void batchedadd_i8W_32(__global void *mem, cl_uint32_t dest, /// workgroups with sizes which are at least as big as a tile. #define TILE_SIZE 8 -__kernel void matmul_tiled(__global void *mem, cl_uint32_t C_off, - cl_uint32_t A_off, cl_uint32_t B_off, ShapeNHWC ddim, - ShapeNHWC ldim, ShapeNHWC rdim) { - __global float *C = &mem[C_off]; - __global float *A = &mem[A_off]; - __global float *B = &mem[B_off]; - +__kernel void matmul_tiled(__global float *C, __global float *A, + __global float *B, ShapeNHWC ddim, ShapeNHWC ldim, + ShapeNHWC rdim) { int M = ldim.n; int N = rdim.h; int K = ldim.h; @@ -797,16 +636,12 @@ __kernel void matmul_tiled(__global void *mem, cl_uint32_t C_off, } } -__kernel void matmul_tiled_i8(__global void *mem, cl_uint32_t C_off, - cl_uint32_t A_off, cl_uint32_t B_off, - ShapeNHWC ddim, ShapeNHWC ldim, ShapeNHWC rdim, +__kernel void matmul_tiled_i8(__global cl_int8_t *C, __global cl_int8_t *A, + __global cl_int8_t *B, ShapeNHWC ddim, + ShapeNHWC ldim, ShapeNHWC rdim, cl_int32_t aOffset, cl_int32_t bOffset, cl_int32_t cOffset, QuantizationTransform32To8 destScaleParams) { - __global cl_int8_t *C = &mem[C_off]; - __global cl_int8_t *A = &mem[A_off]; - __global cl_int8_t *B = &mem[B_off]; - int M = ldim.n; int N = rdim.h; int K = ldim.h; @@ -870,12 +705,6 @@ __kernel void matmulK(__global float *dest, __global float *lhs, dest[getNHWC(ddim, x, y, 0, 0)] = sum; } -__kernel void matmulW(__global void *mem, cl_uint32_t dest, cl_uint32_t lhs, - cl_uint32_t rhs, ShapeNHWC ddim, ShapeNHWC ldim, - ShapeNHWC rdim) { - matmulK(&mem[dest], &mem[lhs], &mem[rhs], ddim, ldim, rdim); -} - __kernel void matmul_i8K(__global cl_int8_t *dest, __global cl_int8_t *lhs, __global cl_int8_t *rhs, ShapeNHWC ddim, ShapeNHWC ldim, ShapeNHWC rdim, cl_int32_t lhsOffset, @@ -898,18 +727,8 @@ __kernel void matmul_i8K(__global cl_int8_t *dest, __global cl_int8_t *lhs, clip(scale_i32i8(sum, destPre, destPost, destScale, destOffset)); } -__kernel void matmul_i8W(__global void *mem, cl_uint32_t dest, cl_uint32_t lhs, - cl_uint32_t rhs, ShapeNHWC ddim, ShapeNHWC ldim, - ShapeNHWC rdim, cl_int32_t lhsOffset, - cl_int32_t rhsOffset, cl_int32_t destOffset, - QuantizationTransform32To8 destScaleParams) { - matmul_i8K(&mem[dest], &mem[lhs], &mem[rhs], ddim, ldim, rdim, lhsOffset, - rhsOffset, destOffset, destScaleParams.pre, destScaleParams.post, - destScaleParams.scale); -} - __kernel void softmaxK(__global float *dest, __global float *src, - __global float *e_cache, cl_uint32_t sliceSize) { + cl_uint32_t sliceSize) { size_t i = get_global_id(0); float max_ = src[i * sliceSize]; for (size_t j = 0; j < sliceSize; j++) { @@ -923,18 +742,11 @@ __kernel void softmaxK(__global float *dest, __global float *src, } for (size_t j = 0; j < sliceSize; j++) { dest[i * sliceSize + j] /= sum; - if (e_cache) - e_cache[i * sliceSize + j] = dest[i * sliceSize + j]; } } -__kernel void softmaxW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, - cl_uint32_t sliceSize) { - softmaxK(&mem[dest], &mem[src], (__global float *)0, sliceSize); -} - -__kernel void softmaxgradK(__global float *inG, __global float *outW, - __global cl_uint64_t *selectedW, +__kernel void softmaxgradK(__global float *outW, __global float *origSrc, + __global cl_uint64_t *selectedW, __global float *inG, cl_uint32_t sliceSize) { size_t i = get_global_id(0); for (size_t j = 0; j < sliceSize; j++) { @@ -943,16 +755,10 @@ __kernel void softmaxgradK(__global float *inG, __global float *outW, } } -__kernel void softmaxgradW(__global void *mem, cl_uint32_t origDest, - cl_uint32_t origSrc, cl_uint32_t selected, - cl_uint32_t srcGrad, cl_uint32_t sliceSize) { - softmaxgradK(&mem[srcGrad], &mem[origDest], &mem[selected], sliceSize); -} - -__kernel void convolutionK(__global float *dest, __global float *src, - __global float *filter, __global float *bias, - ShapeHW kernelSizes, ShapeHW strides, - PaddingTLBR pads, cl_uint32_t group, +__kernel void convolutionK(__global void *unused, __global float *dest, + __global float *src, __global float *filter, + __global float *bias, ShapeHW kernelSizes, + ShapeHW strides, PaddingTLBR pads, cl_uint32_t group, cl_uint32_t dilation, ShapeNHWC odim, ShapeNHWC idim, ShapeNHWC filterDim) { size_t ax = get_global_id(0); @@ -996,16 +802,6 @@ __kernel void convolutionK(__global float *dest, __global float *src, } // N } -__kernel void convolutionW(__global void *mem, cl_uint32_t dest, - cl_uint32_t src, cl_uint32_t filter, - cl_uint32_t bias, ShapeHW kernelSizes, - ShapeHW strides, PaddingTLBR pads, cl_uint32_t group, - cl_uint32_t dilation, ShapeNHWC odim, ShapeNHWC idim, - ShapeNHWC filterDim) { - convolutionK(&mem[dest], &mem[src], &mem[filter], &mem[bias], kernelSizes, - strides, pads, group, dilation, odim, idim, filterDim); -} - __kernel void convolution_i8K(__global cl_int8_t *dest, __global cl_int8_t *src, __global cl_int8_t *filter, __global cl_int32_t *bias, @@ -1060,20 +856,6 @@ convolution_i8K(__global cl_int8_t *dest, __global cl_int8_t *src, } } -__kernel void -convolution_i8W(__global void *mem, cl_uint32_t dest, cl_uint32_t src, - cl_uint32_t filter, cl_uint32_t bias, ShapeHW kernelSizes, - ShapeHW strides, PaddingTLBR pads, cl_uint32_t group, - cl_uint32_t dilation, ShapeNHWC odim, ShapeNHWC idim, - ShapeNHWC filterDim, cl_int32_t destOffset, float destScale, - cl_int32_t srcOffset, float srcScale, cl_int32_t filterOffset, - float filterScale, cl_int32_t biasOffset, float biasScale) { - convolution_i8K(&mem[dest], &mem[src], &mem[filter], &mem[bias], kernelSizes, - strides, destOffset, destScale, srcOffset, srcScale, - filterOffset, filterScale, biasOffset, biasScale, pads, group, - dilation, odim, idim, filterDim); -} - __kernel void convolutiongradK(const __global float *inW, const __global float *filterW, const __global float *outG, __global float *inG, @@ -1127,19 +909,6 @@ __kernel void convolutiongradK(const __global float *inW, } // N } -__kernel void convolutiongradW(__global void *mem, cl_uint32_t src, - cl_uint32_t filter, cl_uint32_t destGrad, - cl_uint32_t srcGrad, cl_uint32_t filterGrad, - cl_uint32_t biasGrad, ShapeHW kernelSizes, - ShapeHW strides, PaddingTLBR pads, - cl_uint32_t group, cl_uint32_t dilation, - ShapeNHWC srcDim, ShapeNHWC destGradDim, - ShapeNHWC filterGradDim) { - convolutiongradK(&mem[src], &mem[filter], &mem[destGrad], &mem[srcGrad], - &mem[filterGrad], &mem[biasGrad], kernelSizes, strides, pads, - group, dilation, srcDim, destGradDim, filterGradDim); -} - __kernel void maxpoolK(__global float *dest, __global float *src, cl_uint32_t kernelSize, cl_uint32_t stride, PaddingTLBR pads, ShapeNHWC odim, ShapeNHWC idim) { @@ -1181,12 +950,6 @@ __kernel void maxpoolK(__global float *dest, __global float *src, } // N } -__kernel void maxpoolW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, - cl_uint32_t kernelSize, cl_uint32_t stride, - PaddingTLBR pads, ShapeNHWC odim, ShapeNHWC idim) { - maxpoolK(&mem[dest], &mem[src], kernelSize, stride, pads, odim, idim); -} - /// Macro to define a kernel for oclmaxpool. The body of /// the kernel is auto-generated by the macro. /// \p name the name of this kernel @@ -1225,11 +988,6 @@ __kernel void maxpoolW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, } \ dest[getNCHW(odim, n, d, ax, ay)] = maxVal; \ } \ - } \ - __kernel void name##W(__global void *mem, cl_uint32_t dest, cl_uint32_t src, \ - cl_uint32_t kernelSize, cl_uint32_t stride, \ - PaddingTLBR pads, ShapeNCHW odim, ShapeNCHW idim) { \ - name##K(&mem[dest], &mem[src], kernelSize, stride, pads, odim, idim); \ } DEFINE_OPENCL_MAXPOOL_KERNEL(oclmaxpool, float) DEFINE_OPENCL_MAXPOOL_KERNEL(oclmaxpool_i8, char) @@ -1283,16 +1041,7 @@ __kernel void maxpoolwithargmaxK(__global float *dest, __global float *src, } // N } -__kernel void maxpoolwithargmaxW(__global void *mem, cl_uint32_t dest, - cl_uint32_t src, cl_uint32_t argmax, - cl_uint32_t kernelSize, cl_uint32_t stride, - PaddingTLBR pads, ShapeNHWC odim, - ShapeNHWC idim) { - maxpoolwithargmaxK(&mem[dest], &mem[src], &mem[argmax], kernelSize, stride, - pads, odim, idim); -} - -__kernel void maxpoolwithargmaxgradK(__global float *dest, +__kernel void maxpoolwithargmaxgradK(__global float *dest, __global float *src, __global cl_uint64_t *argmax, __global float *destGrad, __global float *srcGrad, @@ -1321,19 +1070,6 @@ __kernel void maxpoolwithargmaxgradK(__global float *dest, } // C } -__kernel void maxpoolwithargmaxgradW(__global void *mem, cl_uint32_t dest, - cl_uint32_t src, cl_uint32_t argmax, - cl_uint32_t destGrad, cl_uint32_t srcGrad, - cl_uint32_t kernelSize, cl_uint32_t stride, - PaddingTLBR pads, ShapeNHWC srcGradDim, - ShapeNHWC destDim) { - // src operand is present on the instruction but not needed by the OpenCL - // kernel. - maxpoolwithargmaxgradK(&mem[dest], &mem[argmax], &mem[destGrad], - &mem[srcGrad], kernelSize, stride, pads, srcGradDim, - destDim); -} - __kernel void avgpoolK(__global float *dest, __global float *src, cl_uint32_t kernelSize, cl_uint32_t stride, PaddingTLBR pads, ShapeNHWC odim, ShapeNHWC idim) { @@ -1370,12 +1106,6 @@ __kernel void avgpoolK(__global float *dest, __global float *src, } // N } -__kernel void avgpoolW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, - cl_uint32_t kernelSize, cl_uint32_t stride, - PaddingTLBR pads, ShapeNHWC odim, ShapeNHWC idim) { - avgpoolK(&mem[dest], &mem[src], kernelSize, stride, pads, odim, idim); -} - __kernel void oclavgpoolK(__global float *dest, __global float *src, cl_uint32_t kernelSize, cl_uint32_t stride, PaddingTLBR pads, ShapeNCHW odim, ShapeNCHW idim) { @@ -1412,12 +1142,6 @@ __kernel void oclavgpoolK(__global float *dest, __global float *src, } // N } -__kernel void oclavgpoolW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, - cl_uint32_t kernelSize, cl_uint32_t stride, - PaddingTLBR pads, ShapeNCHW odim, ShapeNCHW idim) { - oclavgpoolK(&mem[dest], &mem[src], kernelSize, stride, pads, odim, idim); -} - __kernel void oclavgpool_i8K(__global cl_int8_t *dest, __global cl_int8_t *src, cl_uint32_t kernelSize, cl_uint32_t stride, PaddingTLBR pads, ShapeNCHW odim, ShapeNCHW idim, @@ -1456,17 +1180,6 @@ __kernel void oclavgpool_i8K(__global cl_int8_t *dest, __global cl_int8_t *src, } } -__kernel void oclavgpool_i8W(__global void *mem, cl_uint32_t dest, - cl_uint32_t src, cl_uint32_t kernelSize, - cl_uint32_t stride, PaddingTLBR pads, - ShapeNCHW odim, ShapeNCHW idim, - cl_int32_t srcOffset, - QuantizationTransform32To8 destScaleParams) { - oclavgpool_i8K(&mem[dest], &mem[src], kernelSize, stride, pads, odim, idim, - srcOffset, destScaleParams.offset, destScaleParams.pre, - destScaleParams.post, destScaleParams.scale); -} - /// Macro to define a kernel for transpose operations. The body of /// the kernel is auto-generated by the macro. /// \p type the type of the tensor elements and of the return value @@ -1488,10 +1201,6 @@ __kernel void oclavgpool_i8W(__global void *mem, cl_uint32_t dest, dest[dstIdx] = src[srcIdx]; \ } \ } \ - } \ - __kernel void name##W(__global void *mem, cl_uint32_t dest, cl_uint32_t src, \ - ShapeNHWC odim, ShapeNHWC idim, ShapeNHWC shuffle) { \ - name##K(&mem[dest], &mem[src], odim, idim, shuffle); \ } DEFINE_OPENCL_TRANSPOSE_KERNEL(transpose_i8, cl_int8_t) @@ -1531,11 +1240,6 @@ DEFINE_OPENCL_TRANSPOSE_KERNEL(transpose, float) } \ } \ } \ - } \ - __kernel void name##W(__global void *mem, cl_uint32_t dest, cl_uint32_t src, \ - ShapeNHWC odim, ShapeNHWC idim, ShapeNHWC offset, \ - cl_uint32_t count, cl_uint32_t axis) { \ - name##K(&mem[dest], &mem[src], odim, idim, offset, count, axis); \ } DEFINE_OPENCL_INSERT_TENSOR_KERNEL(inserttensor, float) DEFINE_OPENCL_INSERT_TENSOR_KERNEL(inserttensor_i8, char) @@ -1563,10 +1267,6 @@ DEFINE_OPENCL_INSERT_TENSOR_KERNEL(inserttensor_i8, char) dest[destIdx] = src[srcIdx]; \ } \ } \ - } \ - __kernel void name##W(__global void *mem, cl_uint32_t dest, cl_uint32_t src, \ - ShapeNHWC odim, ShapeNHWC idim, ShapeNHWC offset) { \ - name##K(&mem[dest], &mem[src], odim, idim, offset); \ } DEFINE_OPENCL_EXTRACT_TENSOR_KERNEL(extracttensor, float) DEFINE_OPENCL_EXTRACT_TENSOR_KERNEL(extracttensor_i8, char) @@ -1593,14 +1293,6 @@ __kernel void gatherK(__global float *dest, __global const float *src, } } -__kernel void gatherW(__global void *mem, cl_uint32_t dest, cl_uint32_t src, - cl_uint32_t indices, cl_uint32_t numIndices, - cl_uint32_t sliceSize, cl_uint32_t numSamples, - cl_uint32_t destSampleSize, cl_uint32_t srcSampleSize) { - gatherK(&mem[dest], &mem[src], &mem[indices], numIndices, sliceSize, - numSamples, destSampleSize, srcSampleSize); -} - __kernel void scatterdataK(__global float *data, __global cl_uint64_t *indices, __global const float *slices, cl_uint32_t sliceSize) { @@ -1610,12 +1302,6 @@ __kernel void scatterdataK(__global float *data, __global cl_uint64_t *indices, sliceSize); } -__kernel void scatterdataW(__global void *mem, cl_uint32_t data, - cl_uint32_t indices, cl_uint32_t slices, - cl_uint32_t sliceSize) { - scatterdataK(&mem[data], &mem[indices], &mem[slices], sliceSize); -} - __kernel void sparselengthsweightedsumK(__global float *dest, __global float *data, __global float *weights, @@ -1654,20 +1340,11 @@ __kernel void sparselengthsweightedsumK(__global float *dest, } } -__kernel void sparselengthsweightedsumW(__global void *mem, cl_uint32_t dest, - cl_uint32_t data, cl_uint32_t weights, - cl_uint32_t indices, - cl_uint32_t lengths, - cl_uint32_t dataSliceSize) { - sparselengthsweightedsumK(&mem[dest], &mem[data], &mem[weights], - &mem[indices], &mem[lengths], dataSliceSize); -} - __kernel void sparselengthsweightedsumgradK( - __global float *destGrad, __global float *dataGrad, - __global float *weightsGrad, __global float *data, __global float *weights, + __global float *data, __global float *weights, __global cl_uint64_t *indices, __global cl_int32_t *lengths, - cl_uint32_t segments, cl_uint32_t sliceSize) { + __global float *destGrad, __global float *dataGrad, + __global float *weightsGrad, cl_uint32_t segments, cl_uint32_t sliceSize) { // For each segment: for (cl_uint32_t i = 0, curIdx = 0; i < segments; ++i) { @@ -1698,16 +1375,5 @@ __kernel void sparselengthsweightedsumgradK( } } -__kernel void -sparselengthsweightedsumgradW(__global void *mem, cl_uint32_t data, - cl_uint32_t weights, cl_uint32_t indices, - cl_uint32_t lengths, cl_uint32_t destGrad, - cl_uint32_t dataGrad, cl_uint32_t weightsGrad, - cl_uint32_t segments, cl_uint32_t sliceSize) { - sparselengthsweightedsumgradK( - &mem[destGrad], &mem[dataGrad], &mem[weightsGrad], &mem[data], - &mem[weights], &mem[indices], &mem[lengths], segments, sliceSize); -} - /// An empty kernel used as a checkpoint for TraceEvents. __kernel void checkpoint(__global void *mem) {} diff --git a/lib/Backends/OpenCL/kernels_fwd_conv.cl b/lib/Backends/OpenCL/kernels_fwd_conv.cl index 3dc01364ba..9f347ab863 100644 --- a/lib/Backends/OpenCL/kernels_fwd_conv.cl +++ b/lib/Backends/OpenCL/kernels_fwd_conv.cl @@ -128,13 +128,8 @@ __kernel __attribute__((reqd_work_group_size(workgroup_size_0, workgroup_size_1, 1))) __attribute__((vec_type_hint(Dtype4))) void - conv_forward_mem(__global void *mem, unsigned im_in_offset, - unsigned wg_offset, unsigned bias_offset, - unsigned im_out_offset) { - __global const Dtype *im_in = &mem[im_in_offset]; - __global const Dtype *wg = &mem[wg_offset]; - __global const Dtype *bias = &mem[bias_offset]; - __global Dtype *im_out = &mem[im_out_offset]; + conv_forward_mem(__global const Dtype *im_in, __global const Dtype *wg, + __global const Dtype *bias, __global Dtype *im_out) { // Thread identifiers. // Local row ID (max: RTSM=TSM/WPTM). const int_tp tidn = get_local_id(0); diff --git a/lib/Backends/OpenCL/kernels_fwd_quantized_conv.cl b/lib/Backends/OpenCL/kernels_fwd_quantized_conv.cl index 094fdfb2c8..6437b3f233 100644 --- a/lib/Backends/OpenCL/kernels_fwd_quantized_conv.cl +++ b/lib/Backends/OpenCL/kernels_fwd_quantized_conv.cl @@ -129,15 +129,11 @@ char clip(int val) { return (char)min(max(val, -128), 127); } __kernel __attribute__((reqd_work_group_size(workgroup_size_0, workgroup_size_1, 1))) __attribute__((vec_type_hint(Dtype4))) void - conv_forward_mem_i8(__global void *mem, unsigned im_in_offset, - unsigned wg_offset, unsigned bias_offset, - unsigned im_out_offset, int a_offset, float a_scale, - int b_offset, float b_scale, int c_offset, - float c_scale, int d_offset, float d_scale) { - __global const Dtype *im_in = &mem[im_in_offset]; - __global const Dtype *wg = &mem[wg_offset]; - __global const int *bias = &mem[bias_offset]; - __global Dtype *im_out = &mem[im_out_offset]; + conv_forward_mem_i8(__global const Dtype *im_in, __global const Dtype *wg, + __global const int *bias, __global Dtype *im_out, + int a_offset, float a_scale, int b_offset, + float b_scale, int c_offset, float c_scale, + int d_offset, float d_scale) { // Thread identifiers. // Local row ID (max: RTSM=TSM/WPTM). const int_tp tidn = get_local_id(0);