From 855eb38997854fd5649b5575a0b78004c621a8c7 Mon Sep 17 00:00:00 2001 From: Andrew Lamontagne Date: Wed, 22 Jan 2025 22:17:45 -0700 Subject: [PATCH 1/3] Fix compressed to compressed texture conversion leaving leaked memory/threads --- cmp_compressonatorlib/compressonator.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cmp_compressonatorlib/compressonator.cpp b/cmp_compressonatorlib/compressonator.cpp index f9f068f19..9e732c7a0 100644 --- a/cmp_compressonatorlib/compressonator.cpp +++ b/cmp_compressonatorlib/compressonator.cpp @@ -465,6 +465,12 @@ CMP_ERROR CMP_API CMP_ConvertTexture(CMP_Texture* pSourceTexture, } RESTORE_FP_EXCEPTIONS; + SAFE_DELETE(pCodecIn); + SAFE_DELETE(pCodecOut); + SAFE_DELETE(pSrcBuffer); + SAFE_DELETE(pTempBuffer); + SAFE_DELETE(pDestBuffer); + return GetError(err2); } } From f464fd8d16022f4702cfbeeb374abd118e79c49f Mon Sep 17 00:00:00 2001 From: Andrew Lamontagne Date: Fri, 25 Jul 2025 21:04:08 -0700 Subject: [PATCH 2/3] Diallow SSE and AVX when compiling for non-x86 targets --- CMakeLists.txt | 51 ++++++++++ build/sdk/cmp_core/CMakeLists.txt | 162 +++++++++++++++--------------- cmp_core/CMakeLists.txt | 82 ++++++++------- cmp_core/shaders/bc1_cmp.h | 5 + 4 files changed, 182 insertions(+), 118 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4cb219698..de348cb58 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,57 @@ message("Building Compressonator version ${PROJECT_VERSION_MAJOR}.${PROJECT_VERS # ------------------------------ include(cmake/helperfunctions.cmake) +# ---------------------------------- +# Check the target architecture +# ---------------------------------- +################################################################################ +# Figure out build type and target platform +################################################################################ + +include(CheckCCompilerFlag) + +# See what CPU we appear to be targeting... +macro(check_cpu _CDEFS _NAME _VALUE) + check_c_source_compiles( + " + #if ( ${_CDEFS} ) + int main(int argc, char **argv) { int yup = 1; return 0; } + #else + #error Not targeting this CPU architecture. + #endif + " + ${_VALUE} + ) + + if(${_VALUE}) + if(AMD_COMPRESSONATOR_CHOSE_CPU) + message(STATUS "We appear to see two different CPU architectures!") + message(STATUS "We saw '${AMD_COMPRESSONATOR_CPU}' and '${_NAME}'.") + message(FATAL_ERROR "Please fix this before continuing.") + endif() + set(AMD_COMPRESSONATOR_CHOSE_CPU TRUE) + set(AMD_COMPRESSONATOR_CPU ${_NAME}) + add_compile_definitions(${_VALUE}=1) + endif() +endmacro(check_cpu) + +check_cpu( + "defined(__i386__) || defined(__i686__) || defined(_M_IX86) || defined(i386)" + "x86" AMD_COMPRESSONATOR_X86 +) + +check_cpu("defined(__x86_64__) || defined(_M_X64)" "amd64" AMD_COMPRESSONATOR_AMD64) + +check_cpu("defined(__EMSCRIPTEN__)" "emscripten" AMD_COMPRESSONATOR_EMSCRIPTEN) + +check_cpu("defined(__arm__)" "arm" AMD_COMPRESSONATOR_ARM) + +check_cpu("defined(__arm64__) || defined(__aarch64__)" "arm64" AMD_COMPRESSONATOR_ARM64) + +if (NOT AMD_COMPRESSONATOR_CHOSE_CPU) + message(FATAL_ERROR "We don't support this architecture yet") +endif() + # ------------------------------ # Common compiler options diff --git a/build/sdk/cmp_core/CMakeLists.txt b/build/sdk/cmp_core/CMakeLists.txt index 680616af8..06ce095c2 100644 --- a/build/sdk/cmp_core/CMakeLists.txt +++ b/build/sdk/cmp_core/CMakeLists.txt @@ -72,82 +72,86 @@ # Core SIMD options -# SSE -add_library(CMP_Core_SSE OBJECT) -target_sources( - CMP_Core_SSE - PRIVATE - ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_sse.cpp -) - -target_include_directories( - CMP_Core_SSE - PRIVATE - ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source - ${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders -) - -if (UNIX) - target_compile_options(CMP_Core_SSE PRIVATE -march=nehalem) -endif() - -set_target_properties(CMP_Core_SSE PROPERTIES - FOLDER ${PROJECT_FOLDER_SDK_LIBS} - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" -) - -# AVX -add_library(CMP_Core_AVX OBJECT) -target_sources( - CMP_Core_AVX - PRIVATE - ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_avx.cpp -) -target_include_directories( - CMP_Core_AVX - PRIVATE - ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source - ${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders -) - -if (WIN32) - target_compile_options(CMP_Core_AVX PRIVATE /arch:AVX2) -else() - target_compile_options(CMP_Core_AVX PRIVATE -march=haswell) -endif() - -set_target_properties(CMP_Core_AVX PROPERTIES - FOLDER ${PROJECT_FOLDER_SDK_LIBS} - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" -) - -# AVX-512 -add_library(CMP_Core_AVX512 OBJECT) -target_sources( - CMP_Core_AVX512 - PRIVATE - ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_avx512.cpp -) -target_include_directories( - CMP_Core_AVX512 - PRIVATE - ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source - ${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders -) - -if (WIN32) - target_compile_options(CMP_Core_AVX512 PRIVATE /arch:AVX-512) -else() - target_compile_options(CMP_Core_AVX512 PRIVATE -march=knl) -endif() - -set_target_properties(CMP_Core_AVX512 PROPERTIES - FOLDER ${PROJECT_FOLDER_SDK_LIBS} - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" -) - -# Link SIMD libraries to CMP_Core -target_link_libraries(CMP_Core PRIVATE CMP_Core_SSE CMP_Core_AVX CMP_Core_AVX512) \ No newline at end of file +# Metallicafan212: Actually check for support before linking it +# This fixes ARM builds +if(AMD_COMPRESSONATOR_AMD64 OR AMD_COMPRESSONATOR_X86) + # SSE + add_library(CMP_Core_SSE OBJECT) + target_sources( + CMP_Core_SSE + PRIVATE + ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_sse.cpp + ) + + target_include_directories( + CMP_Core_SSE + PRIVATE + ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source + ${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders + ) + + if (UNIX) + target_compile_options(CMP_Core_SSE PRIVATE -march=nehalem) + endif() + + set_target_properties(CMP_Core_SSE PROPERTIES + FOLDER ${PROJECT_FOLDER_SDK_LIBS} + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" + ) + + # AVX + add_library(CMP_Core_AVX OBJECT) + target_sources( + CMP_Core_AVX + PRIVATE + ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_avx.cpp + ) + target_include_directories( + CMP_Core_AVX + PRIVATE + ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source + ${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders + ) + + if (WIN32) + target_compile_options(CMP_Core_AVX PRIVATE /arch:AVX2) + else() + target_compile_options(CMP_Core_AVX PRIVATE -march=haswell) + endif() + + set_target_properties(CMP_Core_AVX PROPERTIES + FOLDER ${PROJECT_FOLDER_SDK_LIBS} + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" + ) + + # AVX-512 + add_library(CMP_Core_AVX512 OBJECT) + target_sources( + CMP_Core_AVX512 + PRIVATE + ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_avx512.cpp + ) + target_include_directories( + CMP_Core_AVX512 + PRIVATE + ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source + ${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders + ) + + if (WIN32) + target_compile_options(CMP_Core_AVX512 PRIVATE /arch:AVX-512) + else() + target_compile_options(CMP_Core_AVX512 PRIVATE -march=knl) + endif() + + set_target_properties(CMP_Core_AVX512 PROPERTIES + FOLDER ${PROJECT_FOLDER_SDK_LIBS} + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" + ) + + # Link SIMD libraries to CMP_Core + target_link_libraries(CMP_Core PRIVATE CMP_Core_SSE CMP_Core_AVX CMP_Core_AVX512) +endif() \ No newline at end of file diff --git a/cmp_core/CMakeLists.txt b/cmp_core/CMakeLists.txt index 478737708..a3b462dc8 100644 --- a/cmp_core/CMakeLists.txt +++ b/cmp_core/CMakeLists.txt @@ -66,42 +66,46 @@ set_target_properties(CMP_Core PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS}) # Core SIMD options -# SSE -add_library(CMP_Core_SSE STATIC) -target_sources(CMP_Core_SSE PRIVATE source/core_simd_sse.cpp) -target_include_directories(CMP_Core_SSE PRIVATE source shaders) - -if (UNIX) - target_compile_options(CMP_Core_SSE PRIVATE -march=nehalem) -endif() - -set_target_properties(CMP_Core_SSE PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS}) - -# AVX -add_library(CMP_Core_AVX STATIC) -target_sources(CMP_Core_AVX PRIVATE source/core_simd_avx.cpp) -target_include_directories(CMP_Core_AVX PRIVATE source shaders) - -if (WIN32) - target_compile_options(CMP_Core_AVX PRIVATE /arch:AVX2) -else() - target_compile_options(CMP_Core_AVX PRIVATE -march=haswell) -endif() - -set_target_properties(CMP_Core_AVX PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS}) - -# AVX-512 -add_library(CMP_Core_AVX512 STATIC) -target_sources(CMP_Core_AVX512 PRIVATE source/core_simd_avx512.cpp) -target_include_directories(CMP_Core_AVX512 PRIVATE source shaders) - -if (WIN32) - target_compile_options(CMP_Core_AVX512 PRIVATE /arch:AVX-512) -else() - target_compile_options(CMP_Core_AVX512 PRIVATE -march=knl) -endif() - -set_target_properties(CMP_Core_AVX512 PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS}) - -# Link SIMD libraries to CMP_Core -target_link_libraries(CMP_Core PRIVATE CMP_Core_SSE CMP_Core_AVX CMP_Core_AVX512) \ No newline at end of file +# Metallicafan212: Actually check for support before linking it +# This fixes ARM builds +if(AMD_COMPRESSONATOR_AMD64 OR AMD_COMPRESSONATOR_X86) + # SSE + add_library(CMP_Core_SSE STATIC) + target_sources(CMP_Core_SSE PRIVATE source/core_simd_sse.cpp) + target_include_directories(CMP_Core_SSE PRIVATE source shaders) + + if (UNIX) + target_compile_options(CMP_Core_SSE PRIVATE -march=nehalem) + endif() + + set_target_properties(CMP_Core_SSE PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS}) + + # AVX + add_library(CMP_Core_AVX STATIC) + target_sources(CMP_Core_AVX PRIVATE source/core_simd_avx.cpp) + target_include_directories(CMP_Core_AVX PRIVATE source shaders) + + if (WIN32) + target_compile_options(CMP_Core_AVX PRIVATE /arch:AVX2) + else() + target_compile_options(CMP_Core_AVX PRIVATE -march=haswell) + endif() + + set_target_properties(CMP_Core_AVX PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS}) + + # AVX-512 + add_library(CMP_Core_AVX512 STATIC) + target_sources(CMP_Core_AVX512 PRIVATE source/core_simd_avx512.cpp) + target_include_directories(CMP_Core_AVX512 PRIVATE source shaders) + + if (WIN32) + target_compile_options(CMP_Core_AVX512 PRIVATE /arch:AVX-512) + else() + target_compile_options(CMP_Core_AVX512 PRIVATE -march=knl) + endif() + + set_target_properties(CMP_Core_AVX512 PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS}) + + # Link SIMD libraries to CMP_Core + target_link_libraries(CMP_Core PRIVATE CMP_Core_SSE CMP_Core_AVX CMP_Core_AVX512) +endif() \ No newline at end of file diff --git a/cmp_core/shaders/bc1_cmp.h b/cmp_core/shaders/bc1_cmp.h index c97315dd5..70e1054be 100644 --- a/cmp_core/shaders/bc1_cmp.h +++ b/cmp_core/shaders/bc1_cmp.h @@ -96,6 +96,8 @@ CMP_STATIC CGU_FLOAT (*cpu_bc1ComputeBestEndpoints)(CGU_FLOAT*, CGU_FLOAT*, CGU_ // NOTE: The requested extension will only be enabled if it is supported by the current CPU. CMP_STATIC bool bc1ToggleSIMD(CGU_INT newExtension) { + // Metallicafan212: Don't evaluate on non-X86 platforms +#if AMD_COMPRESSONATOR_AMD64 || AMD_COMPRESSONATOR_X86 CGU_BOOL useAVX512 = true; CGU_BOOL useAVX2 = true; CGU_BOOL useSSE42 = true; @@ -125,6 +127,9 @@ CMP_STATIC bool bc1ToggleSIMD(CGU_INT newExtension) { cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints; } +#else + cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints; +#endif g_bc1FunctionPointersSet = true; From 4b723803e2eaf579a09fc925a6c9c1affda75672 Mon Sep 17 00:00:00 2001 From: metallicafan212 Date: Fri, 25 Jul 2025 23:46:08 -0700 Subject: [PATCH 3/3] Fix variable scope when compiling in non-x86 --- cmp_core/shaders/bc1_cmp.h | 6943 ++++++++++++++++++------------------ 1 file changed, 3473 insertions(+), 3470 deletions(-) diff --git a/cmp_core/shaders/bc1_cmp.h b/cmp_core/shaders/bc1_cmp.h index 70e1054be..892af2e30 100644 --- a/cmp_core/shaders/bc1_cmp.h +++ b/cmp_core/shaders/bc1_cmp.h @@ -1,3470 +1,3473 @@ -//===================================================================== -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files(the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions : -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. -// -// File: bc1_cmp.h -//-------------------------------------------------------------------------------------- -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. -//-------------------------------------------------------------------------------------- - -#define USE_CMP - -#include "common_def.h" -#include "bcn_common_kernel.h" -#include "bcn_common_api.h" - -#ifndef ASPM_GPU -#include "cpu_extensions.h" -#include "core_simd.h" -#endif - -//----------------------------------------------------------------------- -// When build is for CPU, we have some missing API calls common to GPU -// Use CPU CMP_Core replacements -//----------------------------------------------------------------------- -#if defined(ASPM_GPU) || defined(ASPM_HLSL) || defined(ASPM_OPENCL) -#define ALIGN_16 -#define ALIGN_32 -#define ALIGN_64 -#else -#include INC_cmp_math_func -#if defined(_WIN32) || defined(_WIN64) -#define ALIGN_16 __declspec(align(16)) -#define ALIGN_32 __declspec(align(32)) -#define ALIGN_64 __declspec(align(64)) -#else // !WIN32 && !_WIN64 -#define ALIGN_16 __attribute__((aligned(16))) -#define ALIGN_32 __attribute__((aligned(32))) -#define ALIGN_64 __attribute__((aligned(64))) -#endif // !WIN32 && !_WIN64 -#endif - -#define USE_REFINE3D -#define USE_REFINE - -#ifndef MAX_ERROR -#define MAX_ERROR 128000.f -#endif - -#define NUM_CHANNELS 4 -#define NUM_ENDPOINTS 2 - -#ifndef CMP_QUALITY0 -#define CMP_QUALITY0 0.25f -#endif - -#ifndef CMP_QUALITY1 -#define CMP_QUALITY1 0.50f -#endif - -#ifndef CMP_QUALITY2 -#define CMP_QUALITY2 0.75f -#endif - -#define EPS (2.f / 255.f) * (2.f / 255.f) -#define EPS2 3.f * (2.f / 255.f) * (2.f / 255.f) - -// Disable SIMD code during GPU builds -#if !defined(ASPM_GPU) -CMP_STATIC CGU_BOOL g_bc1FunctionPointersSet = false; - -// declarations for SIMD function variations -CMP_STATIC CGU_FLOAT _cpu_bc1ComputeBestEndpoints(CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, int, int); - -// function pointers -CMP_STATIC CGU_FLOAT (*cpu_bc1ComputeBestEndpoints)(CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, int, int) = 0; - -// Toggle which SIMD instruction set extensions to use. Setting this to EXTENSION_COUNT will enable auto-detection of supported extensions. -// NOTE: The requested extension will only be enabled if it is supported by the current CPU. -CMP_STATIC bool bc1ToggleSIMD(CGU_INT newExtension) -{ - // Metallicafan212: Don't evaluate on non-X86 platforms -#if AMD_COMPRESSONATOR_AMD64 || AMD_COMPRESSONATOR_X86 - CGU_BOOL useAVX512 = true; - CGU_BOOL useAVX2 = true; - CGU_BOOL useSSE42 = true; - - CPUExtensions extensions = GetCPUExtensions(); - - if (newExtension < EXTENSION_COUNT) // user requested a specific instruction set extension - { - useAVX512 = newExtension == EXTENSION_AVX512_F; - useAVX2 = newExtension == EXTENSION_AVX2; - useSSE42 = newExtension == EXTENSION_SSE42; - } - - if (useAVX512 && IsAvailableAVX512(extensions)) - { - cpu_bc1ComputeBestEndpoints = avx512_bc1ComputeBestEndpoints; - } - else if (useAVX2 && IsAvailableAVX2(extensions)) - { - cpu_bc1ComputeBestEndpoints = avx_bc1ComputeBestEndpoints; - } - else if (useSSE42 && IsAvailableSSE4(extensions)) - { - cpu_bc1ComputeBestEndpoints = sse_bc1ComputeBestEndpoints; - } - else - { - cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints; - } -#else - cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints; -#endif - - g_bc1FunctionPointersSet = true; - - bool result = true; - - if (newExtension != EXTENSION_COUNT && (useAVX512 && !IsAvailableAVX512(extensions)) || (useAVX2 && !IsAvailableAVX2(extensions)) || - (useSSE42 && !IsAvailableSSE4(extensions))) - result = false; - - return result; -} -#endif - -static CGU_FLOAT cgu_getRampErr(CGU_FLOAT Prj[BLOCK_SIZE_4X4], - CGU_FLOAT PrjErr[BLOCK_SIZE_4X4], - CGU_FLOAT PreMRep[BLOCK_SIZE_4X4], - CGU_FLOAT StepErr, - CGU_FLOAT lowPosStep, - CGU_FLOAT highPosStep, - CGU_UINT32 dwUniqueColors) -{ - CGU_FLOAT error = 0; - CGU_FLOAT step = (highPosStep - lowPosStep) / 3; // using (dwNumChannels=4 - 1); - CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; - CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; - - for (CGU_UINT32 i = 0; i < dwUniqueColors; i++) - { - CGU_FLOAT v; - // Work out which value in the block this select - CGU_FLOAT del; - - if ((del = Prj[i] - lowPosStep) <= 0) - v = lowPosStep; - else if (Prj[i] - highPosStep >= 0) - v = highPosStep; - else - v = cmp_floor((del + step_h) * rstep) * step + lowPosStep; - - // And accumulate the error - CGU_FLOAT d = (Prj[i] - v); - d *= d; - CGU_FLOAT err = PreMRep[i] * d + PrjErr[i]; - error += err; - if (StepErr < error) - { - error = StepErr; - break; - } - } - return error; -} - -CMP_STATIC CMP_EndPoints cgu_CompressRGBBlockX(CMP_IN CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4], - CMP_IN CGU_FLOAT Rpt[BLOCK_SIZE_4X4], - CMP_IN CGU_UINT32 dwUniqueColors, - CMP_IN CGU_Vec3f channelWeightsBGR, - CMP_IN CGU_BOOL b3DRefinement) -{ - CMP_UNUSED(channelWeightsBGR); - CMP_UNUSED(b3DRefinement); - CGU_FLOAT ALIGN_16 Prj0[BLOCK_SIZE_4X4]; - CGU_FLOAT ALIGN_16 Prj[BLOCK_SIZE_4X4]; - CGU_FLOAT ALIGN_16 PrjErr[BLOCK_SIZE_4X4]; - CGU_FLOAT ALIGN_16 RmpIndxs[BLOCK_SIZE_4X4]; - - CGU_Vec3f LineDirG; - CGU_Vec3f LineDir; - CGU_FLOAT LineDir0[NUM_CHANNELS]; - CGU_Vec3f BlkUV[BLOCK_SIZE_4X4]; - CGU_Vec3f BlkSh[BLOCK_SIZE_4X4]; - CGU_Vec3f Mdl; - - CGU_Vec3f rsltC0; - CGU_Vec3f rsltC1; - CGU_Vec3f PosG0 = {0.0f, 0.0f, 0.0f}; - CGU_Vec3f PosG1 = {0.0f, 0.0f, 0.0f}; - CGU_UINT32 i; - - for (i = 0; i < dwUniqueColors; i++) - { - BlkUV[i] = BlkInBGRf_UV[i]; - } - - // if not more then 2 different colors, we've done - if (dwUniqueColors <= 2) - { - rsltC0 = BlkInBGRf_UV[0] * 255.0f; - rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f; - } - else - { - // This is our first attempt to find an axis we will go along. - // The cumulation is done to find a line minimizing the MSE from the - // input 3D points. - - // While trying to find the axis we found that the diameter of the input - // set is quite small. Do not bother. - - // FindAxisIsSmall(BlkSh, LineDir0, Mdl, Blk, Rpt,dwUniqueColors); - { - CGU_UINT32 ii; - CGU_UINT32 jj; - CGU_UINT32 kk; - - // These vars cannot be Vec3 as index to them are varying - CGU_FLOAT Crrl[NUM_CHANNELS]; - CGU_FLOAT RGB2[NUM_CHANNELS]; - - LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = Mdl.x = Mdl.y = Mdl.z = 0.f; - - // sum position of all points - CGU_FLOAT fNumPoints = 0.0f; - for (ii = 0; ii < dwUniqueColors; ii++) - { - Mdl.x += BlkUV[ii].x * Rpt[ii]; - Mdl.y += BlkUV[ii].y * Rpt[ii]; - Mdl.z += BlkUV[ii].z * Rpt[ii]; - fNumPoints += Rpt[ii]; - } - - // and then average to calculate center coordinate of block - Mdl /= fNumPoints; - - for (ii = 0; ii < dwUniqueColors; ii++) - { - // calculate output block as offsets around block center - BlkSh[ii] = BlkUV[ii] - Mdl; - - // compute correlation matrix - // RGB2 = sum of ((distance from point from center) squared) - RGB2[0] += BlkSh[ii].x * BlkSh[ii].x * Rpt[ii]; - RGB2[1] += BlkSh[ii].y * BlkSh[ii].y * Rpt[ii]; - RGB2[2] += BlkSh[ii].z * BlkSh[ii].z * Rpt[ii]; - - Crrl[0] += BlkSh[ii].x * BlkSh[ii].y * Rpt[ii]; - Crrl[1] += BlkSh[ii].y * BlkSh[ii].z * Rpt[ii]; - Crrl[2] += BlkSh[ii].z * BlkSh[ii].x * Rpt[ii]; - } - - // if set's diameter is small - CGU_UINT32 i0 = 0, i1 = 1; - CGU_FLOAT mxRGB2 = 0.0f; - - CGU_FLOAT fEPS = fNumPoints * EPS; - for (kk = 0, jj = 0; jj < 3; jj++) - { - if (RGB2[jj] >= fEPS) - kk++; - else - RGB2[jj] = 0.0f; - - if (mxRGB2 < RGB2[jj]) - { - mxRGB2 = RGB2[jj]; - i0 = jj; - } - } - - CGU_FLOAT fEPS2 = fNumPoints * EPS2; - CGU_BOOL AxisIsSmall; - - AxisIsSmall = (RGB2[0] < fEPS2); - AxisIsSmall = AxisIsSmall && (RGB2[1] < fEPS2); - AxisIsSmall = AxisIsSmall && (RGB2[2] < fEPS2); - - // all are very small to avoid division on the small determinant - if (AxisIsSmall) - { - rsltC0 = BlkInBGRf_UV[0] * 255.0f; - rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f; - } - else - { - // !AxisIsSmall - if (kk == 1) // really only 1 dimension - LineDir0[i0] = 1.; - else if (kk == 2) - { // really only 2 dimensions - i1 = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3; - CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3]; - LineDir0[i1] = Crl / RGB2[i0]; - LineDir0[i0] = 1.; - } - else - { - CGU_FLOAT maxDet = 100000.f; - CGU_FLOAT Cs[3]; - // select max det for precision - for (jj = 0; jj < 3; jj++) - { - // 3 = nDimensions - CGU_FLOAT Det = RGB2[jj] * RGB2[(jj + 1) % 3] - Crrl[jj] * Crrl[jj]; - Cs[jj] = cmp_fabs(Crrl[jj] / sqrt(RGB2[jj] * RGB2[(jj + 1) % 3])); - if (maxDet < Det) - { - maxDet = Det; - i0 = jj; - } - } - - // inverse correl matrix - // -- -- -- -- - // | A B | | C -B | - // | B C | => | -B A | - // -- -- -- -- - CGU_FLOAT mtrx1[2][2]; - CGU_FLOAT vc1[2]; - CGU_FLOAT vc[2]; - vc1[0] = Crrl[(i0 + 2) % 3]; - vc1[1] = Crrl[(i0 + 1) % 3]; - // C - mtrx1[0][0] = RGB2[(i0 + 1) % 3]; - // A - mtrx1[1][1] = RGB2[i0]; - // -B - mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0]; - // find a solution - vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1]; - vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1]; - // normalize - vc[0] /= maxDet; - vc[1] /= maxDet; - // find a line direction vector - LineDir0[i0] = 1.; - LineDir0[(i0 + 1) % 3] = 1.; - LineDir0[(i0 + 2) % 3] = vc[0] + vc[1]; - } - - // normalize direction vector - CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2]; - Len = sqrt(Len); - - LineDir0[0] = (Len > 0.f) ? LineDir0[0] / Len : 0.0f; - LineDir0[1] = (Len > 0.f) ? LineDir0[1] / Len : 0.0f; - LineDir0[2] = (Len > 0.f) ? LineDir0[2] / Len : 0.0f; - } - } // FindAxisIsSmall - - // GCC is being an awful being when it comes to goto-jumps. - // So please bear with this. - CGU_FLOAT ErrG = 10000000.f; - CGU_FLOAT PrjBnd0; - CGU_FLOAT PrjBnd1; - CGU_FLOAT ALIGN_16 PreMRep[BLOCK_SIZE_4X4]; - - LineDir.x = LineDir0[0]; - LineDir.y = LineDir0[1]; - LineDir.z = LineDir0[2]; - - // Here is the main loop. - // 1. Project input set on the axis in consideration. - // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points. - // 3. Compute the vector of indexes (or clusters) for the current approximate ramp. - // 4. Present our color channels as 3 16DIM vectors. - // 5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector. - // 6. Plug the projections as a new directional vector for the axis. - // 7. Goto 1. - // D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3,2/3, 0, ...,}, but shifted and normalized). - // Ci - is a 16 dim vector of color i. for each Ci find a scalar Ai such that (Ai * D - Ci) (Ai * D - Ci) -> min , - // i.e distance between vector AiD and C is min. You can think of D as a unit interval(vector) "clusterizer", and Ai is a scale - // you need to apply to the clusterizer to approximate the Ci vector instead of the unit vector. - // Solution is - // Ai = (D . Ci) / (D . D); . - is a dot product. - // in 3 dim space Ai(s) represent a line direction, along which - // we again try to find (sub)optimal quantizer. - // That's what our for(;;) loop is about. - for (;;) - { - // 1. Project input set on the axis in consideration. - // From Foley & Van Dam: Closest point of approach of a line (P + v) to a - // point (R) is - // P + ((R-P).v) / (v.v))v - // The distance along v is therefore (R-P).v / (v.v) - // (v.v) is 1 if v is a unit vector. - // - PrjBnd0 = 1000.0f; - PrjBnd1 = -1000.0f; - for (i = 0; i < BLOCK_SIZE_4X4; i++) - Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f; - - for (i = 0; i < dwUniqueColors; i++) - { - Prj0[i] = Prj[i] = dot(BlkSh[i], LineDir); - PrjErr[i] = dot(BlkSh[i] - LineDir * Prj[i], BlkSh[i] - LineDir * Prj[i]); - PrjBnd0 = min(PrjBnd0, Prj[i]); - PrjBnd1 = max(PrjBnd1, Prj[i]); - } - - // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal - // pair of end points. - - // min and max of the search interval - CGU_FLOAT Scl0; - CGU_FLOAT Scl1; - Scl0 = PrjBnd0 - (PrjBnd1 - PrjBnd0) * 0.125f; - Scl1 = PrjBnd1 + (PrjBnd1 - PrjBnd0) * 0.125f; - - // compute scaling factor to scale down the search interval to [0.,1] - const CGU_FLOAT Scl2 = (Scl1 - Scl0) * (Scl1 - Scl0); - const CGU_FLOAT overScl = 1.f / (Scl1 - Scl0); - - for (i = 0; i < dwUniqueColors; i++) - { - // scale them - Prj[i] = (Prj[i] - Scl0) * overScl; - // premultiply the scale square to plug into error computation later - PreMRep[i] = Rpt[i] * Scl2; - } - - // scale first approximation of end points - PrjBnd0 = (PrjBnd0 - Scl0) * overScl; - PrjBnd1 = (PrjBnd1 - Scl0) * overScl; - - CGU_FLOAT StepErr = MAX_ERROR; - - // search step - CGU_FLOAT searchStep = 0.025f; - - // low Start/End; high Start/End - const CGU_FLOAT lowStartEnd = (PrjBnd0 - 2.f * searchStep > 0.f) ? PrjBnd0 - 2.f * searchStep : 0.f; - const CGU_FLOAT highStartEnd = (PrjBnd1 + 2.f * searchStep < 1.f) ? PrjBnd1 + 2.f * searchStep : 1.f; - - // find the best endpoints - CGU_FLOAT Pos0 = 0; - CGU_FLOAT Pos1 = 0; - CGU_FLOAT lowPosStep, highPosStep; - CGU_FLOAT err; - - int l, h; - for (l = 0, lowPosStep = lowStartEnd; l < 8; l++, lowPosStep += searchStep) - { - for (h = 0, highPosStep = highStartEnd; h < 8; h++, highPosStep -= searchStep) - { - // compute an error for the current pair of end points. - err = cgu_getRampErr(Prj, PrjErr, PreMRep, StepErr, lowPosStep, highPosStep, dwUniqueColors); - - if (err < StepErr) - { - // save better result - StepErr = err; - Pos0 = lowPosStep; - Pos1 = highPosStep; - } - } - } - - // inverse the scaling - Pos0 = Pos0 * (Scl1 - Scl0) + Scl0; - Pos1 = Pos1 * (Scl1 - Scl0) + Scl0; - - // did we find somthing better from the previous run? - if (StepErr + 0.001 < ErrG) - { - // yes, remember it - ErrG = StepErr; - LineDirG = LineDir; - - PosG0.x = Pos0; - PosG0.y = Pos0; - PosG0.z = Pos0; - PosG1.x = Pos1; - PosG1.y = Pos1; - PosG1.z = Pos1; - - // 3. Compute the vector of indexes (or clusters) for the current - // approximate ramp. - // indexes - const CGU_FLOAT step = (Pos1 - Pos0) / 3.0f; // (dwNumChannels=4 - 1); - const CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; - const CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; - const CGU_FLOAT overBlkTp = 1.f / 3.0f; // (dwNumChannels=4 - 1); - - // here the index vector is computed, - // shifted and normalized - CGU_FLOAT indxAvrg = 3.0f / 2.0f; // (dwNumChannels=4 - 1); - - for (i = 0; i < dwUniqueColors; i++) - { - CGU_FLOAT del; - // CGU_UINT32 n = (CGU_UINT32)((b - _min_ex + (step*0.5f)) * rstep); - if ((del = Prj0[i] - Pos0) <= 0) - RmpIndxs[i] = 0.f; - else if (Prj0[i] - Pos1 >= 0) - RmpIndxs[i] = 3.0f; // (dwNumChannels=4 - 1); - else - RmpIndxs[i] = cmp_floor((del + step_h) * rstep); - // shift and normalization - RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp; - } - - // 4. Present our color channels as 3 16 DIM vectors. - // 5. Find closest aproximation of each of 16DIM color vector with the - // pojection of the 16DIM index vector. - CGU_Vec3f Crs = {0.0f, 0.0f, 0.0f}; - CGU_FLOAT Len = 0.0f; - - for (i = 0; i < dwUniqueColors; i++) - { - const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i]; - Len += RmpIndxs[i] * PreMlt; - Crs.x += BlkSh[i].x * PreMlt; - Crs.y += BlkSh[i].y * PreMlt; - Crs.z += BlkSh[i].z * PreMlt; - } - - LineDir.x = LineDir.y = LineDir.z = 0.0f; - if (Len > 0.0f) - { - CGU_FLOAT Len2; - LineDir = Crs / Len; - // 6. Plug the projections as a new directional vector for the axis. - // 7. Goto 1. - Len2 = dot(LineDir, LineDir); // LineDir.x * LineDir.x + LineDir.y * LineDir.y + LineDir.z * LineDir.z; - Len2 = sqrt(Len2); - LineDir /= Len2; - } - } - else // We was not able to find anything better. Drop out. - break; - } - - // inverse transform to find end-points of 3-color ramp - rsltC0 = (PosG0 * LineDirG + Mdl) * 255.f; - rsltC1 = (PosG1 * LineDirG + Mdl) * 255.f; - } // !isDone - - // We've dealt with (almost) unrestricted full precision realm. - // Now back digital world. - - // round the end points to make them look like compressed ones - CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f}; - CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f}; - CGU_Vec3f Fctrs0 = {8.0f, 4.0f, 8.0f}; //(1 << (PIX_GRID - BG)); x (1 << (PIX_GRID - GG)); y (1 << (PIX_GRID - RG)); z - CGU_Vec3f Fctrs1 = {32.0f, 64.0f, 32.0f}; //(CGU_FLOAT)(1 << RG); z (CGU_FLOAT)(1 << GG); y (CGU_FLOAT)(1 << BG); x - CGU_FLOAT _Min = 0.0f; - CGU_FLOAT _Max = 255.0f; - - { - // MkRmpOnGrid(inpRmpEndPts, rsltC, _Min, _Max); - - inpRmpEndPts0 = cmp_floorVec3f(rsltC0); - - if (inpRmpEndPts0.x <= _Min) - inpRmpEndPts0.x = _Min; - else - { - inpRmpEndPts0.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts0.x / Fctrs1.x); - inpRmpEndPts0.x = min(inpRmpEndPts0.x, _Max); - } - if (inpRmpEndPts0.y <= _Min) - inpRmpEndPts0.y = _Min; - else - { - inpRmpEndPts0.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts0.y / Fctrs1.y); - inpRmpEndPts0.y = min(inpRmpEndPts0.y, _Max); - } - if (inpRmpEndPts0.z <= _Min) - inpRmpEndPts0.z = _Min; - else - { - inpRmpEndPts0.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts0.z / Fctrs1.z); - inpRmpEndPts0.z = min(inpRmpEndPts0.z, _Max); - } - - inpRmpEndPts0 = cmp_floorVec3f(inpRmpEndPts0 / Fctrs0) * Fctrs0; - - inpRmpEndPts1 = cmp_floorVec3f(rsltC1); - if (inpRmpEndPts1.x <= _Min) - inpRmpEndPts1.x = _Min; - else - { - inpRmpEndPts1.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts1.x / Fctrs1.x); - inpRmpEndPts1.x = min(inpRmpEndPts1.x, _Max); - } - if (inpRmpEndPts1.y <= _Min) - inpRmpEndPts1.y = _Min; - else - { - inpRmpEndPts1.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts1.y / Fctrs1.y); - inpRmpEndPts1.y = min(inpRmpEndPts1.y, _Max); - } - if (inpRmpEndPts1.z <= _Min) - inpRmpEndPts1.z = _Min; - else - { - inpRmpEndPts1.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts1.z / Fctrs1.z); - inpRmpEndPts1.z = min(inpRmpEndPts1.z, _Max); - } - - inpRmpEndPts1 = cmp_floorVec3f(inpRmpEndPts1 / Fctrs0) * Fctrs0; - } // MkRmpOnGrid - - CMP_EndPoints EndPoints; - EndPoints.Color0 = inpRmpEndPts0; - EndPoints.Color1 = inpRmpEndPts1; - - return EndPoints; -} - -CMP_STATIC CMP_EndPoints -cgu_MkRmpOnGridBGR(CMP_IN CGU_Vec3f rsltC0, CMP_IN CGU_Vec3f rsltC1, CMP_IN CGU_UINT32 nRedBits, CMP_IN CGU_UINT32 nGreenBits, CMP_IN CGU_UINT32 nBlueBits) -{ - CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f}; - CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f}; - CGU_Vec3f Fctrs0 = {8.0f, 4.0f, 8.0f}; - CGU_Vec3f Fctrs1 = {32.0f, 64.0f, 32.0f}; - CGU_FLOAT _Min = 0.0f; - CGU_FLOAT _Max = 255.0f; - - // user override 565 default setting - if ((nRedBits != 5) || (nGreenBits != 6) || (nBlueBits != 5)) - { - Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits); - Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits); - Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits); - Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits)); - Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits)); - Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits)); - } - - inpRmpEndPts0 = cmp_floorVec3f(rsltC0); - - if (inpRmpEndPts0.x <= _Min) - inpRmpEndPts0.x = _Min; - else - { - inpRmpEndPts0.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts0.x / Fctrs1.x); - inpRmpEndPts0.x = cmp_minf(inpRmpEndPts0.x, _Max); - } - if (inpRmpEndPts0.y <= _Min) - inpRmpEndPts0.y = _Min; - else - { - inpRmpEndPts0.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts0.y / Fctrs1.y); - inpRmpEndPts0.y = cmp_minf(inpRmpEndPts0.y, _Max); - } - if (inpRmpEndPts0.z <= _Min) - inpRmpEndPts0.z = _Min; - else - { - inpRmpEndPts0.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts0.z / Fctrs1.z); - inpRmpEndPts0.z = cmp_minf(inpRmpEndPts0.z, _Max); - } - - inpRmpEndPts0 = cmp_floorVec3f(inpRmpEndPts0 / Fctrs0) * Fctrs0; - - inpRmpEndPts1 = cmp_floorVec3f(rsltC1); - if (inpRmpEndPts1.x <= _Min) - inpRmpEndPts1.x = _Min; - else - { - inpRmpEndPts1.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts1.x / Fctrs1.x); - inpRmpEndPts1.x = cmp_minf(inpRmpEndPts1.x, _Max); - } - if (inpRmpEndPts1.y <= _Min) - inpRmpEndPts1.y = _Min; - else - { - inpRmpEndPts1.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts1.y / Fctrs1.y); - inpRmpEndPts1.y = cmp_minf(inpRmpEndPts1.y, _Max); - } - if (inpRmpEndPts1.z <= _Min) - inpRmpEndPts1.z = _Min; - else - { - inpRmpEndPts1.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts1.z / Fctrs1.z); - inpRmpEndPts1.z = cmp_minf(inpRmpEndPts1.z, _Max); - } - - inpRmpEndPts1 = cmp_floorVec3f(inpRmpEndPts1 / Fctrs0) * Fctrs0; - - CMP_EndPoints EndPoints; - EndPoints.Color0 = inpRmpEndPts0; - EndPoints.Color1 = inpRmpEndPts1; - - return EndPoints; - -} // MkRmpOnGrid - -//=================================================================== -// Replaces CompressBlockBC1_RGBA_Internal() -// if ((errLQ > 0.0f) && (fquality > CMP_QUALITY2)) code block -//=================================================================== -CMP_STATIC CGU_Vec2ui cgu_CompRGBBlock(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options) -{ - //CGU_FLOAT errLQ = 1e6f; - CGU_UINT32 m_nRefinementSteps = BC15Options.m_nRefinementSteps; - CGU_UINT32 dwAlphaThreshold = BC15Options.m_nAlphaThreshold; - CGU_Vec3f channelWeights = {BC15Options.m_fChannelWeights[0], BC15Options.m_fChannelWeights[1], BC15Options.m_fChannelWeights[2]}; - CGU_BOOL isSRGB = BC15Options.m_bIsSRGB; - - CGU_Vec3f rgbBlock_normal[BLOCK_SIZE_4X4]; - CGU_UINT32 nCmpIndices = 0; - CGU_UINT32 c0, c1; - // High Quality - CMP_EndPoints EndPoints = {{0, 0, 0xFF}, {0, 0, 0xFF}}; - CGU_UINT32 i; - - CGU_FLOAT ALIGN_16 Rpt[BLOCK_SIZE_4X4]; - CGU_UINT32 pcIndices = 0; - - m_nRefinementSteps = 0; - - CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4]; // Normalized Block Input (0..1) in BGR channel format - // Default inidices & endpoints for Transparent Block - CGU_Vec3ui nEndpoints0 = {0, 0, 0}; // Endpoints are stored BGR as x,y,z - CGU_Vec3ui nEndpoints1 = {0xFF, 0xFF, 0xFF}; // Endpoints are stored BGR as x,y,z - - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - Rpt[i] = 0.0f; - } - - //=============================================================== - // Check if we have more then 2 colors and process Alpha block - CGU_UINT32 dwColors = 0; - CGU_UINT32 dwBlk[BLOCK_SIZE_4X4]; - CGU_UINT32 R, G, B, A; - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - // Do any color conversion prior to processing the block - rgbBlock_normal[i] = isSRGB ? cmp_linearToSrgb(src_imageNorm[i].rgb) : src_imageNorm[i].rgb; - - R = (CGU_UINT32)(rgbBlock_normal[i].x * 255.0f); - G = (CGU_UINT32)(rgbBlock_normal[i].y * 255.0f); - B = (CGU_UINT32)(rgbBlock_normal[i].z * 255.0f); - - //if (dwAlphaThreshold > 0) - // A = (CGU_UINT32)src_imageNorm[i].w * 255.0f; - //else - A = 255; - - // Punch Through Alpha in BC1 Codec (1 bit alpha) - //if ((dwAlphaThreshold == 0) || (A >= dwAlphaThreshold)) - //{ - // copy to local RGB data and have alpha set to 0xFF - dwBlk[dwColors++] = A << 24 | R << 16 | G << 8 | B; - //} - } - - if (!dwColors) - { - // All are colors transparent - EndPoints.Color0.x = EndPoints.Color0.y = EndPoints.Color0.z = 0.0f; - EndPoints.Color1.x = EndPoints.Color1.y = EndPoints.Color0.z = 255.0f; - nCmpIndices = 0xFFFFFFFF; - } - else - { - // We have colors to process - nCmpIndices = 0; - // Punch Through Alpha Support ToDo - // CGU_BOOL bHasAlpha = (dwColors != BLOCK_SIZE_4X4); - // bHasAlpha = bHasAlpha && (dwAlphaThreshold > 0); // valid for (dwNumChannels=4); - // if (bHasAlpha) { - // CGU_Vec2ui compBlock = {0xf800f800,0}; - // return compBlock; - // } - - // Here we are computing an unique number of sorted colors. - // For each unique value we compute the number of it appearences. - // qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp); - { - CGU_UINT32 j; - CMP_di what[BLOCK_SIZE_4X4]; - - for (i = 0; i < dwColors; i++) - { - what[i].index = i; - what[i].data = dwBlk[i]; - } - - CGU_UINT32 tmp_index; - CGU_UINT32 tmp_data; - - for (i = 1; i < dwColors; i++) - { - for (j = i; j > 0; j--) - { - if (what[j - 1].data > what[j].data) - { - tmp_index = what[j].index; - tmp_data = what[j].data; - what[j].index = what[j - 1].index; - what[j].data = what[j - 1].data; - what[j - 1].index = tmp_index; - what[j - 1].data = tmp_data; - } - } - } - for (i = 0; i < dwColors; i++) - dwBlk[i] = what[i].data; - } - CGU_UINT32 new_p; - CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4]; - CGU_UINT32 dwUniqueColors = 0; - new_p = dwBlkU[0] = dwBlk[0]; - Rpt[dwUniqueColors] = 1.f; - for (i = 1; i < dwColors; i++) - { - if (new_p != dwBlk[i]) - { - dwUniqueColors++; - new_p = dwBlkU[dwUniqueColors] = dwBlk[i]; - Rpt[dwUniqueColors] = 1.f; - } - else - Rpt[dwUniqueColors] += 1.f; - } - dwUniqueColors++; - - // Simple case of only 2 colors to process - // no need for futher processing as lowest quality methods work best for this case - if (dwUniqueColors <= 2) - { - CGU_Vec3f rsltC0; - CGU_Vec3f rsltC1; - rsltC0.r = rgbBlock_normal[0].b * 255.0f; - rsltC0.g = rgbBlock_normal[0].g * 255.0f; - rsltC0.b = rgbBlock_normal[0].r * 255.0f; - rsltC1.r = rgbBlock_normal[dwUniqueColors - 1].b * 255.0f; - rsltC1.g = rgbBlock_normal[dwUniqueColors - 1].g * 255.0f; - rsltC1.b = rgbBlock_normal[dwUniqueColors - 1].r * 255.0f; - EndPoints = cgu_MkRmpOnGridBGR(rsltC0, rsltC1, 5, 6, 5); - } - else - { - // switch from int range back to UV floats - for (i = 0; i < dwUniqueColors; i++) - { - R = (dwBlkU[i] >> 16) & 0xff; - G = (dwBlkU[i] >> 8) & 0xff; - B = (dwBlkU[i] >> 0) & 0xff; - BlkInBGRf_UV[i].z = (CGU_FLOAT)R / 255.0f; - BlkInBGRf_UV[i].y = (CGU_FLOAT)G / 255.0f; - BlkInBGRf_UV[i].x = (CGU_FLOAT)B / 255.0f; - } - - CGU_Vec3f channelWeightsBGR; - channelWeightsBGR.x = channelWeights.z; - channelWeightsBGR.y = channelWeights.y; - channelWeightsBGR.z = channelWeights.x; - - EndPoints = cgu_CompressRGBBlockX(BlkInBGRf_UV, Rpt, dwUniqueColors, channelWeightsBGR, m_nRefinementSteps); - } - } // colors - - //=================================================================== - // Process Cluster INPUT is constant EndPointsf OUTPUT is pcIndices - //=================================================================== - if (nCmpIndices == 0) - { - R = (CGU_UINT32)(EndPoints.Color0.z); - G = (CGU_UINT32)(EndPoints.Color0.y); - B = (CGU_UINT32)(EndPoints.Color0.x); - CGU_INT32 cluster0 = cmp_constructColor(R, G, B); - - R = (CGU_UINT32)(EndPoints.Color1.z); - G = (CGU_UINT32)(EndPoints.Color1.y); - B = (CGU_UINT32)(EndPoints.Color1.x); - CGU_INT32 cluster1 = cmp_constructColor(R, G, B); - - CGU_Vec3f InpRmp[NUM_ENDPOINTS]; - if ((cluster0 <= cluster1) // valid for 4 channels - // || (cluster0 > cluster1) // valid for 3 channels - ) - { - // inverse endpoints - InpRmp[0] = EndPoints.Color1; - InpRmp[1] = EndPoints.Color0; - } - else - { - InpRmp[0] = EndPoints.Color0; - InpRmp[1] = EndPoints.Color1; - } - - CGU_Vec3f srcblockBGR[BLOCK_SIZE_4X4]; - CGU_FLOAT srcblockA[BLOCK_SIZE_4X4]; - - // Swizzle the source RGB to BGR for processing - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - srcblockBGR[i].z = rgbBlock_normal[i].x * 255.0f; - srcblockBGR[i].y = rgbBlock_normal[i].y * 255.0f; - srcblockBGR[i].x = rgbBlock_normal[i].z * 255.0f; - srcblockA[i] = 255.0f; - if (dwAlphaThreshold > 0) - { - CGU_UINT32 alpha = (CGU_UINT32)src_imageNorm[i].w * 255.0f; - if (alpha >= dwAlphaThreshold) - srcblockA[i] = alpha; - } - } - - // input ramp is on the coarse grid - // make ramp endpoints the way they'll going to be decompressed - CGU_Vec3f InpRmpL[NUM_ENDPOINTS]; - CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F}; // 1 << RG,1 << GG,1 << BG - - { - // ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp); - InpRmpL[0] = InpRmp[0] + cmp_floorVec3f(InpRmp[0] / Fctrs); - InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f); - InpRmpL[1] = InpRmp[1] + cmp_floorVec3f(InpRmp[1] / Fctrs); - InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f); - } // MkWkRmpPts - - // build ramp - CGU_Vec3f LerpRmp[4]; - CGU_Vec3f offset = {1.0f, 1.0f, 1.0f}; - { - //BldRmp(Rmp, InpRmpL, dwNumChannels); - // linear interpolate end points to get the ramp - LerpRmp[0] = InpRmpL[0]; - LerpRmp[3] = InpRmpL[1]; - LerpRmp[1] = cmp_floorVec3f((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f); - LerpRmp[2] = cmp_floorVec3f((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f); - } // BldRmp - - //========================================================================= - // Clusterize, Compute error and find DXTC indexes for the current cluster - //========================================================================= - { - // Clusterize - CGU_UINT32 alpha; - - // For each colour in the original block assign it - // to the closest cluster and compute the cumulative error - for (i = 0; i < BLOCK_SIZE_4X4; i++) - { - alpha = (CGU_UINT32)srcblockA[i]; - if ((dwAlphaThreshold > 0) && alpha == 0) - { //*((CGU_DWORD *)&_Blk[i][AC]) == 0) - pcIndices |= cmp_set2Bit32(4, i); // dwNumChannels 3 or 4 (default is 4) - } - else - { - CGU_FLOAT shortest = 99999999999.f; - CGU_UINT8 shortestIndex = 0; - - CGU_Vec3f channelWeightsBGR; - channelWeightsBGR.x = channelWeights.z; - channelWeightsBGR.y = channelWeights.y; - channelWeightsBGR.z = channelWeights.x; - - for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++) - { - // r is either 1 or 4 - // calculate the distance for each component - CGU_FLOAT distance = - dot(((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR), ((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR)); - if (distance < shortest) - { - shortest = distance; - shortestIndex = rampindex; - } - } - - // The total is a sum of (error += shortest) - // We have the index of the best cluster, so assign this in the block - // Reorder indices to match correct DXTC ordering - if (shortestIndex == 3) // dwNumChannels - 1 - shortestIndex = 1; - else if (shortestIndex) - shortestIndex++; - pcIndices |= cmp_set2Bit32(shortestIndex, i); - } - } // BLOCK_SIZE_4X4 - } // Clusterize - } // Process Cluster - - //============================================================== - // Generate Compressed Result from nEndpoints & pcIndices - //============================================================== - c0 = cmp_constructColorBGR(EndPoints.Color0); - c1 = cmp_constructColorBGR(EndPoints.Color1); - - // Get Processed indices if not set - if (nCmpIndices == 0) - nCmpIndices = pcIndices; - - CGU_Vec2ui cmpBlock; - if (c0 <= c1) - { - cmpBlock.x = c1 | (c0 << 16); - } - else - cmpBlock.x = c0 | (c1 << 16); - - cmpBlock.y = nCmpIndices; - - return cmpBlock; -} - -CMP_STATIC void cgu_ProcessColors(CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMin, - CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMax, - CMP_INOUT CGU_UINT32 CMP_PTRINOUT c0, - CMP_INOUT CGU_UINT32 CMP_PTRINOUT c1, - CMP_IN CGU_INT setopt, - CMP_IN CGU_BOOL isSRGB) -{ - // CGU_UINT32 srbMap[32] = {0,5,8,11,12,13,14,15,16,17,18,19,20,21,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31}; - // CGU_UINT32 sgMap[64] = {0,10,14,16,19,20,22,24,25,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,42,43,43,44,45,45, - // 46,47,47,48,48,49,50,50,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63}; - CGU_INT32 x, y, z; - CGU_Vec3f scale = {31.0f, 63.0f, 31.0f}; - CGU_Vec3f MinColorScaled; - CGU_Vec3f MaxColorScaled; - - // Clamp or Transform is needed, the transforms have built in clamps - if (isSRGB) - { - MinColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMin); - MaxColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMax); - } - else - { - MinColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMin, 0.0f, 1.0f); - MaxColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMax, 0.0f, 1.0f); - } - - switch (setopt) - { - case 0: // Use Min Max processing - MinColorScaled = cmp_floorVec3f(MinColorScaled * scale); - MaxColorScaled = cmp_ceilVec3f(MaxColorScaled * scale); - CMP_PTRINOUT colorMin = MinColorScaled / scale; - CMP_PTRINOUT colorMax = MaxColorScaled / scale; - break; - default: // Use round processing - MinColorScaled = round(MinColorScaled * scale); - MaxColorScaled = round(MaxColorScaled * scale); - break; - } - - x = (CGU_UINT32)(MinColorScaled.x); - y = (CGU_UINT32)(MinColorScaled.y); - z = (CGU_UINT32)(MinColorScaled.z); - - //if (isSRGB) { - // // scale RB - // x = srbMap[x]; // &0x1F]; - // y = sgMap [y]; // &0x3F]; - // z = srbMap[z]; // &0x1F]; - // // scale G - //} - CMP_PTRINOUT c0 = (x << 11) | (y << 5) | z; - - x = (CGU_UINT32)(MaxColorScaled.x); - y = (CGU_UINT32)(MaxColorScaled.y); - z = (CGU_UINT32)(MaxColorScaled.z); - CMP_PTRINOUT c1 = (x << 11) | (y << 5) | z; -} - -CMP_STATIC CGU_FLOAT cgu_getIndicesRGB(CMP_INOUT CGU_UINT32 CMP_PTRINOUT cmpindex, - CMP_IN const CGU_Vec3f block[16], - CMP_IN CGU_Vec3f minColor, - CMP_IN CGU_Vec3f maxColor, - CMP_IN CGU_BOOL getErr) -{ - CGU_UINT32 PackedIndices = 0; - CGU_FLOAT err = 0.0f; - CGU_Vec3f cn[4]; - CGU_FLOAT minDistance; - - if (getErr) - { - // remap to BC1 spec for decoding offsets, - // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1 - cn[0] = maxColor; - cn[1] = minColor; - cn[2] = cn[0] * 2.0f / 3.0f + cn[1] * 1.0f / 3.0f; - cn[3] = cn[0] * 1.0f / 3.0f + cn[1] * 2.0f / 3.0f; - } - - CGU_FLOAT Scale = 3.f / cmp_dotVec3f(minColor - maxColor, minColor - maxColor); - CGU_Vec3f ScaledRange = (minColor - maxColor) * Scale; - CGU_FLOAT Bias = (cmp_dotVec3f(maxColor, maxColor) - cmp_dotVec3f(maxColor, minColor)) * Scale; - CGU_INT indexMap[4] = {0, 2, 3, 1}; // mapping based on BC1 Spec for color0 > color1 - CGU_UINT32 index; - CGU_FLOAT diff; - - for (CGU_UINT32 i = 0; i < 16; i++) - { - // Get offset from base scale - diff = cmp_dotVec3f(block[i], ScaledRange) + Bias; - index = ((CGU_UINT32)round(diff)) & 0x3; - - // remap linear offset to spec offset - index = indexMap[index]; - - // use err calc for use in higher quality code - if (getErr) - { - minDistance = cmp_dotVec3f(block[i] - cn[index], block[i] - cn[index]); - err += minDistance; - } - - // Map the 2 bit index into compress 32 bit block - if (index) - PackedIndices |= (index << (2 * i)); - } - - if (getErr) - err = err * 0.0208333f; - - CMP_PTRINOUT cmpindex = PackedIndices; - return err; -} - -//-------------------------------------------------------------------------------------------------------- -// Decompress is RGB (0.0f..255.0f) -//-------------------------------------------------------------------------------------------------------- -CMP_STATIC void cgu_decompressRGBBlock(CMP_INOUT CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock) -{ - CGU_UINT32 n0 = compressedBlock.x & 0xffff; - CGU_UINT32 n1 = compressedBlock.x >> 16; - CGU_UINT32 index; - - //------------------------------------------------------- - // Decode the compressed block 0..255 color range - //------------------------------------------------------- - CGU_Vec3f c0 = cmp_565ToLinear(n0); // max color - CGU_Vec3f c1 = cmp_565ToLinear(n1); // min color - CGU_Vec3f c2; - CGU_Vec3f c3; - - if (n0 > n1) - { - c2 = (c0 * 2.0f + c1) / 3.0f; - c3 = (c1 * 2.0f + c0) / 3.0f; - - for (CGU_UINT32 i = 0; i < 16; i++) - { - index = (compressedBlock.y >> (2 * i)) & 3; - switch (index) - { - case 0: - rgbBlock[i] = c0; - break; - case 1: - rgbBlock[i] = c1; - break; - case 2: - rgbBlock[i] = c2; - break; - case 3: - rgbBlock[i] = c3; - break; - } - } - } - else - { - // Transparent decode - c2 = (c0 + c1) / 2.0f; - - for (CGU_UINT32 i = 0; i < 16; i++) - { - index = (compressedBlock.y >> (2 * i)) & 3; - switch (index) - { - case 0: - rgbBlock[i] = c0; - break; - case 1: - rgbBlock[i] = c1; - break; - case 2: - rgbBlock[i] = c2; - break; - case 3: - rgbBlock[i] = 0.0f; - break; - } - } - } -} - -// The source is 0..255 -CMP_STATIC float cgu_RGBABlockErrorLinear(const CGU_Vec4uc src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock) -{ - CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4]; - - // Decompressed block channels are 0..255 - cgu_decompressRGBBlock(rgbBlock, compressedBlock); - - //------------------------------------------------------------------ - // Calculate MSE of the block - // Note : pow is used as Float type for the code to be usable on CPU - //------------------------------------------------------------------ - CGU_Vec3f serr; - serr = 0.0f; - - float sR, sG, sB, R, G, B; - - for (int j = 0; j < 16; j++) - { - sR = src_rgbBlock[j].x; - sG = src_rgbBlock[j].y; - sB = src_rgbBlock[j].z; - - R = rgbBlock[j].x; - G = rgbBlock[j].y; - B = rgbBlock[j].z; - - // Norm colors - serr.x += pow(sR - R, 2.0f); - serr.y += pow(sG - G, 2.0f); - serr.z += pow(sB - B, 2.0f); - } - - // MSE for 16 texels - return (serr.x + serr.y + serr.z) / 48.0f; -} - -// The source is 0..1, decompressed data using cmp_decompressRGBBlock2 is 0..255 which is converted down to 0..1 -CMP_STATIC float cgu_RGBBlockError(const CGU_Vec3f src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock, CGU_BOOL isSRGB) -{ - CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4]; - - // Decompressed block channels are 0..255 - cgu_decompressRGBBlock(rgbBlock, compressedBlock); - - //------------------------------------------------------------------ - // Calculate MSE of the block - // Note : pow is used as Float type for the code to be usable on CPU - //------------------------------------------------------------------ - CGU_Vec3f serr; - serr = 0.0f; - - float sR, sG, sB, R, G, B; - - for (int j = 0; j < 16; j++) - { - if (isSRGB) - { - sR = round(cmp_linearToSrgbf(src_rgbBlock[j].x) * 255.0f); - sG = round(cmp_linearToSrgbf(src_rgbBlock[j].y) * 255.0f); - sB = round(cmp_linearToSrgbf(src_rgbBlock[j].z) * 255.0f); - } - else - { - sR = round(src_rgbBlock[j].x * 255.0f); - sG = round(src_rgbBlock[j].y * 255.0f); - sB = round(src_rgbBlock[j].z * 255.0f); - } - - R = rgbBlock[j].x; - G = rgbBlock[j].y; - B = rgbBlock[j].z; - - // Norm colors - serr.x += pow(sR - R, 2.0f); - serr.y += pow(sG - G, 2.0f); - serr.z += pow(sB - B, 2.0f); - } - - // MSE for 16 texels - return (serr.x + serr.y + serr.z) / 48.0f; -} - -CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_MinMax(CMP_IN const CGU_Vec3f src_imageRGB[16], - CMP_IN CGU_FLOAT fquality, - CMP_IN CGU_BOOL isSRGB, - CMP_INOUT CGU_Vec3f srcRGB[16], // The list of source colors with blue channel altered - CMP_INOUT CGU_Vec3f CMP_REFINOUT average_rgb, // The centrepoint of the axis - CMP_INOUT CGU_FLOAT CMP_REFINOUT errout) -{ - CGU_Vec2ui Q1CompData = {0, 0}; - CGU_Vec3f rgb = {0, 0, 0}; - - // ------------------------------------------------------------------------------------- - // (1) Find the array of unique pixel values and sum them to find their average position - // ------------------------------------------------------------------------------------- - CGU_FLOAT errLQ = 0.0f; - CGU_BOOL fastProcess = (fquality <= CMP_QUALITY0); // Min Max only - CGU_Vec3f srcMin = 1.0f; // Min source color - CGU_Vec3f srcMax = 0.0f; // Max source color - CGU_Vec2ui Q1compressedBlock = {0, 0}; - CGU_UINT32 c0 = 0; - CGU_UINT32 c1 = 0; - - average_rgb = 0.0f; - // Get average and modifed src - // find average position and save list of pixels as 0F..255F range for processing - // Note: z (blue) is average of blue+green channels - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - { - srcMin = cmp_minVec3f(srcMin, src_imageRGB[i]); - srcMax = cmp_maxVec3f(srcMax, src_imageRGB[i]); - if (!fastProcess) - { - rgb = isSRGB ? cmp_linearToSrgb(src_imageRGB[i]) : cmp_saturate(src_imageRGB[i]); - rgb.z = (rgb.y + rgb.z) * 0.5F; // Z-axiz => (R+G)/2 - srcRGB[i] = rgb; - average_rgb = average_rgb + rgb; - } - } - - // Process two colors for saving in 565 format as C0 and C1 - cgu_ProcessColors(CMP_REFINOUT srcMin, CMP_REFINOUT srcMax, CMP_REFINOUT c0, CMP_REFINOUT c1, isSRGB ? 1 : 0, isSRGB); - - // Save simple min-max encoding - if (c0 < c1) - { - Q1CompData.x = (c0 << 16) | c1; - CGU_UINT32 index = 0; - errLQ = cgu_getIndicesRGB(CMP_REFINOUT index, src_imageRGB, srcMin, srcMax, false); - Q1CompData.y = index; - errout = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB); - } - else - { - // Most simple case all colors are equal or 0.0f - Q1compressedBlock.x = (c1 << 16) | c0; - Q1compressedBlock.y = 0; - errout = 0.0f; - return Q1compressedBlock; - } - // 0.0625F is (1/BLOCK_SIZE_4X4) - average_rgb = average_rgb * 0.0625F; - - return Q1CompData; -} - -CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_Fast(CMP_IN const CGU_Vec3f src_imageRGB[16], - CMP_IN CGU_FLOAT fquality, - CMP_IN CGU_BOOL isSRGB, - CMP_IN CGU_Vec3f srcRGB[16], - CMP_IN CGU_Vec3f CMP_REFINOUT average_rgb, - CMP_INOUT CGU_FLOAT CMP_REFINOUT errout) -{ - CMP_UNUSED(fquality); - - CGU_Vec3f axisVectorRGB = {0.0f, 0.0f, 0.0f}; // The axis vector for index projection - CGU_FLOAT pos_on_axis[16]; // The distance each unique falls along the compression axis - CGU_FLOAT axisleft = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis - CGU_FLOAT axisright = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis - CGU_FLOAT axiscentre = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis - CGU_INT32 swap = 0; // Indicator if the RGB values need swapping to generate an opaque result - CGU_Vec3f srcBlock[16]; // The list of source colors with any color space transforms and clipping - CGU_UINT32 c0 = 0; - CGU_UINT32 c1 = 0; - CGU_Vec2ui compressedBlock = {0, 0}; - CGU_FLOAT Q1CompErr; - CGU_Vec2ui Q1CompData = {0, 0}; - - CGU_Vec3f rgb = {0, 0, 0}; - - // ------------------------------------------------------------------------------------- - // (4) For each component, reflect points about the average so all lie on the same side - // of the average, and compute the new average - this gives a second point that defines the axis - // To compute the sign of the axis sum the positive differences of G for each of R and B (the - // G axis is always positive in this implementation - // ------------------------------------------------------------------------------------- - // An interesting situation occurs if the G axis contains no information, in which case the RB - // axis is also compared. I am not entirely sure if this is the correct implementation - should - // the priority axis be determined by magnitude? - { - CGU_FLOAT rg_pos = 0.0f; - CGU_FLOAT bg_pos = 0.0f; - CGU_FLOAT rb_pos = 0.0f; - - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - { - rgb = srcRGB[i] - average_rgb; - axisVectorRGB = axisVectorRGB + cmp_fabsVec3f(rgb); - if (rgb.x > 0) - { - rg_pos += rgb.y; - rb_pos += rgb.z; - } - if (rgb.z > 0) - bg_pos += rgb.y; - } - - // Average over BLOCK_SIZE_4X4 - axisVectorRGB = axisVectorRGB * 0.0625F; - - // New average position - if (rg_pos < 0) - axisVectorRGB.x = -axisVectorRGB.x; - if (bg_pos < 0) - axisVectorRGB.z = -axisVectorRGB.z; - if ((rg_pos == bg_pos) && (rg_pos == 0)) - { - if (rb_pos < 0) - axisVectorRGB.z = -axisVectorRGB.z; - } - } - - // ------------------------------------------------------------------------------------- - // (5) Axis projection and remapping - // ------------------------------------------------------------------------------------- - { - CGU_FLOAT v2_recip; - // Normalize the axis for simplicity of future calculation - v2_recip = cmp_dotVec3f(axisVectorRGB, axisVectorRGB); - if (v2_recip > 0) - v2_recip = 1.0f / (CGU_FLOAT)cmp_sqrt(v2_recip); - else - v2_recip = 1.0f; - axisVectorRGB = axisVectorRGB * v2_recip; - } - - // ------------------------------------------------------------------------------------- - // (6) Map the axis - // ------------------------------------------------------------------------------------- - // the line joining (and extended on either side of) average and axis - // defines the axis onto which the points will be projected - // Project all the points onto the axis, calculate the distance along - // the axis from the centre of the axis (average) - // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is - // P + ((R-P).v) / (v.v))v - // The distance along v is therefore (R-P).v / (v.v) where (v.v) is 1 if v is a unit vector. - // - // Calculate the extremities at the same time - these need to be reasonably accurately - // represented in all cases - { - axisleft = CMP_FLOAT_MAX; - axisright = -CMP_FLOAT_MAX; - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - { - // Compute the distance along the axis of the point of closest approach - CGU_Vec3f temp = (srcRGB[i] - average_rgb); - pos_on_axis[i] = cmp_dotVec3f(temp, axisVectorRGB); - - // Work out the extremities - if (pos_on_axis[i] < axisleft) - axisleft = pos_on_axis[i]; - if (pos_on_axis[i] > axisright) - axisright = pos_on_axis[i]; - } - } - - // --------------------------------------------------------------------------------------------- - // (7) Now we have a good axis and the basic information about how the points are mapped to it - // Our initial guess is to represent the endpoints accurately, by moving the average - // to the centre and recalculating the point positions along the line - // --------------------------------------------------------------------------------------------- - { - axiscentre = (axisleft + axisright) * 0.5F; - average_rgb = average_rgb + (axisVectorRGB * axiscentre); - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - pos_on_axis[i] -= axiscentre; - axisright -= axiscentre; - axisleft -= axiscentre; - } - - // ------------------------------------------------------------------------------------- - // (8) Calculate the high and low output colour values - // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A - // straight rounded average is not correct, as the decompressor 'unrounds' by replicating - // the top bits to the bottom. - // In order to take account of this process, we don't just apply a straight rounding correction, - // but base our rounding on the input value (a straight rounding is actually pretty good in terms of - // error measure, but creates a visual colour and/or brightness shift relative to the original image) - // The method used here is to apply a centre-biased rounding dependent on the input value, which was - // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of - // the image. - // rgb = (average_rgb + (left|right)*axisVectorRGB); - // ------------------------------------------------------------------------------------- - { - CGU_Vec3f MinColor, MaxColor; - - MinColor = average_rgb + (axisVectorRGB * axisleft); - MaxColor = average_rgb + (axisVectorRGB * axisright); - MinColor.z = (MinColor.z * 2) - MinColor.y; - MaxColor.z = (MaxColor.z * 2) - MaxColor.y; - - cgu_ProcessColors(CMP_REFINOUT MinColor, CMP_REFINOUT MaxColor, CMP_REFINOUT c0, CMP_REFINOUT c1, 1, false); - - // Force to be a 4-colour opaque block - in which case, c0 is greater than c1 - swap = 0; - if (c0 < c1) - { - CGU_UINT32 t; - t = c0; - c0 = c1; - c1 = t; - swap = 1; - } - else if (c0 == c1) - { - // This block will always be encoded in 3-colour mode - // Need to ensure that only one of the two points gets used, - // avoiding accidentally setting some transparent pixels into the block - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - pos_on_axis[i] = axisleft; - } - - compressedBlock.x = c0 | (c1 << 16); - - // ------------------------------------------------------------------------------------- - // (9) Final clustering, creating the 2-bit values that define the output - // ------------------------------------------------------------------------------------- - - CGU_UINT32 index; - CGU_FLOAT division; - { - compressedBlock.y = 0; - division = axisright * 2.0f / 3.0f; - axiscentre = (axisleft + axisright) / 2; // Actually, this code only works if centre is 0 or approximately so - - CGU_FLOAT CompMinErr; - - // This feature is work in progress - // remap to BC1 spec for decoding offsets, - // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1 - // CGU_Vec3f cn[4]; - // cn[0] = MaxColor; - // cn[1] = MinColor; - // cn[2] = cn[0]*2.0f/3.0f + cn[1]*1.0f/3.0f; - // cn[3] = cn[0]*1.0f/3.0f + cn[1]*2.0f/3.0f; - - for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) - { - // Endpoints (indicated by block > average) are 0 and 1, while - // interpolants are 2 and 3 - if (cmp_fabs(pos_on_axis[i]) >= division) - index = 0; - else - index = 2; - // Positive is in the latter half of the block - if (pos_on_axis[i] >= axiscentre) - index += 1; - - index = index ^ swap; - // Set the output, taking swapping into account - compressedBlock.y |= (index << (2 * i)); - - // use err calc for use in higher quality code - //CompMinErr += cmp_dotVec3f(srcRGBRef[i] - cn[index],srcRGBRef[i] - cn[index]); - } - - //CompMinErr = CompMinErr * 0.0208333f; - - CompMinErr = cgu_RGBBlockError(src_imageRGB, compressedBlock, isSRGB); - Q1CompErr = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB); - - if (CompMinErr > Q1CompErr) - { - compressedBlock = Q1CompData; - errout = Q1CompErr; - } - else - errout = CompMinErr; - } - } - // done - - return compressedBlock; -} - -CMP_STATIC CGU_UINT8 g_Match5Bit[256][2] = { - {0, 0}, {0, 0}, {1, 0}, {1, 0}, {0, 1}, {0, 1}, {0, 1}, {1, 1}, {1, 1}, {1, 1}, {0, 2}, {4, 0}, {1, 2}, {1, 2}, {1, 2}, - {2, 2}, {2, 2}, {2, 2}, {1, 3}, {5, 1}, {2, 3}, {2, 3}, {0, 4}, {3, 3}, {3, 3}, {3, 3}, {2, 4}, {2, 4}, {2, 4}, {5, 3}, - {1, 5}, {1, 5}, {2, 5}, {4, 4}, {4, 4}, {3, 5}, {3, 5}, {2, 6}, {2, 6}, {2, 6}, {3, 6}, {5, 5}, {5, 5}, {4, 6}, {8, 4}, - {3, 7}, {3, 7}, {3, 7}, {6, 6}, {6, 6}, {6, 6}, {5, 7}, {9, 5}, {6, 7}, {6, 7}, {4, 8}, {7, 7}, {7, 7}, {7, 7}, {6, 8}, - {6, 8}, {6, 8}, {9, 7}, {5, 9}, {5, 9}, {6, 9}, {8, 8}, {8, 8}, {7, 9}, {7, 9}, {6, 10}, {6, 10}, {6, 10}, {7, 10}, {9, 9}, - {9, 9}, {8, 10}, {12, 8}, {7, 11}, {7, 11}, {7, 11}, {10, 10}, {10, 10}, {10, 10}, {9, 11}, {13, 9}, {10, 11}, {10, 11}, {8, 12}, {11, 11}, - {11, 11}, {11, 11}, {10, 12}, {10, 12}, {10, 12}, {13, 11}, {9, 13}, {9, 13}, {10, 13}, {12, 12}, {12, 12}, {11, 13}, {11, 13}, {10, 14}, {10, 14}, - {10, 14}, {11, 14}, {13, 13}, {13, 13}, {12, 14}, {16, 12}, {11, 15}, {11, 15}, {11, 15}, {14, 14}, {14, 14}, {14, 14}, {13, 15}, {17, 13}, {14, 15}, - {14, 15}, {12, 16}, {15, 15}, {15, 15}, {15, 15}, {14, 16}, {14, 16}, {14, 16}, {17, 15}, {13, 17}, {13, 17}, {14, 17}, {16, 16}, {16, 16}, {15, 17}, - {15, 17}, {14, 18}, {14, 18}, {14, 18}, {15, 18}, {17, 17}, {17, 17}, {16, 18}, {20, 16}, {15, 19}, {15, 19}, {15, 19}, {18, 18}, {18, 18}, {18, 18}, - {17, 19}, {21, 17}, {18, 19}, {18, 19}, {16, 20}, {19, 19}, {19, 19}, {19, 19}, {18, 20}, {18, 20}, {18, 20}, {21, 19}, {17, 21}, {17, 21}, {18, 21}, - {20, 20}, {20, 20}, {19, 21}, {19, 21}, {18, 22}, {18, 22}, {18, 22}, {19, 22}, {21, 21}, {21, 21}, {20, 22}, {24, 20}, {19, 23}, {19, 23}, {19, 23}, - {22, 22}, {22, 22}, {22, 22}, {21, 23}, {25, 21}, {22, 23}, {22, 23}, {20, 24}, {23, 23}, {23, 23}, {23, 23}, {22, 24}, {22, 24}, {22, 24}, {25, 23}, - {21, 25}, {21, 25}, {22, 25}, {24, 24}, {24, 24}, {23, 25}, {23, 25}, {22, 26}, {22, 26}, {22, 26}, {23, 26}, {25, 25}, {25, 25}, {24, 26}, {28, 24}, - {23, 27}, {23, 27}, {23, 27}, {26, 26}, {26, 26}, {26, 26}, {25, 27}, {29, 25}, {26, 27}, {26, 27}, {24, 28}, {27, 27}, {27, 27}, {27, 27}, {26, 28}, - {26, 28}, {26, 28}, {29, 27}, {25, 29}, {25, 29}, {26, 29}, {28, 28}, {28, 28}, {27, 29}, {27, 29}, {26, 30}, {26, 30}, {26, 30}, {27, 30}, {29, 29}, - {29, 29}, {28, 30}, {28, 30}, {27, 31}, {27, 31}, {27, 31}, {30, 30}, {30, 30}, {30, 30}, {29, 31}, {29, 31}, {30, 31}, {30, 31}, {30, 31}, {31, 31}, - {31, 31}}; - -CMP_STATIC CGU_UINT8 g_Match6Bit[256][2] = { - {0, 0}, {1, 0}, {0, 1}, {1, 1}, {1, 1}, {0, 2}, {1, 2}, {2, 2}, {2, 2}, {1, 3}, {0, 4}, {3, 3}, {3, 3}, {0, 5}, {1, 5}, - {4, 4}, {4, 4}, {1, 6}, {0, 7}, {5, 5}, {5, 5}, {0, 8}, {1, 8}, {6, 6}, {6, 6}, {1, 9}, {2, 9}, {7, 7}, {7, 7}, {2, 10}, - {3, 10}, {8, 8}, {8, 8}, {3, 11}, {4, 11}, {9, 9}, {9, 9}, {4, 12}, {5, 12}, {10, 10}, {10, 10}, {5, 13}, {6, 13}, {16, 8}, {11, 11}, - {6, 14}, {7, 14}, {17, 9}, {12, 12}, {7, 15}, {8, 15}, {16, 11}, {13, 13}, {10, 15}, {8, 16}, {9, 16}, {14, 14}, {13, 15}, {9, 17}, {10, 17}, - {15, 15}, {16, 15}, {10, 18}, {11, 18}, {12, 18}, {16, 16}, {11, 19}, {12, 19}, {13, 19}, {17, 17}, {12, 20}, {13, 20}, {14, 20}, {18, 18}, {13, 21}, - {14, 21}, {15, 21}, {19, 19}, {14, 22}, {15, 22}, {20, 20}, {20, 20}, {15, 23}, {16, 23}, {21, 21}, {21, 21}, {16, 24}, {17, 24}, {22, 22}, {22, 22}, - {17, 25}, {18, 25}, {23, 23}, {23, 23}, {18, 26}, {19, 26}, {24, 24}, {24, 24}, {19, 27}, {20, 27}, {25, 25}, {25, 25}, {20, 28}, {21, 28}, {26, 26}, - {26, 26}, {21, 29}, {22, 29}, {32, 24}, {27, 27}, {22, 30}, {23, 30}, {33, 25}, {28, 28}, {23, 31}, {24, 31}, {32, 27}, {29, 29}, {26, 31}, {24, 32}, - {25, 32}, {30, 30}, {29, 31}, {25, 33}, {26, 33}, {31, 31}, {32, 31}, {26, 34}, {27, 34}, {28, 34}, {32, 32}, {27, 35}, {28, 35}, {29, 35}, {33, 33}, - {28, 36}, {29, 36}, {30, 36}, {34, 34}, {29, 37}, {30, 37}, {31, 37}, {35, 35}, {30, 38}, {31, 38}, {36, 36}, {36, 36}, {31, 39}, {32, 39}, {37, 37}, - {37, 37}, {32, 40}, {33, 40}, {38, 38}, {38, 38}, {33, 41}, {34, 41}, {39, 39}, {39, 39}, {34, 42}, {35, 42}, {40, 40}, {40, 40}, {35, 43}, {36, 43}, - {41, 41}, {41, 41}, {36, 44}, {37, 44}, {42, 42}, {42, 42}, {37, 45}, {38, 45}, {48, 40}, {43, 43}, {38, 46}, {39, 46}, {49, 41}, {44, 44}, {39, 47}, - {40, 47}, {48, 43}, {45, 45}, {42, 47}, {40, 48}, {41, 48}, {46, 46}, {45, 47}, {41, 49}, {42, 49}, {47, 47}, {48, 47}, {42, 50}, {43, 50}, {44, 50}, - {48, 48}, {43, 51}, {44, 51}, {45, 51}, {49, 49}, {44, 52}, {45, 52}, {46, 52}, {50, 50}, {45, 53}, {46, 53}, {47, 53}, {51, 51}, {46, 54}, {47, 54}, - {52, 52}, {52, 52}, {47, 55}, {48, 55}, {53, 53}, {53, 53}, {48, 56}, {49, 56}, {54, 54}, {54, 54}, {49, 57}, {50, 57}, {55, 55}, {55, 55}, {50, 58}, - {51, 58}, {56, 56}, {56, 56}, {51, 59}, {52, 59}, {57, 57}, {57, 57}, {52, 60}, {53, 60}, {58, 58}, {58, 58}, {53, 61}, {54, 61}, {59, 59}, {59, 59}, - {54, 62}, {55, 62}, {60, 60}, {60, 60}, {55, 63}, {56, 63}, {61, 61}, {61, 61}, {58, 63}, {59, 63}, {62, 62}, {62, 62}, {61, 63}, {62, 63}, {63, 63}, - {63, 63}}; - -CMP_STATIC CGU_Vec2ui cgu_solidColorBlock(CMP_IN CGU_UINT8 Red, CMP_IN CGU_UINT8 Green, CMP_IN CGU_UINT8 Blue) -{ - CGU_UINT32 maxEndp16; - CGU_UINT32 minEndp16; - - CGU_UINT32 mask = 0xAAAAAAAAu; - - minEndp16 = g_Match5Bit[Red][0] * 2048U + g_Match6Bit[Green][0] * 32U + g_Match5Bit[Blue][0]; - maxEndp16 = g_Match5Bit[Red][1] * 2048U + g_Match6Bit[Green][1] * 32U + g_Match5Bit[Blue][1]; - - // write the color block - if (maxEndp16 < minEndp16) - { - CGU_UINT32 tmpValue = minEndp16; - minEndp16 = maxEndp16; - maxEndp16 = tmpValue; - mask ^= 0x55555555u; - } - - CGU_Vec2ui outputBytes; - outputBytes.x = CGU_UINT32(maxEndp16) | (CGU_UINT32(minEndp16) << 16u); - outputBytes.y = mask; - - return outputBytes; -} - -CMP_STATIC void cmp_get_encode_data(CMP_IN CMP_EncodeData CMP_REFINOUT edata, CMP_IN CMP_CONSTANT CGU_Vec4uc src_image[16]) -{ - CMP_CONSTANT CGU_UINT32 fr = src_image[0].r, fg = src_image[0].g, fb = src_image[0].b; - - edata.all_colors_equal = false; - - edata.total.r = fr; - edata.total.g = fg; - edata.total.b = fb; - edata.max.r = fr; - edata.max.g = fg; - edata.max.b = fb; - edata.min.r = fr; - edata.min.g = fg; - edata.min.b = fb; - - edata.grayscale_flag = (fr == fg) && (fr == fb); - edata.any_black_pixels = (fr | fg | fb) < 4; - - for (CGU_UINT32 i = 1; i < 16; i++) - { - CMP_CONSTANT CGU_INT r = src_image[i].r, g = src_image[i].g, b = src_image[i].b; - - edata.grayscale_flag &= ((r == g) && (r == b)); - edata.any_black_pixels |= ((r | g | b) < 4); - - edata.max.r = CMP_MAX(edata.max.r, r); - edata.max.g = CMP_MAX(edata.max.g, g); - edata.max.b = CMP_MAX(edata.max.b, b); - edata.min.r = CMP_MIN(edata.min.r, r); - edata.min.g = CMP_MIN(edata.min.g, g); - edata.min.b = CMP_MIN(edata.min.b, b); - edata.total.r += r; - edata.total.g += g; - edata.total.b += b; - } - - edata.avg.r = (edata.total.r + 8) >> 4; - edata.avg.g = (edata.total.g + 8) >> 4; - edata.avg.b = (edata.total.b + 8) >> 4; -} - -#ifndef ASPM_GPU -/*------------------------------------------------------------------------------------------------ -1 DIM ramp -------------------------------------------------------------------------------------------------*/ -CMP_STATIC inline void cpu_BldClrRmp(CGU_FLOAT _Rmp[MAX_POINTS], CGU_FLOAT _InpRmp[NUM_ENDPOINTS], CGU_UINT32 dwNumPoints) -{ - CGU_UINT32 dwRndAmount[9] = {0, 0, 0, 0, 1, 1, 2, 2, 3}; - - // linear interpolate end points to get the ramp - _Rmp[0] = _InpRmp[0]; - _Rmp[dwNumPoints - 1] = _InpRmp[1]; - if (dwNumPoints % 2) - _Rmp[dwNumPoints] = 1000000.f; // for 3 point ramp; not to select the 4th point as min - for (CGU_UINT32 e = 1; e < dwNumPoints - 1; e++) - _Rmp[e] = cmp_floor((_Rmp[0] * (dwNumPoints - 1 - e) + _Rmp[dwNumPoints - 1] * e + dwRndAmount[dwNumPoints]) / (CGU_FLOAT)(dwNumPoints - 1)); -} - -/*------------------------------------------------------------------------------------------------ -// build 3D ramp -------------------------------------------------------------------------------------------------*/ -CMP_STATIC inline void cpu_BldRmp(CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS], CGU_UINT32 dwNumPoints) -{ - for (CGU_UINT32 j = 0; j < 3; j++) - cpu_BldClrRmp(_Rmp[j], _InpRmp[j], dwNumPoints); -} - -/*------------------------------------------------------------------------------------------------ -// this is how the end points is going to be look like when decompressed -------------------------------------------------------------------------------------------------*/ -CMP_STATIC inline void cpu_MkWkRmpPts(CMP_INOUT CGU_UINT8 CMP_REFINOUT _bEq, - CGU_FLOAT _OutRmpPts[NUM_CHANNELS][NUM_ENDPOINTS], - CGU_FLOAT _InpRmpPts[NUM_CHANNELS][NUM_ENDPOINTS], - CGU_UINT8 nRedBits, - CGU_UINT8 nGreenBits, - CGU_UINT8 nBlueBits) -{ - CGU_FLOAT Fctrs[3]; - Fctrs[RC] = (CGU_FLOAT)(1 << nRedBits); - Fctrs[GC] = (CGU_FLOAT)(1 << nGreenBits); - Fctrs[BC] = (CGU_FLOAT)(1 << nBlueBits); - - CGU_BOOL bEq = true; - // find whether input ramp is flat - for (CGU_UINT32 j = 0; j < 3; j++) - bEq &= (_InpRmpPts[j][0] == _InpRmpPts[j][1]); - - _bEq = bEq ? 1 : 0; - - // end points on the integer grid - for (CGU_UINT32 j = 0; j < 3; j++) - { - for (CGU_UINT32 k = 0; k < 2; k++) - { - // Apply the lower bit replication to give full dynamic range - _OutRmpPts[j][k] = _InpRmpPts[j][k] + cmp_floor(_InpRmpPts[j][k] / Fctrs[j]); - _OutRmpPts[j][k] = cmp_max(_OutRmpPts[j][k], 0.f); - _OutRmpPts[j][k] = cmp_min(_OutRmpPts[j][k], 255.f); - } - } -} - -// Compute error and find DXTC indexes for the current cluster -CMP_STATIC CGU_FLOAT cpu_ClstrIntnl(CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], - CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], - CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], - int dwBlockSize, - CGU_UINT8 dwNumPoints, - bool _ConstRamp, - CGU_FLOAT _pfWeights[3], - bool _bUseAlpha) -{ - CGU_FLOAT Err = 0.f; - CGU_UINT8 rmp_l = (_ConstRamp) ? 1 : dwNumPoints; - - // For each colour in the original block assign it - // to the closest cluster and compute the cumulative error - for (int i = 0; i < dwBlockSize; i++) - { - if (_bUseAlpha && *((CGU_UINT32*)&_Blk[i][AC]) == 0) - pcIndices[i] = dwNumPoints; - else - { - CGU_FLOAT shortest = 99999999999.f; - CGU_UINT8 shortestIndex = 0; - CGU_UINT8 r; - if ((_pfWeights[0] != 1.0f) || (_pfWeights[1] != 1.0f) || (_pfWeights[2] != 1.0f)) - for (r = 0; r < rmp_l; r++) - { - // calculate the distance for each component - CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * _pfWeights[0] + - (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * _pfWeights[1] + - (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * _pfWeights[2]; - - if (distance < shortest) - { - shortest = distance; - shortestIndex = r; - } - } - else - for (r = 0; r < rmp_l; r++) - { - // calculate the distance for each component - CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) + - (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]); - - if (distance < shortest) - { - shortest = distance; - shortestIndex = r; - } - } - - Err += shortest; - - // We have the index of the best cluster, so assign this in the block - // Reorder indices to match correct DXTC ordering - if (shortestIndex == dwNumPoints - 1) - shortestIndex = 1; - else if (shortestIndex) - shortestIndex++; - pcIndices[i] = shortestIndex; - } - } - - return Err; -} - -/*------------------------------------------------------------------------------------------------ -// input ramp is on the coarse grid -------------------------------------------------------------------------------------------------*/ -CMP_STATIC CGU_FLOAT cpu_ClstrBas(CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], - CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], - CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS], - int dwBlockSize, - CGU_UINT8 dwNumPoints, - CGU_FLOAT _pfWeights[3], - bool _bUseAlpha, - CGU_UINT8 nRedBits, - CGU_UINT8 nGreenBits, - CGU_UINT8 nBlueBits) -{ - // make ramp endpoints the way they'll going to be decompressed - CGU_UINT8 Eq = 1; - CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; - cpu_MkWkRmpPts(Eq, InpRmp, _InpRmp, nRedBits, nGreenBits, nBlueBits); - - // build ramp as it would be built by decompressor - CGU_FLOAT Rmp[NUM_CHANNELS][MAX_POINTS]; - cpu_BldRmp(Rmp, InpRmp, dwNumPoints); - - // clusterize and find a cumulative error - return cpu_ClstrIntnl(_Blk, pcIndices, Rmp, dwBlockSize, dwNumPoints, Eq, _pfWeights, _bUseAlpha); -} - -CMP_STATIC CGU_UINT8 nByteBitsMask2[9] = {0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; - -CMP_STATIC CGU_UINT32 cpu_ConstructColor2(CGU_UINT8 R, CGU_UINT8 nRedBits, CGU_UINT8 G, CGU_UINT8 nGreenBits, CGU_UINT8 B, CGU_UINT8 nBlueBits) -{ - return (((R & nByteBitsMask2[nRedBits]) << (nGreenBits + nBlueBits - (PIX_GRID - nRedBits))) | - ((G & nByteBitsMask2[nGreenBits]) << (nBlueBits - (PIX_GRID - nGreenBits))) | ((B & nByteBitsMask2[nBlueBits]) >> ((PIX_GRID - nBlueBits)))); -} - -CMP_STATIC CGU_FLOAT cpu_Clstr(CGU_UINT32 block_32[BLOCK_SIZE_4X4], - CGU_UINT32 dwBlockSize, - CGU_UINT8 nEndpoints[3][NUM_ENDPOINTS], - CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], - CGU_UINT8 dwNumPoints, - CGU_FLOAT _pfWeights[3], - bool _bUseAlpha, - CGU_UINT8 _nAlphaThreshold, - CGU_UINT8 nRedBits, - CGU_UINT8 nGreenBits, - CGU_UINT8 nBlueBits) -{ - CGU_UINT32 c0 = cpu_ConstructColor2(nEndpoints[RC][0], nRedBits, nEndpoints[GC][0], nGreenBits, nEndpoints[BC][0], nBlueBits); - CGU_UINT32 c1 = cpu_ConstructColor2(nEndpoints[RC][1], nRedBits, nEndpoints[GC][1], nGreenBits, nEndpoints[BC][1], nBlueBits); - CGU_UINT32 nEndpointIndex0 = 0; - CGU_UINT32 nEndpointIndex1 = 1; - if ((!(dwNumPoints & 0x1) && c0 <= c1) || ((dwNumPoints & 0x1) && c0 > c1)) - { - nEndpointIndex0 = 1; - nEndpointIndex1 = 0; - } - - CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; - InpRmp[RC][0] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex0]; - InpRmp[RC][1] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex1]; - InpRmp[GC][0] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex0]; - InpRmp[GC][1] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex1]; - InpRmp[BC][0] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex0]; - InpRmp[BC][1] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex1]; - - CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24; - CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS]; - for (CGU_UINT32 i = 0; i < dwBlockSize; i++) - { - Blk[i][RC] = (CGU_FLOAT)((block_32[i] & 0xff0000) >> 16); - Blk[i][GC] = (CGU_FLOAT)((block_32[i] & 0xff00) >> 8); - Blk[i][BC] = (CGU_FLOAT)(block_32[i] & 0xff); - if (_bUseAlpha) - Blk[i][AC] = ((block_32[i] & 0xff000000) >= dwAlphaThreshold) ? 1.f : 0.f; - } - - return cpu_ClstrBas(pcIndices, Blk, InpRmp, dwBlockSize, dwNumPoints, _pfWeights, _bUseAlpha, nRedBits, nGreenBits, nBlueBits); -} - -/*------------------------------------------------------------------------------------------------ -Compute cumulative error for the current cluster -------------------------------------------------------------------------------------------------*/ -CMP_STATIC CGU_FLOAT cpu_ClstrErr(CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], - CGU_FLOAT _Rpt[BLOCK_SIZE_4X4], - CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], - CGU_UINT32 _NmbClrs, - CGU_UINT32 _blcktp, - bool _ConstRamp, - CGU_Vec3f channelWeights) -{ - CGU_FLOAT fError = 0.f; - CGU_UINT32 rmp_l = (_ConstRamp) ? 1 : _blcktp; - - CGU_BOOL useWeights = ((channelWeights[0] != 1.0f) || (channelWeights[1] != 1.0f) || (channelWeights[2] != 1.0f)); - - // For each colour in the original block, find the closest cluster - // and compute the comulative error - for (CGU_UINT32 i = 0; i < _NmbClrs; i++) - { - CGU_FLOAT fShortest = 99999999999.f; - - if (useWeights) - for (CGU_UINT32 r = 0; r < rmp_l; r++) - { - // calculate the distance for each component - CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * channelWeights[0] + - (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * channelWeights[1] + - (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * channelWeights[2]; - - if (fDistance < fShortest) - fShortest = fDistance; - } - else - for (CGU_UINT32 r = 0; r < rmp_l; r++) - { - // calculate the distance for each component - CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) + - (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]); - - if (fDistance < fShortest) - fShortest = fDistance; - } - - // accumulate the error - fError += fShortest * _Rpt[i]; - } - - return fError; -} - -#if defined(USE_REFINE3D) - -CMP_STATIC CGU_FLOAT cmp_Refine3D(CGU_FLOAT _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], - CGU_FLOAT _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], - CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], - CGU_FLOAT _Rpt[BLOCK_SIZE_4X4], - CGU_UINT32 _NmrClrs, - CGU_UINT32 dwNumPoints, - CGU_Vec3f channelWeights, - CGU_UINT8 nRedBits, - CGU_UINT8 nGreenBits, - CGU_UINT8 nBlueBits, - CGU_UINT32 nRefineSteps) -{ - CGU_FLOAT ALIGN_16 Rmp[NUM_CHANNELS][MAX_POINTS]; - - CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS]; - for (CGU_UINT32 i = 0; i < _NmrClrs; i++) - for (CGU_UINT32 j = 0; j < 3; j++) - Blk[i][j] = _Blk[i][j]; - - CGU_FLOAT fWeightRed = channelWeights.r; - CGU_FLOAT fWeightGreen = channelWeights.g; - CGU_FLOAT fWeightBlue = channelWeights.b; - - // here is our grid - CGU_FLOAT Fctrs[3]; - Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits)); - Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits)); - Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits)); - - CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS]; - CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; - for (CGU_UINT32 k = 0; k < 2; k++) - for (CGU_UINT32 j = 0; j < 3; j++) - InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k]; - - // make ramp endpoints the way they'll going to be decompressed - // plus check whether the ramp is flat - CGU_UINT8 Eq; - CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS]; - cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); - - // build ramp for all 3 colors - cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); - - // clusterize for the current ramp - CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights); - if (bestE == 0.f) // if exact, we've done - return bestE; - - // Jitter endpoints in each direction - CGU_INT nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8)); - CGU_INT nRefineEnd = cmp_min(nRefineSteps, (CGU_UINT8)8); - for (CGU_INT nJitterG0 = nRefineStart; nJitterG0 <= nRefineEnd; nJitterG0++) - { - InpRmp[GC][0] = cmp_min(cmp_max(InpRmp0[GC][0] + nJitterG0 * Fctrs[GC], 0.f), 255.f); - for (CGU_INT nJitterG1 = nRefineStart; nJitterG1 <= nRefineEnd; nJitterG1++) - { - InpRmp[GC][1] = cmp_min(cmp_max(InpRmp0[GC][1] + nJitterG1 * Fctrs[GC], 0.f), 255.f); - cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); - cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints); - - CGU_FLOAT RmpErrG[MAX_POINTS][BLOCK_SIZE_4X4]; - for (CGU_UINT32 i = 0; i < _NmrClrs; i++) - { - for (CGU_UINT32 r = 0; r < dwNumPoints; r++) - { - CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]); - RmpErrG[r][i] = DistG * DistG * fWeightGreen; - } - } - - for (CGU_INT nJitterB0 = nRefineStart; nJitterB0 <= nRefineEnd; nJitterB0++) - { - InpRmp[BC][0] = cmp_min(cmp_max(InpRmp0[BC][0] + nJitterB0 * Fctrs[BC], 0.f), 255.f); - for (CGU_INT nJitterB1 = nRefineStart; nJitterB1 <= nRefineEnd; nJitterB1++) - { - InpRmp[BC][1] = cmp_min(cmp_max(InpRmp0[BC][1] + nJitterB1 * Fctrs[BC], 0.f), 255.f); - cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); - cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints); - - CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4]; - for (CGU_UINT32 i = 0; i < _NmrClrs; i++) - { - for (CGU_UINT32 r = 0; r < dwNumPoints; r++) - { - CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]); - RmpErr[r][i] = RmpErrG[r][i] + DistB * DistB * fWeightBlue; - } - } - - for (CGU_INT nJitterR0 = nRefineStart; nJitterR0 <= nRefineEnd; nJitterR0++) - { - InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + nJitterR0 * Fctrs[RC], 0.f), 255.f); - for (CGU_INT nJitterR1 = nRefineStart; nJitterR1 <= nRefineEnd; nJitterR1++) - { - InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + nJitterR1 * Fctrs[RC], 0.f), 255.f); - cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); - cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints); - - // compute cumulative error - CGU_FLOAT mse = 0.f; - CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; - for (CGU_UINT32 k = 0; k < _NmrClrs; k++) - { - CGU_FLOAT MinErr = 10000000.f; - for (CGU_INT r = 0; r < rmp_l; r++) - { - CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]); - CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightRed; - MinErr = cmp_min(MinErr, Err); - } - mse += MinErr * _Rpt[k]; - } - - // save if we achieve better result - if (mse < bestE) - { - bestE = mse; - for (CGU_UINT32 k = 0; k < 2; k++) - for (CGU_UINT32 j = 0; j < 3; j++) - _OutRmpPnts[j][k] = InpRmp[j][k]; - } - } - } - } - } - } - } - - return bestE; -} -#endif - -#if defined(USE_REFINE) -CMP_STATIC CGU_FLOAT cmp_Refine(CGU_FLOAT _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], - CGU_FLOAT _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], - CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], - CGU_FLOAT _Rpt[BLOCK_SIZE_4X4], - CGU_INT _NmrClrs, - CGU_UINT8 dwNumPoints, - CGU_Vec3f channelWeights, - CGU_UINT32 nRedBits, - CGU_UINT32 nGreenBits, - CGU_UINT32 nBlueBits, - CGU_UINT32 nRefineSteps) -{ - CGU_FLOAT ALIGN_16 Rmp[NUM_CHANNELS][MAX_POINTS]; - - if (nRefineSteps == 0) - nRefineSteps = 1; - - CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS]; - for (CGU_INT i = 0; i < _NmrClrs; i++) - for (CGU_INT j = 0; j < 3; j++) - Blk[i][j] = _Blk[i][j]; - - CGU_FLOAT fWeightRed = channelWeights.r; - CGU_FLOAT fWeightGreen = channelWeights.g; - CGU_FLOAT fWeightBlue = channelWeights.b; - - // here is our grid - CGU_FLOAT Fctrs[3]; - Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits)); - Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits)); - Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits)); - - CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS]; - CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; - for (CGU_INT k = 0; k < 2; k++) - for (CGU_INT j = 0; j < 3; j++) - InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k]; - - // make ramp endpoints the way they'll going to be decompressed - // plus check whether the ramp is flat - CGU_UINT8 Eq; - CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS]; - cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); - - // build ramp for all 3 colors - cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); - - // clusterize for the current ramp - CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights); - if (bestE == 0.f) // || !nRefineSteps) // if exact, we've done - return bestE; - - // Tweak each component in isolation and get the best values - - // precompute ramp errors for Green and Blue - CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4]; - for (CGU_INT i = 0; i < _NmrClrs; i++) - { - for (CGU_INT r = 0; r < dwNumPoints; r++) - { - CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]); - CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]); - RmpErr[r][i] = DistG * DistG * fWeightGreen + DistB * DistB * fWeightBlue; - } - } - - // First Red - CGU_FLOAT bstC0 = InpRmp0[RC][0]; - CGU_FLOAT bstC1 = InpRmp0[RC][1]; - CGU_INT nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8)); - CGU_INT nRefineEnd = cmp_min(nRefineSteps, (CGU_UINT8)8); - for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++) - { - for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++) - { - // make a move; both sides of interval. - InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + i * Fctrs[RC], 0.f), 255.f); - InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + j * Fctrs[RC], 0.f), 255.f); - - // make ramp endpoints the way they'll going to be decompressed - // plus check whether the ramp is flat - cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); - - // build ramp only for red - cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints); - - // compute cumulative error - CGU_FLOAT mse = 0.f; - CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; - for (CGU_INT k = 0; k < _NmrClrs; k++) - { - CGU_FLOAT MinErr = 10000000.f; - for (CGU_INT r = 0; r < rmp_l; r++) - { - CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]); - CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightRed; - MinErr = cmp_minf(MinErr, Err); - } - mse += MinErr * _Rpt[k]; - } - - // save if we achieve better result - if (mse < bestE) - { - bstC0 = InpRmp[RC][0]; - bstC1 = InpRmp[RC][1]; - bestE = mse; - } - } - } - - // our best REDs - InpRmp[RC][0] = bstC0; - InpRmp[RC][1] = bstC1; - - // make ramp endpoints the way they'll going to be decompressed - // plus check whether the ramp is flat - cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); - - // build ramp only for green - cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); - - // precompute ramp errors for Red and Blue - for (CGU_INT i = 0; i < _NmrClrs; i++) - { - for (CGU_INT r = 0; r < dwNumPoints; r++) - { - CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]); - CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]); - RmpErr[r][i] = DistR * DistR * fWeightRed + DistB * DistB * fWeightBlue; - } - } - - // Now green - bstC0 = InpRmp0[GC][0]; - bstC1 = InpRmp0[GC][1]; - for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++) - { - for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++) - { - InpRmp[GC][0] = cmp_minf(cmp_maxf(InpRmp0[GC][0] + i * Fctrs[GC], 0.f), 255.f); - InpRmp[GC][1] = cmp_minf(cmp_maxf(InpRmp0[GC][1] + j * Fctrs[GC], 0.f), 255.f); - - cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); - cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints); - - CGU_FLOAT mse = 0.f; - CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; - for (CGU_INT k = 0; k < _NmrClrs; k++) - { - CGU_FLOAT MinErr = 10000000.f; - for (CGU_INT r = 0; r < rmp_l; r++) - { - CGU_FLOAT Dist = (Rmp[GC][r] - Blk[k][GC]); - CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightGreen; - MinErr = cmp_minf(MinErr, Err); - } - mse += MinErr * _Rpt[k]; - } - - if (mse < bestE) - { - bstC0 = InpRmp[GC][0]; - bstC1 = InpRmp[GC][1]; - bestE = mse; - } - } - } - - // our best GREENs - InpRmp[GC][0] = bstC0; - InpRmp[GC][1] = bstC1; - - cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); - cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); - - // ramp err for Red and Green - for (CGU_INT i = 0; i < _NmrClrs; i++) - { - for (CGU_INT r = 0; r < dwNumPoints; r++) - { - CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]); - CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]); - RmpErr[r][i] = DistR * DistR * fWeightRed + DistG * DistG * fWeightGreen; - } - } - - bstC0 = InpRmp0[BC][0]; - bstC1 = InpRmp0[BC][1]; - // Now blue - for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++) - { - for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++) - { - InpRmp[BC][0] = min(max(InpRmp0[BC][0] + i * Fctrs[BC], 0.f), 255.f); - InpRmp[BC][1] = min(max(InpRmp0[BC][1] + j * Fctrs[BC], 0.f), 255.f); - - cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); - cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints); - - CGU_FLOAT mse = 0.f; - CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; - for (CGU_INT k = 0; k < _NmrClrs; k++) - { - CGU_FLOAT MinErr = 10000000.f; - for (CGU_INT r = 0; r < rmp_l; r++) - { - CGU_FLOAT Dist = (Rmp[BC][r] - Blk[k][BC]); - CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightBlue; - MinErr = min(MinErr, Err); - } - mse += MinErr * _Rpt[k]; - } - - if (mse < bestE) - { - bstC0 = InpRmp[BC][0]; - bstC1 = InpRmp[BC][1]; - bestE = mse; - } - } - } - - // our best BLUEs - InpRmp[BC][0] = bstC0; - InpRmp[BC][1] = bstC1; - - // return our best choice - for (CGU_INT j = 0; j < 3; j++) - for (CGU_INT k = 0; k < 2; k++) - _OutRmpPnts[j][k] = InpRmp[j][k]; - - return bestE; -} - -#endif - -//====================================================================================== -// Codec from CompressonatorLib -//====================================================================================== -#define BLOCK_SIZE_4X4 16 -#define RG 5 -#define GG 6 -#define BG 5 - -/*------------------------------------------------------------------------------------------------ -// this is how the end points is going to be rounded in compressed format -------------------------------------------------------------------------------------------------*/ -CMP_STATIC void cpu_MkRmpOnGrid(CGU_FLOAT _RmpF[NUM_CHANNELS][NUM_ENDPOINTS], - CGU_FLOAT _MnMx[NUM_CHANNELS][NUM_ENDPOINTS], - CGU_FLOAT _Min, - CGU_FLOAT _Max, - CGU_UINT8 nRedBits, - CGU_UINT8 nGreenBits, - CGU_UINT8 nBlueBits) -{ - CGU_FLOAT Fctrs0[3]; - CGU_FLOAT Fctrs1[3]; - - Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits); - Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits); - Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits); - Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits)); - Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits)); - Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits)); - - for (int j = 0; j < 3; j++) - { - for (int k = 0; k < 2; k++) - { - _RmpF[j][k] = cmp_floor(_MnMx[j][k]); - if (_RmpF[j][k] <= _Min) - _RmpF[j][k] = _Min; - else - { - _RmpF[j][k] += cmp_floor(128.f / Fctrs1[j]) - cmp_floor(_RmpF[j][k] / Fctrs1[j]); - _RmpF[j][k] = cmp_minf(_RmpF[j][k], _Max); - } - - _RmpF[j][k] = cmp_floor(_RmpF[j][k] / Fctrs0[j]) * Fctrs0[j]; - } - } -} - -// Find the first approximation of the line -// Assume there is a linear relation -// Z = a * X_In -// Z = b * Y_In -// Find a,b to minimize MSE between Z and Z_In -CMP_STATIC void cpu_FindAxis(CMP_OUT CGU_FLOAT BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS], - CMP_IN CGU_FLOAT LineDir0[NUM_CHANNELS], - CMP_IN CGU_FLOAT fBlockCenter[NUM_CHANNELS], - CMP_OUT CGU_UINT8 CMP_REFINOUT AxisIsSmall, - CMP_IN CGU_FLOAT BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS], - CMP_IN CGU_FLOAT _inpRpt[BLOCK_SIZE_4X4], - CMP_IN int nDimensions, - CMP_IN int dwUniqueColors) -{ - CGU_FLOAT Crrl[NUM_CHANNELS]; - CGU_FLOAT RGB2[NUM_CHANNELS]; - CGU_INT i; - - LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = fBlockCenter[0] = fBlockCenter[1] = fBlockCenter[2] = - 0.f; - - // sum position of all points - CGU_FLOAT fNumPoints = 0.f; - for (i = 0; i < dwUniqueColors; i++) - { - fBlockCenter[0] += BlkUV[i][0] * _inpRpt[i]; - fBlockCenter[1] += BlkUV[i][1] * _inpRpt[i]; - fBlockCenter[2] += BlkUV[i][2] * _inpRpt[i]; - fNumPoints += _inpRpt[i]; - } - - // and then average to calculate center coordinate of block - fBlockCenter[0] /= fNumPoints; - fBlockCenter[1] /= fNumPoints; - fBlockCenter[2] /= fNumPoints; - - for (i = 0; i < dwUniqueColors; i++) - { - // calculate output block as offsets around block center - BlkSh[i][0] = BlkUV[i][0] - fBlockCenter[0]; - BlkSh[i][1] = BlkUV[i][1] - fBlockCenter[1]; - BlkSh[i][2] = BlkUV[i][2] - fBlockCenter[2]; - - // compute correlation matrix - // RGB2 = sum of ((distance from point from center) squared) - // Crrl = ???????. Seems to be be some calculation based on distance from point center in two dimensions - for (int j = 0; j < nDimensions; j++) - { - RGB2[j] += BlkSh[i][j] * BlkSh[i][j] * _inpRpt[i]; - Crrl[j] += BlkSh[i][j] * BlkSh[i][(j + 1) % 3] * _inpRpt[i]; - } - } - - // if set's diameter is small - int i0 = 0, i1 = 1; - CGU_FLOAT mxRGB2 = 0.f; - int k = 0, j = 0; - CGU_FLOAT fEPS = fNumPoints * EPS; - for (k = 0, j = 0; j < 3; j++) - { - if (RGB2[j] >= fEPS) - k++; - else - RGB2[j] = 0.f; - - if (mxRGB2 < RGB2[j]) - { - mxRGB2 = RGB2[j]; - i0 = j; - } - } - - CGU_FLOAT fEPS2 = fNumPoints * EPS2; - AxisIsSmall = 1; - for (j = 0; j < 3; j++) - { - AxisIsSmall &= (RGB2[j] < fEPS2); - } - - if (AxisIsSmall) // all are very small to avoid division on the small determinant - return; - - if (k == 1) // really only 1 dimension - LineDir0[i0] = 1.; - else if (k == 2) - { // really only 2 dimensions - i1 = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3; - CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3]; - LineDir0[i1] = Crl / RGB2[i0]; - LineDir0[i0] = 1.; - } - else - { - CGU_FLOAT maxDet = 100000.f; - CGU_FLOAT Cs[3]; - // select max det for precision - for (j = 0; j < nDimensions; j++) - { - CGU_FLOAT Det = RGB2[j] * RGB2[(j + 1) % 3] - Crrl[j] * Crrl[j]; - Cs[j] = abs(Crrl[j] / sqrt(RGB2[j] * RGB2[(j + 1) % 3])); - if (maxDet < Det) - { - maxDet = Det; - i0 = j; - } - } - - // inverse correl matrix - // -- -- -- -- - // | A B | | C -B | - // | B C | => | -B A | - // -- -- -- -- - CGU_FLOAT mtrx1[2][2]; - CGU_FLOAT vc1[2]; - CGU_FLOAT vc[2]; - vc1[0] = Crrl[(i0 + 2) % 3]; - vc1[1] = Crrl[(i0 + 1) % 3]; - // C - mtrx1[0][0] = RGB2[(i0 + 1) % 3]; - // A - mtrx1[1][1] = RGB2[i0]; - // -B - mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0]; - // find a solution - vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1]; - vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1]; - // normalize - vc[0] /= maxDet; - vc[1] /= maxDet; - // find a line direction vector - LineDir0[i0] = 1.; - LineDir0[(i0 + 1) % 3] = 1.; - LineDir0[(i0 + 2) % 3] = vc[0] + vc[1]; - } - - // normalize direction vector - CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2]; - Len = sqrt(Len); - - for (j = 0; j < 3; j++) - LineDir0[j] = (Len > 0.f) ? LineDir0[j] / Len : 0.f; -} - -CMP_STATIC CGU_FLOAT cpu_RampSrchW(CGU_FLOAT Prj[BLOCK_SIZE_4X4], - CGU_FLOAT PrjErr[BLOCK_SIZE_4X4], - CGU_FLOAT PreMRep[BLOCK_SIZE_4X4], - CGU_FLOAT StepErr, - CGU_FLOAT lowPosStep, - CGU_FLOAT highPosStep, - int dwUniqueColors, - int dwNumPoints) -{ - CGU_FLOAT error = 0.0f; - CGU_FLOAT step = (highPosStep - lowPosStep) / (dwNumPoints - 1); - CGU_FLOAT step_h = step * 0.5f; - CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; - CGU_INT i; - - for (i = 0; i < dwUniqueColors; i++) - { - // Work out which value in the block this select - CGU_FLOAT del = Prj[i] - lowPosStep; - - CGU_FLOAT v; - - if (del <= 0) - v = lowPosStep; - else if (Prj[i] - highPosStep >= 0) - v = highPosStep; - else - v = cmp_floor((del + step_h) * rstep) * step + lowPosStep; - - // And accumulate the error - CGU_FLOAT d = (Prj[i] - v); - d *= d; - CGU_FLOAT err = PreMRep[i] * d + PrjErr[i]; - error += err; - if (StepErr < error) - { - error = StepErr; - break; - } - } - return error; -} - -CMP_STATIC CGU_FLOAT _cpu_bc1ComputeBestEndpoints(CGU_FLOAT endpointsOut[NUM_ENDPOINTS], - CGU_FLOAT endpointsIn[NUM_ENDPOINTS], - CGU_FLOAT prj[BLOCK_SIZE_4X4], - CGU_FLOAT prjError[BLOCK_SIZE_4X4], - CGU_FLOAT preMRep[BLOCK_SIZE_4X4], - int numColours, - int numPoints) -{ - CGU_FLOAT minError = MAX_ERROR; - - static const CGU_FLOAT searchStep = 0.025f; - - const CGU_FLOAT lowStart = (endpointsIn[0] - 2.0f * searchStep > 0.0f) ? endpointsIn[0] - 2.0f * searchStep : 0.0f; - const CGU_FLOAT highStart = (endpointsIn[1] + 2.0f * searchStep < 1.0f) ? endpointsIn[1] + 2.0f * searchStep : 1.0f; - - CGU_FLOAT lowStep = lowStart; - CGU_FLOAT highStep = highStart; - - for (int low = 0; low < 8; ++low) - { - for (int high = 0; high < 8; ++high) - { - // compute an error for the current pair of end points. - CGU_FLOAT error = cpu_RampSrchW(prj, prjError, preMRep, minError, lowStep, highStep, numColours, numPoints); - - if (error < minError) - { - // save better result - minError = error; - endpointsOut[0] = lowStep; - endpointsOut[1] = highStep; - } - - highStep -= searchStep; - } - - lowStep += searchStep; - } - - return minError; -} - -// This is a float point-based compression -// it assumes that the number of unique colors is already known; input is in [0., 255.] range. -// This is C version. -CMP_STATIC bool cpu_CompressRGBBlockX(CMP_OUT CGU_FLOAT _RsltRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], - CMP_IN CGU_FLOAT src_image[BLOCK_SIZE_4X4][NUM_CHANNELS], - CMP_IN CGU_FLOAT Rpt[BLOCK_SIZE_4X4], - CMP_IN int dwUniqueColors, - CMP_IN CGU_UINT8 dwNumPoints, - CMP_IN bool b3DRefinement, - CMP_IN CGU_UINT8 nRefinementSteps, - CMP_IN CGU_FLOAT pfWeights[3], - CMP_IN CGU_UINT8 nRedBits, - CMP_IN CGU_UINT8 nGreenBits, - CMP_IN CGU_UINT8 nBlueBits, - CMP_IN CGU_FLOAT fquality) -{ -#if !defined(ASPM_GPU) - if (!g_bc1FunctionPointersSet) - { - bc1ToggleSIMD(EXTENSION_COUNT); - } -#endif - - CGU_FLOAT ALIGN_16 Prj0[BLOCK_SIZE_4X4]; - CGU_FLOAT ALIGN_16 Prj[BLOCK_SIZE_4X4]; - CGU_FLOAT ALIGN_16 PrjErr[BLOCK_SIZE_4X4]; - CGU_FLOAT ALIGN_16 LineDir[NUM_CHANNELS]; - CGU_FLOAT ALIGN_16 RmpIndxs[BLOCK_SIZE_4X4]; - - CMP_UNUSED(fquality); - CMP_UNUSED(b3DRefinement) - - CGU_FLOAT LineDirG[NUM_CHANNELS]; - CGU_FLOAT PosG[NUM_ENDPOINTS]; - CGU_FLOAT BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS]; - CGU_FLOAT BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS]; - CGU_FLOAT LineDir0[NUM_CHANNELS]; - CGU_FLOAT Mdl[NUM_CHANNELS]; - - CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS]; - int i, j, k; - - // down to [0., 1.] - for (i = 0; i < dwUniqueColors; i++) - for (j = 0; j < 3; j++) - BlkUV[i][j] = src_image[i][j] / 255.f; - - bool isDONE = false; - - // as usual if not more then 2 different colors, we've done - if (dwUniqueColors <= 2) - { - for (j = 0; j < 3; j++) - { - rsltC[j][0] = src_image[0][j]; - rsltC[j][1] = src_image[dwUniqueColors - 1][j]; - } - isDONE = true; - } - - if (!isDONE) - { - // This is our first attempt to find an axis we will go along. - // The cumulation is done to find a line minimizing the MSE from the input 3D points. - CGU_UINT8 bSmall; - cpu_FindAxis(BlkSh, LineDir0, Mdl, bSmall, BlkUV, Rpt, 3, dwUniqueColors); - - // While trying to find the axis we found that the diameter of the input set is quite small. - // Do not bother. - if (bSmall) - { - for (j = 0; j < 3; j++) - { - rsltC[j][0] = src_image[0][j]; - rsltC[j][1] = src_image[dwUniqueColors - 1][j]; - } - isDONE = true; - } - } - - // GCC is being an awful being when it comes to goto-jumps. - // So please bear with this. - if (!isDONE) - { - CGU_FLOAT ErrG = 10000000.f; - CGU_FLOAT PrjBnd[NUM_ENDPOINTS]; - CGU_FLOAT ALIGN_16 PreMRep[BLOCK_SIZE_4X4]; - for (j = 0; j < 3; j++) - LineDir[j] = LineDir0[j]; - - // Here is the main loop. - // 1. Project input set on the axis in consideration. - // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points. - // 3. Compute the vector of indexes (or clusters) for the current approximate ramp. - // 4. Present our color channels as 3 16DIM vectors. - // 5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector. - // 6. Plug the projections as a new directional vector for the axis. - // 7. Goto 1. - // D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3, 2/3, 0, ...,}, but shifted and normalized). - // Ci - is a 16 dim vector of color i. - // for each Ci find a scalar Ai such that - // (Ai * D - Ci) (Ai * D - Ci) -> min , i.e distance between vector AiD and C is min. - // You can think of D as a unit interval(vector) "clusterizer", - // and Ai is a scale you need to apply to the clusterizer to - // approximate the Ci vector instead of the unit vector. - // Solution is - // Ai = (D . Ci) / (D . D); . - is a dot product. - // in 3 dim space Ai(s) represent a line direction, along which - // we again try to find (sub)optimal quantizer. - - // That's what our for(;;) loop is about. - for (;;) - { - // 1. Project input set on the axis in consideration. - // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is - // P + ((R-P).v) / (v.v))v - // The distance along v is therefore (R-P).v / (v.v) - // (v.v) is 1 if v is a unit vector. - // - PrjBnd[0] = 1000.; - PrjBnd[1] = -1000.; - for (i = 0; i < BLOCK_SIZE_4X4; i++) - Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f; - - for (i = 0; i < dwUniqueColors; i++) - { - Prj0[i] = Prj[i] = BlkSh[i][0] * LineDir[0] + BlkSh[i][1] * LineDir[1] + BlkSh[i][2] * LineDir[2]; - - PrjErr[i] = (BlkSh[i][0] - LineDir[0] * Prj[i]) * (BlkSh[i][0] - LineDir[0] * Prj[i]) + - (BlkSh[i][1] - LineDir[1] * Prj[i]) * (BlkSh[i][1] - LineDir[1] * Prj[i]) + - (BlkSh[i][2] - LineDir[2] * Prj[i]) * (BlkSh[i][2] - LineDir[2] * Prj[i]); - - PrjBnd[0] = min(PrjBnd[0], Prj[i]); - PrjBnd[1] = max(PrjBnd[1], Prj[i]); - } - - // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points. - - // min and max of the search interval - CGU_FLOAT stepf = 0.125f; - - CGU_FLOAT Scl[NUM_ENDPOINTS]; - Scl[0] = PrjBnd[0] - (PrjBnd[1] - PrjBnd[0]) * stepf; - Scl[1] = PrjBnd[1] + (PrjBnd[1] - PrjBnd[0]) * stepf; - - // No range found exit - if (Scl[0] == Scl[1]) - { - return false; - } - - // compute scaling factor to scale down the search interval to [0.,1] - const CGU_FLOAT Scl2 = (Scl[1] - Scl[0]) * (Scl[1] - Scl[0]); - const CGU_FLOAT overScl = 1.f / (Scl[1] - Scl[0]); - - for (i = 0; i < dwUniqueColors; i++) - { - // scale them - Prj[i] = (Prj[i] - Scl[0]) * overScl; - // premultiply the scale squire to plug into error computation later - PreMRep[i] = Rpt[i] * Scl2; - } - - // scale first approximation of end points - PrjBnd[0] = (PrjBnd[0] - Scl[0]) * overScl; - PrjBnd[1] = (PrjBnd[1] - Scl[0]) * overScl; - - // find the best endpoints - CGU_FLOAT Pos[NUM_ENDPOINTS]; -#if defined(ASPM_GPU) - CGU_FLOAT StepErr = _cpu_bc1ComputeBestEndpoints(Pos, PrjBnd, Prj, PrjErr, PreMRep, dwUniqueColors, dwNumPoints); -#else - CGU_FLOAT StepErr = cpu_bc1ComputeBestEndpoints(Pos, PrjBnd, Prj, PrjErr, PreMRep, dwUniqueColors, dwNumPoints); -#endif - - // inverse the scaling - Pos[0] = Pos[0] * (Scl[1] - Scl[0]) + Scl[0]; - Pos[1] = Pos[1] * (Scl[1] - Scl[0]) + Scl[0]; - - // did we find somthing better from the previous run? - if (StepErr + 0.001 < ErrG) - { - // yes, remember it - ErrG = StepErr; - LineDirG[0] = LineDir[0]; - LineDirG[1] = LineDir[1]; - LineDirG[2] = LineDir[2]; - PosG[0] = Pos[0]; - PosG[1] = Pos[1]; - // 3. Compute the vector of indexes (or clusters) for the current approximate ramp. - // indexes - const CGU_FLOAT step = (Pos[1] - Pos[0]) / (CGU_FLOAT)(dwNumPoints - 1); - const CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; - const CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; - const CGU_FLOAT overBlkTp = 1.f / (CGU_FLOAT)(dwNumPoints - 1); - - // here the index vector is computed, - // shifted and normalized - CGU_FLOAT indxAvrg = (CGU_FLOAT)(dwNumPoints - 1) / 2.f; - - for (i = 0; i < dwUniqueColors; i++) - { - CGU_FLOAT del; - //int n = (int)((b - _min_ex + (step*0.5f)) * rstep); - if ((del = Prj0[i] - Pos[0]) <= 0) - RmpIndxs[i] = 0.f; - else if (Prj0[i] - Pos[1] >= 0) - RmpIndxs[i] = (CGU_FLOAT)(dwNumPoints - 1); - else - RmpIndxs[i] = cmp_floor((del + step_h) * rstep); - // shift and normalization - RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp; - } - - // 4. Present our color channels as 3 16DIM vectors. - // 5. Find closest aproximation of each of 16DIM color vector with the pojection of the 16DIM index vector. - CGU_FLOAT Crs[3], Len, Len2; - for (i = 0, Crs[0] = Crs[1] = Crs[2] = Len = 0.f; i < dwUniqueColors; i++) - { - const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i]; - Len += RmpIndxs[i] * PreMlt; - for (j = 0; j < 3; j++) - Crs[j] += BlkSh[i][j] * PreMlt; - } - - LineDir[0] = LineDir[1] = LineDir[2] = 0.f; - if (Len > 0.f) - { - LineDir[0] = Crs[0] / Len; - LineDir[1] = Crs[1] / Len; - LineDir[2] = Crs[2] / Len; - - // 6. Plug the projections as a new directional vector for the axis. - // 7. Goto 1. - Len2 = LineDir[0] * LineDir[0] + LineDir[1] * LineDir[1] + LineDir[2] * LineDir[2]; - Len2 = sqrt(Len2); - - LineDir[0] /= Len2; - LineDir[1] /= Len2; - LineDir[2] /= Len2; - } - } - else // We was not able to find anything better. Drop dead. - break; - } - - // inverse transform to find end-points of 3-color ramp - for (k = 0; k < 2; k++) - for (j = 0; j < 3; j++) - rsltC[j][k] = (PosG[k] * LineDirG[j] + Mdl[j]) * 255.f; - } - - // We've dealt with (almost) unrestricted full precision realm. - // Now back to the dirty digital world. - - // round the end points to make them look like compressed ones - CGU_FLOAT inpRmpEndPts[NUM_CHANNELS][NUM_ENDPOINTS]; - cpu_MkRmpOnGrid(inpRmpEndPts, rsltC, 0.f, 255.f, nRedBits, nGreenBits, nBlueBits); - - // Try using this on 3 channels - // static CGU_Vec2i cmp_getLinearEndPoints(CGU_FLOAT _Blk[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSigned); - - // This not a small procedure squeezes and stretches the ramp along each axis (R,G,B) separately while other 2 are fixed. - // It does it only over coarse grid - 565 that is. It tries to squeeze more precision for the real world ramp. -#if defined(USE_REFINE) || defined(USE_REFINE3D) - switch (nRefinementSteps) - { - case 1: - cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 3); - break; - case 2: - if (dwUniqueColors > 2) - cmp_Refine3D(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1); - else - cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 3); - break; - default: - cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1); - break; - } -#endif - return true; -} - -// CPU: CompRGBBlock() -CMP_STATIC CGU_FLOAT cpu_CompRGBBlock32(CGU_UINT32 block_32[16], - CGU_UINT32 compressedBlock[2], - CGU_UINT32 dwBlockSize, - CGU_UINT8 nRedBits, - CGU_UINT8 nGreenBits, - CGU_UINT8 nBlueBits, - CGU_UINT8 nEndpoints[3][NUM_ENDPOINTS], - CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], - CGU_UINT8 dwNumPoints, - bool b3DRefinement, - CGU_UINT8 m_nRefinementSteps, - CGU_FLOAT _pfChannelWeights[3], - bool _bUseAlpha, - CGU_UINT8 _nAlphaThreshold) -{ - CGU_FLOAT ALIGN_16 Rpt[BLOCK_SIZE_4X4]; - CGU_FLOAT ALIGN_16 BlkIn[BLOCK_SIZE_4X4][NUM_CHANNELS]; - CGU_UINT32 mx; - for (mx = 0; mx < BLOCK_SIZE_4X4; mx++) - { - Rpt[mx] = 0; - BlkIn[mx][0] = 0; - BlkIn[mx][1] = 0; - BlkIn[mx][2] = 0; - BlkIn[mx][3] = 0; - } - - compressedBlock[0] = 0; - - CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24; - CGU_UINT32 dwColors = 0; - CGU_UINT32 dwBlk[BLOCK_SIZE]; - for (CGU_UINT32 i = 0; i < dwBlockSize; i++) - if (!_bUseAlpha || (block_32[i] & 0xff000000) >= dwAlphaThreshold) - dwBlk[dwColors++] = block_32[i] | 0xff000000; - - // Do we have any colors ? - static int id = 0; - if (dwColors) - { - bool bHasAlpha = (dwColors != dwBlockSize); - if (bHasAlpha && _bUseAlpha && !(dwNumPoints & 0x1)) - return CMP_FLT_MAX; - - // Here we are computing an unique number of colors. - // For each unique value we compute the number of it appearences. - //qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp); -#ifndef ASPM_GPU // this is here for reminder when code moves to GPU - std::sort(dwBlk, dwBlk + 15); -#else - { - CGU_UINT32 j; - CMP_di what[BLOCK_SIZE_4X4]; - - for (i = 0; i < dwColors; i++) - { - what[i].index = i; - what[i].data = dwBlk[i]; - } - - CGU_UINT32 tmp_index; - CGU_UINT32 tmp_data; - - for (i = 1; i < dwColors; i++) - { - for (j = i; j > 0; j--) - { - if (what[j - 1].data > what[j].data) - { - tmp_index = what[j].index; - tmp_data = what[j].data; - what[j].index = what[j - 1].index; - what[j].data = what[j - 1].data; - what[j - 1].index = tmp_index; - what[j - 1].data = tmp_data; - } - } - } - for (i = 0; i < dwColors; i++) - dwBlk[i] = what[i].data; - } -#endif - - CGU_UINT32 new_p; - CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4]; - CGU_UINT32 dwUniqueColors = 0; - new_p = dwBlkU[0] = dwBlk[0]; - Rpt[dwUniqueColors] = 1.f; - CGU_UINT32 i; - for (i = 1; i < dwColors; i++) - { - if (new_p != dwBlk[i]) - { - dwUniqueColors++; - new_p = dwBlkU[dwUniqueColors] = dwBlk[i]; - Rpt[dwUniqueColors] = 1.f; - } - else - Rpt[dwUniqueColors] += 1.f; - } - dwUniqueColors++; - - // switch to float - for (i = 0; i < dwUniqueColors; i++) - { - BlkIn[i][RC] = (CGU_FLOAT)((dwBlkU[i] >> 16) & 0xff); // R - BlkIn[i][GC] = (CGU_FLOAT)((dwBlkU[i] >> 8) & 0xff); // G - BlkIn[i][BC] = (CGU_FLOAT)((dwBlkU[i] >> 0) & 0xff); // B - BlkIn[i][AC] = 255.0f; - } - - CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS]; - if (cpu_CompressRGBBlockX(rsltC, // CMP_EndPoints = CompressRGBBlock_Slow2 ( - BlkIn, // CGU_Vec3f src_imageNorm[BLOCK_SIZE_4X4] - Rpt, // CGU_FLOAT Rpt[BLOCK_SIZE_4X4], - dwUniqueColors, // CGU_UINT32 dwUniqueColors, - dwNumPoints, // CGU_UINT32 dwNumPoints, - b3DRefinement, // - m_nRefinementSteps, // CGU_UINT32 m_nRefinementSteps, - _pfChannelWeights, // CGU_Vec3f channelWeightsBGR, - nRedBits, // ); - nGreenBits, - nBlueBits, - 1.0f)) - { - // return to integer realm - for (int ch = 0; ch < 3; ch++) - for (int j = 0; j < 2; j++) - nEndpoints[ch][j] = (CGU_UINT8)rsltC[ch][j]; - //printf("Endpoints {%3d,%3d,%3d} {%3d,%3d,%3d} ", nEndpoints[0][0],nEndpoints[1][0],nEndpoints[2][0], - // nEndpoints[0][1],nEndpoints[1][1],nEndpoints[2][1]); - - // Now get the indices using the new end points - return cpu_Clstr( - block_32, dwBlockSize, nEndpoints, pcIndices, dwNumPoints, _pfChannelWeights, _bUseAlpha, _nAlphaThreshold, nRedBits, nGreenBits, nBlueBits); - } - else - { - CGU_FLOAT CompErr = CMP_FLT_MAX; - if (dwNumPoints < 4) - { - CGU_Vec3f src_imageNorm[BLOCK_SIZE_4X4]; - - for (CGU_UINT32 px = 0; px < 16; px++) - { - src_imageNorm[px].r = (CGU_FLOAT)((block_32[px] >> 16) & 0xff) / 255.0f; - src_imageNorm[px].g = (CGU_FLOAT)((block_32[px] >> 8) & 0xff) / 255.0f; - src_imageNorm[px].b = (CGU_FLOAT)((block_32[px] >> 0) & 0xff) / 255.0f; - } - - // Do a quick compression test - CGU_Vec3f srcRGB[16]; // The list of source colors with blue channel altered - CGU_Vec3f average_rgb; // The centrepoint of the axis - CGU_FLOAT errLQ = CMP_FLT_MAX; - cgu_CompressRGBBlock_MinMax(src_imageNorm, 1.0f, false, srcRGB, average_rgb, errLQ); - CGU_Vec2ui cmp = cgu_CompressRGBBlock_Fast(src_imageNorm, 1.0f, false, srcRGB, average_rgb, CompErr); - - compressedBlock[0] = cmp.x; - compressedBlock[1] = cmp.y; - } - return CompErr; - } - } - else - { - // All colors transparent - nEndpoints[0][0] = nEndpoints[1][0] = nEndpoints[2][0] = 0; - nEndpoints[0][1] = nEndpoints[1][1] = nEndpoints[2][1] = 0xff; - for (CGU_UINT32 ms = 0; ms < dwBlockSize; ms++) - pcIndices[ms] = 0xff; - return 0.0; - } -} - -CMP_STATIC CGU_Vec2ui cpu_CompRGBBlock(CMP_IN CGU_Vec4uc bgraBlock[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options, CMP_INOUT CGU_FLOAT CMP_REFINOUT err) -{ - CGU_Vec2ui cmpBlock = {0U, 0U}; - CGU_FLOAT pfChannelWeights[3] = {1.0f, 1.0f, 1.0f}; - CGU_UINT8 nEndpoints[2][3][2]; - CGU_UINT8 nIndices[2][BLOCK_SIZE_4X4]; - CGU_UINT32 compressedBlock[2] = {0, 0}; - - CGU_FLOAT fError3 = CMP_FLT_MAX; - - fError3 = cpu_CompRGBBlock32((CGU_UINT32*)bgraBlock, - compressedBlock, - BLOCK_SIZE_4X4, - RG, - GG, - BG, - nEndpoints[0], - nIndices[0], - 3, - BC15Options.m_b3DRefinement, - BC15Options.m_nRefinementSteps, - pfChannelWeights, - BC15Options.m_bUseAlpha, - BC15Options.m_nAlphaThreshold); - // use case of small min max ranges - if (compressedBlock[0] > 0) - { - //return cmpBlockBlue; - cmpBlock.x = compressedBlock[0]; - cmpBlock.y = compressedBlock[1]; - err = fError3; - } - else - { - CGU_FLOAT fError4 = CMP_FLT_MAX; - fError4 = (fError3 == 0.0) ? CMP_FLT_MAX - : cpu_CompRGBBlock32((CGU_UINT32*)bgraBlock, - compressedBlock, - BLOCK_SIZE_4X4, - RG, - GG, - BG, - nEndpoints[1], - nIndices[1], - 4, - BC15Options.m_b3DRefinement, - BC15Options.m_nRefinementSteps, - pfChannelWeights, - BC15Options.m_bUseAlpha, - BC15Options.m_nAlphaThreshold); - - CGU_UINT32 nMethod; - if (fError3 <= fError4) - { - err = fError3; - nMethod = 0; - } - else - { - err = fError4; - nMethod = 1; - } - - CGU_UINT32 c0 = - BC1ConstructColour((nEndpoints[nMethod][RC][0] >> (8 - RG)), (nEndpoints[nMethod][GC][0] >> (8 - GG)), (nEndpoints[nMethod][BC][0] >> (8 - BG))); - CGU_UINT32 c1 = - BC1ConstructColour((nEndpoints[nMethod][RC][1] >> (8 - RG)), (nEndpoints[nMethod][GC][1] >> (8 - GG)), (nEndpoints[nMethod][BC][1] >> (8 - BG))); - if (nMethod == 1 && c0 <= c1 || nMethod == 0 && c0 > c1) - compressedBlock[0] = c1 | (c0 << 16); - else - compressedBlock[0] = c0 | (c1 << 16); - - compressedBlock[1] = 0; - for (CGU_UINT32 i = 0; i < 16; i++) - compressedBlock[1] |= (nIndices[nMethod][i] << (2 * i)); - - cmpBlock.x = compressedBlock[0]; - cmpBlock.y = compressedBlock[1]; - } - - return cmpBlock; -} - -#endif - -#ifdef ENABLE_NEW_CODE - -//---------------------------------------- Common Utility Code ------------------------------------------------------- -// 1 - Dim error -CMP_STATIC CGU_FLOAT cgu_RampSrchW(CGU_FLOAT Prj[BLOCK_SIZE_4X4], - CGU_FLOAT PrjErr[BLOCK_SIZE_4X4], - CGU_FLOAT PreMRep[BLOCK_SIZE_4X4], - CGU_FLOAT StepErr, - CGU_FLOAT lowPosStep, - CGU_FLOAT highPosStep, - CGU_UINT32 dwUniqueColors, - CGU_UINT32 dwNumPoints) -{ - CGU_FLOAT error = 0; - CGU_FLOAT step = (highPosStep - lowPosStep) / (dwNumPoints - 1); - CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; - CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; - - for (CGU_UINT32 i = 0; i < dwUniqueColors; i++) - { - CGU_FLOAT v; - // Work out which value in the block this select - CGU_FLOAT del; - - if ((del = Prj[i] - lowPosStep) <= 0) - v = lowPosStep; - else if (Prj[i] - highPosStep >= 0) - v = highPosStep; - else - v = cmp_floor((del + step_h) * rstep) * step + lowPosStep; - - // And accumulate the error - CGU_FLOAT d = (Prj[i] - v); - d *= d; - CGU_FLOAT err = PreMRep[i] * d + PrjErr[i]; - error += err; - if (StepErr < error) - { - error = StepErr; - break; - } - } - return error; -} - -CMP_STATIC CGU_UINT32 cgu_processCluster(CMP_IN CMP_EndPoints EndPoints, - CMP_IN CGU_Vec4f rgbBlock_normal[BLOCK_SIZE_4X4], - CMP_IN CGU_UINT32 dwAlphaThreshold, - CMP_IN CGU_Vec3f channelWeights, - CMP_IN CGU_UINT8 indices[BLOCK_SIZE_4X4], - CMP_OUT CGU_FLOAT CMP_REFINOUT Err) -{ - Err = 0.f; - CGU_UINT32 pcIndices = 0; - CGU_UINT32 R, G, B; - - R = (CGU_UINT32)(EndPoints.Color0.z); - G = (CGU_UINT32)(EndPoints.Color0.y); - B = (CGU_UINT32)(EndPoints.Color0.x); - CGU_INT32 cluster0 = cmp_constructColor(R, G, B); - - R = (CGU_UINT32)(EndPoints.Color1.z); - G = (CGU_UINT32)(EndPoints.Color1.y); - B = (CGU_UINT32)(EndPoints.Color1.x); - CGU_INT32 cluster1 = cmp_constructColor(R, G, B); - - CGU_Vec3f InpRmp[NUM_ENDPOINTS]; - if ((cluster0 <= cluster1) // valid for 4 channels - // || (cluster0 > cluster1) // valid for 3 channels - ) - { - // inverse endpoints - InpRmp[0] = EndPoints.Color1; - InpRmp[1] = EndPoints.Color0; - } - else - { - InpRmp[0] = EndPoints.Color0; - InpRmp[1] = EndPoints.Color1; - } - - CGU_Vec3f srcblockLinear[BLOCK_SIZE_4X4]; - CGU_FLOAT srcblockA[BLOCK_SIZE_4X4]; - - // Swizzle the source RGB to BGR for processing - for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++) - { - srcblockLinear[i].z = rgbBlock_normal[i].x * 255.0f; - srcblockLinear[i].y = rgbBlock_normal[i].y * 255.0f; - srcblockLinear[i].x = rgbBlock_normal[i].z * 255.0f; - srcblockA[i] = 0.0f; - //if (dwAlphaThreshold > 0) - //{ - // CGU_UINT32 alpha = (CGU_UINT32)BlockA[i]; - // if (alpha >= dwAlphaThreshold) - // srcblockA[i] = BlockA[i]; - //} - } - - // cmp_ClstrBas2() - // input ramp is on the coarse grid - // make ramp endpoints the way they'll going to be decompressed - CGU_Vec3f InpRmpL[NUM_ENDPOINTS]; - CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F}; // 1 << RG,1 << GG,1 << BG - - { - // ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp); - InpRmpL[0] = InpRmp[0] + cmp_floorVec3f(InpRmp[0] / Fctrs); - InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f); - InpRmpL[1] = InpRmp[1] + cmp_floorVec3f(InpRmp[1] / Fctrs); - InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f); - } // MkWkRmpPts - - // build ramp - CGU_Vec3f LerpRmp[4]; - CGU_Vec3f offset = {1.0f, 1.0f, 1.0f}; - { - //BldRmp(Rmp, InpRmpL, dwNumChannels); - // linear interpolate end points to get the ramp - LerpRmp[0] = InpRmpL[0]; - LerpRmp[3] = InpRmpL[1]; - LerpRmp[1] = cmp_floorVec3f((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f); - LerpRmp[2] = cmp_floorVec3f((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f); - } // BldRmp - - //========================================================================= - // Clusterize, Compute error and find DXTC indexes for the current cluster - //========================================================================= - { - // Clusterize - CGU_UINT32 alpha; - - // For each colour in the original block assign it - // to the closest cluster and compute the cumulative error - for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++) - { - alpha = (CGU_UINT32)srcblockA[i]; - if ((dwAlphaThreshold > 0) && alpha == 0) - { //*((CGU_UINT32 *)&_Blk[i][AC]) == 0) - pcIndices |= cmp_set2Bit32(4, i); // dwNumChannels 3 or 4 (default is 4) - indices[i] = 4; - } - else - { - CGU_FLOAT shortest = 99999999999.f; - CGU_UINT8 shortestIndex = 0; - - CGU_Vec3f channelWeightsBGR; - channelWeightsBGR.x = channelWeights.z; - channelWeightsBGR.y = channelWeights.y; - channelWeightsBGR.z = channelWeights.x; - - for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++) - { - // r is either 1 or 4 - // calculate the distance for each component - CGU_FLOAT distance = cmp_dotVec3f(((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR), - ((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR)); - if (distance < shortest) - { - shortest = distance; - shortestIndex = rampindex; - } - } - - Err += shortest; - - // The total is a sum of (error += shortest) - // We have the index of the best cluster, so assign this in the block - // Reorder indices to match correct DXTC ordering - if (shortestIndex == 3) // dwNumChannels - 1 - shortestIndex = 1; - else if (shortestIndex) - shortestIndex++; - pcIndices |= cmp_set2Bit32(shortestIndex, i); - indices[i] = shortestIndex; - } - } // BLOCK_SIZE_4X4 - } // Clusterize - - return pcIndices; -} -#endif - -// Process a rgbBlock which is normalized (0.0f ... 1.0f), signed normal is not implemented -CMP_STATIC CGU_Vec2ui CompressBlockBC1_NORMALIZED(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options) -{ - bool usingMaxQualityOnly = false; - -#ifndef ASPM_GPU - if (BC15Options.m_fquality > 0.75) - usingMaxQualityOnly = true; -#endif - - CGU_FLOAT CompErr = CMP_FLT_MAX; - CGU_Vec2ui cmpBlock = {0U, 0U}; - CGU_Vec2ui cmpBlockTemp = {0U, 0U}; - CGU_FLOAT CompErrTemp; - - // Transfer to RGB Norm from RGBA Norm - CGU_Vec3f src_imageRGBNorm[16]; - CGU_Vec4uc pixels[16]; - CGU_Vec4uc pixelsBGRA[16]; - - for (CGU_UINT32 sr = 0; sr < 16; sr++) - { - src_imageRGBNorm[sr] = src_imageNorm[sr].rgb; - pixelsBGRA[sr].b = pixels[sr].r = src_imageNorm[sr].r * 255.0f; - pixelsBGRA[sr].g = pixels[sr].g = src_imageNorm[sr].g * 255.0f; - pixelsBGRA[sr].r = pixels[sr].b = src_imageNorm[sr].b * 255.0f; - pixelsBGRA[sr].a = pixels[sr].a = src_imageNorm[sr].a * 255.0f; - } - - // check for a punch through transparent alpha setting - if ((BC15Options.m_fquality < 0.75) && (BC15Options.m_bUseAlpha)) - { - CGU_Vec2ui cmpBlockAlpha = {0xffff0000, 0xffffffffU}; - for (CGU_UINT32 sr = 0; sr < 16; sr++) - if (pixels[sr].a < BC15Options.m_nAlphaThreshold) - { - return cmpBlockAlpha; - } - } - - //================ - // extern codec - //================ - // For debugging - // CGU_Vec2ui cmpBlockRed = {0xF800F800,0x00000000}; - // CGU_Vec2ui cmpBlockGreen = {0x7E007E00,0x00000000}; - // CGU_Vec2ui cmpBlockBlue = {0x1F001F00,0x00000000}; - - if (!BC15Options.m_bUseAlpha) - { - //========================================== - // Gain +0.3 dB for images with soild blocks - //========================================== - bool bAllColoursEqual = true; - - // Load the whole 4x4 block - for (CGU_UINT32 i = 0u; (i < 16u) && bAllColoursEqual; ++i) - { - for (CGU_INT c = 0; c < 3; c++) - bAllColoursEqual = bAllColoursEqual && (pixels[0][c] == pixels[i][c]); - } - - if (bAllColoursEqual) - { - cmpBlock = cgu_solidColorBlock(pixels[0].x, pixels[0].y, pixels[0].z); - CompErr = cgu_RGBABlockErrorLinear(pixels, cmpBlock); - if (BC15Options.m_nRefinementSteps < 1) - return cmpBlock; - } - } - - if (!usingMaxQualityOnly) - { - //==================================== - // Get src image data, min,max... - //===================================== - //CMP_EncodeData edata; - //cmp_get_encode_data(edata,pixels); - - if (!BC15Options.m_bUseAlpha) - { - //==================================== - // Fast Compression, low quality - //===================================== - CGU_Vec3f srcRGB[16]; // The list of source colors with blue channel altered - CGU_Vec3f average_rgb; // The centrepoint of the axis - CGU_FLOAT errLQ = CMP_FLT_MAX; - cmpBlockTemp = cgu_CompressRGBBlock_MinMax(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB, srcRGB, average_rgb, errLQ); - if ((BC15Options.m_fquality < CMP_QUALITY0) || (errLQ == 0.0f)) - return cmpBlockTemp; - - if (CompErr > errLQ) - { - CompErr = errLQ; - cmpBlock = cmpBlockTemp; - } - - cmpBlockTemp = cgu_CompressRGBBlock_Fast(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB, srcRGB, average_rgb, errLQ); - if (CompErr > errLQ) - { - CompErr = errLQ; - cmpBlock = cmpBlockTemp; - } - if (BC15Options.m_fquality < CMP_QUALITY1) - return cmpBlock; - } - - //======================================== - // use GPU codec lower quality then CPU - //======================================== - cmpBlockTemp = cgu_CompRGBBlock(src_imageNorm, BC15Options); - CompErrTemp = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp); - if (CompErr > CompErrTemp) - { - CompErr = CompErrTemp; - cmpBlock = cmpBlockTemp; - } - - if (BC15Options.m_fquality < CMP_QUALITY2) - return cmpBlock; - } // if useCGUCodecs - - //==================================== - // High Quality Codec CPU only - //===================================== -#ifndef ASPM_GPU - cmpBlockTemp = cpu_CompRGBBlock(pixelsBGRA, BC15Options, CompErrTemp); - - CompErrTemp = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp); - - if (CompErr > CompErrTemp) - { - CompErr = CompErrTemp; - cmpBlock = cmpBlockTemp; - } -#endif - - return cmpBlock; -} +//===================================================================== +// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// File: bc1_cmp.h +//-------------------------------------------------------------------------------------- +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +//-------------------------------------------------------------------------------------- + +#define USE_CMP + +#include "common_def.h" +#include "bcn_common_kernel.h" +#include "bcn_common_api.h" + +#ifndef ASPM_GPU +#include "cpu_extensions.h" +#include "core_simd.h" +#endif + +//----------------------------------------------------------------------- +// When build is for CPU, we have some missing API calls common to GPU +// Use CPU CMP_Core replacements +//----------------------------------------------------------------------- +#if defined(ASPM_GPU) || defined(ASPM_HLSL) || defined(ASPM_OPENCL) +#define ALIGN_16 +#define ALIGN_32 +#define ALIGN_64 +#else +#include INC_cmp_math_func +#if defined(_WIN32) || defined(_WIN64) +#define ALIGN_16 __declspec(align(16)) +#define ALIGN_32 __declspec(align(32)) +#define ALIGN_64 __declspec(align(64)) +#else // !WIN32 && !_WIN64 +#define ALIGN_16 __attribute__((aligned(16))) +#define ALIGN_32 __attribute__((aligned(32))) +#define ALIGN_64 __attribute__((aligned(64))) +#endif // !WIN32 && !_WIN64 +#endif + +#define USE_REFINE3D +#define USE_REFINE + +#ifndef MAX_ERROR +#define MAX_ERROR 128000.f +#endif + +#define NUM_CHANNELS 4 +#define NUM_ENDPOINTS 2 + +#ifndef CMP_QUALITY0 +#define CMP_QUALITY0 0.25f +#endif + +#ifndef CMP_QUALITY1 +#define CMP_QUALITY1 0.50f +#endif + +#ifndef CMP_QUALITY2 +#define CMP_QUALITY2 0.75f +#endif + +#define EPS (2.f / 255.f) * (2.f / 255.f) +#define EPS2 3.f * (2.f / 255.f) * (2.f / 255.f) + +// Disable SIMD code during GPU builds +#if !defined(ASPM_GPU) +CMP_STATIC CGU_BOOL g_bc1FunctionPointersSet = false; + +// declarations for SIMD function variations +CMP_STATIC CGU_FLOAT _cpu_bc1ComputeBestEndpoints(CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, int, int); + +// function pointers +CMP_STATIC CGU_FLOAT (*cpu_bc1ComputeBestEndpoints)(CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, int, int) = 0; + +// Toggle which SIMD instruction set extensions to use. Setting this to EXTENSION_COUNT will enable auto-detection of supported extensions. +// NOTE: The requested extension will only be enabled if it is supported by the current CPU. +CMP_STATIC bool bc1ToggleSIMD(CGU_INT newExtension) +{ + CPUExtensions extensions = GetCPUExtensions(); + + // Metallicafan212: Don't evaluate on non-X86 platforms +#if AMD_COMPRESSONATOR_AMD64 || AMD_COMPRESSONATOR_X86 + CGU_BOOL useAVX512 = true; + CGU_BOOL useAVX2 = true; + CGU_BOOL useSSE42 = true; + + if (newExtension < EXTENSION_COUNT) // user requested a specific instruction set extension + { + useAVX512 = newExtension == EXTENSION_AVX512_F; + useAVX2 = newExtension == EXTENSION_AVX2; + useSSE42 = newExtension == EXTENSION_SSE42; + } + + if (useAVX512 && IsAvailableAVX512(extensions)) + { + cpu_bc1ComputeBestEndpoints = avx512_bc1ComputeBestEndpoints; + } + else if (useAVX2 && IsAvailableAVX2(extensions)) + { + cpu_bc1ComputeBestEndpoints = avx_bc1ComputeBestEndpoints; + } + else if (useSSE42 && IsAvailableSSE4(extensions)) + { + cpu_bc1ComputeBestEndpoints = sse_bc1ComputeBestEndpoints; + } + else + { + cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints; + } +#else + CGU_BOOL useAVX512 = false; + CGU_BOOL useAVX2 = false; + CGU_BOOL useSSE42 = false; + cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints; +#endif + + g_bc1FunctionPointersSet = true; + + bool result = true; + + if (newExtension != EXTENSION_COUNT && (useAVX512 && !IsAvailableAVX512(extensions)) || (useAVX2 && !IsAvailableAVX2(extensions)) || + (useSSE42 && !IsAvailableSSE4(extensions))) + result = false; + + return result; +} +#endif + +static CGU_FLOAT cgu_getRampErr(CGU_FLOAT Prj[BLOCK_SIZE_4X4], + CGU_FLOAT PrjErr[BLOCK_SIZE_4X4], + CGU_FLOAT PreMRep[BLOCK_SIZE_4X4], + CGU_FLOAT StepErr, + CGU_FLOAT lowPosStep, + CGU_FLOAT highPosStep, + CGU_UINT32 dwUniqueColors) +{ + CGU_FLOAT error = 0; + CGU_FLOAT step = (highPosStep - lowPosStep) / 3; // using (dwNumChannels=4 - 1); + CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; + CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; + + for (CGU_UINT32 i = 0; i < dwUniqueColors; i++) + { + CGU_FLOAT v; + // Work out which value in the block this select + CGU_FLOAT del; + + if ((del = Prj[i] - lowPosStep) <= 0) + v = lowPosStep; + else if (Prj[i] - highPosStep >= 0) + v = highPosStep; + else + v = cmp_floor((del + step_h) * rstep) * step + lowPosStep; + + // And accumulate the error + CGU_FLOAT d = (Prj[i] - v); + d *= d; + CGU_FLOAT err = PreMRep[i] * d + PrjErr[i]; + error += err; + if (StepErr < error) + { + error = StepErr; + break; + } + } + return error; +} + +CMP_STATIC CMP_EndPoints cgu_CompressRGBBlockX(CMP_IN CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4], + CMP_IN CGU_FLOAT Rpt[BLOCK_SIZE_4X4], + CMP_IN CGU_UINT32 dwUniqueColors, + CMP_IN CGU_Vec3f channelWeightsBGR, + CMP_IN CGU_BOOL b3DRefinement) +{ + CMP_UNUSED(channelWeightsBGR); + CMP_UNUSED(b3DRefinement); + CGU_FLOAT ALIGN_16 Prj0[BLOCK_SIZE_4X4]; + CGU_FLOAT ALIGN_16 Prj[BLOCK_SIZE_4X4]; + CGU_FLOAT ALIGN_16 PrjErr[BLOCK_SIZE_4X4]; + CGU_FLOAT ALIGN_16 RmpIndxs[BLOCK_SIZE_4X4]; + + CGU_Vec3f LineDirG; + CGU_Vec3f LineDir; + CGU_FLOAT LineDir0[NUM_CHANNELS]; + CGU_Vec3f BlkUV[BLOCK_SIZE_4X4]; + CGU_Vec3f BlkSh[BLOCK_SIZE_4X4]; + CGU_Vec3f Mdl; + + CGU_Vec3f rsltC0; + CGU_Vec3f rsltC1; + CGU_Vec3f PosG0 = {0.0f, 0.0f, 0.0f}; + CGU_Vec3f PosG1 = {0.0f, 0.0f, 0.0f}; + CGU_UINT32 i; + + for (i = 0; i < dwUniqueColors; i++) + { + BlkUV[i] = BlkInBGRf_UV[i]; + } + + // if not more then 2 different colors, we've done + if (dwUniqueColors <= 2) + { + rsltC0 = BlkInBGRf_UV[0] * 255.0f; + rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f; + } + else + { + // This is our first attempt to find an axis we will go along. + // The cumulation is done to find a line minimizing the MSE from the + // input 3D points. + + // While trying to find the axis we found that the diameter of the input + // set is quite small. Do not bother. + + // FindAxisIsSmall(BlkSh, LineDir0, Mdl, Blk, Rpt,dwUniqueColors); + { + CGU_UINT32 ii; + CGU_UINT32 jj; + CGU_UINT32 kk; + + // These vars cannot be Vec3 as index to them are varying + CGU_FLOAT Crrl[NUM_CHANNELS]; + CGU_FLOAT RGB2[NUM_CHANNELS]; + + LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = Mdl.x = Mdl.y = Mdl.z = 0.f; + + // sum position of all points + CGU_FLOAT fNumPoints = 0.0f; + for (ii = 0; ii < dwUniqueColors; ii++) + { + Mdl.x += BlkUV[ii].x * Rpt[ii]; + Mdl.y += BlkUV[ii].y * Rpt[ii]; + Mdl.z += BlkUV[ii].z * Rpt[ii]; + fNumPoints += Rpt[ii]; + } + + // and then average to calculate center coordinate of block + Mdl /= fNumPoints; + + for (ii = 0; ii < dwUniqueColors; ii++) + { + // calculate output block as offsets around block center + BlkSh[ii] = BlkUV[ii] - Mdl; + + // compute correlation matrix + // RGB2 = sum of ((distance from point from center) squared) + RGB2[0] += BlkSh[ii].x * BlkSh[ii].x * Rpt[ii]; + RGB2[1] += BlkSh[ii].y * BlkSh[ii].y * Rpt[ii]; + RGB2[2] += BlkSh[ii].z * BlkSh[ii].z * Rpt[ii]; + + Crrl[0] += BlkSh[ii].x * BlkSh[ii].y * Rpt[ii]; + Crrl[1] += BlkSh[ii].y * BlkSh[ii].z * Rpt[ii]; + Crrl[2] += BlkSh[ii].z * BlkSh[ii].x * Rpt[ii]; + } + + // if set's diameter is small + CGU_UINT32 i0 = 0, i1 = 1; + CGU_FLOAT mxRGB2 = 0.0f; + + CGU_FLOAT fEPS = fNumPoints * EPS; + for (kk = 0, jj = 0; jj < 3; jj++) + { + if (RGB2[jj] >= fEPS) + kk++; + else + RGB2[jj] = 0.0f; + + if (mxRGB2 < RGB2[jj]) + { + mxRGB2 = RGB2[jj]; + i0 = jj; + } + } + + CGU_FLOAT fEPS2 = fNumPoints * EPS2; + CGU_BOOL AxisIsSmall; + + AxisIsSmall = (RGB2[0] < fEPS2); + AxisIsSmall = AxisIsSmall && (RGB2[1] < fEPS2); + AxisIsSmall = AxisIsSmall && (RGB2[2] < fEPS2); + + // all are very small to avoid division on the small determinant + if (AxisIsSmall) + { + rsltC0 = BlkInBGRf_UV[0] * 255.0f; + rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f; + } + else + { + // !AxisIsSmall + if (kk == 1) // really only 1 dimension + LineDir0[i0] = 1.; + else if (kk == 2) + { // really only 2 dimensions + i1 = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3; + CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3]; + LineDir0[i1] = Crl / RGB2[i0]; + LineDir0[i0] = 1.; + } + else + { + CGU_FLOAT maxDet = 100000.f; + CGU_FLOAT Cs[3]; + // select max det for precision + for (jj = 0; jj < 3; jj++) + { + // 3 = nDimensions + CGU_FLOAT Det = RGB2[jj] * RGB2[(jj + 1) % 3] - Crrl[jj] * Crrl[jj]; + Cs[jj] = cmp_fabs(Crrl[jj] / sqrt(RGB2[jj] * RGB2[(jj + 1) % 3])); + if (maxDet < Det) + { + maxDet = Det; + i0 = jj; + } + } + + // inverse correl matrix + // -- -- -- -- + // | A B | | C -B | + // | B C | => | -B A | + // -- -- -- -- + CGU_FLOAT mtrx1[2][2]; + CGU_FLOAT vc1[2]; + CGU_FLOAT vc[2]; + vc1[0] = Crrl[(i0 + 2) % 3]; + vc1[1] = Crrl[(i0 + 1) % 3]; + // C + mtrx1[0][0] = RGB2[(i0 + 1) % 3]; + // A + mtrx1[1][1] = RGB2[i0]; + // -B + mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0]; + // find a solution + vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1]; + vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1]; + // normalize + vc[0] /= maxDet; + vc[1] /= maxDet; + // find a line direction vector + LineDir0[i0] = 1.; + LineDir0[(i0 + 1) % 3] = 1.; + LineDir0[(i0 + 2) % 3] = vc[0] + vc[1]; + } + + // normalize direction vector + CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2]; + Len = sqrt(Len); + + LineDir0[0] = (Len > 0.f) ? LineDir0[0] / Len : 0.0f; + LineDir0[1] = (Len > 0.f) ? LineDir0[1] / Len : 0.0f; + LineDir0[2] = (Len > 0.f) ? LineDir0[2] / Len : 0.0f; + } + } // FindAxisIsSmall + + // GCC is being an awful being when it comes to goto-jumps. + // So please bear with this. + CGU_FLOAT ErrG = 10000000.f; + CGU_FLOAT PrjBnd0; + CGU_FLOAT PrjBnd1; + CGU_FLOAT ALIGN_16 PreMRep[BLOCK_SIZE_4X4]; + + LineDir.x = LineDir0[0]; + LineDir.y = LineDir0[1]; + LineDir.z = LineDir0[2]; + + // Here is the main loop. + // 1. Project input set on the axis in consideration. + // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points. + // 3. Compute the vector of indexes (or clusters) for the current approximate ramp. + // 4. Present our color channels as 3 16DIM vectors. + // 5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector. + // 6. Plug the projections as a new directional vector for the axis. + // 7. Goto 1. + // D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3,2/3, 0, ...,}, but shifted and normalized). + // Ci - is a 16 dim vector of color i. for each Ci find a scalar Ai such that (Ai * D - Ci) (Ai * D - Ci) -> min , + // i.e distance between vector AiD and C is min. You can think of D as a unit interval(vector) "clusterizer", and Ai is a scale + // you need to apply to the clusterizer to approximate the Ci vector instead of the unit vector. + // Solution is + // Ai = (D . Ci) / (D . D); . - is a dot product. + // in 3 dim space Ai(s) represent a line direction, along which + // we again try to find (sub)optimal quantizer. + // That's what our for(;;) loop is about. + for (;;) + { + // 1. Project input set on the axis in consideration. + // From Foley & Van Dam: Closest point of approach of a line (P + v) to a + // point (R) is + // P + ((R-P).v) / (v.v))v + // The distance along v is therefore (R-P).v / (v.v) + // (v.v) is 1 if v is a unit vector. + // + PrjBnd0 = 1000.0f; + PrjBnd1 = -1000.0f; + for (i = 0; i < BLOCK_SIZE_4X4; i++) + Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f; + + for (i = 0; i < dwUniqueColors; i++) + { + Prj0[i] = Prj[i] = dot(BlkSh[i], LineDir); + PrjErr[i] = dot(BlkSh[i] - LineDir * Prj[i], BlkSh[i] - LineDir * Prj[i]); + PrjBnd0 = min(PrjBnd0, Prj[i]); + PrjBnd1 = max(PrjBnd1, Prj[i]); + } + + // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal + // pair of end points. + + // min and max of the search interval + CGU_FLOAT Scl0; + CGU_FLOAT Scl1; + Scl0 = PrjBnd0 - (PrjBnd1 - PrjBnd0) * 0.125f; + Scl1 = PrjBnd1 + (PrjBnd1 - PrjBnd0) * 0.125f; + + // compute scaling factor to scale down the search interval to [0.,1] + const CGU_FLOAT Scl2 = (Scl1 - Scl0) * (Scl1 - Scl0); + const CGU_FLOAT overScl = 1.f / (Scl1 - Scl0); + + for (i = 0; i < dwUniqueColors; i++) + { + // scale them + Prj[i] = (Prj[i] - Scl0) * overScl; + // premultiply the scale square to plug into error computation later + PreMRep[i] = Rpt[i] * Scl2; + } + + // scale first approximation of end points + PrjBnd0 = (PrjBnd0 - Scl0) * overScl; + PrjBnd1 = (PrjBnd1 - Scl0) * overScl; + + CGU_FLOAT StepErr = MAX_ERROR; + + // search step + CGU_FLOAT searchStep = 0.025f; + + // low Start/End; high Start/End + const CGU_FLOAT lowStartEnd = (PrjBnd0 - 2.f * searchStep > 0.f) ? PrjBnd0 - 2.f * searchStep : 0.f; + const CGU_FLOAT highStartEnd = (PrjBnd1 + 2.f * searchStep < 1.f) ? PrjBnd1 + 2.f * searchStep : 1.f; + + // find the best endpoints + CGU_FLOAT Pos0 = 0; + CGU_FLOAT Pos1 = 0; + CGU_FLOAT lowPosStep, highPosStep; + CGU_FLOAT err; + + int l, h; + for (l = 0, lowPosStep = lowStartEnd; l < 8; l++, lowPosStep += searchStep) + { + for (h = 0, highPosStep = highStartEnd; h < 8; h++, highPosStep -= searchStep) + { + // compute an error for the current pair of end points. + err = cgu_getRampErr(Prj, PrjErr, PreMRep, StepErr, lowPosStep, highPosStep, dwUniqueColors); + + if (err < StepErr) + { + // save better result + StepErr = err; + Pos0 = lowPosStep; + Pos1 = highPosStep; + } + } + } + + // inverse the scaling + Pos0 = Pos0 * (Scl1 - Scl0) + Scl0; + Pos1 = Pos1 * (Scl1 - Scl0) + Scl0; + + // did we find somthing better from the previous run? + if (StepErr + 0.001 < ErrG) + { + // yes, remember it + ErrG = StepErr; + LineDirG = LineDir; + + PosG0.x = Pos0; + PosG0.y = Pos0; + PosG0.z = Pos0; + PosG1.x = Pos1; + PosG1.y = Pos1; + PosG1.z = Pos1; + + // 3. Compute the vector of indexes (or clusters) for the current + // approximate ramp. + // indexes + const CGU_FLOAT step = (Pos1 - Pos0) / 3.0f; // (dwNumChannels=4 - 1); + const CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; + const CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; + const CGU_FLOAT overBlkTp = 1.f / 3.0f; // (dwNumChannels=4 - 1); + + // here the index vector is computed, + // shifted and normalized + CGU_FLOAT indxAvrg = 3.0f / 2.0f; // (dwNumChannels=4 - 1); + + for (i = 0; i < dwUniqueColors; i++) + { + CGU_FLOAT del; + // CGU_UINT32 n = (CGU_UINT32)((b - _min_ex + (step*0.5f)) * rstep); + if ((del = Prj0[i] - Pos0) <= 0) + RmpIndxs[i] = 0.f; + else if (Prj0[i] - Pos1 >= 0) + RmpIndxs[i] = 3.0f; // (dwNumChannels=4 - 1); + else + RmpIndxs[i] = cmp_floor((del + step_h) * rstep); + // shift and normalization + RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp; + } + + // 4. Present our color channels as 3 16 DIM vectors. + // 5. Find closest aproximation of each of 16DIM color vector with the + // pojection of the 16DIM index vector. + CGU_Vec3f Crs = {0.0f, 0.0f, 0.0f}; + CGU_FLOAT Len = 0.0f; + + for (i = 0; i < dwUniqueColors; i++) + { + const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i]; + Len += RmpIndxs[i] * PreMlt; + Crs.x += BlkSh[i].x * PreMlt; + Crs.y += BlkSh[i].y * PreMlt; + Crs.z += BlkSh[i].z * PreMlt; + } + + LineDir.x = LineDir.y = LineDir.z = 0.0f; + if (Len > 0.0f) + { + CGU_FLOAT Len2; + LineDir = Crs / Len; + // 6. Plug the projections as a new directional vector for the axis. + // 7. Goto 1. + Len2 = dot(LineDir, LineDir); // LineDir.x * LineDir.x + LineDir.y * LineDir.y + LineDir.z * LineDir.z; + Len2 = sqrt(Len2); + LineDir /= Len2; + } + } + else // We was not able to find anything better. Drop out. + break; + } + + // inverse transform to find end-points of 3-color ramp + rsltC0 = (PosG0 * LineDirG + Mdl) * 255.f; + rsltC1 = (PosG1 * LineDirG + Mdl) * 255.f; + } // !isDone + + // We've dealt with (almost) unrestricted full precision realm. + // Now back digital world. + + // round the end points to make them look like compressed ones + CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f}; + CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f}; + CGU_Vec3f Fctrs0 = {8.0f, 4.0f, 8.0f}; //(1 << (PIX_GRID - BG)); x (1 << (PIX_GRID - GG)); y (1 << (PIX_GRID - RG)); z + CGU_Vec3f Fctrs1 = {32.0f, 64.0f, 32.0f}; //(CGU_FLOAT)(1 << RG); z (CGU_FLOAT)(1 << GG); y (CGU_FLOAT)(1 << BG); x + CGU_FLOAT _Min = 0.0f; + CGU_FLOAT _Max = 255.0f; + + { + // MkRmpOnGrid(inpRmpEndPts, rsltC, _Min, _Max); + + inpRmpEndPts0 = cmp_floorVec3f(rsltC0); + + if (inpRmpEndPts0.x <= _Min) + inpRmpEndPts0.x = _Min; + else + { + inpRmpEndPts0.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts0.x / Fctrs1.x); + inpRmpEndPts0.x = min(inpRmpEndPts0.x, _Max); + } + if (inpRmpEndPts0.y <= _Min) + inpRmpEndPts0.y = _Min; + else + { + inpRmpEndPts0.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts0.y / Fctrs1.y); + inpRmpEndPts0.y = min(inpRmpEndPts0.y, _Max); + } + if (inpRmpEndPts0.z <= _Min) + inpRmpEndPts0.z = _Min; + else + { + inpRmpEndPts0.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts0.z / Fctrs1.z); + inpRmpEndPts0.z = min(inpRmpEndPts0.z, _Max); + } + + inpRmpEndPts0 = cmp_floorVec3f(inpRmpEndPts0 / Fctrs0) * Fctrs0; + + inpRmpEndPts1 = cmp_floorVec3f(rsltC1); + if (inpRmpEndPts1.x <= _Min) + inpRmpEndPts1.x = _Min; + else + { + inpRmpEndPts1.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts1.x / Fctrs1.x); + inpRmpEndPts1.x = min(inpRmpEndPts1.x, _Max); + } + if (inpRmpEndPts1.y <= _Min) + inpRmpEndPts1.y = _Min; + else + { + inpRmpEndPts1.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts1.y / Fctrs1.y); + inpRmpEndPts1.y = min(inpRmpEndPts1.y, _Max); + } + if (inpRmpEndPts1.z <= _Min) + inpRmpEndPts1.z = _Min; + else + { + inpRmpEndPts1.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts1.z / Fctrs1.z); + inpRmpEndPts1.z = min(inpRmpEndPts1.z, _Max); + } + + inpRmpEndPts1 = cmp_floorVec3f(inpRmpEndPts1 / Fctrs0) * Fctrs0; + } // MkRmpOnGrid + + CMP_EndPoints EndPoints; + EndPoints.Color0 = inpRmpEndPts0; + EndPoints.Color1 = inpRmpEndPts1; + + return EndPoints; +} + +CMP_STATIC CMP_EndPoints +cgu_MkRmpOnGridBGR(CMP_IN CGU_Vec3f rsltC0, CMP_IN CGU_Vec3f rsltC1, CMP_IN CGU_UINT32 nRedBits, CMP_IN CGU_UINT32 nGreenBits, CMP_IN CGU_UINT32 nBlueBits) +{ + CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f}; + CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f}; + CGU_Vec3f Fctrs0 = {8.0f, 4.0f, 8.0f}; + CGU_Vec3f Fctrs1 = {32.0f, 64.0f, 32.0f}; + CGU_FLOAT _Min = 0.0f; + CGU_FLOAT _Max = 255.0f; + + // user override 565 default setting + if ((nRedBits != 5) || (nGreenBits != 6) || (nBlueBits != 5)) + { + Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits); + Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits); + Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits); + Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits)); + Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits)); + Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits)); + } + + inpRmpEndPts0 = cmp_floorVec3f(rsltC0); + + if (inpRmpEndPts0.x <= _Min) + inpRmpEndPts0.x = _Min; + else + { + inpRmpEndPts0.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts0.x / Fctrs1.x); + inpRmpEndPts0.x = cmp_minf(inpRmpEndPts0.x, _Max); + } + if (inpRmpEndPts0.y <= _Min) + inpRmpEndPts0.y = _Min; + else + { + inpRmpEndPts0.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts0.y / Fctrs1.y); + inpRmpEndPts0.y = cmp_minf(inpRmpEndPts0.y, _Max); + } + if (inpRmpEndPts0.z <= _Min) + inpRmpEndPts0.z = _Min; + else + { + inpRmpEndPts0.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts0.z / Fctrs1.z); + inpRmpEndPts0.z = cmp_minf(inpRmpEndPts0.z, _Max); + } + + inpRmpEndPts0 = cmp_floorVec3f(inpRmpEndPts0 / Fctrs0) * Fctrs0; + + inpRmpEndPts1 = cmp_floorVec3f(rsltC1); + if (inpRmpEndPts1.x <= _Min) + inpRmpEndPts1.x = _Min; + else + { + inpRmpEndPts1.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts1.x / Fctrs1.x); + inpRmpEndPts1.x = cmp_minf(inpRmpEndPts1.x, _Max); + } + if (inpRmpEndPts1.y <= _Min) + inpRmpEndPts1.y = _Min; + else + { + inpRmpEndPts1.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts1.y / Fctrs1.y); + inpRmpEndPts1.y = cmp_minf(inpRmpEndPts1.y, _Max); + } + if (inpRmpEndPts1.z <= _Min) + inpRmpEndPts1.z = _Min; + else + { + inpRmpEndPts1.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts1.z / Fctrs1.z); + inpRmpEndPts1.z = cmp_minf(inpRmpEndPts1.z, _Max); + } + + inpRmpEndPts1 = cmp_floorVec3f(inpRmpEndPts1 / Fctrs0) * Fctrs0; + + CMP_EndPoints EndPoints; + EndPoints.Color0 = inpRmpEndPts0; + EndPoints.Color1 = inpRmpEndPts1; + + return EndPoints; + +} // MkRmpOnGrid + +//=================================================================== +// Replaces CompressBlockBC1_RGBA_Internal() +// if ((errLQ > 0.0f) && (fquality > CMP_QUALITY2)) code block +//=================================================================== +CMP_STATIC CGU_Vec2ui cgu_CompRGBBlock(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options) +{ + //CGU_FLOAT errLQ = 1e6f; + CGU_UINT32 m_nRefinementSteps = BC15Options.m_nRefinementSteps; + CGU_UINT32 dwAlphaThreshold = BC15Options.m_nAlphaThreshold; + CGU_Vec3f channelWeights = {BC15Options.m_fChannelWeights[0], BC15Options.m_fChannelWeights[1], BC15Options.m_fChannelWeights[2]}; + CGU_BOOL isSRGB = BC15Options.m_bIsSRGB; + + CGU_Vec3f rgbBlock_normal[BLOCK_SIZE_4X4]; + CGU_UINT32 nCmpIndices = 0; + CGU_UINT32 c0, c1; + // High Quality + CMP_EndPoints EndPoints = {{0, 0, 0xFF}, {0, 0, 0xFF}}; + CGU_UINT32 i; + + CGU_FLOAT ALIGN_16 Rpt[BLOCK_SIZE_4X4]; + CGU_UINT32 pcIndices = 0; + + m_nRefinementSteps = 0; + + CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4]; // Normalized Block Input (0..1) in BGR channel format + // Default inidices & endpoints for Transparent Block + CGU_Vec3ui nEndpoints0 = {0, 0, 0}; // Endpoints are stored BGR as x,y,z + CGU_Vec3ui nEndpoints1 = {0xFF, 0xFF, 0xFF}; // Endpoints are stored BGR as x,y,z + + for (i = 0; i < BLOCK_SIZE_4X4; i++) + { + Rpt[i] = 0.0f; + } + + //=============================================================== + // Check if we have more then 2 colors and process Alpha block + CGU_UINT32 dwColors = 0; + CGU_UINT32 dwBlk[BLOCK_SIZE_4X4]; + CGU_UINT32 R, G, B, A; + for (i = 0; i < BLOCK_SIZE_4X4; i++) + { + // Do any color conversion prior to processing the block + rgbBlock_normal[i] = isSRGB ? cmp_linearToSrgb(src_imageNorm[i].rgb) : src_imageNorm[i].rgb; + + R = (CGU_UINT32)(rgbBlock_normal[i].x * 255.0f); + G = (CGU_UINT32)(rgbBlock_normal[i].y * 255.0f); + B = (CGU_UINT32)(rgbBlock_normal[i].z * 255.0f); + + //if (dwAlphaThreshold > 0) + // A = (CGU_UINT32)src_imageNorm[i].w * 255.0f; + //else + A = 255; + + // Punch Through Alpha in BC1 Codec (1 bit alpha) + //if ((dwAlphaThreshold == 0) || (A >= dwAlphaThreshold)) + //{ + // copy to local RGB data and have alpha set to 0xFF + dwBlk[dwColors++] = A << 24 | R << 16 | G << 8 | B; + //} + } + + if (!dwColors) + { + // All are colors transparent + EndPoints.Color0.x = EndPoints.Color0.y = EndPoints.Color0.z = 0.0f; + EndPoints.Color1.x = EndPoints.Color1.y = EndPoints.Color0.z = 255.0f; + nCmpIndices = 0xFFFFFFFF; + } + else + { + // We have colors to process + nCmpIndices = 0; + // Punch Through Alpha Support ToDo + // CGU_BOOL bHasAlpha = (dwColors != BLOCK_SIZE_4X4); + // bHasAlpha = bHasAlpha && (dwAlphaThreshold > 0); // valid for (dwNumChannels=4); + // if (bHasAlpha) { + // CGU_Vec2ui compBlock = {0xf800f800,0}; + // return compBlock; + // } + + // Here we are computing an unique number of sorted colors. + // For each unique value we compute the number of it appearences. + // qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp); + { + CGU_UINT32 j; + CMP_di what[BLOCK_SIZE_4X4]; + + for (i = 0; i < dwColors; i++) + { + what[i].index = i; + what[i].data = dwBlk[i]; + } + + CGU_UINT32 tmp_index; + CGU_UINT32 tmp_data; + + for (i = 1; i < dwColors; i++) + { + for (j = i; j > 0; j--) + { + if (what[j - 1].data > what[j].data) + { + tmp_index = what[j].index; + tmp_data = what[j].data; + what[j].index = what[j - 1].index; + what[j].data = what[j - 1].data; + what[j - 1].index = tmp_index; + what[j - 1].data = tmp_data; + } + } + } + for (i = 0; i < dwColors; i++) + dwBlk[i] = what[i].data; + } + CGU_UINT32 new_p; + CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4]; + CGU_UINT32 dwUniqueColors = 0; + new_p = dwBlkU[0] = dwBlk[0]; + Rpt[dwUniqueColors] = 1.f; + for (i = 1; i < dwColors; i++) + { + if (new_p != dwBlk[i]) + { + dwUniqueColors++; + new_p = dwBlkU[dwUniqueColors] = dwBlk[i]; + Rpt[dwUniqueColors] = 1.f; + } + else + Rpt[dwUniqueColors] += 1.f; + } + dwUniqueColors++; + + // Simple case of only 2 colors to process + // no need for futher processing as lowest quality methods work best for this case + if (dwUniqueColors <= 2) + { + CGU_Vec3f rsltC0; + CGU_Vec3f rsltC1; + rsltC0.r = rgbBlock_normal[0].b * 255.0f; + rsltC0.g = rgbBlock_normal[0].g * 255.0f; + rsltC0.b = rgbBlock_normal[0].r * 255.0f; + rsltC1.r = rgbBlock_normal[dwUniqueColors - 1].b * 255.0f; + rsltC1.g = rgbBlock_normal[dwUniqueColors - 1].g * 255.0f; + rsltC1.b = rgbBlock_normal[dwUniqueColors - 1].r * 255.0f; + EndPoints = cgu_MkRmpOnGridBGR(rsltC0, rsltC1, 5, 6, 5); + } + else + { + // switch from int range back to UV floats + for (i = 0; i < dwUniqueColors; i++) + { + R = (dwBlkU[i] >> 16) & 0xff; + G = (dwBlkU[i] >> 8) & 0xff; + B = (dwBlkU[i] >> 0) & 0xff; + BlkInBGRf_UV[i].z = (CGU_FLOAT)R / 255.0f; + BlkInBGRf_UV[i].y = (CGU_FLOAT)G / 255.0f; + BlkInBGRf_UV[i].x = (CGU_FLOAT)B / 255.0f; + } + + CGU_Vec3f channelWeightsBGR; + channelWeightsBGR.x = channelWeights.z; + channelWeightsBGR.y = channelWeights.y; + channelWeightsBGR.z = channelWeights.x; + + EndPoints = cgu_CompressRGBBlockX(BlkInBGRf_UV, Rpt, dwUniqueColors, channelWeightsBGR, m_nRefinementSteps); + } + } // colors + + //=================================================================== + // Process Cluster INPUT is constant EndPointsf OUTPUT is pcIndices + //=================================================================== + if (nCmpIndices == 0) + { + R = (CGU_UINT32)(EndPoints.Color0.z); + G = (CGU_UINT32)(EndPoints.Color0.y); + B = (CGU_UINT32)(EndPoints.Color0.x); + CGU_INT32 cluster0 = cmp_constructColor(R, G, B); + + R = (CGU_UINT32)(EndPoints.Color1.z); + G = (CGU_UINT32)(EndPoints.Color1.y); + B = (CGU_UINT32)(EndPoints.Color1.x); + CGU_INT32 cluster1 = cmp_constructColor(R, G, B); + + CGU_Vec3f InpRmp[NUM_ENDPOINTS]; + if ((cluster0 <= cluster1) // valid for 4 channels + // || (cluster0 > cluster1) // valid for 3 channels + ) + { + // inverse endpoints + InpRmp[0] = EndPoints.Color1; + InpRmp[1] = EndPoints.Color0; + } + else + { + InpRmp[0] = EndPoints.Color0; + InpRmp[1] = EndPoints.Color1; + } + + CGU_Vec3f srcblockBGR[BLOCK_SIZE_4X4]; + CGU_FLOAT srcblockA[BLOCK_SIZE_4X4]; + + // Swizzle the source RGB to BGR for processing + for (i = 0; i < BLOCK_SIZE_4X4; i++) + { + srcblockBGR[i].z = rgbBlock_normal[i].x * 255.0f; + srcblockBGR[i].y = rgbBlock_normal[i].y * 255.0f; + srcblockBGR[i].x = rgbBlock_normal[i].z * 255.0f; + srcblockA[i] = 255.0f; + if (dwAlphaThreshold > 0) + { + CGU_UINT32 alpha = (CGU_UINT32)src_imageNorm[i].w * 255.0f; + if (alpha >= dwAlphaThreshold) + srcblockA[i] = alpha; + } + } + + // input ramp is on the coarse grid + // make ramp endpoints the way they'll going to be decompressed + CGU_Vec3f InpRmpL[NUM_ENDPOINTS]; + CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F}; // 1 << RG,1 << GG,1 << BG + + { + // ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp); + InpRmpL[0] = InpRmp[0] + cmp_floorVec3f(InpRmp[0] / Fctrs); + InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f); + InpRmpL[1] = InpRmp[1] + cmp_floorVec3f(InpRmp[1] / Fctrs); + InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f); + } // MkWkRmpPts + + // build ramp + CGU_Vec3f LerpRmp[4]; + CGU_Vec3f offset = {1.0f, 1.0f, 1.0f}; + { + //BldRmp(Rmp, InpRmpL, dwNumChannels); + // linear interpolate end points to get the ramp + LerpRmp[0] = InpRmpL[0]; + LerpRmp[3] = InpRmpL[1]; + LerpRmp[1] = cmp_floorVec3f((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f); + LerpRmp[2] = cmp_floorVec3f((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f); + } // BldRmp + + //========================================================================= + // Clusterize, Compute error and find DXTC indexes for the current cluster + //========================================================================= + { + // Clusterize + CGU_UINT32 alpha; + + // For each colour in the original block assign it + // to the closest cluster and compute the cumulative error + for (i = 0; i < BLOCK_SIZE_4X4; i++) + { + alpha = (CGU_UINT32)srcblockA[i]; + if ((dwAlphaThreshold > 0) && alpha == 0) + { //*((CGU_DWORD *)&_Blk[i][AC]) == 0) + pcIndices |= cmp_set2Bit32(4, i); // dwNumChannels 3 or 4 (default is 4) + } + else + { + CGU_FLOAT shortest = 99999999999.f; + CGU_UINT8 shortestIndex = 0; + + CGU_Vec3f channelWeightsBGR; + channelWeightsBGR.x = channelWeights.z; + channelWeightsBGR.y = channelWeights.y; + channelWeightsBGR.z = channelWeights.x; + + for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++) + { + // r is either 1 or 4 + // calculate the distance for each component + CGU_FLOAT distance = + dot(((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR), ((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR)); + if (distance < shortest) + { + shortest = distance; + shortestIndex = rampindex; + } + } + + // The total is a sum of (error += shortest) + // We have the index of the best cluster, so assign this in the block + // Reorder indices to match correct DXTC ordering + if (shortestIndex == 3) // dwNumChannels - 1 + shortestIndex = 1; + else if (shortestIndex) + shortestIndex++; + pcIndices |= cmp_set2Bit32(shortestIndex, i); + } + } // BLOCK_SIZE_4X4 + } // Clusterize + } // Process Cluster + + //============================================================== + // Generate Compressed Result from nEndpoints & pcIndices + //============================================================== + c0 = cmp_constructColorBGR(EndPoints.Color0); + c1 = cmp_constructColorBGR(EndPoints.Color1); + + // Get Processed indices if not set + if (nCmpIndices == 0) + nCmpIndices = pcIndices; + + CGU_Vec2ui cmpBlock; + if (c0 <= c1) + { + cmpBlock.x = c1 | (c0 << 16); + } + else + cmpBlock.x = c0 | (c1 << 16); + + cmpBlock.y = nCmpIndices; + + return cmpBlock; +} + +CMP_STATIC void cgu_ProcessColors(CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMin, + CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMax, + CMP_INOUT CGU_UINT32 CMP_PTRINOUT c0, + CMP_INOUT CGU_UINT32 CMP_PTRINOUT c1, + CMP_IN CGU_INT setopt, + CMP_IN CGU_BOOL isSRGB) +{ + // CGU_UINT32 srbMap[32] = {0,5,8,11,12,13,14,15,16,17,18,19,20,21,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31}; + // CGU_UINT32 sgMap[64] = {0,10,14,16,19,20,22,24,25,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,42,43,43,44,45,45, + // 46,47,47,48,48,49,50,50,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63}; + CGU_INT32 x, y, z; + CGU_Vec3f scale = {31.0f, 63.0f, 31.0f}; + CGU_Vec3f MinColorScaled; + CGU_Vec3f MaxColorScaled; + + // Clamp or Transform is needed, the transforms have built in clamps + if (isSRGB) + { + MinColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMin); + MaxColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMax); + } + else + { + MinColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMin, 0.0f, 1.0f); + MaxColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMax, 0.0f, 1.0f); + } + + switch (setopt) + { + case 0: // Use Min Max processing + MinColorScaled = cmp_floorVec3f(MinColorScaled * scale); + MaxColorScaled = cmp_ceilVec3f(MaxColorScaled * scale); + CMP_PTRINOUT colorMin = MinColorScaled / scale; + CMP_PTRINOUT colorMax = MaxColorScaled / scale; + break; + default: // Use round processing + MinColorScaled = round(MinColorScaled * scale); + MaxColorScaled = round(MaxColorScaled * scale); + break; + } + + x = (CGU_UINT32)(MinColorScaled.x); + y = (CGU_UINT32)(MinColorScaled.y); + z = (CGU_UINT32)(MinColorScaled.z); + + //if (isSRGB) { + // // scale RB + // x = srbMap[x]; // &0x1F]; + // y = sgMap [y]; // &0x3F]; + // z = srbMap[z]; // &0x1F]; + // // scale G + //} + CMP_PTRINOUT c0 = (x << 11) | (y << 5) | z; + + x = (CGU_UINT32)(MaxColorScaled.x); + y = (CGU_UINT32)(MaxColorScaled.y); + z = (CGU_UINT32)(MaxColorScaled.z); + CMP_PTRINOUT c1 = (x << 11) | (y << 5) | z; +} + +CMP_STATIC CGU_FLOAT cgu_getIndicesRGB(CMP_INOUT CGU_UINT32 CMP_PTRINOUT cmpindex, + CMP_IN const CGU_Vec3f block[16], + CMP_IN CGU_Vec3f minColor, + CMP_IN CGU_Vec3f maxColor, + CMP_IN CGU_BOOL getErr) +{ + CGU_UINT32 PackedIndices = 0; + CGU_FLOAT err = 0.0f; + CGU_Vec3f cn[4]; + CGU_FLOAT minDistance; + + if (getErr) + { + // remap to BC1 spec for decoding offsets, + // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1 + cn[0] = maxColor; + cn[1] = minColor; + cn[2] = cn[0] * 2.0f / 3.0f + cn[1] * 1.0f / 3.0f; + cn[3] = cn[0] * 1.0f / 3.0f + cn[1] * 2.0f / 3.0f; + } + + CGU_FLOAT Scale = 3.f / cmp_dotVec3f(minColor - maxColor, minColor - maxColor); + CGU_Vec3f ScaledRange = (minColor - maxColor) * Scale; + CGU_FLOAT Bias = (cmp_dotVec3f(maxColor, maxColor) - cmp_dotVec3f(maxColor, minColor)) * Scale; + CGU_INT indexMap[4] = {0, 2, 3, 1}; // mapping based on BC1 Spec for color0 > color1 + CGU_UINT32 index; + CGU_FLOAT diff; + + for (CGU_UINT32 i = 0; i < 16; i++) + { + // Get offset from base scale + diff = cmp_dotVec3f(block[i], ScaledRange) + Bias; + index = ((CGU_UINT32)round(diff)) & 0x3; + + // remap linear offset to spec offset + index = indexMap[index]; + + // use err calc for use in higher quality code + if (getErr) + { + minDistance = cmp_dotVec3f(block[i] - cn[index], block[i] - cn[index]); + err += minDistance; + } + + // Map the 2 bit index into compress 32 bit block + if (index) + PackedIndices |= (index << (2 * i)); + } + + if (getErr) + err = err * 0.0208333f; + + CMP_PTRINOUT cmpindex = PackedIndices; + return err; +} + +//-------------------------------------------------------------------------------------------------------- +// Decompress is RGB (0.0f..255.0f) +//-------------------------------------------------------------------------------------------------------- +CMP_STATIC void cgu_decompressRGBBlock(CMP_INOUT CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock) +{ + CGU_UINT32 n0 = compressedBlock.x & 0xffff; + CGU_UINT32 n1 = compressedBlock.x >> 16; + CGU_UINT32 index; + + //------------------------------------------------------- + // Decode the compressed block 0..255 color range + //------------------------------------------------------- + CGU_Vec3f c0 = cmp_565ToLinear(n0); // max color + CGU_Vec3f c1 = cmp_565ToLinear(n1); // min color + CGU_Vec3f c2; + CGU_Vec3f c3; + + if (n0 > n1) + { + c2 = (c0 * 2.0f + c1) / 3.0f; + c3 = (c1 * 2.0f + c0) / 3.0f; + + for (CGU_UINT32 i = 0; i < 16; i++) + { + index = (compressedBlock.y >> (2 * i)) & 3; + switch (index) + { + case 0: + rgbBlock[i] = c0; + break; + case 1: + rgbBlock[i] = c1; + break; + case 2: + rgbBlock[i] = c2; + break; + case 3: + rgbBlock[i] = c3; + break; + } + } + } + else + { + // Transparent decode + c2 = (c0 + c1) / 2.0f; + + for (CGU_UINT32 i = 0; i < 16; i++) + { + index = (compressedBlock.y >> (2 * i)) & 3; + switch (index) + { + case 0: + rgbBlock[i] = c0; + break; + case 1: + rgbBlock[i] = c1; + break; + case 2: + rgbBlock[i] = c2; + break; + case 3: + rgbBlock[i] = 0.0f; + break; + } + } + } +} + +// The source is 0..255 +CMP_STATIC float cgu_RGBABlockErrorLinear(const CGU_Vec4uc src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock) +{ + CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4]; + + // Decompressed block channels are 0..255 + cgu_decompressRGBBlock(rgbBlock, compressedBlock); + + //------------------------------------------------------------------ + // Calculate MSE of the block + // Note : pow is used as Float type for the code to be usable on CPU + //------------------------------------------------------------------ + CGU_Vec3f serr; + serr = 0.0f; + + float sR, sG, sB, R, G, B; + + for (int j = 0; j < 16; j++) + { + sR = src_rgbBlock[j].x; + sG = src_rgbBlock[j].y; + sB = src_rgbBlock[j].z; + + R = rgbBlock[j].x; + G = rgbBlock[j].y; + B = rgbBlock[j].z; + + // Norm colors + serr.x += pow(sR - R, 2.0f); + serr.y += pow(sG - G, 2.0f); + serr.z += pow(sB - B, 2.0f); + } + + // MSE for 16 texels + return (serr.x + serr.y + serr.z) / 48.0f; +} + +// The source is 0..1, decompressed data using cmp_decompressRGBBlock2 is 0..255 which is converted down to 0..1 +CMP_STATIC float cgu_RGBBlockError(const CGU_Vec3f src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock, CGU_BOOL isSRGB) +{ + CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4]; + + // Decompressed block channels are 0..255 + cgu_decompressRGBBlock(rgbBlock, compressedBlock); + + //------------------------------------------------------------------ + // Calculate MSE of the block + // Note : pow is used as Float type for the code to be usable on CPU + //------------------------------------------------------------------ + CGU_Vec3f serr; + serr = 0.0f; + + float sR, sG, sB, R, G, B; + + for (int j = 0; j < 16; j++) + { + if (isSRGB) + { + sR = round(cmp_linearToSrgbf(src_rgbBlock[j].x) * 255.0f); + sG = round(cmp_linearToSrgbf(src_rgbBlock[j].y) * 255.0f); + sB = round(cmp_linearToSrgbf(src_rgbBlock[j].z) * 255.0f); + } + else + { + sR = round(src_rgbBlock[j].x * 255.0f); + sG = round(src_rgbBlock[j].y * 255.0f); + sB = round(src_rgbBlock[j].z * 255.0f); + } + + R = rgbBlock[j].x; + G = rgbBlock[j].y; + B = rgbBlock[j].z; + + // Norm colors + serr.x += pow(sR - R, 2.0f); + serr.y += pow(sG - G, 2.0f); + serr.z += pow(sB - B, 2.0f); + } + + // MSE for 16 texels + return (serr.x + serr.y + serr.z) / 48.0f; +} + +CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_MinMax(CMP_IN const CGU_Vec3f src_imageRGB[16], + CMP_IN CGU_FLOAT fquality, + CMP_IN CGU_BOOL isSRGB, + CMP_INOUT CGU_Vec3f srcRGB[16], // The list of source colors with blue channel altered + CMP_INOUT CGU_Vec3f CMP_REFINOUT average_rgb, // The centrepoint of the axis + CMP_INOUT CGU_FLOAT CMP_REFINOUT errout) +{ + CGU_Vec2ui Q1CompData = {0, 0}; + CGU_Vec3f rgb = {0, 0, 0}; + + // ------------------------------------------------------------------------------------- + // (1) Find the array of unique pixel values and sum them to find their average position + // ------------------------------------------------------------------------------------- + CGU_FLOAT errLQ = 0.0f; + CGU_BOOL fastProcess = (fquality <= CMP_QUALITY0); // Min Max only + CGU_Vec3f srcMin = 1.0f; // Min source color + CGU_Vec3f srcMax = 0.0f; // Max source color + CGU_Vec2ui Q1compressedBlock = {0, 0}; + CGU_UINT32 c0 = 0; + CGU_UINT32 c1 = 0; + + average_rgb = 0.0f; + // Get average and modifed src + // find average position and save list of pixels as 0F..255F range for processing + // Note: z (blue) is average of blue+green channels + for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) + { + srcMin = cmp_minVec3f(srcMin, src_imageRGB[i]); + srcMax = cmp_maxVec3f(srcMax, src_imageRGB[i]); + if (!fastProcess) + { + rgb = isSRGB ? cmp_linearToSrgb(src_imageRGB[i]) : cmp_saturate(src_imageRGB[i]); + rgb.z = (rgb.y + rgb.z) * 0.5F; // Z-axiz => (R+G)/2 + srcRGB[i] = rgb; + average_rgb = average_rgb + rgb; + } + } + + // Process two colors for saving in 565 format as C0 and C1 + cgu_ProcessColors(CMP_REFINOUT srcMin, CMP_REFINOUT srcMax, CMP_REFINOUT c0, CMP_REFINOUT c1, isSRGB ? 1 : 0, isSRGB); + + // Save simple min-max encoding + if (c0 < c1) + { + Q1CompData.x = (c0 << 16) | c1; + CGU_UINT32 index = 0; + errLQ = cgu_getIndicesRGB(CMP_REFINOUT index, src_imageRGB, srcMin, srcMax, false); + Q1CompData.y = index; + errout = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB); + } + else + { + // Most simple case all colors are equal or 0.0f + Q1compressedBlock.x = (c1 << 16) | c0; + Q1compressedBlock.y = 0; + errout = 0.0f; + return Q1compressedBlock; + } + // 0.0625F is (1/BLOCK_SIZE_4X4) + average_rgb = average_rgb * 0.0625F; + + return Q1CompData; +} + +CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_Fast(CMP_IN const CGU_Vec3f src_imageRGB[16], + CMP_IN CGU_FLOAT fquality, + CMP_IN CGU_BOOL isSRGB, + CMP_IN CGU_Vec3f srcRGB[16], + CMP_IN CGU_Vec3f CMP_REFINOUT average_rgb, + CMP_INOUT CGU_FLOAT CMP_REFINOUT errout) +{ + CMP_UNUSED(fquality); + + CGU_Vec3f axisVectorRGB = {0.0f, 0.0f, 0.0f}; // The axis vector for index projection + CGU_FLOAT pos_on_axis[16]; // The distance each unique falls along the compression axis + CGU_FLOAT axisleft = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis + CGU_FLOAT axisright = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis + CGU_FLOAT axiscentre = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis + CGU_INT32 swap = 0; // Indicator if the RGB values need swapping to generate an opaque result + CGU_Vec3f srcBlock[16]; // The list of source colors with any color space transforms and clipping + CGU_UINT32 c0 = 0; + CGU_UINT32 c1 = 0; + CGU_Vec2ui compressedBlock = {0, 0}; + CGU_FLOAT Q1CompErr; + CGU_Vec2ui Q1CompData = {0, 0}; + + CGU_Vec3f rgb = {0, 0, 0}; + + // ------------------------------------------------------------------------------------- + // (4) For each component, reflect points about the average so all lie on the same side + // of the average, and compute the new average - this gives a second point that defines the axis + // To compute the sign of the axis sum the positive differences of G for each of R and B (the + // G axis is always positive in this implementation + // ------------------------------------------------------------------------------------- + // An interesting situation occurs if the G axis contains no information, in which case the RB + // axis is also compared. I am not entirely sure if this is the correct implementation - should + // the priority axis be determined by magnitude? + { + CGU_FLOAT rg_pos = 0.0f; + CGU_FLOAT bg_pos = 0.0f; + CGU_FLOAT rb_pos = 0.0f; + + for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) + { + rgb = srcRGB[i] - average_rgb; + axisVectorRGB = axisVectorRGB + cmp_fabsVec3f(rgb); + if (rgb.x > 0) + { + rg_pos += rgb.y; + rb_pos += rgb.z; + } + if (rgb.z > 0) + bg_pos += rgb.y; + } + + // Average over BLOCK_SIZE_4X4 + axisVectorRGB = axisVectorRGB * 0.0625F; + + // New average position + if (rg_pos < 0) + axisVectorRGB.x = -axisVectorRGB.x; + if (bg_pos < 0) + axisVectorRGB.z = -axisVectorRGB.z; + if ((rg_pos == bg_pos) && (rg_pos == 0)) + { + if (rb_pos < 0) + axisVectorRGB.z = -axisVectorRGB.z; + } + } + + // ------------------------------------------------------------------------------------- + // (5) Axis projection and remapping + // ------------------------------------------------------------------------------------- + { + CGU_FLOAT v2_recip; + // Normalize the axis for simplicity of future calculation + v2_recip = cmp_dotVec3f(axisVectorRGB, axisVectorRGB); + if (v2_recip > 0) + v2_recip = 1.0f / (CGU_FLOAT)cmp_sqrt(v2_recip); + else + v2_recip = 1.0f; + axisVectorRGB = axisVectorRGB * v2_recip; + } + + // ------------------------------------------------------------------------------------- + // (6) Map the axis + // ------------------------------------------------------------------------------------- + // the line joining (and extended on either side of) average and axis + // defines the axis onto which the points will be projected + // Project all the points onto the axis, calculate the distance along + // the axis from the centre of the axis (average) + // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is + // P + ((R-P).v) / (v.v))v + // The distance along v is therefore (R-P).v / (v.v) where (v.v) is 1 if v is a unit vector. + // + // Calculate the extremities at the same time - these need to be reasonably accurately + // represented in all cases + { + axisleft = CMP_FLOAT_MAX; + axisright = -CMP_FLOAT_MAX; + for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) + { + // Compute the distance along the axis of the point of closest approach + CGU_Vec3f temp = (srcRGB[i] - average_rgb); + pos_on_axis[i] = cmp_dotVec3f(temp, axisVectorRGB); + + // Work out the extremities + if (pos_on_axis[i] < axisleft) + axisleft = pos_on_axis[i]; + if (pos_on_axis[i] > axisright) + axisright = pos_on_axis[i]; + } + } + + // --------------------------------------------------------------------------------------------- + // (7) Now we have a good axis and the basic information about how the points are mapped to it + // Our initial guess is to represent the endpoints accurately, by moving the average + // to the centre and recalculating the point positions along the line + // --------------------------------------------------------------------------------------------- + { + axiscentre = (axisleft + axisright) * 0.5F; + average_rgb = average_rgb + (axisVectorRGB * axiscentre); + for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) + pos_on_axis[i] -= axiscentre; + axisright -= axiscentre; + axisleft -= axiscentre; + } + + // ------------------------------------------------------------------------------------- + // (8) Calculate the high and low output colour values + // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A + // straight rounded average is not correct, as the decompressor 'unrounds' by replicating + // the top bits to the bottom. + // In order to take account of this process, we don't just apply a straight rounding correction, + // but base our rounding on the input value (a straight rounding is actually pretty good in terms of + // error measure, but creates a visual colour and/or brightness shift relative to the original image) + // The method used here is to apply a centre-biased rounding dependent on the input value, which was + // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of + // the image. + // rgb = (average_rgb + (left|right)*axisVectorRGB); + // ------------------------------------------------------------------------------------- + { + CGU_Vec3f MinColor, MaxColor; + + MinColor = average_rgb + (axisVectorRGB * axisleft); + MaxColor = average_rgb + (axisVectorRGB * axisright); + MinColor.z = (MinColor.z * 2) - MinColor.y; + MaxColor.z = (MaxColor.z * 2) - MaxColor.y; + + cgu_ProcessColors(CMP_REFINOUT MinColor, CMP_REFINOUT MaxColor, CMP_REFINOUT c0, CMP_REFINOUT c1, 1, false); + + // Force to be a 4-colour opaque block - in which case, c0 is greater than c1 + swap = 0; + if (c0 < c1) + { + CGU_UINT32 t; + t = c0; + c0 = c1; + c1 = t; + swap = 1; + } + else if (c0 == c1) + { + // This block will always be encoded in 3-colour mode + // Need to ensure that only one of the two points gets used, + // avoiding accidentally setting some transparent pixels into the block + for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) + pos_on_axis[i] = axisleft; + } + + compressedBlock.x = c0 | (c1 << 16); + + // ------------------------------------------------------------------------------------- + // (9) Final clustering, creating the 2-bit values that define the output + // ------------------------------------------------------------------------------------- + + CGU_UINT32 index; + CGU_FLOAT division; + { + compressedBlock.y = 0; + division = axisright * 2.0f / 3.0f; + axiscentre = (axisleft + axisright) / 2; // Actually, this code only works if centre is 0 or approximately so + + CGU_FLOAT CompMinErr; + + // This feature is work in progress + // remap to BC1 spec for decoding offsets, + // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1 + // CGU_Vec3f cn[4]; + // cn[0] = MaxColor; + // cn[1] = MinColor; + // cn[2] = cn[0]*2.0f/3.0f + cn[1]*1.0f/3.0f; + // cn[3] = cn[0]*1.0f/3.0f + cn[1]*2.0f/3.0f; + + for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) + { + // Endpoints (indicated by block > average) are 0 and 1, while + // interpolants are 2 and 3 + if (cmp_fabs(pos_on_axis[i]) >= division) + index = 0; + else + index = 2; + // Positive is in the latter half of the block + if (pos_on_axis[i] >= axiscentre) + index += 1; + + index = index ^ swap; + // Set the output, taking swapping into account + compressedBlock.y |= (index << (2 * i)); + + // use err calc for use in higher quality code + //CompMinErr += cmp_dotVec3f(srcRGBRef[i] - cn[index],srcRGBRef[i] - cn[index]); + } + + //CompMinErr = CompMinErr * 0.0208333f; + + CompMinErr = cgu_RGBBlockError(src_imageRGB, compressedBlock, isSRGB); + Q1CompErr = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB); + + if (CompMinErr > Q1CompErr) + { + compressedBlock = Q1CompData; + errout = Q1CompErr; + } + else + errout = CompMinErr; + } + } + // done + + return compressedBlock; +} + +CMP_STATIC CGU_UINT8 g_Match5Bit[256][2] = { + {0, 0}, {0, 0}, {1, 0}, {1, 0}, {0, 1}, {0, 1}, {0, 1}, {1, 1}, {1, 1}, {1, 1}, {0, 2}, {4, 0}, {1, 2}, {1, 2}, {1, 2}, + {2, 2}, {2, 2}, {2, 2}, {1, 3}, {5, 1}, {2, 3}, {2, 3}, {0, 4}, {3, 3}, {3, 3}, {3, 3}, {2, 4}, {2, 4}, {2, 4}, {5, 3}, + {1, 5}, {1, 5}, {2, 5}, {4, 4}, {4, 4}, {3, 5}, {3, 5}, {2, 6}, {2, 6}, {2, 6}, {3, 6}, {5, 5}, {5, 5}, {4, 6}, {8, 4}, + {3, 7}, {3, 7}, {3, 7}, {6, 6}, {6, 6}, {6, 6}, {5, 7}, {9, 5}, {6, 7}, {6, 7}, {4, 8}, {7, 7}, {7, 7}, {7, 7}, {6, 8}, + {6, 8}, {6, 8}, {9, 7}, {5, 9}, {5, 9}, {6, 9}, {8, 8}, {8, 8}, {7, 9}, {7, 9}, {6, 10}, {6, 10}, {6, 10}, {7, 10}, {9, 9}, + {9, 9}, {8, 10}, {12, 8}, {7, 11}, {7, 11}, {7, 11}, {10, 10}, {10, 10}, {10, 10}, {9, 11}, {13, 9}, {10, 11}, {10, 11}, {8, 12}, {11, 11}, + {11, 11}, {11, 11}, {10, 12}, {10, 12}, {10, 12}, {13, 11}, {9, 13}, {9, 13}, {10, 13}, {12, 12}, {12, 12}, {11, 13}, {11, 13}, {10, 14}, {10, 14}, + {10, 14}, {11, 14}, {13, 13}, {13, 13}, {12, 14}, {16, 12}, {11, 15}, {11, 15}, {11, 15}, {14, 14}, {14, 14}, {14, 14}, {13, 15}, {17, 13}, {14, 15}, + {14, 15}, {12, 16}, {15, 15}, {15, 15}, {15, 15}, {14, 16}, {14, 16}, {14, 16}, {17, 15}, {13, 17}, {13, 17}, {14, 17}, {16, 16}, {16, 16}, {15, 17}, + {15, 17}, {14, 18}, {14, 18}, {14, 18}, {15, 18}, {17, 17}, {17, 17}, {16, 18}, {20, 16}, {15, 19}, {15, 19}, {15, 19}, {18, 18}, {18, 18}, {18, 18}, + {17, 19}, {21, 17}, {18, 19}, {18, 19}, {16, 20}, {19, 19}, {19, 19}, {19, 19}, {18, 20}, {18, 20}, {18, 20}, {21, 19}, {17, 21}, {17, 21}, {18, 21}, + {20, 20}, {20, 20}, {19, 21}, {19, 21}, {18, 22}, {18, 22}, {18, 22}, {19, 22}, {21, 21}, {21, 21}, {20, 22}, {24, 20}, {19, 23}, {19, 23}, {19, 23}, + {22, 22}, {22, 22}, {22, 22}, {21, 23}, {25, 21}, {22, 23}, {22, 23}, {20, 24}, {23, 23}, {23, 23}, {23, 23}, {22, 24}, {22, 24}, {22, 24}, {25, 23}, + {21, 25}, {21, 25}, {22, 25}, {24, 24}, {24, 24}, {23, 25}, {23, 25}, {22, 26}, {22, 26}, {22, 26}, {23, 26}, {25, 25}, {25, 25}, {24, 26}, {28, 24}, + {23, 27}, {23, 27}, {23, 27}, {26, 26}, {26, 26}, {26, 26}, {25, 27}, {29, 25}, {26, 27}, {26, 27}, {24, 28}, {27, 27}, {27, 27}, {27, 27}, {26, 28}, + {26, 28}, {26, 28}, {29, 27}, {25, 29}, {25, 29}, {26, 29}, {28, 28}, {28, 28}, {27, 29}, {27, 29}, {26, 30}, {26, 30}, {26, 30}, {27, 30}, {29, 29}, + {29, 29}, {28, 30}, {28, 30}, {27, 31}, {27, 31}, {27, 31}, {30, 30}, {30, 30}, {30, 30}, {29, 31}, {29, 31}, {30, 31}, {30, 31}, {30, 31}, {31, 31}, + {31, 31}}; + +CMP_STATIC CGU_UINT8 g_Match6Bit[256][2] = { + {0, 0}, {1, 0}, {0, 1}, {1, 1}, {1, 1}, {0, 2}, {1, 2}, {2, 2}, {2, 2}, {1, 3}, {0, 4}, {3, 3}, {3, 3}, {0, 5}, {1, 5}, + {4, 4}, {4, 4}, {1, 6}, {0, 7}, {5, 5}, {5, 5}, {0, 8}, {1, 8}, {6, 6}, {6, 6}, {1, 9}, {2, 9}, {7, 7}, {7, 7}, {2, 10}, + {3, 10}, {8, 8}, {8, 8}, {3, 11}, {4, 11}, {9, 9}, {9, 9}, {4, 12}, {5, 12}, {10, 10}, {10, 10}, {5, 13}, {6, 13}, {16, 8}, {11, 11}, + {6, 14}, {7, 14}, {17, 9}, {12, 12}, {7, 15}, {8, 15}, {16, 11}, {13, 13}, {10, 15}, {8, 16}, {9, 16}, {14, 14}, {13, 15}, {9, 17}, {10, 17}, + {15, 15}, {16, 15}, {10, 18}, {11, 18}, {12, 18}, {16, 16}, {11, 19}, {12, 19}, {13, 19}, {17, 17}, {12, 20}, {13, 20}, {14, 20}, {18, 18}, {13, 21}, + {14, 21}, {15, 21}, {19, 19}, {14, 22}, {15, 22}, {20, 20}, {20, 20}, {15, 23}, {16, 23}, {21, 21}, {21, 21}, {16, 24}, {17, 24}, {22, 22}, {22, 22}, + {17, 25}, {18, 25}, {23, 23}, {23, 23}, {18, 26}, {19, 26}, {24, 24}, {24, 24}, {19, 27}, {20, 27}, {25, 25}, {25, 25}, {20, 28}, {21, 28}, {26, 26}, + {26, 26}, {21, 29}, {22, 29}, {32, 24}, {27, 27}, {22, 30}, {23, 30}, {33, 25}, {28, 28}, {23, 31}, {24, 31}, {32, 27}, {29, 29}, {26, 31}, {24, 32}, + {25, 32}, {30, 30}, {29, 31}, {25, 33}, {26, 33}, {31, 31}, {32, 31}, {26, 34}, {27, 34}, {28, 34}, {32, 32}, {27, 35}, {28, 35}, {29, 35}, {33, 33}, + {28, 36}, {29, 36}, {30, 36}, {34, 34}, {29, 37}, {30, 37}, {31, 37}, {35, 35}, {30, 38}, {31, 38}, {36, 36}, {36, 36}, {31, 39}, {32, 39}, {37, 37}, + {37, 37}, {32, 40}, {33, 40}, {38, 38}, {38, 38}, {33, 41}, {34, 41}, {39, 39}, {39, 39}, {34, 42}, {35, 42}, {40, 40}, {40, 40}, {35, 43}, {36, 43}, + {41, 41}, {41, 41}, {36, 44}, {37, 44}, {42, 42}, {42, 42}, {37, 45}, {38, 45}, {48, 40}, {43, 43}, {38, 46}, {39, 46}, {49, 41}, {44, 44}, {39, 47}, + {40, 47}, {48, 43}, {45, 45}, {42, 47}, {40, 48}, {41, 48}, {46, 46}, {45, 47}, {41, 49}, {42, 49}, {47, 47}, {48, 47}, {42, 50}, {43, 50}, {44, 50}, + {48, 48}, {43, 51}, {44, 51}, {45, 51}, {49, 49}, {44, 52}, {45, 52}, {46, 52}, {50, 50}, {45, 53}, {46, 53}, {47, 53}, {51, 51}, {46, 54}, {47, 54}, + {52, 52}, {52, 52}, {47, 55}, {48, 55}, {53, 53}, {53, 53}, {48, 56}, {49, 56}, {54, 54}, {54, 54}, {49, 57}, {50, 57}, {55, 55}, {55, 55}, {50, 58}, + {51, 58}, {56, 56}, {56, 56}, {51, 59}, {52, 59}, {57, 57}, {57, 57}, {52, 60}, {53, 60}, {58, 58}, {58, 58}, {53, 61}, {54, 61}, {59, 59}, {59, 59}, + {54, 62}, {55, 62}, {60, 60}, {60, 60}, {55, 63}, {56, 63}, {61, 61}, {61, 61}, {58, 63}, {59, 63}, {62, 62}, {62, 62}, {61, 63}, {62, 63}, {63, 63}, + {63, 63}}; + +CMP_STATIC CGU_Vec2ui cgu_solidColorBlock(CMP_IN CGU_UINT8 Red, CMP_IN CGU_UINT8 Green, CMP_IN CGU_UINT8 Blue) +{ + CGU_UINT32 maxEndp16; + CGU_UINT32 minEndp16; + + CGU_UINT32 mask = 0xAAAAAAAAu; + + minEndp16 = g_Match5Bit[Red][0] * 2048U + g_Match6Bit[Green][0] * 32U + g_Match5Bit[Blue][0]; + maxEndp16 = g_Match5Bit[Red][1] * 2048U + g_Match6Bit[Green][1] * 32U + g_Match5Bit[Blue][1]; + + // write the color block + if (maxEndp16 < minEndp16) + { + CGU_UINT32 tmpValue = minEndp16; + minEndp16 = maxEndp16; + maxEndp16 = tmpValue; + mask ^= 0x55555555u; + } + + CGU_Vec2ui outputBytes; + outputBytes.x = CGU_UINT32(maxEndp16) | (CGU_UINT32(minEndp16) << 16u); + outputBytes.y = mask; + + return outputBytes; +} + +CMP_STATIC void cmp_get_encode_data(CMP_IN CMP_EncodeData CMP_REFINOUT edata, CMP_IN CMP_CONSTANT CGU_Vec4uc src_image[16]) +{ + CMP_CONSTANT CGU_UINT32 fr = src_image[0].r, fg = src_image[0].g, fb = src_image[0].b; + + edata.all_colors_equal = false; + + edata.total.r = fr; + edata.total.g = fg; + edata.total.b = fb; + edata.max.r = fr; + edata.max.g = fg; + edata.max.b = fb; + edata.min.r = fr; + edata.min.g = fg; + edata.min.b = fb; + + edata.grayscale_flag = (fr == fg) && (fr == fb); + edata.any_black_pixels = (fr | fg | fb) < 4; + + for (CGU_UINT32 i = 1; i < 16; i++) + { + CMP_CONSTANT CGU_INT r = src_image[i].r, g = src_image[i].g, b = src_image[i].b; + + edata.grayscale_flag &= ((r == g) && (r == b)); + edata.any_black_pixels |= ((r | g | b) < 4); + + edata.max.r = CMP_MAX(edata.max.r, r); + edata.max.g = CMP_MAX(edata.max.g, g); + edata.max.b = CMP_MAX(edata.max.b, b); + edata.min.r = CMP_MIN(edata.min.r, r); + edata.min.g = CMP_MIN(edata.min.g, g); + edata.min.b = CMP_MIN(edata.min.b, b); + edata.total.r += r; + edata.total.g += g; + edata.total.b += b; + } + + edata.avg.r = (edata.total.r + 8) >> 4; + edata.avg.g = (edata.total.g + 8) >> 4; + edata.avg.b = (edata.total.b + 8) >> 4; +} + +#ifndef ASPM_GPU +/*------------------------------------------------------------------------------------------------ +1 DIM ramp +------------------------------------------------------------------------------------------------*/ +CMP_STATIC inline void cpu_BldClrRmp(CGU_FLOAT _Rmp[MAX_POINTS], CGU_FLOAT _InpRmp[NUM_ENDPOINTS], CGU_UINT32 dwNumPoints) +{ + CGU_UINT32 dwRndAmount[9] = {0, 0, 0, 0, 1, 1, 2, 2, 3}; + + // linear interpolate end points to get the ramp + _Rmp[0] = _InpRmp[0]; + _Rmp[dwNumPoints - 1] = _InpRmp[1]; + if (dwNumPoints % 2) + _Rmp[dwNumPoints] = 1000000.f; // for 3 point ramp; not to select the 4th point as min + for (CGU_UINT32 e = 1; e < dwNumPoints - 1; e++) + _Rmp[e] = cmp_floor((_Rmp[0] * (dwNumPoints - 1 - e) + _Rmp[dwNumPoints - 1] * e + dwRndAmount[dwNumPoints]) / (CGU_FLOAT)(dwNumPoints - 1)); +} + +/*------------------------------------------------------------------------------------------------ +// build 3D ramp +------------------------------------------------------------------------------------------------*/ +CMP_STATIC inline void cpu_BldRmp(CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS], CGU_UINT32 dwNumPoints) +{ + for (CGU_UINT32 j = 0; j < 3; j++) + cpu_BldClrRmp(_Rmp[j], _InpRmp[j], dwNumPoints); +} + +/*------------------------------------------------------------------------------------------------ +// this is how the end points is going to be look like when decompressed +------------------------------------------------------------------------------------------------*/ +CMP_STATIC inline void cpu_MkWkRmpPts(CMP_INOUT CGU_UINT8 CMP_REFINOUT _bEq, + CGU_FLOAT _OutRmpPts[NUM_CHANNELS][NUM_ENDPOINTS], + CGU_FLOAT _InpRmpPts[NUM_CHANNELS][NUM_ENDPOINTS], + CGU_UINT8 nRedBits, + CGU_UINT8 nGreenBits, + CGU_UINT8 nBlueBits) +{ + CGU_FLOAT Fctrs[3]; + Fctrs[RC] = (CGU_FLOAT)(1 << nRedBits); + Fctrs[GC] = (CGU_FLOAT)(1 << nGreenBits); + Fctrs[BC] = (CGU_FLOAT)(1 << nBlueBits); + + CGU_BOOL bEq = true; + // find whether input ramp is flat + for (CGU_UINT32 j = 0; j < 3; j++) + bEq &= (_InpRmpPts[j][0] == _InpRmpPts[j][1]); + + _bEq = bEq ? 1 : 0; + + // end points on the integer grid + for (CGU_UINT32 j = 0; j < 3; j++) + { + for (CGU_UINT32 k = 0; k < 2; k++) + { + // Apply the lower bit replication to give full dynamic range + _OutRmpPts[j][k] = _InpRmpPts[j][k] + cmp_floor(_InpRmpPts[j][k] / Fctrs[j]); + _OutRmpPts[j][k] = cmp_max(_OutRmpPts[j][k], 0.f); + _OutRmpPts[j][k] = cmp_min(_OutRmpPts[j][k], 255.f); + } + } +} + +// Compute error and find DXTC indexes for the current cluster +CMP_STATIC CGU_FLOAT cpu_ClstrIntnl(CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], + CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], + CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], + int dwBlockSize, + CGU_UINT8 dwNumPoints, + bool _ConstRamp, + CGU_FLOAT _pfWeights[3], + bool _bUseAlpha) +{ + CGU_FLOAT Err = 0.f; + CGU_UINT8 rmp_l = (_ConstRamp) ? 1 : dwNumPoints; + + // For each colour in the original block assign it + // to the closest cluster and compute the cumulative error + for (int i = 0; i < dwBlockSize; i++) + { + if (_bUseAlpha && *((CGU_UINT32*)&_Blk[i][AC]) == 0) + pcIndices[i] = dwNumPoints; + else + { + CGU_FLOAT shortest = 99999999999.f; + CGU_UINT8 shortestIndex = 0; + CGU_UINT8 r; + if ((_pfWeights[0] != 1.0f) || (_pfWeights[1] != 1.0f) || (_pfWeights[2] != 1.0f)) + for (r = 0; r < rmp_l; r++) + { + // calculate the distance for each component + CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * _pfWeights[0] + + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * _pfWeights[1] + + (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * _pfWeights[2]; + + if (distance < shortest) + { + shortest = distance; + shortestIndex = r; + } + } + else + for (r = 0; r < rmp_l; r++) + { + // calculate the distance for each component + CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) + + (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]); + + if (distance < shortest) + { + shortest = distance; + shortestIndex = r; + } + } + + Err += shortest; + + // We have the index of the best cluster, so assign this in the block + // Reorder indices to match correct DXTC ordering + if (shortestIndex == dwNumPoints - 1) + shortestIndex = 1; + else if (shortestIndex) + shortestIndex++; + pcIndices[i] = shortestIndex; + } + } + + return Err; +} + +/*------------------------------------------------------------------------------------------------ +// input ramp is on the coarse grid +------------------------------------------------------------------------------------------------*/ +CMP_STATIC CGU_FLOAT cpu_ClstrBas(CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], + CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], + CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS], + int dwBlockSize, + CGU_UINT8 dwNumPoints, + CGU_FLOAT _pfWeights[3], + bool _bUseAlpha, + CGU_UINT8 nRedBits, + CGU_UINT8 nGreenBits, + CGU_UINT8 nBlueBits) +{ + // make ramp endpoints the way they'll going to be decompressed + CGU_UINT8 Eq = 1; + CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; + cpu_MkWkRmpPts(Eq, InpRmp, _InpRmp, nRedBits, nGreenBits, nBlueBits); + + // build ramp as it would be built by decompressor + CGU_FLOAT Rmp[NUM_CHANNELS][MAX_POINTS]; + cpu_BldRmp(Rmp, InpRmp, dwNumPoints); + + // clusterize and find a cumulative error + return cpu_ClstrIntnl(_Blk, pcIndices, Rmp, dwBlockSize, dwNumPoints, Eq, _pfWeights, _bUseAlpha); +} + +CMP_STATIC CGU_UINT8 nByteBitsMask2[9] = {0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; + +CMP_STATIC CGU_UINT32 cpu_ConstructColor2(CGU_UINT8 R, CGU_UINT8 nRedBits, CGU_UINT8 G, CGU_UINT8 nGreenBits, CGU_UINT8 B, CGU_UINT8 nBlueBits) +{ + return (((R & nByteBitsMask2[nRedBits]) << (nGreenBits + nBlueBits - (PIX_GRID - nRedBits))) | + ((G & nByteBitsMask2[nGreenBits]) << (nBlueBits - (PIX_GRID - nGreenBits))) | ((B & nByteBitsMask2[nBlueBits]) >> ((PIX_GRID - nBlueBits)))); +} + +CMP_STATIC CGU_FLOAT cpu_Clstr(CGU_UINT32 block_32[BLOCK_SIZE_4X4], + CGU_UINT32 dwBlockSize, + CGU_UINT8 nEndpoints[3][NUM_ENDPOINTS], + CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], + CGU_UINT8 dwNumPoints, + CGU_FLOAT _pfWeights[3], + bool _bUseAlpha, + CGU_UINT8 _nAlphaThreshold, + CGU_UINT8 nRedBits, + CGU_UINT8 nGreenBits, + CGU_UINT8 nBlueBits) +{ + CGU_UINT32 c0 = cpu_ConstructColor2(nEndpoints[RC][0], nRedBits, nEndpoints[GC][0], nGreenBits, nEndpoints[BC][0], nBlueBits); + CGU_UINT32 c1 = cpu_ConstructColor2(nEndpoints[RC][1], nRedBits, nEndpoints[GC][1], nGreenBits, nEndpoints[BC][1], nBlueBits); + CGU_UINT32 nEndpointIndex0 = 0; + CGU_UINT32 nEndpointIndex1 = 1; + if ((!(dwNumPoints & 0x1) && c0 <= c1) || ((dwNumPoints & 0x1) && c0 > c1)) + { + nEndpointIndex0 = 1; + nEndpointIndex1 = 0; + } + + CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; + InpRmp[RC][0] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex0]; + InpRmp[RC][1] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex1]; + InpRmp[GC][0] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex0]; + InpRmp[GC][1] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex1]; + InpRmp[BC][0] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex0]; + InpRmp[BC][1] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex1]; + + CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24; + CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS]; + for (CGU_UINT32 i = 0; i < dwBlockSize; i++) + { + Blk[i][RC] = (CGU_FLOAT)((block_32[i] & 0xff0000) >> 16); + Blk[i][GC] = (CGU_FLOAT)((block_32[i] & 0xff00) >> 8); + Blk[i][BC] = (CGU_FLOAT)(block_32[i] & 0xff); + if (_bUseAlpha) + Blk[i][AC] = ((block_32[i] & 0xff000000) >= dwAlphaThreshold) ? 1.f : 0.f; + } + + return cpu_ClstrBas(pcIndices, Blk, InpRmp, dwBlockSize, dwNumPoints, _pfWeights, _bUseAlpha, nRedBits, nGreenBits, nBlueBits); +} + +/*------------------------------------------------------------------------------------------------ +Compute cumulative error for the current cluster +------------------------------------------------------------------------------------------------*/ +CMP_STATIC CGU_FLOAT cpu_ClstrErr(CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], + CGU_FLOAT _Rpt[BLOCK_SIZE_4X4], + CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], + CGU_UINT32 _NmbClrs, + CGU_UINT32 _blcktp, + bool _ConstRamp, + CGU_Vec3f channelWeights) +{ + CGU_FLOAT fError = 0.f; + CGU_UINT32 rmp_l = (_ConstRamp) ? 1 : _blcktp; + + CGU_BOOL useWeights = ((channelWeights[0] != 1.0f) || (channelWeights[1] != 1.0f) || (channelWeights[2] != 1.0f)); + + // For each colour in the original block, find the closest cluster + // and compute the comulative error + for (CGU_UINT32 i = 0; i < _NmbClrs; i++) + { + CGU_FLOAT fShortest = 99999999999.f; + + if (useWeights) + for (CGU_UINT32 r = 0; r < rmp_l; r++) + { + // calculate the distance for each component + CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * channelWeights[0] + + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * channelWeights[1] + + (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * channelWeights[2]; + + if (fDistance < fShortest) + fShortest = fDistance; + } + else + for (CGU_UINT32 r = 0; r < rmp_l; r++) + { + // calculate the distance for each component + CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) + + (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]); + + if (fDistance < fShortest) + fShortest = fDistance; + } + + // accumulate the error + fError += fShortest * _Rpt[i]; + } + + return fError; +} + +#if defined(USE_REFINE3D) + +CMP_STATIC CGU_FLOAT cmp_Refine3D(CGU_FLOAT _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], + CGU_FLOAT _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], + CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], + CGU_FLOAT _Rpt[BLOCK_SIZE_4X4], + CGU_UINT32 _NmrClrs, + CGU_UINT32 dwNumPoints, + CGU_Vec3f channelWeights, + CGU_UINT8 nRedBits, + CGU_UINT8 nGreenBits, + CGU_UINT8 nBlueBits, + CGU_UINT32 nRefineSteps) +{ + CGU_FLOAT ALIGN_16 Rmp[NUM_CHANNELS][MAX_POINTS]; + + CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS]; + for (CGU_UINT32 i = 0; i < _NmrClrs; i++) + for (CGU_UINT32 j = 0; j < 3; j++) + Blk[i][j] = _Blk[i][j]; + + CGU_FLOAT fWeightRed = channelWeights.r; + CGU_FLOAT fWeightGreen = channelWeights.g; + CGU_FLOAT fWeightBlue = channelWeights.b; + + // here is our grid + CGU_FLOAT Fctrs[3]; + Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits)); + Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits)); + Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits)); + + CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS]; + CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; + for (CGU_UINT32 k = 0; k < 2; k++) + for (CGU_UINT32 j = 0; j < 3; j++) + InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k]; + + // make ramp endpoints the way they'll going to be decompressed + // plus check whether the ramp is flat + CGU_UINT8 Eq; + CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS]; + cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); + + // build ramp for all 3 colors + cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); + + // clusterize for the current ramp + CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights); + if (bestE == 0.f) // if exact, we've done + return bestE; + + // Jitter endpoints in each direction + CGU_INT nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8)); + CGU_INT nRefineEnd = cmp_min(nRefineSteps, (CGU_UINT8)8); + for (CGU_INT nJitterG0 = nRefineStart; nJitterG0 <= nRefineEnd; nJitterG0++) + { + InpRmp[GC][0] = cmp_min(cmp_max(InpRmp0[GC][0] + nJitterG0 * Fctrs[GC], 0.f), 255.f); + for (CGU_INT nJitterG1 = nRefineStart; nJitterG1 <= nRefineEnd; nJitterG1++) + { + InpRmp[GC][1] = cmp_min(cmp_max(InpRmp0[GC][1] + nJitterG1 * Fctrs[GC], 0.f), 255.f); + cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); + cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints); + + CGU_FLOAT RmpErrG[MAX_POINTS][BLOCK_SIZE_4X4]; + for (CGU_UINT32 i = 0; i < _NmrClrs; i++) + { + for (CGU_UINT32 r = 0; r < dwNumPoints; r++) + { + CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]); + RmpErrG[r][i] = DistG * DistG * fWeightGreen; + } + } + + for (CGU_INT nJitterB0 = nRefineStart; nJitterB0 <= nRefineEnd; nJitterB0++) + { + InpRmp[BC][0] = cmp_min(cmp_max(InpRmp0[BC][0] + nJitterB0 * Fctrs[BC], 0.f), 255.f); + for (CGU_INT nJitterB1 = nRefineStart; nJitterB1 <= nRefineEnd; nJitterB1++) + { + InpRmp[BC][1] = cmp_min(cmp_max(InpRmp0[BC][1] + nJitterB1 * Fctrs[BC], 0.f), 255.f); + cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); + cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints); + + CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4]; + for (CGU_UINT32 i = 0; i < _NmrClrs; i++) + { + for (CGU_UINT32 r = 0; r < dwNumPoints; r++) + { + CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]); + RmpErr[r][i] = RmpErrG[r][i] + DistB * DistB * fWeightBlue; + } + } + + for (CGU_INT nJitterR0 = nRefineStart; nJitterR0 <= nRefineEnd; nJitterR0++) + { + InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + nJitterR0 * Fctrs[RC], 0.f), 255.f); + for (CGU_INT nJitterR1 = nRefineStart; nJitterR1 <= nRefineEnd; nJitterR1++) + { + InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + nJitterR1 * Fctrs[RC], 0.f), 255.f); + cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); + cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints); + + // compute cumulative error + CGU_FLOAT mse = 0.f; + CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; + for (CGU_UINT32 k = 0; k < _NmrClrs; k++) + { + CGU_FLOAT MinErr = 10000000.f; + for (CGU_INT r = 0; r < rmp_l; r++) + { + CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]); + CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightRed; + MinErr = cmp_min(MinErr, Err); + } + mse += MinErr * _Rpt[k]; + } + + // save if we achieve better result + if (mse < bestE) + { + bestE = mse; + for (CGU_UINT32 k = 0; k < 2; k++) + for (CGU_UINT32 j = 0; j < 3; j++) + _OutRmpPnts[j][k] = InpRmp[j][k]; + } + } + } + } + } + } + } + + return bestE; +} +#endif + +#if defined(USE_REFINE) +CMP_STATIC CGU_FLOAT cmp_Refine(CGU_FLOAT _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], + CGU_FLOAT _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], + CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], + CGU_FLOAT _Rpt[BLOCK_SIZE_4X4], + CGU_INT _NmrClrs, + CGU_UINT8 dwNumPoints, + CGU_Vec3f channelWeights, + CGU_UINT32 nRedBits, + CGU_UINT32 nGreenBits, + CGU_UINT32 nBlueBits, + CGU_UINT32 nRefineSteps) +{ + CGU_FLOAT ALIGN_16 Rmp[NUM_CHANNELS][MAX_POINTS]; + + if (nRefineSteps == 0) + nRefineSteps = 1; + + CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS]; + for (CGU_INT i = 0; i < _NmrClrs; i++) + for (CGU_INT j = 0; j < 3; j++) + Blk[i][j] = _Blk[i][j]; + + CGU_FLOAT fWeightRed = channelWeights.r; + CGU_FLOAT fWeightGreen = channelWeights.g; + CGU_FLOAT fWeightBlue = channelWeights.b; + + // here is our grid + CGU_FLOAT Fctrs[3]; + Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits)); + Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits)); + Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits)); + + CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS]; + CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; + for (CGU_INT k = 0; k < 2; k++) + for (CGU_INT j = 0; j < 3; j++) + InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k]; + + // make ramp endpoints the way they'll going to be decompressed + // plus check whether the ramp is flat + CGU_UINT8 Eq; + CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS]; + cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); + + // build ramp for all 3 colors + cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); + + // clusterize for the current ramp + CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights); + if (bestE == 0.f) // || !nRefineSteps) // if exact, we've done + return bestE; + + // Tweak each component in isolation and get the best values + + // precompute ramp errors for Green and Blue + CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4]; + for (CGU_INT i = 0; i < _NmrClrs; i++) + { + for (CGU_INT r = 0; r < dwNumPoints; r++) + { + CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]); + CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]); + RmpErr[r][i] = DistG * DistG * fWeightGreen + DistB * DistB * fWeightBlue; + } + } + + // First Red + CGU_FLOAT bstC0 = InpRmp0[RC][0]; + CGU_FLOAT bstC1 = InpRmp0[RC][1]; + CGU_INT nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8)); + CGU_INT nRefineEnd = cmp_min(nRefineSteps, (CGU_UINT8)8); + for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++) + { + for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++) + { + // make a move; both sides of interval. + InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + i * Fctrs[RC], 0.f), 255.f); + InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + j * Fctrs[RC], 0.f), 255.f); + + // make ramp endpoints the way they'll going to be decompressed + // plus check whether the ramp is flat + cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); + + // build ramp only for red + cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints); + + // compute cumulative error + CGU_FLOAT mse = 0.f; + CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; + for (CGU_INT k = 0; k < _NmrClrs; k++) + { + CGU_FLOAT MinErr = 10000000.f; + for (CGU_INT r = 0; r < rmp_l; r++) + { + CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]); + CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightRed; + MinErr = cmp_minf(MinErr, Err); + } + mse += MinErr * _Rpt[k]; + } + + // save if we achieve better result + if (mse < bestE) + { + bstC0 = InpRmp[RC][0]; + bstC1 = InpRmp[RC][1]; + bestE = mse; + } + } + } + + // our best REDs + InpRmp[RC][0] = bstC0; + InpRmp[RC][1] = bstC1; + + // make ramp endpoints the way they'll going to be decompressed + // plus check whether the ramp is flat + cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); + + // build ramp only for green + cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); + + // precompute ramp errors for Red and Blue + for (CGU_INT i = 0; i < _NmrClrs; i++) + { + for (CGU_INT r = 0; r < dwNumPoints; r++) + { + CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]); + CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]); + RmpErr[r][i] = DistR * DistR * fWeightRed + DistB * DistB * fWeightBlue; + } + } + + // Now green + bstC0 = InpRmp0[GC][0]; + bstC1 = InpRmp0[GC][1]; + for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++) + { + for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++) + { + InpRmp[GC][0] = cmp_minf(cmp_maxf(InpRmp0[GC][0] + i * Fctrs[GC], 0.f), 255.f); + InpRmp[GC][1] = cmp_minf(cmp_maxf(InpRmp0[GC][1] + j * Fctrs[GC], 0.f), 255.f); + + cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); + cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints); + + CGU_FLOAT mse = 0.f; + CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; + for (CGU_INT k = 0; k < _NmrClrs; k++) + { + CGU_FLOAT MinErr = 10000000.f; + for (CGU_INT r = 0; r < rmp_l; r++) + { + CGU_FLOAT Dist = (Rmp[GC][r] - Blk[k][GC]); + CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightGreen; + MinErr = cmp_minf(MinErr, Err); + } + mse += MinErr * _Rpt[k]; + } + + if (mse < bestE) + { + bstC0 = InpRmp[GC][0]; + bstC1 = InpRmp[GC][1]; + bestE = mse; + } + } + } + + // our best GREENs + InpRmp[GC][0] = bstC0; + InpRmp[GC][1] = bstC1; + + cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); + cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); + + // ramp err for Red and Green + for (CGU_INT i = 0; i < _NmrClrs; i++) + { + for (CGU_INT r = 0; r < dwNumPoints; r++) + { + CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]); + CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]); + RmpErr[r][i] = DistR * DistR * fWeightRed + DistG * DistG * fWeightGreen; + } + } + + bstC0 = InpRmp0[BC][0]; + bstC1 = InpRmp0[BC][1]; + // Now blue + for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++) + { + for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++) + { + InpRmp[BC][0] = min(max(InpRmp0[BC][0] + i * Fctrs[BC], 0.f), 255.f); + InpRmp[BC][1] = min(max(InpRmp0[BC][1] + j * Fctrs[BC], 0.f), 255.f); + + cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); + cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints); + + CGU_FLOAT mse = 0.f; + CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; + for (CGU_INT k = 0; k < _NmrClrs; k++) + { + CGU_FLOAT MinErr = 10000000.f; + for (CGU_INT r = 0; r < rmp_l; r++) + { + CGU_FLOAT Dist = (Rmp[BC][r] - Blk[k][BC]); + CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightBlue; + MinErr = min(MinErr, Err); + } + mse += MinErr * _Rpt[k]; + } + + if (mse < bestE) + { + bstC0 = InpRmp[BC][0]; + bstC1 = InpRmp[BC][1]; + bestE = mse; + } + } + } + + // our best BLUEs + InpRmp[BC][0] = bstC0; + InpRmp[BC][1] = bstC1; + + // return our best choice + for (CGU_INT j = 0; j < 3; j++) + for (CGU_INT k = 0; k < 2; k++) + _OutRmpPnts[j][k] = InpRmp[j][k]; + + return bestE; +} + +#endif + +//====================================================================================== +// Codec from CompressonatorLib +//====================================================================================== +#define BLOCK_SIZE_4X4 16 +#define RG 5 +#define GG 6 +#define BG 5 + +/*------------------------------------------------------------------------------------------------ +// this is how the end points is going to be rounded in compressed format +------------------------------------------------------------------------------------------------*/ +CMP_STATIC void cpu_MkRmpOnGrid(CGU_FLOAT _RmpF[NUM_CHANNELS][NUM_ENDPOINTS], + CGU_FLOAT _MnMx[NUM_CHANNELS][NUM_ENDPOINTS], + CGU_FLOAT _Min, + CGU_FLOAT _Max, + CGU_UINT8 nRedBits, + CGU_UINT8 nGreenBits, + CGU_UINT8 nBlueBits) +{ + CGU_FLOAT Fctrs0[3]; + CGU_FLOAT Fctrs1[3]; + + Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits); + Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits); + Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits); + Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits)); + Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits)); + Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits)); + + for (int j = 0; j < 3; j++) + { + for (int k = 0; k < 2; k++) + { + _RmpF[j][k] = cmp_floor(_MnMx[j][k]); + if (_RmpF[j][k] <= _Min) + _RmpF[j][k] = _Min; + else + { + _RmpF[j][k] += cmp_floor(128.f / Fctrs1[j]) - cmp_floor(_RmpF[j][k] / Fctrs1[j]); + _RmpF[j][k] = cmp_minf(_RmpF[j][k], _Max); + } + + _RmpF[j][k] = cmp_floor(_RmpF[j][k] / Fctrs0[j]) * Fctrs0[j]; + } + } +} + +// Find the first approximation of the line +// Assume there is a linear relation +// Z = a * X_In +// Z = b * Y_In +// Find a,b to minimize MSE between Z and Z_In +CMP_STATIC void cpu_FindAxis(CMP_OUT CGU_FLOAT BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS], + CMP_IN CGU_FLOAT LineDir0[NUM_CHANNELS], + CMP_IN CGU_FLOAT fBlockCenter[NUM_CHANNELS], + CMP_OUT CGU_UINT8 CMP_REFINOUT AxisIsSmall, + CMP_IN CGU_FLOAT BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS], + CMP_IN CGU_FLOAT _inpRpt[BLOCK_SIZE_4X4], + CMP_IN int nDimensions, + CMP_IN int dwUniqueColors) +{ + CGU_FLOAT Crrl[NUM_CHANNELS]; + CGU_FLOAT RGB2[NUM_CHANNELS]; + CGU_INT i; + + LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = fBlockCenter[0] = fBlockCenter[1] = fBlockCenter[2] = + 0.f; + + // sum position of all points + CGU_FLOAT fNumPoints = 0.f; + for (i = 0; i < dwUniqueColors; i++) + { + fBlockCenter[0] += BlkUV[i][0] * _inpRpt[i]; + fBlockCenter[1] += BlkUV[i][1] * _inpRpt[i]; + fBlockCenter[2] += BlkUV[i][2] * _inpRpt[i]; + fNumPoints += _inpRpt[i]; + } + + // and then average to calculate center coordinate of block + fBlockCenter[0] /= fNumPoints; + fBlockCenter[1] /= fNumPoints; + fBlockCenter[2] /= fNumPoints; + + for (i = 0; i < dwUniqueColors; i++) + { + // calculate output block as offsets around block center + BlkSh[i][0] = BlkUV[i][0] - fBlockCenter[0]; + BlkSh[i][1] = BlkUV[i][1] - fBlockCenter[1]; + BlkSh[i][2] = BlkUV[i][2] - fBlockCenter[2]; + + // compute correlation matrix + // RGB2 = sum of ((distance from point from center) squared) + // Crrl = ???????. Seems to be be some calculation based on distance from point center in two dimensions + for (int j = 0; j < nDimensions; j++) + { + RGB2[j] += BlkSh[i][j] * BlkSh[i][j] * _inpRpt[i]; + Crrl[j] += BlkSh[i][j] * BlkSh[i][(j + 1) % 3] * _inpRpt[i]; + } + } + + // if set's diameter is small + int i0 = 0, i1 = 1; + CGU_FLOAT mxRGB2 = 0.f; + int k = 0, j = 0; + CGU_FLOAT fEPS = fNumPoints * EPS; + for (k = 0, j = 0; j < 3; j++) + { + if (RGB2[j] >= fEPS) + k++; + else + RGB2[j] = 0.f; + + if (mxRGB2 < RGB2[j]) + { + mxRGB2 = RGB2[j]; + i0 = j; + } + } + + CGU_FLOAT fEPS2 = fNumPoints * EPS2; + AxisIsSmall = 1; + for (j = 0; j < 3; j++) + { + AxisIsSmall &= (RGB2[j] < fEPS2); + } + + if (AxisIsSmall) // all are very small to avoid division on the small determinant + return; + + if (k == 1) // really only 1 dimension + LineDir0[i0] = 1.; + else if (k == 2) + { // really only 2 dimensions + i1 = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3; + CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3]; + LineDir0[i1] = Crl / RGB2[i0]; + LineDir0[i0] = 1.; + } + else + { + CGU_FLOAT maxDet = 100000.f; + CGU_FLOAT Cs[3]; + // select max det for precision + for (j = 0; j < nDimensions; j++) + { + CGU_FLOAT Det = RGB2[j] * RGB2[(j + 1) % 3] - Crrl[j] * Crrl[j]; + Cs[j] = abs(Crrl[j] / sqrt(RGB2[j] * RGB2[(j + 1) % 3])); + if (maxDet < Det) + { + maxDet = Det; + i0 = j; + } + } + + // inverse correl matrix + // -- -- -- -- + // | A B | | C -B | + // | B C | => | -B A | + // -- -- -- -- + CGU_FLOAT mtrx1[2][2]; + CGU_FLOAT vc1[2]; + CGU_FLOAT vc[2]; + vc1[0] = Crrl[(i0 + 2) % 3]; + vc1[1] = Crrl[(i0 + 1) % 3]; + // C + mtrx1[0][0] = RGB2[(i0 + 1) % 3]; + // A + mtrx1[1][1] = RGB2[i0]; + // -B + mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0]; + // find a solution + vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1]; + vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1]; + // normalize + vc[0] /= maxDet; + vc[1] /= maxDet; + // find a line direction vector + LineDir0[i0] = 1.; + LineDir0[(i0 + 1) % 3] = 1.; + LineDir0[(i0 + 2) % 3] = vc[0] + vc[1]; + } + + // normalize direction vector + CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2]; + Len = sqrt(Len); + + for (j = 0; j < 3; j++) + LineDir0[j] = (Len > 0.f) ? LineDir0[j] / Len : 0.f; +} + +CMP_STATIC CGU_FLOAT cpu_RampSrchW(CGU_FLOAT Prj[BLOCK_SIZE_4X4], + CGU_FLOAT PrjErr[BLOCK_SIZE_4X4], + CGU_FLOAT PreMRep[BLOCK_SIZE_4X4], + CGU_FLOAT StepErr, + CGU_FLOAT lowPosStep, + CGU_FLOAT highPosStep, + int dwUniqueColors, + int dwNumPoints) +{ + CGU_FLOAT error = 0.0f; + CGU_FLOAT step = (highPosStep - lowPosStep) / (dwNumPoints - 1); + CGU_FLOAT step_h = step * 0.5f; + CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; + CGU_INT i; + + for (i = 0; i < dwUniqueColors; i++) + { + // Work out which value in the block this select + CGU_FLOAT del = Prj[i] - lowPosStep; + + CGU_FLOAT v; + + if (del <= 0) + v = lowPosStep; + else if (Prj[i] - highPosStep >= 0) + v = highPosStep; + else + v = cmp_floor((del + step_h) * rstep) * step + lowPosStep; + + // And accumulate the error + CGU_FLOAT d = (Prj[i] - v); + d *= d; + CGU_FLOAT err = PreMRep[i] * d + PrjErr[i]; + error += err; + if (StepErr < error) + { + error = StepErr; + break; + } + } + return error; +} + +CMP_STATIC CGU_FLOAT _cpu_bc1ComputeBestEndpoints(CGU_FLOAT endpointsOut[NUM_ENDPOINTS], + CGU_FLOAT endpointsIn[NUM_ENDPOINTS], + CGU_FLOAT prj[BLOCK_SIZE_4X4], + CGU_FLOAT prjError[BLOCK_SIZE_4X4], + CGU_FLOAT preMRep[BLOCK_SIZE_4X4], + int numColours, + int numPoints) +{ + CGU_FLOAT minError = MAX_ERROR; + + static const CGU_FLOAT searchStep = 0.025f; + + const CGU_FLOAT lowStart = (endpointsIn[0] - 2.0f * searchStep > 0.0f) ? endpointsIn[0] - 2.0f * searchStep : 0.0f; + const CGU_FLOAT highStart = (endpointsIn[1] + 2.0f * searchStep < 1.0f) ? endpointsIn[1] + 2.0f * searchStep : 1.0f; + + CGU_FLOAT lowStep = lowStart; + CGU_FLOAT highStep = highStart; + + for (int low = 0; low < 8; ++low) + { + for (int high = 0; high < 8; ++high) + { + // compute an error for the current pair of end points. + CGU_FLOAT error = cpu_RampSrchW(prj, prjError, preMRep, minError, lowStep, highStep, numColours, numPoints); + + if (error < minError) + { + // save better result + minError = error; + endpointsOut[0] = lowStep; + endpointsOut[1] = highStep; + } + + highStep -= searchStep; + } + + lowStep += searchStep; + } + + return minError; +} + +// This is a float point-based compression +// it assumes that the number of unique colors is already known; input is in [0., 255.] range. +// This is C version. +CMP_STATIC bool cpu_CompressRGBBlockX(CMP_OUT CGU_FLOAT _RsltRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], + CMP_IN CGU_FLOAT src_image[BLOCK_SIZE_4X4][NUM_CHANNELS], + CMP_IN CGU_FLOAT Rpt[BLOCK_SIZE_4X4], + CMP_IN int dwUniqueColors, + CMP_IN CGU_UINT8 dwNumPoints, + CMP_IN bool b3DRefinement, + CMP_IN CGU_UINT8 nRefinementSteps, + CMP_IN CGU_FLOAT pfWeights[3], + CMP_IN CGU_UINT8 nRedBits, + CMP_IN CGU_UINT8 nGreenBits, + CMP_IN CGU_UINT8 nBlueBits, + CMP_IN CGU_FLOAT fquality) +{ +#if !defined(ASPM_GPU) + if (!g_bc1FunctionPointersSet) + { + bc1ToggleSIMD(EXTENSION_COUNT); + } +#endif + + CGU_FLOAT ALIGN_16 Prj0[BLOCK_SIZE_4X4]; + CGU_FLOAT ALIGN_16 Prj[BLOCK_SIZE_4X4]; + CGU_FLOAT ALIGN_16 PrjErr[BLOCK_SIZE_4X4]; + CGU_FLOAT ALIGN_16 LineDir[NUM_CHANNELS]; + CGU_FLOAT ALIGN_16 RmpIndxs[BLOCK_SIZE_4X4]; + + CMP_UNUSED(fquality); + CMP_UNUSED(b3DRefinement) + + CGU_FLOAT LineDirG[NUM_CHANNELS]; + CGU_FLOAT PosG[NUM_ENDPOINTS]; + CGU_FLOAT BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS]; + CGU_FLOAT BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS]; + CGU_FLOAT LineDir0[NUM_CHANNELS]; + CGU_FLOAT Mdl[NUM_CHANNELS]; + + CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS]; + int i, j, k; + + // down to [0., 1.] + for (i = 0; i < dwUniqueColors; i++) + for (j = 0; j < 3; j++) + BlkUV[i][j] = src_image[i][j] / 255.f; + + bool isDONE = false; + + // as usual if not more then 2 different colors, we've done + if (dwUniqueColors <= 2) + { + for (j = 0; j < 3; j++) + { + rsltC[j][0] = src_image[0][j]; + rsltC[j][1] = src_image[dwUniqueColors - 1][j]; + } + isDONE = true; + } + + if (!isDONE) + { + // This is our first attempt to find an axis we will go along. + // The cumulation is done to find a line minimizing the MSE from the input 3D points. + CGU_UINT8 bSmall; + cpu_FindAxis(BlkSh, LineDir0, Mdl, bSmall, BlkUV, Rpt, 3, dwUniqueColors); + + // While trying to find the axis we found that the diameter of the input set is quite small. + // Do not bother. + if (bSmall) + { + for (j = 0; j < 3; j++) + { + rsltC[j][0] = src_image[0][j]; + rsltC[j][1] = src_image[dwUniqueColors - 1][j]; + } + isDONE = true; + } + } + + // GCC is being an awful being when it comes to goto-jumps. + // So please bear with this. + if (!isDONE) + { + CGU_FLOAT ErrG = 10000000.f; + CGU_FLOAT PrjBnd[NUM_ENDPOINTS]; + CGU_FLOAT ALIGN_16 PreMRep[BLOCK_SIZE_4X4]; + for (j = 0; j < 3; j++) + LineDir[j] = LineDir0[j]; + + // Here is the main loop. + // 1. Project input set on the axis in consideration. + // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points. + // 3. Compute the vector of indexes (or clusters) for the current approximate ramp. + // 4. Present our color channels as 3 16DIM vectors. + // 5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector. + // 6. Plug the projections as a new directional vector for the axis. + // 7. Goto 1. + // D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3, 2/3, 0, ...,}, but shifted and normalized). + // Ci - is a 16 dim vector of color i. + // for each Ci find a scalar Ai such that + // (Ai * D - Ci) (Ai * D - Ci) -> min , i.e distance between vector AiD and C is min. + // You can think of D as a unit interval(vector) "clusterizer", + // and Ai is a scale you need to apply to the clusterizer to + // approximate the Ci vector instead of the unit vector. + // Solution is + // Ai = (D . Ci) / (D . D); . - is a dot product. + // in 3 dim space Ai(s) represent a line direction, along which + // we again try to find (sub)optimal quantizer. + + // That's what our for(;;) loop is about. + for (;;) + { + // 1. Project input set on the axis in consideration. + // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is + // P + ((R-P).v) / (v.v))v + // The distance along v is therefore (R-P).v / (v.v) + // (v.v) is 1 if v is a unit vector. + // + PrjBnd[0] = 1000.; + PrjBnd[1] = -1000.; + for (i = 0; i < BLOCK_SIZE_4X4; i++) + Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f; + + for (i = 0; i < dwUniqueColors; i++) + { + Prj0[i] = Prj[i] = BlkSh[i][0] * LineDir[0] + BlkSh[i][1] * LineDir[1] + BlkSh[i][2] * LineDir[2]; + + PrjErr[i] = (BlkSh[i][0] - LineDir[0] * Prj[i]) * (BlkSh[i][0] - LineDir[0] * Prj[i]) + + (BlkSh[i][1] - LineDir[1] * Prj[i]) * (BlkSh[i][1] - LineDir[1] * Prj[i]) + + (BlkSh[i][2] - LineDir[2] * Prj[i]) * (BlkSh[i][2] - LineDir[2] * Prj[i]); + + PrjBnd[0] = min(PrjBnd[0], Prj[i]); + PrjBnd[1] = max(PrjBnd[1], Prj[i]); + } + + // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points. + + // min and max of the search interval + CGU_FLOAT stepf = 0.125f; + + CGU_FLOAT Scl[NUM_ENDPOINTS]; + Scl[0] = PrjBnd[0] - (PrjBnd[1] - PrjBnd[0]) * stepf; + Scl[1] = PrjBnd[1] + (PrjBnd[1] - PrjBnd[0]) * stepf; + + // No range found exit + if (Scl[0] == Scl[1]) + { + return false; + } + + // compute scaling factor to scale down the search interval to [0.,1] + const CGU_FLOAT Scl2 = (Scl[1] - Scl[0]) * (Scl[1] - Scl[0]); + const CGU_FLOAT overScl = 1.f / (Scl[1] - Scl[0]); + + for (i = 0; i < dwUniqueColors; i++) + { + // scale them + Prj[i] = (Prj[i] - Scl[0]) * overScl; + // premultiply the scale squire to plug into error computation later + PreMRep[i] = Rpt[i] * Scl2; + } + + // scale first approximation of end points + PrjBnd[0] = (PrjBnd[0] - Scl[0]) * overScl; + PrjBnd[1] = (PrjBnd[1] - Scl[0]) * overScl; + + // find the best endpoints + CGU_FLOAT Pos[NUM_ENDPOINTS]; +#if defined(ASPM_GPU) + CGU_FLOAT StepErr = _cpu_bc1ComputeBestEndpoints(Pos, PrjBnd, Prj, PrjErr, PreMRep, dwUniqueColors, dwNumPoints); +#else + CGU_FLOAT StepErr = cpu_bc1ComputeBestEndpoints(Pos, PrjBnd, Prj, PrjErr, PreMRep, dwUniqueColors, dwNumPoints); +#endif + + // inverse the scaling + Pos[0] = Pos[0] * (Scl[1] - Scl[0]) + Scl[0]; + Pos[1] = Pos[1] * (Scl[1] - Scl[0]) + Scl[0]; + + // did we find somthing better from the previous run? + if (StepErr + 0.001 < ErrG) + { + // yes, remember it + ErrG = StepErr; + LineDirG[0] = LineDir[0]; + LineDirG[1] = LineDir[1]; + LineDirG[2] = LineDir[2]; + PosG[0] = Pos[0]; + PosG[1] = Pos[1]; + // 3. Compute the vector of indexes (or clusters) for the current approximate ramp. + // indexes + const CGU_FLOAT step = (Pos[1] - Pos[0]) / (CGU_FLOAT)(dwNumPoints - 1); + const CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; + const CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; + const CGU_FLOAT overBlkTp = 1.f / (CGU_FLOAT)(dwNumPoints - 1); + + // here the index vector is computed, + // shifted and normalized + CGU_FLOAT indxAvrg = (CGU_FLOAT)(dwNumPoints - 1) / 2.f; + + for (i = 0; i < dwUniqueColors; i++) + { + CGU_FLOAT del; + //int n = (int)((b - _min_ex + (step*0.5f)) * rstep); + if ((del = Prj0[i] - Pos[0]) <= 0) + RmpIndxs[i] = 0.f; + else if (Prj0[i] - Pos[1] >= 0) + RmpIndxs[i] = (CGU_FLOAT)(dwNumPoints - 1); + else + RmpIndxs[i] = cmp_floor((del + step_h) * rstep); + // shift and normalization + RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp; + } + + // 4. Present our color channels as 3 16DIM vectors. + // 5. Find closest aproximation of each of 16DIM color vector with the pojection of the 16DIM index vector. + CGU_FLOAT Crs[3], Len, Len2; + for (i = 0, Crs[0] = Crs[1] = Crs[2] = Len = 0.f; i < dwUniqueColors; i++) + { + const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i]; + Len += RmpIndxs[i] * PreMlt; + for (j = 0; j < 3; j++) + Crs[j] += BlkSh[i][j] * PreMlt; + } + + LineDir[0] = LineDir[1] = LineDir[2] = 0.f; + if (Len > 0.f) + { + LineDir[0] = Crs[0] / Len; + LineDir[1] = Crs[1] / Len; + LineDir[2] = Crs[2] / Len; + + // 6. Plug the projections as a new directional vector for the axis. + // 7. Goto 1. + Len2 = LineDir[0] * LineDir[0] + LineDir[1] * LineDir[1] + LineDir[2] * LineDir[2]; + Len2 = sqrt(Len2); + + LineDir[0] /= Len2; + LineDir[1] /= Len2; + LineDir[2] /= Len2; + } + } + else // We was not able to find anything better. Drop dead. + break; + } + + // inverse transform to find end-points of 3-color ramp + for (k = 0; k < 2; k++) + for (j = 0; j < 3; j++) + rsltC[j][k] = (PosG[k] * LineDirG[j] + Mdl[j]) * 255.f; + } + + // We've dealt with (almost) unrestricted full precision realm. + // Now back to the dirty digital world. + + // round the end points to make them look like compressed ones + CGU_FLOAT inpRmpEndPts[NUM_CHANNELS][NUM_ENDPOINTS]; + cpu_MkRmpOnGrid(inpRmpEndPts, rsltC, 0.f, 255.f, nRedBits, nGreenBits, nBlueBits); + + // Try using this on 3 channels + // static CGU_Vec2i cmp_getLinearEndPoints(CGU_FLOAT _Blk[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSigned); + + // This not a small procedure squeezes and stretches the ramp along each axis (R,G,B) separately while other 2 are fixed. + // It does it only over coarse grid - 565 that is. It tries to squeeze more precision for the real world ramp. +#if defined(USE_REFINE) || defined(USE_REFINE3D) + switch (nRefinementSteps) + { + case 1: + cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 3); + break; + case 2: + if (dwUniqueColors > 2) + cmp_Refine3D(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1); + else + cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 3); + break; + default: + cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1); + break; + } +#endif + return true; +} + +// CPU: CompRGBBlock() +CMP_STATIC CGU_FLOAT cpu_CompRGBBlock32(CGU_UINT32 block_32[16], + CGU_UINT32 compressedBlock[2], + CGU_UINT32 dwBlockSize, + CGU_UINT8 nRedBits, + CGU_UINT8 nGreenBits, + CGU_UINT8 nBlueBits, + CGU_UINT8 nEndpoints[3][NUM_ENDPOINTS], + CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], + CGU_UINT8 dwNumPoints, + bool b3DRefinement, + CGU_UINT8 m_nRefinementSteps, + CGU_FLOAT _pfChannelWeights[3], + bool _bUseAlpha, + CGU_UINT8 _nAlphaThreshold) +{ + CGU_FLOAT ALIGN_16 Rpt[BLOCK_SIZE_4X4]; + CGU_FLOAT ALIGN_16 BlkIn[BLOCK_SIZE_4X4][NUM_CHANNELS]; + CGU_UINT32 mx; + for (mx = 0; mx < BLOCK_SIZE_4X4; mx++) + { + Rpt[mx] = 0; + BlkIn[mx][0] = 0; + BlkIn[mx][1] = 0; + BlkIn[mx][2] = 0; + BlkIn[mx][3] = 0; + } + + compressedBlock[0] = 0; + + CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24; + CGU_UINT32 dwColors = 0; + CGU_UINT32 dwBlk[BLOCK_SIZE]; + for (CGU_UINT32 i = 0; i < dwBlockSize; i++) + if (!_bUseAlpha || (block_32[i] & 0xff000000) >= dwAlphaThreshold) + dwBlk[dwColors++] = block_32[i] | 0xff000000; + + // Do we have any colors ? + static int id = 0; + if (dwColors) + { + bool bHasAlpha = (dwColors != dwBlockSize); + if (bHasAlpha && _bUseAlpha && !(dwNumPoints & 0x1)) + return CMP_FLT_MAX; + + // Here we are computing an unique number of colors. + // For each unique value we compute the number of it appearences. + //qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp); +#ifndef ASPM_GPU // this is here for reminder when code moves to GPU + std::sort(dwBlk, dwBlk + 15); +#else + { + CGU_UINT32 j; + CMP_di what[BLOCK_SIZE_4X4]; + + for (i = 0; i < dwColors; i++) + { + what[i].index = i; + what[i].data = dwBlk[i]; + } + + CGU_UINT32 tmp_index; + CGU_UINT32 tmp_data; + + for (i = 1; i < dwColors; i++) + { + for (j = i; j > 0; j--) + { + if (what[j - 1].data > what[j].data) + { + tmp_index = what[j].index; + tmp_data = what[j].data; + what[j].index = what[j - 1].index; + what[j].data = what[j - 1].data; + what[j - 1].index = tmp_index; + what[j - 1].data = tmp_data; + } + } + } + for (i = 0; i < dwColors; i++) + dwBlk[i] = what[i].data; + } +#endif + + CGU_UINT32 new_p; + CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4]; + CGU_UINT32 dwUniqueColors = 0; + new_p = dwBlkU[0] = dwBlk[0]; + Rpt[dwUniqueColors] = 1.f; + CGU_UINT32 i; + for (i = 1; i < dwColors; i++) + { + if (new_p != dwBlk[i]) + { + dwUniqueColors++; + new_p = dwBlkU[dwUniqueColors] = dwBlk[i]; + Rpt[dwUniqueColors] = 1.f; + } + else + Rpt[dwUniqueColors] += 1.f; + } + dwUniqueColors++; + + // switch to float + for (i = 0; i < dwUniqueColors; i++) + { + BlkIn[i][RC] = (CGU_FLOAT)((dwBlkU[i] >> 16) & 0xff); // R + BlkIn[i][GC] = (CGU_FLOAT)((dwBlkU[i] >> 8) & 0xff); // G + BlkIn[i][BC] = (CGU_FLOAT)((dwBlkU[i] >> 0) & 0xff); // B + BlkIn[i][AC] = 255.0f; + } + + CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS]; + if (cpu_CompressRGBBlockX(rsltC, // CMP_EndPoints = CompressRGBBlock_Slow2 ( + BlkIn, // CGU_Vec3f src_imageNorm[BLOCK_SIZE_4X4] + Rpt, // CGU_FLOAT Rpt[BLOCK_SIZE_4X4], + dwUniqueColors, // CGU_UINT32 dwUniqueColors, + dwNumPoints, // CGU_UINT32 dwNumPoints, + b3DRefinement, // + m_nRefinementSteps, // CGU_UINT32 m_nRefinementSteps, + _pfChannelWeights, // CGU_Vec3f channelWeightsBGR, + nRedBits, // ); + nGreenBits, + nBlueBits, + 1.0f)) + { + // return to integer realm + for (int ch = 0; ch < 3; ch++) + for (int j = 0; j < 2; j++) + nEndpoints[ch][j] = (CGU_UINT8)rsltC[ch][j]; + //printf("Endpoints {%3d,%3d,%3d} {%3d,%3d,%3d} ", nEndpoints[0][0],nEndpoints[1][0],nEndpoints[2][0], + // nEndpoints[0][1],nEndpoints[1][1],nEndpoints[2][1]); + + // Now get the indices using the new end points + return cpu_Clstr( + block_32, dwBlockSize, nEndpoints, pcIndices, dwNumPoints, _pfChannelWeights, _bUseAlpha, _nAlphaThreshold, nRedBits, nGreenBits, nBlueBits); + } + else + { + CGU_FLOAT CompErr = CMP_FLT_MAX; + if (dwNumPoints < 4) + { + CGU_Vec3f src_imageNorm[BLOCK_SIZE_4X4]; + + for (CGU_UINT32 px = 0; px < 16; px++) + { + src_imageNorm[px].r = (CGU_FLOAT)((block_32[px] >> 16) & 0xff) / 255.0f; + src_imageNorm[px].g = (CGU_FLOAT)((block_32[px] >> 8) & 0xff) / 255.0f; + src_imageNorm[px].b = (CGU_FLOAT)((block_32[px] >> 0) & 0xff) / 255.0f; + } + + // Do a quick compression test + CGU_Vec3f srcRGB[16]; // The list of source colors with blue channel altered + CGU_Vec3f average_rgb; // The centrepoint of the axis + CGU_FLOAT errLQ = CMP_FLT_MAX; + cgu_CompressRGBBlock_MinMax(src_imageNorm, 1.0f, false, srcRGB, average_rgb, errLQ); + CGU_Vec2ui cmp = cgu_CompressRGBBlock_Fast(src_imageNorm, 1.0f, false, srcRGB, average_rgb, CompErr); + + compressedBlock[0] = cmp.x; + compressedBlock[1] = cmp.y; + } + return CompErr; + } + } + else + { + // All colors transparent + nEndpoints[0][0] = nEndpoints[1][0] = nEndpoints[2][0] = 0; + nEndpoints[0][1] = nEndpoints[1][1] = nEndpoints[2][1] = 0xff; + for (CGU_UINT32 ms = 0; ms < dwBlockSize; ms++) + pcIndices[ms] = 0xff; + return 0.0; + } +} + +CMP_STATIC CGU_Vec2ui cpu_CompRGBBlock(CMP_IN CGU_Vec4uc bgraBlock[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options, CMP_INOUT CGU_FLOAT CMP_REFINOUT err) +{ + CGU_Vec2ui cmpBlock = {0U, 0U}; + CGU_FLOAT pfChannelWeights[3] = {1.0f, 1.0f, 1.0f}; + CGU_UINT8 nEndpoints[2][3][2]; + CGU_UINT8 nIndices[2][BLOCK_SIZE_4X4]; + CGU_UINT32 compressedBlock[2] = {0, 0}; + + CGU_FLOAT fError3 = CMP_FLT_MAX; + + fError3 = cpu_CompRGBBlock32((CGU_UINT32*)bgraBlock, + compressedBlock, + BLOCK_SIZE_4X4, + RG, + GG, + BG, + nEndpoints[0], + nIndices[0], + 3, + BC15Options.m_b3DRefinement, + BC15Options.m_nRefinementSteps, + pfChannelWeights, + BC15Options.m_bUseAlpha, + BC15Options.m_nAlphaThreshold); + // use case of small min max ranges + if (compressedBlock[0] > 0) + { + //return cmpBlockBlue; + cmpBlock.x = compressedBlock[0]; + cmpBlock.y = compressedBlock[1]; + err = fError3; + } + else + { + CGU_FLOAT fError4 = CMP_FLT_MAX; + fError4 = (fError3 == 0.0) ? CMP_FLT_MAX + : cpu_CompRGBBlock32((CGU_UINT32*)bgraBlock, + compressedBlock, + BLOCK_SIZE_4X4, + RG, + GG, + BG, + nEndpoints[1], + nIndices[1], + 4, + BC15Options.m_b3DRefinement, + BC15Options.m_nRefinementSteps, + pfChannelWeights, + BC15Options.m_bUseAlpha, + BC15Options.m_nAlphaThreshold); + + CGU_UINT32 nMethod; + if (fError3 <= fError4) + { + err = fError3; + nMethod = 0; + } + else + { + err = fError4; + nMethod = 1; + } + + CGU_UINT32 c0 = + BC1ConstructColour((nEndpoints[nMethod][RC][0] >> (8 - RG)), (nEndpoints[nMethod][GC][0] >> (8 - GG)), (nEndpoints[nMethod][BC][0] >> (8 - BG))); + CGU_UINT32 c1 = + BC1ConstructColour((nEndpoints[nMethod][RC][1] >> (8 - RG)), (nEndpoints[nMethod][GC][1] >> (8 - GG)), (nEndpoints[nMethod][BC][1] >> (8 - BG))); + if (nMethod == 1 && c0 <= c1 || nMethod == 0 && c0 > c1) + compressedBlock[0] = c1 | (c0 << 16); + else + compressedBlock[0] = c0 | (c1 << 16); + + compressedBlock[1] = 0; + for (CGU_UINT32 i = 0; i < 16; i++) + compressedBlock[1] |= (nIndices[nMethod][i] << (2 * i)); + + cmpBlock.x = compressedBlock[0]; + cmpBlock.y = compressedBlock[1]; + } + + return cmpBlock; +} + +#endif + +#ifdef ENABLE_NEW_CODE + +//---------------------------------------- Common Utility Code ------------------------------------------------------- +// 1 - Dim error +CMP_STATIC CGU_FLOAT cgu_RampSrchW(CGU_FLOAT Prj[BLOCK_SIZE_4X4], + CGU_FLOAT PrjErr[BLOCK_SIZE_4X4], + CGU_FLOAT PreMRep[BLOCK_SIZE_4X4], + CGU_FLOAT StepErr, + CGU_FLOAT lowPosStep, + CGU_FLOAT highPosStep, + CGU_UINT32 dwUniqueColors, + CGU_UINT32 dwNumPoints) +{ + CGU_FLOAT error = 0; + CGU_FLOAT step = (highPosStep - lowPosStep) / (dwNumPoints - 1); + CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; + CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; + + for (CGU_UINT32 i = 0; i < dwUniqueColors; i++) + { + CGU_FLOAT v; + // Work out which value in the block this select + CGU_FLOAT del; + + if ((del = Prj[i] - lowPosStep) <= 0) + v = lowPosStep; + else if (Prj[i] - highPosStep >= 0) + v = highPosStep; + else + v = cmp_floor((del + step_h) * rstep) * step + lowPosStep; + + // And accumulate the error + CGU_FLOAT d = (Prj[i] - v); + d *= d; + CGU_FLOAT err = PreMRep[i] * d + PrjErr[i]; + error += err; + if (StepErr < error) + { + error = StepErr; + break; + } + } + return error; +} + +CMP_STATIC CGU_UINT32 cgu_processCluster(CMP_IN CMP_EndPoints EndPoints, + CMP_IN CGU_Vec4f rgbBlock_normal[BLOCK_SIZE_4X4], + CMP_IN CGU_UINT32 dwAlphaThreshold, + CMP_IN CGU_Vec3f channelWeights, + CMP_IN CGU_UINT8 indices[BLOCK_SIZE_4X4], + CMP_OUT CGU_FLOAT CMP_REFINOUT Err) +{ + Err = 0.f; + CGU_UINT32 pcIndices = 0; + CGU_UINT32 R, G, B; + + R = (CGU_UINT32)(EndPoints.Color0.z); + G = (CGU_UINT32)(EndPoints.Color0.y); + B = (CGU_UINT32)(EndPoints.Color0.x); + CGU_INT32 cluster0 = cmp_constructColor(R, G, B); + + R = (CGU_UINT32)(EndPoints.Color1.z); + G = (CGU_UINT32)(EndPoints.Color1.y); + B = (CGU_UINT32)(EndPoints.Color1.x); + CGU_INT32 cluster1 = cmp_constructColor(R, G, B); + + CGU_Vec3f InpRmp[NUM_ENDPOINTS]; + if ((cluster0 <= cluster1) // valid for 4 channels + // || (cluster0 > cluster1) // valid for 3 channels + ) + { + // inverse endpoints + InpRmp[0] = EndPoints.Color1; + InpRmp[1] = EndPoints.Color0; + } + else + { + InpRmp[0] = EndPoints.Color0; + InpRmp[1] = EndPoints.Color1; + } + + CGU_Vec3f srcblockLinear[BLOCK_SIZE_4X4]; + CGU_FLOAT srcblockA[BLOCK_SIZE_4X4]; + + // Swizzle the source RGB to BGR for processing + for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++) + { + srcblockLinear[i].z = rgbBlock_normal[i].x * 255.0f; + srcblockLinear[i].y = rgbBlock_normal[i].y * 255.0f; + srcblockLinear[i].x = rgbBlock_normal[i].z * 255.0f; + srcblockA[i] = 0.0f; + //if (dwAlphaThreshold > 0) + //{ + // CGU_UINT32 alpha = (CGU_UINT32)BlockA[i]; + // if (alpha >= dwAlphaThreshold) + // srcblockA[i] = BlockA[i]; + //} + } + + // cmp_ClstrBas2() + // input ramp is on the coarse grid + // make ramp endpoints the way they'll going to be decompressed + CGU_Vec3f InpRmpL[NUM_ENDPOINTS]; + CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F}; // 1 << RG,1 << GG,1 << BG + + { + // ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp); + InpRmpL[0] = InpRmp[0] + cmp_floorVec3f(InpRmp[0] / Fctrs); + InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f); + InpRmpL[1] = InpRmp[1] + cmp_floorVec3f(InpRmp[1] / Fctrs); + InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f); + } // MkWkRmpPts + + // build ramp + CGU_Vec3f LerpRmp[4]; + CGU_Vec3f offset = {1.0f, 1.0f, 1.0f}; + { + //BldRmp(Rmp, InpRmpL, dwNumChannels); + // linear interpolate end points to get the ramp + LerpRmp[0] = InpRmpL[0]; + LerpRmp[3] = InpRmpL[1]; + LerpRmp[1] = cmp_floorVec3f((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f); + LerpRmp[2] = cmp_floorVec3f((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f); + } // BldRmp + + //========================================================================= + // Clusterize, Compute error and find DXTC indexes for the current cluster + //========================================================================= + { + // Clusterize + CGU_UINT32 alpha; + + // For each colour in the original block assign it + // to the closest cluster and compute the cumulative error + for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++) + { + alpha = (CGU_UINT32)srcblockA[i]; + if ((dwAlphaThreshold > 0) && alpha == 0) + { //*((CGU_UINT32 *)&_Blk[i][AC]) == 0) + pcIndices |= cmp_set2Bit32(4, i); // dwNumChannels 3 or 4 (default is 4) + indices[i] = 4; + } + else + { + CGU_FLOAT shortest = 99999999999.f; + CGU_UINT8 shortestIndex = 0; + + CGU_Vec3f channelWeightsBGR; + channelWeightsBGR.x = channelWeights.z; + channelWeightsBGR.y = channelWeights.y; + channelWeightsBGR.z = channelWeights.x; + + for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++) + { + // r is either 1 or 4 + // calculate the distance for each component + CGU_FLOAT distance = cmp_dotVec3f(((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR), + ((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR)); + if (distance < shortest) + { + shortest = distance; + shortestIndex = rampindex; + } + } + + Err += shortest; + + // The total is a sum of (error += shortest) + // We have the index of the best cluster, so assign this in the block + // Reorder indices to match correct DXTC ordering + if (shortestIndex == 3) // dwNumChannels - 1 + shortestIndex = 1; + else if (shortestIndex) + shortestIndex++; + pcIndices |= cmp_set2Bit32(shortestIndex, i); + indices[i] = shortestIndex; + } + } // BLOCK_SIZE_4X4 + } // Clusterize + + return pcIndices; +} +#endif + +// Process a rgbBlock which is normalized (0.0f ... 1.0f), signed normal is not implemented +CMP_STATIC CGU_Vec2ui CompressBlockBC1_NORMALIZED(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options) +{ + bool usingMaxQualityOnly = false; + +#ifndef ASPM_GPU + if (BC15Options.m_fquality > 0.75) + usingMaxQualityOnly = true; +#endif + + CGU_FLOAT CompErr = CMP_FLT_MAX; + CGU_Vec2ui cmpBlock = {0U, 0U}; + CGU_Vec2ui cmpBlockTemp = {0U, 0U}; + CGU_FLOAT CompErrTemp; + + // Transfer to RGB Norm from RGBA Norm + CGU_Vec3f src_imageRGBNorm[16]; + CGU_Vec4uc pixels[16]; + CGU_Vec4uc pixelsBGRA[16]; + + for (CGU_UINT32 sr = 0; sr < 16; sr++) + { + src_imageRGBNorm[sr] = src_imageNorm[sr].rgb; + pixelsBGRA[sr].b = pixels[sr].r = src_imageNorm[sr].r * 255.0f; + pixelsBGRA[sr].g = pixels[sr].g = src_imageNorm[sr].g * 255.0f; + pixelsBGRA[sr].r = pixels[sr].b = src_imageNorm[sr].b * 255.0f; + pixelsBGRA[sr].a = pixels[sr].a = src_imageNorm[sr].a * 255.0f; + } + + // check for a punch through transparent alpha setting + if ((BC15Options.m_fquality < 0.75) && (BC15Options.m_bUseAlpha)) + { + CGU_Vec2ui cmpBlockAlpha = {0xffff0000, 0xffffffffU}; + for (CGU_UINT32 sr = 0; sr < 16; sr++) + if (pixels[sr].a < BC15Options.m_nAlphaThreshold) + { + return cmpBlockAlpha; + } + } + + //================ + // extern codec + //================ + // For debugging + // CGU_Vec2ui cmpBlockRed = {0xF800F800,0x00000000}; + // CGU_Vec2ui cmpBlockGreen = {0x7E007E00,0x00000000}; + // CGU_Vec2ui cmpBlockBlue = {0x1F001F00,0x00000000}; + + if (!BC15Options.m_bUseAlpha) + { + //========================================== + // Gain +0.3 dB for images with soild blocks + //========================================== + bool bAllColoursEqual = true; + + // Load the whole 4x4 block + for (CGU_UINT32 i = 0u; (i < 16u) && bAllColoursEqual; ++i) + { + for (CGU_INT c = 0; c < 3; c++) + bAllColoursEqual = bAllColoursEqual && (pixels[0][c] == pixels[i][c]); + } + + if (bAllColoursEqual) + { + cmpBlock = cgu_solidColorBlock(pixels[0].x, pixels[0].y, pixels[0].z); + CompErr = cgu_RGBABlockErrorLinear(pixels, cmpBlock); + if (BC15Options.m_nRefinementSteps < 1) + return cmpBlock; + } + } + + if (!usingMaxQualityOnly) + { + //==================================== + // Get src image data, min,max... + //===================================== + //CMP_EncodeData edata; + //cmp_get_encode_data(edata,pixels); + + if (!BC15Options.m_bUseAlpha) + { + //==================================== + // Fast Compression, low quality + //===================================== + CGU_Vec3f srcRGB[16]; // The list of source colors with blue channel altered + CGU_Vec3f average_rgb; // The centrepoint of the axis + CGU_FLOAT errLQ = CMP_FLT_MAX; + cmpBlockTemp = cgu_CompressRGBBlock_MinMax(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB, srcRGB, average_rgb, errLQ); + if ((BC15Options.m_fquality < CMP_QUALITY0) || (errLQ == 0.0f)) + return cmpBlockTemp; + + if (CompErr > errLQ) + { + CompErr = errLQ; + cmpBlock = cmpBlockTemp; + } + + cmpBlockTemp = cgu_CompressRGBBlock_Fast(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB, srcRGB, average_rgb, errLQ); + if (CompErr > errLQ) + { + CompErr = errLQ; + cmpBlock = cmpBlockTemp; + } + if (BC15Options.m_fquality < CMP_QUALITY1) + return cmpBlock; + } + + //======================================== + // use GPU codec lower quality then CPU + //======================================== + cmpBlockTemp = cgu_CompRGBBlock(src_imageNorm, BC15Options); + CompErrTemp = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp); + if (CompErr > CompErrTemp) + { + CompErr = CompErrTemp; + cmpBlock = cmpBlockTemp; + } + + if (BC15Options.m_fquality < CMP_QUALITY2) + return cmpBlock; + } // if useCGUCodecs + + //==================================== + // High Quality Codec CPU only + //===================================== +#ifndef ASPM_GPU + cmpBlockTemp = cpu_CompRGBBlock(pixelsBGRA, BC15Options, CompErrTemp); + + CompErrTemp = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp); + + if (CompErr > CompErrTemp) + { + CompErr = CompErrTemp; + cmpBlock = cmpBlockTemp; + } +#endif + + return cmpBlock; +}