From 855eb38997854fd5649b5575a0b78004c621a8c7 Mon Sep 17 00:00:00 2001
From: Andrew Lamontagne <metallicafan212@gmail.com>
Date: Wed, 22 Jan 2025 22:17:45 -0700
Subject: [PATCH 1/3] Fix compressed to compressed texture conversion leaving
 leaked memory/threads

---
 cmp_compressonatorlib/compressonator.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cmp_compressonatorlib/compressonator.cpp b/cmp_compressonatorlib/compressonator.cpp
index f9f068f19..9e732c7a0 100644
--- a/cmp_compressonatorlib/compressonator.cpp
+++ b/cmp_compressonatorlib/compressonator.cpp
@@ -465,6 +465,12 @@ CMP_ERROR CMP_API CMP_ConvertTexture(CMP_Texture*               pSourceTexture,
         }
         RESTORE_FP_EXCEPTIONS;
 
+        SAFE_DELETE(pCodecIn);
+        SAFE_DELETE(pCodecOut);
+        SAFE_DELETE(pSrcBuffer);
+        SAFE_DELETE(pTempBuffer);
+        SAFE_DELETE(pDestBuffer);
+
         return GetError(err2);
     }
 }

From f464fd8d16022f4702cfbeeb374abd118e79c49f Mon Sep 17 00:00:00 2001
From: Andrew Lamontagne <metallicafan212@gmail.com>
Date: Fri, 25 Jul 2025 21:04:08 -0700
Subject: [PATCH 2/3] Diallow SSE and AVX when compiling for non-x86 targets

---
 CMakeLists.txt                    |  51 ++++++++++
 build/sdk/cmp_core/CMakeLists.txt | 162 +++++++++++++++---------------
 cmp_core/CMakeLists.txt           |  82 ++++++++-------
 cmp_core/shaders/bc1_cmp.h        |   5 +
 4 files changed, 182 insertions(+), 118 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4cb219698..de348cb58 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,6 +40,57 @@ message("Building Compressonator version ${PROJECT_VERSION_MAJOR}.${PROJECT_VERS
 # ------------------------------
 include(cmake/helperfunctions.cmake)
 
+# ----------------------------------
+# Check the target architecture
+# ----------------------------------
+################################################################################
+# Figure out build type and target platform
+################################################################################
+
+include(CheckCCompilerFlag)
+
+# See what CPU we appear to be targeting... 
+macro(check_cpu _CDEFS _NAME _VALUE)
+    check_c_source_compiles(
+        "
+        #if ( ${_CDEFS} )
+        int main(int argc, char **argv) { int yup = 1; return 0; }
+        #else
+        #error Not targeting this CPU architecture.
+        #endif
+        "
+        ${_VALUE}
+    )
+
+    if(${_VALUE})
+        if(AMD_COMPRESSONATOR_CHOSE_CPU)
+            message(STATUS "We appear to see two different CPU architectures!")
+            message(STATUS "We saw '${AMD_COMPRESSONATOR_CPU}' and '${_NAME}'.")
+            message(FATAL_ERROR "Please fix this before continuing.")
+        endif()
+        set(AMD_COMPRESSONATOR_CHOSE_CPU TRUE)
+        set(AMD_COMPRESSONATOR_CPU ${_NAME})
+        add_compile_definitions(${_VALUE}=1)
+    endif()
+endmacro(check_cpu)
+
+check_cpu(
+    "defined(__i386__) || defined(__i686__) || defined(_M_IX86) || defined(i386)"
+    "x86" AMD_COMPRESSONATOR_X86
+)
+
+check_cpu("defined(__x86_64__) || defined(_M_X64)" "amd64" AMD_COMPRESSONATOR_AMD64)
+
+check_cpu("defined(__EMSCRIPTEN__)" "emscripten" AMD_COMPRESSONATOR_EMSCRIPTEN)
+
+check_cpu("defined(__arm__)" "arm" AMD_COMPRESSONATOR_ARM)
+
+check_cpu("defined(__arm64__) || defined(__aarch64__)" "arm64" AMD_COMPRESSONATOR_ARM64)
+
+if (NOT AMD_COMPRESSONATOR_CHOSE_CPU)
+    message(FATAL_ERROR "We don't support this architecture yet")
+endif()
+
 
 # ------------------------------
 # Common compiler options
diff --git a/build/sdk/cmp_core/CMakeLists.txt b/build/sdk/cmp_core/CMakeLists.txt
index 680616af8..06ce095c2 100644
--- a/build/sdk/cmp_core/CMakeLists.txt
+++ b/build/sdk/cmp_core/CMakeLists.txt
@@ -72,82 +72,86 @@
 
 # Core SIMD options
 
-# SSE
-add_library(CMP_Core_SSE OBJECT)
-target_sources(
-    CMP_Core_SSE 
-    PRIVATE 
-    ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_sse.cpp
-)
-
-target_include_directories(
-    CMP_Core_SSE 
-    PRIVATE 
-    ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source 
-    ${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders
-)
-
-if (UNIX)
-    target_compile_options(CMP_Core_SSE PRIVATE -march=nehalem)
-endif()
-
-set_target_properties(CMP_Core_SSE PROPERTIES 
-    FOLDER ${PROJECT_FOLDER_SDK_LIBS}
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-)
-
-# AVX
-add_library(CMP_Core_AVX OBJECT)
-target_sources(
-    CMP_Core_AVX 
-    PRIVATE 
-    ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_avx.cpp
-)
-target_include_directories(
-    CMP_Core_AVX 
-    PRIVATE 
-    ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source 
-    ${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders
-)
-
-if (WIN32)
-    target_compile_options(CMP_Core_AVX PRIVATE /arch:AVX2)
-else()
-    target_compile_options(CMP_Core_AVX PRIVATE -march=haswell)
-endif()
-
-set_target_properties(CMP_Core_AVX PROPERTIES 
-    FOLDER ${PROJECT_FOLDER_SDK_LIBS}
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-)
-
-# AVX-512
-add_library(CMP_Core_AVX512 OBJECT)
-target_sources(
-    CMP_Core_AVX512 
-    PRIVATE 
-    ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_avx512.cpp
-)
-target_include_directories(
-    CMP_Core_AVX512 
-    PRIVATE 
-    ${COMPRESSONATOR_ROOT_PATH}/cmp_core/source 
-    ${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders
-)
-
-if (WIN32)
-    target_compile_options(CMP_Core_AVX512 PRIVATE /arch:AVX-512)
-else()
-    target_compile_options(CMP_Core_AVX512 PRIVATE -march=knl)
-endif()
-
-set_target_properties(CMP_Core_AVX512 PROPERTIES 
-    FOLDER ${PROJECT_FOLDER_SDK_LIBS}
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-)
-
-# Link SIMD libraries to CMP_Core
-target_link_libraries(CMP_Core PRIVATE CMP_Core_SSE CMP_Core_AVX CMP_Core_AVX512)
\ No newline at end of file
+# Metallicafan212: Actually check for support before linking it
+# 				   This fixes ARM builds
+if(AMD_COMPRESSONATOR_AMD64 OR AMD_COMPRESSONATOR_X86)
+	# SSE
+	add_library(CMP_Core_SSE OBJECT)
+	target_sources(
+		CMP_Core_SSE 
+		PRIVATE 
+		${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_sse.cpp
+	)
+
+	target_include_directories(
+		CMP_Core_SSE 
+		PRIVATE 
+		${COMPRESSONATOR_ROOT_PATH}/cmp_core/source 
+		${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders
+	)
+
+	if (UNIX)
+		target_compile_options(CMP_Core_SSE PRIVATE -march=nehalem)
+	endif()
+
+	set_target_properties(CMP_Core_SSE PROPERTIES 
+		FOLDER ${PROJECT_FOLDER_SDK_LIBS}
+		RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+		ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+	)
+
+	# AVX
+	add_library(CMP_Core_AVX OBJECT)
+	target_sources(
+		CMP_Core_AVX 
+		PRIVATE 
+		${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_avx.cpp
+	)
+	target_include_directories(
+		CMP_Core_AVX 
+		PRIVATE 
+		${COMPRESSONATOR_ROOT_PATH}/cmp_core/source 
+		${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders
+	)
+
+	if (WIN32)
+		target_compile_options(CMP_Core_AVX PRIVATE /arch:AVX2)
+	else()
+		target_compile_options(CMP_Core_AVX PRIVATE -march=haswell)
+	endif()
+
+	set_target_properties(CMP_Core_AVX PROPERTIES 
+		FOLDER ${PROJECT_FOLDER_SDK_LIBS}
+		RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+		ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+	)
+
+	# AVX-512
+	add_library(CMP_Core_AVX512 OBJECT)
+	target_sources(
+		CMP_Core_AVX512 
+		PRIVATE 
+		${COMPRESSONATOR_ROOT_PATH}/cmp_core/source/core_simd_avx512.cpp
+	)
+	target_include_directories(
+		CMP_Core_AVX512 
+		PRIVATE 
+		${COMPRESSONATOR_ROOT_PATH}/cmp_core/source 
+		${COMPRESSONATOR_ROOT_PATH}/cmp_core/shaders
+	)
+
+	if (WIN32)
+		target_compile_options(CMP_Core_AVX512 PRIVATE /arch:AVX-512)
+	else()
+		target_compile_options(CMP_Core_AVX512 PRIVATE -march=knl)
+	endif()
+
+	set_target_properties(CMP_Core_AVX512 PROPERTIES 
+		FOLDER ${PROJECT_FOLDER_SDK_LIBS}
+		RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+		ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+	)
+
+	# Link SIMD libraries to CMP_Core
+	target_link_libraries(CMP_Core PRIVATE CMP_Core_SSE CMP_Core_AVX CMP_Core_AVX512)
+endif()
\ No newline at end of file
diff --git a/cmp_core/CMakeLists.txt b/cmp_core/CMakeLists.txt
index 478737708..a3b462dc8 100644
--- a/cmp_core/CMakeLists.txt
+++ b/cmp_core/CMakeLists.txt
@@ -66,42 +66,46 @@ set_target_properties(CMP_Core PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS})
 
 # Core SIMD options
 
-# SSE
-add_library(CMP_Core_SSE STATIC)
-target_sources(CMP_Core_SSE PRIVATE source/core_simd_sse.cpp)
-target_include_directories(CMP_Core_SSE PRIVATE source shaders)
-
-if (UNIX)
-    target_compile_options(CMP_Core_SSE PRIVATE -march=nehalem)
-endif()
-
-set_target_properties(CMP_Core_SSE PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS})
-
-# AVX
-add_library(CMP_Core_AVX STATIC)
-target_sources(CMP_Core_AVX PRIVATE source/core_simd_avx.cpp)
-target_include_directories(CMP_Core_AVX PRIVATE source shaders)
-
-if (WIN32)
-    target_compile_options(CMP_Core_AVX PRIVATE /arch:AVX2)
-else()
-    target_compile_options(CMP_Core_AVX PRIVATE -march=haswell)
-endif()
-
-set_target_properties(CMP_Core_AVX PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS})
-
-# AVX-512
-add_library(CMP_Core_AVX512 STATIC)
-target_sources(CMP_Core_AVX512 PRIVATE source/core_simd_avx512.cpp)
-target_include_directories(CMP_Core_AVX512 PRIVATE source shaders)
-
-if (WIN32)
-    target_compile_options(CMP_Core_AVX512 PRIVATE /arch:AVX-512)
-else()
-    target_compile_options(CMP_Core_AVX512 PRIVATE -march=knl)
-endif()
-
-set_target_properties(CMP_Core_AVX512 PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS})
-
-# Link SIMD libraries to CMP_Core
-target_link_libraries(CMP_Core PRIVATE CMP_Core_SSE CMP_Core_AVX CMP_Core_AVX512)
\ No newline at end of file
+# Metallicafan212: Actually check for support before linking it
+# 				   This fixes ARM builds
+if(AMD_COMPRESSONATOR_AMD64 OR AMD_COMPRESSONATOR_X86)
+	# SSE
+	add_library(CMP_Core_SSE STATIC)
+	target_sources(CMP_Core_SSE PRIVATE source/core_simd_sse.cpp)
+	target_include_directories(CMP_Core_SSE PRIVATE source shaders)
+
+	if (UNIX)
+		target_compile_options(CMP_Core_SSE PRIVATE -march=nehalem)
+	endif()
+
+	set_target_properties(CMP_Core_SSE PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS})
+
+	# AVX
+	add_library(CMP_Core_AVX STATIC)
+	target_sources(CMP_Core_AVX PRIVATE source/core_simd_avx.cpp)
+	target_include_directories(CMP_Core_AVX PRIVATE source shaders)
+
+	if (WIN32)
+		target_compile_options(CMP_Core_AVX PRIVATE /arch:AVX2)
+	else()
+		target_compile_options(CMP_Core_AVX PRIVATE -march=haswell)
+	endif()
+
+	set_target_properties(CMP_Core_AVX PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS})
+
+	# AVX-512
+	add_library(CMP_Core_AVX512 STATIC)
+	target_sources(CMP_Core_AVX512 PRIVATE source/core_simd_avx512.cpp)
+	target_include_directories(CMP_Core_AVX512 PRIVATE source shaders)
+
+	if (WIN32)
+		target_compile_options(CMP_Core_AVX512 PRIVATE /arch:AVX-512)
+	else()
+		target_compile_options(CMP_Core_AVX512 PRIVATE -march=knl)
+	endif()
+
+	set_target_properties(CMP_Core_AVX512 PROPERTIES FOLDER ${PROJECT_FOLDER_SDK_LIBS})
+
+	# Link SIMD libraries to CMP_Core
+	target_link_libraries(CMP_Core PRIVATE CMP_Core_SSE CMP_Core_AVX CMP_Core_AVX512)
+endif()
\ No newline at end of file
diff --git a/cmp_core/shaders/bc1_cmp.h b/cmp_core/shaders/bc1_cmp.h
index c97315dd5..70e1054be 100644
--- a/cmp_core/shaders/bc1_cmp.h
+++ b/cmp_core/shaders/bc1_cmp.h
@@ -96,6 +96,8 @@ CMP_STATIC CGU_FLOAT (*cpu_bc1ComputeBestEndpoints)(CGU_FLOAT*, CGU_FLOAT*, CGU_
 // NOTE: The requested extension will only be enabled if it is supported by the current CPU.
 CMP_STATIC bool bc1ToggleSIMD(CGU_INT newExtension)
 {
+	// Metallicafan212:	Don't evaluate on non-X86 platforms
+#if AMD_COMPRESSONATOR_AMD64 || AMD_COMPRESSONATOR_X86
     CGU_BOOL useAVX512 = true;
     CGU_BOOL useAVX2   = true;
     CGU_BOOL useSSE42  = true;
@@ -125,6 +127,9 @@ CMP_STATIC bool bc1ToggleSIMD(CGU_INT newExtension)
     {
         cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints;
     }
+#else 
+	cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints;
+#endif
 
     g_bc1FunctionPointersSet = true;
 

From 4b723803e2eaf579a09fc925a6c9c1affda75672 Mon Sep 17 00:00:00 2001
From: metallicafan212 <metallicafan212@gmail.com>
Date: Fri, 25 Jul 2025 23:46:08 -0700
Subject: [PATCH 3/3] Fix variable scope when compiling in non-x86

---
 cmp_core/shaders/bc1_cmp.h | 6943 ++++++++++++++++++------------------
 1 file changed, 3473 insertions(+), 3470 deletions(-)

diff --git a/cmp_core/shaders/bc1_cmp.h b/cmp_core/shaders/bc1_cmp.h
index 70e1054be..892af2e30 100644
--- a/cmp_core/shaders/bc1_cmp.h
+++ b/cmp_core/shaders/bc1_cmp.h
@@ -1,3470 +1,3473 @@
-//=====================================================================
-// Copyright (c) 2020-2024    Advanced Micro Devices, Inc. All rights reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files(the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions :
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-//
-// File: bc1_cmp.h
-//--------------------------------------------------------------------------------------
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-//--------------------------------------------------------------------------------------
-
-#define USE_CMP
-
-#include "common_def.h"
-#include "bcn_common_kernel.h"
-#include "bcn_common_api.h"
-
-#ifndef ASPM_GPU
-#include "cpu_extensions.h"
-#include "core_simd.h"
-#endif
-
-//-----------------------------------------------------------------------
-// When build is for CPU, we have some missing API calls common to GPU
-// Use CPU CMP_Core replacements
-//-----------------------------------------------------------------------
-#if defined(ASPM_GPU) || defined(ASPM_HLSL) || defined(ASPM_OPENCL)
-#define ALIGN_16
-#define ALIGN_32
-#define ALIGN_64
-#else
-#include INC_cmp_math_func
-#if defined(_WIN32) || defined(_WIN64)
-#define ALIGN_16 __declspec(align(16))
-#define ALIGN_32 __declspec(align(32))
-#define ALIGN_64 __declspec(align(64))
-#else  // !WIN32 && !_WIN64
-#define ALIGN_16 __attribute__((aligned(16)))
-#define ALIGN_32 __attribute__((aligned(32)))
-#define ALIGN_64 __attribute__((aligned(64)))
-#endif  // !WIN32 && !_WIN64
-#endif
-
-#define USE_REFINE3D
-#define USE_REFINE
-
-#ifndef MAX_ERROR
-#define MAX_ERROR 128000.f
-#endif
-
-#define NUM_CHANNELS 4
-#define NUM_ENDPOINTS 2
-
-#ifndef CMP_QUALITY0
-#define CMP_QUALITY0 0.25f
-#endif
-
-#ifndef CMP_QUALITY1
-#define CMP_QUALITY1 0.50f
-#endif
-
-#ifndef CMP_QUALITY2
-#define CMP_QUALITY2 0.75f
-#endif
-
-#define EPS (2.f / 255.f) * (2.f / 255.f)
-#define EPS2 3.f * (2.f / 255.f) * (2.f / 255.f)
-
-// Disable SIMD code during GPU builds
-#if !defined(ASPM_GPU)
-CMP_STATIC CGU_BOOL g_bc1FunctionPointersSet = false;
-
-// declarations for SIMD function variations
-CMP_STATIC CGU_FLOAT _cpu_bc1ComputeBestEndpoints(CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, int, int);
-
-// function pointers
-CMP_STATIC CGU_FLOAT (*cpu_bc1ComputeBestEndpoints)(CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, int, int) = 0;
-
-// Toggle which SIMD instruction set extensions to use. Setting this to EXTENSION_COUNT will enable auto-detection of supported extensions.
-// NOTE: The requested extension will only be enabled if it is supported by the current CPU.
-CMP_STATIC bool bc1ToggleSIMD(CGU_INT newExtension)
-{
-	// Metallicafan212:	Don't evaluate on non-X86 platforms
-#if AMD_COMPRESSONATOR_AMD64 || AMD_COMPRESSONATOR_X86
-    CGU_BOOL useAVX512 = true;
-    CGU_BOOL useAVX2   = true;
-    CGU_BOOL useSSE42  = true;
-
-    CPUExtensions extensions = GetCPUExtensions();
-
-    if (newExtension < EXTENSION_COUNT)  // user requested a specific instruction set extension
-    {
-        useAVX512 = newExtension == EXTENSION_AVX512_F;
-        useAVX2   = newExtension == EXTENSION_AVX2;
-        useSSE42  = newExtension == EXTENSION_SSE42;
-    }
-
-    if (useAVX512 && IsAvailableAVX512(extensions))
-    {
-        cpu_bc1ComputeBestEndpoints = avx512_bc1ComputeBestEndpoints;
-    }
-    else if (useAVX2 && IsAvailableAVX2(extensions))
-    {
-        cpu_bc1ComputeBestEndpoints = avx_bc1ComputeBestEndpoints;
-    }
-    else if (useSSE42 && IsAvailableSSE4(extensions))
-    {
-        cpu_bc1ComputeBestEndpoints = sse_bc1ComputeBestEndpoints;
-    }
-    else
-    {
-        cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints;
-    }
-#else 
-	cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints;
-#endif
-
-    g_bc1FunctionPointersSet = true;
-
-    bool result = true;
-
-    if (newExtension != EXTENSION_COUNT && (useAVX512 && !IsAvailableAVX512(extensions)) || (useAVX2 && !IsAvailableAVX2(extensions)) ||
-        (useSSE42 && !IsAvailableSSE4(extensions)))
-        result = false;
-
-    return result;
-}
-#endif
-
-static CGU_FLOAT cgu_getRampErr(CGU_FLOAT  Prj[BLOCK_SIZE_4X4],
-                                CGU_FLOAT  PrjErr[BLOCK_SIZE_4X4],
-                                CGU_FLOAT  PreMRep[BLOCK_SIZE_4X4],
-                                CGU_FLOAT  StepErr,
-                                CGU_FLOAT  lowPosStep,
-                                CGU_FLOAT  highPosStep,
-                                CGU_UINT32 dwUniqueColors)
-{
-    CGU_FLOAT error  = 0;
-    CGU_FLOAT step   = (highPosStep - lowPosStep) / 3;  // using (dwNumChannels=4 - 1);
-    CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
-    CGU_FLOAT rstep  = (CGU_FLOAT)1.0f / step;
-
-    for (CGU_UINT32 i = 0; i < dwUniqueColors; i++)
-    {
-        CGU_FLOAT v;
-        // Work out which value in the block this select
-        CGU_FLOAT del;
-
-        if ((del = Prj[i] - lowPosStep) <= 0)
-            v = lowPosStep;
-        else if (Prj[i] - highPosStep >= 0)
-            v = highPosStep;
-        else
-            v = cmp_floor((del + step_h) * rstep) * step + lowPosStep;
-
-        // And accumulate the error
-        CGU_FLOAT d = (Prj[i] - v);
-        d *= d;
-        CGU_FLOAT err = PreMRep[i] * d + PrjErr[i];
-        error += err;
-        if (StepErr < error)
-        {
-            error = StepErr;
-            break;
-        }
-    }
-    return error;
-}
-
-CMP_STATIC CMP_EndPoints cgu_CompressRGBBlockX(CMP_IN CGU_Vec3f  BlkInBGRf_UV[BLOCK_SIZE_4X4],
-                                               CMP_IN CGU_FLOAT  Rpt[BLOCK_SIZE_4X4],
-                                               CMP_IN CGU_UINT32 dwUniqueColors,
-                                               CMP_IN CGU_Vec3f  channelWeightsBGR,
-                                               CMP_IN CGU_BOOL   b3DRefinement)
-{
-    CMP_UNUSED(channelWeightsBGR);
-    CMP_UNUSED(b3DRefinement);
-    CGU_FLOAT ALIGN_16 Prj0[BLOCK_SIZE_4X4];
-    CGU_FLOAT ALIGN_16 Prj[BLOCK_SIZE_4X4];
-    CGU_FLOAT ALIGN_16 PrjErr[BLOCK_SIZE_4X4];
-    CGU_FLOAT ALIGN_16 RmpIndxs[BLOCK_SIZE_4X4];
-
-    CGU_Vec3f LineDirG;
-    CGU_Vec3f LineDir;
-    CGU_FLOAT LineDir0[NUM_CHANNELS];
-    CGU_Vec3f BlkUV[BLOCK_SIZE_4X4];
-    CGU_Vec3f BlkSh[BLOCK_SIZE_4X4];
-    CGU_Vec3f Mdl;
-
-    CGU_Vec3f  rsltC0;
-    CGU_Vec3f  rsltC1;
-    CGU_Vec3f  PosG0 = {0.0f, 0.0f, 0.0f};
-    CGU_Vec3f  PosG1 = {0.0f, 0.0f, 0.0f};
-    CGU_UINT32 i;
-
-    for (i = 0; i < dwUniqueColors; i++)
-    {
-        BlkUV[i] = BlkInBGRf_UV[i];
-    }
-
-    // if not more then 2 different colors, we've done
-    if (dwUniqueColors <= 2)
-    {
-        rsltC0 = BlkInBGRf_UV[0] * 255.0f;
-        rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f;
-    }
-    else
-    {
-        //    This is our first attempt to find an axis we will go along.
-        //    The cumulation is done to find a line minimizing the MSE from the
-        //    input 3D points.
-
-        //    While trying to find the axis we found that the diameter of the input
-        //    set is quite small. Do not bother.
-
-        // FindAxisIsSmall(BlkSh, LineDir0, Mdl, Blk, Rpt,dwUniqueColors);
-        {
-            CGU_UINT32 ii;
-            CGU_UINT32 jj;
-            CGU_UINT32 kk;
-
-            // These vars cannot be Vec3 as index to them are varying
-            CGU_FLOAT Crrl[NUM_CHANNELS];
-            CGU_FLOAT RGB2[NUM_CHANNELS];
-
-            LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = Mdl.x = Mdl.y = Mdl.z = 0.f;
-
-            // sum position of all points
-            CGU_FLOAT fNumPoints = 0.0f;
-            for (ii = 0; ii < dwUniqueColors; ii++)
-            {
-                Mdl.x += BlkUV[ii].x * Rpt[ii];
-                Mdl.y += BlkUV[ii].y * Rpt[ii];
-                Mdl.z += BlkUV[ii].z * Rpt[ii];
-                fNumPoints += Rpt[ii];
-            }
-
-            // and then average to calculate center coordinate of block
-            Mdl /= fNumPoints;
-
-            for (ii = 0; ii < dwUniqueColors; ii++)
-            {
-                // calculate output block as offsets around block center
-                BlkSh[ii] = BlkUV[ii] - Mdl;
-
-                // compute correlation matrix
-                // RGB2 = sum of ((distance from point from center) squared)
-                RGB2[0] += BlkSh[ii].x * BlkSh[ii].x * Rpt[ii];
-                RGB2[1] += BlkSh[ii].y * BlkSh[ii].y * Rpt[ii];
-                RGB2[2] += BlkSh[ii].z * BlkSh[ii].z * Rpt[ii];
-
-                Crrl[0] += BlkSh[ii].x * BlkSh[ii].y * Rpt[ii];
-                Crrl[1] += BlkSh[ii].y * BlkSh[ii].z * Rpt[ii];
-                Crrl[2] += BlkSh[ii].z * BlkSh[ii].x * Rpt[ii];
-            }
-
-            // if set's diameter is small
-            CGU_UINT32 i0 = 0, i1 = 1;
-            CGU_FLOAT  mxRGB2 = 0.0f;
-
-            CGU_FLOAT fEPS = fNumPoints * EPS;
-            for (kk = 0, jj = 0; jj < 3; jj++)
-            {
-                if (RGB2[jj] >= fEPS)
-                    kk++;
-                else
-                    RGB2[jj] = 0.0f;
-
-                if (mxRGB2 < RGB2[jj])
-                {
-                    mxRGB2 = RGB2[jj];
-                    i0     = jj;
-                }
-            }
-
-            CGU_FLOAT fEPS2 = fNumPoints * EPS2;
-            CGU_BOOL  AxisIsSmall;
-
-            AxisIsSmall = (RGB2[0] < fEPS2);
-            AxisIsSmall = AxisIsSmall && (RGB2[1] < fEPS2);
-            AxisIsSmall = AxisIsSmall && (RGB2[2] < fEPS2);
-
-            // all are very small to avoid division on the small determinant
-            if (AxisIsSmall)
-            {
-                rsltC0 = BlkInBGRf_UV[0] * 255.0f;
-                rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f;
-            }
-            else
-            {
-                // !AxisIsSmall
-                if (kk == 1)  // really only 1 dimension
-                    LineDir0[i0] = 1.;
-                else if (kk == 2)
-                {  // really only 2 dimensions
-                    i1            = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3;
-                    CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3];
-                    LineDir0[i1]  = Crl / RGB2[i0];
-                    LineDir0[i0]  = 1.;
-                }
-                else
-                {
-                    CGU_FLOAT maxDet = 100000.f;
-                    CGU_FLOAT Cs[3];
-                    // select max det for precision
-                    for (jj = 0; jj < 3; jj++)
-                    {
-                        // 3 = nDimensions
-                        CGU_FLOAT Det = RGB2[jj] * RGB2[(jj + 1) % 3] - Crrl[jj] * Crrl[jj];
-                        Cs[jj]        = cmp_fabs(Crrl[jj] / sqrt(RGB2[jj] * RGB2[(jj + 1) % 3]));
-                        if (maxDet < Det)
-                        {
-                            maxDet = Det;
-                            i0     = jj;
-                        }
-                    }
-
-                    // inverse correl matrix
-                    //  --      --       --      --
-                    //  |  A   B |       |  C  -B |
-                    //  |  B   C |  =>   | -B   A |
-                    //  --      --       --     --
-                    CGU_FLOAT mtrx1[2][2];
-                    CGU_FLOAT vc1[2];
-                    CGU_FLOAT vc[2];
-                    vc1[0] = Crrl[(i0 + 2) % 3];
-                    vc1[1] = Crrl[(i0 + 1) % 3];
-                    // C
-                    mtrx1[0][0] = RGB2[(i0 + 1) % 3];
-                    // A
-                    mtrx1[1][1] = RGB2[i0];
-                    // -B
-                    mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0];
-                    // find a solution
-                    vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1];
-                    vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1];
-                    // normalize
-                    vc[0] /= maxDet;
-                    vc[1] /= maxDet;
-                    // find a line direction vector
-                    LineDir0[i0]           = 1.;
-                    LineDir0[(i0 + 1) % 3] = 1.;
-                    LineDir0[(i0 + 2) % 3] = vc[0] + vc[1];
-                }
-
-                // normalize direction vector
-                CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2];
-                Len           = sqrt(Len);
-
-                LineDir0[0] = (Len > 0.f) ? LineDir0[0] / Len : 0.0f;
-                LineDir0[1] = (Len > 0.f) ? LineDir0[1] / Len : 0.0f;
-                LineDir0[2] = (Len > 0.f) ? LineDir0[2] / Len : 0.0f;
-            }
-        }  // FindAxisIsSmall
-
-        // GCC is being an awful being when it comes to goto-jumps.
-        // So please bear with this.
-        CGU_FLOAT          ErrG = 10000000.f;
-        CGU_FLOAT          PrjBnd0;
-        CGU_FLOAT          PrjBnd1;
-        CGU_FLOAT ALIGN_16 PreMRep[BLOCK_SIZE_4X4];
-
-        LineDir.x = LineDir0[0];
-        LineDir.y = LineDir0[1];
-        LineDir.z = LineDir0[2];
-
-        //    Here is the main loop.
-        //    1. Project input set on the axis in consideration.
-        //    2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points.
-        //    3. Compute the vector of indexes (or clusters) for the current approximate ramp.
-        //    4. Present our color channels as 3 16DIM vectors.
-        //    5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector.
-        //    6. Plug the projections as a new directional vector for the axis.
-        //    7. Goto 1.
-        //    D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3,2/3, 0, ...,}, but shifted and normalized).
-        //    Ci - is a 16 dim vector of color i. for each Ci find a scalar Ai such that (Ai * D - Ci) (Ai * D - Ci) -> min ,
-        //         i.e distance between vector AiD and C is min. You can think of D as a unit interval(vector) "clusterizer", and Ai is a scale
-        //         you need to apply to the clusterizer to approximate the Ci vector instead of the unit vector.
-        //    Solution is
-        //    Ai = (D . Ci) / (D . D); . - is a dot product.
-        //    in 3 dim space Ai(s) represent a line direction, along which
-        //    we again try to find (sub)optimal quantizer.
-        //    That's what our for(;;) loop is about.
-        for (;;)
-        {
-            //  1. Project input set on the axis in consideration.
-            // From Foley & Van Dam: Closest point of approach of a line (P + v) to a
-            // point (R) is
-            //                            P + ((R-P).v) / (v.v))v
-            // The distance along v is therefore (R-P).v / (v.v)
-            // (v.v) is 1 if v is a unit vector.
-            //
-            PrjBnd0 = 1000.0f;
-            PrjBnd1 = -1000.0f;
-            for (i = 0; i < BLOCK_SIZE_4X4; i++)
-                Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f;
-
-            for (i = 0; i < dwUniqueColors; i++)
-            {
-                Prj0[i] = Prj[i] = dot(BlkSh[i], LineDir);
-                PrjErr[i]        = dot(BlkSh[i] - LineDir * Prj[i], BlkSh[i] - LineDir * Prj[i]);
-                PrjBnd0          = min(PrjBnd0, Prj[i]);
-                PrjBnd1          = max(PrjBnd1, Prj[i]);
-            }
-
-            //  2. Run 1 dimensional search (see scalar case) to find an (sub) optimal
-            //  pair of end points.
-
-            // min and max of the search interval
-            CGU_FLOAT Scl0;
-            CGU_FLOAT Scl1;
-            Scl0 = PrjBnd0 - (PrjBnd1 - PrjBnd0) * 0.125f;
-            Scl1 = PrjBnd1 + (PrjBnd1 - PrjBnd0) * 0.125f;
-
-            // compute scaling factor to scale down the search interval to [0.,1]
-            const CGU_FLOAT Scl2    = (Scl1 - Scl0) * (Scl1 - Scl0);
-            const CGU_FLOAT overScl = 1.f / (Scl1 - Scl0);
-
-            for (i = 0; i < dwUniqueColors; i++)
-            {
-                // scale them
-                Prj[i] = (Prj[i] - Scl0) * overScl;
-                // premultiply the scale square to plug into error computation later
-                PreMRep[i] = Rpt[i] * Scl2;
-            }
-
-            // scale first approximation of end points
-            PrjBnd0 = (PrjBnd0 - Scl0) * overScl;
-            PrjBnd1 = (PrjBnd1 - Scl0) * overScl;
-
-            CGU_FLOAT StepErr = MAX_ERROR;
-
-            // search step
-            CGU_FLOAT searchStep = 0.025f;
-
-            // low Start/End; high Start/End
-            const CGU_FLOAT lowStartEnd  = (PrjBnd0 - 2.f * searchStep > 0.f) ? PrjBnd0 - 2.f * searchStep : 0.f;
-            const CGU_FLOAT highStartEnd = (PrjBnd1 + 2.f * searchStep < 1.f) ? PrjBnd1 + 2.f * searchStep : 1.f;
-
-            // find the best endpoints
-            CGU_FLOAT Pos0 = 0;
-            CGU_FLOAT Pos1 = 0;
-            CGU_FLOAT lowPosStep, highPosStep;
-            CGU_FLOAT err;
-
-            int l, h;
-            for (l = 0, lowPosStep = lowStartEnd; l < 8; l++, lowPosStep += searchStep)
-            {
-                for (h = 0, highPosStep = highStartEnd; h < 8; h++, highPosStep -= searchStep)
-                {
-                    // compute an error for the current pair of end points.
-                    err = cgu_getRampErr(Prj, PrjErr, PreMRep, StepErr, lowPosStep, highPosStep, dwUniqueColors);
-
-                    if (err < StepErr)
-                    {
-                        // save better result
-                        StepErr = err;
-                        Pos0    = lowPosStep;
-                        Pos1    = highPosStep;
-                    }
-                }
-            }
-
-            // inverse the scaling
-            Pos0 = Pos0 * (Scl1 - Scl0) + Scl0;
-            Pos1 = Pos1 * (Scl1 - Scl0) + Scl0;
-
-            // did we find somthing better from the previous run?
-            if (StepErr + 0.001 < ErrG)
-            {
-                // yes, remember it
-                ErrG     = StepErr;
-                LineDirG = LineDir;
-
-                PosG0.x = Pos0;
-                PosG0.y = Pos0;
-                PosG0.z = Pos0;
-                PosG1.x = Pos1;
-                PosG1.y = Pos1;
-                PosG1.z = Pos1;
-
-                //  3. Compute the vector of indexes (or clusters) for the current
-                //  approximate ramp.
-                // indexes
-                const CGU_FLOAT step      = (Pos1 - Pos0) / 3.0f;  // (dwNumChannels=4 - 1);
-                const CGU_FLOAT step_h    = step * (CGU_FLOAT)0.5;
-                const CGU_FLOAT rstep     = (CGU_FLOAT)1.0f / step;
-                const CGU_FLOAT overBlkTp = 1.f / 3.0f;  // (dwNumChannels=4 - 1);
-
-                // here the index vector is computed,
-                // shifted and normalized
-                CGU_FLOAT indxAvrg = 3.0f / 2.0f;  // (dwNumChannels=4 - 1);
-
-                for (i = 0; i < dwUniqueColors; i++)
-                {
-                    CGU_FLOAT del;
-                    // CGU_UINT32 n = (CGU_UINT32)((b - _min_ex + (step*0.5f)) * rstep);
-                    if ((del = Prj0[i] - Pos0) <= 0)
-                        RmpIndxs[i] = 0.f;
-                    else if (Prj0[i] - Pos1 >= 0)
-                        RmpIndxs[i] = 3.0f;  // (dwNumChannels=4 - 1);
-                    else
-                        RmpIndxs[i] = cmp_floor((del + step_h) * rstep);
-                    // shift and normalization
-                    RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp;
-                }
-
-                //  4. Present our color channels as 3 16 DIM vectors.
-                //  5. Find closest aproximation of each of 16DIM color vector with the
-                //  pojection of the 16DIM index vector.
-                CGU_Vec3f Crs = {0.0f, 0.0f, 0.0f};
-                CGU_FLOAT Len = 0.0f;
-
-                for (i = 0; i < dwUniqueColors; i++)
-                {
-                    const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i];
-                    Len += RmpIndxs[i] * PreMlt;
-                    Crs.x += BlkSh[i].x * PreMlt;
-                    Crs.y += BlkSh[i].y * PreMlt;
-                    Crs.z += BlkSh[i].z * PreMlt;
-                }
-
-                LineDir.x = LineDir.y = LineDir.z = 0.0f;
-                if (Len > 0.0f)
-                {
-                    CGU_FLOAT Len2;
-                    LineDir = Crs / Len;
-                    //  6. Plug the projections as a new directional vector for the axis.
-                    //  7. Goto 1.
-                    Len2 = dot(LineDir, LineDir);  // LineDir.x * LineDir.x + LineDir.y * LineDir.y + LineDir.z * LineDir.z;
-                    Len2 = sqrt(Len2);
-                    LineDir /= Len2;
-                }
-            }
-            else  // We was not able to find anything better.  Drop out.
-                break;
-        }
-
-        // inverse transform to find end-points of 3-color ramp
-        rsltC0 = (PosG0 * LineDirG + Mdl) * 255.f;
-        rsltC1 = (PosG1 * LineDirG + Mdl) * 255.f;
-    }  // !isDone
-
-    // We've dealt with (almost) unrestricted full precision realm.
-    // Now back digital world.
-
-    // round the end points to make them look like compressed ones
-    CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f};
-    CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f};
-    CGU_Vec3f Fctrs0        = {8.0f, 4.0f, 8.0f};     //(1 << (PIX_GRID - BG)); x (1 << (PIX_GRID - GG)); y (1 << (PIX_GRID - RG)); z
-    CGU_Vec3f Fctrs1        = {32.0f, 64.0f, 32.0f};  //(CGU_FLOAT)(1 << RG); z (CGU_FLOAT)(1 << GG); y (CGU_FLOAT)(1 << BG); x
-    CGU_FLOAT _Min          = 0.0f;
-    CGU_FLOAT _Max          = 255.0f;
-
-    {
-        // MkRmpOnGrid(inpRmpEndPts, rsltC, _Min, _Max);
-
-        inpRmpEndPts0 = cmp_floorVec3f(rsltC0);
-
-        if (inpRmpEndPts0.x <= _Min)
-            inpRmpEndPts0.x = _Min;
-        else
-        {
-            inpRmpEndPts0.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts0.x / Fctrs1.x);
-            inpRmpEndPts0.x = min(inpRmpEndPts0.x, _Max);
-        }
-        if (inpRmpEndPts0.y <= _Min)
-            inpRmpEndPts0.y = _Min;
-        else
-        {
-            inpRmpEndPts0.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts0.y / Fctrs1.y);
-            inpRmpEndPts0.y = min(inpRmpEndPts0.y, _Max);
-        }
-        if (inpRmpEndPts0.z <= _Min)
-            inpRmpEndPts0.z = _Min;
-        else
-        {
-            inpRmpEndPts0.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts0.z / Fctrs1.z);
-            inpRmpEndPts0.z = min(inpRmpEndPts0.z, _Max);
-        }
-
-        inpRmpEndPts0 = cmp_floorVec3f(inpRmpEndPts0 / Fctrs0) * Fctrs0;
-
-        inpRmpEndPts1 = cmp_floorVec3f(rsltC1);
-        if (inpRmpEndPts1.x <= _Min)
-            inpRmpEndPts1.x = _Min;
-        else
-        {
-            inpRmpEndPts1.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts1.x / Fctrs1.x);
-            inpRmpEndPts1.x = min(inpRmpEndPts1.x, _Max);
-        }
-        if (inpRmpEndPts1.y <= _Min)
-            inpRmpEndPts1.y = _Min;
-        else
-        {
-            inpRmpEndPts1.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts1.y / Fctrs1.y);
-            inpRmpEndPts1.y = min(inpRmpEndPts1.y, _Max);
-        }
-        if (inpRmpEndPts1.z <= _Min)
-            inpRmpEndPts1.z = _Min;
-        else
-        {
-            inpRmpEndPts1.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts1.z / Fctrs1.z);
-            inpRmpEndPts1.z = min(inpRmpEndPts1.z, _Max);
-        }
-
-        inpRmpEndPts1 = cmp_floorVec3f(inpRmpEndPts1 / Fctrs0) * Fctrs0;
-    }  // MkRmpOnGrid
-
-    CMP_EndPoints EndPoints;
-    EndPoints.Color0 = inpRmpEndPts0;
-    EndPoints.Color1 = inpRmpEndPts1;
-
-    return EndPoints;
-}
-
-CMP_STATIC CMP_EndPoints
-cgu_MkRmpOnGridBGR(CMP_IN CGU_Vec3f rsltC0, CMP_IN CGU_Vec3f rsltC1, CMP_IN CGU_UINT32 nRedBits, CMP_IN CGU_UINT32 nGreenBits, CMP_IN CGU_UINT32 nBlueBits)
-{
-    CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f};
-    CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f};
-    CGU_Vec3f Fctrs0        = {8.0f, 4.0f, 8.0f};
-    CGU_Vec3f Fctrs1        = {32.0f, 64.0f, 32.0f};
-    CGU_FLOAT _Min          = 0.0f;
-    CGU_FLOAT _Max          = 255.0f;
-
-    // user override 565 default setting
-    if ((nRedBits != 5) || (nGreenBits != 6) || (nBlueBits != 5))
-    {
-        Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits);
-        Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits);
-        Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits);
-        Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits));
-        Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits));
-        Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits));
-    }
-
-    inpRmpEndPts0 = cmp_floorVec3f(rsltC0);
-
-    if (inpRmpEndPts0.x <= _Min)
-        inpRmpEndPts0.x = _Min;
-    else
-    {
-        inpRmpEndPts0.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts0.x / Fctrs1.x);
-        inpRmpEndPts0.x = cmp_minf(inpRmpEndPts0.x, _Max);
-    }
-    if (inpRmpEndPts0.y <= _Min)
-        inpRmpEndPts0.y = _Min;
-    else
-    {
-        inpRmpEndPts0.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts0.y / Fctrs1.y);
-        inpRmpEndPts0.y = cmp_minf(inpRmpEndPts0.y, _Max);
-    }
-    if (inpRmpEndPts0.z <= _Min)
-        inpRmpEndPts0.z = _Min;
-    else
-    {
-        inpRmpEndPts0.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts0.z / Fctrs1.z);
-        inpRmpEndPts0.z = cmp_minf(inpRmpEndPts0.z, _Max);
-    }
-
-    inpRmpEndPts0 = cmp_floorVec3f(inpRmpEndPts0 / Fctrs0) * Fctrs0;
-
-    inpRmpEndPts1 = cmp_floorVec3f(rsltC1);
-    if (inpRmpEndPts1.x <= _Min)
-        inpRmpEndPts1.x = _Min;
-    else
-    {
-        inpRmpEndPts1.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts1.x / Fctrs1.x);
-        inpRmpEndPts1.x = cmp_minf(inpRmpEndPts1.x, _Max);
-    }
-    if (inpRmpEndPts1.y <= _Min)
-        inpRmpEndPts1.y = _Min;
-    else
-    {
-        inpRmpEndPts1.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts1.y / Fctrs1.y);
-        inpRmpEndPts1.y = cmp_minf(inpRmpEndPts1.y, _Max);
-    }
-    if (inpRmpEndPts1.z <= _Min)
-        inpRmpEndPts1.z = _Min;
-    else
-    {
-        inpRmpEndPts1.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts1.z / Fctrs1.z);
-        inpRmpEndPts1.z = cmp_minf(inpRmpEndPts1.z, _Max);
-    }
-
-    inpRmpEndPts1 = cmp_floorVec3f(inpRmpEndPts1 / Fctrs0) * Fctrs0;
-
-    CMP_EndPoints EndPoints;
-    EndPoints.Color0 = inpRmpEndPts0;
-    EndPoints.Color1 = inpRmpEndPts1;
-
-    return EndPoints;
-
-}  // MkRmpOnGrid
-
-//===================================================================
-// Replaces CompressBlockBC1_RGBA_Internal()
-// if ((errLQ > 0.0f) && (fquality > CMP_QUALITY2)) code block
-//===================================================================
-CMP_STATIC CGU_Vec2ui cgu_CompRGBBlock(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options)
-{
-    //CGU_FLOAT  errLQ    = 1e6f;
-    CGU_UINT32 m_nRefinementSteps = BC15Options.m_nRefinementSteps;
-    CGU_UINT32 dwAlphaThreshold   = BC15Options.m_nAlphaThreshold;
-    CGU_Vec3f  channelWeights     = {BC15Options.m_fChannelWeights[0], BC15Options.m_fChannelWeights[1], BC15Options.m_fChannelWeights[2]};
-    CGU_BOOL   isSRGB             = BC15Options.m_bIsSRGB;
-
-    CGU_Vec3f  rgbBlock_normal[BLOCK_SIZE_4X4];
-    CGU_UINT32 nCmpIndices = 0;
-    CGU_UINT32 c0, c1;
-    // High Quality
-    CMP_EndPoints EndPoints = {{0, 0, 0xFF}, {0, 0, 0xFF}};
-    CGU_UINT32    i;
-
-    CGU_FLOAT ALIGN_16 Rpt[BLOCK_SIZE_4X4];
-    CGU_UINT32         pcIndices = 0;
-
-    m_nRefinementSteps = 0;
-
-    CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4];  // Normalized Block Input (0..1) in BGR channel format
-    // Default inidices & endpoints for Transparent Block
-    CGU_Vec3ui nEndpoints0 = {0, 0, 0};           // Endpoints are stored BGR as x,y,z
-    CGU_Vec3ui nEndpoints1 = {0xFF, 0xFF, 0xFF};  // Endpoints are stored BGR as x,y,z
-
-    for (i = 0; i < BLOCK_SIZE_4X4; i++)
-    {
-        Rpt[i] = 0.0f;
-    }
-
-    //===============================================================
-    // Check if we have more then 2 colors and process Alpha block
-    CGU_UINT32 dwColors = 0;
-    CGU_UINT32 dwBlk[BLOCK_SIZE_4X4];
-    CGU_UINT32 R, G, B, A;
-    for (i = 0; i < BLOCK_SIZE_4X4; i++)
-    {
-        // Do any color conversion prior to processing the block
-        rgbBlock_normal[i] = isSRGB ? cmp_linearToSrgb(src_imageNorm[i].rgb) : src_imageNorm[i].rgb;
-
-        R = (CGU_UINT32)(rgbBlock_normal[i].x * 255.0f);
-        G = (CGU_UINT32)(rgbBlock_normal[i].y * 255.0f);
-        B = (CGU_UINT32)(rgbBlock_normal[i].z * 255.0f);
-
-        //if (dwAlphaThreshold > 0)
-        //    A = (CGU_UINT32)src_imageNorm[i].w * 255.0f;
-        //else
-        A = 255;
-
-        // Punch Through Alpha in BC1 Codec (1 bit alpha)
-        //if ((dwAlphaThreshold == 0) || (A >= dwAlphaThreshold))
-        //{
-        // copy to local RGB data and have alpha set to 0xFF
-        dwBlk[dwColors++] = A << 24 | R << 16 | G << 8 | B;
-        //}
-    }
-
-    if (!dwColors)
-    {
-        // All are colors transparent
-        EndPoints.Color0.x = EndPoints.Color0.y = EndPoints.Color0.z = 0.0f;
-        EndPoints.Color1.x = EndPoints.Color1.y = EndPoints.Color0.z = 255.0f;
-        nCmpIndices                                                  = 0xFFFFFFFF;
-    }
-    else
-    {
-        // We have colors to process
-        nCmpIndices = 0;
-        // Punch Through Alpha Support ToDo
-        // CGU_BOOL bHasAlpha = (dwColors != BLOCK_SIZE_4X4);
-        // bHasAlpha = bHasAlpha && (dwAlphaThreshold > 0); // valid for  (dwNumChannels=4);
-        // if (bHasAlpha) {
-        //      CGU_Vec2ui  compBlock = {0xf800f800,0};
-        //     return compBlock;
-        // }
-
-        // Here we are computing an unique number of sorted colors.
-        // For each unique value we compute the number of it appearences.
-        // qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp);
-        {
-            CGU_UINT32 j;
-            CMP_di     what[BLOCK_SIZE_4X4];
-
-            for (i = 0; i < dwColors; i++)
-            {
-                what[i].index = i;
-                what[i].data  = dwBlk[i];
-            }
-
-            CGU_UINT32 tmp_index;
-            CGU_UINT32 tmp_data;
-
-            for (i = 1; i < dwColors; i++)
-            {
-                for (j = i; j > 0; j--)
-                {
-                    if (what[j - 1].data > what[j].data)
-                    {
-                        tmp_index         = what[j].index;
-                        tmp_data          = what[j].data;
-                        what[j].index     = what[j - 1].index;
-                        what[j].data      = what[j - 1].data;
-                        what[j - 1].index = tmp_index;
-                        what[j - 1].data  = tmp_data;
-                    }
-                }
-            }
-            for (i = 0; i < dwColors; i++)
-                dwBlk[i] = what[i].data;
-        }
-        CGU_UINT32 new_p;
-        CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4];
-        CGU_UINT32 dwUniqueColors = 0;
-        new_p = dwBlkU[0]   = dwBlk[0];
-        Rpt[dwUniqueColors] = 1.f;
-        for (i = 1; i < dwColors; i++)
-        {
-            if (new_p != dwBlk[i])
-            {
-                dwUniqueColors++;
-                new_p = dwBlkU[dwUniqueColors] = dwBlk[i];
-                Rpt[dwUniqueColors]            = 1.f;
-            }
-            else
-                Rpt[dwUniqueColors] += 1.f;
-        }
-        dwUniqueColors++;
-
-        // Simple case of only 2 colors to process
-        // no need for futher processing as lowest quality methods work best for this case
-        if (dwUniqueColors <= 2)
-        {
-            CGU_Vec3f rsltC0;
-            CGU_Vec3f rsltC1;
-            rsltC0.r  = rgbBlock_normal[0].b * 255.0f;
-            rsltC0.g  = rgbBlock_normal[0].g * 255.0f;
-            rsltC0.b  = rgbBlock_normal[0].r * 255.0f;
-            rsltC1.r  = rgbBlock_normal[dwUniqueColors - 1].b * 255.0f;
-            rsltC1.g  = rgbBlock_normal[dwUniqueColors - 1].g * 255.0f;
-            rsltC1.b  = rgbBlock_normal[dwUniqueColors - 1].r * 255.0f;
-            EndPoints = cgu_MkRmpOnGridBGR(rsltC0, rsltC1, 5, 6, 5);
-        }
-        else
-        {
-            // switch from int range back to UV floats
-            for (i = 0; i < dwUniqueColors; i++)
-            {
-                R                 = (dwBlkU[i] >> 16) & 0xff;
-                G                 = (dwBlkU[i] >> 8) & 0xff;
-                B                 = (dwBlkU[i] >> 0) & 0xff;
-                BlkInBGRf_UV[i].z = (CGU_FLOAT)R / 255.0f;
-                BlkInBGRf_UV[i].y = (CGU_FLOAT)G / 255.0f;
-                BlkInBGRf_UV[i].x = (CGU_FLOAT)B / 255.0f;
-            }
-
-            CGU_Vec3f channelWeightsBGR;
-            channelWeightsBGR.x = channelWeights.z;
-            channelWeightsBGR.y = channelWeights.y;
-            channelWeightsBGR.z = channelWeights.x;
-
-            EndPoints = cgu_CompressRGBBlockX(BlkInBGRf_UV, Rpt, dwUniqueColors, channelWeightsBGR, m_nRefinementSteps);
-        }
-    }  // colors
-
-    //===================================================================
-    // Process Cluster INPUT is constant EndPointsf OUTPUT is pcIndices
-    //===================================================================
-    if (nCmpIndices == 0)
-    {
-        R                  = (CGU_UINT32)(EndPoints.Color0.z);
-        G                  = (CGU_UINT32)(EndPoints.Color0.y);
-        B                  = (CGU_UINT32)(EndPoints.Color0.x);
-        CGU_INT32 cluster0 = cmp_constructColor(R, G, B);
-
-        R                  = (CGU_UINT32)(EndPoints.Color1.z);
-        G                  = (CGU_UINT32)(EndPoints.Color1.y);
-        B                  = (CGU_UINT32)(EndPoints.Color1.x);
-        CGU_INT32 cluster1 = cmp_constructColor(R, G, B);
-
-        CGU_Vec3f InpRmp[NUM_ENDPOINTS];
-        if ((cluster0 <= cluster1)  // valid for 4 channels
-                                    // || (cluster0 > cluster1)    // valid for 3 channels
-        )
-        {
-            // inverse endpoints
-            InpRmp[0] = EndPoints.Color1;
-            InpRmp[1] = EndPoints.Color0;
-        }
-        else
-        {
-            InpRmp[0] = EndPoints.Color0;
-            InpRmp[1] = EndPoints.Color1;
-        }
-
-        CGU_Vec3f srcblockBGR[BLOCK_SIZE_4X4];
-        CGU_FLOAT srcblockA[BLOCK_SIZE_4X4];
-
-        // Swizzle the source RGB to BGR for processing
-        for (i = 0; i < BLOCK_SIZE_4X4; i++)
-        {
-            srcblockBGR[i].z = rgbBlock_normal[i].x * 255.0f;
-            srcblockBGR[i].y = rgbBlock_normal[i].y * 255.0f;
-            srcblockBGR[i].x = rgbBlock_normal[i].z * 255.0f;
-            srcblockA[i]     = 255.0f;
-            if (dwAlphaThreshold > 0)
-            {
-                CGU_UINT32 alpha = (CGU_UINT32)src_imageNorm[i].w * 255.0f;
-                if (alpha >= dwAlphaThreshold)
-                    srcblockA[i] = alpha;
-            }
-        }
-
-        // input ramp is on the coarse grid
-        // make ramp endpoints the way they'll going to be decompressed
-        CGU_Vec3f InpRmpL[NUM_ENDPOINTS];
-        CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F};  // 1 << RG,1 << GG,1 << BG
-
-        {
-            //   ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp);
-            InpRmpL[0] = InpRmp[0] + cmp_floorVec3f(InpRmp[0] / Fctrs);
-            InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f);
-            InpRmpL[1] = InpRmp[1] + cmp_floorVec3f(InpRmp[1] / Fctrs);
-            InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f);
-        }  // MkWkRmpPts
-
-        // build ramp
-        CGU_Vec3f LerpRmp[4];
-        CGU_Vec3f offset = {1.0f, 1.0f, 1.0f};
-        {
-            //BldRmp(Rmp, InpRmpL, dwNumChannels);
-            // linear interpolate end points to get the ramp
-            LerpRmp[0] = InpRmpL[0];
-            LerpRmp[3] = InpRmpL[1];
-            LerpRmp[1] = cmp_floorVec3f((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f);
-            LerpRmp[2] = cmp_floorVec3f((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f);
-        }  // BldRmp
-
-        //=========================================================================
-        // Clusterize, Compute error and find DXTC indexes for the current cluster
-        //=========================================================================
-        {
-            // Clusterize
-            CGU_UINT32 alpha;
-
-            // For each colour in the original block assign it
-            // to the closest cluster and compute the cumulative error
-            for (i = 0; i < BLOCK_SIZE_4X4; i++)
-            {
-                alpha = (CGU_UINT32)srcblockA[i];
-                if ((dwAlphaThreshold > 0) && alpha == 0)
-                {                                      //*((CGU_DWORD *)&_Blk[i][AC]) == 0)
-                    pcIndices |= cmp_set2Bit32(4, i);  // dwNumChannels 3 or 4 (default is 4)
-                }
-                else
-                {
-                    CGU_FLOAT shortest      = 99999999999.f;
-                    CGU_UINT8 shortestIndex = 0;
-
-                    CGU_Vec3f channelWeightsBGR;
-                    channelWeightsBGR.x = channelWeights.z;
-                    channelWeightsBGR.y = channelWeights.y;
-                    channelWeightsBGR.z = channelWeights.x;
-
-                    for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++)
-                    {
-                        // r is either 1 or 4
-                        // calculate the distance for each component
-                        CGU_FLOAT distance =
-                            dot(((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR), ((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR));
-                        if (distance < shortest)
-                        {
-                            shortest      = distance;
-                            shortestIndex = rampindex;
-                        }
-                    }
-
-                    // The total is a sum of (error += shortest)
-                    // We have the index of the best cluster, so assign this in the block
-                    // Reorder indices to match correct DXTC ordering
-                    if (shortestIndex == 3)  // dwNumChannels - 1
-                        shortestIndex = 1;
-                    else if (shortestIndex)
-                        shortestIndex++;
-                    pcIndices |= cmp_set2Bit32(shortestIndex, i);
-                }
-            }  // BLOCK_SIZE_4X4
-        }      // Clusterize
-    }          // Process Cluster
-
-    //==============================================================
-    // Generate Compressed Result from nEndpoints & pcIndices
-    //==============================================================
-    c0 = cmp_constructColorBGR(EndPoints.Color0);
-    c1 = cmp_constructColorBGR(EndPoints.Color1);
-
-    // Get Processed indices if not set
-    if (nCmpIndices == 0)
-        nCmpIndices = pcIndices;
-
-    CGU_Vec2ui cmpBlock;
-    if (c0 <= c1)
-    {
-        cmpBlock.x = c1 | (c0 << 16);
-    }
-    else
-        cmpBlock.x = c0 | (c1 << 16);
-
-    cmpBlock.y = nCmpIndices;
-
-    return cmpBlock;
-}
-
-CMP_STATIC void cgu_ProcessColors(CMP_INOUT CGU_Vec3f CMP_PTRINOUT  colorMin,
-                                  CMP_INOUT CGU_Vec3f CMP_PTRINOUT  colorMax,
-                                  CMP_INOUT CGU_UINT32 CMP_PTRINOUT c0,
-                                  CMP_INOUT CGU_UINT32 CMP_PTRINOUT c1,
-                                  CMP_IN CGU_INT                    setopt,
-                                  CMP_IN CGU_BOOL                   isSRGB)
-{
-    // CGU_UINT32 srbMap[32] = {0,5,8,11,12,13,14,15,16,17,18,19,20,21,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31};
-    // CGU_UINT32 sgMap[64]  = {0,10,14,16,19,20,22,24,25,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,42,43,43,44,45,45,
-    //                          46,47,47,48,48,49,50,50,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63};
-    CGU_INT32 x, y, z;
-    CGU_Vec3f scale = {31.0f, 63.0f, 31.0f};
-    CGU_Vec3f MinColorScaled;
-    CGU_Vec3f MaxColorScaled;
-
-    // Clamp or Transform is needed, the transforms have built in clamps
-    if (isSRGB)
-    {
-        MinColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMin);
-        MaxColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMax);
-    }
-    else
-    {
-        MinColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMin, 0.0f, 1.0f);
-        MaxColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMax, 0.0f, 1.0f);
-    }
-
-    switch (setopt)
-    {
-    case 0:  // Use Min Max processing
-        MinColorScaled        = cmp_floorVec3f(MinColorScaled * scale);
-        MaxColorScaled        = cmp_ceilVec3f(MaxColorScaled * scale);
-        CMP_PTRINOUT colorMin = MinColorScaled / scale;
-        CMP_PTRINOUT colorMax = MaxColorScaled / scale;
-        break;
-    default:  // Use round processing
-        MinColorScaled = round(MinColorScaled * scale);
-        MaxColorScaled = round(MaxColorScaled * scale);
-        break;
-    }
-
-    x = (CGU_UINT32)(MinColorScaled.x);
-    y = (CGU_UINT32)(MinColorScaled.y);
-    z = (CGU_UINT32)(MinColorScaled.z);
-
-    //if (isSRGB) {
-    //    // scale RB
-    //    x = srbMap[x]; // &0x1F];
-    //    y = sgMap [y]; // &0x3F];
-    //    z = srbMap[z]; // &0x1F];
-    //    // scale G
-    //}
-    CMP_PTRINOUT c0 = (x << 11) | (y << 5) | z;
-
-    x               = (CGU_UINT32)(MaxColorScaled.x);
-    y               = (CGU_UINT32)(MaxColorScaled.y);
-    z               = (CGU_UINT32)(MaxColorScaled.z);
-    CMP_PTRINOUT c1 = (x << 11) | (y << 5) | z;
-}
-
-CMP_STATIC CGU_FLOAT cgu_getIndicesRGB(CMP_INOUT CGU_UINT32 CMP_PTRINOUT cmpindex,
-                                       CMP_IN const CGU_Vec3f            block[16],
-                                       CMP_IN CGU_Vec3f                  minColor,
-                                       CMP_IN CGU_Vec3f                  maxColor,
-                                       CMP_IN CGU_BOOL                   getErr)
-{
-    CGU_UINT32 PackedIndices = 0;
-    CGU_FLOAT  err           = 0.0f;
-    CGU_Vec3f  cn[4];
-    CGU_FLOAT  minDistance;
-
-    if (getErr)
-    {
-        // remap to BC1 spec for decoding offsets,
-        // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1
-        cn[0] = maxColor;
-        cn[1] = minColor;
-        cn[2] = cn[0] * 2.0f / 3.0f + cn[1] * 1.0f / 3.0f;
-        cn[3] = cn[0] * 1.0f / 3.0f + cn[1] * 2.0f / 3.0f;
-    }
-
-    CGU_FLOAT  Scale       = 3.f / cmp_dotVec3f(minColor - maxColor, minColor - maxColor);
-    CGU_Vec3f  ScaledRange = (minColor - maxColor) * Scale;
-    CGU_FLOAT  Bias        = (cmp_dotVec3f(maxColor, maxColor) - cmp_dotVec3f(maxColor, minColor)) * Scale;
-    CGU_INT    indexMap[4] = {0, 2, 3, 1};  // mapping based on BC1 Spec for color0 > color1
-    CGU_UINT32 index;
-    CGU_FLOAT  diff;
-
-    for (CGU_UINT32 i = 0; i < 16; i++)
-    {
-        // Get offset from base scale
-        diff  = cmp_dotVec3f(block[i], ScaledRange) + Bias;
-        index = ((CGU_UINT32)round(diff)) & 0x3;
-
-        // remap linear offset to spec offset
-        index = indexMap[index];
-
-        // use err calc for use in higher quality code
-        if (getErr)
-        {
-            minDistance = cmp_dotVec3f(block[i] - cn[index], block[i] - cn[index]);
-            err += minDistance;
-        }
-
-        // Map the 2 bit index into compress 32 bit block
-        if (index)
-            PackedIndices |= (index << (2 * i));
-    }
-
-    if (getErr)
-        err = err * 0.0208333f;
-
-    CMP_PTRINOUT cmpindex = PackedIndices;
-    return err;
-}
-
-//--------------------------------------------------------------------------------------------------------
-// Decompress is RGB (0.0f..255.0f)
-//--------------------------------------------------------------------------------------------------------
-CMP_STATIC void cgu_decompressRGBBlock(CMP_INOUT CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock)
-{
-    CGU_UINT32 n0 = compressedBlock.x & 0xffff;
-    CGU_UINT32 n1 = compressedBlock.x >> 16;
-    CGU_UINT32 index;
-
-    //-------------------------------------------------------
-    // Decode the compressed block 0..255 color range
-    //-------------------------------------------------------
-    CGU_Vec3f c0 = cmp_565ToLinear(n0);  // max color
-    CGU_Vec3f c1 = cmp_565ToLinear(n1);  // min color
-    CGU_Vec3f c2;
-    CGU_Vec3f c3;
-
-    if (n0 > n1)
-    {
-        c2 = (c0 * 2.0f + c1) / 3.0f;
-        c3 = (c1 * 2.0f + c0) / 3.0f;
-
-        for (CGU_UINT32 i = 0; i < 16; i++)
-        {
-            index = (compressedBlock.y >> (2 * i)) & 3;
-            switch (index)
-            {
-            case 0:
-                rgbBlock[i] = c0;
-                break;
-            case 1:
-                rgbBlock[i] = c1;
-                break;
-            case 2:
-                rgbBlock[i] = c2;
-                break;
-            case 3:
-                rgbBlock[i] = c3;
-                break;
-            }
-        }
-    }
-    else
-    {
-        // Transparent decode
-        c2 = (c0 + c1) / 2.0f;
-
-        for (CGU_UINT32 i = 0; i < 16; i++)
-        {
-            index = (compressedBlock.y >> (2 * i)) & 3;
-            switch (index)
-            {
-            case 0:
-                rgbBlock[i] = c0;
-                break;
-            case 1:
-                rgbBlock[i] = c1;
-                break;
-            case 2:
-                rgbBlock[i] = c2;
-                break;
-            case 3:
-                rgbBlock[i] = 0.0f;
-                break;
-            }
-        }
-    }
-}
-
-// The source is 0..255
-CMP_STATIC float cgu_RGBABlockErrorLinear(const CGU_Vec4uc src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock)
-{
-    CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4];
-
-    // Decompressed block channels are 0..255
-    cgu_decompressRGBBlock(rgbBlock, compressedBlock);
-
-    //------------------------------------------------------------------
-    // Calculate MSE of the block
-    // Note : pow is used as Float type for the code to be usable on CPU
-    //------------------------------------------------------------------
-    CGU_Vec3f serr;
-    serr = 0.0f;
-
-    float sR, sG, sB, R, G, B;
-
-    for (int j = 0; j < 16; j++)
-    {
-        sR = src_rgbBlock[j].x;
-        sG = src_rgbBlock[j].y;
-        sB = src_rgbBlock[j].z;
-
-        R = rgbBlock[j].x;
-        G = rgbBlock[j].y;
-        B = rgbBlock[j].z;
-
-        // Norm colors
-        serr.x += pow(sR - R, 2.0f);
-        serr.y += pow(sG - G, 2.0f);
-        serr.z += pow(sB - B, 2.0f);
-    }
-
-    // MSE for 16 texels
-    return (serr.x + serr.y + serr.z) / 48.0f;
-}
-
-// The source is 0..1, decompressed data using cmp_decompressRGBBlock2 is 0..255 which is converted down to 0..1
-CMP_STATIC float cgu_RGBBlockError(const CGU_Vec3f src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock, CGU_BOOL isSRGB)
-{
-    CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4];
-
-    // Decompressed block channels are 0..255
-    cgu_decompressRGBBlock(rgbBlock, compressedBlock);
-
-    //------------------------------------------------------------------
-    // Calculate MSE of the block
-    // Note : pow is used as Float type for the code to be usable on CPU
-    //------------------------------------------------------------------
-    CGU_Vec3f serr;
-    serr = 0.0f;
-
-    float sR, sG, sB, R, G, B;
-
-    for (int j = 0; j < 16; j++)
-    {
-        if (isSRGB)
-        {
-            sR = round(cmp_linearToSrgbf(src_rgbBlock[j].x) * 255.0f);
-            sG = round(cmp_linearToSrgbf(src_rgbBlock[j].y) * 255.0f);
-            sB = round(cmp_linearToSrgbf(src_rgbBlock[j].z) * 255.0f);
-        }
-        else
-        {
-            sR = round(src_rgbBlock[j].x * 255.0f);
-            sG = round(src_rgbBlock[j].y * 255.0f);
-            sB = round(src_rgbBlock[j].z * 255.0f);
-        }
-
-        R = rgbBlock[j].x;
-        G = rgbBlock[j].y;
-        B = rgbBlock[j].z;
-
-        // Norm colors
-        serr.x += pow(sR - R, 2.0f);
-        serr.y += pow(sG - G, 2.0f);
-        serr.z += pow(sB - B, 2.0f);
-    }
-
-    // MSE for 16 texels
-    return (serr.x + serr.y + serr.z) / 48.0f;
-}
-
-CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_MinMax(CMP_IN const CGU_Vec3f           src_imageRGB[16],
-                                                  CMP_IN CGU_FLOAT                 fquality,
-                                                  CMP_IN CGU_BOOL                  isSRGB,
-                                                  CMP_INOUT CGU_Vec3f              srcRGB[16],   // The list of source colors with blue channel altered
-                                                  CMP_INOUT CGU_Vec3f CMP_REFINOUT average_rgb,  // The centrepoint of the axis
-                                                  CMP_INOUT CGU_FLOAT CMP_REFINOUT errout)
-{
-    CGU_Vec2ui Q1CompData = {0, 0};
-    CGU_Vec3f  rgb        = {0, 0, 0};
-
-    // -------------------------------------------------------------------------------------
-    // (1) Find the array of unique pixel values and sum them to find their average position
-    // -------------------------------------------------------------------------------------
-    CGU_FLOAT  errLQ             = 0.0f;
-    CGU_BOOL   fastProcess       = (fquality <= CMP_QUALITY0);  // Min Max only
-    CGU_Vec3f  srcMin            = 1.0f;                        // Min source color
-    CGU_Vec3f  srcMax            = 0.0f;                        // Max source color
-    CGU_Vec2ui Q1compressedBlock = {0, 0};
-    CGU_UINT32 c0                = 0;
-    CGU_UINT32 c1                = 0;
-
-    average_rgb = 0.0f;
-    // Get average and modifed src
-    // find average position and save list of pixels as 0F..255F range for processing
-    // Note: z (blue) is average of blue+green channels
-    for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
-    {
-        srcMin = cmp_minVec3f(srcMin, src_imageRGB[i]);
-        srcMax = cmp_maxVec3f(srcMax, src_imageRGB[i]);
-        if (!fastProcess)
-        {
-            rgb         = isSRGB ? cmp_linearToSrgb(src_imageRGB[i]) : cmp_saturate(src_imageRGB[i]);
-            rgb.z       = (rgb.y + rgb.z) * 0.5F;  // Z-axiz => (R+G)/2
-            srcRGB[i]   = rgb;
-            average_rgb = average_rgb + rgb;
-        }
-    }
-
-    // Process two colors for saving in 565 format as C0 and C1
-    cgu_ProcessColors(CMP_REFINOUT srcMin, CMP_REFINOUT srcMax, CMP_REFINOUT c0, CMP_REFINOUT c1, isSRGB ? 1 : 0, isSRGB);
-
-    // Save simple min-max encoding
-    if (c0 < c1)
-    {
-        Q1CompData.x     = (c0 << 16) | c1;
-        CGU_UINT32 index = 0;
-        errLQ            = cgu_getIndicesRGB(CMP_REFINOUT index, src_imageRGB, srcMin, srcMax, false);
-        Q1CompData.y     = index;
-        errout           = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB);
-    }
-    else
-    {
-        // Most simple case all colors are equal or 0.0f
-        Q1compressedBlock.x = (c1 << 16) | c0;
-        Q1compressedBlock.y = 0;
-        errout              = 0.0f;
-        return Q1compressedBlock;
-    }
-    // 0.0625F is (1/BLOCK_SIZE_4X4)
-    average_rgb = average_rgb * 0.0625F;
-
-    return Q1CompData;
-}
-
-CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_Fast(CMP_IN const CGU_Vec3f           src_imageRGB[16],
-                                                CMP_IN CGU_FLOAT                 fquality,
-                                                CMP_IN CGU_BOOL                  isSRGB,
-                                                CMP_IN CGU_Vec3f                 srcRGB[16],
-                                                CMP_IN CGU_Vec3f CMP_REFINOUT    average_rgb,
-                                                CMP_INOUT CGU_FLOAT CMP_REFINOUT errout)
-{
-    CMP_UNUSED(fquality);
-
-    CGU_Vec3f  axisVectorRGB = {0.0f, 0.0f, 0.0f};  // The axis vector for index projection
-    CGU_FLOAT  pos_on_axis[16];                     // The distance each unique falls along the compression axis
-    CGU_FLOAT  axisleft   = 0;                      // The extremities and centre (average of left/right) of srcRGB along the compression axis
-    CGU_FLOAT  axisright  = 0;                      // The extremities and centre (average of left/right) of srcRGB along the compression axis
-    CGU_FLOAT  axiscentre = 0;                      // The extremities and centre (average of left/right) of srcRGB along the compression axis
-    CGU_INT32  swap       = 0;                      // Indicator if the RGB values need swapping to generate an opaque result
-    CGU_Vec3f  srcBlock[16];                        // The list of source colors with any color space transforms and clipping
-    CGU_UINT32 c0              = 0;
-    CGU_UINT32 c1              = 0;
-    CGU_Vec2ui compressedBlock = {0, 0};
-    CGU_FLOAT  Q1CompErr;
-    CGU_Vec2ui Q1CompData = {0, 0};
-
-    CGU_Vec3f rgb = {0, 0, 0};
-
-    // -------------------------------------------------------------------------------------
-    // (4) For each component, reflect points about the average so all lie on the same side
-    // of the average, and compute the new average - this gives a second point that defines the axis
-    // To compute the sign of the axis sum the positive differences of G for each of R and B (the
-    // G axis is always positive in this implementation
-    // -------------------------------------------------------------------------------------
-    // An interesting situation occurs if the G axis contains no information, in which case the RB
-    // axis is also compared. I am not entirely sure if this is the correct implementation - should
-    // the priority axis be determined by magnitude?
-    {
-        CGU_FLOAT rg_pos = 0.0f;
-        CGU_FLOAT bg_pos = 0.0f;
-        CGU_FLOAT rb_pos = 0.0f;
-
-        for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
-        {
-            rgb           = srcRGB[i] - average_rgb;
-            axisVectorRGB = axisVectorRGB + cmp_fabsVec3f(rgb);
-            if (rgb.x > 0)
-            {
-                rg_pos += rgb.y;
-                rb_pos += rgb.z;
-            }
-            if (rgb.z > 0)
-                bg_pos += rgb.y;
-        }
-
-        // Average over BLOCK_SIZE_4X4
-        axisVectorRGB = axisVectorRGB * 0.0625F;
-
-        // New average position
-        if (rg_pos < 0)
-            axisVectorRGB.x = -axisVectorRGB.x;
-        if (bg_pos < 0)
-            axisVectorRGB.z = -axisVectorRGB.z;
-        if ((rg_pos == bg_pos) && (rg_pos == 0))
-        {
-            if (rb_pos < 0)
-                axisVectorRGB.z = -axisVectorRGB.z;
-        }
-    }
-
-    // -------------------------------------------------------------------------------------
-    // (5) Axis projection and remapping
-    // -------------------------------------------------------------------------------------
-    {
-        CGU_FLOAT v2_recip;
-        // Normalize the axis for simplicity of future calculation
-        v2_recip = cmp_dotVec3f(axisVectorRGB, axisVectorRGB);
-        if (v2_recip > 0)
-            v2_recip = 1.0f / (CGU_FLOAT)cmp_sqrt(v2_recip);
-        else
-            v2_recip = 1.0f;
-        axisVectorRGB = axisVectorRGB * v2_recip;
-    }
-
-    // -------------------------------------------------------------------------------------
-    // (6) Map the axis
-    // -------------------------------------------------------------------------------------
-    // the line joining (and extended on either side of) average and axis
-    // defines the axis onto which the points will be projected
-    // Project all the points onto the axis, calculate the distance along
-    // the axis from the centre of the axis (average)
-    // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is
-    //     P + ((R-P).v) / (v.v))v
-    // The distance along v is therefore (R-P).v / (v.v) where (v.v) is 1 if v is a unit vector.
-    //
-    // Calculate the extremities at the same time - these need to be reasonably accurately
-    // represented in all cases
-    {
-        axisleft  = CMP_FLOAT_MAX;
-        axisright = -CMP_FLOAT_MAX;
-        for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
-        {
-            // Compute the distance along the axis of the point of closest approach
-            CGU_Vec3f temp = (srcRGB[i] - average_rgb);
-            pos_on_axis[i] = cmp_dotVec3f(temp, axisVectorRGB);
-
-            // Work out the extremities
-            if (pos_on_axis[i] < axisleft)
-                axisleft = pos_on_axis[i];
-            if (pos_on_axis[i] > axisright)
-                axisright = pos_on_axis[i];
-        }
-    }
-
-    // ---------------------------------------------------------------------------------------------
-    // (7) Now we have a good axis and the basic information about how the points are mapped to it
-    // Our initial guess is to represent the endpoints accurately, by moving the average
-    // to the centre and recalculating the point positions along the line
-    // ---------------------------------------------------------------------------------------------
-    {
-        axiscentre  = (axisleft + axisright) * 0.5F;
-        average_rgb = average_rgb + (axisVectorRGB * axiscentre);
-        for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
-            pos_on_axis[i] -= axiscentre;
-        axisright -= axiscentre;
-        axisleft -= axiscentre;
-    }
-
-    // -------------------------------------------------------------------------------------
-    // (8) Calculate the high and low output colour values
-    // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A
-    // straight rounded average is not correct, as the decompressor 'unrounds' by replicating
-    // the top bits to the bottom.
-    // In order to take account of this process, we don't just apply a straight rounding correction,
-    // but base our rounding on the input value (a straight rounding is actually pretty good in terms of
-    // error measure, but creates a visual colour and/or brightness shift relative to the original image)
-    // The method used here is to apply a centre-biased rounding dependent on the input value, which was
-    // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
-    // the image.
-    // rgb = (average_rgb + (left|right)*axisVectorRGB);
-    // -------------------------------------------------------------------------------------
-    {
-        CGU_Vec3f MinColor, MaxColor;
-
-        MinColor   = average_rgb + (axisVectorRGB * axisleft);
-        MaxColor   = average_rgb + (axisVectorRGB * axisright);
-        MinColor.z = (MinColor.z * 2) - MinColor.y;
-        MaxColor.z = (MaxColor.z * 2) - MaxColor.y;
-
-        cgu_ProcessColors(CMP_REFINOUT MinColor, CMP_REFINOUT MaxColor, CMP_REFINOUT c0, CMP_REFINOUT c1, 1, false);
-
-        // Force to be a 4-colour opaque block - in which case, c0 is greater than c1
-        swap = 0;
-        if (c0 < c1)
-        {
-            CGU_UINT32 t;
-            t    = c0;
-            c0   = c1;
-            c1   = t;
-            swap = 1;
-        }
-        else if (c0 == c1)
-        {
-            // This block will always be encoded in 3-colour mode
-            // Need to ensure that only one of the two points gets used,
-            // avoiding accidentally setting some transparent pixels into the block
-            for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
-                pos_on_axis[i] = axisleft;
-        }
-
-        compressedBlock.x = c0 | (c1 << 16);
-
-        // -------------------------------------------------------------------------------------
-        // (9) Final clustering, creating the 2-bit values that define the output
-        // -------------------------------------------------------------------------------------
-
-        CGU_UINT32 index;
-        CGU_FLOAT  division;
-        {
-            compressedBlock.y = 0;
-            division          = axisright * 2.0f / 3.0f;
-            axiscentre        = (axisleft + axisright) / 2;  // Actually, this code only works if centre is 0 or approximately so
-
-            CGU_FLOAT CompMinErr;
-
-            // This feature is work in progress
-            // remap to BC1 spec for decoding offsets,
-            // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1
-            // CGU_Vec3f   cn[4];
-            // cn[0] = MaxColor;
-            // cn[1] = MinColor;
-            // cn[2] = cn[0]*2.0f/3.0f + cn[1]*1.0f/3.0f;
-            // cn[3] = cn[0]*1.0f/3.0f + cn[1]*2.0f/3.0f;
-
-            for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
-            {
-                // Endpoints (indicated by block > average) are 0 and 1, while
-                // interpolants are 2 and 3
-                if (cmp_fabs(pos_on_axis[i]) >= division)
-                    index = 0;
-                else
-                    index = 2;
-                // Positive is in the latter half of the block
-                if (pos_on_axis[i] >= axiscentre)
-                    index += 1;
-
-                index = index ^ swap;
-                // Set the output, taking swapping into account
-                compressedBlock.y |= (index << (2 * i));
-
-                // use err calc for use in higher quality code
-                //CompMinErr += cmp_dotVec3f(srcRGBRef[i] - cn[index],srcRGBRef[i] - cn[index]);
-            }
-
-            //CompMinErr = CompMinErr * 0.0208333f;
-
-            CompMinErr = cgu_RGBBlockError(src_imageRGB, compressedBlock, isSRGB);
-            Q1CompErr  = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB);
-
-            if (CompMinErr > Q1CompErr)
-            {
-                compressedBlock = Q1CompData;
-                errout          = Q1CompErr;
-            }
-            else
-                errout = CompMinErr;
-        }
-    }
-    // done
-
-    return compressedBlock;
-}
-
-CMP_STATIC CGU_UINT8 g_Match5Bit[256][2] = {
-    {0, 0},   {0, 0},   {1, 0},   {1, 0},   {0, 1},   {0, 1},   {0, 1},   {1, 1},   {1, 1},   {1, 1},   {0, 2},   {4, 0},   {1, 2},   {1, 2},   {1, 2},
-    {2, 2},   {2, 2},   {2, 2},   {1, 3},   {5, 1},   {2, 3},   {2, 3},   {0, 4},   {3, 3},   {3, 3},   {3, 3},   {2, 4},   {2, 4},   {2, 4},   {5, 3},
-    {1, 5},   {1, 5},   {2, 5},   {4, 4},   {4, 4},   {3, 5},   {3, 5},   {2, 6},   {2, 6},   {2, 6},   {3, 6},   {5, 5},   {5, 5},   {4, 6},   {8, 4},
-    {3, 7},   {3, 7},   {3, 7},   {6, 6},   {6, 6},   {6, 6},   {5, 7},   {9, 5},   {6, 7},   {6, 7},   {4, 8},   {7, 7},   {7, 7},   {7, 7},   {6, 8},
-    {6, 8},   {6, 8},   {9, 7},   {5, 9},   {5, 9},   {6, 9},   {8, 8},   {8, 8},   {7, 9},   {7, 9},   {6, 10},  {6, 10},  {6, 10},  {7, 10},  {9, 9},
-    {9, 9},   {8, 10},  {12, 8},  {7, 11},  {7, 11},  {7, 11},  {10, 10}, {10, 10}, {10, 10}, {9, 11},  {13, 9},  {10, 11}, {10, 11}, {8, 12},  {11, 11},
-    {11, 11}, {11, 11}, {10, 12}, {10, 12}, {10, 12}, {13, 11}, {9, 13},  {9, 13},  {10, 13}, {12, 12}, {12, 12}, {11, 13}, {11, 13}, {10, 14}, {10, 14},
-    {10, 14}, {11, 14}, {13, 13}, {13, 13}, {12, 14}, {16, 12}, {11, 15}, {11, 15}, {11, 15}, {14, 14}, {14, 14}, {14, 14}, {13, 15}, {17, 13}, {14, 15},
-    {14, 15}, {12, 16}, {15, 15}, {15, 15}, {15, 15}, {14, 16}, {14, 16}, {14, 16}, {17, 15}, {13, 17}, {13, 17}, {14, 17}, {16, 16}, {16, 16}, {15, 17},
-    {15, 17}, {14, 18}, {14, 18}, {14, 18}, {15, 18}, {17, 17}, {17, 17}, {16, 18}, {20, 16}, {15, 19}, {15, 19}, {15, 19}, {18, 18}, {18, 18}, {18, 18},
-    {17, 19}, {21, 17}, {18, 19}, {18, 19}, {16, 20}, {19, 19}, {19, 19}, {19, 19}, {18, 20}, {18, 20}, {18, 20}, {21, 19}, {17, 21}, {17, 21}, {18, 21},
-    {20, 20}, {20, 20}, {19, 21}, {19, 21}, {18, 22}, {18, 22}, {18, 22}, {19, 22}, {21, 21}, {21, 21}, {20, 22}, {24, 20}, {19, 23}, {19, 23}, {19, 23},
-    {22, 22}, {22, 22}, {22, 22}, {21, 23}, {25, 21}, {22, 23}, {22, 23}, {20, 24}, {23, 23}, {23, 23}, {23, 23}, {22, 24}, {22, 24}, {22, 24}, {25, 23},
-    {21, 25}, {21, 25}, {22, 25}, {24, 24}, {24, 24}, {23, 25}, {23, 25}, {22, 26}, {22, 26}, {22, 26}, {23, 26}, {25, 25}, {25, 25}, {24, 26}, {28, 24},
-    {23, 27}, {23, 27}, {23, 27}, {26, 26}, {26, 26}, {26, 26}, {25, 27}, {29, 25}, {26, 27}, {26, 27}, {24, 28}, {27, 27}, {27, 27}, {27, 27}, {26, 28},
-    {26, 28}, {26, 28}, {29, 27}, {25, 29}, {25, 29}, {26, 29}, {28, 28}, {28, 28}, {27, 29}, {27, 29}, {26, 30}, {26, 30}, {26, 30}, {27, 30}, {29, 29},
-    {29, 29}, {28, 30}, {28, 30}, {27, 31}, {27, 31}, {27, 31}, {30, 30}, {30, 30}, {30, 30}, {29, 31}, {29, 31}, {30, 31}, {30, 31}, {30, 31}, {31, 31},
-    {31, 31}};
-
-CMP_STATIC CGU_UINT8 g_Match6Bit[256][2] = {
-    {0, 0},   {1, 0},   {0, 1},   {1, 1},   {1, 1},   {0, 2},   {1, 2},   {2, 2},   {2, 2},   {1, 3},   {0, 4},   {3, 3},   {3, 3},   {0, 5},   {1, 5},
-    {4, 4},   {4, 4},   {1, 6},   {0, 7},   {5, 5},   {5, 5},   {0, 8},   {1, 8},   {6, 6},   {6, 6},   {1, 9},   {2, 9},   {7, 7},   {7, 7},   {2, 10},
-    {3, 10},  {8, 8},   {8, 8},   {3, 11},  {4, 11},  {9, 9},   {9, 9},   {4, 12},  {5, 12},  {10, 10}, {10, 10}, {5, 13},  {6, 13},  {16, 8},  {11, 11},
-    {6, 14},  {7, 14},  {17, 9},  {12, 12}, {7, 15},  {8, 15},  {16, 11}, {13, 13}, {10, 15}, {8, 16},  {9, 16},  {14, 14}, {13, 15}, {9, 17},  {10, 17},
-    {15, 15}, {16, 15}, {10, 18}, {11, 18}, {12, 18}, {16, 16}, {11, 19}, {12, 19}, {13, 19}, {17, 17}, {12, 20}, {13, 20}, {14, 20}, {18, 18}, {13, 21},
-    {14, 21}, {15, 21}, {19, 19}, {14, 22}, {15, 22}, {20, 20}, {20, 20}, {15, 23}, {16, 23}, {21, 21}, {21, 21}, {16, 24}, {17, 24}, {22, 22}, {22, 22},
-    {17, 25}, {18, 25}, {23, 23}, {23, 23}, {18, 26}, {19, 26}, {24, 24}, {24, 24}, {19, 27}, {20, 27}, {25, 25}, {25, 25}, {20, 28}, {21, 28}, {26, 26},
-    {26, 26}, {21, 29}, {22, 29}, {32, 24}, {27, 27}, {22, 30}, {23, 30}, {33, 25}, {28, 28}, {23, 31}, {24, 31}, {32, 27}, {29, 29}, {26, 31}, {24, 32},
-    {25, 32}, {30, 30}, {29, 31}, {25, 33}, {26, 33}, {31, 31}, {32, 31}, {26, 34}, {27, 34}, {28, 34}, {32, 32}, {27, 35}, {28, 35}, {29, 35}, {33, 33},
-    {28, 36}, {29, 36}, {30, 36}, {34, 34}, {29, 37}, {30, 37}, {31, 37}, {35, 35}, {30, 38}, {31, 38}, {36, 36}, {36, 36}, {31, 39}, {32, 39}, {37, 37},
-    {37, 37}, {32, 40}, {33, 40}, {38, 38}, {38, 38}, {33, 41}, {34, 41}, {39, 39}, {39, 39}, {34, 42}, {35, 42}, {40, 40}, {40, 40}, {35, 43}, {36, 43},
-    {41, 41}, {41, 41}, {36, 44}, {37, 44}, {42, 42}, {42, 42}, {37, 45}, {38, 45}, {48, 40}, {43, 43}, {38, 46}, {39, 46}, {49, 41}, {44, 44}, {39, 47},
-    {40, 47}, {48, 43}, {45, 45}, {42, 47}, {40, 48}, {41, 48}, {46, 46}, {45, 47}, {41, 49}, {42, 49}, {47, 47}, {48, 47}, {42, 50}, {43, 50}, {44, 50},
-    {48, 48}, {43, 51}, {44, 51}, {45, 51}, {49, 49}, {44, 52}, {45, 52}, {46, 52}, {50, 50}, {45, 53}, {46, 53}, {47, 53}, {51, 51}, {46, 54}, {47, 54},
-    {52, 52}, {52, 52}, {47, 55}, {48, 55}, {53, 53}, {53, 53}, {48, 56}, {49, 56}, {54, 54}, {54, 54}, {49, 57}, {50, 57}, {55, 55}, {55, 55}, {50, 58},
-    {51, 58}, {56, 56}, {56, 56}, {51, 59}, {52, 59}, {57, 57}, {57, 57}, {52, 60}, {53, 60}, {58, 58}, {58, 58}, {53, 61}, {54, 61}, {59, 59}, {59, 59},
-    {54, 62}, {55, 62}, {60, 60}, {60, 60}, {55, 63}, {56, 63}, {61, 61}, {61, 61}, {58, 63}, {59, 63}, {62, 62}, {62, 62}, {61, 63}, {62, 63}, {63, 63},
-    {63, 63}};
-
-CMP_STATIC CGU_Vec2ui cgu_solidColorBlock(CMP_IN CGU_UINT8 Red, CMP_IN CGU_UINT8 Green, CMP_IN CGU_UINT8 Blue)
-{
-    CGU_UINT32 maxEndp16;
-    CGU_UINT32 minEndp16;
-
-    CGU_UINT32 mask = 0xAAAAAAAAu;
-
-    minEndp16 = g_Match5Bit[Red][0] * 2048U + g_Match6Bit[Green][0] * 32U + g_Match5Bit[Blue][0];
-    maxEndp16 = g_Match5Bit[Red][1] * 2048U + g_Match6Bit[Green][1] * 32U + g_Match5Bit[Blue][1];
-
-    // write the color block
-    if (maxEndp16 < minEndp16)
-    {
-        CGU_UINT32 tmpValue = minEndp16;
-        minEndp16           = maxEndp16;
-        maxEndp16           = tmpValue;
-        mask ^= 0x55555555u;
-    }
-
-    CGU_Vec2ui outputBytes;
-    outputBytes.x = CGU_UINT32(maxEndp16) | (CGU_UINT32(minEndp16) << 16u);
-    outputBytes.y = mask;
-
-    return outputBytes;
-}
-
-CMP_STATIC void cmp_get_encode_data(CMP_IN CMP_EncodeData CMP_REFINOUT edata, CMP_IN CMP_CONSTANT CGU_Vec4uc src_image[16])
-{
-    CMP_CONSTANT CGU_UINT32 fr = src_image[0].r, fg = src_image[0].g, fb = src_image[0].b;
-
-    edata.all_colors_equal = false;
-
-    edata.total.r = fr;
-    edata.total.g = fg;
-    edata.total.b = fb;
-    edata.max.r   = fr;
-    edata.max.g   = fg;
-    edata.max.b   = fb;
-    edata.min.r   = fr;
-    edata.min.g   = fg;
-    edata.min.b   = fb;
-
-    edata.grayscale_flag   = (fr == fg) && (fr == fb);
-    edata.any_black_pixels = (fr | fg | fb) < 4;
-
-    for (CGU_UINT32 i = 1; i < 16; i++)
-    {
-        CMP_CONSTANT CGU_INT r = src_image[i].r, g = src_image[i].g, b = src_image[i].b;
-
-        edata.grayscale_flag &= ((r == g) && (r == b));
-        edata.any_black_pixels |= ((r | g | b) < 4);
-
-        edata.max.r = CMP_MAX(edata.max.r, r);
-        edata.max.g = CMP_MAX(edata.max.g, g);
-        edata.max.b = CMP_MAX(edata.max.b, b);
-        edata.min.r = CMP_MIN(edata.min.r, r);
-        edata.min.g = CMP_MIN(edata.min.g, g);
-        edata.min.b = CMP_MIN(edata.min.b, b);
-        edata.total.r += r;
-        edata.total.g += g;
-        edata.total.b += b;
-    }
-
-    edata.avg.r = (edata.total.r + 8) >> 4;
-    edata.avg.g = (edata.total.g + 8) >> 4;
-    edata.avg.b = (edata.total.b + 8) >> 4;
-}
-
-#ifndef ASPM_GPU
-/*------------------------------------------------------------------------------------------------
-1 DIM ramp
-------------------------------------------------------------------------------------------------*/
-CMP_STATIC inline void cpu_BldClrRmp(CGU_FLOAT _Rmp[MAX_POINTS], CGU_FLOAT _InpRmp[NUM_ENDPOINTS], CGU_UINT32 dwNumPoints)
-{
-    CGU_UINT32 dwRndAmount[9] = {0, 0, 0, 0, 1, 1, 2, 2, 3};
-
-    // linear interpolate end points to get the ramp
-    _Rmp[0]               = _InpRmp[0];
-    _Rmp[dwNumPoints - 1] = _InpRmp[1];
-    if (dwNumPoints % 2)
-        _Rmp[dwNumPoints] = 1000000.f;  // for 3 point ramp; not to select the 4th point as min
-    for (CGU_UINT32 e = 1; e < dwNumPoints - 1; e++)
-        _Rmp[e] = cmp_floor((_Rmp[0] * (dwNumPoints - 1 - e) + _Rmp[dwNumPoints - 1] * e + dwRndAmount[dwNumPoints]) / (CGU_FLOAT)(dwNumPoints - 1));
-}
-
-/*------------------------------------------------------------------------------------------------
-// build 3D ramp
-------------------------------------------------------------------------------------------------*/
-CMP_STATIC inline void cpu_BldRmp(CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS], CGU_UINT32 dwNumPoints)
-{
-    for (CGU_UINT32 j = 0; j < 3; j++)
-        cpu_BldClrRmp(_Rmp[j], _InpRmp[j], dwNumPoints);
-}
-
-/*------------------------------------------------------------------------------------------------
-// this is how the end points is going to be look like when decompressed
-------------------------------------------------------------------------------------------------*/
-CMP_STATIC inline void cpu_MkWkRmpPts(CMP_INOUT CGU_UINT8 CMP_REFINOUT _bEq,
-                                      CGU_FLOAT                        _OutRmpPts[NUM_CHANNELS][NUM_ENDPOINTS],
-                                      CGU_FLOAT                        _InpRmpPts[NUM_CHANNELS][NUM_ENDPOINTS],
-                                      CGU_UINT8                        nRedBits,
-                                      CGU_UINT8                        nGreenBits,
-                                      CGU_UINT8                        nBlueBits)
-{
-    CGU_FLOAT Fctrs[3];
-    Fctrs[RC] = (CGU_FLOAT)(1 << nRedBits);
-    Fctrs[GC] = (CGU_FLOAT)(1 << nGreenBits);
-    Fctrs[BC] = (CGU_FLOAT)(1 << nBlueBits);
-
-    CGU_BOOL bEq = true;
-    // find whether input ramp is flat
-    for (CGU_UINT32 j = 0; j < 3; j++)
-        bEq &= (_InpRmpPts[j][0] == _InpRmpPts[j][1]);
-
-    _bEq = bEq ? 1 : 0;
-
-    // end points on the integer grid
-    for (CGU_UINT32 j = 0; j < 3; j++)
-    {
-        for (CGU_UINT32 k = 0; k < 2; k++)
-        {
-            // Apply the lower bit replication to give full dynamic range
-            _OutRmpPts[j][k] = _InpRmpPts[j][k] + cmp_floor(_InpRmpPts[j][k] / Fctrs[j]);
-            _OutRmpPts[j][k] = cmp_max(_OutRmpPts[j][k], 0.f);
-            _OutRmpPts[j][k] = cmp_min(_OutRmpPts[j][k], 255.f);
-        }
-    }
-}
-
-// Compute error and find DXTC indexes for the current cluster
-CMP_STATIC CGU_FLOAT cpu_ClstrIntnl(CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
-                                    CGU_UINT8 pcIndices[BLOCK_SIZE_4X4],
-                                    CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS],
-                                    int       dwBlockSize,
-                                    CGU_UINT8 dwNumPoints,
-                                    bool      _ConstRamp,
-                                    CGU_FLOAT _pfWeights[3],
-                                    bool      _bUseAlpha)
-{
-    CGU_FLOAT Err   = 0.f;
-    CGU_UINT8 rmp_l = (_ConstRamp) ? 1 : dwNumPoints;
-
-    // For each colour in the original block assign it
-    // to the closest cluster and compute the cumulative error
-    for (int i = 0; i < dwBlockSize; i++)
-    {
-        if (_bUseAlpha && *((CGU_UINT32*)&_Blk[i][AC]) == 0)
-            pcIndices[i] = dwNumPoints;
-        else
-        {
-            CGU_FLOAT shortest      = 99999999999.f;
-            CGU_UINT8 shortestIndex = 0;
-            CGU_UINT8 r;
-            if ((_pfWeights[0] != 1.0f) || (_pfWeights[1] != 1.0f) || (_pfWeights[2] != 1.0f))
-                for (r = 0; r < rmp_l; r++)
-                {
-                    // calculate the distance for each component
-                    CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * _pfWeights[0] +
-                                         (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * _pfWeights[1] +
-                                         (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * _pfWeights[2];
-
-                    if (distance < shortest)
-                    {
-                        shortest      = distance;
-                        shortestIndex = r;
-                    }
-                }
-            else
-                for (r = 0; r < rmp_l; r++)
-                {
-                    // calculate the distance for each component
-                    CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) +
-                                         (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]);
-
-                    if (distance < shortest)
-                    {
-                        shortest      = distance;
-                        shortestIndex = r;
-                    }
-                }
-
-            Err += shortest;
-
-            // We have the index of the best cluster, so assign this in the block
-            // Reorder indices to match correct DXTC ordering
-            if (shortestIndex == dwNumPoints - 1)
-                shortestIndex = 1;
-            else if (shortestIndex)
-                shortestIndex++;
-            pcIndices[i] = shortestIndex;
-        }
-    }
-
-    return Err;
-}
-
-/*------------------------------------------------------------------------------------------------
-// input ramp is on the coarse grid
-------------------------------------------------------------------------------------------------*/
-CMP_STATIC CGU_FLOAT cpu_ClstrBas(CGU_UINT8 pcIndices[BLOCK_SIZE_4X4],
-                                  CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
-                                  CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS],
-                                  int       dwBlockSize,
-                                  CGU_UINT8 dwNumPoints,
-                                  CGU_FLOAT _pfWeights[3],
-                                  bool      _bUseAlpha,
-                                  CGU_UINT8 nRedBits,
-                                  CGU_UINT8 nGreenBits,
-                                  CGU_UINT8 nBlueBits)
-{
-    // make ramp endpoints the way they'll going to be decompressed
-    CGU_UINT8 Eq = 1;
-    CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
-    cpu_MkWkRmpPts(Eq, InpRmp, _InpRmp, nRedBits, nGreenBits, nBlueBits);
-
-    // build ramp as it would be built by decompressor
-    CGU_FLOAT Rmp[NUM_CHANNELS][MAX_POINTS];
-    cpu_BldRmp(Rmp, InpRmp, dwNumPoints);
-
-    // clusterize and find a cumulative error
-    return cpu_ClstrIntnl(_Blk, pcIndices, Rmp, dwBlockSize, dwNumPoints, Eq, _pfWeights, _bUseAlpha);
-}
-
-CMP_STATIC CGU_UINT8 nByteBitsMask2[9] = {0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
-
-CMP_STATIC CGU_UINT32 cpu_ConstructColor2(CGU_UINT8 R, CGU_UINT8 nRedBits, CGU_UINT8 G, CGU_UINT8 nGreenBits, CGU_UINT8 B, CGU_UINT8 nBlueBits)
-{
-    return (((R & nByteBitsMask2[nRedBits]) << (nGreenBits + nBlueBits - (PIX_GRID - nRedBits))) |
-            ((G & nByteBitsMask2[nGreenBits]) << (nBlueBits - (PIX_GRID - nGreenBits))) | ((B & nByteBitsMask2[nBlueBits]) >> ((PIX_GRID - nBlueBits))));
-}
-
-CMP_STATIC CGU_FLOAT cpu_Clstr(CGU_UINT32 block_32[BLOCK_SIZE_4X4],
-                               CGU_UINT32 dwBlockSize,
-                               CGU_UINT8  nEndpoints[3][NUM_ENDPOINTS],
-                               CGU_UINT8  pcIndices[BLOCK_SIZE_4X4],
-                               CGU_UINT8  dwNumPoints,
-                               CGU_FLOAT  _pfWeights[3],
-                               bool       _bUseAlpha,
-                               CGU_UINT8  _nAlphaThreshold,
-                               CGU_UINT8  nRedBits,
-                               CGU_UINT8  nGreenBits,
-                               CGU_UINT8  nBlueBits)
-{
-    CGU_UINT32 c0              = cpu_ConstructColor2(nEndpoints[RC][0], nRedBits, nEndpoints[GC][0], nGreenBits, nEndpoints[BC][0], nBlueBits);
-    CGU_UINT32 c1              = cpu_ConstructColor2(nEndpoints[RC][1], nRedBits, nEndpoints[GC][1], nGreenBits, nEndpoints[BC][1], nBlueBits);
-    CGU_UINT32 nEndpointIndex0 = 0;
-    CGU_UINT32 nEndpointIndex1 = 1;
-    if ((!(dwNumPoints & 0x1) && c0 <= c1) || ((dwNumPoints & 0x1) && c0 > c1))
-    {
-        nEndpointIndex0 = 1;
-        nEndpointIndex1 = 0;
-    }
-
-    CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
-    InpRmp[RC][0] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex0];
-    InpRmp[RC][1] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex1];
-    InpRmp[GC][0] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex0];
-    InpRmp[GC][1] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex1];
-    InpRmp[BC][0] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex0];
-    InpRmp[BC][1] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex1];
-
-    CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24;
-    CGU_FLOAT  Blk[BLOCK_SIZE_4X4][NUM_CHANNELS];
-    for (CGU_UINT32 i = 0; i < dwBlockSize; i++)
-    {
-        Blk[i][RC] = (CGU_FLOAT)((block_32[i] & 0xff0000) >> 16);
-        Blk[i][GC] = (CGU_FLOAT)((block_32[i] & 0xff00) >> 8);
-        Blk[i][BC] = (CGU_FLOAT)(block_32[i] & 0xff);
-        if (_bUseAlpha)
-            Blk[i][AC] = ((block_32[i] & 0xff000000) >= dwAlphaThreshold) ? 1.f : 0.f;
-    }
-
-    return cpu_ClstrBas(pcIndices, Blk, InpRmp, dwBlockSize, dwNumPoints, _pfWeights, _bUseAlpha, nRedBits, nGreenBits, nBlueBits);
-}
-
-/*------------------------------------------------------------------------------------------------
-Compute cumulative error for the current cluster
-------------------------------------------------------------------------------------------------*/
-CMP_STATIC CGU_FLOAT cpu_ClstrErr(CGU_FLOAT  _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
-                                  CGU_FLOAT  _Rpt[BLOCK_SIZE_4X4],
-                                  CGU_FLOAT  _Rmp[NUM_CHANNELS][MAX_POINTS],
-                                  CGU_UINT32 _NmbClrs,
-                                  CGU_UINT32 _blcktp,
-                                  bool       _ConstRamp,
-                                  CGU_Vec3f  channelWeights)
-{
-    CGU_FLOAT  fError = 0.f;
-    CGU_UINT32 rmp_l  = (_ConstRamp) ? 1 : _blcktp;
-
-    CGU_BOOL useWeights = ((channelWeights[0] != 1.0f) || (channelWeights[1] != 1.0f) || (channelWeights[2] != 1.0f));
-
-    // For each colour in the original block, find the closest cluster
-    // and compute the comulative error
-    for (CGU_UINT32 i = 0; i < _NmbClrs; i++)
-    {
-        CGU_FLOAT fShortest = 99999999999.f;
-
-        if (useWeights)
-            for (CGU_UINT32 r = 0; r < rmp_l; r++)
-            {
-                // calculate the distance for each component
-                CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * channelWeights[0] +
-                                      (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * channelWeights[1] +
-                                      (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * channelWeights[2];
-
-                if (fDistance < fShortest)
-                    fShortest = fDistance;
-            }
-        else
-            for (CGU_UINT32 r = 0; r < rmp_l; r++)
-            {
-                // calculate the distance for each component
-                CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) +
-                                      (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]);
-
-                if (fDistance < fShortest)
-                    fShortest = fDistance;
-            }
-
-        // accumulate the error
-        fError += fShortest * _Rpt[i];
-    }
-
-    return fError;
-}
-
-#if defined(USE_REFINE3D)
-
-CMP_STATIC CGU_FLOAT cmp_Refine3D(CGU_FLOAT  _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
-                                  CGU_FLOAT  _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
-                                  CGU_FLOAT  _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
-                                  CGU_FLOAT  _Rpt[BLOCK_SIZE_4X4],
-                                  CGU_UINT32 _NmrClrs,
-                                  CGU_UINT32 dwNumPoints,
-                                  CGU_Vec3f  channelWeights,
-                                  CGU_UINT8  nRedBits,
-                                  CGU_UINT8  nGreenBits,
-                                  CGU_UINT8  nBlueBits,
-                                  CGU_UINT32 nRefineSteps)
-{
-    CGU_FLOAT ALIGN_16 Rmp[NUM_CHANNELS][MAX_POINTS];
-
-    CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS];
-    for (CGU_UINT32 i = 0; i < _NmrClrs; i++)
-        for (CGU_UINT32 j = 0; j < 3; j++)
-            Blk[i][j] = _Blk[i][j];
-
-    CGU_FLOAT fWeightRed   = channelWeights.r;
-    CGU_FLOAT fWeightGreen = channelWeights.g;
-    CGU_FLOAT fWeightBlue  = channelWeights.b;
-
-    // here is our grid
-    CGU_FLOAT Fctrs[3];
-    Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits));
-    Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits));
-    Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits));
-
-    CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS];
-    CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
-    for (CGU_UINT32 k = 0; k < 2; k++)
-        for (CGU_UINT32 j = 0; j < 3; j++)
-            InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k];
-
-    // make ramp endpoints the way they'll going to be decompressed
-    // plus check whether the ramp is flat
-    CGU_UINT8 Eq;
-    CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS];
-    cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
-
-    // build ramp for all 3 colors
-    cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
-
-    // clusterize for the current ramp
-    CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights);
-    if (bestE == 0.f)  // if exact, we've done
-        return bestE;
-
-    // Jitter endpoints in each direction
-    CGU_INT nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8));
-    CGU_INT nRefineEnd   = cmp_min(nRefineSteps, (CGU_UINT8)8);
-    for (CGU_INT nJitterG0 = nRefineStart; nJitterG0 <= nRefineEnd; nJitterG0++)
-    {
-        InpRmp[GC][0] = cmp_min(cmp_max(InpRmp0[GC][0] + nJitterG0 * Fctrs[GC], 0.f), 255.f);
-        for (CGU_INT nJitterG1 = nRefineStart; nJitterG1 <= nRefineEnd; nJitterG1++)
-        {
-            InpRmp[GC][1] = cmp_min(cmp_max(InpRmp0[GC][1] + nJitterG1 * Fctrs[GC], 0.f), 255.f);
-            cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
-            cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints);
-
-            CGU_FLOAT RmpErrG[MAX_POINTS][BLOCK_SIZE_4X4];
-            for (CGU_UINT32 i = 0; i < _NmrClrs; i++)
-            {
-                for (CGU_UINT32 r = 0; r < dwNumPoints; r++)
-                {
-                    CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
-                    RmpErrG[r][i]   = DistG * DistG * fWeightGreen;
-                }
-            }
-
-            for (CGU_INT nJitterB0 = nRefineStart; nJitterB0 <= nRefineEnd; nJitterB0++)
-            {
-                InpRmp[BC][0] = cmp_min(cmp_max(InpRmp0[BC][0] + nJitterB0 * Fctrs[BC], 0.f), 255.f);
-                for (CGU_INT nJitterB1 = nRefineStart; nJitterB1 <= nRefineEnd; nJitterB1++)
-                {
-                    InpRmp[BC][1] = cmp_min(cmp_max(InpRmp0[BC][1] + nJitterB1 * Fctrs[BC], 0.f), 255.f);
-                    cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
-                    cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints);
-
-                    CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4];
-                    for (CGU_UINT32 i = 0; i < _NmrClrs; i++)
-                    {
-                        for (CGU_UINT32 r = 0; r < dwNumPoints; r++)
-                        {
-                            CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
-                            RmpErr[r][i]    = RmpErrG[r][i] + DistB * DistB * fWeightBlue;
-                        }
-                    }
-
-                    for (CGU_INT nJitterR0 = nRefineStart; nJitterR0 <= nRefineEnd; nJitterR0++)
-                    {
-                        InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + nJitterR0 * Fctrs[RC], 0.f), 255.f);
-                        for (CGU_INT nJitterR1 = nRefineStart; nJitterR1 <= nRefineEnd; nJitterR1++)
-                        {
-                            InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + nJitterR1 * Fctrs[RC], 0.f), 255.f);
-                            cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
-                            cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints);
-
-                            // compute cumulative error
-                            CGU_FLOAT mse   = 0.f;
-                            CGU_INT   rmp_l = (Eq > 0) ? 1 : dwNumPoints;
-                            for (CGU_UINT32 k = 0; k < _NmrClrs; k++)
-                            {
-                                CGU_FLOAT MinErr = 10000000.f;
-                                for (CGU_INT r = 0; r < rmp_l; r++)
-                                {
-                                    CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]);
-                                    CGU_FLOAT Err  = RmpErr[r][k] + Dist * Dist * fWeightRed;
-                                    MinErr         = cmp_min(MinErr, Err);
-                                }
-                                mse += MinErr * _Rpt[k];
-                            }
-
-                            // save if we achieve better result
-                            if (mse < bestE)
-                            {
-                                bestE = mse;
-                                for (CGU_UINT32 k = 0; k < 2; k++)
-                                    for (CGU_UINT32 j = 0; j < 3; j++)
-                                        _OutRmpPnts[j][k] = InpRmp[j][k];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    return bestE;
-}
-#endif
-
-#if defined(USE_REFINE)
-CMP_STATIC CGU_FLOAT cmp_Refine(CGU_FLOAT  _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
-                                CGU_FLOAT  _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
-                                CGU_FLOAT  _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
-                                CGU_FLOAT  _Rpt[BLOCK_SIZE_4X4],
-                                CGU_INT    _NmrClrs,
-                                CGU_UINT8  dwNumPoints,
-                                CGU_Vec3f  channelWeights,
-                                CGU_UINT32 nRedBits,
-                                CGU_UINT32 nGreenBits,
-                                CGU_UINT32 nBlueBits,
-                                CGU_UINT32 nRefineSteps)
-{
-    CGU_FLOAT ALIGN_16 Rmp[NUM_CHANNELS][MAX_POINTS];
-
-    if (nRefineSteps == 0)
-        nRefineSteps = 1;
-
-    CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS];
-    for (CGU_INT i = 0; i < _NmrClrs; i++)
-        for (CGU_INT j = 0; j < 3; j++)
-            Blk[i][j] = _Blk[i][j];
-
-    CGU_FLOAT fWeightRed   = channelWeights.r;
-    CGU_FLOAT fWeightGreen = channelWeights.g;
-    CGU_FLOAT fWeightBlue  = channelWeights.b;
-
-    // here is our grid
-    CGU_FLOAT Fctrs[3];
-    Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits));
-    Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits));
-    Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits));
-
-    CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS];
-    CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
-    for (CGU_INT k = 0; k < 2; k++)
-        for (CGU_INT j = 0; j < 3; j++)
-            InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k];
-
-    // make ramp endpoints the way they'll going to be decompressed
-    // plus check whether the ramp is flat
-    CGU_UINT8 Eq;
-    CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS];
-    cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
-
-    // build ramp for all 3 colors
-    cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
-
-    // clusterize for the current ramp
-    CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights);
-    if (bestE == 0.f)  //  || !nRefineSteps)    // if exact, we've done
-        return bestE;
-
-    // Tweak each component in isolation and get the best values
-
-    // precompute ramp errors for Green and Blue
-    CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4];
-    for (CGU_INT i = 0; i < _NmrClrs; i++)
-    {
-        for (CGU_INT r = 0; r < dwNumPoints; r++)
-        {
-            CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
-            CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
-            RmpErr[r][i]    = DistG * DistG * fWeightGreen + DistB * DistB * fWeightBlue;
-        }
-    }
-
-    // First Red
-    CGU_FLOAT bstC0        = InpRmp0[RC][0];
-    CGU_FLOAT bstC1        = InpRmp0[RC][1];
-    CGU_INT   nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8));
-    CGU_INT   nRefineEnd   = cmp_min(nRefineSteps, (CGU_UINT8)8);
-    for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++)
-    {
-        for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++)
-        {
-            // make a move; both sides of interval.
-            InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + i * Fctrs[RC], 0.f), 255.f);
-            InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + j * Fctrs[RC], 0.f), 255.f);
-
-            // make ramp endpoints the way they'll going to be decompressed
-            // plus check whether the ramp is flat
-            cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
-
-            // build ramp only for red
-            cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints);
-
-            // compute cumulative error
-            CGU_FLOAT mse   = 0.f;
-            CGU_INT   rmp_l = (Eq > 0) ? 1 : dwNumPoints;
-            for (CGU_INT k = 0; k < _NmrClrs; k++)
-            {
-                CGU_FLOAT MinErr = 10000000.f;
-                for (CGU_INT r = 0; r < rmp_l; r++)
-                {
-                    CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]);
-                    CGU_FLOAT Err  = RmpErr[r][k] + Dist * Dist * fWeightRed;
-                    MinErr         = cmp_minf(MinErr, Err);
-                }
-                mse += MinErr * _Rpt[k];
-            }
-
-            // save if we achieve better result
-            if (mse < bestE)
-            {
-                bstC0 = InpRmp[RC][0];
-                bstC1 = InpRmp[RC][1];
-                bestE = mse;
-            }
-        }
-    }
-
-    // our best REDs
-    InpRmp[RC][0] = bstC0;
-    InpRmp[RC][1] = bstC1;
-
-    // make ramp endpoints the way they'll going to be decompressed
-    // plus check whether the ramp is flat
-    cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
-
-    // build ramp only for green
-    cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
-
-    // precompute ramp errors for Red and Blue
-    for (CGU_INT i = 0; i < _NmrClrs; i++)
-    {
-        for (CGU_INT r = 0; r < dwNumPoints; r++)
-        {
-            CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]);
-            CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
-            RmpErr[r][i]    = DistR * DistR * fWeightRed + DistB * DistB * fWeightBlue;
-        }
-    }
-
-    // Now green
-    bstC0 = InpRmp0[GC][0];
-    bstC1 = InpRmp0[GC][1];
-    for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++)
-    {
-        for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++)
-        {
-            InpRmp[GC][0] = cmp_minf(cmp_maxf(InpRmp0[GC][0] + i * Fctrs[GC], 0.f), 255.f);
-            InpRmp[GC][1] = cmp_minf(cmp_maxf(InpRmp0[GC][1] + j * Fctrs[GC], 0.f), 255.f);
-
-            cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
-            cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints);
-
-            CGU_FLOAT mse   = 0.f;
-            CGU_INT   rmp_l = (Eq > 0) ? 1 : dwNumPoints;
-            for (CGU_INT k = 0; k < _NmrClrs; k++)
-            {
-                CGU_FLOAT MinErr = 10000000.f;
-                for (CGU_INT r = 0; r < rmp_l; r++)
-                {
-                    CGU_FLOAT Dist = (Rmp[GC][r] - Blk[k][GC]);
-                    CGU_FLOAT Err  = RmpErr[r][k] + Dist * Dist * fWeightGreen;
-                    MinErr         = cmp_minf(MinErr, Err);
-                }
-                mse += MinErr * _Rpt[k];
-            }
-
-            if (mse < bestE)
-            {
-                bstC0 = InpRmp[GC][0];
-                bstC1 = InpRmp[GC][1];
-                bestE = mse;
-            }
-        }
-    }
-
-    // our best GREENs
-    InpRmp[GC][0] = bstC0;
-    InpRmp[GC][1] = bstC1;
-
-    cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
-    cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
-
-    // ramp err for Red and Green
-    for (CGU_INT i = 0; i < _NmrClrs; i++)
-    {
-        for (CGU_INT r = 0; r < dwNumPoints; r++)
-        {
-            CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]);
-            CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
-            RmpErr[r][i]    = DistR * DistR * fWeightRed + DistG * DistG * fWeightGreen;
-        }
-    }
-
-    bstC0 = InpRmp0[BC][0];
-    bstC1 = InpRmp0[BC][1];
-    // Now blue
-    for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++)
-    {
-        for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++)
-        {
-            InpRmp[BC][0] = min(max(InpRmp0[BC][0] + i * Fctrs[BC], 0.f), 255.f);
-            InpRmp[BC][1] = min(max(InpRmp0[BC][1] + j * Fctrs[BC], 0.f), 255.f);
-
-            cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
-            cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints);
-
-            CGU_FLOAT mse   = 0.f;
-            CGU_INT   rmp_l = (Eq > 0) ? 1 : dwNumPoints;
-            for (CGU_INT k = 0; k < _NmrClrs; k++)
-            {
-                CGU_FLOAT MinErr = 10000000.f;
-                for (CGU_INT r = 0; r < rmp_l; r++)
-                {
-                    CGU_FLOAT Dist = (Rmp[BC][r] - Blk[k][BC]);
-                    CGU_FLOAT Err  = RmpErr[r][k] + Dist * Dist * fWeightBlue;
-                    MinErr         = min(MinErr, Err);
-                }
-                mse += MinErr * _Rpt[k];
-            }
-
-            if (mse < bestE)
-            {
-                bstC0 = InpRmp[BC][0];
-                bstC1 = InpRmp[BC][1];
-                bestE = mse;
-            }
-        }
-    }
-
-    // our best BLUEs
-    InpRmp[BC][0] = bstC0;
-    InpRmp[BC][1] = bstC1;
-
-    // return our best choice
-    for (CGU_INT j = 0; j < 3; j++)
-        for (CGU_INT k = 0; k < 2; k++)
-            _OutRmpPnts[j][k] = InpRmp[j][k];
-
-    return bestE;
-}
-
-#endif
-
-//======================================================================================
-// Codec from CompressonatorLib
-//======================================================================================
-#define BLOCK_SIZE_4X4 16
-#define RG 5
-#define GG 6
-#define BG 5
-
-/*------------------------------------------------------------------------------------------------
-// this is how the end points is going to be rounded in compressed format
-------------------------------------------------------------------------------------------------*/
-CMP_STATIC void cpu_MkRmpOnGrid(CGU_FLOAT _RmpF[NUM_CHANNELS][NUM_ENDPOINTS],
-                                CGU_FLOAT _MnMx[NUM_CHANNELS][NUM_ENDPOINTS],
-                                CGU_FLOAT _Min,
-                                CGU_FLOAT _Max,
-                                CGU_UINT8 nRedBits,
-                                CGU_UINT8 nGreenBits,
-                                CGU_UINT8 nBlueBits)
-{
-    CGU_FLOAT Fctrs0[3];
-    CGU_FLOAT Fctrs1[3];
-
-    Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits);
-    Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits);
-    Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits);
-    Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits));
-    Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits));
-    Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits));
-
-    for (int j = 0; j < 3; j++)
-    {
-        for (int k = 0; k < 2; k++)
-        {
-            _RmpF[j][k] = cmp_floor(_MnMx[j][k]);
-            if (_RmpF[j][k] <= _Min)
-                _RmpF[j][k] = _Min;
-            else
-            {
-                _RmpF[j][k] += cmp_floor(128.f / Fctrs1[j]) - cmp_floor(_RmpF[j][k] / Fctrs1[j]);
-                _RmpF[j][k] = cmp_minf(_RmpF[j][k], _Max);
-            }
-
-            _RmpF[j][k] = cmp_floor(_RmpF[j][k] / Fctrs0[j]) * Fctrs0[j];
-        }
-    }
-}
-
-// Find the first approximation of the line
-// Assume there is a linear relation
-//   Z = a * X_In
-//   Z = b * Y_In
-// Find a,b to minimize MSE between Z and Z_In
-CMP_STATIC void cpu_FindAxis(CMP_OUT CGU_FLOAT              BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS],
-                             CMP_IN CGU_FLOAT               LineDir0[NUM_CHANNELS],
-                             CMP_IN CGU_FLOAT               fBlockCenter[NUM_CHANNELS],
-                             CMP_OUT CGU_UINT8 CMP_REFINOUT AxisIsSmall,
-                             CMP_IN CGU_FLOAT               BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS],
-                             CMP_IN CGU_FLOAT               _inpRpt[BLOCK_SIZE_4X4],
-                             CMP_IN int                     nDimensions,
-                             CMP_IN int                     dwUniqueColors)
-{
-    CGU_FLOAT Crrl[NUM_CHANNELS];
-    CGU_FLOAT RGB2[NUM_CHANNELS];
-    CGU_INT   i;
-
-    LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = fBlockCenter[0] = fBlockCenter[1] = fBlockCenter[2] =
-        0.f;
-
-    // sum position of all points
-    CGU_FLOAT fNumPoints = 0.f;
-    for (i = 0; i < dwUniqueColors; i++)
-    {
-        fBlockCenter[0] += BlkUV[i][0] * _inpRpt[i];
-        fBlockCenter[1] += BlkUV[i][1] * _inpRpt[i];
-        fBlockCenter[2] += BlkUV[i][2] * _inpRpt[i];
-        fNumPoints += _inpRpt[i];
-    }
-
-    // and then average to calculate center coordinate of block
-    fBlockCenter[0] /= fNumPoints;
-    fBlockCenter[1] /= fNumPoints;
-    fBlockCenter[2] /= fNumPoints;
-
-    for (i = 0; i < dwUniqueColors; i++)
-    {
-        // calculate output block as offsets around block center
-        BlkSh[i][0] = BlkUV[i][0] - fBlockCenter[0];
-        BlkSh[i][1] = BlkUV[i][1] - fBlockCenter[1];
-        BlkSh[i][2] = BlkUV[i][2] - fBlockCenter[2];
-
-        // compute correlation matrix
-        // RGB2 = sum of ((distance from point from center) squared)
-        // Crrl = ???????. Seems to be be some calculation based on distance from point center in two dimensions
-        for (int j = 0; j < nDimensions; j++)
-        {
-            RGB2[j] += BlkSh[i][j] * BlkSh[i][j] * _inpRpt[i];
-            Crrl[j] += BlkSh[i][j] * BlkSh[i][(j + 1) % 3] * _inpRpt[i];
-        }
-    }
-
-    // if set's diameter is small
-    int       i0 = 0, i1 = 1;
-    CGU_FLOAT mxRGB2 = 0.f;
-    int       k = 0, j = 0;
-    CGU_FLOAT fEPS = fNumPoints * EPS;
-    for (k = 0, j = 0; j < 3; j++)
-    {
-        if (RGB2[j] >= fEPS)
-            k++;
-        else
-            RGB2[j] = 0.f;
-
-        if (mxRGB2 < RGB2[j])
-        {
-            mxRGB2 = RGB2[j];
-            i0     = j;
-        }
-    }
-
-    CGU_FLOAT fEPS2 = fNumPoints * EPS2;
-    AxisIsSmall     = 1;
-    for (j = 0; j < 3; j++)
-    {
-        AxisIsSmall &= (RGB2[j] < fEPS2);
-    }
-
-    if (AxisIsSmall)  // all are very small to avoid division on the small determinant
-        return;
-
-    if (k == 1)  // really only 1 dimension
-        LineDir0[i0] = 1.;
-    else if (k == 2)
-    {  // really only 2 dimensions
-        i1            = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3;
-        CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3];
-        LineDir0[i1]  = Crl / RGB2[i0];
-        LineDir0[i0]  = 1.;
-    }
-    else
-    {
-        CGU_FLOAT maxDet = 100000.f;
-        CGU_FLOAT Cs[3];
-        // select max det for precision
-        for (j = 0; j < nDimensions; j++)
-        {
-            CGU_FLOAT Det = RGB2[j] * RGB2[(j + 1) % 3] - Crrl[j] * Crrl[j];
-            Cs[j]         = abs(Crrl[j] / sqrt(RGB2[j] * RGB2[(j + 1) % 3]));
-            if (maxDet < Det)
-            {
-                maxDet = Det;
-                i0     = j;
-            }
-        }
-
-        // inverse correl matrix
-        //  --      --       --      --
-        //  |  A   B |       |  C  -B |
-        //  |  B   C |  =>   | -B   A |
-        //  --      --       --     --
-        CGU_FLOAT mtrx1[2][2];
-        CGU_FLOAT vc1[2];
-        CGU_FLOAT vc[2];
-        vc1[0] = Crrl[(i0 + 2) % 3];
-        vc1[1] = Crrl[(i0 + 1) % 3];
-        // C
-        mtrx1[0][0] = RGB2[(i0 + 1) % 3];
-        // A
-        mtrx1[1][1] = RGB2[i0];
-        // -B
-        mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0];
-        // find a solution
-        vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1];
-        vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1];
-        // normalize
-        vc[0] /= maxDet;
-        vc[1] /= maxDet;
-        // find a line direction vector
-        LineDir0[i0]           = 1.;
-        LineDir0[(i0 + 1) % 3] = 1.;
-        LineDir0[(i0 + 2) % 3] = vc[0] + vc[1];
-    }
-
-    // normalize direction vector
-    CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2];
-    Len           = sqrt(Len);
-
-    for (j = 0; j < 3; j++)
-        LineDir0[j] = (Len > 0.f) ? LineDir0[j] / Len : 0.f;
-}
-
-CMP_STATIC CGU_FLOAT cpu_RampSrchW(CGU_FLOAT Prj[BLOCK_SIZE_4X4],
-                                   CGU_FLOAT PrjErr[BLOCK_SIZE_4X4],
-                                   CGU_FLOAT PreMRep[BLOCK_SIZE_4X4],
-                                   CGU_FLOAT StepErr,
-                                   CGU_FLOAT lowPosStep,
-                                   CGU_FLOAT highPosStep,
-                                   int       dwUniqueColors,
-                                   int       dwNumPoints)
-{
-    CGU_FLOAT error  = 0.0f;
-    CGU_FLOAT step   = (highPosStep - lowPosStep) / (dwNumPoints - 1);
-    CGU_FLOAT step_h = step * 0.5f;
-    CGU_FLOAT rstep  = (CGU_FLOAT)1.0f / step;
-    CGU_INT   i;
-
-    for (i = 0; i < dwUniqueColors; i++)
-    {
-        // Work out which value in the block this select
-        CGU_FLOAT del = Prj[i] - lowPosStep;
-
-        CGU_FLOAT v;
-
-        if (del <= 0)
-            v = lowPosStep;
-        else if (Prj[i] - highPosStep >= 0)
-            v = highPosStep;
-        else
-            v = cmp_floor((del + step_h) * rstep) * step + lowPosStep;
-
-        // And accumulate the error
-        CGU_FLOAT d = (Prj[i] - v);
-        d *= d;
-        CGU_FLOAT err = PreMRep[i] * d + PrjErr[i];
-        error += err;
-        if (StepErr < error)
-        {
-            error = StepErr;
-            break;
-        }
-    }
-    return error;
-}
-
-CMP_STATIC CGU_FLOAT _cpu_bc1ComputeBestEndpoints(CGU_FLOAT endpointsOut[NUM_ENDPOINTS],
-                                                  CGU_FLOAT endpointsIn[NUM_ENDPOINTS],
-                                                  CGU_FLOAT prj[BLOCK_SIZE_4X4],
-                                                  CGU_FLOAT prjError[BLOCK_SIZE_4X4],
-                                                  CGU_FLOAT preMRep[BLOCK_SIZE_4X4],
-                                                  int       numColours,
-                                                  int       numPoints)
-{
-    CGU_FLOAT minError = MAX_ERROR;
-
-    static const CGU_FLOAT searchStep = 0.025f;
-
-    const CGU_FLOAT lowStart  = (endpointsIn[0] - 2.0f * searchStep > 0.0f) ? endpointsIn[0] - 2.0f * searchStep : 0.0f;
-    const CGU_FLOAT highStart = (endpointsIn[1] + 2.0f * searchStep < 1.0f) ? endpointsIn[1] + 2.0f * searchStep : 1.0f;
-
-    CGU_FLOAT lowStep  = lowStart;
-    CGU_FLOAT highStep = highStart;
-
-    for (int low = 0; low < 8; ++low)
-    {
-        for (int high = 0; high < 8; ++high)
-        {
-            // compute an error for the current pair of end points.
-            CGU_FLOAT error = cpu_RampSrchW(prj, prjError, preMRep, minError, lowStep, highStep, numColours, numPoints);
-
-            if (error < minError)
-            {
-                // save better result
-                minError        = error;
-                endpointsOut[0] = lowStep;
-                endpointsOut[1] = highStep;
-            }
-
-            highStep -= searchStep;
-        }
-
-        lowStep += searchStep;
-    }
-
-    return minError;
-}
-
-//    This is a float point-based compression
-//    it assumes that the number of unique colors is already known; input is in [0., 255.] range.
-//    This is C version.
-CMP_STATIC bool cpu_CompressRGBBlockX(CMP_OUT CGU_FLOAT _RsltRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
-                                      CMP_IN CGU_FLOAT  src_image[BLOCK_SIZE_4X4][NUM_CHANNELS],
-                                      CMP_IN CGU_FLOAT  Rpt[BLOCK_SIZE_4X4],
-                                      CMP_IN int        dwUniqueColors,
-                                      CMP_IN CGU_UINT8  dwNumPoints,
-                                      CMP_IN bool       b3DRefinement,
-                                      CMP_IN CGU_UINT8  nRefinementSteps,
-                                      CMP_IN CGU_FLOAT  pfWeights[3],
-                                      CMP_IN CGU_UINT8  nRedBits,
-                                      CMP_IN CGU_UINT8  nGreenBits,
-                                      CMP_IN CGU_UINT8  nBlueBits,
-                                      CMP_IN CGU_FLOAT  fquality)
-{
-#if !defined(ASPM_GPU)
-    if (!g_bc1FunctionPointersSet)
-    {
-        bc1ToggleSIMD(EXTENSION_COUNT);
-    }
-#endif
-
-    CGU_FLOAT ALIGN_16 Prj0[BLOCK_SIZE_4X4];
-    CGU_FLOAT ALIGN_16 Prj[BLOCK_SIZE_4X4];
-    CGU_FLOAT ALIGN_16 PrjErr[BLOCK_SIZE_4X4];
-    CGU_FLOAT ALIGN_16 LineDir[NUM_CHANNELS];
-    CGU_FLOAT ALIGN_16 RmpIndxs[BLOCK_SIZE_4X4];
-
-    CMP_UNUSED(fquality);
-    CMP_UNUSED(b3DRefinement)
-
-    CGU_FLOAT LineDirG[NUM_CHANNELS];
-    CGU_FLOAT PosG[NUM_ENDPOINTS];
-    CGU_FLOAT BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS];
-    CGU_FLOAT BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS];
-    CGU_FLOAT LineDir0[NUM_CHANNELS];
-    CGU_FLOAT Mdl[NUM_CHANNELS];
-
-    CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS];
-    int       i, j, k;
-
-    // down to [0., 1.]
-    for (i = 0; i < dwUniqueColors; i++)
-        for (j = 0; j < 3; j++)
-            BlkUV[i][j] = src_image[i][j] / 255.f;
-
-    bool isDONE = false;
-
-    // as usual if not more then 2 different colors, we've done
-    if (dwUniqueColors <= 2)
-    {
-        for (j = 0; j < 3; j++)
-        {
-            rsltC[j][0] = src_image[0][j];
-            rsltC[j][1] = src_image[dwUniqueColors - 1][j];
-        }
-        isDONE = true;
-    }
-
-    if (!isDONE)
-    {
-        //    This is our first attempt to find an axis we will go along.
-        //    The cumulation is done to find a line minimizing the MSE from the input 3D points.
-        CGU_UINT8 bSmall;
-        cpu_FindAxis(BlkSh, LineDir0, Mdl, bSmall, BlkUV, Rpt, 3, dwUniqueColors);
-
-        //    While trying to find the axis we found that the diameter of the input set is quite small.
-        //    Do not bother.
-        if (bSmall)
-        {
-            for (j = 0; j < 3; j++)
-            {
-                rsltC[j][0] = src_image[0][j];
-                rsltC[j][1] = src_image[dwUniqueColors - 1][j];
-            }
-            isDONE = true;
-        }
-    }
-
-    // GCC is being an awful being when it comes to goto-jumps.
-    // So please bear with this.
-    if (!isDONE)
-    {
-        CGU_FLOAT          ErrG = 10000000.f;
-        CGU_FLOAT          PrjBnd[NUM_ENDPOINTS];
-        CGU_FLOAT ALIGN_16 PreMRep[BLOCK_SIZE_4X4];
-        for (j = 0; j < 3; j++)
-            LineDir[j] = LineDir0[j];
-
-        //    Here is the main loop.
-        //    1. Project input set on the axis in consideration.
-        //    2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points.
-        //    3. Compute the vector of indexes (or clusters) for the current approximate ramp.
-        //    4. Present our color channels as 3 16DIM vectors.
-        //    5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector.
-        //    6. Plug the projections as a new directional vector for the axis.
-        //    7. Goto 1.
-        //    D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3, 2/3, 0, ...,}, but shifted and normalized).
-        //    Ci - is a 16 dim vector of color i.
-        //    for each Ci find a scalar Ai such that
-        //    (Ai * D - Ci) (Ai * D - Ci) -> min , i.e distance between vector AiD and C is min.
-        //    You can think of D as a unit interval(vector) "clusterizer",
-        //    and Ai is a scale you need to apply to the clusterizer to
-        //    approximate the Ci vector instead of the unit vector.
-        //    Solution is
-        //    Ai = (D . Ci) / (D . D); . - is a dot product.
-        //    in 3 dim space Ai(s) represent a line direction, along which
-        //    we again try to find (sub)optimal quantizer.
-
-        //    That's what our for(;;) loop is about.
-        for (;;)
-        {
-            //  1. Project input set on the axis in consideration.
-            // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is
-            //                            P + ((R-P).v) / (v.v))v
-            // The distance along v is therefore (R-P).v / (v.v)
-            // (v.v) is 1 if v is a unit vector.
-            //
-            PrjBnd[0] = 1000.;
-            PrjBnd[1] = -1000.;
-            for (i = 0; i < BLOCK_SIZE_4X4; i++)
-                Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f;
-
-            for (i = 0; i < dwUniqueColors; i++)
-            {
-                Prj0[i] = Prj[i] = BlkSh[i][0] * LineDir[0] + BlkSh[i][1] * LineDir[1] + BlkSh[i][2] * LineDir[2];
-
-                PrjErr[i] = (BlkSh[i][0] - LineDir[0] * Prj[i]) * (BlkSh[i][0] - LineDir[0] * Prj[i]) +
-                            (BlkSh[i][1] - LineDir[1] * Prj[i]) * (BlkSh[i][1] - LineDir[1] * Prj[i]) +
-                            (BlkSh[i][2] - LineDir[2] * Prj[i]) * (BlkSh[i][2] - LineDir[2] * Prj[i]);
-
-                PrjBnd[0] = min(PrjBnd[0], Prj[i]);
-                PrjBnd[1] = max(PrjBnd[1], Prj[i]);
-            }
-
-            //  2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points.
-
-            // min and max of the search interval
-            CGU_FLOAT stepf = 0.125f;
-
-            CGU_FLOAT Scl[NUM_ENDPOINTS];
-            Scl[0] = PrjBnd[0] - (PrjBnd[1] - PrjBnd[0]) * stepf;
-            Scl[1] = PrjBnd[1] + (PrjBnd[1] - PrjBnd[0]) * stepf;
-
-            // No range found exit
-            if (Scl[0] == Scl[1])
-            {
-                return false;
-            }
-
-            // compute scaling factor to scale down the search interval to [0.,1]
-            const CGU_FLOAT Scl2    = (Scl[1] - Scl[0]) * (Scl[1] - Scl[0]);
-            const CGU_FLOAT overScl = 1.f / (Scl[1] - Scl[0]);
-
-            for (i = 0; i < dwUniqueColors; i++)
-            {
-                // scale them
-                Prj[i] = (Prj[i] - Scl[0]) * overScl;
-                // premultiply the scale squire to plug into error computation later
-                PreMRep[i] = Rpt[i] * Scl2;
-            }
-
-            // scale first approximation of end points
-            PrjBnd[0] = (PrjBnd[0] - Scl[0]) * overScl;
-            PrjBnd[1] = (PrjBnd[1] - Scl[0]) * overScl;
-
-            // find the best endpoints
-            CGU_FLOAT Pos[NUM_ENDPOINTS];
-#if defined(ASPM_GPU)
-            CGU_FLOAT StepErr = _cpu_bc1ComputeBestEndpoints(Pos, PrjBnd, Prj, PrjErr, PreMRep, dwUniqueColors, dwNumPoints);
-#else
-            CGU_FLOAT StepErr = cpu_bc1ComputeBestEndpoints(Pos, PrjBnd, Prj, PrjErr, PreMRep, dwUniqueColors, dwNumPoints);
-#endif
-
-            // inverse the scaling
-            Pos[0] = Pos[0] * (Scl[1] - Scl[0]) + Scl[0];
-            Pos[1] = Pos[1] * (Scl[1] - Scl[0]) + Scl[0];
-
-            // did we find somthing better from the previous run?
-            if (StepErr + 0.001 < ErrG)
-            {
-                // yes, remember it
-                ErrG        = StepErr;
-                LineDirG[0] = LineDir[0];
-                LineDirG[1] = LineDir[1];
-                LineDirG[2] = LineDir[2];
-                PosG[0]     = Pos[0];
-                PosG[1]     = Pos[1];
-                //  3. Compute the vector of indexes (or clusters) for the current approximate ramp.
-                // indexes
-                const CGU_FLOAT step      = (Pos[1] - Pos[0]) / (CGU_FLOAT)(dwNumPoints - 1);
-                const CGU_FLOAT step_h    = step * (CGU_FLOAT)0.5;
-                const CGU_FLOAT rstep     = (CGU_FLOAT)1.0f / step;
-                const CGU_FLOAT overBlkTp = 1.f / (CGU_FLOAT)(dwNumPoints - 1);
-
-                // here the index vector is computed,
-                // shifted and normalized
-                CGU_FLOAT indxAvrg = (CGU_FLOAT)(dwNumPoints - 1) / 2.f;
-
-                for (i = 0; i < dwUniqueColors; i++)
-                {
-                    CGU_FLOAT del;
-                    //int n = (int)((b - _min_ex + (step*0.5f)) * rstep);
-                    if ((del = Prj0[i] - Pos[0]) <= 0)
-                        RmpIndxs[i] = 0.f;
-                    else if (Prj0[i] - Pos[1] >= 0)
-                        RmpIndxs[i] = (CGU_FLOAT)(dwNumPoints - 1);
-                    else
-                        RmpIndxs[i] = cmp_floor((del + step_h) * rstep);
-                    // shift and normalization
-                    RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp;
-                }
-
-                //  4. Present our color channels as 3 16DIM vectors.
-                //  5. Find closest aproximation of each of 16DIM color vector with the pojection of the 16DIM index vector.
-                CGU_FLOAT Crs[3], Len, Len2;
-                for (i = 0, Crs[0] = Crs[1] = Crs[2] = Len = 0.f; i < dwUniqueColors; i++)
-                {
-                    const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i];
-                    Len += RmpIndxs[i] * PreMlt;
-                    for (j = 0; j < 3; j++)
-                        Crs[j] += BlkSh[i][j] * PreMlt;
-                }
-
-                LineDir[0] = LineDir[1] = LineDir[2] = 0.f;
-                if (Len > 0.f)
-                {
-                    LineDir[0] = Crs[0] / Len;
-                    LineDir[1] = Crs[1] / Len;
-                    LineDir[2] = Crs[2] / Len;
-
-                    //  6. Plug the projections as a new directional vector for the axis.
-                    //  7. Goto 1.
-                    Len2 = LineDir[0] * LineDir[0] + LineDir[1] * LineDir[1] + LineDir[2] * LineDir[2];
-                    Len2 = sqrt(Len2);
-
-                    LineDir[0] /= Len2;
-                    LineDir[1] /= Len2;
-                    LineDir[2] /= Len2;
-                }
-            }
-            else  // We was not able to find anything better.  Drop dead.
-                break;
-        }
-
-        // inverse transform to find end-points of 3-color ramp
-        for (k = 0; k < 2; k++)
-            for (j = 0; j < 3; j++)
-                rsltC[j][k] = (PosG[k] * LineDirG[j] + Mdl[j]) * 255.f;
-    }
-
-    // We've dealt with (almost) unrestricted full precision realm.
-    // Now back to the dirty digital world.
-
-    // round the end points to make them look like compressed ones
-    CGU_FLOAT inpRmpEndPts[NUM_CHANNELS][NUM_ENDPOINTS];
-    cpu_MkRmpOnGrid(inpRmpEndPts, rsltC, 0.f, 255.f, nRedBits, nGreenBits, nBlueBits);
-
-    // Try using this on 3 channels
-    // static CGU_Vec2i cmp_getLinearEndPoints(CGU_FLOAT _Blk[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSigned);
-
-    // This not a small procedure squeezes and stretches the ramp along each axis (R,G,B) separately while other 2 are fixed.
-    // It does it only over coarse grid - 565 that is. It tries to squeeze more precision for the real world ramp.
-#if defined(USE_REFINE) || defined(USE_REFINE3D)
-    switch (nRefinementSteps)
-    {
-    case 1:
-        cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 3);
-        break;
-    case 2:
-        if (dwUniqueColors > 2)
-            cmp_Refine3D(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1);
-        else
-            cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 3);
-        break;
-    default:
-        cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1);
-        break;
-    }
-#endif
-    return true;
-}
-
-// CPU: CompRGBBlock()
-CMP_STATIC CGU_FLOAT cpu_CompRGBBlock32(CGU_UINT32 block_32[16],
-                                        CGU_UINT32 compressedBlock[2],
-                                        CGU_UINT32 dwBlockSize,
-                                        CGU_UINT8  nRedBits,
-                                        CGU_UINT8  nGreenBits,
-                                        CGU_UINT8  nBlueBits,
-                                        CGU_UINT8  nEndpoints[3][NUM_ENDPOINTS],
-                                        CGU_UINT8  pcIndices[BLOCK_SIZE_4X4],
-                                        CGU_UINT8  dwNumPoints,
-                                        bool       b3DRefinement,
-                                        CGU_UINT8  m_nRefinementSteps,
-                                        CGU_FLOAT  _pfChannelWeights[3],
-                                        bool       _bUseAlpha,
-                                        CGU_UINT8  _nAlphaThreshold)
-{
-    CGU_FLOAT ALIGN_16 Rpt[BLOCK_SIZE_4X4];
-    CGU_FLOAT ALIGN_16 BlkIn[BLOCK_SIZE_4X4][NUM_CHANNELS];
-    CGU_UINT32         mx;
-    for (mx = 0; mx < BLOCK_SIZE_4X4; mx++)
-    {
-        Rpt[mx]      = 0;
-        BlkIn[mx][0] = 0;
-        BlkIn[mx][1] = 0;
-        BlkIn[mx][2] = 0;
-        BlkIn[mx][3] = 0;
-    }
-
-    compressedBlock[0] = 0;
-
-    CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24;
-    CGU_UINT32 dwColors         = 0;
-    CGU_UINT32 dwBlk[BLOCK_SIZE];
-    for (CGU_UINT32 i = 0; i < dwBlockSize; i++)
-        if (!_bUseAlpha || (block_32[i] & 0xff000000) >= dwAlphaThreshold)
-            dwBlk[dwColors++] = block_32[i] | 0xff000000;
-
-    // Do we have any colors ?
-    static int id = 0;
-    if (dwColors)
-    {
-        bool bHasAlpha = (dwColors != dwBlockSize);
-        if (bHasAlpha && _bUseAlpha && !(dwNumPoints & 0x1))
-            return CMP_FLT_MAX;
-
-            // Here we are computing an unique number of colors.
-            // For each unique value we compute the number of it appearences.
-            //qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp);
-#ifndef ASPM_GPU  // this is here for reminder when code moves to GPU
-        std::sort(dwBlk, dwBlk + 15);
-#else
-        {
-            CGU_UINT32 j;
-            CMP_di     what[BLOCK_SIZE_4X4];
-
-            for (i = 0; i < dwColors; i++)
-            {
-                what[i].index = i;
-                what[i].data  = dwBlk[i];
-            }
-
-            CGU_UINT32 tmp_index;
-            CGU_UINT32 tmp_data;
-
-            for (i = 1; i < dwColors; i++)
-            {
-                for (j = i; j > 0; j--)
-                {
-                    if (what[j - 1].data > what[j].data)
-                    {
-                        tmp_index         = what[j].index;
-                        tmp_data          = what[j].data;
-                        what[j].index     = what[j - 1].index;
-                        what[j].data      = what[j - 1].data;
-                        what[j - 1].index = tmp_index;
-                        what[j - 1].data  = tmp_data;
-                    }
-                }
-            }
-            for (i = 0; i < dwColors; i++)
-                dwBlk[i] = what[i].data;
-        }
-#endif
-
-        CGU_UINT32 new_p;
-        CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4];
-        CGU_UINT32 dwUniqueColors = 0;
-        new_p = dwBlkU[0]   = dwBlk[0];
-        Rpt[dwUniqueColors] = 1.f;
-        CGU_UINT32 i;
-        for (i = 1; i < dwColors; i++)
-        {
-            if (new_p != dwBlk[i])
-            {
-                dwUniqueColors++;
-                new_p = dwBlkU[dwUniqueColors] = dwBlk[i];
-                Rpt[dwUniqueColors]            = 1.f;
-            }
-            else
-                Rpt[dwUniqueColors] += 1.f;
-        }
-        dwUniqueColors++;
-
-        // switch to float
-        for (i = 0; i < dwUniqueColors; i++)
-        {
-            BlkIn[i][RC] = (CGU_FLOAT)((dwBlkU[i] >> 16) & 0xff);  // R
-            BlkIn[i][GC] = (CGU_FLOAT)((dwBlkU[i] >> 8) & 0xff);   // G
-            BlkIn[i][BC] = (CGU_FLOAT)((dwBlkU[i] >> 0) & 0xff);   // B
-            BlkIn[i][AC] = 255.0f;
-        }
-
-        CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS];
-        if (cpu_CompressRGBBlockX(rsltC,               //  CMP_EndPoints = CompressRGBBlock_Slow2 (
-                                  BlkIn,               //  CGU_Vec3f  src_imageNorm[BLOCK_SIZE_4X4]
-                                  Rpt,                 //  CGU_FLOAT  Rpt[BLOCK_SIZE_4X4],
-                                  dwUniqueColors,      //  CGU_UINT32 dwUniqueColors,
-                                  dwNumPoints,         //  CGU_UINT32 dwNumPoints,
-                                  b3DRefinement,       //
-                                  m_nRefinementSteps,  //  CGU_UINT32 m_nRefinementSteps,
-                                  _pfChannelWeights,   //  CGU_Vec3f  channelWeightsBGR,
-                                  nRedBits,            //  );
-                                  nGreenBits,
-                                  nBlueBits,
-                                  1.0f))
-        {
-            // return to integer realm
-            for (int ch = 0; ch < 3; ch++)
-                for (int j = 0; j < 2; j++)
-                    nEndpoints[ch][j] = (CGU_UINT8)rsltC[ch][j];
-            //printf("Endpoints {%3d,%3d,%3d} {%3d,%3d,%3d} ", nEndpoints[0][0],nEndpoints[1][0],nEndpoints[2][0],
-            //                                                  nEndpoints[0][1],nEndpoints[1][1],nEndpoints[2][1]);
-
-            // Now get the indices using the new end points
-            return cpu_Clstr(
-                block_32, dwBlockSize, nEndpoints, pcIndices, dwNumPoints, _pfChannelWeights, _bUseAlpha, _nAlphaThreshold, nRedBits, nGreenBits, nBlueBits);
-        }
-        else
-        {
-            CGU_FLOAT CompErr = CMP_FLT_MAX;
-            if (dwNumPoints < 4)
-            {
-                CGU_Vec3f src_imageNorm[BLOCK_SIZE_4X4];
-
-                for (CGU_UINT32 px = 0; px < 16; px++)
-                {
-                    src_imageNorm[px].r = (CGU_FLOAT)((block_32[px] >> 16) & 0xff) / 255.0f;
-                    src_imageNorm[px].g = (CGU_FLOAT)((block_32[px] >> 8) & 0xff) / 255.0f;
-                    src_imageNorm[px].b = (CGU_FLOAT)((block_32[px] >> 0) & 0xff) / 255.0f;
-                }
-
-                // Do a quick compression test
-                CGU_Vec3f srcRGB[16];   // The list of source colors with blue channel altered
-                CGU_Vec3f average_rgb;  // The centrepoint of the axis
-                CGU_FLOAT errLQ = CMP_FLT_MAX;
-                cgu_CompressRGBBlock_MinMax(src_imageNorm, 1.0f, false, srcRGB, average_rgb, errLQ);
-                CGU_Vec2ui cmp = cgu_CompressRGBBlock_Fast(src_imageNorm, 1.0f, false, srcRGB, average_rgb, CompErr);
-
-                compressedBlock[0] = cmp.x;
-                compressedBlock[1] = cmp.y;
-            }
-            return CompErr;
-        }
-    }
-    else
-    {
-        // All colors transparent
-        nEndpoints[0][0] = nEndpoints[1][0] = nEndpoints[2][0] = 0;
-        nEndpoints[0][1] = nEndpoints[1][1] = nEndpoints[2][1] = 0xff;
-        for (CGU_UINT32 ms = 0; ms < dwBlockSize; ms++)
-            pcIndices[ms] = 0xff;
-        return 0.0;
-    }
-}
-
-CMP_STATIC CGU_Vec2ui cpu_CompRGBBlock(CMP_IN CGU_Vec4uc bgraBlock[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options, CMP_INOUT CGU_FLOAT CMP_REFINOUT err)
-{
-    CGU_Vec2ui cmpBlock            = {0U, 0U};
-    CGU_FLOAT  pfChannelWeights[3] = {1.0f, 1.0f, 1.0f};
-    CGU_UINT8  nEndpoints[2][3][2];
-    CGU_UINT8  nIndices[2][BLOCK_SIZE_4X4];
-    CGU_UINT32 compressedBlock[2] = {0, 0};
-
-    CGU_FLOAT fError3 = CMP_FLT_MAX;
-
-    fError3 = cpu_CompRGBBlock32((CGU_UINT32*)bgraBlock,
-                                 compressedBlock,
-                                 BLOCK_SIZE_4X4,
-                                 RG,
-                                 GG,
-                                 BG,
-                                 nEndpoints[0],
-                                 nIndices[0],
-                                 3,
-                                 BC15Options.m_b3DRefinement,
-                                 BC15Options.m_nRefinementSteps,
-                                 pfChannelWeights,
-                                 BC15Options.m_bUseAlpha,
-                                 BC15Options.m_nAlphaThreshold);
-    // use case of small min max ranges
-    if (compressedBlock[0] > 0)
-    {
-        //return cmpBlockBlue;
-        cmpBlock.x = compressedBlock[0];
-        cmpBlock.y = compressedBlock[1];
-        err        = fError3;
-    }
-    else
-    {
-        CGU_FLOAT fError4 = CMP_FLT_MAX;
-        fError4           = (fError3 == 0.0) ? CMP_FLT_MAX
-                                             : cpu_CompRGBBlock32((CGU_UINT32*)bgraBlock,
-                                                        compressedBlock,
-                                                        BLOCK_SIZE_4X4,
-                                                        RG,
-                                                        GG,
-                                                        BG,
-                                                        nEndpoints[1],
-                                                        nIndices[1],
-                                                        4,
-                                                        BC15Options.m_b3DRefinement,
-                                                        BC15Options.m_nRefinementSteps,
-                                                        pfChannelWeights,
-                                                        BC15Options.m_bUseAlpha,
-                                                        BC15Options.m_nAlphaThreshold);
-
-        CGU_UINT32 nMethod;
-        if (fError3 <= fError4)
-        {
-            err     = fError3;
-            nMethod = 0;
-        }
-        else
-        {
-            err     = fError4;
-            nMethod = 1;
-        }
-
-        CGU_UINT32 c0 =
-            BC1ConstructColour((nEndpoints[nMethod][RC][0] >> (8 - RG)), (nEndpoints[nMethod][GC][0] >> (8 - GG)), (nEndpoints[nMethod][BC][0] >> (8 - BG)));
-        CGU_UINT32 c1 =
-            BC1ConstructColour((nEndpoints[nMethod][RC][1] >> (8 - RG)), (nEndpoints[nMethod][GC][1] >> (8 - GG)), (nEndpoints[nMethod][BC][1] >> (8 - BG)));
-        if (nMethod == 1 && c0 <= c1 || nMethod == 0 && c0 > c1)
-            compressedBlock[0] = c1 | (c0 << 16);
-        else
-            compressedBlock[0] = c0 | (c1 << 16);
-
-        compressedBlock[1] = 0;
-        for (CGU_UINT32 i = 0; i < 16; i++)
-            compressedBlock[1] |= (nIndices[nMethod][i] << (2 * i));
-
-        cmpBlock.x = compressedBlock[0];
-        cmpBlock.y = compressedBlock[1];
-    }
-
-    return cmpBlock;
-}
-
-#endif
-
-#ifdef ENABLE_NEW_CODE
-
-//---------------------------------------- Common Utility Code -------------------------------------------------------
-// 1 - Dim error
-CMP_STATIC CGU_FLOAT cgu_RampSrchW(CGU_FLOAT  Prj[BLOCK_SIZE_4X4],
-                                   CGU_FLOAT  PrjErr[BLOCK_SIZE_4X4],
-                                   CGU_FLOAT  PreMRep[BLOCK_SIZE_4X4],
-                                   CGU_FLOAT  StepErr,
-                                   CGU_FLOAT  lowPosStep,
-                                   CGU_FLOAT  highPosStep,
-                                   CGU_UINT32 dwUniqueColors,
-                                   CGU_UINT32 dwNumPoints)
-{
-    CGU_FLOAT error  = 0;
-    CGU_FLOAT step   = (highPosStep - lowPosStep) / (dwNumPoints - 1);
-    CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
-    CGU_FLOAT rstep  = (CGU_FLOAT)1.0f / step;
-
-    for (CGU_UINT32 i = 0; i < dwUniqueColors; i++)
-    {
-        CGU_FLOAT v;
-        // Work out which value in the block this select
-        CGU_FLOAT del;
-
-        if ((del = Prj[i] - lowPosStep) <= 0)
-            v = lowPosStep;
-        else if (Prj[i] - highPosStep >= 0)
-            v = highPosStep;
-        else
-            v = cmp_floor((del + step_h) * rstep) * step + lowPosStep;
-
-        // And accumulate the error
-        CGU_FLOAT d = (Prj[i] - v);
-        d *= d;
-        CGU_FLOAT err = PreMRep[i] * d + PrjErr[i];
-        error += err;
-        if (StepErr < error)
-        {
-            error = StepErr;
-            break;
-        }
-    }
-    return error;
-}
-
-CMP_STATIC CGU_UINT32 cgu_processCluster(CMP_IN CMP_EndPoints           EndPoints,
-                                         CMP_IN CGU_Vec4f               rgbBlock_normal[BLOCK_SIZE_4X4],
-                                         CMP_IN CGU_UINT32              dwAlphaThreshold,
-                                         CMP_IN CGU_Vec3f               channelWeights,
-                                         CMP_IN CGU_UINT8               indices[BLOCK_SIZE_4X4],
-                                         CMP_OUT CGU_FLOAT CMP_REFINOUT Err)
-{
-    Err                  = 0.f;
-    CGU_UINT32 pcIndices = 0;
-    CGU_UINT32 R, G, B;
-
-    R                  = (CGU_UINT32)(EndPoints.Color0.z);
-    G                  = (CGU_UINT32)(EndPoints.Color0.y);
-    B                  = (CGU_UINT32)(EndPoints.Color0.x);
-    CGU_INT32 cluster0 = cmp_constructColor(R, G, B);
-
-    R                  = (CGU_UINT32)(EndPoints.Color1.z);
-    G                  = (CGU_UINT32)(EndPoints.Color1.y);
-    B                  = (CGU_UINT32)(EndPoints.Color1.x);
-    CGU_INT32 cluster1 = cmp_constructColor(R, G, B);
-
-    CGU_Vec3f InpRmp[NUM_ENDPOINTS];
-    if ((cluster0 <= cluster1)  // valid for 4 channels
-                                // || (cluster0 > cluster1)    // valid for 3 channels
-    )
-    {
-        // inverse endpoints
-        InpRmp[0] = EndPoints.Color1;
-        InpRmp[1] = EndPoints.Color0;
-    }
-    else
-    {
-        InpRmp[0] = EndPoints.Color0;
-        InpRmp[1] = EndPoints.Color1;
-    }
-
-    CGU_Vec3f srcblockLinear[BLOCK_SIZE_4X4];
-    CGU_FLOAT srcblockA[BLOCK_SIZE_4X4];
-
-    // Swizzle the source RGB to BGR for processing
-    for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++)
-    {
-        srcblockLinear[i].z = rgbBlock_normal[i].x * 255.0f;
-        srcblockLinear[i].y = rgbBlock_normal[i].y * 255.0f;
-        srcblockLinear[i].x = rgbBlock_normal[i].z * 255.0f;
-        srcblockA[i]        = 0.0f;
-        //if (dwAlphaThreshold > 0)
-        //{
-        //    CGU_UINT32 alpha = (CGU_UINT32)BlockA[i];
-        //    if (alpha >= dwAlphaThreshold)
-        //        srcblockA[i] = BlockA[i];
-        //}
-    }
-
-    // cmp_ClstrBas2()
-    // input ramp is on the coarse grid
-    // make ramp endpoints the way they'll going to be decompressed
-    CGU_Vec3f InpRmpL[NUM_ENDPOINTS];
-    CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F};  // 1 << RG,1 << GG,1 << BG
-
-    {
-        //   ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp);
-        InpRmpL[0] = InpRmp[0] + cmp_floorVec3f(InpRmp[0] / Fctrs);
-        InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f);
-        InpRmpL[1] = InpRmp[1] + cmp_floorVec3f(InpRmp[1] / Fctrs);
-        InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f);
-    }  // MkWkRmpPts
-
-    // build ramp
-    CGU_Vec3f LerpRmp[4];
-    CGU_Vec3f offset = {1.0f, 1.0f, 1.0f};
-    {
-        //BldRmp(Rmp, InpRmpL, dwNumChannels);
-        // linear interpolate end points to get the ramp
-        LerpRmp[0] = InpRmpL[0];
-        LerpRmp[3] = InpRmpL[1];
-        LerpRmp[1] = cmp_floorVec3f((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f);
-        LerpRmp[2] = cmp_floorVec3f((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f);
-    }  // BldRmp
-
-    //=========================================================================
-    // Clusterize, Compute error and find DXTC indexes for the current cluster
-    //=========================================================================
-    {
-        // Clusterize
-        CGU_UINT32 alpha;
-
-        // For each colour in the original block assign it
-        // to the closest cluster and compute the cumulative error
-        for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++)
-        {
-            alpha = (CGU_UINT32)srcblockA[i];
-            if ((dwAlphaThreshold > 0) && alpha == 0)
-            {                                      //*((CGU_UINT32 *)&_Blk[i][AC]) == 0)
-                pcIndices |= cmp_set2Bit32(4, i);  // dwNumChannels 3 or 4 (default is 4)
-                indices[i] = 4;
-            }
-            else
-            {
-                CGU_FLOAT shortest      = 99999999999.f;
-                CGU_UINT8 shortestIndex = 0;
-
-                CGU_Vec3f channelWeightsBGR;
-                channelWeightsBGR.x = channelWeights.z;
-                channelWeightsBGR.y = channelWeights.y;
-                channelWeightsBGR.z = channelWeights.x;
-
-                for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++)
-                {
-                    // r is either 1 or 4
-                    // calculate the distance for each component
-                    CGU_FLOAT distance = cmp_dotVec3f(((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR),
-                                                      ((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR));
-                    if (distance < shortest)
-                    {
-                        shortest      = distance;
-                        shortestIndex = rampindex;
-                    }
-                }
-
-                Err += shortest;
-
-                // The total is a sum of (error += shortest)
-                // We have the index of the best cluster, so assign this in the block
-                // Reorder indices to match correct DXTC ordering
-                if (shortestIndex == 3)  // dwNumChannels - 1
-                    shortestIndex = 1;
-                else if (shortestIndex)
-                    shortestIndex++;
-                pcIndices |= cmp_set2Bit32(shortestIndex, i);
-                indices[i] = shortestIndex;
-            }
-        }  // BLOCK_SIZE_4X4
-    }      // Clusterize
-
-    return pcIndices;
-}
-#endif
-
-// Process a rgbBlock which is normalized (0.0f ... 1.0f), signed normal is not implemented
-CMP_STATIC CGU_Vec2ui CompressBlockBC1_NORMALIZED(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options)
-{
-    bool usingMaxQualityOnly = false;
-
-#ifndef ASPM_GPU
-    if (BC15Options.m_fquality > 0.75)
-        usingMaxQualityOnly = true;
-#endif
-
-    CGU_FLOAT  CompErr      = CMP_FLT_MAX;
-    CGU_Vec2ui cmpBlock     = {0U, 0U};
-    CGU_Vec2ui cmpBlockTemp = {0U, 0U};
-    CGU_FLOAT  CompErrTemp;
-
-    // Transfer to RGB Norm from RGBA Norm
-    CGU_Vec3f  src_imageRGBNorm[16];
-    CGU_Vec4uc pixels[16];
-    CGU_Vec4uc pixelsBGRA[16];
-
-    for (CGU_UINT32 sr = 0; sr < 16; sr++)
-    {
-        src_imageRGBNorm[sr] = src_imageNorm[sr].rgb;
-        pixelsBGRA[sr].b = pixels[sr].r = src_imageNorm[sr].r * 255.0f;
-        pixelsBGRA[sr].g = pixels[sr].g = src_imageNorm[sr].g * 255.0f;
-        pixelsBGRA[sr].r = pixels[sr].b = src_imageNorm[sr].b * 255.0f;
-        pixelsBGRA[sr].a = pixels[sr].a = src_imageNorm[sr].a * 255.0f;
-    }
-
-    // check for a punch through transparent alpha setting
-    if ((BC15Options.m_fquality < 0.75) && (BC15Options.m_bUseAlpha))
-    {
-        CGU_Vec2ui cmpBlockAlpha = {0xffff0000, 0xffffffffU};
-        for (CGU_UINT32 sr = 0; sr < 16; sr++)
-            if (pixels[sr].a < BC15Options.m_nAlphaThreshold)
-            {
-                return cmpBlockAlpha;
-            }
-    }
-
-    //================
-    // extern codec
-    //================
-    // For debugging
-    // CGU_Vec2ui cmpBlockRed   = {0xF800F800,0x00000000};
-    // CGU_Vec2ui cmpBlockGreen = {0x7E007E00,0x00000000};
-    // CGU_Vec2ui cmpBlockBlue  = {0x1F001F00,0x00000000};
-
-    if (!BC15Options.m_bUseAlpha)
-    {
-        //==========================================
-        // Gain +0.3 dB for images with soild blocks
-        //==========================================
-        bool bAllColoursEqual = true;
-
-        // Load the whole 4x4 block
-        for (CGU_UINT32 i = 0u; (i < 16u) && bAllColoursEqual; ++i)
-        {
-            for (CGU_INT c = 0; c < 3; c++)
-                bAllColoursEqual = bAllColoursEqual && (pixels[0][c] == pixels[i][c]);
-        }
-
-        if (bAllColoursEqual)
-        {
-            cmpBlock = cgu_solidColorBlock(pixels[0].x, pixels[0].y, pixels[0].z);
-            CompErr  = cgu_RGBABlockErrorLinear(pixels, cmpBlock);
-            if (BC15Options.m_nRefinementSteps < 1)
-                return cmpBlock;
-        }
-    }
-
-    if (!usingMaxQualityOnly)
-    {
-        //====================================
-        // Get src image data, min,max...
-        //=====================================
-        //CMP_EncodeData edata;
-        //cmp_get_encode_data(edata,pixels);
-
-        if (!BC15Options.m_bUseAlpha)
-        {
-            //====================================
-            // Fast Compression, low quality
-            //=====================================
-            CGU_Vec3f srcRGB[16];   // The list of source colors with blue channel altered
-            CGU_Vec3f average_rgb;  // The centrepoint of the axis
-            CGU_FLOAT errLQ = CMP_FLT_MAX;
-            cmpBlockTemp    = cgu_CompressRGBBlock_MinMax(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB, srcRGB, average_rgb, errLQ);
-            if ((BC15Options.m_fquality < CMP_QUALITY0) || (errLQ == 0.0f))
-                return cmpBlockTemp;
-
-            if (CompErr > errLQ)
-            {
-                CompErr  = errLQ;
-                cmpBlock = cmpBlockTemp;
-            }
-
-            cmpBlockTemp = cgu_CompressRGBBlock_Fast(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB, srcRGB, average_rgb, errLQ);
-            if (CompErr > errLQ)
-            {
-                CompErr  = errLQ;
-                cmpBlock = cmpBlockTemp;
-            }
-            if (BC15Options.m_fquality < CMP_QUALITY1)
-                return cmpBlock;
-        }
-
-        //========================================
-        // use GPU codec lower quality then CPU
-        //========================================
-        cmpBlockTemp = cgu_CompRGBBlock(src_imageNorm, BC15Options);
-        CompErrTemp  = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp);
-        if (CompErr > CompErrTemp)
-        {
-            CompErr  = CompErrTemp;
-            cmpBlock = cmpBlockTemp;
-        }
-
-        if (BC15Options.m_fquality < CMP_QUALITY2)
-            return cmpBlock;
-    }  // if useCGUCodecs
-
-    //====================================
-    // High Quality Codec CPU only
-    //=====================================
-#ifndef ASPM_GPU
-    cmpBlockTemp = cpu_CompRGBBlock(pixelsBGRA, BC15Options, CompErrTemp);
-
-    CompErrTemp = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp);
-
-    if (CompErr > CompErrTemp)
-    {
-        CompErr  = CompErrTemp;
-        cmpBlock = cmpBlockTemp;
-    }
-#endif
-
-    return cmpBlock;
-}
+//=====================================================================
+// Copyright (c) 2020-2024    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// File: bc1_cmp.h
+//--------------------------------------------------------------------------------------
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+//--------------------------------------------------------------------------------------
+
+#define USE_CMP
+
+#include "common_def.h"
+#include "bcn_common_kernel.h"
+#include "bcn_common_api.h"
+
+#ifndef ASPM_GPU
+#include "cpu_extensions.h"
+#include "core_simd.h"
+#endif
+
+//-----------------------------------------------------------------------
+// When build is for CPU, we have some missing API calls common to GPU
+// Use CPU CMP_Core replacements
+//-----------------------------------------------------------------------
+#if defined(ASPM_GPU) || defined(ASPM_HLSL) || defined(ASPM_OPENCL)
+#define ALIGN_16
+#define ALIGN_32
+#define ALIGN_64
+#else
+#include INC_cmp_math_func
+#if defined(_WIN32) || defined(_WIN64)
+#define ALIGN_16 __declspec(align(16))
+#define ALIGN_32 __declspec(align(32))
+#define ALIGN_64 __declspec(align(64))
+#else  // !WIN32 && !_WIN64
+#define ALIGN_16 __attribute__((aligned(16)))
+#define ALIGN_32 __attribute__((aligned(32)))
+#define ALIGN_64 __attribute__((aligned(64)))
+#endif  // !WIN32 && !_WIN64
+#endif
+
+#define USE_REFINE3D
+#define USE_REFINE
+
+#ifndef MAX_ERROR
+#define MAX_ERROR 128000.f
+#endif
+
+#define NUM_CHANNELS 4
+#define NUM_ENDPOINTS 2
+
+#ifndef CMP_QUALITY0
+#define CMP_QUALITY0 0.25f
+#endif
+
+#ifndef CMP_QUALITY1
+#define CMP_QUALITY1 0.50f
+#endif
+
+#ifndef CMP_QUALITY2
+#define CMP_QUALITY2 0.75f
+#endif
+
+#define EPS (2.f / 255.f) * (2.f / 255.f)
+#define EPS2 3.f * (2.f / 255.f) * (2.f / 255.f)
+
+// Disable SIMD code during GPU builds
+#if !defined(ASPM_GPU)
+CMP_STATIC CGU_BOOL g_bc1FunctionPointersSet = false;
+
+// declarations for SIMD function variations
+CMP_STATIC CGU_FLOAT _cpu_bc1ComputeBestEndpoints(CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, int, int);
+
+// function pointers
+CMP_STATIC CGU_FLOAT (*cpu_bc1ComputeBestEndpoints)(CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, int, int) = 0;
+
+// Toggle which SIMD instruction set extensions to use. Setting this to EXTENSION_COUNT will enable auto-detection of supported extensions.
+// NOTE: The requested extension will only be enabled if it is supported by the current CPU.
+CMP_STATIC bool bc1ToggleSIMD(CGU_INT newExtension)
+{
+    CPUExtensions extensions = GetCPUExtensions();
+    
+	// Metallicafan212:	Don't evaluate on non-X86 platforms
+#if AMD_COMPRESSONATOR_AMD64 || AMD_COMPRESSONATOR_X86
+    CGU_BOOL useAVX512 = true;
+    CGU_BOOL useAVX2   = true;
+    CGU_BOOL useSSE42  = true;
+
+    if (newExtension < EXTENSION_COUNT)  // user requested a specific instruction set extension
+    {
+        useAVX512 = newExtension == EXTENSION_AVX512_F;
+        useAVX2   = newExtension == EXTENSION_AVX2;
+        useSSE42  = newExtension == EXTENSION_SSE42;
+    }
+
+    if (useAVX512 && IsAvailableAVX512(extensions))
+    {
+        cpu_bc1ComputeBestEndpoints = avx512_bc1ComputeBestEndpoints;
+    }
+    else if (useAVX2 && IsAvailableAVX2(extensions))
+    {
+        cpu_bc1ComputeBestEndpoints = avx_bc1ComputeBestEndpoints;
+    }
+    else if (useSSE42 && IsAvailableSSE4(extensions))
+    {
+        cpu_bc1ComputeBestEndpoints = sse_bc1ComputeBestEndpoints;
+    }
+    else
+    {
+        cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints;
+    }
+#else 
+    CGU_BOOL useAVX512 = false;
+    CGU_BOOL useAVX2   = false;
+    CGU_BOOL useSSE42  = false;
+    cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints;
+#endif
+
+    g_bc1FunctionPointersSet = true;
+
+    bool result = true;
+
+    if (newExtension != EXTENSION_COUNT && (useAVX512 && !IsAvailableAVX512(extensions)) || (useAVX2 && !IsAvailableAVX2(extensions)) ||
+        (useSSE42 && !IsAvailableSSE4(extensions)))
+        result = false;
+
+    return result;
+}
+#endif
+
+static CGU_FLOAT cgu_getRampErr(CGU_FLOAT  Prj[BLOCK_SIZE_4X4],
+                                CGU_FLOAT  PrjErr[BLOCK_SIZE_4X4],
+                                CGU_FLOAT  PreMRep[BLOCK_SIZE_4X4],
+                                CGU_FLOAT  StepErr,
+                                CGU_FLOAT  lowPosStep,
+                                CGU_FLOAT  highPosStep,
+                                CGU_UINT32 dwUniqueColors)
+{
+    CGU_FLOAT error  = 0;
+    CGU_FLOAT step   = (highPosStep - lowPosStep) / 3;  // using (dwNumChannels=4 - 1);
+    CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
+    CGU_FLOAT rstep  = (CGU_FLOAT)1.0f / step;
+
+    for (CGU_UINT32 i = 0; i < dwUniqueColors; i++)
+    {
+        CGU_FLOAT v;
+        // Work out which value in the block this select
+        CGU_FLOAT del;
+
+        if ((del = Prj[i] - lowPosStep) <= 0)
+            v = lowPosStep;
+        else if (Prj[i] - highPosStep >= 0)
+            v = highPosStep;
+        else
+            v = cmp_floor((del + step_h) * rstep) * step + lowPosStep;
+
+        // And accumulate the error
+        CGU_FLOAT d = (Prj[i] - v);
+        d *= d;
+        CGU_FLOAT err = PreMRep[i] * d + PrjErr[i];
+        error += err;
+        if (StepErr < error)
+        {
+            error = StepErr;
+            break;
+        }
+    }
+    return error;
+}
+
+CMP_STATIC CMP_EndPoints cgu_CompressRGBBlockX(CMP_IN CGU_Vec3f  BlkInBGRf_UV[BLOCK_SIZE_4X4],
+                                               CMP_IN CGU_FLOAT  Rpt[BLOCK_SIZE_4X4],
+                                               CMP_IN CGU_UINT32 dwUniqueColors,
+                                               CMP_IN CGU_Vec3f  channelWeightsBGR,
+                                               CMP_IN CGU_BOOL   b3DRefinement)
+{
+    CMP_UNUSED(channelWeightsBGR);
+    CMP_UNUSED(b3DRefinement);
+    CGU_FLOAT ALIGN_16 Prj0[BLOCK_SIZE_4X4];
+    CGU_FLOAT ALIGN_16 Prj[BLOCK_SIZE_4X4];
+    CGU_FLOAT ALIGN_16 PrjErr[BLOCK_SIZE_4X4];
+    CGU_FLOAT ALIGN_16 RmpIndxs[BLOCK_SIZE_4X4];
+
+    CGU_Vec3f LineDirG;
+    CGU_Vec3f LineDir;
+    CGU_FLOAT LineDir0[NUM_CHANNELS];
+    CGU_Vec3f BlkUV[BLOCK_SIZE_4X4];
+    CGU_Vec3f BlkSh[BLOCK_SIZE_4X4];
+    CGU_Vec3f Mdl;
+
+    CGU_Vec3f  rsltC0;
+    CGU_Vec3f  rsltC1;
+    CGU_Vec3f  PosG0 = {0.0f, 0.0f, 0.0f};
+    CGU_Vec3f  PosG1 = {0.0f, 0.0f, 0.0f};
+    CGU_UINT32 i;
+
+    for (i = 0; i < dwUniqueColors; i++)
+    {
+        BlkUV[i] = BlkInBGRf_UV[i];
+    }
+
+    // if not more then 2 different colors, we've done
+    if (dwUniqueColors <= 2)
+    {
+        rsltC0 = BlkInBGRf_UV[0] * 255.0f;
+        rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f;
+    }
+    else
+    {
+        //    This is our first attempt to find an axis we will go along.
+        //    The cumulation is done to find a line minimizing the MSE from the
+        //    input 3D points.
+
+        //    While trying to find the axis we found that the diameter of the input
+        //    set is quite small. Do not bother.
+
+        // FindAxisIsSmall(BlkSh, LineDir0, Mdl, Blk, Rpt,dwUniqueColors);
+        {
+            CGU_UINT32 ii;
+            CGU_UINT32 jj;
+            CGU_UINT32 kk;
+
+            // These vars cannot be Vec3 as index to them are varying
+            CGU_FLOAT Crrl[NUM_CHANNELS];
+            CGU_FLOAT RGB2[NUM_CHANNELS];
+
+            LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = Mdl.x = Mdl.y = Mdl.z = 0.f;
+
+            // sum position of all points
+            CGU_FLOAT fNumPoints = 0.0f;
+            for (ii = 0; ii < dwUniqueColors; ii++)
+            {
+                Mdl.x += BlkUV[ii].x * Rpt[ii];
+                Mdl.y += BlkUV[ii].y * Rpt[ii];
+                Mdl.z += BlkUV[ii].z * Rpt[ii];
+                fNumPoints += Rpt[ii];
+            }
+
+            // and then average to calculate center coordinate of block
+            Mdl /= fNumPoints;
+
+            for (ii = 0; ii < dwUniqueColors; ii++)
+            {
+                // calculate output block as offsets around block center
+                BlkSh[ii] = BlkUV[ii] - Mdl;
+
+                // compute correlation matrix
+                // RGB2 = sum of ((distance from point from center) squared)
+                RGB2[0] += BlkSh[ii].x * BlkSh[ii].x * Rpt[ii];
+                RGB2[1] += BlkSh[ii].y * BlkSh[ii].y * Rpt[ii];
+                RGB2[2] += BlkSh[ii].z * BlkSh[ii].z * Rpt[ii];
+
+                Crrl[0] += BlkSh[ii].x * BlkSh[ii].y * Rpt[ii];
+                Crrl[1] += BlkSh[ii].y * BlkSh[ii].z * Rpt[ii];
+                Crrl[2] += BlkSh[ii].z * BlkSh[ii].x * Rpt[ii];
+            }
+
+            // if set's diameter is small
+            CGU_UINT32 i0 = 0, i1 = 1;
+            CGU_FLOAT  mxRGB2 = 0.0f;
+
+            CGU_FLOAT fEPS = fNumPoints * EPS;
+            for (kk = 0, jj = 0; jj < 3; jj++)
+            {
+                if (RGB2[jj] >= fEPS)
+                    kk++;
+                else
+                    RGB2[jj] = 0.0f;
+
+                if (mxRGB2 < RGB2[jj])
+                {
+                    mxRGB2 = RGB2[jj];
+                    i0     = jj;
+                }
+            }
+
+            CGU_FLOAT fEPS2 = fNumPoints * EPS2;
+            CGU_BOOL  AxisIsSmall;
+
+            AxisIsSmall = (RGB2[0] < fEPS2);
+            AxisIsSmall = AxisIsSmall && (RGB2[1] < fEPS2);
+            AxisIsSmall = AxisIsSmall && (RGB2[2] < fEPS2);
+
+            // all are very small to avoid division on the small determinant
+            if (AxisIsSmall)
+            {
+                rsltC0 = BlkInBGRf_UV[0] * 255.0f;
+                rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f;
+            }
+            else
+            {
+                // !AxisIsSmall
+                if (kk == 1)  // really only 1 dimension
+                    LineDir0[i0] = 1.;
+                else if (kk == 2)
+                {  // really only 2 dimensions
+                    i1            = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3;
+                    CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3];
+                    LineDir0[i1]  = Crl / RGB2[i0];
+                    LineDir0[i0]  = 1.;
+                }
+                else
+                {
+                    CGU_FLOAT maxDet = 100000.f;
+                    CGU_FLOAT Cs[3];
+                    // select max det for precision
+                    for (jj = 0; jj < 3; jj++)
+                    {
+                        // 3 = nDimensions
+                        CGU_FLOAT Det = RGB2[jj] * RGB2[(jj + 1) % 3] - Crrl[jj] * Crrl[jj];
+                        Cs[jj]        = cmp_fabs(Crrl[jj] / sqrt(RGB2[jj] * RGB2[(jj + 1) % 3]));
+                        if (maxDet < Det)
+                        {
+                            maxDet = Det;
+                            i0     = jj;
+                        }
+                    }
+
+                    // inverse correl matrix
+                    //  --      --       --      --
+                    //  |  A   B |       |  C  -B |
+                    //  |  B   C |  =>   | -B   A |
+                    //  --      --       --     --
+                    CGU_FLOAT mtrx1[2][2];
+                    CGU_FLOAT vc1[2];
+                    CGU_FLOAT vc[2];
+                    vc1[0] = Crrl[(i0 + 2) % 3];
+                    vc1[1] = Crrl[(i0 + 1) % 3];
+                    // C
+                    mtrx1[0][0] = RGB2[(i0 + 1) % 3];
+                    // A
+                    mtrx1[1][1] = RGB2[i0];
+                    // -B
+                    mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0];
+                    // find a solution
+                    vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1];
+                    vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1];
+                    // normalize
+                    vc[0] /= maxDet;
+                    vc[1] /= maxDet;
+                    // find a line direction vector
+                    LineDir0[i0]           = 1.;
+                    LineDir0[(i0 + 1) % 3] = 1.;
+                    LineDir0[(i0 + 2) % 3] = vc[0] + vc[1];
+                }
+
+                // normalize direction vector
+                CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2];
+                Len           = sqrt(Len);
+
+                LineDir0[0] = (Len > 0.f) ? LineDir0[0] / Len : 0.0f;
+                LineDir0[1] = (Len > 0.f) ? LineDir0[1] / Len : 0.0f;
+                LineDir0[2] = (Len > 0.f) ? LineDir0[2] / Len : 0.0f;
+            }
+        }  // FindAxisIsSmall
+
+        // GCC is being an awful being when it comes to goto-jumps.
+        // So please bear with this.
+        CGU_FLOAT          ErrG = 10000000.f;
+        CGU_FLOAT          PrjBnd0;
+        CGU_FLOAT          PrjBnd1;
+        CGU_FLOAT ALIGN_16 PreMRep[BLOCK_SIZE_4X4];
+
+        LineDir.x = LineDir0[0];
+        LineDir.y = LineDir0[1];
+        LineDir.z = LineDir0[2];
+
+        //    Here is the main loop.
+        //    1. Project input set on the axis in consideration.
+        //    2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points.
+        //    3. Compute the vector of indexes (or clusters) for the current approximate ramp.
+        //    4. Present our color channels as 3 16DIM vectors.
+        //    5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector.
+        //    6. Plug the projections as a new directional vector for the axis.
+        //    7. Goto 1.
+        //    D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3,2/3, 0, ...,}, but shifted and normalized).
+        //    Ci - is a 16 dim vector of color i. for each Ci find a scalar Ai such that (Ai * D - Ci) (Ai * D - Ci) -> min ,
+        //         i.e distance between vector AiD and C is min. You can think of D as a unit interval(vector) "clusterizer", and Ai is a scale
+        //         you need to apply to the clusterizer to approximate the Ci vector instead of the unit vector.
+        //    Solution is
+        //    Ai = (D . Ci) / (D . D); . - is a dot product.
+        //    in 3 dim space Ai(s) represent a line direction, along which
+        //    we again try to find (sub)optimal quantizer.
+        //    That's what our for(;;) loop is about.
+        for (;;)
+        {
+            //  1. Project input set on the axis in consideration.
+            // From Foley & Van Dam: Closest point of approach of a line (P + v) to a
+            // point (R) is
+            //                            P + ((R-P).v) / (v.v))v
+            // The distance along v is therefore (R-P).v / (v.v)
+            // (v.v) is 1 if v is a unit vector.
+            //
+            PrjBnd0 = 1000.0f;
+            PrjBnd1 = -1000.0f;
+            for (i = 0; i < BLOCK_SIZE_4X4; i++)
+                Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f;
+
+            for (i = 0; i < dwUniqueColors; i++)
+            {
+                Prj0[i] = Prj[i] = dot(BlkSh[i], LineDir);
+                PrjErr[i]        = dot(BlkSh[i] - LineDir * Prj[i], BlkSh[i] - LineDir * Prj[i]);
+                PrjBnd0          = min(PrjBnd0, Prj[i]);
+                PrjBnd1          = max(PrjBnd1, Prj[i]);
+            }
+
+            //  2. Run 1 dimensional search (see scalar case) to find an (sub) optimal
+            //  pair of end points.
+
+            // min and max of the search interval
+            CGU_FLOAT Scl0;
+            CGU_FLOAT Scl1;
+            Scl0 = PrjBnd0 - (PrjBnd1 - PrjBnd0) * 0.125f;
+            Scl1 = PrjBnd1 + (PrjBnd1 - PrjBnd0) * 0.125f;
+
+            // compute scaling factor to scale down the search interval to [0.,1]
+            const CGU_FLOAT Scl2    = (Scl1 - Scl0) * (Scl1 - Scl0);
+            const CGU_FLOAT overScl = 1.f / (Scl1 - Scl0);
+
+            for (i = 0; i < dwUniqueColors; i++)
+            {
+                // scale them
+                Prj[i] = (Prj[i] - Scl0) * overScl;
+                // premultiply the scale square to plug into error computation later
+                PreMRep[i] = Rpt[i] * Scl2;
+            }
+
+            // scale first approximation of end points
+            PrjBnd0 = (PrjBnd0 - Scl0) * overScl;
+            PrjBnd1 = (PrjBnd1 - Scl0) * overScl;
+
+            CGU_FLOAT StepErr = MAX_ERROR;
+
+            // search step
+            CGU_FLOAT searchStep = 0.025f;
+
+            // low Start/End; high Start/End
+            const CGU_FLOAT lowStartEnd  = (PrjBnd0 - 2.f * searchStep > 0.f) ? PrjBnd0 - 2.f * searchStep : 0.f;
+            const CGU_FLOAT highStartEnd = (PrjBnd1 + 2.f * searchStep < 1.f) ? PrjBnd1 + 2.f * searchStep : 1.f;
+
+            // find the best endpoints
+            CGU_FLOAT Pos0 = 0;
+            CGU_FLOAT Pos1 = 0;
+            CGU_FLOAT lowPosStep, highPosStep;
+            CGU_FLOAT err;
+
+            int l, h;
+            for (l = 0, lowPosStep = lowStartEnd; l < 8; l++, lowPosStep += searchStep)
+            {
+                for (h = 0, highPosStep = highStartEnd; h < 8; h++, highPosStep -= searchStep)
+                {
+                    // compute an error for the current pair of end points.
+                    err = cgu_getRampErr(Prj, PrjErr, PreMRep, StepErr, lowPosStep, highPosStep, dwUniqueColors);
+
+                    if (err < StepErr)
+                    {
+                        // save better result
+                        StepErr = err;
+                        Pos0    = lowPosStep;
+                        Pos1    = highPosStep;
+                    }
+                }
+            }
+
+            // inverse the scaling
+            Pos0 = Pos0 * (Scl1 - Scl0) + Scl0;
+            Pos1 = Pos1 * (Scl1 - Scl0) + Scl0;
+
+            // did we find somthing better from the previous run?
+            if (StepErr + 0.001 < ErrG)
+            {
+                // yes, remember it
+                ErrG     = StepErr;
+                LineDirG = LineDir;
+
+                PosG0.x = Pos0;
+                PosG0.y = Pos0;
+                PosG0.z = Pos0;
+                PosG1.x = Pos1;
+                PosG1.y = Pos1;
+                PosG1.z = Pos1;
+
+                //  3. Compute the vector of indexes (or clusters) for the current
+                //  approximate ramp.
+                // indexes
+                const CGU_FLOAT step      = (Pos1 - Pos0) / 3.0f;  // (dwNumChannels=4 - 1);
+                const CGU_FLOAT step_h    = step * (CGU_FLOAT)0.5;
+                const CGU_FLOAT rstep     = (CGU_FLOAT)1.0f / step;
+                const CGU_FLOAT overBlkTp = 1.f / 3.0f;  // (dwNumChannels=4 - 1);
+
+                // here the index vector is computed,
+                // shifted and normalized
+                CGU_FLOAT indxAvrg = 3.0f / 2.0f;  // (dwNumChannels=4 - 1);
+
+                for (i = 0; i < dwUniqueColors; i++)
+                {
+                    CGU_FLOAT del;
+                    // CGU_UINT32 n = (CGU_UINT32)((b - _min_ex + (step*0.5f)) * rstep);
+                    if ((del = Prj0[i] - Pos0) <= 0)
+                        RmpIndxs[i] = 0.f;
+                    else if (Prj0[i] - Pos1 >= 0)
+                        RmpIndxs[i] = 3.0f;  // (dwNumChannels=4 - 1);
+                    else
+                        RmpIndxs[i] = cmp_floor((del + step_h) * rstep);
+                    // shift and normalization
+                    RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp;
+                }
+
+                //  4. Present our color channels as 3 16 DIM vectors.
+                //  5. Find closest aproximation of each of 16DIM color vector with the
+                //  pojection of the 16DIM index vector.
+                CGU_Vec3f Crs = {0.0f, 0.0f, 0.0f};
+                CGU_FLOAT Len = 0.0f;
+
+                for (i = 0; i < dwUniqueColors; i++)
+                {
+                    const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i];
+                    Len += RmpIndxs[i] * PreMlt;
+                    Crs.x += BlkSh[i].x * PreMlt;
+                    Crs.y += BlkSh[i].y * PreMlt;
+                    Crs.z += BlkSh[i].z * PreMlt;
+                }
+
+                LineDir.x = LineDir.y = LineDir.z = 0.0f;
+                if (Len > 0.0f)
+                {
+                    CGU_FLOAT Len2;
+                    LineDir = Crs / Len;
+                    //  6. Plug the projections as a new directional vector for the axis.
+                    //  7. Goto 1.
+                    Len2 = dot(LineDir, LineDir);  // LineDir.x * LineDir.x + LineDir.y * LineDir.y + LineDir.z * LineDir.z;
+                    Len2 = sqrt(Len2);
+                    LineDir /= Len2;
+                }
+            }
+            else  // We was not able to find anything better.  Drop out.
+                break;
+        }
+
+        // inverse transform to find end-points of 3-color ramp
+        rsltC0 = (PosG0 * LineDirG + Mdl) * 255.f;
+        rsltC1 = (PosG1 * LineDirG + Mdl) * 255.f;
+    }  // !isDone
+
+    // We've dealt with (almost) unrestricted full precision realm.
+    // Now back digital world.
+
+    // round the end points to make them look like compressed ones
+    CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f};
+    CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f};
+    CGU_Vec3f Fctrs0        = {8.0f, 4.0f, 8.0f};     //(1 << (PIX_GRID - BG)); x (1 << (PIX_GRID - GG)); y (1 << (PIX_GRID - RG)); z
+    CGU_Vec3f Fctrs1        = {32.0f, 64.0f, 32.0f};  //(CGU_FLOAT)(1 << RG); z (CGU_FLOAT)(1 << GG); y (CGU_FLOAT)(1 << BG); x
+    CGU_FLOAT _Min          = 0.0f;
+    CGU_FLOAT _Max          = 255.0f;
+
+    {
+        // MkRmpOnGrid(inpRmpEndPts, rsltC, _Min, _Max);
+
+        inpRmpEndPts0 = cmp_floorVec3f(rsltC0);
+
+        if (inpRmpEndPts0.x <= _Min)
+            inpRmpEndPts0.x = _Min;
+        else
+        {
+            inpRmpEndPts0.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts0.x / Fctrs1.x);
+            inpRmpEndPts0.x = min(inpRmpEndPts0.x, _Max);
+        }
+        if (inpRmpEndPts0.y <= _Min)
+            inpRmpEndPts0.y = _Min;
+        else
+        {
+            inpRmpEndPts0.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts0.y / Fctrs1.y);
+            inpRmpEndPts0.y = min(inpRmpEndPts0.y, _Max);
+        }
+        if (inpRmpEndPts0.z <= _Min)
+            inpRmpEndPts0.z = _Min;
+        else
+        {
+            inpRmpEndPts0.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts0.z / Fctrs1.z);
+            inpRmpEndPts0.z = min(inpRmpEndPts0.z, _Max);
+        }
+
+        inpRmpEndPts0 = cmp_floorVec3f(inpRmpEndPts0 / Fctrs0) * Fctrs0;
+
+        inpRmpEndPts1 = cmp_floorVec3f(rsltC1);
+        if (inpRmpEndPts1.x <= _Min)
+            inpRmpEndPts1.x = _Min;
+        else
+        {
+            inpRmpEndPts1.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts1.x / Fctrs1.x);
+            inpRmpEndPts1.x = min(inpRmpEndPts1.x, _Max);
+        }
+        if (inpRmpEndPts1.y <= _Min)
+            inpRmpEndPts1.y = _Min;
+        else
+        {
+            inpRmpEndPts1.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts1.y / Fctrs1.y);
+            inpRmpEndPts1.y = min(inpRmpEndPts1.y, _Max);
+        }
+        if (inpRmpEndPts1.z <= _Min)
+            inpRmpEndPts1.z = _Min;
+        else
+        {
+            inpRmpEndPts1.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts1.z / Fctrs1.z);
+            inpRmpEndPts1.z = min(inpRmpEndPts1.z, _Max);
+        }
+
+        inpRmpEndPts1 = cmp_floorVec3f(inpRmpEndPts1 / Fctrs0) * Fctrs0;
+    }  // MkRmpOnGrid
+
+    CMP_EndPoints EndPoints;
+    EndPoints.Color0 = inpRmpEndPts0;
+    EndPoints.Color1 = inpRmpEndPts1;
+
+    return EndPoints;
+}
+
+CMP_STATIC CMP_EndPoints
+cgu_MkRmpOnGridBGR(CMP_IN CGU_Vec3f rsltC0, CMP_IN CGU_Vec3f rsltC1, CMP_IN CGU_UINT32 nRedBits, CMP_IN CGU_UINT32 nGreenBits, CMP_IN CGU_UINT32 nBlueBits)
+{
+    CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f};
+    CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f};
+    CGU_Vec3f Fctrs0        = {8.0f, 4.0f, 8.0f};
+    CGU_Vec3f Fctrs1        = {32.0f, 64.0f, 32.0f};
+    CGU_FLOAT _Min          = 0.0f;
+    CGU_FLOAT _Max          = 255.0f;
+
+    // user override 565 default setting
+    if ((nRedBits != 5) || (nGreenBits != 6) || (nBlueBits != 5))
+    {
+        Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits);
+        Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits);
+        Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits);
+        Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits));
+        Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits));
+        Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits));
+    }
+
+    inpRmpEndPts0 = cmp_floorVec3f(rsltC0);
+
+    if (inpRmpEndPts0.x <= _Min)
+        inpRmpEndPts0.x = _Min;
+    else
+    {
+        inpRmpEndPts0.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts0.x / Fctrs1.x);
+        inpRmpEndPts0.x = cmp_minf(inpRmpEndPts0.x, _Max);
+    }
+    if (inpRmpEndPts0.y <= _Min)
+        inpRmpEndPts0.y = _Min;
+    else
+    {
+        inpRmpEndPts0.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts0.y / Fctrs1.y);
+        inpRmpEndPts0.y = cmp_minf(inpRmpEndPts0.y, _Max);
+    }
+    if (inpRmpEndPts0.z <= _Min)
+        inpRmpEndPts0.z = _Min;
+    else
+    {
+        inpRmpEndPts0.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts0.z / Fctrs1.z);
+        inpRmpEndPts0.z = cmp_minf(inpRmpEndPts0.z, _Max);
+    }
+
+    inpRmpEndPts0 = cmp_floorVec3f(inpRmpEndPts0 / Fctrs0) * Fctrs0;
+
+    inpRmpEndPts1 = cmp_floorVec3f(rsltC1);
+    if (inpRmpEndPts1.x <= _Min)
+        inpRmpEndPts1.x = _Min;
+    else
+    {
+        inpRmpEndPts1.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts1.x / Fctrs1.x);
+        inpRmpEndPts1.x = cmp_minf(inpRmpEndPts1.x, _Max);
+    }
+    if (inpRmpEndPts1.y <= _Min)
+        inpRmpEndPts1.y = _Min;
+    else
+    {
+        inpRmpEndPts1.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts1.y / Fctrs1.y);
+        inpRmpEndPts1.y = cmp_minf(inpRmpEndPts1.y, _Max);
+    }
+    if (inpRmpEndPts1.z <= _Min)
+        inpRmpEndPts1.z = _Min;
+    else
+    {
+        inpRmpEndPts1.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts1.z / Fctrs1.z);
+        inpRmpEndPts1.z = cmp_minf(inpRmpEndPts1.z, _Max);
+    }
+
+    inpRmpEndPts1 = cmp_floorVec3f(inpRmpEndPts1 / Fctrs0) * Fctrs0;
+
+    CMP_EndPoints EndPoints;
+    EndPoints.Color0 = inpRmpEndPts0;
+    EndPoints.Color1 = inpRmpEndPts1;
+
+    return EndPoints;
+
+}  // MkRmpOnGrid
+
+//===================================================================
+// Replaces CompressBlockBC1_RGBA_Internal()
+// if ((errLQ > 0.0f) && (fquality > CMP_QUALITY2)) code block
+//===================================================================
+CMP_STATIC CGU_Vec2ui cgu_CompRGBBlock(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options)
+{
+    //CGU_FLOAT  errLQ    = 1e6f;
+    CGU_UINT32 m_nRefinementSteps = BC15Options.m_nRefinementSteps;
+    CGU_UINT32 dwAlphaThreshold   = BC15Options.m_nAlphaThreshold;
+    CGU_Vec3f  channelWeights     = {BC15Options.m_fChannelWeights[0], BC15Options.m_fChannelWeights[1], BC15Options.m_fChannelWeights[2]};
+    CGU_BOOL   isSRGB             = BC15Options.m_bIsSRGB;
+
+    CGU_Vec3f  rgbBlock_normal[BLOCK_SIZE_4X4];
+    CGU_UINT32 nCmpIndices = 0;
+    CGU_UINT32 c0, c1;
+    // High Quality
+    CMP_EndPoints EndPoints = {{0, 0, 0xFF}, {0, 0, 0xFF}};
+    CGU_UINT32    i;
+
+    CGU_FLOAT ALIGN_16 Rpt[BLOCK_SIZE_4X4];
+    CGU_UINT32         pcIndices = 0;
+
+    m_nRefinementSteps = 0;
+
+    CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4];  // Normalized Block Input (0..1) in BGR channel format
+    // Default inidices & endpoints for Transparent Block
+    CGU_Vec3ui nEndpoints0 = {0, 0, 0};           // Endpoints are stored BGR as x,y,z
+    CGU_Vec3ui nEndpoints1 = {0xFF, 0xFF, 0xFF};  // Endpoints are stored BGR as x,y,z
+
+    for (i = 0; i < BLOCK_SIZE_4X4; i++)
+    {
+        Rpt[i] = 0.0f;
+    }
+
+    //===============================================================
+    // Check if we have more then 2 colors and process Alpha block
+    CGU_UINT32 dwColors = 0;
+    CGU_UINT32 dwBlk[BLOCK_SIZE_4X4];
+    CGU_UINT32 R, G, B, A;
+    for (i = 0; i < BLOCK_SIZE_4X4; i++)
+    {
+        // Do any color conversion prior to processing the block
+        rgbBlock_normal[i] = isSRGB ? cmp_linearToSrgb(src_imageNorm[i].rgb) : src_imageNorm[i].rgb;
+
+        R = (CGU_UINT32)(rgbBlock_normal[i].x * 255.0f);
+        G = (CGU_UINT32)(rgbBlock_normal[i].y * 255.0f);
+        B = (CGU_UINT32)(rgbBlock_normal[i].z * 255.0f);
+
+        //if (dwAlphaThreshold > 0)
+        //    A = (CGU_UINT32)src_imageNorm[i].w * 255.0f;
+        //else
+        A = 255;
+
+        // Punch Through Alpha in BC1 Codec (1 bit alpha)
+        //if ((dwAlphaThreshold == 0) || (A >= dwAlphaThreshold))
+        //{
+        // copy to local RGB data and have alpha set to 0xFF
+        dwBlk[dwColors++] = A << 24 | R << 16 | G << 8 | B;
+        //}
+    }
+
+    if (!dwColors)
+    {
+        // All are colors transparent
+        EndPoints.Color0.x = EndPoints.Color0.y = EndPoints.Color0.z = 0.0f;
+        EndPoints.Color1.x = EndPoints.Color1.y = EndPoints.Color0.z = 255.0f;
+        nCmpIndices                                                  = 0xFFFFFFFF;
+    }
+    else
+    {
+        // We have colors to process
+        nCmpIndices = 0;
+        // Punch Through Alpha Support ToDo
+        // CGU_BOOL bHasAlpha = (dwColors != BLOCK_SIZE_4X4);
+        // bHasAlpha = bHasAlpha && (dwAlphaThreshold > 0); // valid for  (dwNumChannels=4);
+        // if (bHasAlpha) {
+        //      CGU_Vec2ui  compBlock = {0xf800f800,0};
+        //     return compBlock;
+        // }
+
+        // Here we are computing an unique number of sorted colors.
+        // For each unique value we compute the number of it appearences.
+        // qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp);
+        {
+            CGU_UINT32 j;
+            CMP_di     what[BLOCK_SIZE_4X4];
+
+            for (i = 0; i < dwColors; i++)
+            {
+                what[i].index = i;
+                what[i].data  = dwBlk[i];
+            }
+
+            CGU_UINT32 tmp_index;
+            CGU_UINT32 tmp_data;
+
+            for (i = 1; i < dwColors; i++)
+            {
+                for (j = i; j > 0; j--)
+                {
+                    if (what[j - 1].data > what[j].data)
+                    {
+                        tmp_index         = what[j].index;
+                        tmp_data          = what[j].data;
+                        what[j].index     = what[j - 1].index;
+                        what[j].data      = what[j - 1].data;
+                        what[j - 1].index = tmp_index;
+                        what[j - 1].data  = tmp_data;
+                    }
+                }
+            }
+            for (i = 0; i < dwColors; i++)
+                dwBlk[i] = what[i].data;
+        }
+        CGU_UINT32 new_p;
+        CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4];
+        CGU_UINT32 dwUniqueColors = 0;
+        new_p = dwBlkU[0]   = dwBlk[0];
+        Rpt[dwUniqueColors] = 1.f;
+        for (i = 1; i < dwColors; i++)
+        {
+            if (new_p != dwBlk[i])
+            {
+                dwUniqueColors++;
+                new_p = dwBlkU[dwUniqueColors] = dwBlk[i];
+                Rpt[dwUniqueColors]            = 1.f;
+            }
+            else
+                Rpt[dwUniqueColors] += 1.f;
+        }
+        dwUniqueColors++;
+
+        // Simple case of only 2 colors to process
+        // no need for futher processing as lowest quality methods work best for this case
+        if (dwUniqueColors <= 2)
+        {
+            CGU_Vec3f rsltC0;
+            CGU_Vec3f rsltC1;
+            rsltC0.r  = rgbBlock_normal[0].b * 255.0f;
+            rsltC0.g  = rgbBlock_normal[0].g * 255.0f;
+            rsltC0.b  = rgbBlock_normal[0].r * 255.0f;
+            rsltC1.r  = rgbBlock_normal[dwUniqueColors - 1].b * 255.0f;
+            rsltC1.g  = rgbBlock_normal[dwUniqueColors - 1].g * 255.0f;
+            rsltC1.b  = rgbBlock_normal[dwUniqueColors - 1].r * 255.0f;
+            EndPoints = cgu_MkRmpOnGridBGR(rsltC0, rsltC1, 5, 6, 5);
+        }
+        else
+        {
+            // switch from int range back to UV floats
+            for (i = 0; i < dwUniqueColors; i++)
+            {
+                R                 = (dwBlkU[i] >> 16) & 0xff;
+                G                 = (dwBlkU[i] >> 8) & 0xff;
+                B                 = (dwBlkU[i] >> 0) & 0xff;
+                BlkInBGRf_UV[i].z = (CGU_FLOAT)R / 255.0f;
+                BlkInBGRf_UV[i].y = (CGU_FLOAT)G / 255.0f;
+                BlkInBGRf_UV[i].x = (CGU_FLOAT)B / 255.0f;
+            }
+
+            CGU_Vec3f channelWeightsBGR;
+            channelWeightsBGR.x = channelWeights.z;
+            channelWeightsBGR.y = channelWeights.y;
+            channelWeightsBGR.z = channelWeights.x;
+
+            EndPoints = cgu_CompressRGBBlockX(BlkInBGRf_UV, Rpt, dwUniqueColors, channelWeightsBGR, m_nRefinementSteps);
+        }
+    }  // colors
+
+    //===================================================================
+    // Process Cluster INPUT is constant EndPointsf OUTPUT is pcIndices
+    //===================================================================
+    if (nCmpIndices == 0)
+    {
+        R                  = (CGU_UINT32)(EndPoints.Color0.z);
+        G                  = (CGU_UINT32)(EndPoints.Color0.y);
+        B                  = (CGU_UINT32)(EndPoints.Color0.x);
+        CGU_INT32 cluster0 = cmp_constructColor(R, G, B);
+
+        R                  = (CGU_UINT32)(EndPoints.Color1.z);
+        G                  = (CGU_UINT32)(EndPoints.Color1.y);
+        B                  = (CGU_UINT32)(EndPoints.Color1.x);
+        CGU_INT32 cluster1 = cmp_constructColor(R, G, B);
+
+        CGU_Vec3f InpRmp[NUM_ENDPOINTS];
+        if ((cluster0 <= cluster1)  // valid for 4 channels
+                                    // || (cluster0 > cluster1)    // valid for 3 channels
+        )
+        {
+            // inverse endpoints
+            InpRmp[0] = EndPoints.Color1;
+            InpRmp[1] = EndPoints.Color0;
+        }
+        else
+        {
+            InpRmp[0] = EndPoints.Color0;
+            InpRmp[1] = EndPoints.Color1;
+        }
+
+        CGU_Vec3f srcblockBGR[BLOCK_SIZE_4X4];
+        CGU_FLOAT srcblockA[BLOCK_SIZE_4X4];
+
+        // Swizzle the source RGB to BGR for processing
+        for (i = 0; i < BLOCK_SIZE_4X4; i++)
+        {
+            srcblockBGR[i].z = rgbBlock_normal[i].x * 255.0f;
+            srcblockBGR[i].y = rgbBlock_normal[i].y * 255.0f;
+            srcblockBGR[i].x = rgbBlock_normal[i].z * 255.0f;
+            srcblockA[i]     = 255.0f;
+            if (dwAlphaThreshold > 0)
+            {
+                CGU_UINT32 alpha = (CGU_UINT32)src_imageNorm[i].w * 255.0f;
+                if (alpha >= dwAlphaThreshold)
+                    srcblockA[i] = alpha;
+            }
+        }
+
+        // input ramp is on the coarse grid
+        // make ramp endpoints the way they'll going to be decompressed
+        CGU_Vec3f InpRmpL[NUM_ENDPOINTS];
+        CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F};  // 1 << RG,1 << GG,1 << BG
+
+        {
+            //   ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp);
+            InpRmpL[0] = InpRmp[0] + cmp_floorVec3f(InpRmp[0] / Fctrs);
+            InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f);
+            InpRmpL[1] = InpRmp[1] + cmp_floorVec3f(InpRmp[1] / Fctrs);
+            InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f);
+        }  // MkWkRmpPts
+
+        // build ramp
+        CGU_Vec3f LerpRmp[4];
+        CGU_Vec3f offset = {1.0f, 1.0f, 1.0f};
+        {
+            //BldRmp(Rmp, InpRmpL, dwNumChannels);
+            // linear interpolate end points to get the ramp
+            LerpRmp[0] = InpRmpL[0];
+            LerpRmp[3] = InpRmpL[1];
+            LerpRmp[1] = cmp_floorVec3f((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f);
+            LerpRmp[2] = cmp_floorVec3f((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f);
+        }  // BldRmp
+
+        //=========================================================================
+        // Clusterize, Compute error and find DXTC indexes for the current cluster
+        //=========================================================================
+        {
+            // Clusterize
+            CGU_UINT32 alpha;
+
+            // For each colour in the original block assign it
+            // to the closest cluster and compute the cumulative error
+            for (i = 0; i < BLOCK_SIZE_4X4; i++)
+            {
+                alpha = (CGU_UINT32)srcblockA[i];
+                if ((dwAlphaThreshold > 0) && alpha == 0)
+                {                                      //*((CGU_DWORD *)&_Blk[i][AC]) == 0)
+                    pcIndices |= cmp_set2Bit32(4, i);  // dwNumChannels 3 or 4 (default is 4)
+                }
+                else
+                {
+                    CGU_FLOAT shortest      = 99999999999.f;
+                    CGU_UINT8 shortestIndex = 0;
+
+                    CGU_Vec3f channelWeightsBGR;
+                    channelWeightsBGR.x = channelWeights.z;
+                    channelWeightsBGR.y = channelWeights.y;
+                    channelWeightsBGR.z = channelWeights.x;
+
+                    for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++)
+                    {
+                        // r is either 1 or 4
+                        // calculate the distance for each component
+                        CGU_FLOAT distance =
+                            dot(((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR), ((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR));
+                        if (distance < shortest)
+                        {
+                            shortest      = distance;
+                            shortestIndex = rampindex;
+                        }
+                    }
+
+                    // The total is a sum of (error += shortest)
+                    // We have the index of the best cluster, so assign this in the block
+                    // Reorder indices to match correct DXTC ordering
+                    if (shortestIndex == 3)  // dwNumChannels - 1
+                        shortestIndex = 1;
+                    else if (shortestIndex)
+                        shortestIndex++;
+                    pcIndices |= cmp_set2Bit32(shortestIndex, i);
+                }
+            }  // BLOCK_SIZE_4X4
+        }      // Clusterize
+    }          // Process Cluster
+
+    //==============================================================
+    // Generate Compressed Result from nEndpoints & pcIndices
+    //==============================================================
+    c0 = cmp_constructColorBGR(EndPoints.Color0);
+    c1 = cmp_constructColorBGR(EndPoints.Color1);
+
+    // Get Processed indices if not set
+    if (nCmpIndices == 0)
+        nCmpIndices = pcIndices;
+
+    CGU_Vec2ui cmpBlock;
+    if (c0 <= c1)
+    {
+        cmpBlock.x = c1 | (c0 << 16);
+    }
+    else
+        cmpBlock.x = c0 | (c1 << 16);
+
+    cmpBlock.y = nCmpIndices;
+
+    return cmpBlock;
+}
+
+CMP_STATIC void cgu_ProcessColors(CMP_INOUT CGU_Vec3f CMP_PTRINOUT  colorMin,
+                                  CMP_INOUT CGU_Vec3f CMP_PTRINOUT  colorMax,
+                                  CMP_INOUT CGU_UINT32 CMP_PTRINOUT c0,
+                                  CMP_INOUT CGU_UINT32 CMP_PTRINOUT c1,
+                                  CMP_IN CGU_INT                    setopt,
+                                  CMP_IN CGU_BOOL                   isSRGB)
+{
+    // CGU_UINT32 srbMap[32] = {0,5,8,11,12,13,14,15,16,17,18,19,20,21,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31};
+    // CGU_UINT32 sgMap[64]  = {0,10,14,16,19,20,22,24,25,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,42,43,43,44,45,45,
+    //                          46,47,47,48,48,49,50,50,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63};
+    CGU_INT32 x, y, z;
+    CGU_Vec3f scale = {31.0f, 63.0f, 31.0f};
+    CGU_Vec3f MinColorScaled;
+    CGU_Vec3f MaxColorScaled;
+
+    // Clamp or Transform is needed, the transforms have built in clamps
+    if (isSRGB)
+    {
+        MinColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMin);
+        MaxColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMax);
+    }
+    else
+    {
+        MinColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMin, 0.0f, 1.0f);
+        MaxColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMax, 0.0f, 1.0f);
+    }
+
+    switch (setopt)
+    {
+    case 0:  // Use Min Max processing
+        MinColorScaled        = cmp_floorVec3f(MinColorScaled * scale);
+        MaxColorScaled        = cmp_ceilVec3f(MaxColorScaled * scale);
+        CMP_PTRINOUT colorMin = MinColorScaled / scale;
+        CMP_PTRINOUT colorMax = MaxColorScaled / scale;
+        break;
+    default:  // Use round processing
+        MinColorScaled = round(MinColorScaled * scale);
+        MaxColorScaled = round(MaxColorScaled * scale);
+        break;
+    }
+
+    x = (CGU_UINT32)(MinColorScaled.x);
+    y = (CGU_UINT32)(MinColorScaled.y);
+    z = (CGU_UINT32)(MinColorScaled.z);
+
+    //if (isSRGB) {
+    //    // scale RB
+    //    x = srbMap[x]; // &0x1F];
+    //    y = sgMap [y]; // &0x3F];
+    //    z = srbMap[z]; // &0x1F];
+    //    // scale G
+    //}
+    CMP_PTRINOUT c0 = (x << 11) | (y << 5) | z;
+
+    x               = (CGU_UINT32)(MaxColorScaled.x);
+    y               = (CGU_UINT32)(MaxColorScaled.y);
+    z               = (CGU_UINT32)(MaxColorScaled.z);
+    CMP_PTRINOUT c1 = (x << 11) | (y << 5) | z;
+}
+
+CMP_STATIC CGU_FLOAT cgu_getIndicesRGB(CMP_INOUT CGU_UINT32 CMP_PTRINOUT cmpindex,
+                                       CMP_IN const CGU_Vec3f            block[16],
+                                       CMP_IN CGU_Vec3f                  minColor,
+                                       CMP_IN CGU_Vec3f                  maxColor,
+                                       CMP_IN CGU_BOOL                   getErr)
+{
+    CGU_UINT32 PackedIndices = 0;
+    CGU_FLOAT  err           = 0.0f;
+    CGU_Vec3f  cn[4];
+    CGU_FLOAT  minDistance;
+
+    if (getErr)
+    {
+        // remap to BC1 spec for decoding offsets,
+        // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1
+        cn[0] = maxColor;
+        cn[1] = minColor;
+        cn[2] = cn[0] * 2.0f / 3.0f + cn[1] * 1.0f / 3.0f;
+        cn[3] = cn[0] * 1.0f / 3.0f + cn[1] * 2.0f / 3.0f;
+    }
+
+    CGU_FLOAT  Scale       = 3.f / cmp_dotVec3f(minColor - maxColor, minColor - maxColor);
+    CGU_Vec3f  ScaledRange = (minColor - maxColor) * Scale;
+    CGU_FLOAT  Bias        = (cmp_dotVec3f(maxColor, maxColor) - cmp_dotVec3f(maxColor, minColor)) * Scale;
+    CGU_INT    indexMap[4] = {0, 2, 3, 1};  // mapping based on BC1 Spec for color0 > color1
+    CGU_UINT32 index;
+    CGU_FLOAT  diff;
+
+    for (CGU_UINT32 i = 0; i < 16; i++)
+    {
+        // Get offset from base scale
+        diff  = cmp_dotVec3f(block[i], ScaledRange) + Bias;
+        index = ((CGU_UINT32)round(diff)) & 0x3;
+
+        // remap linear offset to spec offset
+        index = indexMap[index];
+
+        // use err calc for use in higher quality code
+        if (getErr)
+        {
+            minDistance = cmp_dotVec3f(block[i] - cn[index], block[i] - cn[index]);
+            err += minDistance;
+        }
+
+        // Map the 2 bit index into compress 32 bit block
+        if (index)
+            PackedIndices |= (index << (2 * i));
+    }
+
+    if (getErr)
+        err = err * 0.0208333f;
+
+    CMP_PTRINOUT cmpindex = PackedIndices;
+    return err;
+}
+
+//--------------------------------------------------------------------------------------------------------
+// Decompress is RGB (0.0f..255.0f)
+//--------------------------------------------------------------------------------------------------------
+CMP_STATIC void cgu_decompressRGBBlock(CMP_INOUT CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock)
+{
+    CGU_UINT32 n0 = compressedBlock.x & 0xffff;
+    CGU_UINT32 n1 = compressedBlock.x >> 16;
+    CGU_UINT32 index;
+
+    //-------------------------------------------------------
+    // Decode the compressed block 0..255 color range
+    //-------------------------------------------------------
+    CGU_Vec3f c0 = cmp_565ToLinear(n0);  // max color
+    CGU_Vec3f c1 = cmp_565ToLinear(n1);  // min color
+    CGU_Vec3f c2;
+    CGU_Vec3f c3;
+
+    if (n0 > n1)
+    {
+        c2 = (c0 * 2.0f + c1) / 3.0f;
+        c3 = (c1 * 2.0f + c0) / 3.0f;
+
+        for (CGU_UINT32 i = 0; i < 16; i++)
+        {
+            index = (compressedBlock.y >> (2 * i)) & 3;
+            switch (index)
+            {
+            case 0:
+                rgbBlock[i] = c0;
+                break;
+            case 1:
+                rgbBlock[i] = c1;
+                break;
+            case 2:
+                rgbBlock[i] = c2;
+                break;
+            case 3:
+                rgbBlock[i] = c3;
+                break;
+            }
+        }
+    }
+    else
+    {
+        // Transparent decode
+        c2 = (c0 + c1) / 2.0f;
+
+        for (CGU_UINT32 i = 0; i < 16; i++)
+        {
+            index = (compressedBlock.y >> (2 * i)) & 3;
+            switch (index)
+            {
+            case 0:
+                rgbBlock[i] = c0;
+                break;
+            case 1:
+                rgbBlock[i] = c1;
+                break;
+            case 2:
+                rgbBlock[i] = c2;
+                break;
+            case 3:
+                rgbBlock[i] = 0.0f;
+                break;
+            }
+        }
+    }
+}
+
+// The source is 0..255
+CMP_STATIC float cgu_RGBABlockErrorLinear(const CGU_Vec4uc src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock)
+{
+    CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4];
+
+    // Decompressed block channels are 0..255
+    cgu_decompressRGBBlock(rgbBlock, compressedBlock);
+
+    //------------------------------------------------------------------
+    // Calculate MSE of the block
+    // Note : pow is used as Float type for the code to be usable on CPU
+    //------------------------------------------------------------------
+    CGU_Vec3f serr;
+    serr = 0.0f;
+
+    float sR, sG, sB, R, G, B;
+
+    for (int j = 0; j < 16; j++)
+    {
+        sR = src_rgbBlock[j].x;
+        sG = src_rgbBlock[j].y;
+        sB = src_rgbBlock[j].z;
+
+        R = rgbBlock[j].x;
+        G = rgbBlock[j].y;
+        B = rgbBlock[j].z;
+
+        // Norm colors
+        serr.x += pow(sR - R, 2.0f);
+        serr.y += pow(sG - G, 2.0f);
+        serr.z += pow(sB - B, 2.0f);
+    }
+
+    // MSE for 16 texels
+    return (serr.x + serr.y + serr.z) / 48.0f;
+}
+
+// The source is 0..1, decompressed data using cmp_decompressRGBBlock2 is 0..255 which is converted down to 0..1
+CMP_STATIC float cgu_RGBBlockError(const CGU_Vec3f src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock, CGU_BOOL isSRGB)
+{
+    CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4];
+
+    // Decompressed block channels are 0..255
+    cgu_decompressRGBBlock(rgbBlock, compressedBlock);
+
+    //------------------------------------------------------------------
+    // Calculate MSE of the block
+    // Note : pow is used as Float type for the code to be usable on CPU
+    //------------------------------------------------------------------
+    CGU_Vec3f serr;
+    serr = 0.0f;
+
+    float sR, sG, sB, R, G, B;
+
+    for (int j = 0; j < 16; j++)
+    {
+        if (isSRGB)
+        {
+            sR = round(cmp_linearToSrgbf(src_rgbBlock[j].x) * 255.0f);
+            sG = round(cmp_linearToSrgbf(src_rgbBlock[j].y) * 255.0f);
+            sB = round(cmp_linearToSrgbf(src_rgbBlock[j].z) * 255.0f);
+        }
+        else
+        {
+            sR = round(src_rgbBlock[j].x * 255.0f);
+            sG = round(src_rgbBlock[j].y * 255.0f);
+            sB = round(src_rgbBlock[j].z * 255.0f);
+        }
+
+        R = rgbBlock[j].x;
+        G = rgbBlock[j].y;
+        B = rgbBlock[j].z;
+
+        // Norm colors
+        serr.x += pow(sR - R, 2.0f);
+        serr.y += pow(sG - G, 2.0f);
+        serr.z += pow(sB - B, 2.0f);
+    }
+
+    // MSE for 16 texels
+    return (serr.x + serr.y + serr.z) / 48.0f;
+}
+
+CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_MinMax(CMP_IN const CGU_Vec3f           src_imageRGB[16],
+                                                  CMP_IN CGU_FLOAT                 fquality,
+                                                  CMP_IN CGU_BOOL                  isSRGB,
+                                                  CMP_INOUT CGU_Vec3f              srcRGB[16],   // The list of source colors with blue channel altered
+                                                  CMP_INOUT CGU_Vec3f CMP_REFINOUT average_rgb,  // The centrepoint of the axis
+                                                  CMP_INOUT CGU_FLOAT CMP_REFINOUT errout)
+{
+    CGU_Vec2ui Q1CompData = {0, 0};
+    CGU_Vec3f  rgb        = {0, 0, 0};
+
+    // -------------------------------------------------------------------------------------
+    // (1) Find the array of unique pixel values and sum them to find their average position
+    // -------------------------------------------------------------------------------------
+    CGU_FLOAT  errLQ             = 0.0f;
+    CGU_BOOL   fastProcess       = (fquality <= CMP_QUALITY0);  // Min Max only
+    CGU_Vec3f  srcMin            = 1.0f;                        // Min source color
+    CGU_Vec3f  srcMax            = 0.0f;                        // Max source color
+    CGU_Vec2ui Q1compressedBlock = {0, 0};
+    CGU_UINT32 c0                = 0;
+    CGU_UINT32 c1                = 0;
+
+    average_rgb = 0.0f;
+    // Get average and modifed src
+    // find average position and save list of pixels as 0F..255F range for processing
+    // Note: z (blue) is average of blue+green channels
+    for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
+    {
+        srcMin = cmp_minVec3f(srcMin, src_imageRGB[i]);
+        srcMax = cmp_maxVec3f(srcMax, src_imageRGB[i]);
+        if (!fastProcess)
+        {
+            rgb         = isSRGB ? cmp_linearToSrgb(src_imageRGB[i]) : cmp_saturate(src_imageRGB[i]);
+            rgb.z       = (rgb.y + rgb.z) * 0.5F;  // Z-axiz => (R+G)/2
+            srcRGB[i]   = rgb;
+            average_rgb = average_rgb + rgb;
+        }
+    }
+
+    // Process two colors for saving in 565 format as C0 and C1
+    cgu_ProcessColors(CMP_REFINOUT srcMin, CMP_REFINOUT srcMax, CMP_REFINOUT c0, CMP_REFINOUT c1, isSRGB ? 1 : 0, isSRGB);
+
+    // Save simple min-max encoding
+    if (c0 < c1)
+    {
+        Q1CompData.x     = (c0 << 16) | c1;
+        CGU_UINT32 index = 0;
+        errLQ            = cgu_getIndicesRGB(CMP_REFINOUT index, src_imageRGB, srcMin, srcMax, false);
+        Q1CompData.y     = index;
+        errout           = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB);
+    }
+    else
+    {
+        // Most simple case all colors are equal or 0.0f
+        Q1compressedBlock.x = (c1 << 16) | c0;
+        Q1compressedBlock.y = 0;
+        errout              = 0.0f;
+        return Q1compressedBlock;
+    }
+    // 0.0625F is (1/BLOCK_SIZE_4X4)
+    average_rgb = average_rgb * 0.0625F;
+
+    return Q1CompData;
+}
+
+CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_Fast(CMP_IN const CGU_Vec3f           src_imageRGB[16],
+                                                CMP_IN CGU_FLOAT                 fquality,
+                                                CMP_IN CGU_BOOL                  isSRGB,
+                                                CMP_IN CGU_Vec3f                 srcRGB[16],
+                                                CMP_IN CGU_Vec3f CMP_REFINOUT    average_rgb,
+                                                CMP_INOUT CGU_FLOAT CMP_REFINOUT errout)
+{
+    CMP_UNUSED(fquality);
+
+    CGU_Vec3f  axisVectorRGB = {0.0f, 0.0f, 0.0f};  // The axis vector for index projection
+    CGU_FLOAT  pos_on_axis[16];                     // The distance each unique falls along the compression axis
+    CGU_FLOAT  axisleft   = 0;                      // The extremities and centre (average of left/right) of srcRGB along the compression axis
+    CGU_FLOAT  axisright  = 0;                      // The extremities and centre (average of left/right) of srcRGB along the compression axis
+    CGU_FLOAT  axiscentre = 0;                      // The extremities and centre (average of left/right) of srcRGB along the compression axis
+    CGU_INT32  swap       = 0;                      // Indicator if the RGB values need swapping to generate an opaque result
+    CGU_Vec3f  srcBlock[16];                        // The list of source colors with any color space transforms and clipping
+    CGU_UINT32 c0              = 0;
+    CGU_UINT32 c1              = 0;
+    CGU_Vec2ui compressedBlock = {0, 0};
+    CGU_FLOAT  Q1CompErr;
+    CGU_Vec2ui Q1CompData = {0, 0};
+
+    CGU_Vec3f rgb = {0, 0, 0};
+
+    // -------------------------------------------------------------------------------------
+    // (4) For each component, reflect points about the average so all lie on the same side
+    // of the average, and compute the new average - this gives a second point that defines the axis
+    // To compute the sign of the axis sum the positive differences of G for each of R and B (the
+    // G axis is always positive in this implementation
+    // -------------------------------------------------------------------------------------
+    // An interesting situation occurs if the G axis contains no information, in which case the RB
+    // axis is also compared. I am not entirely sure if this is the correct implementation - should
+    // the priority axis be determined by magnitude?
+    {
+        CGU_FLOAT rg_pos = 0.0f;
+        CGU_FLOAT bg_pos = 0.0f;
+        CGU_FLOAT rb_pos = 0.0f;
+
+        for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
+        {
+            rgb           = srcRGB[i] - average_rgb;
+            axisVectorRGB = axisVectorRGB + cmp_fabsVec3f(rgb);
+            if (rgb.x > 0)
+            {
+                rg_pos += rgb.y;
+                rb_pos += rgb.z;
+            }
+            if (rgb.z > 0)
+                bg_pos += rgb.y;
+        }
+
+        // Average over BLOCK_SIZE_4X4
+        axisVectorRGB = axisVectorRGB * 0.0625F;
+
+        // New average position
+        if (rg_pos < 0)
+            axisVectorRGB.x = -axisVectorRGB.x;
+        if (bg_pos < 0)
+            axisVectorRGB.z = -axisVectorRGB.z;
+        if ((rg_pos == bg_pos) && (rg_pos == 0))
+        {
+            if (rb_pos < 0)
+                axisVectorRGB.z = -axisVectorRGB.z;
+        }
+    }
+
+    // -------------------------------------------------------------------------------------
+    // (5) Axis projection and remapping
+    // -------------------------------------------------------------------------------------
+    {
+        CGU_FLOAT v2_recip;
+        // Normalize the axis for simplicity of future calculation
+        v2_recip = cmp_dotVec3f(axisVectorRGB, axisVectorRGB);
+        if (v2_recip > 0)
+            v2_recip = 1.0f / (CGU_FLOAT)cmp_sqrt(v2_recip);
+        else
+            v2_recip = 1.0f;
+        axisVectorRGB = axisVectorRGB * v2_recip;
+    }
+
+    // -------------------------------------------------------------------------------------
+    // (6) Map the axis
+    // -------------------------------------------------------------------------------------
+    // the line joining (and extended on either side of) average and axis
+    // defines the axis onto which the points will be projected
+    // Project all the points onto the axis, calculate the distance along
+    // the axis from the centre of the axis (average)
+    // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is
+    //     P + ((R-P).v) / (v.v))v
+    // The distance along v is therefore (R-P).v / (v.v) where (v.v) is 1 if v is a unit vector.
+    //
+    // Calculate the extremities at the same time - these need to be reasonably accurately
+    // represented in all cases
+    {
+        axisleft  = CMP_FLOAT_MAX;
+        axisright = -CMP_FLOAT_MAX;
+        for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
+        {
+            // Compute the distance along the axis of the point of closest approach
+            CGU_Vec3f temp = (srcRGB[i] - average_rgb);
+            pos_on_axis[i] = cmp_dotVec3f(temp, axisVectorRGB);
+
+            // Work out the extremities
+            if (pos_on_axis[i] < axisleft)
+                axisleft = pos_on_axis[i];
+            if (pos_on_axis[i] > axisright)
+                axisright = pos_on_axis[i];
+        }
+    }
+
+    // ---------------------------------------------------------------------------------------------
+    // (7) Now we have a good axis and the basic information about how the points are mapped to it
+    // Our initial guess is to represent the endpoints accurately, by moving the average
+    // to the centre and recalculating the point positions along the line
+    // ---------------------------------------------------------------------------------------------
+    {
+        axiscentre  = (axisleft + axisright) * 0.5F;
+        average_rgb = average_rgb + (axisVectorRGB * axiscentre);
+        for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
+            pos_on_axis[i] -= axiscentre;
+        axisright -= axiscentre;
+        axisleft -= axiscentre;
+    }
+
+    // -------------------------------------------------------------------------------------
+    // (8) Calculate the high and low output colour values
+    // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A
+    // straight rounded average is not correct, as the decompressor 'unrounds' by replicating
+    // the top bits to the bottom.
+    // In order to take account of this process, we don't just apply a straight rounding correction,
+    // but base our rounding on the input value (a straight rounding is actually pretty good in terms of
+    // error measure, but creates a visual colour and/or brightness shift relative to the original image)
+    // The method used here is to apply a centre-biased rounding dependent on the input value, which was
+    // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
+    // the image.
+    // rgb = (average_rgb + (left|right)*axisVectorRGB);
+    // -------------------------------------------------------------------------------------
+    {
+        CGU_Vec3f MinColor, MaxColor;
+
+        MinColor   = average_rgb + (axisVectorRGB * axisleft);
+        MaxColor   = average_rgb + (axisVectorRGB * axisright);
+        MinColor.z = (MinColor.z * 2) - MinColor.y;
+        MaxColor.z = (MaxColor.z * 2) - MaxColor.y;
+
+        cgu_ProcessColors(CMP_REFINOUT MinColor, CMP_REFINOUT MaxColor, CMP_REFINOUT c0, CMP_REFINOUT c1, 1, false);
+
+        // Force to be a 4-colour opaque block - in which case, c0 is greater than c1
+        swap = 0;
+        if (c0 < c1)
+        {
+            CGU_UINT32 t;
+            t    = c0;
+            c0   = c1;
+            c1   = t;
+            swap = 1;
+        }
+        else if (c0 == c1)
+        {
+            // This block will always be encoded in 3-colour mode
+            // Need to ensure that only one of the two points gets used,
+            // avoiding accidentally setting some transparent pixels into the block
+            for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
+                pos_on_axis[i] = axisleft;
+        }
+
+        compressedBlock.x = c0 | (c1 << 16);
+
+        // -------------------------------------------------------------------------------------
+        // (9) Final clustering, creating the 2-bit values that define the output
+        // -------------------------------------------------------------------------------------
+
+        CGU_UINT32 index;
+        CGU_FLOAT  division;
+        {
+            compressedBlock.y = 0;
+            division          = axisright * 2.0f / 3.0f;
+            axiscentre        = (axisleft + axisright) / 2;  // Actually, this code only works if centre is 0 or approximately so
+
+            CGU_FLOAT CompMinErr;
+
+            // This feature is work in progress
+            // remap to BC1 spec for decoding offsets,
+            // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1
+            // CGU_Vec3f   cn[4];
+            // cn[0] = MaxColor;
+            // cn[1] = MinColor;
+            // cn[2] = cn[0]*2.0f/3.0f + cn[1]*1.0f/3.0f;
+            // cn[3] = cn[0]*1.0f/3.0f + cn[1]*2.0f/3.0f;
+
+            for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
+            {
+                // Endpoints (indicated by block > average) are 0 and 1, while
+                // interpolants are 2 and 3
+                if (cmp_fabs(pos_on_axis[i]) >= division)
+                    index = 0;
+                else
+                    index = 2;
+                // Positive is in the latter half of the block
+                if (pos_on_axis[i] >= axiscentre)
+                    index += 1;
+
+                index = index ^ swap;
+                // Set the output, taking swapping into account
+                compressedBlock.y |= (index << (2 * i));
+
+                // use err calc for use in higher quality code
+                //CompMinErr += cmp_dotVec3f(srcRGBRef[i] - cn[index],srcRGBRef[i] - cn[index]);
+            }
+
+            //CompMinErr = CompMinErr * 0.0208333f;
+
+            CompMinErr = cgu_RGBBlockError(src_imageRGB, compressedBlock, isSRGB);
+            Q1CompErr  = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB);
+
+            if (CompMinErr > Q1CompErr)
+            {
+                compressedBlock = Q1CompData;
+                errout          = Q1CompErr;
+            }
+            else
+                errout = CompMinErr;
+        }
+    }
+    // done
+
+    return compressedBlock;
+}
+
+CMP_STATIC CGU_UINT8 g_Match5Bit[256][2] = {
+    {0, 0},   {0, 0},   {1, 0},   {1, 0},   {0, 1},   {0, 1},   {0, 1},   {1, 1},   {1, 1},   {1, 1},   {0, 2},   {4, 0},   {1, 2},   {1, 2},   {1, 2},
+    {2, 2},   {2, 2},   {2, 2},   {1, 3},   {5, 1},   {2, 3},   {2, 3},   {0, 4},   {3, 3},   {3, 3},   {3, 3},   {2, 4},   {2, 4},   {2, 4},   {5, 3},
+    {1, 5},   {1, 5},   {2, 5},   {4, 4},   {4, 4},   {3, 5},   {3, 5},   {2, 6},   {2, 6},   {2, 6},   {3, 6},   {5, 5},   {5, 5},   {4, 6},   {8, 4},
+    {3, 7},   {3, 7},   {3, 7},   {6, 6},   {6, 6},   {6, 6},   {5, 7},   {9, 5},   {6, 7},   {6, 7},   {4, 8},   {7, 7},   {7, 7},   {7, 7},   {6, 8},
+    {6, 8},   {6, 8},   {9, 7},   {5, 9},   {5, 9},   {6, 9},   {8, 8},   {8, 8},   {7, 9},   {7, 9},   {6, 10},  {6, 10},  {6, 10},  {7, 10},  {9, 9},
+    {9, 9},   {8, 10},  {12, 8},  {7, 11},  {7, 11},  {7, 11},  {10, 10}, {10, 10}, {10, 10}, {9, 11},  {13, 9},  {10, 11}, {10, 11}, {8, 12},  {11, 11},
+    {11, 11}, {11, 11}, {10, 12}, {10, 12}, {10, 12}, {13, 11}, {9, 13},  {9, 13},  {10, 13}, {12, 12}, {12, 12}, {11, 13}, {11, 13}, {10, 14}, {10, 14},
+    {10, 14}, {11, 14}, {13, 13}, {13, 13}, {12, 14}, {16, 12}, {11, 15}, {11, 15}, {11, 15}, {14, 14}, {14, 14}, {14, 14}, {13, 15}, {17, 13}, {14, 15},
+    {14, 15}, {12, 16}, {15, 15}, {15, 15}, {15, 15}, {14, 16}, {14, 16}, {14, 16}, {17, 15}, {13, 17}, {13, 17}, {14, 17}, {16, 16}, {16, 16}, {15, 17},
+    {15, 17}, {14, 18}, {14, 18}, {14, 18}, {15, 18}, {17, 17}, {17, 17}, {16, 18}, {20, 16}, {15, 19}, {15, 19}, {15, 19}, {18, 18}, {18, 18}, {18, 18},
+    {17, 19}, {21, 17}, {18, 19}, {18, 19}, {16, 20}, {19, 19}, {19, 19}, {19, 19}, {18, 20}, {18, 20}, {18, 20}, {21, 19}, {17, 21}, {17, 21}, {18, 21},
+    {20, 20}, {20, 20}, {19, 21}, {19, 21}, {18, 22}, {18, 22}, {18, 22}, {19, 22}, {21, 21}, {21, 21}, {20, 22}, {24, 20}, {19, 23}, {19, 23}, {19, 23},
+    {22, 22}, {22, 22}, {22, 22}, {21, 23}, {25, 21}, {22, 23}, {22, 23}, {20, 24}, {23, 23}, {23, 23}, {23, 23}, {22, 24}, {22, 24}, {22, 24}, {25, 23},
+    {21, 25}, {21, 25}, {22, 25}, {24, 24}, {24, 24}, {23, 25}, {23, 25}, {22, 26}, {22, 26}, {22, 26}, {23, 26}, {25, 25}, {25, 25}, {24, 26}, {28, 24},
+    {23, 27}, {23, 27}, {23, 27}, {26, 26}, {26, 26}, {26, 26}, {25, 27}, {29, 25}, {26, 27}, {26, 27}, {24, 28}, {27, 27}, {27, 27}, {27, 27}, {26, 28},
+    {26, 28}, {26, 28}, {29, 27}, {25, 29}, {25, 29}, {26, 29}, {28, 28}, {28, 28}, {27, 29}, {27, 29}, {26, 30}, {26, 30}, {26, 30}, {27, 30}, {29, 29},
+    {29, 29}, {28, 30}, {28, 30}, {27, 31}, {27, 31}, {27, 31}, {30, 30}, {30, 30}, {30, 30}, {29, 31}, {29, 31}, {30, 31}, {30, 31}, {30, 31}, {31, 31},
+    {31, 31}};
+
+CMP_STATIC CGU_UINT8 g_Match6Bit[256][2] = {
+    {0, 0},   {1, 0},   {0, 1},   {1, 1},   {1, 1},   {0, 2},   {1, 2},   {2, 2},   {2, 2},   {1, 3},   {0, 4},   {3, 3},   {3, 3},   {0, 5},   {1, 5},
+    {4, 4},   {4, 4},   {1, 6},   {0, 7},   {5, 5},   {5, 5},   {0, 8},   {1, 8},   {6, 6},   {6, 6},   {1, 9},   {2, 9},   {7, 7},   {7, 7},   {2, 10},
+    {3, 10},  {8, 8},   {8, 8},   {3, 11},  {4, 11},  {9, 9},   {9, 9},   {4, 12},  {5, 12},  {10, 10}, {10, 10}, {5, 13},  {6, 13},  {16, 8},  {11, 11},
+    {6, 14},  {7, 14},  {17, 9},  {12, 12}, {7, 15},  {8, 15},  {16, 11}, {13, 13}, {10, 15}, {8, 16},  {9, 16},  {14, 14}, {13, 15}, {9, 17},  {10, 17},
+    {15, 15}, {16, 15}, {10, 18}, {11, 18}, {12, 18}, {16, 16}, {11, 19}, {12, 19}, {13, 19}, {17, 17}, {12, 20}, {13, 20}, {14, 20}, {18, 18}, {13, 21},
+    {14, 21}, {15, 21}, {19, 19}, {14, 22}, {15, 22}, {20, 20}, {20, 20}, {15, 23}, {16, 23}, {21, 21}, {21, 21}, {16, 24}, {17, 24}, {22, 22}, {22, 22},
+    {17, 25}, {18, 25}, {23, 23}, {23, 23}, {18, 26}, {19, 26}, {24, 24}, {24, 24}, {19, 27}, {20, 27}, {25, 25}, {25, 25}, {20, 28}, {21, 28}, {26, 26},
+    {26, 26}, {21, 29}, {22, 29}, {32, 24}, {27, 27}, {22, 30}, {23, 30}, {33, 25}, {28, 28}, {23, 31}, {24, 31}, {32, 27}, {29, 29}, {26, 31}, {24, 32},
+    {25, 32}, {30, 30}, {29, 31}, {25, 33}, {26, 33}, {31, 31}, {32, 31}, {26, 34}, {27, 34}, {28, 34}, {32, 32}, {27, 35}, {28, 35}, {29, 35}, {33, 33},
+    {28, 36}, {29, 36}, {30, 36}, {34, 34}, {29, 37}, {30, 37}, {31, 37}, {35, 35}, {30, 38}, {31, 38}, {36, 36}, {36, 36}, {31, 39}, {32, 39}, {37, 37},
+    {37, 37}, {32, 40}, {33, 40}, {38, 38}, {38, 38}, {33, 41}, {34, 41}, {39, 39}, {39, 39}, {34, 42}, {35, 42}, {40, 40}, {40, 40}, {35, 43}, {36, 43},
+    {41, 41}, {41, 41}, {36, 44}, {37, 44}, {42, 42}, {42, 42}, {37, 45}, {38, 45}, {48, 40}, {43, 43}, {38, 46}, {39, 46}, {49, 41}, {44, 44}, {39, 47},
+    {40, 47}, {48, 43}, {45, 45}, {42, 47}, {40, 48}, {41, 48}, {46, 46}, {45, 47}, {41, 49}, {42, 49}, {47, 47}, {48, 47}, {42, 50}, {43, 50}, {44, 50},
+    {48, 48}, {43, 51}, {44, 51}, {45, 51}, {49, 49}, {44, 52}, {45, 52}, {46, 52}, {50, 50}, {45, 53}, {46, 53}, {47, 53}, {51, 51}, {46, 54}, {47, 54},
+    {52, 52}, {52, 52}, {47, 55}, {48, 55}, {53, 53}, {53, 53}, {48, 56}, {49, 56}, {54, 54}, {54, 54}, {49, 57}, {50, 57}, {55, 55}, {55, 55}, {50, 58},
+    {51, 58}, {56, 56}, {56, 56}, {51, 59}, {52, 59}, {57, 57}, {57, 57}, {52, 60}, {53, 60}, {58, 58}, {58, 58}, {53, 61}, {54, 61}, {59, 59}, {59, 59},
+    {54, 62}, {55, 62}, {60, 60}, {60, 60}, {55, 63}, {56, 63}, {61, 61}, {61, 61}, {58, 63}, {59, 63}, {62, 62}, {62, 62}, {61, 63}, {62, 63}, {63, 63},
+    {63, 63}};
+
+CMP_STATIC CGU_Vec2ui cgu_solidColorBlock(CMP_IN CGU_UINT8 Red, CMP_IN CGU_UINT8 Green, CMP_IN CGU_UINT8 Blue)
+{
+    CGU_UINT32 maxEndp16;
+    CGU_UINT32 minEndp16;
+
+    CGU_UINT32 mask = 0xAAAAAAAAu;
+
+    minEndp16 = g_Match5Bit[Red][0] * 2048U + g_Match6Bit[Green][0] * 32U + g_Match5Bit[Blue][0];
+    maxEndp16 = g_Match5Bit[Red][1] * 2048U + g_Match6Bit[Green][1] * 32U + g_Match5Bit[Blue][1];
+
+    // write the color block
+    if (maxEndp16 < minEndp16)
+    {
+        CGU_UINT32 tmpValue = minEndp16;
+        minEndp16           = maxEndp16;
+        maxEndp16           = tmpValue;
+        mask ^= 0x55555555u;
+    }
+
+    CGU_Vec2ui outputBytes;
+    outputBytes.x = CGU_UINT32(maxEndp16) | (CGU_UINT32(minEndp16) << 16u);
+    outputBytes.y = mask;
+
+    return outputBytes;
+}
+
+CMP_STATIC void cmp_get_encode_data(CMP_IN CMP_EncodeData CMP_REFINOUT edata, CMP_IN CMP_CONSTANT CGU_Vec4uc src_image[16])
+{
+    CMP_CONSTANT CGU_UINT32 fr = src_image[0].r, fg = src_image[0].g, fb = src_image[0].b;
+
+    edata.all_colors_equal = false;
+
+    edata.total.r = fr;
+    edata.total.g = fg;
+    edata.total.b = fb;
+    edata.max.r   = fr;
+    edata.max.g   = fg;
+    edata.max.b   = fb;
+    edata.min.r   = fr;
+    edata.min.g   = fg;
+    edata.min.b   = fb;
+
+    edata.grayscale_flag   = (fr == fg) && (fr == fb);
+    edata.any_black_pixels = (fr | fg | fb) < 4;
+
+    for (CGU_UINT32 i = 1; i < 16; i++)
+    {
+        CMP_CONSTANT CGU_INT r = src_image[i].r, g = src_image[i].g, b = src_image[i].b;
+
+        edata.grayscale_flag &= ((r == g) && (r == b));
+        edata.any_black_pixels |= ((r | g | b) < 4);
+
+        edata.max.r = CMP_MAX(edata.max.r, r);
+        edata.max.g = CMP_MAX(edata.max.g, g);
+        edata.max.b = CMP_MAX(edata.max.b, b);
+        edata.min.r = CMP_MIN(edata.min.r, r);
+        edata.min.g = CMP_MIN(edata.min.g, g);
+        edata.min.b = CMP_MIN(edata.min.b, b);
+        edata.total.r += r;
+        edata.total.g += g;
+        edata.total.b += b;
+    }
+
+    edata.avg.r = (edata.total.r + 8) >> 4;
+    edata.avg.g = (edata.total.g + 8) >> 4;
+    edata.avg.b = (edata.total.b + 8) >> 4;
+}
+
+#ifndef ASPM_GPU
+/*------------------------------------------------------------------------------------------------
+1 DIM ramp
+------------------------------------------------------------------------------------------------*/
+CMP_STATIC inline void cpu_BldClrRmp(CGU_FLOAT _Rmp[MAX_POINTS], CGU_FLOAT _InpRmp[NUM_ENDPOINTS], CGU_UINT32 dwNumPoints)
+{
+    CGU_UINT32 dwRndAmount[9] = {0, 0, 0, 0, 1, 1, 2, 2, 3};
+
+    // linear interpolate end points to get the ramp
+    _Rmp[0]               = _InpRmp[0];
+    _Rmp[dwNumPoints - 1] = _InpRmp[1];
+    if (dwNumPoints % 2)
+        _Rmp[dwNumPoints] = 1000000.f;  // for 3 point ramp; not to select the 4th point as min
+    for (CGU_UINT32 e = 1; e < dwNumPoints - 1; e++)
+        _Rmp[e] = cmp_floor((_Rmp[0] * (dwNumPoints - 1 - e) + _Rmp[dwNumPoints - 1] * e + dwRndAmount[dwNumPoints]) / (CGU_FLOAT)(dwNumPoints - 1));
+}
+
+/*------------------------------------------------------------------------------------------------
+// build 3D ramp
+------------------------------------------------------------------------------------------------*/
+CMP_STATIC inline void cpu_BldRmp(CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS], CGU_UINT32 dwNumPoints)
+{
+    for (CGU_UINT32 j = 0; j < 3; j++)
+        cpu_BldClrRmp(_Rmp[j], _InpRmp[j], dwNumPoints);
+}
+
+/*------------------------------------------------------------------------------------------------
+// this is how the end points is going to be look like when decompressed
+------------------------------------------------------------------------------------------------*/
+CMP_STATIC inline void cpu_MkWkRmpPts(CMP_INOUT CGU_UINT8 CMP_REFINOUT _bEq,
+                                      CGU_FLOAT                        _OutRmpPts[NUM_CHANNELS][NUM_ENDPOINTS],
+                                      CGU_FLOAT                        _InpRmpPts[NUM_CHANNELS][NUM_ENDPOINTS],
+                                      CGU_UINT8                        nRedBits,
+                                      CGU_UINT8                        nGreenBits,
+                                      CGU_UINT8                        nBlueBits)
+{
+    CGU_FLOAT Fctrs[3];
+    Fctrs[RC] = (CGU_FLOAT)(1 << nRedBits);
+    Fctrs[GC] = (CGU_FLOAT)(1 << nGreenBits);
+    Fctrs[BC] = (CGU_FLOAT)(1 << nBlueBits);
+
+    CGU_BOOL bEq = true;
+    // find whether input ramp is flat
+    for (CGU_UINT32 j = 0; j < 3; j++)
+        bEq &= (_InpRmpPts[j][0] == _InpRmpPts[j][1]);
+
+    _bEq = bEq ? 1 : 0;
+
+    // end points on the integer grid
+    for (CGU_UINT32 j = 0; j < 3; j++)
+    {
+        for (CGU_UINT32 k = 0; k < 2; k++)
+        {
+            // Apply the lower bit replication to give full dynamic range
+            _OutRmpPts[j][k] = _InpRmpPts[j][k] + cmp_floor(_InpRmpPts[j][k] / Fctrs[j]);
+            _OutRmpPts[j][k] = cmp_max(_OutRmpPts[j][k], 0.f);
+            _OutRmpPts[j][k] = cmp_min(_OutRmpPts[j][k], 255.f);
+        }
+    }
+}
+
+// Compute error and find DXTC indexes for the current cluster
+CMP_STATIC CGU_FLOAT cpu_ClstrIntnl(CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
+                                    CGU_UINT8 pcIndices[BLOCK_SIZE_4X4],
+                                    CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS],
+                                    int       dwBlockSize,
+                                    CGU_UINT8 dwNumPoints,
+                                    bool      _ConstRamp,
+                                    CGU_FLOAT _pfWeights[3],
+                                    bool      _bUseAlpha)
+{
+    CGU_FLOAT Err   = 0.f;
+    CGU_UINT8 rmp_l = (_ConstRamp) ? 1 : dwNumPoints;
+
+    // For each colour in the original block assign it
+    // to the closest cluster and compute the cumulative error
+    for (int i = 0; i < dwBlockSize; i++)
+    {
+        if (_bUseAlpha && *((CGU_UINT32*)&_Blk[i][AC]) == 0)
+            pcIndices[i] = dwNumPoints;
+        else
+        {
+            CGU_FLOAT shortest      = 99999999999.f;
+            CGU_UINT8 shortestIndex = 0;
+            CGU_UINT8 r;
+            if ((_pfWeights[0] != 1.0f) || (_pfWeights[1] != 1.0f) || (_pfWeights[2] != 1.0f))
+                for (r = 0; r < rmp_l; r++)
+                {
+                    // calculate the distance for each component
+                    CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * _pfWeights[0] +
+                                         (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * _pfWeights[1] +
+                                         (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * _pfWeights[2];
+
+                    if (distance < shortest)
+                    {
+                        shortest      = distance;
+                        shortestIndex = r;
+                    }
+                }
+            else
+                for (r = 0; r < rmp_l; r++)
+                {
+                    // calculate the distance for each component
+                    CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) +
+                                         (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]);
+
+                    if (distance < shortest)
+                    {
+                        shortest      = distance;
+                        shortestIndex = r;
+                    }
+                }
+
+            Err += shortest;
+
+            // We have the index of the best cluster, so assign this in the block
+            // Reorder indices to match correct DXTC ordering
+            if (shortestIndex == dwNumPoints - 1)
+                shortestIndex = 1;
+            else if (shortestIndex)
+                shortestIndex++;
+            pcIndices[i] = shortestIndex;
+        }
+    }
+
+    return Err;
+}
+
+/*------------------------------------------------------------------------------------------------
+// input ramp is on the coarse grid
+------------------------------------------------------------------------------------------------*/
+CMP_STATIC CGU_FLOAT cpu_ClstrBas(CGU_UINT8 pcIndices[BLOCK_SIZE_4X4],
+                                  CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
+                                  CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS],
+                                  int       dwBlockSize,
+                                  CGU_UINT8 dwNumPoints,
+                                  CGU_FLOAT _pfWeights[3],
+                                  bool      _bUseAlpha,
+                                  CGU_UINT8 nRedBits,
+                                  CGU_UINT8 nGreenBits,
+                                  CGU_UINT8 nBlueBits)
+{
+    // make ramp endpoints the way they'll going to be decompressed
+    CGU_UINT8 Eq = 1;
+    CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
+    cpu_MkWkRmpPts(Eq, InpRmp, _InpRmp, nRedBits, nGreenBits, nBlueBits);
+
+    // build ramp as it would be built by decompressor
+    CGU_FLOAT Rmp[NUM_CHANNELS][MAX_POINTS];
+    cpu_BldRmp(Rmp, InpRmp, dwNumPoints);
+
+    // clusterize and find a cumulative error
+    return cpu_ClstrIntnl(_Blk, pcIndices, Rmp, dwBlockSize, dwNumPoints, Eq, _pfWeights, _bUseAlpha);
+}
+
+CMP_STATIC CGU_UINT8 nByteBitsMask2[9] = {0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
+
+CMP_STATIC CGU_UINT32 cpu_ConstructColor2(CGU_UINT8 R, CGU_UINT8 nRedBits, CGU_UINT8 G, CGU_UINT8 nGreenBits, CGU_UINT8 B, CGU_UINT8 nBlueBits)
+{
+    return (((R & nByteBitsMask2[nRedBits]) << (nGreenBits + nBlueBits - (PIX_GRID - nRedBits))) |
+            ((G & nByteBitsMask2[nGreenBits]) << (nBlueBits - (PIX_GRID - nGreenBits))) | ((B & nByteBitsMask2[nBlueBits]) >> ((PIX_GRID - nBlueBits))));
+}
+
+CMP_STATIC CGU_FLOAT cpu_Clstr(CGU_UINT32 block_32[BLOCK_SIZE_4X4],
+                               CGU_UINT32 dwBlockSize,
+                               CGU_UINT8  nEndpoints[3][NUM_ENDPOINTS],
+                               CGU_UINT8  pcIndices[BLOCK_SIZE_4X4],
+                               CGU_UINT8  dwNumPoints,
+                               CGU_FLOAT  _pfWeights[3],
+                               bool       _bUseAlpha,
+                               CGU_UINT8  _nAlphaThreshold,
+                               CGU_UINT8  nRedBits,
+                               CGU_UINT8  nGreenBits,
+                               CGU_UINT8  nBlueBits)
+{
+    CGU_UINT32 c0              = cpu_ConstructColor2(nEndpoints[RC][0], nRedBits, nEndpoints[GC][0], nGreenBits, nEndpoints[BC][0], nBlueBits);
+    CGU_UINT32 c1              = cpu_ConstructColor2(nEndpoints[RC][1], nRedBits, nEndpoints[GC][1], nGreenBits, nEndpoints[BC][1], nBlueBits);
+    CGU_UINT32 nEndpointIndex0 = 0;
+    CGU_UINT32 nEndpointIndex1 = 1;
+    if ((!(dwNumPoints & 0x1) && c0 <= c1) || ((dwNumPoints & 0x1) && c0 > c1))
+    {
+        nEndpointIndex0 = 1;
+        nEndpointIndex1 = 0;
+    }
+
+    CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
+    InpRmp[RC][0] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex0];
+    InpRmp[RC][1] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex1];
+    InpRmp[GC][0] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex0];
+    InpRmp[GC][1] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex1];
+    InpRmp[BC][0] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex0];
+    InpRmp[BC][1] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex1];
+
+    CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24;
+    CGU_FLOAT  Blk[BLOCK_SIZE_4X4][NUM_CHANNELS];
+    for (CGU_UINT32 i = 0; i < dwBlockSize; i++)
+    {
+        Blk[i][RC] = (CGU_FLOAT)((block_32[i] & 0xff0000) >> 16);
+        Blk[i][GC] = (CGU_FLOAT)((block_32[i] & 0xff00) >> 8);
+        Blk[i][BC] = (CGU_FLOAT)(block_32[i] & 0xff);
+        if (_bUseAlpha)
+            Blk[i][AC] = ((block_32[i] & 0xff000000) >= dwAlphaThreshold) ? 1.f : 0.f;
+    }
+
+    return cpu_ClstrBas(pcIndices, Blk, InpRmp, dwBlockSize, dwNumPoints, _pfWeights, _bUseAlpha, nRedBits, nGreenBits, nBlueBits);
+}
+
+/*------------------------------------------------------------------------------------------------
+Compute cumulative error for the current cluster
+------------------------------------------------------------------------------------------------*/
+CMP_STATIC CGU_FLOAT cpu_ClstrErr(CGU_FLOAT  _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
+                                  CGU_FLOAT  _Rpt[BLOCK_SIZE_4X4],
+                                  CGU_FLOAT  _Rmp[NUM_CHANNELS][MAX_POINTS],
+                                  CGU_UINT32 _NmbClrs,
+                                  CGU_UINT32 _blcktp,
+                                  bool       _ConstRamp,
+                                  CGU_Vec3f  channelWeights)
+{
+    CGU_FLOAT  fError = 0.f;
+    CGU_UINT32 rmp_l  = (_ConstRamp) ? 1 : _blcktp;
+
+    CGU_BOOL useWeights = ((channelWeights[0] != 1.0f) || (channelWeights[1] != 1.0f) || (channelWeights[2] != 1.0f));
+
+    // For each colour in the original block, find the closest cluster
+    // and compute the comulative error
+    for (CGU_UINT32 i = 0; i < _NmbClrs; i++)
+    {
+        CGU_FLOAT fShortest = 99999999999.f;
+
+        if (useWeights)
+            for (CGU_UINT32 r = 0; r < rmp_l; r++)
+            {
+                // calculate the distance for each component
+                CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * channelWeights[0] +
+                                      (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * channelWeights[1] +
+                                      (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * channelWeights[2];
+
+                if (fDistance < fShortest)
+                    fShortest = fDistance;
+            }
+        else
+            for (CGU_UINT32 r = 0; r < rmp_l; r++)
+            {
+                // calculate the distance for each component
+                CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) +
+                                      (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]);
+
+                if (fDistance < fShortest)
+                    fShortest = fDistance;
+            }
+
+        // accumulate the error
+        fError += fShortest * _Rpt[i];
+    }
+
+    return fError;
+}
+
+#if defined(USE_REFINE3D)
+
+CMP_STATIC CGU_FLOAT cmp_Refine3D(CGU_FLOAT  _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
+                                  CGU_FLOAT  _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
+                                  CGU_FLOAT  _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
+                                  CGU_FLOAT  _Rpt[BLOCK_SIZE_4X4],
+                                  CGU_UINT32 _NmrClrs,
+                                  CGU_UINT32 dwNumPoints,
+                                  CGU_Vec3f  channelWeights,
+                                  CGU_UINT8  nRedBits,
+                                  CGU_UINT8  nGreenBits,
+                                  CGU_UINT8  nBlueBits,
+                                  CGU_UINT32 nRefineSteps)
+{
+    CGU_FLOAT ALIGN_16 Rmp[NUM_CHANNELS][MAX_POINTS];
+
+    CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS];
+    for (CGU_UINT32 i = 0; i < _NmrClrs; i++)
+        for (CGU_UINT32 j = 0; j < 3; j++)
+            Blk[i][j] = _Blk[i][j];
+
+    CGU_FLOAT fWeightRed   = channelWeights.r;
+    CGU_FLOAT fWeightGreen = channelWeights.g;
+    CGU_FLOAT fWeightBlue  = channelWeights.b;
+
+    // here is our grid
+    CGU_FLOAT Fctrs[3];
+    Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits));
+    Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits));
+    Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits));
+
+    CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS];
+    CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
+    for (CGU_UINT32 k = 0; k < 2; k++)
+        for (CGU_UINT32 j = 0; j < 3; j++)
+            InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k];
+
+    // make ramp endpoints the way they'll going to be decompressed
+    // plus check whether the ramp is flat
+    CGU_UINT8 Eq;
+    CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS];
+    cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+
+    // build ramp for all 3 colors
+    cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
+
+    // clusterize for the current ramp
+    CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights);
+    if (bestE == 0.f)  // if exact, we've done
+        return bestE;
+
+    // Jitter endpoints in each direction
+    CGU_INT nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8));
+    CGU_INT nRefineEnd   = cmp_min(nRefineSteps, (CGU_UINT8)8);
+    for (CGU_INT nJitterG0 = nRefineStart; nJitterG0 <= nRefineEnd; nJitterG0++)
+    {
+        InpRmp[GC][0] = cmp_min(cmp_max(InpRmp0[GC][0] + nJitterG0 * Fctrs[GC], 0.f), 255.f);
+        for (CGU_INT nJitterG1 = nRefineStart; nJitterG1 <= nRefineEnd; nJitterG1++)
+        {
+            InpRmp[GC][1] = cmp_min(cmp_max(InpRmp0[GC][1] + nJitterG1 * Fctrs[GC], 0.f), 255.f);
+            cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+            cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints);
+
+            CGU_FLOAT RmpErrG[MAX_POINTS][BLOCK_SIZE_4X4];
+            for (CGU_UINT32 i = 0; i < _NmrClrs; i++)
+            {
+                for (CGU_UINT32 r = 0; r < dwNumPoints; r++)
+                {
+                    CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
+                    RmpErrG[r][i]   = DistG * DistG * fWeightGreen;
+                }
+            }
+
+            for (CGU_INT nJitterB0 = nRefineStart; nJitterB0 <= nRefineEnd; nJitterB0++)
+            {
+                InpRmp[BC][0] = cmp_min(cmp_max(InpRmp0[BC][0] + nJitterB0 * Fctrs[BC], 0.f), 255.f);
+                for (CGU_INT nJitterB1 = nRefineStart; nJitterB1 <= nRefineEnd; nJitterB1++)
+                {
+                    InpRmp[BC][1] = cmp_min(cmp_max(InpRmp0[BC][1] + nJitterB1 * Fctrs[BC], 0.f), 255.f);
+                    cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+                    cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints);
+
+                    CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4];
+                    for (CGU_UINT32 i = 0; i < _NmrClrs; i++)
+                    {
+                        for (CGU_UINT32 r = 0; r < dwNumPoints; r++)
+                        {
+                            CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
+                            RmpErr[r][i]    = RmpErrG[r][i] + DistB * DistB * fWeightBlue;
+                        }
+                    }
+
+                    for (CGU_INT nJitterR0 = nRefineStart; nJitterR0 <= nRefineEnd; nJitterR0++)
+                    {
+                        InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + nJitterR0 * Fctrs[RC], 0.f), 255.f);
+                        for (CGU_INT nJitterR1 = nRefineStart; nJitterR1 <= nRefineEnd; nJitterR1++)
+                        {
+                            InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + nJitterR1 * Fctrs[RC], 0.f), 255.f);
+                            cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+                            cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints);
+
+                            // compute cumulative error
+                            CGU_FLOAT mse   = 0.f;
+                            CGU_INT   rmp_l = (Eq > 0) ? 1 : dwNumPoints;
+                            for (CGU_UINT32 k = 0; k < _NmrClrs; k++)
+                            {
+                                CGU_FLOAT MinErr = 10000000.f;
+                                for (CGU_INT r = 0; r < rmp_l; r++)
+                                {
+                                    CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]);
+                                    CGU_FLOAT Err  = RmpErr[r][k] + Dist * Dist * fWeightRed;
+                                    MinErr         = cmp_min(MinErr, Err);
+                                }
+                                mse += MinErr * _Rpt[k];
+                            }
+
+                            // save if we achieve better result
+                            if (mse < bestE)
+                            {
+                                bestE = mse;
+                                for (CGU_UINT32 k = 0; k < 2; k++)
+                                    for (CGU_UINT32 j = 0; j < 3; j++)
+                                        _OutRmpPnts[j][k] = InpRmp[j][k];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return bestE;
+}
+#endif
+
+#if defined(USE_REFINE)
+CMP_STATIC CGU_FLOAT cmp_Refine(CGU_FLOAT  _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
+                                CGU_FLOAT  _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
+                                CGU_FLOAT  _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
+                                CGU_FLOAT  _Rpt[BLOCK_SIZE_4X4],
+                                CGU_INT    _NmrClrs,
+                                CGU_UINT8  dwNumPoints,
+                                CGU_Vec3f  channelWeights,
+                                CGU_UINT32 nRedBits,
+                                CGU_UINT32 nGreenBits,
+                                CGU_UINT32 nBlueBits,
+                                CGU_UINT32 nRefineSteps)
+{
+    CGU_FLOAT ALIGN_16 Rmp[NUM_CHANNELS][MAX_POINTS];
+
+    if (nRefineSteps == 0)
+        nRefineSteps = 1;
+
+    CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS];
+    for (CGU_INT i = 0; i < _NmrClrs; i++)
+        for (CGU_INT j = 0; j < 3; j++)
+            Blk[i][j] = _Blk[i][j];
+
+    CGU_FLOAT fWeightRed   = channelWeights.r;
+    CGU_FLOAT fWeightGreen = channelWeights.g;
+    CGU_FLOAT fWeightBlue  = channelWeights.b;
+
+    // here is our grid
+    CGU_FLOAT Fctrs[3];
+    Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits));
+    Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits));
+    Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits));
+
+    CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS];
+    CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
+    for (CGU_INT k = 0; k < 2; k++)
+        for (CGU_INT j = 0; j < 3; j++)
+            InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k];
+
+    // make ramp endpoints the way they'll going to be decompressed
+    // plus check whether the ramp is flat
+    CGU_UINT8 Eq;
+    CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS];
+    cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+
+    // build ramp for all 3 colors
+    cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
+
+    // clusterize for the current ramp
+    CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights);
+    if (bestE == 0.f)  //  || !nRefineSteps)    // if exact, we've done
+        return bestE;
+
+    // Tweak each component in isolation and get the best values
+
+    // precompute ramp errors for Green and Blue
+    CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4];
+    for (CGU_INT i = 0; i < _NmrClrs; i++)
+    {
+        for (CGU_INT r = 0; r < dwNumPoints; r++)
+        {
+            CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
+            CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
+            RmpErr[r][i]    = DistG * DistG * fWeightGreen + DistB * DistB * fWeightBlue;
+        }
+    }
+
+    // First Red
+    CGU_FLOAT bstC0        = InpRmp0[RC][0];
+    CGU_FLOAT bstC1        = InpRmp0[RC][1];
+    CGU_INT   nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8));
+    CGU_INT   nRefineEnd   = cmp_min(nRefineSteps, (CGU_UINT8)8);
+    for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++)
+    {
+        for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++)
+        {
+            // make a move; both sides of interval.
+            InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + i * Fctrs[RC], 0.f), 255.f);
+            InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + j * Fctrs[RC], 0.f), 255.f);
+
+            // make ramp endpoints the way they'll going to be decompressed
+            // plus check whether the ramp is flat
+            cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+
+            // build ramp only for red
+            cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints);
+
+            // compute cumulative error
+            CGU_FLOAT mse   = 0.f;
+            CGU_INT   rmp_l = (Eq > 0) ? 1 : dwNumPoints;
+            for (CGU_INT k = 0; k < _NmrClrs; k++)
+            {
+                CGU_FLOAT MinErr = 10000000.f;
+                for (CGU_INT r = 0; r < rmp_l; r++)
+                {
+                    CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]);
+                    CGU_FLOAT Err  = RmpErr[r][k] + Dist * Dist * fWeightRed;
+                    MinErr         = cmp_minf(MinErr, Err);
+                }
+                mse += MinErr * _Rpt[k];
+            }
+
+            // save if we achieve better result
+            if (mse < bestE)
+            {
+                bstC0 = InpRmp[RC][0];
+                bstC1 = InpRmp[RC][1];
+                bestE = mse;
+            }
+        }
+    }
+
+    // our best REDs
+    InpRmp[RC][0] = bstC0;
+    InpRmp[RC][1] = bstC1;
+
+    // make ramp endpoints the way they'll going to be decompressed
+    // plus check whether the ramp is flat
+    cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+
+    // build ramp only for green
+    cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
+
+    // precompute ramp errors for Red and Blue
+    for (CGU_INT i = 0; i < _NmrClrs; i++)
+    {
+        for (CGU_INT r = 0; r < dwNumPoints; r++)
+        {
+            CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]);
+            CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
+            RmpErr[r][i]    = DistR * DistR * fWeightRed + DistB * DistB * fWeightBlue;
+        }
+    }
+
+    // Now green
+    bstC0 = InpRmp0[GC][0];
+    bstC1 = InpRmp0[GC][1];
+    for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++)
+    {
+        for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++)
+        {
+            InpRmp[GC][0] = cmp_minf(cmp_maxf(InpRmp0[GC][0] + i * Fctrs[GC], 0.f), 255.f);
+            InpRmp[GC][1] = cmp_minf(cmp_maxf(InpRmp0[GC][1] + j * Fctrs[GC], 0.f), 255.f);
+
+            cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+            cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints);
+
+            CGU_FLOAT mse   = 0.f;
+            CGU_INT   rmp_l = (Eq > 0) ? 1 : dwNumPoints;
+            for (CGU_INT k = 0; k < _NmrClrs; k++)
+            {
+                CGU_FLOAT MinErr = 10000000.f;
+                for (CGU_INT r = 0; r < rmp_l; r++)
+                {
+                    CGU_FLOAT Dist = (Rmp[GC][r] - Blk[k][GC]);
+                    CGU_FLOAT Err  = RmpErr[r][k] + Dist * Dist * fWeightGreen;
+                    MinErr         = cmp_minf(MinErr, Err);
+                }
+                mse += MinErr * _Rpt[k];
+            }
+
+            if (mse < bestE)
+            {
+                bstC0 = InpRmp[GC][0];
+                bstC1 = InpRmp[GC][1];
+                bestE = mse;
+            }
+        }
+    }
+
+    // our best GREENs
+    InpRmp[GC][0] = bstC0;
+    InpRmp[GC][1] = bstC1;
+
+    cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+    cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
+
+    // ramp err for Red and Green
+    for (CGU_INT i = 0; i < _NmrClrs; i++)
+    {
+        for (CGU_INT r = 0; r < dwNumPoints; r++)
+        {
+            CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]);
+            CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
+            RmpErr[r][i]    = DistR * DistR * fWeightRed + DistG * DistG * fWeightGreen;
+        }
+    }
+
+    bstC0 = InpRmp0[BC][0];
+    bstC1 = InpRmp0[BC][1];
+    // Now blue
+    for (CGU_INT i = nRefineStart; i <= nRefineEnd; i++)
+    {
+        for (CGU_INT j = nRefineStart; j <= nRefineEnd; j++)
+        {
+            InpRmp[BC][0] = min(max(InpRmp0[BC][0] + i * Fctrs[BC], 0.f), 255.f);
+            InpRmp[BC][1] = min(max(InpRmp0[BC][1] + j * Fctrs[BC], 0.f), 255.f);
+
+            cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+            cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints);
+
+            CGU_FLOAT mse   = 0.f;
+            CGU_INT   rmp_l = (Eq > 0) ? 1 : dwNumPoints;
+            for (CGU_INT k = 0; k < _NmrClrs; k++)
+            {
+                CGU_FLOAT MinErr = 10000000.f;
+                for (CGU_INT r = 0; r < rmp_l; r++)
+                {
+                    CGU_FLOAT Dist = (Rmp[BC][r] - Blk[k][BC]);
+                    CGU_FLOAT Err  = RmpErr[r][k] + Dist * Dist * fWeightBlue;
+                    MinErr         = min(MinErr, Err);
+                }
+                mse += MinErr * _Rpt[k];
+            }
+
+            if (mse < bestE)
+            {
+                bstC0 = InpRmp[BC][0];
+                bstC1 = InpRmp[BC][1];
+                bestE = mse;
+            }
+        }
+    }
+
+    // our best BLUEs
+    InpRmp[BC][0] = bstC0;
+    InpRmp[BC][1] = bstC1;
+
+    // return our best choice
+    for (CGU_INT j = 0; j < 3; j++)
+        for (CGU_INT k = 0; k < 2; k++)
+            _OutRmpPnts[j][k] = InpRmp[j][k];
+
+    return bestE;
+}
+
+#endif
+
+//======================================================================================
+// Codec from CompressonatorLib
+//======================================================================================
+#define BLOCK_SIZE_4X4 16
+#define RG 5
+#define GG 6
+#define BG 5
+
+/*------------------------------------------------------------------------------------------------
+// this is how the end points is going to be rounded in compressed format
+------------------------------------------------------------------------------------------------*/
+CMP_STATIC void cpu_MkRmpOnGrid(CGU_FLOAT _RmpF[NUM_CHANNELS][NUM_ENDPOINTS],
+                                CGU_FLOAT _MnMx[NUM_CHANNELS][NUM_ENDPOINTS],
+                                CGU_FLOAT _Min,
+                                CGU_FLOAT _Max,
+                                CGU_UINT8 nRedBits,
+                                CGU_UINT8 nGreenBits,
+                                CGU_UINT8 nBlueBits)
+{
+    CGU_FLOAT Fctrs0[3];
+    CGU_FLOAT Fctrs1[3];
+
+    Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits);
+    Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits);
+    Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits);
+    Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits));
+    Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits));
+    Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits));
+
+    for (int j = 0; j < 3; j++)
+    {
+        for (int k = 0; k < 2; k++)
+        {
+            _RmpF[j][k] = cmp_floor(_MnMx[j][k]);
+            if (_RmpF[j][k] <= _Min)
+                _RmpF[j][k] = _Min;
+            else
+            {
+                _RmpF[j][k] += cmp_floor(128.f / Fctrs1[j]) - cmp_floor(_RmpF[j][k] / Fctrs1[j]);
+                _RmpF[j][k] = cmp_minf(_RmpF[j][k], _Max);
+            }
+
+            _RmpF[j][k] = cmp_floor(_RmpF[j][k] / Fctrs0[j]) * Fctrs0[j];
+        }
+    }
+}
+
+// Find the first approximation of the line
+// Assume there is a linear relation
+//   Z = a * X_In
+//   Z = b * Y_In
+// Find a,b to minimize MSE between Z and Z_In
+CMP_STATIC void cpu_FindAxis(CMP_OUT CGU_FLOAT              BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS],
+                             CMP_IN CGU_FLOAT               LineDir0[NUM_CHANNELS],
+                             CMP_IN CGU_FLOAT               fBlockCenter[NUM_CHANNELS],
+                             CMP_OUT CGU_UINT8 CMP_REFINOUT AxisIsSmall,
+                             CMP_IN CGU_FLOAT               BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS],
+                             CMP_IN CGU_FLOAT               _inpRpt[BLOCK_SIZE_4X4],
+                             CMP_IN int                     nDimensions,
+                             CMP_IN int                     dwUniqueColors)
+{
+    CGU_FLOAT Crrl[NUM_CHANNELS];
+    CGU_FLOAT RGB2[NUM_CHANNELS];
+    CGU_INT   i;
+
+    LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = fBlockCenter[0] = fBlockCenter[1] = fBlockCenter[2] =
+        0.f;
+
+    // sum position of all points
+    CGU_FLOAT fNumPoints = 0.f;
+    for (i = 0; i < dwUniqueColors; i++)
+    {
+        fBlockCenter[0] += BlkUV[i][0] * _inpRpt[i];
+        fBlockCenter[1] += BlkUV[i][1] * _inpRpt[i];
+        fBlockCenter[2] += BlkUV[i][2] * _inpRpt[i];
+        fNumPoints += _inpRpt[i];
+    }
+
+    // and then average to calculate center coordinate of block
+    fBlockCenter[0] /= fNumPoints;
+    fBlockCenter[1] /= fNumPoints;
+    fBlockCenter[2] /= fNumPoints;
+
+    for (i = 0; i < dwUniqueColors; i++)
+    {
+        // calculate output block as offsets around block center
+        BlkSh[i][0] = BlkUV[i][0] - fBlockCenter[0];
+        BlkSh[i][1] = BlkUV[i][1] - fBlockCenter[1];
+        BlkSh[i][2] = BlkUV[i][2] - fBlockCenter[2];
+
+        // compute correlation matrix
+        // RGB2 = sum of ((distance from point from center) squared)
+        // Crrl = ???????. Seems to be be some calculation based on distance from point center in two dimensions
+        for (int j = 0; j < nDimensions; j++)
+        {
+            RGB2[j] += BlkSh[i][j] * BlkSh[i][j] * _inpRpt[i];
+            Crrl[j] += BlkSh[i][j] * BlkSh[i][(j + 1) % 3] * _inpRpt[i];
+        }
+    }
+
+    // if set's diameter is small
+    int       i0 = 0, i1 = 1;
+    CGU_FLOAT mxRGB2 = 0.f;
+    int       k = 0, j = 0;
+    CGU_FLOAT fEPS = fNumPoints * EPS;
+    for (k = 0, j = 0; j < 3; j++)
+    {
+        if (RGB2[j] >= fEPS)
+            k++;
+        else
+            RGB2[j] = 0.f;
+
+        if (mxRGB2 < RGB2[j])
+        {
+            mxRGB2 = RGB2[j];
+            i0     = j;
+        }
+    }
+
+    CGU_FLOAT fEPS2 = fNumPoints * EPS2;
+    AxisIsSmall     = 1;
+    for (j = 0; j < 3; j++)
+    {
+        AxisIsSmall &= (RGB2[j] < fEPS2);
+    }
+
+    if (AxisIsSmall)  // all are very small to avoid division on the small determinant
+        return;
+
+    if (k == 1)  // really only 1 dimension
+        LineDir0[i0] = 1.;
+    else if (k == 2)
+    {  // really only 2 dimensions
+        i1            = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3;
+        CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3];
+        LineDir0[i1]  = Crl / RGB2[i0];
+        LineDir0[i0]  = 1.;
+    }
+    else
+    {
+        CGU_FLOAT maxDet = 100000.f;
+        CGU_FLOAT Cs[3];
+        // select max det for precision
+        for (j = 0; j < nDimensions; j++)
+        {
+            CGU_FLOAT Det = RGB2[j] * RGB2[(j + 1) % 3] - Crrl[j] * Crrl[j];
+            Cs[j]         = abs(Crrl[j] / sqrt(RGB2[j] * RGB2[(j + 1) % 3]));
+            if (maxDet < Det)
+            {
+                maxDet = Det;
+                i0     = j;
+            }
+        }
+
+        // inverse correl matrix
+        //  --      --       --      --
+        //  |  A   B |       |  C  -B |
+        //  |  B   C |  =>   | -B   A |
+        //  --      --       --     --
+        CGU_FLOAT mtrx1[2][2];
+        CGU_FLOAT vc1[2];
+        CGU_FLOAT vc[2];
+        vc1[0] = Crrl[(i0 + 2) % 3];
+        vc1[1] = Crrl[(i0 + 1) % 3];
+        // C
+        mtrx1[0][0] = RGB2[(i0 + 1) % 3];
+        // A
+        mtrx1[1][1] = RGB2[i0];
+        // -B
+        mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0];
+        // find a solution
+        vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1];
+        vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1];
+        // normalize
+        vc[0] /= maxDet;
+        vc[1] /= maxDet;
+        // find a line direction vector
+        LineDir0[i0]           = 1.;
+        LineDir0[(i0 + 1) % 3] = 1.;
+        LineDir0[(i0 + 2) % 3] = vc[0] + vc[1];
+    }
+
+    // normalize direction vector
+    CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2];
+    Len           = sqrt(Len);
+
+    for (j = 0; j < 3; j++)
+        LineDir0[j] = (Len > 0.f) ? LineDir0[j] / Len : 0.f;
+}
+
+CMP_STATIC CGU_FLOAT cpu_RampSrchW(CGU_FLOAT Prj[BLOCK_SIZE_4X4],
+                                   CGU_FLOAT PrjErr[BLOCK_SIZE_4X4],
+                                   CGU_FLOAT PreMRep[BLOCK_SIZE_4X4],
+                                   CGU_FLOAT StepErr,
+                                   CGU_FLOAT lowPosStep,
+                                   CGU_FLOAT highPosStep,
+                                   int       dwUniqueColors,
+                                   int       dwNumPoints)
+{
+    CGU_FLOAT error  = 0.0f;
+    CGU_FLOAT step   = (highPosStep - lowPosStep) / (dwNumPoints - 1);
+    CGU_FLOAT step_h = step * 0.5f;
+    CGU_FLOAT rstep  = (CGU_FLOAT)1.0f / step;
+    CGU_INT   i;
+
+    for (i = 0; i < dwUniqueColors; i++)
+    {
+        // Work out which value in the block this select
+        CGU_FLOAT del = Prj[i] - lowPosStep;
+
+        CGU_FLOAT v;
+
+        if (del <= 0)
+            v = lowPosStep;
+        else if (Prj[i] - highPosStep >= 0)
+            v = highPosStep;
+        else
+            v = cmp_floor((del + step_h) * rstep) * step + lowPosStep;
+
+        // And accumulate the error
+        CGU_FLOAT d = (Prj[i] - v);
+        d *= d;
+        CGU_FLOAT err = PreMRep[i] * d + PrjErr[i];
+        error += err;
+        if (StepErr < error)
+        {
+            error = StepErr;
+            break;
+        }
+    }
+    return error;
+}
+
+CMP_STATIC CGU_FLOAT _cpu_bc1ComputeBestEndpoints(CGU_FLOAT endpointsOut[NUM_ENDPOINTS],
+                                                  CGU_FLOAT endpointsIn[NUM_ENDPOINTS],
+                                                  CGU_FLOAT prj[BLOCK_SIZE_4X4],
+                                                  CGU_FLOAT prjError[BLOCK_SIZE_4X4],
+                                                  CGU_FLOAT preMRep[BLOCK_SIZE_4X4],
+                                                  int       numColours,
+                                                  int       numPoints)
+{
+    CGU_FLOAT minError = MAX_ERROR;
+
+    static const CGU_FLOAT searchStep = 0.025f;
+
+    const CGU_FLOAT lowStart  = (endpointsIn[0] - 2.0f * searchStep > 0.0f) ? endpointsIn[0] - 2.0f * searchStep : 0.0f;
+    const CGU_FLOAT highStart = (endpointsIn[1] + 2.0f * searchStep < 1.0f) ? endpointsIn[1] + 2.0f * searchStep : 1.0f;
+
+    CGU_FLOAT lowStep  = lowStart;
+    CGU_FLOAT highStep = highStart;
+
+    for (int low = 0; low < 8; ++low)
+    {
+        for (int high = 0; high < 8; ++high)
+        {
+            // compute an error for the current pair of end points.
+            CGU_FLOAT error = cpu_RampSrchW(prj, prjError, preMRep, minError, lowStep, highStep, numColours, numPoints);
+
+            if (error < minError)
+            {
+                // save better result
+                minError        = error;
+                endpointsOut[0] = lowStep;
+                endpointsOut[1] = highStep;
+            }
+
+            highStep -= searchStep;
+        }
+
+        lowStep += searchStep;
+    }
+
+    return minError;
+}
+
+//    This is a float point-based compression
+//    it assumes that the number of unique colors is already known; input is in [0., 255.] range.
+//    This is C version.
+CMP_STATIC bool cpu_CompressRGBBlockX(CMP_OUT CGU_FLOAT _RsltRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
+                                      CMP_IN CGU_FLOAT  src_image[BLOCK_SIZE_4X4][NUM_CHANNELS],
+                                      CMP_IN CGU_FLOAT  Rpt[BLOCK_SIZE_4X4],
+                                      CMP_IN int        dwUniqueColors,
+                                      CMP_IN CGU_UINT8  dwNumPoints,
+                                      CMP_IN bool       b3DRefinement,
+                                      CMP_IN CGU_UINT8  nRefinementSteps,
+                                      CMP_IN CGU_FLOAT  pfWeights[3],
+                                      CMP_IN CGU_UINT8  nRedBits,
+                                      CMP_IN CGU_UINT8  nGreenBits,
+                                      CMP_IN CGU_UINT8  nBlueBits,
+                                      CMP_IN CGU_FLOAT  fquality)
+{
+#if !defined(ASPM_GPU)
+    if (!g_bc1FunctionPointersSet)
+    {
+        bc1ToggleSIMD(EXTENSION_COUNT);
+    }
+#endif
+
+    CGU_FLOAT ALIGN_16 Prj0[BLOCK_SIZE_4X4];
+    CGU_FLOAT ALIGN_16 Prj[BLOCK_SIZE_4X4];
+    CGU_FLOAT ALIGN_16 PrjErr[BLOCK_SIZE_4X4];
+    CGU_FLOAT ALIGN_16 LineDir[NUM_CHANNELS];
+    CGU_FLOAT ALIGN_16 RmpIndxs[BLOCK_SIZE_4X4];
+
+    CMP_UNUSED(fquality);
+    CMP_UNUSED(b3DRefinement)
+
+    CGU_FLOAT LineDirG[NUM_CHANNELS];
+    CGU_FLOAT PosG[NUM_ENDPOINTS];
+    CGU_FLOAT BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS];
+    CGU_FLOAT BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS];
+    CGU_FLOAT LineDir0[NUM_CHANNELS];
+    CGU_FLOAT Mdl[NUM_CHANNELS];
+
+    CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS];
+    int       i, j, k;
+
+    // down to [0., 1.]
+    for (i = 0; i < dwUniqueColors; i++)
+        for (j = 0; j < 3; j++)
+            BlkUV[i][j] = src_image[i][j] / 255.f;
+
+    bool isDONE = false;
+
+    // as usual if not more then 2 different colors, we've done
+    if (dwUniqueColors <= 2)
+    {
+        for (j = 0; j < 3; j++)
+        {
+            rsltC[j][0] = src_image[0][j];
+            rsltC[j][1] = src_image[dwUniqueColors - 1][j];
+        }
+        isDONE = true;
+    }
+
+    if (!isDONE)
+    {
+        //    This is our first attempt to find an axis we will go along.
+        //    The cumulation is done to find a line minimizing the MSE from the input 3D points.
+        CGU_UINT8 bSmall;
+        cpu_FindAxis(BlkSh, LineDir0, Mdl, bSmall, BlkUV, Rpt, 3, dwUniqueColors);
+
+        //    While trying to find the axis we found that the diameter of the input set is quite small.
+        //    Do not bother.
+        if (bSmall)
+        {
+            for (j = 0; j < 3; j++)
+            {
+                rsltC[j][0] = src_image[0][j];
+                rsltC[j][1] = src_image[dwUniqueColors - 1][j];
+            }
+            isDONE = true;
+        }
+    }
+
+    // GCC is being an awful being when it comes to goto-jumps.
+    // So please bear with this.
+    if (!isDONE)
+    {
+        CGU_FLOAT          ErrG = 10000000.f;
+        CGU_FLOAT          PrjBnd[NUM_ENDPOINTS];
+        CGU_FLOAT ALIGN_16 PreMRep[BLOCK_SIZE_4X4];
+        for (j = 0; j < 3; j++)
+            LineDir[j] = LineDir0[j];
+
+        //    Here is the main loop.
+        //    1. Project input set on the axis in consideration.
+        //    2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points.
+        //    3. Compute the vector of indexes (or clusters) for the current approximate ramp.
+        //    4. Present our color channels as 3 16DIM vectors.
+        //    5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector.
+        //    6. Plug the projections as a new directional vector for the axis.
+        //    7. Goto 1.
+        //    D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3, 2/3, 0, ...,}, but shifted and normalized).
+        //    Ci - is a 16 dim vector of color i.
+        //    for each Ci find a scalar Ai such that
+        //    (Ai * D - Ci) (Ai * D - Ci) -> min , i.e distance between vector AiD and C is min.
+        //    You can think of D as a unit interval(vector) "clusterizer",
+        //    and Ai is a scale you need to apply to the clusterizer to
+        //    approximate the Ci vector instead of the unit vector.
+        //    Solution is
+        //    Ai = (D . Ci) / (D . D); . - is a dot product.
+        //    in 3 dim space Ai(s) represent a line direction, along which
+        //    we again try to find (sub)optimal quantizer.
+
+        //    That's what our for(;;) loop is about.
+        for (;;)
+        {
+            //  1. Project input set on the axis in consideration.
+            // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is
+            //                            P + ((R-P).v) / (v.v))v
+            // The distance along v is therefore (R-P).v / (v.v)
+            // (v.v) is 1 if v is a unit vector.
+            //
+            PrjBnd[0] = 1000.;
+            PrjBnd[1] = -1000.;
+            for (i = 0; i < BLOCK_SIZE_4X4; i++)
+                Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f;
+
+            for (i = 0; i < dwUniqueColors; i++)
+            {
+                Prj0[i] = Prj[i] = BlkSh[i][0] * LineDir[0] + BlkSh[i][1] * LineDir[1] + BlkSh[i][2] * LineDir[2];
+
+                PrjErr[i] = (BlkSh[i][0] - LineDir[0] * Prj[i]) * (BlkSh[i][0] - LineDir[0] * Prj[i]) +
+                            (BlkSh[i][1] - LineDir[1] * Prj[i]) * (BlkSh[i][1] - LineDir[1] * Prj[i]) +
+                            (BlkSh[i][2] - LineDir[2] * Prj[i]) * (BlkSh[i][2] - LineDir[2] * Prj[i]);
+
+                PrjBnd[0] = min(PrjBnd[0], Prj[i]);
+                PrjBnd[1] = max(PrjBnd[1], Prj[i]);
+            }
+
+            //  2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points.
+
+            // min and max of the search interval
+            CGU_FLOAT stepf = 0.125f;
+
+            CGU_FLOAT Scl[NUM_ENDPOINTS];
+            Scl[0] = PrjBnd[0] - (PrjBnd[1] - PrjBnd[0]) * stepf;
+            Scl[1] = PrjBnd[1] + (PrjBnd[1] - PrjBnd[0]) * stepf;
+
+            // No range found exit
+            if (Scl[0] == Scl[1])
+            {
+                return false;
+            }
+
+            // compute scaling factor to scale down the search interval to [0.,1]
+            const CGU_FLOAT Scl2    = (Scl[1] - Scl[0]) * (Scl[1] - Scl[0]);
+            const CGU_FLOAT overScl = 1.f / (Scl[1] - Scl[0]);
+
+            for (i = 0; i < dwUniqueColors; i++)
+            {
+                // scale them
+                Prj[i] = (Prj[i] - Scl[0]) * overScl;
+                // premultiply the scale squire to plug into error computation later
+                PreMRep[i] = Rpt[i] * Scl2;
+            }
+
+            // scale first approximation of end points
+            PrjBnd[0] = (PrjBnd[0] - Scl[0]) * overScl;
+            PrjBnd[1] = (PrjBnd[1] - Scl[0]) * overScl;
+
+            // find the best endpoints
+            CGU_FLOAT Pos[NUM_ENDPOINTS];
+#if defined(ASPM_GPU)
+            CGU_FLOAT StepErr = _cpu_bc1ComputeBestEndpoints(Pos, PrjBnd, Prj, PrjErr, PreMRep, dwUniqueColors, dwNumPoints);
+#else
+            CGU_FLOAT StepErr = cpu_bc1ComputeBestEndpoints(Pos, PrjBnd, Prj, PrjErr, PreMRep, dwUniqueColors, dwNumPoints);
+#endif
+
+            // inverse the scaling
+            Pos[0] = Pos[0] * (Scl[1] - Scl[0]) + Scl[0];
+            Pos[1] = Pos[1] * (Scl[1] - Scl[0]) + Scl[0];
+
+            // did we find somthing better from the previous run?
+            if (StepErr + 0.001 < ErrG)
+            {
+                // yes, remember it
+                ErrG        = StepErr;
+                LineDirG[0] = LineDir[0];
+                LineDirG[1] = LineDir[1];
+                LineDirG[2] = LineDir[2];
+                PosG[0]     = Pos[0];
+                PosG[1]     = Pos[1];
+                //  3. Compute the vector of indexes (or clusters) for the current approximate ramp.
+                // indexes
+                const CGU_FLOAT step      = (Pos[1] - Pos[0]) / (CGU_FLOAT)(dwNumPoints - 1);
+                const CGU_FLOAT step_h    = step * (CGU_FLOAT)0.5;
+                const CGU_FLOAT rstep     = (CGU_FLOAT)1.0f / step;
+                const CGU_FLOAT overBlkTp = 1.f / (CGU_FLOAT)(dwNumPoints - 1);
+
+                // here the index vector is computed,
+                // shifted and normalized
+                CGU_FLOAT indxAvrg = (CGU_FLOAT)(dwNumPoints - 1) / 2.f;
+
+                for (i = 0; i < dwUniqueColors; i++)
+                {
+                    CGU_FLOAT del;
+                    //int n = (int)((b - _min_ex + (step*0.5f)) * rstep);
+                    if ((del = Prj0[i] - Pos[0]) <= 0)
+                        RmpIndxs[i] = 0.f;
+                    else if (Prj0[i] - Pos[1] >= 0)
+                        RmpIndxs[i] = (CGU_FLOAT)(dwNumPoints - 1);
+                    else
+                        RmpIndxs[i] = cmp_floor((del + step_h) * rstep);
+                    // shift and normalization
+                    RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp;
+                }
+
+                //  4. Present our color channels as 3 16DIM vectors.
+                //  5. Find closest aproximation of each of 16DIM color vector with the pojection of the 16DIM index vector.
+                CGU_FLOAT Crs[3], Len, Len2;
+                for (i = 0, Crs[0] = Crs[1] = Crs[2] = Len = 0.f; i < dwUniqueColors; i++)
+                {
+                    const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i];
+                    Len += RmpIndxs[i] * PreMlt;
+                    for (j = 0; j < 3; j++)
+                        Crs[j] += BlkSh[i][j] * PreMlt;
+                }
+
+                LineDir[0] = LineDir[1] = LineDir[2] = 0.f;
+                if (Len > 0.f)
+                {
+                    LineDir[0] = Crs[0] / Len;
+                    LineDir[1] = Crs[1] / Len;
+                    LineDir[2] = Crs[2] / Len;
+
+                    //  6. Plug the projections as a new directional vector for the axis.
+                    //  7. Goto 1.
+                    Len2 = LineDir[0] * LineDir[0] + LineDir[1] * LineDir[1] + LineDir[2] * LineDir[2];
+                    Len2 = sqrt(Len2);
+
+                    LineDir[0] /= Len2;
+                    LineDir[1] /= Len2;
+                    LineDir[2] /= Len2;
+                }
+            }
+            else  // We was not able to find anything better.  Drop dead.
+                break;
+        }
+
+        // inverse transform to find end-points of 3-color ramp
+        for (k = 0; k < 2; k++)
+            for (j = 0; j < 3; j++)
+                rsltC[j][k] = (PosG[k] * LineDirG[j] + Mdl[j]) * 255.f;
+    }
+
+    // We've dealt with (almost) unrestricted full precision realm.
+    // Now back to the dirty digital world.
+
+    // round the end points to make them look like compressed ones
+    CGU_FLOAT inpRmpEndPts[NUM_CHANNELS][NUM_ENDPOINTS];
+    cpu_MkRmpOnGrid(inpRmpEndPts, rsltC, 0.f, 255.f, nRedBits, nGreenBits, nBlueBits);
+
+    // Try using this on 3 channels
+    // static CGU_Vec2i cmp_getLinearEndPoints(CGU_FLOAT _Blk[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSigned);
+
+    // This not a small procedure squeezes and stretches the ramp along each axis (R,G,B) separately while other 2 are fixed.
+    // It does it only over coarse grid - 565 that is. It tries to squeeze more precision for the real world ramp.
+#if defined(USE_REFINE) || defined(USE_REFINE3D)
+    switch (nRefinementSteps)
+    {
+    case 1:
+        cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 3);
+        break;
+    case 2:
+        if (dwUniqueColors > 2)
+            cmp_Refine3D(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1);
+        else
+            cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 3);
+        break;
+    default:
+        cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1);
+        break;
+    }
+#endif
+    return true;
+}
+
+// CPU: CompRGBBlock()
+CMP_STATIC CGU_FLOAT cpu_CompRGBBlock32(CGU_UINT32 block_32[16],
+                                        CGU_UINT32 compressedBlock[2],
+                                        CGU_UINT32 dwBlockSize,
+                                        CGU_UINT8  nRedBits,
+                                        CGU_UINT8  nGreenBits,
+                                        CGU_UINT8  nBlueBits,
+                                        CGU_UINT8  nEndpoints[3][NUM_ENDPOINTS],
+                                        CGU_UINT8  pcIndices[BLOCK_SIZE_4X4],
+                                        CGU_UINT8  dwNumPoints,
+                                        bool       b3DRefinement,
+                                        CGU_UINT8  m_nRefinementSteps,
+                                        CGU_FLOAT  _pfChannelWeights[3],
+                                        bool       _bUseAlpha,
+                                        CGU_UINT8  _nAlphaThreshold)
+{
+    CGU_FLOAT ALIGN_16 Rpt[BLOCK_SIZE_4X4];
+    CGU_FLOAT ALIGN_16 BlkIn[BLOCK_SIZE_4X4][NUM_CHANNELS];
+    CGU_UINT32         mx;
+    for (mx = 0; mx < BLOCK_SIZE_4X4; mx++)
+    {
+        Rpt[mx]      = 0;
+        BlkIn[mx][0] = 0;
+        BlkIn[mx][1] = 0;
+        BlkIn[mx][2] = 0;
+        BlkIn[mx][3] = 0;
+    }
+
+    compressedBlock[0] = 0;
+
+    CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24;
+    CGU_UINT32 dwColors         = 0;
+    CGU_UINT32 dwBlk[BLOCK_SIZE];
+    for (CGU_UINT32 i = 0; i < dwBlockSize; i++)
+        if (!_bUseAlpha || (block_32[i] & 0xff000000) >= dwAlphaThreshold)
+            dwBlk[dwColors++] = block_32[i] | 0xff000000;
+
+    // Do we have any colors ?
+    static int id = 0;
+    if (dwColors)
+    {
+        bool bHasAlpha = (dwColors != dwBlockSize);
+        if (bHasAlpha && _bUseAlpha && !(dwNumPoints & 0x1))
+            return CMP_FLT_MAX;
+
+            // Here we are computing an unique number of colors.
+            // For each unique value we compute the number of it appearences.
+            //qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp);
+#ifndef ASPM_GPU  // this is here for reminder when code moves to GPU
+        std::sort(dwBlk, dwBlk + 15);
+#else
+        {
+            CGU_UINT32 j;
+            CMP_di     what[BLOCK_SIZE_4X4];
+
+            for (i = 0; i < dwColors; i++)
+            {
+                what[i].index = i;
+                what[i].data  = dwBlk[i];
+            }
+
+            CGU_UINT32 tmp_index;
+            CGU_UINT32 tmp_data;
+
+            for (i = 1; i < dwColors; i++)
+            {
+                for (j = i; j > 0; j--)
+                {
+                    if (what[j - 1].data > what[j].data)
+                    {
+                        tmp_index         = what[j].index;
+                        tmp_data          = what[j].data;
+                        what[j].index     = what[j - 1].index;
+                        what[j].data      = what[j - 1].data;
+                        what[j - 1].index = tmp_index;
+                        what[j - 1].data  = tmp_data;
+                    }
+                }
+            }
+            for (i = 0; i < dwColors; i++)
+                dwBlk[i] = what[i].data;
+        }
+#endif
+
+        CGU_UINT32 new_p;
+        CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4];
+        CGU_UINT32 dwUniqueColors = 0;
+        new_p = dwBlkU[0]   = dwBlk[0];
+        Rpt[dwUniqueColors] = 1.f;
+        CGU_UINT32 i;
+        for (i = 1; i < dwColors; i++)
+        {
+            if (new_p != dwBlk[i])
+            {
+                dwUniqueColors++;
+                new_p = dwBlkU[dwUniqueColors] = dwBlk[i];
+                Rpt[dwUniqueColors]            = 1.f;
+            }
+            else
+                Rpt[dwUniqueColors] += 1.f;
+        }
+        dwUniqueColors++;
+
+        // switch to float
+        for (i = 0; i < dwUniqueColors; i++)
+        {
+            BlkIn[i][RC] = (CGU_FLOAT)((dwBlkU[i] >> 16) & 0xff);  // R
+            BlkIn[i][GC] = (CGU_FLOAT)((dwBlkU[i] >> 8) & 0xff);   // G
+            BlkIn[i][BC] = (CGU_FLOAT)((dwBlkU[i] >> 0) & 0xff);   // B
+            BlkIn[i][AC] = 255.0f;
+        }
+
+        CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS];
+        if (cpu_CompressRGBBlockX(rsltC,               //  CMP_EndPoints = CompressRGBBlock_Slow2 (
+                                  BlkIn,               //  CGU_Vec3f  src_imageNorm[BLOCK_SIZE_4X4]
+                                  Rpt,                 //  CGU_FLOAT  Rpt[BLOCK_SIZE_4X4],
+                                  dwUniqueColors,      //  CGU_UINT32 dwUniqueColors,
+                                  dwNumPoints,         //  CGU_UINT32 dwNumPoints,
+                                  b3DRefinement,       //
+                                  m_nRefinementSteps,  //  CGU_UINT32 m_nRefinementSteps,
+                                  _pfChannelWeights,   //  CGU_Vec3f  channelWeightsBGR,
+                                  nRedBits,            //  );
+                                  nGreenBits,
+                                  nBlueBits,
+                                  1.0f))
+        {
+            // return to integer realm
+            for (int ch = 0; ch < 3; ch++)
+                for (int j = 0; j < 2; j++)
+                    nEndpoints[ch][j] = (CGU_UINT8)rsltC[ch][j];
+            //printf("Endpoints {%3d,%3d,%3d} {%3d,%3d,%3d} ", nEndpoints[0][0],nEndpoints[1][0],nEndpoints[2][0],
+            //                                                  nEndpoints[0][1],nEndpoints[1][1],nEndpoints[2][1]);
+
+            // Now get the indices using the new end points
+            return cpu_Clstr(
+                block_32, dwBlockSize, nEndpoints, pcIndices, dwNumPoints, _pfChannelWeights, _bUseAlpha, _nAlphaThreshold, nRedBits, nGreenBits, nBlueBits);
+        }
+        else
+        {
+            CGU_FLOAT CompErr = CMP_FLT_MAX;
+            if (dwNumPoints < 4)
+            {
+                CGU_Vec3f src_imageNorm[BLOCK_SIZE_4X4];
+
+                for (CGU_UINT32 px = 0; px < 16; px++)
+                {
+                    src_imageNorm[px].r = (CGU_FLOAT)((block_32[px] >> 16) & 0xff) / 255.0f;
+                    src_imageNorm[px].g = (CGU_FLOAT)((block_32[px] >> 8) & 0xff) / 255.0f;
+                    src_imageNorm[px].b = (CGU_FLOAT)((block_32[px] >> 0) & 0xff) / 255.0f;
+                }
+
+                // Do a quick compression test
+                CGU_Vec3f srcRGB[16];   // The list of source colors with blue channel altered
+                CGU_Vec3f average_rgb;  // The centrepoint of the axis
+                CGU_FLOAT errLQ = CMP_FLT_MAX;
+                cgu_CompressRGBBlock_MinMax(src_imageNorm, 1.0f, false, srcRGB, average_rgb, errLQ);
+                CGU_Vec2ui cmp = cgu_CompressRGBBlock_Fast(src_imageNorm, 1.0f, false, srcRGB, average_rgb, CompErr);
+
+                compressedBlock[0] = cmp.x;
+                compressedBlock[1] = cmp.y;
+            }
+            return CompErr;
+        }
+    }
+    else
+    {
+        // All colors transparent
+        nEndpoints[0][0] = nEndpoints[1][0] = nEndpoints[2][0] = 0;
+        nEndpoints[0][1] = nEndpoints[1][1] = nEndpoints[2][1] = 0xff;
+        for (CGU_UINT32 ms = 0; ms < dwBlockSize; ms++)
+            pcIndices[ms] = 0xff;
+        return 0.0;
+    }
+}
+
+CMP_STATIC CGU_Vec2ui cpu_CompRGBBlock(CMP_IN CGU_Vec4uc bgraBlock[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options, CMP_INOUT CGU_FLOAT CMP_REFINOUT err)
+{
+    CGU_Vec2ui cmpBlock            = {0U, 0U};
+    CGU_FLOAT  pfChannelWeights[3] = {1.0f, 1.0f, 1.0f};
+    CGU_UINT8  nEndpoints[2][3][2];
+    CGU_UINT8  nIndices[2][BLOCK_SIZE_4X4];
+    CGU_UINT32 compressedBlock[2] = {0, 0};
+
+    CGU_FLOAT fError3 = CMP_FLT_MAX;
+
+    fError3 = cpu_CompRGBBlock32((CGU_UINT32*)bgraBlock,
+                                 compressedBlock,
+                                 BLOCK_SIZE_4X4,
+                                 RG,
+                                 GG,
+                                 BG,
+                                 nEndpoints[0],
+                                 nIndices[0],
+                                 3,
+                                 BC15Options.m_b3DRefinement,
+                                 BC15Options.m_nRefinementSteps,
+                                 pfChannelWeights,
+                                 BC15Options.m_bUseAlpha,
+                                 BC15Options.m_nAlphaThreshold);
+    // use case of small min max ranges
+    if (compressedBlock[0] > 0)
+    {
+        //return cmpBlockBlue;
+        cmpBlock.x = compressedBlock[0];
+        cmpBlock.y = compressedBlock[1];
+        err        = fError3;
+    }
+    else
+    {
+        CGU_FLOAT fError4 = CMP_FLT_MAX;
+        fError4           = (fError3 == 0.0) ? CMP_FLT_MAX
+                                             : cpu_CompRGBBlock32((CGU_UINT32*)bgraBlock,
+                                                        compressedBlock,
+                                                        BLOCK_SIZE_4X4,
+                                                        RG,
+                                                        GG,
+                                                        BG,
+                                                        nEndpoints[1],
+                                                        nIndices[1],
+                                                        4,
+                                                        BC15Options.m_b3DRefinement,
+                                                        BC15Options.m_nRefinementSteps,
+                                                        pfChannelWeights,
+                                                        BC15Options.m_bUseAlpha,
+                                                        BC15Options.m_nAlphaThreshold);
+
+        CGU_UINT32 nMethod;
+        if (fError3 <= fError4)
+        {
+            err     = fError3;
+            nMethod = 0;
+        }
+        else
+        {
+            err     = fError4;
+            nMethod = 1;
+        }
+
+        CGU_UINT32 c0 =
+            BC1ConstructColour((nEndpoints[nMethod][RC][0] >> (8 - RG)), (nEndpoints[nMethod][GC][0] >> (8 - GG)), (nEndpoints[nMethod][BC][0] >> (8 - BG)));
+        CGU_UINT32 c1 =
+            BC1ConstructColour((nEndpoints[nMethod][RC][1] >> (8 - RG)), (nEndpoints[nMethod][GC][1] >> (8 - GG)), (nEndpoints[nMethod][BC][1] >> (8 - BG)));
+        if (nMethod == 1 && c0 <= c1 || nMethod == 0 && c0 > c1)
+            compressedBlock[0] = c1 | (c0 << 16);
+        else
+            compressedBlock[0] = c0 | (c1 << 16);
+
+        compressedBlock[1] = 0;
+        for (CGU_UINT32 i = 0; i < 16; i++)
+            compressedBlock[1] |= (nIndices[nMethod][i] << (2 * i));
+
+        cmpBlock.x = compressedBlock[0];
+        cmpBlock.y = compressedBlock[1];
+    }
+
+    return cmpBlock;
+}
+
+#endif
+
+#ifdef ENABLE_NEW_CODE
+
+//---------------------------------------- Common Utility Code -------------------------------------------------------
+// 1 - Dim error
+CMP_STATIC CGU_FLOAT cgu_RampSrchW(CGU_FLOAT  Prj[BLOCK_SIZE_4X4],
+                                   CGU_FLOAT  PrjErr[BLOCK_SIZE_4X4],
+                                   CGU_FLOAT  PreMRep[BLOCK_SIZE_4X4],
+                                   CGU_FLOAT  StepErr,
+                                   CGU_FLOAT  lowPosStep,
+                                   CGU_FLOAT  highPosStep,
+                                   CGU_UINT32 dwUniqueColors,
+                                   CGU_UINT32 dwNumPoints)
+{
+    CGU_FLOAT error  = 0;
+    CGU_FLOAT step   = (highPosStep - lowPosStep) / (dwNumPoints - 1);
+    CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
+    CGU_FLOAT rstep  = (CGU_FLOAT)1.0f / step;
+
+    for (CGU_UINT32 i = 0; i < dwUniqueColors; i++)
+    {
+        CGU_FLOAT v;
+        // Work out which value in the block this select
+        CGU_FLOAT del;
+
+        if ((del = Prj[i] - lowPosStep) <= 0)
+            v = lowPosStep;
+        else if (Prj[i] - highPosStep >= 0)
+            v = highPosStep;
+        else
+            v = cmp_floor((del + step_h) * rstep) * step + lowPosStep;
+
+        // And accumulate the error
+        CGU_FLOAT d = (Prj[i] - v);
+        d *= d;
+        CGU_FLOAT err = PreMRep[i] * d + PrjErr[i];
+        error += err;
+        if (StepErr < error)
+        {
+            error = StepErr;
+            break;
+        }
+    }
+    return error;
+}
+
+CMP_STATIC CGU_UINT32 cgu_processCluster(CMP_IN CMP_EndPoints           EndPoints,
+                                         CMP_IN CGU_Vec4f               rgbBlock_normal[BLOCK_SIZE_4X4],
+                                         CMP_IN CGU_UINT32              dwAlphaThreshold,
+                                         CMP_IN CGU_Vec3f               channelWeights,
+                                         CMP_IN CGU_UINT8               indices[BLOCK_SIZE_4X4],
+                                         CMP_OUT CGU_FLOAT CMP_REFINOUT Err)
+{
+    Err                  = 0.f;
+    CGU_UINT32 pcIndices = 0;
+    CGU_UINT32 R, G, B;
+
+    R                  = (CGU_UINT32)(EndPoints.Color0.z);
+    G                  = (CGU_UINT32)(EndPoints.Color0.y);
+    B                  = (CGU_UINT32)(EndPoints.Color0.x);
+    CGU_INT32 cluster0 = cmp_constructColor(R, G, B);
+
+    R                  = (CGU_UINT32)(EndPoints.Color1.z);
+    G                  = (CGU_UINT32)(EndPoints.Color1.y);
+    B                  = (CGU_UINT32)(EndPoints.Color1.x);
+    CGU_INT32 cluster1 = cmp_constructColor(R, G, B);
+
+    CGU_Vec3f InpRmp[NUM_ENDPOINTS];
+    if ((cluster0 <= cluster1)  // valid for 4 channels
+                                // || (cluster0 > cluster1)    // valid for 3 channels
+    )
+    {
+        // inverse endpoints
+        InpRmp[0] = EndPoints.Color1;
+        InpRmp[1] = EndPoints.Color0;
+    }
+    else
+    {
+        InpRmp[0] = EndPoints.Color0;
+        InpRmp[1] = EndPoints.Color1;
+    }
+
+    CGU_Vec3f srcblockLinear[BLOCK_SIZE_4X4];
+    CGU_FLOAT srcblockA[BLOCK_SIZE_4X4];
+
+    // Swizzle the source RGB to BGR for processing
+    for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++)
+    {
+        srcblockLinear[i].z = rgbBlock_normal[i].x * 255.0f;
+        srcblockLinear[i].y = rgbBlock_normal[i].y * 255.0f;
+        srcblockLinear[i].x = rgbBlock_normal[i].z * 255.0f;
+        srcblockA[i]        = 0.0f;
+        //if (dwAlphaThreshold > 0)
+        //{
+        //    CGU_UINT32 alpha = (CGU_UINT32)BlockA[i];
+        //    if (alpha >= dwAlphaThreshold)
+        //        srcblockA[i] = BlockA[i];
+        //}
+    }
+
+    // cmp_ClstrBas2()
+    // input ramp is on the coarse grid
+    // make ramp endpoints the way they'll going to be decompressed
+    CGU_Vec3f InpRmpL[NUM_ENDPOINTS];
+    CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F};  // 1 << RG,1 << GG,1 << BG
+
+    {
+        //   ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp);
+        InpRmpL[0] = InpRmp[0] + cmp_floorVec3f(InpRmp[0] / Fctrs);
+        InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f);
+        InpRmpL[1] = InpRmp[1] + cmp_floorVec3f(InpRmp[1] / Fctrs);
+        InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f);
+    }  // MkWkRmpPts
+
+    // build ramp
+    CGU_Vec3f LerpRmp[4];
+    CGU_Vec3f offset = {1.0f, 1.0f, 1.0f};
+    {
+        //BldRmp(Rmp, InpRmpL, dwNumChannels);
+        // linear interpolate end points to get the ramp
+        LerpRmp[0] = InpRmpL[0];
+        LerpRmp[3] = InpRmpL[1];
+        LerpRmp[1] = cmp_floorVec3f((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f);
+        LerpRmp[2] = cmp_floorVec3f((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f);
+    }  // BldRmp
+
+    //=========================================================================
+    // Clusterize, Compute error and find DXTC indexes for the current cluster
+    //=========================================================================
+    {
+        // Clusterize
+        CGU_UINT32 alpha;
+
+        // For each colour in the original block assign it
+        // to the closest cluster and compute the cumulative error
+        for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++)
+        {
+            alpha = (CGU_UINT32)srcblockA[i];
+            if ((dwAlphaThreshold > 0) && alpha == 0)
+            {                                      //*((CGU_UINT32 *)&_Blk[i][AC]) == 0)
+                pcIndices |= cmp_set2Bit32(4, i);  // dwNumChannels 3 or 4 (default is 4)
+                indices[i] = 4;
+            }
+            else
+            {
+                CGU_FLOAT shortest      = 99999999999.f;
+                CGU_UINT8 shortestIndex = 0;
+
+                CGU_Vec3f channelWeightsBGR;
+                channelWeightsBGR.x = channelWeights.z;
+                channelWeightsBGR.y = channelWeights.y;
+                channelWeightsBGR.z = channelWeights.x;
+
+                for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++)
+                {
+                    // r is either 1 or 4
+                    // calculate the distance for each component
+                    CGU_FLOAT distance = cmp_dotVec3f(((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR),
+                                                      ((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR));
+                    if (distance < shortest)
+                    {
+                        shortest      = distance;
+                        shortestIndex = rampindex;
+                    }
+                }
+
+                Err += shortest;
+
+                // The total is a sum of (error += shortest)
+                // We have the index of the best cluster, so assign this in the block
+                // Reorder indices to match correct DXTC ordering
+                if (shortestIndex == 3)  // dwNumChannels - 1
+                    shortestIndex = 1;
+                else if (shortestIndex)
+                    shortestIndex++;
+                pcIndices |= cmp_set2Bit32(shortestIndex, i);
+                indices[i] = shortestIndex;
+            }
+        }  // BLOCK_SIZE_4X4
+    }      // Clusterize
+
+    return pcIndices;
+}
+#endif
+
+// Process a rgbBlock which is normalized (0.0f ... 1.0f), signed normal is not implemented
+CMP_STATIC CGU_Vec2ui CompressBlockBC1_NORMALIZED(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options)
+{
+    bool usingMaxQualityOnly = false;
+
+#ifndef ASPM_GPU
+    if (BC15Options.m_fquality > 0.75)
+        usingMaxQualityOnly = true;
+#endif
+
+    CGU_FLOAT  CompErr      = CMP_FLT_MAX;
+    CGU_Vec2ui cmpBlock     = {0U, 0U};
+    CGU_Vec2ui cmpBlockTemp = {0U, 0U};
+    CGU_FLOAT  CompErrTemp;
+
+    // Transfer to RGB Norm from RGBA Norm
+    CGU_Vec3f  src_imageRGBNorm[16];
+    CGU_Vec4uc pixels[16];
+    CGU_Vec4uc pixelsBGRA[16];
+
+    for (CGU_UINT32 sr = 0; sr < 16; sr++)
+    {
+        src_imageRGBNorm[sr] = src_imageNorm[sr].rgb;
+        pixelsBGRA[sr].b = pixels[sr].r = src_imageNorm[sr].r * 255.0f;
+        pixelsBGRA[sr].g = pixels[sr].g = src_imageNorm[sr].g * 255.0f;
+        pixelsBGRA[sr].r = pixels[sr].b = src_imageNorm[sr].b * 255.0f;
+        pixelsBGRA[sr].a = pixels[sr].a = src_imageNorm[sr].a * 255.0f;
+    }
+
+    // check for a punch through transparent alpha setting
+    if ((BC15Options.m_fquality < 0.75) && (BC15Options.m_bUseAlpha))
+    {
+        CGU_Vec2ui cmpBlockAlpha = {0xffff0000, 0xffffffffU};
+        for (CGU_UINT32 sr = 0; sr < 16; sr++)
+            if (pixels[sr].a < BC15Options.m_nAlphaThreshold)
+            {
+                return cmpBlockAlpha;
+            }
+    }
+
+    //================
+    // extern codec
+    //================
+    // For debugging
+    // CGU_Vec2ui cmpBlockRed   = {0xF800F800,0x00000000};
+    // CGU_Vec2ui cmpBlockGreen = {0x7E007E00,0x00000000};
+    // CGU_Vec2ui cmpBlockBlue  = {0x1F001F00,0x00000000};
+
+    if (!BC15Options.m_bUseAlpha)
+    {
+        //==========================================
+        // Gain +0.3 dB for images with soild blocks
+        //==========================================
+        bool bAllColoursEqual = true;
+
+        // Load the whole 4x4 block
+        for (CGU_UINT32 i = 0u; (i < 16u) && bAllColoursEqual; ++i)
+        {
+            for (CGU_INT c = 0; c < 3; c++)
+                bAllColoursEqual = bAllColoursEqual && (pixels[0][c] == pixels[i][c]);
+        }
+
+        if (bAllColoursEqual)
+        {
+            cmpBlock = cgu_solidColorBlock(pixels[0].x, pixels[0].y, pixels[0].z);
+            CompErr  = cgu_RGBABlockErrorLinear(pixels, cmpBlock);
+            if (BC15Options.m_nRefinementSteps < 1)
+                return cmpBlock;
+        }
+    }
+
+    if (!usingMaxQualityOnly)
+    {
+        //====================================
+        // Get src image data, min,max...
+        //=====================================
+        //CMP_EncodeData edata;
+        //cmp_get_encode_data(edata,pixels);
+
+        if (!BC15Options.m_bUseAlpha)
+        {
+            //====================================
+            // Fast Compression, low quality
+            //=====================================
+            CGU_Vec3f srcRGB[16];   // The list of source colors with blue channel altered
+            CGU_Vec3f average_rgb;  // The centrepoint of the axis
+            CGU_FLOAT errLQ = CMP_FLT_MAX;
+            cmpBlockTemp    = cgu_CompressRGBBlock_MinMax(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB, srcRGB, average_rgb, errLQ);
+            if ((BC15Options.m_fquality < CMP_QUALITY0) || (errLQ == 0.0f))
+                return cmpBlockTemp;
+
+            if (CompErr > errLQ)
+            {
+                CompErr  = errLQ;
+                cmpBlock = cmpBlockTemp;
+            }
+
+            cmpBlockTemp = cgu_CompressRGBBlock_Fast(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB, srcRGB, average_rgb, errLQ);
+            if (CompErr > errLQ)
+            {
+                CompErr  = errLQ;
+                cmpBlock = cmpBlockTemp;
+            }
+            if (BC15Options.m_fquality < CMP_QUALITY1)
+                return cmpBlock;
+        }
+
+        //========================================
+        // use GPU codec lower quality then CPU
+        //========================================
+        cmpBlockTemp = cgu_CompRGBBlock(src_imageNorm, BC15Options);
+        CompErrTemp  = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp);
+        if (CompErr > CompErrTemp)
+        {
+            CompErr  = CompErrTemp;
+            cmpBlock = cmpBlockTemp;
+        }
+
+        if (BC15Options.m_fquality < CMP_QUALITY2)
+            return cmpBlock;
+    }  // if useCGUCodecs
+
+    //====================================
+    // High Quality Codec CPU only
+    //=====================================
+#ifndef ASPM_GPU
+    cmpBlockTemp = cpu_CompRGBBlock(pixelsBGRA, BC15Options, CompErrTemp);
+
+    CompErrTemp = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp);
+
+    if (CompErr > CompErrTemp)
+    {
+        CompErr  = CompErrTemp;
+        cmpBlock = cmpBlockTemp;
+    }
+#endif
+
+    return cmpBlock;
+}