From 2557c842f87756f31dd1f4270bf83c425f9073be Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Wed, 19 Feb 2025 16:20:20 +0100 Subject: [PATCH 1/6] Adds bloom filter index to coordinates agg views Improves the performance of the queries that filter the rows by the type's name. For example, when looking for Member.* coordinates we elimiate the need to scan the whole table, by laveraging the idx_typename index. We filter rows by the first part of the coordinate field (substringIndex(coordinate, '.', 1)). --- .changeset/three-gifts-battle.md | 8 +++ .../012-coordinates-typename-index.ts | 52 +++++++++++++++++++ packages/migrations/src/clickhouse.ts | 1 + 3 files changed, 61 insertions(+) create mode 100644 .changeset/three-gifts-battle.md create mode 100644 packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts diff --git a/.changeset/three-gifts-battle.md b/.changeset/three-gifts-battle.md new file mode 100644 index 0000000000..2aff5d5b82 --- /dev/null +++ b/.changeset/three-gifts-battle.md @@ -0,0 +1,8 @@ +--- +'hive': patch +--- + +Adds an index to coordinates\_(daily,hourly,minutely) tables to speedup the +get_top_operations_for_types ClickHoue query. + +Reading of type and fields usage statisticts should be noticeably faster now on big datasets. diff --git a/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts new file mode 100644 index 0000000000..095f0a6044 --- /dev/null +++ b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts @@ -0,0 +1,52 @@ +import { z } from 'zod'; +import type { Action } from '../clickhouse'; + +const SystemTablesModel = z.array( + z.object({ + name: z.string(), + uuid: z.string(), + }), +); + +const DataSkippingIndicesModel = z.array( + z.object({ + name: z.string(), + }), +); + +// This migration adds an index for the `coordinate` field. +// Improve the performance of the queries that filter the rows by the type's name. +// +// For example, when looking for `Member.*` coordinates we elimiate the need to scan the whole table, +// by laveraging the idx_typename index. +// We filter rows by the first part of the `coordinate` field (substringIndex(coordinate, '.', 1)). +export const action: Action = async (exec, query, hiveCloudEnvironment) => { + const tables = await query(` + SELECT uuid, name FROM system.tables WHERE name IN ( + 'coordinates_daily', + 'coordinates_hourly', + 'coordinates_minutely' + ); + `).then(async r => SystemTablesModel.parse(r.data)); + + if (tables.length !== 3) { + throw new Error('Expected 3 tables'); + } + + for (const { uuid, name } of tables) { + console.log(`Creating idx_typename for table ${name}`); + await exec( + `ALTER TABLE ".inner_id.${uuid}" ADD INDEX idx_typename (substringIndex(coordinate, '.', 1)) TYPE ngrambf_v1(4, 1024, 2, 0) GRANULARITY 1`, + ); + const indexes = await query(` + SELECT name FROM system.data_skipping_indices WHERE table = ${'.inner_id.' + uuid} AND name = 'idx_typename' + `).then(async r => DataSkippingIndicesModel.parse(r.data)); + + if (indexes.some(i => i.name)) { + console.log(`Materializing the idx_typename for table ${name}`); + await exec(`ALTER TABLE ".inner_id.${uuid}" MATERIALIZE INDEX idx_typename`); + } else { + console.error(`Failed to find idx_typename for table ${name}`); + } + } +}; diff --git a/packages/migrations/src/clickhouse.ts b/packages/migrations/src/clickhouse.ts index f635c462ee..1fbedee312 100644 --- a/packages/migrations/src/clickhouse.ts +++ b/packages/migrations/src/clickhouse.ts @@ -169,6 +169,7 @@ export async function migrateClickHouse( import('./clickhouse-actions/009-ttl-1-year'), import('./clickhouse-actions/010-app-deployment-operations'), import('./clickhouse-actions/011-audit-logs'), + import('./clickhouse-actions/012-coordinates-typename-index'), ]); async function actionRunner(action: Action, index: number) { From b7d96b2f1f4c8b7eaf22c11f3a95196172939099 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Wed, 19 Feb 2025 16:24:16 +0100 Subject: [PATCH 2/6] ok --- .../src/clickhouse-actions/012-coordinates-typename-index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts index 095f0a6044..824ea55d56 100644 --- a/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts +++ b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts @@ -20,7 +20,7 @@ const DataSkippingIndicesModel = z.array( // For example, when looking for `Member.*` coordinates we elimiate the need to scan the whole table, // by laveraging the idx_typename index. // We filter rows by the first part of the `coordinate` field (substringIndex(coordinate, '.', 1)). -export const action: Action = async (exec, query, hiveCloudEnvironment) => { +export const action: Action = async (exec, query) => { const tables = await query(` SELECT uuid, name FROM system.tables WHERE name IN ( 'coordinates_daily', From b038514768ccd9e1396f22723d3181a3f59390a3 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Wed, 19 Feb 2025 17:11:43 +0100 Subject: [PATCH 3/6] asd --- .../src/clickhouse-actions/012-coordinates-typename-index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts index 824ea55d56..051fed5b89 100644 --- a/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts +++ b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts @@ -39,7 +39,7 @@ export const action: Action = async (exec, query) => { `ALTER TABLE ".inner_id.${uuid}" ADD INDEX idx_typename (substringIndex(coordinate, '.', 1)) TYPE ngrambf_v1(4, 1024, 2, 0) GRANULARITY 1`, ); const indexes = await query(` - SELECT name FROM system.data_skipping_indices WHERE table = ${'.inner_id.' + uuid} AND name = 'idx_typename' + SELECT name FROM system.data_skipping_indices WHERE table = '${'.inner_id.' + uuid}' AND name = 'idx_typename' `).then(async r => DataSkippingIndicesModel.parse(r.data)); if (indexes.some(i => i.name)) { From 1768dfa94f777240d8de6795f0d02d0ee1c3e807 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Wed, 19 Feb 2025 21:39:10 +0100 Subject: [PATCH 4/6] Best I can do and it's an overkill for 3 statements We could run them manually if something goes south --- .../012-coordinates-typename-index.ts | 66 +++++++++++++++---- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts index 051fed5b89..21efac951d 100644 --- a/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts +++ b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts @@ -8,9 +8,11 @@ const SystemTablesModel = z.array( }), ); -const DataSkippingIndicesModel = z.array( +const StateTableModel = z.array( z.object({ - name: z.string(), + table: z.string(), + idx_created: z.boolean(), + idx_materialized: z.boolean(), }), ); @@ -21,6 +23,15 @@ const DataSkippingIndicesModel = z.array( // by laveraging the idx_typename index. // We filter rows by the first part of the `coordinate` field (substringIndex(coordinate, '.', 1)). export const action: Action = async (exec, query) => { + // Create a table to store the state of the migration + await exec(` + CREATE TABLE IF NOT EXISTS default.migration_coordinates_typename_index ( + table String, + idx_created Bool DEFAULT false, + idx_materialized Bool DEFAULT false + ) ENGINE = MergeTree() + `); + const tables = await query(` SELECT uuid, name FROM system.tables WHERE name IN ( 'coordinates_daily', @@ -33,20 +44,49 @@ export const action: Action = async (exec, query) => { throw new Error('Expected 3 tables'); } + const tableStates = await query(` + SELECT table, idx_created, idx_materialized FROM default.migration_coordinates_typename_index + `).then(async r => StateTableModel.parse(r.data)); + for (const { uuid, name } of tables) { - console.log(`Creating idx_typename for table ${name}`); - await exec( - `ALTER TABLE ".inner_id.${uuid}" ADD INDEX idx_typename (substringIndex(coordinate, '.', 1)) TYPE ngrambf_v1(4, 1024, 2, 0) GRANULARITY 1`, - ); - const indexes = await query(` - SELECT name FROM system.data_skipping_indices WHERE table = '${'.inner_id.' + uuid}' AND name = 'idx_typename' - `).then(async r => DataSkippingIndicesModel.parse(r.data)); + let state = tableStates.find(s => s.table === name); + + if (!state) { + console.log(`Creating state for table ${name}`); + await exec(` + INSERT INTO default.migration_coordinates_typename_index (table) VALUES ('${name}') + `); + + state = { table: name, idx_created: false, idx_materialized: false }; + } + + const innerTable = `.inner_id.${uuid}`; + + if (state.idx_created) { + console.log(`Skipping idx_typename for table ${name}`); + } else { + console.log(`Creating idx_typename for table ${name}`); + await exec( + `ALTER TABLE "${innerTable}" ADD INDEX idx_typename (substringIndex(coordinate, '.', 1)) TYPE ngrambf_v1(4, 1024, 2, 0) GRANULARITY 1`, + ); + await exec( + `UPDATE default.migration_coordinates_typename_index SET idx_created = true WHERE table = '${name}'`, + ); + } - if (indexes.some(i => i.name)) { - console.log(`Materializing the idx_typename for table ${name}`); - await exec(`ALTER TABLE ".inner_id.${uuid}" MATERIALIZE INDEX idx_typename`); + if (state.idx_materialized) { + console.log(`Skipping materializing idx_typename for table ${name}`); } else { - console.error(`Failed to find idx_typename for table ${name}`); + console.log(`Materializing idx_typename for table ${name}`); + await exec(`ALTER TABLE "${innerTable}" MATERIALIZE INDEX idx_typename`); + await exec( + `UPDATE default.migration_coordinates_typename_index SET idx_materialized = true WHERE table = '${name}'`, + ); } } + + // Drop the state table + await exec(` + DROP TABLE default.migration_coordinates_typename_index + `); }; From 4e0ccc9af204ad1d0b153f6847e8d1be67c8ea63 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Thu, 20 Feb 2025 09:26:31 +0100 Subject: [PATCH 5/6] Update 012-coordinates-typename-index.ts --- .../src/clickhouse-actions/012-coordinates-typename-index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts index 21efac951d..63de80e857 100644 --- a/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts +++ b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts @@ -29,7 +29,7 @@ export const action: Action = async (exec, query) => { table String, idx_created Bool DEFAULT false, idx_materialized Bool DEFAULT false - ) ENGINE = MergeTree() + ) ENGINE = MergeTree() ORDER BY tuple() `); const tables = await query(` From 076e5177330790e216cd43e31c8f99f493518601 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Thu, 20 Feb 2025 10:01:23 +0100 Subject: [PATCH 6/6] o --- .../012-coordinates-typename-index.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts index 63de80e857..258a4f3141 100644 --- a/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts +++ b/packages/migrations/src/clickhouse-actions/012-coordinates-typename-index.ts @@ -67,10 +67,13 @@ export const action: Action = async (exec, query) => { } else { console.log(`Creating idx_typename for table ${name}`); await exec( - `ALTER TABLE "${innerTable}" ADD INDEX idx_typename (substringIndex(coordinate, '.', 1)) TYPE ngrambf_v1(4, 1024, 2, 0) GRANULARITY 1`, + `ALTER TABLE "${innerTable}" ADD INDEX IF NOT EXISTS idx_typename (substringIndex(coordinate, '.', 1)) TYPE ngrambf_v1(4, 1024, 2, 0) GRANULARITY 1`, ); await exec( - `UPDATE default.migration_coordinates_typename_index SET idx_created = true WHERE table = '${name}'`, + `ALTER TABLE default.migration_coordinates_typename_index UPDATE idx_created = true WHERE table = '${name}'`, + { + mutations_sync: '2', + }, ); } @@ -80,12 +83,15 @@ export const action: Action = async (exec, query) => { console.log(`Materializing idx_typename for table ${name}`); await exec(`ALTER TABLE "${innerTable}" MATERIALIZE INDEX idx_typename`); await exec( - `UPDATE default.migration_coordinates_typename_index SET idx_materialized = true WHERE table = '${name}'`, + `ALTER TABLE default.migration_coordinates_typename_index UPDATE idx_materialized = true WHERE table = '${name}'`, + { + mutations_sync: '2', + }, ); } } - // Drop the state table + console.log('Dropping migration state table'); await exec(` DROP TABLE default.migration_coordinates_typename_index `);