Agent Skills: ClickHouse Debug Bundle

|

UncategorizedID: jeremylongshore/claude-code-plugins-plus-skills/clickhouse-debug-bundle

Install this agent skill to your local

pnpm dlx add-skill https://github.com/jeremylongshore/claude-code-plugins-plus-skills/tree/HEAD/plugins/saas-packs/clickhouse-pack/skills/clickhouse-debug-bundle

Skill Files

Browse the full folder contents for clickhouse-debug-bundle.

Download Skill

Loading file tree…

plugins/saas-packs/clickhouse-pack/skills/clickhouse-debug-bundle/SKILL.md

Skill Metadata

Name
clickhouse-debug-bundle
Description
"Collect ClickHouse diagnostic data \u2014 system tables, query logs,\

ClickHouse Debug Bundle

Overview

Collect comprehensive diagnostic data from ClickHouse system tables for troubleshooting performance issues, merge problems, or support escalation.

Prerequisites

  • Access to ClickHouse with system.* table read permissions
  • curl or clickhouse-client available

Instructions

Step 1: Server Health Overview

-- Server version and uptime
SELECT
    version()                       AS version,
    uptime()                        AS uptime_seconds,
    formatReadableTimeDelta(uptime()) AS uptime_human,
    currentDatabase()               AS current_db;

-- Global metrics snapshot
SELECT metric, value, description
FROM system.metrics
WHERE metric IN (
    'Query', 'Merge', 'PartMutation', 'ReplicatedFetch',
    'TCPConnection', 'HTTPConnection', 'MemoryTracking',
    'BackgroundMergesAndMutationsPoolTask'
);

Step 2: Disk and Table Health

-- Disk usage by table (top 20)
SELECT
    database,
    table,
    formatReadableSize(sum(bytes_on_disk))  AS disk_size,
    sum(rows)                               AS total_rows,
    count()                                 AS active_parts,
    max(modification_time)                  AS last_modified
FROM system.parts
WHERE active
GROUP BY database, table
ORDER BY sum(bytes_on_disk) DESC
LIMIT 20;

-- Tables with too many parts (merge pressure)
SELECT database, table, count() AS parts
FROM system.parts WHERE active
GROUP BY database, table
HAVING parts > 100
ORDER BY parts DESC;

-- Disk space per disk
SELECT
    name,
    path,
    formatReadableSize(total_space)     AS total,
    formatReadableSize(free_space)      AS free,
    round(free_space / total_space * 100, 1) AS free_pct
FROM system.disks;

Step 3: Query Performance Analysis

-- Slowest queries in the last 24 hours
SELECT
    event_time,
    query_duration_ms,
    read_rows,
    read_bytes,
    result_rows,
    memory_usage,
    substring(query, 1, 200) AS query_preview
FROM system.query_log
WHERE type = 'QueryFinish'
  AND event_time >= now() - INTERVAL 24 HOUR
ORDER BY query_duration_ms DESC
LIMIT 20;

-- Failed queries (last 24h)
SELECT
    event_time,
    exception_code,
    exception,
    substring(query, 1, 200) AS query_preview
FROM system.query_log
WHERE type = 'ExceptionWhileProcessing'
  AND event_time >= now() - INTERVAL 24 HOUR
ORDER BY event_time DESC
LIMIT 20;

-- Query patterns (group by normalized query)
SELECT
    normalized_query_hash,
    count()                          AS executions,
    avg(query_duration_ms)           AS avg_ms,
    max(query_duration_ms)           AS max_ms,
    sum(read_rows)                   AS total_rows_read,
    formatReadableSize(sum(read_bytes)) AS total_read,
    any(substring(query, 1, 150))    AS sample_query
FROM system.query_log
WHERE type = 'QueryFinish'
  AND event_time >= now() - INTERVAL 24 HOUR
GROUP BY normalized_query_hash
ORDER BY sum(query_duration_ms) DESC
LIMIT 20;

Step 4: Merge and Mutation Status

-- Active merges
SELECT
    database, table, elapsed, progress,
    num_parts, result_part_name,
    formatReadableSize(total_size_bytes_compressed) AS size
FROM system.merges;

-- Pending mutations
SELECT database, table, mutation_id, command, create_time, is_done
FROM system.mutations
WHERE NOT is_done
ORDER BY create_time DESC;

-- Replication health (if using ReplicatedMergeTree)
SELECT
    database, table,
    is_leader, total_replicas, active_replicas,
    queue_size, inserts_in_queue, merges_in_queue
FROM system.replicas
WHERE active_replicas < total_replicas OR queue_size > 0;

Step 5: Automated Debug Script

#!/bin/bash
# clickhouse-debug-bundle.sh
set -euo pipefail

CH_HOST="${CLICKHOUSE_HOST:-http://localhost:8123}"
CH_USER="${CLICKHOUSE_USER:-default}"
CH_PASS="${CLICKHOUSE_PASSWORD:-}"
BUNDLE="ch-debug-$(date +%Y%m%d-%H%M%S)"
mkdir -p "$BUNDLE"

ch_query() {
  curl -sS "${CH_HOST}" \
    --user "${CH_USER}:${CH_PASS}" \
    --data-binary "$1" 2>&1
}

echo "Collecting ClickHouse diagnostics..."

ch_query "SELECT version(), uptime(), currentDatabase()" > "$BUNDLE/version.txt"
ch_query "SELECT * FROM system.metrics FORMAT TabSeparatedWithNames" > "$BUNDLE/metrics.tsv"
ch_query "SELECT * FROM system.events FORMAT TabSeparatedWithNames" > "$BUNDLE/events.tsv"
ch_query "SELECT database, table, count() AS parts, sum(rows) AS rows, \
  formatReadableSize(sum(bytes_on_disk)) AS size FROM system.parts \
  WHERE active GROUP BY database, table ORDER BY sum(bytes_on_disk) DESC \
  FORMAT TabSeparatedWithNames" > "$BUNDLE/tables.tsv"
ch_query "SELECT * FROM system.merges FORMAT TabSeparatedWithNames" > "$BUNDLE/merges.tsv"
ch_query "SELECT * FROM system.query_log WHERE type IN ('ExceptionWhileProcessing') \
  AND event_time >= now() - INTERVAL 1 HOUR ORDER BY event_time DESC LIMIT 50 \
  FORMAT TabSeparatedWithNames" > "$BUNDLE/errors.tsv"
ch_query "SELECT * FROM system.replicas FORMAT TabSeparatedWithNames" > "$BUNDLE/replicas.tsv" 2>/dev/null || true

tar -czf "${BUNDLE}.tar.gz" "$BUNDLE"
rm -rf "$BUNDLE"
echo "Bundle created: ${BUNDLE}.tar.gz"

Step 6: Node.js Debug Collector

import { createClient } from '@clickhouse/client';

async function collectDebugBundle(client: ReturnType<typeof createClient>) {
  const queries = {
    version: 'SELECT version() AS ver, uptime() AS up',
    tables: `SELECT database, table, count() AS parts, sum(rows) AS rows
             FROM system.parts WHERE active GROUP BY database, table
             ORDER BY sum(bytes_on_disk) DESC LIMIT 20`,
    slow: `SELECT query_duration_ms, substring(query,1,200) AS q
           FROM system.query_log WHERE type='QueryFinish'
           AND event_time >= now() - INTERVAL 1 HOUR
           ORDER BY query_duration_ms DESC LIMIT 10`,
    errors: `SELECT exception_code, exception, substring(query,1,200) AS q
             FROM system.query_log WHERE type='ExceptionWhileProcessing'
             AND event_time >= now() - INTERVAL 1 HOUR LIMIT 10`,
    merges: 'SELECT * FROM system.merges',
  };

  const bundle: Record<string, unknown> = {};
  for (const [key, sql] of Object.entries(queries)) {
    try {
      const rs = await client.query({ query: sql, format: 'JSONEachRow' });
      bundle[key] = await rs.json();
    } catch (e) {
      bundle[key] = { error: (e as Error).message };
    }
  }

  return bundle;
}

Key System Tables

| Table | Purpose | |-------|---------| | system.parts | Data parts per table (size, rows, merge status) | | system.query_log | Query history with timing and errors | | system.metrics | Real-time server metrics (gauges) | | system.events | Cumulative server counters | | system.merges | Currently running merges | | system.mutations | ALTER TABLE mutations (UPDATE/DELETE) | | system.replicas | Replication status per table | | system.processes | Currently executing queries | | system.disks | Disk space and health |

Error Handling

| Issue | Cause | Solution | |-------|-------|----------| | system.query_log empty | Logging disabled | Set log_queries = 1 | | Permission denied on system tables | Restricted user | Grant SELECT ON system.* | | Bundle too large | Too much history | Narrow time window |

Resources

Next Steps

For connection and concurrency issues, see clickhouse-rate-limits.