ClickHouse Data Handling
Overview
Manage the full data lifecycle in ClickHouse: TTL-based expiration, GDPR/CCPA deletion, data masking, partition management, and audit trails.
Prerequisites
- ClickHouse tables with data (see
clickhouse-core-workflow-a) - Understanding of your data retention requirements
Instructions
Step 1: TTL-Based Data Expiration
-- Add TTL to expire data automatically
CREATE TABLE analytics.events (
event_id UUID DEFAULT generateUUIDv4(),
event_type LowCardinality(String),
user_id UInt64,
properties String CODEC(ZSTD(3)),
created_at DateTime DEFAULT now()
)
ENGINE = MergeTree()
ORDER BY (event_type, created_at)
PARTITION BY toYYYYMM(created_at)
TTL created_at + INTERVAL 90 DAY; -- Auto-delete after 90 days
-- Add TTL to existing table
ALTER TABLE analytics.events
MODIFY TTL created_at + INTERVAL 90 DAY;
-- Tiered storage TTL (hot → cold → delete)
ALTER TABLE analytics.events
MODIFY TTL
created_at + INTERVAL 7 DAY TO VOLUME 'hot',
created_at + INTERVAL 30 DAY TO VOLUME 'cold',
created_at + INTERVAL 365 DAY DELETE;
-- Column-level TTL (null out PII after 30 days, keep the row)
ALTER TABLE analytics.events
MODIFY COLUMN email String DEFAULT ''
TTL created_at + INTERVAL 30 DAY;
-- Force TTL cleanup now (normally runs during merges)
OPTIMIZE TABLE analytics.events FINAL;
Step 2: Data Deletion for GDPR/CCPA
-- Option A: Lightweight DELETE (ClickHouse 23.3+)
-- Marks rows as deleted without rewriting parts immediately
DELETE FROM analytics.events WHERE user_id = 42;
-- Option B: ALTER TABLE DELETE (mutation — rewrites parts in background)
ALTER TABLE analytics.events DELETE WHERE user_id = 42;
-- Check mutation progress
SELECT
database, table, mutation_id, command,
is_done, parts_to_do, create_time
FROM system.mutations
WHERE NOT is_done
ORDER BY create_time DESC;
-- Option C: Drop entire partitions (fastest for bulk deletion)
-- First, check what partitions exist
SELECT partition, count() AS parts, sum(rows) AS rows,
min(min_time) AS from_time, max(max_time) AS to_time
FROM system.parts
WHERE database = 'analytics' AND table = 'events' AND active
GROUP BY partition ORDER BY partition;
ALTER TABLE analytics.events DROP PARTITION '202401';
Important notes on ClickHouse deletions:
DELETE FROMis lightweight but still creates mutations internally- Mutations rewrite data parts in the background — not instant
- For GDPR compliance, use
ALTER TABLE DELETEand verify viasystem.mutations - Partitioned data is fastest to bulk-delete via
DROP PARTITION
Step 3: Data Masking and Anonymization
-- Create a view that masks PII for analyst access
CREATE VIEW analytics.events_masked AS
SELECT
event_id,
event_type,
sipHash64(user_id) AS user_id_hash, -- One-way hash
JSONExtractString(properties, 'url') AS url, -- Extract safe fields only
-- Mask email: show domain only
concat('***@', substringAfter(email, '@')) AS masked_email,
created_at
FROM analytics.events;
-- Row-level masking with dictionaries
CREATE DICTIONARY analytics.pii_allowlist (
user_id UInt64,
can_see_pii UInt8
)
PRIMARY KEY user_id
SOURCE(CLICKHOUSE(TABLE 'pii_allowlist'))
LIFETIME(MIN 300 MAX 600)
LAYOUT(FLAT());
Step 4: User Data Export (DSAR)
import { createClient } from '@clickhouse/client';
async function exportUserData(userId: number): Promise<Record<string, unknown[]>> {
const client = createClient({ url: process.env.CLICKHOUSE_HOST! });
// Export all user data from all tables
const tables = ['events', 'sessions', 'purchases'];
const result: Record<string, unknown[]> = {};
for (const table of tables) {
const rs = await client.query({
query: `SELECT * FROM analytics.${table} WHERE user_id = {uid:UInt64}`,
query_params: { uid: userId },
format: 'JSONEachRow',
});
result[table] = await rs.json();
}
return result;
}
// GDPR: Delete all user data
async function deleteUserData(userId: number): Promise<void> {
const client = createClient({ url: process.env.CLICKHOUSE_HOST! });
const tables = ['events', 'sessions', 'purchases'];
for (const table of tables) {
await client.command({
query: `ALTER TABLE analytics.${table} DELETE WHERE user_id = {uid:UInt64}`,
query_params: { uid: userId },
});
}
// Log the deletion for compliance audit trail
await client.insert({
table: 'analytics.gdpr_audit_log',
values: [{
user_id: userId,
action: 'DELETE_ALL',
tables_affected: tables.join(','),
requested_at: new Date().toISOString().replace('T', ' ').slice(0, 19),
}],
format: 'JSONEachRow',
});
}
Step 5: Audit Trail Table
-- Immutable audit log (no deletes, no TTL)
CREATE TABLE analytics.audit_log (
log_id UUID DEFAULT generateUUIDv4(),
action LowCardinality(String), -- 'query', 'delete', 'export', 'schema_change'
actor String, -- User or service name
target String, -- Table or resource
details String CODEC(ZSTD(3)), -- JSON details
ip_address IPv4,
logged_at DateTime DEFAULT now()
)
ENGINE = MergeTree()
ORDER BY (action, logged_at)
PARTITION BY toYYYYMM(logged_at);
-- No TTL — audit logs must be retained
-- Query audit trail
SELECT logged_at, actor, action, target, details
FROM analytics.audit_log
WHERE action = 'DELETE_ALL'
ORDER BY logged_at DESC
LIMIT 50;
Step 6: Retention Monitoring
-- Data retention overview
SELECT
database, table,
result_ttl_expression AS ttl,
formatReadableSize(sum(bytes_on_disk)) AS size,
min(p.min_time) AS oldest_data,
max(p.max_time) AS newest_data,
dateDiff('day', min(p.min_time), max(p.max_time)) AS days_span
FROM system.tables t
LEFT JOIN system.parts p ON t.database = p.database AND t.name = p.table AND p.active
WHERE t.database = 'analytics'
GROUP BY database, table, result_ttl_expression
ORDER BY sum(bytes_on_disk) DESC;
-- Find tables missing TTL
SELECT database, name AS table, engine
FROM system.tables
WHERE database = 'analytics'
AND engine LIKE '%MergeTree%'
AND result_ttl_expression = '';
Data Classification
| Category | Examples | Handling in ClickHouse | |----------|----------|------------------------| | PII | Email, name, IP | Column-level TTL, masking views, deletion support | | Sensitive | API keys, tokens | Never store in ClickHouse — use secret managers | | Business | Event counts, metrics | Standard TTL, aggregate for long-term retention | | Audit | Access logs | No TTL, immutable, partitioned by month |
Error Handling
| Issue | Cause | Solution |
|-------|-------|----------|
| Mutation stuck | Large table rewrite | Check system.mutations, cancel if needed |
| TTL not expiring | No merges running | OPTIMIZE TABLE ... FINAL to force |
| DELETE not working | Old ClickHouse version | Use ALTER TABLE DELETE (mutation) |
| Export timeout | Too much user data | Add LIMIT or export in batches |
Resources
Next Steps
For role-based access control, see clickhouse-enterprise-rbac.