Data Design Skill | Agent Skills

Data Design

Overview

Principles for designing data structures, schemas, and data flows that are efficient, maintainable, and scalable.

Data Modeling

Entity-Relationship Diagrams

┌─────────────┐       ┌─────────────┐       ┌─────────────┐
│    User     │       │    Order    │       │   Product   │
├─────────────┤       ├─────────────┤       ├─────────────┤
│ id (PK)     │──┐    │ id (PK)     │    ┌──│ id (PK)     │
│ email       │  │    │ user_id(FK) │←───┘  │ name        │
│ name        │  └───→│ status      │       │ price       │
│ created_at  │       │ total       │       │ stock       │
└─────────────┘       │ created_at  │       └─────────────┘
                      └─────────────┘              │
                             │                     │
                      ┌──────┴──────┐              │
                      ↓             ↓              │
               ┌─────────────┐                     │
               │ OrderItem   │                     │
               ├─────────────┤                     │
               │ id (PK)     │                     │
               │ order_id(FK)│                     │
               │ product_id  │─────────────────────┘
               │ quantity    │
               │ price       │
               └─────────────┘

Relationship Types

| Type | Description | Example | |------|-------------|---------| | 1:1 | One to one | User ↔ Profile | | 1:N | One to many | User → Orders | | M:N | Many to many | Students ↔ Courses |

-- 1:1 (profile extends user)
CREATE TABLE users (
  id SERIAL PRIMARY KEY,
  email VARCHAR(255) UNIQUE
);

CREATE TABLE profiles (
  user_id INTEGER PRIMARY KEY REFERENCES users(id),
  bio TEXT,
  avatar_url VARCHAR(255)
);

-- 1:N (user has many orders)
CREATE TABLE orders (
  id SERIAL PRIMARY KEY,
  user_id INTEGER REFERENCES users(id),
  total DECIMAL(10,2)
);

-- M:N (students ↔ courses via junction table)
CREATE TABLE enrollments (
  student_id INTEGER REFERENCES students(id),
  course_id INTEGER REFERENCES courses(id),
  enrolled_at TIMESTAMP DEFAULT NOW(),
  PRIMARY KEY (student_id, course_id)
);

Normalization

Normal Forms

| Form | Rule | Example Violation | |------|------|-------------------| | 1NF | Atomic values, no repeating groups | tags: "a,b,c" | | 2NF | 1NF + no partial dependencies | Non-key depends on part of composite key | | 3NF | 2NF + no transitive dependencies | zip → city in orders table | | BCNF | Every determinant is a candidate key | Rare edge cases |

-- ❌ Violates 1NF (non-atomic)
CREATE TABLE products (
  id SERIAL PRIMARY KEY,
  name VARCHAR(255),
  tags VARCHAR(255)  -- "electronics,sale,featured"
);

-- ✅ 1NF compliant
CREATE TABLE products (
  id SERIAL PRIMARY KEY,
  name VARCHAR(255)
);

CREATE TABLE product_tags (
  product_id INTEGER REFERENCES products(id),
  tag VARCHAR(50),
  PRIMARY KEY (product_id, tag)
);

-- ❌ Violates 3NF (transitive dependency)
CREATE TABLE orders (
  id SERIAL PRIMARY KEY,
  customer_zip VARCHAR(10),
  customer_city VARCHAR(100)  -- Depends on zip, not order
);

-- ✅ 3NF compliant
CREATE TABLE customers (
  id SERIAL PRIMARY KEY,
  zip VARCHAR(10),
  city VARCHAR(100)
);

CREATE TABLE orders (
  id SERIAL PRIMARY KEY,
  customer_id INTEGER REFERENCES customers(id)
);

Denormalization

When to Denormalize

Normalize for:
✅ Write-heavy workloads
✅ Data integrity requirements
✅ Storage efficiency
✅ Flexibility in queries

Denormalize for:
✅ Read-heavy workloads
✅ Complex joins hurting performance
✅ Reporting/analytics
✅ Known access patterns

Denormalization Patterns

-- Computed columns
CREATE TABLE orders (
  id SERIAL PRIMARY KEY,
  items JSONB,
  item_count INTEGER GENERATED ALWAYS AS (jsonb_array_length(items)) STORED,
  total DECIMAL(10,2)
);

-- Duplicated data for read performance
CREATE TABLE posts (
  id SERIAL PRIMARY KEY,
  author_id INTEGER REFERENCES users(id),
  author_name VARCHAR(100),  -- Duplicated from users
  author_avatar VARCHAR(255), -- Duplicated from users
  content TEXT
);

-- Materialized view for complex queries
CREATE MATERIALIZED VIEW monthly_sales AS
SELECT
  DATE_TRUNC('month', created_at) as month,
  product_id,
  SUM(quantity) as units_sold,
  SUM(total) as revenue
FROM order_items
GROUP BY 1, 2;

-- Refresh periodically
REFRESH MATERIALIZED VIEW CONCURRENTLY monthly_sales;

Schema Design Patterns

Soft Deletes

CREATE TABLE users (
  id SERIAL PRIMARY KEY,
  email VARCHAR(255),
  deleted_at TIMESTAMP NULL,
  -- Partial unique index
  CONSTRAINT unique_active_email UNIQUE (email) WHERE deleted_at IS NULL
);

-- Query active users only
SELECT * FROM users WHERE deleted_at IS NULL;

Audit Trail

CREATE TABLE audit_log (
  id SERIAL PRIMARY KEY,
  table_name VARCHAR(100),
  record_id INTEGER,
  action VARCHAR(10),  -- INSERT, UPDATE, DELETE
  old_data JSONB,
  new_data JSONB,
  changed_by INTEGER REFERENCES users(id),
  changed_at TIMESTAMP DEFAULT NOW()
);

-- Trigger for automatic auditing
CREATE OR REPLACE FUNCTION audit_trigger()
RETURNS TRIGGER AS $$
BEGIN
  INSERT INTO audit_log (table_name, record_id, action, old_data, new_data, changed_by)
  VALUES (
    TG_TABLE_NAME,
    COALESCE(NEW.id, OLD.id),
    TG_OP,
    CASE WHEN TG_OP != 'INSERT' THEN to_jsonb(OLD) END,
    CASE WHEN TG_OP != 'DELETE' THEN to_jsonb(NEW) END,
    current_setting('app.user_id', true)::INTEGER
  );
  RETURN COALESCE(NEW, OLD);
END;
$$ LANGUAGE plpgsql;

Multi-Tenancy

-- Row-level security
CREATE TABLE organizations (
  id SERIAL PRIMARY KEY,
  name VARCHAR(255)
);

CREATE TABLE projects (
  id SERIAL PRIMARY KEY,
  org_id INTEGER REFERENCES organizations(id),
  name VARCHAR(255)
);

-- Enable RLS
ALTER TABLE projects ENABLE ROW LEVEL SECURITY;

CREATE POLICY org_isolation ON projects
  USING (org_id = current_setting('app.org_id')::INTEGER);

-- Set org context per request
SET app.org_id = 123;
SELECT * FROM projects; -- Only sees org 123's projects

Versioning / History

-- Version table pattern
CREATE TABLE documents (
  id SERIAL PRIMARY KEY,
  current_version_id INTEGER
);

CREATE TABLE document_versions (
  id SERIAL PRIMARY KEY,
  document_id INTEGER REFERENCES documents(id),
  version INTEGER,
  content TEXT,
  created_at TIMESTAMP DEFAULT NOW(),
  created_by INTEGER REFERENCES users(id),
  UNIQUE (document_id, version)
);

-- Temporal tables (PostgreSQL)
CREATE TABLE products (
  id SERIAL PRIMARY KEY,
  name VARCHAR(255),
  price DECIMAL(10,2),
  valid_from TIMESTAMP DEFAULT NOW(),
  valid_to TIMESTAMP DEFAULT 'infinity'
);

-- Query historical state
SELECT * FROM products
WHERE valid_from <= '2024-01-01' AND valid_to > '2024-01-01';

NoSQL Schema Design

Document Store (MongoDB)

// Embedded vs Referenced

// ✅ Embed when: data is accessed together, 1:few relationship
{
  _id: ObjectId("..."),
  title: "Blog Post",
  author: {
    name: "John",
    email: "john@example.com"
  },
  comments: [
    { user: "Jane", text: "Great post!", date: ISODate("...") }
  ]
}

// ✅ Reference when: data is accessed independently, 1:many or M:N
{
  _id: ObjectId("..."),
  title: "Blog Post",
  authorId: ObjectId("..."),  // Reference to users collection
  commentIds: [ObjectId("..."), ObjectId("...")]
}

// ❌ Anti-pattern: Unbounded arrays
{
  _id: ObjectId("..."),
  logs: [...] // Can grow to millions, hits 16MB limit
}

// ✅ Better: Bucket pattern
{
  _id: ObjectId("..."),
  sensorId: "sensor-123",
  date: ISODate("2024-01-15"),
  readings: [...] // Max ~1000 per document
}

Key-Value Store (Redis)

# Naming conventions
user:123              # User object
user:123:sessions     # User's sessions (set)
user:123:orders       # User's orders (list)
order:456             # Order object
orders:pending        # Queue of pending orders (list)
products:category:electronics  # Products in category (set)

# Expiration patterns
session:{token}       # Expires after 30 min
rate_limit:ip:1.2.3.4 # Expires after 1 min
cache:api:/users/123  # Expires after 5 min

Data Pipeline Design

ETL vs ELT

ETL (Extract, Transform, Load):
Source → Transform (external) → Data Warehouse
Use: Traditional, when transformation is complex

ELT (Extract, Load, Transform):
Source → Data Lake/Warehouse → Transform (in-place)
Use: Modern, leverages warehouse compute power

Event Sourcing

// Events are the source of truth
interface Event {
  id: string;
  aggregateId: string;
  type: string;
  payload: unknown;
  timestamp: Date;
  version: number;
}

// Event store
class EventStore {
  async append(aggregateId: string, events: Event[]) {
    await db.events.insertMany(events);
  }

  async getEvents(aggregateId: string): Promise<Event[]> {
    return db.events
      .find({ aggregateId })
      .sort({ version: 1 })
      .toArray();
  }
}

// Rebuild state from events
function rebuildAccount(events: Event[]): Account {
  return events.reduce((account, event) => {
    switch (event.type) {
      case 'AccountOpened':
        return { balance: 0, ...event.payload };
      case 'MoneyDeposited':
        return { ...account, balance: account.balance + event.payload.amount };
      case 'MoneyWithdrawn':
        return { ...account, balance: account.balance - event.payload.amount };
      default:
        return account;
    }
  }, {} as Account);
}

Data Governance

Data Quality Dimensions

| Dimension | Description | Example Check | |-----------|-------------|---------------| | Accuracy | Correct values | Email format validation | | Completeness | No missing data | Required fields present | | Consistency | Same across systems | User ID matches in all tables | | Timeliness | Up to date | Last updated within 24h | | Uniqueness | No duplicates | Unique email per user |

Schema Evolution

-- Safe migrations

-- ✅ Adding nullable column
ALTER TABLE users ADD COLUMN phone VARCHAR(20) NULL;

-- ✅ Adding column with default
ALTER TABLE users ADD COLUMN status VARCHAR(20) DEFAULT 'active';

-- ⚠️ Making column non-null (multi-step)
-- Step 1: Add with default
ALTER TABLE users ADD COLUMN verified BOOLEAN DEFAULT false;
-- Step 2: Backfill data
UPDATE users SET verified = true WHERE email_verified_at IS NOT NULL;
-- Step 3: Add constraint
ALTER TABLE users ALTER COLUMN verified SET NOT NULL;

-- ❌ Dangerous: Renaming column
-- Instead: Add new, migrate data, remove old (over multiple deploys)

Related Skills

[[database]] - Database implementation
[[architecture-patterns]] - Data architecture patterns
[[api-design]] - Data in APIs

Agent Skills: Data Design

Install this agent skill to your local

Skill Files