#!/usr/bin/env python3
"""
Created by: Claude Code
Session ID: 2025-12-03
Date: 2025-12-03
Purpose: Fast table extraction from Notion pages

Usage:
    python 01_extract_table.py <notion_url>
    python 01_extract_table.py <notion_url> --format json
    python 01_extract_table.py <notion_url> --format csv
    python 01_extract_table.py <notion_url> --format markdown

Examples:
    python 01_extract_table.py "https://notion.so/page#block_id"
    python 01_extract_table.py "https://notion.so/page#block_id" --format json
"""

import os
import sys
import json
import argparse
from typing import Optional

# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))))

from data_sources.notion.notion_client import NotionClient


def extract_ids_from_url(url: str) -> tuple[str, Optional[str]]:
    """
    Extract page_id and block_id from Notion URL.

    Args:
        url: Notion URL like https://notion.so/page-name-abc123#def456

    Returns:
        Tuple of (page_id, block_id) where block_id may be None
    """
    # Remove query params
    url_clean = url.split('?')[0]

    # Extract anchor (block_id)
    block_id = None
    if '#' in url_clean:
        url_clean, anchor = url_clean.split('#', 1)
        # Convert 32-char anchor to 36-char UUID format
        if len(anchor) == 32:
            block_id = f"{anchor[:8]}-{anchor[8:12]}-{anchor[12:16]}-{anchor[16:20]}-{anchor[20:]}"
        else:
            block_id = anchor

    # Extract page_id (last 32 chars of path)
    page_id_raw = url_clean.rstrip('/').split('-')[-1]
    if len(page_id_raw) == 32:
        page_id = f"{page_id_raw[:8]}-{page_id_raw[8:12]}-{page_id_raw[12:16]}-{page_id_raw[16:20]}-{page_id_raw[20:]}"
    else:
        page_id = page_id_raw

    return page_id, block_id


def find_table_in_block(client: NotionClient, block_id: str) -> Optional[dict]:
    """
    Find table block within a parent block (handles toggles, columns, etc).

    Args:
        client: NotionClient instance
        block_id: Parent block ID to search within

    Returns:
        Table block dict or None
    """
    children = client.get_block_children(block_id)

    for child in children:
        if child.get('type') == 'table':
            return child
        # Recursively search in nested blocks
        if child.get('has_children'):
            nested_table = find_table_in_block(client, child['id'])
            if nested_table:
                return nested_table

    return None


def extract_table_data(client: NotionClient, table_id: str) -> list[list[str]]:
    """
    Extract all rows from a Notion table.

    Args:
        client: NotionClient instance
        table_id: Table block ID

    Returns:
        List of rows, where each row is a list of cell values
    """
    rows = client.get_block_children(table_id)
    table_data = []

    for row in rows:
        if row.get('type') == 'table_row':
            cells = row.get('table_row', {}).get('cells', [])
            row_values = []
            for cell in cells:
                # Extract plain text from rich text array
                cell_text = ''.join([t.get('plain_text', '') for t in cell])
                row_values.append(cell_text)
            table_data.append(row_values)

    return table_data


def format_markdown(data: list[list[str]]) -> str:
    """Format table data as Markdown."""
    if not data:
        return "Empty table"

    lines = []

    # Header
    header = data[0]
    lines.append("| " + " | ".join(header) + " |")
    lines.append("|" + "|".join(["---"] * len(header)) + "|")

    # Rows
    for row in data[1:]:
        # Pad row if needed
        while len(row) < len(header):
            row.append("")
        lines.append("| " + " | ".join(row) + " |")

    return "\n".join(lines)


def format_csv(data: list[list[str]]) -> str:
    """Format table data as CSV."""
    import csv
    import io

    output = io.StringIO()
    writer = csv.writer(output)
    for row in data:
        writer.writerow(row)
    return output.getvalue()


def format_json(data: list[list[str]]) -> str:
    """Format table data as JSON (list of dicts)."""
    if not data or len(data) < 2:
        return json.dumps(data, ensure_ascii=False, indent=2)

    header = data[0]
    rows = []
    for row in data[1:]:
        row_dict = {}
        for i, col in enumerate(header):
            row_dict[col] = row[i] if i < len(row) else ""
        rows.append(row_dict)

    return json.dumps(rows, ensure_ascii=False, indent=2)


def main():
    parser = argparse.ArgumentParser(
        description="Extract table from Notion page",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s "https://notion.so/page#block_id"
  %(prog)s "https://notion.so/page#block_id" --format json
  %(prog)s "https://notion.so/page#block_id" --format csv
        """
    )
    parser.add_argument("url", help="Notion URL (with optional #block_id anchor)")
    parser.add_argument(
        "--format", "-f",
        choices=["markdown", "json", "csv", "raw"],
        default="markdown",
        help="Output format (default: markdown)"
    )
    parser.add_argument(
        "--quiet", "-q",
        action="store_true",
        help="Suppress status messages"
    )

    args = parser.parse_args()

    # Extract IDs from URL
    page_id, block_id = extract_ids_from_url(args.url)

    if not args.quiet:
        print(f"📄 Page ID: {page_id}", file=sys.stderr)
        if block_id:
            print(f"📍 Block ID: {block_id}", file=sys.stderr)

    # Initialize client
    client = NotionClient()

    # Find table
    table_block = None

    if block_id:
        # First check if block_id is the table itself
        try:
            block = client.get_block(block_id)
            if block.get('type') == 'table':
                table_block = block
            else:
                # Search within the block (e.g., toggle)
                if not args.quiet:
                    print(f"🔍 Searching in {block.get('type')} block...", file=sys.stderr)
                table_block = find_table_in_block(client, block_id)
        except Exception as e:
            if not args.quiet:
                print(f"⚠️ Block not found, searching page...", file=sys.stderr)

    if not table_block:
        # Search entire page for first table
        if not args.quiet:
            print("🔍 Searching page for tables...", file=sys.stderr)
        table_block = find_table_in_block(client, page_id)

    if not table_block:
        print("❌ No table found", file=sys.stderr)
        sys.exit(1)

    if not args.quiet:
        print(f"✅ Found table: {table_block['id']}", file=sys.stderr)
        print("", file=sys.stderr)

    # Extract data
    data = extract_table_data(client, table_block['id'])

    if not args.quiet:
        print(f"📊 {len(data)} rows extracted", file=sys.stderr)
        print("", file=sys.stderr)

    # Format output
    if args.format == "markdown":
        print(format_markdown(data))
    elif args.format == "json":
        print(format_json(data))
    elif args.format == "csv":
        print(format_csv(data))
    elif args.format == "raw":
        print(json.dumps(data, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()
