#!/usr/bin/env python3
"""
Extract Client Emails from ClickHouse
Part of: client-emails-extractor skill
Version: 2.6.0
Date: 2025-12-01

CHANGELOG v2.6.0:
- CRITICAL FIX: Two-level deduplication
- Level 1: gmail_msg_id (Revenue vs CS for same mailbox)
- Level 2: subject+from+date (same email to multiple recipients = different IDs)
- Fixes: Email sent to N recipients had N copies, only 1 with body

CHANGELOG v2.5.0:
- NEW: Email body cleaning pipeline (html2text + email-reply-parser)
- Removes HTML tags, quoted replies, and signatures
- Produces clean, readable text output
- Dependencies: html2text, email-reply-parser

CHANGELOG v2.4.0:
- CRITICAL FIX: Dedup by gmail_msg_id (same ID in both tables)
- FIXED: Date-based dedup failed when 7sec difference crossed minute boundary
- Revenue 11:59:53 vs CS 12:00:00 → different dedup keys → CS record survived
- Now: same message ID = same email → correct dedup, Revenue body preferred

CHANGELOG v2.3.0:
- CRITICAL FIX: Switched from fct_gmail_revenue_messages to src_mv_google_gmail_revenue_messages
- fct_ table truncates body to ~129 chars due to ETL bug
- src_ table has FULL body (6000+ chars)
- Added calendar sync email filters (synced invitation, canceled event)

CHANGELOG v2.2.0:
- MAJOR: Now combines TWO sources for maximum body coverage:
  1. src_mv_google_gmail_revenue_messages (FULL body) - Revenue/Sales team
  2. src_mv_google_gmail_cs_messages (57% body) - CS team
- Deduplicates by composite key (subject+from+date), preferring source with body
- Expected body coverage: ~95%+ combined

CHANGELOG v2.1.0:
- FIXED: Now uses src_mv_google_gmail_cs_messages (RAW source) instead of stg view
- RAW table contains FULL email body (not just snippets)
- Body available for ~57% of emails vs only snippet in stg view

CHANGELOG v2.0.0:
- Added context validation to ensure emails actually relate to the client
- Primary domains (e.g., kp.org) are trusted without additional validation
- Associated domains (e.g., omc.com for agencies) require context keywords
- Added --keywords parameter for custom context keywords

Usage:
    python extract_client_emails.py "Client Name" --primary kp.org --associated omc.com,hearts-science.com
"""

import os
import sys
import json
import re
import argparse
from datetime import datetime, timedelta
from pathlib import Path
from collections import defaultdict

# Add project root to path
SCRIPT_DIR = Path(__file__).parent
PROJECT_ROOT = SCRIPT_DIR.parent.parent.parent
sys.path.append(str(PROJECT_ROOT))

from analytics.clickhouse.db_utils import get_client

# Email cleaning libraries
try:
    import html2text
    from email_reply_parser import EmailReplyParser
    EMAIL_CLEANING_AVAILABLE = True
except ImportError:
    EMAIL_CLEANING_AVAILABLE = False
    print("⚠️ Email cleaning libraries not installed. Run: pip install html2text email-reply-parser")

# ============================================================================
# CONFIGURATION
# ============================================================================

DEFAULT_MONTHS_BACK = 6

# Noise filter patterns
EXCLUDE_SUBJECT_PATTERNS = [
    r'pre-read for your upcoming',
    r'please read this message in its entirety',
    r'your meeting recap',
    r'meeting recap -',
    r'read meeting report',
    r'\| read meeting report',
    r'^ooo\b',
    r'out of office',
    r'\booo\b.*re:',
    r'automatic reply',
    r'^invitation\b',
    r'^updated invitation',
    r'^canceled:',
    r'^cancelled:',
    r'^accepted:',
    r'^declined:',
    r'^tentative:',
    r'invoice.*is due',
    r'payment reminder',
    r'you have a new invoice',
    r'^\[jira\]',
    r'fred from fireflies',
    r'early payment available',         # Taulia payment notifications
    r'a new opportunity has been created', # Salesforce auto-notifications
    r'^synced invitation',               # Google Calendar sync - no body available
    r'^canceled event with note',        # Google Calendar cancellation - no body available
    r'^updated event with note',         # Google Calendar update - no body available
]

EXCLUDE_FROM_PATTERNS = [
    r'noreply',
    r'no-reply',
    r'donotreply',
    r'notifications@',
    r'calendar-notification',
    r'fred@fireflies',
    r'billing@improvado',
    r'@e\.read\.ai',
    r'@read\.ai',
    r'@taulia\.com',                    # Payment portal auto-notifications
    r'portal@',                         # Generic portal notifications
]


def should_exclude(email: dict) -> tuple[bool, str]:
    """Check if email should be excluded based on noise patterns"""
    subject = (email.get('gmail_msg_subject') or '').lower()
    from_email = (email.get('gmail_msg_from_email') or '').lower()
    from_field = (email.get('gmail_msg_from_field') or '').lower()

    for pattern in EXCLUDE_SUBJECT_PATTERNS:
        if re.search(pattern, subject, re.IGNORECASE):
            return True, f"Subject: {pattern}"

    for pattern in EXCLUDE_FROM_PATTERNS:
        if re.search(pattern, from_email, re.IGNORECASE):
            return True, f"From: {pattern}"

    # OOO in from field
    if re.search(r'ooo.*re:|automatic reply', from_field, re.IGNORECASE):
        return True, "OOO auto-reply"

    return False, None


def clean_email_body(body: str) -> str:
    """
    Clean email body by removing HTML, quoted replies, and signatures.

    Gmail body format: plain text + HTML concatenated (separated by \n<div)
    We extract plain text part OR convert HTML if no plain text.

    Pipeline:
    1. Split multipart content (plain text | HTML)
    2. Use plain text if available, else convert HTML
    3. Remove quoted replies (email-reply-parser)
    4. Remove email signatures (regex patterns)
    5. Clean up whitespace

    Returns clean, readable text.
    """
    if not body or not body.strip():
        return ''

    # Gmail often concatenates plain text + HTML in body
    # Split on first HTML tag to get plain text part
    html_start = body.find('\n<div')
    if html_start == -1:
        html_start = body.find('<div')

    if html_start > 0:
        # Has both plain text and HTML - use plain text part
        plain = body[:html_start]
    elif '<' in body and '>' in body:
        # Only HTML - convert to text
        if EMAIL_CLEANING_AVAILABLE:
            h = html2text.HTML2Text()
            h.ignore_links = True
            h.ignore_images = True
            h.ignore_emphasis = True
            h.body_width = 0
            plain = h.handle(body)
        else:
            plain = re.sub(r'<[^>]+>', '', body)
    else:
        # Plain text only
        plain = body

    if not EMAIL_CLEANING_AVAILABLE:
        plain = re.sub(r'\s+', ' ', plain)
        return plain.strip()

    # Remove quoted replies (From: ..., On ... wrote:, > quoted lines)
    plain = EmailReplyParser.parse_reply(plain)

    # Remove signatures (after -- or common patterns)
    signature_patterns = [
        r'\r?\n--\s*\r?\n.*',        # Standard signature delimiter
        r'\r?\nBest regards,.*',     # Common sign-offs
        r'\r?\nKind regards,.*',
        r'\r?\nWarm regards,.*',
        r'\r?\nThanks,\r?\n.*',
        r'\r?\nThank you,\r?\n.*',
        r'\r?\nSincerely,.*',
        r'\r?\nCheers,.*',
        r'\r?\n_{10,}.*',            # Long underscore line
        r'\r?\nSent from my iPhone.*',
        r'\r?\nSent from my iPad.*',
        r'\r?\nGet Outlook for.*',
    ]

    for pattern in signature_patterns:
        plain = re.sub(pattern, '', plain, flags=re.IGNORECASE | re.DOTALL)

    # Clean up whitespace
    plain = plain.replace('\r\n', '\n')        # Normalize line endings
    plain = re.sub(r'\n{3,}', '\n\n', plain)   # Max 2 newlines
    plain = re.sub(r'[ \t]+', ' ', plain)      # Multiple spaces → single
    plain = plain.strip()

    return plain


def validate_client_context(
    email: dict,
    primary_domains: list[str],
    associated_domains: list[str],
    context_keywords: list[str]
) -> tuple[bool, str]:
    """
    Validate that email actually relates to the client.

    Strategy:
    1. If email involves a PRIMARY domain (e.g., kp.org) → VALID (no further check)
    2. If email involves only ASSOCIATED domain (e.g., omc.com) → check for context keywords
    3. If subject contains client name → VALID

    Returns:
        (is_valid, reason)
    """
    from_email = (email.get('gmail_msg_from_email') or '').lower()
    to_field = (email.get('gmail_msg_to_field') or '').lower()
    subject = (email.get('gmail_msg_subject') or '').lower()
    body = (email.get('gmail_msg_body') or '').lower()
    snippet = (email.get('gmail_msg_snippet') or '').lower()

    full_text = f"{subject} {body} {snippet}"

    # Check 1: Primary domain involvement → always valid
    for domain in primary_domains:
        domain = domain.lower()
        if domain in from_email or domain in to_field:
            return True, f"primary_domain:{domain}"

    # Check 2: Context keywords in subject/body
    for keyword in context_keywords:
        keyword = keyword.lower()
        if keyword in full_text:
            return True, f"context_keyword:{keyword}"

    # Check 3: If only associated domain without context → INVALID
    has_associated = False
    for domain in associated_domains:
        domain = domain.lower()
        if domain in from_email or domain in to_field:
            has_associated = True
            break

    if has_associated:
        return False, "associated_domain_without_context"

    # If we got here, email matched some condition but doesn't have required context
    return False, "no_client_context"


def safe_filename(s: str, max_len: int = 60) -> str:
    """Convert string to safe filename"""
    s = s.lower()
    s = re.sub(r'[^\w\s-]', '', s)
    s = re.sub(r'\s+', '_', s)
    return s[:max_len].strip('_') or 'untitled'


def extract_client_emails(
    client_name: str,
    primary_domains: list[str] = None,
    associated_domains: list[str] = None,
    context_keywords: list[str] = None,
    output_dir: Path = None,
    months_back: int = DEFAULT_MONTHS_BACK
) -> dict:
    """
    Extract client emails from ClickHouse CS messages table.

    Args:
        client_name: Name of the client to search
        primary_domains: Client's own domains (e.g., ['kp.org']) - always trusted
        associated_domains: Agency/partner domains (e.g., ['omc.com']) - require context
        context_keywords: Keywords to validate context (e.g., ['kaiser', 'kp', 'hearts'])
        output_dir: Where to save output files
        months_back: How many months back to search

    Returns:
        dict with extraction stats and metadata
    """

    print("=" * 80)
    print(f"CLIENT EMAIL EXTRACTION: {client_name}")
    print("=" * 80)

    # Defaults
    if primary_domains is None:
        primary_domains = []
    if associated_domains is None:
        associated_domains = []
    if context_keywords is None:
        # Default: client name parts
        context_keywords = [word.lower() for word in client_name.split() if len(word) > 2]

    all_domains = primary_domains + associated_domains

    # Calculate date range
    start_date = (datetime.now() - timedelta(days=months_back * 30)).strftime('%Y-%m-%d')

    print(f"\n📋 Parameters:")
    print(f"   Client: {client_name}")
    print(f"   Primary domains: {primary_domains}")
    print(f"   Associated domains: {associated_domains}")
    print(f"   Context keywords: {context_keywords}")
    print(f"   Period: {start_date} → {datetime.now().strftime('%Y-%m-%d')}")

    # Build search conditions for BOTH tables
    # Revenue SOURCE table uses message_* columns, CS table uses from/to/subject
    cs_conditions = [f"subject ILIKE '%{client_name}%'"]
    revenue_conditions = [f"message_subject ILIKE '%{client_name}%'"]

    for domain in all_domains:
        cs_conditions.append(f"`from` ILIKE '%{domain}%'")
        cs_conditions.append(f"`to` ILIKE '%{domain}%'")
        revenue_conditions.append(f"message_from ILIKE '%{domain}%'")
        revenue_conditions.append(f"message_to ILIKE '%{domain}%'")

    cs_where = ' OR '.join(cs_conditions)
    revenue_where = ' OR '.join(revenue_conditions)

    # Query BOTH tables for maximum body coverage
    print(f"\n🔍 Querying ClickHouse (combining Revenue + CS tables for max body coverage)...")

    ch_client = get_client(is_default=False)  # Palantir shard

    # Query 1: Revenue SOURCE table (has FULL body, not truncated like fct_)
    # NOTE: fct_gmail_revenue_messages truncates body to ~129 chars due to ETL bug
    # Using src_mv_google_gmail_revenue_messages which has full body (6000+ chars)
    # Date is parsed to DateTime in SQL using internal_date column (reliable timestamp)
    query_revenue = f"""
    SELECT
        message_id as gmail_msg_id,
        toString(message_internal_date) as gmail_msg_date,
        message_thread_id as gmail_thread_id,
        message_account_email as gmail_account_email,
        lower(trim(BOTH '<>' FROM extractAll(message_from, '<([^>]+)>')[1])) as gmail_msg_from_email,
        message_from as gmail_msg_from_field,
        message_to as gmail_msg_to_field,
        message_subject as gmail_msg_subject,
        message_body as gmail_msg_body,
        message_snippet as gmail_msg_snippet,
        'revenue' as source
    FROM internal_analytics_src.src_mv_google_gmail_revenue_messages
    WHERE message_internal_date >= '{start_date}'
      AND ({revenue_where})
    """

    # Query 2: CS RAW table (57% body coverage)
    # NOTE: CS has body=0 for many emails but Revenue may have body - must dedup correctly
    query_cs = f"""
    SELECT
        id as gmail_msg_id,
        toString(date) as gmail_msg_date,
        threadId as gmail_thread_id,
        accountEmail as gmail_account_email,
        lower(trim(BOTH '<>' FROM extractAll(`from`, '<([^>]+)>')[1])) as gmail_msg_from_email,
        `from` as gmail_msg_from_field,
        `to` as gmail_msg_to_field,
        subject as gmail_msg_subject,
        body as gmail_msg_body,
        snippet as gmail_msg_snippet,
        'cs' as source
    FROM internal_analytics_src.src_mv_google_gmail_cs_messages
    WHERE date >= '{start_date}'
      AND ({cs_where})
    """

    try:
        df_revenue = ch_client.query_df(query_revenue)
        print(f"   Revenue table: {len(df_revenue)} emails")

        df_cs = ch_client.query_df(query_cs)
        print(f"   CS table: {len(df_cs)} emails")

        # Combine both DataFrames
        import pandas as pd
        df_combined = pd.concat([df_revenue, df_cs], ignore_index=True)

        # Calculate body length for better dedup (prefer longer body)
        df_combined['body_len'] = df_combined['gmail_msg_body'].apply(
            lambda x: len(str(x).strip()) if x and str(x).strip() else 0
        )
        df_combined['source_priority'] = df_combined['source'].apply(lambda x: 1 if x == 'revenue' else 2)

        # TWO-LEVEL DEDUPLICATION:
        # Level 1: Same gmail_msg_id exists in both Revenue and CS tables
        # Level 2: Same email sent to multiple recipients has different message_id per mailbox

        # Sort by: body_len DESC (prefer longer body), source_priority ASC
        df_combined = df_combined.sort_values(
            ['body_len', 'source_priority'],
            ascending=[False, True]
        )

        # Level 1: Dedup by message ID (Revenue vs CS for same mailbox)
        df = df_combined.drop_duplicates(subset=['gmail_msg_id'], keep='first')
        print(f"   After Level 1 dedup (by msg_id): {len(df)} emails")

        # Level 2: Dedup by content key (same email sent to multiple recipients)
        # Different mailboxes get different message_id but same content (subject+from+time)
        df['content_key'] = (
            df['gmail_msg_subject'].astype(str).str.lower().str.strip() + '|' +
            df['gmail_msg_from_email'].astype(str).str.lower().str.strip() + '|' +
            df['gmail_msg_date'].astype(str).str[:16]  # Date to minute precision
        )
        df = df.drop_duplicates(subset=['content_key'], keep='first')
        df = df.drop(columns=['content_key'])
        print(f"   After Level 2 dedup (by content): {len(df)} emails")

        # Count how many have body after dedup
        with_body_count = (df['body_len'] > 0).sum()
        print(f"   With body: {with_body_count}/{len(df)} ({round(with_body_count*100/len(df), 1)}%)")

        df = df.drop(columns=['body_len', 'source_priority', 'source'])

        print(f"   After dedup: {len(df)} unique emails")

    except Exception as e:
        print(f"❌ Query failed: {e}")
        ch_client.close()
        return {'error': str(e)}
    finally:
        ch_client.close()

    if df.empty:
        print("⚠️ No emails found")
        return {'total_raw': 0, 'total_clean': 0, 'threads': 0}

    emails = df.to_dict('records')

    # Step 1: Filter noise
    print(f"\n🧹 Step 1: Filtering noise...")
    noise_filtered = []
    excluded_noise = {'count': 0, 'reasons': defaultdict(int)}

    for email in emails:
        skip, reason = should_exclude(email)
        if skip:
            excluded_noise['count'] += 1
            excluded_noise['reasons'][reason] += 1
        else:
            noise_filtered.append(email)

    print(f"   After noise filter: {len(noise_filtered)} emails")
    print(f"   Excluded (noise): {excluded_noise['count']}")

    # Step 2: Validate client context
    print(f"\n🎯 Step 2: Validating client context...")
    clean_emails = []
    excluded_context = {'count': 0, 'reasons': defaultdict(int)}

    for email in noise_filtered:
        is_valid, reason = validate_client_context(
            email, primary_domains, associated_domains, context_keywords
        )
        if is_valid:
            clean_emails.append(email)
        else:
            excluded_context['count'] += 1
            excluded_context['reasons'][reason] += 1
            # Log rejected emails for debugging
            if excluded_context['count'] <= 5:
                subj = (email.get('gmail_msg_subject') or '')[:50]
                from_e = email.get('gmail_msg_from_email', '')
                print(f"   ❌ Rejected: {subj}... (from: {from_e}) - {reason}")

    print(f"   After context validation: {len(clean_emails)} emails")
    print(f"   Excluded (wrong context): {excluded_context['count']}")

    # Group by thread
    threads = defaultdict(list)
    for email in clean_emails:
        thread_id = email.get('gmail_thread_id') or email['gmail_msg_id']
        threads[thread_id].append(email)

    # Sort each thread chronologically
    for thread_id in threads:
        threads[thread_id].sort(key=lambda x: str(x.get('gmail_msg_date', '')))

    print(f"   Grouped into {len(threads)} threads")

    # Determine output directory
    if output_dir is None:
        # Auto-detect from client_cases
        client_slug = safe_filename(client_name)
        output_dir = PROJECT_ROOT / 'client_cases' / f'emails_{client_slug}'

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Clear existing .md files (but keep metadata)
    for f in output_dir.glob('*.md'):
        f.unlink()

    # Save each thread as markdown
    print(f"\n📁 Saving to: {output_dir}")

    for thread_id, thread_emails in threads.items():
        first_email = thread_emails[0]
        last_email = thread_emails[-1]

        date_str = str(first_email['gmail_msg_date'])[:10].replace('-', '_')
        subject = safe_filename(first_email['gmail_msg_subject'] or 'no_subject')

        filename = f"{date_str}_{subject}.md"
        filepath = output_dir / filename

        # Handle duplicates
        i = 1
        while filepath.exists():
            filename = f"{date_str}_{subject}_{i}.md"
            filepath = output_dir / filename
            i += 1

        # Collect participants
        participants = set()
        for e in thread_emails:
            if e.get('gmail_msg_from_email'):
                participants.add(e['gmail_msg_from_email'])

        # Build content
        thread_subject = first_email['gmail_msg_subject'] or 'No Subject'

        content = f"""# {thread_subject}

**Thread:** {len(thread_emails)} messages
**Period:** {str(first_email['gmail_msg_date'])[:10]} → {str(last_email['gmail_msg_date'])[:10]}
**Participants:** {', '.join(sorted(participants))}

---

"""
        for idx, email in enumerate(thread_emails, 1):
            from_field = email.get('gmail_msg_from_field') or email.get('gmail_msg_from_email', '')
            date = str(email.get('gmail_msg_date', ''))[:19]

            # Use body if available, otherwise snippet
            raw_body = email.get('gmail_msg_body') or ''
            if raw_body.strip():
                # Clean the body (remove HTML, quotes, signatures)
                body = clean_email_body(raw_body)
                if not body.strip():
                    # Cleaning removed everything - use snippet as fallback
                    body = email.get('gmail_msg_snippet') or '[Empty after cleaning]'
            else:
                body = email.get('gmail_msg_snippet') or ''
                if body:
                    body = f"[Snippet only] {body}"

            content += f"""## Message {idx}

**From:** {from_field}
**Date:** {date}

{body}

---

"""

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)

    # Save metadata JSON
    metadata = {
        'client': client_name,
        'primary_domains': primary_domains,
        'associated_domains': associated_domains,
        'context_keywords': context_keywords,
        'sources': [
            'internal_analytics.fct_gmail_revenue_messages (94% body)',
            'internal_analytics_src.src_mv_google_gmail_cs_messages (57% body)'
        ],
        'source_note': 'Combined Revenue + CS tables with deduplication for max body coverage',
        'period_start': start_date,
        'period_end': datetime.now().isoformat(),
        'total_raw': len(emails),
        'after_noise_filter': len(noise_filtered),
        'total_clean': len(clean_emails),
        'total_threads': len(threads),
        'excluded_noise': excluded_noise['count'],
        'excluded_context': excluded_context['count'],
        'excluded_reasons': {
            'noise': dict(excluded_noise['reasons']),
            'context': dict(excluded_context['reasons'])
        },
        'output_dir': str(output_dir),
        'extracted_at': datetime.now().isoformat(),
        'version': '2.6.0'
    }

    meta_file = output_dir / '_metadata.json'
    with open(meta_file, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)

    # Summary
    print(f"\n" + "=" * 80)
    print("📊 EXTRACTION COMPLETE")
    print("=" * 80)
    print(f"   Raw emails:           {len(emails)}")
    print(f"   After noise filter:   {len(noise_filtered)}")
    print(f"   After context check:  {len(clean_emails)}")
    print(f"   Threads:              {len(threads)}")
    print(f"   Output:               {output_dir}")

    return metadata


def main():
    parser = argparse.ArgumentParser(description='Extract client emails from ClickHouse')
    parser.add_argument('client_name', help='Client name to search')
    parser.add_argument('--primary', '-p', help='Primary client domains (comma-separated)')
    parser.add_argument('--associated', '-a', help='Associated agency domains (comma-separated)')
    parser.add_argument('--keywords', '-k', help='Context keywords (comma-separated)')
    parser.add_argument('--output', '-o', help='Output directory')
    parser.add_argument('--months', '-m', type=int, default=6, help='Months back to search')

    args = parser.parse_args()

    primary = args.primary.split(',') if args.primary else None
    associated = args.associated.split(',') if args.associated else None
    keywords = args.keywords.split(',') if args.keywords else None
    output_dir = Path(args.output) if args.output else None

    extract_client_emails(
        client_name=args.client_name,
        primary_domains=primary,
        associated_domains=associated,
        context_keywords=keywords,
        output_dir=output_dir,
        months_back=args.months
    )


if __name__ == '__main__':
    main()
