Agent Skills: Firecrawl Reliability Patterns

|

UncategorizedID: jeremylongshore/claude-code-plugins-plus-skills/firecrawl-reliability-patterns

Install this agent skill to your local

pnpm dlx add-skill https://github.com/jeremylongshore/claude-code-plugins-plus-skills/tree/HEAD/plugins/saas-packs/firecrawl-pack/skills/firecrawl-reliability-patterns

Skill Files

Browse the full folder contents for firecrawl-reliability-patterns.

Download Skill

Loading file tree…

plugins/saas-packs/firecrawl-pack/skills/firecrawl-reliability-patterns/SKILL.md

Skill Metadata

Name
firecrawl-reliability-patterns
Description
|

Firecrawl Reliability Patterns

Overview

Production reliability patterns for Firecrawl scraping pipelines. Firecrawl's async crawl model, JS rendering, and credit-based pricing create specific reliability challenges: crawl jobs may time out, scraped content may be empty (bot detection, JS failures), and credits can be burned by runaway crawls. This skill covers battle-tested patterns for each.

Instructions

Step 1: Robust Crawl with Timeout and Backoff

import FirecrawlApp from "@mendable/firecrawl-js";

const firecrawl = new FirecrawlApp({
  apiKey: process.env.FIRECRAWL_API_KEY!,
});

async function reliableCrawl(
  url: string,
  opts: { limit: number; paths?: string[] },
  timeoutMs = 600000
) {
  const job = await firecrawl.asyncCrawlUrl(url, {
    limit: opts.limit,
    includePaths: opts.paths,
    scrapeOptions: { formats: ["markdown"], onlyMainContent: true },
  });

  const deadline = Date.now() + timeoutMs;
  let pollInterval = 2000;

  while (Date.now() < deadline) {
    const status = await firecrawl.checkCrawlStatus(job.id);

    if (status.status === "completed") return status;
    if (status.status === "failed") {
      throw new Error(`Crawl failed: ${status.error}`);
    }

    await new Promise(r => setTimeout(r, pollInterval));
    pollInterval = Math.min(pollInterval * 1.5, 30000); // back off to 30s max
  }

  throw new Error(`Crawl timed out after ${timeoutMs}ms (job: ${job.id})`);
}

Step 2: Content Quality Validation

interface ScrapedPage {
  url: string;
  markdown: string;
  metadata: { title?: string; statusCode?: number };
}

function validateContent(page: ScrapedPage): {
  valid: boolean;
  reason?: string;
} {
  if (!page.markdown || page.markdown.length < 100) {
    return { valid: false, reason: "Content too short" };
  }

  if (page.metadata.statusCode && page.metadata.statusCode >= 400) {
    return { valid: false, reason: `HTTP ${page.metadata.statusCode}` };
  }

  const errorPatterns = [
    "access denied", "403 forbidden", "page not found",
    "captcha", "please verify", "enable javascript",
  ];
  const lower = page.markdown.toLowerCase();
  for (const pattern of errorPatterns) {
    if (lower.includes(pattern)) {
      return { valid: false, reason: `Error page detected: "${pattern}"` };
    }
  }

  return { valid: true };
}

Step 3: Crawl-to-Scrape Fallback

// If a full crawl fails, fall back to scraping critical pages individually
async function resilientFetch(urls: string[]): Promise<any[]> {
  // Try batch scrape first (most efficient)
  try {
    const batch = await firecrawl.batchScrapeUrls(urls, {
      formats: ["markdown"],
      onlyMainContent: true,
    });

    const results = (batch.data || []).filter(page => {
      const { valid } = validateContent({
        url: page.metadata?.sourceURL || "",
        markdown: page.markdown || "",
        metadata: page.metadata || {},
      });
      return valid;
    });

    if (results.length >= urls.length * 0.5) {
      return results; // batch succeeded (>50% valid)
    }
  } catch (batchError) {
    console.warn("Batch scrape failed, falling back to individual scrapes");
  }

  // Fallback: scrape individually with retries
  const results: any[] = [];
  for (const url of urls) {
    try {
      const result = await firecrawl.scrapeUrl(url, {
        formats: ["markdown"],
        onlyMainContent: true,
        waitFor: 5000,
      });
      if (validateContent({ url, markdown: result.markdown || "", metadata: result.metadata || {} }).valid) {
        results.push(result);
      }
    } catch (e) {
      console.error(`Failed to scrape ${url}: ${(e as Error).message}`);
    }
    // Delay between individual scrapes to avoid rate limits
    await new Promise(r => setTimeout(r, 1000));
  }

  return results;
}

Step 4: Circuit Breaker for Firecrawl

class FirecrawlCircuitBreaker {
  private failures = 0;
  private lastFailure = 0;
  private state: "closed" | "open" | "half-open" = "closed";
  private threshold: number;
  private resetTimeMs: number;

  constructor(threshold = 5, resetTimeMs = 60000) {
    this.threshold = threshold;
    this.resetTimeMs = resetTimeMs;
  }

  async execute<T>(operation: () => Promise<T>, fallback?: () => T): Promise<T> {
    // Check if circuit should reset
    if (this.state === "open" && Date.now() - this.lastFailure > this.resetTimeMs) {
      this.state = "half-open";
    }

    if (this.state === "open") {
      console.warn("Circuit breaker OPEN — using fallback");
      if (fallback) return fallback();
      throw new Error("Firecrawl circuit breaker is open");
    }

    try {
      const result = await operation();
      if (this.state === "half-open") {
        this.state = "closed";
        this.failures = 0;
      }
      return result;
    } catch (error) {
      this.failures++;
      this.lastFailure = Date.now();
      if (this.failures >= this.threshold) {
        this.state = "open";
        console.error(`Circuit breaker OPENED after ${this.failures} failures`);
      }
      throw error;
    }
  }
}

const breaker = new FirecrawlCircuitBreaker(5, 60000);

async function protectedScrape(url: string) {
  return breaker.execute(
    () => firecrawl.scrapeUrl(url, { formats: ["markdown"] }),
    () => ({ markdown: getCachedContent(url), metadata: { fromCache: true } })
  );
}

Step 5: Credit-Aware Processing

class CreditGuard {
  private dailyUsage = new Map<string, number>();
  private dailyLimit: number;

  constructor(dailyLimit = 5000) {
    this.dailyLimit = dailyLimit;
  }

  canAfford(credits: number): boolean {
    const today = new Date().toISOString().split("T")[0];
    return (this.dailyUsage.get(today) || 0) + credits <= this.dailyLimit;
  }

  record(credits: number) {
    const today = new Date().toISOString().split("T")[0];
    this.dailyUsage.set(today, (this.dailyUsage.get(today) || 0) + credits);
  }

  remaining(): number {
    const today = new Date().toISOString().split("T")[0];
    return this.dailyLimit - (this.dailyUsage.get(today) || 0);
  }
}

const creditGuard = new CreditGuard(5000);

async function budgetedCrawl(url: string, limit: number) {
  if (!creditGuard.canAfford(limit)) {
    throw new Error(`Budget exceeded: ${creditGuard.remaining()} credits remaining`);
  }

  const result = await reliableCrawl(url, { limit });
  creditGuard.record(result.data?.length || 0);
  return result;
}

Error Handling

| Issue | Cause | Solution | |-------|-------|----------| | Crawl timeout | Large site, slow rendering | Set timeout, reduce limit | | Empty markdown | Bot detection or JS failure | Increase waitFor, use actions | | Credit overrun | No budget tracking | Implement credit guard | | Cascade failures | Single scrape failure crashes pipeline | Circuit breaker + fallback | | Partial crawl results | Some pages blocked | Validate content, retry failed URLs |

Examples

Full Resilient Pipeline

async function resilientPipeline(url: string) {
  const map = await firecrawl.mapUrl(url);
  const urls = (map.links || []).filter(u => u.includes("/docs/")).slice(0, 50);

  if (!creditGuard.canAfford(urls.length)) {
    console.warn("Budget tight — reducing scope");
    urls.splice(20); // trim to 20
  }

  const pages = await resilientFetch(urls);
  const valid = pages.filter(p => validateContent(p).valid);
  creditGuard.record(urls.length);

  return { scraped: urls.length, valid: valid.length, remaining: creditGuard.remaining() };
}

Resources

Next Steps

For policy enforcement, see firecrawl-policy-guardrails.