Apify SDK Patterns Skill

Apify SDK Patterns

Overview

Production patterns for both the apify SDK (building Actors) and apify-client (calling Actors remotely). Covers Crawlee crawler selection, data storage, proxy configuration, and typed client wrappers.

Prerequisites

apify-client and/or apify + crawlee installed
APIFY_TOKEN configured
TypeScript recommended

Pattern 1: Typed Client Singleton

// src/apify/client.ts
import { ApifyClient } from 'apify-client';

let instance: ApifyClient | null = null;

export function getApifyClient(): ApifyClient {
  if (!instance) {
    const token = process.env.APIFY_TOKEN;
    if (!token) throw new Error('APIFY_TOKEN is required');
    instance = new ApifyClient({ token });
  }
  return instance;
}

// Reset for testing
export function resetClient(): void {
  instance = null;
}

Pattern 2: Crawlee Crawler Selection

Choose the right crawler for the job:

import { CheerioCrawler, PlaywrightCrawler, PuppeteerCrawler } from 'crawlee';

// CHEERIO — Fast, lightweight, no JavaScript rendering
// Use for: static HTML, server-rendered pages, APIs
const cheerioCrawler = new CheerioCrawler({
  async requestHandler({ request, $, enqueueLinks }) {
    const title = $('title').text();
    await Actor.pushData({ url: request.url, title });
    await enqueueLinks({ strategy: 'same-domain' });
  },
});

// PLAYWRIGHT — Full browser, all engines, modern API
// Use for: SPAs, JavaScript-heavy pages, complex interactions
const playwrightCrawler = new PlaywrightCrawler({
  launchContext: { launchOptions: { headless: true } },
  async requestHandler({ page, request, enqueueLinks }) {
    await page.waitForSelector('h1');
    const title = await page.title();
    const content = await page.$eval('main', el => el.textContent);
    await Actor.pushData({ url: request.url, title, content });
    await enqueueLinks({ strategy: 'same-domain' });
  },
});

// PUPPETEER — Chromium-only browser automation
// Use for: when you need Chromium specifically or legacy Puppeteer code
const puppeteerCrawler = new PuppeteerCrawler({
  async requestHandler({ page, request }) {
    const title = await page.title();
    await Actor.pushData({ url: request.url, title });
  },
});

Pattern 3: Actor Lifecycle with Error Handling

import { Actor } from 'apify';
import { CheerioCrawler, log } from 'crawlee';

// Actor.main() wraps init + exit + error handling
await Actor.main(async () => {
  const input = await Actor.getInput<{
    startUrls: { url: string }[];
    maxPages?: number;
    proxyConfig?: { useApifyProxy: boolean; groups?: string[] };
  }>();

  if (!input?.startUrls?.length) {
    throw new Error('Input must include at least one startUrl');
  }

  // Configure proxy if requested
  const proxyConfiguration = input.proxyConfig?.useApifyProxy
    ? await Actor.createProxyConfiguration({
        groups: input.proxyConfig.groups,
      })
    : undefined;

  const crawler = new CheerioCrawler({
    proxyConfiguration,
    maxRequestsPerCrawl: input.maxPages ?? 50,
    maxConcurrency: 10,

    async requestHandler({ request, $, enqueueLinks }) {
      log.info(`Processing ${request.url}`);

      await Actor.pushData({
        url: request.url,
        title: $('title').text().trim(),
        h1: $('h1').first().text().trim(),
        paragraphs: $('p').map((_, el) => $(el).text().trim()).get(),
      });

      await enqueueLinks({ strategy: 'same-domain' });
    },

    async failedRequestHandler({ request }, error) {
      log.error(`Request failed: ${request.url}`, { error: error.message });
      await Actor.pushData({
        url: request.url,
        error: error.message,
        '#isFailed': true,
      });
    },
  });

  await crawler.run(input.startUrls.map(s => s.url));
  log.info(`Crawler finished. ${crawler.stats.state.requestsFinished} pages processed.`);
});

Pattern 4: Dataset Operations

import { Actor } from 'apify';
import { ApifyClient } from 'apify-client';

// --- Inside an Actor (apify SDK) ---

// Push single item
await Actor.pushData({ url: 'https://example.com', title: 'Example' });

// Push batch
await Actor.pushData([
  { url: 'https://a.com', price: 10 },
  { url: 'https://b.com', price: 20 },
]);

// Store named output in key-value store
await Actor.setValue('SUMMARY', {
  totalItems: 100,
  avgPrice: 15.50,
  crawledAt: new Date().toISOString(),
});

// Get value back
const summary = await Actor.getValue('SUMMARY');

// --- From external app (apify-client) ---
const client = new ApifyClient({ token: process.env.APIFY_TOKEN });

// List dataset items with pagination
const { items, total } = await client
  .dataset('DATASET_ID')
  .listItems({ limit: 1000, offset: 0 });

// Push items to a named dataset
const dataset = await client.datasets().getOrCreate('my-results');
await client.dataset(dataset.id).pushItems([
  { url: 'https://example.com', data: 'scraped content' },
]);

// Download entire dataset
const csv = await client.dataset(dataset.id).downloadItems('csv');
const json = await client.dataset(dataset.id).downloadItems('json');

Pattern 5: Key-Value Store Operations

import { ApifyClient } from 'apify-client';

const client = new ApifyClient({ token: process.env.APIFY_TOKEN });

// Create or get a named store
const store = await client.keyValueStores().getOrCreate('my-config');
const storeClient = client.keyValueStore(store.id);

// Set a record (any content type)
await storeClient.setRecord({
  key: 'CONFIG',
  value: { retries: 3, timeout: 30000 },
  contentType: 'application/json',
});

// Get a record
const record = await storeClient.getRecord('CONFIG');
console.log(record?.value); // { retries: 3, timeout: 30000 }

// Store binary data (screenshots, PDFs)
await storeClient.setRecord({
  key: 'screenshot.png',
  value: screenshotBuffer,
  contentType: 'image/png',
});

// List all keys
const { items: keys } = await storeClient.listKeys();

Pattern 6: Proxy Configuration

import { Actor } from 'apify';

// Datacenter proxy (included in subscription, fast)
const dcProxy = await Actor.createProxyConfiguration({
  groups: ['BUYPROXIES94952'],
});

// Residential proxy (pay per GB, high success rate)
const resProxy = await Actor.createProxyConfiguration({
  groups: ['RESIDENTIAL'],
  countryCode: 'US',
});

// Google SERP proxy (specialized for Google)
const serpProxy = await Actor.createProxyConfiguration({
  groups: ['GOOGLE_SERP'],
});

// Use with any crawler
const crawler = new CheerioCrawler({
  proxyConfiguration: dcProxy,
  // ...
});

Pattern 7: Router for Multi-Page Actors

import { Actor } from 'apify';
import { CheerioCrawler, createCheerioRouter } from 'crawlee';

const router = createCheerioRouter();

// Default route — listing pages
router.addDefaultHandler(async ({ request, $, enqueueLinks }) => {
  // Extract links to detail pages
  const detailLinks = $('a.product-link')
    .map((_, el) => $(el).attr('href'))
    .get();

  await enqueueLinks({
    urls: detailLinks,
    label: 'DETAIL',
  });
});

// Detail route — individual item pages
router.addHandler('DETAIL', async ({ request, $ }) => {
  await Actor.pushData({
    url: request.url,
    name: $('h1.product-name').text().trim(),
    price: parseFloat($('.price').text().replace('$', '')),
    description: $('div.description').text().trim(),
  });
});

await Actor.main(async () => {
  const crawler = new CheerioCrawler({
    requestHandler: router,
  });
  await crawler.run(['https://example-store.com/products']);
});

Pattern 8: Safe Result Wrapper

type Result<T> = { data: T; error: null } | { data: null; error: Error };

async function safeActorCall<T>(
  client: ApifyClient,
  actorId: string,
  input: Record<string, unknown>,
): Promise<Result<T[]>> {
  try {
    const run = await client.actor(actorId).call(input, { timeout: 300 });

    if (run.status !== 'SUCCEEDED') {
      return { data: null, error: new Error(`Run ${run.status}: ${run.statusMessage}`) };
    }

    const { items } = await client.dataset(run.defaultDatasetId).listItems();
    return { data: items as T[], error: null };
  } catch (err) {
    return { data: null, error: err as Error };
  }
}

// Usage
const result = await safeActorCall<{ url: string; title: string }>(
  client, 'apify/web-scraper', { startUrls: [{ url: 'https://example.com' }] }
);

if (result.error) {
  console.error('Actor call failed:', result.error.message);
} else {
  console.log(`Got ${result.data.length} items`);
}

Error Handling

| Pattern | Use Case | Benefit | |---------|----------|---------| | Actor.main() | Actor entry point | Auto init/exit + error reporting | | failedRequestHandler | Per-request failures | Log failures without stopping crawl | | Safe wrapper | External calls | Prevents uncaught exceptions | | Router | Multi-page scrapes | Clean separation of page types | | Proxy rotation | Anti-bot sites | Higher success rate |

Resources

Next Steps

Apply patterns in apify-core-workflow-a for a complete web scraping workflow.

Agent Skills: Apify SDK Patterns

Install this agent skill to your local

Skill Files