Apify Reference Architecture
Overview
Production-ready architecture patterns for applications built on Apify. Covers standalone Actor projects, multi-Actor pipelines, and full-stack applications that integrate Apify as a data source.
Architecture Pattern 1: Standalone Actor
For a single scraper deployed to Apify platform.
my-scraper/
├── .actor/
│ ├── actor.json # Actor metadata
│ ├── INPUT_SCHEMA.json # Input definition (generates UI)
│ └── Dockerfile # Build configuration
├── src/
│ ├── main.ts # Entry point (Actor.main)
│ ├── routes/
│ │ ├── listing.ts # Router handler: listing pages
│ │ └── detail.ts # Router handler: detail pages
│ ├── types.ts # Input/output TypeScript types
│ └── utils/
│ ├── extractors.ts # Data extraction functions
│ └── validators.ts # Input/output validation
├── tests/
│ ├── extractors.test.ts # Unit tests for extraction logic
│ └── integration.test.ts # Integration tests (live API)
├── storage/ # Local storage (git-ignored)
├── package.json
├── tsconfig.json
└── .gitignore
Key Files
// src/main.ts — Actor entry point
import { Actor } from 'apify';
import { CheerioCrawler } from 'crawlee';
import { router } from './routes/listing';
import { validateInput, ScraperInput } from './types';
await Actor.main(async () => {
const rawInput = await Actor.getInput<ScraperInput>();
const input = validateInput(rawInput);
const proxyConfiguration = input.proxyConfig?.useApifyProxy
? await Actor.createProxyConfiguration({ groups: input.proxyConfig.groups })
: undefined;
const crawler = new CheerioCrawler({
requestHandler: router,
proxyConfiguration,
maxRequestsPerCrawl: input.maxItems ?? 100,
maxConcurrency: input.concurrency ?? 10,
});
await crawler.run(input.startUrls.map(s => s.url));
});
// src/types.ts — Shared types and validation
import { z } from 'zod';
export const InputSchema = z.object({
startUrls: z.array(z.object({ url: z.string().url() })).min(1),
maxItems: z.number().int().positive().optional().default(100),
concurrency: z.number().int().min(1).max(50).optional().default(10),
proxyConfig: z.object({
useApifyProxy: z.boolean(),
groups: z.array(z.string()).optional(),
}).optional(),
});
export type ScraperInput = z.infer<typeof InputSchema>;
export function validateInput(raw: unknown): ScraperInput {
return InputSchema.parse(raw);
}
export interface ProductOutput {
url: string;
name: string;
price: number | null;
currency: string;
inStock: boolean;
scrapedAt: string;
}
Architecture Pattern 2: Multi-Actor Pipeline
For complex scraping workflows with multiple stages.
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ Discover │────▶│ Scrape │────▶│ Transform │
│ Actor │ │ Actor │ │ Actor │
│ │ │ │ │ │
│ Finds URLs │ │ Extracts │ │ Dedup, │
│ to scrape │ │ raw data │ │ clean, │
│ │ │ │ │ enrich │
└──────┬───────┘ └──────┬───────┘ └──────┬───────┘
│ │ │
▼ ▼ ▼
Request Queue Dataset A Dataset B
(URLs to scrape) (raw data) (clean data)
Pipeline Orchestrator
// pipeline/orchestrator.ts
import { ApifyClient } from 'apify-client';
const client = new ApifyClient({ token: process.env.APIFY_TOKEN });
interface PipelineConfig {
discoverActorId: string;
scrapeActorId: string;
transformActorId: string;
seedUrls: string[];
maxItems: number;
}
async function runPipeline(config: PipelineConfig) {
const results = {
discover: { runId: '', items: 0, cost: 0 },
scrape: { runId: '', items: 0, cost: 0 },
transform: { runId: '', items: 0, cost: 0 },
};
// Stage 1: Discover URLs
console.log('Stage 1: Discovering URLs...');
const discoverRun = await client.actor(config.discoverActorId).call({
seedUrls: config.seedUrls,
maxPages: 50,
});
const { items: urls } = await client
.dataset(discoverRun.defaultDatasetId)
.listItems();
results.discover = {
runId: discoverRun.id,
items: urls.length,
cost: discoverRun.usageTotalUsd ?? 0,
};
// Stage 2: Scrape each discovered URL
console.log(`Stage 2: Scraping ${urls.length} URLs...`);
const scrapeRun = await client.actor(config.scrapeActorId).call({
startUrls: urls.map((u: any) => ({ url: u.url })),
maxItems: config.maxItems,
});
results.scrape = {
runId: scrapeRun.id,
items: scrapeRun.stats?.datasetItemCount ?? 0,
cost: scrapeRun.usageTotalUsd ?? 0,
};
// Stage 3: Transform and deduplicate
console.log('Stage 3: Transforming...');
const transformRun = await client.actor(config.transformActorId).call({
sourceDatasetId: scrapeRun.defaultDatasetId,
dedupField: 'url',
filterEmpty: true,
});
results.transform = {
runId: transformRun.id,
items: transformRun.stats?.datasetItemCount ?? 0,
cost: transformRun.usageTotalUsd ?? 0,
};
// Store final results in named dataset
const finalDs = await client.datasets().getOrCreate('pipeline-output');
const { items: cleanData } = await client
.dataset(transformRun.defaultDatasetId)
.listItems();
await client.dataset(finalDs.id).pushItems(cleanData);
// Summary
const totalCost = Object.values(results).reduce((s, r) => s + r.cost, 0);
console.log('\n=== Pipeline Summary ===');
console.log(`Discovered: ${results.discover.items} URLs`);
console.log(`Scraped: ${results.scrape.items} items`);
console.log(`Clean: ${results.transform.items} items`);
console.log(`Total cost: $${totalCost.toFixed(4)}`);
return results;
}
Architecture Pattern 3: Full-Stack Integration
Application that uses Apify as a data source.
┌─────────────────────────────────────────────────────────┐
│ Your Application │
│ │
│ ┌─────────┐ ┌──────────────┐ ┌─────────────────┐ │
│ │ Frontend │──▶│ API Server │──▶│ Apify Service │ │
│ │ (React) │ │ (Express/ │ │ (apify-client) │ │
│ │ │◀──│ Next.js) │◀──│ │ │
│ └─────────┘ └──────┬───────┘ └────────┬─────────┘ │
│ │ │ │
│ ▼ ▼ │
│ ┌──────────┐ ┌────────────┐ │
│ │ Your DB │ │ Apify │ │
│ │ (results)│ │ Platform │ │
│ └──────────┘ └────────────┘ │
│ │
│ ┌──────────────────────────────────────────────────┐ │
│ │ Webhook Handler │ │
│ │ Receives run completion → saves results to DB │ │
│ └──────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────┘
Service Layer
// src/services/apify-service.ts
import { ApifyClient } from 'apify-client';
export class ApifyService {
private client: ApifyClient;
constructor(token: string) {
this.client = new ApifyClient({ token });
}
async startScrape(urls: string[]): Promise<{ runId: string }> {
const run = await this.client.actor('username/scraper').start({
startUrls: urls.map(url => ({ url })),
});
return { runId: run.id };
}
async getRunStatus(runId: string): Promise<{
status: string;
progress?: { finished: number; failed: number };
}> {
const run = await this.client.run(runId).get();
return {
status: run.status,
progress: {
finished: run.stats?.requestsFinished ?? 0,
failed: run.stats?.requestsFailed ?? 0,
},
};
}
async getResults<T>(runId: string): Promise<T[]> {
const run = await this.client.run(runId).get();
if (run.status !== 'SUCCEEDED') {
throw new Error(`Run not ready: ${run.status}`);
}
const { items } = await this.client
.dataset(run.defaultDatasetId)
.listItems();
return items as T[];
}
async checkHealth(): Promise<boolean> {
try {
const user = await this.client.user().get();
return !!user.username;
} catch {
return false;
}
}
}
Configuration Management
// src/config/apify.ts
interface ApifyConfig {
token: string;
actorId: string;
defaultMemory: number;
defaultTimeout: number;
webhookUrl?: string;
}
export function loadConfig(): ApifyConfig {
const env = process.env.NODE_ENV || 'development';
const base: ApifyConfig = {
token: process.env.APIFY_TOKEN!,
actorId: process.env.APIFY_ACTOR_ID!,
defaultMemory: 1024,
defaultTimeout: 3600,
};
const overrides: Record<string, Partial<ApifyConfig>> = {
development: { defaultMemory: 256, defaultTimeout: 300 },
staging: { defaultMemory: 512 },
production: { webhookUrl: process.env.APIFY_WEBHOOK_URL },
};
return { ...base, ...overrides[env] };
}
Health Check
// src/health.ts
export async function healthCheck(apifyService: ApifyService) {
const start = Date.now();
const healthy = await apifyService.checkHealth();
return {
service: 'apify',
status: healthy ? 'healthy' : 'unhealthy',
latencyMs: Date.now() - start,
timestamp: new Date().toISOString(),
};
}
Error Handling
| Issue | Cause | Solution |
|-------|-------|----------|
| Circular dependencies | Service imports service | Use dependency injection |
| Missing config | Env var not set | Validate at startup with loadConfig() |
| Pipeline stage failure | Actor crash mid-pipeline | Add retry logic per stage |
| State management | Tracking run status | Use webhook handler + database |
Resources
Flagship Skills
For multi-environment setup, see apify-deploy-integration.