Perplexity Data Handling
Overview
Manage data flowing through Perplexity Sonar API. Critical concern: queries are sent to Perplexity for web search, so any PII in queries is exposed to external infrastructure. Responses contain citations (third-party URLs) that must be validated before displaying to users.
Data Flow
User Input → Query Sanitization → Perplexity API → Response Parsing
│
┌─────────────┼──────────────┐
│ │ │
Answer Text Citations Search Results
│ │ │
Format & Validate & Store for
Display Deduplicate Analytics
Prerequisites
- Perplexity API key configured
- Understanding of PII regulations (GDPR/CCPA)
- Cache storage (Redis or in-memory)
Instructions
Step 1: Query Sanitization
function sanitizeQuery(query: string): { clean: string; redacted: boolean } {
let clean = query;
let redacted = false;
const patterns: Array<[RegExp, string]> = [
[/\b[\w.+-]+@[\w-]+\.[\w.]+\b/g, "[email]"],
[/\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/g, "[phone]"],
[/\b\d{3}-\d{2}-\d{4}\b/g, "[ssn]"],
[/\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/g, "[card]"],
[/\b(pplx-|sk-|pk_|sk_live_)\w{20,}\b/g, "[token]"],
[/\b(user|customer|account)\s*#?\s*\d+\b/gi, "[id]"],
];
for (const [pattern, replacement] of patterns) {
if (pattern.test(clean)) {
clean = clean.replace(pattern, replacement);
redacted = true;
}
}
return { clean, redacted };
}
async function safeSearch(rawQuery: string) {
const { clean, redacted } = sanitizeQuery(rawQuery);
if (redacted) {
console.warn("[Data] PII redacted from Perplexity query");
}
return perplexity.chat.completions.create({
model: "sonar",
messages: [{ role: "user", content: clean }],
});
}
Step 2: Citation Validation
interface ValidatedCitation {
url: string;
domain: string;
valid: boolean;
index: number;
}
function validateCitations(citations: string[]): ValidatedCitation[] {
return citations.map((url, i) => {
try {
const parsed = new URL(url);
return {
url: url.replace(/[.,;:]+$/, ""),
domain: parsed.hostname,
valid: ["http:", "https:"].includes(parsed.protocol),
index: i + 1,
};
} catch {
return { url, domain: "unknown", valid: false, index: i + 1 };
}
});
}
function deduplicateCitations(citations: ValidatedCitation[]): ValidatedCitation[] {
const seen = new Set<string>();
return citations.filter((c) => {
const normalized = c.url.split("?")[0].replace(/\/$/, "");
if (seen.has(normalized)) return false;
seen.add(normalized);
return true;
});
}
// Replace [1] markers with linked citations
function renderCitations(answer: string, citations: ValidatedCitation[]): string {
let rendered = answer;
for (const c of citations.filter((c) => c.valid)) {
rendered = rendered.replaceAll(`[${c.index}]`, `[${c.index}](${c.url})`);
}
return rendered;
}
Step 3: Result Caching with Freshness Policy
import { LRUCache } from "lru-cache";
import { createHash } from "crypto";
interface CachedResult {
answer: string;
citations: ValidatedCitation[];
cachedAt: number;
model: string;
}
const CACHE_TTL: Record<string, number> = {
news: 30 * 60_000, // 30 min for breaking/current events
research: 4 * 3600_000, // 4 hours for research topics
factual: 24 * 3600_000, // 24 hours for stable facts
default: 1 * 3600_000, // 1 hour default
};
const resultCache = new LRUCache<string, CachedResult>({ max: 500 });
function detectQueryType(query: string): keyof typeof CACHE_TTL {
if (/\b(latest|today|breaking|recent|this week)\b/i.test(query)) return "news";
if (/\b(research|study|paper|analysis|compare)\b/i.test(query)) return "research";
if (/\b(what is|define|how does|who is)\b/i.test(query)) return "factual";
return "default";
}
async function cachedSearch(query: string, model = "sonar") {
const hash = createHash("sha256")
.update(`${model}:${query.toLowerCase().trim()}`)
.digest("hex");
const cached = resultCache.get(hash);
if (cached) return { ...cached, fromCache: true };
const response = await safeSearch(query);
const rawCitations = (response as any).citations || [];
const citations = deduplicateCitations(validateCitations(rawCitations));
const queryType = detectQueryType(query);
const entry: CachedResult = {
answer: response.choices[0].message.content || "",
citations,
cachedAt: Date.now(),
model: response.model,
};
resultCache.set(hash, entry, { ttl: CACHE_TTL[queryType] });
return { ...entry, fromCache: false };
}
Step 4: Conversation Context Management
import OpenAI from "openai";
type Message = OpenAI.ChatCompletionMessageParam;
class SearchContext {
private messages: Message[] = [];
private readonly maxMessages = 10;
private readonly maxEstimatedTokens = 8000;
constructor(systemPrompt?: string) {
if (systemPrompt) {
this.messages.push({ role: "system", content: systemPrompt });
}
}
addUserMessage(content: string) {
this.messages.push({ role: "user", content });
this.trim();
}
addAssistantMessage(content: string) {
this.messages.push({ role: "assistant", content });
this.trim();
}
getMessages(): Message[] {
return [...this.messages];
}
private trim() {
// Keep system prompt + last N messages
while (this.messages.length > this.maxMessages) {
const systemIdx = this.messages[0].role === "system" ? 1 : 0;
this.messages.splice(systemIdx, 1);
}
// Trim if estimated tokens too high
while (this.estimateTokens() > this.maxEstimatedTokens && this.messages.length > 2) {
const systemIdx = this.messages[0].role === "system" ? 1 : 0;
this.messages.splice(systemIdx, 1);
}
}
private estimateTokens(): number {
return this.messages.reduce(
(sum, m) => sum + Math.ceil(String(m.content).length / 4),
0
);
}
clear() {
const system = this.messages.find((m) => m.role === "system");
this.messages = system ? [system] : [];
}
}
Error Handling
| Issue | Cause | Solution |
|-------|-------|----------|
| PII in search query | User entered personal data | Apply sanitizeQuery before API call |
| Broken citation URLs | Source page moved/deleted | Validate URLs, filter invalid ones |
| Stale cached results | TTL too long for news | Use query-type-aware TTL |
| Context overflow | Too many conversation turns | Automatic trimming in SearchContext |
| Duplicate citations | Same source cited multiple times | Deduplicate by normalized URL |
Output
- Query sanitization stripping PII before API calls
- Citation validation and deduplication
- Cache with query-type-aware TTL
- Conversation context with automatic trimming
Resources
Next Steps
For access control, see perplexity-enterprise-rbac.