Audiobook Creation Guide Skill

Audiobook Creation Guide

Create audiobooks from web articles, essays, or text files. This skill covers the full pipeline: content fetching, text processing, and audio generation.

Quick Start

import os

# 1. Check which TTS API is available
def get_tts_provider():
    if os.environ.get("ELEVENLABS_API_KEY"):
        return "elevenlabs"
    elif os.environ.get("OPENAI_API_KEY"):
        return "openai"
    else:
        return "gtts"  # Free, no API key needed

provider = get_tts_provider()
print(f"Using TTS provider: {provider}")

Step 1: Fetching Web Content

IMPORTANT: Verify fetched content is complete

WebFetch and similar tools may return summaries instead of full text. Always verify:

import subprocess

def fetch_article_content(url):
    """Fetch article content using curl for reliability."""
    # Use curl to get raw HTML - more reliable than web fetch tools
    result = subprocess.run(
        ["curl", "-s", url],
        capture_output=True,
        text=True
    )
    html = result.stdout

    # Strip HTML tags (basic approach)
    import re
    text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)
    text = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL)
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

Content verification checklist

Before converting to audio, verify:

[ ] Text length is reasonable for the source (articles typically 1,000-10,000+ words)
[ ] Content includes actual article text, not just navigation/headers
[ ] No "summary" or "key points" headers that indicate truncation

def verify_content(text, expected_min_chars=1000):
    """Basic verification that content is complete."""
    if len(text) < expected_min_chars:
        print(f"WARNING: Content may be truncated ({len(text)} chars)")
        return False
    if "summary" in text.lower()[:500] or "key points" in text.lower()[:500]:
        print("WARNING: Content appears to be a summary, not full text")
        return False
    return True

Step 2: Text Processing

Clean and prepare text for TTS

import re

def clean_text_for_tts(text):
    """Clean text for better TTS output."""
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)

    # Remove footnote markers like [1], [2]
    text = re.sub(r'\[\d+\]', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove special characters that confuse TTS
    text = re.sub(r'[^\w\s.,!?;:\'"()-]', '', text)

    return text.strip()

def chunk_text(text, max_chars=4000):
    """Split text into chunks at sentence boundaries."""
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chars:
            current_chunk += sentence + " "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

Step 3: TTS Conversion with Fallback

Automatic provider selection

import os
import subprocess

def create_audiobook(text, output_path):
    """Convert text to audiobook with automatic TTS provider selection."""

    # Check available providers
    has_elevenlabs = bool(os.environ.get("ELEVENLABS_API_KEY"))
    has_openai = bool(os.environ.get("OPENAI_API_KEY"))

    if has_elevenlabs:
        print("Using ElevenLabs TTS (highest quality)")
        return create_with_elevenlabs(text, output_path)
    elif has_openai:
        print("Using OpenAI TTS (high quality)")
        return create_with_openai(text, output_path)
    else:
        print("Using gTTS (free, no API key required)")
        return create_with_gtts(text, output_path)

ElevenLabs implementation

import requests

def create_with_elevenlabs(text, output_path):
    """Generate audiobook using ElevenLabs API."""
    api_key = os.environ.get("ELEVENLABS_API_KEY")
    voice_id = "21m00Tcm4TlvDq8ikWAM"  # Rachel - calm female voice

    chunks = chunk_text(text, max_chars=4500)
    audio_files = []

    for i, chunk in enumerate(chunks):
        chunk_file = f"/tmp/chunk_{i:03d}.mp3"

        response = requests.post(
            f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
            headers={
                "xi-api-key": api_key,
                "Content-Type": "application/json"
            },
            json={
                "text": chunk,
                "model_id": "eleven_turbo_v2_5",
                "voice_settings": {"stability": 0.5, "similarity_boost": 0.75}
            }
        )

        if response.status_code == 200:
            with open(chunk_file, "wb") as f:
                f.write(response.content)
            audio_files.append(chunk_file)
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return False

    return concatenate_audio(audio_files, output_path)

OpenAI TTS implementation

def create_with_openai(text, output_path):
    """Generate audiobook using OpenAI TTS API."""
    api_key = os.environ.get("OPENAI_API_KEY")

    chunks = chunk_text(text, max_chars=4000)
    audio_files = []

    for i, chunk in enumerate(chunks):
        chunk_file = f"/tmp/chunk_{i:03d}.mp3"

        response = requests.post(
            "https://api.openai.com/v1/audio/speech",
            headers={
                "Authorization": f"Bearer {api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": "tts-1",
                "input": chunk,
                "voice": "onyx",  # Deep male voice, good for essays
                "response_format": "mp3"
            }
        )

        if response.status_code == 200:
            with open(chunk_file, "wb") as f:
                f.write(response.content)
            audio_files.append(chunk_file)
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return False

    return concatenate_audio(audio_files, output_path)

gTTS implementation (free fallback)

def create_with_gtts(text, output_path):
    """Generate audiobook using gTTS (free, no API key)."""
    from gtts import gTTS
    from pydub import AudioSegment

    chunks = chunk_text(text, max_chars=4500)
    audio_files = []

    for i, chunk in enumerate(chunks):
        chunk_file = f"/tmp/chunk_{i:03d}.mp3"

        tts = gTTS(text=chunk, lang='en', slow=False)
        tts.save(chunk_file)
        audio_files.append(chunk_file)

    return concatenate_audio(audio_files, output_path)

Audio concatenation

def concatenate_audio(audio_files, output_path):
    """Concatenate multiple audio files using ffmpeg."""
    if not audio_files:
        return False

    # Create file list for ffmpeg
    list_file = "/tmp/audio_list.txt"
    with open(list_file, "w") as f:
        for audio_file in audio_files:
            f.write(f"file '{audio_file}'\n")

    # Concatenate with ffmpeg
    result = subprocess.run([
        "ffmpeg", "-y", "-f", "concat", "-safe", "0",
        "-i", list_file, "-c", "copy", output_path
    ], capture_output=True)

    # Cleanup temp files
    import os
    for f in audio_files:
        os.unlink(f)
    os.unlink(list_file)

    return result.returncode == 0

Complete Example

#!/usr/bin/env python3
"""Create audiobook from web articles."""

import os
import re
import subprocess
import requests

# ... include all helper functions above ...

def main():
    # Fetch articles
    urls = [
        "https://example.com/article1",
        "https://example.com/article2"
    ]

    all_text = ""
    for url in urls:
        print(f"Fetching: {url}")
        text = fetch_article_content(url)

        if not verify_content(text):
            print(f"WARNING: Content from {url} may be incomplete")

        all_text += f"\n\n{text}"

    # Clean and convert
    clean_text = clean_text_for_tts(all_text)
    print(f"Total text: {len(clean_text)} characters")

    # Create audiobook
    success = create_audiobook(clean_text, "/root/audiobook.mp3")

    if success:
        print("Audiobook created successfully!")
    else:
        print("Failed to create audiobook")

if __name__ == "__main__":
    main()

TTS Provider Comparison

| Provider | Quality | Cost | API Key Required | Best For | |----------|---------|------|------------------|----------| | ElevenLabs | Excellent | Paid | Yes | Professional audiobooks | | OpenAI TTS | Very Good | Paid | Yes | General purpose | | gTTS | Good | Free | No | Testing, budget projects |

Troubleshooting

"Content appears to be a summary"

Use curl directly instead of web fetch tools
Verify the URL is correct and accessible
Check if the site requires JavaScript rendering

"API key not found"

Check environment variables: echo $OPENAI_API_KEY
Ensure keys are exported in the shell
Fall back to gTTS if no paid API keys available

"Audio chunks don't sound continuous"

Ensure chunking happens at sentence boundaries
Consider adding small pauses between sections
Use consistent voice settings across all chunks

Agent Skills: Audiobook Creation Guide

Install this agent skill to your local

Skill Files