Audiobook Creation Guide
Create audiobooks from web articles, essays, or text files. This skill covers the full pipeline: content fetching, text processing, and audio generation.
Quick Start
import os
# 1. Check which TTS API is available
def get_tts_provider():
if os.environ.get("ELEVENLABS_API_KEY"):
return "elevenlabs"
elif os.environ.get("OPENAI_API_KEY"):
return "openai"
else:
return "gtts" # Free, no API key needed
provider = get_tts_provider()
print(f"Using TTS provider: {provider}")
Step 1: Fetching Web Content
IMPORTANT: Verify fetched content is complete
WebFetch and similar tools may return summaries instead of full text. Always verify:
import subprocess
def fetch_article_content(url):
"""Fetch article content using curl for reliability."""
# Use curl to get raw HTML - more reliable than web fetch tools
result = subprocess.run(
["curl", "-s", url],
capture_output=True,
text=True
)
html = result.stdout
# Strip HTML tags (basic approach)
import re
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)
text = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL)
text = re.sub(r'<[^>]+>', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
Content verification checklist
Before converting to audio, verify:
- [ ] Text length is reasonable for the source (articles typically 1,000-10,000+ words)
- [ ] Content includes actual article text, not just navigation/headers
- [ ] No "summary" or "key points" headers that indicate truncation
def verify_content(text, expected_min_chars=1000):
"""Basic verification that content is complete."""
if len(text) < expected_min_chars:
print(f"WARNING: Content may be truncated ({len(text)} chars)")
return False
if "summary" in text.lower()[:500] or "key points" in text.lower()[:500]:
print("WARNING: Content appears to be a summary, not full text")
return False
return True
Step 2: Text Processing
Clean and prepare text for TTS
import re
def clean_text_for_tts(text):
"""Clean text for better TTS output."""
# Remove URLs
text = re.sub(r'http[s]?://\S+', '', text)
# Remove footnote markers like [1], [2]
text = re.sub(r'\[\d+\]', '', text)
# Normalize whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters that confuse TTS
text = re.sub(r'[^\w\s.,!?;:\'"()-]', '', text)
return text.strip()
def chunk_text(text, max_chars=4000):
"""Split text into chunks at sentence boundaries."""
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < max_chars:
current_chunk += sentence + " "
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
Step 3: TTS Conversion with Fallback
Automatic provider selection
import os
import subprocess
def create_audiobook(text, output_path):
"""Convert text to audiobook with automatic TTS provider selection."""
# Check available providers
has_elevenlabs = bool(os.environ.get("ELEVENLABS_API_KEY"))
has_openai = bool(os.environ.get("OPENAI_API_KEY"))
if has_elevenlabs:
print("Using ElevenLabs TTS (highest quality)")
return create_with_elevenlabs(text, output_path)
elif has_openai:
print("Using OpenAI TTS (high quality)")
return create_with_openai(text, output_path)
else:
print("Using gTTS (free, no API key required)")
return create_with_gtts(text, output_path)
ElevenLabs implementation
import requests
def create_with_elevenlabs(text, output_path):
"""Generate audiobook using ElevenLabs API."""
api_key = os.environ.get("ELEVENLABS_API_KEY")
voice_id = "21m00Tcm4TlvDq8ikWAM" # Rachel - calm female voice
chunks = chunk_text(text, max_chars=4500)
audio_files = []
for i, chunk in enumerate(chunks):
chunk_file = f"/tmp/chunk_{i:03d}.mp3"
response = requests.post(
f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
headers={
"xi-api-key": api_key,
"Content-Type": "application/json"
},
json={
"text": chunk,
"model_id": "eleven_turbo_v2_5",
"voice_settings": {"stability": 0.5, "similarity_boost": 0.75}
}
)
if response.status_code == 200:
with open(chunk_file, "wb") as f:
f.write(response.content)
audio_files.append(chunk_file)
else:
print(f"Error: {response.status_code} - {response.text}")
return False
return concatenate_audio(audio_files, output_path)
OpenAI TTS implementation
def create_with_openai(text, output_path):
"""Generate audiobook using OpenAI TTS API."""
api_key = os.environ.get("OPENAI_API_KEY")
chunks = chunk_text(text, max_chars=4000)
audio_files = []
for i, chunk in enumerate(chunks):
chunk_file = f"/tmp/chunk_{i:03d}.mp3"
response = requests.post(
"https://api.openai.com/v1/audio/speech",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
},
json={
"model": "tts-1",
"input": chunk,
"voice": "onyx", # Deep male voice, good for essays
"response_format": "mp3"
}
)
if response.status_code == 200:
with open(chunk_file, "wb") as f:
f.write(response.content)
audio_files.append(chunk_file)
else:
print(f"Error: {response.status_code} - {response.text}")
return False
return concatenate_audio(audio_files, output_path)
gTTS implementation (free fallback)
def create_with_gtts(text, output_path):
"""Generate audiobook using gTTS (free, no API key)."""
from gtts import gTTS
from pydub import AudioSegment
chunks = chunk_text(text, max_chars=4500)
audio_files = []
for i, chunk in enumerate(chunks):
chunk_file = f"/tmp/chunk_{i:03d}.mp3"
tts = gTTS(text=chunk, lang='en', slow=False)
tts.save(chunk_file)
audio_files.append(chunk_file)
return concatenate_audio(audio_files, output_path)
Audio concatenation
def concatenate_audio(audio_files, output_path):
"""Concatenate multiple audio files using ffmpeg."""
if not audio_files:
return False
# Create file list for ffmpeg
list_file = "/tmp/audio_list.txt"
with open(list_file, "w") as f:
for audio_file in audio_files:
f.write(f"file '{audio_file}'\n")
# Concatenate with ffmpeg
result = subprocess.run([
"ffmpeg", "-y", "-f", "concat", "-safe", "0",
"-i", list_file, "-c", "copy", output_path
], capture_output=True)
# Cleanup temp files
import os
for f in audio_files:
os.unlink(f)
os.unlink(list_file)
return result.returncode == 0
Complete Example
#!/usr/bin/env python3
"""Create audiobook from web articles."""
import os
import re
import subprocess
import requests
# ... include all helper functions above ...
def main():
# Fetch articles
urls = [
"https://example.com/article1",
"https://example.com/article2"
]
all_text = ""
for url in urls:
print(f"Fetching: {url}")
text = fetch_article_content(url)
if not verify_content(text):
print(f"WARNING: Content from {url} may be incomplete")
all_text += f"\n\n{text}"
# Clean and convert
clean_text = clean_text_for_tts(all_text)
print(f"Total text: {len(clean_text)} characters")
# Create audiobook
success = create_audiobook(clean_text, "/root/audiobook.mp3")
if success:
print("Audiobook created successfully!")
else:
print("Failed to create audiobook")
if __name__ == "__main__":
main()
TTS Provider Comparison
| Provider | Quality | Cost | API Key Required | Best For | |----------|---------|------|------------------|----------| | ElevenLabs | Excellent | Paid | Yes | Professional audiobooks | | OpenAI TTS | Very Good | Paid | Yes | General purpose | | gTTS | Good | Free | No | Testing, budget projects |
Troubleshooting
"Content appears to be a summary"
- Use
curldirectly instead of web fetch tools - Verify the URL is correct and accessible
- Check if the site requires JavaScript rendering
"API key not found"
- Check environment variables:
echo $OPENAI_API_KEY - Ensure keys are exported in the shell
- Fall back to gTTS if no paid API keys available
"Audio chunks don't sound continuous"
- Ensure chunking happens at sentence boundaries
- Consider adding small pauses between sections
- Use consistent voice settings across all chunks