Computer Vision Skill Skill

Computer Vision Skill

Build visual AI systems from classification to detection.

Quick Start

import torch
import timm
from PIL import Image
from torchvision import transforms

# Load pretrained model
model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=10)
model.eval()

# Preprocessing
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Inference
image = Image.open('image.jpg').convert('RGB')
input_tensor = transform(image).unsqueeze(0)

with torch.no_grad():
    output = model(input_tensor)
    predicted_class = output.argmax(dim=1).item()

Key Topics

1. Data Augmentation

import albumentations as A
from albumentations.pytorch import ToTensorV2

train_transform = A.Compose([
    A.RandomResizedCrop(224, 224, scale=(0.8, 1.0)),
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15),
    A.ColorJitter(brightness=0.2, contrast=0.2),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

val_transform = A.Compose([
    A.Resize(256, 256),
    A.CenterCrop(224, 224),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

2. Transfer Learning

| Model | Params | ImageNet Acc | Speed | |-------|--------|--------------|-------| | EfficientNet-B0 | 5.3M | 77% | Fast | | ResNet-50 | 25.6M | 76% | Fast | | ViT-B/16 | 86M | 84% | Slow |

import timm

class TransferClassifier(torch.nn.Module):
    def __init__(self, backbone='efficientnet_b0', num_classes=10):
        super().__init__()
        self.backbone = timm.create_model(backbone, pretrained=True, num_classes=0)
        self.classifier = torch.nn.Linear(self.backbone.num_features, num_classes)

        # Freeze backbone
        for param in self.backbone.parameters():
            param.requires_grad = False

    def unfreeze(self):
        for param in self.backbone.parameters():
            param.requires_grad = True

    def forward(self, x):
        features = self.backbone(x)
        return self.classifier(features)

3. Object Detection (YOLOv8)

from ultralytics import YOLO

# Load model
model = YOLO('yolov8n.pt')

# Train
results = model.train(
    data='dataset.yaml',
    epochs=100,
    imgsz=640,
    batch=16
)

# Inference
results = model('image.jpg')
for r in results:
    boxes = r.boxes
    for box in boxes:
        print(f"Class: {r.names[int(box.cls)]}, Conf: {box.conf:.2f}")

4. Image Segmentation

import segmentation_models_pytorch as smp

# Create U-Net model
model = smp.Unet(
    encoder_name='resnet50',
    encoder_weights='imagenet',
    in_channels=3,
    classes=21
)

# Loss function
loss_fn = smp.losses.DiceLoss(mode='multiclass')

5. Model Evaluation

from sklearn.metrics import classification_report, confusion_matrix

def evaluate_classifier(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for images, labels in dataloader:
            outputs = model(images.to(device))
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())

    return {
        'report': classification_report(all_labels, all_preds),
        'confusion_matrix': confusion_matrix(all_labels, all_preds)
    }

Best Practices

DO

Use pretrained models
Apply consistent augmentation
Use mixed precision training
Normalize with ImageNet stats
Visualize predictions

DON'T

Don't train from scratch on small data
Don't use same augmentations for val
Don't ignore class imbalance
Don't skip visual error analysis

Exercises

Exercise 1: Transfer Learning

# TODO: Fine-tune EfficientNet on CIFAR-10
# Freeze backbone first, then unfreeze

Exercise 2: Object Detection

# TODO: Train YOLOv8 on custom dataset
# Create dataset.yaml and train

Unit Test Template

import pytest
import torch

def test_model_output_shape():
    """Test model output dimensions."""
    model = TransferClassifier(num_classes=10)
    x = torch.randn(4, 3, 224, 224)

    output = model(x)

    assert output.shape == (4, 10)

def test_augmentation_preserves_shape():
    """Test augmentation output shape."""
    import numpy as np
    image = np.random.randint(0, 255, (256, 256, 3), dtype=np.uint8)

    augmented = train_transform(image=image)['image']

    assert augmented.shape == (3, 224, 224)

Troubleshooting

| Problem | Cause | Solution | |---------|-------|----------| | Overfitting | Small dataset | More augmentation | | Slow training | Large images | Resize, use AMP | | Poor detection | Wrong anchors | Adjust anchor sizes | | Memory error | Batch too large | Reduce batch size |

Related Resources

Agent: 06-computer-vision
Previous: nlp-basics
Next: ml-deployment
Docs: timm

Version: 1.4.0 | Status: Production Ready

Agent Skills: Computer Vision Skill

Install this agent skill to your local

Skill Files