Agent Skills: Computer Vision Skill

Build computer vision solutions - image classification, object detection, and transfer learning

computer-visionimage-classificationobject-detectiontransfer-learningdeep-learning
ml-developmentID: pluginagentmarketplace/custom-plugin-machine-learning/computer-vision

Skill Files

Browse the full folder contents for computer-vision.

Download Skill

Loading file tree…

skills/computer-vision/SKILL.md

Skill Metadata

Name
computer-vision
Description
Build computer vision solutions - image classification, object detection, and transfer learning

Computer Vision Skill

Build visual AI systems from classification to detection.

Quick Start

import torch
import timm
from PIL import Image
from torchvision import transforms

# Load pretrained model
model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=10)
model.eval()

# Preprocessing
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Inference
image = Image.open('image.jpg').convert('RGB')
input_tensor = transform(image).unsqueeze(0)

with torch.no_grad():
    output = model(input_tensor)
    predicted_class = output.argmax(dim=1).item()

Key Topics

1. Data Augmentation

import albumentations as A
from albumentations.pytorch import ToTensorV2

train_transform = A.Compose([
    A.RandomResizedCrop(224, 224, scale=(0.8, 1.0)),
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15),
    A.ColorJitter(brightness=0.2, contrast=0.2),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

val_transform = A.Compose([
    A.Resize(256, 256),
    A.CenterCrop(224, 224),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

2. Transfer Learning

| Model | Params | ImageNet Acc | Speed | |-------|--------|--------------|-------| | EfficientNet-B0 | 5.3M | 77% | Fast | | ResNet-50 | 25.6M | 76% | Fast | | ViT-B/16 | 86M | 84% | Slow |

import timm

class TransferClassifier(torch.nn.Module):
    def __init__(self, backbone='efficientnet_b0', num_classes=10):
        super().__init__()
        self.backbone = timm.create_model(backbone, pretrained=True, num_classes=0)
        self.classifier = torch.nn.Linear(self.backbone.num_features, num_classes)

        # Freeze backbone
        for param in self.backbone.parameters():
            param.requires_grad = False

    def unfreeze(self):
        for param in self.backbone.parameters():
            param.requires_grad = True

    def forward(self, x):
        features = self.backbone(x)
        return self.classifier(features)

3. Object Detection (YOLOv8)

from ultralytics import YOLO

# Load model
model = YOLO('yolov8n.pt')

# Train
results = model.train(
    data='dataset.yaml',
    epochs=100,
    imgsz=640,
    batch=16
)

# Inference
results = model('image.jpg')
for r in results:
    boxes = r.boxes
    for box in boxes:
        print(f"Class: {r.names[int(box.cls)]}, Conf: {box.conf:.2f}")

4. Image Segmentation

import segmentation_models_pytorch as smp

# Create U-Net model
model = smp.Unet(
    encoder_name='resnet50',
    encoder_weights='imagenet',
    in_channels=3,
    classes=21
)

# Loss function
loss_fn = smp.losses.DiceLoss(mode='multiclass')

5. Model Evaluation

from sklearn.metrics import classification_report, confusion_matrix

def evaluate_classifier(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for images, labels in dataloader:
            outputs = model(images.to(device))
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())

    return {
        'report': classification_report(all_labels, all_preds),
        'confusion_matrix': confusion_matrix(all_labels, all_preds)
    }

Best Practices

DO

  • Use pretrained models
  • Apply consistent augmentation
  • Use mixed precision training
  • Normalize with ImageNet stats
  • Visualize predictions

DON'T

  • Don't train from scratch on small data
  • Don't use same augmentations for val
  • Don't ignore class imbalance
  • Don't skip visual error analysis

Exercises

Exercise 1: Transfer Learning

# TODO: Fine-tune EfficientNet on CIFAR-10
# Freeze backbone first, then unfreeze

Exercise 2: Object Detection

# TODO: Train YOLOv8 on custom dataset
# Create dataset.yaml and train

Unit Test Template

import pytest
import torch

def test_model_output_shape():
    """Test model output dimensions."""
    model = TransferClassifier(num_classes=10)
    x = torch.randn(4, 3, 224, 224)

    output = model(x)

    assert output.shape == (4, 10)

def test_augmentation_preserves_shape():
    """Test augmentation output shape."""
    import numpy as np
    image = np.random.randint(0, 255, (256, 256, 3), dtype=np.uint8)

    augmented = train_transform(image=image)['image']

    assert augmented.shape == (3, 224, 224)

Troubleshooting

| Problem | Cause | Solution | |---------|-------|----------| | Overfitting | Small dataset | More augmentation | | Slow training | Large images | Resize, use AMP | | Poor detection | Wrong anchors | Adjust anchor sizes | | Memory error | Batch too large | Reduce batch size |

Related Resources

  • Agent: 06-computer-vision
  • Previous: nlp-basics
  • Next: ml-deployment
  • Docs: timm

Version: 1.4.0 | Status: Production Ready