Computer Vision Skill | Agent Skills

Computer Vision

Build models to analyze and understand visual data.

Quick Start

Image Classification

import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

# Load pre-trained model
model = models.resnet50(pretrained=True)
model.eval()

# Preprocess image
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

img = Image.open('image.jpg')
img_tensor = transform(img).unsqueeze(0)

# Predict
with torch.no_grad():
    output = model(img_tensor)
    probabilities = torch.nn.functional.softmax(output[0], dim=0)
    top5 = torch.topk(probabilities, 5)

print(top5)

Custom CNN

import torch.nn as nn

class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 4 * 4, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

Data Augmentation

from torchvision import transforms

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(
        brightness=0.2,
        contrast=0.2,
        saturation=0.2,
        hue=0.1
    ),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

Object Detection with YOLO

from ultralytics import YOLO

# Load model
model = YOLO('yolov8n.pt')

# Predict
results = model('image.jpg')

# Process results
for result in results:
    boxes = result.boxes
    for box in boxes:
        x1, y1, x2, y2 = box.xyxy[0]
        confidence = box.conf[0]
        class_id = box.cls[0]
        print(f"Class: {class_id}, Confidence: {confidence:.2f}")
        print(f"Box: ({x1}, {y1}, {x2}, {y2})")

# Save results
results[0].save('output.jpg')

Image Segmentation

# Semantic segmentation with DeepLab
model = torch.hub.load(
    'pytorch/vision:v0.10.0',
    'deeplabv3_resnet50',
    pretrained=True
)
model.eval()

# Preprocess
preprocess = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

input_tensor = preprocess(img).unsqueeze(0)

# Predict
with torch.no_grad():
    output = model(input_tensor)['out'][0]
    output_predictions = output.argmax(0)

Transfer Learning

from torchvision import models

# Load pre-trained ResNet
model = models.resnet50(pretrained=True)

# Freeze all layers
for param in model.parameters():
    param.requires_grad = False

# Replace final layer
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)

# Train only final layer
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)

Image Processing with OpenCV

import cv2

# Read image
img = cv2.imread('image.jpg')

# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Edge detection
edges = cv2.Canny(gray, 100, 200)

# Blur
blurred = cv2.GaussianBlur(img, (5, 5), 0)

# Resize
resized = cv2.resize(img, (224, 224))

# Draw rectangle
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)

# Save
cv2.imwrite('output.jpg', img)

Face Detection

# Haar Cascade
face_cascade = cv2.CascadeClassifier(
    cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)

gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.1, 4)

for (x, y, w, h) in faces:
    cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)

Common Architectures

Image Classification:

ResNet: Skip connections, deep networks
EfficientNet: Compound scaling, efficient
Vision Transformer (ViT): Attention-based

Object Detection:

YOLO: Real-time, one-stage
Faster R-CNN: Two-stage, accurate
RetinaNet: Focal loss, handles class imbalance

Segmentation:

U-Net: Encoder-decoder, medical imaging
DeepLab: Atrous convolution, semantic segmentation
Mask R-CNN: Instance segmentation

Tips

Use pre-trained models for transfer learning
Apply data augmentation to prevent overfitting
Normalize images (ImageNet statistics)
Use appropriate loss functions (CrossEntropy, Focal Loss)
Monitor training with visualization
Test on diverse images

Agent Skills: Computer Vision

Install this agent skill to your local

Skill Files

Computer Vision

Quick Start

Image Classification

Custom CNN

Data Augmentation

Object Detection with YOLO

Image Segmentation

Transfer Learning

Image Processing with OpenCV

Face Detection

Common Architectures

Tips