卷积神经网络（CNN）

图像识别、目标检测、医疗影像分析——这些任务背后几乎都是卷积神经网络。CNN 的核心思想是：用局部感知替代全连接，让网络自动学习图像的空间特征。这篇从卷积运算的直觉出发，搭建一个完整的图像分类模型。

为什么全连接网络处理不了图像

一张 224×224 的彩色图片有 224 × 224 × 3 = 150,528 个像素值。如果用全连接网络，第一层隐藏节点（假设 1024 个）就需要 150,528 × 1024 ≈ 1.5 亿个参数——参数爆炸，而且完全没利用像素的空间相关性（相邻像素往往属于同一物体）。

CNN 的解决思路：

局部感知：每个神经元只看图像的一小块区域（感受野）
权重共享：同一个卷积核扫过整张图，参数量大幅减少
层次特征：浅层学边缘/颜色，深层学纹理/部件/物体

卷积运算：核心直觉

卷积核（滤波器）是一个小矩阵（如 3×3），在图像上滑动扫描，每次对覆盖区域做点积，输出一个值：

输入图像（5×5）      卷积核（3×3）      输出特征图（3×3）
┌─────────────┐    ┌─────────┐        ┌─────────┐
│ 1  1  1  0  0│   │ 1  0  1 │        │ 4  3  4 │
│ 0  1  1  1  0│ × │ 0  1  0 │   =    │ 2  4  3 │
│ 0  0  1  1  1│   │ 1  0  1 │        │ 2  3  4 │
│ 0  0  1  1  0│   └─────────┘        └─────────┘
│ 0  1  1  0  0│
└─────────────┘

不同的卷积核学到不同的特征：有的学水平边缘，有的学垂直边缘，有的学斜线。这些卷积核的参数完全由反向传播自动学习，不需要人工设计。

池化层：降采样

池化（Pooling）把特征图缩小，减少计算量并提高平移不变性：

最大池化（2×2，步长 2）：
┌────────┐        ┌────┐
│ 1  3   │  →     │ 3  │    取每个 2×2 区域的最大值
│ 2  4   │        └────┘
└────────┘

import torch
import torch.nn as nn

# 卷积层：输入 3 通道（RGB），输出 32 通道（32 个卷积核），核大小 3×3
conv = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
# padding=1 保持输出尺寸与输入相同

# 最大池化：2×2，步长 2，把特征图缩小一半
pool = nn.MaxPool2d(kernel_size=2, stride=2)

# 演示尺寸变化
x = torch.randn(1, 3, 224, 224)    # [batch, channels, H, W]
x = conv(x);  print(x.shape)        # [1, 32, 224, 224]
x = pool(x);  print(x.shape)        # [1, 32, 112, 112]

搭建一个完整的 CNN：CIFAR-10 图像分类

CIFAR-10 包含 60,000 张 32×32 的彩色图片，10 个类别（飞机、汽车、鸟……）。

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time


# 1. 数据加载与数据增强
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),          # 随机水平翻转（数据增强）
    transforms.RandomCrop(32, padding=4),       # 随机裁剪（数据增强）
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2023, 0.1994, 0.2010)),  # CIFAR-10 均值/标准差
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2023, 0.1994, 0.2010)),
])

train_set = datasets.CIFAR10(root="./data", train=True,  transform=transform_train, download=True)
test_set  = datasets.CIFAR10(root="./data", train=False, transform=transform_test,  download=True)
train_loader = DataLoader(train_set, batch_size=128, shuffle=True,  num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_set,  batch_size=256, shuffle=False, num_workers=4)


# 2. 定义 CNN 结构
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        # 特征提取部分
        self.features = nn.Sequential(
            # Block 1：32×32 → 16×16
            nn.Conv2d(3, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Dropout2d(0.25),

            # Block 2：16×16 → 8×8
            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Dropout2d(0.25),

            # Block 3：8×8 → 4×4
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
        )

        # 分类头：全连接层
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 4 * 4, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)


# 3. 训练配置
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model     = SimpleCNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
criterion = nn.CrossEntropyLoss()

print(f"参数总量: {sum(p.numel() for p in model.parameters()):,}")


# 4. 训练 & 评估循环
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, correct, total = 0.0, 0, 0
    for X, y in loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        out  = model(X)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(y)
        correct    += (out.argmax(1) == y).sum().item()
        total      += len(y)
    return total_loss / total, correct / total


@torch.no_grad()
def eval_epoch(model, loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    for X, y in loader:
        X, y = X.to(device), y.to(device)
        out  = model(X)
        loss = criterion(out, y)
        total_loss += loss.item() * len(y)
        correct    += (out.argmax(1) == y).sum().item()
        total      += len(y)
    return total_loss / total, correct / total


for epoch in range(1, 51):
    t0 = time.time()
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss,   val_acc   = eval_epoch(model, test_loader, criterion, device)
    scheduler.step()

    if epoch % 5 == 0:
        print(f"Epoch {epoch:3d} | "
              f"Train Loss {train_loss:.3f} Acc {train_acc:.3f} | "
              f"Val Loss {val_loss:.3f} Acc {val_acc:.3f} | "
              f"{time.time()-t0:.1f}s")

训练 50 轮后，这个简单 CNN 在 CIFAR-10 上可达约 84% 准确率。

经典 CNN 架构演进

模型	年份	创新点	ImageNet Top-1
AlexNet	2012	ReLU + Dropout + GPU	63.3%
VGG-16	2014	全用 3×3 小卷积核堆叠	74.5%
GoogLeNet	2014	Inception 模块，多尺度并行	74.8%
ResNet-50	2015	残差连接，首次训练 100+ 层	79.3%
EfficientNet-B7	2019	复合缩放（宽/深/分辨率）	88.4%
ConvNeXt	2022	用 Transformer 思路改造 CNN	87.8%

残差连接（ResNet）

深网络（50层+）训练时梯度消失严重，ResNet 用跳跃连接解决：

class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(channels)
        self.relu  = nn.ReLU(inplace=True)

    def forward(self, x):
        residual = x                       # 保存输入
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = out + residual               # 残差连接：输出 = F(x) + x
        return self.relu(out)

直觉：即使 conv1 和 conv2 都学成了零，网络至少还有恒等映射（直接传递 x），不会退化。

实用技巧

Grad-CAM：可视化模型「看哪里」

import torch.nn.functional as F

def grad_cam(model, image, target_class, target_layer):
    """计算目标层的 Grad-CAM 热力图"""
    activations, gradients = {}, {}

    def forward_hook(m, i, o):
        activations["value"] = o

    def backward_hook(m, gi, go):
        gradients["value"] = go[0]

    h1 = target_layer.register_forward_hook(forward_hook)
    h2 = target_layer.register_full_backward_hook(backward_hook)

    out = model(image.unsqueeze(0))
    model.zero_grad()
    out[0, target_class].backward()

    # 对梯度做全局平均池化得到权重
    weights = gradients["value"].mean(dim=[2, 3], keepdim=True)
    cam = (weights * activations["value"]).sum(dim=1, keepdim=True)
    cam = F.relu(cam)
    cam = F.interpolate(cam, size=image.shape[1:], mode="bilinear")

    h1.remove(); h2.remove()
    return cam.squeeze().detach().numpy()

一句话小结

CNN = 卷积（局部感知 + 权重共享）+ 池化（降维）+ 全连接（分类）。浅层学边缘，深层学语义；残差连接让百层网络得以训练；数据增强和 BatchNorm 是打好精度的关键。计算机视觉任务，先试 ResNet 或 EfficientNet，不要自己从头设计架构。

最后更新于 2026-06-18

Transformer 与注意力机制序列模型：RNN 与 LSTM