Chapter 02: Micrograd — Backpropagation from Scratch

Modern deep learning frameworks like PyTorch hide the mechanics of backpropagation behind a clean API. To truly understand how neural networks learn, it helps to build the machinery yourself. In this chapter we implement micrograd: a miniature scalar-valued automatic differentiation engine that computes gradients via reverse-mode autodiff (backpropagation).

Every operation we support (+, *, relu, exp, log) records its inputs and a backward function that knows how to propagate gradients back through that operation. After a forward pass we call .backward() on the final scalar loss and the engine walks the computation graph in reverse topological order, accumulating ∂loss/∂node for every node.

This is exactly what PyTorch’s autograd does — just over tensors instead of scalars. Once you understand the scalar version the tensor version is conceptually identical; the shapes just add an extra dimension.

We then stack these scalar operations into a tiny multi-layer perceptron (MLP) and train it on XOR, the canonical non-linearly-separable problem. XOR requires at least one hidden layer, so it’s the simplest problem that cannot be solved with a linear model — making it the perfect test for backprop correctness.

1. The `Value` Class

import math

class Value:
    """A scalar value that tracks its computation graph for backprop."""

    def __init__(self, data: float, _children=(), _op="", label=""):
        self.data = float(data)
        self.grad = 0.0           # ∂loss/∂self, initialised to 0
        self._backward = lambda: None   # filled in by each operation
        self._prev = set(_children)
        self._op = _op
        self.label = label

    def __repr__(self):
        return f"Value(data={self.data:.4f}, grad={self.grad:.4f})"

    # ------------------------------------------------------------------ #
    # Forward operations                                                   #
    # ------------------------------------------------------------------ #

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), "+")

        def _backward():
            # d(out)/d(self) = 1, d(out)/d(other) = 1
            self.grad  += out.grad
            other.grad += out.grad
        out._backward = _backward
        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), "*")

        def _backward():
            # d(out)/d(self) = other.data, d(out)/d(other) = self.data
            self.grad  += other.data * out.grad
            other.grad += self.data  * out.grad
        out._backward = _backward
        return out

    def __pow__(self, exponent: float):
        assert isinstance(exponent, (int, float))
        out = Value(self.data ** exponent, (self,), f"**{exponent}")

        def _backward():
            self.grad += (exponent * self.data ** (exponent - 1)) * out.grad
        out._backward = _backward
        return out

    def exp(self):
        e = math.exp(self.data)
        out = Value(e, (self,), "exp")

        def _backward():
            self.grad += e * out.grad   # d(e^x)/dx = e^x
        out._backward = _backward
        return out

    def log(self):
        assert self.data > 0, "log of non-positive number"
        out = Value(math.log(self.data), (self,), "log")

        def _backward():
            self.grad += (1.0 / self.data) * out.grad
        out._backward = _backward
        return out

    def relu(self):
        out = Value(max(0.0, self.data), (self,), "ReLU")

        def _backward():
            self.grad += (1.0 if self.data > 0 else 0.0) * out.grad
        out._backward = _backward
        return out

    # ------------------------------------------------------------------ #
    # Convenience wrappers so Python operators work symmetrically          #
    # ------------------------------------------------------------------ #

    def __radd__(self, other): return self + other
    def __rmul__(self, other): return self * other
    def __sub__(self, other):  return self + (-1 * other)
    def __rsub__(self, other): return other + (-1 * self)
    def __truediv__(self, other): return self * other**-1
    def __neg__(self): return self * -1

    # ------------------------------------------------------------------ #
    # Backpropagation                                                      #
    # ------------------------------------------------------------------ #

    def backward(self):
        """Compute gradients for all ancestors via reverse-mode autodiff."""
        topo = []
        visited = set()

        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)

        build_topo(self)
        self.grad = 1.0          # d(loss)/d(loss) = 1
        for node in reversed(topo):
            node._backward()

2. Verify Gradient Correctness

# Manual check: f(x, y) = (x + y) * x
x = Value(2.0, label="x")
y = Value(3.0, label="y")
z = (x + y) * x      # z = x² + xy = 4 + 6 = 10
z.backward()

# Analytically: dz/dx = 2x + y = 7, dz/dy = x = 2
print(f"z  = {z.data}")          # 10.0
print(f"dz/dx = {x.grad}")       # 7.0 ✓
print(f"dz/dy = {y.grad}")       # 2.0 ✓
assert abs(x.grad - 7.0) < 1e-9
assert abs(y.grad - 2.0) < 1e-9
print("Gradient check: PASSED")

3. Build a Neuron, Layer, and MLP

import random

class Neuron:
    def __init__(self, n_in: int):
        # Weights and bias initialised from U(-1, 1)
        self.w = [Value(random.uniform(-1, 1)) for _ in range(n_in)]
        self.b = Value(0.0)

    def __call__(self, x):
        # Weighted sum + bias, then ReLU activation
        act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
        return act.relu()

    def parameters(self):
        return self.w + [self.b]


class Layer:
    def __init__(self, n_in: int, n_out: int):
        self.neurons = [Neuron(n_in) for _ in range(n_out)]

    def __call__(self, x):
        return [n(x) for n in self.neurons]

    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]


class MLP:
    def __init__(self, n_in: int, layer_sizes: list[int]):
        sizes = [n_in] + layer_sizes
        self.layers = [Layer(sizes[i], sizes[i + 1]) for i in range(len(layer_sizes))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        # Return scalar if last layer has one neuron
        return x[0] if len(x) == 1 else x

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

    def zero_grad(self):
        for p in self.parameters():
            p.grad = 0.0

4. Train on XOR

# XOR dataset: inputs → target
XOR_X = [[0, 0], [0, 1], [1, 0], [1, 1]]
XOR_Y = [0.0,    1.0,    1.0,    0.0]   # XOR labels

random.seed(42)
model = MLP(n_in=2, layer_sizes=[4, 4, 1])  # 2 hidden layers
print(f"Parameter count: {len(model.parameters())}")

learning_rate = 0.1
losses = []

for step in range(200):
    # ----- Forward pass -----
    preds = [model([Value(xi) for xi in x]) for x in XOR_X]

    # Mean squared error loss
    loss = sum((pred - y) ** 2 for pred, y in zip(preds, XOR_Y))
    loss = loss * (1.0 / len(XOR_Y))   # average

    # ----- Backward pass -----
    model.zero_grad()
    loss.backward()

    # ----- Gradient descent update -----
    for p in model.parameters():
        p.data -= learning_rate * p.grad

    losses.append(loss.data)
    if step % 40 == 0:
        print(f"Step {step:3d}  Loss: {loss.data:.6f}")

print("\nFinal predictions:")
for x, y in zip(XOR_X, XOR_Y):
    pred = model([Value(xi) for xi in x])
    print(f"  XOR{x} = {y:.0f}  →  model: {pred.data:.4f}")

5. Visualise the Loss Curve

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))
plt.plot(losses)
plt.xlabel("Training step")
plt.ylabel("MSE Loss")
plt.title("Micrograd MLP training on XOR")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("data/micrograd_loss.png", dpi=100)
print("Saved → data/micrograd_loss.png")

6. Gradient Flow Visualisation

def print_graph(v, indent=0, visited=None):
    """Print the computation graph recursively."""
    if visited is None:
        visited = set()
    if id(v) in visited:
        return
    visited.add(id(v))
    label = f"[{v._op}] " if v._op else ""
    name  = v.label if v.label else ""
    print(" " * indent + f"{label}{name} data={v.data:.3f} grad={v.grad:.3f}")
    for child in v._prev:
        print_graph(child, indent + 2, visited)

# Re-run forward/backward on a tiny example for visualisation
a = Value(2.0, label="a")
b = Value(-3.0, label="b")
c = Value(5.0, label="c")
e = a * b; e.label = "e"
d = e + c; d.label = "d"
f = d.relu(); f.label = "f"
f.backward()
print("Computation graph (with gradients):")
print_graph(f)

7. Summary

Concept	Implementation
Forward pass	Each op stores `data` and a `_backward` closure
Reverse topological sort	Ensures each node’s gradient is fully accumulated before it propagates
Chain rule	`self.grad += local_grad * out.grad` in every `_backward`
Parameter update	`p.data -= lr * p.grad` — the fundamental SGD step

Chapter 3 will replace this scalar engine with PyTorch tensors and build a proper N-gram neural language model with embeddings and matrix multiplications.