Chapter 06: Tokenization — Byte Pair Encoding (BPE)

Character-level models from earlier chapters are intuitive but inefficient: every letter is a separate token, so a single sentence of 50 characters requires 50 forward passes during generation. Modern LLMs use subword tokenisation to find a middle ground between characters (vocabulary too small, sequences too long) and words (vocabulary too large, out-of-vocabulary problems).

Byte Pair Encoding (BPE) starts with a character-level vocabulary and iteratively merges the most frequent adjacent pair of tokens into a new token. After N merges, common words become single tokens while rare words are split into recognisable pieces. GPT-2 and GPT-4 both use BPE on the byte representation of UTF-8 text, which means every possible string can be encoded without any unknown-token issues.

In this chapter we implement BPE from scratch, then compare it to the character-level tokeniser to see how much sequence compression we achieve. We also show how to use the HuggingFace tokenizers library for a production-quality implementation.

The key insight is that tokenisation is a separate, pre-trained step: we train the tokeniser on raw text once, then use the resulting vocabulary and merge rules for all subsequent model training and inference.

1. Load a Text Corpus

import os
import re
from collections import defaultdict

DATA_DIR   = "data"
TRAIN_FILE = os.path.join(DATA_DIR, "tinystories_train.txt")
os.makedirs(DATA_DIR, exist_ok=True)

# Download TinyStories if not present
if not os.path.exists(TRAIN_FILE):
    from datasets import load_dataset
    ds = load_dataset("roneneldan/TinyStories", split="train", streaming=True)
    with open(TRAIN_FILE, "w", encoding="utf-8") as f:
        for i, ex in enumerate(ds):
            if i >= 50_000: break
            f.write(ex["text"].strip() + "\n")

with open(TRAIN_FILE, encoding="utf-8") as f:
    corpus = f.read()

# Use a smaller slice for BPE training (faster to demonstrate)
corpus_small = corpus[:500_000]
print(f"Corpus size: {len(corpus_small):,} chars")

2. BPE Tokeniser from Scratch

class BPETokenizer:
    """
    Minimal Byte Pair Encoding tokeniser.
    Operates on characters (not bytes) for clarity.
    """

    def __init__(self):
        self.vocab  = {}          # token_id → string
        self.merges = {}          # (a, b) → merged token string
        self.stoi   = {}          # string → token_id

    # ------------------------------------------------------------------
    # Training
    # ------------------------------------------------------------------

    def train(self, text: str, vocab_size: int, verbose: bool = True):
        """
        Train BPE until the vocabulary reaches `vocab_size`.
        `vocab_size` must be > number of unique characters in text.
        """
        # Start with all unique characters as base vocabulary
        chars = sorted(set(text))
        self.vocab = {i: c for i, c in enumerate(chars)}
        self.stoi  = {c: i for i, c in self.vocab.items()}

        # Split corpus into words separated by whitespace
        # Each word is represented as a list of character tokens
        word_freq = defaultdict(int)
        for word in text.split():
            word_freq[" ".join(list(word)) + " </w>"] += 1

        print(f"Initial vocab size: {len(self.vocab)}")

        while len(self.vocab) < vocab_size:
            # Count all adjacent symbol pairs weighted by word frequency
            pair_counts = defaultdict(int)
            for word, freq in word_freq.items():
                symbols = word.split()
                for a, b in zip(symbols, symbols[1:]):
                    pair_counts[(a, b)] += freq

            if not pair_counts:
                break

            # Find the most frequent pair
            best_pair = max(pair_counts, key=pair_counts.get)
            best_count = pair_counts[best_pair]
            merged = "".join(best_pair)

            if verbose and len(self.merges) % 100 == 0:
                print(f"  Merge #{len(self.merges):4d}: {best_pair} → '{merged}'"
                      f"  (freq={best_count:,})")

            # Record the merge rule
            self.merges[best_pair] = merged
            new_id = len(self.vocab)
            self.vocab[new_id] = merged
            self.stoi[merged] = new_id

            # Apply the merge to all words in the corpus
            bigram = re.escape(" ".join(best_pair))
            pattern = re.compile(r"(?<!\S)" + bigram + r"(?!\S)")
            word_freq = {
                pattern.sub(merged, word): freq
                for word, freq in word_freq.items()
            }

        print(f"Final vocab size: {len(self.vocab)}")

    # ------------------------------------------------------------------
    # Encoding / Decoding
    # ------------------------------------------------------------------

    def _apply_merges(self, word: str) -> list[str]:
        """Apply all learned merge rules to a single word."""
        symbols = list(word) + ["</w>"]
        for (a, b), merged in self.merges.items():
            i = 0
            while i < len(symbols) - 1:
                if symbols[i] == a and symbols[i + 1] == b:
                    symbols = symbols[:i] + [merged] + symbols[i + 2:]
                else:
                    i += 1
        return symbols

    def encode(self, text: str) -> list[int]:
        """Convert a string to a list of token IDs."""
        ids = []
        for word in text.split():
            for sym in self._apply_merges(word):
                ids.append(self.stoi.get(sym, 0))
        return ids

    def decode(self, ids: list[int]) -> str:
        """Convert token IDs back to a string."""
        tokens = [self.vocab.get(i, "?") for i in ids]
        text   = " ".join(tokens)
        text   = text.replace(" </w>", " ").replace("</w>", "")
        # Remove spaces within merged subwords
        text   = re.sub(r" (?=[^<\s])", "", text)
        return text.strip()

3. Train the BPE Tokeniser

tokenizer = BPETokenizer()
tokenizer.train(corpus_small, vocab_size=500, verbose=True)

print(f"\nVocabulary sample (last 20 tokens):")
ids = sorted(tokenizer.vocab.keys())
for i in ids[-20:]:
    print(f"  {i:4d}: {repr(tokenizer.vocab[i])}")

4. Encode / Decode Examples

sample_texts = [
    "Once upon a time there was a little girl.",
    "The dog ran quickly through the forest.",
    "She loved to read stories about dragons.",
]

for text in sample_texts:
    ids = tokenizer.encode(text)
    reconstructed = tokenizer.decode(ids)
    print(f"Original : {text}")
    print(f"Token IDs: {ids}")
    print(f"Decoded  : {reconstructed}")
    print()

5. Compare Compression Ratios

test_text = corpus_small[:10_000]

# Character-level tokenisation
char_tokens = len(test_text)

# BPE tokenisation
bpe_tokens = len(tokenizer.encode(test_text))

print(f"Test text: {char_tokens:,} characters")
print(f"Char-level tokens: {char_tokens:,}")
print(f"BPE tokens (v=500): {bpe_tokens:,}")
print(f"Compression ratio: {char_tokens / bpe_tokens:.2f}×")

6. Production BPE with HuggingFace tokenizers

from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders

# Build a BPE tokenizer using the fast Rust-based HuggingFace library
hf_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
hf_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
hf_tokenizer.decoder       = decoders.ByteLevel()

trainer = trainers.BpeTrainer(
    vocab_size     = 2_000,
    min_frequency  = 2,
    special_tokens = ["[UNK]", "[PAD]", "[BOS]", "[EOS]"],
    show_progress  = True,
)

# Train on the TinyStories file directly
hf_tokenizer.train([TRAIN_FILE], trainer)

# Save tokenizer
hf_tokenizer.save("data/tinystories_bpe_tokenizer.json")
print("Tokenizer saved → data/tinystories_bpe_tokenizer.json")

# Encode / decode test
sample = "Once upon a time, there was a brave little rabbit."
encoded = hf_tokenizer.encode(sample)
print(f"\nOriginal : {sample}")
print(f"Token IDs: {encoded.ids}")
print(f"Tokens   : {encoded.tokens}")
print(f"Decoded  : {hf_tokenizer.decode(encoded.ids)}")

7. Load a Pre-Trained GPT-2 Tokeniser

from transformers import AutoTokenizer

# GPT-2 uses a 50 257-token BPE vocabulary trained on WebText
gpt2_tok = AutoTokenizer.from_pretrained("gpt2")

text = "Once upon a time there was a tiny dragon."
ids  = gpt2_tok.encode(text)
print(f"GPT-2 tokenisation of: {repr(text)}")
print(f"  Tokens  : {gpt2_tok.convert_ids_to_tokens(ids)}")
print(f"  IDs     : {ids}")
print(f"  Chars   : {len(text)}")
print(f"  Tokens  : {len(ids)}")
print(f"  Ratio   : {len(text)/len(ids):.1f}×")

8. Summary

Method Vocab size Compression OOV handling
Character-level ~65 None
BPE (small) 500 ~3× Falls back to chars
BPE (GPT-2) 50 257 ~4× Byte fallback

Chapter 7 dives into the optimisation algorithms (SGD, Adam, AdamW) that train our models, including the crucial details of weight initialisation.