MNIST Classification with SimpleChains

SimpleChains.jl is an excellent framework for training small neural networks. In this tutorial we will demonstrate how to use the same API as Lux.jl to train a model using SimpleChains.jl. We will use the tutorial from SimpleChains.jl as a reference.

Package Imports

julia

using Lux, MLUtils, Optimisers, Zygote, OneHotArrays, Random, Statistics, Printf, Reactant
using MLDatasets: MNIST
using SimpleChains: SimpleChains

Reactant.set_default_backend("cpu")

2025-08-05 23:26:04.218954: I external/xla/xla/service/service.cc:163] XLA service 0xf461ca0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-08-05 23:26:04.219074: I external/xla/xla/service/service.cc:171]   StreamExecutor device (0): Quadro RTX 5000, Compute Capability 7.5
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1754436364.220125  318776 se_gpu_pjrt_client.cc:1373] Using BFC allocator.
I0000 00:00:1754436364.220311  318776 gpu_helpers.cc:136] XLA backend allocating 12528893952 bytes on device 0 for BFCAllocator.
I0000 00:00:1754436364.220432  318776 gpu_helpers.cc:177] XLA backend will use up to 4176297984 bytes on device 0 for CollectiveBFCAllocator.
2025-08-05 23:26:04.234863: I external/xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 90800

Loading MNIST

julia

function loadmnist(batchsize, train_split)
    # Load MNIST
    N = parse(Bool, get(ENV, "CI", "false")) ? 1500 : nothing
    dataset = MNIST(; split=:train)
    if N !== nothing
        imgs = dataset.features[:, :, 1:N]
        labels_raw = dataset.targets[1:N]
    else
        imgs = dataset.features
        labels_raw = dataset.targets
    end

    # Process images into (H, W, C, BS) batches
    x_data = Float32.(reshape(imgs, size(imgs, 1), size(imgs, 2), 1, size(imgs, 3)))
    y_data = onehotbatch(labels_raw, 0:9)
    (x_train, y_train), (x_test, y_test) = splitobs((x_data, y_data); at=train_split)

    return (
        # Use DataLoader to automatically minibatch and shuffle the data
        DataLoader(collect.((x_train, y_train)); batchsize, shuffle=true, partial=false),
        # Don't shuffle the test data
        DataLoader(collect.((x_test, y_test)); batchsize, shuffle=false, partial=false),
    )
end

Define the Model

julia

lux_model = Chain(
    Conv((5, 5), 1 => 6, relu),
    MaxPool((2, 2)),
    Conv((5, 5), 6 => 16, relu),
    MaxPool((2, 2)),
    FlattenLayer(3),
    Chain(Dense(256 => 128, relu), Dense(128 => 84, relu), Dense(84 => 10)),
)

Chain(
    layer_1 = Conv((5, 5), 1 => 6, relu),  # 156 parameters
    layer_2 = MaxPool((2, 2)),
    layer_3 = Conv((5, 5), 6 => 16, relu),  # 2_416 parameters
    layer_4 = MaxPool((2, 2)),
    layer_5 = Lux.FlattenLayer{Static.StaticInt{3}}(static(3)),
    layer_6 = Chain(
        layer_1 = Dense(256 => 128, relu),  # 32_896 parameters
        layer_2 = Dense(128 => 84, relu),  # 10_836 parameters
        layer_3 = Dense(84 => 10),      # 850 parameters
    ),
)         # Total: 47_154 parameters,
          #        plus 0 states.

We now need to convert the lux_model to SimpleChains.jl. We need to do this by defining the ToSimpleChainsAdaptor and providing the input dimensions.

julia

adaptor = ToSimpleChainsAdaptor((28, 28, 1))
simple_chains_model = adaptor(lux_model)

SimpleChainsLayer(
    Chain(
        layer_1 = Conv((5, 5), 1 => 6, relu),  # 156 parameters
        layer_2 = MaxPool((2, 2)),
        layer_3 = Conv((5, 5), 6 => 16, relu),  # 2_416 parameters
        layer_4 = MaxPool((2, 2)),
        layer_5 = Lux.FlattenLayer{Static.StaticInt{3}}(static(3)),
        layer_6 = Chain(
            layer_1 = Dense(256 => 128, relu),  # 32_896 parameters
            layer_2 = Dense(128 => 84, relu),  # 10_836 parameters
            layer_3 = Dense(84 => 10),  # 850 parameters
        ),
    ),
)         # Total: 47_154 parameters,
          #        plus 0 states.

Helper Functions

julia

const lossfn = CrossEntropyLoss(; logits=Val(true))

function accuracy(model, ps, st, dataloader)
    total_correct, total = 0, 0
    st = Lux.testmode(st)
    for (x, y) in dataloader
        target_class = onecold(Array(y))
        predicted_class = onecold(Array(first(model(x, ps, st))))
        total_correct += sum(target_class .== predicted_class)
        total += length(target_class)
    end
    return total_correct / total
end

accuracy (generic function with 1 method)

Define the Training Loop

julia

function train(model, dev=cpu_device(); rng=Random.default_rng(), kwargs...)
    train_dataloader, test_dataloader = dev(loadmnist(128, 0.9))
    ps, st = dev(Lux.setup(rng, model))

    vjp = dev isa ReactantDevice ? AutoEnzyme() : AutoZygote()

    train_state = Training.TrainState(model, ps, st, Adam(3.0f-4))

    if dev isa ReactantDevice
        x_ra = first(test_dataloader)[1]
        model_compiled = Reactant.with_config(;
            dot_general_precision=PrecisionConfig.HIGH,
            convolution_precision=PrecisionConfig.HIGH,
        ) do
            @compile model(x_ra, ps, Lux.testmode(st))
        end
    else
        model_compiled = model
    end

    ### Lets train the model
    nepochs = 10
    tr_acc, te_acc = 0.0, 0.0
    for epoch in 1:nepochs
        stime = time()
        for (x, y) in train_dataloader
            _, _, _, train_state = Training.single_train_step!(
                vjp, lossfn, (x, y), train_state
            )
        end
        ttime = time() - stime

        tr_acc =
            accuracy(
                model_compiled, train_state.parameters, train_state.states, train_dataloader
            ) * 100
        te_acc =
            accuracy(
                model_compiled, train_state.parameters, train_state.states, test_dataloader
            ) * 100

        @printf "[%2d/%2d] \t Time %.2fs \t Training Accuracy: %.2f%% \t Test Accuracy: \
                 %.2f%%\n" epoch nepochs ttime tr_acc te_acc
    end

    return tr_acc, te_acc
end

Finally Training the Model

First we will train the Lux model

julia

tr_acc, te_acc = train(lux_model, reactant_device())

[ 1/10] 	 Time 460.88s 	 Training Accuracy: 14.92% 	 Test Accuracy: 10.94%
[ 2/10] 	 Time 0.11s 	 Training Accuracy: 30.00% 	 Test Accuracy: 23.44%
[ 3/10] 	 Time 0.17s 	 Training Accuracy: 45.86% 	 Test Accuracy: 36.72%
[ 4/10] 	 Time 0.09s 	 Training Accuracy: 55.78% 	 Test Accuracy: 51.56%
[ 5/10] 	 Time 0.09s 	 Training Accuracy: 65.16% 	 Test Accuracy: 58.59%
[ 6/10] 	 Time 0.10s 	 Training Accuracy: 70.70% 	 Test Accuracy: 67.19%
[ 7/10] 	 Time 0.10s 	 Training Accuracy: 74.92% 	 Test Accuracy: 66.41%
[ 8/10] 	 Time 0.10s 	 Training Accuracy: 78.67% 	 Test Accuracy: 71.09%
[ 9/10] 	 Time 0.16s 	 Training Accuracy: 82.27% 	 Test Accuracy: 72.66%
[10/10] 	 Time 0.14s 	 Training Accuracy: 83.52% 	 Test Accuracy: 74.22%

Now we will train the SimpleChains model

julia

tr_acc, te_acc = train(simple_chains_model)

[ 1/10] 	 Time 1034.72s 	 Training Accuracy: 27.03% 	 Test Accuracy: 26.56%
[ 2/10] 	 Time 12.42s 	 Training Accuracy: 50.31% 	 Test Accuracy: 51.56%
[ 3/10] 	 Time 12.39s 	 Training Accuracy: 62.27% 	 Test Accuracy: 59.38%
[ 4/10] 	 Time 12.39s 	 Training Accuracy: 70.00% 	 Test Accuracy: 66.41%
[ 5/10] 	 Time 12.32s 	 Training Accuracy: 79.22% 	 Test Accuracy: 69.53%
[ 6/10] 	 Time 12.29s 	 Training Accuracy: 80.78% 	 Test Accuracy: 71.88%
[ 7/10] 	 Time 12.28s 	 Training Accuracy: 83.83% 	 Test Accuracy: 78.12%
[ 8/10] 	 Time 12.30s 	 Training Accuracy: 84.69% 	 Test Accuracy: 79.69%
[ 9/10] 	 Time 12.28s 	 Training Accuracy: 86.17% 	 Test Accuracy: 82.81%
[10/10] 	 Time 12.29s 	 Training Accuracy: 87.66% 	 Test Accuracy: 82.03%

On my local machine we see a 3-4x speedup when using SimpleChains.jl. The conditions of the server this documentation is being built on is not ideal for CPU benchmarking hence, the speedup may not be as significant and even there might be regressions.

Appendix

julia

using InteractiveUtils
InteractiveUtils.versioninfo()

if @isdefined(MLDataDevices)
    if @isdefined(CUDA) && MLDataDevices.functional(CUDADevice)
        println()
        CUDA.versioninfo()
    end

    if @isdefined(AMDGPU) && MLDataDevices.functional(AMDGPUDevice)
        println()
        AMDGPU.versioninfo()
    end
end

Julia Version 1.11.6
Commit 9615af0f269 (2025-07-09 12:58 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 48 × AMD EPYC 7402 24-Core Processor
  WORD_SIZE: 64
  LLVM: libLLVM-16.0.6 (ORCJIT, znver2)
Threads: 48 default, 0 interactive, 24 GC (on 2 virtual cores)
Environment:
  JULIA_CPU_THREADS = 2
  LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64
  JULIA_PKG_SERVER = 
  JULIA_NUM_THREADS = 48
  JULIA_CUDA_HARD_MEMORY_LIMIT = 100%
  JULIA_PKG_PRECOMPILE_AUTO = 0
  JULIA_DEBUG = Literate
  JULIA_DEPOT_PATH = /root/.cache/julia-buildkite-plugin/depots/01872db4-8c79-43af-ab7d-12abac4f24f6

This page was generated using Literate.jl.

Trusted by

MNIST Classification with SimpleChains ​

Package Imports ​

Loading MNIST ​

Define the Model ​

Helper Functions ​

Define the Training Loop ​

Finally Training the Model ​

Appendix ​

MNIST Classification with SimpleChains

Package Imports

Loading MNIST

Define the Model

Helper Functions

Define the Training Loop

Finally Training the Model

Appendix