Skip to content

Convolutional VAE for MNIST using Reactant

Convolutional variational autoencoder (CVAE) implementation in MLX using MNIST. This is based on the CVAE implementation in MLX.

julia
using Lux, Reactant, MLDatasets, Random, Statistics, Enzyme, MLUtils, DataAugmentation,
    ConcreteStructs, OneHotArrays, ImageShow, Images, Printf, Optimisers

const xdev = reactant_device(; force = true)
const cdev = cpu_device()
(::MLDataDevices.CPUDevice) (generic function with 1 method)

Model Definition

First we will define the encoder.It maps the input to a normal distribution in latent space and sample a latent vector from that distribution.

julia
function cvae_encoder(
        rng = Random.default_rng(); num_latent_dims::Int,
        image_shape::Dims{3}, max_num_filters::Int
    )
    flattened_dim = prod(image_shape[1:2]  8) * max_num_filters
    return @compact(;
        embed = Chain(
            Chain(
                Conv((3, 3), image_shape[3] => max_num_filters ÷ 4; stride = 2, pad = 1),
                BatchNorm(max_num_filters ÷ 4, leakyrelu)
            ),
            Chain(
                Conv((3, 3), max_num_filters ÷ 4 => max_num_filters ÷ 2; stride = 2, pad = 1),
                BatchNorm(max_num_filters ÷ 2, leakyrelu)
            ),
            Chain(
                Conv((3, 3), max_num_filters ÷ 2 => max_num_filters; stride = 2, pad = 1),
                BatchNorm(max_num_filters, leakyrelu)
            ),
            FlattenLayer()
        ),
        proj_mu = Dense(flattened_dim, num_latent_dims; init_bias = zeros32),
        proj_log_var = Dense(flattened_dim, num_latent_dims; init_bias = zeros32),
        rng
    ) do x
        y = embed(x)

        μ = proj_mu(y)
        logσ² = proj_log_var(y)

        T = eltype(logσ²)
        logσ² = clamp.(logσ², -T(20.0f0), T(10.0f0))
        σ = exp.(logσ² .* T(0.5))

        # Generate a tensor of random values from a normal distribution
        rng = Lux.replicate(rng)
        ϵ = randn_like(rng, σ)

        # Reparameterization trick to brackpropagate through sampling
        z = ϵ .* σ .+ μ

        @return z, μ, logσ²
    end
end
cvae_encoder (generic function with 2 methods)

Similarly we define the decoder.

julia
function cvae_decoder(; num_latent_dims::Int, image_shape::Dims{3}, max_num_filters::Int)
    flattened_dim = prod(image_shape[1:2]  8) * max_num_filters
    return @compact(;
        linear = Dense(num_latent_dims, flattened_dim),
        upchain = Chain(
            Chain(
                Upsample(2),
                Conv((3, 3), max_num_filters => max_num_filters ÷ 2; stride = 1, pad = 1),
                BatchNorm(max_num_filters ÷ 2, leakyrelu)
            ),
            Chain(
                Upsample(2),
                Conv((3, 3), max_num_filters ÷ 2 => max_num_filters ÷ 4; stride = 1, pad = 1),
                BatchNorm(max_num_filters ÷ 4, leakyrelu)
            ),
            Chain(
                Upsample(2),
                Conv(
                    (3, 3), max_num_filters ÷ 4 => image_shape[3],
                    sigmoid; stride = 1, pad = 1
                )
            )
        ),
        max_num_filters
    ) do x
        y = linear(x)
        img = reshape(y, image_shape[1] ÷ 8, image_shape[2] ÷ 8, max_num_filters, :)
        @return upchain(img)
    end
end

@concrete struct CVAE <: Lux.AbstractLuxContainerLayer{(:encoder, :decoder)}
    encoder <: Lux.AbstractLuxLayer
    decoder <: Lux.AbstractLuxLayer
end

function CVAE(
        rng = Random.default_rng(); num_latent_dims::Int,
        image_shape::Dims{3}, max_num_filters::Int
    )
    decoder = cvae_decoder(; num_latent_dims, image_shape, max_num_filters)
    encoder = cvae_encoder(rng; num_latent_dims, image_shape, max_num_filters)
    return CVAE(encoder, decoder)
end

function (cvae::CVAE)(x, ps, st)
    (z, μ, logσ²), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
    x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
    return (x_rec, μ, logσ²), (; encoder = st_enc, decoder = st_dec)
end

function encode(cvae::CVAE, x, ps, st)
    (z, _, _), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
    return z, (; encoder = st_enc, st.decoder)
end

function decode(cvae::CVAE, z, ps, st)
    x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
    return x_rec, (; decoder = st_dec, st.encoder)
end
decode (generic function with 1 method)

Loading MNIST

julia
@concrete struct TensorDataset
    dataset
    transform
    total_samples::Int
end

Base.length(ds::TensorDataset) = ds.total_samples

function Base.getindex(ds::TensorDataset, idxs::Union{Vector{<:Integer}, AbstractRange})
    img = Image.(eachslice(convert2image(ds.dataset, idxs); dims = 3))
    return stack(parent  itemdata  Base.Fix1(apply, ds.transform), img)
end

function loadmnist(batchsize, image_size::Dims{2})
    # Load MNIST: Only 1500 for demonstration purposes on CI
    train_dataset = MNIST(; split = :train)
    N = parse(Bool, get(ENV, "CI", "false")) ? 1500 : length(train_dataset)

    train_transform = ScaleKeepAspect(image_size) |> ImageToTensor()
    trainset = TensorDataset(train_dataset, train_transform, N)
    trainloader = DataLoader(trainset; batchsize, shuffle = true, partial = false)

    return trainloader
end
loadmnist (generic function with 1 method)

Helper Functions

Generate an Image Grid from a list of images

julia
function create_image_grid(imgs::AbstractArray, grid_rows::Int, grid_cols::Int)
    total_images = grid_rows * grid_cols
    imgs = map(eachslice(imgs[:, :, :, 1:total_images]; dims = 4)) do img
        cimg = size(img, 3) == 1 ? colorview(Gray, view(img, :, :, 1)) :
            colorview(RGB, permutedims(img, (3, 1, 2)))
        return cimg'
    end
    return create_image_grid(imgs, grid_rows, grid_cols)
end

function create_image_grid(images::Vector, grid_rows::Int, grid_cols::Int)
    # Check if the number of images matches the grid
    total_images = grid_rows * grid_cols
    @assert length(images) == total_images

    # Get the size of a single image (assuming all images are the same size)
    img_height, img_width = size(images[1])

    # Create a blank grid canvas
    grid_height = img_height * grid_rows
    grid_width = img_width * grid_cols
    grid_canvas = similar(images[1], grid_height, grid_width)

    # Place each image in the correct position on the canvas
    for idx in 1:total_images
        row = div(idx - 1, grid_cols) + 1
        col = mod(idx - 1, grid_cols) + 1

        start_row = (row - 1) * img_height + 1
        start_col = (col - 1) * img_width + 1

        grid_canvas[start_row:(start_row + img_height - 1), start_col:(start_col + img_width - 1)] .= images[idx]
    end

    return grid_canvas
end

function loss_function(model, ps, st, X)
    (y, μ, logσ²), st = model(X, ps, st)
    reconstruction_loss = MSELoss(; agg = sum)(y, X)
    kldiv_loss = -sum(1 .+ logσ² .- μ .^ 2 .- exp.(logσ²)) / 2
    loss = reconstruction_loss + kldiv_loss
    return loss, st, (; y, μ, logσ², reconstruction_loss, kldiv_loss)
end

function generate_images(
        model, ps, st; num_samples::Int = 128, num_latent_dims::Int, decode_compiled = nothing
    )
    z = randn(Float32, num_latent_dims, num_samples) |> get_device((ps, st))
    if decode_compiled === nothing
        images, _ = decode(model, z, ps, Lux.testmode(st))
    else
        images, _ = decode_compiled(model, z, ps, Lux.testmode(st))
        images = images |> cpu_device()
    end
    return create_image_grid(images, 8, num_samples ÷ 8)
end

function reconstruct_images(model, ps, st, X)
    (recon, _, _), _ = model(X, ps, Lux.testmode(st))
    recon = recon |> cpu_device()
    return create_image_grid(recon, 8, size(X, ndims(X)) ÷ 8)
end
reconstruct_images (generic function with 1 method)

Training the Model

julia
function main(;
        batchsize = 128, image_size = (64, 64), num_latent_dims = 8, max_num_filters = 64,
        seed = 0, epochs = 50, weight_decay = 1.0e-5, learning_rate = 1.0e-3, num_samples = batchsize
    )
    rng = Xoshiro()
    Random.seed!(rng, seed)

    cvae = CVAE(rng; num_latent_dims, image_shape = (image_size..., 1), max_num_filters)
    ps, st = Lux.setup(rng, cvae) |> xdev

    z = randn(Float32, num_latent_dims, num_samples) |> xdev
    decode_compiled = @compile decode(cvae, z, ps, Lux.testmode(st))
    x = randn(Float32, image_size..., 1, batchsize) |> xdev
    cvae_compiled = @compile cvae(x, ps, Lux.testmode(st))

    train_dataloader = loadmnist(batchsize, image_size) |> xdev

    opt = AdamW(; eta = learning_rate, lambda = weight_decay)

    train_state = Training.TrainState(cvae, ps, st, opt)

    @printf "Total Trainable Parameters: %0.4f M\n" (Lux.parameterlength(ps) / 1.0e6)

    is_vscode = isdefined(Main, :VSCodeServer)
    empty_row, model_img_full = nothing, nothing

    for epoch in 1:epochs
        loss_total = 0.0f0
        total_samples = 0

        start_time = time()
        for (i, X) in enumerate(train_dataloader)
            (_, loss, _, train_state) = Training.single_train_step!(
                AutoEnzyme(), loss_function, X, train_state; return_gradients = Val(false)
            )

            loss_total += loss
            total_samples += size(X, ndims(X))

            if i % 250 == 0 || i == length(train_dataloader)
                throughput = total_samples / (time() - start_time)
                @printf "Epoch %d, Iter %d, Loss: %.7f, Throughput: %.6f im/s\n" epoch i loss throughput
            end
        end
        total_time = time() - start_time

        train_loss = loss_total / length(train_dataloader)
        throughput = total_samples / total_time
        @printf "Epoch %d, Train Loss: %.7f, Time: %.4fs, Throughput: %.6f im/s\n" epoch train_loss total_time throughput

        if is_vscode || epoch == epochs
            recon_images = reconstruct_images(
                cvae_compiled, train_state.parameters, train_state.states,
                first(train_dataloader)
            )
            gen_images = generate_images(
                cvae, train_state.parameters, train_state.states;
                num_samples, num_latent_dims, decode_compiled
            )
            if empty_row === nothing
                empty_row = similar(gen_images, image_size[1], size(gen_images, 2))
                fill!(empty_row, 0)
            end
            model_img_full = vcat(recon_images, empty_row, gen_images)
            is_vscode && display(model_img_full)
        end
    end

    return model_img_full
end

img = main()
2025-03-08 00:08:41.847601: I external/xla/xla/service/service.cc:152] XLA service 0x5576b30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-03-08 00:08:41.847636: I external/xla/xla/service/service.cc:160]   StreamExecutor device (0): NVIDIA A100-PCIE-40GB MIG 1g.5gb, Compute Capability 8.0
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1741392521.848387 4176427 se_gpu_pjrt_client.cc:951] Using BFC allocator.
I0000 00:00:1741392521.848444 4176427 gpu_helpers.cc:136] XLA backend allocating 3825205248 bytes on device 0 for BFCAllocator.
I0000 00:00:1741392521.848475 4176427 gpu_helpers.cc:177] XLA backend will use up to 1275068416 bytes on device 0 for CollectiveBFCAllocator.
I0000 00:00:1741392521.870534 4176427 cuda_dnn.cc:529] Loaded cuDNN version 90400
┌ Warning: `training` is set to `Val{false}()` but is being used within an autodiff call (gradient, jacobian, etc...). This might lead to incorrect results. If you are using a `Lux.jl` model, set it to training mode using `LuxCore.trainmode`.
└ @ LuxLib.Utils /var/lib/buildkite-agent/builds/gpuci-9/julialang/lux-dot-jl/lib/LuxLib/src/utils.jl:340
Total Trainable Parameters: 0.1493 M
Epoch 1, Iter 11, Loss: 64386.7617188, Throughput: 23.434239 im/s
Epoch 1, Train Loss: 77072.9687500, Time: 60.5095s, Throughput: 23.269063 im/s
Epoch 2, Iter 11, Loss: 52420.5234375, Throughput: 1594.318367 im/s
Epoch 2, Train Loss: 56733.8164062, Time: 0.8839s, Throughput: 1593.013986 im/s
Epoch 3, Iter 11, Loss: 49821.8554688, Throughput: 1608.497708 im/s
Epoch 3, Train Loss: 50596.9531250, Time: 0.8760s, Throughput: 1607.289446 im/s
Epoch 4, Iter 11, Loss: 47659.7031250, Throughput: 1514.517302 im/s
Epoch 4, Train Loss: 48978.7460938, Time: 0.9300s, Throughput: 1514.058345 im/s
Epoch 5, Iter 11, Loss: 51648.8515625, Throughput: 1581.370323 im/s
Epoch 5, Train Loss: 49399.5468750, Time: 0.8906s, Throughput: 1580.951217 im/s
Epoch 6, Iter 11, Loss: 52250.5117188, Throughput: 1632.745943 im/s
Epoch 6, Train Loss: 50262.3125000, Time: 0.8626s, Throughput: 1632.333004 im/s
Epoch 7, Iter 11, Loss: 51206.6054688, Throughput: 1613.372336 im/s
Epoch 7, Train Loss: 50919.3515625, Time: 0.8730s, Throughput: 1612.870021 im/s
Epoch 8, Iter 11, Loss: 49951.3085938, Throughput: 1704.600535 im/s
Epoch 8, Train Loss: 51013.2460938, Time: 0.8263s, Throughput: 1703.925751 im/s
Epoch 9, Iter 11, Loss: 51987.7500000, Throughput: 1688.599636 im/s
Epoch 9, Train Loss: 50359.8750000, Time: 0.8340s, Throughput: 1688.174373 im/s
Epoch 10, Iter 11, Loss: 49664.2656250, Throughput: 1849.385561 im/s
Epoch 10, Train Loss: 51119.7148438, Time: 0.7615s, Throughput: 1848.887624 im/s
Epoch 11, Iter 11, Loss: 50988.8828125, Throughput: 1845.255607 im/s
Epoch 11, Train Loss: 51947.9648438, Time: 0.7632s, Throughput: 1844.801383 im/s
Epoch 12, Iter 11, Loss: 50156.6132812, Throughput: 1800.465798 im/s
Epoch 12, Train Loss: 51636.4101562, Time: 0.7822s, Throughput: 1800.021285 im/s
Epoch 13, Iter 11, Loss: 51740.4335938, Throughput: 1718.916008 im/s
Epoch 13, Train Loss: 50858.4843750, Time: 0.8193s, Throughput: 1718.494343 im/s
Epoch 14, Iter 11, Loss: 49820.1093750, Throughput: 1704.183405 im/s
Epoch 14, Train Loss: 51009.9023438, Time: 0.8264s, Throughput: 1703.868233 im/s
Epoch 15, Iter 11, Loss: 52141.6914062, Throughput: 1761.900396 im/s
Epoch 15, Train Loss: 51267.0039062, Time: 0.7994s, Throughput: 1761.292424 im/s
Epoch 16, Iter 11, Loss: 55204.2695312, Throughput: 1735.441293 im/s
Epoch 16, Train Loss: 52438.0898438, Time: 0.8115s, Throughput: 1735.058379 im/s
Epoch 17, Iter 11, Loss: 54870.1406250, Throughput: 1734.788758 im/s
Epoch 17, Train Loss: 54244.1289062, Time: 0.8118s, Throughput: 1734.404094 im/s
Epoch 18, Iter 11, Loss: 55982.9140625, Throughput: 1730.833606 im/s
Epoch 18, Train Loss: 56001.7148438, Time: 0.8138s, Throughput: 1730.148541 im/s
Epoch 19, Iter 11, Loss: 56693.6835938, Throughput: 1760.466528 im/s
Epoch 19, Train Loss: 55950.4648438, Time: 0.8000s, Throughput: 1760.015316 im/s
Epoch 20, Iter 11, Loss: 58107.2500000, Throughput: 1767.327621 im/s
Epoch 20, Train Loss: 55843.8984375, Time: 0.7969s, Throughput: 1766.813153 im/s
Epoch 21, Iter 11, Loss: 55566.5000000, Throughput: 1840.171639 im/s
Epoch 21, Train Loss: 54761.5585938, Time: 0.7653s, Throughput: 1839.731376 im/s
Epoch 22, Iter 11, Loss: 56382.8984375, Throughput: 1833.581781 im/s
Epoch 22, Train Loss: 54614.9726562, Time: 0.7681s, Throughput: 1833.085490 im/s
Epoch 23, Iter 11, Loss: 56681.7890625, Throughput: 1805.268982 im/s
Epoch 23, Train Loss: 55683.2500000, Time: 0.7802s, Throughput: 1804.757562 im/s
Epoch 24, Iter 11, Loss: 56432.2343750, Throughput: 1754.335734 im/s
Epoch 24, Train Loss: 55764.7382812, Time: 0.8027s, Throughput: 1754.016848 im/s
Epoch 25, Iter 11, Loss: 53247.5898438, Throughput: 1703.998024 im/s
Epoch 25, Train Loss: 54181.7773438, Time: 0.8265s, Throughput: 1703.645568 im/s
Epoch 26, Iter 11, Loss: 47469.6796875, Throughput: 1765.722817 im/s
Epoch 26, Train Loss: 51152.9218750, Time: 0.7976s, Throughput: 1765.360199 im/s
Epoch 27, Iter 11, Loss: 48327.2148438, Throughput: 1727.719950 im/s
Epoch 27, Train Loss: 49432.1992188, Time: 0.8152s, Throughput: 1727.226260 im/s
Epoch 28, Iter 11, Loss: 49721.9570312, Throughput: 1758.092394 im/s
Epoch 28, Train Loss: 48417.6601562, Time: 0.8012s, Throughput: 1757.451482 im/s
Epoch 29, Iter 11, Loss: 46331.5351562, Throughput: 1745.317447 im/s
Epoch 29, Train Loss: 48186.9726562, Time: 0.8069s, Throughput: 1744.846128 im/s
Epoch 30, Iter 11, Loss: 47147.0937500, Throughput: 1749.585838 im/s
Epoch 30, Train Loss: 46883.9375000, Time: 0.8050s, Throughput: 1748.996693 im/s
Epoch 31, Iter 11, Loss: 49736.9101562, Throughput: 1745.257100 im/s
Epoch 31, Train Loss: 47081.7460938, Time: 0.8070s, Throughput: 1744.774988 im/s
Epoch 32, Iter 11, Loss: 48305.0273438, Throughput: 1760.350030 im/s
Epoch 32, Train Loss: 47621.1835938, Time: 0.8000s, Throughput: 1760.033150 im/s
Epoch 33, Iter 11, Loss: 49714.8984375, Throughput: 1736.280624 im/s
Epoch 33, Train Loss: 48431.6718750, Time: 0.8112s, Throughput: 1735.627457 im/s
Epoch 34, Iter 11, Loss: 46347.0312500, Throughput: 1722.136579 im/s
Epoch 34, Train Loss: 48329.2265625, Time: 0.8179s, Throughput: 1721.563764 im/s
Epoch 35, Iter 11, Loss: 47263.9765625, Throughput: 1744.173624 im/s
Epoch 35, Train Loss: 47460.2812500, Time: 0.8075s, Throughput: 1743.748230 im/s
Epoch 36, Iter 11, Loss: 46274.1132812, Throughput: 1722.199858 im/s
Epoch 36, Train Loss: 46490.9335938, Time: 0.8178s, Throughput: 1721.589360 im/s
Epoch 37, Iter 11, Loss: 45948.6289062, Throughput: 1729.336902 im/s
Epoch 37, Train Loss: 46343.1093750, Time: 0.8144s, Throughput: 1728.807873 im/s
Epoch 38, Iter 11, Loss: 46470.3164062, Throughput: 1772.821912 im/s
Epoch 38, Train Loss: 46693.6562500, Time: 0.7945s, Throughput: 1772.281902 im/s
Epoch 39, Iter 11, Loss: 45581.2890625, Throughput: 1771.974005 im/s
Epoch 39, Train Loss: 46991.7343750, Time: 0.7948s, Throughput: 1771.619444 im/s
Epoch 40, Iter 11, Loss: 46127.8867188, Throughput: 1708.839470 im/s
Epoch 40, Train Loss: 46248.5898438, Time: 0.8241s, Throughput: 1708.445468 im/s
Epoch 41, Iter 11, Loss: 47099.3125000, Throughput: 1743.556202 im/s
Epoch 41, Train Loss: 46182.4140625, Time: 0.8077s, Throughput: 1743.171757 im/s
Epoch 42, Iter 11, Loss: 44627.6875000, Throughput: 1710.458909 im/s
Epoch 42, Train Loss: 46270.8281250, Time: 0.8234s, Throughput: 1710.080501 im/s
Epoch 43, Iter 11, Loss: 44242.1718750, Throughput: 1780.275052 im/s
Epoch 43, Train Loss: 45986.6640625, Time: 0.7911s, Throughput: 1779.712258 im/s
Epoch 44, Iter 11, Loss: 44870.5000000, Throughput: 1756.217539 im/s
Epoch 44, Train Loss: 45815.6718750, Time: 0.8020s, Throughput: 1755.654714 im/s
Epoch 45, Iter 11, Loss: 47405.1289062, Throughput: 1772.652159 im/s
Epoch 45, Train Loss: 45554.2539062, Time: 0.7945s, Throughput: 1772.268606 im/s
Epoch 46, Iter 11, Loss: 46983.6406250, Throughput: 1718.762924 im/s
Epoch 46, Train Loss: 46297.9882812, Time: 0.8194s, Throughput: 1718.267840 im/s
Epoch 47, Iter 11, Loss: 46006.6328125, Throughput: 1798.763569 im/s
Epoch 47, Train Loss: 46349.9843750, Time: 0.7830s, Throughput: 1798.221331 im/s
Epoch 48, Iter 11, Loss: 48529.2773438, Throughput: 1726.463288 im/s
Epoch 48, Train Loss: 47938.6875000, Time: 0.8158s, Throughput: 1726.016725 im/s
Epoch 49, Iter 11, Loss: 51657.4531250, Throughput: 1683.280730 im/s
Epoch 49, Train Loss: 49743.6210938, Time: 0.8367s, Throughput: 1682.873967 im/s
Epoch 50, Iter 11, Loss: 46286.2031250, Throughput: 1801.709406 im/s
Epoch 50, Train Loss: 49036.3164062, Time: 0.7816s, Throughput: 1801.342847 im/s

Appendix

julia
using InteractiveUtils
InteractiveUtils.versioninfo()

if @isdefined(MLDataDevices)
    if @isdefined(CUDA) && MLDataDevices.functional(CUDADevice)
        println()
        CUDA.versioninfo()
    end

    if @isdefined(AMDGPU) && MLDataDevices.functional(AMDGPUDevice)
        println()
        AMDGPU.versioninfo()
    end
end
Julia Version 1.11.3
Commit d63adeda50d (2025-01-21 19:42 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 48 × AMD EPYC 7402 24-Core Processor
  WORD_SIZE: 64
  LLVM: libLLVM-16.0.6 (ORCJIT, znver2)
Threads: 48 default, 0 interactive, 24 GC (on 2 virtual cores)
Environment:
  JULIA_CPU_THREADS = 2
  JULIA_DEPOT_PATH = /root/.cache/julia-buildkite-plugin/depots/01872db4-8c79-43af-ab7d-12abac4f24f6
  LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64
  JULIA_PKG_SERVER = 
  JULIA_NUM_THREADS = 48
  JULIA_CUDA_HARD_MEMORY_LIMIT = 100%
  JULIA_PKG_PRECOMPILE_AUTO = 0
  JULIA_DEBUG = Literate

This page was generated using Literate.jl.