Convolutional VAE for MNIST
Convolutional variational autoencoder (CVAE) implementation in MLX using MNIST. This is based on the CVAE implementation in MLX.
julia
using Lux,
Reactant,
MLDatasets,
Random,
Statistics,
Enzyme,
MLUtils,
DataAugmentation,
ConcreteStructs,
OneHotArrays,
ImageShow,
Images,
Printf,
Optimisers
const xdev = reactant_device(; force=true)
const cdev = cpu_device()
const IN_VSCODE = isdefined(Main, :VSCodeServer)
false
Model Definition
First we will define the encoder.It maps the input to a normal distribution in latent space and sample a latent vector from that distribution.
julia
function cvae_encoder(
rng=Random.default_rng();
num_latent_dims::Int,
image_shape::Dims{3},
max_num_filters::Int,
)
flattened_dim = prod(image_shape[1:2] .÷ 8) * max_num_filters
return @compact(;
embed=Chain(
Chain(
Conv((3, 3), image_shape[3] => max_num_filters ÷ 4; stride=2, pad=1),
BatchNorm(max_num_filters ÷ 4, leakyrelu),
),
Chain(
Conv((3, 3), max_num_filters ÷ 4 => max_num_filters ÷ 2; stride=2, pad=1),
BatchNorm(max_num_filters ÷ 2, leakyrelu),
),
Chain(
Conv((3, 3), max_num_filters ÷ 2 => max_num_filters; stride=2, pad=1),
BatchNorm(max_num_filters, leakyrelu),
),
FlattenLayer(),
),
proj_mu=Dense(flattened_dim, num_latent_dims; init_bias=zeros32),
proj_log_var=Dense(flattened_dim, num_latent_dims; init_bias=zeros32),
rng
) do x
y = embed(x)
μ = proj_mu(y)
logσ² = proj_log_var(y)
T = eltype(logσ²)
logσ² = clamp.(logσ², -T(20.0f0), T(10.0f0))
σ = exp.(logσ² .* T(0.5))
# Generate a tensor of random values from a normal distribution
ϵ = randn_like(Lux.replicate(rng), σ)
# Reparameterization trick to backpropagate through sampling
z = ϵ .* σ .+ μ
@return z, μ, logσ²
end
end
Similarly we define the decoder.
julia
function cvae_decoder(; num_latent_dims::Int, image_shape::Dims{3}, max_num_filters::Int)
flattened_dim = prod(image_shape[1:2] .÷ 8) * max_num_filters
return @compact(;
linear=Dense(num_latent_dims, flattened_dim),
upchain=Chain(
Chain(
Upsample(2),
Conv((3, 3), max_num_filters => max_num_filters ÷ 2; stride=1, pad=1),
BatchNorm(max_num_filters ÷ 2, leakyrelu),
),
Chain(
Upsample(2),
Conv((3, 3), max_num_filters ÷ 2 => max_num_filters ÷ 4; stride=1, pad=1),
BatchNorm(max_num_filters ÷ 4, leakyrelu),
),
Chain(
Upsample(2),
Conv(
(3, 3), max_num_filters ÷ 4 => image_shape[3], sigmoid; stride=1, pad=1
),
),
),
max_num_filters
) do x
y = linear(x)
img = reshape(y, image_shape[1] ÷ 8, image_shape[2] ÷ 8, max_num_filters, :)
@return upchain(img)
end
end
@concrete struct CVAE <: AbstractLuxContainerLayer{(:encoder, :decoder)}
encoder <: AbstractLuxLayer
decoder <: AbstractLuxLayer
end
function CVAE(
rng=Random.default_rng();
num_latent_dims::Int,
image_shape::Dims{3},
max_num_filters::Int,
)
decoder = cvae_decoder(; num_latent_dims, image_shape, max_num_filters)
encoder = cvae_encoder(rng; num_latent_dims, image_shape, max_num_filters)
return CVAE(encoder, decoder)
end
function (cvae::CVAE)(x, ps, st)
(z, μ, logσ²), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
return (x_rec, μ, logσ²), (; encoder=st_enc, decoder=st_dec)
end
function encode(cvae::CVAE, x, ps, st)
(z, _, _), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
return z, (; encoder=st_enc, st.decoder)
end
function decode(cvae::CVAE, z, ps, st)
x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
return x_rec, (; decoder=st_dec, st.encoder)
end
Loading MNIST
julia
@concrete struct TensorDataset
dataset
transform
total_samples::Int
end
Base.length(ds::TensorDataset) = ds.total_samples
function Base.getindex(ds::TensorDataset, idxs::Union{Vector{<:Integer},AbstractRange})
img = Image.(eachslice(convert2image(ds.dataset, idxs); dims=3))
return stack(parent ∘ itemdata ∘ Base.Fix1(apply, ds.transform), img)
end
function loadmnist(batchsize, image_size::Dims{2})
# Load MNIST: Only 1500 for demonstration purposes on CI
train_dataset = MNIST(; split=:train)
N = parse(Bool, get(ENV, "CI", "false")) ? 5000 : length(train_dataset)
train_transform = ScaleKeepAspect(image_size) |> ImageToTensor()
trainset = TensorDataset(train_dataset, train_transform, N)
trainloader = DataLoader(trainset; batchsize, shuffle=true, partial=false)
return trainloader
end
Helper Functions
Generate an Image Grid from a list of images
julia
function create_image_grid(imgs::AbstractArray, grid_rows::Int, grid_cols::Int)
total_images = grid_rows * grid_cols
imgs = map(eachslice(imgs[:, :, :, 1:total_images]; dims=4)) do img
cimg = if size(img, 3) == 1
colorview(Gray, view(img, :, :, 1))
else
colorview(RGB, permutedims(img, (3, 1, 2)))
end
return cimg'
end
return create_image_grid(imgs, grid_rows, grid_cols)
end
function create_image_grid(images::Vector, grid_rows::Int, grid_cols::Int)
# Check if the number of images matches the grid
total_images = grid_rows * grid_cols
@assert length(images) == total_images
# Get the size of a single image (assuming all images are the same size)
img_height, img_width = size(images[1])
# Create a blank grid canvas
grid_height = img_height * grid_rows
grid_width = img_width * grid_cols
grid_canvas = similar(images[1], grid_height, grid_width)
# Place each image in the correct position on the canvas
for idx in 1:total_images
row = div(idx - 1, grid_cols) + 1
col = mod(idx - 1, grid_cols) + 1
start_row = (row - 1) * img_height + 1
start_col = (col - 1) * img_width + 1
grid_canvas[start_row:(start_row + img_height - 1), start_col:(start_col + img_width - 1)] .= images[idx]
end
return grid_canvas
end
function loss_function(model, ps, st, X)
(y, μ, logσ²), st = model(X, ps, st)
reconstruction_loss = MSELoss(; agg=sum)(y, X)
kldiv_loss = -sum(1 .+ logσ² .- μ .^ 2 .- exp.(logσ²)) / 2
loss = reconstruction_loss + kldiv_loss
return loss, st, (; y, μ, logσ², reconstruction_loss, kldiv_loss)
end
function generate_images(
model, ps, st; num_samples::Int=128, num_latent_dims::Int, decode_compiled=nothing
)
z = get_device((ps, st))(randn(Float32, num_latent_dims, num_samples))
if decode_compiled === nothing
images, _ = decode(model, z, ps, Lux.testmode(st))
else
images, _ = decode_compiled(model, z, ps, Lux.testmode(st))
images = cpu_device()(images)
end
return create_image_grid(images, 8, num_samples ÷ 8)
end
function reconstruct_images(model, ps, st, X)
(recon, _, _), _ = model(X, ps, Lux.testmode(st))
recon = cpu_device()(recon)
return create_image_grid(recon, 8, size(X, ndims(X)) ÷ 8)
end
reconstruct_images (generic function with 1 method)
Training the Model
julia
function main(;
batchsize=128,
image_size=(64, 64),
num_latent_dims=8,
max_num_filters=64,
seed=0,
epochs=50,
weight_decay=1.0e-5,
learning_rate=1.0e-3,
num_samples=batchsize,
)
rng = Xoshiro()
Random.seed!(rng, seed)
cvae = CVAE(rng; num_latent_dims, image_shape=(image_size..., 1), max_num_filters)
ps, st = xdev(Lux.setup(rng, cvae))
z = xdev(randn(Float32, num_latent_dims, num_samples))
decode_compiled = Reactant.with_config(;
dot_general_precision=PrecisionConfig.HIGH,
convolution_precision=PrecisionConfig.HIGH,
) do
@compile decode(cvae, z, ps, Lux.testmode(st))
end
x = xdev(randn(Float32, image_size..., 1, batchsize))
cvae_compiled = Reactant.with_config(;
dot_general_precision=PrecisionConfig.HIGH,
convolution_precision=PrecisionConfig.HIGH,
) do
@compile cvae(x, ps, Lux.testmode(st))
end
train_dataloader = xdev(loadmnist(batchsize, image_size))
opt = AdamW(; eta=learning_rate, lambda=weight_decay)
train_state = Training.TrainState(cvae, ps, st, opt)
@printf "Total Trainable Parameters: %0.4f M\n" (Lux.parameterlength(ps) / 1.0e6)
empty_row, model_img_full = nothing, nothing
for epoch in 1:epochs
loss_total = 0.0f0
total_samples = 0
start_time = time()
for (i, X) in enumerate(train_dataloader)
(_, loss, _, train_state) = Training.single_train_step!(
AutoEnzyme(), loss_function, X, train_state; return_gradients=Val(false)
)
loss_total += loss
total_samples += size(X, ndims(X))
if i % 250 == 0 || i == length(train_dataloader)
throughput = total_samples / (time() - start_time)
@printf "Epoch %d, Iter %d, Loss: %.7f, Throughput: %.6f im/s\n" epoch i loss throughput
end
end
total_time = time() - start_time
train_loss = loss_total / length(train_dataloader)
throughput = total_samples / total_time
@printf "Epoch %d, Train Loss: %.7f, Time: %.4fs, Throughput: %.6f im/s\n" epoch train_loss total_time throughput
if IN_VSCODE || epoch == epochs
recon_images = reconstruct_images(
cvae_compiled,
train_state.parameters,
train_state.states,
first(train_dataloader),
)
gen_images = generate_images(
cvae,
train_state.parameters,
train_state.states;
num_samples,
num_latent_dims,
decode_compiled,
)
if empty_row === nothing
empty_row = similar(gen_images, image_size[1], size(gen_images, 2))
fill!(empty_row, 0)
end
model_img_full = vcat(recon_images, empty_row, gen_images)
IN_VSCODE && display(model_img_full)
end
end
return model_img_full
end
img = main()
AssertionError("Could not find registered platform with name: \"cuda\". Available platform names are: ")
┌ Warning: `training` is set to `Val{false}()` but is being used within an autodiff call (gradient, jacobian, etc...). This might lead to incorrect results. If you are using a `Lux.jl` model, set it to training mode using `LuxCore.trainmode`.
└ @ LuxLib.Utils /var/lib/buildkite-agent/builds/gpuci-12/julialang/lux-dot-jl/lib/LuxLib/src/utils.jl:344
Total Trainable Parameters: 0.1493 M
Epoch 1, Iter 39, Loss: 23915.1093750, Throughput: 44.067011 im/s
Epoch 1, Train Loss: 39871.0273438, Time: 113.7450s, Throughput: 43.887658 im/s
Epoch 2, Iter 39, Loss: 17550.3808594, Throughput: 371.533179 im/s
Epoch 2, Train Loss: 20061.7617188, Time: 13.4366s, Throughput: 371.522282 im/s
Epoch 3, Iter 39, Loss: 15722.3710938, Throughput: 394.577250 im/s
Epoch 3, Train Loss: 16542.9570312, Time: 12.6520s, Throughput: 394.562595 im/s
Epoch 4, Iter 39, Loss: 14781.2968750, Throughput: 393.271215 im/s
Epoch 4, Train Loss: 15091.8798828, Time: 12.6938s, Throughput: 393.262447 im/s
Epoch 5, Iter 39, Loss: 13806.8847656, Throughput: 387.441429 im/s
Epoch 5, Train Loss: 14026.0625000, Time: 12.8849s, Throughput: 387.431055 im/s
Epoch 6, Iter 39, Loss: 13807.2373047, Throughput: 412.344493 im/s
Epoch 6, Train Loss: 13424.1728516, Time: 12.1069s, Throughput: 412.327838 im/s
Epoch 7, Iter 39, Loss: 12251.4707031, Throughput: 403.792181 im/s
Epoch 7, Train Loss: 12885.7919922, Time: 12.3631s, Throughput: 403.782151 im/s
Epoch 8, Iter 39, Loss: 12963.1953125, Throughput: 375.296669 im/s
Epoch 8, Train Loss: 12745.5263672, Time: 13.3018s, Throughput: 375.286768 im/s
Epoch 9, Iter 39, Loss: 12629.7060547, Throughput: 354.422894 im/s
Epoch 9, Train Loss: 12348.6455078, Time: 14.0852s, Throughput: 354.414135 im/s
Epoch 10, Iter 39, Loss: 11848.5732422, Throughput: 353.795071 im/s
Epoch 10, Train Loss: 11977.1464844, Time: 14.1102s, Throughput: 353.787623 im/s
Epoch 11, Iter 39, Loss: 11422.3164062, Throughput: 352.388152 im/s
Epoch 11, Train Loss: 11740.1845703, Time: 14.1665s, Throughput: 352.380614 im/s
Epoch 12, Iter 39, Loss: 11388.6113281, Throughput: 354.586105 im/s
Epoch 12, Train Loss: 11576.1083984, Time: 14.0786s, Throughput: 354.579806 im/s
Epoch 13, Iter 39, Loss: 11147.5156250, Throughput: 362.797311 im/s
Epoch 13, Train Loss: 11415.6269531, Time: 13.7600s, Throughput: 362.789931 im/s
Epoch 14, Iter 39, Loss: 11376.8916016, Throughput: 362.379178 im/s
Epoch 14, Train Loss: 11255.0576172, Time: 13.7759s, Throughput: 362.371049 im/s
Epoch 15, Iter 39, Loss: 11148.1025391, Throughput: 361.750613 im/s
Epoch 15, Train Loss: 11142.6884766, Time: 13.7999s, Throughput: 361.742513 im/s
Epoch 16, Iter 39, Loss: 11242.5888672, Throughput: 361.209169 im/s
Epoch 16, Train Loss: 11024.5156250, Time: 13.8205s, Throughput: 361.201411 im/s
Epoch 17, Iter 39, Loss: 10905.4921875, Throughput: 362.380018 im/s
Epoch 17, Train Loss: 11020.8945312, Time: 13.7759s, Throughput: 362.371413 im/s
Epoch 18, Iter 39, Loss: 10679.6113281, Throughput: 343.420465 im/s
Epoch 18, Train Loss: 10799.8457031, Time: 14.5364s, Throughput: 343.412619 im/s
Epoch 19, Iter 39, Loss: 10396.4902344, Throughput: 358.622389 im/s
Epoch 19, Train Loss: 10673.1132812, Time: 13.9202s, Throughput: 358.615897 im/s
Epoch 20, Iter 39, Loss: 10924.0566406, Throughput: 358.110737 im/s
Epoch 20, Train Loss: 10603.5078125, Time: 13.9401s, Throughput: 358.102879 im/s
Epoch 21, Iter 39, Loss: 9915.4101562, Throughput: 352.238393 im/s
Epoch 21, Train Loss: 10539.0429688, Time: 14.1725s, Throughput: 352.231034 im/s
Epoch 22, Iter 39, Loss: 10744.7138672, Throughput: 351.606989 im/s
Epoch 22, Train Loss: 10466.8046875, Time: 14.1980s, Throughput: 351.597826 im/s
Epoch 23, Iter 39, Loss: 9903.0117188, Throughput: 345.774473 im/s
Epoch 23, Train Loss: 10359.5058594, Time: 14.4374s, Throughput: 345.767981 im/s
Epoch 24, Iter 39, Loss: 10299.8925781, Throughput: 362.224381 im/s
Epoch 24, Train Loss: 10368.6015625, Time: 13.7818s, Throughput: 362.215996 im/s
Epoch 25, Iter 39, Loss: 10385.0810547, Throughput: 355.467293 im/s
Epoch 25, Train Loss: 10324.5068359, Time: 14.0438s, Throughput: 355.459828 im/s
Epoch 26, Iter 39, Loss: 10327.2050781, Throughput: 355.705841 im/s
Epoch 26, Train Loss: 10187.0966797, Time: 14.0343s, Throughput: 355.699200 im/s
Epoch 27, Iter 39, Loss: 10393.1474609, Throughput: 357.046422 im/s
Epoch 27, Train Loss: 10119.4736328, Time: 13.9816s, Throughput: 357.039481 im/s
Epoch 28, Iter 39, Loss: 10210.0771484, Throughput: 354.247720 im/s
Epoch 28, Train Loss: 10020.3144531, Time: 14.0926s, Throughput: 354.228560 im/s
Epoch 29, Iter 39, Loss: 10357.9521484, Throughput: 354.284673 im/s
Epoch 29, Train Loss: 10129.2294922, Time: 14.0907s, Throughput: 354.277180 im/s
Epoch 30, Iter 39, Loss: 9800.4775391, Throughput: 352.405292 im/s
Epoch 30, Train Loss: 10013.2080078, Time: 14.1658s, Throughput: 352.397956 im/s
Epoch 31, Iter 39, Loss: 10175.2050781, Throughput: 353.275965 im/s
Epoch 31, Train Loss: 9948.7207031, Time: 14.1309s, Throughput: 353.267394 im/s
Epoch 32, Iter 39, Loss: 9972.2363281, Throughput: 336.388229 im/s
Epoch 32, Train Loss: 9889.9082031, Time: 14.8403s, Throughput: 336.380863 im/s
Epoch 33, Iter 39, Loss: 9972.9804688, Throughput: 339.464251 im/s
Epoch 33, Train Loss: 9967.7255859, Time: 14.7058s, Throughput: 339.458065 im/s
Epoch 34, Iter 39, Loss: 10264.1347656, Throughput: 347.229033 im/s
Epoch 34, Train Loss: 9888.2519531, Time: 14.3770s, Throughput: 347.221208 im/s
Epoch 35, Iter 39, Loss: 10051.7685547, Throughput: 351.591804 im/s
Epoch 35, Train Loss: 9901.2812500, Time: 14.1986s, Throughput: 351.584625 im/s
Epoch 36, Iter 39, Loss: 10326.6474609, Throughput: 352.847950 im/s
Epoch 36, Train Loss: 9700.7792969, Time: 14.1480s, Throughput: 352.840915 im/s
Epoch 37, Iter 39, Loss: 9933.6210938, Throughput: 356.120022 im/s
Epoch 37, Train Loss: 9673.5693359, Time: 14.0181s, Throughput: 356.111082 im/s
Epoch 38, Iter 39, Loss: 9656.7900391, Throughput: 354.039969 im/s
Epoch 38, Train Loss: 9644.3769531, Time: 14.1005s, Throughput: 354.031205 im/s
Epoch 39, Iter 39, Loss: 9576.4814453, Throughput: 355.147994 im/s
Epoch 39, Train Loss: 9624.7353516, Time: 14.0564s, Throughput: 355.140037 im/s
Epoch 40, Iter 39, Loss: 9044.5156250, Throughput: 352.539486 im/s
Epoch 40, Train Loss: 9560.3720703, Time: 14.1604s, Throughput: 352.532043 im/s
Epoch 41, Iter 39, Loss: 9580.3486328, Throughput: 354.345027 im/s
Epoch 41, Train Loss: 9536.5634766, Time: 14.0882s, Throughput: 354.338892 im/s
Epoch 42, Iter 39, Loss: 9540.9677734, Throughput: 355.857045 im/s
Epoch 42, Train Loss: 9526.1591797, Time: 14.0284s, Throughput: 355.848421 im/s
Epoch 43, Iter 39, Loss: 8775.1640625, Throughput: 346.257528 im/s
Epoch 43, Train Loss: 9610.4794922, Time: 14.4174s, Throughput: 346.249483 im/s
Epoch 44, Iter 39, Loss: 9698.1445312, Throughput: 347.185609 im/s
Epoch 44, Train Loss: 9444.7275391, Time: 14.3791s, Throughput: 347.170204 im/s
Epoch 45, Iter 39, Loss: 9706.8105469, Throughput: 351.471157 im/s
Epoch 45, Train Loss: 9409.7734375, Time: 14.2035s, Throughput: 351.463410 im/s
Epoch 46, Iter 39, Loss: 9618.8105469, Throughput: 344.354424 im/s
Epoch 46, Train Loss: 9469.3857422, Time: 14.4969s, Throughput: 344.348319 im/s
Epoch 47, Iter 39, Loss: 9530.3007812, Throughput: 345.775935 im/s
Epoch 47, Train Loss: 9453.2812500, Time: 14.4374s, Throughput: 345.767787 im/s
Epoch 48, Iter 39, Loss: 9254.7382812, Throughput: 347.601444 im/s
Epoch 48, Train Loss: 9344.7333984, Time: 14.3626s, Throughput: 347.569114 im/s
Epoch 49, Iter 39, Loss: 9703.0449219, Throughput: 356.170563 im/s
Epoch 49, Train Loss: 9339.4833984, Time: 14.0161s, Throughput: 356.162050 im/s
Epoch 50, Iter 39, Loss: 9485.5742188, Throughput: 351.615929 im/s
Epoch 50, Train Loss: 9365.2187500, Time: 14.1976s, Throughput: 351.607881 im/s
Appendix
julia
using InteractiveUtils
InteractiveUtils.versioninfo()
if @isdefined(MLDataDevices)
if @isdefined(CUDA) && MLDataDevices.functional(CUDADevice)
println()
CUDA.versioninfo()
end
if @isdefined(AMDGPU) && MLDataDevices.functional(AMDGPUDevice)
println()
AMDGPU.versioninfo()
end
end
Julia Version 1.11.6
Commit 9615af0f269 (2025-07-09 12:58 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 48 × AMD EPYC 7402 24-Core Processor
WORD_SIZE: 64
LLVM: libLLVM-16.0.6 (ORCJIT, znver2)
Threads: 48 default, 0 interactive, 24 GC (on 2 virtual cores)
Environment:
JULIA_CPU_THREADS = 2
LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64
JULIA_PKG_SERVER =
JULIA_NUM_THREADS = 48
JULIA_CUDA_HARD_MEMORY_LIMIT = 100%
JULIA_PKG_PRECOMPILE_AUTO = 0
JULIA_DEBUG = Literate
JULIA_DEPOT_PATH = /root/.cache/julia-buildkite-plugin/depots/01872db4-8c79-43af-ab7d-12abac4f24f6
This page was generated using Literate.jl.