Convolutional VAE for MNIST
Convolutional variational autoencoder (CVAE) implementation in MLX using MNIST. This is based on the CVAE implementation in MLX.
julia
using Lux,
Reactant,
MLDatasets,
Random,
Statistics,
Enzyme,
MLUtils,
DataAugmentation,
ConcreteStructs,
OneHotArrays,
ImageShow,
Images,
Printf,
Optimisers
const xdev = reactant_device(; force=true)
const cdev = cpu_device()
const IN_VSCODE = isdefined(Main, :VSCodeServer)falseModel Definition
First we will define the encoder.It maps the input to a normal distribution in latent space and sample a latent vector from that distribution.
julia
function cvae_encoder(
rng=Random.default_rng();
num_latent_dims::Int,
image_shape::Dims{3},
max_num_filters::Int,
)
flattened_dim = prod(image_shape[1:2] .÷ 8) * max_num_filters
return @compact(;
embed=Chain(
Chain(
Conv((3, 3), image_shape[3] => max_num_filters ÷ 4; stride=2, pad=1),
BatchNorm(max_num_filters ÷ 4, leakyrelu),
),
Chain(
Conv((3, 3), max_num_filters ÷ 4 => max_num_filters ÷ 2; stride=2, pad=1),
BatchNorm(max_num_filters ÷ 2, leakyrelu),
),
Chain(
Conv((3, 3), max_num_filters ÷ 2 => max_num_filters; stride=2, pad=1),
BatchNorm(max_num_filters, leakyrelu),
),
FlattenLayer(),
),
proj_mu=Dense(flattened_dim, num_latent_dims; init_bias=zeros32),
proj_log_var=Dense(flattened_dim, num_latent_dims; init_bias=zeros32),
rng
) do x
y = embed(x)
μ = proj_mu(y)
logσ² = proj_log_var(y)
T = eltype(logσ²)
logσ² = clamp.(logσ², -T(20.0f0), T(10.0f0))
σ = exp.(logσ² .* T(0.5))
# Generate a tensor of random values from a normal distribution
ϵ = randn_like(Lux.replicate(rng), σ)
# Reparameterization trick to backpropagate through sampling
z = ϵ .* σ .+ μ
@return z, μ, logσ²
end
endSimilarly we define the decoder.
julia
function cvae_decoder(; num_latent_dims::Int, image_shape::Dims{3}, max_num_filters::Int)
flattened_dim = prod(image_shape[1:2] .÷ 8) * max_num_filters
return @compact(;
linear=Dense(num_latent_dims, flattened_dim),
upchain=Chain(
Chain(
Upsample(2),
Conv((3, 3), max_num_filters => max_num_filters ÷ 2; stride=1, pad=1),
BatchNorm(max_num_filters ÷ 2, leakyrelu),
),
Chain(
Upsample(2),
Conv((3, 3), max_num_filters ÷ 2 => max_num_filters ÷ 4; stride=1, pad=1),
BatchNorm(max_num_filters ÷ 4, leakyrelu),
),
Chain(
Upsample(2),
Conv(
(3, 3), max_num_filters ÷ 4 => image_shape[3], sigmoid; stride=1, pad=1
),
),
),
max_num_filters
) do x
y = linear(x)
img = reshape(y, image_shape[1] ÷ 8, image_shape[2] ÷ 8, max_num_filters, :)
@return upchain(img)
end
end
@concrete struct CVAE <: AbstractLuxContainerLayer{(:encoder, :decoder)}
encoder <: AbstractLuxLayer
decoder <: AbstractLuxLayer
end
function CVAE(
rng=Random.default_rng();
num_latent_dims::Int,
image_shape::Dims{3},
max_num_filters::Int,
)
decoder = cvae_decoder(; num_latent_dims, image_shape, max_num_filters)
encoder = cvae_encoder(rng; num_latent_dims, image_shape, max_num_filters)
return CVAE(encoder, decoder)
end
function (cvae::CVAE)(x, ps, st)
(z, μ, logσ²), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
return (x_rec, μ, logσ²), (; encoder=st_enc, decoder=st_dec)
end
function encode(cvae::CVAE, x, ps, st)
(z, _, _), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
return z, (; encoder=st_enc, st.decoder)
end
function decode(cvae::CVAE, z, ps, st)
x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
return x_rec, (; decoder=st_dec, st.encoder)
endLoading MNIST
julia
@concrete struct TensorDataset
dataset
transform
total_samples::Int
end
Base.length(ds::TensorDataset) = ds.total_samples
function Base.getindex(ds::TensorDataset, idxs::Union{Vector{<:Integer},AbstractRange})
img = Image.(eachslice(convert2image(ds.dataset, idxs); dims=3))
return stack(parent ∘ itemdata ∘ Base.Fix1(apply, ds.transform), img)
end
function loadmnist(batchsize, image_size::Dims{2})
# Load MNIST: Only 1500 for demonstration purposes on CI
train_dataset = MNIST(; split=:train)
N = parse(Bool, get(ENV, "CI", "false")) ? 5000 : length(train_dataset)
train_transform = ScaleKeepAspect(image_size) |> ImageToTensor()
trainset = TensorDataset(train_dataset, train_transform, N)
trainloader = DataLoader(trainset; batchsize, shuffle=true, partial=false)
return trainloader
endHelper Functions
Generate an Image Grid from a list of images
julia
function create_image_grid(imgs::AbstractArray, grid_rows::Int, grid_cols::Int)
total_images = grid_rows * grid_cols
imgs = map(eachslice(imgs[:, :, :, 1:total_images]; dims=4)) do img
cimg = if size(img, 3) == 1
colorview(Gray, view(img, :, :, 1))
else
colorview(RGB, permutedims(img, (3, 1, 2)))
end
return cimg'
end
return create_image_grid(imgs, grid_rows, grid_cols)
end
function create_image_grid(images::Vector, grid_rows::Int, grid_cols::Int)
# Check if the number of images matches the grid
total_images = grid_rows * grid_cols
@assert length(images) == total_images
# Get the size of a single image (assuming all images are the same size)
img_height, img_width = size(images[1])
# Create a blank grid canvas
grid_height = img_height * grid_rows
grid_width = img_width * grid_cols
grid_canvas = similar(images[1], grid_height, grid_width)
# Place each image in the correct position on the canvas
for idx in 1:total_images
row = div(idx - 1, grid_cols) + 1
col = mod(idx - 1, grid_cols) + 1
start_row = (row - 1) * img_height + 1
start_col = (col - 1) * img_width + 1
grid_canvas[start_row:(start_row + img_height - 1), start_col:(start_col + img_width - 1)] .= images[idx]
end
return grid_canvas
end
function loss_function(model, ps, st, X)
(y, μ, logσ²), st = model(X, ps, st)
reconstruction_loss = MSELoss(; agg=sum)(y, X)
kldiv_loss = -sum(1 .+ logσ² .- μ .^ 2 .- exp.(logσ²)) / 2
loss = reconstruction_loss + kldiv_loss
return loss, st, (; y, μ, logσ², reconstruction_loss, kldiv_loss)
end
function generate_images(
model, ps, st; num_samples::Int=128, num_latent_dims::Int, decode_compiled=nothing
)
z = get_device((ps, st))(randn(Float32, num_latent_dims, num_samples))
if decode_compiled === nothing
images, _ = decode(model, z, ps, Lux.testmode(st))
else
images, _ = decode_compiled(model, z, ps, Lux.testmode(st))
images = cpu_device()(images)
end
return create_image_grid(images, 8, num_samples ÷ 8)
end
function reconstruct_images(model, ps, st, X)
(recon, _, _), _ = model(X, ps, Lux.testmode(st))
recon = cpu_device()(recon)
return create_image_grid(recon, 8, size(X, ndims(X)) ÷ 8)
endreconstruct_images (generic function with 1 method)Training the Model
julia
function main(;
batchsize=128,
image_size=(64, 64),
num_latent_dims=8,
max_num_filters=64,
seed=0,
epochs=50,
weight_decay=1.0e-5,
learning_rate=1.0e-3,
num_samples=batchsize,
)
rng = Xoshiro()
Random.seed!(rng, seed)
cvae = CVAE(rng; num_latent_dims, image_shape=(image_size..., 1), max_num_filters)
ps, st = Lux.setup(rng, cvae) |> xdev
z = xdev(randn(Float32, num_latent_dims, num_samples))
decode_compiled = @compile decode(cvae, z, ps, Lux.testmode(st))
x = randn(Float32, image_size..., 1, batchsize) |> xdev
cvae_compiled = @compile cvae(x, ps, Lux.testmode(st))
train_dataloader = loadmnist(batchsize, image_size) |> xdev
opt = AdamW(; eta=learning_rate, lambda=weight_decay)
train_state = Training.TrainState(cvae, ps, st, opt)
@printf "Total Trainable Parameters: %0.4f M\n" (Lux.parameterlength(ps) / 1.0e6)
empty_row, model_img_full = nothing, nothing
for epoch in 1:epochs
loss_total = 0.0f0
total_samples = 0
start_time = time()
for (i, X) in enumerate(train_dataloader)
(_, loss, _, train_state) = Training.single_train_step!(
AutoEnzyme(), loss_function, X, train_state; return_gradients=Val(false)
)
loss_total += loss
total_samples += size(X, ndims(X))
if i % 250 == 0 || i == length(train_dataloader)
throughput = total_samples / (time() - start_time)
@printf "Epoch %d, Iter %d, Loss: %.7f, Throughput: %.6f im/s\n" epoch i loss throughput
end
end
total_time = time() - start_time
train_loss = loss_total / length(train_dataloader)
throughput = total_samples / total_time
@printf "Epoch %d, Train Loss: %.7f, Time: %.4fs, Throughput: %.6f im/s\n" epoch train_loss total_time throughput
if IN_VSCODE || epoch == epochs
recon_images = reconstruct_images(
cvae_compiled,
train_state.parameters,
train_state.states,
first(train_dataloader),
)
gen_images = generate_images(
cvae,
train_state.parameters,
train_state.states;
num_samples,
num_latent_dims,
decode_compiled,
)
if empty_row === nothing
empty_row = similar(gen_images, image_size[1], size(gen_images, 2))
fill!(empty_row, 0)
end
model_img_full = vcat(recon_images, empty_row, gen_images)
IN_VSCODE && display(model_img_full)
end
end
return model_img_full
end
img = main()Total Trainable Parameters: 0.1493 M
Epoch 1, Iter 39, Loss: 23580.5664062, Throughput: 5.191230 im/s
Epoch 1, Train Loss: 39646.6875000, Time: 961.9297s, Throughput: 5.189569 im/s
Epoch 2, Iter 39, Loss: 16947.2929688, Throughput: 84.391311 im/s
Epoch 2, Train Loss: 20315.6464844, Time: 59.1532s, Throughput: 84.391005 im/s
Epoch 3, Iter 39, Loss: 16068.3222656, Throughput: 84.532179 im/s
Epoch 3, Train Loss: 16682.0390625, Time: 59.0546s, Throughput: 84.531917 im/s
Epoch 4, Iter 39, Loss: 14498.4794922, Throughput: 85.804170 im/s
Epoch 4, Train Loss: 15026.4130859, Time: 58.1792s, Throughput: 85.803903 im/s
Epoch 5, Iter 39, Loss: 13397.9208984, Throughput: 84.987472 im/s
Epoch 5, Train Loss: 14048.5400391, Time: 58.7383s, Throughput: 84.987207 im/s
Epoch 6, Iter 39, Loss: 12758.6347656, Throughput: 85.815470 im/s
Epoch 6, Train Loss: 13509.0126953, Time: 58.1715s, Throughput: 85.815183 im/s
Epoch 7, Iter 39, Loss: 13207.2763672, Throughput: 85.199384 im/s
Epoch 7, Train Loss: 12994.4414062, Time: 58.5922s, Throughput: 85.199093 im/s
Epoch 8, Iter 39, Loss: 13123.3212891, Throughput: 84.442288 im/s
Epoch 8, Train Loss: 12492.6093750, Time: 59.1175s, Throughput: 84.442007 im/s
Epoch 9, Iter 39, Loss: 13128.2460938, Throughput: 83.773611 im/s
Epoch 9, Train Loss: 12281.6816406, Time: 59.5894s, Throughput: 83.773318 im/s
Epoch 10, Iter 39, Loss: 11021.0830078, Throughput: 84.397939 im/s
Epoch 10, Train Loss: 11980.7978516, Time: 59.1486s, Throughput: 84.397643 im/s
Epoch 11, Iter 39, Loss: 11356.9062500, Throughput: 83.807375 im/s
Epoch 11, Train Loss: 11703.3720703, Time: 59.5653s, Throughput: 83.807120 im/s
Epoch 12, Iter 39, Loss: 11677.4609375, Throughput: 82.962554 im/s
Epoch 12, Train Loss: 11600.3193359, Time: 60.1719s, Throughput: 82.962287 im/s
Epoch 13, Iter 39, Loss: 11433.3583984, Throughput: 84.526207 im/s
Epoch 13, Train Loss: 11390.8593750, Time: 59.0588s, Throughput: 84.525950 im/s
Epoch 14, Iter 39, Loss: 10662.7060547, Throughput: 83.740000 im/s
Epoch 14, Train Loss: 11209.8759766, Time: 59.6133s, Throughput: 83.739725 im/s
Epoch 15, Iter 39, Loss: 11059.0244141, Throughput: 85.665222 im/s
Epoch 15, Train Loss: 11074.5849609, Time: 58.2735s, Throughput: 85.664950 im/s
Epoch 16, Iter 39, Loss: 11043.1679688, Throughput: 85.947571 im/s
Epoch 16, Train Loss: 10991.9980469, Time: 58.0821s, Throughput: 85.947303 im/s
Epoch 17, Iter 39, Loss: 11595.5771484, Throughput: 85.410708 im/s
Epoch 17, Train Loss: 10917.2480469, Time: 58.4472s, Throughput: 85.410400 im/s
Epoch 18, Iter 39, Loss: 11368.8691406, Throughput: 85.713191 im/s
Epoch 18, Train Loss: 10722.2265625, Time: 58.2409s, Throughput: 85.712892 im/s
Epoch 19, Iter 39, Loss: 10695.8779297, Throughput: 85.109969 im/s
Epoch 19, Train Loss: 10692.6044922, Time: 58.6537s, Throughput: 85.109697 im/s
Epoch 20, Iter 39, Loss: 10607.6513672, Throughput: 85.711069 im/s
Epoch 20, Train Loss: 10572.3173828, Time: 58.2424s, Throughput: 85.710805 im/s
Epoch 21, Iter 39, Loss: 11003.8183594, Throughput: 85.646845 im/s
Epoch 21, Train Loss: 10479.9462891, Time: 58.2860s, Throughput: 85.646566 im/s
Epoch 22, Iter 39, Loss: 10131.0146484, Throughput: 86.032574 im/s
Epoch 22, Train Loss: 10388.8476562, Time: 58.0248s, Throughput: 86.032239 im/s
Epoch 23, Iter 39, Loss: 10646.8359375, Throughput: 85.614168 im/s
Epoch 23, Train Loss: 10334.3828125, Time: 58.3083s, Throughput: 85.613913 im/s
Epoch 24, Iter 39, Loss: 9776.9560547, Throughput: 85.838777 im/s
Epoch 24, Train Loss: 10299.3056641, Time: 58.1558s, Throughput: 85.838452 im/s
Epoch 25, Iter 39, Loss: 10127.6777344, Throughput: 85.860199 im/s
Epoch 25, Train Loss: 10164.5126953, Time: 58.1412s, Throughput: 85.859936 im/s
Epoch 26, Iter 39, Loss: 10167.2773438, Throughput: 85.153270 im/s
Epoch 26, Train Loss: 10249.1855469, Time: 58.6239s, Throughput: 85.153023 im/s
Epoch 27, Iter 39, Loss: 9915.9970703, Throughput: 84.878928 im/s
Epoch 27, Train Loss: 10143.7451172, Time: 58.8134s, Throughput: 84.878678 im/s
Epoch 28, Iter 39, Loss: 10024.7851562, Throughput: 85.460664 im/s
Epoch 28, Train Loss: 10099.4042969, Time: 58.4130s, Throughput: 85.460380 im/s
Epoch 29, Iter 39, Loss: 9819.0781250, Throughput: 85.193111 im/s
Epoch 29, Train Loss: 10049.7568359, Time: 58.5965s, Throughput: 85.192851 im/s
Epoch 30, Iter 39, Loss: 9764.6464844, Throughput: 84.894058 im/s
Epoch 30, Train Loss: 9918.1445312, Time: 58.8029s, Throughput: 84.893800 im/s
Epoch 31, Iter 39, Loss: 9319.4843750, Throughput: 85.055332 im/s
Epoch 31, Train Loss: 9845.6582031, Time: 58.6914s, Throughput: 85.055064 im/s
Epoch 32, Iter 39, Loss: 10244.7998047, Throughput: 85.784590 im/s
Epoch 32, Train Loss: 9919.8095703, Time: 58.1924s, Throughput: 85.784339 im/s
Epoch 33, Iter 39, Loss: 9330.6445312, Throughput: 85.447664 im/s
Epoch 33, Train Loss: 9818.6093750, Time: 58.4219s, Throughput: 85.447377 im/s
Epoch 34, Iter 39, Loss: 9743.9140625, Throughput: 85.922613 im/s
Epoch 34, Train Loss: 9832.1777344, Time: 58.0990s, Throughput: 85.922332 im/s
Epoch 35, Iter 39, Loss: 9626.2656250, Throughput: 86.236367 im/s
Epoch 35, Train Loss: 9710.0195312, Time: 57.8876s, Throughput: 86.236090 im/s
Epoch 36, Iter 39, Loss: 9015.6660156, Throughput: 85.656520 im/s
Epoch 36, Train Loss: 9671.4433594, Time: 58.2794s, Throughput: 85.656269 im/s
Epoch 37, Iter 39, Loss: 9330.9306641, Throughput: 85.372005 im/s
Epoch 37, Train Loss: 9692.7187500, Time: 58.4737s, Throughput: 85.371752 im/s
Epoch 38, Iter 39, Loss: 10622.1601562, Throughput: 85.704825 im/s
Epoch 38, Train Loss: 9637.9365234, Time: 58.2466s, Throughput: 85.704567 im/s
Epoch 39, Iter 39, Loss: 10266.1777344, Throughput: 85.565011 im/s
Epoch 39, Train Loss: 9553.8417969, Time: 58.3418s, Throughput: 85.564734 im/s
Epoch 40, Iter 39, Loss: 9361.8515625, Throughput: 85.186481 im/s
Epoch 40, Train Loss: 9538.7246094, Time: 58.6010s, Throughput: 85.186235 im/s
Epoch 41, Iter 39, Loss: 10219.7402344, Throughput: 85.406814 im/s
Epoch 41, Train Loss: 9539.0517578, Time: 58.4498s, Throughput: 85.406568 im/s
Epoch 42, Iter 39, Loss: 9825.5507812, Throughput: 85.105672 im/s
Epoch 42, Train Loss: 9530.7294922, Time: 58.6567s, Throughput: 85.105373 im/s
Epoch 43, Iter 39, Loss: 9977.4863281, Throughput: 85.188596 im/s
Epoch 43, Train Loss: 9535.4912109, Time: 58.5996s, Throughput: 85.188349 im/s
Epoch 44, Iter 39, Loss: 9230.9472656, Throughput: 86.193022 im/s
Epoch 44, Train Loss: 9495.9550781, Time: 57.9167s, Throughput: 86.192751 im/s
Epoch 45, Iter 39, Loss: 9255.7480469, Throughput: 85.307881 im/s
Epoch 45, Train Loss: 9376.3574219, Time: 58.5176s, Throughput: 85.307597 im/s
Epoch 46, Iter 39, Loss: 9634.3281250, Throughput: 85.836257 im/s
Epoch 46, Train Loss: 9351.8017578, Time: 58.1574s, Throughput: 85.835989 im/s
Epoch 47, Iter 39, Loss: 8818.4697266, Throughput: 85.273892 im/s
Epoch 47, Train Loss: 9332.3994141, Time: 58.5410s, Throughput: 85.273630 im/s
Epoch 48, Iter 39, Loss: 9293.5585938, Throughput: 85.388429 im/s
Epoch 48, Train Loss: 9320.0683594, Time: 58.4625s, Throughput: 85.388137 im/s
Epoch 49, Iter 39, Loss: 9132.1181641, Throughput: 85.825214 im/s
Epoch 49, Train Loss: 9304.1044922, Time: 58.1649s, Throughput: 85.824922 im/s
Epoch 50, Iter 39, Loss: 9634.7763672, Throughput: 85.961179 im/s
Epoch 50, Train Loss: 9267.6445312, Time: 58.0729s, Throughput: 85.960907 im/sAppendix
julia
using InteractiveUtils
InteractiveUtils.versioninfo()
if @isdefined(MLDataDevices)
if @isdefined(CUDA) && MLDataDevices.functional(CUDADevice)
println()
CUDA.versioninfo()
end
if @isdefined(AMDGPU) && MLDataDevices.functional(AMDGPUDevice)
println()
AMDGPU.versioninfo()
end
endJulia Version 1.12.5
Commit 5fe89b8ddc1 (2026-02-09 16:05 UTC)
Build Info:
Official https://julialang.org release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 4 × AMD EPYC 9V74 80-Core Processor
WORD_SIZE: 64
LLVM: libLLVM-18.1.7 (ORCJIT, znver4)
GC: Built with stock GC
Threads: 4 default, 1 interactive, 4 GC (on 4 virtual cores)
Environment:
JULIA_DEBUG = Literate
LD_LIBRARY_PATH =
JULIA_NUM_THREADS = 4
JULIA_CPU_HARD_MEMORY_LIMIT = 100%
JULIA_PKG_PRECOMPILE_AUTO = 0This page was generated using Literate.jl.