Convolutional VAE for MNIST
Convolutional variational autoencoder (CVAE) implementation in MLX using MNIST. This is based on the CVAE implementation in MLX.
julia
using Lux,
Reactant,
MLDatasets,
Random,
Statistics,
Enzyme,
MLUtils,
DataAugmentation,
ConcreteStructs,
OneHotArrays,
ImageShow,
Images,
Printf,
Optimisers
const xdev = reactant_device(; force=true)
const cdev = cpu_device()
const IN_VSCODE = isdefined(Main, :VSCodeServer)
false
Model Definition
First we will define the encoder.It maps the input to a normal distribution in latent space and sample a latent vector from that distribution.
julia
function cvae_encoder(
rng=Random.default_rng();
num_latent_dims::Int,
image_shape::Dims{3},
max_num_filters::Int,
)
flattened_dim = prod(image_shape[1:2] .÷ 8) * max_num_filters
return @compact(;
embed=Chain(
Chain(
Conv((3, 3), image_shape[3] => max_num_filters ÷ 4; stride=2, pad=1),
BatchNorm(max_num_filters ÷ 4, leakyrelu),
),
Chain(
Conv((3, 3), max_num_filters ÷ 4 => max_num_filters ÷ 2; stride=2, pad=1),
BatchNorm(max_num_filters ÷ 2, leakyrelu),
),
Chain(
Conv((3, 3), max_num_filters ÷ 2 => max_num_filters; stride=2, pad=1),
BatchNorm(max_num_filters, leakyrelu),
),
FlattenLayer(),
),
proj_mu=Dense(flattened_dim, num_latent_dims; init_bias=zeros32),
proj_log_var=Dense(flattened_dim, num_latent_dims; init_bias=zeros32),
rng
) do x
y = embed(x)
μ = proj_mu(y)
logσ² = proj_log_var(y)
T = eltype(logσ²)
logσ² = clamp.(logσ², -T(20.0f0), T(10.0f0))
σ = exp.(logσ² .* T(0.5))
# Generate a tensor of random values from a normal distribution
ϵ = randn_like(Lux.replicate(rng), σ)
# Reparameterization trick to backpropagate through sampling
z = ϵ .* σ .+ μ
@return z, μ, logσ²
end
end
Similarly we define the decoder.
julia
function cvae_decoder(; num_latent_dims::Int, image_shape::Dims{3}, max_num_filters::Int)
flattened_dim = prod(image_shape[1:2] .÷ 8) * max_num_filters
return @compact(;
linear=Dense(num_latent_dims, flattened_dim),
upchain=Chain(
Chain(
Upsample(2),
Conv((3, 3), max_num_filters => max_num_filters ÷ 2; stride=1, pad=1),
BatchNorm(max_num_filters ÷ 2, leakyrelu),
),
Chain(
Upsample(2),
Conv((3, 3), max_num_filters ÷ 2 => max_num_filters ÷ 4; stride=1, pad=1),
BatchNorm(max_num_filters ÷ 4, leakyrelu),
),
Chain(
Upsample(2),
Conv(
(3, 3), max_num_filters ÷ 4 => image_shape[3], sigmoid; stride=1, pad=1
),
),
),
max_num_filters
) do x
y = linear(x)
img = reshape(y, image_shape[1] ÷ 8, image_shape[2] ÷ 8, max_num_filters, :)
@return upchain(img)
end
end
@concrete struct CVAE <: AbstractLuxContainerLayer{(:encoder, :decoder)}
encoder <: AbstractLuxLayer
decoder <: AbstractLuxLayer
end
function CVAE(
rng=Random.default_rng();
num_latent_dims::Int,
image_shape::Dims{3},
max_num_filters::Int,
)
decoder = cvae_decoder(; num_latent_dims, image_shape, max_num_filters)
encoder = cvae_encoder(rng; num_latent_dims, image_shape, max_num_filters)
return CVAE(encoder, decoder)
end
function (cvae::CVAE)(x, ps, st)
(z, μ, logσ²), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
return (x_rec, μ, logσ²), (; encoder=st_enc, decoder=st_dec)
end
function encode(cvae::CVAE, x, ps, st)
(z, _, _), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
return z, (; encoder=st_enc, st.decoder)
end
function decode(cvae::CVAE, z, ps, st)
x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
return x_rec, (; decoder=st_dec, st.encoder)
end
Loading MNIST
julia
@concrete struct TensorDataset
dataset
transform
total_samples::Int
end
Base.length(ds::TensorDataset) = ds.total_samples
function Base.getindex(ds::TensorDataset, idxs::Union{Vector{<:Integer},AbstractRange})
img = Image.(eachslice(convert2image(ds.dataset, idxs); dims=3))
return stack(parent ∘ itemdata ∘ Base.Fix1(apply, ds.transform), img)
end
function loadmnist(batchsize, image_size::Dims{2})
# Load MNIST: Only 1500 for demonstration purposes on CI
train_dataset = MNIST(; split=:train)
N = parse(Bool, get(ENV, "CI", "false")) ? 5000 : length(train_dataset)
train_transform = ScaleKeepAspect(image_size) |> ImageToTensor()
trainset = TensorDataset(train_dataset, train_transform, N)
trainloader = DataLoader(trainset; batchsize, shuffle=true, partial=false)
return trainloader
end
Helper Functions
Generate an Image Grid from a list of images
julia
function create_image_grid(imgs::AbstractArray, grid_rows::Int, grid_cols::Int)
total_images = grid_rows * grid_cols
imgs = map(eachslice(imgs[:, :, :, 1:total_images]; dims=4)) do img
cimg = if size(img, 3) == 1
colorview(Gray, view(img, :, :, 1))
else
colorview(RGB, permutedims(img, (3, 1, 2)))
end
return cimg'
end
return create_image_grid(imgs, grid_rows, grid_cols)
end
function create_image_grid(images::Vector, grid_rows::Int, grid_cols::Int)
# Check if the number of images matches the grid
total_images = grid_rows * grid_cols
@assert length(images) == total_images
# Get the size of a single image (assuming all images are the same size)
img_height, img_width = size(images[1])
# Create a blank grid canvas
grid_height = img_height * grid_rows
grid_width = img_width * grid_cols
grid_canvas = similar(images[1], grid_height, grid_width)
# Place each image in the correct position on the canvas
for idx in 1:total_images
row = div(idx - 1, grid_cols) + 1
col = mod(idx - 1, grid_cols) + 1
start_row = (row - 1) * img_height + 1
start_col = (col - 1) * img_width + 1
grid_canvas[start_row:(start_row + img_height - 1), start_col:(start_col + img_width - 1)] .= images[idx]
end
return grid_canvas
end
function loss_function(model, ps, st, X)
(y, μ, logσ²), st = model(X, ps, st)
reconstruction_loss = MSELoss(; agg=sum)(y, X)
kldiv_loss = -sum(1 .+ logσ² .- μ .^ 2 .- exp.(logσ²)) / 2
loss = reconstruction_loss + kldiv_loss
return loss, st, (; y, μ, logσ², reconstruction_loss, kldiv_loss)
end
function generate_images(
model, ps, st; num_samples::Int=128, num_latent_dims::Int, decode_compiled=nothing
)
z = get_device((ps, st))(randn(Float32, num_latent_dims, num_samples))
if decode_compiled === nothing
images, _ = decode(model, z, ps, Lux.testmode(st))
else
images, _ = decode_compiled(model, z, ps, Lux.testmode(st))
images = cpu_device()(images)
end
return create_image_grid(images, 8, num_samples ÷ 8)
end
function reconstruct_images(model, ps, st, X)
(recon, _, _), _ = model(X, ps, Lux.testmode(st))
recon = cpu_device()(recon)
return create_image_grid(recon, 8, size(X, ndims(X)) ÷ 8)
end
reconstruct_images (generic function with 1 method)
Training the Model
julia
function main(;
batchsize=128,
image_size=(64, 64),
num_latent_dims=8,
max_num_filters=64,
seed=0,
epochs=50,
weight_decay=1.0e-5,
learning_rate=1.0e-3,
num_samples=batchsize,
)
rng = Xoshiro()
Random.seed!(rng, seed)
cvae = CVAE(rng; num_latent_dims, image_shape=(image_size..., 1), max_num_filters)
ps, st = xdev(Lux.setup(rng, cvae))
z = xdev(randn(Float32, num_latent_dims, num_samples))
decode_compiled = Reactant.with_config(;
dot_general_precision=PrecisionConfig.HIGH,
convolution_precision=PrecisionConfig.HIGH,
) do
@compile decode(cvae, z, ps, Lux.testmode(st))
end
x = xdev(randn(Float32, image_size..., 1, batchsize))
cvae_compiled = Reactant.with_config(;
dot_general_precision=PrecisionConfig.HIGH,
convolution_precision=PrecisionConfig.HIGH,
) do
@compile cvae(x, ps, Lux.testmode(st))
end
train_dataloader = xdev(loadmnist(batchsize, image_size))
opt = AdamW(; eta=learning_rate, lambda=weight_decay)
train_state = Training.TrainState(cvae, ps, st, opt)
@printf "Total Trainable Parameters: %0.4f M\n" (Lux.parameterlength(ps) / 1.0e6)
empty_row, model_img_full = nothing, nothing
for epoch in 1:epochs
loss_total = 0.0f0
total_samples = 0
start_time = time()
for (i, X) in enumerate(train_dataloader)
(_, loss, _, train_state) = Training.single_train_step!(
AutoEnzyme(), loss_function, X, train_state; return_gradients=Val(false)
)
loss_total += loss
total_samples += size(X, ndims(X))
if i % 250 == 0 || i == length(train_dataloader)
throughput = total_samples / (time() - start_time)
@printf "Epoch %d, Iter %d, Loss: %.7f, Throughput: %.6f im/s\n" epoch i loss throughput
end
end
total_time = time() - start_time
train_loss = loss_total / length(train_dataloader)
throughput = total_samples / total_time
@printf "Epoch %d, Train Loss: %.7f, Time: %.4fs, Throughput: %.6f im/s\n" epoch train_loss total_time throughput
if IN_VSCODE || epoch == epochs
recon_images = reconstruct_images(
cvae_compiled,
train_state.parameters,
train_state.states,
first(train_dataloader),
)
gen_images = generate_images(
cvae,
train_state.parameters,
train_state.states;
num_samples,
num_latent_dims,
decode_compiled,
)
if empty_row === nothing
empty_row = similar(gen_images, image_size[1], size(gen_images, 2))
fill!(empty_row, 0)
end
model_img_full = vcat(recon_images, empty_row, gen_images)
IN_VSCODE && display(model_img_full)
end
end
return model_img_full
end
img = main()
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1758410879.715772 2880388 service.cc:158] XLA service 0x1cfcf670 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1758410879.715842 2880388 service.cc:166] StreamExecutor device (0): NVIDIA A100-PCIE-40GB MIG 1g.5gb, Compute Capability 8.0
I0000 00:00:1758410879.716597 2880388 se_gpu_pjrt_client.cc:1338] Using BFC allocator.
I0000 00:00:1758410879.716634 2880388 gpu_helpers.cc:136] XLA backend allocating 3825205248 bytes on device 0 for BFCAllocator.
I0000 00:00:1758410879.716678 2880388 gpu_helpers.cc:177] XLA backend will use up to 1275068416 bytes on device 0 for CollectiveBFCAllocator.
I0000 00:00:1758410879.727673 2880388 cuda_dnn.cc:463] Loaded cuDNN version 91200
Total Trainable Parameters: 0.1493 M
┌ Warning: `training` is set to `Val{true}()` but is not being used within an autodiff call (gradient, jacobian, etc...). This will be slow. If you are using a `Lux.jl` model, set it to inference (test) mode using `LuxCore.testmode`. Reliance on this behavior is discouraged, and is not guaranteed by Semantic Versioning, and might be removed without a deprecation cycle. It is recommended to fix this issue in your code.
└ @ LuxLib.Utils /var/lib/buildkite-agent/builds/gpuci-2/julialang/lux-dot-jl/lib/LuxLib/src/utils.jl:334
Epoch 1, Iter 39, Loss: 24104.1601562, Throughput: 47.980822 im/s
Epoch 1, Train Loss: 39593.5625000, Time: 104.4461s, Throughput: 47.795007 im/s
Epoch 2, Iter 39, Loss: 17491.6796875, Throughput: 1856.405213 im/s
Epoch 2, Train Loss: 20196.1230469, Time: 2.6895s, Throughput: 1856.104880 im/s
Epoch 3, Iter 39, Loss: 16008.7412109, Throughput: 1894.584930 im/s
Epoch 3, Train Loss: 16441.6347656, Time: 2.6353s, Throughput: 1894.302451 im/s
Epoch 4, Iter 39, Loss: 15207.7216797, Throughput: 1892.121734 im/s
Epoch 4, Train Loss: 14935.9755859, Time: 2.6387s, Throughput: 1891.830588 im/s
Epoch 5, Iter 39, Loss: 13629.2490234, Throughput: 1899.070777 im/s
Epoch 5, Train Loss: 13963.3857422, Time: 2.6291s, Throughput: 1898.782654 im/s
Epoch 6, Iter 39, Loss: 13297.1845703, Throughput: 1896.192446 im/s
Epoch 6, Train Loss: 13290.6357422, Time: 2.6330s, Throughput: 1895.925281 im/s
Epoch 7, Iter 39, Loss: 12183.3437500, Throughput: 1898.037522 im/s
Epoch 7, Train Loss: 12871.8935547, Time: 2.6304s, Throughput: 1897.784974 im/s
Epoch 8, Iter 39, Loss: 12439.2714844, Throughput: 1900.766324 im/s
Epoch 8, Train Loss: 12421.9775391, Time: 2.6267s, Throughput: 1900.515982 im/s
Epoch 9, Iter 39, Loss: 12354.8769531, Throughput: 1895.994469 im/s
Epoch 9, Train Loss: 12344.2656250, Time: 2.6333s, Throughput: 1895.744696 im/s
Epoch 10, Iter 39, Loss: 12412.8203125, Throughput: 1896.599690 im/s
Epoch 10, Train Loss: 11955.5107422, Time: 2.6324s, Throughput: 1896.354739 im/s
Epoch 11, Iter 39, Loss: 11820.3300781, Throughput: 1899.990151 im/s
Epoch 11, Train Loss: 11757.1093750, Time: 2.6278s, Throughput: 1899.707437 im/s
Epoch 12, Iter 39, Loss: 11808.2578125, Throughput: 1896.143506 im/s
Epoch 12, Train Loss: 11506.7744141, Time: 2.6331s, Throughput: 1895.853352 im/s
Epoch 13, Iter 39, Loss: 11154.9941406, Throughput: 1895.787951 im/s
Epoch 13, Train Loss: 11386.1201172, Time: 2.6335s, Throughput: 1895.549558 im/s
Epoch 14, Iter 39, Loss: 11037.3408203, Throughput: 1899.676413 im/s
Epoch 14, Train Loss: 11185.4687500, Time: 2.6281s, Throughput: 1899.435663 im/s
Epoch 15, Iter 39, Loss: 10789.7119141, Throughput: 1897.982637 im/s
Epoch 15, Train Loss: 11095.8828125, Time: 2.6305s, Throughput: 1897.740940 im/s
Epoch 16, Iter 39, Loss: 11899.7529297, Throughput: 1905.254132 im/s
Epoch 16, Train Loss: 10994.0332031, Time: 2.6205s, Throughput: 1904.944372 im/s
Epoch 17, Iter 39, Loss: 10160.9023438, Throughput: 1897.435854 im/s
Epoch 17, Train Loss: 10844.6328125, Time: 2.6313s, Throughput: 1897.173496 im/s
Epoch 18, Iter 39, Loss: 10253.7050781, Throughput: 1905.902926 im/s
Epoch 18, Train Loss: 10754.7568359, Time: 2.6196s, Throughput: 1905.605269 im/s
Epoch 19, Iter 39, Loss: 10266.9238281, Throughput: 1899.185672 im/s
Epoch 19, Train Loss: 10669.4970703, Time: 2.6288s, Throughput: 1898.941429 im/s
Epoch 20, Iter 39, Loss: 11241.9667969, Throughput: 1899.357609 im/s
Epoch 20, Train Loss: 10650.4980469, Time: 2.6286s, Throughput: 1899.111944 im/s
Epoch 21, Iter 39, Loss: 10402.2597656, Throughput: 1903.577917 im/s
Epoch 21, Train Loss: 10614.6855469, Time: 2.6228s, Throughput: 1903.332544 im/s
Epoch 22, Iter 39, Loss: 10133.6230469, Throughput: 1900.033600 im/s
Epoch 22, Train Loss: 10492.8017578, Time: 2.6277s, Throughput: 1899.778970 im/s
Epoch 23, Iter 39, Loss: 10275.3896484, Throughput: 1904.593306 im/s
Epoch 23, Train Loss: 10426.1650391, Time: 2.6214s, Throughput: 1904.347671 im/s
Epoch 24, Iter 39, Loss: 10957.4716797, Throughput: 1900.351769 im/s
Epoch 24, Train Loss: 10377.8691406, Time: 2.6273s, Throughput: 1900.061015 im/s
Epoch 25, Iter 39, Loss: 10527.3056641, Throughput: 1892.163456 im/s
Epoch 25, Train Loss: 10272.5751953, Time: 2.6386s, Throughput: 1891.910247 im/s
Epoch 26, Iter 39, Loss: 10543.8212891, Throughput: 1901.803416 im/s
Epoch 26, Train Loss: 10169.3632812, Time: 2.6252s, Throughput: 1901.553492 im/s
Epoch 27, Iter 39, Loss: 9660.7343750, Throughput: 1899.097475 im/s
Epoch 27, Train Loss: 10144.2324219, Time: 2.6291s, Throughput: 1898.762335 im/s
Epoch 28, Iter 39, Loss: 9772.7597656, Throughput: 1904.270771 im/s
Epoch 28, Train Loss: 10070.5859375, Time: 2.6219s, Throughput: 1903.965659 im/s
Epoch 29, Iter 39, Loss: 9855.8828125, Throughput: 1895.384143 im/s
Epoch 29, Train Loss: 10141.4912109, Time: 2.6342s, Throughput: 1895.108459 im/s
Epoch 30, Iter 39, Loss: 10570.7451172, Throughput: 1887.006497 im/s
Epoch 30, Train Loss: 10058.4375000, Time: 2.6459s, Throughput: 1886.687681 im/s
Epoch 31, Iter 39, Loss: 9784.6894531, Throughput: 1901.418799 im/s
Epoch 31, Train Loss: 10003.3310547, Time: 2.6258s, Throughput: 1901.133415 im/s
Epoch 32, Iter 39, Loss: 9250.6972656, Throughput: 1897.166104 im/s
Epoch 32, Train Loss: 9994.3437500, Time: 2.6317s, Throughput: 1896.857593 im/s
Epoch 33, Iter 39, Loss: 10186.8691406, Throughput: 1902.249021 im/s
Epoch 33, Train Loss: 9880.8652344, Time: 2.6246s, Throughput: 1901.970817 im/s
Epoch 34, Iter 39, Loss: 9397.9609375, Throughput: 1897.212862 im/s
Epoch 34, Train Loss: 9785.4375000, Time: 2.6316s, Throughput: 1896.923068 im/s
Epoch 35, Iter 39, Loss: 10727.7675781, Throughput: 1903.132381 im/s
Epoch 35, Train Loss: 9794.9101562, Time: 2.6234s, Throughput: 1902.882799 im/s
Epoch 36, Iter 39, Loss: 10081.8261719, Throughput: 1897.972486 im/s
Epoch 36, Train Loss: 9836.0371094, Time: 2.6305s, Throughput: 1897.738704 im/s
Epoch 37, Iter 39, Loss: 9600.6416016, Throughput: 1898.672801 im/s
Epoch 37, Train Loss: 9737.6093750, Time: 2.6296s, Throughput: 1898.404077 im/s
Epoch 38, Iter 39, Loss: 10548.1875000, Throughput: 1903.498830 im/s
Epoch 38, Train Loss: 9654.7968750, Time: 2.6229s, Throughput: 1903.223029 im/s
Epoch 39, Iter 39, Loss: 10332.8535156, Throughput: 1891.843408 im/s
Epoch 39, Train Loss: 9644.2734375, Time: 2.6390s, Throughput: 1891.627710 im/s
Epoch 40, Iter 39, Loss: 9197.0117188, Throughput: 1902.649362 im/s
Epoch 40, Train Loss: 9706.4296875, Time: 2.6241s, Throughput: 1902.395586 im/s
Epoch 41, Iter 39, Loss: 9683.7275391, Throughput: 1896.666694 im/s
Epoch 41, Train Loss: 9591.5205078, Time: 2.6324s, Throughput: 1896.382048 im/s
Epoch 42, Iter 39, Loss: 10255.5205078, Throughput: 1900.690403 im/s
Epoch 42, Train Loss: 9573.6347656, Time: 2.6268s, Throughput: 1900.393854 im/s
Epoch 43, Iter 39, Loss: 9754.3710938, Throughput: 1900.193447 im/s
Epoch 43, Train Loss: 9586.4394531, Time: 2.6274s, Throughput: 1899.988772 im/s
Epoch 44, Iter 39, Loss: 9217.6054688, Throughput: 1895.843911 im/s
Epoch 44, Train Loss: 9525.3554688, Time: 2.6335s, Throughput: 1895.597095 im/s
Epoch 45, Iter 39, Loss: 9316.2041016, Throughput: 1900.017738 im/s
Epoch 45, Train Loss: 9446.2207031, Time: 2.6277s, Throughput: 1899.789140 im/s
Epoch 46, Iter 39, Loss: 9843.9531250, Throughput: 1892.862912 im/s
Epoch 46, Train Loss: 9457.7294922, Time: 2.6377s, Throughput: 1892.578723 im/s
Epoch 47, Iter 39, Loss: 9868.5722656, Throughput: 1900.577569 im/s
Epoch 47, Train Loss: 9429.8271484, Time: 2.6270s, Throughput: 1900.283815 im/s
Epoch 48, Iter 39, Loss: 9085.4609375, Throughput: 1895.905882 im/s
Epoch 48, Train Loss: 9347.3271484, Time: 2.6334s, Throughput: 1895.664027 im/s
Epoch 49, Iter 39, Loss: 9334.4365234, Throughput: 1894.954611 im/s
Epoch 49, Train Loss: 9330.2333984, Time: 2.6347s, Throughput: 1894.715742 im/s
Epoch 50, Iter 39, Loss: 8783.9609375, Throughput: 1897.651502 im/s
Epoch 50, Train Loss: 9397.8701172, Time: 2.6310s, Throughput: 1897.363810 im/s
Appendix
julia
using InteractiveUtils
InteractiveUtils.versioninfo()
if @isdefined(MLDataDevices)
if @isdefined(CUDA) && MLDataDevices.functional(CUDADevice)
println()
CUDA.versioninfo()
end
if @isdefined(AMDGPU) && MLDataDevices.functional(AMDGPUDevice)
println()
AMDGPU.versioninfo()
end
end
Julia Version 1.11.7
Commit f2b3dbda30a (2025-09-08 12:10 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 48 × AMD EPYC 7402 24-Core Processor
WORD_SIZE: 64
LLVM: libLLVM-16.0.6 (ORCJIT, znver2)
Threads: 48 default, 0 interactive, 24 GC (on 2 virtual cores)
Environment:
JULIA_CPU_THREADS = 2
JULIA_DEPOT_PATH = /root/.cache/julia-buildkite-plugin/depots/01872db4-8c79-43af-ab7d-12abac4f24f6
LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64
JULIA_PKG_SERVER =
JULIA_NUM_THREADS = 48
JULIA_CUDA_HARD_MEMORY_LIMIT = 100%
JULIA_PKG_PRECOMPILE_AUTO = 0
JULIA_DEBUG = Literate
This page was generated using Literate.jl.