Convolutional VAE for MNIST using Reactant
Convolutional variational autoencoder (CVAE) implementation in MLX using MNIST. This is based on the CVAE implementation in MLX.
julia
using Lux, Reactant, MLDatasets, Random, Statistics, Enzyme, MLUtils, DataAugmentation,
ConcreteStructs, OneHotArrays, ImageShow, Images, Printf, Optimisers
const xdev = reactant_device(; force = true)
const cdev = cpu_device()
(::MLDataDevices.CPUDevice) (generic function with 1 method)
Model Definition
First we will define the encoder.It maps the input to a normal distribution in latent space and sample a latent vector from that distribution.
julia
function cvae_encoder(
rng = Random.default_rng(); num_latent_dims::Int,
image_shape::Dims{3}, max_num_filters::Int
)
flattened_dim = prod(image_shape[1:2] .÷ 8) * max_num_filters
return @compact(;
embed = Chain(
Chain(
Conv((3, 3), image_shape[3] => max_num_filters ÷ 4; stride = 2, pad = 1),
BatchNorm(max_num_filters ÷ 4, leakyrelu)
),
Chain(
Conv((3, 3), max_num_filters ÷ 4 => max_num_filters ÷ 2; stride = 2, pad = 1),
BatchNorm(max_num_filters ÷ 2, leakyrelu)
),
Chain(
Conv((3, 3), max_num_filters ÷ 2 => max_num_filters; stride = 2, pad = 1),
BatchNorm(max_num_filters, leakyrelu)
),
FlattenLayer()
),
proj_mu = Dense(flattened_dim, num_latent_dims; init_bias = zeros32),
proj_log_var = Dense(flattened_dim, num_latent_dims; init_bias = zeros32),
rng
) do x
y = embed(x)
μ = proj_mu(y)
logσ² = proj_log_var(y)
T = eltype(logσ²)
logσ² = clamp.(logσ², -T(20.0f0), T(10.0f0))
σ = exp.(logσ² .* T(0.5))
# Generate a tensor of random values from a normal distribution
rng = Lux.replicate(rng)
ϵ = randn_like(rng, σ)
# Reparameterization trick to brackpropagate through sampling
z = ϵ .* σ .+ μ
@return z, μ, logσ²
end
end
cvae_encoder (generic function with 2 methods)
Similarly we define the decoder.
julia
function cvae_decoder(; num_latent_dims::Int, image_shape::Dims{3}, max_num_filters::Int)
flattened_dim = prod(image_shape[1:2] .÷ 8) * max_num_filters
return @compact(;
linear = Dense(num_latent_dims, flattened_dim),
upchain = Chain(
Chain(
Upsample(2),
Conv((3, 3), max_num_filters => max_num_filters ÷ 2; stride = 1, pad = 1),
BatchNorm(max_num_filters ÷ 2, leakyrelu)
),
Chain(
Upsample(2),
Conv((3, 3), max_num_filters ÷ 2 => max_num_filters ÷ 4; stride = 1, pad = 1),
BatchNorm(max_num_filters ÷ 4, leakyrelu)
),
Chain(
Upsample(2),
Conv(
(3, 3), max_num_filters ÷ 4 => image_shape[3],
sigmoid; stride = 1, pad = 1
)
)
),
max_num_filters
) do x
y = linear(x)
img = reshape(y, image_shape[1] ÷ 8, image_shape[2] ÷ 8, max_num_filters, :)
@return upchain(img)
end
end
@concrete struct CVAE <: Lux.AbstractLuxContainerLayer{(:encoder, :decoder)}
encoder <: Lux.AbstractLuxLayer
decoder <: Lux.AbstractLuxLayer
end
function CVAE(
rng = Random.default_rng(); num_latent_dims::Int,
image_shape::Dims{3}, max_num_filters::Int
)
decoder = cvae_decoder(; num_latent_dims, image_shape, max_num_filters)
encoder = cvae_encoder(rng; num_latent_dims, image_shape, max_num_filters)
return CVAE(encoder, decoder)
end
function (cvae::CVAE)(x, ps, st)
(z, μ, logσ²), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
return (x_rec, μ, logσ²), (; encoder = st_enc, decoder = st_dec)
end
function encode(cvae::CVAE, x, ps, st)
(z, _, _), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
return z, (; encoder = st_enc, st.decoder)
end
function decode(cvae::CVAE, z, ps, st)
x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
return x_rec, (; decoder = st_dec, st.encoder)
end
decode (generic function with 1 method)
Loading MNIST
julia
@concrete struct TensorDataset
dataset
transform
total_samples::Int
end
Base.length(ds::TensorDataset) = ds.total_samples
function Base.getindex(ds::TensorDataset, idxs::Union{Vector{<:Integer}, AbstractRange})
img = Image.(eachslice(convert2image(ds.dataset, idxs); dims = 3))
return stack(parent ∘ itemdata ∘ Base.Fix1(apply, ds.transform), img)
end
function loadmnist(batchsize, image_size::Dims{2})
# Load MNIST: Only 1500 for demonstration purposes on CI
train_dataset = MNIST(; split = :train)
N = parse(Bool, get(ENV, "CI", "false")) ? 1500 : length(train_dataset)
train_transform = ScaleKeepAspect(image_size) |> ImageToTensor()
trainset = TensorDataset(train_dataset, train_transform, N)
trainloader = DataLoader(trainset; batchsize, shuffle = true, partial = false)
return trainloader
end
loadmnist (generic function with 1 method)
Helper Functions
Generate an Image Grid from a list of images
julia
function create_image_grid(imgs::AbstractArray, grid_rows::Int, grid_cols::Int)
total_images = grid_rows * grid_cols
imgs = map(eachslice(imgs[:, :, :, 1:total_images]; dims = 4)) do img
cimg = size(img, 3) == 1 ? colorview(Gray, view(img, :, :, 1)) :
colorview(RGB, permutedims(img, (3, 1, 2)))
return cimg'
end
return create_image_grid(imgs, grid_rows, grid_cols)
end
function create_image_grid(images::Vector, grid_rows::Int, grid_cols::Int)
# Check if the number of images matches the grid
total_images = grid_rows * grid_cols
@assert length(images) == total_images
# Get the size of a single image (assuming all images are the same size)
img_height, img_width = size(images[1])
# Create a blank grid canvas
grid_height = img_height * grid_rows
grid_width = img_width * grid_cols
grid_canvas = similar(images[1], grid_height, grid_width)
# Place each image in the correct position on the canvas
for idx in 1:total_images
row = div(idx - 1, grid_cols) + 1
col = mod(idx - 1, grid_cols) + 1
start_row = (row - 1) * img_height + 1
start_col = (col - 1) * img_width + 1
grid_canvas[start_row:(start_row + img_height - 1), start_col:(start_col + img_width - 1)] .= images[idx]
end
return grid_canvas
end
function loss_function(model, ps, st, X)
(y, μ, logσ²), st = model(X, ps, st)
reconstruction_loss = MSELoss(; agg = sum)(y, X)
kldiv_loss = -sum(1 .+ logσ² .- μ .^ 2 .- exp.(logσ²)) / 2
loss = reconstruction_loss + kldiv_loss
return loss, st, (; y, μ, logσ², reconstruction_loss, kldiv_loss)
end
function generate_images(
model, ps, st; num_samples::Int = 128, num_latent_dims::Int, decode_compiled = nothing
)
z = randn(Float32, num_latent_dims, num_samples) |> get_device((ps, st))
if decode_compiled === nothing
images, _ = decode(model, z, ps, Lux.testmode(st))
else
images, _ = decode_compiled(model, z, ps, Lux.testmode(st))
images = images |> cpu_device()
end
return create_image_grid(images, 8, num_samples ÷ 8)
end
function reconstruct_images(model, ps, st, X)
(recon, _, _), _ = model(X, ps, Lux.testmode(st))
recon = recon |> cpu_device()
return create_image_grid(recon, 8, size(X, ndims(X)) ÷ 8)
end
reconstruct_images (generic function with 1 method)
Training the Model
julia
function main(;
batchsize = 128, image_size = (64, 64), num_latent_dims = 8, max_num_filters = 64,
seed = 0, epochs = 50, weight_decay = 1.0e-5, learning_rate = 1.0e-3, num_samples = batchsize
)
rng = Xoshiro()
Random.seed!(rng, seed)
cvae = CVAE(rng; num_latent_dims, image_shape = (image_size..., 1), max_num_filters)
ps, st = Lux.setup(rng, cvae) |> xdev
z = randn(Float32, num_latent_dims, num_samples) |> xdev
decode_compiled = @compile decode(cvae, z, ps, Lux.testmode(st))
x = randn(Float32, image_size..., 1, batchsize) |> xdev
cvae_compiled = @compile cvae(x, ps, Lux.testmode(st))
train_dataloader = loadmnist(batchsize, image_size) |> xdev
opt = AdamW(; eta = learning_rate, lambda = weight_decay)
train_state = Training.TrainState(cvae, ps, st, opt)
@printf "Total Trainable Parameters: %0.4f M\n" (Lux.parameterlength(ps) / 1.0e6)
is_vscode = isdefined(Main, :VSCodeServer)
empty_row, model_img_full = nothing, nothing
for epoch in 1:epochs
loss_total = 0.0f0
total_samples = 0
start_time = time()
for (i, X) in enumerate(train_dataloader)
(_, loss, _, train_state) = Training.single_train_step!(
AutoEnzyme(), loss_function, X, train_state; return_gradients = Val(false)
)
loss_total += loss
total_samples += size(X, ndims(X))
if i % 250 == 0 || i == length(train_dataloader)
throughput = total_samples / (time() - start_time)
@printf "Epoch %d, Iter %d, Loss: %.7f, Throughput: %.6f im/s\n" epoch i loss throughput
end
end
total_time = time() - start_time
train_loss = loss_total / length(train_dataloader)
throughput = total_samples / total_time
@printf "Epoch %d, Train Loss: %.7f, Time: %.4fs, Throughput: %.6f im/s\n" epoch train_loss total_time throughput
if is_vscode || epoch == epochs
recon_images = reconstruct_images(
cvae_compiled, train_state.parameters, train_state.states,
first(train_dataloader)
)
gen_images = generate_images(
cvae, train_state.parameters, train_state.states;
num_samples, num_latent_dims, decode_compiled
)
if empty_row === nothing
empty_row = similar(gen_images, image_size[1], size(gen_images, 2))
fill!(empty_row, 0)
end
model_img_full = vcat(recon_images, empty_row, gen_images)
is_vscode && display(model_img_full)
end
end
return model_img_full
end
img = main()
2025-03-11 23:16:35.416356: I external/xla/xla/service/service.cc:152] XLA service 0x9df14f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-03-11 23:16:35.416393: I external/xla/xla/service/service.cc:160] StreamExecutor device (0): NVIDIA A100-PCIE-40GB MIG 1g.5gb, Compute Capability 8.0
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1741734995.417174 1305528 se_gpu_pjrt_client.cc:951] Using BFC allocator.
I0000 00:00:1741734995.417237 1305528 gpu_helpers.cc:136] XLA backend allocating 3825205248 bytes on device 0 for BFCAllocator.
I0000 00:00:1741734995.417290 1305528 gpu_helpers.cc:177] XLA backend will use up to 1275068416 bytes on device 0 for CollectiveBFCAllocator.
I0000 00:00:1741734995.432973 1305528 cuda_dnn.cc:529] Loaded cuDNN version 90400
┌ Warning: `training` is set to `Val{false}()` but is being used within an autodiff call (gradient, jacobian, etc...). This might lead to incorrect results. If you are using a `Lux.jl` model, set it to training mode using `LuxCore.trainmode`.
└ @ LuxLib.Utils /var/lib/buildkite-agent/builds/gpuci-7/julialang/lux-dot-jl/lib/LuxLib/src/utils.jl:340
Total Trainable Parameters: 0.1493 M
Epoch 1, Iter 11, Loss: 63860.7343750, Throughput: 20.753901 im/s
Epoch 1, Train Loss: 76687.9843750, Time: 68.2567s, Throughput: 20.628008 im/s
Epoch 2, Iter 11, Loss: 50712.7187500, Throughput: 1693.280539 im/s
Epoch 2, Train Loss: 54648.9765625, Time: 0.8319s, Throughput: 1692.568115 im/s
Epoch 3, Iter 11, Loss: 49819.5859375, Throughput: 1796.883567 im/s
Epoch 3, Train Loss: 50235.8007812, Time: 0.7840s, Throughput: 1796.012493 im/s
Epoch 4, Iter 11, Loss: 51618.7695312, Throughput: 1816.652695 im/s
Epoch 4, Train Loss: 50766.0859375, Time: 0.7754s, Throughput: 1815.905286 im/s
Epoch 5, Iter 11, Loss: 49920.3750000, Throughput: 1816.544288 im/s
Epoch 5, Train Loss: 51354.7968750, Time: 0.7755s, Throughput: 1815.647354 im/s
Epoch 6, Iter 11, Loss: 49371.0781250, Throughput: 1816.045448 im/s
Epoch 6, Train Loss: 50771.1640625, Time: 0.7756s, Throughput: 1815.349875 im/s
Epoch 7, Iter 11, Loss: 51831.1562500, Throughput: 1803.668932 im/s
Epoch 7, Train Loss: 51772.7148438, Time: 0.7809s, Throughput: 1802.983356 im/s
Epoch 8, Iter 11, Loss: 49106.3476562, Throughput: 1801.562105 im/s
Epoch 8, Train Loss: 50532.3906250, Time: 0.7818s, Throughput: 1800.891859 im/s
Epoch 9, Iter 11, Loss: 48753.0195312, Throughput: 1821.538255 im/s
Epoch 9, Train Loss: 49127.0390625, Time: 0.7733s, Throughput: 1820.850259 im/s
Epoch 10, Iter 11, Loss: 50092.2656250, Throughput: 1823.133033 im/s
Epoch 10, Train Loss: 49258.7382812, Time: 0.7726s, Throughput: 1822.420212 im/s
Epoch 11, Iter 11, Loss: 51411.0546875, Throughput: 1821.724806 im/s
Epoch 11, Train Loss: 50068.2500000, Time: 0.7733s, Throughput: 1820.791873 im/s
Epoch 12, Iter 11, Loss: 51336.9843750, Throughput: 1821.198966 im/s
Epoch 12, Train Loss: 50676.2656250, Time: 0.7734s, Throughput: 1820.438833 im/s
Epoch 13, Iter 11, Loss: 49087.9765625, Throughput: 1783.003391 im/s
Epoch 13, Train Loss: 50930.9609375, Time: 0.7899s, Throughput: 1782.452321 im/s
Epoch 14, Iter 11, Loss: 53517.5703125, Throughput: 1814.992248 im/s
Epoch 14, Train Loss: 51167.8710938, Time: 0.7760s, Throughput: 1814.358239 im/s
Epoch 15, Iter 11, Loss: 52977.5859375, Throughput: 1818.637038 im/s
Epoch 15, Train Loss: 51445.3789062, Time: 0.7745s, Throughput: 1817.982011 im/s
Epoch 16, Iter 11, Loss: 49285.3281250, Throughput: 1813.832743 im/s
Epoch 16, Train Loss: 52015.2500000, Time: 0.7765s, Throughput: 1813.217914 im/s
Epoch 17, Iter 11, Loss: 49648.7460938, Throughput: 1821.107424 im/s
Epoch 17, Train Loss: 51780.6367188, Time: 0.7734s, Throughput: 1820.434343 im/s
Epoch 18, Iter 11, Loss: 51579.3476562, Throughput: 1789.704206 im/s
Epoch 18, Train Loss: 51411.8398438, Time: 0.7870s, Throughput: 1789.024325 im/s
Epoch 19, Iter 11, Loss: 50840.2109375, Throughput: 1819.116570 im/s
Epoch 19, Train Loss: 50885.6406250, Time: 0.7743s, Throughput: 1818.524474 im/s
Epoch 20, Iter 11, Loss: 50992.2304688, Throughput: 1802.643790 im/s
Epoch 20, Train Loss: 50648.8710938, Time: 0.7815s, Throughput: 1801.762726 im/s
Epoch 21, Iter 11, Loss: 51737.2968750, Throughput: 1818.414164 im/s
Epoch 21, Train Loss: 50757.0507812, Time: 0.7746s, Throughput: 1817.728525 im/s
Epoch 22, Iter 11, Loss: 51831.1093750, Throughput: 1822.830846 im/s
Epoch 22, Train Loss: 49888.0625000, Time: 0.7727s, Throughput: 1822.163237 im/s
Epoch 23, Iter 11, Loss: 50836.8242188, Throughput: 1810.492415 im/s
Epoch 23, Train Loss: 49248.8398438, Time: 0.7780s, Throughput: 1809.752283 im/s
Epoch 24, Iter 11, Loss: 48332.6132812, Throughput: 1803.835862 im/s
Epoch 24, Train Loss: 48788.5234375, Time: 0.7809s, Throughput: 1803.075837 im/s
Epoch 25, Iter 11, Loss: 51076.3515625, Throughput: 1822.377471 im/s
Epoch 25, Train Loss: 49380.5000000, Time: 0.7729s, Throughput: 1821.774259 im/s
Epoch 26, Iter 11, Loss: 48839.1367188, Throughput: 1780.599263 im/s
Epoch 26, Train Loss: 49971.9335938, Time: 0.7911s, Throughput: 1779.863518 im/s
Epoch 27, Iter 11, Loss: 51517.9453125, Throughput: 1750.956344 im/s
Epoch 27, Train Loss: 49365.1484375, Time: 0.8045s, Throughput: 1750.159820 im/s
Epoch 28, Iter 11, Loss: 50281.6406250, Throughput: 1769.222045 im/s
Epoch 28, Train Loss: 49344.8281250, Time: 0.7961s, Throughput: 1768.637613 im/s
Epoch 29, Iter 11, Loss: 48107.8046875, Throughput: 1817.860015 im/s
Epoch 29, Train Loss: 49317.1289062, Time: 0.7750s, Throughput: 1816.891348 im/s
Epoch 30, Iter 11, Loss: 50180.8398438, Throughput: 1798.758638 im/s
Epoch 30, Train Loss: 49567.7460938, Time: 0.7832s, Throughput: 1797.851263 im/s
Epoch 31, Iter 11, Loss: 50080.4609375, Throughput: 1821.208514 im/s
Epoch 31, Train Loss: 50245.8476562, Time: 0.7735s, Throughput: 1820.262083 im/s
Epoch 32, Iter 11, Loss: 49447.4843750, Throughput: 1820.845767 im/s
Epoch 32, Train Loss: 50399.0585938, Time: 0.7736s, Throughput: 1820.120709 im/s
Epoch 33, Iter 11, Loss: 49561.9335938, Throughput: 1781.142205 im/s
Epoch 33, Train Loss: 50634.0664062, Time: 0.7909s, Throughput: 1780.304033 im/s
Epoch 34, Iter 11, Loss: 52728.8515625, Throughput: 1822.009762 im/s
Epoch 34, Train Loss: 50756.5859375, Time: 0.7731s, Throughput: 1821.147297 im/s
Epoch 35, Iter 11, Loss: 52424.9414062, Throughput: 1822.901741 im/s
Epoch 35, Train Loss: 51304.1640625, Time: 0.7727s, Throughput: 1822.224522 im/s
Epoch 36, Iter 11, Loss: 49819.3750000, Throughput: 1822.630568 im/s
Epoch 36, Train Loss: 51487.1835938, Time: 0.7729s, Throughput: 1821.795615 im/s
Epoch 37, Iter 11, Loss: 50790.7187500, Throughput: 1806.693880 im/s
Epoch 37, Train Loss: 51998.2148438, Time: 0.7797s, Throughput: 1805.929237 im/s
Epoch 38, Iter 11, Loss: 49208.7265625, Throughput: 1793.306305 im/s
Epoch 38, Train Loss: 51508.5273438, Time: 0.7854s, Throughput: 1792.767352 im/s
Epoch 39, Iter 11, Loss: 51422.6679688, Throughput: 1808.520813 im/s
Epoch 39, Train Loss: 51812.0117188, Time: 0.7788s, Throughput: 1807.852021 im/s
Epoch 40, Iter 11, Loss: 53149.1484375, Throughput: 1823.373392 im/s
Epoch 40, Train Loss: 52292.9726562, Time: 0.7725s, Throughput: 1822.646319 im/s
Epoch 41, Iter 11, Loss: 53032.9960938, Throughput: 1805.021787 im/s
Epoch 41, Train Loss: 53116.9843750, Time: 0.7804s, Throughput: 1804.295492 im/s
Epoch 42, Iter 11, Loss: 52812.6601562, Throughput: 1813.921326 im/s
Epoch 42, Train Loss: 52715.4023438, Time: 0.7765s, Throughput: 1813.236843 im/s
Epoch 43, Iter 11, Loss: 48705.9062500, Throughput: 1806.839810 im/s
Epoch 43, Train Loss: 51291.0351562, Time: 0.7795s, Throughput: 1806.225846 im/s
Epoch 44, Iter 11, Loss: 49732.6640625, Throughput: 1804.253597 im/s
Epoch 44, Train Loss: 50392.5468750, Time: 0.7807s, Throughput: 1803.601728 im/s
Epoch 45, Iter 11, Loss: 49704.3203125, Throughput: 1824.510182 im/s
Epoch 45, Train Loss: 50869.4726562, Time: 0.7720s, Throughput: 1823.831768 im/s
Epoch 46, Iter 11, Loss: 51301.3007812, Throughput: 1820.003474 im/s
Epoch 46, Train Loss: 51399.8398438, Time: 0.7739s, Throughput: 1819.255547 im/s
Epoch 47, Iter 11, Loss: 50735.7226562, Throughput: 1821.943995 im/s
Epoch 47, Train Loss: 50576.2500000, Time: 0.7731s, Throughput: 1821.302874 im/s
Epoch 48, Iter 11, Loss: 46476.7578125, Throughput: 1807.830991 im/s
Epoch 48, Train Loss: 48956.6406250, Time: 0.7792s, Throughput: 1807.046585 im/s
Epoch 49, Iter 11, Loss: 49552.1835938, Throughput: 1809.480018 im/s
Epoch 49, Train Loss: 49457.6914062, Time: 0.7784s, Throughput: 1808.792235 im/s
Epoch 50, Iter 11, Loss: 50881.5742188, Throughput: 1810.520723 im/s
Epoch 50, Train Loss: 49874.0117188, Time: 0.7780s, Throughput: 1809.813291 im/s
Appendix
julia
using InteractiveUtils
InteractiveUtils.versioninfo()
if @isdefined(MLDataDevices)
if @isdefined(CUDA) && MLDataDevices.functional(CUDADevice)
println()
CUDA.versioninfo()
end
if @isdefined(AMDGPU) && MLDataDevices.functional(AMDGPUDevice)
println()
AMDGPU.versioninfo()
end
end
Julia Version 1.11.4
Commit 8561cc3d68d (2025-03-10 11:36 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 48 × AMD EPYC 7402 24-Core Processor
WORD_SIZE: 64
LLVM: libLLVM-16.0.6 (ORCJIT, znver2)
Threads: 48 default, 0 interactive, 24 GC (on 2 virtual cores)
Environment:
JULIA_CPU_THREADS = 2
JULIA_DEPOT_PATH = /root/.cache/julia-buildkite-plugin/depots/01872db4-8c79-43af-ab7d-12abac4f24f6
LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64
JULIA_PKG_SERVER =
JULIA_NUM_THREADS = 48
JULIA_CUDA_HARD_MEMORY_LIMIT = 100%
JULIA_PKG_PRECOMPILE_AUTO = 0
JULIA_DEBUG = Literate
This page was generated using Literate.jl.