Convolutional VAE for MNIST using Reactant
Convolutional variational autoencoder (CVAE) implementation in MLX using MNIST. This is based on the CVAE implementation in MLX.
julia
using Lux,
Reactant,
MLDatasets,
Random,
Statistics,
Enzyme,
MLUtils,
DataAugmentation,
ConcreteStructs,
OneHotArrays,
ImageShow,
Images,
Printf,
Optimisers
const xdev = reactant_device(; force=true)
const cdev = cpu_device()
const IN_VSCODE = isdefined(Main, :VSCodeServer)
false
Model Definition
First we will define the encoder.It maps the input to a normal distribution in latent space and sample a latent vector from that distribution.
julia
function cvae_encoder(
rng=Random.default_rng();
num_latent_dims::Int,
image_shape::Dims{3},
max_num_filters::Int,
)
flattened_dim = prod(image_shape[1:2] .÷ 8) * max_num_filters
return @compact(;
embed=Chain(
Chain(
Conv((3, 3), image_shape[3] => max_num_filters ÷ 4; stride=2, pad=1),
BatchNorm(max_num_filters ÷ 4, leakyrelu),
),
Chain(
Conv((3, 3), max_num_filters ÷ 4 => max_num_filters ÷ 2; stride=2, pad=1),
BatchNorm(max_num_filters ÷ 2, leakyrelu),
),
Chain(
Conv((3, 3), max_num_filters ÷ 2 => max_num_filters; stride=2, pad=1),
BatchNorm(max_num_filters, leakyrelu),
),
FlattenLayer(),
),
proj_mu=Dense(flattened_dim, num_latent_dims; init_bias=zeros32),
proj_log_var=Dense(flattened_dim, num_latent_dims; init_bias=zeros32),
rng
) do x
y = embed(x)
μ = proj_mu(y)
logσ² = proj_log_var(y)
T = eltype(logσ²)
logσ² = clamp.(logσ², -T(20.0f0), T(10.0f0))
σ = exp.(logσ² .* T(0.5))
# Generate a tensor of random values from a normal distribution
ϵ = randn_like(Lux.replicate(rng), σ)
# Reparameterization trick to brackpropagate through sampling
z = ϵ .* σ .+ μ
@return z, μ, logσ²
end
end
cvae_encoder (generic function with 2 methods)
Similarly we define the decoder.
julia
function cvae_decoder(; num_latent_dims::Int, image_shape::Dims{3}, max_num_filters::Int)
flattened_dim = prod(image_shape[1:2] .÷ 8) * max_num_filters
return @compact(;
linear=Dense(num_latent_dims, flattened_dim),
upchain=Chain(
Chain(
Upsample(2),
Conv((3, 3), max_num_filters => max_num_filters ÷ 2; stride=1, pad=1),
BatchNorm(max_num_filters ÷ 2, leakyrelu),
),
Chain(
Upsample(2),
Conv((3, 3), max_num_filters ÷ 2 => max_num_filters ÷ 4; stride=1, pad=1),
BatchNorm(max_num_filters ÷ 4, leakyrelu),
),
Chain(
Upsample(2),
Conv(
(3, 3), max_num_filters ÷ 4 => image_shape[3], sigmoid; stride=1, pad=1
),
),
),
max_num_filters
) do x
y = linear(x)
img = reshape(y, image_shape[1] ÷ 8, image_shape[2] ÷ 8, max_num_filters, :)
@return upchain(img)
end
end
@concrete struct CVAE <: AbstractLuxContainerLayer{(:encoder, :decoder)}
encoder <: AbstractLuxLayer
decoder <: AbstractLuxLayer
end
function CVAE(
rng=Random.default_rng();
num_latent_dims::Int,
image_shape::Dims{3},
max_num_filters::Int,
)
decoder = cvae_decoder(; num_latent_dims, image_shape, max_num_filters)
encoder = cvae_encoder(rng; num_latent_dims, image_shape, max_num_filters)
return CVAE(encoder, decoder)
end
function (cvae::CVAE)(x, ps, st)
(z, μ, logσ²), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
return (x_rec, μ, logσ²), (; encoder=st_enc, decoder=st_dec)
end
function encode(cvae::CVAE, x, ps, st)
(z, _, _), st_enc = cvae.encoder(x, ps.encoder, st.encoder)
return z, (; encoder=st_enc, st.decoder)
end
function decode(cvae::CVAE, z, ps, st)
x_rec, st_dec = cvae.decoder(z, ps.decoder, st.decoder)
return x_rec, (; decoder=st_dec, st.encoder)
end
decode (generic function with 1 method)
Loading MNIST
julia
@concrete struct TensorDataset
dataset
transform
total_samples::Int
end
Base.length(ds::TensorDataset) = ds.total_samples
function Base.getindex(ds::TensorDataset, idxs::Union{Vector{<:Integer},AbstractRange})
img = Image.(eachslice(convert2image(ds.dataset, idxs); dims=3))
return stack(parent ∘ itemdata ∘ Base.Fix1(apply, ds.transform), img)
end
function loadmnist(batchsize, image_size::Dims{2})
# Load MNIST: Only 1500 for demonstration purposes on CI
train_dataset = MNIST(; split=:train)
N = parse(Bool, get(ENV, "CI", "false")) ? 5000 : length(train_dataset)
train_transform = ScaleKeepAspect(image_size) |> ImageToTensor()
trainset = TensorDataset(train_dataset, train_transform, N)
trainloader = DataLoader(trainset; batchsize, shuffle=true, partial=false)
return trainloader
end
loadmnist (generic function with 1 method)
Helper Functions
Generate an Image Grid from a list of images
julia
function create_image_grid(imgs::AbstractArray, grid_rows::Int, grid_cols::Int)
total_images = grid_rows * grid_cols
imgs = map(eachslice(imgs[:, :, :, 1:total_images]; dims=4)) do img
cimg = if size(img, 3) == 1
colorview(Gray, view(img, :, :, 1))
else
colorview(RGB, permutedims(img, (3, 1, 2)))
end
return cimg'
end
return create_image_grid(imgs, grid_rows, grid_cols)
end
function create_image_grid(images::Vector, grid_rows::Int, grid_cols::Int)
# Check if the number of images matches the grid
total_images = grid_rows * grid_cols
@assert length(images) == total_images
# Get the size of a single image (assuming all images are the same size)
img_height, img_width = size(images[1])
# Create a blank grid canvas
grid_height = img_height * grid_rows
grid_width = img_width * grid_cols
grid_canvas = similar(images[1], grid_height, grid_width)
# Place each image in the correct position on the canvas
for idx in 1:total_images
row = div(idx - 1, grid_cols) + 1
col = mod(idx - 1, grid_cols) + 1
start_row = (row - 1) * img_height + 1
start_col = (col - 1) * img_width + 1
grid_canvas[start_row:(start_row + img_height - 1), start_col:(start_col + img_width - 1)] .= images[idx]
end
return grid_canvas
end
function loss_function(model, ps, st, X)
(y, μ, logσ²), st = model(X, ps, st)
reconstruction_loss = MSELoss(; agg=sum)(y, X)
kldiv_loss = -sum(1 .+ logσ² .- μ .^ 2 .- exp.(logσ²)) / 2
loss = reconstruction_loss + kldiv_loss
return loss, st, (; y, μ, logσ², reconstruction_loss, kldiv_loss)
end
function generate_images(
model, ps, st; num_samples::Int=128, num_latent_dims::Int, decode_compiled=nothing
)
z = get_device((ps, st))(randn(Float32, num_latent_dims, num_samples))
if decode_compiled === nothing
images, _ = decode(model, z, ps, Lux.testmode(st))
else
images, _ = decode_compiled(model, z, ps, Lux.testmode(st))
images = cpu_device()(images)
end
return create_image_grid(images, 8, num_samples ÷ 8)
end
function reconstruct_images(model, ps, st, X)
(recon, _, _), _ = model(X, ps, Lux.testmode(st))
recon = cpu_device()(recon)
return create_image_grid(recon, 8, size(X, ndims(X)) ÷ 8)
end
reconstruct_images (generic function with 1 method)
Training the Model
julia
function main(;
batchsize=128,
image_size=(64, 64),
num_latent_dims=8,
max_num_filters=64,
seed=0,
epochs=50,
weight_decay=1.0e-5,
learning_rate=1.0e-3,
num_samples=batchsize,
)
rng = Xoshiro()
Random.seed!(rng, seed)
cvae = CVAE(rng; num_latent_dims, image_shape=(image_size..., 1), max_num_filters)
ps, st = xdev(Lux.setup(rng, cvae))
z = xdev(randn(Float32, num_latent_dims, num_samples))
decode_compiled = Reactant.with_config(;
dot_general_precision=PrecisionConfig.HIGH,
convolution_precision=PrecisionConfig.HIGH,
) do
@compile decode(cvae, z, ps, Lux.testmode(st))
end
x = xdev(randn(Float32, image_size..., 1, batchsize))
cvae_compiled = Reactant.with_config(;
dot_general_precision=PrecisionConfig.HIGH,
convolution_precision=PrecisionConfig.HIGH,
) do
@compile cvae(x, ps, Lux.testmode(st))
end
train_dataloader = xdev(loadmnist(batchsize, image_size))
opt = AdamW(; eta=learning_rate, lambda=weight_decay)
train_state = Training.TrainState(cvae, ps, st, opt)
@printf "Total Trainable Parameters: %0.4f M\n" (Lux.parameterlength(ps) / 1.0e6)
empty_row, model_img_full = nothing, nothing
for epoch in 1:epochs
loss_total = 0.0f0
total_samples = 0
start_time = time()
for (i, X) in enumerate(train_dataloader)
(_, loss, _, train_state) = Training.single_train_step!(
AutoEnzyme(), loss_function, X, train_state; return_gradients=Val(false)
)
loss_total += loss
total_samples += size(X, ndims(X))
if i % 250 == 0 || i == length(train_dataloader)
throughput = total_samples / (time() - start_time)
@printf "Epoch %d, Iter %d, Loss: %.7f, Throughput: %.6f im/s\n" epoch i loss throughput
end
end
total_time = time() - start_time
train_loss = loss_total / length(train_dataloader)
throughput = total_samples / total_time
@printf "Epoch %d, Train Loss: %.7f, Time: %.4fs, Throughput: %.6f im/s\n" epoch train_loss total_time throughput
if IN_VSCODE || epoch == epochs
recon_images = reconstruct_images(
cvae_compiled,
train_state.parameters,
train_state.states,
first(train_dataloader),
)
gen_images = generate_images(
cvae,
train_state.parameters,
train_state.states;
num_samples,
num_latent_dims,
decode_compiled,
)
if empty_row === nothing
empty_row = similar(gen_images, image_size[1], size(gen_images, 2))
fill!(empty_row, 0)
end
model_img_full = vcat(recon_images, empty_row, gen_images)
IN_VSCODE && display(model_img_full)
end
end
return model_img_full
end
img = main()
2025-07-03 16:32:21.712096: I external/xla/xla/service/service.cc:153] XLA service 0x12d07bd0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-07-03 16:32:21.712139: I external/xla/xla/service/service.cc:161] StreamExecutor device (0): NVIDIA A100-PCIE-40GB MIG 1g.5gb, Compute Capability 8.0
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1751560341.712813 322209 se_gpu_pjrt_client.cc:1370] Using BFC allocator.
I0000 00:00:1751560341.712863 322209 gpu_helpers.cc:136] XLA backend allocating 3825205248 bytes on device 0 for BFCAllocator.
I0000 00:00:1751560341.712900 322209 gpu_helpers.cc:177] XLA backend will use up to 1275068416 bytes on device 0 for CollectiveBFCAllocator.
I0000 00:00:1751560341.725234 322209 cuda_dnn.cc:471] Loaded cuDNN version 90800
┌ Warning: `training` is set to `Val{false}()` but is being used within an autodiff call (gradient, jacobian, etc...). This might lead to incorrect results. If you are using a `Lux.jl` model, set it to training mode using `LuxCore.trainmode`.
└ @ LuxLib.Utils /var/lib/buildkite-agent/builds/gpuci-4/julialang/lux-dot-jl/lib/LuxLib/src/utils.jl:344
Total Trainable Parameters: 0.1493 M
Epoch 1, Iter 39, Loss: 24620.8476562, Throughput: 47.416060 im/s
Epoch 1, Train Loss: 39480.7734375, Time: 105.6730s, Throughput: 47.240058 im/s
Epoch 2, Iter 39, Loss: 18118.3476562, Throughput: 1856.073289 im/s
Epoch 2, Train Loss: 20187.6425781, Time: 2.6898s, Throughput: 1855.884916 im/s
Epoch 3, Iter 39, Loss: 14817.0312500, Throughput: 1900.770120 im/s
Epoch 3, Train Loss: 16585.4355469, Time: 2.6268s, Throughput: 1900.377123 im/s
Epoch 4, Iter 39, Loss: 14749.6738281, Throughput: 1900.325035 im/s
Epoch 4, Train Loss: 15016.8193359, Time: 2.6273s, Throughput: 1900.068257 im/s
Epoch 5, Iter 39, Loss: 13114.7050781, Throughput: 1906.645393 im/s
Epoch 5, Train Loss: 14135.9853516, Time: 2.6186s, Throughput: 1906.390550 im/s
Epoch 6, Iter 39, Loss: 12859.2128906, Throughput: 1899.765352 im/s
Epoch 6, Train Loss: 13425.0644531, Time: 2.6280s, Throughput: 1899.540434 im/s
Epoch 7, Iter 39, Loss: 12559.5000000, Throughput: 1902.723364 im/s
Epoch 7, Train Loss: 13115.7812500, Time: 2.6240s, Throughput: 1902.447096 im/s
Epoch 8, Iter 39, Loss: 12654.2500000, Throughput: 1899.239937 im/s
Epoch 8, Train Loss: 12619.7500000, Time: 2.6287s, Throughput: 1899.015143 im/s
Epoch 9, Iter 39, Loss: 12070.7597656, Throughput: 1907.276198 im/s
Epoch 9, Train Loss: 12316.6494141, Time: 2.6177s, Throughput: 1907.044460 im/s
Epoch 10, Iter 39, Loss: 11965.8710938, Throughput: 1902.176611 im/s
Epoch 10, Train Loss: 11968.3408203, Time: 2.6248s, Throughput: 1901.847639 im/s
Epoch 11, Iter 39, Loss: 11404.3457031, Throughput: 1878.981103 im/s
Epoch 11, Train Loss: 11793.1591797, Time: 2.6571s, Throughput: 1878.760404 im/s
Epoch 12, Iter 39, Loss: 11897.5839844, Throughput: 1903.185315 im/s
Epoch 12, Train Loss: 11670.8574219, Time: 2.6232s, Throughput: 1903.033786 im/s
Epoch 13, Iter 39, Loss: 12168.1894531, Throughput: 1908.245796 im/s
Epoch 13, Train Loss: 11437.8330078, Time: 2.6163s, Throughput: 1908.046686 im/s
Epoch 14, Iter 39, Loss: 10289.2558594, Throughput: 1903.047450 im/s
Epoch 14, Train Loss: 11424.1689453, Time: 2.6235s, Throughput: 1902.813800 im/s
Epoch 15, Iter 39, Loss: 11661.3613281, Throughput: 1907.806243 im/s
Epoch 15, Train Loss: 11158.3378906, Time: 2.6171s, Throughput: 1907.464374 im/s
Epoch 16, Iter 39, Loss: 11387.5136719, Throughput: 1904.835365 im/s
Epoch 16, Train Loss: 11075.5146484, Time: 2.6210s, Throughput: 1904.621026 im/s
Epoch 17, Iter 39, Loss: 10879.1650391, Throughput: 1905.725292 im/s
Epoch 17, Train Loss: 10862.3486328, Time: 2.6199s, Throughput: 1905.425609 im/s
Epoch 18, Iter 39, Loss: 10023.6914062, Throughput: 1903.299151 im/s
Epoch 18, Train Loss: 10840.8730469, Time: 2.6232s, Throughput: 1902.998848 im/s
Epoch 19, Iter 39, Loss: 10473.0498047, Throughput: 1907.584631 im/s
Epoch 19, Train Loss: 10756.0000000, Time: 2.6173s, Throughput: 1907.344826 im/s
Epoch 20, Iter 39, Loss: 10758.8671875, Throughput: 1901.197287 im/s
Epoch 20, Train Loss: 10601.0537109, Time: 2.6260s, Throughput: 1900.959775 im/s
Epoch 21, Iter 39, Loss: 10598.9101562, Throughput: 1906.059251 im/s
Epoch 21, Train Loss: 10601.7275391, Time: 2.6193s, Throughput: 1905.824340 im/s
Epoch 22, Iter 39, Loss: 11254.6494141, Throughput: 1908.504963 im/s
Epoch 22, Train Loss: 10581.7216797, Time: 2.6161s, Throughput: 1908.220405 im/s
Epoch 23, Iter 39, Loss: 10032.0830078, Throughput: 1905.646027 im/s
Epoch 23, Train Loss: 10420.7597656, Time: 2.6200s, Throughput: 1905.357292 im/s
Epoch 24, Iter 39, Loss: 10212.0419922, Throughput: 1908.753064 im/s
Epoch 24, Train Loss: 10319.0234375, Time: 2.6156s, Throughput: 1908.538538 im/s
Epoch 25, Iter 39, Loss: 10086.4677734, Throughput: 1904.487283 im/s
Epoch 25, Train Loss: 10373.6523438, Time: 2.6216s, Throughput: 1904.165477 im/s
Epoch 26, Iter 39, Loss: 9747.2783203, Throughput: 1906.659283 im/s
Epoch 26, Train Loss: 10314.9550781, Time: 2.6185s, Throughput: 1906.413809 im/s
Epoch 27, Iter 39, Loss: 9836.6992188, Throughput: 1900.458884 im/s
Epoch 27, Train Loss: 10191.0917969, Time: 2.6270s, Throughput: 1900.236043 im/s
Epoch 28, Iter 39, Loss: 10363.9375000, Throughput: 1909.320319 im/s
Epoch 28, Train Loss: 10061.3710938, Time: 2.6148s, Throughput: 1909.111410 im/s
Epoch 29, Iter 39, Loss: 10047.7353516, Throughput: 1903.039667 im/s
Epoch 29, Train Loss: 10056.3242188, Time: 2.6235s, Throughput: 1902.807401 im/s
Epoch 30, Iter 39, Loss: 9712.5595703, Throughput: 1908.567591 im/s
Epoch 30, Train Loss: 9990.4179688, Time: 2.6158s, Throughput: 1908.377980 im/s
Epoch 31, Iter 39, Loss: 9655.0742188, Throughput: 1905.538326 im/s
Epoch 31, Train Loss: 10032.8496094, Time: 2.6200s, Throughput: 1905.357986 im/s
Epoch 32, Iter 39, Loss: 8734.4423828, Throughput: 1909.398497 im/s
Epoch 32, Train Loss: 9966.3300781, Time: 2.6148s, Throughput: 1909.126206 im/s
Epoch 33, Iter 39, Loss: 10012.9072266, Throughput: 1902.942292 im/s
Epoch 33, Train Loss: 9888.8388672, Time: 2.6236s, Throughput: 1902.730453 im/s
Epoch 34, Iter 39, Loss: 8890.3261719, Throughput: 1910.308729 im/s
Epoch 34, Train Loss: 9932.3115234, Time: 2.6135s, Throughput: 1910.101172 im/s
Epoch 35, Iter 39, Loss: 9535.2675781, Throughput: 1905.041779 im/s
Epoch 35, Train Loss: 9834.5000000, Time: 2.6207s, Throughput: 1904.820808 im/s
Epoch 36, Iter 39, Loss: 9366.0644531, Throughput: 1907.284363 im/s
Epoch 36, Train Loss: 9783.6777344, Time: 2.6177s, Throughput: 1907.016843 im/s
Epoch 37, Iter 39, Loss: 9803.2089844, Throughput: 1909.197579 im/s
Epoch 37, Train Loss: 9732.1406250, Time: 2.6150s, Throughput: 1908.971989 im/s
Epoch 38, Iter 39, Loss: 9764.2392578, Throughput: 1905.807513 im/s
Epoch 38, Train Loss: 9665.0625000, Time: 2.6196s, Throughput: 1905.600239 im/s
Epoch 39, Iter 39, Loss: 9317.4023438, Throughput: 1908.688162 im/s
Epoch 39, Train Loss: 9653.9892578, Time: 2.6157s, Throughput: 1908.477304 im/s
Epoch 40, Iter 39, Loss: 9995.7187500, Throughput: 1902.667343 im/s
Epoch 40, Train Loss: 9585.0097656, Time: 2.6241s, Throughput: 1902.354103 im/s
Epoch 41, Iter 39, Loss: 10247.4882812, Throughput: 1906.742279 im/s
Epoch 41, Train Loss: 9600.0087891, Time: 2.6184s, Throughput: 1906.537059 im/s
Epoch 42, Iter 39, Loss: 9416.2275391, Throughput: 1899.992393 im/s
Epoch 42, Train Loss: 9500.5791016, Time: 2.6277s, Throughput: 1899.766042 im/s
Epoch 43, Iter 39, Loss: 9467.0966797, Throughput: 1906.876339 im/s
Epoch 43, Train Loss: 9528.2744141, Time: 2.6181s, Throughput: 1906.731340 im/s
Epoch 44, Iter 39, Loss: 8925.8964844, Throughput: 1907.753573 im/s
Epoch 44, Train Loss: 9459.8730469, Time: 2.6169s, Throughput: 1907.584457 im/s
Epoch 45, Iter 39, Loss: 9469.2236328, Throughput: 1897.661650 im/s
Epoch 45, Train Loss: 9470.8759766, Time: 2.6310s, Throughput: 1897.360199 im/s
Epoch 46, Iter 39, Loss: 10006.0898438, Throughput: 1906.321990 im/s
Epoch 46, Train Loss: 9467.0830078, Time: 2.6189s, Throughput: 1906.140807 im/s
Epoch 47, Iter 39, Loss: 10224.7744141, Throughput: 1904.686345 im/s
Epoch 47, Train Loss: 9420.1416016, Time: 2.6211s, Throughput: 1904.511882 im/s
Epoch 48, Iter 39, Loss: 9173.8945312, Throughput: 1907.212786 im/s
Epoch 48, Train Loss: 9428.5917969, Time: 2.6178s, Throughput: 1906.965780 im/s
Epoch 49, Iter 39, Loss: 8963.1298828, Throughput: 1903.353652 im/s
Epoch 49, Train Loss: 9356.3339844, Time: 2.6231s, Throughput: 1903.112834 im/s
Epoch 50, Iter 39, Loss: 9135.7890625, Throughput: 1909.386657 im/s
Epoch 50, Train Loss: 9318.3906250, Time: 2.6147s, Throughput: 1909.189571 im/s
Appendix
julia
using InteractiveUtils
InteractiveUtils.versioninfo()
if @isdefined(MLDataDevices)
if @isdefined(CUDA) && MLDataDevices.functional(CUDADevice)
println()
CUDA.versioninfo()
end
if @isdefined(AMDGPU) && MLDataDevices.functional(AMDGPUDevice)
println()
AMDGPU.versioninfo()
end
end
Julia Version 1.11.5
Commit 760b2e5b739 (2025-04-14 06:53 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 48 × AMD EPYC 7402 24-Core Processor
WORD_SIZE: 64
LLVM: libLLVM-16.0.6 (ORCJIT, znver2)
Threads: 48 default, 0 interactive, 24 GC (on 2 virtual cores)
Environment:
JULIA_CPU_THREADS = 2
LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64
JULIA_PKG_SERVER =
JULIA_NUM_THREADS = 48
JULIA_CUDA_HARD_MEMORY_LIMIT = 100%
JULIA_PKG_PRECOMPILE_AUTO = 0
JULIA_DEBUG = Literate
JULIA_DEPOT_PATH = /root/.cache/julia-buildkite-plugin/depots/01872db4-8c79-43af-ab7d-12abac4f24f6
This page was generated using Literate.jl.