Training a HyperNetwork on MNIST and FashionMNIST

Package Imports

julia

using Lux, ADTypes, ComponentArrays, LuxCUDA, MLDatasets, MLUtils, OneHotArrays, Optimisers,
      Printf, Random, Setfield, Statistics, Zygote

CUDA.allowscalar(false)

Loading Datasets

julia

function load_dataset(::Type{dset}, n_train::Int, n_eval::Int, batchsize::Int) where {dset}
    imgs, labels = dset(:train)[1:n_train]
    x_train, y_train = reshape(imgs, 28, 28, 1, n_train), onehotbatch(labels, 0:9)

    imgs, labels = dset(:test)[1:n_eval]
    x_test, y_test = reshape(imgs, 28, 28, 1, n_eval), onehotbatch(labels, 0:9)

    return (DataLoader((x_train, y_train); batchsize=min(batchsize, n_train), shuffle=true),
        DataLoader((x_test, y_test); batchsize=min(batchsize, n_eval), shuffle=false))
end

function load_datasets(n_train=1024, n_eval=32, batchsize=256)
    return load_dataset.((MNIST, FashionMNIST), n_train, n_eval, batchsize)
end

load_datasets (generic function with 4 methods)

Implement a HyperNet Layer

julia

function HyperNet(
        weight_generator::Lux.AbstractLuxLayer, core_network::Lux.AbstractLuxLayer)
    ca_axes = Lux.initialparameters(Random.default_rng(), core_network) |>
              ComponentArray |>
              getaxes
    return @compact(; ca_axes, weight_generator, core_network, dispatch=:HyperNet) do (x, y)
        # Generate the weights
        ps_new = ComponentArray(vec(weight_generator(x)), ca_axes)
        @return core_network(y, ps_new)
    end
end

HyperNet (generic function with 1 method)

Defining functions on the CompactLuxLayer requires some understanding of how the layer is structured, as such we don't recommend doing it unless you are familiar with the internals. In this case, we simply write it to ignore the initialization of the core_network parameters.

julia

function Lux.initialparameters(rng::AbstractRNG, hn::CompactLuxLayer{:HyperNet})
    return (; weight_generator=Lux.initialparameters(rng, hn.layers.weight_generator),)
end

Create and Initialize the HyperNet

julia

function create_model()
    # Doesn't need to be a MLP can have any Lux Layer
    core_network = Chain(FlattenLayer(), Dense(784, 256, relu), Dense(256, 10))
    weight_generator = Chain(Embedding(2 => 32), Dense(32, 64, relu),
        Dense(64, Lux.parameterlength(core_network)))

    model = HyperNet(weight_generator, core_network)
    return model
end

create_model (generic function with 1 method)

Define Utility Functions

julia

const loss = CrossEntropyLoss(; logits=Val(true))

function accuracy(model, ps, st, dataloader, data_idx)
    total_correct, total = 0, 0
    st = Lux.testmode(st)
    for (x, y) in dataloader
        target_class = onecold(y)
        predicted_class = onecold(first(model((data_idx, x), ps, st)))
        total_correct += sum(target_class .== predicted_class)
        total += length(target_class)
    end
    return total_correct / total
end

accuracy (generic function with 1 method)

Training

julia

function train()
    model = create_model()
    dataloaders = load_datasets()

    dev = gpu_device()
    rng = Xoshiro(0)
    ps, st = Lux.setup(rng, model) |> dev

    train_state = Training.TrainState(model, ps, st, Adam(3.0f-4))

    ### Lets train the model
    nepochs = 25
    for epoch in 1:nepochs, data_idx in 1:2
        train_dataloader, test_dataloader = dataloaders[data_idx] .|> dev

        stime = time()
        for (x, y) in train_dataloader
            (_, _, _, train_state) = Training.single_train_step!(
                AutoZygote(), loss, ((data_idx, x), y), train_state)
        end
        ttime = time() - stime

        train_acc = round(
            accuracy(model, train_state.parameters,
                train_state.states, train_dataloader, data_idx) * 100;
            digits=2)
        test_acc = round(
            accuracy(model, train_state.parameters,
                train_state.states, test_dataloader, data_idx) * 100;
            digits=2)

        data_name = data_idx == 1 ? "MNIST" : "FashionMNIST"

        @printf "[%3d/%3d] \t %12s \t Time %.5fs \t Training Accuracy: %.2f%% \t Test \
                 Accuracy: %.2f%%\n" epoch nepochs data_name ttime train_acc test_acc
    end

    println()

    test_acc_list = [0.0, 0.0]
    for data_idx in 1:2
        train_dataloader, test_dataloader = dataloaders[data_idx] .|> dev
        train_acc = round(
            accuracy(model, train_state.parameters,
                train_state.states, train_dataloader, data_idx) * 100;
            digits=2)
        test_acc = round(
            accuracy(model, train_state.parameters,
                train_state.states, test_dataloader, data_idx) * 100;
            digits=2)

        data_name = data_idx == 1 ? "MNIST" : "FashionMNIST"

        @printf "[FINAL] \t %12s \t Training Accuracy: %.2f%% \t Test Accuracy: \
                 %.2f%%\n" data_name train_acc test_acc
        test_acc_list[data_idx] = test_acc
    end
    return test_acc_list
end

test_acc_list = train()

[  1/ 25] 	        MNIST 	 Time 73.46126s 	 Training Accuracy: 25.98% 	 Test Accuracy: 25.00%
[  1/ 25] 	 FashionMNIST 	 Time 0.03004s 	 Training Accuracy: 31.05% 	 Test Accuracy: 28.12%
[  2/ 25] 	        MNIST 	 Time 0.02960s 	 Training Accuracy: 52.25% 	 Test Accuracy: 37.50%
[  2/ 25] 	 FashionMNIST 	 Time 0.02894s 	 Training Accuracy: 55.66% 	 Test Accuracy: 46.88%
[  3/ 25] 	        MNIST 	 Time 0.02974s 	 Training Accuracy: 61.82% 	 Test Accuracy: 56.25%
[  3/ 25] 	 FashionMNIST 	 Time 0.02865s 	 Training Accuracy: 62.30% 	 Test Accuracy: 56.25%
[  4/ 25] 	        MNIST 	 Time 0.02896s 	 Training Accuracy: 68.07% 	 Test Accuracy: 50.00%
[  4/ 25] 	 FashionMNIST 	 Time 0.02530s 	 Training Accuracy: 67.48% 	 Test Accuracy: 53.12%
[  5/ 25] 	        MNIST 	 Time 0.02270s 	 Training Accuracy: 75.10% 	 Test Accuracy: 56.25%
[  5/ 25] 	 FashionMNIST 	 Time 0.05660s 	 Training Accuracy: 73.24% 	 Test Accuracy: 56.25%
[  6/ 25] 	        MNIST 	 Time 0.02165s 	 Training Accuracy: 82.13% 	 Test Accuracy: 65.62%
[  6/ 25] 	 FashionMNIST 	 Time 0.02176s 	 Training Accuracy: 73.54% 	 Test Accuracy: 68.75%
[  7/ 25] 	        MNIST 	 Time 0.02139s 	 Training Accuracy: 83.01% 	 Test Accuracy: 68.75%
[  7/ 25] 	 FashionMNIST 	 Time 0.02163s 	 Training Accuracy: 78.32% 	 Test Accuracy: 65.62%
[  8/ 25] 	        MNIST 	 Time 0.02226s 	 Training Accuracy: 86.23% 	 Test Accuracy: 71.88%
[  8/ 25] 	 FashionMNIST 	 Time 0.02188s 	 Training Accuracy: 79.30% 	 Test Accuracy: 65.62%
[  9/ 25] 	        MNIST 	 Time 0.02153s 	 Training Accuracy: 89.16% 	 Test Accuracy: 65.62%
[  9/ 25] 	 FashionMNIST 	 Time 0.02143s 	 Training Accuracy: 79.79% 	 Test Accuracy: 68.75%
[ 10/ 25] 	        MNIST 	 Time 0.03653s 	 Training Accuracy: 91.70% 	 Test Accuracy: 71.88%
[ 10/ 25] 	 FashionMNIST 	 Time 0.02218s 	 Training Accuracy: 81.25% 	 Test Accuracy: 62.50%
[ 11/ 25] 	        MNIST 	 Time 0.02127s 	 Training Accuracy: 92.97% 	 Test Accuracy: 65.62%
[ 11/ 25] 	 FashionMNIST 	 Time 0.02109s 	 Training Accuracy: 81.45% 	 Test Accuracy: 71.88%
[ 12/ 25] 	        MNIST 	 Time 0.02109s 	 Training Accuracy: 93.85% 	 Test Accuracy: 65.62%
[ 12/ 25] 	 FashionMNIST 	 Time 0.02139s 	 Training Accuracy: 86.62% 	 Test Accuracy: 71.88%
[ 13/ 25] 	        MNIST 	 Time 0.02078s 	 Training Accuracy: 96.00% 	 Test Accuracy: 62.50%
[ 13/ 25] 	 FashionMNIST 	 Time 0.02164s 	 Training Accuracy: 86.72% 	 Test Accuracy: 65.62%
[ 14/ 25] 	        MNIST 	 Time 0.02141s 	 Training Accuracy: 97.27% 	 Test Accuracy: 65.62%
[ 14/ 25] 	 FashionMNIST 	 Time 0.02868s 	 Training Accuracy: 88.77% 	 Test Accuracy: 65.62%
[ 15/ 25] 	        MNIST 	 Time 0.02187s 	 Training Accuracy: 97.46% 	 Test Accuracy: 65.62%
[ 15/ 25] 	 FashionMNIST 	 Time 0.02208s 	 Training Accuracy: 88.96% 	 Test Accuracy: 71.88%
[ 16/ 25] 	        MNIST 	 Time 0.02120s 	 Training Accuracy: 97.85% 	 Test Accuracy: 65.62%
[ 16/ 25] 	 FashionMNIST 	 Time 0.02080s 	 Training Accuracy: 87.30% 	 Test Accuracy: 59.38%
[ 17/ 25] 	        MNIST 	 Time 0.02089s 	 Training Accuracy: 97.17% 	 Test Accuracy: 65.62%
[ 17/ 25] 	 FashionMNIST 	 Time 0.02192s 	 Training Accuracy: 85.06% 	 Test Accuracy: 62.50%
[ 18/ 25] 	        MNIST 	 Time 0.02144s 	 Training Accuracy: 98.44% 	 Test Accuracy: 65.62%
[ 18/ 25] 	 FashionMNIST 	 Time 0.02161s 	 Training Accuracy: 87.99% 	 Test Accuracy: 68.75%
[ 19/ 25] 	        MNIST 	 Time 0.02190s 	 Training Accuracy: 98.44% 	 Test Accuracy: 68.75%
[ 19/ 25] 	 FashionMNIST 	 Time 0.02246s 	 Training Accuracy: 90.43% 	 Test Accuracy: 62.50%
[ 20/ 25] 	        MNIST 	 Time 0.02190s 	 Training Accuracy: 98.73% 	 Test Accuracy: 65.62%
[ 20/ 25] 	 FashionMNIST 	 Time 0.02253s 	 Training Accuracy: 90.62% 	 Test Accuracy: 65.62%
[ 21/ 25] 	        MNIST 	 Time 0.02302s 	 Training Accuracy: 99.80% 	 Test Accuracy: 68.75%
[ 21/ 25] 	 FashionMNIST 	 Time 0.02214s 	 Training Accuracy: 90.72% 	 Test Accuracy: 65.62%
[ 22/ 25] 	        MNIST 	 Time 0.02141s 	 Training Accuracy: 99.80% 	 Test Accuracy: 65.62%
[ 22/ 25] 	 FashionMNIST 	 Time 0.02235s 	 Training Accuracy: 92.09% 	 Test Accuracy: 62.50%
[ 23/ 25] 	        MNIST 	 Time 0.02981s 	 Training Accuracy: 100.00% 	 Test Accuracy: 68.75%
[ 23/ 25] 	 FashionMNIST 	 Time 0.02152s 	 Training Accuracy: 90.14% 	 Test Accuracy: 62.50%
[ 24/ 25] 	        MNIST 	 Time 0.02182s 	 Training Accuracy: 99.90% 	 Test Accuracy: 65.62%
[ 24/ 25] 	 FashionMNIST 	 Time 0.02109s 	 Training Accuracy: 90.62% 	 Test Accuracy: 59.38%
[ 25/ 25] 	        MNIST 	 Time 0.02092s 	 Training Accuracy: 100.00% 	 Test Accuracy: 68.75%
[ 25/ 25] 	 FashionMNIST 	 Time 0.02086s 	 Training Accuracy: 92.19% 	 Test Accuracy: 62.50%

[FINAL] 	        MNIST 	 Training Accuracy: 100.00% 	 Test Accuracy: 68.75%
[FINAL] 	 FashionMNIST 	 Training Accuracy: 92.19% 	 Test Accuracy: 62.50%

Appendix

julia

using InteractiveUtils
InteractiveUtils.versioninfo()

if @isdefined(MLDataDevices)
    if @isdefined(CUDA) && MLDataDevices.functional(CUDADevice)
        println()
        CUDA.versioninfo()
    end

    if @isdefined(AMDGPU) && MLDataDevices.functional(AMDGPUDevice)
        println()
        AMDGPU.versioninfo()
    end
end

Julia Version 1.10.5
Commit 6f3fdf7b362 (2024-08-27 14:19 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 48 × AMD EPYC 7402 24-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, znver2)
Threads: 48 default, 0 interactive, 24 GC (on 2 virtual cores)
Environment:
  JULIA_CPU_THREADS = 2
  JULIA_DEPOT_PATH = /root/.cache/julia-buildkite-plugin/depots/01872db4-8c79-43af-ab7d-12abac4f24f6
  LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64
  JULIA_PKG_SERVER = 
  JULIA_NUM_THREADS = 48
  JULIA_CUDA_HARD_MEMORY_LIMIT = 100%
  JULIA_PKG_PRECOMPILE_AUTO = 0
  JULIA_DEBUG = Literate

CUDA runtime 12.5, artifact installation
CUDA driver 12.5
NVIDIA driver 555.42.6

CUDA libraries: 
- CUBLAS: 12.5.3
- CURAND: 10.3.6
- CUFFT: 11.2.3
- CUSOLVER: 11.6.3
- CUSPARSE: 12.5.1
- CUPTI: 2024.2.1 (API 23.0.0)
- NVML: 12.0.0+555.42.6

Julia packages: 
- CUDA: 5.4.3
- CUDA_Driver_jll: 0.9.2+0
- CUDA_Runtime_jll: 0.14.1+0

Toolchain:
- Julia: 1.10.5
- LLVM: 15.0.7

Environment:
- JULIA_CUDA_HARD_MEMORY_LIMIT: 100%

1 device:
  0: NVIDIA A100-PCIE-40GB MIG 1g.5gb (sm_80, 2.170 GiB / 4.750 GiB available)

This page was generated using Literate.jl.

Trusted by

Training a HyperNetwork on MNIST and FashionMNIST ​

Package Imports ​

Loading Datasets ​

Implement a HyperNet Layer ​

Create and Initialize the HyperNet ​

Define Utility Functions ​

Training ​

Appendix ​

Training a HyperNetwork on MNIST and FashionMNIST

Package Imports

Loading Datasets

Implement a HyperNet Layer

Create and Initialize the HyperNet

Define Utility Functions

Training

Appendix