module dataManager using MAT using Base.Iterators: repeated, partition using Statistics using Flux.Data.MNIST using Flux:onehotbatch # dimension of coordinates (labels): (x, y) lbls_dims = (1080, 980) lbls_offset = (0, 699) """ make_minibatch(X, Y, idxset) loads and bundles training data and labels into batches X should be of size Width x Height x channels x batchsize Y should be of size 2 x batchsize """ function make_minibatch(X, Y, idxset) X_batch = Array{Float32}(undef, size(X, 1), size(X, 2), 1, length(idxset)) Y_batch = Array{Float32}(undef, 2, length(idxset)) for i in 1:length(idxset) X_batch[:, :, :, i] = Float32.(X[:, :, :, idxset[i]]) Y_batch[:, i] = Float32.(Y[:, idxset[i]]) end return (X_batch, Y_batch) end """ make_batch(filepath, filenames...; batch_size=100, normalize_data=true, truncate_data=false) Creates batches with size batch_size(default 100) from filenames at given filepath. Images will be normalized if normalize is set (default true). If batch_size equals -1 the batch size will be the size of the dataset Structure of the .mat file: fieldname | size ---------------- data | 60 x 6 x N bin_targets | 2 x N (1: x, 2: y) where N denotes the number of samples, 50 is the window size and 6 are the number of channels """ function make_batch(filepath, filenames...; batch_size=100, normalize_data=true, truncate_data=false) data = nothing # Array{Float64}(undef, 0) labels = nothing # Array{Float64}(undef, 0) for (i, filename) in enumerate(filenames) # load the data from the mat file file = "$filepath$filename" @debug("Reading $(i) of $(length(filenames)) from $(file)") matfile = matopen(file) # size(images) = (N, width, height, 1) dataPart = read(matfile, "data") # size(bin_targets) = (N, 10) labelsPart = read(matfile, "labels") close(matfile) if (isnothing(data)) data = dataPart; labels = labelsPart; else data = cat(dims=3, data, dataPart) labels = cat(dims=2, labels, labelsPart) end end # add singleton dimension and permute dims so it matches the convention of Flux width x height x channels x batchsize(Setsize) data = cat(dims=4, data) # normalize the labels labels = (labels .- lbls_offset) ./ lbls_dims # rearrange the data array # size(data) = (50, 6, 1, N) data = permutedims(data, (1, 2, 4, 3)) @debug("Dimension of data $(size(data))") @debug("Dimension of binary targets $(size(labels))") if(normalize_data) normalize!(data, truncate_data) end # Convert to Float32 labels = convert(Array{Float32}, labels) data = convert(Array{Float32}, data) # display one sample of the images depends on PyPlot! # matshow(dropdims(images[:,:,:,10], dims=3), cmap=PyPlot.cm.gray, vmin=0, vmax=255) if ( batch_size == -1 ) batch_size = size(data, 4) end idxsets = partition(1:size(data, 4), batch_size) data_set = [make_minibatch(data, labels, i) for i in idxsets]; return data_set end # function make_batch """ normalize input images along the batch and channel dimension input should have standart flux order: Widht x height x channels x batchsize if truncate is set to true the last 1% beyond 2.576 sigma will be clipped to 2.576 sigma """ function normalize!(data, truncate) mean_data = mean(data, dims=4) std_data = std(data, mean=mean_data, dims=4) setsize = size(data, 4) @debug("normalize dataset") std_data_tmp = copy(std_data) std_data_tmp[std_data_tmp .== 0] .= 1 for i in 1:setsize data[:, :, :, i] = (data[:, :, :, i] - mean_data) ./ std_data_tmp end if(truncate) # truncate the last 1% beyond 2.576 sigma data[data .> 2.576] .= 2.576 data[data .< -2.576] .= -2.576 end return (mean_data, std_data) end end # module dataManager