123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- module dataManager
- using MAT
- using Base.Iterators: repeated, partition
- using Statistics
- using Flux.Data.MNIST
- using Flux:onehotbatch
- # dimension of coordinates (labels): (x, y)
- lbls_dims = (1080, 980)
- lbls_offset = (0, 699)
- """
- make_minibatch(X, Y, idxset)
-
- loads and bundles training data and labels into batches
- X should be of size Width x Height x channels x batchsize
- Y should be of size 2 x batchsize
- """
- function make_minibatch(X, Y, idxset)
- X_batch = Array{Float32}(undef, size(X, 1), size(X, 2), 1, length(idxset))
- Y_batch = Array{Float32}(undef, 2, length(idxset))
- for i in 1:length(idxset)
-
- X_batch[:, :, :, i] = Float32.(X[:, :, :, idxset[i]])
- Y_batch[:, i] = Float32.(Y[:, idxset[i]])
- end
- return (X_batch, Y_batch)
- end
- """
- make_batch(filepath, filenames...; batch_size=100, normalize_data=true, truncate_data=false)
-
- Creates batches with size batch_size(default 100) from filenames at given filepath. Images will be normalized if normalize is set (default true).
- If batch_size equals -1 the batch size will be the size of the dataset
- Structure of the .mat file:
- fieldname | size
- ----------------
- data | 60 x 6 x N
- bin_targets | 2 x N (1: x, 2: y)
- where N denotes the number of samples, 50 is the window size and 6 are the number of channels
- """
- function make_batch(filepath, filenames...; batch_size=100, normalize_data=true, truncate_data=false)
- data = nothing # Array{Float64}(undef, 0)
- labels = nothing # Array{Float64}(undef, 0)
- for (i, filename) in enumerate(filenames)
- # load the data from the mat file
- file = "$filepath$filename"
- @debug("Reading $(i) of $(length(filenames)) from $(file)")
- matfile = matopen(file)
- # size(images) = (N, width, height, 1)
- dataPart = read(matfile, "data")
- # size(bin_targets) = (N, 10)
- labelsPart = read(matfile, "labels")
- close(matfile)
- if (isnothing(data)) data = dataPart; labels = labelsPart;
- else
- data = cat(dims=3, data, dataPart)
- labels = cat(dims=2, labels, labelsPart)
- end
- end
-
- # add singleton dimension and permute dims so it matches the convention of Flux width x height x channels x batchsize(Setsize)
- data = cat(dims=4, data)
-
- # normalize the labels
- labels = (labels .- lbls_offset) ./ lbls_dims
- # rearrange the data array
- # size(data) = (50, 6, 1, N)
- data = permutedims(data, (1, 2, 4, 3))
- @debug("Dimension of data $(size(data))")
- @debug("Dimension of binary targets $(size(labels))")
-
-
- if(normalize_data)
- normalize!(data, truncate_data)
- end
-
- # Convert to Float32
- labels = convert(Array{Float32}, labels)
- data = convert(Array{Float32}, data)
-
- # display one sample of the images depends on PyPlot!
- # matshow(dropdims(images[:,:,:,10], dims=3), cmap=PyPlot.cm.gray, vmin=0, vmax=255)
-
- if ( batch_size == -1 )
- batch_size = size(data, 4)
- end
- idxsets = partition(1:size(data, 4), batch_size)
- data_set = [make_minibatch(data, labels, i) for i in idxsets];
-
- return data_set
- end # function make_batch
- """
- normalize input images along the batch and channel dimension
- input should have standart flux order: Widht x height x channels x batchsize
- if truncate is set to true the last 1% beyond 2.576 sigma will be clipped to 2.576 sigma
- """
- function normalize!(data, truncate)
- mean_data = mean(data, dims=4)
- std_data = std(data, mean=mean_data, dims=4)
-
- setsize = size(data, 4)
-
- @debug("normalize dataset")
- std_data_tmp = copy(std_data)
- std_data_tmp[std_data_tmp .== 0] .= 1
- for i in 1:setsize
- data[:, :, :, i] = (data[:, :, :, i] - mean_data) ./ std_data_tmp
- end
- if(truncate)
- # truncate the last 1% beyond 2.576 sigma
- data[data .> 2.576] .= 2.576
- data[data .< -2.576] .= -2.576
- end
- return (mean_data, std_data)
- end
- end # module dataManager
|