Converting MNIST and Fashion-MNIST IDX format to NumPy

MNIST and the newer Fashion-MNIST datasets are the most well-known datasets to test Machine Learning models. Although the original MNIST dataset is solved as an ML problem, it seems it will be with us for a long time.

These datasets are presented in a binary format. There are 4 files, 2 for the training set, 2 for the testing set that are in gzipped IDX and IDX3 formats. Although a simple format, it is non standard and requires to write a custom code.

For a professional project, I wrote the following two functions to get the contents of these files in NumPy format.


import struct
import gzip
import numpy as np

def mnist_images_idx_to_array(images_filename):
    images_f = gzip.open(images_filename, mode="rb")
    images_f.seek(0)
    magic = struct.unpack('>I', images_f.read(4))[0]
    if magic != 0x00000803:
        raise Exception(f"Format error: Need an IDX3 file: {images_filename}")
    n_images = struct.unpack('>I', images_f.read(4))[0]
    n_row = struct.unpack('>I', images_f.read(4))[0]
    n_col = struct.unpack('>I', images_f.read(4))[0]

    n_bytes = n_images * n_row * n_col  # each pixel is 1 byte

    images_data = struct.unpack(
        '>' + str(n_bytes) + 'B', images_f.read(n_bytes))

    images_array = np.asarray(images_data, dtype='uint8')
    images_array.shape = (n_images, n_row, n_col)

    return images_array


def mnist_labels_idx_to_array(labels_filename):
    labels_f = gzip.open(labels_filename, mode="rb")
    labels_f.seek(0)
    magic = struct.unpack('>I', labels_f.read(4))[0]
    if magic != 0x00000801:
        raise Exception(f"Format error: Need an IDX file: {labels_filename}")
    n_labels = struct.unpack('>I', labels_f.read(4))[0]
    labels_data = struct.unpack(
        '>' + str(n_labels) + 'B', labels_f.read(n_labels))
    labels_array = np.asarray(labels_data, dtype='uint8')
    return labels_array


You can use these functions by sending the appropriate filenames to the functions like



training_images = mnist_images_idx_to_array(
    os.path.join(input_dir, "train-images-idx3-ubyte.gz"))
training_labels = mnist_labels_idx_to_array(
    os.path.join(input_dir, "train-labels-idx1-ubyte.gz"))
testing_images = mnist_images_idx_to_array(
    os.path.join(input_dir, "t10k-images-idx3-ubyte.gz"))
testing_labels = mnist_labels_idx_to_array(
    os.path.join(input_dir, "t10k-labels-idx1-ubyte.gz"))