Converting MNIST and Fashion-MNIST IDX format to NumPy
MNIST and the newer Fashion-MNIST datasets are the most well-known datasets to test Machine Learning models. Although the original MNIST dataset is solved as an ML problem, it seems it will be with us for a long time.
These datasets are presented in a binary format. There are 4 files, 2 for the training set, 2 for the testing set that are in gzipped IDX
and IDX3
formats. Although a simple format, it is non standard and requires to write a custom code.
For a professional project, I wrote the following two functions to get the contents of these files in NumPy format.
import struct
import gzip
import numpy as np
def mnist_images_idx_to_array(images_filename):
images_f = gzip.open(images_filename, mode="rb")
images_f.seek(0)
magic = struct.unpack('>I', images_f.read(4))[0]
if magic != 0x00000803:
raise Exception(f"Format error: Need an IDX3 file: {images_filename}")
n_images = struct.unpack('>I', images_f.read(4))[0]
n_row = struct.unpack('>I', images_f.read(4))[0]
n_col = struct.unpack('>I', images_f.read(4))[0]
n_bytes = n_images * n_row * n_col # each pixel is 1 byte
images_data = struct.unpack(
'>' + str(n_bytes) + 'B', images_f.read(n_bytes))
images_array = np.asarray(images_data, dtype='uint8')
images_array.shape = (n_images, n_row, n_col)
return images_array
def mnist_labels_idx_to_array(labels_filename):
labels_f = gzip.open(labels_filename, mode="rb")
labels_f.seek(0)
magic = struct.unpack('>I', labels_f.read(4))[0]
if magic != 0x00000801:
raise Exception(f"Format error: Need an IDX file: {labels_filename}")
n_labels = struct.unpack('>I', labels_f.read(4))[0]
labels_data = struct.unpack(
'>' + str(n_labels) + 'B', labels_f.read(n_labels))
labels_array = np.asarray(labels_data, dtype='uint8')
return labels_array
You can use these functions by sending the appropriate filenames to the functions like
training_images = mnist_images_idx_to_array(
os.path.join(input_dir, "train-images-idx3-ubyte.gz"))
training_labels = mnist_labels_idx_to_array(
os.path.join(input_dir, "train-labels-idx1-ubyte.gz"))
testing_images = mnist_images_idx_to_array(
os.path.join(input_dir, "t10k-images-idx3-ubyte.gz"))
testing_labels = mnist_labels_idx_to_array(
os.path.join(input_dir, "t10k-labels-idx1-ubyte.gz"))