Source code for neodroidvision.data.synthesis.conversion.mnist.h5_mnist_data
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
__author__ = "Christian Heider Nielsen"
__doc__ = r"""
Created on 22/03/2020
"""
from pathlib import Path
from draugr.visualisation import progress_bar
from warg import Triple
"""Get the binarized MNIST dataset and convert to hdf5.
From https://github.com/yburda/iwae/blob/master/datasets.py
"""
import urllib.request
import h5py
import numpy
from neodroidvision import PROJECT_APP_PATH
__all__ = []
# __all__ = ["download_binary_mnist", "parse_binary_mnist"]
[docs]def parse_binary_mnist(data_dir: Path) -> Triple:
"""
Args:
data_dir:
Returns:
"""
def lines_to_np_array(lines):
"""
Args:
lines:
Returns:
"""
return numpy.array([[int(i) for i in line.split()] for line in lines])
with open(str(data_dir / "binarized_mnist_train.amat")) as f:
lines = f.readlines()
train_data = lines_to_np_array(lines).astype("float32")
with open(str(data_dir / "binarized_mnist_valid.amat")) as f:
lines = f.readlines()
validation_data = lines_to_np_array(lines).astype("float32")
with open(str(data_dir / "binarized_mnist_test.amat")) as f:
lines = f.readlines()
test_data = lines_to_np_array(lines).astype("float32")
return train_data, validation_data, test_data
[docs]def download_binary_mnist(
file_path: str = "binary_mnist.h5",
data_dir: Path = (PROJECT_APP_PATH.user_data / "vanilla_vae" / "data"),
):
"""
Args:
file_path:
data_dir:
"""
if not data_dir.exists():
data_dir.mkdir(parents=True)
subdatasets = ["train", "valid", "test"]
for subdataset in progress_bar(subdatasets):
filename = f"binarized_mnist_{subdataset}.amat"
url = (
f"http://www.cs.toronto.edu/~larocheh/public/datasets/binarized_mnist"
f"/binarized_mnist_{subdataset}.amat"
)
local_filename = str(data_dir / filename)
urllib.request.urlretrieve(url, local_filename)
train, validation, test = parse_binary_mnist(data_dir)
data_dict = {"train": train, "valid": validation, "test": test}
f = h5py.File(file_path, "w")
f.create_dataset("train", data=data_dict["train"])
f.create_dataset("valid", data=data_dict["valid"])
f.create_dataset("test", data=data_dict["test"])
f.close()
print(f"Saved binary MNIST data to: {file_path}")
if __name__ == "__main__":
download_binary_mnist()