Source code for neodroidvision.data.classification.mnist

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = "Christian Heider Nielsen"
__doc__ = r"""

           Created on 25/03/2020
           """

from pathlib import Path
from typing import Sequence, Tuple

import numpy
import torch
from draugr.numpy_utilities import SplitEnum
from draugr.torch_utilities import SupervisedDataset, global_pin_memory
from matplotlib import pyplot
from torch.utils.data import Subset
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import datasets, transforms

__all__ = ["MNISTDataset"]


[docs]class MNISTDataset(SupervisedDataset):
    """description"""

[docs]    def __init__(self, data_dir: Path, split: SplitEnum = SplitEnum.training):
        super().__init__()
        if split == SplitEnum.training:
            self._dataset = datasets.MNIST(
                str(data_dir), train=True, download=True, transform=self.trans
            )
        else:
            self._dataset = datasets.MNIST(
                str(data_dir), train=False, download=True, transform=self.trans
            )

    def __getitem__(self, index):
        return self._dataset.__getitem__(index)

    def __len__(self):
        return self._dataset.__len__()

    @property
    def predictor_shape(self) -> Tuple[int, ...]:
        """

        :return:
        :rtype:"""
        return 1, 28, 28

    @property
    def response_shape(self) -> Tuple[int, ...]:
        """

        :return:
        :rtype:"""
        return (10,)

    trans = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
    )

    inverse_transform = transforms.Compose(
        [
            # transforms.Normalize((-mean / std).tolist(), (1.0 / std).tolist()), # TODO: imp
            transforms.ToPILImage()
        ]
    )

[docs]    @staticmethod
    def get_train_valid_loader(
        data_dir: Path,
        *,
        batch_size: int,
        random_seed: int,
        valid_size: float = 0.1,
        shuffle: bool = True,
        num_workers: int = 0,
        pin_memory: bool = False,
        using_cuda: bool = True,
    ) -> Tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]:
        """Train and validation data loaders.

        If using CUDA, num_workers should be set to 1 and pin_memory to True.

        Args:
        data_dir: path directory to the dataset.
        batch_size: how many samples per batch to load.
        random_seed: fix seed for reproducibility.
        valid_size: percentage split of the training set used for
            the validation set. Should be a float in the range [0, 1].
            In the paper, this number is set to 0.1.
        shuffle: whether to shuffle the train/validation indices.
        show_sample: plot 9x9 sample grid of the dataset.
        num_workers: number of subprocesses to use when loading the dataset.
        pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
            True if using GPU.
            :param data_dir:
            :type data_dir:
            :param batch_size:
            :type batch_size:
            :param random_seed:
            :type random_seed:
            :param valid_size:
            :type valid_size:
            :param shuffle:
            :type shuffle:
            :param num_workers:
            :type num_workers:
            :param pin_memory:
            :type pin_memory:
            :param using_cuda:
            :type using_cuda:"""
        error_msg = "[!] valid_size should be in the range [0, 1]."
        assert (valid_size >= 0) and (valid_size <= 1), error_msg

        if using_cuda:
            pass
            # assert num_workers == 1
            # assert pin_memory == True

        dataset = MNISTDataset(data_dir)
        num_train = len(dataset)
        indices = list(range(num_train))
        split = int(numpy.floor(valid_size * num_train))

        if shuffle:
            numpy.random.seed(random_seed)
            numpy.random.shuffle(indices)

        train_idx, valid_idx = indices[split:], indices[:split]

        train_sampler = SubsetRandomSampler(train_idx)
        valid_sampler = SubsetRandomSampler(valid_idx)

        train_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=batch_size,
            sampler=train_sampler,
            num_workers=num_workers,
            pin_memory=pin_memory,
        )

        valid_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=batch_size,
            sampler=valid_sampler,
            num_workers=num_workers,
            pin_memory=pin_memory,
        )

        return train_loader, valid_loader

[docs]    @staticmethod
    def get_test_loader(
        data_dir: Path,
        batch_size: int,
        *,
        num_workers: int = 0,
        pin_memory: bool = False,
        using_cuda: bool = True,
    ) -> torch.utils.data.DataLoader:
        """Test datalaoder.

        If using CUDA, num_workers should be set to 1 and pin_memory to True.

        Args:
        data_dir: path directory to the dataset.
        batch_size: how many samples per batch to load.
        num_workers: number of subprocesses to use when loading the dataset.
        pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
            True if using GPU.
        :param data_dir:
        :type data_dir:
        :param batch_size:
        :type batch_size:
        :param num_workers:
        :type num_workers:
        :param pin_memory:
        :type pin_memory:
        :param using_cuda:
        :type using_cuda:"""
        # define transforms

        if using_cuda:
            pass
            # assert num_workers == 1
            # assert pin_memory == True

        dataset = MNISTDataset(data_dir, split=SplitEnum.testing)

        data_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=num_workers,
            pin_memory=pin_memory,
        )

        return data_loader

[docs]    def sample(self) -> None:
        """description"""
        images, labels = next(
            iter(
                torch.utils.data.DataLoader(
                    self,
                    batch_size=9,
                    shuffle=True,
                    num_workers=0,
                    pin_memory=global_pin_memory(0),
                )
            )
        )
        X = images.numpy()
        X = numpy.transpose(X, [0, 2, 3, 1])
        MNISTDataset.plot_images(X, labels)

[docs]    @staticmethod
    def plot_images(images: numpy.ndarray, label: Sequence) -> None:
        """

        :param images:
        :type images:
        :param label:
        :type label:"""
        images = images.squeeze()
        assert len(images) == len(label) == 9

        fig, axes = pyplot.subplots(3, 3)
        for i, ax in enumerate(axes.flat):
            ax.imshow(images[i], cmap="Greys_r")
            xlabel = f"{label[i]}"
            ax.set_xlabel(xlabel)
            ax.set_xticks([])
            ax.set_yticks([])

        pyplot.show()


if __name__ == "__main__":

    def a():
        """description"""
        MNISTDataset(Path.home() / "Data" / "MNIST").sample()
        pyplot.show()

        a()
Source code for neodroidvision.data.classification.mnist

neodroidvision

Navigation

Related Topics