Source code for neodroidvision.classification.mechanims.attention.foveal.architecture.ram

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = "Christian Heider Nielsen"
__doc__ = r"""
Foveal attention, moves around gaze and yields glimpses
           """

from typing import Tuple

import torch
from torch import nn

from . import ram_modules

__all__ = ["RecurrentAttention"]


[docs]class RecurrentAttention(nn.Module):
    """A Recurrent Model of Visual Attention (RAM) [1].

    RAM is a recurrent neural network that processes
    inputs sequentially, attending to different locations
    within the image one at a time, and incrementally
    combining information from these fixations to build
    up a dynamic internal representation of the image.

    References:
    [1]: Minh et. al., https://arxiv.org/abs/1406.6247"""

[docs]    def __init__(
        self,
        size_glimpse,
        num_patches_per_glimpse,
        scale_factor_suc,
        num_channels,
        hidden_size_glimpse,
        hidden_size_locator,
        std_policy,
        hidden_size_rnn,
        num_classes,
    ):
        """Constructor.

        Args:
        size_glimpse: size of the square patches in the glimpses extracted by the retina.
        num_patches_per_glimpse: number of patches to extract per glimpse.
        scale_factor_suc: scaling factor that controls the size of successive patches.
        num_channels: number of channels in each image.
        hidden_size_glimpse: hidden layer size of the fc layer for `phi`.
        hidden_size_locator: hidden layer size of the fc layer for `l`.
        std_policy: standard deviation of the Gaussian policy.
        hidden_size_rnn: hidden size of the rnn.
        num_classes: number of classes in the dataset.
        num_glimpses: number of glimpses to take per image,
        i.e. number of BPTT steps."""
        super().__init__()

        self._sensor = ram_modules.GlimpseSensor(
            hidden_size_glimpse,
            hidden_size_locator,
            size_glimpse,
            num_patches_per_glimpse,
            scale_factor_suc,
            num_channels,
        )
        self._rnn = ram_modules.CoreRNN(hidden_size_rnn, hidden_size_rnn)
        self._locator_policy = ram_modules.Locator(hidden_size_rnn, 2, std_policy)
        self.classifier = ram_modules.Actor(hidden_size_rnn, num_classes)
        self._signal_baseline = ram_modules.SignalBaseline(hidden_size_rnn, 1)

[docs]    def forward(
        self,
        x: torch.Tensor,
        l_t_prev: torch.Tensor,
        h_t_prev: torch.Tensor,
        last: bool = False,
    ) -> Tuple[torch.Tensor, ...]:
        """Run RAM for one step on a minibatch of images.

        Args:
        x: a 4D Tensor of shape (B, H, W, C). The minibatch
            of images.
        l_t_prev: a 2D tensor of shape (B, 2). The location vector
            containing the glimpse coordinates [x, y] for the previous
            step `t-1`.
        h_t_prev: a 2D tensor of shape (B, hidden_size). The hidden
            state vector for the previous step `t-1`.
        last: a bool indicating whether this is the last step.
            If True, the action network returns an output probability
            vector over the classes and the baseline `b_t` for the
            current step `t`. Else, the core network returns the
            hidden state vector for the next step `t+1` and the
            location vector for the next step `t+1`.

        Returns:
        h_t: a 2D tensor of shape (B, hidden_size). The hidden
            state vector for the current step `t`.
        mu: a 2D tensor of shape (B, 2). The mean that parametrizes
            the Gaussian policy.
        l_t: a 2D tensor of shape (B, 2). The location vector
            containing the glimpse coordinates [x, y] for the
            current step `t`.
        b_t: a vector of length (B,). The baseline for the
            current time step `t`.
        log_probas: a 2D tensor of shape (B, num_classes). The
            output log probability vector over the classes.
        log_pi: a vector of length (B,)."""
        h_t = self._rnn(self._sensor(x, l_t_prev), h_t_prev)

        log_pi, l_t = self._locator_policy(h_t)
        b_t = self._signal_baseline(h_t).squeeze()

        if last:
            return h_t, l_t, b_t, self.classifier(h_t), log_pi  # log_probas

        return h_t, l_t, b_t, log_pi
Source code for neodroidvision.classification.mechanims.attention.foveal.architecture.ram

neodroidvision

Navigation

Related Topics