Source code for ffcv.transforms.normalize

"""
Image normalization
"""
from collections.abc import Sequence
from typing import Tuple

import numpy as np
import torch as ch
from numpy import dtype
from numpy.random import rand
from dataclasses import replace
from typing import Callable, Optional, Tuple
from ..pipeline.allocation_query import AllocationQuery
from ..pipeline.operation import Operation
from ..pipeline.state import State
from ..pipeline.compiler import Compiler

def ch_dtype_from_numpy(dtype):
    return ch.from_numpy(np.zeros((), dtype=dtype)).dtype

[docs]class NormalizeImage(Operation):
    """Fast implementation of normalization and type conversion for uint8 images
    to any floating point dtype.

    Works on both GPU and CPU tensors.

    Parameters
    ----------
    mean: np.ndarray
        The mean vector.
    std: np.ndarray
        The standard deviation vector.
    type: np.dtype
        The desired output type for the result as a numpy type.
        If the transform is applied on a GPU tensor it will be converted
        as the equivalent torch dtype.
    """

    def __init__(self, mean: np.ndarray, std: np.ndarray,
                 type: np.dtype):
        super().__init__()
        table = (np.arange(256)[:, None] - mean[None, :]) / std[None, :]
        self.original_dtype = type
        table = table.astype(type)
        if type == np.float16:
            type = np.int16
        self.dtype = type
        table = table.view(type)
        self.lookup_table = table
        self.previous_shape = None
        self.mode = 'cpu'

[docs]    def generate_code(self) -> Callable:
        if self.mode == 'cpu':
            return self.generate_code_cpu()
        return self.generate_code_gpu()

[docs]    def generate_code_gpu(self) -> Callable:

        # We only import cupy if it's truly needed
        import cupy as cp
        import pytorch_pfn_extras as ppe

        tn = np.zeros((), dtype=self.dtype).dtype.name
        kernel = cp.ElementwiseKernel(f'uint8 input, raw {tn} table', f'{tn} output', 'output = table[input * 3 + i % 3];')
        final_type = ch_dtype_from_numpy(self.original_dtype)
        s = self
        def normalize_convert(images, result):
            B, C, H, W = images.shape
            table = self.lookup_table.view(-1)
            assert images.is_contiguous(memory_format=ch.channels_last), 'Images need to be in channel last'
            result = result[:B]
            result_c = result.view(-1)
            images = images.permute(0, 2, 3, 1).view(-1)

            current_stream = ch.cuda.current_stream()
            with ppe.cuda.stream(current_stream):
                kernel(images, table, result_c)

            # Mark the result as channel last
            final_result = result.reshape(B, H, W, C).permute(0, 3, 1, 2)

            assert final_result.is_contiguous(memory_format=ch.channels_last), 'Images need to be in channel last'

            return final_result.view(final_type)

        return normalize_convert

[docs]    def generate_code_cpu(self) -> Callable:

        table = self.lookup_table.view(dtype=self.dtype)
        my_range = Compiler.get_iterator()

        def normalize_convert(images, result, indices):
            result_flat = result.reshape(result.shape[0], -1, 3)
            num_pixels = result_flat.shape[1]
            for i in my_range(len(indices)):
                image = images[i].reshape(num_pixels, 3)
                for px in range(num_pixels):
                    # Just in case llvm forgets to unroll this one
                    result_flat[i, px, 0] = table[image[px, 0], 0]
                    result_flat[i, px, 1] = table[image[px, 1], 1]
                    result_flat[i, px, 2] = table[image[px, 2], 2]

            return result

        normalize_convert.is_parallel = True
        normalize_convert.with_indices = True
        return normalize_convert

[docs]    def declare_state_and_memory(self, previous_state: State) -> Tuple[State, Optional[AllocationQuery]]:

        if previous_state.device == ch.device('cpu'):
            new_state = replace(previous_state, jit_mode=True, dtype=self.dtype)
            return new_state, AllocationQuery(
                shape=previous_state.shape,
                dtype=self.dtype,
                device=previous_state.device
            )

        else:
            self.mode = 'gpu'
            new_state = replace(previous_state, dtype=self.dtype)

            gpu_type = ch_dtype_from_numpy(self.dtype)


            # Copy the lookup table into the proper device
            try:
                self.lookup_table = ch.from_numpy(self.lookup_table)
            except TypeError:
                pass  # This is alredy a tensor
            self.lookup_table = self.lookup_table.to(previous_state.device)

            return new_state, AllocationQuery(
                shape=previous_state.shape,
                device=previous_state.device,
                dtype=gpu_type
            )