"""
Image normalization
"""
from collections.abc import Sequence
from typing import Tuple
import numpy as np
import torch as ch
from numpy import dtype
from numpy.random import rand
from dataclasses import replace
from typing import Callable, Optional, Tuple
from ..pipeline.allocation_query import AllocationQuery
from ..pipeline.operation import Operation
from ..pipeline.state import State
from ..pipeline.compiler import Compiler
def ch_dtype_from_numpy(dtype):
return ch.from_numpy(np.zeros((), dtype=dtype)).dtype
[docs]class NormalizeImage(Operation):
"""Fast implementation of normalization and type conversion for uint8 images
to any floating point dtype.
Works on both GPU and CPU tensors.
Parameters
----------
mean: np.ndarray
The mean vector.
std: np.ndarray
The standard deviation vector.
type: np.dtype
The desired output type for the result as a numpy type.
If the transform is applied on a GPU tensor it will be converted
as the equivalent torch dtype.
"""
def __init__(self, mean: np.ndarray, std: np.ndarray,
type: np.dtype):
super().__init__()
table = (np.arange(256)[:, None] - mean[None, :]) / std[None, :]
self.original_dtype = type
table = table.astype(type)
if type == np.float16:
type = np.int16
self.dtype = type
table = table.view(type)
self.lookup_table = table
self.previous_shape = None
self.mode = 'cpu'
[docs] def generate_code(self) -> Callable:
if self.mode == 'cpu':
return self.generate_code_cpu()
return self.generate_code_gpu()
[docs] def generate_code_gpu(self) -> Callable:
# We only import cupy if it's truly needed
import cupy as cp
import pytorch_pfn_extras as ppe
tn = np.zeros((), dtype=self.dtype).dtype.name
kernel = cp.ElementwiseKernel(f'uint8 input, raw {tn} table', f'{tn} output', 'output = table[input * 3 + i % 3];')
final_type = ch_dtype_from_numpy(self.original_dtype)
s = self
def normalize_convert(images, result):
B, C, H, W = images.shape
table = self.lookup_table.view(-1)
assert images.is_contiguous(memory_format=ch.channels_last), 'Images need to be in channel last'
result = result[:B]
result_c = result.view(-1)
images = images.permute(0, 2, 3, 1).view(-1)
current_stream = ch.cuda.current_stream()
with ppe.cuda.stream(current_stream):
kernel(images, table, result_c)
# Mark the result as channel last
final_result = result.reshape(B, H, W, C).permute(0, 3, 1, 2)
assert final_result.is_contiguous(memory_format=ch.channels_last), 'Images need to be in channel last'
return final_result.view(final_type)
return normalize_convert
[docs] def generate_code_cpu(self) -> Callable:
table = self.lookup_table.view(dtype=self.dtype)
my_range = Compiler.get_iterator()
def normalize_convert(images, result, indices):
result_flat = result.reshape(result.shape[0], -1, 3)
num_pixels = result_flat.shape[1]
for i in my_range(len(indices)):
image = images[i].reshape(num_pixels, 3)
for px in range(num_pixels):
# Just in case llvm forgets to unroll this one
result_flat[i, px, 0] = table[image[px, 0], 0]
result_flat[i, px, 1] = table[image[px, 1], 1]
result_flat[i, px, 2] = table[image[px, 2], 2]
return result
normalize_convert.is_parallel = True
normalize_convert.with_indices = True
return normalize_convert
[docs] def declare_state_and_memory(self, previous_state: State) -> Tuple[State, Optional[AllocationQuery]]:
if previous_state.device == ch.device('cpu'):
new_state = replace(previous_state, jit_mode=True, dtype=self.dtype)
return new_state, AllocationQuery(
shape=previous_state.shape,
dtype=self.dtype,
device=previous_state.device
)
else:
self.mode = 'gpu'
new_state = replace(previous_state, dtype=self.dtype)
gpu_type = ch_dtype_from_numpy(self.dtype)
# Copy the lookup table into the proper device
try:
self.lookup_table = ch.from_numpy(self.lookup_table)
except TypeError:
pass # This is alredy a tensor
self.lookup_table = self.lookup_table.to(previous_state.device)
return new_state, AllocationQuery(
shape=previous_state.shape,
device=previous_state.device,
dtype=gpu_type
)