Source code for bfgn.data_management.scalers

import logging
import os
import sys
from typing import List

import joblib
import numpy as np
import sklearn.preprocessing


_logger = logging.getLogger(__name__)


[docs]def get_available_scalers() -> List[str]:
    """Gets list of available scaler names.

    Returns:
        List of available scaler names.
    """
    return sorted([attr for attr in sys.modules[__name__].__dict__.keys()
                   if not attr.startswith('Base') and attr.endswith('Scaler')])


[docs]def get_scaler(scaler_name: str, scaler_options: dict) -> 'BaseGlobalScaler':
    """Gets scaler matching the provided name.

    Args:
        scaler_name: Scaler name from available scalers.
        scaler_options: Configuration for requested scaler.

    Returns:
        Scaler matching the provided name.
    """
    available_scalers = get_available_scalers()
    assert scaler_name in available_scalers, \
        'Scaler {} not in available scalers: {}'.format(scaler_name, available_scalers)
    return getattr(sys.modules[__name__], scaler_name)(**scaler_options)


[docs]class BaseGlobalScaler(object):
    """
    Scalers handle the process of transforming data prior to fitting or predicting using the neural network, as
    well as inverse transforming the data for applications or review afterwards. In this case, we use readily available
    scalers from the scikit-learn package to handle the nitty-gritty of the transform and inverse transform, and we use
    the Scaler class to handle the nitty-gritty of reshaping and otherwise handling the image arrays.
    """
    savename = None
    scaler_name = None

    def __init__(self, savename_base=None):
        """
        :param savename_base: the directory and optionally filename prefix for saving data
        """
        if (savename_base is not None):
            self.savename = savename_base + self.scaler_name
        self.is_fitted = False

    # TODO: In all of the below, handle (IE ignore) transformations on categorical data
[docs]    def fit(self, image_array):
        assert self.is_fitted is False, 'Scaler has already been fit to data'
        self._fit(image_array)
        self.is_fitted = True

    def _fit(self, image_array):
        raise NotImplementedError

[docs]    def inverse_transform(self, image_array):
        raise NotImplementedError

[docs]    def transform(self, image_array):
        raise NotImplementedError

[docs]    def fit_transform(self, image_array):
        self.fit(image_array)
        return self.transform(image_array)

[docs]    def save(self):
        raise NotImplementedError

[docs]    def load(self):
        raise NotImplementedError


[docs]class BaseSklearnScaler(BaseGlobalScaler):
    scaler = None

    def __init__(self, savename_base):
        self.scaler_name = 'sklearn_' + self.scaler.__class__.__name__
        super().__init__(savename_base)

    def _fit(self, image_array):
        # Reshape to (num_samples, num_features)
        image_array = self._reshape_image_array(image_array)
        self.scaler.fit(image_array)

[docs]    def inverse_transform(self, image_array):
        # Reshape to (num_samples, num_features) for sklearn
        shape = image_array.shape
        image_array = self._reshape_image_array(image_array)
        image_array = self.scaler.inverse_transform(image_array)
        return image_array.reshape(shape)

[docs]    def transform(self, image_array):
        # Reshape to (num_samples, num_features) for sklearn
        shape = image_array.shape
        image_array = self._reshape_image_array(image_array)
        image_array = self.scaler.transform(image_array)
        return image_array.reshape(shape)

    def _reshape_image_array(self, image_array):
        # The second dimension is image_array.shape[-1] which is the num_channels, so the first dimension is
        # image width x image height
        if (len(image_array.shape) > 2):
            return image_array.reshape(-1, image_array.shape[-1])
        else:
            return image_array

[docs]    def save(self):
        joblib.dump(self.scaler, self.savename)

[docs]    def load(self):
        if (os.path.isfile(self.savename)):
            self.scaler = joblib.load(self.savename)
            self.is_fitted = True


[docs]class NullScaler(BaseGlobalScaler):

    def __init__(self, savename_base):
        self.scaler_name = 'NullScaler'
        super().__init__(savename_base)

    def _fit(self, image_array):
        return image_array

[docs]    def inverse_transform(self, image_array):
        return image_array

[docs]    def transform(self, image_array):
        return image_array

[docs]    def save(self):
        pass

[docs]    def load(self):
        self.is_fitted = True


[docs]class ConstantScaler(BaseGlobalScaler):
    constant_scaler = None
    constant_offset = None

    def __init__(self, savename_base, constant_scaler=None, constant_offset=None):
        self.constant_scaler = constant_scaler
        self.constant_offset = constant_offset

        self.scaler_name = 'ConstantScaler'
        super().__init__(savename_base)

    def _fit(self, image_array):
        pass

[docs]    def inverse_transform(self, image_array):
        image_array = (image_array - self.constant_offset) * self.constant_scaler
        return image_array

[docs]    def transform(self, image_array):
        image_array = image_array / self.constant_scaler + self.constant_offset
        return image_array

[docs]    def save(self):
        np.savez(self.savename + '.npz', constant_scaler=self.constant_scaler, constant_offset=self.constant_offset)

[docs]    def load(self):
        if (os.path.isfile(self.savename + '.npz')):
            npzf = np.load(self.savename + '.npz')
            self.constant_scaler = npzf['constant_scaler']
            self.constant_offset = npzf['constant_offset']
            self.is_fitted = True


[docs]class StandardScaler(BaseSklearnScaler):

    def __init__(self, savename_base):
        self.scaler = sklearn.preprocessing.StandardScaler(copy=True)
        super().__init__(savename_base)


[docs]class MinMaxScaler(BaseSklearnScaler):

    def __init__(self, savename_base, feature_range=(0, 1)):
        self.scaler = sklearn.preprocessing.MinMaxScaler(feature_range=feature_range, copy=True)
        super().__init__(savename_base)


[docs]class RobustScaler(BaseSklearnScaler):

    def __init__(self, savename_base, quantile_range=(10.0, 90.0)):
        self.scaler = sklearn.preprocessing.RobustScaler(quantile_range=quantile_range, copy=True)
        super().__init__(savename_base)


[docs]class PowerScaler(BaseSklearnScaler):

    def __init__(self, savename_base, method='box-cox'):
        self.scaler = sklearn.preprocessing.PowerTransformer(method=method, copy=True)
        super().__init__(savename_base)


[docs]class QuantileUniformScaler(BaseSklearnScaler):

    def __init__(self, savename_base, output_distribution='uniform'):
        self.scaler = sklearn.preprocessing.QuantileTransformer(output_distribution=output_distribution, copy=True)
        super().__init__(savename_base)