feat: Gumbel Softmax and Activation Interface

jfsantos-ds · Francisco Santos · commit f4477b80ed53 · 2021-12-10T12:55:36.000Z
diff --git a/src/ydata_synthetic/preprocessing/base_processor.py b/src/ydata_synthetic/preprocessing/base_processor.py
@@ -1,36 +1,38 @@
-"Implements a BaseProcessor Class, not meant to be directly instantiated."
+"Base class of Data Preprocessors, do not instantiate this class directly."
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from collections import namedtuple
 from typing import List, Optional
 
-from numpy import ndarray
-from pandas import DataFrame, Series
+from numpy import concatenate, ndarray, split, zeros
+from pandas import DataFrame, Series, concat
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.exceptions import NotFittedError
 from typeguard import typechecked
 
+ProcessorInfo = namedtuple("ProcessorInfo", ["numerical", "categorical"])
+PipelineInfo = namedtuple("PipelineInfo", ["feat_names_in", "feat_names_out"])
 
+# pylint: disable=R0902
 @typechecked
 class BaseProcessor(ABC, BaseEstimator, TransformerMixin):
     """
-    Base class for Data Preprocessing.
-    It works like any other transformer in scikit learn with the methods fit, transform and inverse transform.
+    This data processor works like a scikit learn transformer in with the methods fit, transform and inverse transform.
     Args:
         num_cols (list of strings):
             List of names of numerical columns.
         cat_cols (list of strings):
             List of names of categorical columns.
     """
     def __init__(self, num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None):
-
         self.num_cols = [] if num_cols is None else num_cols
         self.cat_cols = [] if cat_cols is None else cat_cols
 
-        self._num_pipeline = None
-        self._cat_pipeline = None
+        self._num_pipeline = None  # To be overriden by child processors
+        self._cat_pipeline = None  # To be overriden by child processors
 
-        self._types = None
+        self._col_transform_info = None  # Metadata object mapping inputs/outputs of each pipeline
 
     @property
     def num_pipeline(self) -> BaseEstimator:
@@ -47,6 +49,25 @@ def types(self) -> Series:
         """Returns a Series with the dtypes of each column in the fitted DataFrame."""
         return self._types
 
+    @property
+    def col_transform_info(self) -> ProcessorInfo:
+        """Returns a ProcessorInfo object specifying input/output feature mappings of this processor's pipelines."""
+        self._check_is_fitted()
+        if self._col_transform_info is None:
+            self._col_transform_info = self.__create_metadata_synth()
+        return self._col_transform_info
+
+    def __create_metadata_synth(self):
+        num_info = None
+        cat_info = None
+        # Numerical ls named tuple
+        if self.num_cols:
+            num_info = PipelineInfo(self.num_pipeline.feature_names_in_, self.num_pipeline.get_feature_names_out())
+        # Categorical ls named tuple
+        if self.cat_cols:
+            cat_info = PipelineInfo(self.cat_pipeline.feature_names_in_, self.cat_pipeline.get_feature_names_out())
+        return ProcessorInfo(num_info, cat_info)
+
     def _check_is_fitted(self):
         """Checks if the processor is fitted by testing the numerical pipeline.
         Raises NotFittedError if not."""
@@ -86,8 +107,7 @@ def transform(self, X: DataFrame) -> ndarray:
                 DataFrame used to fit the processor parameters.
                 Should be aligned with the columns types defined in initialization.
         Returns:
-            transformed (ndarray):
-                Processed version of the passed DataFrame.
+            transformed (ndarray): Processed version of the passed DataFrame.
         """
         raise NotImplementedError
 
diff --git a/src/ydata_synthetic/tests/custom_layers/test_activation_interface.py b/src/ydata_synthetic/tests/custom_layers/test_activation_interface.py
@@ -0,0 +1,59 @@
+"Activation Interface layer test suite."
+from numpy import cumsum, isin, split
+from numpy import sum as npsum
+from numpy.random import normal
+from pytest import fixture
+from tensorflow.keras import Model
+from tensorflow.keras.layers import Dense, Input
+
+from ydata_synthetic.utils.gumbel_softmax import ActivationInterface
+
+
+@fixture(name='noise_batch')
+def fixture_noise_batch():
+    "Sample noise for mock output generation."
+    return normal(size=(10, 16))
+
+@fixture(name='mock_col_map')
+def fixture_mock_col_map():
+    "Mock data processing column map (var blocks i/o names)."
+    return {'numerical': [
+        [f'nfeat{n}' for n in range(6)],
+        [f'nfeat{n}' for n in range(6)]],
+        'categorical': [
+        [f'cfeat{n}' for n in range(2)],
+        sum([[f'cfeat0_{i}' for i in range(4)], [f'cfeat1_{i}' for i in range(2)]],[])]}
+
+# pylint: disable=C0103
+@fixture(name='mock_generator')
+def fixture_mock_generator(noise_batch, mock_col_map):
+    "A mock generator with the Activation Interface as final layer."
+    input_ = Input(shape=noise_batch.shape[1], batch_size = noise_batch.shape[0])
+    dim = 15
+    data_dim = 12
+    x = Dense(dim, activation='relu')(input_)
+    x = Dense(dim * 2, activation='relu')(x)
+    x = Dense(dim * 4, activation='relu')(x)
+    x = Dense(data_dim)(x)
+    x = ActivationInterface(mock_col_map, name='act_itf')(x)
+    return Model(inputs=input_, outputs=x)
+
+@fixture(name='mock_output')
+def fixture_mock_output(noise_batch, mock_generator):
+    "Returns mock output of the model as a numpy object."
+    return mock_generator(noise_batch).numpy()
+
+# pylint: disable=W0632
+def test_io(noise_batch, mock_col_map, mock_output):
+    "Tests the output format of the activation interface for a known input."
+    num_lens = len(mock_col_map.get('numerical')[1])
+    cat_lens = len(mock_col_map.get('categorical')[1])
+    assert mock_output.shape == (len(noise_batch), num_lens + cat_lens), "The output has wrong shape."
+    num_part, cat_part = split(mock_output, [num_lens], 1)
+    assert not isin(num_part, [0, 1]).all(), "The numerical block is not expected to contain 0 or 1."
+    assert isin(cat_part, [0, 1]).all(), "The categorical block is expected to contain only 0 or 1."
+    cat_i, cat_o = mock_col_map.get('categorical')
+    cat_blocks = cumsum([len([col for col in cat_o if ''.join(col.split('_')[:-1]) == feat]) for feat in cat_i])
+    cat_blocks = split(cat_part, cat_blocks[:-1], 1)
+    assert all(npsum(abs(block)) == noise_batch.shape[0] for block in cat_blocks), "There are non one-hot encoded \
+        categorical blocks."
diff --git a/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax.py b/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax.py
@@ -0,0 +1,54 @@
+"Test suite for the Gumbel-Softmax layer implementation."
+import tensorflow as tf
+from numpy import amax, amin, isclose, ones
+from numpy import sum as npsum
+from pytest import fixture
+from tensorflow.keras import layers
+
+from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxLayer
+
+
+# pylint:disable=W0613
+def custom_initializer(shape_list, dtype):
+    "A constant weight intializer to ensure test reproducibility."
+    return tf.constant(ones((5, 5)), dtype=tf.dtypes.float32)
+
+@fixture(name='rand_input')
+def fixture_rand_input():
+    "A random, reproducible, input for the mock model."
+    return tf.constant(tf.random.normal([4, 5], seed=42))
+
+def test_hard_sample_output_format(rand_input):
+    """Tests that the hard output samples are in the expected formats.
+    The hard sample should be returned as a one-hot tensor."""
+    affined = layers.Dense(5, use_bias = False, kernel_initializer=custom_initializer)(rand_input)
+    hard_sample, _ = GumbelSoftmaxLayer()(affined)
+    assert npsum(hard_sample) == hard_sample.shape[0], "The sum of the hard samples should equal the number."
+    assert all(npsum(hard_sample == 0, 1) == hard_sample.shape[1] - 1), "The hard samples is not a one-hot tensor."
+
+def test_soft_sample_output_format(rand_input):
+    """Tests that the soft output samples are in the expected formats.
+    The soft sample should be returned as a probabilities tensor."""
+    affined = layers.Dense(5, use_bias = False, kernel_initializer=custom_initializer)(rand_input)
+    _, soft_sample = GumbelSoftmaxLayer(tau=0.5)(affined)
+    assert isclose(npsum(soft_sample), soft_sample.shape[0]), "The sum of the soft samples should be close to \
+        the number of records."
+    assert amax(soft_sample) <= 1, "Invalid probability values found."
+    assert amin(soft_sample) >= 0, "Invalid probability values found."
+
+def test_gradients(rand_input):
+    "Performs basic numerical assertions on the gradients of the sof/hard samples."
+    def mock(i):
+        return GumbelSoftmaxLayer()(layers.Dense(5, use_bias=False, kernel_initializer=custom_initializer)(i))
+    with tf.GradientTape() as hard_tape:
+        hard_tape.watch(rand_input)
+        hard_sample, _ = mock(rand_input)
+    with tf.GradientTape() as soft_tape:
+        soft_tape.watch(rand_input)
+        _, soft_sample = mock(rand_input)
+    hard_grads = hard_tape.gradient(hard_sample, rand_input)
+    soft_grads = soft_tape.gradient(soft_sample, rand_input)
+
+    assert hard_grads is None, "The hard sample must not compute gradients."
+    assert soft_grads is not None, "The soft sample is expected to compute gradients."
+    assert npsum(abs(soft_grads)) != 0, "The soft sample is expected to have non-zero gradients."
diff --git a/src/ydata_synthetic/utils/gumbel_softmax.py b/src/ydata_synthetic/utils/gumbel_softmax.py
@@ -0,0 +1,75 @@
+"""Gumbel-Softmax layer implementation.
+Reference: https://arxiv.org/pdf/1611.04051.pdf"""
+from typing import Dict, List, Optional
+
+# pylint: disable=E0401
+from tensorflow import (Tensor, TensorShape, concat, one_hot, split, squeeze,
+                        stop_gradient)
+from tensorflow.keras.layers import Activation, Layer
+from tensorflow.math import log
+from tensorflow.nn import softmax
+from tensorflow.random import categorical, uniform
+
+TOL = 1e-20
+
+
+def gumbel_noise(shape: TensorShape) -> Tensor:
+    """Create a single sample from the standard (loc = 0, scale = 1) Gumbel distribution."""
+    uniform_sample = uniform(shape, seed=0)
+    return -log(-log(uniform_sample + TOL) + TOL)
+
+
+class GumbelSoftmaxLayer(Layer):
+    "A Gumbel-Softmax layer implementation that should be stacked on top of a categorical feature logits."
+
+    def __init__(self, tau: float = 0.2, name: Optional[str] = None):
+        super().__init__(name = name)
+        self.tau = tau
+
+    # pylint: disable=W0221, E1120
+    def call(self, _input):
+        """Computes Gumbel-Softmax for the logits output of a particular categorical feature."""
+        noised_input = _input + gumbel_noise(_input.shape)
+        soft_sample = softmax(noised_input/self.tau, -1)
+        hard_sample = stop_gradient(squeeze(one_hot(categorical(log(soft_sample), 1), _input.shape[-1]), 1))
+        return hard_sample, soft_sample
+
+
+class ActivationInterface(Layer):
+    """An interface layer connecting different parts of an incoming tensor to adequate activation functions.
+    The tensor parts are qualified according to the passed processor object.
+    Processed categorical features are sent to specific Gumbel-Softmax layers.
+    Processed features of different kind are sent to a TanH activation.
+    Finally all output parts are concatenated and returned in the same order.
+
+    The parts of an incoming tensor are qualified by leveraging a data processor's in/out feature map.
+
+    Example of how to get a col_map from a Data Processor ProcessorInfo attribute:
+    >>> col_map = {k: [v.feat_names_in, v.feat_names_out] for k, v in ProcessorInfo._asdict().items() if v}"""
+
+    def __init__(self, col_map: Dict[str, List[List[str]]], name: Optional[str] = None):
+        """Arguments:
+            col_map (Dict[str, List[List[str]]]): A map defining the processor pipelines input/output features.
+            name (Optional[str]): Name of the layer"""
+        super().__init__(name)
+
+        self.cat_names_i, cat_names_o = col_map.get("categorical", [[],[]])
+        num_names_i, num_names_o = col_map.get("numerical", [[],[]])
+
+        self._cat_lens = None
+        self._num_lens = None
+
+        if self.cat_names_i:  # Get the length of each processed categorical feature's output block
+            self._cat_lens = [len([col for col in cat_names_o \
+            if ''.join(col.split('_')[:-1]) == cat_feat]) for cat_feat in self.cat_names_i]
+        if num_names_i:  # Get the length of the numerical features output block
+            self._num_lens = len(num_names_o)
+
+    def call(self, _input):  # pylint: disable=W0221
+        num_cols, cat_cols = split(_input, [self._num_lens if self._num_lens else 0, -1], 1, name='split_num_cats')
+        cat_cols = split(cat_cols, self._cat_lens if self._cat_lens else 1, 1, name='split_cats')
+
+        num_cols = [Activation('tanh', name='num_cols_activation')(num_cols)] if self._num_lens else []
+        cat_cols = [GumbelSoftmaxLayer(name=name).call(col)[0] for name, col in zip(self.cat_names_i, cat_cols)] \
+            if self._cat_lens else []
+        return concat(num_cols+cat_cols, 1)