feat: Add new regular data model synthesizer CWGANGP (#153)

jfsantos-ds · fabiana · commit 8d1311d58e3c · 2022-04-19T10:28:44.000+01:00
* CWGANGP

* fix column order in cgan sample
diff --git a/examples/regular/cgan_example.py b/examples/regular/cgan_example.py
@@ -77,4 +77,4 @@
 #Sampling from the synthesizer
 cond_array = np.array([0])
 # Synthesizer samples are returned in the original format (inverse_transform of internal processing already took place)
-synthesizer = synthesizer.sample(cond_array, 1000)
+sample = synthesizer.sample(cond_array, 1000)
diff --git a/examples/regular/cwgangp_example.py b/examples/regular/cwgangp_example.py
@@ -0,0 +1,80 @@
+from ydata_synthetic.synthesizers.regular import CWGANGP
+from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
+
+import pandas as pd
+import numpy as np
+from sklearn import cluster
+
+model = CWGANGP
+
+#Read the original data and have it preprocessed
+data = pd.read_csv('data/creditcard.csv', index_col=[0])
+
+#List of columns different from the Class column
+num_cols = list(data.columns[ data.columns != 'Class' ])
+cat_cols = []  # Condition features are not preprocessed and therefore not listed here
+
+print('Dataset columns: {}'.format(num_cols))
+sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
+data = data[ sorted_cols ].copy()
+
+#For the purpose of this example we will only synthesize the minority class
+train_data = data.loc[ data['Class']==1 ].copy()
+
+#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional WGANGP
+print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
+algorithm = cluster.KMeans
+args, kwds = (), {'n_clusters':2, 'random_state':0}
+labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])
+
+print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )
+
+fraud_w_classes = train_data.copy()
+fraud_w_classes['Class'] = labels
+
+#----------------------------
+#    GAN Training
+#----------------------------
+
+#Define the Conditional WGANGP and training parameters
+noise_dim = 32
+dim = 128
+batch_size = 128
+beta_1 = 0.5
+beta_2 = 0.9
+
+log_step = 100
+epochs = 300 + 1
+learning_rate = 5e-4
+models_dir = './cache'
+
+#Test here the new inputs
+gan_args = ModelParameters(batch_size=batch_size,
+                           lr=learning_rate,
+                           betas=(beta_1, beta_2),
+                           noise_dim=noise_dim,
+                           layers_dim=dim)
+
+train_args = TrainParameters(epochs=epochs,
+                             cache_prefix='',
+                             sample_interval=log_step,
+                             label_dim=-1,
+                             labels=(0,1))
+
+#Init the Conditional WGANGP providing the index of the label column as one of the arguments
+synthesizer = model(model_parameters=gan_args, num_classes=2, n_critic=3)
+
+#Training the Conditional WGANGP
+synthesizer.train(data=fraud_w_classes, label_col="Class", train_arguments=train_args,
+                  num_cols=num_cols, cat_cols=cat_cols)
+
+#Saving the synthesizer
+synthesizer.save('cwgangp_synthtrained.pkl')
+
+#Loading the synthesizer
+synthesizer = model.load('cwgangp_synthtrained.pkl')
+
+#Sampling from the synthesizer
+cond_array = np.array([0])
+# Synthesizer samples are returned in the original format (inverse_transform of internal processing already took place)
+sample = synthesizer.sample(cond_array, 1000)
diff --git a/src/ydata_synthetic/preprocessing/base_processor.py b/src/ydata_synthetic/preprocessing/base_processor.py
@@ -2,17 +2,15 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from collections import namedtuple
+from types import SimpleNamespace
 from typing import List, Optional
 
-from numpy import concatenate, ndarray, split, zeros
-from pandas import DataFrame, Series, concat
+from numpy import ndarray
+from pandas import DataFrame, Series
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.exceptions import NotFittedError
 from typeguard import typechecked
 
-ProcessorInfo = namedtuple("ProcessorInfo", ["numerical", "categorical"])
-PipelineInfo = namedtuple("PipelineInfo", ["feat_names_in", "feat_names_out"])
 
 # pylint: disable=R0902
 @typechecked
@@ -50,23 +48,25 @@ def types(self) -> Series:
         return self._types
 
     @property
-    def col_transform_info(self) -> ProcessorInfo:
+    def col_transform_info(self) -> SimpleNamespace:
         """Returns a ProcessorInfo object specifying input/output feature mappings of this processor's pipelines."""
         self._check_is_fitted()
         if self._col_transform_info is None:
             self._col_transform_info = self.__create_metadata_synth()
         return self._col_transform_info
 
-    def __create_metadata_synth(self):
-        num_info = PipelineInfo([], [])
-        cat_info = PipelineInfo([], [])
-        # Numerical ls named tuple
+    def __create_metadata_synth(self) -> SimpleNamespace:
+        def new_pipeline_info(feat_in, feat_out):
+            return SimpleNamespace(feat_names_in = feat_in, feat_names_out = feat_out)
         if self.num_cols:
-            num_info = PipelineInfo(self.num_pipeline.feature_names_in_, self.num_pipeline.get_feature_names_out())
-        # Categorical ls named tuple
+            num_info = new_pipeline_info(self.num_pipeline.feature_names_in_, self.num_pipeline.get_feature_names_out())
+        else:
+            num_info = new_pipeline_info([], [])
         if self.cat_cols:
-            cat_info = PipelineInfo(self.cat_pipeline.feature_names_in_, self.cat_pipeline.get_feature_names_out())
-        return ProcessorInfo(num_info, cat_info)
+            cat_info = new_pipeline_info(self.cat_pipeline.feature_names_in_, self.cat_pipeline.get_feature_names_out())
+        else:
+            cat_info = new_pipeline_info([], [])
+        return SimpleNamespace(numerical=num_info, categorical=cat_info)
 
     def _check_is_fitted(self):
         """Checks if the processor is fitted by testing the numerical pipeline.
diff --git a/src/ydata_synthetic/preprocessing/regular/processor.py b/src/ydata_synthetic/preprocessing/regular/processor.py
@@ -21,6 +21,7 @@ class RegularModels(Enum):
     GAN = 'VanillaGAN'
     WGAN = 'WGAN'
     WGAN_GP = 'WGAN_GP'
+    CWGAN_GP = 'CWGAN_GP'
 
 
 @typechecked
diff --git a/src/ydata_synthetic/synthesizers/gan.py b/src/ydata_synthetic/synthesizers/gan.py
@@ -131,7 +131,7 @@ def sample(self, n_samples: int):
     def save(self, path):
         "Saves the pickled synthesizer instance in the given path."
         #Save only the generator?
-        if self.__MODEL__=='WGAN' or self.__MODEL__=='WGAN_GP':
+        if self.__MODEL__=='WGAN' or self.__MODEL__=='WGAN_GP' or self.__MODEL__=='CWGAN_GP':
             del self.critic
         make_keras_picklable()
         dump(self, path)
diff --git a/src/ydata_synthetic/synthesizers/regular/__init__.py b/src/ydata_synthetic/synthesizers/regular/__init__.py
@@ -4,12 +4,14 @@
 from ydata_synthetic.synthesizers.regular.wgangp.model import WGAN_GP
 from ydata_synthetic.synthesizers.regular.dragan.model import DRAGAN
 from ydata_synthetic.synthesizers.regular.cramergan.model import CRAMERGAN
+from ydata_synthetic.synthesizers.regular.cwgangp.model import CWGANGP
 
 __all__ = [
     "VanilllaGAN",
     "CGAN",
     "WGAN",
     "WGAN_GP",
     "DRAGAN",
-    "CRAMERGAN"
+    "CRAMERGAN",
+    "CWGANGP"
 ]
diff --git a/src/ydata_synthetic/synthesizers/regular/cgan/model.py b/src/ydata_synthetic/synthesizers/regular/cgan/model.py
@@ -1,7 +1,7 @@
 """CGAN implementation"""
 import os
 from os import path
-from typing import List, Tuple, Union, Optional, NamedTuple
+from typing import List, Optional, NamedTuple
 
 import numpy as np
 from numpy import array, empty, hstack, ndarray, vstack, save
@@ -30,6 +30,7 @@ class CGAN(BaseModel):
     def __init__(self, model_parameters, num_classes):
         self.num_classes = num_classes
         self._label_col = None
+        self._col_order = None
         super().__init__(model_parameters)
 
     @property
@@ -38,18 +39,9 @@ def label_col(self) -> str:
         return self._label_col
 
     @label_col.setter
-    def label_col(self, data_label: Tuple[Union[DataFrame, array], str]):
-        "Validates the label_col format, raises ValueError if invalid."
-        data, label_col = data_label
-        assert label_col in data.columns, f"The column {label_col} could not be found on the provided dataset and \
-            cannot be used as condition."
-        assert data[label_col].isna().sum() == 0, "The label column contains NaN values, please impute or drop the \
-            respective records before proceeding."
-        assert is_float_dtype(data[label_col]) or is_integer_dtype(data[label_col]), "The label column is expected to be an \
-            integer or a float dtype to ensure the function of the embedding layer."
-        unique_frac = data[label_col].nunique()/len(data.index)
-        assert unique_frac < 1, "The provided column {label_col} is constituted by unique values and is not suitable \
-            to be used as condition."
+    def label_col(self, label_col: str):
+        """Set the label_col property."""
+        self._label_col = label_col
 
     def define_gan(self, activation_info: Optional[NamedTuple] = None):
         self.generator = Generator(self.batch_size, self.num_classes). \
@@ -103,18 +95,20 @@ def get_data_batch(self, data, batch_size, seed=0):
         data_ix = np.random.choice(data.shape[0], replace=False, size=len(data))  # wasteful to shuffle every time
         return data[data_ix[start_i: stop_i]]
 
-    def train(self, data: Union[DataFrame, array], label_col: str, train_arguments: TrainParameters, num_cols: List[str],
+    def train(self, data: DataFrame, label_col: str, train_arguments: TrainParameters, num_cols: List[str],
               cat_cols: List[str]):
         """
         Args:
-            data: A pandas DataFrame or a Numpy array with the data to be synthesized
+            data: A pandas DataFrame with the data to be synthesized
             label: The name of the column to be used as a label and condition for the training
             train_arguments: GAN training arguments.
             num_cols: List of columns of the data object to be handled as numerical
             cat_cols: List of columns of the data object to be handled as categorical
         """
         # Validating the label column
-        self.label_col = (data, label_col)
+        self._validate_label_col(data, label_col)
+        self._col_order = data.columns
+        self.label_col = label_col
 
         # Separating labels from the rest of the data to fit the data processor
         data, label = data.loc[:, data.columns != label_col], expand_dims(data[label_col], 1)
@@ -182,15 +176,28 @@ def sample(self, condition: ndarray, n_samples: int,) -> ndarray:
         steps = n_samples // self.batch_size + 1
         data = []
         z_dist = self.get_batch_noise()
-        condition = expand_dims(convert_to_tensor(condition, dtypes.float32), axis=0)
-        cond_seq = tile(condition, multiples=[self.batch_size, 1])
+        cond_seq = expand_dims(convert_to_tensor(condition, dtypes.float32), axis=0)
+        cond_seq = tile(cond_seq, multiples=[self.batch_size, 1])
         for _ in trange(steps, desc='Synthetic data generation'):
             records = empty(shape=(self.batch_size, self.data_dim))
             records = self.generator([next(z_dist), cond_seq], training=False)
             data.append(records)
         data = self.processor.inverse_transform(array(vstack(data)))
-        data[self.label_col] = tile(condition, multiples=[data.shape[0], 1])
-        return data
+        data[self.label_col] = condition[0]
+        return data[self._col_order]
+
+    @staticmethod
+    def _validate_label_col(data: DataFrame, label_col: str):
+        "Validates the label_col format, raises ValueError if invalid."
+        assert label_col in data.columns, f"The column {label_col} could not be found on the provided dataset and \
+            cannot be used as condition."
+        assert data[label_col].isna().sum() == 0, "The label column contains NaN values, please impute or drop the \
+            respective records before proceeding."
+        assert is_float_dtype(data[label_col]) or is_integer_dtype(data[label_col]), "The label column is expected to be an \
+            integer or a float dtype to ensure the function of the embedding layer."
+        unique_frac = data[label_col].nunique()/len(data.index)
+        assert unique_frac < 1, "The provided column {label_col} is constituted by unique values and is not suitable \
+            to be used as condition."
 
 
 # pylint: disable=R0903
diff --git a/src/ydata_synthetic/synthesizers/regular/cwgangp/__init__.py b/src/ydata_synthetic/synthesizers/regular/cwgangp/__init__.py
diff --git a/src/ydata_synthetic/synthesizers/regular/cwgangp/model.py b/src/ydata_synthetic/synthesizers/regular/cwgangp/model.py
diff --git a/src/ydata_synthetic/synthesizers/regular/wgangp/model.py b/src/ydata_synthetic/synthesizers/regular/wgangp/model.py