feat: Add inverse transformations to supported datasets (#104)

fabclmnt · web-flow · commit c618d90b9037 · 2021-10-28T11:56:09.000+01:00
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 pandas==1.2.*
 numpy==1.19.*
-scikit-learn==0.22.*
+scikit-learn==1.0.*
 matplotlib==3.3.2
 seaborn==0.11.*
 tensorflow==2.4.*
diff --git a/src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py b/src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py
@@ -0,0 +1,45 @@
+# Inverts all preprocessing pipelines provided in the preprocessing examples
+from typing import Union
+
+import pandas as pd
+
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler
+
+
+def inverse_transform(data: pd.DataFrame, processor: Union[Pipeline, ColumnTransformer, PowerTransformer, OneHotEncoder, StandardScaler]) -> pd.DataFrame:
+    """Inverts data transformations taking place in a standard sklearn processor.
+    Supported processes are sklearn pipelines, column transformers or base estimators like standard scalers.
+
+    Args:
+        data (pd.DataFrame): The data object that needs inversion of preprocessing
+        processor (Union[Pipeline, ColumnTransformer, BaseEstimator]): The processor applied on the original data
+
+    Returns:
+        inv_data (pd.DataFrame): The data object after inverting preprocessing"""
+    inv_data = data.copy()
+    if isinstance(processor, (PowerTransformer, OneHotEncoder, StandardScaler, Pipeline)):
+        inv_data = pd.DataFrame(processor.inverse_transform(data), columns=processor.feature_names_in_)
+    elif isinstance(processor, ColumnTransformer):
+        output_indices = processor.output_indices_
+        assert isinstance(data, pd.DataFrame), "The data to be inverted from a ColumnTransformer has to be a Pandas DataFrame."
+        for t_name, t, t_cols in processor.transformers_[::-1]:
+            slice_ = output_indices[t_name]
+            t_indices = list(range(slice_.start, slice_.stop, 1 if slice_.step is None else slice_.step))
+            if t == 'drop':
+                continue
+            elif t == 'passthrough':
+                inv_cols = pd.DataFrame(data.iloc[:,t_indices].values, columns = t_cols, index = data.index)
+                inv_col_names = inv_cols.columns
+            else:
+                inv_cols = pd.DataFrame(t.inverse_transform(data.iloc[:,t_indices].values), columns = t_cols, index = data.index)
+                inv_col_names = inv_cols.columns
+            if set(inv_col_names).issubset(set(inv_data.columns)):
+                inv_data[inv_col_names] = inv_cols[inv_col_names]
+            else:
+                inv_data = pd.concat([inv_data, inv_cols], axis=1)
+    else:
+        print('The provided data processor is not supported and cannot be inverted with this method.')
+        return None
+    return inv_data[processor.feature_names_in_]
diff --git a/src/ydata_synthetic/preprocessing/regular/adult.py b/src/ydata_synthetic/preprocessing/regular/adult.py
@@ -9,26 +9,28 @@
 def transformations():
     data = fetch_data('adult')
 
-    numerical_features = ['age', 'fnlwgt', 
+    numerical_features = ['age', 'fnlwgt',
                           'capital-gain', 'capital-loss',
                           'hours-per-week']
     numerical_transformer = Pipeline(steps=[
-        ('onehot', StandardScaler())])
+        ('scaler', StandardScaler())])
 
-    categorical_features = ['workclass','education', 'marital-status', 
+    categorical_features = ['workclass','education', 'marital-status',
                             'occupation', 'relationship',
                             'race', 'sex']
     categorical_transformer = Pipeline(steps=[
         ('onehot', OneHotEncoder(handle_unknown='ignore'))])
 
+    remaining_features = ['education-num', 'native-country','target']
+    remaining_transformer = 'passthrough'
     preprocessor = ColumnTransformer(
         transformers=[
             ('num', numerical_transformer, numerical_features),
-            ('cat', categorical_transformer, categorical_features)])
+            ('cat', categorical_transformer, categorical_features),
+            ('remaining', remaining_transformer, remaining_features)])
 
     processed_data = pd.DataFrame.sparse.from_spmatrix(preprocessor.fit_transform(data))
 
     return data, processed_data, preprocessor
 
 
-    
diff --git a/src/ydata_synthetic/preprocessing/regular/breast_cancer_wisconsin.py b/src/ydata_synthetic/preprocessing/regular/breast_cancer_wisconsin.py
@@ -6,22 +6,18 @@
 
 from pmlb import fetch_data
 
-def transformations(auto=True):
-    if auto:
-        data = fetch_data('breast_cancer_wisconsin')
-    else:
-        data = fetch_data('breast_cancer_wisconsin')
-        
+def transformations():
+    data = fetch_data('breast_cancer_wisconsin')
+
     scaler = StandardScaler()
     processed_data = scaler.fit_transform(data)
     processed_data = pd.DataFrame(processed_data)
-    
+
     return data, processed_data, scaler
 
 
 if __name__ == '__main__':
-    
-    data = transformations(auto=True)
-    
+
+    data = transformations()
+
     print(data)
-    
diff --git a/src/ydata_synthetic/preprocessing/regular/cardiovascular.py b/src/ydata_synthetic/preprocessing/regular/cardiovascular.py
@@ -19,7 +19,7 @@ def transformations(data):
         transformers=[
             ('num', numerical_transformer, numerical_features),
             ('cat', categorical_transformer, categorical_features)])
-    
+
     processed_data = preprocessor.fit_transform(data)
     processed_data = pd.DataFrame.sparse.from_spmatrix(preprocessor.fit_transform(processed_data))
-    return processed_data, preprocessor
+    return data, processed_data, preprocessor
diff --git a/src/ydata_synthetic/preprocessing/regular/credit_fraud.py b/src/ydata_synthetic/preprocessing/regular/credit_fraud.py
@@ -1,16 +1,23 @@
-#Data transformations to be aplied 
+#Data transformations to be applied
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import math
 
 from sklearn.preprocessing import PowerTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
 
 def transformations(data):
     #Log transformation to Amount variable
+    processed_data = data.copy()
     data_cols = list(data.columns[data.columns != 'Class'])
-  
-    #data[data_cols] = StandardScaler().fit_transform(data[data_cols])
-    data[data_cols] = PowerTransformer(method='yeo-johnson', standardize=True, copy=True).fit_transform(data[data_cols])
-    
-    return data
+
+    data_transformer = Pipeline(steps=[
+        ('PowerTransformer', PowerTransformer(method='yeo-johnson', standardize=True, copy=True))])
+
+    preprocessor = ColumnTransformer(
+        transformers = [('power', data_transformer, data_cols)])
+    processed_data[data_cols] = preprocessor.fit_transform(data[data_cols])
+
+    return data, processed_data, preprocessor