|
| 1 | +# Inverts all preprocessing pipelines provided in the preprocessing examples |
| 2 | +from typing import Union |
| 3 | + |
| 4 | +import pandas as pd |
| 5 | + |
| 6 | +from sklearn.pipeline import Pipeline |
| 7 | +from sklearn.compose import ColumnTransformer |
| 8 | +from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler |
| 9 | + |
| 10 | + |
| 11 | +def inverse_transform(data: pd.DataFrame, processor: Union[Pipeline, ColumnTransformer, PowerTransformer, OneHotEncoder, StandardScaler]) -> pd.DataFrame: |
| 12 | + """Inverts data transformations taking place in a standard sklearn processor. |
| 13 | + Supported processes are sklearn pipelines, column transformers or base estimators like standard scalers. |
| 14 | +
|
| 15 | + Args: |
| 16 | + data (pd.DataFrame): The data object that needs inversion of preprocessing |
| 17 | + processor (Union[Pipeline, ColumnTransformer, BaseEstimator]): The processor applied on the original data |
| 18 | +
|
| 19 | + Returns: |
| 20 | + inv_data (pd.DataFrame): The data object after inverting preprocessing""" |
| 21 | + inv_data = data.copy() |
| 22 | + if isinstance(processor, (PowerTransformer, OneHotEncoder, StandardScaler, Pipeline)): |
| 23 | + inv_data = pd.DataFrame(processor.inverse_transform(data), columns=processor.feature_names_in_) |
| 24 | + elif isinstance(processor, ColumnTransformer): |
| 25 | + output_indices = processor.output_indices_ |
| 26 | + assert isinstance(data, pd.DataFrame), "The data to be inverted from a ColumnTransformer has to be a Pandas DataFrame." |
| 27 | + for t_name, t, t_cols in processor.transformers_[::-1]: |
| 28 | + slice_ = output_indices[t_name] |
| 29 | + t_indices = list(range(slice_.start, slice_.stop, 1 if slice_.step is None else slice_.step)) |
| 30 | + if t == 'drop': |
| 31 | + continue |
| 32 | + elif t == 'passthrough': |
| 33 | + inv_cols = pd.DataFrame(data.iloc[:,t_indices].values, columns = t_cols, index = data.index) |
| 34 | + inv_col_names = inv_cols.columns |
| 35 | + else: |
| 36 | + inv_cols = pd.DataFrame(t.inverse_transform(data.iloc[:,t_indices].values), columns = t_cols, index = data.index) |
| 37 | + inv_col_names = inv_cols.columns |
| 38 | + if set(inv_col_names).issubset(set(inv_data.columns)): |
| 39 | + inv_data[inv_col_names] = inv_cols[inv_col_names] |
| 40 | + else: |
| 41 | + inv_data = pd.concat([inv_data, inv_cols], axis=1) |
| 42 | + else: |
| 43 | + print('The provided data processor is not supported and cannot be inverted with this method.') |
| 44 | + return None |
| 45 | + return inv_data[processor.feature_names_in_] |
0 commit comments