feat: update to python 3.10, update examples (#223)

aquemy · web-flow · commit bd20953a865b · 2023-01-16T14:17:15.000+01:00
* feat: update to python 3.10, update examples

* feat: add CWGANGP example

* chore: remove unused imports

* chore: remove trailing whitespace
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 ![](https://img.shields.io/github/workflow/status/ydataai/ydata-synthetic/prerelease)
 ![](https://img.shields.io/pypi/status/ydata-synthetic)
 [![](https://pepy.tech/badge/ydata-synthetic)](https://pypi.org/project/ydata-synthetic/)
-![](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue)
+![](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue)
 [![](https://img.shields.io/pypi/v/ydata-synthetic)](https://pypi.org/project/ydata-synthetic/)
 ![](https://img.shields.io/github/license/ydataai/ydata-synthetic)
 
diff --git a/examples/regular/models/adult_dragan.py b/examples/regular/models/adult_dragan.py
@@ -11,7 +11,6 @@
 
 # DRAGAN training
 #Defining the training parameters of DRAGAN
-
 noise_dim = 128
 dim = 128
 batch_size = 500
@@ -35,10 +34,10 @@
 synth = RegularSynthesizer(modelname='dragan', model_parameters=gan_args, n_discriminator=3)
 synth.fit(data = data, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols)
 
-synth.save('adult_synth.pkl')
+synth.save('adult_dragan_model.pkl')
 
 #########################################################
 #    Loading and sampling from a trained synthesizer    #
 #########################################################
-synthesizer = RegularSynthesizer.load('adult_synth.pkl')
+synthesizer = RegularSynthesizer.load('adult_dragan_model.pkl')
 synthesizer.sample(1000)
diff --git a/examples/regular/models/adult_wgangp.py b/examples/regular/models/adult_wgangp.py
@@ -1,18 +1,15 @@
 from pmlb import fetch_data
 
-from ydata_synthetic.synthesizers.regular import WGAN_GP
+from ydata_synthetic.synthesizers.regular import RegularSynthesizer
 from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
 
-model = WGAN_GP
-
 #Load data and define the data processor parameters
 data = fetch_data('adult')
 num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
 cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
             'native-country', 'target']
 
 #Defining the training parameters
-
 noise_dim = 128
 dim = 128
 batch_size = 500
@@ -33,10 +30,13 @@
 train_args = TrainParameters(epochs=epochs,
                              sample_interval=log_step)
 
-synthesizer = model(gan_args, n_critic=2)
-synthesizer.train(data, train_args, num_cols, cat_cols)
+synth = RegularSynthesizer(modelname='wgangp', model_parameters=gan_args, n_critic=2)
+synth.fit(data, train_args, num_cols, cat_cols)
 
-synthesizer.save('test.pkl')
+synth.save('adult_wgangp_model.pkl')
 
-synthesizer = model.load('test.pkl')
-synth_data = synthesizer.sample(1000)
+#########################################################
+#    Loading and sampling from a trained synthesizer    #
+#########################################################
+synth = RegularSynthesizer.load('adult_wgangp_model.pkl')
+synth_data = synth.sample(1000)
diff --git a/examples/regular/models/creditcard_cgan.py b/examples/regular/models/creditcard_cgan.py
@@ -1,35 +1,37 @@
 """
     CGAN architecture example file
 """
-from ydata_synthetic.synthesizers.regular import RegularSynthesizer
-from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
-
 import pandas as pd
-import numpy as np
 from sklearn import cluster
 
+from ydata_synthetic.utils.cache import cache_file
+from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
+from ydata_synthetic.synthesizers.regular import RegularSynthesizer
+
 #Read the original data and have it preprocessed
-data = pd.read_csv('../../data/creditcard.csv', index_col=[0])
+data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv')
+data = pd.read_csv(data_path, index_col=[0])
 
-#List of columns different from the Class column
+#Data processing and analysis
 num_cols = list(data.columns[ data.columns != 'Class' ])
-cat_cols = []  # Condition features are not preprocessed and therefore not listed here
+cat_cols = []
 
 print('Dataset columns: {}'.format(num_cols))
-sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
-data = data[ sorted_cols ].copy()
+sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19',
+                'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15',
+                'V9', 'V23', 'Class']
+processed_data = data[ sorted_cols ].copy()
+processed_data['Class'] = processed_data['Class'].apply(lambda x: 1 if x == "'1'" else 0)
 
 #For the purpose of this example we will only synthesize the minority class
-train_data = data.loc[ data['Class']==1 ].copy()
+train_data = processed_data.loc[processed_data['Class'] == 1].copy()
 
 #Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN
 print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
 algorithm = cluster.KMeans
 args, kwds = (), {'n_clusters':2, 'random_state':0}
 labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])
 
-print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )
-
 fraud_w_classes = train_data.copy()
 fraud_w_classes['Class'] = labels
 
@@ -72,10 +74,10 @@
 synth.fit(data=fraud_w_classes, label_cols=["Class"], train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols)
 
 #Saving the synthesizer
-synth.save('cgan_synthtrained.pkl')
+synth.save('creditcard_cgan_model.pkl')
 
 #Loading the synthesizer
-synthesizer = RegularSynthesizer.load('cgan_synthtrained.pkl')
+synthesizer = RegularSynthesizer.load('creditcard_cgan_model.pkl')
 
 #Sampling from the synthesizer
 cond_array = pd.DataFrame(100*[1], columns=['Class'])
diff --git a/examples/regular/models/creditcard_cramergan.py b/examples/regular/models/creditcard_cramergan.py
@@ -7,22 +7,25 @@
 import numpy as np
 import pandas as pd
 
+from ydata_synthetic.utils.cache import cache_file
 from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
 from ydata_synthetic.synthesizers.regular import RegularSynthesizer
 
 #Read the original data and have it preprocessed
-data = pd.read_csv('../../../data/creditcard.csv', index_col=[0])
+data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv')
+data = pd.read_csv(data_path, index_col=[0])
 
-#List of columns different from the Class column
+#Data processing and analysis
 num_cols = list(data.columns[ data.columns != 'Class' ])
 cat_cols = ['Class']
 
 print('Dataset columns: {}'.format(num_cols))
 sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
-data = data[ sorted_cols ].copy()
+processed_data = data[ sorted_cols ].copy()
+processed_data['Class'] = processed_data['Class'].apply(lambda x: 1 if x == "'1'" else 0)
 
 #For the purpose of this example we will only synthesize the minority class
-train_data = data.loc[ data['Class']==1 ].copy()
+train_data = processed_data.loc[processed_data['Class'] == 1].copy()
 
 #Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN
 print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
@@ -62,12 +65,12 @@
 synth.fit(data=train_data, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols)
 
 #Saving the synthesizer to later generate new events
-synth.save(path='cramergan_creditcard.pkl')
+synth.save(path='creditcard_cramergan_model.pkl')
 
 #########################################################
 #    Loading and sampling from a trained synthesizer    #
 #########################################################
-synth = RegularSynthesizer.load(path='cramergan_creditcard.pkl')
+synth = RegularSynthesizer.load(path='creditcard_cramergan_model.pkl')
 #Sampling the data
 #Note that the data returned it is not inverse processed.
 data_sample = synth.sample(100000)
diff --git a/examples/regular/models/creditcard_cwgangp.py b/examples/regular/models/creditcard_cwgangp.py
@@ -1,32 +1,33 @@
-from ydata_synthetic.synthesizers.regular import RegularSynthesizer
-from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
-
 import pandas as pd
 import numpy as np
 from sklearn import cluster
 
+from ydata_synthetic.utils.cache import cache_file
+from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
+from ydata_synthetic.synthesizers.regular import RegularSynthesizer
+
 #Read the original data and have it preprocessed
-data = pd.read_csv('../../data/creditcard.csv', index_col=[0])
+data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv')
+data = pd.read_csv(data_path, index_col=[0])
 
-#List of columns different from the Class column
-num_cols = list(data.columns[~data.columns.isin(['Class', 'Amount'])])
-cat_cols = []  # Condition features are not preprocessed and therefore not listed here
+#Data processing and analysis
+num_cols = list(data.columns[ data.columns != 'Class' ])
+cat_cols = [] #['Class']
 
 print('Dataset columns: {}'.format(num_cols))
 sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
-data = data[ sorted_cols ].copy()
+processed_data = data[ sorted_cols ].copy()
+processed_data['Class'] = processed_data['Class'].apply(lambda x: 1 if x == "'1'" else 0)
 
 #For the purpose of this example we will only synthesize the minority class
-train_data = data.loc[ data['Class']==1 ].copy()
+train_data = processed_data.loc[processed_data['Class'] == 1].copy()
 
 #Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional WGANGP
 print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
 algorithm = cluster.KMeans
 args, kwds = (), {'n_clusters':2, 'random_state':0}
 labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])
 
-print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )
-
 fraud_w_classes = train_data.copy()
 fraud_w_classes['Class'] = labels
 
@@ -66,16 +67,16 @@
 synth = RegularSynthesizer(modelname='cwgangp', model_parameters=gan_args, n_critic=5)
 
 #Fitting the synthesizer
-synth.fit(data=fraud_w_classes, label_cols=["Class", "Amount"], train_arguments=train_args,
+synth.fit(data=fraud_w_classes, label_cols=["Class"], train_arguments=train_args,
                   num_cols=num_cols, cat_cols=cat_cols)
 
-synth.save('.model.pkl')
+synth.save('creditcard_cwgangp_model.pkl')
 
 #########################################################
 #    Loading and sampling from a trained synthesizer    #
 #########################################################
-new_synth = RegularSynthesizer.load('.model.pkl')
+new_synth = RegularSynthesizer.load('creditcard_cwgangp_model.pkl')
 
 sample_len = 2000
-cond_array = fraud_w_classes[["Class", "Amount"]]
+cond_array = fraud_w_classes[["Class"]]
 new_synth.sample(cond_array)
diff --git a/examples/regular/models/creditcard_wgan.py b/examples/regular/models/creditcard_wgan.py
@@ -4,11 +4,13 @@
 import pandas as pd
 import numpy as np
 
+from ydata_synthetic.utils.cache import cache_file
 from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
 from ydata_synthetic.synthesizers.regular import RegularSynthesizer
 
 #Read the original data and have it preprocessed
-data = pd.read_csv('../../../data/creditcard.csv', index_col=[0])
+data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv')
+data = pd.read_csv(data_path, index_col=[0])
 
 #Data processing and analysis
 num_cols = list(data.columns[ data.columns != 'Class' ])
@@ -17,9 +19,10 @@
 print('Dataset columns: {}'.format(num_cols))
 sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
 processed_data = data[ sorted_cols ].copy()
+processed_data['Class'] = processed_data['Class'].apply(lambda x: 1 if x == "'1'" else 0)
 
 #For the purpose of this example we will only synthesize the minority class
-train_data = data.loc[ data['Class']==1 ].copy()
+train_data = processed_data.loc[processed_data['Class'] == 1].copy()
 
 print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
 algorithm = cluster.KMeans
@@ -61,12 +64,12 @@
 synth.fit(data=train_data, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols)
 
 #Saving the synthesizer to later generate new events
-synth.save(path='models/wgan_creditcard.pkl')
+synth.save(path='creditcard_wgan_model.pkl')
 
 #########################################################
 #    Loading and sampling from a trained synthesizer    #
 #########################################################
-synth = RegularSynthesizer.load(path='models/wgan_creditcard.pkl')
+synth = RegularSynthesizer.load(path='creditcard_wgan_model.pkl')
 
 #Sampling the data
 data_sample = synth.sample(100000)
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,10 @@
 requests>=2.24.0, <2.29
-pandas==1.4.*
+pandas==1.5.*
 numpy==1.23.*
-scikit-learn==1.1.*
-matplotlib==3.5.*
-tensorflow==2.9.0
-easydict==1.9
+scikit-learn==1.2.*
+matplotlib==3.6.*
+tensorflow==2.11.0
+easydict==1.10
 pmlb==1.0.*
 tqdm<5.0
 typeguard==2.13.*
diff --git a/setup.py b/setup.py
@@ -42,7 +42,7 @@
       keywords='data science ydata',
       url='https://github.com/ydataai/ydata-synthetic',
       license="https://github.com/ydataai/ydata-synthetic/blob/master/LICENSE",
-      python_requires=">=3.6, <3.9",
+      python_requires=">=3.6, <3.11",
       packages=find_namespace_packages('src'),
       package_dir={'':'src'},
       include_package_data=True,