|
1 | 1 | """ |
2 | 2 | CGAN architecture example file |
3 | 3 | """ |
4 | | -from ydata_synthetic.synthesizers.regular import RegularSynthesizer |
5 | | -from ydata_synthetic.synthesizers import ModelParameters, TrainParameters |
6 | | - |
7 | 4 | import pandas as pd |
8 | | -import numpy as np |
9 | 5 | from sklearn import cluster |
10 | 6 |
|
| 7 | +from ydata_synthetic.utils.cache import cache_file |
| 8 | +from ydata_synthetic.synthesizers import ModelParameters, TrainParameters |
| 9 | +from ydata_synthetic.synthesizers.regular import RegularSynthesizer |
| 10 | + |
11 | 11 | #Read the original data and have it preprocessed |
12 | | -data = pd.read_csv('../../data/creditcard.csv', index_col=[0]) |
| 12 | +data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv') |
| 13 | +data = pd.read_csv(data_path, index_col=[0]) |
13 | 14 |
|
14 | | -#List of columns different from the Class column |
| 15 | +#Data processing and analysis |
15 | 16 | num_cols = list(data.columns[ data.columns != 'Class' ]) |
16 | | -cat_cols = [] # Condition features are not preprocessed and therefore not listed here |
| 17 | +cat_cols = [] |
17 | 18 |
|
18 | 19 | print('Dataset columns: {}'.format(num_cols)) |
19 | | -sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class'] |
20 | | -data = data[ sorted_cols ].copy() |
| 20 | +sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', |
| 21 | + 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', |
| 22 | + 'V9', 'V23', 'Class'] |
| 23 | +processed_data = data[ sorted_cols ].copy() |
| 24 | +processed_data['Class'] = processed_data['Class'].apply(lambda x: 1 if x == "'1'" else 0) |
21 | 25 |
|
22 | 26 | #For the purpose of this example we will only synthesize the minority class |
23 | | -train_data = data.loc[ data['Class']==1 ].copy() |
| 27 | +train_data = processed_data.loc[processed_data['Class'] == 1].copy() |
24 | 28 |
|
25 | 29 | #Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN |
26 | 30 | print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) |
27 | 31 | algorithm = cluster.KMeans |
28 | 32 | args, kwds = (), {'n_clusters':2, 'random_state':0} |
29 | 33 | labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ]) |
30 | 34 |
|
31 | | -print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) ) |
32 | | - |
33 | 35 | fraud_w_classes = train_data.copy() |
34 | 36 | fraud_w_classes['Class'] = labels |
35 | 37 |
|
|
72 | 74 | synth.fit(data=fraud_w_classes, label_cols=["Class"], train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols) |
73 | 75 |
|
74 | 76 | #Saving the synthesizer |
75 | | -synth.save('cgan_synthtrained.pkl') |
| 77 | +synth.save('creditcard_cgan_model.pkl') |
76 | 78 |
|
77 | 79 | #Loading the synthesizer |
78 | | -synthesizer = RegularSynthesizer.load('cgan_synthtrained.pkl') |
| 80 | +synthesizer = RegularSynthesizer.load('creditcard_cgan_model.pkl') |
79 | 81 |
|
80 | 82 | #Sampling from the synthesizer |
81 | 83 | cond_array = pd.DataFrame(100*[1], columns=['Class']) |
|
0 commit comments