From ca53aaa1f7ce283242f017bdd0a1fc8ec5c51895 Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Mon, 8 Nov 2021 16:50:54 +0000 Subject: [PATCH 1/3] PATEGAN base implementation Remove duplicate test files after renaming Use BaseModel variables --- requirements.txt | 1 + .../synthesizers/regular/__init__.py | 2 + .../synthesizers/regular/pategan/__init__.py | 0 .../synthesizers/regular/pategan/model.py | 256 ++++++++++++++++++ 4 files changed, 259 insertions(+) create mode 100644 src/ydata_synthetic/synthesizers/regular/pategan/__init__.py create mode 100644 src/ydata_synthetic/synthesizers/regular/pategan/model.py diff --git a/requirements.txt b/requirements.txt index 3f806133..b140d5ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ pmlb==1.0.* tqdm<5.0 typeguard==2.13.* pytest==6.2.* +tensorflow_probability==0.12.* diff --git a/src/ydata_synthetic/synthesizers/regular/__init__.py b/src/ydata_synthetic/synthesizers/regular/__init__.py index ee1497bd..7acda5d9 100644 --- a/src/ydata_synthetic/synthesizers/regular/__init__.py +++ b/src/ydata_synthetic/synthesizers/regular/__init__.py @@ -5,6 +5,7 @@ from ydata_synthetic.synthesizers.regular.dragan.model import DRAGAN from ydata_synthetic.synthesizers.regular.cramergan.model import CRAMERGAN from ydata_synthetic.synthesizers.regular.cwgangp.model import CWGANGP +from ydata_synthetic.synthesizers.regular.pategan.model import PATEGAN __all__ = [ "VanilllaGAN", @@ -14,4 +15,5 @@ "DRAGAN", "CRAMERGAN", "CWGANGP" + "PATEGAN" ] diff --git a/src/ydata_synthetic/synthesizers/regular/pategan/__init__.py b/src/ydata_synthetic/synthesizers/regular/pategan/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/ydata_synthetic/synthesizers/regular/pategan/model.py b/src/ydata_synthetic/synthesizers/regular/pategan/model.py new file mode 100644 index 00000000..28d9ca25 --- /dev/null +++ b/src/ydata_synthetic/synthesizers/regular/pategan/model.py @@ -0,0 +1,256 @@ +"PATEGAN implementation supporting Differential Privacy budget specification." +# pylint: disable = W0622, E0401 +from math import log +from typing import List, NamedTuple, Optional + +import tqdm +from tensorflow import (GradientTape, clip_by_value, concat, constant, + expand_dims, ones_like, tensor_scatter_nd_update, + transpose, zeros, zeros_like) +from tensorflow.data import Dataset +from tensorflow.dtypes import cast, float64, int64 +from tensorflow.keras import Model +from tensorflow.keras.layers import Dense, Input, ReLU +from tensorflow.keras.losses import BinaryCrossentropy +from tensorflow.keras.optimizers import Adam +from tensorflow.math import abs, exp, pow, reduce_sum, square +from tensorflow.random import uniform +from tensorflow_probability import distributions + +from ydata_synthetic.synthesizers import TrainParameters +from ydata_synthetic.synthesizers.gan import BaseModel +from ydata_synthetic.utils.gumbel_softmax import ActivationInterface + + +# pylint: disable=R0902 +class PATEGAN(BaseModel): + "A basic PATEGAN synthesizer implementation with configurable differential privacy budget." + + __MODEL__='PATEGAN' + + def __init__(self, model_parameters, n_teachers: int, target_delta: float, target_epsilon: float): + super().__init__(model_parameters) + self.n_teachers = n_teachers + self.target_epsilon = target_epsilon + self.target_delta = target_delta + + # pylint: disable=W0201 + def define_gan(self, processor_info: Optional[NamedTuple] = None): + def discriminator(): + return Discriminator(self.batch_size).build_model((self.data_dim,), self.layers_dim) + + self.generator = Generator(self.batch_size). \ + build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, + processor_info=processor_info) + self.s_discriminator = discriminator() + self.t_discriminators = [discriminator() for i in range(self.n_teachers)] + + generator_optimizer = Adam(learning_rate=self.g_lr) + discriminator_optimizer = Adam(learning_rate=self.d_lr) + + loss_fn = BinaryCrossentropy(from_logits=True) + self.generator.compile(loss=loss_fn, optimizer=generator_optimizer) + self.s_discriminator.compile(loss=loss_fn, optimizer=discriminator_optimizer) + for teacher in self.t_discriminators: + teacher.compile(loss=loss_fn, optimizer=discriminator_optimizer) + + # pylint: disable = C0103 + @staticmethod + def _moments_acc(n_teachers, votes, lap_scale, l_list): + q = (2 + lap_scale * abs(2 * votes - n_teachers))/(4 * exp(lap_scale * abs(2 * votes - n_teachers))) + + update = [] + for l in l_list: + clip = 2 * square(lap_scale) * l * (l + 1) + t = (1 - q) * pow((1 - q) / (1 - exp(2 * lap_scale) * q), l) + q * exp(2 * lap_scale * l) + update.append(reduce_sum(clip_by_value(t, clip_value_min=-clip, clip_value_max=clip))) + return cast(update, dtype=float64) + + def get_data_loader(self, data) -> List[Dataset]: + "Obtain a List of TF Datasets corresponding to partitions for each teacher in n_teachers." + loader = [] + SHUFFLE_BUFFER_SIZE = 100 + + for teacher_id in range(self.n_teachers): + start_id = int(teacher_id * len(data) / self.n_teachers) + end_id = int((teacher_id + 1) * len(data) / self.n_teachers if \ + teacher_id != (self.n_teachers - 1) else len(data)) + loader.append(Dataset.from_tensor_slices(data[start_id:end_id:])\ + .batch(self.batch_size).shuffle(SHUFFLE_BUFFER_SIZE)) + return loader + + # pylint:disable=R0913 + def train(self, data, class_ratios, train_arguments: TrainParameters, num_cols: List[str], cat_cols: List[str]): + """ + Args: + data: A pandas DataFrame or a Numpy array with the data to be synthesized + class_ratios: + train_arguments: GAN training arguments. + num_cols: List of columns of the data object to be handled as numerical + cat_cols: List of columns of the data object to be handled as categorical + """ + super().train(data, num_cols, cat_cols) + + data = self.processor.transform(data) + self.data_dim = data.shape[1] + self.define_gan(self.processor.col_transform_info) + + self.class_ratios = class_ratios + + alpha = cast([0.0 for _ in range(train_arguments.num_moments)], float64) + l_list = 1 + cast(range(train_arguments.num_moments), float64) + + # print("initial alpha", l_list.shape) + + cross_entropy = BinaryCrossentropy(from_logits=True) + + generator_optimizer = Adam(learning_rate=train_arguments.lr) + disc_opt_stu = Adam(learning_rate=train_arguments.lr) + disc_opt_t = [Adam(learning_rate=train_arguments.lr) for i in range(self.n_teachers)] + + train_loader = self.get_data_loader(data, self.batch_size) + + steps = 0 + epsilon = 0 + + category_samples = distributions.Categorical(probs=self.class_ratios, dtype=float64) + + while epsilon < self.target_epsilon: + # train the teacher descriminator + for t_2 in range(train_arguments.num_teacher_iters): + for i in range(self.n_teachers): + inputs, categories = None, None + for b, data_ in enumerate(train_loader[i]): + inputs, categories = data_, b # categories = 0, data_ holds the first batch, why do we do this? + #categories will give zero value in each loop as the loop break after running the first time + #inputs will have only the first batch of data + break + + with GradientTape() as disc_tape: + # train with real + dis_data = concat([inputs, zeros((self.batch_size, 1), dtype=float64)], 1) # Why do we append a column of zeros instead of categories? + # print("1st batch data", dis_data.shape) + real_output = self.t_discriminators[i](dis_data, training=True) + # print(real_output.shape, tf.ones.shape) + + # train with fake + z = uniform([self.batch_size, self.noise_dim], dtype=float64) + # print("uniformly distributed noise", z.shape) + + sample = expand_dims(category_samples.sample(self.batch_size), axis=1) + # print("category", sample.shape) + + fake = self.generator(concat([z, sample], 1)) + # print('fake', fake.shape) + + fake_output = self.t_discriminators[i](concat([fake, sample], 1), training=True) + # print('fake_output_dis', fake_output.shape) + + # print("watch", disc_tape.watch(self.teacher_disc[i].trainable_variables) + real_loss_disc = cross_entropy(ones_like(real_output), real_output) + fake_loss_disc = cross_entropy(zeros_like(fake_output), fake_output) + + disc_loss = real_loss_disc + fake_loss_disc + # print(disc_loss, real_loss_disc, fake_loss_disc) + + disc_grad = disc_tape.gradient(disc_loss, self.t_discriminators[i].trainable_variables) + # print(gradients_of_discriminator) + + disc_opt_t[i].apply_gradients(zip(disc_grad, self.t_discriminators[i].trainable_variables)) + + # train the student discriminator + for t_3 in range(train_arguments.num_student_iters): + z = uniform([self.batch_size, self.noise_dim], dtype=float64) + + sample = expand_dims(category_samples.sample(self.batch_size), axis=1) + # print("category_stu", sample.shape) + + with GradientTape() as stu_tape: + fake = self.generator(concat([z, sample], 1)) + # print('fake_stu', fake.shape) + + predictions, clean_votes = self._pate_voting( + concat([fake, sample], 1), self.t_discriminators, train_arguments.lap_scale) + # print("noisy_labels", predictions.shape, "clean_votes", clean_votes.shape) + outputs = self.s_discriminator(concat([fake, sample], 1)) + + # update the moments + alpha = alpha + self._moments_acc(self.n_teachers, clean_votes, train_arguments.lap_scale, l_list) + # print("final_alpha", alpha) + + stu_loss = cross_entropy(predictions, outputs) + gradients_of_stu = stu_tape.gradient(stu_loss, self.s_discriminator.trainable_variables) + # print(gradients_of_stu) + + disc_opt_stu.apply_gradients(zip(gradients_of_stu, self.s_discriminator.trainable_variables)) + + # train the generator + z = uniform([self.batch_size, self.noise_dim], dtype=float64) + + sample_g = expand_dims(category_samples.sample(self.batch_size), axis=1) + + with GradientTape() as gen_tape: + fake = self.generator(concat([z, sample_g], 1)) + output = self.s_discriminator(concat([fake, sample_g], 1)) + + loss_gen = cross_entropy(ones_like(output), output) + gradients_of_generator = gen_tape.gradient(loss_gen, self.generator.trainable_variables) + generator_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables)) + + # Calculate the current privacy cost + epsilon = min((alpha - log(train_arguments.delta)) / l_list) + if steps % train_arguments.sample_interval == 0: + print("Step : ", steps, "Loss SD : ", stu_loss, "Loss G : ", loss_gen, "Epsilon : ", epsilon) + + steps += 1 + # self.generator.summary() + + def _pate_voting(self, data, netTD, lap_scale): + # TODO: Validate the logic against original article + ## Faz os votos dos teachers (1/0) netTD para cada record em data e guarda em results + results = zeros([len(netTD), self.batch_size], dtype=int64) + # print(results) + for i in range(len(netTD)): + output = netTD[i](data, training=True) + pred = transpose(cast((output > 0.5), int64)) + # print(pred) + results = tensor_scatter_nd_update(results, constant([[i]]), pred) + # print(results) + + #guarda o somatorio das probabilidades atribuidas por cada disc a cada record (valores entre 0 e len(netTD)) + clean_votes = expand_dims(cast(reduce_sum(results, 0), dtype=float64), 1) + # print("clean_votes",clean_votes) + noise_sample = distributions.Laplace(loc=0, scale=1/lap_scale).sample(clean_votes.shape) + # print("noise_sample", noise_sample) + noisy_results = clean_votes + cast(noise_sample, float64) + noisy_labels = cast((noisy_results > len(netTD)/2), float64) + + return noisy_labels, clean_votes + + +class Discriminator(Model): + def __init__(self, batch_size): + self.batch_size = batch_size + + def build_model(self, input_shape, dim): + input = Input(shape=input_shape, batch_size=self.batch_size) + x = Dense(dim * 4)(input) + x = ReLU()(x) + x = Dense(dim * 2)(x) + x = Dense(1)(x) + return Model(inputs=input, outputs=x) + + +class Generator(Model): + def __init__(self, batch_size): + self.batch_size = batch_size + + def build_model(self, input_shape, dim, data_dim, processor_info: Optional[NamedTuple] = None): + input = Input(shape=input_shape, batch_size = self.batch_size) + x = Dense(dim)(input) + x = ReLU()(x) + x = Dense(dim * 2)(x) + x = Dense(data_dim)(x) + if processor_info: + x = ActivationInterface(processor_info, 'ActivationInterface')(x) + return Model(inputs=input, outputs=x) From 48f21e9476540b55571410bfc34745b87163d49b Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Thu, 21 Apr 2022 17:34:44 +0100 Subject: [PATCH 2/3] feat: Integrate PATEGAN --- examples/regular/pategan_example.py | 52 ++++++ .../preprocessing/regular/processor.py | 1 + .../synthesizers/regular/pategan/model.py | 153 ++++++------------ 3 files changed, 104 insertions(+), 102 deletions(-) create mode 100644 examples/regular/pategan_example.py diff --git a/examples/regular/pategan_example.py b/examples/regular/pategan_example.py new file mode 100644 index 00000000..9fbf885a --- /dev/null +++ b/examples/regular/pategan_example.py @@ -0,0 +1,52 @@ +from pmlb import fetch_data + +from ydata_synthetic.synthesizers.regular import PATEGAN +from ydata_synthetic.synthesizers import ModelParameters, TrainParameters + +model = PATEGAN + +#Load data and define the data processor parameters +data = fetch_data('adult') +num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'] +cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', + 'native-country', 'target'] + +print(data.head()) + +#Defining the training parameters + +noise_dim = 128 +dim = 128 +batch_size = 50 + +log_step = 100 +epochs = 300+1 +learning_rate = [5e-4, 3e-3] +beta_1 = 0.5 +beta_2 = 0.9 +models_dir = './cache' + +gan_args = ModelParameters(batch_size=batch_size, + lr=learning_rate, + betas=(beta_1, beta_2), + noise_dim=noise_dim, + layers_dim=dim) + +# PATEGAN specific arguments +n_moments = 100 +n_teacher_iters = 5 +n_student_iters = 5 +n_teachers = min(int(len(data)/1e3), 100) +## Privacy/utility tradeoff specification +target_delta = 1e-3 +target_epsilon = 1e-1 +lap_scale = 1e-4 + +synthesizer = model(gan_args, n_teachers, target_delta, target_epsilon) +synthesizer.train(data, num_cols, cat_cols, + n_teacher_iters, n_student_iters, n_moments, lap_scale) + +synthesizer.save('pate_test.pkl') + +synthesizer = model.load('pate_test.pkl') +synth_data = synthesizer.sample(1000) diff --git a/src/ydata_synthetic/preprocessing/regular/processor.py b/src/ydata_synthetic/preprocessing/regular/processor.py index cdbabb97..7154ab2a 100644 --- a/src/ydata_synthetic/preprocessing/regular/processor.py +++ b/src/ydata_synthetic/preprocessing/regular/processor.py @@ -22,6 +22,7 @@ class RegularModels(Enum): WGAN = 'WGAN' WGAN_GP = 'WGAN_GP' CWGAN_GP = 'CWGAN_GP' + PATEGAN = 'PATEGAN' @typechecked diff --git a/src/ydata_synthetic/synthesizers/regular/pategan/model.py b/src/ydata_synthetic/synthesizers/regular/pategan/model.py index 28d9ca25..00116954 100644 --- a/src/ydata_synthetic/synthesizers/regular/pategan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/pategan/model.py @@ -3,9 +3,7 @@ from math import log from typing import List, NamedTuple, Optional -import tqdm -from tensorflow import (GradientTape, clip_by_value, concat, constant, - expand_dims, ones_like, tensor_scatter_nd_update, +from tensorflow import (GradientTape, clip_by_value, constant, expand_dims, ones_like, tensor_scatter_nd_update, transpose, zeros, zeros_like) from tensorflow.data import Dataset from tensorflow.dtypes import cast, float64, int64 @@ -19,7 +17,7 @@ from ydata_synthetic.synthesizers import TrainParameters from ydata_synthetic.synthesizers.gan import BaseModel -from ydata_synthetic.utils.gumbel_softmax import ActivationInterface +from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxActivation # pylint: disable=R0902 @@ -28,20 +26,20 @@ class PATEGAN(BaseModel): __MODEL__='PATEGAN' - def __init__(self, model_parameters, n_teachers: int, target_delta: float, target_epsilon: float): + def __init__(self, model_parameters, n_teachers: int, target_delta: float = 1e-5, target_epsilon: float = 5e-2): super().__init__(model_parameters) self.n_teachers = n_teachers self.target_epsilon = target_epsilon self.target_delta = target_delta # pylint: disable=W0201 - def define_gan(self, processor_info: Optional[NamedTuple] = None): + def define_gan(self, activation_info: Optional[NamedTuple] = None): def discriminator(): return Discriminator(self.batch_size).build_model((self.data_dim,), self.layers_dim) self.generator = Generator(self.batch_size). \ build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, - processor_info=processor_info) + activation_info=activation_info) self.s_discriminator = discriminator() self.t_discriminators = [discriminator() for i in range(self.n_teachers)] @@ -76,18 +74,21 @@ def get_data_loader(self, data) -> List[Dataset]: end_id = int((teacher_id + 1) * len(data) / self.n_teachers if \ teacher_id != (self.n_teachers - 1) else len(data)) loader.append(Dataset.from_tensor_slices(data[start_id:end_id:])\ - .batch(self.batch_size).shuffle(SHUFFLE_BUFFER_SIZE)) + .batch(self.batch_size).shuffle(SHUFFLE_BUFFER_SIZE).repeat().as_numpy_iterator()) return loader # pylint:disable=R0913 - def train(self, data, class_ratios, train_arguments: TrainParameters, num_cols: List[str], cat_cols: List[str]): + def train(self, data, num_cols: List[str], cat_cols: List[str], n_teacher_iters: int = 5, n_student_iters: int = 5, + n_moments: int = 100, lap_scale: float = 1e-4): """ Args: data: A pandas DataFrame or a Numpy array with the data to be synthesized - class_ratios: - train_arguments: GAN training arguments. num_cols: List of columns of the data object to be handled as numerical cat_cols: List of columns of the data object to be handled as categorical + n_teacher_iters: Number of train steps of each teacher discriminator per global step + n_student_iters: Number of train steps of the student discriminator per global step + n_moments: Number of moments accounted in the privacy budget computations + lap_scale: Inverse laplace noise scale multiplier """ super().train(data, num_cols, cat_cols) @@ -95,133 +96,81 @@ def train(self, data, class_ratios, train_arguments: TrainParameters, num_cols: self.data_dim = data.shape[1] self.define_gan(self.processor.col_transform_info) - self.class_ratios = class_ratios + alpha = cast([0.0 for _ in range(n_moments)], float64) + l_list = 1 + cast(range(n_moments), float64) + lap_scale = cast(lap_scale, float64) - alpha = cast([0.0 for _ in range(train_arguments.num_moments)], float64) - l_list = 1 + cast(range(train_arguments.num_moments), float64) - - # print("initial alpha", l_list.shape) - - cross_entropy = BinaryCrossentropy(from_logits=True) - - generator_optimizer = Adam(learning_rate=train_arguments.lr) - disc_opt_stu = Adam(learning_rate=train_arguments.lr) - disc_opt_t = [Adam(learning_rate=train_arguments.lr) for i in range(self.n_teachers)] - - train_loader = self.get_data_loader(data, self.batch_size) + train_loaders = self.get_data_loader(data) steps = 0 epsilon = 0 - category_samples = distributions.Categorical(probs=self.class_ratios, dtype=float64) - while epsilon < self.target_epsilon: # train the teacher descriminator - for t_2 in range(train_arguments.num_teacher_iters): - for i in range(self.n_teachers): - inputs, categories = None, None - for b, data_ in enumerate(train_loader[i]): - inputs, categories = data_, b # categories = 0, data_ holds the first batch, why do we do this? - #categories will give zero value in each loop as the loop break after running the first time - #inputs will have only the first batch of data - break + for t_2 in range(n_teacher_iters): + for train_loader, t_discriminator in zip(train_loaders, self.t_discriminators): + z = uniform([self.batch_size, self.noise_dim], dtype=float64) with GradientTape() as disc_tape: - # train with real - dis_data = concat([inputs, zeros((self.batch_size, 1), dtype=float64)], 1) # Why do we append a column of zeros instead of categories? - # print("1st batch data", dis_data.shape) - real_output = self.t_discriminators[i](dis_data, training=True) - # print(real_output.shape, tf.ones.shape) - - # train with fake - z = uniform([self.batch_size, self.noise_dim], dtype=float64) - # print("uniformly distributed noise", z.shape) - - sample = expand_dims(category_samples.sample(self.batch_size), axis=1) - # print("category", sample.shape) - - fake = self.generator(concat([z, sample], 1)) - # print('fake', fake.shape) - - fake_output = self.t_discriminators[i](concat([fake, sample], 1), training=True) - # print('fake_output_dis', fake_output.shape) + # loss on real data + real_batch=train_loader.next() + real_output = t_discriminator(real_batch, training=True) + real_loss_disc = t_discriminator.loss(ones_like(real_output), real_output) - # print("watch", disc_tape.watch(self.teacher_disc[i].trainable_variables) - real_loss_disc = cross_entropy(ones_like(real_output), real_output) - fake_loss_disc = cross_entropy(zeros_like(fake_output), fake_output) + # loss on fake data + fake = self.generator(z) + fake_output = t_discriminator(fake, training=True) + fake_loss_disc = t_discriminator.loss(zeros_like(fake_output), fake_output) + # compute and apply gradients disc_loss = real_loss_disc + fake_loss_disc - # print(disc_loss, real_loss_disc, fake_loss_disc) - - disc_grad = disc_tape.gradient(disc_loss, self.t_discriminators[i].trainable_variables) - # print(gradients_of_discriminator) - - disc_opt_t[i].apply_gradients(zip(disc_grad, self.t_discriminators[i].trainable_variables)) + disc_grad = disc_tape.gradient(disc_loss, t_discriminator.trainable_variables) + t_discriminator.optimizer.apply_gradients(zip(disc_grad, t_discriminator.trainable_variables)) # train the student discriminator - for t_3 in range(train_arguments.num_student_iters): + for t_3 in range(n_student_iters): z = uniform([self.batch_size, self.noise_dim], dtype=float64) - sample = expand_dims(category_samples.sample(self.batch_size), axis=1) - # print("category_stu", sample.shape) - with GradientTape() as stu_tape: - fake = self.generator(concat([z, sample], 1)) - # print('fake_stu', fake.shape) + # student discriminator loss + fake = self.generator(z) + predictions, clean_votes = self._pate_voting(fake, self.t_discriminators, lap_scale) + outputs = self.s_discriminator(fake) + stu_loss = self.s_discriminator.loss(predictions, outputs) - predictions, clean_votes = self._pate_voting( - concat([fake, sample], 1), self.t_discriminators, train_arguments.lap_scale) - # print("noisy_labels", predictions.shape, "clean_votes", clean_votes.shape) - outputs = self.s_discriminator(concat([fake, sample], 1)) - - # update the moments - alpha = alpha + self._moments_acc(self.n_teachers, clean_votes, train_arguments.lap_scale, l_list) - # print("final_alpha", alpha) - - stu_loss = cross_entropy(predictions, outputs) + # compute and apply gradients gradients_of_stu = stu_tape.gradient(stu_loss, self.s_discriminator.trainable_variables) - # print(gradients_of_stu) + self.s_discriminator.optimizer.apply_gradients(zip(gradients_of_stu, self.s_discriminator.trainable_variables)) - disc_opt_stu.apply_gradients(zip(gradients_of_stu, self.s_discriminator.trainable_variables)) + # update the moments + alpha = alpha + self._moments_acc(self.n_teachers, clean_votes, lap_scale, l_list) # train the generator z = uniform([self.batch_size, self.noise_dim], dtype=float64) - - sample_g = expand_dims(category_samples.sample(self.batch_size), axis=1) - with GradientTape() as gen_tape: - fake = self.generator(concat([z, sample_g], 1)) - output = self.s_discriminator(concat([fake, sample_g], 1)) + fake = self.generator(z) + output = self.s_discriminator(fake) + loss_gen = self.generator.loss(ones_like(output), output) - loss_gen = cross_entropy(ones_like(output), output) + # compute and apply gradients gradients_of_generator = gen_tape.gradient(loss_gen, self.generator.trainable_variables) - generator_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables)) + self.generator.optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables)) # Calculate the current privacy cost - epsilon = min((alpha - log(train_arguments.delta)) / l_list) - if steps % train_arguments.sample_interval == 0: - print("Step : ", steps, "Loss SD : ", stu_loss, "Loss G : ", loss_gen, "Epsilon : ", epsilon) + epsilon = min((alpha - log(self.target_delta)) / l_list).numpy() + print(f"Step : {steps} Loss SD : {stu_loss:.2e} Loss G : {loss_gen:.2e} Epsilon : {epsilon:.2e}") steps += 1 - # self.generator.summary() def _pate_voting(self, data, netTD, lap_scale): - # TODO: Validate the logic against original article - ## Faz os votos dos teachers (1/0) netTD para cada record em data e guarda em results results = zeros([len(netTD), self.batch_size], dtype=int64) - # print(results) - for i in range(len(netTD)): + for i in enumerate(netTD): output = netTD[i](data, training=True) pred = transpose(cast((output > 0.5), int64)) - # print(pred) results = tensor_scatter_nd_update(results, constant([[i]]), pred) - # print(results) - #guarda o somatorio das probabilidades atribuidas por cada disc a cada record (valores entre 0 e len(netTD)) clean_votes = expand_dims(cast(reduce_sum(results, 0), dtype=float64), 1) - # print("clean_votes",clean_votes) noise_sample = distributions.Laplace(loc=0, scale=1/lap_scale).sample(clean_votes.shape) - # print("noise_sample", noise_sample) noisy_results = clean_votes + cast(noise_sample, float64) noisy_labels = cast((noisy_results > len(netTD)/2), float64) @@ -245,12 +194,12 @@ class Generator(Model): def __init__(self, batch_size): self.batch_size = batch_size - def build_model(self, input_shape, dim, data_dim, processor_info: Optional[NamedTuple] = None): + def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None): input = Input(shape=input_shape, batch_size = self.batch_size) x = Dense(dim)(input) x = ReLU()(x) x = Dense(dim * 2)(x) x = Dense(data_dim)(x) - if processor_info: - x = ActivationInterface(processor_info, 'ActivationInterface')(x) + if activation_info: + x = GumbelSoftmaxActivation(activation_info, tau=tau)(x) return Model(inputs=input, outputs=x) From da061dfbd86932453caaad822f31379eb725e99f Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Mon, 16 May 2022 20:14:49 +0100 Subject: [PATCH 3/3] update and add comparison example --- examples/regular/pategan_comparison.ipynb | 438 ++++++++++++++++++ examples/regular/pategan_example.py | 4 +- .../synthesizers/regular/__init__.py | 2 +- .../synthesizers/regular/pategan/model.py | 11 +- 4 files changed, 446 insertions(+), 9 deletions(-) create mode 100644 examples/regular/pategan_comparison.ipynb diff --git a/examples/regular/pategan_comparison.ipynb b/examples/regular/pategan_comparison.ipynb new file mode 100644 index 00000000..80527ac6 --- /dev/null +++ b/examples/regular/pategan_comparison.ipynb @@ -0,0 +1,438 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PATEGAN example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we will be comparing the YData-Synthetic PATE-GAN implementation to the one from the original authors in the [mlforhealthlabpub](https://github.com/vanderschaarlab/mlforhealthlabpub/tree/main/alg/pategan) package. Since this package has a lot of dependencies and uses TensorFlow 1, we recommend that you create a new environment and follow their setup instructions available [here](https://github.com/vanderschaarlab/mlforhealthlabpub/blob/main/doc/install.md).\n", + "\n", + "## Introduction\n", + "To run this comparison we have executed `mlforhealthlabpub`'s implementation via the main script, together with their fake dataset script used for random dataset generation. With this utility script we have produced a train dataset used to train both synthesizers. Both synthesizers are defined according to the same set of parameters. After producing two versions of synthetic datasets we will use [Pandas Profiling](https://github.com/ydataai/pandas-profiling) to compare the outputs regarding fidelity.\n", + "\n", + "### Import the required packages" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-16 18:19:02.584078: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import pandas_profiling as pp\n", + "\n", + "from ydata_synthetic.synthesizers.regular import PATEGAN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the train dataset and the synthetic dataset from the original dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "dir = '../../data/'\n", + "train_data = pd.read_csv(dir+'train_dataset.csv', index_col=0)\n", + "orig_synth = pd.read_csv(dir+'orig_synth.csv', index_col=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train the YData-Synthetic synthesizer" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-16 18:19:03.613787: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set\n", + "2022-05-16 18:19:03.614447: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1\n", + "2022-05-16 18:19:03.643719: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: \n", + "pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2070 Super computeCapability: 7.5\n", + "coreClock: 1.38GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s\n", + "2022-05-16 18:19:03.643755: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n", + "2022-05-16 18:19:03.646305: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11\n", + "2022-05-16 18:19:03.646372: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11\n", + "2022-05-16 18:19:03.647028: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10\n", + "2022-05-16 18:19:03.647195: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10\n", + "2022-05-16 18:19:03.648253: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.10\n", + "2022-05-16 18:19:03.648816: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11\n", + "2022-05-16 18:19:03.648911: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8\n", + "2022-05-16 18:19:03.649488: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:AutoGraph could not transform > and will run it as-is.\n", + "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n", + "Cause: module 'gast' has no attribute 'Index'\n", + "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n", + "WARNING: AutoGraph could not transform > and will run it as-is.\n", + "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n", + "Cause: module 'gast' has no attribute 'Index'\n", + "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-16 18:19:03.666574: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2022-05-16 18:19:03.667386: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set\n", + "2022-05-16 18:19:03.667959: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: \n", + "pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2070 Super computeCapability: 7.5\n", + "coreClock: 1.38GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s\n", + "2022-05-16 18:19:03.668011: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n", + "2022-05-16 18:19:03.668032: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11\n", + "2022-05-16 18:19:03.668045: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11\n", + "2022-05-16 18:19:03.668059: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10\n", + "2022-05-16 18:19:03.668072: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10\n", + "2022-05-16 18:19:03.668085: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.10\n", + "2022-05-16 18:19:03.668098: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11\n", + "2022-05-16 18:19:03.668111: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8\n", + "2022-05-16 18:19:03.668726: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0\n", + "2022-05-16 18:19:03.668769: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n", + "2022-05-16 18:19:04.046932: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1261] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2022-05-16 18:19:04.046951: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267] 0 \n", + "2022-05-16 18:19:04.046955: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1280] 0: N \n", + "2022-05-16 18:19:04.047834: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1406] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6713 MB memory) -> physical GPU (device: 0, name: NVIDIA GeForce RTX 2070 Super, pci bus id: 0000:01:00.0, compute capability: 7.5)\n", + "2022-05-16 18:19:04.312378: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11\n", + "2022-05-16 18:19:04.741514: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step : 0 Loss SD : 6.87e-01 Loss G : 7.65e-01 Epsilon : 8.44e-01\n", + "Step : 1 Loss SD : 7.01e-01 Loss G : 7.10e-01 Epsilon : 1.11e+00\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Synthetic data generation: 100%|██████████| 157/157 [00:00<00:00, 734.24it/s]\n" + ] + } + ], + "source": [ + "from ydata_synthetic.synthesizers import ModelParameters\n", + "\n", + "num_cols = train_data.columns.to_list()\n", + "cat_cols = []\n", + "\n", + "#Defining the training parameters\n", + "noise_dim = 128\n", + "dim = 4*len(train_data.columns)\n", + "batch_size = 64\n", + "\n", + "log_step = 100\n", + "learning_rate = [5e-4, 3e-3]\n", + "beta_1 = 0.5\n", + "beta_2 = 0.9\n", + "models_dir = './cache'\n", + "\n", + "gan_args = ModelParameters(batch_size=batch_size,\n", + " lr=learning_rate,\n", + " betas=(beta_1, beta_2),\n", + " noise_dim=noise_dim,\n", + " layers_dim=dim)\n", + "\n", + "# PATEGAN specific arguments\n", + "n_moments = 20\n", + "n_teacher_iters = 1\n", + "n_student_iters = 1\n", + "n_teachers = 10\n", + "## Privacy/utility tradeoff specification\n", + "target_delta = 1e-5\n", + "target_epsilon = 1\n", + "lap_scale = 1e-2\n", + "\n", + "model = PATEGAN\n", + "\n", + "synthesizer = model(gan_args, n_teachers, target_delta, target_epsilon)\n", + "synthesizer.train(train_data, num_cols, cat_cols,\n", + " n_teacher_iters, n_student_iters, n_moments, lap_scale)\n", + "\n", + "synthesizer.save('pate_test.pkl')\n", + "\n", + "synthesizer = model.load('pate_test.pkl')\n", + "ydata_synth = synthesizer.sample(train_data.shape[0])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Profiling the synthetic samples" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "train_prof = pp.ProfileReport(train_data, title='Train data')\n", + "\n", + "orig_prof = pp.ProfileReport(orig_synth, title='Original PATEGAN implementation synthetic samples')\n", + "ydata_prof = pp.ProfileReport(ydata_synth, title='YData-Synthetic PATEGAN implementation synthetic samples')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "74199f13341e4415aefcbdfa82a9df6a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Summarize dataset: 0%| | 0/5 [00:00 0.5), int64)) results = tensor_scatter_nd_update(results, constant([[i]]), pred) @@ -184,8 +183,7 @@ def __init__(self, batch_size): def build_model(self, input_shape, dim): input = Input(shape=input_shape, batch_size=self.batch_size) x = Dense(dim * 4)(input) - x = ReLU()(x) - x = Dense(dim * 2)(x) + x = Activation('relu')(x) x = Dense(1)(x) return Model(inputs=input, outputs=x) @@ -197,8 +195,9 @@ def __init__(self, batch_size): def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None): input = Input(shape=input_shape, batch_size = self.batch_size) x = Dense(dim)(input) - x = ReLU()(x) + x = Activation('tanh')(x) x = Dense(dim * 2)(x) + x = Activation('tanh')(x) x = Dense(data_dim)(x) if activation_info: x = GumbelSoftmaxActivation(activation_info, tau=tau)(x)