diff --git a/examples/regular/pategan_comparison.ipynb b/examples/regular/pategan_comparison.ipynb new file mode 100644 index 00000000..80527ac6 --- /dev/null +++ b/examples/regular/pategan_comparison.ipynb @@ -0,0 +1,438 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PATEGAN example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we will be comparing the YData-Synthetic PATE-GAN implementation to the one from the original authors in the [mlforhealthlabpub](https://github.com/vanderschaarlab/mlforhealthlabpub/tree/main/alg/pategan) package. Since this package has a lot of dependencies and uses TensorFlow 1, we recommend that you create a new environment and follow their setup instructions available [here](https://github.com/vanderschaarlab/mlforhealthlabpub/blob/main/doc/install.md).\n", + "\n", + "## Introduction\n", + "To run this comparison we have executed `mlforhealthlabpub`'s implementation via the main script, together with their fake dataset script used for random dataset generation. With this utility script we have produced a train dataset used to train both synthesizers. Both synthesizers are defined according to the same set of parameters. After producing two versions of synthetic datasets we will use [Pandas Profiling](https://github.com/ydataai/pandas-profiling) to compare the outputs regarding fidelity.\n", + "\n", + "### Import the required packages" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-16 18:19:02.584078: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import pandas_profiling as pp\n", + "\n", + "from ydata_synthetic.synthesizers.regular import PATEGAN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the train dataset and the synthetic dataset from the original dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "dir = '../../data/'\n", + "train_data = pd.read_csv(dir+'train_dataset.csv', index_col=0)\n", + "orig_synth = pd.read_csv(dir+'orig_synth.csv', index_col=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train the YData-Synthetic synthesizer" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-16 18:19:03.613787: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set\n", + "2022-05-16 18:19:03.614447: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1\n", + "2022-05-16 18:19:03.643719: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: \n", + "pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2070 Super computeCapability: 7.5\n", + "coreClock: 1.38GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s\n", + "2022-05-16 18:19:03.643755: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n", + "2022-05-16 18:19:03.646305: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11\n", + "2022-05-16 18:19:03.646372: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11\n", + "2022-05-16 18:19:03.647028: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10\n", + "2022-05-16 18:19:03.647195: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10\n", + "2022-05-16 18:19:03.648253: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.10\n", + "2022-05-16 18:19:03.648816: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11\n", + "2022-05-16 18:19:03.648911: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8\n", + "2022-05-16 18:19:03.649488: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:AutoGraph could not transform > and will run it as-is.\n", + "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n", + "Cause: module 'gast' has no attribute 'Index'\n", + "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n", + "WARNING: AutoGraph could not transform > and will run it as-is.\n", + "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n", + "Cause: module 'gast' has no attribute 'Index'\n", + "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-16 18:19:03.666574: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2022-05-16 18:19:03.667386: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set\n", + "2022-05-16 18:19:03.667959: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: \n", + "pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2070 Super computeCapability: 7.5\n", + "coreClock: 1.38GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s\n", + "2022-05-16 18:19:03.668011: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n", + "2022-05-16 18:19:03.668032: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11\n", + "2022-05-16 18:19:03.668045: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11\n", + "2022-05-16 18:19:03.668059: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10\n", + "2022-05-16 18:19:03.668072: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10\n", + "2022-05-16 18:19:03.668085: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.10\n", + "2022-05-16 18:19:03.668098: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11\n", + "2022-05-16 18:19:03.668111: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8\n", + "2022-05-16 18:19:03.668726: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0\n", + "2022-05-16 18:19:03.668769: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n", + "2022-05-16 18:19:04.046932: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1261] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2022-05-16 18:19:04.046951: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267] 0 \n", + "2022-05-16 18:19:04.046955: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1280] 0: N \n", + "2022-05-16 18:19:04.047834: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1406] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6713 MB memory) -> physical GPU (device: 0, name: NVIDIA GeForce RTX 2070 Super, pci bus id: 0000:01:00.0, compute capability: 7.5)\n", + "2022-05-16 18:19:04.312378: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11\n", + "2022-05-16 18:19:04.741514: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step : 0 Loss SD : 6.87e-01 Loss G : 7.65e-01 Epsilon : 8.44e-01\n", + "Step : 1 Loss SD : 7.01e-01 Loss G : 7.10e-01 Epsilon : 1.11e+00\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Synthetic data generation: 100%|██████████| 157/157 [00:00<00:00, 734.24it/s]\n" + ] + } + ], + "source": [ + "from ydata_synthetic.synthesizers import ModelParameters\n", + "\n", + "num_cols = train_data.columns.to_list()\n", + "cat_cols = []\n", + "\n", + "#Defining the training parameters\n", + "noise_dim = 128\n", + "dim = 4*len(train_data.columns)\n", + "batch_size = 64\n", + "\n", + "log_step = 100\n", + "learning_rate = [5e-4, 3e-3]\n", + "beta_1 = 0.5\n", + "beta_2 = 0.9\n", + "models_dir = './cache'\n", + "\n", + "gan_args = ModelParameters(batch_size=batch_size,\n", + " lr=learning_rate,\n", + " betas=(beta_1, beta_2),\n", + " noise_dim=noise_dim,\n", + " layers_dim=dim)\n", + "\n", + "# PATEGAN specific arguments\n", + "n_moments = 20\n", + "n_teacher_iters = 1\n", + "n_student_iters = 1\n", + "n_teachers = 10\n", + "## Privacy/utility tradeoff specification\n", + "target_delta = 1e-5\n", + "target_epsilon = 1\n", + "lap_scale = 1e-2\n", + "\n", + "model = PATEGAN\n", + "\n", + "synthesizer = model(gan_args, n_teachers, target_delta, target_epsilon)\n", + "synthesizer.train(train_data, num_cols, cat_cols,\n", + " n_teacher_iters, n_student_iters, n_moments, lap_scale)\n", + "\n", + "synthesizer.save('pate_test.pkl')\n", + "\n", + "synthesizer = model.load('pate_test.pkl')\n", + "ydata_synth = synthesizer.sample(train_data.shape[0])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Profiling the synthetic samples" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "train_prof = pp.ProfileReport(train_data, title='Train data')\n", + "\n", + "orig_prof = pp.ProfileReport(orig_synth, title='Original PATEGAN implementation synthetic samples')\n", + "ydata_prof = pp.ProfileReport(ydata_synth, title='YData-Synthetic PATEGAN implementation synthetic samples')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "74199f13341e4415aefcbdfa82a9df6a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Summarize dataset: 0%| | 0/5 [00:00 List[Dataset]: + "Obtain a List of TF Datasets corresponding to partitions for each teacher in n_teachers." + loader = [] + SHUFFLE_BUFFER_SIZE = 100 + + for teacher_id in range(self.n_teachers): + start_id = int(teacher_id * len(data) / self.n_teachers) + end_id = int((teacher_id + 1) * len(data) / self.n_teachers if \ + teacher_id != (self.n_teachers - 1) else len(data)) + loader.append(Dataset.from_tensor_slices(data[start_id:end_id:])\ + .batch(self.batch_size).shuffle(SHUFFLE_BUFFER_SIZE).repeat().as_numpy_iterator()) + return loader + + # pylint:disable=R0913 + def train(self, data, num_cols: List[str], cat_cols: List[str], n_teacher_iters: int = 5, n_student_iters: int = 5, + n_moments: int = 100, lap_scale: float = 1e-4): + """ + Args: + data: A pandas DataFrame or a Numpy array with the data to be synthesized + num_cols: List of columns of the data object to be handled as numerical + cat_cols: List of columns of the data object to be handled as categorical + n_teacher_iters: Number of train steps of each teacher discriminator per global step + n_student_iters: Number of train steps of the student discriminator per global step + n_moments: Number of moments accounted in the privacy budget computations + lap_scale: Inverse laplace noise scale multiplier + """ + super().train(data, num_cols, cat_cols) + + data = self.processor.transform(data) + self.data_dim = data.shape[1] + self.define_gan(self.processor.col_transform_info) + + alpha = cast([0.0 for _ in range(n_moments)], float64) + l_list = 1 + cast(range(n_moments), float64) + lap_scale = cast(lap_scale, float64) + + train_loaders = self.get_data_loader(data) + + steps = 0 + epsilon = 0 + + while epsilon < self.target_epsilon: + # train the teacher descriminator + for t_2 in range(n_teacher_iters): + for train_loader, t_discriminator in zip(train_loaders, self.t_discriminators): + z = uniform([self.batch_size, self.noise_dim], dtype=float64) + + with GradientTape() as disc_tape: + # loss on real data + real_batch=train_loader.next() + real_output = t_discriminator(real_batch, training=True) + real_loss_disc = t_discriminator.loss(ones_like(real_output), real_output) + + # loss on fake data + fake = self.generator(z) + fake_output = t_discriminator(fake, training=True) + fake_loss_disc = t_discriminator.loss(zeros_like(fake_output), fake_output) + + # compute and apply gradients + disc_loss = real_loss_disc + fake_loss_disc + disc_grad = disc_tape.gradient(disc_loss, t_discriminator.trainable_variables) + t_discriminator.optimizer.apply_gradients(zip(disc_grad, t_discriminator.trainable_variables)) + + # train the student discriminator + for t_3 in range(n_student_iters): + z = uniform([self.batch_size, self.noise_dim], dtype=float64) + + with GradientTape() as stu_tape: + # student discriminator loss + fake = self.generator(z) + predictions, clean_votes = self._pate_voting(fake, self.t_discriminators, lap_scale) + outputs = self.s_discriminator(fake) + stu_loss = self.s_discriminator.loss(predictions, outputs) + + # compute and apply gradients + gradients_of_stu = stu_tape.gradient(stu_loss, self.s_discriminator.trainable_variables) + self.s_discriminator.optimizer.apply_gradients(zip(gradients_of_stu, self.s_discriminator.trainable_variables)) + + # update the moments + alpha = alpha + self._moments_acc(self.n_teachers, clean_votes, lap_scale, l_list) + + # train the generator + z = uniform([self.batch_size, self.noise_dim], dtype=float64) + with GradientTape() as gen_tape: + fake = self.generator(z) + output = self.s_discriminator(fake) + loss_gen = self.generator.loss(ones_like(output), output) + + # compute and apply gradients + gradients_of_generator = gen_tape.gradient(loss_gen, self.generator.trainable_variables) + self.generator.optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables)) + + # Calculate the current privacy cost + epsilon = min((alpha - log(self.target_delta)) / l_list).numpy() + print(f"Step : {steps} Loss SD : {stu_loss:.2e} Loss G : {loss_gen:.2e} Epsilon : {epsilon:.2e}") + + steps += 1 + + def _pate_voting(self, data, netTD, lap_scale): + results = zeros([len(netTD), self.batch_size], dtype=int64) + for i in range(len(netTD)): + output = netTD[i](data, training=True) + pred = transpose(cast((output > 0.5), int64)) + results = tensor_scatter_nd_update(results, constant([[i]]), pred) + + clean_votes = expand_dims(cast(reduce_sum(results, 0), dtype=float64), 1) + noise_sample = distributions.Laplace(loc=0, scale=1/lap_scale).sample(clean_votes.shape) + noisy_results = clean_votes + cast(noise_sample, float64) + noisy_labels = cast((noisy_results > len(netTD)/2), float64) + + return noisy_labels, clean_votes + + +class Discriminator(Model): + def __init__(self, batch_size): + self.batch_size = batch_size + + def build_model(self, input_shape, dim): + input = Input(shape=input_shape, batch_size=self.batch_size) + x = Dense(dim * 4)(input) + x = Activation('relu')(x) + x = Dense(1)(x) + return Model(inputs=input, outputs=x) + + +class Generator(Model): + def __init__(self, batch_size): + self.batch_size = batch_size + + def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None): + input = Input(shape=input_shape, batch_size = self.batch_size) + x = Dense(dim)(input) + x = Activation('tanh')(x) + x = Dense(dim * 2)(x) + x = Activation('tanh')(x) + x = Dense(data_dim)(x) + if activation_info: + x = GumbelSoftmaxActivation(activation_info, tau=tau)(x) + return Model(inputs=input, outputs=x)