chore: Streamlit demo app to generate synthetic dataset using ydata-synthetic on tabular data (#166)

rajeshai · web-flow · commit d43b5f31d69a · 2023-02-25T22:38:46.000Z
* Add files via upload

* Delete examples/regular/ydata-synthetic-streamlit directory

* Demo app with Streamlit

* Update app.py

* Update app.py

* Update app.py

* Add files via upload

* Create app.gif

* Create README.md

* Create requirements.txt

* Update README.md

* Update README.md

* Update requirements.txt

* Update README.md

* Update README.md

* Update README.md
diff --git a/examples/regular/streamlit app/.streamlit/config.toml b/examples/regular/streamlit app/.streamlit/config.toml
@@ -0,0 +1,5 @@
+[theme]
+primaryColor="#040000"
+backgroundColor="#770303"
+secondaryBackgroundColor="#000000"
+textColor="#f2f2f3"
diff --git a/examples/regular/streamlit app/README.md b/examples/regular/streamlit app/README.md
@@ -0,0 +1,23 @@
+# Streamlit application to generate synthetic data using ydata-synthetic
+
+<img src="https://github.com/rajeshai/ydata-synthetic/blob/dev/examples/regular/streamlit%20app/app.JPG" alt="streamlit app to generate synthetic data">
+
+This application takes a pre-processed dataset as input and outputs a synthetic dataset based on the given input parameters. This is made with open source libraries streamlit, ydata-synthetic and deployed on the streamlit cloud.
+
+## How to use
+
+1.  Upload a pre-processed dataset.
+2.  Choose the numerical features and categorical features.
+3.  Choose all the training parameters appropriately.
+4.  Click the 'click here to start the training process' button.
+
+<img src="https://github.com/rajeshai/ydata-synthetic/blob/dev/examples/regular/streamlit%20app/app.gif" alt="streamlit app to generate synthetic data">
+
+Wait for the training to end. You will see a graph comparing the original data and synthetic data after training.
+Please use less number of epochs to complete the training process quickly as this application is deployed on the community cloud of streamlit which has computational limits.
+
+## Contributing
+
+Find the application here in this link [![Open in Streamlit](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://share.streamlit.io/rajeshai/ydata-synthetic-streamlit/main/app.py)
+
+Feel free to contribute to this app by adding more features and optimizing its performance further.
diff --git a/examples/regular/streamlit app/YData_logo.svg b/examples/regular/streamlit app/YData_logo.svg
@@ -0,0 +1,15 @@
+<svg width="199" height="64" viewBox="0 0 199 64" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0)">
+<path d="M89.3872 40.4979L78.1299 23.1573H82.2961L91.4262 37.3855L100.467 23.1573H104.456L93.1987 40.4979V52.0584H89.3872V40.4979Z" fill="white"/>
+<path d="M107.553 23.1573H116.586C127.969 23.1573 132.846 29.6689 132.846 37.6078C132.846 45.5466 127.878 52.0584 116.586 52.0584H107.553V23.1573ZM116.677 48.7581C126.162 48.7581 128.872 43.4952 128.872 37.697C128.872 31.899 126.162 26.6361 116.677 26.6361H111.437V48.7581H116.677Z" fill="white"/>
+<path d="M136.461 41.9497C136.461 35.8402 140.689 30.8986 146.446 30.8986C149.865 30.8986 152.924 32.6057 154.723 35.3011L154.903 31.5276H158.141V52.4618H154.903L154.723 48.6883C153.014 51.3833 149.955 53.0907 146.446 53.0907C140.689 53.0007 136.461 48.059 136.461 41.9497ZM154.633 41.9497C154.633 36.9183 151.394 33.8636 147.436 33.8636C143.478 33.8636 140.239 36.9183 140.239 41.9497C140.239 46.981 143.478 50.0356 147.436 50.0356C151.394 50.0356 154.633 46.981 154.633 41.9497Z" fill="white"/>
+<path d="M166.534 33.8909H161.754V31.4095H166.534V25.7377H170.303V31.4095H175.175V33.8909H170.303V52.0585H166.534V33.8909Z" fill="white"/>
+<path d="M177.24 41.9497C177.24 35.8402 181.368 30.8986 186.988 30.8986C190.325 30.8986 193.311 32.6057 195.067 35.3011L195.243 31.5276H198.404V52.4618H195.243L195.067 48.6883C193.399 51.3833 190.413 53.0907 186.988 53.0907C181.28 53.0007 177.24 48.059 177.24 41.9497ZM194.979 41.9497C194.979 36.9183 191.818 33.8636 187.954 33.8636C184.09 33.8636 180.928 36.9183 180.928 41.9497C180.928 46.981 184.09 50.0356 187.954 50.0356C191.73 50.0356 194.979 46.981 194.979 41.9497Z" fill="white"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M18.2823 0.377791L19.2846 0.24353L44.6751 14.7804L70.0469 3.42956L71.0368 4.70757L56.3235 23.1429L55.9733 23.3999L45.9248 27.2843V44.7475L45.6815 45.3351L27.0457 64L25.625 63.4124V34.4005L0.285813 19.893L0.110352 18.5837L18.2823 0.377791ZM27.2889 34.4885V61.4016L44.2609 44.4036V27.9275L27.2889 34.4885ZM45.9248 25.5007L55.163 21.9294L67.5999 6.34676L45.9248 16.0439V25.5007ZM43.5451 16.0504L19.0103 2.00348L2.05598 18.9895L26.3248 32.8841L43.5451 16.0504ZM29.9344 31.6822L44.2609 26.1439V17.6775L29.9344 31.6822Z" fill="#E32212"/>
+</g>
+<defs>
+<clipPath id="clip0">
+<rect width="198.649" height="64" fill="white"/>
+</clipPath>
+</defs>
+</svg>
diff --git a/examples/regular/streamlit app/app.JPG b/examples/regular/streamlit app/app.JPG
diff --git a/examples/regular/streamlit app/app.gif b/examples/regular/streamlit app/app.gif
diff --git a/examples/regular/streamlit app/app.py b/examples/regular/streamlit app/app.py
@@ -0,0 +1,113 @@
+import os
+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from ydata_synthetic.synthesizers.regular import DRAGAN, CGAN, CRAMERGAN, WGAN_GP
+from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
+
+st.set_page_config(layout="wide",initial_sidebar_state="auto")
+os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'
+def run():
+    #global data_synn
+    st.sidebar.image('YData_logo.svg')
+    st.title('Generate synthetic data for a tabular classification dataset using [ydata-synthetic](https://github.com/ydataai/ydata-synthetic)')
+    st.markdown('This streamlit application can generate synthetic data for your dataset. Please read all the instructions in the sidebar before you start the process.')
+    data = st.file_uploader('Upload a preprocessed dataset in csv format')
+    st.sidebar.title('About')
+    st.sidebar.markdown('[ydata-synthetic](https://github.com/ydataai/ydata-synthetic) is an open-source library and is used to generate synthetic data mimicking the real world data.')
+    st.sidebar.header('What is synthetic data?')
+    st.sidebar.markdown('Synthetic data is artificially generated data that is not collected from real world events. It replicates the statistical components of real data without containing any identifiable information, ensuring individuals privacy.')
+    st.sidebar.header('Why Synthetic Data?')
+    st.sidebar.markdown('''Synthetic data can be used for many applications:
+- Privacy
+- Remove bias
+- Balance datasets
+- Augment datasets''')
+
+
+    st.sidebar.header('Steps to follow')
+    st.sidebar.markdown('''
+- Upload any preprocessed tabular classification dataset.
+- Choose the parameters in the adjacent window appropriately.
+- Since this is a demo, please choose less number of epochs for quick completion of training.
+- After choosing all parameters, Click the button under the parameters to start training.
+- After the training is complete, you will see a graph comparing both real data set and synthetic dataset. Categorical columns are used to compare.
+- You will also see a button to download your synthetic dataset. Click that button to download your dataset.''')
+
+    st.sidebar.markdown('''[![Repo](https://badgen.net/badge/icon/GitHub?icon=github&label)](https://github.com/ydataai/ydata-synthetic)''',unsafe_allow_html=True)
+
+    @st.cache
+    def train(df):
+        #models_dir = './cache'
+        gan_args = ModelParameters(batch_size=batch_size,
+                           lr=learning_rate*0.001,
+                           betas=(beta_1, beta_2),
+                           noise_dim=noise_dim,
+                           layers_dim=layer_dim)
+
+        train_args = TrainParameters(epochs=epochs,
+                             sample_interval=log_step)
+        synthesizer = model(gan_args, n_discriminator=3)
+        synthesizer.train(data, train_args, num_cols, cat_cols)
+        synthesizer.save('data_synth.pkl')
+        synthesizer = model.load('data_synth.pkl')
+        data_syn = synthesizer.sample(samples)
+        return data_syn
+    @st.cache
+    def convert_df(df):
+        return df.to_csv().encode('utf-8')
+    if data is not None:
+        data = pd.read_csv(data)
+        data.dropna(inplace=True)
+        st.header('Choose the parameters!!')
+        col1, col2, col3,col4 = st.columns(4)
+        with col1:
+            model = st.selectbox('Choose the GAN model', ['DRAGAN','CGAN','CRAMEGAN','WGAN_GP'],key=1)
+            if model=='DRAGAN':
+                model = DRAGAN
+            elif model=='CGAN':
+                model=CGAN
+            elif model=='CRAMEGAN':
+                model = CRAMERGAN
+            else:
+                model = WGAN_GP
+            num_cols = st.multiselect('Choose the numerical columns', data.columns,key=1)
+            cat_cols = st.multiselect('Choose categorical columns', [x for x in data.columns if x not in num_cols], key=2)
+
+        with col2:
+            noise_dim = st.number_input('Select noise dimension', 0,200,128,1)
+            layer_dim = st.number_input('Select the layer dimension', 0,200,128,1)
+            batch_size = st.number_input('Select batch size', 0,500, 500,1)
+
+        with col3:
+            log_step = st.number_input('Select sample interval', 0,200,100,1)
+            epochs = st.number_input('Select the number of epochs',0,50,2,1)
+            learning_rate = st.number_input('Select learning rate(x1e-3', 0.01, 0.1, 0.05, 0.01)
+
+        with col4:
+            beta_1 = st.slider('Select first beta co-efficient', 0.0, 1.0, 0.5)
+            beta_2 = st.slider('Select second beta co-efficient', 0.0, 1.0, 0.9)
+            samples = st.number_input('Select the number of synthetic samples to be generated', 0, 400000, step=1000)
+    if st.button('Click here to start the training process'):
+        if data is not None:
+            st.write('Model Training is in progress. It may take a few minutes. Please wait for a while.')
+            data_synn = train(data)
+            st.success('Synthetic dataset with the given number of samples is generated!!')
+            st.subheader('Real Data vs Synthetic Data')
+            f , axes =  plt.subplots(len(cat_cols),2, figsize=(20,25))
+            f.suptitle('Real data vs Synthetic data')
+            for i, j in enumerate(cat_cols):
+                sns.countplot(x=j, data=data, ax = axes[i,0])
+                sns.countplot(x=j, data=data_synn, ax = axes[i,1])
+            st.pyplot(f)
+            st.download_button(
+            label="Download data as CSV",
+            data=convert_df(data_synn),
+            file_name='data_syn.csv',
+            mime='text/csv')
+            st.balloons()
+        else:
+            st.write('Upload a dataset to train!!')
+if __name__== '__main__':
+    run()
diff --git a/examples/regular/streamlit app/requirements.txt b/examples/regular/streamlit app/requirements.txt
@@ -0,0 +1,6 @@
+pandas
+matplotlib
+numpy
+seaborn
+streamlit
+ydata-synthetic