Data-Centric-AI-Community
diff --git a/‎.github/workflows/docs.yaml‎
Lines changed: 85 additions & 0 deletions b/‎.github/workflows/docs.yaml‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎.github/workflows/pull_request.yml‎
Lines changed: 33 additions & 1 deletion b/‎.github/workflows/pull_request.yml‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 4 additions & 2 deletions b/‎Makefile‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎docs/README.md‎
Lines changed: 16 additions & 0 deletions b/‎docs/README.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎docs/getting-started/examples.md‎ b/‎docs/getting-started/examples.md‎
diff --git a/‎docs/getting-started/installation.md‎
Lines changed: 44 additions & 0 deletions b/‎docs/getting-started/installation.md‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎docs/getting-started/quickstart.md‎
Lines changed: 69 additions & 0 deletions b/‎docs/getting-started/quickstart.md‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎docs/index.md‎
Lines changed: 55 additions & 0 deletions b/‎docs/index.md‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎docs/reference/api/index.md‎ b/‎docs/reference/api/index.md‎
@@ -0,0 +1,85 @@
+name: Publish Documentation
+
+
+
+on:
+  push:
+    paths:
+    - .github/workflows/docs.yaml
+    - docs/**
+    - mkdocs.yml
+    - requirements-docs.txt
+    branches:
+    - main
+    - dev
+  release:
+    types:
+    - released
+    - prereleased
+
+
+
+jobs:
+  prepare:
+    name: Get Current version
+    runs-on: ubuntu-22.04
+
+    outputs:
+      version: ${{ steps.version.outputs.value }}
+
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        token: ${{ secrets.ACCESS_TOKEN }}
+
+    - name: Find Latest Tag
+      id: latest_tag
+      uses: oprypin/find-latest-tag@v1.1.1
+      with:
+        repository: ${{ github.repository }}
+        regex: '^\d+\.\d+\.\d+$'
+
+    - name: Extract major and minor version
+      id: version
+      run: |
+        echo "value=`echo ${{ steps.latest_tag.outputs.tag }} | sed -r 's|([0-9]+.[0-9]+).*|\1|g'`" >> $GITHUB_OUTPUT
+
+
+  publish-docs:
+    name: Publish Docs
+    runs-on: ubuntu-22.04
+
+    needs:
+    - prepare
+
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+        token: ${{ secrets.ACCESS_TOKEN }}
+
+    - name: Configurating Git
+      run: |
+        git config user.email "azory@ydata.ai"
+        git config user.name "Azory YData Bot"
+        git config core.autocrlf false
+
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.10"
+
+    - name: Cache pip dependencies
+      id: cache
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
+
+    - name: Install doc dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements-docs.txt
+
+    - name: Publish
+      run: make publish-docs version=${{ needs.prepare.outputs.version }}
@@ -5,7 +5,9 @@ on:
     branches:
     - renovate/**
   pull_request:
-    branches: [ master ]
+    branches: 
+    - master
+    - dev
 
 jobs:
   validate:
@@ -41,3 +43,33 @@ jobs:
 
     - name: Tests
       run: make test || exit 0
+
+  validate-docs:
+    name: Validate Docs
+    runs-on: ubuntu-22.04
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.10"
+
+    - name: Cache pip dependencies
+      id: cache
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('requirements-docs.txt') }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements-docs.txt
+
+    - name: Build docs
+      run: |
+        echo "0.0dev0" > VERSION
+        pip install .
+        mkdocs build
@@ -255,6 +255,7 @@ pythonenv*
 
 # mkdocs documentation
 /site
+/static/docs
 
 # mypy
 .mypy_cache/
@@ -373,4 +374,4 @@ DerivedData/
 
 # User created
 VERSION
-version.py
+version.py
@@ -31,5 +31,7 @@ clean: ### Removes build binaries
 install: ### Installs required dependencies
 	$(PIP) install dist/ydata-synthetic-$(version).tar.gz
 
-
-
+publish-docs: ### Publishes the documentation
+	echo "$(version)" > VERSION
+	$(PIP) install .
+	mike deploy --push --update-aliases $(version) latest
@@ -0,0 +1,16 @@
+# ydata-synthetic documentation
+
+Installing the doc dependencies (one time step):
+```
+pip install -r requirements-docs.txt
+```
+
+Build the doc for deployment:
+```
+mkdocs build
+```
+
+To build and serve locally:
+```
+mkdocs serve
+```
@@ -0,0 +1,44 @@
+
+`ydata-synthetic` is available through PyPi, allowing an easy process of installation and integration with the data science programing environments (Google Colab, Jupyter Notebooks, Visual Studio Code, PyCharm) and stack (`pandas`, `numpy`, `scikit-learn`).
+
+##Installing the package
+Currently, the package supports **python versions over 3.9**, and can be installed in Windows, Linux or MacOS operating systems. 
+
+Prior to the package installation, it is recommended the creation of a virtual or `conda` environment:
+
+=== "conda"
+    ``` commandline
+    conda create -n synth-env python=3.10
+    conda activate synth-env
+    ```
+
+The above command creates and activates a new environment called "synth-env" with Python version 3.10.X. In the new environment, you can then install `ydata-synthetic`:
+
+=== "pypi"
+    ``` commandline
+    pip install ydata-synthetic==1.1.0
+    ```
+
+:fontawesome-brands-youtube:{ style="color: #EE0F0F" }
+[Installing ydata-synthetic](https://www.youtube.com/watch?v=aESmGcxtBdU) – :octicons-clock-24:
+5min – Step-by-step installation guide
+
+## Using Google Colab
+To install inside a Google Colab notebook, you can use the following:
+
+``` commandline
+!pip install ydata-synthetic==1.1.0
+```
+
+Make sure your Google Colab is running Python versions `>=3.9, <3.11`. Learn how to configure Python versions on Google Colab [here](https://stackoverflow.com/questions/68657341/how-can-i-update-google-colabs-python-version/68658479#68658479).
+
+
+## Installing the Streamlit App
+Since version 1.0.0, the `ydata-synthetic` includes a GUI experience provided by a Streamlit app. The UI supports the data synthesization process from reading the data to profiling the synthetic data generation, and can be installed as follows:
+
+``` commandline
+pip install "ydata-synthetic[streamlit]"
+```
+
+Note that Jupyter or Colab Notebooks are not yet supported, so use it in your Python environment.
+
@@ -0,0 +1,69 @@
+# Quickstart
+
+`ydata-synthetic` is equipped to handle both **tabular** (comprising numeric and categorical features) and sequential, **time-series** data. In this section we explain how you can **quickstart the synthesization** of tabular and time-series datasets.
+
+## Synthesizing a Tabular Dataset
+The following example showcases how to synthesize the [Adult Census Income](https://www.kaggle.com/datasets/uciml/adult-census-income) dataset with CTGAN:
+=== "Tabular Data"
+    ```python
+        # Import the necessary modules
+        from pmlb import fetch_data
+        from ydata_synthetic.synthesizers.regular import RegularSynthesizer
+        from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
+
+        # Load data
+        data = fetch_data('adult')
+        num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
+        cat_cols = ['workclass','education', 'education-num', 'marital-status',
+                    'occupation', 'relationship', 'race', 'sex', 'native-country', 'target']
+       
+        # Define model and training parameters
+        ctgan_args = ModelParameters(batch_size=500, lr=2e-4, betas=(0.5, 0.9))
+        train_args = TrainParameters(epochs=501)
+       
+        # Train the generator model
+        synth = RegularSynthesizer(modelname='ctgan', model_parameters=ctgan_args)
+        synth.fit(data=data, train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols)
+
+        # Generate 1000 new synthetic samples
+        synth_data = synth.sample(1000) 
+    ```
+
+## Synthesizing a Time-Series Dataset
+The following example showcases how to synthesize the [Yahoo Stock Price](https://www.kaggle.com/datasets/arashnic/time-series-forecasting-with-yahoo-stock-price) dataset with TimeGAN:
+=== "Time-Series Data"
+    ```python
+        # Import the necessary modules
+        import pandas as pd
+        from ydata_synthetic.synthesizers import ModelParameters
+        from ydata_synthetic.synthesizers.timeseries import TimeGAN
+        from ydata_synthetic.preprocessing.timeseries.utils import real_data_loading
+
+        # Load and preprocess data
+        stock_data_df = pd.read_csv("stock_data.csv")
+        processed_data = real_data_loading(stock_data_df.values, seq_len=24)
+       
+        # Define model and training parameters
+        gan_args = ModelParameters(batch_size=128, lr=5e-4, noise_dim=128, layers_dim=128)
+        synth = TimeGAN(model_parameters=gan_args, hidden_dim=24, seq_len=24, n_seq=6, gamma=1)
+
+        # Train the generator model
+        synth.train(data=processed_data, train_steps=50000)
+
+        # Generate new synthetic data
+        synth_data = synth.sample(len(stock_data_df))
+    ```
+
+## Running the Streamlit App
+Once the package is [installed](installation.md) with the "streamlit" extra, the app can be launched as:
+
+=== "Streamlit App"
+    ```python
+        from ydata_synthetic import streamlit_app
+
+        streamlit_app.run()
+    ```
+
+The console will then output the URL from which the app can be accessed.
+
+:fontawesome-brands-youtube:{ style="color: #EE0F0F" } Here's a [quick example](https://www.youtube.com/watch?v=6Lzi26szKNo&t=4s) of how to synthesize data with the Streamlit App  – :octicons-clock-24: 5min
@@ -0,0 +1,55 @@
+<p></p>
+<p align="center"><img width="250" src="https://user-images.githubusercontent.com/3348134/177604157-11181f6c-57e5-44b1-8f6c-774edbba5512.png" alt="YData Logo"></p>
+<p></p>
+
+[![pypi](https://img.shields.io/pypi/v/ydata-synthetic)](https://pypi.org/project/ydata-synthetic)
+![Pythonversion](https://img.shields.io/badge/python-3.9%20%7C%203.10-blue)
+[![downloads](https://static.pepy.tech/badge/ydata-synthetic/month)](https://pepy.tech/project/ydata-synthetic)
+![](https://img.shields.io/github/license/ydataai/ydata-synthetic)
+![](https://img.shields.io/pypi/status/ydata-synthetic)
+[![Build Status](https://github.com/ydataai/ydata-synthetic/actions/workflows/tests.yml/badge.svg?branch=master)](https://github.com/ydataai/ydata-synthetic/actions/workflows/tests.yml)
+[![Code Coverage](https://codecov.io/gh/ydataai/ydata-synthetic/branch/master/graph/badge.svg?token=gMptB4YUnF)](https://codecov.io/gh/ydataai/ydata-synthetic)
+[![GitHub stars](https://img.shields.io/github/stars/ydataai/ydata-synthetic?style=social)](https://github.com/ydataai/ydata-synthetic)
+[![Discord](https://img.shields.io/discord/1037720091376238592?label=Discord&logo=Discord)](https://discord.com/invite/mw7xjJ7b7s)
+ 
+
+
+## Overview
+`ydata-synthetic` is the go-to Python package for **synthetic data generation for tabular and time-series data**. It uses the latest Generative AI models to learn the properties of real data and create realistic synthetic data. This project was created to educate the community about synthetic data and its applications in real-world domains, such as data augmentation, bias mitigation, data sharing, and privacy engineering. To learn more about Synthetic Data and its applications, [check this article](https://ydata.ai/resources/10-most-frequently-asked-questions-about-synthetic-data).
+
+## Current Functionality
+- 🤖 **Create Realistic Synthetic Data using Generative AI Models:** `ydata-synthetic` supports the state-of-the-art generative adversarial networks for data generation, namely Vanilla GAN, CGAN, WGAN, WGAN-GP, DRAGAN, Cramer GAN, CWGAN-GP, CTGAN, and TimeGAN. Learn more about the use of [GANs for Synthetic Data generation](https://medium.com/ydata-ai/generating-synthetic-tabular-data-with-gans-part-1-866705a77302). 
+
+- 📀 **Synthetic Data Generation for Tabular and Time-Series Data:** The package supports the synthesization of tabular and time-series data, covering a wide range of real-world applications. Learn how to leverage `ydata-synthetic` for [tabular](https://ydata.ai/resources/gans-for-synthetic-data-generation) and [time-series](https://towardsdatascience.com/synthetic-time-series-data-a-gan-approach-869a984f2239) data.
+
+- 💻 **Best Generation Experience in Open Source:** Including a guided UI experience for the generation of synthetic data, from reading the data to visualization of synthetic data. All served by a slick Streamlit app. 
+:fontawesome-brands-youtube:{ style="color: #EE0F0F" } Here's a [quick overview](https://www.youtube.com/watch?v=ep0PhwsFx0A) – :octicons-clock-24: 1min
+
+
+## Supported Data Types
+    
+=== "Tabular Data"
+    **Tabular data** does not have a temporal dependence, and can be structured and organized in a table-like format, where **features are represented in columns**, whereas **observations correspond to the rows**. 
+
+    Additionally, tabular data usually comprises both *numeric* and *categorical* features. **Numeric** features are those that encode **quantitative** values, whereas **categorical** represent **qualitative** measurements. Categorical features can further divided in *ordinal*, *binary* or *boolean*, and *nominal* features.
+    
+    Learn more about synthesizing tabular data in this [article](https://ydata.ai/resources/gans-for-synthetic-data-generation), or check the [quickstart guide](getting-started/quickstart.md#synthesizing-a-tabular-dataset) to get started with the synthesization of tabular datasets.
+
+=== "Time-Series Data"
+    **Time-series data** exhibit a sequencial, **temporal dependency** between records, and may present a wide range of patterns and trends, including **seasonality** (patterns that repeat at calendar periods -- days, weeks, months -- such as holiday sales, for instance) or **periodicity** (patterns that repeat over time).
+
+    Read more about generating time-series data in this [article](https://ydata.ai/resources/synthetic-time-series-data-a-gan-approach) and check this [quickstart guide](getting-started/quickstart.md#synthesizing-a-time-series-dataset) to get started with time-series data synthesization.
+   
+
+## Supported Generative AI Models
+The following architectures are currently supported:
+
+- [GAN](https://arxiv.org/abs/1406.2661)
+- [CGAN](https://arxiv.org/abs/1411.1784) (Conditional GAN)
+- [WGAN](https://arxiv.org/abs/1701.07875) (Wasserstein GAN)
+- [WGAN-GP](https://arxiv.org/abs/1704.00028) (Wassertein GAN with Gradient Penalty)
+- [DRAGAN](https://arxiv.org/pdf/1705.07215.pdf) (Deep Regret Analytic GAN)
+- [Cramer GAN](https://arxiv.org/abs/1705.10743) (Cramer Distance Solution to Biased Wasserstein Gradients)
+- [CWGAN-GP](https://cameronfabbri.github.io/papers/conditionalWGAN.pdf) (Conditional Wassertein GAN with Gradient Penalty)
+- [CTGAN](https://arxiv.org/pdf/1907.00503.pdf) (Conditional Tabular GAN)
+- [TimeGAN](https://papers.nips.cc/paper/2019/file/c9efe5f26cd17ba6216bbe2a7d26d490-Paper.pdf) (specifically for *time-series* data)