Here’s an example of how to generate synthetic data based on real data using SDV.

Synthetic data can be used to train ML models, while making sure that no actual (sensitive) data is used.

Note that the synthetic data has the same statistical properties as the real data, for example the age of the synthetic contacts is also <30.

dbconn = pq.dbconnect('dw_123')
my_query = 'select first_name, email, age from contacts where age<30 limit 100
real_data = dbconn.fetch('dw_123', query = my_query, df = True)

from sdv.metadata import SingleTableMetadata
metadata = SingleTableMetadata()

metadata.detect_from_dataframe(data=real_data)

metadata.update_column(column_name='first_name', sdtype='name', pii=True)
metadata.update_column(column_name='email', sdtype='email', pii=True)

from sdv.single_table import GaussianCopulaSynthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_data)
synthetic_data = synthesizer.sample(num_rows=10)

st.title("Synthetic data from real data")

st.header("Metadata")
st.text(metadata)

st.header("Real data")
st.dataframe(real_data)

st.header("Synthetic data")
st.dataframe(synthetic_data)

pq.write_records(table_name = 'synthetic_contacts', synthetic_data)

Result:

synthetic data.png