Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create PARSynthesizer #1068

Merged
merged 4 commits into from
Oct 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdv/sequential/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Synthesizers for sequential data."""
93 changes: 93 additions & 0 deletions sdv/sequential/par.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""PAR Synthesizer class."""

import inspect

from sdv.data_processing import DataProcessor
from sdv.metadata.single_table import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer


class PARSynthesizer:
"""Synthesizer for sequential data.

This synthesizer uses the ``deepecho.models.par.PARModel`` class as the core model.
Additionally, it uses a separate synthesizer to model and sample the context columns
to be passed into PAR.

Args:
metadata (sdv.metadata.SingleTableMetadata):
Single table metadata representing the data that this synthesizer will be used for.
enforce_min_max_values (bool):
Specify whether or not to clip the data returned by ``reverse_transform`` of
the numerical transformer, ``FloatFormatter``, to the min and max values seen
during ``fit``. Defaults to ``True``.
enforce_rounding (bool):
Define rounding scheme for ``numerical`` columns. If ``True``, the data returned
by ``reverse_transform`` will be rounded as in the original data. Defaults to ``True``.
context_columns (list[str]):
A list of strings, representing the columns that do not vary in a sequence.
segment_size (int):
If specified, cut each training sequence in several segments of
the indicated size. The size can be passed as an integer
value, which will interpreted as the number of data points to
put on each segment.
epochs (int):
The number of epochs to train for. Defaults to 128.
sample_size (int):
The number of times to sample (before choosing and
returning the sample which maximizes the likelihood).
Defaults to 1.
cuda (bool):
Whether to attempt to use cuda for GPU computation.
If this is False or CUDA is not available, CPU will be used.
Defaults to ``True``.
verbose (bool):
Whether to print progress to console or not.
"""

def _get_context_metadata(self):
context_columns_dict = {}
context_columns = self.context_columns or []
for column in context_columns:
context_columns_dict[column] = self.metadata._columns[column]

context_metadata_dict = {'columns': context_columns_dict}
return SingleTableMetadata._load_from_dict(context_metadata_dict)

def __init__(self, metadata, enforce_min_max_values, enforce_rounding, context_columns=None,
segment_size=None, epochs=128, sample_size=1, cuda=True, verbose=False):
self.metadata = metadata
self.enforce_min_max_values = enforce_min_max_values
self.enforce_rounding = enforce_rounding
self._data_processor = DataProcessor(metadata)
self.context_columns = context_columns
self.segment_size = segment_size
self._model_kwargs = {
'epochs': epochs,
'sample_size': sample_size,
'cuda': cuda,
'verbose': verbose,
}
context_metadata = self._get_context_metadata()
self._context_synthesizer = GaussianCopulaSynthesizer(
metadata=context_metadata,
enforce_min_max_values=enforce_min_max_values,
enforce_rounding=enforce_rounding
)

def get_parameters(self):
"""Return the parameters used to instantiate the synthesizer."""
parameters = inspect.signature(self.__init__).parameters
instantiated_parameters = {}
for parameter_name in parameters:
if parameter_name != 'metadata':
instantiated_parameters[parameter_name] = self.__dict__.get(parameter_name)

for parameter_name, value in self._model_kwargs.items():
instantiated_parameters[parameter_name] = value

return instantiated_parameters

def get_metadata(self):
"""Return the ``SingleTableMetadata`` for this synthesizer."""
return self.metadata
2 changes: 1 addition & 1 deletion sdv/single_table/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Models for Single Table data."""
"""Synthesizers for Single Table data."""

from sdv.single_table.copulagan import CopulaGANSynthesizer
from sdv.single_table.copulas import GaussianCopulaSynthesizer
Expand Down
1 change: 1 addition & 0 deletions tests/unit/sequential/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Tests for synthesizers for sequential data."""
106 changes: 106 additions & 0 deletions tests/unit/sequential/test_par.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from sdv.data_processing.data_processor import DataProcessor
from sdv.metadata.single_table import SingleTableMetadata
from sdv.sequential.par import PARSynthesizer
from sdv.single_table.copulas import GaussianCopulaSynthesizer


class TestPARSynthesizer:

def test___init__(self):
"""Test that the parameters are set correctly.

The parameters passed in the ``__init__`` should be set on the instance. Additionally,
a context synthesizer should be created with the correct metadata and parameters.
"""
# Setup
metadata = SingleTableMetadata()
metadata.add_column('time', sdtype='datetime')
metadata.add_column('gender', sdtype='categorical')
metadata.add_column('name', sdtype='text')
metadata.add_column('measurement', sdtype='numerical')

# Run
synthesizer = PARSynthesizer(
metadata=metadata,
enforce_min_max_values=True,
enforce_rounding=True,
context_columns=['gender', 'name'],
segment_size=10,
epochs=10,
sample_size=5,
cuda=False,
verbose=False
)

# Assert
assert synthesizer.context_columns == ['gender', 'name']
assert synthesizer.enforce_min_max_values is True
assert synthesizer.enforce_rounding is True
assert synthesizer.segment_size == 10
assert synthesizer._model_kwargs == {
'epochs': 10,
'sample_size': 5,
'cuda': False,
'verbose': False
}
assert isinstance(synthesizer._data_processor, DataProcessor)
assert synthesizer._data_processor.metadata == metadata
assert isinstance(synthesizer._context_synthesizer, GaussianCopulaSynthesizer)
assert synthesizer._context_synthesizer.metadata._columns == {
'gender': {'sdtype': 'categorical'},
'name': {'sdtype': 'text'}
}

def test_get_parameters(self):
"""Test that it returns every ``init`` parameter without the ``metadata``."""
# Setup
metadata = SingleTableMetadata()
instance = PARSynthesizer(
metadata=metadata,
enforce_min_max_values=True,
enforce_rounding=True,
context_columns=None,
segment_size=10,
epochs=10,
sample_size=5,
cuda=False,
verbose=False
)

# Run
parameters = instance.get_parameters()

# Assert
assert 'metadata' not in parameters
assert parameters == {
'enforce_min_max_values': True,
'enforce_rounding': True,
'context_columns': None,
'segment_size': 10,
'epochs': 10,
'sample_size': 5,
'cuda': False,
'verbose': False
}

def test_get_metadata(self):
"""Test that it returns the ``metadata`` object."""
# Setup
metadata = SingleTableMetadata()
instance = PARSynthesizer(
metadata=metadata,
enforce_min_max_values=True,
enforce_rounding=True,
context_columns=None,
segment_size=10,
epochs=10,
sample_size=5,
cuda=False,
verbose=False
)

# Run
result = instance.get_metadata()

# Assert
assert result == metadata