Source code for ray.data.preprocessors.transformer
from typing import Any, Dict, List, Optional
import numpy as np
import pandas as pd
from ray.data.preprocessor import SerializablePreprocessorBase
from ray.data.preprocessors.utils import _Computed, _PublicField, migrate_private_fields
from ray.data.preprocessors.version_support import SerializablePreprocessor
from ray.util.annotations import PublicAPI
[docs]
@PublicAPI(stability="alpha")
@SerializablePreprocessor(
version=1, identifier="io.ray.preprocessors.power_transformer"
)
class PowerTransformer(SerializablePreprocessorBase):
"""Apply a `power transform <https://en.wikipedia.org/wiki/Power_transform>`_ to
make your data more normally distributed.
Some models expect data to be normally distributed. By making your data more
Gaussian-like, you might be able to improve your model's performance.
This preprocessor supports the following transformations:
* `Yeo-Johnson <https://en.wikipedia.org/wiki/Power_transform#Yeo%E2%80%93Johnson_transformation>`_
* `Box-Cox <https://en.wikipedia.org/wiki/Power_transform#Box%E2%80%93Cox_transformation>`_
Box-Cox requires all data to be positive.
.. warning::
You need to manually specify the transform's power parameter. If you
choose a bad value, the transformation might not work well.
Args:
columns: The columns to separately transform.
power: A parameter that determines how your data is transformed. Practioners
typically set ``power`` between :math:`-2.5` and :math:`2.5`, although you
may need to try different values to find one that works well.
method: A string representing which transformation to apply. Supports
``"yeo-johnson"`` and ``"box-cox"``. If you choose ``"box-cox"``, your data
needs to be positive. Defaults to ``"yeo-johnson"``.
output_columns: The names of the transformed columns. If None, the transformed
columns will be the same as the input columns. If not None, the length of
``output_columns`` must match the length of ``columns``, othwerwise an error
will be raised.
""" # noqa: E501
_valid_methods = ["yeo-johnson", "box-cox"]
_is_fittable = False
def __init__(
self,
columns: List[str],
power: float,
method: str = "yeo-johnson",
*,
output_columns: Optional[List[str]] = None,
):
super().__init__()
self._columns = columns
self._method = method
self._power = power
self._output_columns = (
SerializablePreprocessorBase._derive_and_validate_output_columns(
columns, output_columns
)
)
if method not in self._valid_methods:
raise ValueError(
f"Method {method} is not supported."
f"Supported values are: {self._valid_methods}"
)
@property
def columns(self) -> List[str]:
return self._columns
@property
def method(self) -> str:
return self._method
@property
def power(self) -> float:
return self._power
@property
def output_columns(self) -> List[str]:
return self._output_columns
def _transform_pandas(self, df: pd.DataFrame):
def column_power_transformer(s: pd.Series):
if self._method == "yeo-johnson":
result = np.zeros_like(s, dtype=np.float64)
pos = s >= 0 # binary mask
if self._power != 0:
result[pos] = (np.power(s[pos] + 1, self._power) - 1) / self._power
else:
result[pos] = np.log(s[pos] + 1)
if self._power != 2:
result[~pos] = -(np.power(-s[~pos] + 1, 2 - self._power) - 1) / (
2 - self._power
)
else:
result[~pos] = -np.log(-s[~pos] + 1)
return result
else: # box-cox
if self._power != 0:
return (np.power(s, self._power) - 1) / self._power
else:
return np.log(s)
df[self._output_columns] = df[self._columns].transform(column_power_transformer)
return df
def __repr__(self):
return (
f"{self.__class__.__name__}(columns={self._columns!r}, "
f"power={self._power!r}, method={self._method!r}, "
f"output_columns={self._output_columns!r})"
)
def _get_serializable_fields(self) -> Dict[str, Any]:
return {
"columns": self._columns,
"power": self._power,
"method": self._method,
"output_columns": self._output_columns,
}
def _set_serializable_fields(self, fields: Dict[str, Any], version: int):
# required fields
self._columns = fields["columns"]
self._power = fields["power"]
self._method = fields["method"]
self._output_columns = fields["output_columns"]
def __setstate__(self, state: Dict[str, Any]) -> None:
super().__setstate__(state)
migrate_private_fields(
self,
fields={
"_columns": _PublicField(public_field="columns"),
"_power": _PublicField(public_field="power"),
"_method": _PublicField(public_field="method", default="yeo-johnson"),
"_output_columns": _PublicField(
public_field="output_columns",
default=_Computed(lambda obj: obj._columns),
),
},
)