Source code for ray.data.preprocessors.concatenator

import logging
from typing import List, Optional

import numpy as np
import pandas as pd

from ray.data.preprocessor import Preprocessor
from ray.util.annotations import PublicAPI

logger = logging.getLogger(__name__)


[docs] @PublicAPI(stability="alpha") class Concatenator(Preprocessor): """Combine numeric columns into a column of type :class:`~ray.air.util.tensor_extensions.pandas.TensorDtype`. Only columns specified in ``columns`` will be concatenated. This preprocessor concatenates numeric columns and stores the result in a new column. The new column contains :class:`~ray.air.util.tensor_extensions.pandas.TensorArrayElement` objects of shape :math:`(m,)`, where :math:`m` is the number of columns concatenated. The :math:`m` concatenated columns are dropped after concatenation. The preprocessor preserves the order of the columns provided in the ``colummns`` argument and will use that order when calling ``transform()`` and ``transform_batch()``. Examples: >>> import numpy as np >>> import pandas as pd >>> import ray >>> from ray.data.preprocessors import Concatenator :py:class:`Concatenator` combines numeric columns into a column of :py:class:`~ray.air.util.tensor_extensions.pandas.TensorDtype`. >>> df = pd.DataFrame({"X0": [0, 3, 1], "X1": [0.5, 0.2, 0.9]}) >>> ds = ray.data.from_pandas(df) # doctest: +SKIP >>> concatenator = Concatenator(columns=["X0", "X1"]) >>> concatenator.transform(ds).to_pandas() # doctest: +SKIP concat_out 0 [0.0, 0.5] 1 [3.0, 0.2] 2 [1.0, 0.9] By default, the created column is called `"concat_out"`, but you can specify a different name. >>> concatenator = Concatenator(columns=["X0", "X1"], output_column_name="tensor") >>> concatenator.transform(ds).to_pandas() # doctest: +SKIP tensor 0 [0.0, 0.5] 1 [3.0, 0.2] 2 [1.0, 0.9] >>> concatenator = Concatenator(columns=["X0", "X1"], dtype=np.float32) >>> concatenator.transform(ds) # doctest: +SKIP Dataset(num_rows=3, schema={Y: object, concat_out: TensorDtype(shape=(2,), dtype=float32)}) Args: output_column_name: The desired name for the new column. Defaults to ``"concat_out"``. columns: A list of columns to concatenate. The provided order of the columns will be retained during concatenation. dtype: The ``dtype`` to convert the output tensors to. If unspecified, the ``dtype`` is determined by standard coercion rules. raise_if_missing: If ``True``, an error is raised if any of the columns in ``columns`` don't exist. Defaults to ``False``. Raises: ValueError: if `raise_if_missing` is `True` and a column in `columns` or doesn't exist in the dataset. """ # noqa: E501 _is_fittable = False def __init__( self, columns: List[str], output_column_name: str = "concat_out", dtype: Optional[np.dtype] = None, raise_if_missing: bool = False, ): self.columns = columns self.output_column_name = output_column_name self.dtype = dtype self.raise_if_missing = raise_if_missing def _validate(self, df: pd.DataFrame) -> None: missing_columns = set(self.columns) - set(df) if missing_columns: message = ( f"Missing columns specified in '{self.columns}': {missing_columns}" ) if self.raise_if_missing: raise ValueError(message) else: logger.warning(message) def _transform_pandas(self, df: pd.DataFrame): self._validate(df) concatenated = df[self.columns].to_numpy(dtype=self.dtype) df = df.drop(columns=self.columns) # Use a Pandas Series for column assignment to get more consistent # behavior across Pandas versions. df.loc[:, self.output_column_name] = pd.Series(list(concatenated)) return df def __repr__(self): default_values = { "output_column_name": "concat_out", "columns": None, "dtype": None, "raise_if_missing": False, } non_default_arguments = [] for parameter, default_value in default_values.items(): value = getattr(self, parameter) if value != default_value: non_default_arguments.append(f"{parameter}={value}") return f"{self.__class__.__name__}({', '.join(non_default_arguments)})"