Source code for ray.data.namespace_expressions.struct_namespace

"""Struct namespace for expression operations on struct-typed columns."""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

import pyarrow
import pyarrow.compute as pc

from ray.data.datatype import DataType
from ray.data.expressions import pyarrow_udf

if TYPE_CHECKING:
    from ray.data.expressions import Expr, UDFExpr


[docs] @dataclass class _StructNamespace: """Namespace for struct operations on expression columns. This namespace provides methods for operating on struct-typed columns using PyArrow compute functions. Example: >>> from ray.data.expressions import col >>> # Access a field using method >>> expr = col("user_record").struct.field("age") >>> # Access a field using bracket notation >>> expr = col("user_record").struct["age"] >>> # Access nested field >>> expr = col("user_record").struct["address"].struct["city"] """ _expr: Expr def __getitem__(self, field_name: str) -> "UDFExpr": """Extract a field using bracket notation. Args: field_name: The name of the field to extract. Returns: UDFExpr that extracts the specified field from each struct. Example: >>> col("user").struct["age"] # Get age field # doctest: +SKIP >>> col("user").struct["address"].struct["city"] # Get nested city field # doctest: +SKIP """ return self.field(field_name)
[docs] def field(self, field_name: str) -> "UDFExpr": """Extract a field from a struct. Args: field_name: The name of the field to extract. Returns: UDFExpr that extracts the specified field from each struct. """ # Infer return type from the struct field type return_dtype = DataType(object) # fallback if self._expr.data_type.is_arrow_type(): arrow_type = self._expr.data_type.to_arrow_dtype() if pyarrow.types.is_struct(arrow_type): try: field_type = arrow_type.field(field_name).type return_dtype = DataType.from_arrow(field_type) except KeyError: # Field not found in schema, fallback to object pass @pyarrow_udf(return_dtype=return_dtype) def _struct_field(arr: pyarrow.Array) -> pyarrow.Array: return pc.struct_field(arr, field_name) return _struct_field(self._expr)