Source code for ray.data.preprocessors.hasher

import collections
from typing import List

import pandas as pd

from ray.data.preprocessor import Preprocessor
from ray.data.preprocessors.utils import simple_hash
from ray.util.annotations import PublicAPI


[docs] @PublicAPI(stability="alpha") class FeatureHasher(Preprocessor): r"""Apply the `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_ to a table that describes token frequencies. :class:`FeatureHasher` creates ``num_features`` columns named ``hash_{index}``, where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column ``hash_{index}`` describes the frequency of tokens that hash to ``index``. Distinct tokens can correspond to the same index. However, if ``num_features`` is large enough, then columns probably correspond to a unique token. This preprocessor is memory efficient and quick to pickle. However, given a transformed column, you can't know which tokens correspond to it. This might make it hard to determine which tokens are important to your model. .. warning:: Sparse matrices aren't supported. If you use a large ``num_features``, this preprocessor might behave poorly. Examples: >>> import pandas as pd >>> import ray >>> from ray.data.preprocessors import FeatureHasher The data below describes the frequencies of tokens in ``"I like Python"`` and ``"I dislike Python"``. >>> df = pd.DataFrame({ ... "I": [1, 1], ... "like": [1, 0], ... "dislike": [0, 1], ... "Python": [1, 1] ... }) >>> ds = ray.data.from_pandas(df) # doctest: +SKIP :class:`FeatureHasher` hashes each token to determine its index. For example, the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`. >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed") >>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy() # doctest: +SKIP array([[0, 0, 0, 2, 0, 1, 0, 0], [0, 0, 0, 1, 0, 1, 1, 0]]) Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index :math:`3`. You can avoid hash collisions like these by increasing ``num_features``. Args: columns: The columns to apply the hashing trick to. Each column should describe the frequency of a token. num_features: The number of features used to represent the vocabulary. You should choose a value large enough to prevent hash collisions between distinct tokens. output_column: The name of the column that contains the hashed features. .. seealso:: :class:`~ray.data.preprocessors.CountVectorizer` Use this preprocessor to generate inputs for :class:`FeatureHasher`. :class:`ray.data.preprocessors.HashingVectorizer` If your input data describes documents rather than token frequencies, use :class:`~ray.data.preprocessors.HashingVectorizer`. """ # noqa: E501 _is_fittable = False def __init__( self, columns: List[str], num_features: int, output_column: str, ): self.columns = columns # TODO(matt): Set default number of features. # This likely requires sparse matrix support to avoid explosion of columns. self.num_features = num_features self.output_column = output_column def _transform_pandas(self, df: pd.DataFrame): # TODO(matt): Use sparse matrix for efficiency. def row_feature_hasher(row): hash_counts = collections.defaultdict(int) for column in self.columns: hashed_value = simple_hash(column, self.num_features) hash_counts[hashed_value] += row[column] return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)} feature_columns = df.loc[:, self.columns].apply( row_feature_hasher, axis=1, result_type="expand" ) # Concatenate the hash columns hash_columns = [f"hash_{i}" for i in range(self.num_features)] concatenated = feature_columns[hash_columns].to_numpy() # Use a Pandas Series for column assignment to get more consistent # behavior across Pandas versions. df.loc[:, self.output_column] = pd.Series(list(concatenated)) return df def __repr__(self): return ( f"{self.__class__.__name__}(columns={self.columns!r}, " f"num_features={self.num_features!r}, " f"output_column={self.output_column!r})" )