Source code for ray.data.preprocessors.vectorizer

from collections import Counter
from typing import Callable, List, Optional

import pandas as pd

from ray.data import Dataset
from ray.data.preprocessor import Preprocessor
from ray.data.preprocessors.utils import simple_hash, simple_split_tokenizer
from ray.util.annotations import PublicAPI



[docs]
@PublicAPI(stability="alpha")
class HashingVectorizer(Preprocessor):
    """Count the frequency of tokens using the
    `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_.

    This preprocessors creates a list column for each input column. For each row,
    the list contains the frequency counts of tokens (for CountVectorizer) or hash values
    (for HashingVectorizer). For HashingVectorizer, the list will have length
    ``num_features``. If ``num_features`` is large enough relative to the size of your
    vocabulary, then each index approximately corresponds to the frequency of a unique
    token.

    :class:`HashingVectorizer` is memory efficient and quick to pickle. However, given a
    transformed column, you can't know which tokens correspond to it. This might make it
    hard to determine which tokens are important to your model.

    .. note::

        This preprocessor transforms each input column to a
        `document-term matrix <https://en.wikipedia.org/wiki/Document-term_matrix>`_.

        A document-term matrix is a table that describes the frequency of tokens in a
        collection of documents. For example, the strings `"I like Python"` and `"I
        dislike Python"` might have the document-term matrix below:

        .. code-block::

                    corpus_I  corpus_Python  corpus_dislike  corpus_like
                0         1              1               1            0
                1         1              1               0            1

        To generate the matrix, you typically map each token to a unique index. For
        example:

        .. code-block::

                        token  index
                0        I      0
                1   Python      1
                2  dislike      2
                3     like      3

        The problem with this approach is that memory use scales linearly with the size
        of your vocabulary. :class:`HashingVectorizer` circumvents this problem by
        computing indices with a hash function:
        :math:`\\texttt{index} = hash(\\texttt{token})`.

    .. warning::
        Sparse matrices aren't currently supported. If you use a large ``num_features``,
        this preprocessor might behave poorly.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import HashingVectorizer
        >>>
        >>> df = pd.DataFrame({
        ...     "corpus": [
        ...         "Jimmy likes volleyball",
        ...         "Bob likes volleyball too",
        ...         "Bob also likes fruit jerky"
        ...     ]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> vectorizer = HashingVectorizer(["corpus"], num_features=8)
        >>> vectorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
                             corpus
        0  [1, 0, 1, 0, 0, 0, 0, 1]
        1  [1, 0, 1, 0, 0, 0, 1, 1]
        2  [0, 0, 1, 1, 0, 2, 1, 0]

        :class:`HashingVectorizer` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> vectorizer = HashingVectorizer(["corpus"], num_features=8, output_columns=["corpus_hashed"])
        >>> vectorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
                               corpus             corpus_hashed
        0      Jimmy likes volleyball  [1, 0, 1, 0, 0, 0, 0, 1]
        1    Bob likes volleyball too  [1, 0, 1, 0, 0, 0, 1, 1]
        2  Bob also likes fruit jerky  [0, 0, 1, 1, 0, 2, 1, 0]

    Args:
        columns: The columns to separately tokenize and count.
        num_features: The number of features used to represent the vocabulary. You
            should choose a value large enough to prevent hash collisions between
            distinct tokens.
        tokenization_fn: The function used to generate tokens. This function
            should accept a string as input and return a list of tokens as
            output. If unspecified, the tokenizer uses a function equivalent to
            ``lambda s: s.split(" ")``.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`CountVectorizer`
            Another method for counting token frequencies. Unlike :class:`HashingVectorizer`,
            :class:`CountVectorizer` creates a feature for each unique token. This
            enables you to compute the inverse transformation.

        :class:`FeatureHasher`
            This preprocessor is similar to :class:`HashingVectorizer`, except it expects
            a table describing token frequencies. In contrast,
            :class:`FeatureHasher` expects a column containing documents.
    """  # noqa: E501

    _is_fittable = False

    def __init__(
        self,
        columns: List[str],
        num_features: int,
        tokenization_fn: Optional[Callable[[str], List[str]]] = None,
        *,
        output_columns: Optional[List[str]] = None,
    ):
        self.columns = columns
        self.num_features = num_features
        self.tokenization_fn = tokenization_fn or simple_split_tokenizer
        self.output_columns = Preprocessor._derive_and_validate_output_columns(
            columns, output_columns
        )

    def _transform_pandas(self, df: pd.DataFrame):
        def hash_count(tokens: List[str]) -> Counter:
            hashed_tokens = [simple_hash(token, self.num_features) for token in tokens]
            return Counter(hashed_tokens)

        for col, output_col in zip(self.columns, self.output_columns):
            tokenized = df[col].map(self.tokenization_fn)
            hashed = tokenized.map(hash_count)
            # Create a list to store the hash columns
            hash_columns = []
            for i in range(self.num_features):
                series = hashed.map(lambda counts: counts[i])
                series.name = f"hash_{i}"
                hash_columns.append(series)
            # Concatenate all hash columns into a single list column
            df[output_col] = pd.concat(hash_columns, axis=1).values.tolist()

        return df

    def __repr__(self):
        fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn)
        return (
            f"{self.__class__.__name__}(columns={self.columns!r}, "
            f"num_features={self.num_features!r}, tokenization_fn={fn_name}, "
            f"output_columns={self.output_columns!r})"
        )




[docs]
@PublicAPI(stability="alpha")
class CountVectorizer(Preprocessor):
    """Count the frequency of tokens in a column of strings.

    :class:`CountVectorizer` operates on columns that contain strings. For example:

    .. code-block::

                        corpus
        0    I dislike Python
        1       I like Python

    This preprocessor creates a list column for each input column. Each list contains
    the frequency counts of tokens in order of their first appearance. For example:

    .. code-block::

                    corpus
        0    [1, 1, 1, 0]  # Counts for [I, dislike, Python, like]
        1    [1, 0, 1, 1]  # Counts for [I, dislike, Python, like]

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import CountVectorizer
        >>>
        >>> df = pd.DataFrame({
        ...     "corpus": [
        ...         "Jimmy likes volleyball",
        ...         "Bob likes volleyball too",
        ...         "Bob also likes fruit jerky"
        ...     ]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> vectorizer = CountVectorizer(["corpus"])
        >>> vectorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
                             corpus
        0  [1, 0, 1, 1, 0, 0, 0, 0]
        1  [1, 1, 1, 0, 0, 0, 0, 1]
        2  [1, 1, 0, 0, 1, 1, 1, 0]

        You can limit the number of tokens in the vocabulary with ``max_features``.

        >>> vectorizer = CountVectorizer(["corpus"], max_features=3)
        >>> vectorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
              corpus
        0  [1, 0, 1]
        1  [1, 1, 1]
        2  [1, 1, 0]

        :class:`CountVectorizer` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> vectorizer = CountVectorizer(["corpus"], output_columns=["corpus_counts"])
        >>> vectorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
                               corpus             corpus_counts
        0      Jimmy likes volleyball  [1, 0, 1, 1, 0, 0, 0, 0]
        1    Bob likes volleyball too  [1, 1, 1, 0, 0, 0, 0, 1]
        2  Bob also likes fruit jerky  [1, 1, 0, 0, 1, 1, 1, 0]

    Args:
        columns: The columns to separately tokenize and count.
        tokenization_fn: The function used to generate tokens. This function
            should accept a string as input and return a list of tokens as
            output. If unspecified, the tokenizer uses a function equivalent to
            ``lambda s: s.split(" ")``.
        max_features: The maximum number of tokens to encode in the transformed
            dataset. If specified, only the most frequent tokens are encoded.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.
    """  # noqa: E501

    def __init__(
        self,
        columns: List[str],
        tokenization_fn: Optional[Callable[[str], List[str]]] = None,
        max_features: Optional[int] = None,
        *,
        output_columns: Optional[List[str]] = None,
    ):
        self.columns = columns
        self.tokenization_fn = tokenization_fn or simple_split_tokenizer
        self.max_features = max_features
        self.output_columns = Preprocessor._derive_and_validate_output_columns(
            columns, output_columns
        )

    def _fit(self, dataset: Dataset) -> Preprocessor:
        def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]:
            def get_token_counts(col):
                token_series = df[col].apply(self.tokenization_fn)
                tokens = token_series.sum()
                return Counter(tokens)

            return {col: [get_token_counts(col)] for col in self.columns}

        value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas")
        total_counts = {col: Counter() for col in self.columns}
        for batch in value_counts.iter_batches(batch_size=None):
            for col, counters in batch.items():
                for counter in counters:
                    total_counts[col].update(counter)

        def most_common(counter: Counter, n: int):
            return Counter(dict(counter.most_common(n)))

        top_counts = [
            most_common(counter, self.max_features) for counter in total_counts.values()
        ]

        self.stats_ = {
            f"token_counts({col})": counts
            for (col, counts) in zip(self.columns, top_counts)
        }

        return self

    def _transform_pandas(self, df: pd.DataFrame):
        result_columns = []
        for col, output_col in zip(self.columns, self.output_columns):
            token_counts = self.stats_[f"token_counts({col})"]
            sorted_tokens = [token for (token, count) in token_counts.most_common()]
            tokenized = df[col].map(self.tokenization_fn).map(Counter)

            # Create a list to store token frequencies
            token_columns = []
            for token in sorted_tokens:
                series = tokenized.map(lambda val: val[token])
                series.name = token
                token_columns.append(series)

            # Concatenate all token columns into a single list column
            if token_columns:
                df[output_col] = pd.concat(token_columns, axis=1).values.tolist()
            else:
                df[output_col] = [[]] * len(df)
            result_columns.append(output_col)

        return df

    def __repr__(self):
        fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn)
        return (
            f"{self.__class__.__name__}(columns={self.columns!r}, "
            f"tokenization_fn={fn_name}, max_features={self.max_features!r}, "
            f"output_columns={self.output_columns!r})"
        )