Source code for ray.serve.llm.deployments

from ray.llm._internal.serve.deployments.llm.vllm.vllm_deployment import (
    VLLMService as _VLLMService,
)
from ray.llm._internal.serve.deployments.routers.router import (
    LLMRouter as _LLMRouter,
)


from ray.util.annotations import PublicAPI



[docs]
@PublicAPI(stability="alpha")
class VLLMService(_VLLMService):
    """The implementation of the VLLM engine deployment.

    To build a VLLMDeployment object you should use `build_vllm_deployment` function.
    We also expose a lower level API for more control over the deployment class
    through `as_deployment` method.

    Examples:
        .. testcode::
            :skipif: True

            from ray import serve
            from ray.serve.config import AutoscalingConfig
            from ray.serve.llm.configs import LLMConfig, ModelLoadingConfig, DeploymentConfig
            from ray.serve.llm.deployments import VLLMDeployment
            from ray.serve.llm.openai_api_models import ChatCompletionRequest

            # Configure the model
            llm_config = LLMConfig(
                model_loading_config=ModelLoadingConfig(
                    served_model_name="llama-3.1-8b",
                    model_source="meta-llama/Llama-3.1-8b-instruct",
                ),
                deployment_config=DeploymentConfig(
                    autoscaling_config=AutoscalingConfig(
                        min_replicas=1,
                        max_replicas=8,
                    )
                ),
            )

            # Build the deployment directly
            VLLMDeployment = VLLMService.as_deployment(llm_config.get_serve_options())
            vllm_app = VLLMDeployment.bind(llm_config)

            model_handle = serve.run(vllm_app)

            # Query the model via `chat` api
            from ray.serve.llm.openai_api_models import ChatCompletionRequest
            request = ChatCompletionRequest(
                model="llama-3.1-8b",
                messages=[
                    {
                        "role": "user",
                        "content": "Hello, world!"
                    }
                ]
            )
            response = ray.get(model_handle.chat(request))
            print(response)
    """

    pass




[docs]
@PublicAPI(stability="alpha")
class LLMRouter(_LLMRouter):

    """The implementation of the OpenAI compatiple model router.

    This deployment creates the following endpoints:
      - /v1/chat/completions: Chat interface (OpenAI-style)
      - /v1/completions: Text completion
      - /v1/models: List available models
      - /v1/models/{model}: Model information


    Examples:
        .. testcode::
            :skipif: True


            from ray import serve
            from ray.serve.config import AutoscalingConfig
            from ray.serve.llm.configs import LLMConfig, ModelLoadingConfig, DeploymentConfig
            from ray.serve.llm.deployments import VLLMDeployment
            from ray.serve.llm.openai_api_models import ChatCompletionRequest


            llm_config1 = LLMConfig(
                model_loading_config=ModelLoadingConfig(
                    served_model_name="llama-3.1-8b",  # Name shown in /v1/models
                    model_source="meta-llama/Llama-3.1-8b-instruct",
                ),
                deployment_config=DeploymentConfig(
                    autoscaling_config=AutoscalingConfig(
                        min_replicas=1, max_replicas=8,
                    )
                ),
            )
            llm_config2 = LLMConfig(
                model_loading_config=ModelLoadingConfig(
                    served_model_name="llama-3.2-3b",  # Name shown in /v1/models
                    model_source="meta-llama/Llama-3.2-3b-instruct",
                ),
                deployment_config=DeploymentConfig(
                    autoscaling_config=AutoscalingConfig(
                        min_replicas=1, max_replicas=8,
                    )
                ),
            )

            # Deploy the application
            vllm_deployment1 = VLLMDeployment.as_deployment(llm_config1.get_serve_options()).bind(llm_config1)
            vllm_deployment2 = VLLMDeployment.as_deployment(llm_config2.get_serve_options()).bind(llm_config2)
            llm_app = LLMModelRouterDeployment.as_deployment().bind([vllm_deployment1, vllm_deployment2])
            serve.run(llm_app)
    """

    pass