Source code for ray.serve.llm.deployments
from ray.llm._internal.serve.deployments.llm.vllm.vllm_deployment import (
VLLMService as _VLLMService,
)
from ray.llm._internal.serve.deployments.routers.router import (
LLMRouter as _LLMRouter,
)
from ray.util.annotations import PublicAPI
[docs]
@PublicAPI(stability="alpha")
class VLLMService(_VLLMService):
"""The implementation of the VLLM engine deployment.
To build a VLLMDeployment object you should use `build_vllm_deployment` function.
We also expose a lower level API for more control over the deployment class
through `as_deployment` method.
Examples:
.. testcode::
:skipif: True
from ray import serve
from ray.serve.config import AutoscalingConfig
from ray.serve.llm.configs import LLMConfig, ModelLoadingConfig, DeploymentConfig
from ray.serve.llm.deployments import VLLMDeployment
from ray.serve.llm.openai_api_models import ChatCompletionRequest
# Configure the model
llm_config = LLMConfig(
model_loading_config=ModelLoadingConfig(
served_model_name="llama-3.1-8b",
model_source="meta-llama/Llama-3.1-8b-instruct",
),
deployment_config=DeploymentConfig(
autoscaling_config=AutoscalingConfig(
min_replicas=1,
max_replicas=8,
)
),
)
# Build the deployment directly
VLLMDeployment = VLLMService.as_deployment(llm_config.get_serve_options())
vllm_app = VLLMDeployment.bind(llm_config)
model_handle = serve.run(vllm_app)
# Query the model via `chat` api
from ray.serve.llm.openai_api_models import ChatCompletionRequest
request = ChatCompletionRequest(
model="llama-3.1-8b",
messages=[
{
"role": "user",
"content": "Hello, world!"
}
]
)
response = ray.get(model_handle.chat(request))
print(response)
"""
pass
[docs]
@PublicAPI(stability="alpha")
class LLMRouter(_LLMRouter):
"""The implementation of the OpenAI compatiple model router.
This deployment creates the following endpoints:
- /v1/chat/completions: Chat interface (OpenAI-style)
- /v1/completions: Text completion
- /v1/models: List available models
- /v1/models/{model}: Model information
Examples:
.. testcode::
:skipif: True
from ray import serve
from ray.serve.config import AutoscalingConfig
from ray.serve.llm.configs import LLMConfig, ModelLoadingConfig, DeploymentConfig
from ray.serve.llm.deployments import VLLMDeployment
from ray.serve.llm.openai_api_models import ChatCompletionRequest
llm_config1 = LLMConfig(
model_loading_config=ModelLoadingConfig(
served_model_name="llama-3.1-8b", # Name shown in /v1/models
model_source="meta-llama/Llama-3.1-8b-instruct",
),
deployment_config=DeploymentConfig(
autoscaling_config=AutoscalingConfig(
min_replicas=1, max_replicas=8,
)
),
)
llm_config2 = LLMConfig(
model_loading_config=ModelLoadingConfig(
served_model_name="llama-3.2-3b", # Name shown in /v1/models
model_source="meta-llama/Llama-3.2-3b-instruct",
),
deployment_config=DeploymentConfig(
autoscaling_config=AutoscalingConfig(
min_replicas=1, max_replicas=8,
)
),
)
# Deploy the application
vllm_deployment1 = VLLMDeployment.as_deployment(llm_config1.get_serve_options()).bind(llm_config1)
vllm_deployment2 = VLLMDeployment.as_deployment(llm_config2.get_serve_options()).bind(llm_config2)
llm_app = LLMModelRouterDeployment.as_deployment().bind([vllm_deployment1, vllm_deployment2])
serve.run(llm_app)
"""
pass