Source code for ray.serve.llm.builders
from typing import TYPE_CHECKING
from ray.util.annotations import PublicAPI
if TYPE_CHECKING:
from ray.serve.deployment import Application
from ray.serve.llm.configs import LLMConfig, LLMServingArgs
[docs]
@PublicAPI(stability="alpha")
def build_vllm_deployment(llm_config: "LLMConfig") -> "Application":
"""Helper to build a single vllm deployment from the given llm config.
Examples:
.. testcode::
:skipif: True
from ray import serve
from ray.serve.llm.configs import LLMConfig
from ray.serve.llm.builders import build_vllm_deployment
# Configure the model
llm_config = LLMConfig(
model_loading_config=dict(
model_id="llama-3.1-8b",
model_source="meta-llama/Llama-3.1-8b-instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1,
max_replicas=2,
)
),
accelerator_type="A10G",
)
# Build the deployment
vllm_app = build_vllm_deployment(llm_config)
# Deploy the application
model_handle = serve.run(vllm_app)
# Querying the model handle
import asyncio
model_handle = model_handle.options(stream=True)
async def query_model(model_handle):
from ray.serve.llm.openai_api_models import ChatCompletionRequest
request = ChatCompletionRequest(
model="qwen-0.5b",
messages=[
{
"role": "user",
"content": "Hello, world!"
}
]
)
resp = model_handle.chat.remote(request)
async for message in resp:
print("message: ", message)
asyncio.run(query_model(model_handle))
Args:
llm_config: The llm config to build vllm deployment.
Returns:
The configured Ray Serve Application for vllm deployment.
"""
from ray.llm._internal.serve.builders import build_vllm_deployment
return build_vllm_deployment(llm_config=llm_config)
[docs]
@PublicAPI(stability="alpha")
def build_openai_app(llm_serving_args: "LLMServingArgs") -> "Application":
"""Helper to build an OpenAI compatible app with the llm deployment setup from
the given llm serving args. This is the main entry point for users to create a
Serve application serving LLMs.
Examples:
.. testcode::
:skipif: True
from ray import serve
from ray.serve.llm.configs import LLMConfig
from ray.serve.llm.deployments import VLLMService, LLMRouter
llm_config1 = LLMConfig(
model_loading_config=dict(
model_id="qwen-0.5b",
model_source="Qwen/Qwen2.5-0.5B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1, max_replicas=2,
)
),
accelerator_type="A10G",
)
llm_config2 = LLMConfig(
model_loading_config=dict(
model_id="qwen-1.5b",
model_source="Qwen/Qwen2.5-1.5B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1, max_replicas=2,
)
),
accelerator_type="A10G",
)
# Deploy the application
deployment1 = VLLMService.as_deployment().bind(llm_config1)
deployment2 = VLLMService.as_deployment().bind(llm_config2)
llm_app = LLMRouter.as_deployment().bind([deployment1, deployment2])
serve.run(llm_app)
# Querying the model via openai client
from openai import OpenAI
# Initialize client
client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
# Basic completion
response = client.chat.completions.create(
model="qwen-0.5b",
messages=[{"role": "user", "content": "Hello!"}]
)
Args:
llm_serving_args: The list of llm configs or the paths to the llm config to
build the app.
Returns:
The configured Ray Serve Application router.
"""
from ray.llm._internal.serve.builders import build_openai_app
return build_openai_app(llm_serving_args=llm_serving_args)