From 45379fc8152a0171b07178acd7a8e8d9cd917c10 Mon Sep 17 00:00:00 2001
From: Bihan  Rana <bihan@Bihans-MacBook-Pro.local>
Date: Tue, 26 May 2026 13:51:46 +0545
Subject: [PATCH 1/5] [Docs]Add Miles Example

---
 mkdocs.yml                             |   3 +
 mkdocs/docs/concepts/tasks.md          |   2 +-
 mkdocs/docs/examples.md                |  11 +
 mkdocs/docs/examples/training/miles.md | 270 +++++++++++++++++++++++++
 4 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 mkdocs/docs/examples/training/miles.md
diff --git a/mkdocs.yml b/mkdocs.yml
index 5de85a37d..debf86167 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -171,12 +171,14 @@ plugins:
         "examples/distributed-training/trl.md": "docs/examples/training/trl.md"
         "examples/distributed-training/axolotl.md": "docs/examples/training/axolotl.md"
         "examples/distributed-training/ray-ragen.md": "docs/examples/training/ray-ragen.md"
+        "examples/distributed-training/miles.md": "docs/examples/training/miles.md"
         # Single-node and distributed training merged under Training
         "docs/examples/single-node-training/trl.md": "docs/examples/training/trl.md"
         "docs/examples/single-node-training/axolotl.md": "docs/examples/training/axolotl.md"
         "docs/examples/distributed-training/trl.md": "docs/examples/training/trl.md"
         "docs/examples/distributed-training/axolotl.md": "docs/examples/training/axolotl.md"
         "docs/examples/distributed-training/ray-ragen.md": "docs/examples/training/ray-ragen.md"
+        "docs/examples/distributed-training/miles.md": "docs/examples/training/miles.md"
         "examples/clusters/aws.md": "docs/examples/clusters/aws.md"
         "examples/clusters/gcp.md": "docs/examples/clusters/gcp.md"
         "examples/clusters/lambda.md": "docs/examples/clusters/lambda.md"
@@ -312,6 +314,7 @@ nav:
             - TRL: docs/examples/training/trl.md
             - Axolotl: docs/examples/training/axolotl.md
             - Ray+RAGEN: docs/examples/training/ray-ragen.md
+            - Miles: docs/examples/training/miles.md
         - Clusters:
             - AWS: docs/examples/clusters/aws.md
             - GCP: docs/examples/clusters/gcp.md
diff --git a/mkdocs/docs/concepts/tasks.md b/mkdocs/docs/concepts/tasks.md
index 43eb8e80c..d3a3515b8 100644
--- a/mkdocs/docs/concepts/tasks.md
+++ b/mkdocs/docs/concepts/tasks.md
@@ -151,7 +151,7 @@ Jobs on each node communicate using their private IP addresses. Use `DSTACK_MAST
 `dstack` is easy to use with `accelerate`, `torchrun`, Ray, Spark, and any other distributed frameworks.
     
 !!! info "Examples"
-    See the training examples for [TRL](../examples/training/trl.md#distributed-training), [Axolotl](../examples/training/axolotl.md#distributed-training), and [Ray+RAGEN](../examples/training/ray-ragen.md).
+    See the training examples for [TRL](../examples/training/trl.md#distributed-training), [Axolotl](../examples/training/axolotl.md#distributed-training), [Ray+RAGEN](../examples/training/ray-ragen.md), and [Miles](../examples/training/miles.md).
 
     See the cluster examples for [AWS](../examples/clusters/aws.md), [GCP](../examples/clusters/gcp.md), [Lambda](../examples/clusters/lambda.md), [Crusoe](../examples/clusters/crusoe.md), [Nebius](../examples/clusters/nebius.md), and [NCCL/RCCL tests](../examples/clusters/nccl-rccl-tests.md).
 
diff --git a/mkdocs/docs/examples.md b/mkdocs/docs/examples.md
index 59203cdf8..ad8bf4839 100644
--- a/mkdocs/docs/examples.md
+++ b/mkdocs/docs/examples.md
@@ -49,6 +49,17 @@ hide:
             Fine-tune an agent on multiple nodes with RAGEN, verl, and Ray.
         </p>
     </a>
+
+    <a href="/docs/examples/training/miles"
+       class="feature-cell">
+        <h3>
+            Miles
+        </h3>
+
+        <p>
+            RL-fine-tune Qwen2.5-32B with Miles.
+        </p>
+    </a>
 </div>
 
 
diff --git a/mkdocs/docs/examples/training/miles.md b/mkdocs/docs/examples/training/miles.md
new file mode 100644
index 000000000..9f2c9df33
--- /dev/null
+++ b/mkdocs/docs/examples/training/miles.md
@@ -0,0 +1,270 @@
+---
+title: Miles
+description: RL-fine-tune Qwen2.5-32B with Miles, SGLang, Megatron-LM, and Ray across two 8xH100 nodes
+---
+
+# Miles
+
+This example shows how to use `dstack` and [Miles](https://github.com/radixark/miles)
+to RL-fine-tune a 32B language model with [GRPO](https://arxiv.org/abs/2402.03300) across two 8xH100 nodes. Under the hood Miles uses [SGLang](https://github.com/sgl-project/sglang) for high-throughput rollout, [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) for training, and [Ray](https://docs.ray.io/en/latest/) to coordinate the trainer
+and rollout actors across nodes.
+
+Here we fine-tune `Qwen/Qwen2.5-32B-Instruct` on [GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset.
+
+!!! info "Prerequisites"
+    Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../concepts/fleets.md#cluster-placement) or an [SSH fleet](../../concepts/fleets.md#ssh-placement)).
+
+## Run a Ray cluster
+
+The task below starts a Ray cluster across both nodes and performs the one-time setup on each node. The setup includes: downloading the model, downloading the dataset, and converting the checkpoint to Megatron's `torch_dist` format.
+
+<div editor-title="miles-qwen32b-h100.dstack.yml">
+
+```yaml
+type: task
+name: miles-qwen32b-h100
+nodes: 2
+image: radixark/miles:sglang-miles-v0.5.12
+env:
+  - WANDB_API_KEY
+  - PYTHONPATH=/root/Megatron-LM
+  - NCCL_DEBUG=INFO
+  - MODEL_ID=Qwen/Qwen2.5-32B-Instruct
+commands:
+  # 1. Download Model + dataset
+  - pip install -U "huggingface_hub[cli]"
+  - hf download "$MODEL_ID" --local-dir "/root/$(basename "$MODEL_ID")"
+  - hf download --repo-type dataset openai/gsm8k --local-dir /root/gsm8k
+  # 2. Convert HF -> Megatron torch_dist
+  - |
+    MODEL_NAME="$(basename "$MODEL_ID")"
+    cd /root/miles && python tools/convert_hf_to_torch_dist.py \
+      --swiglu \
+      --num-layers 64 \
+      --hidden-size 5120 \
+      --ffn-hidden-size 27648 \
+      --num-attention-heads 40 \
+      --use-rotary-position-embeddings \
+      --disable-bias-linear \
+      --add-qkv-bias \
+      --normalization RMSNorm \
+      --norm-epsilon 1e-5 \
+      --rotary-base 1000000 \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --vocab-size 152064 \
+      --untie-embeddings-and-output-weights \
+      --hf-checkpoint "/root/$MODEL_NAME" \
+      --save "/root/${MODEL_NAME}_torch_dist"
+  # 3. Start the Ray cluster.
+  - |
+    if [ $DSTACK_NODE_RANK = 0 ]; then
+      ray start --head --port=6379
+    else
+      ray start --address=$DSTACK_MASTER_NODE_IP:6379
+    fi
+ports:
+  - 8265
+privileged: true
+resources:
+  gpu: H100:8
+  shm_size: 32GB
+  disk: 1000GB..
+volumes:
+  - /checkpoints:/checkpoints
+```
+
+</div>
+
+Now, if you run this task via `dstack apply`, it will automatically forward the Ray's dashboard port to `localhost:8265`.
+
+<div class="termy">
+
+```shell
+$ export WANDB_API_KEY=...
+$ dstack apply -f miles-qwen32b-h100.dstack.yml
+```
+
+</div>
+
+As long as the `dstack apply` is attached, you can use `localhost:8265` to submit Ray jobs for execution. If `dstack apply` is detached, you can use `dstack attach` to re-attach.
+
+## Submit Ray jobs
+
+Before you can submit Ray jobs, ensure `ray` is installed locally:
+
+<div class="termy">
+
+```shell
+$ pip install ray
+```
+
+</div>
+
+The submit script below runs the Miles training job on the Ray cluster. The
+model is sharded across all 8 GPUs per node via tensor parallelism, and SGLang
+uses the same 8 GPUs per node for rollout.
+
+<div editor-title="submit-miles-train.sh">
+
+```bash
+#!/bin/bash
+set -euo pipefail
+
+export RAY_ADDRESS=http://localhost:8265
+
+: "${NUM_NODES:?NUM_NODES is not set}"
+: "${GPUS_PER_NODE:?GPUS_PER_NODE is not set}"
+TOTAL_GPUS=$((NUM_NODES * GPUS_PER_NODE))
+
+MODEL_ID="Qwen/Qwen2.5-32B-Instruct"
+MODEL_NAME="$(basename "$MODEL_ID")"
+HF_CHECKPOINT="/root/$MODEL_NAME"
+REF_LOAD="/root/${MODEL_NAME}_torch_dist"
+PROMPT_DATA="/root/gsm8k/main/train-00000-of-00001.parquet"
+EVAL_PROMPT_DATA="/root/gsm8k/main/test-00000-of-00001.parquet"
+INPUT_KEY="question"
+LABEL_KEY="answer"
+EVAL_DATASET_NAME="gsm8k"
+CHECKPOINT_DIR="/checkpoints/${MODEL_NAME}-${EVAL_DATASET_NAME}"
+SAVE_INTERVAL=10
+WANDB_PROJECT="dstack-miles-RL"
+WANDB_GROUP="${MODEL_NAME}-gsm8k-${NUM_NODES}node-${GPUS_PER_NODE}gpu"
+WANDB_NAME="rollout-$(date +%Y%m%d-%H%M%S)"
+ROLLOUT_GPUS_PER_ENGINE=8
+
+echo "===== submit-miles env ====="
+echo "NUM_NODES=${NUM_NODES}"
+echo "GPUS_PER_NODE=${GPUS_PER_NODE}"
+echo "TOTAL_GPUS=${TOTAL_GPUS}"
+echo "MODEL_ID=${MODEL_ID}"
+echo "HF_CHECKPOINT=${HF_CHECKPOINT}"
+echo "CHECKPOINT_DIR=${CHECKPOINT_DIR}"
+echo "============================"
+
+
+
+CMD='cd /root/miles && python3 train.py \
+  --actor-num-nodes '"$NUM_NODES"' \
+  --actor-num-gpus-per-node '"$GPUS_PER_NODE"' \
+  --num-gpus-per-node '"$GPUS_PER_NODE"' \
+  --rollout-num-gpus-per-engine '"$ROLLOUT_GPUS_PER_ENGINE"' \
+  --sglang-server-concurrency 128 \
+  --colocate \
+  --calculate-per-token-loss \
+  --use-miles-router \
+  --swiglu \
+  --num-layers 64 \
+  --hidden-size 5120 \
+  --ffn-hidden-size 27648 \
+  --num-attention-heads 40 \
+  --use-rotary-position-embeddings \
+  --disable-bias-linear \
+  --add-qkv-bias \
+  --normalization RMSNorm \
+  --norm-epsilon 1e-5 \
+  --rotary-base 1000000 \
+  --group-query-attention \
+  --num-query-groups 8 \
+  --vocab-size 152064 \
+  --untie-embeddings-and-output-weights \
+  --hf-checkpoint '"$HF_CHECKPOINT"' \
+  --ref-load '"$REF_LOAD"' \
+  --prompt-data '"$PROMPT_DATA"' \
+  --input-key '"$INPUT_KEY"' \
+  --label-key '"$LABEL_KEY"' \
+  --apply-chat-template \
+  --rollout-shuffle \
+  --rm-type math \
+  --num-rollout 20 \
+  --rollout-batch-size 8 \
+  --n-samples-per-prompt 8 \
+  --rollout-max-response-len 512 \
+  --rollout-temperature 1 \
+  --global-batch-size 64 \
+  --eval-interval 5 \
+  --eval-prompt-data '"$EVAL_DATASET_NAME"' '"$EVAL_PROMPT_DATA"' \
+  --n-samples-per-eval-prompt 1 \
+  --eval-max-response-len 512 \
+  --eval-top-k 1 \
+  --tensor-model-parallel-size 8 \
+  --sequence-parallel \
+  --pipeline-model-parallel-size 1 \
+  --context-parallel-size 1 \
+  --expert-model-parallel-size 1 \
+  --expert-tensor-parallel-size 1 \
+  --use-dynamic-batch-size \
+  --max-tokens-per-gpu 9216 \
+  --advantage-estimator grpo \
+  --use-kl-loss \
+  --kl-loss-coef 0.00 \
+  --kl-loss-type low_var_kl \
+  --kl-coef 0.00 \
+  --entropy-coef 0.00 \
+  --eps-clip 0.2 \
+  --eps-clip-high 0.28 \
+  --optimizer adam \
+  --lr 1e-6 \
+  --lr-decay-style constant \
+  --weight-decay 0.1 \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.98 \
+  --sglang-mem-fraction-static 0.7 \
+  --use-wandb \
+  --wandb-host https://wandb.ai/ \
+  --wandb-project '"$WANDB_PROJECT"' \
+  --wandb-group '"$WANDB_GROUP"' \
+  --wandb-exp-name '"$WANDB_NAME"' \
+  --attention-dropout 0.0 \
+  --hidden-dropout 0.0 \
+  --accumulate-allreduce-grads-in-fp32 \
+  --attention-softmax-in-fp32 \
+  --attention-backend flash \
+  --save '"$CHECKPOINT_DIR"' \
+  --save-interval '"$SAVE_INTERVAL"''
+
+# GLOO_SOCKET_IFNAME=eth0 is required for multi-node Gloo process group init.
+# Without it, Gloo resolves to a loopback address (127.0.1.1) instead of the
+# inter-node interface, causing `init_gloo_group()` to timeout.
+RUNTIME_ENV_JSON=$(cat <<EOF
+{
+  "env_vars": {
+    "PYTHONPATH": "/root/Megatron-LM",
+    "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+    "NCCL_DEBUG": "INFO",
+    "GLOO_SOCKET_IFNAME": "eth0"
+  }
+}
+EOF
+)
+
+ray job submit \
+  --address="$RAY_ADDRESS" \
+  --runtime-env-json="$RUNTIME_ENV_JSON" \
+  -- bash -lc "$CMD"
+```
+
+</div>
+
+Then run it:
+
+<div class="termy">
+
+```shell
+$ NUM_NODES=2 GPUS_PER_NODE=8 bash submit-miles-train.sh
+```
+
+</div>
+
+!!! info "Important parameters"
+    1. `--tensor-model-parallel-size 8` shards the 32B model across all 8 GPUs
+       per node.
+    2. `--rollout-num-gpus-per-engine 8` starts SGLang engine with TP-8 on each node.
+    3. `--sglang-server-concurrency` sets how many requests SGLang processes concurrently.
+    4. `--max-tokens-per-gpu 9216` sets number of tokens each GPU processes. Lower this if Megatron OOM's during training.
+    5. `--sglang-mem-fraction-static 0.7` sets fraction of GPU memory SGLang pre-allocates for its KV cache. Lower this if Megatron OOM's at startup.
+
+!!! info "What's next"
+    1. Read about [distributed tasks](../../concepts/tasks.md#distributed-tasks)
+       and [fleets](../../concepts/fleets.md) for larger-scale setups.
+    2. Browse Miles' [examples](https://github.com/radixark/miles/tree/main/examples)

From 8ce6f07abeedd7e3d88d7fcd8fbe7360f2212183 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Tue, 26 May 2026 15:45:31 +0300
Subject: [PATCH 2/5] [Docs] Minor Miles training example

---
 mkdocs/docs/examples/training/miles.md | 77 +++++++++++++++-----------
 1 file changed, 44 insertions(+), 33 deletions(-)

diff --git a/mkdocs/docs/examples/training/miles.md b/mkdocs/docs/examples/training/miles.md
index 9f2c9df33..d1c52db01 100644
--- a/mkdocs/docs/examples/training/miles.md
+++ b/mkdocs/docs/examples/training/miles.md
@@ -1,22 +1,32 @@
 ---
 title: Miles
-description: RL-fine-tune Qwen2.5-32B with Miles, SGLang, Megatron-LM, and Ray across two 8xH100 nodes
+description: RL fine-tuning Qwen2.5-32B with Miles, SGLang, Megatron-LM, and Ray across two 8xH100 nodes
 ---
 
 # Miles
 
 This example shows how to use `dstack` and [Miles](https://github.com/radixark/miles)
-to RL-fine-tune a 32B language model with [GRPO](https://arxiv.org/abs/2402.03300) across two 8xH100 nodes. Under the hood Miles uses [SGLang](https://github.com/sgl-project/sglang) for high-throughput rollout, [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) for training, and [Ray](https://docs.ray.io/en/latest/) to coordinate the trainer
-and rollout actors across nodes.
+to fine-tune a 32B language model with [GRPO](https://arxiv.org/abs/2402.03300)
+across a multi-node cluster.
+Miles uses [SGLang](https://github.com/sgl-project/sglang) for high-throughput
+rollouts, [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) for training,
+and [Ray](https://docs.ray.io/en/latest/) to coordinate the trainer and rollout
+actors across nodes.
 
-Here we fine-tune `Qwen/Qwen2.5-32B-Instruct` on [GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset.
+Here we fine-tune `Qwen/Qwen2.5-32B-Instruct` on the
+[GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset.
 
 !!! info "Prerequisites"
-    Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../concepts/fleets.md#cluster-placement) or an [SSH fleet](../../concepts/fleets.md#ssh-placement)).
+    Before running a distributed task, make sure to create a [fleet](../../concepts/fleets.md)
+    with `placement` set to [`cluster`](../../concepts/fleets.md#cluster-placement).
 
 ## Run a Ray cluster
 
-The task below starts a Ray cluster across both nodes and performs the one-time setup on each node. The setup includes: downloading the model, downloading the dataset, and converting the checkpoint to Megatron's `torch_dist` format.
+### Define a configuration
+
+The [task](../../concepts/tasks.md) below starts Ray on two nodes and prepares
+each node by downloading the model and dataset, then converting the checkpoint
+to Megatron's `torch_dist` format.
 
 <div editor-title="miles-qwen32b-h100.dstack.yml">
 
@@ -31,11 +41,11 @@ env:
   - NCCL_DEBUG=INFO
   - MODEL_ID=Qwen/Qwen2.5-32B-Instruct
 commands:
-  # 1. Download Model + dataset
+  # 1. Download the model and dataset.
   - pip install -U "huggingface_hub[cli]"
   - hf download "$MODEL_ID" --local-dir "/root/$(basename "$MODEL_ID")"
   - hf download --repo-type dataset openai/gsm8k --local-dir /root/gsm8k
-  # 2. Convert HF -> Megatron torch_dist
+  # 2. Convert the Hugging Face checkpoint to Megatron torch_dist.
   - |
     MODEL_NAME="$(basename "$MODEL_ID")"
     cd /root/miles && python tools/convert_hf_to_torch_dist.py \
@@ -56,7 +66,7 @@ commands:
       --untie-embeddings-and-output-weights \
       --hf-checkpoint "/root/$MODEL_NAME" \
       --save "/root/${MODEL_NAME}_torch_dist"
-  # 3. Start the Ray cluster.
+  # 3. Start Ray.
   - |
     if [ $DSTACK_NODE_RANK = 0 ]; then
       ray start --head --port=6379
@@ -76,7 +86,10 @@ volumes:
 
 </div>
 
-Now, if you run this task via `dstack apply`, it will automatically forward the Ray's dashboard port to `localhost:8265`.
+### Run the configuration
+
+Run the task with [`dstack apply`](../../reference/cli/dstack/apply.md). By
+default, `dstack apply` forwards the Ray dashboard port to `localhost:8265`.
 
 <div class="termy">
 
@@ -87,11 +100,14 @@ $ dstack apply -f miles-qwen32b-h100.dstack.yml
 
 </div>
 
-As long as the `dstack apply` is attached, you can use `localhost:8265` to submit Ray jobs for execution. If `dstack apply` is detached, you can use `dstack attach` to re-attach.
+While `dstack apply` is attached, you can submit Ray jobs through
+`localhost:8265`. If you detach or run from another machine, use
+[`dstack attach`](../../reference/cli/dstack/attach.md) to re-attach and make
+the dashboard port accessible on `localhost`.
 
 ## Submit Ray jobs
 
-Before you can submit Ray jobs, ensure `ray` is installed locally:
+Install `ray` locally before submitting jobs:
 
 <div class="termy">
 
@@ -102,7 +118,7 @@ $ pip install ray
 </div>
 
 The submit script below runs the Miles training job on the Ray cluster. The
-model is sharded across all 8 GPUs per node via tensor parallelism, and SGLang
+model is sharded across all 8 GPUs per node with tensor parallelism, and SGLang
 uses the same 8 GPUs per node for rollout.
 
 <div editor-title="submit-miles-train.sh">
@@ -115,7 +131,6 @@ export RAY_ADDRESS=http://localhost:8265
 
 : "${NUM_NODES:?NUM_NODES is not set}"
 : "${GPUS_PER_NODE:?GPUS_PER_NODE is not set}"
-TOTAL_GPUS=$((NUM_NODES * GPUS_PER_NODE))
 
 MODEL_ID="Qwen/Qwen2.5-32B-Instruct"
 MODEL_NAME="$(basename "$MODEL_ID")"
@@ -133,17 +148,6 @@ WANDB_GROUP="${MODEL_NAME}-gsm8k-${NUM_NODES}node-${GPUS_PER_NODE}gpu"
 WANDB_NAME="rollout-$(date +%Y%m%d-%H%M%S)"
 ROLLOUT_GPUS_PER_ENGINE=8
 
-echo "===== submit-miles env ====="
-echo "NUM_NODES=${NUM_NODES}"
-echo "GPUS_PER_NODE=${GPUS_PER_NODE}"
-echo "TOTAL_GPUS=${TOTAL_GPUS}"
-echo "MODEL_ID=${MODEL_ID}"
-echo "HF_CHECKPOINT=${HF_CHECKPOINT}"
-echo "CHECKPOINT_DIR=${CHECKPOINT_DIR}"
-echo "============================"
-
-
-
 CMD='cd /root/miles && python3 train.py \
   --actor-num-nodes '"$NUM_NODES"' \
   --actor-num-gpus-per-node '"$GPUS_PER_NODE"' \
@@ -246,7 +250,7 @@ ray job submit \
 
 </div>
 
-Then run it:
+Submit the job with the same cluster shape as the task:
 
 <div class="termy">
 
@@ -256,15 +260,22 @@ $ NUM_NODES=2 GPUS_PER_NODE=8 bash submit-miles-train.sh
 
 </div>
 
-!!! info "Important parameters"
+!!! info "Training parameters"
     1. `--tensor-model-parallel-size 8` shards the 32B model across all 8 GPUs
        per node.
-    2. `--rollout-num-gpus-per-engine 8` starts SGLang engine with TP-8 on each node.
-    3. `--sglang-server-concurrency` sets how many requests SGLang processes concurrently.
-    4. `--max-tokens-per-gpu 9216` sets number of tokens each GPU processes. Lower this if Megatron OOM's during training.
-    5. `--sglang-mem-fraction-static 0.7` sets fraction of GPU memory SGLang pre-allocates for its KV cache. Lower this if Megatron OOM's at startup.
+    2. `--rollout-num-gpus-per-engine 8` starts SGLang with TP-8 on each node.
+    3. `--sglang-server-concurrency` sets how many requests SGLang processes
+       concurrently.
+    4. `--max-tokens-per-gpu 9216` sets the per-GPU token budget. Lower this if
+       Megatron OOMs during training.
+    5. `--sglang-mem-fraction-static 0.7` sets the SGLang KV cache memory
+       fraction. Lower this if Megatron OOMs at startup.
+
+Using Ray via `dstack` gives you access to the Ray ecosystem while benefiting
+from `dstack`'s provisioning capabilities.
 
 !!! info "What's next"
     1. Read about [distributed tasks](../../concepts/tasks.md#distributed-tasks)
-       and [fleets](../../concepts/fleets.md) for larger-scale setups.
-    2. Browse Miles' [examples](https://github.com/radixark/miles/tree/main/examples)
+       and [fleets](../../concepts/fleets.md)
+    2. See the [SGLang inference](../inference/sglang.md) example
+    3. Browse Miles' [examples](https://github.com/radixark/miles/tree/main/examples)

From 54fc412f2eecad21ef80e2cdf3322f2463a31a2f Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Tue, 26 May 2026 15:46:32 +0300
Subject: [PATCH 3/5] [Docs] Minor Miles training example changes

---
 mkdocs/docs/examples/training/miles.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mkdocs/docs/examples/training/miles.md b/mkdocs/docs/examples/training/miles.md
index d1c52db01..1b94b8615 100644
--- a/mkdocs/docs/examples/training/miles.md
+++ b/mkdocs/docs/examples/training/miles.md
@@ -75,7 +75,6 @@ commands:
     fi
 ports:
   - 8265
-privileged: true
 resources:
   gpu: H100:8
   shm_size: 32GB

From 51a6edb30f8f64f9ff56f70b1c9f87ff3fc66388 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Tue, 26 May 2026 15:59:41 +0300
Subject: [PATCH 4/5] [Docs] Minor Miles training example changes

---
 mkdocs/docs/examples/training/miles.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mkdocs/docs/examples/training/miles.md b/mkdocs/docs/examples/training/miles.md
index 1b94b8615..e9f462d6b 100644
--- a/mkdocs/docs/examples/training/miles.md
+++ b/mkdocs/docs/examples/training/miles.md
@@ -1,17 +1,17 @@
 ---
 title: Miles
-description: RL fine-tuning Qwen2.5-32B with Miles, SGLang, Megatron-LM, and Ray across two 8xH100 nodes
+description: RL post-training Qwen2.5-32B with Miles, SGLang, Megatron-LM, and Ray across two 8xH100 nodes
 ---
 
 # Miles
 
 This example shows how to use `dstack` and [Miles](https://github.com/radixark/miles)
-to fine-tune a 32B language model with [GRPO](https://arxiv.org/abs/2402.03300)
-across a multi-node cluster.
-Miles uses [SGLang](https://github.com/sgl-project/sglang) for high-throughput
-rollouts, [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) for training,
-and [Ray](https://docs.ray.io/en/latest/) to coordinate the trainer and rollout
-actors across nodes.
+for reinforcement learning (RL) post-training of a 32B language model with
+[GRPO](https://arxiv.org/abs/2402.03300) across a multi-node cluster.
+Miles integrates [SGLang](https://github.com/sgl-project/sglang) for
+high-throughput rollouts, [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
+for training, and [Ray](https://docs.ray.io/en/latest/) to coordinate the
+trainer and rollout actors across nodes.
 
 Here we fine-tune `Qwen/Qwen2.5-32B-Instruct` on the
 [GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset.

From fd107f67eed35ed07af8c7cc8bab81c8397f6cc6 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Tue, 26 May 2026 17:05:11 +0300
Subject: [PATCH 5/5] [Docs] Minor Miles training example changes

---
 mkdocs/docs/examples/training/miles.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mkdocs/docs/examples/training/miles.md b/mkdocs/docs/examples/training/miles.md
index e9f462d6b..59451d150 100644
--- a/mkdocs/docs/examples/training/miles.md
+++ b/mkdocs/docs/examples/training/miles.md
@@ -17,8 +17,8 @@ Here we fine-tune `Qwen/Qwen2.5-32B-Instruct` on the
 [GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset.
 
 !!! info "Prerequisites"
-    Before running a distributed task, make sure to create a [fleet](../../concepts/fleets.md)
-    with `placement` set to [`cluster`](../../concepts/fleets.md#cluster-placement).
+    Multi-node tasks require a [fleet](../../concepts/fleets.md) with
+    `placement` set to [`cluster`](../../concepts/fleets.md#cluster-placement).
 
 ## Run a Ray cluster
 
@@ -104,6 +104,9 @@ While `dstack apply` is attached, you can submit Ray jobs through
 [`dstack attach`](../../reference/cli/dstack/attach.md) to re-attach and make
 the dashboard port accessible on `localhost`.
 
+> To run on a single node, remove `nodes` or set it to `1`, then submit the job
+> with `NUM_NODES=1`. In this case, `placement: cluster` is not required.
+
 ## Submit Ray jobs
 
 Install `ray` locally before submitting jobs: