Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions architecture/compute-runtimes.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ template resource limits. Docker and Podman apply them as runtime limits.
Kubernetes mirrors each limit into the matching request. VM accepts the fields
but currently ignores them.

GPU requests enter the driver layer through
`SandboxSpec.resource_requirements.gpu`. The compact interim shape supports a
default GPU request and GPU count. Exact driver-native device selection is
passed through the selected runtime's `driver_config` block; the gateway
selects that block but does not interpret the nested driver schema. Drivers
that support exact selection validate that the unique `gpu_device_ids` entry
count matches the portable GPU count.

VM runtime state paths are derived only from driver-validated sandbox IDs
matching `[A-Za-z0-9._-]{1,128}`. The gateway-owned VM driver socket uses a
private `run/` directory plus Unix peer UID/PID checks. Standalone
Expand Down
82 changes: 80 additions & 2 deletions crates/openshell-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1215,10 +1215,14 @@ enum SandboxCommands {

/// Target a driver-specific GPU device. Docker and Podman use CDI device IDs
/// (for example "nvidia.com/gpu=0"); VM uses a PCI BDF or index.
/// Only valid with --gpu. When omitted with --gpu, the driver uses its default GPU selection.
#[arg(long, requires = "gpu")]
/// When omitted with --gpu, the driver uses its default GPU selection.
#[arg(long, conflicts_with = "gpu_count")]
gpu_device: Option<String>,

/// Request a specific number of GPUs. Mutually exclusive with --gpu-device.
#[arg(long, value_parser = clap::value_parser!(u32).range(1..), conflicts_with = "gpu_device")]
gpu_count: Option<u32>,

/// CPU limit for the sandbox (for example: 500m, 1, 2.5).
#[arg(long)]
cpu: Option<String>,
Expand Down Expand Up @@ -2539,6 +2543,7 @@ async fn main() -> Result<()> {
editor,
gpu,
gpu_device,
gpu_count,
cpu,
memory,
providers,
Expand Down Expand Up @@ -2608,6 +2613,7 @@ async fn main() -> Result<()> {
keep,
gpu,
gpu_device.as_deref(),
gpu_count,
cpu.as_deref(),
memory.as_deref(),
editor,
Expand Down Expand Up @@ -4287,6 +4293,78 @@ mod tests {
}
}

#[test]
fn sandbox_create_gpu_count_parses_without_gpu_flag() {
let cli = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu-count", "2"])
.expect("sandbox create --gpu-count should parse");

match cli.command {
Some(Commands::Sandbox {
command: Some(SandboxCommands::Create { gpu, gpu_count, .. }),
..
}) => {
assert!(!gpu);
assert_eq!(gpu_count, Some(2));
}
other => panic!("expected SandboxCommands::Create, got: {other:?}"),
}
}

#[test]
fn sandbox_create_gpu_count_rejects_zero() {
let result = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu-count", "0"]);

assert!(
result.is_err(),
"sandbox create --gpu-count 0 should be rejected"
);
}

#[test]
fn sandbox_create_gpu_device_parses_without_gpu_flag() {
let cli = Cli::try_parse_from([
"openshell",
"sandbox",
"create",
"--gpu-device",
"nvidia.com/gpu=0",
])
.expect("sandbox create --gpu-device should parse without --gpu");

match cli.command {
Some(Commands::Sandbox {
command:
Some(SandboxCommands::Create {
gpu, gpu_device, ..
}),
..
}) => {
assert!(!gpu);
assert_eq!(gpu_device.as_deref(), Some("nvidia.com/gpu=0"));
}
other => panic!("expected SandboxCommands::Create, got: {other:?}"),
}
}

#[test]
fn sandbox_create_gpu_count_conflicts_with_gpu_device() {
let result = Cli::try_parse_from([
"openshell",
"sandbox",
"create",
"--gpu",
"--gpu-device",
"nvidia.com/gpu=0",
"--gpu-count",
"2",
]);

assert!(
result.is_err(),
"sandbox create should reject --gpu-count with --gpu-device"
);
}

#[test]
fn service_expose_accepts_positional_target_port_and_service() {
let cli = Cli::try_parse_from([
Expand Down
175 changes: 157 additions & 18 deletions crates/openshell-cli/src/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,18 @@ use openshell_core::proto::{
GetClusterInferenceRequest, GetDraftHistoryRequest, GetDraftPolicyRequest,
GetGatewayConfigRequest, GetProviderProfileRequest, GetProviderRefreshStatusRequest,
GetProviderRequest, GetSandboxConfigRequest, GetSandboxLogsRequest,
GetSandboxPolicyStatusRequest, GetSandboxRequest, GetServiceRequest, HealthRequest,
ImportProviderProfilesRequest, LintProviderProfilesRequest, ListProviderProfilesRequest,
ListProvidersRequest, ListSandboxPoliciesRequest, ListSandboxProvidersRequest,
ListSandboxesRequest, ListServicesRequest, PlatformEvent, PolicySource, PolicyStatus, Provider,
ProviderCredentialRefreshStatus, ProviderCredentialRefreshStrategy, ProviderProfile,
ProviderProfileDiagnostic, ProviderProfileImportItem, RejectDraftChunkRequest,
RevokeSshSessionRequest, RotateProviderCredentialRequest, Sandbox, SandboxPhase, SandboxPolicy,
SandboxSpec, SandboxTemplate, ServiceEndpointResponse, SetClusterInferenceRequest,
SettingScope, SettingValue, TcpForwardFrame, TcpForwardInit, TcpRelayTarget,
UpdateConfigRequest, UpdateProviderRequest, WatchSandboxRequest, exec_sandbox_event,
setting_value, tcp_forward_init,
GetSandboxPolicyStatusRequest, GetSandboxRequest, GetServiceRequest, GpuResourceRequirement,
HealthRequest, ImportProviderProfilesRequest, LintProviderProfilesRequest,
ListProviderProfilesRequest, ListProvidersRequest, ListSandboxPoliciesRequest,
ListSandboxProvidersRequest, ListSandboxesRequest, ListServicesRequest, PlatformEvent,
PolicySource, PolicyStatus, Provider, ProviderCredentialRefreshStatus,
ProviderCredentialRefreshStrategy, ProviderProfile, ProviderProfileDiagnostic,
ProviderProfileImportItem, RejectDraftChunkRequest, RevokeSshSessionRequest,
RotateProviderCredentialRequest, Sandbox, SandboxPhase, SandboxPolicy,
SandboxResourceRequirements, SandboxSpec, SandboxTemplate, ServiceEndpointResponse,
SetClusterInferenceRequest, SettingScope, SettingValue, TcpForwardFrame, TcpForwardInit,
TcpRelayTarget, UpdateConfigRequest, UpdateProviderRequest, WatchSandboxRequest,
exec_sandbox_event, setting_value, tcp_forward_init,
};
use openshell_core::settings::{self, SettingValueKind};
use openshell_core::{ObjectId, ObjectName};
Expand Down Expand Up @@ -1679,6 +1680,7 @@ pub async fn sandbox_create(
keep: bool,
gpu: bool,
gpu_device: Option<&str>,
gpu_count: Option<u32>,
cpu: Option<&str>,
memory: Option<&str>,
editor: Option<Editor>,
Expand Down Expand Up @@ -1732,7 +1734,10 @@ pub async fn sandbox_create(
}
None => None,
};
let requested_gpu = gpu || image.as_deref().is_some_and(image_requests_gpu);
let gpu_device_ids = gpu_device_ids_from_cli(gpu_device);
let effective_gpu_count = gpu_count_from_cli(gpu_count, &gpu_device_ids);
let requested_gpu =
gpu || effective_gpu_count.is_some() || image.as_deref().is_some_and(image_requests_gpu);

let providers_v2_enabled = gateway_providers_v2_enabled(&mut client).await?;
let inferred_types: Vec<String> = if providers_v2_enabled {
Expand All @@ -1750,11 +1755,13 @@ pub async fn sandbox_create(

let policy = load_sandbox_policy(policy)?;
let resource_limits = build_sandbox_resource_limits(cpu, memory)?;
let driver_config = gpu_driver_config_from_cli(&gpu_device_ids);

let template = if image.is_some() || resource_limits.is_some() {
let template = if image.is_some() || resource_limits.is_some() || driver_config.is_some() {
Some(SandboxTemplate {
image: image.unwrap_or_default(),
resources: resource_limits,
driver_config,
..SandboxTemplate::default()
})
} else {
Expand All @@ -1763,8 +1770,10 @@ pub async fn sandbox_create(

let request = CreateSandboxRequest {
spec: Some(SandboxSpec {
gpu: requested_gpu,
gpu_device: gpu_device.unwrap_or_default().to_string(),
resource_requirements: resource_requirements_from_cli(
requested_gpu,
effective_gpu_count,
),
policy,
providers: configured_providers,
template,
Expand Down Expand Up @@ -2189,6 +2198,74 @@ pub async fn sandbox_create(
}
}

fn resource_requirements_from_cli(
requested_gpu: bool,
gpu_count: Option<u32>,
) -> Option<SandboxResourceRequirements> {
requested_gpu.then_some(SandboxResourceRequirements {
gpu: Some(GpuResourceRequirement { count: gpu_count }),
})
}

fn gpu_device_ids_from_cli(gpu_device: Option<&str>) -> Vec<String> {
gpu_device
.map(str::trim)
.filter(|device_id| !device_id.is_empty())
.map(|device_id| vec![device_id.to_string()])
.unwrap_or_default()
}

fn gpu_count_from_cli(gpu_count: Option<u32>, gpu_device_ids: &[String]) -> Option<u32> {
if gpu_device_ids.is_empty() {
gpu_count
} else {
u32::try_from(gpu_device_ids.len()).ok()
}
}

fn gpu_driver_config_from_cli(gpu_device_ids: &[String]) -> Option<prost_types::Struct> {
use prost_types::{ListValue, Struct, Value, value::Kind};

fn string_value(value: &str) -> Value {
Value {
kind: Some(Kind::StringValue(value.to_string())),
}
}

fn driver_block(gpu_device_ids: &[String]) -> Value {
Value {
kind: Some(Kind::StructValue(Struct {
fields: std::iter::once((
"gpu_device_ids".to_string(),
Value {
kind: Some(Kind::ListValue(ListValue {
values: gpu_device_ids
.iter()
.map(|device_id| string_value(device_id))
.collect(),
})),
},
))
.collect(),
})),
}
}

if gpu_device_ids.is_empty() {
return None;
}

Some(Struct {
fields: [
("docker".to_string(), driver_block(gpu_device_ids)),
("podman".to_string(), driver_block(gpu_device_ids)),
("vm".to_string(), driver_block(gpu_device_ids)),
]
.into_iter()
.collect(),
})
}

/// Resolved source for the `--from` flag on `sandbox create`.
#[derive(Debug)]
enum ResolvedSource {
Expand Down Expand Up @@ -7438,14 +7515,15 @@ mod tests {
dockerfile_sources_supported_for_gateway, format_endpoint, format_gateway_select_header,
format_gateway_select_items, format_provider_attachment_table, gateway_add,
gateway_auth_label, gateway_env_override_warning, gateway_select_with, gateway_type_label,
git_sync_files, http_health_check, image_requests_gpu, import_local_package_mtls_bundle,
git_sync_files, gpu_count_from_cli, gpu_device_ids_from_cli, gpu_driver_config_from_cli,
http_health_check, image_requests_gpu, import_local_package_mtls_bundle,
inferred_provider_type, package_managed_tls_dirs, parse_cli_setting_value,
parse_credential_expiry_cli_value, parse_credential_expiry_pairs, parse_credential_pairs,
plaintext_gateway_is_remote, progress_step_from_metadata,
provider_profile_allows_refresh_bootstrap, provisioning_timeout_message,
ready_false_condition_message, refresh_status_header, refresh_status_row, resolve_from,
sandbox_should_persist, sandbox_upload_plan, service_expose_status_error,
service_url_for_gateway,
resource_requirements_from_cli, sandbox_should_persist, sandbox_upload_plan,
service_expose_status_error, service_url_for_gateway,
};
use crate::TEST_ENV_LOCK;
use hyper::StatusCode;
Expand Down Expand Up @@ -7924,6 +8002,67 @@ mod tests {
}
}

#[test]
fn gpu_device_ids_from_cli_trims_gpu_device() {
assert_eq!(
gpu_device_ids_from_cli(Some(" nvidia.com/gpu=0 ")),
vec!["nvidia.com/gpu=0".to_string()]
);
}

#[test]
fn gpu_device_ids_from_cli_omits_empty_device() {
assert!(gpu_device_ids_from_cli(Some(" ")).is_empty());
assert!(gpu_device_ids_from_cli(None).is_empty());
}

#[test]
fn gpu_count_from_cli_uses_gpu_device_id_count() {
let device_ids = gpu_device_ids_from_cli(Some("nvidia.com/gpu=0"));

assert_eq!(gpu_count_from_cli(None, &device_ids), Some(1));
assert_eq!(gpu_count_from_cli(Some(2), &device_ids), Some(1));
}

#[test]
fn resource_requirements_from_cli_uses_presence_for_default_gpu() {
let requirements = resource_requirements_from_cli(true, None)
.expect("resource requirements should be present");
let gpu = requirements.gpu.expect("GPU requirement should be present");

assert_eq!(gpu.count, None);
}

#[test]
fn gpu_driver_config_from_cli_maps_gpu_device_to_driver_blocks() {
let device_ids = gpu_device_ids_from_cli(Some("nvidia.com/gpu=0"));
let config =
gpu_driver_config_from_cli(&device_ids).expect("driver config should be present");

assert!(config.fields.contains_key("docker"));
assert!(config.fields.contains_key("podman"));
assert!(config.fields.contains_key("vm"));
}

#[test]
fn resource_requirements_from_cli_maps_gpu_count() {
let requirements =
resource_requirements_from_cli(true, Some(2)).expect("requirements should exist");
let gpu = requirements.gpu.expect("GPU requirement should be present");

assert_eq!(gpu.count, Some(2));
}

#[test]
fn gpu_driver_config_from_cli_omits_empty_device() {
assert!(gpu_driver_config_from_cli(&[]).is_none());
}

#[test]
fn resource_requirements_from_cli_omits_gpu_request_when_not_requested() {
assert!(resource_requirements_from_cli(false, None).is_none());
}

#[test]
fn resolve_from_classifies_existing_dockerfile_path() {
let temp = tempfile::tempdir().expect("failed to create tempdir");
Expand Down
Loading
Loading