Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions infra/environments/prod/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,47 @@ resource "cloudflare_dns_record" "gr_mqtt" {
proxied = false
}

# ClickHouse on EC2 — analytics store for CAN telemetry + Epic Shelter
# ingest. Sentinel + transactional mapache state stay on gr-postgres;
# anything column-store-shaped (signals, aggregates, gr25_message
# successor tables) lands here.
#
# r8g.xlarge (Graviton4, 4 vCPU, 32 GiB) — RAM-optimized for the mark
# cache + per-query memory budget that columnar scans depend on, and
# dedicated CPU (no t-series credit accounting) so a big aggregation
# doesn't tip into throttling. Steeper hourly than t4g.xlarge but
# avoiding another OOM is worth ~$55/mo.
#
# Read the admin password via:
# terraform output -raw clickhouse_admin_password
module "clickhouse" {
source = "../../modules/clickhouse-ec2"

name = "gr-clickhouse"
vpc_id = module.vpc.vpc_id
subnet_id = module.vpc.public_subnet_ids[0]
availability_zone = "us-west-2a"

instance_type = "r8g.xlarge"
data_volume_size_gb = 200

associate_public_ip = true
admin_cidr_blocks = ["0.0.0.0/0"]

allowed_security_group_ids = [
module.eks.node_security_group_id,
]
}

resource "cloudflare_dns_record" "gr_clickhouse" {
zone_id = data.cloudflare_zone.gauchoracing.id
name = "gr-clickhouse"
type = "A"
content = module.clickhouse.public_ip
ttl = 300
proxied = false
}

# Per-hostname SSL/TLS override. The zone defaults to "Flexible" (CF
# talks HTTP to origin), but our ALB-backed Ingresses run HTTPS-only
# with the imported Origin CA cert — Flexible there causes a redirect
Expand Down
16 changes: 16 additions & 0 deletions infra/environments/prod/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,19 @@ output "mqtt_password_tcm26" {
value = module.mqtt.mqtt_password_tcm26
sensitive = true
}

output "clickhouse_private_ip" {
description = "Private IP of the ClickHouse EC2. In-cluster pods connect to this on 8123 (HTTP) / 9000 (native)."
value = module.clickhouse.private_ip
}

output "clickhouse_public_ip" {
description = "Public IP of the ClickHouse EC2 (EIP). External admins connect to this on 8123/9000, or use gr-clickhouse.gauchoracing.com."
value = module.clickhouse.public_ip
}

output "clickhouse_admin_password" {
description = "Generated ClickHouse admin password. Read with `terraform output -raw clickhouse_admin_password`."
value = module.clickhouse.admin_password
sensitive = true
}
162 changes: 162 additions & 0 deletions infra/modules/clickhouse-ec2/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# ClickHouse on a single EC2 in the EKS VPC. Mirrors the postgres-ec2
# module shape — pods reach it via private IP through the AWS split-horizon
# DNS trick, external admin clients via the EIP + gr-clickhouse hostname.
#
# Columnar OLAP store for telemetry: CAN signals from gr26, Epic Shelter
# ingest output, future analytics queries. Postgres keeps the small
# transactional state (users, vehicles, jobs, sessions).
#
# Backups are NOT included — add a snapshot policy before this holds
# unrecoverable data. ClickHouse data on a dedicated EBS volume so the
# instance can be replaced without losing the database.

terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 6.0"
}
random = {
source = "hashicorp/random"
version = "~> 3.0"
}
}
}

data "aws_ami" "al2023_arm64" {
most_recent = true
owners = ["amazon"]

filter {
name = "name"
values = ["al2023-ami-2023.*-arm64"]
}
filter {
name = "virtualization-type"
values = ["hvm"]
}
}

resource "random_password" "admin" {
length = 32
special = false
}

resource "aws_security_group" "this" {
name = var.name
description = "ClickHouse for ${var.name}"
vpc_id = var.vpc_id

egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}

tags = {
Name = "${var.name}"
}
}

# Ingress on the HTTP + native protocol ports from each allowed SG
# (typically the EKS node SG). 8123 for HTTP clients (query service,
# clickhouse-client --http); 9000 for the native binary protocol
# (clickhouse-client, the Go and Python drivers).
locals {
ingress_ports = [8123, 9000]
}

resource "aws_security_group_rule" "ingress_sg" {
for_each = {
for pair in setproduct(var.allowed_security_group_ids, local.ingress_ports) :
"${pair[0]}-${pair[1]}" => { sg = pair[0], port = pair[1] }
}
type = "ingress"
from_port = each.value.port
to_port = each.value.port
protocol = "tcp"
source_security_group_id = each.value.sg
security_group_id = aws_security_group.this.id
description = "ClickHouse :${each.value.port} from ${each.value.sg}"
}

# Ingress from arbitrary CIDR blocks (admin laptops, the public internet,
# etc.). 32-char random admin password + sha256_hex on the wire is the only
# gate; tighten the CIDR list later when a known set of admin IPs exists.
resource "aws_security_group_rule" "ingress_cidr" {
for_each = length(var.admin_cidr_blocks) > 0 ? toset([for p in local.ingress_ports : tostring(p)]) : []
type = "ingress"
from_port = tonumber(each.value)
to_port = tonumber(each.value)
protocol = "tcp"
cidr_blocks = var.admin_cidr_blocks
security_group_id = aws_security_group.this.id
description = "ClickHouse :${each.value} from admin CIDRs"
}

resource "aws_ebs_volume" "data" {
availability_zone = var.availability_zone
size = var.data_volume_size_gb
type = "gp3"
encrypted = true

tags = {
Name = "${var.name}-data"
}

# Preserve the volume across instance replacements. ClickHouse data
# is the whole reason this server exists; never destroy by accident.
lifecycle {
prevent_destroy = true
}
}

resource "aws_instance" "this" {
ami = data.aws_ami.al2023_arm64.id
instance_type = var.instance_type
subnet_id = var.subnet_id
vpc_security_group_ids = [aws_security_group.this.id]
availability_zone = var.availability_zone
associate_public_ip_address = var.associate_public_ip

root_block_device {
volume_size = 20
volume_type = "gp3"
encrypted = true
}

user_data = templatefile("${path.module}/user-data.sh.tftpl", {
clickhouse_version = var.clickhouse_version
admin_user = var.admin_user
admin_password_sha256 = sha256(random_password.admin.result)
})

# Re-rendering user-data shouldn't recreate the instance — the data
# volume preserves state, and admin credentials are generated once
# and persist in TF state. AMI bumps similarly ignored.
lifecycle {
ignore_changes = [user_data, ami]
}

tags = {
Name = "${var.name}"
Role = "clickhouse"
}
}

resource "aws_volume_attachment" "data" {
device_name = "/dev/sdf" # appears as /dev/nvme1n1 inside the instance
volume_id = aws_ebs_volume.data.id
instance_id = aws_instance.this.id
}

resource "aws_eip" "this" {
count = var.associate_public_ip ? 1 : 0
domain = "vpc"
instance = aws_instance.this.id

tags = {
Name = "${var.name}"
}
}
35 changes: 35 additions & 0 deletions infra/modules/clickhouse-ec2/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
output "instance_id" {
description = "EC2 instance ID. Useful for SSM/console access."
value = aws_instance.this.id
}

output "private_ip" {
description = "Private IP address. In-cluster pods can connect to this on 8123/9000."
value = aws_instance.this.private_ip
}

output "public_ip" {
description = "EIP-assigned public IP, if associate_public_ip = true; null otherwise. External admin clients dial this."
value = try(aws_eip.this[0].public_ip, null)
}

output "public_dns" {
description = "EC2 public DNS hostname. AWS split-horizon DNS resolves it to the private IP from inside the VPC and the EIP from outside, so pods + admins can both use this single name."
value = try("ec2-${replace(aws_eip.this[0].public_ip, ".", "-")}.us-west-2.compute.amazonaws.com", null)
}

output "security_group_id" {
description = "Security group ID. Add ingress rules for any additional callers."
value = aws_security_group.this.id
}

output "admin_user" {
description = "Admin username created on first boot. Pair with admin_password."
value = var.admin_user
}

output "admin_password" {
description = "Generated admin password (32-char random). Read via `terraform output -raw clickhouse_admin_password` and put into the k8s Secret + clickhouse-client config."
value = random_password.admin.result
sensitive = true
}
74 changes: 74 additions & 0 deletions infra/modules/clickhouse-ec2/user-data.sh.tftpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env bash
set -euxo pipefail

# Wait for the data volume to attach. AWS asynchronously attaches secondary
# volumes; cloud-init can start before /dev/nvme1n1 exists.
DATA_DEVICE="/dev/nvme1n1"
for _ in $(seq 1 30); do
[ -b "$DATA_DEVICE" ] && break
sleep 2
done

# Format only if not already a filesystem — prevents wipe on instance
# replacement when the EBS volume is preserved.
if ! blkid "$DATA_DEVICE" >/dev/null 2>&1; then
mkfs.ext4 -L clickhouse-data "$DATA_DEVICE"
fi

mkdir -p /var/lib/clickhouse
echo "LABEL=clickhouse-data /var/lib/clickhouse ext4 defaults,nofail 0 2" >> /etc/fstab
mount /var/lib/clickhouse

# Official clickhouse-server image runs as uid 101. Hand the data dir over
# before the container starts; otherwise initdb fails on permission errors.
chown 101:101 /var/lib/clickhouse

dnf install -y docker
systemctl enable --now docker

# users.d overrides — merged with the image's default users.xml at startup.
# Drops the default user (no password, full access) and adds a sha256-hashed
# admin with access_management so further users can be created via SQL.
mkdir -p /etc/clickhouse-server/users.d

cat >/etc/clickhouse-server/users.d/admin.xml <<EOF
<?xml version="1.0"?>
<clickhouse>
<users>
<${admin_user}>
<password_sha256_hex>${admin_password_sha256}</password_sha256_hex>
<networks>
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
<access_management>1</access_management>
</${admin_user}>
</users>
</clickhouse>
EOF

cat >/etc/clickhouse-server/users.d/remove-default-user.xml <<'EOF'
<?xml version="1.0"?>
<clickhouse>
<users>
<default remove="1"/>
</users>
</clickhouse>
EOF

chmod 600 /etc/clickhouse-server/users.d/admin.xml

# --network=host: nothing else on this box uses 8123 or 9000, so skip the
# docker-proxy NAT layer. --restart=always: container survives reboots
# (docker.service is enabled above). --ulimit nofile=262144: ClickHouse
# wants high fd limits for many concurrent table parts; the official docs
# call this out as a hard requirement.
docker run -d \
--name clickhouse \
--restart=always \
--network=host \
--ulimit nofile=262144:262144 \
-v /var/lib/clickhouse:/var/lib/clickhouse \
-v /etc/clickhouse-server/users.d:/etc/clickhouse-server/users.d:ro \
clickhouse/clickhouse-server:${clickhouse_version}
Loading
Loading