Gaucho-Racing · BK1031 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
@@ -153,6 +153,47 @@ resource "cloudflare_dns_record" "gr_mqtt" {
   proxied = false
 }
 
+# ClickHouse on EC2 — analytics store for CAN telemetry + Epic Shelter
+# ingest. Sentinel + transactional mapache state stay on gr-postgres;
+# anything column-store-shaped (signals, aggregates, gr25_message
+# successor tables) lands here.
+#
+# r8g.xlarge (Graviton4, 4 vCPU, 32 GiB) — RAM-optimized for the mark
+# cache + per-query memory budget that columnar scans depend on, and
+# dedicated CPU (no t-series credit accounting) so a big aggregation
+# doesn't tip into throttling. Steeper hourly than t4g.xlarge but
+# avoiding another OOM is worth ~$55/mo.
+#
+# Read the admin password via:
+#   terraform output -raw clickhouse_admin_password
+module "clickhouse" {
+  source = "../../modules/clickhouse-ec2"
+
+  name              = "gr-clickhouse"
+  vpc_id            = module.vpc.vpc_id
+  subnet_id         = module.vpc.public_subnet_ids[0]
+  availability_zone = "us-west-2a"
+
+  instance_type       = "r8g.xlarge"
+  data_volume_size_gb = 200
+
+  associate_public_ip = true
+  admin_cidr_blocks   = ["0.0.0.0/0"]
+
+  allowed_security_group_ids = [
+    module.eks.node_security_group_id,
+  ]
+}
+
+resource "cloudflare_dns_record" "gr_clickhouse" {
+  zone_id = data.cloudflare_zone.gauchoracing.id
+  name    = "gr-clickhouse"
+  type    = "A"
+  content = module.clickhouse.public_ip
+  ttl     = 300
+  proxied = false
+}
+
 # Per-hostname SSL/TLS override. The zone defaults to "Flexible" (CF
 # talks HTTP to origin), but our ALB-backed Ingresses run HTTPS-only
 # with the imported Origin CA cert — Flexible there causes a redirect

@@ -64,3 +64,19 @@ output "mqtt_password_tcm26" {
   value       = module.mqtt.mqtt_password_tcm26
   sensitive   = true
 }
+
+output "clickhouse_private_ip" {
+  description = "Private IP of the ClickHouse EC2. In-cluster pods connect to this on 8123 (HTTP) / 9000 (native)."
+  value       = module.clickhouse.private_ip
+}
+
+output "clickhouse_public_ip" {
+  description = "Public IP of the ClickHouse EC2 (EIP). External admins connect to this on 8123/9000, or use gr-clickhouse.gauchoracing.com."
+  value       = module.clickhouse.public_ip
+}
+
+output "clickhouse_admin_password" {
+  description = "Generated ClickHouse admin password. Read with `terraform output -raw clickhouse_admin_password`."
+  value       = module.clickhouse.admin_password
+  sensitive   = true
+}
@@ -0,0 +1,162 @@
+# ClickHouse on a single EC2 in the EKS VPC. Mirrors the postgres-ec2
+# module shape — pods reach it via private IP through the AWS split-horizon
+# DNS trick, external admin clients via the EIP + gr-clickhouse hostname.
+#
+# Columnar OLAP store for telemetry: CAN signals from gr26, Epic Shelter
+# ingest output, future analytics queries. Postgres keeps the small
+# transactional state (users, vehicles, jobs, sessions).
+#
+# Backups are NOT included — add a snapshot policy before this holds
+# unrecoverable data. ClickHouse data on a dedicated EBS volume so the
+# instance can be replaced without losing the database.
+
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 6.0"
+    }
+    random = {
+      source  = "hashicorp/random"
+      version = "~> 3.0"
+    }
+  }
+}
+
+data "aws_ami" "al2023_arm64" {
+  most_recent = true
+  owners      = ["amazon"]
+
+  filter {
+    name   = "name"
+    values = ["al2023-ami-2023.*-arm64"]
+  }
+  filter {
+    name   = "virtualization-type"
+    values = ["hvm"]
+  }
+}
+
+resource "random_password" "admin" {
+  length  = 32
+  special = false
+}
+
+resource "aws_security_group" "this" {
+  name        = var.name
+  description = "ClickHouse for ${var.name}"
+  vpc_id      = var.vpc_id
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  tags = {
+    Name = "${var.name}"
+  }
+}
+
+# Ingress on the HTTP + native protocol ports from each allowed SG
+# (typically the EKS node SG). 8123 for HTTP clients (query service,
+# clickhouse-client --http); 9000 for the native binary protocol
+# (clickhouse-client, the Go and Python drivers).
+locals {
+  ingress_ports = [8123, 9000]
+}
+
+resource "aws_security_group_rule" "ingress_sg" {
+  for_each = {
+    for pair in setproduct(var.allowed_security_group_ids, local.ingress_ports) :
+    "${pair[0]}-${pair[1]}" => { sg = pair[0], port = pair[1] }
+  }
+  type                     = "ingress"
+  from_port                = each.value.port
+  to_port                  = each.value.port
+  protocol                 = "tcp"
+  source_security_group_id = each.value.sg
+  security_group_id        = aws_security_group.this.id
+  description              = "ClickHouse :${each.value.port} from ${each.value.sg}"
+}
+
+# Ingress from arbitrary CIDR blocks (admin laptops, the public internet,
+# etc.). 32-char random admin password + sha256_hex on the wire is the only
+# gate; tighten the CIDR list later when a known set of admin IPs exists.
+resource "aws_security_group_rule" "ingress_cidr" {
+  for_each          = length(var.admin_cidr_blocks) > 0 ? toset([for p in local.ingress_ports : tostring(p)]) : []
+  type              = "ingress"
+  from_port         = tonumber(each.value)
+  to_port           = tonumber(each.value)
+  protocol          = "tcp"
+  cidr_blocks       = var.admin_cidr_blocks
+  security_group_id = aws_security_group.this.id
+  description       = "ClickHouse :${each.value} from admin CIDRs"
+}
+
+resource "aws_ebs_volume" "data" {
+  availability_zone = var.availability_zone
+  size              = var.data_volume_size_gb
+  type              = "gp3"
+  encrypted         = true
+
+  tags = {
+    Name = "${var.name}-data"
+  }
+
+  # Preserve the volume across instance replacements. ClickHouse data
+  # is the whole reason this server exists; never destroy by accident.
+  lifecycle {
+    prevent_destroy = true
+  }
+}
+
+resource "aws_instance" "this" {
+  ami                         = data.aws_ami.al2023_arm64.id
+  instance_type               = var.instance_type
+  subnet_id                   = var.subnet_id
+  vpc_security_group_ids      = [aws_security_group.this.id]
+  availability_zone           = var.availability_zone
+  associate_public_ip_address = var.associate_public_ip
+
+  root_block_device {
+    volume_size = 20
+    volume_type = "gp3"
+    encrypted   = true
+  }
+
+  user_data = templatefile("${path.module}/user-data.sh.tftpl", {
+    clickhouse_version    = var.clickhouse_version
+    admin_user            = var.admin_user
+    admin_password_sha256 = sha256(random_password.admin.result)
+  })
+
+  # Re-rendering user-data shouldn't recreate the instance — the data
+  # volume preserves state, and admin credentials are generated once
+  # and persist in TF state. AMI bumps similarly ignored.
+  lifecycle {
+    ignore_changes = [user_data, ami]
+  }
+
+  tags = {
+    Name = "${var.name}"
+    Role = "clickhouse"
+  }
+}
+
+resource "aws_volume_attachment" "data" {
+  device_name = "/dev/sdf" # appears as /dev/nvme1n1 inside the instance
+  volume_id   = aws_ebs_volume.data.id
+  instance_id = aws_instance.this.id
+}
+
+resource "aws_eip" "this" {
+  count    = var.associate_public_ip ? 1 : 0
+  domain   = "vpc"
+  instance = aws_instance.this.id
+
+  tags = {
+    Name = "${var.name}"
+  }
+}
@@ -0,0 +1,35 @@
+output "instance_id" {
+  description = "EC2 instance ID. Useful for SSM/console access."
+  value       = aws_instance.this.id
+}
+
+output "private_ip" {
+  description = "Private IP address. In-cluster pods can connect to this on 8123/9000."
+  value       = aws_instance.this.private_ip
+}
+
+output "public_ip" {
+  description = "EIP-assigned public IP, if associate_public_ip = true; null otherwise. External admin clients dial this."
+  value       = try(aws_eip.this[0].public_ip, null)
+}
+
+output "public_dns" {
+  description = "EC2 public DNS hostname. AWS split-horizon DNS resolves it to the private IP from inside the VPC and the EIP from outside, so pods + admins can both use this single name."
+  value       = try("ec2-${replace(aws_eip.this[0].public_ip, ".", "-")}.us-west-2.compute.amazonaws.com", null)
+}
+
+output "security_group_id" {
+  description = "Security group ID. Add ingress rules for any additional callers."
+  value       = aws_security_group.this.id
+}
+
+output "admin_user" {
+  description = "Admin username created on first boot. Pair with admin_password."
+  value       = var.admin_user
+}
+
+output "admin_password" {
+  description = "Generated admin password (32-char random). Read via `terraform output -raw clickhouse_admin_password` and put into the k8s Secret + clickhouse-client config."
+  value       = random_password.admin.result
+  sensitive   = true
+}
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# Wait for the data volume to attach. AWS asynchronously attaches secondary
+# volumes; cloud-init can start before /dev/nvme1n1 exists.
+DATA_DEVICE="/dev/nvme1n1"
+for _ in $(seq 1 30); do
+  [ -b "$DATA_DEVICE" ] && break
+  sleep 2
+done
+
+# Format only if not already a filesystem — prevents wipe on instance
+# replacement when the EBS volume is preserved.
+if ! blkid "$DATA_DEVICE" >/dev/null 2>&1; then
+  mkfs.ext4 -L clickhouse-data "$DATA_DEVICE"
+fi
+
+mkdir -p /var/lib/clickhouse
+echo "LABEL=clickhouse-data /var/lib/clickhouse ext4 defaults,nofail 0 2" >> /etc/fstab
+mount /var/lib/clickhouse
+
+# Official clickhouse-server image runs as uid 101. Hand the data dir over
+# before the container starts; otherwise initdb fails on permission errors.
+chown 101:101 /var/lib/clickhouse
+
+dnf install -y docker
+systemctl enable --now docker
+
+# users.d overrides — merged with the image's default users.xml at startup.
+# Drops the default user (no password, full access) and adds a sha256-hashed
+# admin with access_management so further users can be created via SQL.
+mkdir -p /etc/clickhouse-server/users.d
+
+cat >/etc/clickhouse-server/users.d/admin.xml <<EOF
+<?xml version="1.0"?>
+<clickhouse>
+  <users>
+    <${admin_user}>
+      <password_sha256_hex>${admin_password_sha256}</password_sha256_hex>
+      <networks>
+        <ip>::/0</ip>
+      </networks>
+      <profile>default</profile>
+      <quota>default</quota>
+      <access_management>1</access_management>
+    </${admin_user}>
+  </users>
+</clickhouse>
+EOF
+
+cat >/etc/clickhouse-server/users.d/remove-default-user.xml <<'EOF'
+<?xml version="1.0"?>
+<clickhouse>
+  <users>
+    <default remove="1"/>
+  </users>
+</clickhouse>
+EOF
+
+chmod 600 /etc/clickhouse-server/users.d/admin.xml
+
+# --network=host: nothing else on this box uses 8123 or 9000, so skip the
+# docker-proxy NAT layer. --restart=always: container survives reboots
+# (docker.service is enabled above). --ulimit nofile=262144: ClickHouse
+# wants high fd limits for many concurrent table parts; the official docs
+# call this out as a hard requirement.
+docker run -d \
+  --name clickhouse \
+  --restart=always \
+  --network=host \
+  --ulimit nofile=262144:262144 \
+  -v /var/lib/clickhouse:/var/lib/clickhouse \
+  -v /etc/clickhouse-server/users.d:/etc/clickhouse-server/users.d:ro \
+  clickhouse/clickhouse-server:${clickhouse_version}