┌──────────────────────────────┐
│ Developer Portal / API │ ← Users interact here
├──────────────────────────────┤
│ Orchestration Layer │ ← Kubernetes, Nomad, etc.
├──────────────────────────────┤
│ Networking Layer │ ← SDN, load balancers, DNS
├──────────────────────────────┤
│ Storage Layer │ ← Block, object, file
├──────────────────────────────┤
│ Compute Layer │ ← Hypervisors, bare metal
├──────────────────────────────┤
│ Physical / Bare Metal │ ← The actual servers
└──────────────────────────────┘
┌──────────────────────────────┐
│ Developer Portal / API │ ← Users interact here
├──────────────────────────────┤
│ Orchestration Layer │ ← Kubernetes, Nomad, etc.
├──────────────────────────────┤
│ Networking Layer │ ← SDN, load balancers, DNS
├──────────────────────────────┤
│ Storage Layer │ ← Block, object, file
├──────────────────────────────┤
│ Compute Layer │ ← Hypervisors, bare metal
├──────────────────────────────┤
│ Physical / Bare Metal │ ← The actual servers
└──────────────────────────────┘
┌──────────────────────────────┐
│ Developer Portal / API │ ← Users interact here
├──────────────────────────────┤
│ Orchestration Layer │ ← Kubernetes, Nomad, etc.
├──────────────────────────────┤
│ Networking Layer │ ← SDN, load balancers, DNS
├──────────────────────────────┤
│ Storage Layer │ ← Block, object, file
├──────────────────────────────┤
│ Compute Layer │ ← Hypervisors, bare metal
├──────────────────────────────┤
│ Physical / Bare Metal │ ← The actual servers
└──────────────────────────────┘
curl -s -k -b "PVEAuthCookie=${TICKET}" \ -H "CSRFPreventionToken: ${CSRF}" \ -X POST \ "https://proxmox-host:8006/api2/json/nodes/pve/qemu" \ -d 'vmid=101&name=my-vm&memory=2048&cores=2&net0=virtio,bridge=vmbr0&ide2=local:iso/ubuntu-22.04.iso,media=cdrom&scsihw=virtio-scsi-pci&scsi0=local-lvm:20'
curl -s -k -b "PVEAuthCookie=${TICKET}" \ -H "CSRFPreventionToken: ${CSRF}" \ -X POST \ "https://proxmox-host:8006/api2/json/nodes/pve/qemu" \ -d 'vmid=101&name=my-vm&memory=2048&cores=2&net0=virtio,bridge=vmbr0&ide2=local:iso/ubuntu-22.04.iso,media=cdrom&scsihw=virtio-scsi-pci&scsi0=local-lvm:20'
curl -s -k -b "PVEAuthCookie=${TICKET}" \ -H "CSRFPreventionToken: ${CSRF}" \ -X POST \ "https://proxmox-host:8006/api2/json/nodes/pve/qemu" \ -d 'vmid=101&name=my-vm&memory=2048&cores=2&net0=virtio,bridge=vmbr0&ide2=local:iso/ubuntu-22.04.iso,media=cdrom&scsihw=virtio-scsi-pci&scsi0=local-lvm:20'
# Create a VXLAN tunnel between two hypervisor nodes
ovs-vsctl add-br br-overlay
ovs-vsctl add-port br-overlay vxlan0 -- \ set interface vxlan0 type=vxlan \ options:remote_ip=10.0.0.2 \ options:key=1001
# Create a VXLAN tunnel between two hypervisor nodes
ovs-vsctl add-br br-overlay
ovs-vsctl add-port br-overlay vxlan0 -- \ set interface vxlan0 type=vxlan \ options:remote_ip=10.0.0.2 \ options:key=1001
# Create a VXLAN tunnel between two hypervisor nodes
ovs-vsctl add-br br-overlay
ovs-vsctl add-port br-overlay vxlan0 -- \ set interface vxlan0 type=vxlan \ options:remote_ip=10.0.0.2 \ options:key=1001
ceph osd pool create my-cloud-vms 128
rbd pool init my-cloud-vms
ceph osd pool create my-cloud-vms 128
rbd pool init my-cloud-vms
ceph osd pool create my-cloud-vms 128
rbd pool init my-cloud-vms
# Install K3s on a fresh VM
curl -sfL https://get.k3s.io | sh -s - \ --cluster-init \ --disable traefik \ --node-name cloud-control-01
# Install K3s on a fresh VM
curl -sfL https://get.k3s.io | sh -s - \ --cluster-init \ --disable traefik \ --node-name cloud-control-01
# Install K3s on a fresh VM
curl -sfL https://get.k3s.io | sh -s - \ --cluster-init \ --disable traefik \ --node-name cloud-control-01
func (s *Server) CreateVM(ctx context.Context, req *CreateVMRequest) (*VM, error) { // 1. Validate and authenticate user, err := s.auth.Validate(ctx, req.Token) if err != nil { return nil, ErrUnauthorized } // 2. Check quota if err := s.quota.Check(ctx, user.ID, req.Resources); err != nil { return nil, ErrQuotaExceeded } // 3. Schedule: pick a hypervisor node node, err := s.scheduler.Select(ctx, req.Resources) if err != nil { return nil, ErrNoCapacity } // 4. Provision the VM vmID, err := s.proxmox.CreateVM(ctx, node, req) if err != nil { return nil, fmt.Errorf("provisioning failed: %w", err) } // 5. Configure networking ip, err := s.network.Allocate(ctx, vmID, user.ProjectID) if err != nil { _ = s.proxmox.DeleteVM(ctx, node, vmID) // rollback return nil, fmt.Errorf("network allocation failed: %w", err) } // 6. Persist state vm := &VM{ID: vmID, NodeID: node.ID, IP: ip, OwnerID: user.ID} if err := s.db.SaveVM(ctx, vm); err != nil { return nil, err } return vm, nil
}
func (s *Server) CreateVM(ctx context.Context, req *CreateVMRequest) (*VM, error) { // 1. Validate and authenticate user, err := s.auth.Validate(ctx, req.Token) if err != nil { return nil, ErrUnauthorized } // 2. Check quota if err := s.quota.Check(ctx, user.ID, req.Resources); err != nil { return nil, ErrQuotaExceeded } // 3. Schedule: pick a hypervisor node node, err := s.scheduler.Select(ctx, req.Resources) if err != nil { return nil, ErrNoCapacity } // 4. Provision the VM vmID, err := s.proxmox.CreateVM(ctx, node, req) if err != nil { return nil, fmt.Errorf("provisioning failed: %w", err) } // 5. Configure networking ip, err := s.network.Allocate(ctx, vmID, user.ProjectID) if err != nil { _ = s.proxmox.DeleteVM(ctx, node, vmID) // rollback return nil, fmt.Errorf("network allocation failed: %w", err) } // 6. Persist state vm := &VM{ID: vmID, NodeID: node.ID, IP: ip, OwnerID: user.ID} if err := s.db.SaveVM(ctx, vm); err != nil { return nil, err } return vm, nil
}
func (s *Server) CreateVM(ctx context.Context, req *CreateVMRequest) (*VM, error) { // 1. Validate and authenticate user, err := s.auth.Validate(ctx, req.Token) if err != nil { return nil, ErrUnauthorized } // 2. Check quota if err := s.quota.Check(ctx, user.ID, req.Resources); err != nil { return nil, ErrQuotaExceeded } // 3. Schedule: pick a hypervisor node node, err := s.scheduler.Select(ctx, req.Resources) if err != nil { return nil, ErrNoCapacity } // 4. Provision the VM vmID, err := s.proxmox.CreateVM(ctx, node, req) if err != nil { return nil, fmt.Errorf("provisioning failed: %w", err) } // 5. Configure networking ip, err := s.network.Allocate(ctx, vmID, user.ProjectID) if err != nil { _ = s.proxmox.DeleteVM(ctx, node, vmID) // rollback return nil, fmt.Errorf("network allocation failed: %w", err) } // 6. Persist state vm := &VM{ID: vmID, NodeID: node.ID, IP: ip, OwnerID: user.ID} if err := s.db.SaveVM(ctx, vm); err != nil { return nil, err } return vm, nil
}
// Instrument your handlers from the start
func (s *Server) CreateVM(ctx context.Context, req *CreateVMRequest) (*VM, error) { timer := prometheus.NewTimer(vmCreationDuration) defer timer.ObserveDuration() vmCreationTotal.Inc() // ... rest of the logic
}
// Instrument your handlers from the start
func (s *Server) CreateVM(ctx context.Context, req *CreateVMRequest) (*VM, error) { timer := prometheus.NewTimer(vmCreationDuration) defer timer.ObserveDuration() vmCreationTotal.Inc() // ... rest of the logic
}
// Instrument your handlers from the start
func (s *Server) CreateVM(ctx context.Context, req *CreateVMRequest) (*VM, error) { timer := prometheus.NewTimer(vmCreationDuration) defer timer.ObserveDuration() vmCreationTotal.Inc() // ... rest of the logic
} - Cost at scale. Managed services are convenient but punishing at volume. At a certain number of VMs or data transfer gigabytes, the math tilts hard toward owning iron.
- Control and compliance. Some industries (healthcare, finance, government) need data sovereignty that public clouds make complicated.
- Learning. Nothing teaches you how Kubernetes actually works like building the thing that Kubernetes runs on.
- The itch. Sometimes you just want to know if you can. - Authenticate the request
- Check quota and billing
- Select the right hypervisor node (scheduling)
- Call the Proxmox API
- Configure networking for the new VM
- Register the VM in a state database
- Return an IP address and credentials to the user - ✅ Provision and destroy VMs via API
- ✅ Allocate isolated project networks automatically
- ✅ Serve block storage from Ceph
- ✅ Run Kubernetes workloads across the VM fleet
- ✅ Track basic resource usage per user/project - 🔧 Live VM migration between hypervisor nodes
- 🔧 A proper billing and quota system
- 🔧 A usable developer portal (the API is functional but ugly)
- 🔧 Automated certificate management for tenant workloads - Designing Data-Intensive Applications by Martin Kleppmann — essential for understanding the state management challenges
- The Proxmox API documentation (surprisingly good)
- The Ceph documentation (less good, but comprehensive)
- [[Cloud Native Patterns and architecture guides]] — for thinking about multi-tenancy correctly
- The OpenStack source code — not to run it, but to read how they solved problems