$ user@container-security:~$ stat -fc %T /sys/fs/cgroup/
cgroup2fs # Note: If the output reads cgroup2fs, you are ready to go. If the output is tmpfs or cgroupfs, your system is still using the legacy cgroups v1 hierarchy.
user@container-security:~$ stat -fc %T /sys/fs/cgroup/
cgroup2fs # Note: If the output reads cgroup2fs, you are ready to go. If the output is tmpfs or cgroupfs, your system is still using the legacy cgroups v1 hierarchy.
user@container-security:~$ stat -fc %T /sys/fs/cgroup/
cgroup2fs # Note: If the output reads cgroup2fs, you are ready to go. If the output is tmpfs or cgroupfs, your system is still using the legacy cgroups v1 hierarchy.
/sys/fs/cgroup (root)
│
├── init.scope
│
├── system.slice
│ ├── -weight: 500;">docker.-weight: 500;">service
│ │ └── -weight: 500;">docker-container.scope
│ │
│ └── ssh.-weight: 500;">service
│
└── user.slice └── user-1000.slice
/sys/fs/cgroup (root)
│
├── init.scope
│
├── system.slice
│ ├── -weight: 500;">docker.-weight: 500;">service
│ │ └── -weight: 500;">docker-container.scope
│ │
│ └── ssh.-weight: 500;">service
│
└── user.slice └── user-1000.slice
/sys/fs/cgroup (root)
│
├── init.scope
│
├── system.slice
│ ├── -weight: 500;">docker.-weight: 500;">service
│ │ └── -weight: 500;">docker-container.scope
│ │
│ └── ssh.-weight: 500;">service
│
└── user.slice └── user-1000.slice
-weight: 600;">sudo su
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -y -weight: 500;">install cgroup-tools
-weight: 600;">sudo su
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -y -weight: 500;">install cgroup-tools
-weight: 600;">sudo su
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -y -weight: 500;">install cgroup-tools
export PARENT_CGROUP="scripts"
export CHILD_CGROUP="production"
export PARENT_CGROUP="scripts"
export CHILD_CGROUP="production"
export PARENT_CGROUP="scripts"
export CHILD_CGROUP="production"
/sys/fs/cgroup/
└── scripts (parent cgroup) └── production (child cgroup)
/sys/fs/cgroup/
└── scripts (parent cgroup) └── production (child cgroup)
/sys/fs/cgroup/
└── scripts (parent cgroup) └── production (child cgroup)
cgcreate -g memory,cpu:/${PARENT_CGROUP}
cgcreate -g memory,cpu:/${PARENT_CGROUP}
cgcreate -g memory,cpu:/${PARENT_CGROUP}
root@container-security:/home/user# cat /sys/fs/cgroup/${PARENT_CGROUP}/cgroup.controllers
cpu memory pids
root@container-security:/home/user# cat /sys/fs/cgroup/${PARENT_CGROUP}/cgroup.controllers
cpu memory pids
root@container-security:/home/user# cat /sys/fs/cgroup/${PARENT_CGROUP}/cgroup.controllers
cpu memory pids
root@container-security:~# cat /sys/fs/cgroup/cgroup.subtree_control
cpu memory pids
root@container-security:~# cat /sys/fs/cgroup/cgroup.subtree_control
cpu memory pids
root@container-security:~# cat /sys/fs/cgroup/cgroup.subtree_control
cpu memory pids
cgcreate -g memory,cpu:/${PARENT_CGROUP}/${CHILD_CGROUP}
cgcreate -g memory,cpu:/${PARENT_CGROUP}/${CHILD_CGROUP}
cgcreate -g memory,cpu:/${PARENT_CGROUP}/${CHILD_CGROUP}
root@container-security:/home/user# cat /sys/fs/cgroup/${PARENT_CGROUP}/cgroup.subtree_control
cpu memory
root@container-security:/home/user# cat /sys/fs/cgroup/${PARENT_CGROUP}/cgroup.subtree_control
cpu memory
root@container-security:/home/user# cat /sys/fs/cgroup/${PARENT_CGROUP}/cgroup.subtree_control
cpu memory
cgset -r memory.max=200000000 ${PARENT_CGROUP}/${CHILD_CGROUP} # (Note: Memory values here are in bytes, but you could also use suffixes like 100M or 1G.)
cgset -r memory.swap.max=200000000 ${PARENT_CGROUP}/${CHILD_CGROUP}
cgset -r cpu.max="150000 1000000" ${PARENT_CGROUP}/${CHILD_CGROUP}
cgset -r memory.max=200000000 ${PARENT_CGROUP}/${CHILD_CGROUP} # (Note: Memory values here are in bytes, but you could also use suffixes like 100M or 1G.)
cgset -r memory.swap.max=200000000 ${PARENT_CGROUP}/${CHILD_CGROUP}
cgset -r cpu.max="150000 1000000" ${PARENT_CGROUP}/${CHILD_CGROUP}
cgset -r memory.max=200000000 ${PARENT_CGROUP}/${CHILD_CGROUP} # (Note: Memory values here are in bytes, but you could also use suffixes like 100M or 1G.)
cgset -r memory.swap.max=200000000 ${PARENT_CGROUP}/${CHILD_CGROUP}
cgset -r cpu.max="150000 1000000" ${PARENT_CGROUP}/${CHILD_CGROUP}
cat /sys/fs/cgroup/${PARENT_CGROUP}/${CHILD_CGROUP}/{memory,cpu,memory.swap}.max
cat /sys/fs/cgroup/${PARENT_CGROUP}/${CHILD_CGROUP}/{memory,cpu,memory.swap}.max
cat /sys/fs/cgroup/${PARENT_CGROUP}/${CHILD_CGROUP}/{memory,cpu,memory.swap}.max
199999488
150000 1000000
199999488
199999488
150000 1000000
199999488
199999488
150000 1000000
199999488
dd if=/dev/zero of=/dev/null &
sleep 2
ps -p $! -o %cpu
dd if=/dev/zero of=/dev/null &
sleep 2
ps -p $! -o %cpu
dd if=/dev/zero of=/dev/null &
sleep 2
ps -p $! -o %cpu
cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} dd if=/dev/zero of=/dev/null &
sleep 2
ps -p $! -o %cpu
cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} dd if=/dev/zero of=/dev/null &
sleep 2
ps -p $! -o %cpu
cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} dd if=/dev/zero of=/dev/null &
sleep 2
ps -p $! -o %cpu
cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} \
bash -c 'a=(); while true; do a+=("$(head -c 10M /dev/zero | tr "\0" "A")"); sleep 1; done' &
cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} \
bash -c 'a=(); while true; do a+=("$(head -c 10M /dev/zero | tr "\0" "A")"); sleep 1; done' &
cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} \
bash -c 'a=(); while true; do a+=("$(head -c 10M /dev/zero | tr "\0" "A")"); sleep 1; done' &
watch ps -p $! -o rss,sz
watch ps -p $! -o rss,sz
watch ps -p $! -o rss,sz
[1]+ Killed cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} bash -c 'a=(); while true; do a+=("$(head -c 10M /dev/zero | tr "\0" "A")"); sleep 1; done'
[1]+ Killed cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} bash -c 'a=(); while true; do a+=("$(head -c 10M /dev/zero | tr "\0" "A")"); sleep 1; done'
[1]+ Killed cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} bash -c 'a=(); while true; do a+=("$(head -c 10M /dev/zero | tr "\0" "A")"); sleep 1; done'
for p in {1..5} ; do cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} sleep 2000 & done
cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} dd if=/dev/zero of=/dev/null &
for p in {1..5} ; do cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} sleep 2000 & done
cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} dd if=/dev/zero of=/dev/null &
for p in {1..5} ; do cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} sleep 2000 & done
cgexec -g memory,cpu:${PARENT_CGROUP}/${CHILD_CGROUP} dd if=/dev/zero of=/dev/null &
root@container-security:/home/user# systemd-cgls /scripts
CGroup /scripts:
└─production ├─2142 sleep 2000 ├─2143 sleep 2000 ├─2144 sleep 2000 ├─2145 sleep 2000 ├─2146 sleep 2000 └─2147 dd if=/dev/zero of=/dev/null
root@container-security:/home/user# systemd-cgls /scripts
CGroup /scripts:
└─production ├─2142 sleep 2000 ├─2143 sleep 2000 ├─2144 sleep 2000 ├─2145 sleep 2000 ├─2146 sleep 2000 └─2147 dd if=/dev/zero of=/dev/null
root@container-security:/home/user# systemd-cgls /scripts
CGroup /scripts:
└─production ├─2142 sleep 2000 ├─2143 sleep 2000 ├─2144 sleep 2000 ├─2145 sleep 2000 ├─2146 sleep 2000 └─2147 dd if=/dev/zero of=/dev/null
echo 1 > /sys/fs/cgroup/${PARENT_CGROUP}/${CHILD_CGROUP}/cgroup.kill
echo 1 > /sys/fs/cgroup/${PARENT_CGROUP}/${CHILD_CGROUP}/cgroup.kill
echo 1 > /sys/fs/cgroup/${PARENT_CGROUP}/${CHILD_CGROUP}/cgroup.kill
dd if=/dev/zero of=/dev/null &
dd if=/dev/zero of=/dev/null &
dd if=/dev/zero of=/dev/null &
cgclassify -g cpu,memory:${PARENT_CGROUP}/${CHILD_CGROUP} $!
cgclassify -g cpu,memory:${PARENT_CGROUP}/${CHILD_CGROUP} $!
cgclassify -g cpu,memory:${PARENT_CGROUP}/${CHILD_CGROUP} $!
cgget ${PARENT_CGROUP}/${CHILD_CGROUP}
cgget ${PARENT_CGROUP}/${CHILD_CGROUP}
cgget ${PARENT_CGROUP}/${CHILD_CGROUP}
cgdelete -r -g cpu:/${PARENT_CGROUP}
cgdelete -r -g cpu:/${PARENT_CGROUP}
cgdelete -r -g cpu:/${PARENT_CGROUP} - Safer Sub-tree Delegation: It safely allows delegating cgroup management to less-privileged users. This is a crucial feature that makes rootless containers possible, allowing resource limits to be applied without requiring root privileges.
- Unified Memory Accounting: It properly accounts for different types of memory usage that v1 missed or handled poorly, including network memory, kernel memory, and non-immediate resource changes like page cache write-backs.
- Pressure Stall Information (PSI): A newer feature that provides rich, real-time metrics on system resource pressure, allowing systems to proactively detect and respond to resource shortages before a crash occurs.
- Enhanced Isolation: Better cross-resource allocation management prevents edge-case scenarios where high usage of one resource unexpectedly impacts another. - Core Files (cgroup.*): Files prefixed with cgroup. manage the mechanics of the cgroup hierarchy itself, rather than specific hardware resources. cgroup.procs: The most important file. It contains a list of Process IDs (PIDs) that belong to this group. To move a process into a cgroup, you simply echo its PID into this file. cgroup.controllers: A read-only file showing which resource controllers (cpu, memory, io) are currently available to this specific group. cgroup.kill: A v2 feature that lets you instantly kill all processes within the cgroup by writing 1 to it.
- cgroup.procs: The most important file. It contains a list of Process IDs (PIDs) that belong to this group. To move a process into a cgroup, you simply echo its PID into this file.
- cgroup.controllers: A read-only file showing which resource controllers (cpu, memory, io) are currently available to this specific group.
- cgroup.kill: A v2 feature that lets you instantly kill all processes within the cgroup by writing 1 to it.
- Controller Files (cpu.,memory., pids.*, etc.): Controllers are the actual engines that distribute and limit system resources. Files prefixed with a controller name dictate how that specific resource is managed. Furthermore, these files generally fall into two types: Configuration (Read-Write): Files you modify to set limits. (e.g., memory.max) Status (Read-Only): Files you read to get live metrics. (e.g., memory.stat). For example watch cat /sys/fs/cgroup/memory.stat will show you real-time memory usage stats for that cgroup.
- Configuration (Read-Write): Files you modify to set limits. (e.g., memory.max)
- Status (Read-Only): Files you read to get live metrics. (e.g., memory.stat). For example watch cat /sys/fs/cgroup/memory.stat will show you real-time memory usage stats for that cgroup. - cgroup.procs: The most important file. It contains a list of Process IDs (PIDs) that belong to this group. To move a process into a cgroup, you simply echo its PID into this file.
- cgroup.controllers: A read-only file showing which resource controllers (cpu, memory, io) are currently available to this specific group.
- cgroup.kill: A v2 feature that lets you instantly kill all processes within the cgroup by writing 1 to it. - Configuration (Read-Write): Files you modify to set limits. (e.g., memory.max)
- Status (Read-Only): Files you read to get live metrics. (e.g., memory.stat). For example watch cat /sys/fs/cgroup/memory.stat will show you real-time memory usage stats for that cgroup. - Memory (memory.*): Regulates RAM usage. memory.max sets an absolute hard limit. If the processes in the cgroup try to use more memory than this, the kernel's Out-Of-Memory (OOM) killer will step in and terminate them. memory.high is a softer throttle limit. If breached, the kernel heavily throttles the processes and forces them to reclaim memory, but avoids outright killing them.
- memory.max sets an absolute hard limit. If the processes in the cgroup try to use more memory than this, the kernel's Out-Of-Memory (OOM) killer will step in and terminate them.
- memory.high is a softer throttle limit. If breached, the kernel heavily throttles the processes and forces them to reclaim memory, but avoids outright killing them.
- CPU (cpu.*): Regulates processor time. cpu.max limits the absolute maximum amount of CPU time the group can use (bandwidth). cpu.weight dictates proportional share. If the system is busy, a cgroup with a higher weight gets priority over one with a lower weight.
- cpu.max limits the absolute maximum amount of CPU time the group can use (bandwidth).
- cpu.weight dictates proportional share. If the system is busy, a cgroup with a higher weight gets priority over one with a lower weight.
- PIDs (pids.*): Regulates process creation. pids.max sets a hard limit on how many processes can exist inside the cgroup. From a security standpoint, this is your primary defense against a Fork Bomb attack, where a malicious script rapidly clones itself to crash the host.
- pids.max sets a hard limit on how many processes can exist inside the cgroup. From a security standpoint, this is your primary defense against a Fork Bomb attack, where a malicious script rapidly clones itself to crash the host.
- Block I/O (io.*): Regulates disk read/write bandwidth. io.max can prevent a compromised container from thrashing the host's storage drives and starving other containers of database reads or log writes.
- io.max can prevent a compromised container from thrashing the host's storage drives and starving other containers of database reads or log writes. - memory.max sets an absolute hard limit. If the processes in the cgroup try to use more memory than this, the kernel's Out-Of-Memory (OOM) killer will step in and terminate them.
- memory.high is a softer throttle limit. If breached, the kernel heavily throttles the processes and forces them to reclaim memory, but avoids outright killing them. - cpu.max limits the absolute maximum amount of CPU time the group can use (bandwidth).
- cpu.weight dictates proportional share. If the system is busy, a cgroup with a higher weight gets priority over one with a lower weight. - pids.max sets a hard limit on how many processes can exist inside the cgroup. From a security standpoint, this is your primary defense against a Fork Bomb attack, where a malicious script rapidly clones itself to crash the host. - io.max can prevent a compromised container from thrashing the host's storage drives and starving other containers of database reads or log writes. - Cpuset (cpuset.*): Pins tasks to specific CPU cores and Memory Nodes. This is crucial for high-performance computing on NUMA architectures where memory access latency matters.
- Devices: Controls which device nodes (like /dev/sda or /dev/random) a cgroup can access. In v2, this is actually implemented using eBPF programs rather than standard text files.
- HugeTLB (hugetlb.*): Limits the usage of Huge Pages (large blocks of memory) to prevent a single group from exhausting them.
- RDMA (rdma.*): Manages Remote Direct Memory Access resources, often used in high-speed clustered networking. - a parent cgroup called scripts (A parent cgroup is the higher-level group that can contain one or more subgroups. It usually defines the overall resource limits that apply to everything inside it.)
- a child cgroup called production (A child cgroup is a subgroup created inside the parent group. Processes can be placed into the child group, and it can have its own additional limits, but it can never exceed the limits set by its parent.) - CPU Throttling (cpu.max): In cgroups v2, CPU limits use a simple quota-based model formatted as $MAX $PERIOD. If you set the value to 100000 1000000, you are telling the kernel: For every 1,000,000 microseconds (1 second) of time, this group is allowed to use the CPU for 100,000 microseconds (a tenth of a second). This effectively limits the cgroup to 10% of a single CPU core. Security Note: Unlike memory limits, CPU limits act as a throttle. If a process hits its CPU limit, the kernel simply pauses it until the next period begins. CPU throttling slows applications down, but it never outright kills them.
- Memory Limits (memory.max & memory.swap.max): Memory limits set an absolute ceiling on RAM usage. If a cgroup exceeds the value in memory.max, the kernel initiates heavy throttling. It will aggressively try to reclaim memory by dropping cached data or swapping memory pages out to disk. However, if the process continues demanding memory and the kernel cannot reclaim enough (or if swap is also exhausted), the kernel triggers the Out-Of-Memory (OOM) killer. It calculates an OOM score and terminates the most offending process within that cgroup to protect the rest of the host system.