$ -weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y python3.12-venv
python3 --version # expect Python 3.12.x python3 -m venv ~/gpufl-venv
source ~/gpufl-venv/bin/activate -weight: 500;">pip -weight: 500;">install ---weight: 500;">upgrade -weight: 500;">pip
-weight: 500;">pip -weight: 500;">install gpufl "numba-cuda[cu13]" python -c "import gpufl; print('gpufl', gpufl.__version__)"
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y python3.12-venv
python3 --version # expect Python 3.12.x python3 -m venv ~/gpufl-venv
source ~/gpufl-venv/bin/activate -weight: 500;">pip -weight: 500;">install ---weight: 500;">upgrade -weight: 500;">pip
-weight: 500;">pip -weight: 500;">install gpufl "numba-cuda[cu13]" python -c "import gpufl; print('gpufl', gpufl.__version__)"
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y python3.12-venv
python3 --version # expect Python 3.12.x python3 -m venv ~/gpufl-venv
source ~/gpufl-venv/bin/activate -weight: 500;">pip -weight: 500;">install ---weight: 500;">upgrade -weight: 500;">pip
-weight: 500;">pip -weight: 500;">install gpufl "numba-cuda[cu13]" python -c "import gpufl; print('gpufl', gpufl.__version__)"
gpufl 1.x.x
gpufl 1.x.x
gpufl 1.x.x
python -c "from numba import cuda; print('cuda available:', cuda.is_available()); cuda.detect()"
python -c "from numba import cuda; print('cuda available:', cuda.is_available()); cuda.detect()"
python -c "from numba import cuda; print('cuda available:', cuda.is_available()); cuda.detect()"
import gpufl as gfl
from gpufl.report import generate_report
from numba import cuda
import numpy as np
import math
import os @cuda.jit
def matmul_kernel(A, B, C): row, col = cuda.grid(2) if row < C.shape[0] and col < C.shape[1]: tmp = 0.0 for k in range(A.shape[1]): tmp += A[row, k] * B[k, col] C[row, col] = tmp LOG_PATH = "./gfl_logs" gfl.init( app_name="matmul_sample", log_path=LOG_PATH, sampling_auto_start=True, system_sample_rate_ms=100, profiling_engine=gfl.ProfilingEngine.PcSamplingWithSass,
) try: N = 2048 A = cuda.to_device(np.random.rand(N, N).astype(np.float32)) B = cuda.to_device(np.random.rand(N, N).astype(np.float32)) C = cuda.to_device(np.zeros((N, N), dtype=np.float32)) tpb = (16, 16) bpg = (math.ceil(N / tpb[0]), math.ceil(N / tpb[1])) with gfl.Scope("matrix_mul_compute", "math"): for _ in range(10): matmul_kernel[bpg, tpb](A, B, C) _ = C.copy_to_host() print("[OK] compute finished") finally: gfl.shutdown() print( generate_report( os.path.dirname(LOG_PATH) or ".", log_prefix=os.path.basename(LOG_PATH), top_n=10, ) )
import gpufl as gfl
from gpufl.report import generate_report
from numba import cuda
import numpy as np
import math
import os @cuda.jit
def matmul_kernel(A, B, C): row, col = cuda.grid(2) if row < C.shape[0] and col < C.shape[1]: tmp = 0.0 for k in range(A.shape[1]): tmp += A[row, k] * B[k, col] C[row, col] = tmp LOG_PATH = "./gfl_logs" gfl.init( app_name="matmul_sample", log_path=LOG_PATH, sampling_auto_start=True, system_sample_rate_ms=100, profiling_engine=gfl.ProfilingEngine.PcSamplingWithSass,
) try: N = 2048 A = cuda.to_device(np.random.rand(N, N).astype(np.float32)) B = cuda.to_device(np.random.rand(N, N).astype(np.float32)) C = cuda.to_device(np.zeros((N, N), dtype=np.float32)) tpb = (16, 16) bpg = (math.ceil(N / tpb[0]), math.ceil(N / tpb[1])) with gfl.Scope("matrix_mul_compute", "math"): for _ in range(10): matmul_kernel[bpg, tpb](A, B, C) _ = C.copy_to_host() print("[OK] compute finished") finally: gfl.shutdown() print( generate_report( os.path.dirname(LOG_PATH) or ".", log_prefix=os.path.basename(LOG_PATH), top_n=10, ) )
import gpufl as gfl
from gpufl.report import generate_report
from numba import cuda
import numpy as np
import math
import os @cuda.jit
def matmul_kernel(A, B, C): row, col = cuda.grid(2) if row < C.shape[0] and col < C.shape[1]: tmp = 0.0 for k in range(A.shape[1]): tmp += A[row, k] * B[k, col] C[row, col] = tmp LOG_PATH = "./gfl_logs" gfl.init( app_name="matmul_sample", log_path=LOG_PATH, sampling_auto_start=True, system_sample_rate_ms=100, profiling_engine=gfl.ProfilingEngine.PcSamplingWithSass,
) try: N = 2048 A = cuda.to_device(np.random.rand(N, N).astype(np.float32)) B = cuda.to_device(np.random.rand(N, N).astype(np.float32)) C = cuda.to_device(np.zeros((N, N), dtype=np.float32)) tpb = (16, 16) bpg = (math.ceil(N / tpb[0]), math.ceil(N / tpb[1])) with gfl.Scope("matrix_mul_compute", "math"): for _ in range(10): matmul_kernel[bpg, tpb](A, B, C) _ = C.copy_to_host() print("[OK] compute finished") finally: gfl.shutdown() print( generate_report( os.path.dirname(LOG_PATH) or ".", log_prefix=os.path.basename(LOG_PATH), top_n=10, ) )
=============================================================================== GPU Flight Session Report Generated: 2026-05-22 05:05:33 UTC
=============================================================================== =============================================================================== Session Summary
=============================================================================== Application: matmul_sample Session ID: 565d3c32-86cc-415d-8642-9c140f856f2b Duration: 17.91 s GPU Device: NVIDIA GeForce RTX 5060 Laptop GPU SMs: 26 Registers/Block: 65536 =============================================================================== Kernel Execution Summary
=============================================================================== Total Kernels: 10 Unique Kernels: 1 Total GPU Time: 17.40 s GPU Busy: 97.2% Avg Duration: 1.74 s Median Duration: 1.74 s Min Duration: 1.71 s Max Duration: 1.78 s =============================================================================== Top 10 Kernels by Total GPU Time
=============================================================================== # Kernel Calls Total Avg Max -------------------------------------------------------------------------------------- 1 __main__::matmul_kernel 10 17.40 s 1.74 s 1.78 s =============================================================================== Kernel Details (Top 10)
=============================================================================== __main__::matmul_kernel ======================= Grid: (128,128,1) Block: (16,16,1) Occupancy: 100.0% Reg Occupancy: 100.0% SMem Occupancy: 100.0% Warp Occupancy: 100.0% Block Occupancy: 100.0% Limiting Resource: warps Registers/Thread: 40 Shared Memory: 0 B dyn + 0 B static =============================================================================== Memory Transfer Summary
=============================================================================== Total Transfers: 4 Total Bytes: 64.0 MB Direction Count Total Bytes Avg Throughput ------------------------------------------------------ HtoD 3 48.0 MB 11.68 GB/s DtoH 1 16.0 MB 4.40 GB/s =============================================================================== System Metrics
=============================================================================== GPU Metrics: Utilization: avg 96.6% peak 100% min 0% Temperature: avg 53.4 C peak 58 C Power: avg 71.0 W peak 75.6 W VRAM Usage: peak 1105 MiB SM Clock: avg 2631 MHz peak 2790 MHz Host Metrics: CPU Utilization: avg 8.6% peak 29.1% RAM Usage: peak 27593 / 32189 MiB (85.7%) =============================================================================== Scope Summary
=============================================================================== Scope Timing: Scope Calls Total Avg Max ------------------------------------------------------------------------ matrix_mul_compute 1 195.21 ms 195.21 ms 195.21 ms GPU Time by Scope: Scope Kernels GPU Time Avg ---------------------------------------------------------------- matrix_mul_compute 10 17.40 s 1.74 s =============================================================================== Profile / SASS Analysis
=============================================================================== SASS Metrics Summary: Metric Total -------------------------------------------------------------- smsp__sass_thread_inst_executed 2235815690240 smsp__sass_inst_executed 69869240320 smsp__sass_sectors_mem_global 45654999040 smsp__sass_sectors_mem_global_ideal 13427015680 Thread Divergence Analysis: Warp Instructions: 69869240320 Thread Instructions: 2235815690240 Avg Threads/Warp: 32.0 / 32 Warp Efficiency: 100.0%
=============================================================================== GPU Flight Session Report Generated: 2026-05-22 05:05:33 UTC
=============================================================================== =============================================================================== Session Summary
=============================================================================== Application: matmul_sample Session ID: 565d3c32-86cc-415d-8642-9c140f856f2b Duration: 17.91 s GPU Device: NVIDIA GeForce RTX 5060 Laptop GPU SMs: 26 Registers/Block: 65536 =============================================================================== Kernel Execution Summary
=============================================================================== Total Kernels: 10 Unique Kernels: 1 Total GPU Time: 17.40 s GPU Busy: 97.2% Avg Duration: 1.74 s Median Duration: 1.74 s Min Duration: 1.71 s Max Duration: 1.78 s =============================================================================== Top 10 Kernels by Total GPU Time
=============================================================================== # Kernel Calls Total Avg Max -------------------------------------------------------------------------------------- 1 __main__::matmul_kernel 10 17.40 s 1.74 s 1.78 s =============================================================================== Kernel Details (Top 10)
=============================================================================== __main__::matmul_kernel ======================= Grid: (128,128,1) Block: (16,16,1) Occupancy: 100.0% Reg Occupancy: 100.0% SMem Occupancy: 100.0% Warp Occupancy: 100.0% Block Occupancy: 100.0% Limiting Resource: warps Registers/Thread: 40 Shared Memory: 0 B dyn + 0 B static =============================================================================== Memory Transfer Summary
=============================================================================== Total Transfers: 4 Total Bytes: 64.0 MB Direction Count Total Bytes Avg Throughput ------------------------------------------------------ HtoD 3 48.0 MB 11.68 GB/s DtoH 1 16.0 MB 4.40 GB/s =============================================================================== System Metrics
=============================================================================== GPU Metrics: Utilization: avg 96.6% peak 100% min 0% Temperature: avg 53.4 C peak 58 C Power: avg 71.0 W peak 75.6 W VRAM Usage: peak 1105 MiB SM Clock: avg 2631 MHz peak 2790 MHz Host Metrics: CPU Utilization: avg 8.6% peak 29.1% RAM Usage: peak 27593 / 32189 MiB (85.7%) =============================================================================== Scope Summary
=============================================================================== Scope Timing: Scope Calls Total Avg Max ------------------------------------------------------------------------ matrix_mul_compute 1 195.21 ms 195.21 ms 195.21 ms GPU Time by Scope: Scope Kernels GPU Time Avg ---------------------------------------------------------------- matrix_mul_compute 10 17.40 s 1.74 s =============================================================================== Profile / SASS Analysis
=============================================================================== SASS Metrics Summary: Metric Total -------------------------------------------------------------- smsp__sass_thread_inst_executed 2235815690240 smsp__sass_inst_executed 69869240320 smsp__sass_sectors_mem_global 45654999040 smsp__sass_sectors_mem_global_ideal 13427015680 Thread Divergence Analysis: Warp Instructions: 69869240320 Thread Instructions: 2235815690240 Avg Threads/Warp: 32.0 / 32 Warp Efficiency: 100.0%
=============================================================================== GPU Flight Session Report Generated: 2026-05-22 05:05:33 UTC
=============================================================================== =============================================================================== Session Summary
=============================================================================== Application: matmul_sample Session ID: 565d3c32-86cc-415d-8642-9c140f856f2b Duration: 17.91 s GPU Device: NVIDIA GeForce RTX 5060 Laptop GPU SMs: 26 Registers/Block: 65536 =============================================================================== Kernel Execution Summary
=============================================================================== Total Kernels: 10 Unique Kernels: 1 Total GPU Time: 17.40 s GPU Busy: 97.2% Avg Duration: 1.74 s Median Duration: 1.74 s Min Duration: 1.71 s Max Duration: 1.78 s =============================================================================== Top 10 Kernels by Total GPU Time
=============================================================================== # Kernel Calls Total Avg Max -------------------------------------------------------------------------------------- 1 __main__::matmul_kernel 10 17.40 s 1.74 s 1.78 s =============================================================================== Kernel Details (Top 10)
=============================================================================== __main__::matmul_kernel ======================= Grid: (128,128,1) Block: (16,16,1) Occupancy: 100.0% Reg Occupancy: 100.0% SMem Occupancy: 100.0% Warp Occupancy: 100.0% Block Occupancy: 100.0% Limiting Resource: warps Registers/Thread: 40 Shared Memory: 0 B dyn + 0 B static =============================================================================== Memory Transfer Summary
=============================================================================== Total Transfers: 4 Total Bytes: 64.0 MB Direction Count Total Bytes Avg Throughput ------------------------------------------------------ HtoD 3 48.0 MB 11.68 GB/s DtoH 1 16.0 MB 4.40 GB/s =============================================================================== System Metrics
=============================================================================== GPU Metrics: Utilization: avg 96.6% peak 100% min 0% Temperature: avg 53.4 C peak 58 C Power: avg 71.0 W peak 75.6 W VRAM Usage: peak 1105 MiB SM Clock: avg 2631 MHz peak 2790 MHz Host Metrics: CPU Utilization: avg 8.6% peak 29.1% RAM Usage: peak 27593 / 32189 MiB (85.7%) =============================================================================== Scope Summary
=============================================================================== Scope Timing: Scope Calls Total Avg Max ------------------------------------------------------------------------ matrix_mul_compute 1 195.21 ms 195.21 ms 195.21 ms GPU Time by Scope: Scope Kernels GPU Time Avg ---------------------------------------------------------------- matrix_mul_compute 10 17.40 s 1.74 s =============================================================================== Profile / SASS Analysis
=============================================================================== SASS Metrics Summary: Metric Total -------------------------------------------------------------- smsp__sass_thread_inst_executed 2235815690240 smsp__sass_inst_executed 69869240320 smsp__sass_sectors_mem_global 45654999040 smsp__sass_sectors_mem_global_ideal 13427015680 Thread Divergence Analysis: Warp Instructions: 69869240320 Thread Instructions: 2235815690240 Avg Threads/Warp: 32.0 / 32 Warp Efficiency: 100.0%
GPU Busy: 97.2%
GPU Util avg: 96.6%
Total GPU Time: 17.40 s
Duration: 17.91 s
GPU Busy: 97.2%
GPU Util avg: 96.6%
Total GPU Time: 17.40 s
Duration: 17.91 s
GPU Busy: 97.2%
GPU Util avg: 96.6%
Total GPU Time: 17.40 s
Duration: 17.91 s
Avg Duration: 1.74 s
Median Duration: 1.74 s
Min Duration: 1.71 s
Max Duration: 1.78 s
Avg Duration: 1.74 s
Median Duration: 1.74 s
Min Duration: 1.71 s
Max Duration: 1.78 s
Avg Duration: 1.74 s
Median Duration: 1.74 s
Min Duration: 1.71 s
Max Duration: 1.78 s
Occupancy: 100.0%
Reg Occupancy: 100.0%
SMem Occupancy: 100.0%
Warp Occupancy: 100.0%
Block Occupancy: 100.0%
Limiting Resource: warps
Occupancy: 100.0%
Reg Occupancy: 100.0%
SMem Occupancy: 100.0%
Warp Occupancy: 100.0%
Block Occupancy: 100.0%
Limiting Resource: warps
Occupancy: 100.0%
Reg Occupancy: 100.0%
SMem Occupancy: 100.0%
Warp Occupancy: 100.0%
Block Occupancy: 100.0%
Limiting Resource: warps
Avg Threads/Warp: 32.0 / 32
Warp Efficiency: 100.0%
Avg Threads/Warp: 32.0 / 32
Warp Efficiency: 100.0%
Avg Threads/Warp: 32.0 / 32
Warp Efficiency: 100.0%
SASS Metrics Summary:
Metric Total
--------------------------------------------------------------
smsp__sass_thread_inst_executed 2235815690240
smsp__sass_inst_executed 69869240320
smsp__sass_sectors_mem_global 45654999040
smsp__sass_sectors_mem_global_ideal 13427015680
SASS Metrics Summary:
Metric Total
--------------------------------------------------------------
smsp__sass_thread_inst_executed 2235815690240
smsp__sass_inst_executed 69869240320
smsp__sass_sectors_mem_global 45654999040
smsp__sass_sectors_mem_global_ideal 13427015680
SASS Metrics Summary:
Metric Total
--------------------------------------------------------------
smsp__sass_thread_inst_executed 2235815690240
smsp__sass_inst_executed 69869240320
smsp__sass_sectors_mem_global 45654999040
smsp__sass_sectors_mem_global_ideal 13427015680
smsp__sass_sectors_mem_global 45,654,999,040
smsp__sass_sectors_mem_global_ideal 13,427,015,680
smsp__sass_sectors_mem_global 45,654,999,040
smsp__sass_sectors_mem_global_ideal 13,427,015,680
smsp__sass_sectors_mem_global 45,654,999,040
smsp__sass_sectors_mem_global_ideal 13,427,015,680
45.7 / 13.4 ≈ 3.4x
45.7 / 13.4 ≈ 3.4x
45.7 / 13.4 ≈ 3.4x
13.4 / 45.7 ≈ 29%
13.4 / 45.7 ≈ 29%
13.4 / 45.7 ≈ 29%
from numba import cuda, float32 TPB = 16 @cuda.jit
def matmul_kernel_perf(A, B, C): sA = cuda.shared.array((TPB, TPB), dtype=float32) sB = cuda.shared.array((TPB, TPB), dtype=float32) x, y = cuda.grid(2) tx = cuda.threadIdx.x ty = cuda.threadIdx.y tmp = float32(0.0) n_tiles = (A.shape[1] + TPB - 1) // TPB for i in range(n_tiles): sA[ty, tx] = 0.0 sB[ty, tx] = 0.0 if y < A.shape[0] and (tx + i * TPB) < A.shape[1]: sA[ty, tx] = A[y, tx + i * TPB] if x < B.shape[1] and (ty + i * TPB) < B.shape[0]: sB[ty, tx] = B[ty + i * TPB, x] cuda.syncthreads() for j in range(TPB): tmp += sA[ty, j] * sB[j, tx] cuda.syncthreads() if y < C.shape[0] and x < C.shape[1]: C[y, x] = tmp
from numba import cuda, float32 TPB = 16 @cuda.jit
def matmul_kernel_perf(A, B, C): sA = cuda.shared.array((TPB, TPB), dtype=float32) sB = cuda.shared.array((TPB, TPB), dtype=float32) x, y = cuda.grid(2) tx = cuda.threadIdx.x ty = cuda.threadIdx.y tmp = float32(0.0) n_tiles = (A.shape[1] + TPB - 1) // TPB for i in range(n_tiles): sA[ty, tx] = 0.0 sB[ty, tx] = 0.0 if y < A.shape[0] and (tx + i * TPB) < A.shape[1]: sA[ty, tx] = A[y, tx + i * TPB] if x < B.shape[1] and (ty + i * TPB) < B.shape[0]: sB[ty, tx] = B[ty + i * TPB, x] cuda.syncthreads() for j in range(TPB): tmp += sA[ty, j] * sB[j, tx] cuda.syncthreads() if y < C.shape[0] and x < C.shape[1]: C[y, x] = tmp
from numba import cuda, float32 TPB = 16 @cuda.jit
def matmul_kernel_perf(A, B, C): sA = cuda.shared.array((TPB, TPB), dtype=float32) sB = cuda.shared.array((TPB, TPB), dtype=float32) x, y = cuda.grid(2) tx = cuda.threadIdx.x ty = cuda.threadIdx.y tmp = float32(0.0) n_tiles = (A.shape[1] + TPB - 1) // TPB for i in range(n_tiles): sA[ty, tx] = 0.0 sB[ty, tx] = 0.0 if y < A.shape[0] and (tx + i * TPB) < A.shape[1]: sA[ty, tx] = A[y, tx + i * TPB] if x < B.shape[1] and (ty + i * TPB) < B.shape[0]: sB[ty, tx] = B[ty + i * TPB, x] cuda.syncthreads() for j in range(TPB): tmp += sA[ty, j] * sB[j, tx] cuda.syncthreads() if y < C.shape[0] and x < C.shape[1]: C[y, x] = tmp
=============================================================================== GPU Flight Session Report Generated: 2026-05-22 05:20:40 UTC
=============================================================================== =============================================================================== Session Summary
=============================================================================== Application: matmul_sample_perf Session ID: d44e5478-ba19-4cd1-b3cf-f6d31ab8b0ca Duration: 2.90 s GPU Device: NVIDIA GeForce RTX 5060 Laptop GPU SMs: 26 Registers/Block: 65536 =============================================================================== Kernel Execution Summary
=============================================================================== Total Kernels: 10 Unique Kernels: 1 Total GPU Time: 2.22 s GPU Busy: 76.4% Avg Duration: 221.64 ms Median Duration: 216.89 ms Min Duration: 215.38 ms Max Duration: 250.06 ms =============================================================================== Top 10 Kernels by Total GPU Time
=============================================================================== # Kernel Calls Total Avg Max -------------------------------------------------------------------------------------- 1 __main__::matmul_kernel_perf 10 2.22 s 221.64 ms 250.06 ms =============================================================================== Kernel Details (Top 10)
=============================================================================== __main__::matmul_kernel_perf ============================ Grid: (128,128,1) Block: (16,16,1) Occupancy: 100.0% Reg Occupancy: 100.0% SMem Occupancy: 100.0% Warp Occupancy: 100.0% Block Occupancy: 100.0% Limiting Resource: warps Registers/Thread: 37 Shared Memory: 0 B dyn + 2.0 KB static =============================================================================== Memory Transfer Summary
=============================================================================== Total Transfers: 4 Total Bytes: 64.0 MB Direction Count Total Bytes Avg Throughput ------------------------------------------------------ HtoD 3 48.0 MB 9.87 GB/s DtoH 1 16.0 MB 4.45 GB/s =============================================================================== System Metrics
=============================================================================== GPU Metrics: Utilization: avg 74.9% peak 100% min 0% Temperature: avg 43.0 C peak 48 C Power: avg 51.0 W peak 76.1 W VRAM Usage: peak 958 MiB SM Clock: avg 2180 MHz peak 2812 MHz Host Metrics: CPU Utilization: avg 16.0% peak 46.0% RAM Usage: peak 27019 / 32189 MiB (83.9%) =============================================================================== Scope Summary
=============================================================================== Scope Timing: Scope Calls Total Avg Max ------------------------------------------------------------------------ matrix_mul_compute_perf 1 330.58 ms 330.58 ms 330.58 ms GPU Time by Scope: Scope Kernels GPU Time Avg ---------------------------------------------------------------- matrix_mul_compute_perf 10 2.22 s 221.64 ms =============================================================================== Profile / SASS Analysis
=============================================================================== SASS Metrics Summary: Metric Total -------------------------------------------------------------- smsp__sass_thread_inst_executed 298005299200 smsp__sass_inst_executed 9312665600 smsp__sass_sectors_mem_global 1347420160 smsp__sass_sectors_mem_global_ideal 1347420160 Thread Divergence Analysis: Warp Instructions: 9312665600 Thread Instructions: 298005299200 Avg Threads/Warp: 32.0 / 32 Warp Efficiency: 100.0%
=============================================================================== GPU Flight Session Report Generated: 2026-05-22 05:20:40 UTC
=============================================================================== =============================================================================== Session Summary
=============================================================================== Application: matmul_sample_perf Session ID: d44e5478-ba19-4cd1-b3cf-f6d31ab8b0ca Duration: 2.90 s GPU Device: NVIDIA GeForce RTX 5060 Laptop GPU SMs: 26 Registers/Block: 65536 =============================================================================== Kernel Execution Summary
=============================================================================== Total Kernels: 10 Unique Kernels: 1 Total GPU Time: 2.22 s GPU Busy: 76.4% Avg Duration: 221.64 ms Median Duration: 216.89 ms Min Duration: 215.38 ms Max Duration: 250.06 ms =============================================================================== Top 10 Kernels by Total GPU Time
=============================================================================== # Kernel Calls Total Avg Max -------------------------------------------------------------------------------------- 1 __main__::matmul_kernel_perf 10 2.22 s 221.64 ms 250.06 ms =============================================================================== Kernel Details (Top 10)
=============================================================================== __main__::matmul_kernel_perf ============================ Grid: (128,128,1) Block: (16,16,1) Occupancy: 100.0% Reg Occupancy: 100.0% SMem Occupancy: 100.0% Warp Occupancy: 100.0% Block Occupancy: 100.0% Limiting Resource: warps Registers/Thread: 37 Shared Memory: 0 B dyn + 2.0 KB static =============================================================================== Memory Transfer Summary
=============================================================================== Total Transfers: 4 Total Bytes: 64.0 MB Direction Count Total Bytes Avg Throughput ------------------------------------------------------ HtoD 3 48.0 MB 9.87 GB/s DtoH 1 16.0 MB 4.45 GB/s =============================================================================== System Metrics
=============================================================================== GPU Metrics: Utilization: avg 74.9% peak 100% min 0% Temperature: avg 43.0 C peak 48 C Power: avg 51.0 W peak 76.1 W VRAM Usage: peak 958 MiB SM Clock: avg 2180 MHz peak 2812 MHz Host Metrics: CPU Utilization: avg 16.0% peak 46.0% RAM Usage: peak 27019 / 32189 MiB (83.9%) =============================================================================== Scope Summary
=============================================================================== Scope Timing: Scope Calls Total Avg Max ------------------------------------------------------------------------ matrix_mul_compute_perf 1 330.58 ms 330.58 ms 330.58 ms GPU Time by Scope: Scope Kernels GPU Time Avg ---------------------------------------------------------------- matrix_mul_compute_perf 10 2.22 s 221.64 ms =============================================================================== Profile / SASS Analysis
=============================================================================== SASS Metrics Summary: Metric Total -------------------------------------------------------------- smsp__sass_thread_inst_executed 298005299200 smsp__sass_inst_executed 9312665600 smsp__sass_sectors_mem_global 1347420160 smsp__sass_sectors_mem_global_ideal 1347420160 Thread Divergence Analysis: Warp Instructions: 9312665600 Thread Instructions: 298005299200 Avg Threads/Warp: 32.0 / 32 Warp Efficiency: 100.0%
=============================================================================== GPU Flight Session Report Generated: 2026-05-22 05:20:40 UTC
=============================================================================== =============================================================================== Session Summary
=============================================================================== Application: matmul_sample_perf Session ID: d44e5478-ba19-4cd1-b3cf-f6d31ab8b0ca Duration: 2.90 s GPU Device: NVIDIA GeForce RTX 5060 Laptop GPU SMs: 26 Registers/Block: 65536 =============================================================================== Kernel Execution Summary
=============================================================================== Total Kernels: 10 Unique Kernels: 1 Total GPU Time: 2.22 s GPU Busy: 76.4% Avg Duration: 221.64 ms Median Duration: 216.89 ms Min Duration: 215.38 ms Max Duration: 250.06 ms =============================================================================== Top 10 Kernels by Total GPU Time
=============================================================================== # Kernel Calls Total Avg Max -------------------------------------------------------------------------------------- 1 __main__::matmul_kernel_perf 10 2.22 s 221.64 ms 250.06 ms =============================================================================== Kernel Details (Top 10)
=============================================================================== __main__::matmul_kernel_perf ============================ Grid: (128,128,1) Block: (16,16,1) Occupancy: 100.0% Reg Occupancy: 100.0% SMem Occupancy: 100.0% Warp Occupancy: 100.0% Block Occupancy: 100.0% Limiting Resource: warps Registers/Thread: 37 Shared Memory: 0 B dyn + 2.0 KB static =============================================================================== Memory Transfer Summary
=============================================================================== Total Transfers: 4 Total Bytes: 64.0 MB Direction Count Total Bytes Avg Throughput ------------------------------------------------------ HtoD 3 48.0 MB 9.87 GB/s DtoH 1 16.0 MB 4.45 GB/s =============================================================================== System Metrics
=============================================================================== GPU Metrics: Utilization: avg 74.9% peak 100% min 0% Temperature: avg 43.0 C peak 48 C Power: avg 51.0 W peak 76.1 W VRAM Usage: peak 958 MiB SM Clock: avg 2180 MHz peak 2812 MHz Host Metrics: CPU Utilization: avg 16.0% peak 46.0% RAM Usage: peak 27019 / 32189 MiB (83.9%) =============================================================================== Scope Summary
=============================================================================== Scope Timing: Scope Calls Total Avg Max ------------------------------------------------------------------------ matrix_mul_compute_perf 1 330.58 ms 330.58 ms 330.58 ms GPU Time by Scope: Scope Kernels GPU Time Avg ---------------------------------------------------------------- matrix_mul_compute_perf 10 2.22 s 221.64 ms =============================================================================== Profile / SASS Analysis
=============================================================================== SASS Metrics Summary: Metric Total -------------------------------------------------------------- smsp__sass_thread_inst_executed 298005299200 smsp__sass_inst_executed 9312665600 smsp__sass_sectors_mem_global 1347420160 smsp__sass_sectors_mem_global_ideal 1347420160 Thread Divergence Analysis: Warp Instructions: 9312665600 Thread Instructions: 298005299200 Avg Threads/Warp: 32.0 / 32 Warp Efficiency: 100.0%
smsp__sass_sectors_mem_global 45,654,999,040
smsp__sass_sectors_mem_global_ideal 13,427,015,680
smsp__sass_sectors_mem_global 45,654,999,040
smsp__sass_sectors_mem_global_ideal 13,427,015,680
smsp__sass_sectors_mem_global 45,654,999,040
smsp__sass_sectors_mem_global_ideal 13,427,015,680
smsp__sass_sectors_mem_global 1,347,420,160
smsp__sass_sectors_mem_global_ideal 1,347,420,160
smsp__sass_sectors_mem_global 1,347,420,160
smsp__sass_sectors_mem_global_ideal 1,347,420,160
smsp__sass_sectors_mem_global 1,347,420,160
smsp__sass_sectors_mem_global_ideal 1,347,420,160
Shared Memory: 0 B dyn + 2.0 KB static
Shared Memory: 0 B dyn + 2.0 KB static
Shared Memory: 0 B dyn + 2.0 KB static
Naive thread instructions: 2,235,815,690,240
Tiled thread instructions: 298,005,299,200
Naive thread instructions: 2,235,815,690,240
Tiled thread instructions: 298,005,299,200
Naive thread instructions: 2,235,815,690,240
Tiled thread instructions: 298,005,299,200 - The GPU is busy.
- Occupancy is high.
- Warp efficiency is perfect. - high GPU utilization,
- 100% occupancy,
- 100% warp efficiency,
- but very inefficient global memory access. - total profiled GPU time dropped from 17.40 s to 2.22 s,
- average profiled kernel time dropped from 1.74 s to 221.64 ms,
- global memory sectors dropped from 45.65B to 1.35B,
- and actual global memory sectors matched the ideal number.