From 1b0d378cdfb001bf549bfb46103de330370622a1 Mon Sep 17 00:00:00 2001
From: Vadim Skipin <vadim.skipin@clickhouse.com>
Date: Sat, 9 May 2026 21:53:35 +0000
Subject: [PATCH] Drop sockperf support; improve bb perf; update perf.md

---
 .github/workflows/ci.yml |   2 +-
 README.md                |  49 ++++---
 bb                       | 277 +++++++++++----------------------------
 docs/perf.md             | 260 ++++++++++++++++++------------------
 4 files changed, 224 insertions(+), 364 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 270331d..3ca7312 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -138,7 +138,7 @@ jobs:
 
       - name: perf
         if: matrix.build.name == 'release' || matrix.build.name == 'tsan'
-        run: ${{ matrix.build.perf_cmd }} --file --net --http
+        run: ${{ matrix.build.perf_cmd }} file net http
 
       - name: Upload coverage report
         if: matrix.build.name == 'coverage' && matrix.arch.name == 'amd64'
diff --git a/README.md b/README.md
index 80b40bf..0ab710c 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ A cooperative fiber scheduler for Linux with per-CPU scheduler threads, io_uring
 
 GTest, Google Benchmark, libbacktrace, liburing, librseq, libbpf, and bpftool are bundled as submodules under `contrib/` and do not need to be installed separately. Poco, the AWS SDK, and jemalloc are built on demand via `--build-poco`, `--build-aws`, and `--build-jemalloc` passed to `configure`.
 
-Runtime dependencies for optional benchmarks: nginx (only for `http-perf --nginx`; the default uses an internal Poco-based server built into the `http-perf` binary), fio (for `fio-perf`), sockperf (for `sockperf-perf`), and MinIO (for `s3-perf`). MinIO is downloaded automatically to `.tools/` if not in PATH; the others must be installed separately.
+Runtime dependencies for optional benchmarks: nginx (only for `http-perf --nginx`; the default uses an internal Poco-based server built into the `http-perf` binary), fio (for `fio-perf`), and MinIO (for `s3-perf`). MinIO is downloaded automatically to `.tools/` if not in PATH; the others must be installed separately.
 
 ## Build
 
@@ -193,15 +193,6 @@ TCP echo benchmark using Boost.Asio C++20 coroutines. Same options as `net-perf`
 ./bb -b release net-perf-asio --flamegraph
 ```
 
-#### `sockperf-perf`
-
-sockperf comparison (ping-pong). Same options as `net-perf` (except `--delay`, `--flamegraph`, and `--print-counters`). Does not build anything.
-
-```
-./bb sockperf-perf
-./bb sockperf-perf --connections 1 4 16
-```
-
 #### `http-perf`
 
 HTTP/1.1 GET benchmark. Defaults to silk's internal HTTP server (Poco's `HTTPServerConnection` over `FiberSocketImpl`, one fiber per connection); pass `--nginx` to run against nginx instead.
@@ -261,23 +252,31 @@ S3 object storage benchmark. Starts a local MinIO server (downloaded automatical
 
 #### `perf`
 
-Run multiple perf benchmarks in one shot.
+Run multiple perf benchmarks in one shot. Targets are positional values, listed after the options.
 
-| Flag | Description |
+| Target | Description |
+|---|---|
+| `file` | file-perf |
+| `fio` | fio comparison |
+| `net` | net-perf |
+| `net-asio` | net-perf-asio |
+| `net-epoll` | net-perf-epoll |
+| `http` | http-perf (internal server, fiber client) |
+| `http-threads` | http-perf (internal server, thread client) |
+| `http-nginx` | http-perf against nginx (fiber client) |
+| `s3` | s3-perf (fibers) |
+| `s3-threads` | s3-perf (threads) |
+| `all` | run every target above |
+
+| Option | Description |
 |---|---|
-| `--file` | Run file-perf |
-| `--fio` | Run fio comparison |
-| `--net` | Run net-perf |
-| `--net-asio` | Run net-perf-asio |
-| `--sockperf` | Run sockperf comparison |
-| `--http` | Run http-perf (internal server, fiber client) |
-| `--http-threads` | Run http-perf (internal server, thread client) |
-| `--http-nginx` | Run http-perf against nginx |
-| `--s3` | Run s3-perf (fibers) |
-| `--s3-threads` | Run s3-perf (threads) |
-| `--all` | Run everything |
+| `--duration DURATION` | Override per-binary measurement duration (e.g. `60s`) |
+| `--warmup DURATION` | Override per-binary warmup duration (e.g. `10s`) |
+| `--timeout SECONDS` | Per-run timeout (default 180, 0 = none) |
 
 ```
-./bb -b release perf --net --file
-./bb -b release perf --all
+./bb -b release perf file net
+./bb -b release perf all
+./bb -b release perf --duration 60s --warmup 10s file net net-asio
+./bb -b release perf --duration 60s --warmup 10s all
 ```
diff --git a/bb b/bb
index 613b7eb..996e97b 100755
--- a/bb
+++ b/bb
@@ -14,7 +14,6 @@ import socket
 import subprocess
 import sys
 import tempfile
-import threading
 import time
 import xml.etree.ElementTree as ET
 from collections.abc import Sequence
@@ -908,118 +907,6 @@ def cmd_fio_perf(params: FilePerfParams) -> None:
             os.unlink(params.file)
 
 
-def _parse_sockperf(output: str) -> dict[str, Any]:
-    result: dict[str, Any] = {}
-
-    m = re.search(r"Summary: Round trip is ([\d.]+) usec", output)
-    if m:
-        result["avg"] = round(float(m.group(1)), 2)
-
-    for pct_str, key in [("99.900", "p999"), ("99.000", "p99"), ("50.000", "p50")]:
-        m = re.search(rf"percentile {re.escape(pct_str)}\s*=\s*([\d.]+)", output)
-        if m:
-            result[key] = round(float(m.group(1)), 2)
-
-    m = re.search(
-        r"\[Valid Duration\] RunTime=([\d.]+) sec; SentMessages=(\d+)", output
-    )
-    if m:
-        runtime = float(m.group(1))
-        sent = int(m.group(2))
-        if runtime > 0:
-            result["iops_raw"] = round(sent / runtime)
-
-    return result
-
-
-_SP_HEADERS: list[str] = ["connections", "IOPS", "avg", "p50", "p99", "p99.9"]
-_SP_WIDTHS: list[int] = [11, 8, 8, 8, 8, 8]
-
-
-def cmd_sockperf_perf(params: NetPerfParams) -> None:
-    print()
-    print("## sockperf comparison -- TCP echo")
-    print()
-    print(
-        f"{params.host}:{params.port}, msg_size={params.msg_size}, duration={params.duration}"
-    )
-    print()
-
-    server_cpus, client_cpus = _cpu_split()
-    local = params.host in ("127.0.0.1", "localhost")
-
-    server = None
-    if local:
-        server = start_process(
-            "taskset",
-            "-c",
-            server_cpus,
-            "sockperf",
-            "server",
-            "-p",
-            str(params.port),
-            "--tcp",
-        )
-        wait_for_tcp_port(params.host, params.port)
-
-    try:
-        print(_perf_row(_SP_HEADERS, _SP_WIDTHS))
-        print(_perf_sep(_SP_WIDTHS))
-
-        for conns in params.connections:
-            outputs: list[str] = [""] * conns
-
-            def run_client(i: int) -> None:
-                result = run_capture(
-                    "taskset",
-                    "-c",
-                    client_cpus,
-                    "sockperf",
-                    "ping-pong",
-                    "-i",
-                    params.host,
-                    "-p",
-                    str(params.port),
-                    "--tcp",
-                    "-m",
-                    str(params.msg_size),
-                    "-t",
-                    str(int(_parse_duration_s(params.duration))),
-                    "--full-rtt",
-                    timeout=params.timeout or None,
-                )
-                outputs[i] = result.stderr + result.stdout
-
-            threads = [
-                threading.Thread(target=run_client, args=(i,)) for i in range(conns)
-            ]
-            for t in threads:
-                t.start()
-            for t in threads:
-                t.join()
-
-            parsed = [_parse_sockperf(o) for o in outputs]
-            total_iops = sum(p.get("iops_raw", 0) for p in parsed)
-
-            def avg_metric(key: str, _parsed: list[dict[str, Any]] = parsed) -> str:
-                vals = [p[key] for p in _parsed if key in p]
-                return _us(round(sum(vals) / len(vals), 2)) if vals else "?"
-
-            cells: list[str | int] = [
-                conns,
-                f"{round(total_iops / 1000)}k" if total_iops else "?",
-                avg_metric("avg"),
-                avg_metric("p50"),
-                avg_metric("p99"),
-                avg_metric("p999"),
-            ]
-            print(_perf_row(cells, _SP_WIDTHS))
-    finally:
-        if server:
-            server.terminate()
-            server.wait()
-
-
 @dataclass
 class HttpPerfParams:
     host: str = "127.0.0.1"
@@ -1572,7 +1459,27 @@ def _build_parser() -> argparse.ArgumentParser:
     #
 
     perf_parser = sub.add_parser(
-        "perf", help="build release then run net-perf and file-perf"
+        "perf",
+        help="build release then run a set of perf benchmarks",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=(
+            "Run one or more perf benchmarks in a single invocation.\n\n"
+            "Targets (positional, repeatable):\n"
+            "  file          file-perf\n"
+            "  fio           fio comparison\n"
+            "  net           net-perf\n"
+            "  net-asio      net-perf-asio\n"
+            "  net-epoll     net-perf-epoll\n"
+            "  http          http-perf (internal server, fibers)\n"
+            "  http-threads  http-perf (internal server, thread client)\n"
+            "  http-nginx    http-perf against nginx (fiber client)\n"
+            "  s3            s3-perf (fibers)\n"
+            "  s3-threads    s3-perf (threads)\n"
+            "  all           run every target above\n\n"
+            "Examples:\n"
+            "  ./bb -b release perf --duration 60s --warmup 10s file net net-asio\n"
+            "  ./bb -b release perf all --duration 60s --warmup 10s\n"
+        ),
     )
     perf_parser.add_argument(
         "--timeout",
@@ -1581,34 +1488,37 @@ def _build_parser() -> argparse.ArgumentParser:
         metavar="SECONDS",
         help="per-run timeout in seconds (default: 180, 0=none)",
     )
-    perf_parser.add_argument("--net", action="store_true", help="run net-perf")
     perf_parser.add_argument(
-        "--net-asio", action="store_true", help="run net-perf-asio"
-    )
-    perf_parser.add_argument(
-        "--net-epoll", action="store_true", help="run net-perf-epoll"
-    )
-    perf_parser.add_argument("--file", action="store_true", help="run file-perf")
-    perf_parser.add_argument(
-        "--http", action="store_true", help="run http-perf (internal server, fibers)"
-    )
-    perf_parser.add_argument(
-        "--http-threads",
-        action="store_true",
-        help="run http-perf (internal server, thread client)",
-    )
-    perf_parser.add_argument(
-        "--http-nginx", action="store_true", help="run http-perf against nginx"
+        "--duration",
+        default=None,
+        metavar="DURATION",
+        help="measurement duration applied to every benchmark (e.g. 60s); per-binary defaults are used when omitted",
     )
-    perf_parser.add_argument("--fio", action="store_true", help="run fio comparison")
     perf_parser.add_argument(
-        "--sockperf", action="store_true", help="run sockperf comparison"
+        "--warmup",
+        default=None,
+        metavar="DURATION",
+        help="warmup duration applied to every benchmark (e.g. 10s); per-binary defaults are used when omitted",
     )
-    perf_parser.add_argument("--s3", action="store_true", help="run s3-perf")
     perf_parser.add_argument(
-        "--s3-threads", action="store_true", help="run s3-perf (threads)"
+        "targets",
+        nargs="+",
+        metavar="TARGET",
+        choices=[
+            "file",
+            "fio",
+            "net",
+            "net-asio",
+            "net-epoll",
+            "http",
+            "http-threads",
+            "http-nginx",
+            "s3",
+            "s3-threads",
+            "all",
+        ],
+        help="benchmarks to run (see list above; use 'all' for every target)",
     )
-    perf_parser.add_argument("--all", action="store_true", help="run everything")
 
     #
     # file-perf
@@ -1819,55 +1729,6 @@ def _build_parser() -> argparse.ArgumentParser:
     )
     _add_net_args(net_perf_epoll_parser)
 
-    #
-    # sockperf-perf
-    #
-
-    sockperf_perf_parser = sub.add_parser(
-        "sockperf-perf", help="run sockperf comparison"
-    )
-    sockperf_perf_parser.add_argument(
-        "--host", dest="sockperf_host", default=net_params.host
-    )
-    sockperf_perf_parser.add_argument(
-        "--port", dest="sockperf_port", default=net_params.port, type=int
-    )
-    sockperf_perf_parser.add_argument(
-        "--msg-size",
-        dest="sockperf_msg_size",
-        default=net_params.msg_size,
-        type=int,
-        metavar="BYTES",
-    )
-    sockperf_perf_parser.add_argument(
-        "--duration",
-        dest="sockperf_duration",
-        default=net_params.duration,
-        metavar="DURATION",
-    )
-    sockperf_perf_parser.add_argument(
-        "--warmup",
-        dest="sockperf_warmup",
-        default=net_params.warmup,
-        metavar="DURATION",
-    )
-    sockperf_perf_parser.add_argument(
-        "--connections",
-        dest="sockperf_connections",
-        default=net_params.connections,
-        type=int,
-        nargs="+",
-        metavar="N",
-    )
-    sockperf_perf_parser.add_argument(
-        "--timeout",
-        dest="sockperf_timeout",
-        default=180,
-        type=int,
-        metavar="SECONDS",
-        help="per-run timeout in seconds (default: 180, 0=none)",
-    )
-
     #
     # http-perf
     #
@@ -2098,9 +1959,6 @@ def main() -> None:
         cmd_build(preset, ["net-perf-epoll"])
         params = _params_from_args(args, "net", NetPerfParams)
         cmd_net_perf(preset, replace(params, engine=NetPerfEngine.EPOLL))
-    elif args.command == "sockperf-perf":
-        _check_no_extra(extra)
-        cmd_sockperf_perf(_params_from_args(args, "sockperf", NetPerfParams))
     elif args.command == "http-perf":
         _check_no_extra(extra)
         cmd_build(preset, ["http-perf"])
@@ -2111,41 +1969,55 @@ def main() -> None:
         cmd_s3_perf(preset, _params_from_args(args, "s3", S3PerfParams))
     elif args.command == "perf":
         _check_no_extra(extra)
+        timing_overrides: dict[str, str] = {}
+        if args.duration is not None:
+            timing_overrides["duration"] = args.duration
+        if args.warmup is not None:
+            timing_overrides["warmup"] = args.warmup
+        targets = set(args.targets)
+        if "all" in targets:
+            targets = {
+                "file", "fio", "net", "net-asio", "net-epoll",
+                "http", "http-threads", "http-nginx", "s3", "s3-threads",
+            }
         file_params = FilePerfParams(
             numjobs=[1, 16],
             iodepth=[1, 16],
             rw=["randwrite", "randread"],
             timeout=args.timeout,
+            **timing_overrides,
         )
-        if args.file or args.all:
+        if "file" in targets:
             cmd_build(preset, ["file-perf"])
             cmd_file_perf(preset, file_params)
-        if args.fio or args.all:
+        if "fio" in targets:
             cmd_fio_perf(file_params)
         net_params = NetPerfParams(
-            connections=[1, 256, 512, 1024], timeout=args.timeout
+            connections=[1, 256, 512, 1024],
+            timeout=args.timeout,
+            **timing_overrides,
         )
-        if args.net or args.all:
+        if "net" in targets:
             cmd_build(preset, ["net-perf"])
             cmd_net_perf(preset, replace(net_params, engine=NetPerfEngine.FIBERS))
-        if args.net_asio or args.all:
+        if "net-asio" in targets:
             cmd_build(preset, ["net-perf-asio"])
             cmd_net_perf(preset, replace(net_params, engine=NetPerfEngine.ASIO))
-        if args.net_epoll or args.all:
+        if "net-epoll" in targets:
             cmd_build(preset, ["net-perf-epoll"])
             cmd_net_perf(preset, replace(net_params, engine=NetPerfEngine.EPOLL))
-        if args.sockperf or args.all:
-            cmd_sockperf_perf(net_params)
         http_params = HttpPerfParams(
-            connections=[1, 256, 512, 1024], timeout=args.timeout
+            connections=[1, 256, 512, 1024],
+            timeout=args.timeout,
+            **timing_overrides,
         )
-        if args.http or args.all:
+        if "http" in targets:
             cmd_build(preset, ["http-perf"])
             cmd_http_perf(preset, http_params)
-        if args.http_threads or args.all:
+        if "http-threads" in targets:
             cmd_build(preset, ["http-perf"])
             cmd_http_perf(preset, replace(http_params, threads=True))
-        if args.http_nginx or args.all:
+        if "http-nginx" in targets:
             cmd_build(preset, ["http-perf"])
             cmd_http_perf(preset, replace(http_params, nginx=True))
         s3_params = S3PerfParams(
@@ -2153,11 +2025,12 @@ def main() -> None:
             iodepth=[1, 64],
             rw=["read", "write"],
             timeout=args.timeout,
+            **timing_overrides,
         )
-        if args.s3 or args.all:
+        if "s3" in targets:
             cmd_build(preset, ["s3-perf"])
             cmd_s3_perf(preset, s3_params)
-        if args.s3_threads or args.all:
+        if "s3-threads" in targets:
             cmd_build(preset, ["s3-perf"])
             cmd_s3_perf(preset, replace(s3_params, threads=True))
         print()
diff --git a/docs/perf.md b/docs/perf.md
index 140f206..6cdf5a9 100644
--- a/docs/perf.md
+++ b/docs/perf.md
@@ -1,30 +1,31 @@
 # Performance Results
 
-Measurements on an AWS instance (32-CPU Intel Xeon Platinum 8488C, Linux 6.17, release build `-O3`).
-Results are reproducible with `./bb -b release perf --file --fio --net --net-asio --net-epoll --http --http-threads --http-nginx --s3 --s3-threads`.
+Measurements on an AWS instance (32-CPU Intel Xeon Platinum 8488C, Linux 6.17, release build `-O3`). All measurements are 60 s with a 10 s warmup.
+
+The main tables are reproducible with `./bb -b release perf --duration 60s --warmup 10s all`. The high-concurrency rows (`net-perf` 1000 conn / `http-perf` 10000 conn / `s3-perf` 100x100), the thread client vs nginx row in `http-perf`, and the latency-profiler section need separate `./bb` invocations -- see each section.
 
 ---
 
 ## file-perf -- async file I/O
 
-`/dev/shm` (tmpfs, in-memory), bs=4k, size=1 GiB, 10 s measurement, 2 s warmup. Uses `FiberScheduler::read`/`write` (`IORING_OP_READV` / `IORING_OP_WRITEV`). `numjobs` = concurrent worker fibers; `iodepth` = per-fiber async IO queue depth (ring of `IoFuture`s).
+`/dev/shm` (tmpfs, in-memory), bs=4k, size=1 GiB, 60 s measurement, 10 s warmup. Uses `FiberScheduler::read`/`write` (`IORING_OP_READV` / `IORING_OP_WRITEV`). `numjobs` = concurrent worker fibers; `iodepth` = per-fiber async IO queue depth (ring of `IoFuture`s).
 
 | numjobs | iodepth | mode | IOPS | BW | avg | p50 | p95 | p99 | p99.9 |
 |---|---|---|---|---|---|---|---|---|---|
-| 1 | 1 | randwrite | 163k | 635 MiB/s | 6 µs | 4 µs | 13 µs | 16 µs | 25 µs |
-| 1 | 16 | randwrite | 522k | 2041 MiB/s | 31 µs | 29 µs | 41 µs | 48 µs | 67 µs |
-| 16 | 1 | randwrite | 837k | 3270 MiB/s | 19 µs | 19 µs | 30 µs | 46 µs | 290 µs |
-| 16 | 16 | randwrite | 755k | 2951 MiB/s | 339 µs | 270 µs | 945 µs | 1560 µs | 2297 µs |
-| 1 | 1 | randread | 195k | 761 MiB/s | 5 µs | 3 µs | 13 µs | 15 µs | 24 µs |
-| 1 | 16 | randread | 638k | 2493 MiB/s | 25 µs | 26 µs | 32 µs | 42 µs | 59 µs |
-| 16 | 1 | randread | 2334k | 9118 MiB/s | 7 µs | 4 µs | 16 µs | 38 µs | 158 µs |
-| 16 | 16 | randread | 5919k | 23120 MiB/s | 43 µs | 46 µs | 66 µs | 83 µs | 115 µs |
+| 1 | 1 | randwrite | 175k | 684 MiB/s | 6 µs | 3 µs | 13 µs | 15 µs | 23 µs |
+| 1 | 16 | randwrite | 526k | 2056 MiB/s | 30 µs | 28 µs | 40 µs | 46 µs | 59 µs |
+| 16 | 1 | randwrite | 893k | 3488 MiB/s | 18 µs | 19 µs | 28 µs | 38 µs | 54 µs |
+| 16 | 16 | randwrite | 789k | 3080 MiB/s | 325 µs | 264 µs | 875 µs | 1387 µs | 1970 µs |
+| 1 | 1 | randread | 214k | 837 MiB/s | 5 µs | 3 µs | 12 µs | 14 µs | 22 µs |
+| 1 | 16 | randread | 655k | 2557 MiB/s | 24 µs | 26 µs | 32 µs | 40 µs | 54 µs |
+| 16 | 1 | randread | 2592k | 10125 MiB/s | 6 µs | 4 µs | 16 µs | 33 µs | 120 µs |
+| 16 | 16 | randread | 5818k | 22726 MiB/s | 44 µs | 40 µs | 70 µs | 83 µs | 113 µs |
 
-**Best throughput** (`numjobs=16 iodepth=16 randread`): 5919k IOPS, 22.6 GiB/s.
+**Best throughput** (`numjobs=16 iodepth=16 randread`): 5818k IOPS, 22.2 GiB/s.
 
 **Best latency** (`numjobs=1 iodepth=1`): 3-4 µs p50 for both read and write.
 
-`numjobs=16 iodepth=16 randwrite` shows p99 blowout from ring contention; the read equivalent stays tight because tmpfs read paths have less internal locking. At `iodepth=16`, SQEs are now batched per fiber suspension (one `io_uring_submit` per fiber run instead of one per SQE), which improves randread throughput by ~9-22% and closes the gap with fio from 0.53x to 0.68x at 1 job.
+At `iodepth=16`, SQEs are batched per fiber suspension (one `io_uring_submit` per fiber run instead of one per SQE).
 
 ---
 
@@ -32,126 +33,109 @@ Results are reproducible with `./bb -b release perf --file --fio --net --net-asi
 
 | numjobs | iodepth | mode | IOPS | BW | avg | p50 | p95 | p99 | p99.9 |
 |---|---|---|---|---|---|---|---|---|---|
-| 1 | 1 | randwrite | 65k | 254 MiB/s | 13 µs | 13 µs | 19 µs | 28 µs | 39 µs |
-| 1 | 16 | randwrite | 803k | 3137 MiB/s | 19 µs | 18 µs | 23 µs | 25 µs | 31 µs |
-| 16 | 1 | randwrite | 718k | 2803 MiB/s | 20 µs | 20 µs | 27 µs | 35 µs | 47 µs |
-| 16 | 16 | randwrite | 771k | 3013 MiB/s | 329 µs | 309 µs | 449 µs | 1974 µs | 4112 µs |
-| 1 | 1 | randread | 68k | 266 MiB/s | 13 µs | 12 µs | 18 µs | 29 µs | 39 µs |
-| 1 | 16 | randread | 977k | 3815 MiB/s | 15 µs | 15 µs | 19 µs | 29 µs | 50 µs |
-| 16 | 1 | randread | 1179k | 4607 MiB/s | 11 µs | 12 µs | 21 µs | 33 µs | 45 µs |
-| 16 | 16 | randread | 11439k | 44682 MiB/s | 21 µs | 20 µs | 29 µs | 42 µs | 90 µs |
+| 1 | 1 | randwrite | 63k | 246 MiB/s | 14 µs | 13 µs | 18 µs | 26 µs | 39 µs |
+| 1 | 16 | randwrite | 797k | 3114 MiB/s | 19 µs | 19 µs | 22 µs | 28 µs | 34 µs |
+| 16 | 1 | randwrite | 723k | 2824 MiB/s | 20 µs | 20 µs | 27 µs | 34 µs | 45 µs |
+| 16 | 16 | randwrite | 800k | 3124 MiB/s | 317 µs | 301 µs | 354 µs | 1778 µs | 5341 µs |
+| 1 | 1 | randread | 71k | 279 MiB/s | 12 µs | 13 µs | 16 µs | 25 µs | 38 µs |
+| 1 | 16 | randread | 973k | 3801 MiB/s | 16 µs | 15 µs | 22 µs | 29 µs | 46 µs |
+| 16 | 1 | randread | 1178k | 4601 MiB/s | 12 µs | 13 µs | 19 µs | 32 µs | 44 µs |
+| 16 | 16 | randread | 10464k | 40876 MiB/s | 23 µs | 20 µs | 39 µs | 75 µs | 127 µs |
 
-At `iodepth=1`, the fiber scheduler outperforms fio (2-3x): fio uses one OS thread per job, so each IO incurs a full OS scheduler round-trip. At `iodepth=16`, fio wins but the gap narrowed significantly with SQE batching: the fiber scheduler now batches all iodepth SQEs into one `io_uring_submit` per fiber suspension, the same principle fio uses.
+At `iodepth=1`, the fiber scheduler outperforms fio (2-3x): fio uses one OS thread per job, so each IO incurs a full OS scheduler round-trip. At `iodepth=16`, fio wins; the fiber scheduler batches all SQEs the fiber enqueued during a run into one `io_uring_submit` per fiber suspension, the same principle fio uses.
 
 | config | fiber IOPS | fio IOPS | ratio |
 |---|---|---|---|
-| 1 job, iodepth=1, randread | 195k | 63k | 3.1x |
-| 16 jobs, iodepth=1, randread | 2334k | 1146k | 2.0x |
-| 1 job, iodepth=16, randread | 638k | 939k | 0.68x |
-| 16 jobs, iodepth=16, randread | 5919k | 10468k | 0.57x |
+| 1 job, iodepth=1, randread | 214k | 71k | 3.0x |
+| 16 jobs, iodepth=1, randread | 2592k | 1178k | 2.2x |
+| 1 job, iodepth=16, randread | 655k | 973k | 0.67x |
+| 16 jobs, iodepth=16, randread | 5818k | 10464k | 0.56x |
 
 ---
 
 ## net-perf -- TCP echo
 
-Loopback TCP, 64 B messages, 10 s measurement, 2 s warmup. All socket I/O is non-blocking; fibers suspend via `FiberScheduler::poll`. Latency is measured end-to-end: client send -> server echo -> client receive.
+Loopback TCP, 64 B messages, 60 s measurement, 10 s warmup. Socket I/O uses `FiberScheduler::read`/`write` (io_uring `IORING_OP_READV`/`IORING_OP_WRITEV`); the fiber suspends inside the call until the CQE arrives. Latency is measured end-to-end: client send -> server echo -> client receive.
 
 | connections | RPS | BW | avg | p50 | p95 | p99 | p99.9 |
 |---|---|---|---|---|---|---|---|
-| 1 | 43k | 3 MiB/s | 23 µs | 27 µs | 32 µs | 38 µs | 49 µs |
-| 256 | 1642k | 100 MiB/s | 156 µs | 144 µs | 306 µs | 327 µs | 346 µs |
-| 512 | 1600k | 98 MiB/s | 320 µs | 155 µs | 2207 µs | 2285 µs | 2343 µs |
-| 1024 | 1581k | 97 MiB/s | 648 µs | 177 µs | 5040 µs | 5174 µs | 5291 µs |
+| 1 | 42k | 3 MiB/s | 24 µs | 27 µs | 31 µs | 36 µs | 43 µs |
+| 256 | 1854k | 113 MiB/s | 138 µs | 122 µs | 319 µs | 338 µs | 364 µs |
+| 512 | 1870k | 114 MiB/s | 274 µs | 111 µs | 1086 µs | 1244 µs | 1305 µs |
+| 1024 | 1917k | 117 MiB/s | 534 µs | 154 µs | 3493 µs | 3607 µs | 3816 µs |
 
-Throughput reaches a ceiling of ~1600k req/s by 256 connections and stays flat thereafter -- the server is fully saturated. At 256 connections p95 improved from 737 µs to 306 µs with SQE batching: when many fibers are ready simultaneously their poll SQEs land in fewer `io_uring_submit` calls, reducing kernel entry overhead. The large gap between p50 and avg reflects a bimodal distribution: most requests are served promptly but a tail stalls behind kernel scheduling.
+Throughput plateaus at ~1.85-1.92M req/s by 256 connections -- the server is fully saturated. The large gap between p50 and avg at high concurrency (e.g. 154 µs vs 534 µs at 1024 conns) reflects a bimodal distribution: most requests are served promptly but a tail stalls.
 
 ---
 
 ## net-perf-asio -- TCP echo (Boost.Asio C++20 coroutines)
 
-Same workload as net-perf above, reimplemented with Boost.Asio C++20 coroutines (`asio::awaitable<void>`) and epoll (Asio's default Linux backend). Server and client use one thread per available CPU (respecting `taskset`). Reproduced with `./bb -b release net-perf-asio`.
+Same workload as net-perf above, reimplemented with Boost.Asio C++20 coroutines (`asio::awaitable<void>`) and epoll (Asio's default Linux backend). Server and client use one thread per available CPU (respecting `taskset`). Reproduced with `./bb -b release net-perf-asio --duration 60s --warmup 10s`.
 
 | connections | RPS | BW | avg | p50 | p95 | p99 | p99.9 |
 |---|---|---|---|---|---|---|---|
-| 1 | 3k | 0 MiB/s | 313 µs | 348 µs | 522 µs | 625 µs | 759 µs |
-| 256 | 390k | 24 MiB/s | 657 µs | 660 µs | 730 µs | 754 µs | 854 µs |
-| 512 | 414k | 25 MiB/s | 1236 µs | 1245 µs | 1312 µs | 1339 µs | 1376 µs |
-| 1024 | 409k | 25 MiB/s | 2503 µs | 2651 µs | 2799 µs | 2843 µs | 3529 µs |
+| 1 | 3k | 0 MiB/s | 300 µs | 343 µs | 491 µs | 585 µs | 711 µs |
+| 256 | 377k | 23 MiB/s | 678 µs | 683 µs | 740 µs | 768 µs | 814 µs |
+| 512 | 383k | 23 MiB/s | 1337 µs | 1350 µs | 1466 µs | 1496 µs | 1534 µs |
+| 1024 | 380k | 23 MiB/s | 2696 µs | 2700 µs | 2782 µs | 2818 µs | 2867 µs |
 
-**Comparison with net-perf (fibers + io_uring):**
+**Comparison with net-perf (fibers + io_uring), measured in the same suite:**
 
 | connections | net-perf RPS | net-perf-asio RPS | ratio |
 |---|---|---|---|
-| 1 | 44k | 3k | **~15x** |
-| 256 | 1522k | 390k | **~4x** |
-| 512 | 1615k | 414k | **~4x** |
-| 1024 | 1566k | 409k | **~4x** |
+| 1 | 42k | 3k | **~14x** |
+| 256 | 1854k | 377k | **~4.9x** |
+| 512 | 1870k | 383k | **~4.9x** |
+| 1024 | 1917k | 380k | **~5.0x** |
 
-The gap has two structural causes. First, net-perf uses io_uring for all socket I/O while Asio uses epoll; io_uring avoids the per-operation `epoll_ctl` + `epoll_wait` + `recv`/`send` syscall chain. Second, the fiber scheduler's work-stealing threads spin and pick up completions in nanoseconds, while Asio's reactor threads sleep in `epoll_wait` and require a wakeup write + pthread wake per completion.
+Two structural differences explain most of the gap. First, net-perf uses io_uring for all socket I/O while Asio uses epoll; io_uring avoids the per-operation `epoll_ctl` + `epoll_wait` + `recv`/`send` syscall chain. Second, the fiber scheduler's per-CPU pinned scheduler threads pick up completions via `io_uring_enter`, while Asio's reactor threads block in `epoll_wait` and resume via a pthread wakeup.
 
-The gap is largest at 1 connection (~15x) where per-operation scheduling overhead dominates with no parallelism to hide it, and narrows to ~4x at high connection counts where I/O bandwidth is the bottleneck. Asio's io_uring backend (`BOOST_ASIO_HAS_IO_URING`) was also tested and performed ~2.5x worse than epoll, ruling out the I/O backend as a factor. Linking with jemalloc had no effect. The bottleneck is entirely in Asio's handler dispatch path.
+The gap is largest at 1 connection (~14x) where per-operation scheduling overhead dominates with no parallelism to hide it, and narrows to ~5x at high connection counts where the server CPU half is the bottleneck.
 
 ---
 
 ## net-perf-epoll -- TCP echo (raw epoll, multi-threaded)
 
-Same workload as net-perf above, reimplemented as the simplest efficient epoll loop: edge-triggered `recv`/`send` per connection, one worker thread per available CPU (auto-detected via `silk::getAvailableProcessorCount`), `SO_REUSEPORT` listener per worker on the server, no fibers, no io_uring. Each worker owns its epoll instance and round-robins its connections through a per-fd state machine. Reproduced with `./bb -b release net-perf-epoll`.
+Same workload as net-perf above, reimplemented as the simplest efficient epoll loop: edge-triggered `recv`/`send` per connection, one worker thread per available CPU (auto-detected via `silk::getAvailableProcessorCount`), `SO_REUSEPORT` listener per worker on the server, no fibers, no io_uring. Each worker owns its epoll instance and round-robins its connections through a per-fd state machine. Reproduced with `./bb -b release net-perf-epoll --duration 60s --warmup 10s`.
 
 | connections | RPS | BW | avg | p50 | p95 | p99 | p99.9 |
 |---|---|---|---|---|---|---|---|
-| 1 | 39k | 2 MiB/s | 25 µs | 25 µs | 30 µs | 35 µs | 45 µs |
-| 256 | 2254k | 138 MiB/s | 114 µs | 101 µs | 168 µs | 190 µs | 3091 µs |
-| 512 | 2260k | 138 MiB/s | 227 µs | 210 µs | 292 µs | 327 µs | 3257 µs |
-| 1024 | 2203k | 134 MiB/s | 465 µs | 446 µs | 579 µs | 749 µs | 3528 µs |
+| 1 | 40k | 2 MiB/s | 25 µs | 25 µs | 29 µs | 34 µs | 41 µs |
+| 256 | 2540k | 155 MiB/s | 101 µs | 97 µs | 155 µs | 171 µs | 189 µs |
+| 512 | 2545k | 155 MiB/s | 201 µs | 196 µs | 276 µs | 298 µs | 328 µs |
+| 1024 | 2411k | 147 MiB/s | 425 µs | 428 µs | 520 µs | 552 µs | 611 µs |
 
 **Comparison with net-perf (fibers + io_uring), same-run measurements:**
 
 | connections | net-perf RPS | net-perf-epoll RPS | RPS ratio | net-perf p99 | net-perf-epoll p99 | p99 ratio |
 |---|---|---|---|---|---|---|
-| 1 | 41k | 39k | 0.95x | 39 µs | 35 µs | 0.90x |
-| 256 | 1729k | 2254k | **1.30x** | 485 µs | 190 µs | **0.39x** |
-| 512 | 1759k | 2260k | **1.28x** | 1512 µs | 327 µs | **0.22x** |
-| 1024 | 1714k | 2203k | **1.29x** | 2404 µs | 749 µs | **0.31x** |
-
-At 1 connection both are equivalent — the host has spare CPU and engine overhead is invisible. Past saturation (~32-64 connections) raw epoll wins ~30% on throughput and 3-5x on p99 tail latency. Per-cpu rate at saturation: fibers ≈ 110k req/cpu (9.1 µs CPU/req), epoll ≈ 145k req/cpu (6.9 µs CPU/req); the 2.2 µs/req gap is the cost of the fiber abstraction in this workload — fiber suspend/resume + io_uring SQE/CQE submission + ready-queue bookkeeping per round-trip. The tail-latency difference is a separate phenomenon: silk's scheduler is selectively unfair (some fibers run hot while others starve for thousands of requests at a time), which keeps p50 low (130 µs at 1024 conns) but inflates p99 dramatically. The epoll loop services its connections in round-robin within each worker, so per-connection treatment is uniform — p99 stays close to p50 (749 µs vs 446 µs at 1024 conns).
+| 1 | 42k | 40k | 0.95x | 36 µs | 34 µs | 0.94x |
+| 256 | 1854k | 2540k | **1.37x** | 338 µs | 171 µs | **0.51x** |
+| 512 | 1870k | 2545k | **1.36x** | 1244 µs | 298 µs | **0.24x** |
+| 1024 | 1917k | 2411k | **1.26x** | 3607 µs | 552 µs | **0.15x** |
 
-The gap holds across message sizes (1.3-1.6x at 64 B – 16 KiB at 256 connections), so it scales with the abstraction cost rather than amortizing over per-byte work. Disabling `USE_IO_URING_RW` (falling back to `recv`/`send` + `FiberScheduler::poll`) does not close it — io_uring on loopback is roughly a wash compared to direct syscalls in this workload.
+At 1 connection both are equivalent -- the host has spare CPU and engine overhead is invisible. Past saturation raw epoll wins ~25-40% on throughput and 2-7x on p99 tail latency. Per-CPU rate at saturation (256 conns, 16 server CPUs): fibers ≈ 116k req/cpu (8.6 µs CPU/req), epoll ≈ 159k req/cpu (6.3 µs CPU/req); the 2.3 µs/req gap is the cost of the fiber abstraction in this workload -- fiber suspend/resume + io_uring SQE/CQE submission + ready-queue bookkeeping per round-trip. The epoll loop services its connections in round-robin within each worker, so per-connection treatment is uniform -- p99 stays close to p50 (552 µs vs 428 µs at 1024 conns), while net-perf shows a wide gap (3607 µs p99 vs 154 µs p50 at the same conn count).
 
 What raw epoll gives up: composability. The state machine can't naturally accommodate sleeps (no `--delay` support), multi-step protocols, or branching control flow without growing into a small interpreter. net-perf-epoll is the throughput floor; net-perf is the structure you'd actually program against.
 
 ---
 
-## sockperf comparison -- TCP echo
-
-Loopback TCP, 64 B messages, 10 s measurement. sockperf uses a single-threaded epoll server; the 1-connection row is the apples-to-apples baseline. Multi-connection rows reflect sockperf's server bottleneck, not TCP cost.
-
-| connections | IOPS | avg | p50 | p99 | p99.9 |
-|---|---|---|---|---|---|
-| 1 | 42k | 23.56 µs | 23.08 µs | 32.98 µs | 42.81 µs |
-| 4 | 43k | 23.27 µs | 22.82 µs | 32.87 µs | 43.05 µs |
-| 16 | 95k | 21.2 µs | 22.8 µs | 32.63 µs | 41.73 µs |
-| 64 | 85k | 23.39 µs | 22.94 µs | 32.88 µs | 42.74 µs |
-
-At 1 connection, net-perf and sockperf are identical (43k vs 42k IOPS, ~23-26 µs p50). The fiber scheduler adds zero overhead over raw TCP.
-
----
-
 ## http-perf -- HTTP/1.1 GET
 
-nginx `return 200` (Content-Length: 0), loopback, 10 s measurement, 2 s warmup. Client and server pinned to separate CPU halves (16 CPUs each). Fiber client uses `FiberSocketImpl` backed by `FiberScheduler::read`/`write` (io_uring `IORING_OP_READV`/`IORING_OP_WRITEV`); thread client uses one blocking OS thread per connection.
+nginx `return 200` (empty body), loopback, 60 s measurement, 10 s warmup. Client and server pinned to separate CPU halves (16 CPUs each). Fiber client uses `FiberSocketImpl` backed by `FiberScheduler::read`/`write` (io_uring `IORING_OP_READV`/`IORING_OP_WRITEV`); thread client uses one blocking OS thread per connection. The thread+nginx row is collected separately with `./bb -b release http-perf --nginx --threads --connections 1 256 512 1024 --duration 60s --warmup 10s`.
 
 | connections | mode | RPS | avg | p50 | p95 | p99 | p99.9 |
 |---|---|---|---|---|---|---|---|
-| 1 | fiber | 38k | 27 µs | 24 µs | 34 µs | 40 µs | 101 µs |
-| 256 | fiber | 1158k | 221 µs | 66 µs | 2033 µs | 2453 µs | 2558 µs |
-| 512 | fiber | 1215k | 421 µs | 83 µs | 4773 µs | 5890 µs | 6192 µs |
-| 1024 | fiber | 1224k | 836 µs | 84 µs | 10858 µs | 12690 µs | 13243 µs |
-| 1 | threads | 36k | 28 µs | 27 µs | 33 µs | 39 µs | 274 µs |
-| 256 | threads | 1206k | 212 µs | 207 µs | 333 µs | 459 µs | 969 µs |
-| 512 | threads | 1179k | 434 µs | 426 µs | 626 µs | 855 µs | 1668 µs |
-| 1024 | threads | 1149k | 891 µs | 878 µs | 1225 µs | 1784 µs | 3428 µs |
+| 1 | fiber | 39k | 25 µs | 24 µs | 32 µs | 40 µs | 85 µs |
+| 256 | fiber | 1340k | 191 µs | 90 µs | 1359 µs | 1867 µs | 2127 µs |
+| 512 | fiber | 1341k | 382 µs | 68 µs | 4384 µs | 5225 µs | 5435 µs |
+| 1024 | fiber | 1351k | 758 µs | 71 µs | 9568 µs | 11419 µs | 11718 µs |
+| 1 | threads | 37k | 27 µs | 27 µs | 32 µs | 39 µs | 223 µs |
+| 256 | threads | 1266k | 202 µs | 192 µs | 341 µs | 485 µs | 839 µs |
+| 512 | threads | 1262k | 406 µs | 367 µs | 733 µs | 1125 µs | 1819 µs |
+| 1024 | threads | 1193k | 858 µs | 842 µs | 1169 µs | 1767 µs | 3311 µs |
 
-At 1 connection both modes are identical (~36-38k RPS, ~24-27 µs p50): baseline is Poco's HTTP parsing overhead. At higher concurrency both clients saturate nginx at ~1M RPS, so throughput is similar. The difference is latency: fiber p50 stays nearly flat across all concurrency levels (24-84 µs) while thread p50 grows linearly with thread count, reaching 10x worse at 1024 connections (878 µs vs 84 µs). The fiber scheduler multiplexes all connections across 16 scheduler threads with zero per-fiber context-switch cost; each additional thread adds OS scheduling overhead proportional to the total thread count.
+At 1 connection both modes are identical (~37-39k RPS, ~24-27 µs p50): baseline is Poco's HTTP parsing overhead. At higher concurrency both clients saturate nginx at ~1.2-1.35M RPS, so throughput is similar. The difference is latency: fiber p50 stays nearly flat across all concurrency levels (24-90 µs) while thread p50 grows roughly linearly with thread count, reaching ~12x worse at 1024 connections (842 µs vs 71 µs). The fiber scheduler multiplexes all connections across 16 scheduler threads with sub-microsecond context-switch cost (see `fiber_run` below); each additional OS thread adds scheduling overhead proportional to the total thread count.
 
 ### Server: internal (silk fibers) vs nginx
 
@@ -159,63 +143,67 @@ At 1 connection both modes are identical (~36-38k RPS, ~24-27 µs p50): baseline
 
 | connections | server | RPS | avg | p50 | p95 | p99 | p99.9 |
 |---|---|---|---|---|---|---|---|
-| 1 | internal | 27k | 37 µs | 36 µs | 43 µs | 49 µs | 64 µs |
-| 256 | internal | 1023k | 250 µs | 152 µs | 1478 µs | 1960 µs | 2248 µs |
-| 512 | internal | 1011k | 506 µs | 88 µs | 5023 µs | 5146 µs | 5277 µs |
-| 1024 | internal | 964k | 1020 µs | 94 µs | 12445 µs | 16772 µs | 19661 µs |
-| 1 | nginx | 36k | 28 µs | 25 µs | 35 µs | 42 µs | 97 µs |
-| 256 | nginx | 1290k | 198 µs | 72 µs | 1700 µs | 2050 µs | 2137 µs |
-| 512 | nginx | 1248k | 410 µs | 63 µs | 4557 µs | 5738 µs | 5904 µs |
-| 1024 | nginx | 1254k | 816 µs | 58 µs | 9979 µs | 12599 µs | 13163 µs |
+| 1 | internal | 28k | 35 µs | 35 µs | 41 µs | 46 µs | 60 µs |
+| 256 | internal | 1104k | 232 µs | 162 µs | 1112 µs | 1470 µs | 1815 µs |
+| 512 | internal | 1093k | 468 µs | 148 µs | 4631 µs | 6136 µs | 7349 µs |
+| 1024 | internal | 1044k | 981 µs | 86 µs | 11919 µs | 16406 µs | 19449 µs |
+| 1 | nginx | 39k | 25 µs | 24 µs | 32 µs | 40 µs | 85 µs |
+| 256 | nginx | 1340k | 191 µs | 90 µs | 1359 µs | 1867 µs | 2127 µs |
+| 512 | nginx | 1341k | 382 µs | 68 µs | 4384 µs | 5225 µs | 5435 µs |
+| 1024 | nginx | 1351k | 758 µs | 71 µs | 9568 µs | 11419 µs | 11718 µs |
+
+The internal server lands at ~80% of nginx RPS at high concurrency (1044-1104k vs 1340-1351k). The gap is Poco overhead, not silk overhead: nginx's `return 200` handler skips most of HTTP/1.1 parsing, while Poco constructs `HTTPServerRequestImpl`/`HTTPServerResponseImpl` plus heap-allocated stream buffers per request. The takeaway is that silk's accept-fiber + per-connection-fiber I/O loop has small overhead on top of whatever HTTP machinery you put on it -- to beat nginx you'd swap Poco for a hand-rolled state machine that allocates nothing per request, which is a different project.
 
-The internal server lands at ~80% of nginx RPS at high concurrency (964–1023k vs 1248–1290k). The gap is Poco overhead, not silk overhead: nginx's `return 200` handler skips most of HTTP/1.1 parsing, while Poco constructs `HTTPServerRequestImpl`/`HTTPServerResponseImpl` plus heap-allocated stream buffers per request. p50 latencies sit within a few µs of each other at high concurrency; tail latencies are dominated by client-side queuing in both cases. The takeaway is that silk's accept-fiber + per-connection-fiber I/O loop has negligible overhead on top of whatever HTTP machinery you put on it — to beat nginx you'd swap Poco for a hand-rolled state machine that allocates nothing per request, which is a different project.
+### High-concurrency throughput (connections=10000, delay=10ms, duration=60s, warmup=10s)
 
-### High-concurrency throughput (connections=10000, delay=10ms, duration=60s)
+Run against the internal silk-fiber HTTP server with a 10 ms server-side sleep per request, so all 10k connections stay alive simultaneously and the server CPU half is fully loaded. Reproduced with `./bb -b release http-perf [--threads] --connections 10000 --delay 10ms --duration 60s --warmup 10s`.
 
 | connections | mode | RPS | avg | p50 | p95 | p99 | p99.9 |
 |---|---|---|---|---|---|---|---|
-| 10000 | fibers | 814k | 12292 µs | 12232 µs | 13408 µs | 14419 µs | 25152 µs |
-| 10000 | threads | 717k | 13942 µs | 13648 µs | 15790 µs | 24089 µs | 28534 µs |
+| 10000 | fibers | 575k | 10262 µs | 10225 µs | 10539 µs | 10844 µs | 12597 µs |
+| 10000 | threads | 636k | 15503 µs | 12053 µs | 28940 µs | 34035 µs | 39164 µs |
 
-At 10000 connections (10 ms synthetic delay, all 32 cores at 100%) fibers sustain 814k RPS vs 717k for threads (+13.5%). The delay keeps all 10k connections alive simultaneously, so nginx is the bottleneck rather than client throughput. Latency is consistently better across all percentiles: fiber p99.9 (25 ms) vs threads (29 ms), and fiber p99 (14 ms) vs threads (24 ms).
+Throughput is in the same band (575k fibers vs 636k threads); the workload is server-bound. The big difference is latency tightness: fiber percentiles cluster within a 2.4 ms window (p50 10.2 ms -> p99.9 12.6 ms), while threads spread over 27 ms (p50 12.1 ms -> p99.9 39.2 ms). At 10k OS threads the kernel scheduler injects multi-millisecond stalls into the tail; the fiber scheduler keeps the tail close to the median.
 
 ---
 
 ## s3-perf -- S3 object storage
 
-MinIO loopback (`http://127.0.0.1:9000`), object size=4096 B, 10 s measurement, 2 s warmup. Both modes use `numjobs` OS session threads, each maintaining an `iodepth`-slot ring of in-flight async S3 requests and waiting on a `FiberFuture` per slot. The difference is the AWS SDK executor and HTTP client: fiber mode runs each SDK async task as a fiber with io_uring socket I/O (`FiberExecutor` + `FiberHttpClient`); thread mode runs each task on a `PooledThreadExecutor` (sized `numjobs x iodepth`) with blocking socket I/O.
+MinIO loopback (`http://127.0.0.1:9000`), object size=4096 B, 60 s measurement, 10 s warmup. Both modes use `numjobs` OS session threads, each maintaining an `iodepth`-slot ring of in-flight async S3 requests and waiting on a `FiberFuture` per slot. The difference is the AWS SDK executor and HTTP client: fiber mode runs each SDK async task as a fiber with io_uring socket I/O (`FiberExecutor` + `FiberHttpClient`); thread mode runs each task on a `PooledThreadExecutor` (sized `numjobs x iodepth`) with blocking socket I/O.
 
 | numjobs | iodepth | mode | executor | OPS/s | avg | p50 | p95 | p99 | p99.9 |
 |---|---|---|---|---|---|---|---|---|---|
-| 1 | 1 | read | fibers | 1456 | 687 µs | 699 µs | 851 µs | 943 µs | 1123 µs |
-| 1 | 64 | read | fibers | 38097 | 1679 µs | 1647 µs | 2691 µs | 3527 µs | 4338 µs |
-| 16 | 1 | read | fibers | 27962 | 572 µs | 559 µs | 698 µs | 976 µs | 1534 µs |
-| 16 | 64 | read | fibers | 47577 | 21464 µs | 20609 µs | 37777 µs | 44589 µs | 65603 µs |
-| 1 | 1 | write | fibers | 1427 | 701 µs | 690 µs | 852 µs | 989 µs | 1124 µs |
-| 1 | 64 | write | fibers | 598 | 105466 µs | 104739 µs | 153297 µs | 179286 µs | 196299 µs |
-| 16 | 1 | write | fibers | 1459 | 10954 µs | 776 µs | 65304 µs | 115223 µs | 195529 µs |
-| 16 | 64 | write | fibers | 2277 | 427224 µs | 416051 µs | 679043 µs | 902817 µs | 1278862 µs |
-| 1 | 1 | read | threads | 1043 | 958 µs | 933 µs | 1270 µs | 1373 µs | 1494 µs |
-| 1 | 64 | read | threads | 39572 | 1617 µs | 1583 µs | 2605 µs | 3447 µs | 4292 µs |
-| 16 | 1 | read | threads | 29829 | 536 µs | 525 µs | 648 µs | 942 µs | 1529 µs |
-| 16 | 64 | read | threads | 47858 | 21347 µs | 20676 µs | 36630 µs | 43711 µs | 55342 µs |
-| 1 | 1 | write | threads | 1122 | 891 µs | 839 µs | 1281 µs | 1454 µs | 1608 µs |
-| 1 | 64 | write | threads | 602 | 105235 µs | 105012 µs | 161543 µs | 175461 µs | 202878 µs |
-| 16 | 1 | write | threads | 1330 | 12026 µs | 994 µs | 67975 µs | 114874 µs | 198232 µs |
-| 16 | 64 | write | threads | 2278 | 429072 µs | 419749 µs | 679185 µs | 880016 µs | 984188 µs |
-
-At `numjobs=1 iodepth=1` read, fibers deliver 1456 OPS vs 1043 for threads (+40%): with one outstanding request at a time, the thread executor pays a full OS wake-up round-trip per response, while a fiber resumes inline on the scheduler thread. At higher iodepth or numjobs, MinIO becomes the bottleneck and throughput converges. Write latency blows out at high iodepth (`iodepth=64` p50 >100 ms, `16x64` p50 >400 ms) symmetrically across both executors, confirming MinIO internal serialization is the cause.
-
-The write `16x1` p50 (776 µs fibers / 994 µs threads) is much lower than the avg (11 ms / 12 ms) because a small fraction of requests stall behind MinIO lock contention, pulling the mean up while the median stays fast.
-
-### High-concurrency tail latency (numjobs=100, iodepth=100, duration=60s)
+| 1 | 1 | read | fibers | 1678 | 596 µs | 606 µs | 716 µs | 790 µs | 962 µs |
+| 1 | 64 | read | fibers | 39731 | 1611 µs | 1578 µs | 2551 µs | 3404 µs | 4071 µs |
+| 16 | 1 | read | fibers | 29230 | 547 µs | 535 µs | 669 µs | 940 µs | 1544 µs |
+| 16 | 64 | read | fibers | 50060 | 20448 µs | 19910 µs | 35103 µs | 41662 µs | 49624 µs |
+| 1 | 1 | write | fibers | 1559 | 641 µs | 617 µs | 777 µs | 875 µs | 1010 µs |
+| 1 | 64 | write | fibers | 611 | 104642 µs | 103667 µs | 151958 µs | 177046 µs | 198348 µs |
+| 16 | 1 | write | fibers | 1579 | 10132 µs | 697 µs | 61724 µs | 114861 µs | 183053 µs |
+| 16 | 64 | write | fibers | 2366 | 429349 µs | 412892 µs | 710441 µs | 896133 µs | 1146916 µs |
+| 1 | 1 | read | threads | 959 | 1042 µs | 1085 µs | 1290 µs | 1373 µs | 1496 µs |
+| 1 | 64 | read | threads | 40324 | 1587 µs | 1549 µs | 2596 µs | 3382 µs | 4204 µs |
+| 16 | 1 | read | threads | 30716 | 521 µs | 510 µs | 632 µs | 900 µs | 1491 µs |
+| 16 | 64 | read | threads | 50068 | 20445 µs | 19737 µs | 35075 µs | 41884 µs | 51820 µs |
+| 1 | 1 | write | threads | 1160 | 862 µs | 829 µs | 1189 µs | 1355 µs | 1516 µs |
+| 1 | 64 | write | threads | 623 | 102639 µs | 101939 µs | 147443 µs | 170832 µs | 190216 µs |
+| 16 | 1 | write | threads | 1318 | 12140 µs | 1012 µs | 68045 µs | 118428 µs | 188976 µs |
+| 16 | 64 | write | threads | 2381 | 426606 µs | 410346 µs | 709980 µs | 869217 µs | 1067136 µs |
+
+At `numjobs=1 iodepth=1` read, fibers deliver 1678 OPS vs 959 for threads (+75%): with one outstanding request at a time, the thread executor pays a full OS wake-up round-trip per response, while a fiber resumes inline on the scheduler thread. At higher iodepth or numjobs, MinIO becomes the bottleneck and throughput converges. Write latency blows out at high iodepth (`iodepth=64` p50 >100 ms, `16x64` p50 >400 ms) symmetrically across both executors, confirming MinIO internal serialization is the cause.
+
+The write `16x1` p50 (697 µs fibers / 1012 µs threads) is much lower than the avg (10 ms / 12 ms) because a small fraction of requests stall behind MinIO lock contention, pulling the mean up while the median stays fast.
+
+### High-concurrency tail latency (numjobs=100, iodepth=100, duration=60s, warmup=10s)
+
+Reproduced with `./bb -b release s3-perf [--threads] --numjobs 100 --iodepth 100 --duration 60s --warmup 10s`.
 
 | numjobs | iodepth | mode | executor | OPS/s | avg | p50 | p95 | p99 | p99.9 |
 |---|---|---|---|---|---|---|---|---|---|
-| 100 | 100 | read | fibers | 45079 | 220727 µs | 215328 µs | 282965 µs | 425493 µs | 579894 µs |
-| 100 | 100 | read | threads | 44046 | 223812 µs | 217035 µs | 290433 µs | 440547 µs | 957205 µs |
+| 100 | 100 | read | fibers | 46998 | 212075 µs | 206508 µs | 261370 µs | 306935 µs | 384190 µs |
+| 100 | 100 | read | threads | 45948 | 216746 µs | 210145 µs | 273958 µs | 391332 µs | 573618 µs |
 
-At 10,000 concurrent requests (100 jobs x iodepth 100) throughput is identical (~45k OPS) -- MinIO is fully saturated. The difference is tail latency: fiber p99.9 is 580 ms vs 957 ms for threads (1.65x). p95 and p99 are close (283 ms vs 290 ms and 425 ms vs 441 ms); the gap widens at p99.9 where OS scheduler jitter under 10,000 OS threads causes occasional long stalls that the fiber scheduler avoids.
+At 10,000 concurrent requests (100 jobs x iodepth 100) throughput is close (~46-47k OPS) -- MinIO is fully saturated. Fibers retain a tail-latency edge: p99 is 307 ms vs 391 ms for threads (1.27x), p99.9 is 384 ms vs 574 ms (1.49x). The gap widens at higher percentiles where 10,000 OS threads stall behind kernel scheduling jitter that the fiber scheduler avoids.
 
 ---
 
@@ -231,29 +219,29 @@ Per-CPU profiler (opted in via `--print-counters`) emits log2 histograms for fiv
 | `ready_wait` | `enqueueReady` -> dispatch (ready-queue dwell) |
 | `fiber_run` | `switchToFiberContext` -> return (on-CPU time per slice) |
 
-### Per-IO breakdown (net-perf, 1000 connections, 60 s, 1853k RPS)
+### Per-IO breakdown (net-perf, 1000 connections, 60 s, 10 s warmup, 1856k RPS)
+
+Reproduced with `./bb -b release net-perf --connections 1000 --duration 60s --warmup 10s --print-counters`.
 
 | event | p50 | p90 | p99 | p99.9 |
 |---|---|---|---|---|
-| `suspend_wait` | 72.8 µs | 441.7 µs | 1.8 ms | 3.8 ms |
-| `io_submit` | 4.1 µs | 7.8 µs | 15.4 µs | 26.8 µs |
-| `io_wait` | 78.0 µs | 445.2 µs | 1.8 ms | 3.8 ms |
-| `ready_wait` | 54.3 µs | 251.2 µs | 927.3 µs | 1.0 ms |
-| `fiber_run` | 206 ns | 395 ns | 503 ns | 3.2 µs |
-
-Per request (~2 IOs): summed p50 components total ~273 µs vs 301 µs observed p50; summed mean components total ~490 µs vs 540 µs observed avg. Self-consistent within 10%.
+| `suspend_wait` | 37.3 µs | 593 µs | 3.6 ms | 4.1 ms |
+| `io_submit` | 4.1 µs | 7.9 µs | 15.5 µs | 24.5 µs |
+| `io_wait` | 42.6 µs | 597 µs | 3.6 ms | 4.1 ms |
+| `ready_wait` | 26.5 µs | 130 µs | 661 µs | 1.0 ms |
+| `fiber_run` | 199 ns | 347 ns | 500 ns | 3.3 µs |
 
-`fiber_run` p50 = 206 ns confirms the dispatch loop itself is essentially free; this workload is entirely IO-bound. `SchedulerSystemTime` totals 1057 CPU-s (55% of 32 cores x 60 s) -- almost entirely `io_uring_submit`: 257 M syscalls x 4.1 µs = 1057 s. User-mode fiber work consumes 54 s (3%); idle time is 147 s (8%).
+`fiber_run` p50 = 199 ns confirms the dispatch loop itself is essentially free; this workload is entirely IO-bound. `SchedulerSystemTime` totals 1060 CPU-s (55% of 32 cores x 60 s) -- almost entirely `io_uring_submit`: 258 M syscalls x 4.1 µs = 1058 s. User-mode fiber work consumes 51 s (2.7%); idle time is 165 s (8.6%).
 
-The profile pinpoints `io_uring_submit` as the dominant lever. SQPOLL is incompatible with the per-CPU pinned scheduler (kernel poller would compete on the same CPU). The next optimization is batching submits at the `handleReadyQueue` boundary instead of per-fiber-suspend: with 4-16 ready fibers per dispatch pass typical, that's a 4-16x reduction in syscall count.
+The profile pinpoints `io_uring_submit` as the dominant lever. SQPOLL is not enabled (the per-CPU pinned scheduler shares the CPU with any kernel poller). The next optimization to consider is batching submits at the `handleReadyQueue` boundary instead of per-fiber-suspend.
 
-### Profiler overhead (net-perf, 1000 connections, 60 s)
+### Profiler overhead (net-perf, 1000 connections, 60 s, 10 s warmup)
 
 | metric | off | on | Δ |
 |---|---|---|---|
-| RPS | 1889k | 1853k | -1.9% |
-| p50 | 210 µs | 301 µs | +43% |
-| p99 | 3404 µs | 2299 µs | **-32%** |
-| p99.9 | 3607 µs | 2649 µs | **-27%** |
+| RPS | 1922k | 1856k | -3.4% |
+| p50 | 210 µs | 173 µs | -18% |
+| p99 | 2431 µs | 3277 µs | +35% |
+| p99.9 | 2491 µs | 3474 µs | +39% |
 
-~2% RPS cost; tail latency *improves* substantially -- the per-suspend TSC reads + ring writes inject a small fixed cost into the dispatch loop that damps the bursty kernel-side contention driving the long tail.
+Profiler costs ~3% RPS. p50 actually improves under the profiler (the per-suspend TSC reads + ring writes change the dispatch-loop cadence in ways that benefit the median); the cost shows up in the tail (+35-39% at p99/p99.9).