From 7696718031ae3ba39989a86eaae18684b1532d52 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Wed, 25 Mar 2026 17:27:18 +0000 Subject: [PATCH 1/5] feat(memory): use fallocate(PUNCH_HOLE) for guest_memfd discard MADV_DONTNEED is a no-op for MAP_SHARED mappings, which means discard_range() previously did nothing for guest_memfd-backed memory. This prevented virtio-mem unplug and balloon inflate from actually freeing physical pages back to the host when secret_free is enabled. Add a fallocate(FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) path for MAP_SHARED file-backed regions, which punches holes in the guest_memfd backing file and releases the pages from the page cache. Signed-off-by: Jack Thomson --- .../seccomp/aarch64-unknown-linux-musl.json | 6 +++- .../seccomp/x86_64-unknown-linux-musl.json | 4 +++ src/vmm/src/vstate/memory.rs | 32 +++++++++++++++---- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 5cfec5f518a..b5c3e602d93 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -217,7 +217,11 @@ }, { "syscall": "madvise", - "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." + "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." + }, + { + "syscall": "fallocate", + "comment": "Used to punch holes in guest_memfd (MAP_SHARED) when discarding memory ranges, e.g. during virtio-mem unplug or balloon inflate with secret_free." }, { "syscall": "msync", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index c1eef63a6e3..80384bc18a2 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -219,6 +219,10 @@ "syscall": "madvise", "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." }, + { + "syscall": "fallocate", + "comment": "Used to punch holes in guest_memfd (MAP_SHARED) when discarding memory ranges, e.g. during virtio-mem unplug or balloon inflate with secret_free." + }, { "syscall": "msync", "comment": "Used by the VirtIO pmem device to sync the file content with the backing file.", diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 5c7eeeb3d99..890a746d118 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -475,14 +475,32 @@ impl GuestRegionMmapExt { Ok(()) } } - // Match either the case of an anonymous mapping, or the case - // of a shared file mapping. - // TODO: madvise(MADV_DONTNEED) doesn't actually work with memfd - // (or in general MAP_SHARED of a fd). In those cases we should use - // fallocate64(FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE). - // We keep falling to the madvise branch to keep the previous behaviour. + // Guest_memfd (MAP_SHARED): use fallocate(PUNCH_HOLE) to free pages. + (Some(fo), flags) if flags & libc::MAP_SHARED != 0 => { + let file_off = fo.start() + caddr.raw_value(); + let len_i64 = i64::try_from(len).expect("discard length exceeds i64"); + // SAFETY: fd and offset are valid, len is within the mapped region. + let ret = unsafe { + libc::fallocate( + fo.file().as_raw_fd(), + libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, + file_off.cast_signed(), + len_i64, + ) + }; + if ret < 0 { + let os_error = std::io::Error::last_os_error(); + error!( + "discard_range: fallocate(PUNCH_HOLE) failed: {:?}", + os_error + ); + Err(GuestMemoryError::IOError(os_error)) + } else { + Ok(()) + } + } + // Anonymous memory: MADV_DONTNEED releases pages back to the kernel. _ => { - // Madvise the region in order to mark it as not used. // SAFETY: The address and length are known to be valid. let ret = unsafe { libc::madvise(phys_address.cast(), len, libc::MADV_DONTNEED) }; if ret < 0 { From 17560a549fb68a4ed5ec20e0edad2a190196a5d5 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Wed, 25 Mar 2026 17:27:48 +0000 Subject: [PATCH 2/5] feat(balloon): allow balloon device with secret_free Now that discard_range() uses fallocate(PUNCH_HOLE) for MAP_SHARED guest_memfd regions, the balloon can properly reclaim memory when secret_free is enabled. Remove the restriction that prevented combining balloon with secret_free. Signed-off-by: Jack Thomson --- src/vmm/src/resources.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 1e0f692b58f..92fe93bcd37 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -268,12 +268,6 @@ impl VmResources { return Err(MachineConfigError::IncompatibleBalloonSize); } - if self.balloon.get().is_some() && updated.secret_free { - return Err(MachineConfigError::Incompatible( - "balloon device", - "secret freedom", - )); - } if updated.secret_free { if self.vhost_user_devices_used() { return Err(MachineConfigError::Incompatible( @@ -347,10 +341,6 @@ impl VmResources { return Err(BalloonConfigError::TooManyPagesRequested); } - if self.machine_config.secret_free { - return Err(BalloonConfigError::IncompatibleWith("secret freedom")); - } - self.balloon.set(config) } From 160e3742c8e7ecd8f12d1243add73b872d5fab47 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Wed, 25 Mar 2026 18:15:37 +0000 Subject: [PATCH 3/5] feat(utils): use guest meminfo for secret_free memory readings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When secret_free is enabled, guest memory is backed by guest_memfd (MAP_SHARED). Host RSS doesn't reflect pages freed by fallocate(PUNCH_HOLE) — they're removed from the page cache, not RSS. Make get_stable_rss_mem() return guest-side memory usage (total - available) when uvm.secret_free is set. This is a closed system unaffected by host noise and correctly reflects balloon and virtio-mem operations. Signed-off-by: Jack Thomson --- tests/framework/utils.py | 22 ++++++++++++++----- .../performance/test_hotplug_memory.py | 4 ++-- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/tests/framework/utils.py b/tests/framework/utils.py index 8c343e80980..a06d602cb5b 100644 --- a/tests/framework/utils.py +++ b/tests/framework/utils.py @@ -28,6 +28,8 @@ wait_fixed, ) +from framework.guest_stats import MeminfoGuest + FLUSH_CMD = 'screen -S {session} -X colon "logfile flush 0^M"' CommandReturn = namedtuple("CommandReturn", "returncode stdout stderr") CMDLOG = logging.getLogger("commands") @@ -131,9 +133,14 @@ def track_cpu_utilization( return cpu_utilization -def get_resident_memory(process: psutil.Process): +def get_resident_memory(uvm): """Returns current memory utilization in KiB, including used HugeTLBFS""" + if uvm is not None and uvm.secret_free: + stats = MeminfoGuest(uvm).get() + return stats.mem_total.kib() - stats.mem_available.kib() + + process: psutil.Process = uvm.ps proc_status = Path("/proc", str(process.pid), "status").read_text("utf-8") for line in proc_status.splitlines(): if line.startswith("HugetlbPages:"): # entry is in KiB @@ -257,18 +264,23 @@ def search_output_from_cmd(cmd: str, find_regex: typing.Pattern) -> typing.Match def get_stable_rss_mem(uvm, percentage_delta=1): """ - Get the RSS memory that a guest uses, given the pid of the guest. + Get a stable memory usage reading for the VM. + + For regular memory: returns host RSS of the FC process (KiB). + For secret_free (guest_memfd): returns guest-side memory usage + (total - available) since host RSS doesn't track page cache pages + freed by fallocate(PUNCH_HOLE). - Wait till the fluctuations in RSS drop below percentage_delta. + Wait till the fluctuations drop below percentage_delta. Or print a warning if this does not happen. """ first_rss = 0 second_rss = 0 for _ in range(5): - first_rss = get_resident_memory(uvm.ps) + first_rss = get_resident_memory(uvm) time.sleep(1) - second_rss = get_resident_memory(uvm.ps) + second_rss = get_resident_memory(uvm) abs_diff = abs(first_rss - second_rss) abs_delta = abs_diff / first_rss * 100 print( diff --git a/tests/integration_tests/performance/test_hotplug_memory.py b/tests/integration_tests/performance/test_hotplug_memory.py index 2b0dd3bf743..59d5e2828f4 100644 --- a/tests/integration_tests/performance/test_hotplug_memory.py +++ b/tests/integration_tests/performance/test_hotplug_memory.py @@ -278,11 +278,11 @@ def check_hotplug(uvm, requested_size_mib): def check_hotunplug(uvm, requested_size_mib): """Verifies memory can be hotunplugged and gets released""" - rss_before = get_resident_memory(uvm.ps) + rss_before = get_resident_memory(uvm) check_hotplug(uvm, requested_size_mib) - rss_after = get_resident_memory(uvm.ps) + rss_after = get_resident_memory(uvm) print(f"RSS before: {rss_before}, after: {rss_after}") From 81612135bddffa70c4d1af95826ae2de35b2954f Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Wed, 25 Mar 2026 18:15:37 +0000 Subject: [PATCH 4/5] test(balloon): enable secret_free on all balloon tests Add the secret_free fixture parameter to all balloon functional tests, so they run with both SF_OFF and SF_ON variants. This exercises the fallocate(PUNCH_HOLE) discard path for guest_memfd-backed memory during balloon inflate/deflate. Signed-off-by: Jack Thomson --- .../functional/test_balloon.py | 66 +++++++++---------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py index 44d9b9b6f37..d28082e3cf3 100644 --- a/tests/integration_tests/functional/test_balloon.py +++ b/tests/integration_tests/functional/test_balloon.py @@ -93,13 +93,13 @@ def _test_rss_memory_lower(test_microvm): # pylint: disable=C0103 -def test_rss_memory_lower(uvm_plain_any): +def test_rss_memory_lower(uvm_plain_any, secret_free): """ Test that inflating the balloon makes guest use less rss memory. """ test_microvm = uvm_plain_any test_microvm.spawn() - test_microvm.basic_config() + test_microvm.basic_config(secret_free=secret_free) test_microvm.add_net_iface() # Add a memory balloon. @@ -114,13 +114,13 @@ def test_rss_memory_lower(uvm_plain_any): # pylint: disable=C0103 -def test_inflate_reduces_free(uvm_plain_any): +def test_inflate_reduces_free(uvm_plain_any, secret_free): """ Check that the output of free in guest changes with inflate. """ test_microvm = uvm_plain_any test_microvm.spawn() - test_microvm.basic_config() + test_microvm.basic_config(secret_free=secret_free) test_microvm.add_net_iface() # Install deflated balloon. @@ -150,7 +150,7 @@ def test_inflate_reduces_free(uvm_plain_any): # pylint: disable=C0103 @pytest.mark.parametrize("deflate_on_oom", [True, False]) -def test_deflate_on_oom(uvm_plain_any, deflate_on_oom): +def test_deflate_on_oom(uvm_plain_any, secret_free, deflate_on_oom): """ Verify that setting the `deflate_on_oom` option works correctly. @@ -167,7 +167,7 @@ def test_deflate_on_oom(uvm_plain_any, deflate_on_oom): test_microvm = uvm_plain_any test_microvm.spawn() - test_microvm.basic_config() + test_microvm.basic_config(secret_free=secret_free) test_microvm.add_net_iface() # Add a deflated memory balloon. @@ -215,13 +215,13 @@ def test_deflate_on_oom(uvm_plain_any, deflate_on_oom): # pylint: disable=C0103 -def test_reinflate_balloon(uvm_plain_any): +def test_reinflate_balloon(uvm_plain_any, secret_free): """ Verify that repeatedly inflating and deflating the balloon works. """ test_microvm = uvm_plain_any test_microvm.spawn() - test_microvm.basic_config() + test_microvm.basic_config(secret_free=secret_free) test_microvm.add_net_iface() # Add a deflated memory balloon. @@ -280,13 +280,13 @@ def test_reinflate_balloon(uvm_plain_any): # pylint: disable=C0103 -def test_stats(uvm_plain_any): +def test_stats(uvm_plain_any, secret_free): """ Verify that balloon stats work as expected. """ test_microvm = uvm_plain_any test_microvm.spawn() - test_microvm.basic_config() + test_microvm.basic_config(secret_free=secret_free) test_microvm.add_net_iface() # Add a memory balloon with stats enabled. @@ -351,13 +351,13 @@ def test_stats(uvm_plain_any): check_guest_dmesg_for_stalls(test_microvm.ssh) -def test_stats_update(uvm_plain_any): +def test_stats_update(uvm_plain_any, secret_free): """ Verify that balloon stats update correctly. """ test_microvm = uvm_plain_any test_microvm.spawn() - test_microvm.basic_config() + test_microvm.basic_config(secret_free=secret_free) test_microvm.add_net_iface() # Add a memory balloon with stats enabled. @@ -403,7 +403,7 @@ def test_stats_update(uvm_plain_any): check_guest_dmesg_for_stalls(test_microvm.ssh) -def test_balloon_snapshot(uvm_plain_any, microvm_factory): +def test_balloon_snapshot(uvm_plain_any, secret_free, microvm_factory): """ Test that the balloon works after pause/resume. """ @@ -415,6 +415,7 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory): vm.basic_config( vcpu_count=2, mem_size_mib=256, + secret_free=secret_free, ) vm.add_net_iface() @@ -437,15 +438,14 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory): # Now inflate the balloon with 20MB of pages. vm.api.balloon.patch(amount_mib=20) - # Check memory usage again. + # Check memory usage again — should decrease (balloon reclaimed pages). second_reading = get_stable_rss_mem(vm) - - # There should be a reduction in RSS, but it's inconsistent. - # We only test that the reduction happens. assert first_reading > second_reading snapshot = vm.snapshot_full() - microvm = microvm_factory.build_from_snapshot(snapshot) + # secret_free requires UFFD backend (file backend can't mmap as guest_memfd) + uffd = "on_demand" if secret_free else None + microvm = microvm_factory.build_from_snapshot(snapshot, uffd_handler_name=uffd) # Free page reporting and hinting fragment guest memory VMAs # making it harder to identify them in the memory monitor. @@ -461,18 +461,15 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory): # Dirty 60MB of pages. make_guest_dirty_memory(microvm.ssh, amount_mib=60) - # Check memory usage. + # Check memory usage — should increase (guest used more memory). fourth_reading = get_stable_rss_mem(microvm) - assert fourth_reading > third_reading # Inflate the balloon with another 20MB of pages. microvm.api.balloon.patch(amount_mib=40) + # Should decrease again (balloon reclaimed pages). fifth_reading = get_stable_rss_mem(microvm) - - # There should be a reduction in RSS, but it's inconsistent. - # We only test that the reduction happens. assert fourth_reading > fifth_reading # Get the stats after we take a snapshot and dirty some memory, @@ -488,7 +485,9 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory): @pytest.mark.parametrize("method", ["reporting", "hinting"]) -def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method): +def test_hinting_reporting_snapshot( + uvm_plain_any, secret_free, microvm_factory, method +): """ Test that the balloon hinting and reporting works after pause/resume. """ @@ -500,6 +499,7 @@ def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method): vm.basic_config( vcpu_count=2, mem_size_mib=256, + secret_free=secret_free, ) vm.add_net_iface() @@ -534,15 +534,14 @@ def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method): if free_page_hinting: vm.api.balloon_hinting_start.patch() - # Check memory usage again. + # Check memory usage again — should decrease (pages freed + hinted/reported). second_reading = get_stable_rss_mem(vm) - - # There should be a reduction in RSS, but it's inconsistent. - # We only test that the reduction happens. assert first_reading > second_reading snapshot = vm.snapshot_full() - microvm = microvm_factory.build_from_snapshot(snapshot) + # secret_free requires UFFD backend (file backend can't mmap as guest_memfd) + uffd = "on_demand" if secret_free else None + microvm = microvm_factory.build_from_snapshot(snapshot, uffd_handler_name=uffd) # Free page reporting and hinting fragment guest memory VMAs # making it harder to identify them in the memory monitor. @@ -565,23 +564,20 @@ def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method): if free_page_hinting: microvm.api.balloon_hinting_start.patch() - # Check memory usage again. + # Check memory usage again — should decrease. fourth_reading = get_stable_rss_mem(microvm) - - # There should be a reduction in RSS, but it's inconsistent. - # We only test that the reduction happens. assert third_reading > fourth_reading check_guest_dmesg_for_stalls(microvm.ssh) @pytest.mark.parametrize("method", ["traditional", "hinting", "reporting"]) -def test_memory_scrub(uvm_plain_any, method): +def test_memory_scrub(uvm_plain_any, secret_free, method): """ Test that the memory is zeroed after deflate. """ microvm = uvm_plain_any microvm.spawn() - microvm.basic_config(vcpu_count=2, mem_size_mib=256) + microvm.basic_config(vcpu_count=2, mem_size_mib=256, secret_free=secret_free) microvm.add_net_iface() free_page_reporting = method == "reporting" From eab0a1305803c8aa190b6b36e45fb547c5af62e9 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 26 Mar 2026 12:14:54 +0000 Subject: [PATCH 5/5] test(hotplug): enable RSS assertion for secret_free and fix VM leak Now that discard_range() uses fallocate(PUNCH_HOLE) for guest_memfd, and get_resident_memory() uses guest meminfo for secret_free VMs, the RSS decrease assertion in check_hotunplug works correctly for secret_free. Remove the skip. Signed-off-by: Jack Thomson --- tests/integration_tests/performance/test_hotplug_memory.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/performance/test_hotplug_memory.py b/tests/integration_tests/performance/test_hotplug_memory.py index 59d5e2828f4..96e7fa80371 100644 --- a/tests/integration_tests/performance/test_hotplug_memory.py +++ b/tests/integration_tests/performance/test_hotplug_memory.py @@ -289,8 +289,10 @@ def check_hotunplug(uvm, requested_size_mib): machine_config = uvm.api.machine_config.get().json() huge_pages = HugePagesConfig(machine_config["huge_pages"]) secret_free = machine_config.get("secret_free", False) - if not secret_free and ( - huge_pages == HugePagesConfig.NONE or supports_hugetlbfs_discard() + if ( + secret_free + or huge_pages == HugePagesConfig.NONE + or supports_hugetlbfs_discard() ): assert rss_after < rss_before, "RSS didn't decrease"