From 7696718031ae3ba39989a86eaae18684b1532d52 Mon Sep 17 00:00:00 2001
From: Jack Thomson <jackabt@amazon.com>
Date: Wed, 25 Mar 2026 17:27:18 +0000
Subject: [PATCH 1/5] feat(memory): use fallocate(PUNCH_HOLE) for guest_memfd
 discard

MADV_DONTNEED is a no-op for MAP_SHARED mappings, which means
discard_range() previously did nothing for guest_memfd-backed memory.
This prevented virtio-mem unplug and balloon inflate from actually
freeing physical pages back to the host when secret_free is enabled.

Add a fallocate(FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) path for
MAP_SHARED file-backed regions, which punches holes in the guest_memfd
backing file and releases the pages from the page cache.

Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
 .../seccomp/aarch64-unknown-linux-musl.json   |  6 +++-
 .../seccomp/x86_64-unknown-linux-musl.json    |  4 +++
 src/vmm/src/vstate/memory.rs                  | 32 +++++++++++++++----
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json
index 5cfec5f518a..b5c3e602d93 100644
--- a/resources/seccomp/aarch64-unknown-linux-musl.json
+++ b/resources/seccomp/aarch64-unknown-linux-musl.json
@@ -217,7 +217,11 @@
             },
             {
                 "syscall": "madvise",
-                "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." 
+                "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms."
+            },
+            {
+                "syscall": "fallocate",
+                "comment": "Used to punch holes in guest_memfd (MAP_SHARED) when discarding memory ranges, e.g. during virtio-mem unplug or balloon inflate with secret_free."
             },
             {
                 "syscall": "msync",
diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json
index c1eef63a6e3..80384bc18a2 100644
--- a/resources/seccomp/x86_64-unknown-linux-musl.json
+++ b/resources/seccomp/x86_64-unknown-linux-musl.json
@@ -219,6 +219,10 @@
                 "syscall": "madvise",
                 "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms."
             },
+            {
+                "syscall": "fallocate",
+                "comment": "Used to punch holes in guest_memfd (MAP_SHARED) when discarding memory ranges, e.g. during virtio-mem unplug or balloon inflate with secret_free."
+            },
             {
                 "syscall": "msync",
                 "comment": "Used by the VirtIO pmem device to sync the file content with the backing file.",
diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs
index 5c7eeeb3d99..890a746d118 100644
--- a/src/vmm/src/vstate/memory.rs
+++ b/src/vmm/src/vstate/memory.rs
@@ -475,14 +475,32 @@ impl GuestRegionMmapExt {
                     Ok(())
                 }
             }
-            // Match either the case of an anonymous mapping, or the case
-            // of a shared file mapping.
-            // TODO: madvise(MADV_DONTNEED) doesn't actually work with memfd
-            // (or in general MAP_SHARED of a fd). In those cases we should use
-            // fallocate64(FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE).
-            // We keep falling to the madvise branch to keep the previous behaviour.
+            // Guest_memfd (MAP_SHARED): use fallocate(PUNCH_HOLE) to free pages.
+            (Some(fo), flags) if flags & libc::MAP_SHARED != 0 => {
+                let file_off = fo.start() + caddr.raw_value();
+                let len_i64 = i64::try_from(len).expect("discard length exceeds i64");
+                // SAFETY: fd and offset are valid, len is within the mapped region.
+                let ret = unsafe {
+                    libc::fallocate(
+                        fo.file().as_raw_fd(),
+                        libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE,
+                        file_off.cast_signed(),
+                        len_i64,
+                    )
+                };
+                if ret < 0 {
+                    let os_error = std::io::Error::last_os_error();
+                    error!(
+                        "discard_range: fallocate(PUNCH_HOLE) failed: {:?}",
+                        os_error
+                    );
+                    Err(GuestMemoryError::IOError(os_error))
+                } else {
+                    Ok(())
+                }
+            }
+            // Anonymous memory: MADV_DONTNEED releases pages back to the kernel.
             _ => {
-                // Madvise the region in order to mark it as not used.
                 // SAFETY: The address and length are known to be valid.
                 let ret = unsafe { libc::madvise(phys_address.cast(), len, libc::MADV_DONTNEED) };
                 if ret < 0 {

From 17560a549fb68a4ed5ec20e0edad2a190196a5d5 Mon Sep 17 00:00:00 2001
From: Jack Thomson <jackabt@amazon.com>
Date: Wed, 25 Mar 2026 17:27:48 +0000
Subject: [PATCH 2/5] feat(balloon): allow balloon device with secret_free

Now that discard_range() uses fallocate(PUNCH_HOLE) for MAP_SHARED
guest_memfd regions, the balloon can properly reclaim memory when
secret_free is enabled. Remove the restriction that prevented
combining balloon with secret_free.

Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
 src/vmm/src/resources.rs | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs
index 1e0f692b58f..92fe93bcd37 100644
--- a/src/vmm/src/resources.rs
+++ b/src/vmm/src/resources.rs
@@ -268,12 +268,6 @@ impl VmResources {
             return Err(MachineConfigError::IncompatibleBalloonSize);
         }
 
-        if self.balloon.get().is_some() && updated.secret_free {
-            return Err(MachineConfigError::Incompatible(
-                "balloon device",
-                "secret freedom",
-            ));
-        }
         if updated.secret_free {
             if self.vhost_user_devices_used() {
                 return Err(MachineConfigError::Incompatible(
@@ -347,10 +341,6 @@ impl VmResources {
             return Err(BalloonConfigError::TooManyPagesRequested);
         }
 
-        if self.machine_config.secret_free {
-            return Err(BalloonConfigError::IncompatibleWith("secret freedom"));
-        }
-
         self.balloon.set(config)
     }
 

From 160e3742c8e7ecd8f12d1243add73b872d5fab47 Mon Sep 17 00:00:00 2001
From: Jack Thomson <jackabt@amazon.com>
Date: Wed, 25 Mar 2026 18:15:37 +0000
Subject: [PATCH 3/5] feat(utils): use guest meminfo for secret_free memory
 readings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When secret_free is enabled, guest memory is backed by guest_memfd
(MAP_SHARED). Host RSS doesn't reflect pages freed by
fallocate(PUNCH_HOLE) — they're removed from the page cache, not RSS.

Make get_stable_rss_mem() return guest-side memory usage
(total - available) when uvm.secret_free is set. This is a closed
system unaffected by host noise and correctly reflects balloon and
virtio-mem operations.

Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
 tests/framework/utils.py                      | 22 ++++++++++++++-----
 .../performance/test_hotplug_memory.py        |  4 ++--
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/tests/framework/utils.py b/tests/framework/utils.py
index 8c343e80980..a06d602cb5b 100644
--- a/tests/framework/utils.py
+++ b/tests/framework/utils.py
@@ -28,6 +28,8 @@
     wait_fixed,
 )
 
+from framework.guest_stats import MeminfoGuest
+
 FLUSH_CMD = 'screen -S {session} -X colon "logfile flush 0^M"'
 CommandReturn = namedtuple("CommandReturn", "returncode stdout stderr")
 CMDLOG = logging.getLogger("commands")
@@ -131,9 +133,14 @@ def track_cpu_utilization(
     return cpu_utilization
 
 
-def get_resident_memory(process: psutil.Process):
+def get_resident_memory(uvm):
     """Returns current memory utilization in KiB, including used HugeTLBFS"""
 
+    if uvm is not None and uvm.secret_free:
+        stats = MeminfoGuest(uvm).get()
+        return stats.mem_total.kib() - stats.mem_available.kib()
+
+    process: psutil.Process = uvm.ps
     proc_status = Path("/proc", str(process.pid), "status").read_text("utf-8")
     for line in proc_status.splitlines():
         if line.startswith("HugetlbPages:"):  # entry is in KiB
@@ -257,18 +264,23 @@ def search_output_from_cmd(cmd: str, find_regex: typing.Pattern) -> typing.Match
 
 def get_stable_rss_mem(uvm, percentage_delta=1):
     """
-    Get the RSS memory that a guest uses, given the pid of the guest.
+    Get a stable memory usage reading for the VM.
+
+    For regular memory: returns host RSS of the FC process (KiB).
+    For secret_free (guest_memfd): returns guest-side memory usage
+    (total - available) since host RSS doesn't track page cache pages
+    freed by fallocate(PUNCH_HOLE).
 
-    Wait till the fluctuations in RSS drop below percentage_delta.
+    Wait till the fluctuations drop below percentage_delta.
     Or print a warning if this does not happen.
     """
 
     first_rss = 0
     second_rss = 0
     for _ in range(5):
-        first_rss = get_resident_memory(uvm.ps)
+        first_rss = get_resident_memory(uvm)
         time.sleep(1)
-        second_rss = get_resident_memory(uvm.ps)
+        second_rss = get_resident_memory(uvm)
         abs_diff = abs(first_rss - second_rss)
         abs_delta = abs_diff / first_rss * 100
         print(
diff --git a/tests/integration_tests/performance/test_hotplug_memory.py b/tests/integration_tests/performance/test_hotplug_memory.py
index 2b0dd3bf743..59d5e2828f4 100644
--- a/tests/integration_tests/performance/test_hotplug_memory.py
+++ b/tests/integration_tests/performance/test_hotplug_memory.py
@@ -278,11 +278,11 @@ def check_hotplug(uvm, requested_size_mib):
 def check_hotunplug(uvm, requested_size_mib):
     """Verifies memory can be hotunplugged and gets released"""
 
-    rss_before = get_resident_memory(uvm.ps)
+    rss_before = get_resident_memory(uvm)
 
     check_hotplug(uvm, requested_size_mib)
 
-    rss_after = get_resident_memory(uvm.ps)
+    rss_after = get_resident_memory(uvm)
 
     print(f"RSS before: {rss_before}, after: {rss_after}")
 

From 81612135bddffa70c4d1af95826ae2de35b2954f Mon Sep 17 00:00:00 2001
From: Jack Thomson <jackabt@amazon.com>
Date: Wed, 25 Mar 2026 18:15:37 +0000
Subject: [PATCH 4/5] test(balloon): enable secret_free on all balloon tests

Add the secret_free fixture parameter to all balloon functional tests,
so they run with both SF_OFF and SF_ON variants. This exercises the
fallocate(PUNCH_HOLE) discard path for guest_memfd-backed memory during
balloon inflate/deflate.

Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
 .../functional/test_balloon.py                | 66 +++++++++----------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
index 44d9b9b6f37..d28082e3cf3 100644
--- a/tests/integration_tests/functional/test_balloon.py
+++ b/tests/integration_tests/functional/test_balloon.py
@@ -93,13 +93,13 @@ def _test_rss_memory_lower(test_microvm):
 
 
 # pylint: disable=C0103
-def test_rss_memory_lower(uvm_plain_any):
+def test_rss_memory_lower(uvm_plain_any, secret_free):
     """
     Test that inflating the balloon makes guest use less rss memory.
     """
     test_microvm = uvm_plain_any
     test_microvm.spawn()
-    test_microvm.basic_config()
+    test_microvm.basic_config(secret_free=secret_free)
     test_microvm.add_net_iface()
 
     # Add a memory balloon.
@@ -114,13 +114,13 @@ def test_rss_memory_lower(uvm_plain_any):
 
 
 # pylint: disable=C0103
-def test_inflate_reduces_free(uvm_plain_any):
+def test_inflate_reduces_free(uvm_plain_any, secret_free):
     """
     Check that the output of free in guest changes with inflate.
     """
     test_microvm = uvm_plain_any
     test_microvm.spawn()
-    test_microvm.basic_config()
+    test_microvm.basic_config(secret_free=secret_free)
     test_microvm.add_net_iface()
 
     # Install deflated balloon.
@@ -150,7 +150,7 @@ def test_inflate_reduces_free(uvm_plain_any):
 
 # pylint: disable=C0103
 @pytest.mark.parametrize("deflate_on_oom", [True, False])
-def test_deflate_on_oom(uvm_plain_any, deflate_on_oom):
+def test_deflate_on_oom(uvm_plain_any, secret_free, deflate_on_oom):
     """
     Verify that setting the `deflate_on_oom` option works correctly.
 
@@ -167,7 +167,7 @@ def test_deflate_on_oom(uvm_plain_any, deflate_on_oom):
 
     test_microvm = uvm_plain_any
     test_microvm.spawn()
-    test_microvm.basic_config()
+    test_microvm.basic_config(secret_free=secret_free)
     test_microvm.add_net_iface()
 
     # Add a deflated memory balloon.
@@ -215,13 +215,13 @@ def test_deflate_on_oom(uvm_plain_any, deflate_on_oom):
 
 
 # pylint: disable=C0103
-def test_reinflate_balloon(uvm_plain_any):
+def test_reinflate_balloon(uvm_plain_any, secret_free):
     """
     Verify that repeatedly inflating and deflating the balloon works.
     """
     test_microvm = uvm_plain_any
     test_microvm.spawn()
-    test_microvm.basic_config()
+    test_microvm.basic_config(secret_free=secret_free)
     test_microvm.add_net_iface()
 
     # Add a deflated memory balloon.
@@ -280,13 +280,13 @@ def test_reinflate_balloon(uvm_plain_any):
 
 
 # pylint: disable=C0103
-def test_stats(uvm_plain_any):
+def test_stats(uvm_plain_any, secret_free):
     """
     Verify that balloon stats work as expected.
     """
     test_microvm = uvm_plain_any
     test_microvm.spawn()
-    test_microvm.basic_config()
+    test_microvm.basic_config(secret_free=secret_free)
     test_microvm.add_net_iface()
 
     # Add a memory balloon with stats enabled.
@@ -351,13 +351,13 @@ def test_stats(uvm_plain_any):
     check_guest_dmesg_for_stalls(test_microvm.ssh)
 
 
-def test_stats_update(uvm_plain_any):
+def test_stats_update(uvm_plain_any, secret_free):
     """
     Verify that balloon stats update correctly.
     """
     test_microvm = uvm_plain_any
     test_microvm.spawn()
-    test_microvm.basic_config()
+    test_microvm.basic_config(secret_free=secret_free)
     test_microvm.add_net_iface()
 
     # Add a memory balloon with stats enabled.
@@ -403,7 +403,7 @@ def test_stats_update(uvm_plain_any):
     check_guest_dmesg_for_stalls(test_microvm.ssh)
 
 
-def test_balloon_snapshot(uvm_plain_any, microvm_factory):
+def test_balloon_snapshot(uvm_plain_any, secret_free, microvm_factory):
     """
     Test that the balloon works after pause/resume.
     """
@@ -415,6 +415,7 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory):
     vm.basic_config(
         vcpu_count=2,
         mem_size_mib=256,
+        secret_free=secret_free,
     )
     vm.add_net_iface()
 
@@ -437,15 +438,14 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory):
     # Now inflate the balloon with 20MB of pages.
     vm.api.balloon.patch(amount_mib=20)
 
-    # Check memory usage again.
+    # Check memory usage again — should decrease (balloon reclaimed pages).
     second_reading = get_stable_rss_mem(vm)
-
-    # There should be a reduction in RSS, but it's inconsistent.
-    # We only test that the reduction happens.
     assert first_reading > second_reading
 
     snapshot = vm.snapshot_full()
-    microvm = microvm_factory.build_from_snapshot(snapshot)
+    # secret_free requires UFFD backend (file backend can't mmap as guest_memfd)
+    uffd = "on_demand" if secret_free else None
+    microvm = microvm_factory.build_from_snapshot(snapshot, uffd_handler_name=uffd)
 
     # Free page reporting and hinting fragment guest memory VMAs
     # making it harder to identify them in the memory monitor.
@@ -461,18 +461,15 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory):
     # Dirty 60MB of pages.
     make_guest_dirty_memory(microvm.ssh, amount_mib=60)
 
-    # Check memory usage.
+    # Check memory usage — should increase (guest used more memory).
     fourth_reading = get_stable_rss_mem(microvm)
-
     assert fourth_reading > third_reading
 
     # Inflate the balloon with another 20MB of pages.
     microvm.api.balloon.patch(amount_mib=40)
 
+    # Should decrease again (balloon reclaimed pages).
     fifth_reading = get_stable_rss_mem(microvm)
-
-    # There should be a reduction in RSS, but it's inconsistent.
-    # We only test that the reduction happens.
     assert fourth_reading > fifth_reading
 
     # Get the stats after we take a snapshot and dirty some memory,
@@ -488,7 +485,9 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory):
 
 
 @pytest.mark.parametrize("method", ["reporting", "hinting"])
-def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method):
+def test_hinting_reporting_snapshot(
+    uvm_plain_any, secret_free, microvm_factory, method
+):
     """
     Test that the balloon hinting and reporting works after pause/resume.
     """
@@ -500,6 +499,7 @@ def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method):
     vm.basic_config(
         vcpu_count=2,
         mem_size_mib=256,
+        secret_free=secret_free,
     )
     vm.add_net_iface()
 
@@ -534,15 +534,14 @@ def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method):
     if free_page_hinting:
         vm.api.balloon_hinting_start.patch()
 
-    # Check memory usage again.
+    # Check memory usage again — should decrease (pages freed + hinted/reported).
     second_reading = get_stable_rss_mem(vm)
-
-    # There should be a reduction in RSS, but it's inconsistent.
-    # We only test that the reduction happens.
     assert first_reading > second_reading
 
     snapshot = vm.snapshot_full()
-    microvm = microvm_factory.build_from_snapshot(snapshot)
+    # secret_free requires UFFD backend (file backend can't mmap as guest_memfd)
+    uffd = "on_demand" if secret_free else None
+    microvm = microvm_factory.build_from_snapshot(snapshot, uffd_handler_name=uffd)
 
     # Free page reporting and hinting fragment guest memory VMAs
     # making it harder to identify them in the memory monitor.
@@ -565,23 +564,20 @@ def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method):
     if free_page_hinting:
         microvm.api.balloon_hinting_start.patch()
 
-    # Check memory usage again.
+    # Check memory usage again — should decrease.
     fourth_reading = get_stable_rss_mem(microvm)
-
-    # There should be a reduction in RSS, but it's inconsistent.
-    # We only test that the reduction happens.
     assert third_reading > fourth_reading
     check_guest_dmesg_for_stalls(microvm.ssh)
 
 
 @pytest.mark.parametrize("method", ["traditional", "hinting", "reporting"])
-def test_memory_scrub(uvm_plain_any, method):
+def test_memory_scrub(uvm_plain_any, secret_free, method):
     """
     Test that the memory is zeroed after deflate.
     """
     microvm = uvm_plain_any
     microvm.spawn()
-    microvm.basic_config(vcpu_count=2, mem_size_mib=256)
+    microvm.basic_config(vcpu_count=2, mem_size_mib=256, secret_free=secret_free)
     microvm.add_net_iface()
 
     free_page_reporting = method == "reporting"

From eab0a1305803c8aa190b6b36e45fb547c5af62e9 Mon Sep 17 00:00:00 2001
From: Jack Thomson <jackabt@amazon.com>
Date: Thu, 26 Mar 2026 12:14:54 +0000
Subject: [PATCH 5/5] test(hotplug): enable RSS assertion for secret_free and
 fix VM leak

Now that discard_range() uses fallocate(PUNCH_HOLE) for guest_memfd,
and get_resident_memory() uses guest meminfo for secret_free VMs,
the RSS decrease assertion in check_hotunplug works correctly for
secret_free. Remove the skip.

Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
 tests/integration_tests/performance/test_hotplug_memory.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/integration_tests/performance/test_hotplug_memory.py b/tests/integration_tests/performance/test_hotplug_memory.py
index 59d5e2828f4..96e7fa80371 100644
--- a/tests/integration_tests/performance/test_hotplug_memory.py
+++ b/tests/integration_tests/performance/test_hotplug_memory.py
@@ -289,8 +289,10 @@ def check_hotunplug(uvm, requested_size_mib):
     machine_config = uvm.api.machine_config.get().json()
     huge_pages = HugePagesConfig(machine_config["huge_pages"])
     secret_free = machine_config.get("secret_free", False)
-    if not secret_free and (
-        huge_pages == HugePagesConfig.NONE or supports_hugetlbfs_discard()
+    if (
+        secret_free
+        or huge_pages == HugePagesConfig.NONE
+        or supports_hugetlbfs_discard()
     ):
         assert rss_after < rss_before, "RSS didn't decrease"