diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 0d8d4e51224..71dfc353a0f 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -284,6 +284,75 @@ } ] }, + { + "syscall": "mmap", + "comment": "Used by pmem device to mmap the backing file", + "args": [ + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 16385, + "comment": "libc::MAP_SHARED | libc::MAP_NORESERVE" + } + ] + }, + { + "syscall": "mmap", + "comment": "Used by pmem device for aligned anonymous mapping", + "args": [ + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 16418, + "comment": "libc::MAP_PRIVATE | libc::MAP_NORESERVE | libc::MAP_ANONYMOUS" + } + ] + }, + { + "syscall": "mmap", + "comment": "Used by pmem device to overlay file mapping on anonymous region", + "args": [ + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 16401, + "comment": "libc::MAP_SHARED | libc::MAP_NORESERVE | libc::MAP_FIXED" + } + ] + }, + { + "syscall": "mmap", + "comment": "Used by IovDeque ring buffer for net device hotplug", + "args": [ + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 17, + "comment": "libc::MAP_SHARED | libc::MAP_FIXED" + } + ] + }, + { + "syscall": "memfd_create", + "comment": "Used by IovDeque ring buffer for net device hotplug" + }, + { + "syscall": "fcntl", + "comment": "Used by IovDeque to seal memfd during net device hotplug", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1033, + "comment": "F_ADD_SEALS" + } + ] + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", @@ -353,6 +422,26 @@ } ] }, + { + "syscall": "timerfd_create", + "comment": "Needed for creating rate limiters during device hotplug", + "args": [ + { + "index": 0, + "type": "dword", + "op": "eq", + "val": 1, + "comment": "CLOCK_MONOTONIC" + }, + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 526336, + "comment": "TFD_NONBLOCK | TFD_CLOEXEC" + } + ] + }, { "syscall": "timerfd_settime", "comment": "Needed for rate limiting and metrics", @@ -465,6 +554,58 @@ } ] }, + { + "syscall": "ioctl", + "comment": "Needed for registering ioeventfds during device hotplug", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1077980793, + "comment": "KVM_IOEVENTFD" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Needed for opening tap device during net hotplug", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074025674, + "comment": "TUNSETIFF" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Needed for configuring tap offload during net hotplug", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074025680, + "comment": "TUNSETOFFLOAD" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Needed for setting tap vnet header size during net hotplug", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074025688, + "comment": "TUNSETVNETHDRSZ" + } + ] + }, { "syscall": "sched_yield", "comment": "Used by the rust standard library in std::sync::mpmc. Firecracker uses mpsc channels from this module for inter-thread communication" diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 4ccbfbd8e50..a3708da86bb 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -284,6 +284,75 @@ } ] }, + { + "syscall": "mmap", + "comment": "Used by pmem device to mmap the backing file", + "args": [ + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 16385, + "comment": "libc::MAP_SHARED | libc::MAP_NORESERVE" + } + ] + }, + { + "syscall": "mmap", + "comment": "Used by pmem device for aligned anonymous mapping", + "args": [ + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 16418, + "comment": "libc::MAP_PRIVATE | libc::MAP_NORESERVE | libc::MAP_ANONYMOUS" + } + ] + }, + { + "syscall": "mmap", + "comment": "Used by pmem device to overlay file mapping on anonymous region", + "args": [ + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 16401, + "comment": "libc::MAP_SHARED | libc::MAP_NORESERVE | libc::MAP_FIXED" + } + ] + }, + { + "syscall": "mmap", + "comment": "Used by IovDeque ring buffer for net device hotplug", + "args": [ + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 17, + "comment": "libc::MAP_SHARED | libc::MAP_FIXED" + } + ] + }, + { + "syscall": "memfd_create", + "comment": "Used by IovDeque ring buffer for net device hotplug" + }, + { + "syscall": "fcntl", + "comment": "Used by IovDeque to seal memfd during net device hotplug", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1033, + "comment": "F_ADD_SEALS" + } + ] + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", @@ -353,6 +422,26 @@ } ] }, + { + "syscall": "timerfd_create", + "comment": "Needed for creating rate limiters during device hotplug", + "args": [ + { + "index": 0, + "type": "dword", + "op": "eq", + "val": 1, + "comment": "CLOCK_MONOTONIC" + }, + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 526336, + "comment": "TFD_NONBLOCK | TFD_CLOEXEC" + } + ] + }, { "syscall": "timerfd_settime", "comment": "Needed for rate limiting and metrics", @@ -477,6 +566,58 @@ } ] }, + { + "syscall": "ioctl", + "comment": "Needed for registering ioeventfds during device hotplug", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1077980793, + "comment": "KVM_IOEVENTFD" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Needed for opening tap device during net hotplug", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074025674, + "comment": "TUNSETIFF" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Needed for configuring tap offload during net hotplug", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074025680, + "comment": "TUNSETOFFLOAD" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Needed for setting tap vnet header size during net hotplug", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074025688, + "comment": "TUNSETVNETHDRSZ" + } + ] + }, { "syscall": "sched_yield", "comment": "Used by the rust standard library in std::sync::mpmc. Firecracker uses mpsc channels from this module for inter-thread communication" diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index a5036b4e29a..ed59c3a92f5 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -41,6 +41,7 @@ struct ApiServerAdapter { from_api: Receiver, to_api: Sender, controller: RuntimeApiController, + request: Option, } impl ApiServerAdapter { @@ -58,12 +59,17 @@ impl ApiServerAdapter { from_api, to_api, controller: RuntimeApiController::new(vmm.clone()), + request: None, })); - event_manager.add_subscriber(api_adapter); + event_manager.add_subscriber(api_adapter.clone()); loop { event_manager .run() .expect("EventManager events driver fatal error"); + api_adapter + .lock() + .expect("Poisoned lock") + .handle_request(event_manager); match vmm.lock().unwrap().shutdown_exit_code() { Some(FcExitCode::Ok) => break, @@ -74,14 +80,39 @@ impl ApiServerAdapter { Ok(()) } - fn handle_request(&mut self, req_action: VmmAction) { - let response = self.controller.handle_request(req_action); + fn _handle_request(&mut self, req_action: VmmAction, event_manager: &mut EventManager) { + let response = self.controller.handle_request(req_action, event_manager); // Send back the result. self.to_api .send(Box::new(response)) .map_err(|_| ()) .expect("one-shot channel closed"); } + + fn handle_request(&mut self, event_manager: &mut EventManager) { + if let Some(api_request) = self.request.take() { + let request_is_pause = *api_request == VmmAction::Pause; + self._handle_request(*api_request, event_manager); + + // If the latest req is a pause request, temporarily switch to a mode where we + // do blocking `recv`s on the `from_api` receiver in a loop, until we get + // unpaused. The device emulation is implicitly paused since we do not + // relinquish control to the event manager because we're not returning from + // `process`. + if request_is_pause { + // This loop only attempts to process API requests, so things like the + // metric flush timerfd handling are frozen as well. + loop { + let req = self.from_api.recv().expect("Error receiving API request."); + let req_is_resume = *req == VmmAction::Resume; + self._handle_request(*req, event_manager); + if req_is_resume { + break; + } + } + } + } + } } impl MutEventSubscriber for ApiServerAdapter { /// Handle a read event (EPOLLIN). @@ -93,26 +124,7 @@ impl MutEventSubscriber for ApiServerAdapter { let _ = self.api_event_fd.read(); match self.from_api.try_recv() { Ok(api_request) => { - let request_is_pause = *api_request == VmmAction::Pause; - self.handle_request(*api_request); - - // If the latest req is a pause request, temporarily switch to a mode where we - // do blocking `recv`s on the `from_api` receiver in a loop, until we get - // unpaused. The device emulation is implicitly paused since we do not - // relinquish control to the event manager because we're not returning from - // `process`. - if request_is_pause { - // This loop only attempts to process API requests, so things like the - // metric flush timerfd handling are frozen as well. - loop { - let req = self.from_api.recv().expect("Error receiving API request."); - let req_is_resume = *req == VmmAction::Resume; - self.handle_request(*req); - if req_is_resume { - break; - } - } - } + self.request = Some(api_request); } Err(TryRecvError::Empty) => { warn!("Got a spurious notification from api thread"); diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 3c94d244db7..0cd7cf27cbe 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -733,7 +733,7 @@ fn attach_pmem_devices<'a, I: Iterator>> + Debug>( false => cmdline.insert_str("rw")?, } } - locked_dev.alloc_region(vm.as_ref()); + locked_dev.alloc_region(vm.as_ref())?; locked_dev.set_mem_region(vm.as_ref())?; locked_dev.config.id.to_string() }; diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 902828aa6b6..7d2e41e72cf 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -35,8 +35,13 @@ use crate::devices::virtio::vsock::persist::{ use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; use crate::pci::bus::PciRootError; use crate::resources::VmResources; +use crate::rpc_interface::VmmActionError; use crate::snapshot::Persist; +use crate::vmm_config::HotplugDeviceConfig; +use crate::vmm_config::drive::{BlockDeviceConfig, DriveError}; use crate::vmm_config::memory_hotplug::MemoryHotplugConfig; +use crate::vmm_config::net::{NetBuilder, NetworkInterfaceConfig, NetworkInterfaceError}; +use crate::vmm_config::pmem::{PmemConfig, PmemConfigError}; use crate::vstate::bus::BusError; use crate::vstate::interrupts::InterruptError; use crate::vstate::memory::GuestMemoryMmap; @@ -136,13 +141,11 @@ impl PciDevices { Ok(()) } - pub(crate) fn attach_pci_virtio_device< - T: 'static + VirtioDevice + MutEventSubscriber + Debug, - >( + pub(crate) fn attach_pci_virtio_device( &mut self, vm: &Arc, id: String, - device: Arc>, + device: Arc>, event_manager: &mut EventManager, ) -> Result<(), PciManagerError> { // We should only be reaching this point if PCI is enabled @@ -232,6 +235,84 @@ impl PciDevices { f(*device_type, &*device); } } + + /// Attaches a device after VM start + pub fn hotplug_device( + &mut self, + vm: &Arc, + config: HotplugDeviceConfig, + event_manager: &mut EventManager, + ) -> Result<(), VmmActionError> { + if self.pci_segment.is_none() { + return Err(VmmActionError::PciNotEnabled); + } + + let dev_type = config.device_type(); + let dev_id = config.device_id().to_string(); + + if self + .virtio_devices + .contains_key(&(dev_type, dev_id.clone())) + { + return Err(VmmActionError::DeviceIdInUse); + } + + let device = match config { + HotplugDeviceConfig::Block(cfg) => Self::hotplug_make_block(cfg)?, + HotplugDeviceConfig::Pmem(cfg) => Self::hotplug_make_pmem(vm, cfg)?, + HotplugDeviceConfig::Net(cfg) => self.hotplug_make_net(cfg)?, + }; + + self.attach_pci_virtio_device(vm, dev_id, device, event_manager)?; + Ok(()) + } + + fn hotplug_make_block( + config: BlockDeviceConfig, + ) -> Result>, VmmActionError> { + if config.is_root_device { + return Err(DriveError::RootBlockDeviceAlreadyAdded.into()); + } + + let block = Block::new(config).map_err(DriveError::CreateBlockDevice)?; + Ok(Arc::new(Mutex::new(block))) + } + + fn hotplug_make_pmem( + vm: &Arc, + config: PmemConfig, + ) -> Result>, VmmActionError> { + if config.root_device { + return Err(PmemConfigError::AddingSecondRootDevice.into()); + } + + let mut pmem = Pmem::new(config).map_err(PmemConfigError::from)?; + pmem.alloc_region(vm.as_ref()).unwrap(); + pmem.set_mem_region(vm.as_ref()) + .map_err(PmemConfigError::from)?; + + Ok(Arc::new(Mutex::new(pmem))) + } + + fn hotplug_make_net( + &self, + config: NetworkInterfaceConfig, + ) -> Result>, VmmActionError> { + if let Some(mac) = config.guest_mac { + for device in self.virtio_devices.values() { + let device = device.lock().expect("Poisoned lock").virtio_device(); + let device = device.lock().expect("Poisoned lock"); + if let Some(net) = device.as_any().downcast_ref::() + && net.guest_mac() == Some(&mac) + { + return Err(NetworkInterfaceError::GuestMacAddressInUse(mac.to_string()).into()); + } + } + } + + let net = NetBuilder::create_net(config)?; + Ok(Arc::new(Mutex::new(net))) + } } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -857,4 +938,163 @@ mod tests { serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() ); } + + fn make_hotplug_block_cfg(drive_id: &str, f: &TempFile, is_root: bool) -> HotplugDeviceConfig { + HotplugDeviceConfig::Block(BlockDeviceConfig { + drive_id: drive_id.to_string(), + partuuid: None, + is_root_device: is_root, + cache_type: CacheType::Unsafe, + is_read_only: Some(false), + path_on_host: Some(f.as_path().to_str().unwrap().to_string()), + rate_limiter: None, + file_engine_type: None, + socket: None, + }) + } + + #[test] + fn test_hotplug_block() { + let mut evt_manager = EventManager::new().unwrap(); + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm).unwrap(); + let f = TempFile::new().unwrap(); + + // Successful case + let cfg = make_hotplug_block_cfg("block0", &f, false); + vmm.device_manager + .pci_devices + .hotplug_device(&vmm.vm, cfg, &mut evt_manager) + .unwrap(); + assert!( + vmm.device_manager + .pci_devices + .virtio_devices + .contains_key(&(VirtioDeviceType::Block, "block0".to_string())) + ); + + // Duplicate device ID is rejected + let cfg2 = make_hotplug_block_cfg("block0", &f, false); + assert!(matches!( + vmm.device_manager + .pci_devices + .hotplug_device(&vmm.vm, cfg2, &mut evt_manager), + Err(VmmActionError::DeviceIdInUse) + )); + + // Root block device is rejected + let cfg3 = make_hotplug_block_cfg("block1", &f, true); + assert!(matches!( + vmm.device_manager + .pci_devices + .hotplug_device(&vmm.vm, cfg3, &mut evt_manager), + Err(VmmActionError::DriveConfig( + DriveError::RootBlockDeviceAlreadyAdded + )) + )); + } + + #[test] + fn test_hotplug_pci_not_enabled() { + let mut vmm = default_vmm(); + let mut evt_manager = EventManager::new().unwrap(); + let f = TempFile::new().unwrap(); + + let cfg = make_hotplug_block_cfg("block0", &f, false); + assert!(matches!( + vmm.device_manager + .pci_devices + .hotplug_device(&vmm.vm, cfg, &mut evt_manager), + Err(VmmActionError::PciNotEnabled) + )); + } + + #[test] + fn test_hotplug_pmem() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm).unwrap(); + let mut evt_manager = EventManager::new().unwrap(); + let f = TempFile::new().unwrap(); + f.as_file().set_len(0x1000).unwrap(); + + // Successful case + let cfg = HotplugDeviceConfig::Pmem(PmemConfig { + id: "pmem0".to_string(), + path_on_host: f.as_path().to_str().unwrap().to_string(), + root_device: false, + read_only: false, + }); + vmm.device_manager + .pci_devices + .hotplug_device(&vmm.vm, cfg, &mut evt_manager) + .unwrap(); + assert!( + vmm.device_manager + .pci_devices + .virtio_devices + .contains_key(&(VirtioDeviceType::Pmem, "pmem0".to_string())) + ); + + // Root pmem device is rejected + let f2 = TempFile::new().unwrap(); + let cfg2 = HotplugDeviceConfig::Pmem(PmemConfig { + id: "pmem1".to_string(), + path_on_host: f2.as_path().to_str().unwrap().to_string(), + root_device: true, + read_only: false, + }); + assert!(matches!( + vmm.device_manager + .pci_devices + .hotplug_device(&vmm.vm, cfg2, &mut evt_manager), + Err(VmmActionError::PmemConfig( + PmemConfigError::AddingSecondRootDevice + )) + )); + } + + #[test] + fn test_hotplug_net() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm).unwrap(); + let mut evt_manager = EventManager::new().unwrap(); + + let mac = "AA:FC:00:00:00:01"; + + // Successful case + let cfg = HotplugDeviceConfig::Net(NetworkInterfaceConfig { + iface_id: "eth0".to_string(), + host_dev_name: "hostname".to_string(), + guest_mac: Some(mac.parse().unwrap()), + rx_rate_limiter: None, + tx_rate_limiter: None, + }); + vmm.device_manager + .pci_devices + .hotplug_device(&vmm.vm, cfg, &mut evt_manager) + .unwrap(); + assert!( + vmm.device_manager + .pci_devices + .virtio_devices + .contains_key(&(VirtioDeviceType::Net, "eth0".to_string())) + ); + + // Duplicate MAC is rejected + let cfg2 = HotplugDeviceConfig::Net(NetworkInterfaceConfig { + iface_id: "eth1".to_string(), + host_dev_name: "hostname2".to_string(), + guest_mac: Some(mac.parse().unwrap()), + rx_rate_limiter: None, + tx_rate_limiter: None, + }); + assert!(matches!( + vmm.device_manager + .pci_devices + .hotplug_device(&vmm.vm, cfg2, &mut evt_manager), + Err(VmmActionError::NetworkConfig( + NetworkInterfaceError::GuestMacAddressInUse(_) + )) + )); + } } diff --git a/src/vmm/src/devices/virtio/pmem/device.rs b/src/vmm/src/devices/virtio/pmem/device.rs index e05a3291c9d..78edffd19f9 100644 --- a/src/vmm/src/devices/virtio/pmem/device.rs +++ b/src/vmm/src/devices/virtio/pmem/device.rs @@ -30,6 +30,8 @@ use crate::{Vm, impl_device_type}; #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum PmemError { + /// Failed to allocate memory region + AllocationFailed, /// Cannot set the memory regions: {0} SetUserMemoryRegion(VmError), /// Unablet to allocate a KVM slot for the device @@ -213,18 +215,20 @@ impl Pmem { } /// Allocate memory in past_mmio64 memory region - pub fn alloc_region(&mut self, vm: &Vm) { + pub fn alloc_region(&mut self, vm: &Vm) -> Result<(), PmemError> { let mut resource_allocator_lock = vm.resource_allocator(); let resource_allocator = resource_allocator_lock.deref_mut(); - let addr = resource_allocator - .past_mmio64_memory - .allocate( - self.config_space.size, - Pmem::ALIGNMENT, - AllocPolicy::FirstMatch, - ) - .unwrap(); - self.config_space.start = addr.start(); + let addr = resource_allocator.past_mmio64_memory.allocate( + self.config_space.size, + Pmem::ALIGNMENT, + AllocPolicy::FirstMatch, + ); + if let Ok(addr) = addr { + self.config_space.start = addr.start(); + } else { + return Err(PmemError::AllocationFailed); + } + Ok(()) } /// Set user memory region in KVM diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index fa8a5b7b667..932ba8e9a67 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -153,6 +153,8 @@ use crate::mmds::data_store::Mmds; use crate::persist::{MicrovmState, MicrovmStateError, VmInfo}; use crate::rate_limiter::BucketUpdate; use crate::resources::VmmConfig; +use crate::rpc_interface::VmmActionError; +use crate::vmm_config::HotplugDeviceConfig; use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::entropy::EntropyDeviceConfig; @@ -782,6 +784,18 @@ impl Vmm { pub fn vm(&self) -> &Vm { &self.vm } + + /// Attaches a device after VM start + #[inline] + pub fn hotplug_device( + &mut self, + config: HotplugDeviceConfig, + event_manager: &mut EventManager, + ) -> Result<(), VmmActionError> { + self.device_manager + .pci_devices + .hotplug_device(&self.vm, config, event_manager) + } } /// Process the content of the MPIDR_EL1 register in order to be able to pass it to KVM diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index cc45aafe16f..bcca41ca1ad 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -62,9 +62,9 @@ pub enum ResourcesError { /// Vsock device error: {0} VsockDevice(#[from] VsockConfigError), /// Entropy device error: {0} - EntropyDevice(#[from] EntropyDeviceError), + EntropyConfig(#[from] EntropyDeviceError), /// Pmem device error: {0} - PmemDevice(#[from] PmemConfigError), + PmemConfig(#[from] PmemConfigError), /// Memory hotplug config error: {0} MemoryHotplugConfig(#[from] MemoryHotplugConfigError), } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 4617890a0e4..4ede424d2eb 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -14,6 +14,7 @@ use super::{Vmm, VmmError}; use crate::EventManager; use crate::builder::StartMicrovmError; use crate::cpu_config::templates::{CustomCpuTemplate, GuestConfigError}; +use crate::device_manager::pci_mngr::PciManagerError; use crate::devices::virtio::balloon::device::{HintingStatus, StartHintingCmd}; use crate::devices::virtio::mem::VirtioMemStatus; use crate::logger::{LoggerConfig, info, warn, *}; @@ -21,6 +22,7 @@ use crate::mmds::data_store::{self, Mmds, MmdsDatastoreError}; use crate::persist::{CreateSnapshotError, RestoreFromSnapshotError, VmInfo}; use crate::resources::VmmConfig; use crate::seccomp::BpfThreadMap; +use crate::vmm_config::HotplugDeviceConfig; use crate::vmm_config::balloon::{ BalloonConfigError, BalloonDeviceConfig, BalloonStats, BalloonUpdateConfig, BalloonUpdateStatsConfig, @@ -163,10 +165,10 @@ pub enum VmmActionError { ConfigureCpu(#[from] GuestConfigError), /// Drive config error: {0} DriveConfig(#[from] DriveError), - /// Entropy device error: {0} - EntropyDevice(#[from] EntropyDeviceError), - /// Pmem device error: {0} - PmemDevice(#[from] PmemConfigError), + /// Entropy config error: {0} + EntropyConfig(#[from] EntropyDeviceError), + /// Pmem config error: {0} + PmemConfig(#[from] PmemConfigError), /// Memory hotplug config error: {0} MemoryHotplugConfig(#[from] MemoryHotplugConfigError), /// Memory hotplug update error: {0} @@ -201,6 +203,12 @@ pub enum VmmActionError { StartMicrovm(#[from] StartMicrovmError), /// Vsock config error: {0} VsockConfig(#[from] VsockConfigError), + /// Device ID in use + DeviceIdInUse, + /// PCI is not enabled + PciNotEnabled, + /// PCI manager error: {0} + PciManager(#[from] PciManagerError), } /// The enum represents the response sent by the VMM in case of success. The response is either @@ -533,7 +541,7 @@ impl<'a> PrebootApiController<'a> { self.vm_resources .build_pmem_device(cfg) .map(|()| VmmData::Empty) - .map_err(VmmActionError::PmemDevice) + .map_err(VmmActionError::PmemConfig) } fn set_balloon_device(&mut self, cfg: BalloonDeviceConfig) -> Result { @@ -674,7 +682,11 @@ pub struct RuntimeApiController { impl RuntimeApiController { /// Handles the incoming runtime `VmmAction` request and provides a response for it. - pub fn handle_request(&mut self, request: VmmAction) -> Result { + pub fn handle_request( + &mut self, + request: VmmAction, + event_manager: &mut EventManager, + ) -> Result { use self::VmmAction::*; match request { // Supported operations allowed post-boot. @@ -737,6 +749,24 @@ impl RuntimeApiController { .expect("Poisoned lock"), value, ), + InsertBlockDevice(config) => self + .vmm + .lock() + .expect("Poisoned lock") + .hotplug_device(HotplugDeviceConfig::Block(config), event_manager) + .map(|()| VmmData::Empty), + InsertPmemDevice(config) => self + .vmm + .lock() + .expect("Poisoned lock") + .hotplug_device(HotplugDeviceConfig::Pmem(config), event_manager) + .map(|()| VmmData::Empty), + InsertNetworkDevice(config) => self + .vmm + .lock() + .expect("Poisoned lock") + .hotplug_device(HotplugDeviceConfig::Net(config), event_manager) + .map(|()| VmmData::Empty), Pause => self.pause(), PutMMDS(value) => mmds_put_data( self.vmm @@ -800,9 +830,6 @@ impl RuntimeApiController { | ConfigureLogger(_) | ConfigureMetrics(_) | ConfigureSerial(_) - | InsertBlockDevice(_) - | InsertPmemDevice(_) - | InsertNetworkDevice(_) | LoadSnapshot(_) | PutCpuConfiguration(_) | SetBalloonDevice(_) @@ -970,7 +997,6 @@ mod tests { use super::*; use crate::HTTP_MAX_PAYLOAD_SIZE; use crate::builder::tests::default_vmm; - use crate::devices::virtio::block::CacheType; use crate::mmds::data_store::MmdsVersion; use crate::seccomp::BpfThreadMap; use crate::vmm_config::snapshot::{MemBackendConfig, MemBackendType}; @@ -1187,7 +1213,8 @@ mod tests { fn runtime_request(request: VmmAction) -> Result { let vmm = Arc::new(Mutex::new(default_vmm())); let mut runtime = RuntimeApiController::new(vmm.clone()); - runtime.handle_request(request) + let mut event_manager = EventManager::new().unwrap(); + runtime.handle_request(request, &mut event_manager) } #[test] @@ -1223,30 +1250,6 @@ mod tests { metrics_path: PathBuf::new(), }, ))); - check_unsupported(runtime_request(VmmAction::InsertBlockDevice( - BlockDeviceConfig { - drive_id: String::new(), - partuuid: None, - is_root_device: false, - cache_type: CacheType::Unsafe, - - is_read_only: Some(false), - path_on_host: Some(String::new()), - rate_limiter: None, - file_engine_type: None, - - socket: None, - }, - ))); - check_unsupported(runtime_request(VmmAction::InsertNetworkDevice( - NetworkInterfaceConfig { - iface_id: String::new(), - host_dev_name: String::new(), - guest_mac: None, - rx_rate_limiter: None, - tx_rate_limiter: None, - }, - ))); check_unsupported(runtime_request(VmmAction::SetVsockDevice( VsockDeviceConfig { vsock_id: Some(String::new()), @@ -1291,12 +1294,6 @@ mod tests { check_unsupported(runtime_request(VmmAction::SetEntropyDevice( EntropyDeviceConfig::default(), ))); - check_unsupported(runtime_request(VmmAction::InsertPmemDevice(PmemConfig { - id: String::new(), - path_on_host: String::new(), - root_device: false, - read_only: false, - }))); check_unsupported(runtime_request(VmmAction::SetMemoryHotplugDevice( MemoryHotplugConfig::default(), ))); diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index 9a4c104ce3a..f8fd0411d99 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -6,7 +6,11 @@ use std::io; use serde::{Deserialize, Serialize}; +use crate::devices::virtio::device::VirtioDeviceType; use crate::rate_limiter::{BucketUpdate, RateLimiter, TokenBucket}; +use crate::vmm_config::drive::BlockDeviceConfig; +use crate::vmm_config::net::NetworkInterfaceConfig; +use crate::vmm_config::pmem::PmemConfig; /// Wrapper for configuring the balloon device. pub mod balloon; @@ -36,6 +40,32 @@ pub mod snapshot; /// Wrapper for configuring the vsock devices attached to the microVM. pub mod vsock; +#[allow(missing_docs)] +#[derive(Debug)] +pub enum HotplugDeviceConfig { + Block(BlockDeviceConfig), + Pmem(PmemConfig), + Net(NetworkInterfaceConfig), +} + +impl HotplugDeviceConfig { + pub(crate) fn device_id(&self) -> &str { + match self { + Self::Block(cfg) => &cfg.drive_id, + Self::Pmem(cfg) => &cfg.id, + Self::Net(cfg) => &cfg.iface_id, + } + } + + pub(crate) fn device_type(&self) -> VirtioDeviceType { + match self { + Self::Block(_) => VirtioDeviceType::Block, + Self::Pmem(_) => VirtioDeviceType::Pmem, + Self::Net(_) => VirtioDeviceType::Net, + } + } +} + // TODO: Migrate the VMM public-facing code (i.e. interface) to use stateless structures, // for receiving data/args, such as the below `RateLimiterConfig` and `TokenBucketConfig`. // Also todo: find a better suffix than `Config`; it should illustrate the static nature diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 4d58b95a426..73426681e8a 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -103,14 +103,21 @@ fn test_build_microvm() { fn pause_resume_microvm(vmm: Arc>) { let mut api_controller = RuntimeApiController::new(vmm.clone()); + let mut event_manager = EventManager::new().unwrap(); // There's a race between this thread and the vcpu thread, but this thread // should be able to pause vcpu thread before it finishes running its test-binary. - api_controller.handle_request(VmmAction::Pause).unwrap(); + api_controller + .handle_request(VmmAction::Pause, &mut event_manager) + .unwrap(); // Pausing again the microVM should not fail (microVM remains in the // `Paused` state). - api_controller.handle_request(VmmAction::Pause).unwrap(); - api_controller.handle_request(VmmAction::Resume).unwrap(); + api_controller + .handle_request(VmmAction::Pause, &mut event_manager) + .unwrap(); + api_controller + .handle_request(VmmAction::Resume, &mut event_manager) + .unwrap(); vmm.lock().unwrap().stop(FcExitCode::Ok); } @@ -213,12 +220,15 @@ fn verify_create_snapshot( let vm_info = VmInfo::from(&*vmm.lock().unwrap()); let mut controller = RuntimeApiController::new(vmm.clone()); + let mut event_manager = EventManager::new().unwrap(); // Be sure that the microVM is running. thread::sleep(Duration::from_millis(200)); // Pause microVM. - controller.handle_request(VmmAction::Pause).unwrap(); + controller + .handle_request(VmmAction::Pause, &mut event_manager) + .unwrap(); // Create snapshot. let snapshot_type = match is_diff { @@ -232,7 +242,10 @@ fn verify_create_snapshot( }; controller - .handle_request(VmmAction::CreateSnapshot(snapshot_params)) + .handle_request( + VmmAction::CreateSnapshot(snapshot_params), + &mut event_manager, + ) .unwrap(); vmm.lock().unwrap().stop(FcExitCode::Ok); diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index aa3c8b050d1..f4514eb1789 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -452,7 +452,7 @@ def test_api_cpu_config(uvm_plain, custom_cpu_template): test_microvm.api.cpu_config.put(**custom_cpu_template["template"]) -def test_api_put_update_post_boot(uvm_plain, io_engine): +def test_api_put_update_post_boot(uvm_plain): """ Test that PUT updates are rejected after the microvm boots. """ @@ -486,22 +486,6 @@ def test_api_put_update_post_boot(uvm_plain, io_engine): with pytest.raises(RuntimeError, match=NOT_SUPPORTED_AFTER_START): test_microvm.api.machine_config.put(vcpu_count=4, mem_size_mib=128) - # Network interface update is not allowed after boot. - with pytest.raises(RuntimeError, match=NOT_SUPPORTED_AFTER_START): - test_microvm.api.network.put( - iface_id="1", host_dev_name=tap1.name, guest_mac="06:00:00:00:00:02" - ) - - # Block device update is not allowed after boot. - with pytest.raises(RuntimeError, match=NOT_SUPPORTED_AFTER_START): - test_microvm.api.drive.put( - drive_id="rootfs", - path_on_host=test_microvm.jailer.jailed_path(test_microvm.rootfs_file), - is_read_only=False, - is_root_device=True, - io_engine=io_engine, - ) - # MMDS config is not allowed post-boot. mmds_config = { "version": "V2", diff --git a/tests/integration_tests/functional/test_hotplug.py b/tests/integration_tests/functional/test_hotplug.py new file mode 100644 index 00000000000..6cbb909077e --- /dev/null +++ b/tests/integration_tests/functional/test_hotplug.py @@ -0,0 +1,283 @@ +# Copyright 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for PCI device hotplug""" + +import os + +import pytest + +import host_tools.drive as drive_tools +import host_tools.network as net_tools + +VIRTIO_PCI_VENDOR_ID = 0x1AF4 +VIRTIO_PCI_DEVICE_ID_NET = 0x1041 +VIRTIO_PCI_DEVICE_ID_BLOCK = 0x1042 +VIRTIO_PCI_DEVICE_ID_PMEM = 0x105B + + +def test_hotplug_block(microvm_factory, guest_kernel_acpi, rootfs): + """ + Test hotplugging a block device after VM start. + Test that the device appears in lspci and is usable. + Test that invalid hotplug request are rejected. + """ + vm = microvm_factory.build(guest_kernel_acpi, rootfs, pci=True) + vm.spawn() + vm.basic_config() + vm.add_net_iface() + vm.start() + + # Snapshot lspci output before hotplug + _, lspci_before, _ = vm.ssh.check_output("lspci -n") + + # Hotplug a block device + host_file = drive_tools.FilesystemFile(os.path.join(vm.fsfiles, "block0"), size=4) + vm.api.drive.put( + drive_id="block0", + path_on_host=vm.create_jailed_resource(host_file.path), + is_root_device=False, + is_read_only=False, + ) + + # Rescan PCI bus since no hotplug notification mechanism exists yet + vm.ssh.check_output("echo 1 > /sys/bus/pci/rescan") + + # Verify a new virtio-block device entry appeared in lspci + _, lspci_after, _ = vm.ssh.check_output("lspci -n") + new_entries = set(lspci_after.splitlines()) - set(lspci_before.splitlines()) + assert len(new_entries) == 1 + entry = new_entries.pop() + assert f"{VIRTIO_PCI_VENDOR_ID:04x}:{VIRTIO_PCI_DEVICE_ID_BLOCK:04x}" in entry + + # Discover the block device node from the PCI BDF via sysfs + bdf = entry.split()[0] + _, dev_name, _ = vm.ssh.check_output( + f"ls /sys/bus/pci/devices/0000:{bdf}/virtio*/block/" + ) + dev_path = f"/dev/{dev_name.strip()}" + + # Ensure the device is usable by writing a file to it and reading it back + vm.ssh.check_output("mkdir -p /tmp/block0_mnt") + vm.ssh.check_output(f"mount {dev_path} /tmp/block0_mnt") + vm.ssh.check_output("echo hotplug_test > /tmp/block0_mnt/test") + _, stdout, _ = vm.ssh.check_output("cat /tmp/block0_mnt/test") + assert stdout.strip() == "hotplug_test" + + # Hotplugging a device with a duplicate ID must be rejected + with pytest.raises(RuntimeError, match="Device ID in use"): + vm.api.drive.put( + drive_id="block0", + path_on_host=vm.create_jailed_resource(host_file.path), + is_root_device=False, + is_read_only=False, + ) + + # Hotplugging a root device must be rejected + with pytest.raises(RuntimeError, match="A root block device already exists"): + vm.api.drive.put( + drive_id="block_root", + path_on_host=vm.create_jailed_resource(host_file.path), + is_root_device=True, + is_read_only=False, + ) + + # Verify no further devices appeared after the rejected requests + vm.ssh.check_output("echo 1 > /sys/bus/pci/rescan") + _, lspci_final, _ = vm.ssh.check_output("lspci -n") + assert lspci_final == lspci_after + + +def test_hotplug_pmem(microvm_factory, guest_kernel_acpi, rootfs): + """ + Test hotplugging a pmem device after VM start. + Test that the device appears in lspci and is usable. + Test that invalid hotplug request are rejected. + """ + vm = microvm_factory.build(guest_kernel_acpi, rootfs, pci=True) + vm.spawn() + vm.basic_config() + vm.add_net_iface() + vm.start() + + # Snapshot lspci output before hotplug + _, lspci_before, _ = vm.ssh.check_output("lspci -n") + + # Hotplug a pmem device + host_file = drive_tools.FilesystemFile(os.path.join(vm.fsfiles, "pmem0"), size=4) + vm.api.pmem.put( + id="pmem0", + path_on_host=vm.create_jailed_resource(host_file.path), + root_device=False, + read_only=False, + ) + + # Rescan PCI bus since no hotplug notification mechanism exists yet + vm.ssh.check_output("echo 1 > /sys/bus/pci/rescan") + + # Verify a new virtio-pmem device entry appeared in lspci + _, lspci_after, _ = vm.ssh.check_output("lspci -n") + new_entries = set(lspci_after.splitlines()) - set(lspci_before.splitlines()) + assert len(new_entries) == 1 + entry = new_entries.pop() + assert f"{VIRTIO_PCI_VENDOR_ID:04x}:{VIRTIO_PCI_DEVICE_ID_PMEM:04x}" in entry + + # Discover the pmem device node from the PCI BDF via sysfs. + # The NVDIMM subsystem in the guest creates the ndbus/region/namespace/block + # hierarchy asynchronously after driver probe, so we need to wait for it. + vm.ssh.check_output("sleep 1") + bdf = entry.split()[0] + _, dev_name, _ = vm.ssh.check_output( + f"ls /sys/bus/pci/devices/0000:{bdf}/virtio*/ndbus*/region*/namespace*/block/" + ) + dev_path = f"/dev/{dev_name.strip()}" + + # Ensure the device is usable by writing a file to it and reading it back + vm.ssh.check_output("mkdir -p /tmp/pmem0_mnt") + vm.ssh.check_output(f"mount {dev_path} /tmp/pmem0_mnt") + vm.ssh.check_output("echo hotplug_test > /tmp/pmem0_mnt/test") + _, stdout, _ = vm.ssh.check_output("cat /tmp/pmem0_mnt/test") + assert stdout.strip() == "hotplug_test" + + # Hotplugging a root pmem device must be rejected + with pytest.raises(RuntimeError, match="Attempt to add pmem as a root device"): + vm.api.pmem.put( + id="pmem_root", + path_on_host=vm.create_jailed_resource(host_file.path), + root_device=True, + read_only=False, + ) + + # Hotplugging a device with a duplicate ID must be rejected + with pytest.raises(RuntimeError, match="Device ID in use"): + vm.api.pmem.put( + id="pmem0", + path_on_host=vm.create_jailed_resource(host_file.path), + root_device=False, + read_only=False, + ) + + # Verify no further devices appeared after the rejected requests + vm.ssh.check_output("echo 1 > /sys/bus/pci/rescan") + _, lspci_final, _ = vm.ssh.check_output("lspci -n") + assert lspci_final == lspci_after + + +def test_hotplug_net(microvm_factory, guest_kernel_acpi, rootfs): + """ + Test hotplugging a net device after VM start. + Test that the device appears in lspci and is usable. + Test that invalid hotplug request are rejected. + """ + vm = microvm_factory.build(guest_kernel_acpi, rootfs, pci=True) + vm.spawn() + vm.basic_config() + vm.add_net_iface() + vm.start() + + # Snapshot lspci output before hotplug + _, lspci_before, _ = vm.ssh.check_output("lspci -n") + + # Hotplug a network device + iface1 = net_tools.NetIfaceConfig.with_id(1) + vm.netns.add_tap(iface1.tap_name, ip=f"{iface1.host_ip}/{iface1.netmask_len}") + vm.api.network.put( + iface_id=iface1.dev_name, + host_dev_name=iface1.tap_name, + guest_mac=iface1.guest_mac, + ) + + # Rescan PCI bus since no hotplug notification mechanism exists yet + vm.ssh.check_output("echo 1 > /sys/bus/pci/rescan") + + # Verify a new net device entry appeared in lspci + _, lspci_after, _ = vm.ssh.check_output("lspci -n") + new_entries = set(lspci_after.splitlines()) - set(lspci_before.splitlines()) + assert len(new_entries) == 1 + entry = new_entries.pop() + assert f"{VIRTIO_PCI_VENDOR_ID:04x}:{VIRTIO_PCI_DEVICE_ID_NET:04x}" in entry + + # Discover the net interface name from the PCI BDF via sysfs + bdf = entry.split()[0] + _, iface_name, _ = vm.ssh.check_output( + f"ls /sys/bus/pci/devices/0000:{bdf}/virtio*/net/" + ) + iface_name = iface_name.strip() + + # Verify the hotplugged interface is usable + vm.ssh.check_output(f"ip link show {iface_name}") + vm.ssh.check_output( + f"ip addr add {iface1.guest_ip}/{iface1.netmask_len} dev {iface_name}" + ) + vm.ssh.check_output(f"ip link set {iface_name} up") + + # Ping the host from the guest through the hotplugged interface + _, stdout, _ = vm.ssh.check_output(f"ping -c 3 -W 3 {iface1.host_ip}") + assert "3 packets transmitted, 3 received" in stdout + + # Hotplugging a device with a duplicate ID must be rejected + iface2 = net_tools.NetIfaceConfig.with_id(2) + with pytest.raises(RuntimeError, match="Device ID in use"): + vm.api.network.put( + iface_id=iface1.dev_name, + host_dev_name=iface2.tap_name, + guest_mac=iface2.guest_mac, + ) + + # Hotplugging a device with a duplicate MAC must be rejected + with pytest.raises(RuntimeError, match="The MAC address is already in use"): + vm.api.network.put( + iface_id=iface2.dev_name, + host_dev_name=iface2.tap_name, + guest_mac=iface1.guest_mac, + ) + + # Hotplugging a device that reuses the same TAP must be rejected + with pytest.raises(RuntimeError, match="Resource busy"): + vm.api.network.put( + iface_id=iface2.dev_name, + host_dev_name=iface1.tap_name, + guest_mac=iface2.guest_mac, + ) + + # Verify no further devices appeared after the rejected requests + vm.ssh.check_output("echo 1 > /sys/bus/pci/rescan") + _, lspci_final, _ = vm.ssh.check_output("lspci -n") + assert lspci_final == lspci_after + + +def test_hotplug_no_pci(microvm_factory, guest_kernel_acpi, rootfs): + """ + Hotplugging any device type must be rejected when PCI is not enabled. + """ + vm = microvm_factory.build(guest_kernel_acpi, rootfs, pci=False) + vm.spawn() + vm.basic_config() + vm.add_net_iface() + vm.start() + + host_file = drive_tools.FilesystemFile(os.path.join(vm.fsfiles, "disk"), size=4) + + with pytest.raises(RuntimeError, match="PCI is not enabled"): + vm.api.drive.put( + drive_id="block0", + path_on_host=vm.create_jailed_resource(host_file.path), + is_root_device=False, + is_read_only=False, + ) + + with pytest.raises(RuntimeError, match="PCI is not enabled"): + vm.api.pmem.put( + id="pmem0", + path_on_host=vm.create_jailed_resource(host_file.path), + root_device=False, + read_only=False, + ) + + iface1 = net_tools.NetIfaceConfig.with_id(1) + vm.netns.add_tap(iface1.tap_name, ip=f"{iface1.host_ip}/{iface1.netmask_len}") + with pytest.raises(RuntimeError, match="PCI is not enabled"): + vm.api.network.put( + iface_id=iface1.dev_name, + host_dev_name=iface1.tap_name, + guest_mac=iface1.guest_mac, + )