diff --git a/CHANGELOG.md b/CHANGELOG.md index 790b81eb52d..821e516fd79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,18 @@ piggy-backing on `--http-api-max-payload-size`. If left unconfigured it defaults to the value of `--http-api-max-payload-size`, to provide backwards compatibility. +- Added optional `mem_backend` body field in `PUT` requests on `/snapshot/load`. + This new parameter is an object that defines the configuration of the backend + responsible for handling memory loading during snapshot restore. The + `mem_backend` parameter contains `backend_type` and `backend_path` required + fields. `backend_type` is an enum that can take either `File` or `Uffd` as + value. Interpretation of `backend_path` field depends on the value of + `backend_type`. If `File`, then the user must provide the path to file that + contains the guest memory to be loaded. Otherwise, if `backend_type` is `Uffd`, + then `backend_path` is the path to a unix domain socket where a custom page + fault handler process is listening and expecting a UFFD to be sent by + Firecracker. The UFFD is used to handle the guest memory page faults in the + separate process. ### Changed @@ -25,6 +37,7 @@ - MmdsV2 is now Generally Available. - MmdsV1 is now deprecated and will be removed in Firecracker v2.0.0. Use MmdsV2 instead. +- Deprecated `mem_file_path` body field in `PUT` on `/snapshot/load` request. ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 7588d046f0d..121a99469dc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,7 +17,7 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "cipher", "cpufeatures", "opaque-debug", @@ -107,6 +107,25 @@ dependencies = [ "serde", ] +[[package]] +name = "bindgen" +version = "0.59.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -146,6 +165,27 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + [[package]] name = "cfg-if" version = "1.0.0" @@ -161,6 +201,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "clang-sys" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cc00842eed744b858222c4c9faf7243aafc6d33f92f96935263ef4d8a41ce21" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "2.34.0" @@ -238,7 +289,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "crossbeam-utils", ] @@ -248,7 +299,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "crossbeam-epoch", "crossbeam-utils", ] @@ -259,7 +310,7 @@ version = "0.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "crossbeam-utils", "lazy_static", "memoffset", @@ -272,7 +323,7 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e5bed1f1c269533fa816a0a5492b3545209a205ca1a54842be180eb63a16a6" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "lazy_static", ] @@ -397,7 +448,7 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "wasi", ] @@ -412,6 +463,12 @@ dependencies = [ "polyval", ] +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "half" version = "1.8.2" @@ -503,12 +560,28 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "libc" version = "0.2.117" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c" +[[package]] +name = "libloading" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" +dependencies = [ + "cfg-if 1.0.0", + "winapi", +] + [[package]] name = "linux-loader" version = "0.4.0" @@ -524,7 +597,7 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -564,6 +637,12 @@ dependencies = [ "vmm-sys-util", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "mmds" version = "0.1.0" @@ -586,6 +665,30 @@ dependencies = [ name = "net_gen" version = "0.1.0" +[[package]] +name = "nix" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +dependencies = [ + "bitflags", + "cc", + "cfg-if 1.0.0", + "libc", + "memoffset", +] + +[[package]] +name = "nom" +version = "7.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109" +dependencies = [ + "memchr", + "minimal-lexical", + "version_check", +] + [[package]] name = "num-traits" version = "0.2.14" @@ -617,6 +720,12 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + [[package]] name = "plotters" version = "0.3.1" @@ -651,7 +760,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8419d2b623c7c0896ff2d5d96e2cb4ede590fed28fcc34934f4c33c036e620a1" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "cpufeatures", "opaque-debug", "universal-hash", @@ -809,6 +918,12 @@ version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc_version" version = "0.4.0" @@ -897,6 +1012,12 @@ dependencies = [ "serde", ] +[[package]] +name = "shlex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" + [[package]] name = "snapshot" version = "0.1.0" @@ -933,6 +1054,26 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "thiserror" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "timerfd" version = "1.2.0" @@ -980,6 +1121,31 @@ dependencies = [ "subtle", ] +[[package]] +name = "userfaultfd" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b738009e099b4ded1ecf19dfb7631f69c24f16e0af6d29fd9b3f54a092aca46" +dependencies = [ + "bitflags", + "cfg-if 1.0.0", + "libc", + "nix", + "thiserror", + "userfaultfd-sys", +] + +[[package]] +name = "userfaultfd-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a4be003c705d2c8dc1234d473856945e291bb998ac2e2d83e70328d964d7458" +dependencies = [ + "bindgen", + "cc", + "cfg-if 0.1.10", +] + [[package]] name = "utils" version = "0.1.0" @@ -1081,6 +1247,7 @@ dependencies = [ "serde", "serde_json", "snapshot", + "userfaultfd", "utils", "versionize", "versionize_derive", @@ -1122,7 +1289,7 @@ version = "0.2.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "wasm-bindgen-macro", ] diff --git a/deny.toml b/deny.toml index 031ac2d4e44..fbd8a421886 100644 --- a/deny.toml +++ b/deny.toml @@ -2,5 +2,6 @@ allow = [ "MIT", "Apache-2.0", - "BSD-3-Clause" + "BSD-3-Clause", + "ISC" ] diff --git a/docs/device-api.md b/docs/device-api.md index 295b26ada2e..a95c9bbc98c 100644 --- a/docs/device-api.md +++ b/docs/device-api.md @@ -59,7 +59,9 @@ specification: [firecracker.yaml](./../src/api_server/swagger/firecracker.yaml). | `InstanceActionInfo` | action_type | O | O | O | O | O | | `LoadSnapshotParams` | enable_diff_snapshots | O | O | O | O | O | | | mem_file_path | O | O | O | O | O | +| | mem_backend | O | O | O | O | O | | | snapshot_path | O | O | O | O | O | +| | resume_vm | O | O | O | O | O | | `Logger` | level | O | O | O | O | O | | | log_path | O | O | O | O | O | | | show_level | O | O | O | O | O | diff --git a/docs/images/uffd_flow1.png b/docs/images/uffd_flow1.png new file mode 100644 index 00000000000..a8150769a50 Binary files /dev/null and b/docs/images/uffd_flow1.png differ diff --git a/docs/images/uffd_flow2.png b/docs/images/uffd_flow2.png new file mode 100644 index 00000000000..9486db63546 Binary files /dev/null and b/docs/images/uffd_flow2.png differ diff --git a/docs/images/uffd_flow3.png b/docs/images/uffd_flow3.png new file mode 100644 index 00000000000..a4c401b8845 Binary files /dev/null and b/docs/images/uffd_flow3.png differ diff --git a/docs/images/uffd_flow4.png b/docs/images/uffd_flow4.png new file mode 100644 index 00000000000..3400dfe8f7e Binary files /dev/null and b/docs/images/uffd_flow4.png differ diff --git a/docs/snapshotting/handling-page-faults-on-snapshot-resume.md b/docs/snapshotting/handling-page-faults-on-snapshot-resume.md new file mode 100644 index 00000000000..0372a3d02c7 --- /dev/null +++ b/docs/snapshotting/handling-page-faults-on-snapshot-resume.md @@ -0,0 +1,133 @@ +# Handling snapshot memory loading + +Firecracker allows for a better management of the microVM's memory loading +by letting users choose between relying on host OS to handle the page faults +when resuming from a snapshot, or having a dedicated userspace process for +dealing with page faults, with the help of +[Userfaultfd](https://www.kernel.org/doc/html/v4.18/admin-guide/mm/userfaultfd.html). + +## Kernel + +When resuming a microVM from a snapshot, loading the snapshotted guest's memory +(which is file-backed) into RAM is usually kernel's responsibility and is handled +on a per-page-fault basis. Each time the guest touches a page that is not already +in Firecracker's process memory, a page fault occurs, which triggers a context +switch and IO operation in order to bring that page into RAM. Depending on the +use case, doing this for every page can be time-consuming. + +## Userfaultfd + +Userfaultfd is a mechanism that passes that responsibility of handling page +fault events from kernel space to user space. In order to be able to interact +with this mechanism, userspace needs to firstly obtain an userfault object +(i.e file descriptor) by calling into [`userfaultfd` +syscall](https://man7.org/linux/man-pages/man2/userfaultfd.2.html). +Next, the memory address range must be registered with the userfault file +descriptor so that the userfault object can monitor page faults occurring for +those addresses. After this, the user space process can start reading and serving +events via the userfault file descriptor. These events will contain the address +that triggered the fault. The fault-handling thread can choose to handle these +events using these [operations](https://www.kernel.org/doc/html/latest/admin-guide/mm/userfaultfd.html#resolving-userfaults). + +In the flow described above, there are two userspace processes that interact +with each other in order to handle page faults: Firecracker process and the +page fault handler. Please note that users are responsible for writing the page +fault handler process to monitor userfaultfd events and handle those events. + +Below is the interaction flow between Firecracker and the page fault handler +(designed by the users): + +- Page fault handler binds and listens on a unix domain socket in order + to be able to communicate with the Firecracker process. + +![](../images/uffd_flow1.png) + +Please note that when using the Jailer, the page fault handler process, UDS and +memory file must reside inside the jail. The UDS must only be accessible to +Firecracker and the page fault handler. + +- PUT snapshot/load API call is issued towards Firecracker's API thread. + The request encapsulates in its body the path to the unix domain socket that + page fault handler listens to in order to communicate with Firecracker. +- Firecracker process creates the userfault object and obtains the userfault + file descriptor. +- The page fault handler privately mmaps the contents of the guest memory file. + +![](../images/uffd_flow2.png) + +- Firecracker anonymously mmaps memory based on the memory description found + in the microVM state file and registers the memory regions with the userfault + object in order for the userfaultfd to be aware of page fault events on these + addresses. Firecracker then connects to the socket previously opened by the page + fault process. + +![](../images/uffd_flow3.png) + +- Firecracker passes the userfault file descriptor and the guest memory layout + to the page fault handler process through the socket. + +![](../images/uffd_flow4.png) + +- After sending the necessary information to the page fault handler, Firecracker + continues with the normal cycle to restore from snapshot. It reads from the microVM + state file the relevant serialized components and loads them into memory. + +- Page faults that occur while Firecracker is touching guest memory are handled + by the page fault handler process, which listens for events on the userfault file + descriptor that Firecracker previously sent. When a page fault event happens, + the page fault handler issues `UFFDIO_COPY` to load the previously mmaped file + contents into the correspondent memory region. + +After Firecracker sends the payload (i.e mem mappings and file descriptor), no +other communication happens on the UDS socket (or otherwise) between Firecracker +and the page fault handler process. + +### Userfaultfd interaction with balloon + +The balloon device allows the host to reclaim memory from a microVM. For more +details on balloon, please refer to [this doc](../ballooning.md). + +When the balloon device asks for removal of a memory range, Firecracker calls +`madvise` with the `MADV_DONTNEED` flag in order to let the kernel know that it +can free up memory found in that specific area. On such a system call, the +userfaultfd interface sends `UFFD_EVENT_REMOVE`. + +When implementing the logic for the page fault handler, users must identify events +of type `UFFD_EVENT_REMOVE` and handle them by zeroing out those pages. This is +because the memory is removed, but the area still remains monitored by userfaultfd. +After a cycle of inflation and deflation, page faults might happen again for memory +ranges that have been removed by balloon (and subsequently zeroed out by the page +fault handler). In such a case, the page fault handler process must zero out the +faulted page (instead of bringing it from file), as recommended by [the userfaultfd +documentation](https://www.kernel.org/doc/html/latest/admin-guide/mm/userfaultfd.html#non-cooperative-userfaultfd). + +In case of a compromised balloon driver, the page fault handler can get flooded with +`UFFD_EVENT_REMOVE`. We recommend using the jailer's built-in cgroup functionality +as defense in depth, in order to limit resource usage of the Firecracker process. + +### Caveats + +If the handler process crashes while Firecracker is resuming the snapshot, Firecracker +will hang when a page fault occurs. This is because Firecracker is designed to +wait for the requested page to be made available. If the page fault handler process +is no longer around when this happens, Firecracker will wait forever. Users are +expected to monitor the page fault handler's status or gather metrics of hanged +Firecracker process and implement a recycle mechanism if necessary. + +It is the page fault handler process's responsibility to handle any errors that +might occur and also send signals to Firecracker process to inform it of any +crashes/exits. The page fault handler can fetch Firecracker's PID through `getsockopt` +call with `SO_PEERCRED` option, which fetches credentials of the peer process that +is connected to the socket. The returned credentials contain: PID, GID and UID of +the peer process (Firecracker in the page fault handler's case). + +We recommend that the page fault handler includes timeouts for waiting on Firecracker +to connect to the UDS or send information over the UDS, in order to account for +unexpected cases when Firecracker crashes before being able to connect/send data. + +### Example + +An example of a handler process can be found [here](../../tests/host_tools/uffd/src/bin/valid_handler.rs). +The process is designed to tackle faults on a certain address by loading into +memory the entire region that the address belongs to, but users can choose any +other behavior that suits their use case best. \ No newline at end of file diff --git a/docs/snapshotting/snapshot-support.md b/docs/snapshotting/snapshot-support.md index 95ae3d668ff..9cf769a2e85 100644 --- a/docs/snapshotting/snapshot-support.md +++ b/docs/snapshotting/snapshot-support.md @@ -244,7 +244,7 @@ curl --unix-socket /tmp/firecracker.socket -i \ "snapshot_type": "Full", "snapshot_path": "./snapshot_file", "mem_file_path": "./mem_file", - "version": "0.23.0" + "version": "1.0.0" }' ``` @@ -299,7 +299,7 @@ curl --unix-socket /tmp/firecracker.socket -i \ "snapshot_type": "Diff", "snapshot_path": "./snapshot_file", "mem_file_path": "./mem_file", - "version": "0.23.0" + "version": "1.0.0" }' ``` @@ -376,6 +376,44 @@ If you want to load a snapshot, you can do that only **before** the microVM is c (the only resources that can be configured prior are the Logger and the Metrics systems) by sending the following API command: +```bash +curl --unix-socket /tmp/firecracker.socket -i \ + -X PUT 'http://localhost/snapshot/load' \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "snapshot_path": "./snapshot_file", + "mem_backend": { + "backend_path": "./mem_file", + "backend_type": "File", + }, + "enable_diff_snapshots": true, + "resume_vm": false + }' +``` + +The `backend_type` field represents the memory backend type used for loading the +snapshot. Accepted values are: + +- `File` - rely on the kernel to handle page faults when loading the contents of + the guest memory file into memory. +- `Uffd` - use a dedicated user space process to handle page faults that occur + for the guest memory range. Please refer to [this](handling-page-faults-on-snapshot-resume.md) + for more details on handling page faults in the user space. + +The meaning of `backend_path` depends on the `backend_type` chosen: + +- if using `File`, then `backend_path` should contain the path to the snapshot's + memory file to be loaded. +- when using `Uffd`, `backend_path` refers to the path of the unix domain socket + used for communication between Firecracker and the user space process that handles + page faults. + +When relying on the OS to handle page faults, the command below is also accepted. +Note that `mem_file_path` field is currently under the deprecation policy. +`mem_file_path` and `mem_backend` are mutually exclusive, therefore specifying them +both at the same time will return an error. + ```bash curl --unix-socket /tmp/firecracker.socket -i \ -X PUT 'http://localhost/snapshot/load' \ @@ -409,10 +447,11 @@ as they were to the original one. diff snapshot point of view). - The loaded microVM is now in the `Paused` state, so it needs to be resumed for it to run. - - The memory file pointed by `mem_file_path` **must** be considered immutable - from Firecracker and host point of view. It backs the guest OS memory for - read access through the page cache. External modification to this file - corrupts the guest memory and leads to undefined behavior. + - The memory file (pointed by `backend_path` when using `File` backend type, + or pointed by `mem_file_path`) **must** be considered immutable from Firecracker + and host point of view. It backs the guest OS memory for read access through + the page cache. External modification to this file corrupts the guest memory + and leads to undefined behavior. - The file indicated by `snapshot_path`, that is used to load from, is released and no longer used by this process. - If `enable_diff_snapshots` is set, then diff snapshots can be taken @@ -463,7 +502,7 @@ function abnormally. ## Ensure continued network connectivity for clones -For recomandations related to continued network connectivity for multiple +For recommendations related to continued network connectivity for multiple clones created from a single Firecracker microVM snapshot please see [this doc](network-for-clones.md). ## Snapshot security and uniqueness diff --git a/src/api_server/src/parsed_request.rs b/src/api_server/src/parsed_request.rs index a52ea4c5d2e..17a0660ff68 100644 --- a/src/api_server/src/parsed_request.rs +++ b/src/api_server/src/parsed_request.rs @@ -919,9 +919,13 @@ pub(crate) mod tests { assert!(connection.try_read().is_ok()); let req = connection.pop_parsed_request().unwrap(); assert!(ParsedRequest::try_from_request(&req).is_ok()); + let body = "{ \ \"snapshot_path\": \"foo\", \ - \"mem_file_path\": \"bar\", \ + \"mem_backend\": { \ + \"backend_type\": \"File\", \ + \"backend_path\": \"bar\" \ + }, \ \"enable_diff_snapshots\": true \ }"; sender @@ -931,6 +935,19 @@ pub(crate) mod tests { assert!(connection.try_read().is_ok()); let req = connection.pop_parsed_request().unwrap(); assert!(ParsedRequest::try_from_request(&req).is_ok()); + + let body = "{ \ + \"snapshot_path\": \"foo\", \ + \"mem_file_path\": \"bar\", \ + \"resume_vm\": true \ + }"; + sender + .write_all(http_request("PUT", "/snapshot/load", Some(&body)).as_bytes()) + .unwrap(); + + assert!(connection.try_read().is_ok()); + let req = connection.pop_parsed_request().unwrap(); + assert!(ParsedRequest::try_from_request(&req).is_ok()); } #[test] diff --git a/src/api_server/src/request/snapshot.rs b/src/api_server/src/request/snapshot.rs index 7124c0c0d09..62c581ad744 100644 --- a/src/api_server/src/request/snapshot.rs +++ b/src/api_server/src/request/snapshot.rs @@ -5,9 +5,23 @@ use super::super::VmmAction; use crate::parsed_request::{Error, ParsedRequest}; use crate::request::Body; use crate::request::{Method, StatusCode}; -use vmm::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams}; +use logger::{IncMetric, METRICS}; +use serde::de::Error as DeserializeError; +use vmm::vmm_config::snapshot::{ + CreateSnapshotParams, LoadSnapshotConfig, LoadSnapshotParams, MemBackendConfig, MemBackendType, +}; use vmm::vmm_config::snapshot::{Vm, VmState}; +/// Deprecation message for the `mem_file_path` field. +const LOAD_DEPRECATION_MESSAGE: &str = "PUT /snapshot/load: mem_file_path field is deprecated."; +/// None of the `mem_backend` or `mem_file_path` fields has been specified. +pub const MISSING_FIELD: &str = + "missing field: either `mem_backend` or `mem_file_path` is required"; +/// Both the `mem_backend` and `mem_file_path` fields have been specified. +/// Only specifying one of them is allowed. +pub const TOO_MANY_FIELDS: &str = + "too many fields: either `mem_backend` or `mem_file_path` exclusively is required"; + pub(crate) fn parse_put_snapshot( body: &Body, request_type_from_path: Option<&&str>, @@ -18,10 +32,7 @@ pub(crate) fn parse_put_snapshot( serde_json::from_slice::(body.raw()) .map_err(Error::SerdeJson)?, ))), - "load" => Ok(ParsedRequest::new_sync(VmmAction::LoadSnapshot( - serde_json::from_slice::(body.raw()) - .map_err(Error::SerdeJson)?, - ))), + "load" => parse_put_snapshot_load(body), _ => Err(Error::InvalidPathMethod( format!("/snapshot/{}", request_type), Method::Put, @@ -43,10 +54,66 @@ pub(crate) fn parse_patch_vm_state(body: &Body) -> Result } } +fn parse_put_snapshot_load(body: &Body) -> Result { + let snapshot_config = + serde_json::from_slice::(body.raw()).map_err(Error::SerdeJson)?; + + match (&snapshot_config.mem_backend, &snapshot_config.mem_file_path) { + // Ensure `mem_file_path` and `mem_backend` fields are not present at the same time. + (Some(_), Some(_)) => { + return Err(Error::SerdeJson(serde_json::Error::custom(TOO_MANY_FIELDS))) + } + // Ensure that one of `mem_file_path` or `mem_backend` fields is always specified. + (None, None) => return Err(Error::SerdeJson(serde_json::Error::custom(MISSING_FIELD))), + _ => {} + } + + // Check for the presence of deprecated `mem_file_path` field and create + // deprecation message if found. + let mut deprecation_message = None; + if snapshot_config.mem_file_path.is_some() { + // `mem_file_path` field in request is deprecated. + METRICS.deprecated_api.deprecated_http_api_calls.inc(); + deprecation_message = Some(LOAD_DEPRECATION_MESSAGE); + } + + // If `mem_file_path` is specified instead of `mem_backend`, we construct the + // `MemBackendConfig` object from the path specified, with `File` as backend type. + let mem_backend = match snapshot_config.mem_backend { + Some(backend_cfg) => backend_cfg, + None => { + MemBackendConfig { + // This is safe to unwrap() because we ensure above that one of the two: + // either `mem_file_path` or `mem_backend` field is always specified. + backend_path: snapshot_config.mem_file_path.unwrap(), + backend_type: MemBackendType::File, + } + } + }; + + let snapshot_params = LoadSnapshotParams { + snapshot_path: snapshot_config.snapshot_path, + mem_backend, + enable_diff_snapshots: snapshot_config.enable_diff_snapshots, + resume_vm: snapshot_config.resume_vm, + }; + + // Construct the `ParsedRequest` object. + let mut parsed_req = ParsedRequest::new_sync(VmmAction::LoadSnapshot(snapshot_params)); + + // If `mem_file_path` was present, set the deprecation message in `parsing_info`. + if let Some(msg) = deprecation_message { + parsed_req.parsing_info().append_deprecation_message(msg); + } + + Ok(parsed_req) +} + #[cfg(test)] mod tests { use super::*; - use crate::parsed_request::tests::vmm_action_from_request; + use crate::parsed_request::tests::{depr_action_from_req, vmm_action_from_request}; + use vmm::vmm_config::snapshot::{MemBackendConfig, MemBackendType}; #[test] fn test_parse_put_snapshot() { @@ -102,36 +169,87 @@ mod tests { body = r#"{ "snapshot_path": "foo", - "mem_file_path": "bar" + "mem_backend": { + "backend_path": "bar", + "backend_type": "File" + } }"#; let mut expected_cfg = LoadSnapshotParams { snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_backend: MemBackendConfig { + backend_path: PathBuf::from("bar"), + backend_type: MemBackendType::File, + }, enable_diff_snapshots: false, resume_vm: false, }; - match vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some(&"load")).unwrap()) - { + + let mut parsed_request = parse_put_snapshot(&Body::new(body), Some(&"load")).unwrap(); + assert!(parsed_request + .parsing_info() + .take_deprecation_message() + .is_none()); + + match vmm_action_from_request(parsed_request) { VmmAction::LoadSnapshot(cfg) => assert_eq!(cfg, expected_cfg), _ => panic!("Test failed."), } body = r#"{ "snapshot_path": "foo", - "mem_file_path": "bar", + "mem_backend": { + "backend_path": "bar", + "backend_type": "File" + }, "enable_diff_snapshots": true }"#; expected_cfg = LoadSnapshotParams { snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_backend: MemBackendConfig { + backend_path: PathBuf::from("bar"), + backend_type: MemBackendType::File, + }, enable_diff_snapshots: true, resume_vm: false, }; - match vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some(&"load")).unwrap()) - { + let mut parsed_request = parse_put_snapshot(&Body::new(body), Some(&"load")).unwrap(); + assert!(parsed_request + .parsing_info() + .take_deprecation_message() + .is_none()); + match vmm_action_from_request(parsed_request) { + VmmAction::LoadSnapshot(cfg) => assert_eq!(cfg, expected_cfg), + _ => panic!("Test failed."), + } + + body = r#"{ + "snapshot_path": "foo", + "mem_backend": { + "backend_path": "bar", + "backend_type": "Uffd" + }, + "resume_vm": true + }"#; + + expected_cfg = LoadSnapshotParams { + snapshot_path: PathBuf::from("foo"), + mem_backend: MemBackendConfig { + backend_path: PathBuf::from("bar"), + backend_type: MemBackendType::Uffd, + }, + enable_diff_snapshots: false, + resume_vm: true, + }; + + let mut parsed_request = parse_put_snapshot(&Body::new(body), Some(&"load")).unwrap(); + assert!(parsed_request + .parsing_info() + .take_deprecation_message() + .is_none()); + match vmm_action_from_request(parsed_request) { VmmAction::LoadSnapshot(cfg) => assert_eq!(cfg, expected_cfg), _ => panic!("Test failed."), } @@ -144,17 +262,87 @@ mod tests { expected_cfg = LoadSnapshotParams { snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_backend: MemBackendConfig { + backend_path: PathBuf::from("bar"), + backend_type: MemBackendType::File, + }, enable_diff_snapshots: false, resume_vm: true, }; - match vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some(&"load")).unwrap()) - { + let parsed_request = parse_put_snapshot(&Body::new(body), Some(&"load")).unwrap(); + match depr_action_from_req(parsed_request, Some(LOAD_DEPRECATION_MESSAGE.to_string())) { VmmAction::LoadSnapshot(cfg) => assert_eq!(cfg, expected_cfg), _ => panic!("Test failed."), } + body = r#"{ + "snapshot_path": "foo", + "mem_backend": { + "backend_path": "bar" + } + }"#; + + assert_eq!( + parse_put_snapshot(&Body::new(body), Some(&"load")).err().unwrap().to_string(), + "An error occurred when deserializing the json body of a request: missing field `backend_type` at line 5 column 17." + ); + + body = r#"{ + "snapshot_path": "foo", + "mem_backend": { + "backend_type": "File", + } + }"#; + + assert_eq!( + parse_put_snapshot(&Body::new(body), Some(&"load")) + .err().unwrap().to_string(), + "An error occurred when deserializing the json body of a request: trailing comma at line 5 column 17." + ); + + body = r#"{ + "snapshot_path": "foo", + "mem_file_path": "bar", + "mem_backend": { + "backend_path": "bar", + "backend_type": "Uffd" + } + }"#; + + assert_eq!( + parse_put_snapshot(&Body::new(body), Some(&"load")) + .err() + .unwrap() + .to_string(), + Error::SerdeJson(serde_json::Error::custom(TOO_MANY_FIELDS.to_string())).to_string() + ); + + body = r#"{ + "snapshot_path": "foo" + }"#; + + assert_eq!( + parse_put_snapshot(&Body::new(body), Some(&"load")) + .err() + .unwrap() + .to_string(), + Error::SerdeJson(serde_json::Error::custom(MISSING_FIELD.to_string())).to_string() + ); + + body = r#"{ + "mem_backend": { + "backend_path": "bar", + "backend_type": "Uffd" + } + }"#; + + assert_eq!( + parse_put_snapshot(&Body::new(body), Some(&"load")) + .err().unwrap().to_string(), + "An error occurred when deserializing the json body of a request: missing field `snapshot_path` at line 6 column 15." + ); + assert!(parse_put_snapshot(&Body::new(body), Some(&"invalid")).is_err()); assert!(parse_put_snapshot(&Body::new(body), None).is_err()); } diff --git a/src/api_server/swagger/firecracker.yaml b/src/api_server/swagger/firecracker.yaml index 5bb7a852155..07d242532e6 100644 --- a/src/api_server/swagger/firecracker.yaml +++ b/src/api_server/swagger/firecracker.yaml @@ -960,6 +960,25 @@ definitions: maximum: 32 description: Number of vCPUs (either 1 or an even number) + MemoryBackend: + type: object + required: + - backend_type + - backend_path + properties: + backend_type: + type: string + enum: + - File + - Uffd + backend_path: + type: string + description: Based on 'backend_type' it is either + 1) Path to the file that contains the guest memory to be loaded + 2) Path to the UDS where a process is listening for a UFFD initialization + control payload and open file descriptor that it can use to serve this + process's guest memory page faults + Metrics: type: object description: @@ -1097,8 +1116,10 @@ definitions: SnapshotLoadParams: type: object + description: + Defines the configuration used for handling snapshot resume. Exactly one of + the two `mem_*` fields must be present in the body of the request. required: - - mem_file_path - snapshot_path properties: enable_diff_snapshots: @@ -1107,7 +1128,16 @@ definitions: Enable support for incremental (diff) snapshots by tracking dirty guest pages. mem_file_path: type: string - description: Path to the file that contains the guest memory to be loaded. + description: + Path to the file that contains the guest memory to be loaded. + This parameter has been deprecated and is only allowed if + `mem_backend` is not present. + mem_backend: + $ref: "#/definitions/MemoryBackend" + description: + Configuration for the backend that handles memory load. If this field + is specified, `mem_file_path` is forbidden. Either `mem_backend` or + `mem_file_path` must be present at a time. snapshot_path: type: string description: Path to the file that contains the microVM state to be loaded. diff --git a/src/utils/src/lib.rs b/src/utils/src/lib.rs index 64d2a40e6f9..5bfa7f74a87 100644 --- a/src/utils/src/lib.rs +++ b/src/utils/src/lib.rs @@ -6,8 +6,8 @@ // More specifically, we are re-exporting modules from `vmm_sys_util` as part // of the `utils` crate. pub use vmm_sys_util::{ - epoll, errno, eventfd, fam, generate_fam_struct_impl, ioctl, rand, seek_hole, syscall, tempdir, - tempfile, terminal, + epoll, errno, eventfd, fam, generate_fam_struct_impl, ioctl, rand, seek_hole, sock_ctrl_msg, + syscall, tempdir, tempfile, terminal, }; pub use vmm_sys_util::{ioctl_expr, ioctl_ioc_nr, ioctl_iow_nr}; diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 6f37220c6e9..c1dfb8ed379 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -14,6 +14,7 @@ libc = ">=0.2.39" linux-loader = ">=0.4.0" serde = { version = ">=1.0.27", features = ["derive"] } serde_json = ">=1.0.9" +userfaultfd = ">=0.4.0" versionize = ">=0.1.6" versionize_derive = ">=0.1.3" vm-superio = ">=0.4.0" diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index bfebbd96324..f7b275c4246 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -53,6 +53,7 @@ use linux_loader::loader::KernelLoader; use logger::{error, warn}; use seccompiler::BpfThreadMap; use snapshot::Persist; +use userfaultfd::Uffd; use utils::eventfd::EventFd; use utils::terminal::Terminal; use utils::time::TimestampUs; @@ -237,6 +238,7 @@ fn create_vmm_and_vcpus( instance_info: &InstanceInfo, event_manager: &mut EventManager, guest_memory: GuestMemoryMmap, + uffd: Option, track_dirty_pages: bool, vcpu_count: u8, ) -> std::result::Result<(Vmm, Vec), StartMicrovmError> { @@ -298,6 +300,7 @@ fn create_vmm_and_vcpus( shutdown_exit_code: None, vm, guest_memory, + uffd, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, @@ -362,6 +365,7 @@ pub fn build_microvm_for_boot( instance_info, event_manager, guest_memory, + None, track_dirty_pages, vcpu_config.vcpu_count, )?; @@ -444,11 +448,13 @@ pub fn build_microvm_for_boot( /// /// An `Arc` reference of the built `Vmm` is also plugged in the `EventManager`, while another /// is returned. +#[allow(clippy::too_many_arguments)] pub fn build_microvm_from_snapshot( instance_info: &InstanceInfo, event_manager: &mut EventManager, microvm_state: MicrovmState, guest_memory: GuestMemoryMmap, + uffd: Option, track_dirty_pages: bool, seccomp_filters: &BpfThreadMap, vm_resources: &mut VmResources, @@ -463,6 +469,7 @@ pub fn build_microvm_from_snapshot( instance_info, event_manager, guest_memory.clone(), + uffd, track_dirty_pages, vcpu_count, )?; @@ -1088,6 +1095,7 @@ pub mod tests { shutdown_exit_code: None, vm, guest_memory, + uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 3ad2db8ecfc..5d28f556d88 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -64,6 +64,7 @@ use logger::{error, info, warn, LoggerError, MetricsError, METRICS}; use rate_limiter::BucketUpdate; use seccompiler::BpfProgram; use snapshot::Persist; +use userfaultfd::Uffd; use utils::epoll::EventSet; use utils::eventfd::EventFd; use vm_memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; @@ -251,6 +252,10 @@ pub struct Vmm { // Guest VM core resources. vm: Vm, guest_memory: GuestMemoryMmap, + // Save UFFD in order to keep it open in the Firecracker process, as well. + // Since this field is never read again, we need to allow `dead_code`. + #[allow(dead_code)] + uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. vcpus_exit_evt: EventFd, diff --git a/src/vmm/src/memory_snapshot.rs b/src/vmm/src/memory_snapshot.rs index 64a212a06dc..b219c424eb2 100644 --- a/src/vmm/src/memory_snapshot.rs +++ b/src/vmm/src/memory_snapshot.rs @@ -7,6 +7,8 @@ use std::fmt::{Display, Formatter}; use std::fs::File; use std::io::SeekFrom; +use crate::DirtyBitmap; +use utils::{errno, get_page_size}; use versionize::{VersionMap, Versionize, VersionizeResult}; use versionize_derive::Versionize; use vm_memory::{ @@ -14,14 +16,13 @@ use vm_memory::{ GuestMemoryRegion, MemoryRegionAddress, }; -use crate::DirtyBitmap; -use utils::{errno, get_page_size}; - /// State of a guest memory region saved to file/buffer. #[derive(Debug, PartialEq, Versionize)] // NOTICE: Any changes to this structure require a snapshot version bump. pub struct GuestMemoryRegionState { - /// Base address. + // This should have been named `base_guest_addr` since it's _guest_ addr, but for + // backward compatibility we have to keep this name. At least this comment should help. + /// Base GuestAddress. pub base_address: u64, /// Region size. pub size: usize, @@ -29,7 +30,7 @@ pub struct GuestMemoryRegionState { pub offset: u64, } -/// Guest memory state. +/// Describes guest memory regions and their snapshot file mappings. #[derive(Debug, Default, PartialEq, Versionize)] // NOTICE: Any changes to this structure require a snapshot version bump. pub struct GuestMemoryState { @@ -55,7 +56,7 @@ where /// Creates a GuestMemoryMmap given a `file` containing the data /// and a `state` containing mapping information. fn restore( - file: &File, + file: Option<&File>, state: &GuestMemoryState, track_dirty_pages: bool, ) -> std::result::Result; @@ -176,28 +177,27 @@ impl SnapshotMemory for GuestMemoryMmap { .map_err(Error::WriteMemory) } - /// Creates a GuestMemoryMmap given a `file` containing the data - /// and a `state` containing mapping information. + /// Creates a GuestMemoryMmap backed by a `file` if present, otherwise backed + /// by anonymous memory. Memory layout and ranges are described in `state` param. fn restore( - file: &File, + file: Option<&File>, state: &GuestMemoryState, track_dirty_pages: bool, ) -> std::result::Result { - vm_memory::create_guest_memory( - &state - .regions - .iter() - .map(|r| { - ( - Some(FileOffset::new(file.try_clone().unwrap(), r.offset)), - GuestAddress(r.base_address), - r.size, - ) - }) - .collect::>(), - track_dirty_pages, - ) - .map_err(Error::CreateMemory) + let mut regions = vec![]; + for region in state.regions.iter() { + let f = match file { + Some(f) => Some(FileOffset::new( + f.try_clone().map_err(Error::FileHandle)?, + region.offset, + )), + None => None, + }; + + regions.push((f, GuestAddress(region.base_address), region.size)); + } + + vm_memory::create_guest_memory(®ions, track_dirty_pages).map_err(Error::CreateMemory) } } @@ -302,7 +302,8 @@ mod tests { guest_memory.dump(&mut memory_file.as_file()).unwrap(); let restored_guest_memory = - GuestMemoryMmap::restore(&memory_file.as_file(), &memory_state, false).unwrap(); + GuestMemoryMmap::restore(Some(memory_file.as_file()), &memory_state, false) + .unwrap(); // Check that the region contents are the same. let mut actual_region = vec![0u8; page_size * 2]; @@ -336,7 +337,7 @@ mod tests { // We can restore from this because this is the first dirty dump. let restored_guest_memory = - GuestMemoryMmap::restore(&file.as_file(), &memory_state, false).unwrap(); + GuestMemoryMmap::restore(Some(file.as_file()), &memory_state, false).unwrap(); // Check that the region contents are the same. let mut actual_region = vec![0u8; page_size * 2]; diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 75fa29494b1..e7e271e5ee1 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -6,6 +6,7 @@ use std::fmt::{Display, Formatter}; use std::fs::{File, OpenOptions}; use std::io::{self, Write}; +use std::os::unix::{io::AsRawFd, net::UnixStream}; use std::path::Path; use std::sync::{Arc, Mutex}; @@ -13,7 +14,9 @@ use crate::builder::{self, StartMicrovmError}; use crate::device_manager::persist::Error as DevicePersistError; use crate::mem_size_mib; use crate::vmm_config::machine_config::MAX_SUPPORTED_VCPUS; -use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, SnapshotType}; +use crate::vmm_config::snapshot::{ + CreateSnapshotParams, LoadSnapshotParams, MemBackendType, SnapshotType, +}; use crate::vstate::{self, vcpu::VcpuState, vm::VmState}; use crate::device_manager::persist::DeviceStates; @@ -32,11 +35,14 @@ use crate::vmm_config::instance_info::InstanceInfo; use arch::regs::{get_manufacturer_id_from_host, get_manufacturer_id_from_state}; use logger::{error, info}; use seccompiler::BpfThreadMap; +use serde::Serialize; use snapshot::Snapshot; +use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; +use utils::sock_ctrl_msg::ScmSocket; use versionize::{VersionMap, Versionize, VersionizeResult}; use versionize_derive::Versionize; use virtio_gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX; -use vm_memory::GuestMemoryMmap; +use vm_memory::{GuestMemory, GuestMemoryMmap}; #[cfg(target_arch = "x86_64")] const FC_V0_23_MAX_DEVICES: u32 = 11; @@ -65,6 +71,23 @@ pub struct MicrovmState { pub device_states: DeviceStates, } +/// This describes the mapping between Firecracker base virtual address and offset in the +/// buffer or file backend for a guest memory region. It is used to tell an external +/// process/thread where to populate the guest memory data for this range. +/// +/// E.g. Guest memory contents for a region of `size` bytes can be found in the backend +/// at `offset` bytes from the beginning, and should be copied/populated into `base_host_address`. +#[derive(Clone, Debug, Serialize)] +pub struct GuestRegionUffdMapping { + /// Base host virtual address where the guest memory contents for this region + /// should be copied/populated. + pub base_host_virt_addr: u64, + /// Region size. + pub size: usize, + /// Offset in the backend file/buffer where the region contents are. + pub offset: u64, +} + /// Errors related to saving and restoring Microvm state. #[derive(Debug)] pub enum MicrovmStateError { @@ -181,20 +204,31 @@ impl Display for CreateSnapshotError { pub enum LoadSnapshotError { /// Failed to build a microVM from snapshot. BuildMicroVm(StartMicrovmError), + /// Snapshot cpu vendor differs than host cpu vendor. + CpuVendorCheck(String), + /// Failed to create an UFFD Builder. + CreateUffdBuilder(userfaultfd::Error), /// Failed to deserialize memory. DeserializeMemory(memory_snapshot::Error), /// Failed to deserialize microVM state. DeserializeMicrovmState(snapshot::Error), + /// Snapshot failed sanity checks. + InvalidSnapshot(String), /// Failed to open memory backing file. MemoryBackingFile(io::Error), /// Failed to resume Vm after loading snapshot. ResumeMicroVm(VmmError), /// Failed to open the snapshot backing file. SnapshotBackingFile(&'static str, io::Error), - /// Snapshot cpu vendor differs than host cpu vendor. - CpuVendorCheck(String), - /// Snapshot failed sanity checks. - InvalidSnapshot(String), + /// Unable to connect to UDS in order to send information regarding + /// handling guest memory page-fault events. + UdsConnection(io::Error), + /// Failed to register guest memory regions to UFFD. + UffdMemoryRegionsRegister(userfaultfd::Error), + /// Failed to send guest memory layout and path to user fault FD used to handle + /// guest memory page faults. This information is sent to a UDS where a custom + /// page-fault handler process is listening. + UffdSend(kvm_ioctls::Error), } impl Display for LoadSnapshotError { @@ -202,10 +236,13 @@ impl Display for LoadSnapshotError { use self::LoadSnapshotError::*; match self { BuildMicroVm(err) => write!(f, "Cannot build a microVM from snapshot: {}", err), + CreateUffdBuilder(err) => write!(f, "Cannot create UFFD builder: {:?}", err), + CpuVendorCheck(err) => write!(f, "CPU vendor check failed: {}", err), DeserializeMemory(err) => write!(f, "Cannot deserialize memory: {}", err), DeserializeMicrovmState(err) => { write!(f, "Cannot deserialize the microVM state: {:?}", err) } + InvalidSnapshot(err) => write!(f, "Snapshot sanity check failed: {}", err), MemoryBackingFile(err) => write!(f, "Cannot open the memory file: {}", err), ResumeMicroVm(err) => write!( f, @@ -217,8 +254,16 @@ impl Display for LoadSnapshotError { "Cannot perform {} on the snapshot backing file: {}", action, err ), - CpuVendorCheck(err) => write!(f, "CPU vendor check failed: {}", err), - InvalidSnapshot(err) => write!(f, "Snapshot sanity check failed: {}", err), + UdsConnection(err) => write!( + f, + "Cannot connect to UDS in order to send information on \ + handling guest memory page-faults due to: {}", + err + ), + UffdMemoryRegionsRegister(err) => { + write!(f, "Cannot register memory regions to UFFD: {:?}.", err) + } + UffdSend(err) => write!(f, "Cannot send FD and memory layout to UFFD: {}", err), } } } @@ -443,23 +488,34 @@ pub fn restore_from_snapshot( vm_resources: &mut VmResources, ) -> std::result::Result>, LoadSnapshotError> { use self::LoadSnapshotError::*; - let track_dirty_pages = params.enable_diff_snapshots; let microvm_state = snapshot_state_from_file(¶ms.snapshot_path, version_map)?; // Some sanity checks before building the microvm. snapshot_state_sanity_check(µvm_state)?; - let guest_memory = guest_memory_from_file( - ¶ms.mem_file_path, - µvm_state.memory_state, - track_dirty_pages, - )?; - + let mem_backend_path = ¶ms.mem_backend.backend_path; + let mem_state = µvm_state.memory_state; + let track_dirty_pages = params.enable_diff_snapshots; + let (guest_memory, uffd) = match params.mem_backend.backend_type { + MemBackendType::File => ( + guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)?, + None, + ), + MemBackendType::Uffd => guest_memory_from_uffd( + mem_backend_path, + mem_state, + track_dirty_pages, + // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device + // is present in the microVM state. + microvm_state.device_states.balloon_device.is_some(), + )?, + }; builder::build_microvm_from_snapshot( instance_info, event_manager, microvm_state, guest_memory, + uffd, track_dirty_pages, seccomp_filters, vm_resources, @@ -487,7 +543,94 @@ fn guest_memory_from_file( ) -> std::result::Result { use self::LoadSnapshotError::{DeserializeMemory, MemoryBackingFile}; let mem_file = File::open(mem_file_path).map_err(MemoryBackingFile)?; - GuestMemoryMmap::restore(&mem_file, mem_state, track_dirty_pages).map_err(DeserializeMemory) + GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages) + .map_err(DeserializeMemory) +} + +fn guest_memory_from_uffd( + mem_uds_path: &Path, + mem_state: &GuestMemoryState, + track_dirty_pages: bool, + enable_balloon: bool, +) -> std::result::Result<(GuestMemoryMmap, Option), LoadSnapshotError> { + use self::LoadSnapshotError::{ + CreateUffdBuilder, DeserializeMemory, UdsConnection, UffdMemoryRegionsRegister, UffdSend, + }; + + let guest_memory = + GuestMemoryMmap::restore(None, mem_state, track_dirty_pages).map_err(DeserializeMemory)?; + + let mut uffd_builder = UffdBuilder::new(); + + if enable_balloon { + // We enable this so that the page fault handler can add logic + // for treating madvise(MADV_DONTNEED) events triggerd by balloon inflation. + uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); + } + + let uffd = uffd_builder + .close_on_exec(true) + .non_blocking(true) + .create() + .map_err(CreateUffdBuilder)?; + + let mut backend_mappings = Vec::with_capacity(guest_memory.num_regions()); + for (mem_region, state_region) in guest_memory.iter().zip(mem_state.regions.iter()) { + let host_base_addr = mem_region.as_ptr(); + let size = mem_region.size(); + + uffd.register(host_base_addr as _, size as _) + .map_err(UffdMemoryRegionsRegister)?; + backend_mappings.push(GuestRegionUffdMapping { + base_host_virt_addr: host_base_addr as u64, + size, + offset: state_region.offset, + }); + } + + // This is safe to unwrap() because we control the contents of the vector + // (i.e GuestRegionUffdMapping entries). + let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); + + let socket = UnixStream::connect(mem_uds_path).map_err(UdsConnection)?; + socket + .send_with_fd( + backend_mappings.as_bytes(), + // In the happy case we can close the fd since the other process has it open and is + // using it to serve us pages. + // + // The problem is that if other process crashes/exits, firecracker guest memory + // will simply revert to anon-mem behavior which would lead to silent errors and + // undefined behavior. + // + // To tackle this scenario, the page fault handler can notify Firecracker of any + // crashes/exits. There is no need for Firecracker to explicitly send its process ID. + // The external process can obtain Firecracker's PID by calling `getsockopt` with + // `libc::SO_PEERCRED` option like so: + // + // let mut val = libc::ucred { pid: 0, gid: 0, uid: 0 }; + // let mut ucred_size: u32 = mem::size_of::() as u32; + // libc::getsockopt( + // socket.as_raw_fd(), + // libc::SOL_SOCKET, + // libc::SO_PEERCRED, + // &mut val as *mut _ as *mut _, + // &mut ucred_size as *mut libc::socklen_t, + // ); + // + // Per this linux man page: https://man7.org/linux/man-pages/man7/unix.7.html, + // `SO_PEERCRED` returns the credentials (PID, UID and GID) of the peer process + // connected to this socket. The returned credentials are those that were in effect + // at the time of the `connect` call. + // + // Moreover, Firecracker holds a copy of the UFFD fd as well, so that even if the + // page fault handler process does not tear down Firecracker when necessary, the + // uffd will still be alive but with no one to serve faults, leading to guest freeze. + uffd.as_raw_fd(), + ) + .map_err(UffdSend)?; + + Ok((guest_memory, Some(uffd))) } #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index d008b7e4024..72cde8d4a6b 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -803,6 +803,7 @@ mod tests { use devices::virtio::VsockError; use seccompiler::BpfThreadMap; + use crate::vmm_config::snapshot::{MemBackendConfig, MemBackendType}; use mmds::data_store::MmdsVersion; use std::path::PathBuf; @@ -1604,7 +1605,10 @@ mod tests { // Without resume. let req = VmmAction::LoadSnapshot(LoadSnapshotParams { snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_backend: MemBackendConfig { + backend_type: MemBackendType::File, + backend_path: PathBuf::new(), + }, enable_diff_snapshots: false, resume_vm: false, }); @@ -1617,7 +1621,10 @@ mod tests { // With resume. let req = VmmAction::LoadSnapshot(LoadSnapshotParams { snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_backend: MemBackendConfig { + backend_type: MemBackendType::File, + backend_path: PathBuf::new(), + }, enable_diff_snapshots: false, resume_vm: true, }); @@ -2025,7 +2032,10 @@ mod tests { check_runtime_request_err( VmmAction::LoadSnapshot(LoadSnapshotParams { snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_backend: MemBackendConfig { + backend_type: MemBackendType::File, + backend_path: PathBuf::new(), + }, enable_diff_snapshots: false, resume_vm: false, }), @@ -2044,7 +2054,10 @@ mod tests { // Load snapshot should no longer be allowed. let req = VmmAction::LoadSnapshot(LoadSnapshotParams { snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_backend: MemBackendConfig { + backend_type: MemBackendType::File, + backend_path: PathBuf::new(), + }, enable_diff_snapshots: false, resume_vm: false, }); diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index 9662de08cd3..c8634b66980 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -18,11 +18,24 @@ pub enum SnapshotType { } impl Default for SnapshotType { - fn default() -> SnapshotType { + fn default() -> Self { SnapshotType::Full } } +/// Specifies the method through which guest memory will get populated when +/// resuming from a snapshot: +/// 1) A file that contains the guest memory to be loaded, +/// 2) An UDS where a custom page-fault handler process is listening for +/// the UFFD set up by Firecracker to handle its guest memory page faults. +#[derive(Debug, Deserialize, PartialEq)] +pub enum MemBackendType { + /// Guest memory contents will be loaded from a file. + File, + /// Guest memory will be served through UFFD by a separate process. + Uffd, +} + /// Stores the configuration that will be used for creating a snapshot. #[derive(Debug, Deserialize, PartialEq, Serialize)] #[serde(deny_unknown_fields)] @@ -41,23 +54,52 @@ pub struct CreateSnapshotParams { } /// Stores the configuration that will be used for loading a snapshot. -#[derive(Debug, Deserialize, PartialEq, Serialize)] -#[serde(deny_unknown_fields)] +#[derive(Debug, PartialEq)] pub struct LoadSnapshotParams { /// Path to the file that contains the microVM state to be loaded. pub snapshot_path: PathBuf, - /// Path to the file that contains the guest memory to be loaded. - pub mem_file_path: PathBuf, + /// Specifies guest memory backend configuration. + pub mem_backend: MemBackendConfig, /// Setting this flag will enable KVM dirty page tracking and will /// allow taking subsequent incremental snapshots. - #[serde(default)] pub enable_diff_snapshots: bool, /// When set to true, the vm is also resumed if the snapshot load /// is successful. + pub resume_vm: bool, +} + +/// Stores the configuration for loading a snapshot that is provided by the user. +#[derive(Deserialize)] +#[serde(deny_unknown_fields)] +pub struct LoadSnapshotConfig { + /// Path to the file that contains the microVM state to be loaded. + pub snapshot_path: PathBuf, + /// Path to the file that contains the guest memory to be loaded. To be used only if + /// `mem_backend` is not specified. + #[serde(skip_serializing_if = "Option::is_none")] + pub mem_file_path: Option, + /// Guest memory backend configuration. Is not to be used in conjunction with `mem_file_path`. + /// None value is allowed only if `mem_file_path` is present. + #[serde(skip_serializing_if = "Option::is_none")] + pub mem_backend: Option, + /// Whether or not to enable KVM dirty page tracking. + #[serde(default)] + pub enable_diff_snapshots: bool, + /// Whether or not to resume the vm post snapshot load. #[serde(default)] pub resume_vm: bool, } +/// Stores the configuration used for managing snapshot memory. +#[derive(Debug, Deserialize, PartialEq)] +#[serde(deny_unknown_fields)] +pub struct MemBackendConfig { + /// Path to the backend used to handle the guest memory. + pub backend_path: PathBuf, + /// Specifies the guest memory backend type. + pub backend_type: MemBackendType, +} + /// The microVM state options. #[derive(Debug, Deserialize, Serialize)] pub enum VmState { diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 413018aa93c..e5a3162bd2e 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -217,8 +217,12 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { VERSION_MAP.clone(), ) .unwrap(); - let mem = GuestMemoryMmap::restore(memory_file.as_file(), µvm_state.memory_state, false) - .unwrap(); + let mem = GuestMemoryMmap::restore( + Some(memory_file.as_file()), + µvm_state.memory_state, + false, + ) + .unwrap(); let vm_resources = &mut VmResources::default(); @@ -228,6 +232,7 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { &mut event_manager, microvm_state, mem, + None, false, &mut empty_seccomp_filters, vm_resources, diff --git a/tests/conftest.py b/tests/conftest.py index c07a208c45c..86788f3a8a9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -394,6 +394,48 @@ def bin_seccomp_paths(test_fc_session_root_path): } +@pytest.fixture(scope='session') +def uffd_handler_paths(test_fc_session_root_path): + """Build UFFD handler binaries.""" + # pylint: disable=redefined-outer-name + # The fixture pattern causes a pylint false positive for that rule. + uffd_build_path = os.path.join( + test_fc_session_root_path, + build_tools.CARGO_RELEASE_REL_PATH + ) + + extra_args = '--release --target {}-unknown-linux-musl' + extra_args = extra_args.format(platform.machine()) + build_tools.cargo_build(uffd_build_path, + extra_args=extra_args, + src_dir='host_tools/uffd') + + release_binaries_path = os.path.join( + test_fc_session_root_path, + build_tools.CARGO_RELEASE_REL_PATH, + build_tools.RELEASE_BINARIES_REL_PATH + ) + + valid_handler = os.path.normpath( + os.path.join( + release_binaries_path, + 'valid_handler' + ) + ) + + malicious_handler = os.path.normpath( + os.path.join( + release_binaries_path, + 'malicious_handler' + ) + ) + + yield { + 'valid_handler': valid_handler, + 'malicious_handler': malicious_handler, + } + + @pytest.fixture() def microvm(test_fc_session_root_path, bin_cloner_path): """Instantiate a microvm.""" diff --git a/tests/framework/artifacts.py b/tests/framework/artifacts.py index c88c4704be0..b2933b7711a 100644 --- a/tests/framework/artifacts.py +++ b/tests/framework/artifacts.py @@ -479,6 +479,19 @@ class SnapshotType(Enum): DIFF = 1 +class SnapshotMemBackendType(Enum): + """ + Supported guest memory backend types used for snapshot load. + + - `FILE`: establishes if the guest memory is backed by a file. + - `UFFD`: indicates that the guest memory page faults are handled by + a dedicated UFFD page-fault handler process. + """ + + FILE = 'File' + UFFD = 'Uffd' + + class Snapshot: """Manages Firecracker snapshots.""" diff --git a/tests/framework/builder.py b/tests/framework/builder.py index 60f6ae8cbdf..cc2d08e0d6f 100644 --- a/tests/framework/builder.py +++ b/tests/framework/builder.py @@ -10,8 +10,8 @@ from conftest import init_microvm, _test_images_s3_bucket from framework.defs import DEFAULT_TEST_SESSION_ROOT_PATH from framework.artifacts import ( - ArtifactCollection, Artifact, DiskArtifact, Snapshot, - SnapshotType, NetIfaceConfig + ArtifactCollection, Artifact, DiskArtifact, NetIfaceConfig, + Snapshot, SnapshotMemBackendType, SnapshotType ) from framework import utils import host_tools.logging as log_tools @@ -170,18 +170,27 @@ def build(self, # so we do not need to move it around polluting the code. def build_from_snapshot(self, snapshot: Snapshot, + vm=None, resume=False, # Enable incremental snapshot capability. diff_snapshots=False, use_ramdisk=False, fc_binary=None, jailer_binary=None, - daemonize=True): + daemonize=True, + # If None, it means that the guest memory is + # backed by a file. + # If specified, establishes that page-faults + # resulted when loading the guest memory + # are handled by a dedicated UFFD PF handler. + uffd_path=None, + timeout=None): """Build a microvm from a snapshot artifact.""" - vm = init_microvm(self.root_path, self.bin_cloner_path, - fc_binary, jailer_binary,) - vm.jailer.daemonize = daemonize - vm.spawn(log_level='Error', use_ramdisk=use_ramdisk) - vm.api_session.untime() + if vm is None: + vm = init_microvm(self.root_path, self.bin_cloner_path, + fc_binary, jailer_binary,) + vm.jailer.daemonize = daemonize + vm.spawn(log_level='Error', use_ramdisk=use_ramdisk) + vm.api_session.untime() metrics_file_path = os.path.join(vm.path, 'metrics.log') metrics_fifo = log_tools.Fifo(metrics_file_path) @@ -210,10 +219,31 @@ def build_from_snapshot(self, guest_ip=iface.guest_ip, netmask_len=iface.netmask, tapname=iface.tap_name) - response = vm.snapshot.load(mem_file_path=jailed_mem, - snapshot_path=jailed_vmstate, - diff=diff_snapshots, - resume=resume) + + full_fc_version = \ + vm.version.get_from_api().json()['firecracker_version'] + if utils.compare_dirty_versions(full_fc_version, '1.0.0') > 0: + if uffd_path: + mem_backend = { + 'type': SnapshotMemBackendType.UFFD, + 'path': uffd_path + } + else: + mem_backend = { + 'type': SnapshotMemBackendType.FILE, + 'path': jailed_mem + } + response = vm.snapshot.load(mem_backend=mem_backend, + snapshot_path=jailed_vmstate, + diff=diff_snapshots, + resume=resume, + timeout=timeout) + else: + response = vm.snapshot.load(mem_file_path=jailed_mem, + snapshot_path=jailed_vmstate, + diff=diff_snapshots, + resume=resume, + timeout=timeout) status_ok = vm.api_session.is_status_no_content(response.status_code) # Verify response status and cleanup if needed before assert. diff --git a/tests/framework/dependencies.txt b/tests/framework/dependencies.txt index e6ff40a8e5f..64ebeda576e 100644 --- a/tests/framework/dependencies.txt +++ b/tests/framework/dependencies.txt @@ -1 +1 @@ -{'jailer v1.0.0 (/firecracker/src/jailer)', 'logger v0.1.0 (/firecracker/src/logger)', 'version_check v0.9.4', 'devices v0.1.0 (/firecracker/src/devices)', 'typenum v1.15.0', 'arch v0.1.0 (/firecracker/src/arch)', 'versionize v0.1.6', 'arch_gen v0.1.0 (/firecracker/src/arch_gen)', 'libc v0.2.117', 'serde_json v1.0.78', 'ryu v1.0.9', 'syn v1.0.86', 'versionize_derive v0.1.4 (proc-macro)', 'log v0.4.14', 'ctr v0.8.0', 'kvm-ioctls v0.11.0', 'vmm-sys-util v0.9.0', 'vm-superio v0.5.0', 'quote v1.0.15', 'vm-fdt v0.1.0', 'aead v0.4.3', 'serde v1.0.136', 'serde_derive v1.0.136 (proc-macro)', 'utils v0.1.0 (/firecracker/src/utils)', 'vm-memory v0.1.0 (/firecracker/src/vm-memory)', 'dumbo v0.1.0 (/firecracker/src/dumbo)', 'cfg-if v1.0.0', 'seccompiler v1.0.0 (/firecracker/src/seccompiler)', 'aes v0.7.5', 'snapshot v0.1.0 (/firecracker/src/snapshot)', 'itoa v1.0.1', 'event-manager v0.2.1', 'unicode-xid v0.2.2', 'cipher v0.3.0', 'net_gen v0.1.0 (/firecracker/src/net_gen)', 'micro_http v0.1.0 (https://github.com/firecracker-microvm/micro-http?rev=0a58eb1#0a58eb1e)', 'subtle v2.4.1', 'api_server v0.1.0 (/firecracker/src/api_server)', 'generic-array v0.14.5', 'crc64 v1.0.0', 'universal-hash v0.4.1', 'cpufeatures v0.2.1', 'polyval v0.5.3', 'io_uring v0.1.0 (/firecracker/src/io_uring)', 'proc-macro2 v1.0.36', 'rate_limiter v0.1.0 (/firecracker/src/rate_limiter)', 'timerfd v1.2.0', 'regex-syntax v0.6.25', 'regex v1.5.4', 'opaque-debug v0.3.0', 'linux-loader v0.4.0', 'bitflags v1.3.2', 'aes-gcm v0.9.4', 'bincode v1.3.3', 'firecracker v1.0.0 (/firecracker/src/firecracker)', 'ghash v0.4.4', 'mmds v0.1.0 (/firecracker/src/mmds)', 'kvm-bindings v0.5.0 (https://github.com/firecracker-microvm/kvm-bindings?tag=v0.5.0-1#4569d3f5)', 'cpuid v0.1.0 (/firecracker/src/cpuid)', 'rebase-snap v0.1.0 (/firecracker/src/rebase-snap)', 'vm-memory v0.7.0', 'base64 v0.13.0', 'virtio_gen v0.1.0 (/firecracker/src/virtio_gen)', 'lazy_static v1.4.0', 'vmm v0.1.0 (/firecracker/src/vmm)'} \ No newline at end of file +{'serde v1.0.136', 'lazy_static v1.4.0', 'syn v1.0.86', 'micro_http v0.1.0 (https://github.com/firecracker-microvm/micro-http?rev=0a58eb1#0a58eb1e)', 'ghash v0.4.4', 'serde_derive v1.0.136 (proc-macro)', 'bincode v1.3.3', 'serde_json v1.0.78', 'lazycell v1.3.0', 'thiserror-impl v1.0.30 (proc-macro)', 'minimal-lexical v0.2.1', 'glob v0.3.0', 'unicode-xid v0.2.2', 'ctr v0.8.0', 'vm-memory v0.1.0 (/firecracker/src/vm-memory)', 'kvm-ioctls v0.11.0', 'itoa v1.0.1', 'libc v0.2.117', 'bitflags v1.3.2', 'vm-memory v0.7.0', 'vmm v0.1.0 (/firecracker/src/vmm)', 'virtio_gen v0.1.0 (/firecracker/src/virtio_gen)', 'proc-macro2 v1.0.36', 'ryu v1.0.9', 'generic-array v0.14.5', 'subtle v2.4.1', 'timerfd v1.2.0', 'libloading v0.7.3', 'io_uring v0.1.0 (/firecracker/src/io_uring)', 'rate_limiter v0.1.0 (/firecracker/src/rate_limiter)', 'universal-hash v0.4.1', 'peeking_take_while v0.1.2', 'shlex v1.1.0', 'memchr v2.4.1', 'regex v1.5.4', 'bindgen v0.59.2', 'vm-fdt v0.1.0', 'vm-superio v0.5.0', 'linux-loader v0.4.0', 'nix v0.23.1', 'clang-sys v1.3.1', 'version_check v0.9.4', 'thiserror v1.0.30', 'autocfg v1.0.1', 'mmds v0.1.0 (/firecracker/src/mmds)', 'api_server v0.1.0 (/firecracker/src/api_server)', 'rebase-snap v0.1.0 (/firecracker/src/rebase-snap)', 'seccompiler v1.0.0 (/firecracker/src/seccompiler)', 'cc v1.0.73', 'typenum v1.15.0', 'kvm-bindings v0.5.0 (https://github.com/firecracker-microvm/kvm-bindings?tag=v0.5.0-1#4569d3f5)', 'cfg-if v0.1.10', 'vmm-sys-util v0.9.0', 'opaque-debug v0.3.0', 'quote v1.0.15', 'dumbo v0.1.0 (/firecracker/src/dumbo)', 'versionize v0.1.6', 'event-manager v0.2.1', 'regex-syntax v0.6.25', 'userfaultfd-sys v0.4.1', 'polyval v0.5.3', 'userfaultfd v0.4.2', 'cpufeatures v0.2.1', 'logger v0.1.0 (/firecracker/src/logger)', 'cfg-if v1.0.0', 'snapshot v0.1.0 (/firecracker/src/snapshot)', 'utils v0.1.0 (/firecracker/src/utils)', 'aes-gcm v0.9.4', 'log v0.4.14', 'cpuid v0.1.0 (/firecracker/src/cpuid)', 'versionize_derive v0.1.4 (proc-macro)', 'memoffset v0.6.5', 'jailer v1.0.0 (/firecracker/src/jailer)', 'firecracker v1.0.0 (/firecracker/src/firecracker)', 'crc64 v1.0.0', 'nom v7.1.0', 'base64 v0.13.0', 'aead v0.4.3', 'devices v0.1.0 (/firecracker/src/devices)', 'cipher v0.3.0', 'arch v0.1.0 (/firecracker/src/arch)', 'rustc-hash v1.1.0', 'cexpr v0.6.0', 'net_gen v0.1.0 (/firecracker/src/net_gen)', 'arch_gen v0.1.0 (/firecracker/src/arch_gen)', 'aes v0.7.5'} \ No newline at end of file diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 6b25f5e9964..14779d1cf43 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -21,7 +21,6 @@ from threading import Lock from retry import retry -from retry.api import retry_call import host_tools.logging as log_tools import host_tools.cpu_load as cpu_tools @@ -552,7 +551,7 @@ def spawn(self, create_logger=True, # to be run by customers (together with CLONE_NEWPID flag). # # We have to use an external tool for CLONE_NEWPID, because - # 1) Python doesn't provide a os.clone() interface, and + # 1) Python doesn't provide os.clone() interface, and # 2) Python's ctypes libc interface appears to be broken, causing # our clone / exec to deadlock at some point. if self._jailer.daemonize: @@ -560,44 +559,13 @@ def spawn(self, create_logger=True, else: # This file will collect any output from 'screen'ed Firecracker. self._screen_log = self.SCREEN_LOGFILE.format(self._session_name) - start_cmd = 'screen -L -Logfile {logfile} '\ - '-dmS {session} {binary} {params}' - start_cmd = start_cmd.format( - logfile=self.screen_log, - session=self._session_name, - binary=self._jailer_binary_path, - params=' '.join(jailer_param_list) + screen_pid, binary_pid = utils.start_screen_process( + self._screen_log, self._session_name, + self._jailer_binary_path, + jailer_param_list ) - - utils.run_cmd(start_cmd) - - # Build a regex object to match (number).session_name - regex_object = re.compile( - r'([0-9]+)\.{}'.format(self._session_name)) - - # Run 'screen -ls' in a retry_call loop, 30 times with a one - # second delay between calls. - # If the output of 'screen -ls' matches the regex object, it will - # return the PID. Otherwise a RuntimeError will be raised. - screen_pid = retry_call( - utils.search_output_from_cmd, - fkwargs={ - "cmd": 'screen -ls', - "find_regex": regex_object - }, - exceptions=RuntimeError, - tries=30, - delay=1).group(1) - self._screen_pid = screen_pid - - self.jailer_clone_pid = int(open('/proc/{0}/task/{0}/children' - .format(screen_pid), - encoding='utf-8').read().strip()) - - # Configure screen to flush stdout to file. - flush_cmd = 'screen -S {session} -X colon "logfile flush 0^M"' - utils.run_cmd(flush_cmd.format(session=self._session_name)) + self.jailer_clone_pid = binary_pid # Wait for the jailer to create resources needed, and Firecracker to # create its API socket. diff --git a/tests/framework/resources.py b/tests/framework/resources.py index afe1fd5bd89..67a7af70a54 100644 --- a/tests/framework/resources.py +++ b/tests/framework/resources.py @@ -454,21 +454,37 @@ def __init__(self, api_usocket_full_name, api_session): self._snapshot_cfg_url = api_url + self.SNAPSHOT_LOAD_URL self._api_session = api_session - def put(self, **args): + def put(self, timeout=None, **args): """Load a snapshot of the microvm.""" datax = self.create_json(**args) return self._api_session.put( "{}".format(self._snapshot_cfg_url), - json=datax + json=datax, + timeout=timeout ) @staticmethod - def create_json(mem_file_path, snapshot_path, diff=False, resume=False): + def create_json( + snapshot_path, + diff=False, + resume=False, + mem_backend=None, + mem_file_path=None + ): """Compose the json associated to this type of API request.""" - datax = { - 'mem_file_path': mem_file_path, - 'snapshot_path': snapshot_path, - } + if mem_file_path: + datax = { + 'mem_file_path': mem_file_path, + 'snapshot_path': snapshot_path, + } + else: + datax = { + 'mem_backend': { + 'backend_type': str(mem_backend['type'].value), + 'backend_path': mem_backend['path'] + }, + 'snapshot_path': snapshot_path, + } if diff: datax['enable_diff_snapshots'] = True if resume: @@ -494,13 +510,23 @@ def create(self, mem_file_path, snapshot_path, diff=False, version=None): version=version ) - def load(self, mem_file_path, snapshot_path, diff=False, resume=False): + def load( + self, + snapshot_path, + diff=False, + resume=False, + mem_file_path=None, + mem_backend=None, + timeout=None + ): """Load a snapshot of the microvm.""" response = self._load.put( - mem_file_path=mem_file_path, snapshot_path=snapshot_path, diff=diff, - resume=resume + resume=resume, + mem_file_path=mem_file_path, + mem_backend=mem_backend, + timeout=timeout ) if resume and "unknown field `resume_vm`" in response.text: @@ -548,9 +574,7 @@ def patch(self, **args): ) @staticmethod - def create_json( - metrics_path=None, - ): + def create_json(metrics_path=None): """Compose the json associated to this type of API request.""" datax = {} if metrics_path is not None: diff --git a/tests/framework/utils.py b/tests/framework/utils.py index ad9065f800d..158e5c97ed1 100644 --- a/tests/framework/utils.py +++ b/tests/framework/utils.py @@ -16,7 +16,7 @@ from collections import namedtuple, defaultdict import psutil from retry import retry - +from retry.api import retry_call from framework.defs import MIN_KERNEL_VERSION_FOR_IO_URING CommandReturn = namedtuple("CommandReturn", "returncode stdout stderr") @@ -78,6 +78,33 @@ def get_cpu_percent(pid: int) -> float: return cpu_percentages +class UffdHandler: + """Describe the UFFD page fault handler process.""" + + def __init__(self, name, args): + """Instantiate the handler process with arguments.""" + self._proc = None + self._args = [f"/{name}"] + self._args.extend(args) + + def spawn(self): + """Spawn handler process using arguments provided.""" + self._proc = subprocess.Popen( + self._args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + bufsize=1 + ) + + def proc(self): + """Return UFFD handler process.""" + return self._proc + + def __del__(self): + """Tear down the UFFD handler process.""" + self._proc.kill() + + # pylint: disable=R0903 class CpuMap: """Cpu map from real cpu cores to containers visible cores. @@ -611,6 +638,49 @@ def compare_versions(first, second): return 0 +def sanitize_version(version): + """ + Get rid of dirty version information. + + Transform version from format `vX.Y.Z-W` to `X.Y.Z`. + """ + if version[0].isalpha(): + version = version[1:] + + return version.split("-", 1)[0] + + +def compare_dirty_versions(first, second): + """ + Compare two versions out of which one is dirty. + + We do not allow both versions to be dirty, because dirty info + does not reveal any ordering information. + + :param first: first version string + :param second: second version string + :returns: 0 if equal, <0 if first < second, >0 if second < first + """ + is_first_dirty = "-" in first + first = sanitize_version(first) + + is_second_dirty = "-" in second + second = sanitize_version(second) + + if is_first_dirty and is_second_dirty: + raise ValueError + + diff = compare_versions(first, second) + if diff != 0: + return diff + if is_first_dirty: + return 1 + if is_second_dirty: + return -1 + + return diff + + def get_kernel_version(level=2): """Return the current kernel version in format `major.minor.patch`.""" linux_version = platform.release() @@ -686,3 +756,46 @@ def configure_mmds(test_microvm, iface_ids, version=None, ipv4_address=None, assert test_microvm.api_session.is_status_no_content(response.status_code) return response + + +def start_screen_process(screen_log, session_name, binary_path, binary_params): + """Start binary process into a screen session.""" + start_cmd = 'screen -L -Logfile {logfile} ' \ + '-dmS {session} {binary} {params}' + start_cmd = start_cmd.format( + logfile=screen_log, + session=session_name, + binary=binary_path, + params=' '.join(binary_params) + ) + + run_cmd(start_cmd) + + # Build a regex object to match (number).session_name + regex_object = re.compile( + r'([0-9]+)\.{}'.format(session_name)) + + # Run 'screen -ls' in a retry_call loop, 30 times with a 1s + # delay between calls. + # If the output of 'screen -ls' matches the regex object, it will + # return the PID. Otherwise, a RuntimeError will be raised. + screen_pid = retry_call( + search_output_from_cmd, + fkwargs={ + "cmd": 'screen -ls', + "find_regex": regex_object + }, + exceptions=RuntimeError, + tries=30, + delay=1).group(1) + + binary_clone_pid = int(open( + '/proc/{0}/task/{0}/children'.format(screen_pid), + encoding='utf-8' + ).read().strip()) + + # Configure screen to flush stdout to file. + flush_cmd = 'screen -S {session} -X colon "logfile flush 0^M"' + run_cmd(flush_cmd.format(session=session_name)) + + return screen_pid, binary_clone_pid diff --git a/tests/host_tools/uffd/Cargo.lock b/tests/host_tools/uffd/Cargo.lock new file mode 100644 index 00000000000..0dbdc9f612a --- /dev/null +++ b/tests/host_tools/uffd/Cargo.lock @@ -0,0 +1,362 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bindgen" +version = "0.59.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clang-sys" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cc00842eed744b858222c4c9faf7243aafc6d33f92f96935263ef4d8a41ce21" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + +[[package]] +name = "itoa" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.121" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efaa7b300f3b5fe8eb6bf21ce3895e1751d9665086af2d64b42f19701015ff4f" + +[[package]] +name = "libloading" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" +dependencies = [ + "cfg-if 1.0.0", + "winapi", +] + +[[package]] +name = "memchr" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "net_gen" +version = "0.1.0" + +[[package]] +name = "nix" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +dependencies = [ + "bitflags", + "cc", + "cfg-if 1.0.0", + "libc", + "memoffset", +] + +[[package]] +name = "nom" +version = "7.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8903e5a29a317527874d0402f867152a3d21c908bb0b933e416c65e301d4c36" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "proc-macro2" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "632d02bff7f874a36f33ea8bb416cd484b90cc66c1194b1a1110d067a7013f58" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" +dependencies = [ + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "ryu" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" + +[[package]] +name = "serde" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e8d9fa5c3b304765ce1fd9c4c8a3de2c8db365a5b91be52f186efc675681d95" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "shlex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" + +[[package]] +name = "syn" +version = "1.0.90" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704df27628939572cd88d33f171cd6f896f4eaca85252c6e0a72d8d8287ee86f" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "thiserror" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "uffd" +version = "1.1.0" +dependencies = [ + "libc", + "nix", + "serde", + "serde_json", + "userfaultfd", + "utils", +] + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "userfaultfd" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b738009e099b4ded1ecf19dfb7631f69c24f16e0af6d29fd9b3f54a092aca46" +dependencies = [ + "bitflags", + "cfg-if 1.0.0", + "libc", + "nix", + "thiserror", + "userfaultfd-sys", +] + +[[package]] +name = "userfaultfd-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a4be003c705d2c8dc1234d473856945e291bb998ac2e2d83e70328d964d7458" +dependencies = [ + "bindgen", + "cc", + "cfg-if 0.1.10", +] + +[[package]] +name = "utils" +version = "0.1.0" +dependencies = [ + "libc", + "net_gen", + "serde", + "vmm-sys-util", +] + +[[package]] +name = "vmm-sys-util" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "733537bded03aaa93543f785ae997727b30d1d9f4a03b7861d23290474242e11" +dependencies = [ + "bitflags", + "libc", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/tests/host_tools/uffd/Cargo.toml b/tests/host_tools/uffd/Cargo.toml new file mode 100644 index 00000000000..f83ed0f9c3f --- /dev/null +++ b/tests/host_tools/uffd/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "uffd" +version = "1.1.0" +authors = ["Amazon Firecracker team "] +edition = "2018" + +[dependencies] +utils = { path = "../../../src/utils" } + +libc = ">=0.2.39" +nix = "0.23.0" +serde = { version = ">=1.0.27", features = ["derive"] } +serde_json = ">=1.0.9" +userfaultfd = ">=0.4.0" + +[workspace] + +[profile.dev] +panic = "abort" + +[profile.release] +panic = "abort" diff --git a/tests/host_tools/uffd/src/bin/malicious_handler.rs b/tests/host_tools/uffd/src/bin/malicious_handler.rs new file mode 100644 index 00000000000..0cc3790ad4d --- /dev/null +++ b/tests/host_tools/uffd/src/bin/malicious_handler.rs @@ -0,0 +1,32 @@ +// Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Provides functionality for a malicious page fault handler +//! which panics when a page fault occurs. + +use nix::poll::{poll, PollFd, PollFlags}; +use std::os::unix::io::AsRawFd; +use uffd::uffd_utils::create_pf_handler; + +fn main() { + let uffd_handler = create_pf_handler(); + let pollfd = PollFd::new(uffd_handler.uffd.as_raw_fd(), PollFlags::POLLIN); + + // Loop, handling incoming events on the userfaultfd file descriptor. + loop { + let _ = poll(&mut [pollfd], -1).expect("Failed to poll"); + + // Read an event from the userfaultfd. + let event = uffd_handler + .uffd + .read_event() + .expect("Failed to read uffd_msg") + .expect("uffd_msg not ready"); + + // We expect to receive either a Page Fault or Removed + // event (if the balloon device is enabled). + if let userfaultfd::Event::Pagefault { .. } = event { + panic!("Fear me! I am the malicious page fault handler.") + } + } +} diff --git a/tests/host_tools/uffd/src/bin/valid_handler.rs b/tests/host_tools/uffd/src/bin/valid_handler.rs new file mode 100644 index 00000000000..fc11d10c28c --- /dev/null +++ b/tests/host_tools/uffd/src/bin/valid_handler.rs @@ -0,0 +1,50 @@ +// Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Provides functionality for a userspace page fault handler +//! which loads the whole region from the backing memory file +//! when a page fault occurs. + +use nix::poll::{poll, PollFd, PollFlags}; +use std::os::unix::io::AsRawFd; + +use uffd::uffd_utils::{create_pf_handler, MemPageState}; + +fn main() { + let mut uffd_handler = create_pf_handler(); + + let pollfd = PollFd::new(uffd_handler.uffd.as_raw_fd(), PollFlags::POLLIN); + + // Loop, handling incoming events on the userfaultfd file descriptor. + loop { + // See what poll() tells us about the userfaultfd. + let nready = poll(&mut [pollfd], -1).expect("Failed to poll"); + + let revents = pollfd.revents().unwrap(); + println!( + "poll() returns: nready = {}; POLLIN = {}; POLLERR = {}", + nready, + revents.contains(PollFlags::POLLIN), + revents.contains(PollFlags::POLLERR), + ); + + // Read an event from the userfaultfd. + let event = uffd_handler + .uffd + .read_event() + .expect("Failed to read uffd_msg") + .expect("uffd_msg not ready"); + + // We expect to receive either a Page Fault or Removed + // event (if the balloon device is enabled). + match event { + userfaultfd::Event::Pagefault { addr, .. } => uffd_handler.serve_pf(addr as *mut u8), + userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings( + start as *mut u8 as u64, + end as *mut u8 as u64, + &MemPageState::Removed, + ), + _ => panic!("Unexpected event on userfaultfd"), + } + } +} diff --git a/tests/host_tools/uffd/src/lib.rs b/tests/host_tools/uffd/src/lib.rs new file mode 100644 index 00000000000..a27e955e4c8 --- /dev/null +++ b/tests/host_tools/uffd/src/lib.rs @@ -0,0 +1,4 @@ +// Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod uffd_utils; diff --git a/tests/host_tools/uffd/src/uffd_utils.rs b/tests/host_tools/uffd/src/uffd_utils.rs new file mode 100644 index 00000000000..12795c8eb1c --- /dev/null +++ b/tests/host_tools/uffd/src/uffd_utils.rs @@ -0,0 +1,247 @@ +// Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::collections::HashMap; +use std::fs::File; +use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd}; +use std::os::unix::net::{UnixListener, UnixStream}; +use std::{mem, ptr}; + +use libc::c_void; +use nix::sys::mman::{mmap, MapFlags, ProtFlags}; +use serde::Deserialize; +use userfaultfd::Uffd; +use utils::get_page_size; + +use utils::sock_ctrl_msg::ScmSocket; + +// This is the same with the one used in src/vmm. +/// This describes the mapping between Firecracker base virtual address and offset in the +/// buffer or file backend for a guest memory region. It is used to tell an external +/// process/thread where to populate the guest memory data for this range. +/// +/// E.g. Guest memory contents for a region of `size` bytes can be found in the backend +/// at `offset` bytes from the beginning, and should be copied/populated into `base_host_address`. +#[derive(Clone, Debug, Deserialize)] +pub struct GuestRegionUffdMapping { + /// Base host virtual address where the guest memory contents for this region + /// should be copied/populated. + pub base_host_virt_addr: u64, + /// Region size. + pub size: usize, + /// Offset in the backend file/buffer where the region contents are. + pub offset: u64, +} + +struct MemRegion { + mapping: GuestRegionUffdMapping, + page_states: HashMap, +} + +pub struct UffdPfHandler { + mem_regions: Vec, + backing_buffer: *const u8, + pub uffd: Uffd, + // Not currently used but included to demonstrate how a page fault handler can + // fetch Firecracker's PID in order to make it aware of any crashes/exits. + _firecracker_pid: u32, +} + +#[derive(Clone)] +pub enum MemPageState { + Uninitialized, + FromFile, + Removed, + Anonymous, +} + +impl UffdPfHandler { + pub fn from_unix_stream(stream: UnixStream, data: *const u8, size: usize) -> Self { + let mut message_buf = vec![0u8; 1024]; + let (bytes_read, file) = stream + .recv_with_fd(&mut message_buf[..]) + .expect("Cannot recv_with_fd"); + message_buf.resize(bytes_read, 0); + + let body = String::from_utf8(message_buf).unwrap(); + let file = file.expect("Uffd not passed through UDS!"); + + let mappings = serde_json::from_str::>(&body) + .expect("Cannot deserialize memory mappings."); + let memsize: usize = mappings.iter().map(|r| r.size).sum(); + + // Make sure memory size matches backing data size. + assert_eq!(memsize, size); + + let uffd = unsafe { Uffd::from_raw_fd(file.into_raw_fd()) }; + + let creds: libc::ucred = get_peer_process_credentials(stream); + + let mem_regions = create_mem_regions(&mappings); + + Self { + mem_regions, + backing_buffer: data, + uffd, + _firecracker_pid: creds.pid as u32, + } + } + + pub fn update_mem_state_mappings(&mut self, start: u64, end: u64, state: &MemPageState) { + for region in self.mem_regions.iter_mut() { + for (key, value) in region.page_states.iter_mut() { + if key >= &start && key < &end { + *value = state.clone(); + } + } + } + } + + fn populate_from_file(&self, region: &MemRegion) -> (u64, u64) { + let src = self.backing_buffer as u64 + region.mapping.offset; + let start_addr = region.mapping.base_host_virt_addr; + let len = region.mapping.size; + // Populate whole region from backing mem-file. + // This offers an example of how memory can be loaded in RAM, + // however this can be adjusted to accommodate use case needs. + let ret = unsafe { + self.uffd + .copy(src as *const _, start_addr as *mut _, len, true) + .expect("Uffd copy failed") + }; + + // Make sure the UFFD copied some bytes. + assert!(ret > 0); + + return (start_addr, start_addr + len as u64); + } + + fn zero_out(&mut self, addr: u64) -> (u64, u64) { + let page_size = get_page_size().unwrap(); + + let ret = unsafe { + self.uffd + .zeropage(addr as *mut _, page_size, true) + .expect("Uffd zeropage failed") + }; + // Make sure the UFFD zeroed out some bytes. + assert!(ret > 0); + + return (addr, addr + page_size as u64); + } + + pub fn serve_pf(&mut self, addr: *mut u8) { + let page_size = get_page_size().unwrap(); + + // Find the start of the page that the current faulting address belongs to. + let dst = (addr as usize & !(page_size as usize - 1)) as *mut c_void; + let fault_page_addr = dst as u64; + + // Get the state of the current faulting page. + for region in self.mem_regions.iter() { + match region.page_states.get(&fault_page_addr) { + // Our simple PF handler has a simple strategy: + // There exist 4 states in which a memory page can be in: + // 1. Uninitialized - page was never touched + // 2. FromFile - the page is populated with content from snapshotted memory file + // 3. Removed - MADV_DONTNEED was called due to balloon inflation + // 4. Anonymous - page was zeroed out -> this implies that more than one page fault + // event was received. This can be a consequence of guest reclaiming back its + // memory from the host (through balloon device) + Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => { + let (start, end) = self.populate_from_file(region); + self.update_mem_state_mappings(start, end, &MemPageState::FromFile); + return; + } + Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => { + let (start, end) = self.zero_out(fault_page_addr); + self.update_mem_state_mappings(start, end, &MemPageState::Anonymous); + return; + } + None => { + (); + } + } + } + + panic!( + "Could not find addr: {:?} within guest region mappings.", + addr + ); + } +} + +fn get_peer_process_credentials(stream: UnixStream) -> libc::ucred { + let mut creds: libc::ucred = libc::ucred { + pid: 0, + gid: 0, + uid: 0, + }; + let mut creds_size = mem::size_of::() as u32; + + let ret = unsafe { + libc::getsockopt( + stream.as_raw_fd(), + libc::SOL_SOCKET, + libc::SO_PEERCRED, + &mut creds as *mut _ as *mut _, + &mut creds_size as *mut libc::socklen_t, + ) + }; + if ret != 0 { + panic!("Failed to get peer process credentials"); + } + + creds +} + +fn create_mem_regions(mappings: &Vec) -> Vec { + let page_size = get_page_size().unwrap(); + let mut mem_regions: Vec = Vec::with_capacity(mappings.len()); + + for r in mappings.iter() { + let mapping = r.clone(); + let mut addr = r.base_host_virt_addr; + let end_addr = r.base_host_virt_addr + r.size as u64; + let mut page_states = HashMap::new(); + + while addr < end_addr { + page_states.insert(addr, MemPageState::Uninitialized); + addr += page_size as u64; + } + mem_regions.push(MemRegion { + mapping, + page_states, + }); + } + + mem_regions +} + +pub fn create_pf_handler() -> UffdPfHandler { + let uffd_sock_path = std::env::args().nth(1).expect("No socket path given"); + let mem_file_path = std::env::args().nth(2).expect("No memory file given"); + + let file = File::open(mem_file_path).expect("Cannot open memfile"); + let size = file.metadata().unwrap().len() as usize; + + // mmap a memory area used to bring in the faulting regions. + let memfile_buffer = unsafe { + mmap( + ptr::null_mut(), + size, + ProtFlags::PROT_READ, + MapFlags::MAP_PRIVATE, + file.as_raw_fd(), + 0, + ) + .expect("mmap failed") + } as *const u8; + + // Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker. + let listener = UnixListener::bind(&uffd_sock_path).expect("Cannot bind to socket path"); + + let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); + + UffdPfHandler::from_unix_stream(stream, memfile_buffer, size) +} diff --git a/tests/integration_tests/build/test_coverage.py b/tests/integration_tests/build/test_coverage.py index 54693cabbf5..ef466670a25 100644 --- a/tests/integration_tests/build/test_coverage.py +++ b/tests/integration_tests/build/test_coverage.py @@ -29,9 +29,9 @@ # Checkout the cpuid crate. In the future other # differences may appear. if utils.is_io_uring_supported(): - COVERAGE_DICT = {"Intel": 85.12, "AMD": 84.60, "ARM": 84.17} + COVERAGE_DICT = {"Intel": 84.89, "AMD": 84.38, "ARM": 83.96} else: - COVERAGE_DICT = {"Intel": 82.14, "AMD": 81.62, "ARM": 81.17} + COVERAGE_DICT = {"Intel": 81.94, "AMD": 81.43, "ARM": 80.96} PROC_MODEL = proc.proc_type() diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index b190502e724..6a7f89cb597 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -16,11 +16,11 @@ import host_tools.drive as drive_tools import host_tools.network as net_tools -from conftest import _test_images_s3_bucket +from conftest import _test_images_s3_bucket, init_microvm from framework.utils import is_io_uring_supported -from framework.artifacts import ArtifactCollection, SnapshotType, \ - NetIfaceConfig, DEFAULT_DEV_NAME, DEFAULT_TAP_NAME +from framework.artifacts import ArtifactCollection, NetIfaceConfig, \ + DEFAULT_DEV_NAME, DEFAULT_TAP_NAME, SnapshotType from framework.builder import MicrovmBuilder, SnapshotBuilder MEM_LIMIT = 1000000000 @@ -1467,7 +1467,11 @@ def test_get_full_config_after_restoring_snapshot(bin_cloner_path): ssh_key, SnapshotType.FULL) - microvm, _ = microvm_builder.build_from_snapshot(snapshot, True, False) + microvm, _ = microvm_builder.build_from_snapshot( + snapshot, + resume=True, + diff_snapshots=False + ) expected_cfg = setup_cfg.copy() @@ -1640,3 +1644,110 @@ def test_map_private_seccomp_regression(test_microvm_with_ssh): response = test_microvm.mmds.put(json=data_store) assert test_microvm.api_session.is_status_no_content(response.status_code) + + +# pylint: disable=protected-access +def test_negative_snapshot_load_api(bin_cloner_path): + """ + Test snapshot load API. + + @type: negative + """ + vm_builder = MicrovmBuilder(bin_cloner_path) + vm = init_microvm(vm_builder.root_path, vm_builder.bin_cloner_path) + vm.spawn() + + # Specifying both `mem_backend` and 'mem_file_path` should fail. + datax = { + 'snapshot_path': 'foo', + 'mem_backend': { + 'backend_type': 'File', + 'backend_path': 'bar' + }, + 'mem_file_path': 'bar', + } + response = vm.snapshot._load._api_session.put( + "{}".format(vm.snapshot._load._snapshot_cfg_url), + json=datax + ) + err_msg = "too many fields: either `mem_backend` or " \ + "`mem_file_path` exclusively is required." + assert err_msg in response.text, response.text + + # API request with `mem_backend` but no `backend_type` should fail. + datax = { + 'snapshot_path': 'foo', + 'mem_backend': { + 'backend_path': 'bar' + } + } + response = vm.snapshot._load._api_session.put( + "{}".format(vm.snapshot._load._snapshot_cfg_url), + json=datax + ) + err_msg = "missing field `backend_type`" + assert err_msg in response.text, response.text + + # API request with `mem_backend` but no `backend_path` should fail. + datax = { + 'snapshot_path': 'foo', + 'mem_backend': { + 'backend_type': 'File' + } + } + response = vm.snapshot._load._api_session.put( + "{}".format(vm.snapshot._load._snapshot_cfg_url), + json=datax + ) + err_msg = "missing field `backend_path`" + assert err_msg in response.text, response.text + + # API request with invalid `backend_type` should fail. + datax = { + 'snapshot_path': 'foo', + 'mem_backend': { + 'backend_type': 'foo', + 'backend_path': 'foo' + } + } + response = vm.snapshot._load._api_session.put( + "{}".format(vm.snapshot._load._snapshot_cfg_url), + json=datax + ) + err_msg = "unknown variant `foo`, expected `File` or `Uffd`" + assert err_msg in response.text, response.text + + # API request without `snapshot_path` should fail. + datax = { + 'mem_backend': { + 'backend_type': 'File', + 'backend_path': 'foo' + } + } + response = vm.snapshot._load._api_session.put( + "{}".format(vm.snapshot._load._snapshot_cfg_url), + json=datax + ) + err_msg = "missing field `snapshot_path`" + assert err_msg in response.text, response.text + + # API request without `mem_backend` or `mem_file_path` should fail. + datax = {'snapshot_path': 'foo'} + response = vm.snapshot._load._api_session.put( + "{}".format(vm.snapshot._load._snapshot_cfg_url), + json=datax + ) + err_msg = "missing field: either `mem_backend` or " \ + "`mem_file_path` is required" + assert err_msg in response.text, response.text + + # Deprecated API should return deprecation response header. + datax = { + 'snapshot_path': 'foo', + 'mem_file_path': 'bar' + } + response = vm.snapshot._load._api_session.put( + "{}".format(vm.snapshot._load._snapshot_cfg_url), + json=datax + ) + assert response.headers['deprecation'] diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py index 81afa8c910d..58d4bec8931 100644 --- a/tests/integration_tests/functional/test_balloon.py +++ b/tests/integration_tests/functional/test_balloon.py @@ -646,9 +646,11 @@ def _test_balloon_snapshot(context): basevm.kill() logger.info("Load snapshot #{}, mem {}".format(1, snapshot.mem)) - microvm, _ = vm_builder.build_from_snapshot(snapshot, - True, - diff_snapshots) + microvm, _ = vm_builder.build_from_snapshot( + snapshot, + resume=True, + diff_snapshots=diff_snapshots + ) # Attempt to connect to resumed microvm. ssh_connection = net_tools.SSHConnection(microvm.ssh_config) diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 04a8c6bc679..8cd5b2d4933 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -102,9 +102,11 @@ def _test_seq_snapshots(context): for i in range(seq_len): logger.info("Load snapshot #{}, mem {}".format(i, snapshot.mem)) - microvm, _ = vm_builder.build_from_snapshot(snapshot, - True, - diff_snapshots) + microvm, _ = vm_builder.build_from_snapshot( + snapshot, + resume=True, + diff_snapshots=diff_snapshots + ) # Attempt to connect to resumed microvm. ssh_connection = net_tools.SSHConnection(microvm.ssh_config) @@ -236,9 +238,11 @@ def test_patch_drive_snapshot(bin_cloner_path): # Load snapshot in a new Firecracker microVM. logger.info("Load snapshot, mem %s", snapshot.mem) - microvm, _ = vm_builder.build_from_snapshot(snapshot, - True, - diff_snapshots) + microvm, _ = vm_builder.build_from_snapshot( + snapshot, + resume=True, + diff_snapshots=diff_snapshots + ) # Attempt to connect to resumed microvm. ssh_connection = net_tools.SSHConnection(microvm.ssh_config) @@ -446,9 +450,11 @@ def test_negative_postload_api(bin_cloner_path): logger.info("Load snapshot, mem %s", snapshot.mem) # Do not resume, just load, so we can still call APIs that work. - microvm, _ = vm_builder.build_from_snapshot(snapshot, - False, - True) + microvm, _ = vm_builder.build_from_snapshot( + snapshot, + resume=False, + diff_snapshots=True + ) fail_msg = "The requested operation is not supported after starting " \ "the microVM" @@ -517,7 +523,11 @@ def test_negative_snapshot_permissions(bin_cloner_path): os.chmod(snapshot.mem, 0o000) try: - _, _ = vm_builder.build_from_snapshot(snapshot, True, True) + _, _ = vm_builder.build_from_snapshot( + snapshot, + resume=True, + diff_snapshots=True + ) except AssertionError as error: # Check if proper error is returned. assert "Cannot open the memory file: Permission denied" in str(error) @@ -528,7 +538,11 @@ def test_negative_snapshot_permissions(bin_cloner_path): os.chmod(snapshot.vmstate, 0o000) try: - _, _ = vm_builder.build_from_snapshot(snapshot, True, True) + _, _ = vm_builder.build_from_snapshot( + snapshot, + resume=True, + diff_snapshots=True + ) except AssertionError as error: # Check if proper error is returned. assert "Cannot perform open on the snapshot backing file:" \ @@ -544,7 +558,11 @@ def test_negative_snapshot_permissions(bin_cloner_path): os.chmod(snapshot.disks[0], 0o000) try: - _, _ = vm_builder.build_from_snapshot(snapshot, True, True) + _, _ = vm_builder.build_from_snapshot( + snapshot, + resume=True, + diff_snapshots=True + ) except AssertionError as error: # Check if proper error is returned. assert "Block(BackingFile(Os { code: 13, kind: PermissionDenied" \ diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py new file mode 100644 index 00000000000..482fffe12e7 --- /dev/null +++ b/tests/integration_tests/functional/test_uffd.py @@ -0,0 +1,252 @@ +# Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Test UFFD related functionality when resuming from snapshot.""" + +import logging +import os +import socket +from subprocess import TimeoutExpired + +import stat + +import requests +import urllib3 + +from framework.artifacts import SnapshotMemBackendType +from framework.builder import MicrovmBuilder, SnapshotBuilder +from framework.utils import run_cmd, UffdHandler + +import host_tools.network as net_tools + +SOCKET_PATH = "/firecracker-uffd.sock" + + +def create_snapshot(bin_cloner_path): + """Create a snapshot of a microVM.""" + vm_builder = MicrovmBuilder(bin_cloner_path) + vm_instance = vm_builder.build_vm_nano() + basevm = vm_instance.vm + root_disk = vm_instance.disks[0] + ssh_key = vm_instance.ssh_key + + # Add a memory balloon. + response = basevm.balloon.put( + amount_mib=0, + deflate_on_oom=True, + stats_polling_interval_s=0 + ) + assert basevm.api_session.is_status_no_content(response.status_code) + + basevm.start() + ssh_connection = net_tools.SSHConnection(basevm.ssh_config) + + # Verify if guest can run commands. + exit_code, _, _ = ssh_connection.execute_command("sync") + assert exit_code == 0 + + # Create a snapshot builder from a microvm. + snapshot_builder = SnapshotBuilder(basevm) + + # Create base snapshot. + snapshot = snapshot_builder.create([root_disk.local_path()], + ssh_key) + + basevm.kill() + + return snapshot + + +def spawn_pf_handler(vm, handler_path, mem_path): + """Spawn page fault handler process.""" + # Copy snapshot memory file into chroot of microVM. + jailed_mem = vm.create_jailed_resource(mem_path) + # Copy the valid page fault binary into chroot of microVM. + jailed_handler = vm.create_jailed_resource(handler_path) + + handler_name = os.path.basename(jailed_handler) + args = [SOCKET_PATH, jailed_mem] + + uffd_handler = UffdHandler(handler_name, args) + real_root = os.open("/", os.O_RDONLY) + + os.chroot(vm.chroot()) + os.chdir('/') + st = os.stat(handler_name) + os.chmod(handler_name, st.st_mode | stat.S_IEXEC) + + uffd_handler.spawn() + try: + outs, errs = uffd_handler.proc().communicate(timeout=1) + print(outs) + print(errs) + assert False, "Could not start PF handler!" + except TimeoutExpired: + print("This is the good case!") + + # The page fault handler will create the socket path with root rights. + # Change rights to the jailer's. + os.chown(SOCKET_PATH, vm.jailer.uid, vm.jailer.gid) + + os.fchdir(real_root) + os.chroot(".") + + return uffd_handler + + +def test_bad_socket_path(bin_cloner_path, test_microvm_with_api): + """ + Test error scenario when socket path does not exist. + + @type: negative + """ + logger = logging.getLogger("uffd_bad_socket_path") + + logger.info("Create snapshot") + snapshot = create_snapshot(bin_cloner_path) + + logger.info("Load snapshot, mem %s", snapshot.mem) + vm = test_microvm_with_api + vm.spawn() + jailed_vmstate = vm.create_jailed_resource(snapshot.vmstate) + + response = vm.snapshot.load( + mem_backend={ + 'type': SnapshotMemBackendType.UFFD, + 'path': 'inexsistent' + }, + snapshot_path=jailed_vmstate + ) + + assert vm.api_session.is_status_bad_request(response.status_code) + assert "Load microVM snapshot error: Cannot connect to UDS in order to " \ + "send information on handling guest memory page-faults due to: " \ + "No such file or directory (os error 2)" in response.text + + +def test_unbinded_socket(bin_cloner_path, test_microvm_with_api): + """ + Test error scenario when PF handler has not yet called bind on socket. + + @type: negative + """ + logger = logging.getLogger("uffd_unbinded_socket") + + logger.info("Create snapshot") + snapshot = create_snapshot(bin_cloner_path) + + logger.info("Load snapshot, mem %s", snapshot.mem) + vm = test_microvm_with_api + vm.spawn() + jailed_vmstate = vm.create_jailed_resource(snapshot.vmstate) + + socket_path = os.path.join(vm.path, "firecracker-uffd.sock") + run_cmd("touch {}".format(socket_path)) + jailed_sock_path = vm.create_jailed_resource(socket_path) + + response = vm.snapshot.load( + mem_backend={ + 'type': SnapshotMemBackendType.UFFD, + 'path': jailed_sock_path + }, + snapshot_path=jailed_vmstate + ) + + assert vm.api_session.is_status_bad_request(response.status_code) + assert "Load microVM snapshot error: Cannot connect to UDS in order to" \ + " send information on handling guest memory page-faults due to: " \ + "Connection refused (os error 111)" in response.text + + +def test_valid_handler(bin_cloner_path, + test_microvm_with_api, + uffd_handler_paths): + """ + Test valid uffd handler scenario. + + @type: functional + """ + logger = logging.getLogger("uffd_unbinded_socket") + + logger.info("Create snapshot") + snapshot = create_snapshot(bin_cloner_path) + + logger.info("Load snapshot, mem %s", snapshot.mem) + vm_builder = MicrovmBuilder(bin_cloner_path) + vm = test_microvm_with_api + vm.spawn() + + # Spawn page fault handler process. + _pf_handler = spawn_pf_handler( + vm, + uffd_handler_paths['valid_handler'], + snapshot.mem + ) + + vm, _ = vm_builder.build_from_snapshot(snapshot, vm=vm, + resume=True, + uffd_path=SOCKET_PATH) + + # Inflate balloon. + response = vm.balloon.patch(amount_mib=200) + assert vm.api_session.is_status_no_content( + response.status_code + ) + + # Deflate balloon. + response = vm.balloon.patch(amount_mib=0) + assert vm.api_session.is_status_no_content( + response.status_code + ) + + # Verify if guest can run commands. + ssh_connection = net_tools.SSHConnection(vm.ssh_config) + exit_code, _, _ = ssh_connection.execute_command("sync") + assert exit_code == 0 + + +def test_malicious_handler(bin_cloner_path, + test_microvm_with_api, + uffd_handler_paths): + """ + Test malicious uffd handler scenario. + + The page fault handler panics when receiving a page fault, + so no events are handled and snapshot memory regions cannot be + loaded into memory. In this case, Firecracker is designed to freeze, + instead of silently switching to having the kernel handle page + faults, so that it becomes obvious that something went wrong. + + @type: negative + """ + logger = logging.getLogger("uffd_unbinded_socket") + + logger.info("Create snapshot") + snapshot = create_snapshot(bin_cloner_path) + + logger.info("Load snapshot, mem %s", snapshot.mem) + vm_builder = MicrovmBuilder(bin_cloner_path) + vm = test_microvm_with_api + vm.spawn() + + # Spawn page fault handler process. + _pf_handler = spawn_pf_handler( + vm, + uffd_handler_paths['malicious_handler'], + snapshot.mem + ) + + # We expect Firecracker to freeze while resuming from a snapshot + # due to the malicious handler's unavailability. + try: + vm_builder.build_from_snapshot( + snapshot, vm=vm, + resume=True, + uffd_path=SOCKET_PATH, + timeout=30 + ) + assert False + except (socket.timeout, + urllib3.exceptions.ReadTimeoutError, + requests.exceptions.ReadTimeout) \ + as _err: + assert True, _err diff --git a/tests/integration_tests/functional/test_vsock.py b/tests/integration_tests/functional/test_vsock.py index 2984ae065ca..b7642154679 100644 --- a/tests/integration_tests/functional/test_vsock.py +++ b/tests/integration_tests/functional/test_vsock.py @@ -246,9 +246,11 @@ def test_vsock_transport_reset( test_vm.kill() # Load snapshot. - test_vm, _ = vm_builder.build_from_snapshot(snapshot, - True, - False) + test_vm, _ = vm_builder.build_from_snapshot( + snapshot, + resume=True, + diff_snapshots=False + ) # Check that vsock device still works. # Test guest-initiated connections. diff --git a/tests/integration_tests/performance/test_snapshot_perf.py b/tests/integration_tests/performance/test_snapshot_perf.py index f2860139f2b..49c9f23ace0 100644 --- a/tests/integration_tests/performance/test_snapshot_perf.py +++ b/tests/integration_tests/performance/test_snapshot_perf.py @@ -172,8 +172,8 @@ def snapshot_resume_producer( """Produce results for snapshot resume tests.""" microvm, metrics_fifo = vm_builder.build_from_snapshot( snapshot, - True, - snapshot_type == SnapshotType.DIFF, + resume=True, + diff_snapshots=snapshot_type == SnapshotType.DIFF, use_ramdisk=use_ramdisk) # Attempt to connect to resumed microvm. diff --git a/tools/devctr/Dockerfile.aarch64 b/tools/devctr/Dockerfile.aarch64 index 94fcf5fabde..80b8256fc70 100644 --- a/tools/devctr/Dockerfile.aarch64 +++ b/tools/devctr/Dockerfile.aarch64 @@ -26,6 +26,8 @@ ENV LC_ALL=C.UTF-8 RUN apt-get update \ && apt-get -y install --no-install-recommends \ binutils-dev \ + # Needed in order to be able to compile `userfaultfd-sys`. + clang \ cmake \ curl \ file \ @@ -43,6 +45,7 @@ RUN apt-get update \ libssl-dev \ lsof \ make \ + musl-tools \ net-tools \ openssh-client \ pkgconf \ @@ -87,6 +90,14 @@ RUN mkdir "$TMP_BUILD_DIR" \ && cd / \ && rm -rf "$TMP_BUILD_DIR" +RUN ln -s /usr/bin/musl-gcc /usr/bin/aarch64-linux-musl-gcc + +# help musl-gcc find linux headers +RUN cd /usr/include/aarch64-linux-musl \ + && ln -s ../aarch64-linux-gnu/asm asm \ + && ln -s ../linux linux \ + && ln -s ../asm-generic asm-generic + # Build iperf3-vsock RUN mkdir "$TMP_BUILD_DIR" && cd "$TMP_BUILD_DIR" \ && git clone https://github.com/stefano-garzarella/iperf-vsock \ diff --git a/tools/devctr/Dockerfile.x86_64 b/tools/devctr/Dockerfile.x86_64 index 1e58b406681..74240787c4c 100644 --- a/tools/devctr/Dockerfile.x86_64 +++ b/tools/devctr/Dockerfile.x86_64 @@ -26,6 +26,8 @@ ENV LC_ALL=C.UTF-8 RUN apt-get update \ && apt-get -y install --no-install-recommends \ binutils-dev \ + # Needed in order to be able to compile `userfaultfd-sys`. + clang \ cmake \ curl \ file \ @@ -42,6 +44,7 @@ RUN apt-get update \ libcurl4-openssl-dev \ lsof \ make \ + musl-tools \ net-tools \ openssh-client \ pkgconf \ @@ -91,7 +94,7 @@ RUN mkdir "$TMP_BUILD_DIR" \ && cargo install cargo-kcov \ && cargo +"stable" install cargo-audit \ # Fix a version that does not require cargo edition 2021. - && cargo install cargo-deny --version '^0.9.1' \ + && cargo install --locked cargo-deny --version '^0.9.1' \ && cargo kcov --print-install-kcov-sh | sh \ && rm -rf "$CARGO_HOME/registry" \ && ln -s "$CARGO_REGISTRY_DIR" "$CARGO_HOME/registry" \ @@ -100,6 +103,12 @@ RUN mkdir "$TMP_BUILD_DIR" \ && cd / \ && rm -rf "$TMP_BUILD_DIR" +# help musl-gcc find linux headers +RUN cd /usr/include/x86_64-linux-musl \ + && ln -s ../x86_64-linux-gnu/asm asm \ + && ln -s ../linux linux \ + && ln -s ../asm-generic asm-generic + # Build iperf3-vsock RUN mkdir "$TMP_BUILD_DIR" && cd "$TMP_BUILD_DIR" \ && git clone https://github.com/stefano-garzarella/iperf-vsock \ diff --git a/tools/devtool b/tools/devtool index cd19d5dac28..6e5269d022a 100755 --- a/tools/devtool +++ b/tools/devtool @@ -72,7 +72,7 @@ DEVCTR_IMAGE_NO_TAG="public.ecr.aws/firecracker/fcuvm" # Development container tag -DEVCTR_IMAGE_TAG="v34" +DEVCTR_IMAGE_TAG="v35" # Development container image (name:tag) # This should be updated whenever we upgrade the development container.