diff --git a/example.nix b/example.nix deleted file mode 100755 index f04cc7a..0000000 --- a/example.nix +++ /dev/null @@ -1,265 +0,0 @@ -# This is someone else's network config, however they use VLANs. I am not using VLANs. I'm just connecting my microVMs to the general LAN for now. - -{ - enable = true; - netdevs = { - "vlan-mgmt" = { - netdevConfig = { - Name = "vlan-mgmt"; - Kind = "vlan"; - }; - vlanConfig.Id = 50; - }; - "br-mgmt" = { - netdevConfig = { - Name = "br-mgmt"; - Kind = "bridge"; - }; - }; - - "vlan-minio" = { - netdevConfig = { - Name = "vlan-minio"; - Kind = "vlan"; - }; - vlanConfig.Id = 51; - }; - "br-minio" = { - netdevConfig = { - Name = "br-minio"; - Kind = "bridge"; - }; - bridgeConfig = { - STP = true; - }; - }; - - "vlan-forgejo" = { - netdevConfig = { - Name = "vlan-forgejo"; - Kind = "vlan"; - }; - vlanConfig.Id = 52; - }; - "br-forgejo" = { - netdevConfig = { - Name = "br-forgejo"; - Kind = "bridge"; - }; - bridgeConfig = { - STP = true; - }; - }; - - "vlan-nexus" = { - netdevConfig = { - Name = "vlan-nexus"; - Kind = "vlan"; - }; - vlanConfig.Id = 53; - }; - "br-nexus" = { - netdevConfig = { - Name = "br-nexus"; - Kind = "bridge"; - }; - bridgeConfig = { - STP = true; - }; - }; - - "vlan-cloud" = { - netdevConfig = { - Name = "vlan-cloud"; - Kind = "vlan"; - }; - vlanConfig.Id = 54; - }; - "br-cloud" = { - netdevConfig = { - Name = "br-cloud"; - Kind = "bridge"; - }; - bridgeConfig = { - STP = true; - }; - }; - - "vlan-caddy" = { - netdevConfig = { - Name = "vlan-caddy"; - Kind = "vlan"; - }; - vlanConfig.Id = 55; - }; - "br-caddy" = { - netdevConfig = { - Name = "br-caddy"; - Kind = "bridge"; - }; - bridgeConfig = { - STP = true; - }; - }; - - "vlan-website" = { - netdevConfig = { - Name = "vlan-website"; - Kind = "vlan"; - }; - vlanConfig.Id = 56; - }; - "br-website" = { - netdevConfig = { - Name = "br-website"; - Kind = "bridge"; - }; - bridgeConfig = { - STP = true; - }; - }; - }; - - networks = { - "trunk-nic" = { - matchConfig.Name = "enp3s0f0"; - vlan = [ - "vlan-mgmt" - "vlan-minio" - "vlan-forgejo" - "vlan-nexus" - "vlan-cloud" - "vlan-caddy" - "vlan-website" - ]; - }; - - "vlan-mgmt-net" = { - matchConfig.Name = "vlan-mgmt"; - bridge = [ "br-mgmt" ]; - }; - "host-on-mgmt" = { - matchConfig.Name = "br-mgmt"; - networkConfig = { - Address = [ "10.0.50.2/24" ]; - Gateway = "10.0.50.1"; - DNS = [ "10.0.50.1" ]; - }; - }; - - "vlan-minio-net" = { - matchConfig.Name = "vlan-minio"; - bridge = [ "br-minio" ]; - }; - "host-on-minio" = { - matchConfig.Name = "br-minio"; - }; - - "vlan-forgejo-net" = { - matchConfig.Name = "vlan-forgejo"; - bridge = [ "br-forgejo" ]; - }; - "host-on-forgejo" = { - matchConfig.Name = "br-forgejo"; - }; - - "vlan-nexus-net" = { - matchConfig.Name = "vlan-nexus"; - bridge = [ "br-nexus" ]; - }; - "host-on-nexus" = { - matchConfig.Name = "br-nexus"; - }; - - "vlan-cloud-net" = { - matchConfig.Name = "vlan-cloud"; - bridge = [ "br-cloud" ]; - }; - "host-on-cloud" = { - matchConfig.Name = "br-cloud"; - }; - - "vlan-caddy-net" = { - matchConfig.Name = "vlan-caddy"; - bridge = [ "br-caddy" ]; - }; - "host-on-caddy" = { - matchConfig.Name = "br-caddy"; - }; - - "vlan-website-net" = { - matchConfig.Name = "vlan-website"; - bridge = [ "br-website" ]; - }; - "host-on-website" = { - matchConfig.Name = "br-website"; - }; - - "vm-taps-minio" = { - matchConfig = { - Name = "vm-minio"; - }; - networkConfig = { - Description = "minio VM tap interface"; - Bridge = "br-minio"; - ConfigureWithoutCarrier = true; - }; - }; - - "vm-taps-forgejo" = { - matchConfig = { - Name = "vm-forgejo"; - }; - networkConfig = { - Description = "forgejo VM tap interface"; - Bridge = "br-forgejo"; - ConfigureWithoutCarrier = true; - }; - }; - - # VM tap interfaces for Nexus - "vm-taps-nexus" = { - matchConfig = { - Name = "vm-nexus"; - }; - networkConfig = { - Description = "nexus VM tap interface"; - Bridge = "br-nexus"; - ConfigureWithoutCarrier = true; - }; - }; - - "vm-taps-cloud" = { - matchConfig = { - Name = "vm-cloud"; - }; - networkConfig = { - Description = "cloud VM tap interface"; - Bridge = "br-cloud"; - ConfigureWithoutCarrier = true; - }; - }; - - "vm-taps-caddy" = { - matchConfig = { - Name = "vm-caddy"; - }; - networkConfig = { - Description = "caddy VM tap interface"; - Bridge = "br-caddy"; - ConfigureWithoutCarrier = true; - }; - }; - - "vm-taps-website" = { - matchConfig = { - Name = "vm-website"; - }; - networkConfig = { - Description = "website VM tap interface"; - Bridge = "br-website"; - ConfigureWithoutCarrier = true; - }; - }; - }; -} diff --git a/example/doc/.gitignore b/example/doc/.gitignore new file mode 100755 index 0000000..7585238 --- /dev/null +++ b/example/doc/.gitignore @@ -0,0 +1 @@ +book diff --git a/example/doc/book.toml b/example/doc/book.toml new file mode 100755 index 0000000..4240264 --- /dev/null +++ b/example/doc/book.toml @@ -0,0 +1,9 @@ +[book] +authors = ["Astro"] +language = "en" +multilingual = false +src = "src" +title = "microvm.nix" + +[output.html] +git-repository-url = "https://github.com/microvm-nix/microvm.nix" diff --git a/example/doc/src/SUMMARY.md b/example/doc/src/SUMMARY.md new file mode 100755 index 0000000..4f9e117 --- /dev/null +++ b/example/doc/src/SUMMARY.md @@ -0,0 +1,23 @@ +# Table of Contents + +- [Intro](./intro.md) +- [Declaring MicroVMs](./declaring.md) + - [Configuration options](./options.md) + - [Network interfaces](./interfaces.md) + - [Shared directories](./shares.md) + - [Device pass-through](./devices.md) + - [CPU emulation](./cpu-emulation.md) + - [Output options](./output-options.md) + - [MicroVM options reference ⚙️](./microvm-options.md) +- [Running a MicroVM as a package](./packages.md) +- [Preparing a host for declarative MicroVMs](./host.md) + - [A simple network setup](./simple-network.md) + - [Advanced network setup](./advanced-network.md) + - [Routed network setup](./routed-network.md) + - [Host systemd services](./host-systemd.md) + - [Host options reference ⚙️](./host-options.md) +- [Declarative MicroVMs](./declarative.md) +- [Imperative MicroVM management](./microvm-command.md) + - [Deploy via SSH](./ssh-deploy.md) +- [Conventions](./conventions.md) +- [Frequently Asked Questions](./faq.md) diff --git a/example/doc/src/advanced-network.md b/example/doc/src/advanced-network.md new file mode 100755 index 0000000..4cc844c --- /dev/null +++ b/example/doc/src/advanced-network.md @@ -0,0 +1,109 @@ +# Advanced network setup + +Renting a server in a datacenter usually gets you one IP address. You +must not bridge your local VM traffic together with the physical +Ethernet uplink port. Instead, setup a host-internal bridge for the +Virtual Machines, and provide them with Internet through NAT just like +your plastic ADSL router at home. + +## A bridge to link TAP interfaces + +Instead of placing MicroVMs directly on a LAN, one can also use a TAP +interface to get a virtual Ethernet interface on the host. Although it +is possible to [assign individual IP +configuration](./routed-network.md) to these individual interfaces, +let us avoid the additional configuration effort and create a bridge +instead: + +```nix +systemd.network.netdevs."10-microvm".netdevConfig = { + Kind = "bridge"; + Name = "microvm"; +}; +systemd.network.networks."10-microvm" = { + matchConfig.Name = "microvm"; + networkConfig = { + DHCPServer = true; + IPv6SendRA = true; + }; + addresses = [ { + addressConfig.Address = "10.0.0.1/24"; + } { + addressConfig.Address = "fd12:3456:789a::1/64"; + } ]; + ipv6Prefixes = [ { + ipv6PrefixConfig.Prefix = "fd12:3456:789a::/64"; + } ]; +}; + +# Allow inbound traffic for the DHCP server +networking.firewall.allowedUDPPorts = [ 67 ]; +``` + +This configuration will hand out IP addresses to clients on the +bridge. In practise, better leave out the DHCP server and its state by +opting for declarative, versioned configuration instead. + +Last, the TAP interfaces of MicroVMs shall be attached to this central +bridge. Make sure your `matchConfig` matches just the interfaces you +want! +```nix +systemd.network.networks."11-microvm" = { + matchConfig.Name = "vm-*"; + # Attach to the bridge that was configured above + networkConfig.Bridge = "microvm"; +}; +``` + +## Provide Internet Access with NAT + +IPv4 addresses are exhausted. It is a very common case that you get +one public IPv4 address for your machine. The solution is to route +your internal virtual machines with *Network Address Translation*. + +You might not get a dedicated /64 IPv6 prefix to route to your +MicroVMs. NAT works for this address family, too! + +```nix +networking.nat = { + enable = true; + # NAT66 exists and works. But if you have a proper subnet in + # 2000::/3 you should route that and remove this setting: + enableIPv6 = true; + + # Change this to the interface with upstream Internet access + externalInterface = "eth0"; + # The bridge where you want to provide Internet access + internalInterfaces = [ "microvm" ]; +}; +``` + +Check out +[`networking.nat.forwardPorts`](https://search.nixos.org/options?channel=unstable&show=networking.nat.forwardPorts&query=networking.nat.forwardPorts) +to make your MicroVM's services available to networks outside your +host! + +## Port forwarding + +Isolating your public Internet services is a great use-case for +virtualization. But how does traffic get to you when your MicroVMs +have private IP addresses behind NAT? + +NixOS has got you covered with the `networking.nat.forwardPorts` +option! This example forwards TCP ports 80 (HTTP) and 443 (HTTPS) to +other hosts: + +```nix +networking.nat = { + enable = true; + forwardPorts = [ { + proto = "tcp"; + sourcePort = 80; + destination = my-addresses.http-reverse-proxy.ip4; + } { + proto = "tcp"; + sourcePort = 443; + destination = my-addresses.https-reverse-proxy.ip4; + } ]; +}; +``` diff --git a/example/doc/src/conventions.md b/example/doc/src/conventions.md new file mode 100755 index 0000000..653ac26 --- /dev/null +++ b/example/doc/src/conventions.md @@ -0,0 +1,35 @@ +# Conventions between MicroVM packages and the host + +This section describes the interface that is used to run MicroVM +packages with the flake's `host` module. While the **microvm.nix** +flake was designed for single-server usage, you can build different +MicroVM deployments using the information on this page. + + +| `nixosModule.microvm` option | MicroVM package file | `nixosModules.host` systemd service | Description | +|------------------------------|----------------------------------------|-------------------------------------|-----------------------------------------------------------------------------------------------| +| `microvm.hypervisor` | `bin/microvm-run` | `microvm@.service` | Start script for the main MicroVM process | +| `microvm.hypervisor` | `bin/microvm-shutdown` | `microvm@.service` | Script for graceful shutdown of the MicroVM (i.e. triggering the power button) | +| `microvm.interfaces.*.id` | `share/microvm/tap-interfaces` | `microvm-tap-interfaces@.service` | Names of the tap network interfaces to setup for the proper user | +| `microvm.devices.*.path` | `share/microvm/pci-devices` | `microvm-pci-devices@.service` | PCI devices that must be bound to the **vfio-pci** driver on the host | +| `microvm.shares.*.source` | `share/microvm/virtiofs/${tag}/source` | `microvm-virtiofsd@.service` | Source directory of a **virtiofs** instance by tag | +| `microvm.shares.*.socket` | `share/microvm/virtiofs/${tag}/socket` | `microvm-virtiofsd@.service` | **virtiofsd** socket path by tag | +| `microvm.systemSymlink` | `share/microvm/system` | | `config.system.build.toplevel` symlink, used for comparing versions when running `microvm -l` | + + +## Generating custom operating system hypervisor packages + +Because a microvm.nix runner package completely defines how to run the +Hypervisor, it is possible to define independent packages that +virtualize other operating systems than NixOS. + +- Your NixOS configurations should export their runner package as + `config.microvm.declaredRunner` so that it can be picked up either + as [declarative MicroVMs](declarative.md) or by [the microvm + command](microvm-command.md). + +- The runner package must have a file layout as described in the table + above. + +[microvm-solo5-spt](https://github.com/microvm-nix/microvm-solo5-spt) is an +example of a Flake that can run on a microvm.nix host. diff --git a/example/doc/src/cpu-emulation.md b/example/doc/src/cpu-emulation.md new file mode 100755 index 0000000..d5f268a --- /dev/null +++ b/example/doc/src/cpu-emulation.md @@ -0,0 +1,71 @@ +# CPU emulation + +It's possible to emulate a CPU if desired. This feature is only +supported by the qemu hypervisor. + +**Note:** this feature has a significant performance impact. + +## Defining an emulated NixOS system + +You can call to `nixpkgs.lib.nixosSystem`, with the following key +settings: + +- Set the `system` attribute to the host system. + +- A module that sets `nixpkgs.crossSystem.config` to the guest + system. This lets `microvm.nix` know that it's a cross-system + environment. + +- Set `microvm.hypervisor` to `qemu`, given this is the only + hypervisor that supports this feature. + +- Set `microvm.cpu` to the desired emulated CPU. You can find a [list + of the available systems + here](https://www.qemu.org/docs/master/system/targets.html). + +```nix +# Example flake.nix +{ + inputs = { + nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable"; + microvm = { + url = "github:microvm-nix/microvm.nix"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + }; + + outputs = { self, nixpkgs, microvm }: { + nixosConfigurations.emulated-dev = nixpkgs.lib.nixosSystem { + # host system + system = "x86_64-linux"; + modules = let + guestSystem = "aarch64-unknown-linux-gnu"; + # you can use packages in the guest machine with cross system configuration + pkgs = import nixpkgs { + system = "x86_64-linux"; + crossSystem.config = guestSystem; + }; + in [ + microvm.nixosModules.microvm + { + nixpkgs.crossSystem.config = guestSystem; + microvm = { + # you can choose what CPU will be emulated by qemu + cpu = "cortex-a53"; + hypervisor = "qemu"; + }; + environment.systemPackages = with pkgs; [ cowsay htop ]; + services.getty.autologinUser = "root"; + system.stateVersion = "24.11"; + } + ]; + }; + }; +} +``` + +You can run the example with `nix run +.#nixosConfigurations.emulated-dev.config.microvm.declaredRunner`. + +As shown in this example, you can use system packages on the guest +system by using nixpkgs with a proper `crossSystem` configuration. diff --git a/example/doc/src/declarative.md b/example/doc/src/declarative.md new file mode 100755 index 0000000..1eafa9f --- /dev/null +++ b/example/doc/src/declarative.md @@ -0,0 +1,73 @@ +# Declarative MicroVMs + +Provided your NixOS host [includes the host nixosModule](./host.md), +options are declared to build a MicroVM together with the host. +You can choose whether your MicroVMs should be managed in a fully-declarative +way, or whether your only want the initial deployment be declarative (with subsequent +imperative updates using the [microvm command](./microvm-command.md)). + +microvm.nix distinguishes between fully-declarative configurations +and declarative deployment by allowing you to specify either +a full `config` or just a `flake` respectively. + +## Fully declarative + +You can create fully declarative VMs by directly defining their +nixos system configuration in-place. This is very similar to how +nixos-containers work if you are familiar with those. + +```nix +# microvm refers to microvm.nixosModules +{ microvm, ... }: { + imports = [ microvm.host ]; + microvm.vms = { + my-microvm = { + # The package set to use for the microvm. This also determines the microvm's architecture. + # Defaults to the host system's package set if not given. + pkgs = import nixpkgs { system = "x86_64-linux"; }; + + # (Optional) A set of special arguments to be passed to the MicroVM's NixOS modules. + #specialArgs = {}; + + # The configuration for the MicroVM. + # Multiple definitions will be merged as expected. + config = { + # It is highly recommended to share the host's nix-store + # with the VMs to prevent building huge images. + microvm.shares = [{ + source = "/nix/store"; + mountPoint = "/nix/.ro-store"; + tag = "ro-store"; + proto = "virtiofs"; + }]; + + # Any other configuration for your MicroVM + # [...] + }; + }; + }; +} +``` + +## Declarative deployment + +Why *deployed*? The per-MicroVM subdirectory under `/var/lib/microvms` +is only created if it did not exist before. This behavior is +intended to ensure existence of MicroVMs that are critical to +operation. To update them later you will have to use the [imperative microvm +command](./microvm-command.md). + +```nix +microvm.vms = { + my-microvm = { + # Host build-time reference to where the MicroVM NixOS is defined + # under nixosConfigurations + flake = self; + # Specify from where to let `microvm -u` update later on + updateFlake = "git+file:///etc/nixos"; + }; +}; +``` + +Note that building MicroVMs with the host increases build time and +closure size of the host's system. diff --git a/example/doc/src/declaring.md b/example/doc/src/declaring.md new file mode 100755 index 0000000..bff7659 --- /dev/null +++ b/example/doc/src/declaring.md @@ -0,0 +1,40 @@ +# Declaring NixOS MicroVMs + +![Demo](demo.gif) + +microvm.nix creates virtual machine disk images and runner script +packages for the entries of the `nixosConfigurations` section of a +`flake.nix` file. + +## The `microvm` module + +To add MicroVM functionality, a NixOS system configuration is +augmented by importing this flake's `nixosModule.microvm`: + +```nix +# Example flake.nix +{ + inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable"; + inputs.microvm.url = "github:microvm-nix/microvm.nix"; + inputs.microvm.inputs.nixpkgs.follows = "nixpkgs"; + + outputs = { self, nixpkgs, microvm }: { + # Example nixosConfigurations entry + nixosConfigurations.my-microvm = nixpkgs.lib.nixosSystem { + system = "x86_64-linux"; + modules = [ + # Include the microvm module + microvm.nixosModules.microvm + # Add more modules here + { + networking.hostName = "my-microvm"; + microvm.hypervisor = "cloud-hypervisor"; + } + ]; + }; + }; +} +``` + +To get you started quickly, a Flake template is included. Run `nix +flake init -t github:microvm-nix/microvm.nix` in a new project directory. diff --git a/example/doc/src/demo.gif b/example/doc/src/demo.gif new file mode 100755 index 0000000..700d3c2 Binary files /dev/null and b/example/doc/src/demo.gif differ diff --git a/example/doc/src/devices.md b/example/doc/src/devices.md new file mode 100755 index 0000000..2881e4e --- /dev/null +++ b/example/doc/src/devices.md @@ -0,0 +1,53 @@ +# Device pass-through + +PCI and USB pass-through is supported on some hypervisors. Permission +setup is automatic for declared `"pci"` devices, but manual for +`"usb"` devices. + +## Example PCI pass-through + +Guest example: + +```nix +microvm.devices = [ { + bus = "pci"; + path = "0000:06:00.1"; +} { + bus = "pci"; + path = "0000:06:10.4"; +} ]; +``` + +Permission setup on the host is provided by systemd template unit +`microvm-pci-devices@.service`. + +## Example USB pass-through + +### In the guest + +```nix +microvm.devices = [ + # RTL2838UHIDIR + # Realtek Semiconductor Corp. RTL2838 DVB-T + { bus = "usb"; path = "vendorid=0x0bda,productid=0x2838"; } + # Sonoff Zigbee 3.0 USB Dongle Plus + # Silicon Labs CP210x UART Bridge + { bus = "usb"; path = "vendorid=0x10c4,productid=0xea60"; } +]; +``` + +### On the host + +USB device paths are not directly translatable to udev rules. Setup +permissions yourself: + +```nix +services.udev.extraRules = '' + # RTL2838UHIDIR + # Realtek Semiconductor Corp. RTL2838 DVB-T + SUBSYSTEM=="usb", ATTR{idVendor}=="0bda", ATTR{idProduct}=="2838", GROUP="kvm" + # Sonoff Zigbee 3.0 USB Dongle Plus + # Silicon Labs CP210x UART Bridge + SUBSYSTEM=="usb", ATTR{idVendor}=="10c4", ATTR{idProduct}=="ea60", GROUP="kvm" +''; +``` diff --git a/example/doc/src/faq.md b/example/doc/src/faq.md new file mode 100755 index 0000000..fb72efa --- /dev/null +++ b/example/doc/src/faq.md @@ -0,0 +1,153 @@ +# Frequently Asked Questions + +## Are there elaborate example setups? + +microvm.nix is used in these open-source infrastructure projects: + +- [C3D2 services](https://gitea.c3d2.de/c3d2/nix-config) +- [DD-IX services](https://github.com/dd-ix/nix-config) + +Let us know if you know more! + +## Can I support the development and maintenance of this project? + +[❤ Sponsor](https://github.com/sponsors/astro) + +## How to centralize logging with journald? + +That is possible without even requiring a network transport by just +making the journals available to the host as a share. Because journald +identifies hosts by their `/etc/machine-id`, we propose to use static +content for that file. Add a NixOS module like the following to your +MicroVM configuration: + +```nix +environment.etc."machine-id" = { + mode = "0644"; + text = + # change this to suit your flake's interface + self.lib.addresses.machineId.${config.networking.hostName} + "\n"; +}; + +microvm.shares = [ { + # On the host + source = "/var/lib/microvms/${config.networking.hostName}/journal"; + # In the MicroVM + mountPoint = "/var/log/journal"; + tag = "journal"; + proto = "virtiofs"; + socket = "journal.sock"; +} ]; +``` + +Last, make the MicroVM journals available to your host. The +`machine-id` must be available. + +```nix +systemd.tmpfiles.rules = map (vmHost: + let + machineId = self.lib.addresses.machineId.${vmHost}; + in + # creates a symlink of each MicroVM's journal under the host's /var/log/journal + "L+ /var/log/journal/${machineId} - - - - /var/lib/microvms/${vmHost}/journal/${machineId}" +) (builtins.attrNames self.lib.addresses.machineId); +``` + +Once your MicroVM's journal data is visible in the +`/var/log/journal/$machineId/` directories, `journalctl` can pick it +up using the `-m`/`--merge` switch. + +## Can I build with hypervisors from the host's nixpkgs instead of the MicroVM's? + +Yes. This scenario is enabled through the flake's `lib.buildRunner` +function. See the [`nix run +microvm#build-microvm`](https://github.com/microvm-nix/microvm.nix/blob/main/pkgs/build-microvm.nix) +script that you will need to customize to fit your deployment scenario. + +## How can I deploy imperatively from Continuous Integration? + +Do this by integrating into your automation what the `microvm` command +does. + +```nix +environment.systemPackages = [ ( + # Provide a manual updating script that fetches the latest + # updated+built system from Hydra + pkgs.writeShellScriptBin "update-microvm" '' + if [ $# -lt 1 ]; then + NAMES="$(ls -1 /var/lib/microvms)" + else + NAMES="$@" + fi + + for NAME in $NAMES; do + echo MicroVM $NAME + cd /var/lib/microvms/$NAME + # Is this truly the flake that is being built on Hydra? + if [ "$(cat flake)" = "git+https://gitea.example.org/org/nix-config?ref=flake-update" ]; then + NEW=$(curl -sLH "Accept: application/json" https://hydra.example.org/job/org/nix-config/$NAME/latest | ${pkgs.jq}/bin/jq -er .buildoutputs.out.path) + nix copy --from https://nix-cache.example.org $NEW + + if [ -e booted ]; then + nix store diff-closures $(readlink booted) $NEW + elif [ -e current ]; then + echo "NOT BOOTED! Diffing to old current:" + nix store diff-closures $(readlink current) $NEW + else + echo "NOT BOOTED?" + fi + + CHANGED=no + if ! [ -e current ]; then + ln -s $NEW current + CHANGED=yes + elif [ "$(readlink current)" != $NEW ]; then + rm -f old + cp --no-dereference current old + rm -f current + ln -s $NEW current + CHANGED=yes + fi + fi + + if [ "$CHANGED" = "yes" ]; then + systemctl restart microvm@$NAME + fi + echo + done + '' +) ]; +``` + +## Can I include my host's `` channel when building the VM? + +Use the following configuration if you build your MicroVM with +`--impure` from channels, not Flakes: + +```nix +nix.nixPath = [ + "nixpkgs=${builtins.storePath }" +]; +``` + +## How do I let the `microvm` user access block devices? + +You can re-add the following line to your host's NixOS configuration +which was removed from microvm.nix: + +```nix +users.users.microvm.extraGroups = [ "disk" ]; +``` + +The more secure solution would be writing custom +`services.udev.extraRules` that assign ownership/permissions to the +individually used block devices. + +## My virtiofs-shared sops-nix /run/secrets disappears when the host is updated! + +A workaround may be setting `sops.keepGenerations = 0;`, effectively +stopping sops-nix from ever removing old generations in +`/run/secrets.d/`. + +That means that you still must reboot all MicroVMs to adapt any +updated secrets. diff --git a/example/doc/src/host-systemd.md b/example/doc/src/host-systemd.md new file mode 100755 index 0000000..cefa314 --- /dev/null +++ b/example/doc/src/host-systemd.md @@ -0,0 +1,47 @@ +# systemd services on a MicroVM host + +The `host` nixosModule provides a few systemd services for additional +bringup which is not available when running a MicroVM interactively +from a package. + +## `install-microvm-${name}.service` + +Creates and prepares a subdirectory under `/var/lib/microvms` for +[declarative MicroVMs](./declarative.md) according to the +`microvm.vms` option. + +If the MicroVM subdirectory under `/var/lib/microvms` already exists, +**and** the MicroVM is configured to be built from a flake's +`nixosConfigurations`, this systemd unit will be skipped. The reason +for this behavior is that it is easier to update with the [`microvm` +command](./microvm-command.md) instead of restarting all virtual +machines on a host when doing `nixos-rebuild switch`. + +## `microvm-tap-interfaces@.service` + +Creates TAP virtual network interfaces for the user that will run MicroVMs. + +## `microvm-macvtap-interfaces@.service` + +Creates MACVTAP virtual network interfaces for the user that will run MicroVMs. + +## `microvm-pci-devices@.service` + +Prepares PCI devices for passthrough +([VFIO](https://www.kernel.org/doc/html/latest/driver-api/vfio.html)). + +## `microvm-virtiofsd@.service` + +Starts a fleet of virtiofsd servers, one for each `virtiofs` +mountpoint in `microvm.shares`. + +## `microvm@.service` + +Runs the actual MicroVM through +`/var/lib/microvms/%i/current/bin/microvm-run` where `%i` is the +MicroVM name. + +## `microvms.target` + +Depends on the `microvm@.service` instance for all configured +`microvm.autostart`. diff --git a/example/doc/src/host.md b/example/doc/src/host.md new file mode 100755 index 0000000..78c15d2 --- /dev/null +++ b/example/doc/src/host.md @@ -0,0 +1,56 @@ +# Preparing a NixOS host for declarative MicroVMs + +**microvm.nix** adds the following configuration for servers to +host MicroVMs reliably: + +- a `/var/lib/microvms` state directory with one subdirectory per MicroVM +- systemd services `microvm-tap-interfaces@` to setup TAP network interfaces +- systemd services `microvm-virtiofsd@` to start virtiofsd instances +- systemd services `microvm@` to start a MicroVM +- configuration options to [declaratively build MicroVMs with the host + system](./declarative.md) +- tools to [manage MicroVMs imperatively](./microvm-command.md) + +Prepare your host by including the microvm.nix `host` nixosModule: + +```nix +# Your server's flake.nix +{ + inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable"; + inputs.microvm.url = "github:microvm-nix/microvm.nix"; + inputs.microvm.inputs.nixpkgs.follows = "nixpkgs"; + + outputs = { self, nixpkgs, microvm }: { + # Example nixosConfigurations entry + nixosConfigurations.server1 = nixpkgs.lib.nixosSystem { + system = "x86_64-linux"; + modules = [ + # Include the microvm host module + microvm.nixosModules.host + # Add more modules here + { + networking.hostName = "server1"; + + # try to automatically start these MicroVMs on bootup + microvm.autostart = [ + "my-microvm" + "your-microvm" + "their-microvm" + ]; + } + ]; + }; + }; +} +``` + +# Preparing a non-Flakes host + +If you really cannot migrate to Flakes easily, just import the `host` +module directly in your NixOS configuration: + +```nix +imports = [ (builtins.fetchGit { + url = "https://github.com/microvm-nix/microvm.nix"; +} + "/nixos-modules/host") ]; +``` diff --git a/example/doc/src/interfaces.md b/example/doc/src/interfaces.md new file mode 100755 index 0000000..9c494ab --- /dev/null +++ b/example/doc/src/interfaces.md @@ -0,0 +1,89 @@ +# Network interfaces + +Declare a MicroVM's virtual network interfaces like this in its NixOS +configuration: +```nix +{ + microvm.interfaces = [ { + type = "tap"; + + # interface name on the host + id = "vm-a1"; + + # Ethernet address of the MicroVM's interface, not the host's + # + # Locally administered have one of 2/6/A/E in the second nibble. + mac = "02:00:00:00:00:01"; + } ]; +} +``` + +## `type = "user"` + +User-mode networking is only provided by qemu and kvmtool, providing +outgoing connectivity to your MicroVM without any further setup. + +As kvmtool seems to lack a built-in DHCP server, additional static IP +configuration is necessary inside the MicroVM. + +## `type = "tap"` + +Use a virtual tuntap Ethernet interface. Its name is the value of +`id`. + +Some Hypervisors may be able to automatically create these interfaces +when running as root, which we advise against. Instead, create the +interfaces before starting a MicroVM: + +```bash +sudo ip tuntap add $IFACE_NAME mode tap user $USER +``` + +**Note:** add `multi_queue` to this command line if the VM is configured +with more than one CPU core. + +When running MicroVMs through the `host` module, the tap network +interfaces are created through a systemd service dependency. + +Extend the generated script in the guest configuration like this: + +```nix +microvm.binScripts.tap-up = lib.mkAfter '' + ${lib.getExe' pkgs.iproute2 "ip"} link set dev 'vm-ixp-as11201p' master 'ixp-peering' +''; +``` + +## `type = "macvtap"` + +*MACVTAP* interfaces attach to a host's physical network interface, +joining the same Ethernet segment with a separate MAC address. + +Before running a MicroVM interactively from a package, do the +following steps manually: + +```bash +# Parent interface: +LINK=eth0 +# MACVTAP interface, as specified under microvm.interfaces.*.id: +ID=microvm1 +# Create the interface +sudo ip l add link $LINK name $ID type macvtap mode bridge +# Obtain the interface index number +IFINDEX=$(cat /sys/class/net/$ID/ifindex) +# Grant yourself permission +sudo chown $USER /dev/tap$IFINDEX +``` + +When running MicroVMs through the `host` module, the macvtap network +interfaces are created through a systemd service dependency. Per +interface with `type = "macvtap"`, a `link` attribute with the parent +interface, and `mode` attribute for the MACVTAP filtering mode must be +specified. + +## `type = "bridge"` + +This mode lets qemu create a tap interface and attach it to a bridge. + +The `qemu-bridge-helper` binary needs to be setup with the proper +permissions. See the `host` module for that. qemu will be run +*without* `-sandbox on` in order for this contraption to work. diff --git a/example/doc/src/intro.md b/example/doc/src/intro.md new file mode 100755 index 0000000..dbce17d --- /dev/null +++ b/example/doc/src/intro.md @@ -0,0 +1,50 @@ +# Intro + +**microvm.nix** is a Flake to run lightweight NixOS virtual machines +on NixOS. Starting with the reasons why for the remainder of this +chapter, this handbook guides you through the provisioning of MicroVMs +on your NixOS machine. + +## Compartmentalization + +NixOS makes running services a breeze. Being able to quickly rollback +configuration is a life-saver. Not so much however on systems that are +shared by multiple services where maintenance of one affects others. + +Increase stability by partitioning services into virtual NixOS systems +that can be updated individually. + +**microvm.nix** can isolate your /nix/store into exactly what is +required for the guest's NixOS: the root filesystem is a read-only +erofs/squashfs file-systems that include only the binaries of your +configuration. Of course, that holds only true until you mount the +host's /nix/store as a share for faster build times, or mount the +store with a writable overlay for Nix builds inside the VM. + +## The Case Against Containers + +Linux containers are not a single technology but a plethora of kernel +features that serve to isolate various system resources so that the +running system appears as one. It is still one shared Linux kernel +with a huge attack surface. + +Virtual machines on the other hand run their own OS kernel, reducing +the attack surface to the hypervisor and its device drivers. The +resource usage however incurs some overhead when compared with +containers, with memory allocation being especially inflexible. + +**microvm.nix** is a tool that helps you building the guest's OS and +running it in ways that are easier than writing a `Dockerfile`, once +you know how to put a NixOS config into a `flake.nix` file. + +## Just Virtual Machines? + +Full virtualization has been available for a long time with QEMU and +VirtualBox. The *MicroVM* machine type highlights that virtualization +overhead has been reduced a lot by replacing emulated devices with +*virtio* interfaces that have been optimized for this environment. + +This Flake offers you to run your MicroVMs not only on QEMU but with +other Hypervisors that have been explicitly authored for +*virtio*. Some of them are written in Rust, a programming language +that is renowned for being safer than C. diff --git a/example/doc/src/microvm-command.md b/example/doc/src/microvm-command.md new file mode 100755 index 0000000..11a85f0 --- /dev/null +++ b/example/doc/src/microvm-command.md @@ -0,0 +1,80 @@ +# Imperative MicroVM management with the `microvm` command + +Compartmentalizing services in an infrastructure landscape allows us to +conduct maintenance individually and without affecting unrelated +MicroVMs. The `microvm` command helps with that. + +## Create a MicroVM + +You can specify this MicroVM's source flake with `-f`. If omitted, the +tool will assume `git+file:///etc/nixos`. The source flakeref will be +kept in `/var/lib/microvms/*/flake` for future updating the MicroVM. + +```bash +microvm -f git+https://... -c my-microvm +``` + +### Enabling MicroVM autostart + +Extension of the host's systemd units must happen declaratively in the +host's NixOS configuration: + +```nix +microvm.autostart = [ + "myvm1" + "myvm2" + "myvm3" +]; +``` + +## Update a MicroVM + +*Updating* does not refresh your packages but simply rebuilds the +MicroVM. Use `nix flake update` to get new package versions. + +```bash +microvm -u my-microvm +``` + +Until ways have been found to safely transfer the profile into the +target /nix/store, and subsequently activate it, you must restart the +MicroVM for the update to take effect. + +Use the `-R` flag to automatically restart if an update was built. + +## List MicroVMs + +Listing your MicroVMs is as trivial as `ls -1 /var/lib/microvms` + +For more insight, the following command will read the current system +version of all MicroVMs and compare them to what the corresponding +flake evaluates. It is therefore quite slow to run, yet very useful +for an updatable VM overview. + +```bash +microvm -l +``` + +If you want a faster overview of booted and current versions, run +this instead: + +```bash +ls -l /var/lib/microvms/*/{current,booted}/share/microvm/system +``` + +## Removing MicroVMs + +First, stop the MicroVM: + +```bash +systemctl stop microvm@$NAME +``` + +If you don't use absolute filesystem paths for sockets, volumes, or +shares, all MicroVM state is kept under `/var/lib/microvms/$NAME/`. +The `microvm@.serivce` systemd service template depends on existence +of this directory. + +```bash +rm -rf /var/lib/microvms/$NAME +``` diff --git a/example/doc/src/options.md b/example/doc/src/options.md new file mode 100755 index 0000000..71b5852 --- /dev/null +++ b/example/doc/src/options.md @@ -0,0 +1,24 @@ +# Configuration options + +By including the `microvm` module a set of NixOS options is made +available for customization. These are the most important ones: + +| Option | Purpose | +|--------------------------------|-----------------------------------------------------------------------------------------------------| +| `microvm.hypervisor` | Hypervisor to use by default in `microvm.declaredRunner` | +| `microvm.vcpu` | Number of Virtual CPU cores | +| `microvm.mem` | RAM allocation in MB | +| `microvm.interfaces` | Network interfaces | +| `microvm.volumes` | Block device images | +| `microvm.shares` | Shared filesystem directories | +| `microvm.devices` | PCI/USB devices for host-to-vm passthrough | +| `microvm.socket` | Control socket for the Hypervisor so that a MicroVM can be shutdown cleanly | +| `microvm.user` | (qemu only) User account which Qemu will switch to when started as root | +| `microvm.forwardPorts` | (qemu user-networking only) TCP/UDP port forwarding | +| `microvm.kernelParams` | Like `boot.kernelParams` but will not end up in `system.build.toplevel`, saving you rebuilds | +| `microvm.storeOnDisk` | Enables the store on the boot squashfs even in the presence of a share with the host's `/nix/store` | +| `microvm.writableStoreOverlay` | Optional string of the path where all writes to `/nix/store` should go to. | + +See [the options declarations]( +https://github.com/microvm-nix/microvm.nix/blob/main/nixos-modules/microvm/options.nix) +for a full reference. diff --git a/example/doc/src/output-options.md b/example/doc/src/output-options.md new file mode 100755 index 0000000..ad27010 --- /dev/null +++ b/example/doc/src/output-options.md @@ -0,0 +1,35 @@ +# MicroVM output options + +Hypervisor runners are provided in the `config` generated by a +nixosSystem for you to use inside and outside your configuration. + +| Option | Purpose | +|--------------------------|-----------------------------------------------------------| +| `microvm.declaredRunner` | Runner package selected according to `microvm.hypervisor` | +| `microvm.runners` | Attribute set of runner packages per known Hypervisor. | + +The `microvm.declaredRunner` selects the hypervisor according to the +configured `microvm.hypervisor`. + +```bash +nix run .#nixosConfigurations.my-microvm.config.microvm.declaredRunner +``` + +The `microvm.runners` option provides a runner for each known +Hypervisor regardless of the `microvm.hypervisor` config setting. To +build *my-microvm* for Firecracker for example: + +```bash +nix run .#nixosConfigurations.my-microvm.config.microvm.runners.firecracker +``` + +## Configure `microvm.hypervisor`, use `microvm.declaredRunner`! + +One of the `microvm.runners` is picked by `microvm.declaredRunner` by +evaluating `microvm.hypervisor`. + +You may switch the Hypervisor quickly, but use `declaredRunner` in +production. Any other NixOS configuration that evaluates the +`microvm.hypervisor` option can be wrong when you pick from +`microvm.runners` directly. One example would be the defaults set by +`microvm.optimize`. diff --git a/example/doc/src/packages.md b/example/doc/src/packages.md new file mode 100755 index 0000000..9aab7e2 --- /dev/null +++ b/example/doc/src/packages.md @@ -0,0 +1,25 @@ +# Running a MicroVM as a package + +Quickly running a MicroVM interactively is great for testing. You get +to interact with its console. + +There are drawbacks: no preparation for TAP network interfaces is done +and no virtiofsd is started. These can be worked around by relying on +9p shares and using qemu's `host` network interfaces. + +## Immediately running a nixosConfiguration + +To run a `nixosConfiguration` off your Flake directly use: +```bash +nix run .#nixosConfigurations.my-microvm.config.microvm.declaredRunner +``` + +## Add a runner package to your Flake + +To add this runner permanently add a package like this to the outputs +of your `flake.nix`: +```nix +packages.x86_64-linux.my-microvm = self.nixosConfigurations.my-microvm.config.microvm.declaredRunner; +``` + +You can then run the MicroVM with a simple `nix run .#my-microvm` diff --git a/example/doc/src/routed-network.md b/example/doc/src/routed-network.md new file mode 100755 index 0000000..04944bc --- /dev/null +++ b/example/doc/src/routed-network.md @@ -0,0 +1,153 @@ +# Routed network setup + +## Motivation + +In bridged setups the Virtual Machines share the same Ethernet +segment. A compromised VM still has raw network access, allowing it to +send a lot of funny packets that cause problems for other +VMs. Examples: + +- Forging MAC addresses +- Running rogue DHCP servers +- ARP/NDP spoofing +- Meddling with link-local multicast + +This can be avoided by unsharing the Ethernet segments, ie. removing +the bridge. + +## Addressing + +Compared to one Ethernet where we assign a large subnet like +`10.0.0.0/24`, we will now only deal with *Host Routes* where the +prefix length is `/32` for IPv4 and `/128` for IPv6. Note that by +doing this we no longer lose precious space to a subnet's network and +broadcast addresses. + +## Host configuration + +Using systemd-networkd, a VM's tap interface is configured with static +addresses and the corresponding host routes. We do this for up to +`maxVMs`. Increasing this number will create as many `.network` +configuration files, so it's relatively cheap. + +```nix +{ lib, ... }: + +let + maxVMs = 64; + +in +{ + networking.useNetworkd = true; + + systemd.network.networks = builtins.listToAttrs ( + map (index: { + name = "30-vm${toString index}"; + value = { + matchConfig.Name = "vm${toString index}"; + # Host's addresses + address = [ + "10.0.0.0/32" + "fec0::/128" + ]; + # Setup routes to the VM + routes = [ { + Destination = "10.0.0.${toString index}/32"; + } { + Destination = "fec0::${lib.toHexString index}/128"; + } ]; + # Enable routing + networkConfig = { + IPv4Forwarding = true; + IPv6Forwarding = true; + }; + }; + }) (lib.genList (i: i + 1) maxVMs) + ); +} +``` + +## NAT + +For NAT configuration on the host we're not going to specify each +potential tap interface. That would create a lot of firewall rules. To +avoid this additional complexity, use a single subnet that matches all +your VMs' addresses: + +```nix +{ + networking.nat = { + enable = true; + internalIPs = [ "10.0.0.0/24" ]; + # Change this to the interface with upstream Internet access + externalInterface = "enp0s3"; + }; +} +``` + +# Virtual Machine configuration + +We no longer rely on DHCP for this non-standard setup. To produce IPv4 +and IPv6 addresses let's assign a number `index` to each MicroVM. Make +sure that this number is **not reused** by two VMs! + +We suggest creating some sort of central configuration file that +contains each VM's network `index` in one place. That should make +reuses obvious. If that list becomes too long, write a NixOS +assertion! + +```nix +{ lib, ... }: + +let + # Change this by VM! + index = 5; + + mac = "00:00:00:00:00:01"; + +in +{ + microvm.interfaces = [ { + id = "vm${toString index}"; + type = "tap"; + inherit mac; + } ]; + + networking.useNetworkd = true; + + systemd.network.networks."10-eth" = { + matchConfig.MACAddress = mac; + # Static IP configuration + address = [ + "10.0.0.${toString index}/32" + "fec0::${lib.toHexString index}/128" + ]; + routes = [ { + # A route to the host + Destination = "10.0.0.0/32"; + GatewayOnLink = true; + } { + # Default route + Destination = "0.0.0.0/0"; + Gateway = "10.0.0.0"; + GatewayOnLink = true; + } { + # Default route + Destination = "::/0"; + Gateway = "fec0::"; + GatewayOnLink = true; + } ]; + networkConfig = { + # DNS servers no longer come from DHCP nor Router + # Advertisements. Perhaps you want to change the defaults: + DNS = [ + # Quad9.net + "9.9.9.9" + "149.112.112.112" + "2620:fe::fe" + "2620:fe::9" + ]; + }; + }; +} +``` diff --git a/example/doc/src/shares.md b/example/doc/src/shares.md new file mode 100755 index 0000000..290d71d --- /dev/null +++ b/example/doc/src/shares.md @@ -0,0 +1,84 @@ +# Shares + +Persistent file-systems are provided by both volumes and +shares. Volumes are block devices inside the virtual machine, yielding +fast performance but mounted file-systems require exclusive +access. Shares allow mounting an arbitrary directory tree from the +host. + +In `microvm.shares` elements the `proto` field allows either of two +values: + +- `9p` (default) is built into many hypervisors, allowing you to + quickly share a directory tree + +- `virtiofs` requires a separate virtiofsd service which is started as + a prerequisite when you start MicroVMs through a systemd service + that comes with the `microvm.nixosModules.host` module. + + If you want to run from the command-line, start `bin/virtiofsd-run` + separately. + + Expect `virtiofs` to yield better performance over `9p`. + +```nix +microvm.shares = [ { + proto = "virtiofs"; + tag = "home"; + # Source path can be absolute or relative + # to /var/lib/microvms/$hostName + source = "home"; + mountPoint = "/home"; +} ]; +``` + +
+When sharing a path that is on ZFS with virtiofs, the dataset must +have options +-o xattr=sa -o acltype=posixacl +
+ + +## Sharing a host's `/nix/store` + +If a share with `source = "/nix/store"` is defined, size and build +time of the stage1 squashfs for `/dev/vda` will be reduced +drastically. + +```nix +microvm.shares = [ { + tag = "ro-store"; + source = "/nix/store"; + mountPoint = "/nix/.ro-store"; +} ]; +``` + +## Writable `/nix/store` overlay + +An optional writable layer will be mounted if the path +`microvm.writableStoreOverlay` is set. Make sure that the path is +located on a writable filesystem. + +**Caveat:** The Linux overlay filesystem is very picky about the +filesystems that can be the upper (writable) layer. 9p/virtiofs shares +don't work currently, so resort to using a volume for that: + +``` +{ config, ... }: +{ + microvm.writableStoreOverlay = "/nix/.rw-store"; + + microvm.volumes = [ { + image = "nix-store-overlay.img"; + mountPoint = config.microvm.writableStoreOverlay; + size = 2048; + } ]; +} +``` + +
+The Nix database will forget all built packages after a +reboot, containing only what is needed for the VM's NixOS +system. Until this has been solved, it is recommended to just delete +and recreate the overlay after MicroVM shutdown or before startup. +
diff --git a/example/doc/src/simple-network.md b/example/doc/src/simple-network.md new file mode 100755 index 0000000..12849c2 --- /dev/null +++ b/example/doc/src/simple-network.md @@ -0,0 +1,113 @@ +# A simple network setup + +While networking infrastructure is out of scope for the **microvm.nix** +flake, here is some guidance for providing the MicroVMs on your NixOS +machine with internet access. + +Use this for your local LAN where IP addresses are free and +plentiful. If not, head over to the +[advanced networking](./advanced-network.md) page. + +Because we already use systemd for MicroVM startup, let's pick +`systemd-networkd`: +```nix +networking.useNetworkd = true; +``` + +## A bridge to link TAP interfaces + +To make your MicroVM reachable, the host will place its Ethernet port (`eno1`) +on a bridge (`br0`). This bridge will have the MicroVM's TAP interface attached +to it - directly placing the MicroVM on your local network. + +Note that the addresses provided below are examples and you must adjust these +for your network settings. Also note that the `eno1` must be attached to the +bridge with the `vm-*` TAP interfaces that you will specify in the MicroVM +definition. + +```nix +systemd.network.enable = true; + +systemd.network.networks."10-lan" = { + matchConfig.Name = ["eno1" "vm-*"]; + networkConfig = { + Bridge = "br0"; + }; +}; + +systemd.network.netdevs."br0" = { + netdevConfig = { + Name = "br0"; + Kind = "bridge"; + }; +}; + +systemd.network.networks."10-lan-bridge" = { + matchConfig.Name = "br0"; + networkConfig = { + Address = ["192.168.1.2/24" "2001:db8::a/64"]; + Gateway = "192.168.1.1"; + DNS = ["192.168.1.1"]; + IPv6AcceptRA = true; + }; + linkConfig.RequiredForOnline = "routable"; +}; +``` + +Now that the host is configured, you can define a MicroVM to have a static IP +address with: + +```nix +microvm = { + #...add additional MicroVM configuration here + interfaces = [ + { + type = "tap"; + id = "vm-test1"; + mac = "02:00:00:00:00:01"; + } + ]; +}; + +systemd.network.enable = true; + +systemd.network.networks."20-lan" = { + matchConfig.Type = "ether"; + networkConfig = { + Address = ["192.168.1.3/24" "2001:db8::b/64"]; + Gateway = "192.168.1.1"; + DNS = ["192.168.1.1"]; + IPv6AcceptRA = true; + DHCP = "no"; + }; +}; +``` + +## Docker and systemd-network + +If you use the above `systemd.network` bridge config and wish to run +Docker containers inside your microvms using `virtualisation.docker`, +you may need to add the following snippet to stop `systemd-networkd` from +managing the bridged `veth*` interfaces Docker creates for each container. +Without this, network access inside the containers will be broken. + +```nix +systemd.network.networks."19-docker" = { + matchConfig.Name = "veth*"; + linkConfig = { + Unmanaged = true; + }; +}; +``` + +## Advanced: Improving Performance + +If you prioritize network performance over inter-VM communication on +the virtual bridge, have a look into these alternatives: + +- Network interfaces with `type = "macvtap"` are supported in + microvm.nix. While they're technically tap devices, they attach to + an external Ethernet port, eliminating the `br0` bridge. + +- Server Ethernet cards support SR-IOV: setup Virtual Function devices + for PCI passthru into MicroVMs. diff --git a/example/doc/src/ssh-deploy.md b/example/doc/src/ssh-deploy.md new file mode 100755 index 0000000..208ee96 --- /dev/null +++ b/example/doc/src/ssh-deploy.md @@ -0,0 +1,66 @@ +# Deploying via SSH + +By running either from packages or through systemd services +microvm.nix tries to support a wholesome Nix workflow: develop and +test on your local laptop, then deploy to staging and later to +production. + +Let's explore alternative ways before detailing our elaboration: + +- You could build + `.#nixosConfiguration.my-microvm.config.microvm.declaredRunner` + locally, then `nix copy` it to the target host for + installation. This comes at the expense of your laptop's battery + time and it can also become quite network-heavy. +- You may transfer each change to the remote host to build entirely + remote. There you're going to have a repository state that is going + to confuse fellow operators. Also, your local `--override-input` + parameters will become meaningless on the remote filesystem. + +## microvm.deploy.rebuild + +The *easy* interface that is named after `nixos-rebuild` combines the +two scripts that are described below: + +- First, we evaluate locally and build remotely with + `microvm.deploy.installOnHost` +- Depending on whether the host's `/nix/store` is mounted and SSH is + running in the MicroVM: + - We either run `microvm.deploy.sshSwitch` as described below + - Alternatively, we restart the MicroVM's systemd service on the + host + +Because it needs to know about both the host and the MicroVM, these +ssh addresses must come before the actual `switch` argument: + +``` +nix run .#nixosConfigurations.my-microvm.config.microvm.deploy.rebuild root@example.com root@my-microvm.example.com switch +``` + +## microvm.deploy.installOnHost + +This script will evaluate only the system's derivations locally. It +then transfers these and their dependencies to the remote system so +the actual build can be performed there. + +Just like [the microvm command](microvm-command.md), it then installs +the MicroVM under `/var/lib/microvms/$NAME` so that the systemd +services of the `host` module can pick it up. + +It is irrelevant whether you create a new MicroVM or update an +existing one. + +## microvm.deploy.sshSwitch + +Once the host has an updated MicroVM in its `/nix/store` (see above) +the new system must be activated. For a proper state, this script does +a bit more in the MicroVM than just `switch-to-configuration`: + +- First, the `config.networking.hostName` is compared to the running + system for safety reasons. +- The Nix database registration will be imported which is important if + you build packages into a `microvm.writableStoreOverlay`. +- The new system is installed into `/nix/var/nix/profiles/system` + which is optional but expected by some Nix tooling. +- Finally, run `switch-to-configuration` with the provided parameter + (eg. `switch`). diff --git a/example/host.nix b/example/host.nix new file mode 100755 index 0000000..787a197 --- /dev/null +++ b/example/host.nix @@ -0,0 +1,12 @@ +# TODO: remove this file after 2024 + +{ lib, ... }: + +lib.warn + '' + microvm.nix/nixos-modules/host.nix has moved to + microvm.nix/nixos-modules/host -- please update. + '' + { + imports = [ ./host ]; + } diff --git a/example/host/default.nix b/example/host/default.nix new file mode 100755 index 0000000..d0884c0 --- /dev/null +++ b/example/host/default.nix @@ -0,0 +1,335 @@ +{ + pkgs, + config, + lib, + ... +}: +let + inherit (config.microvm) stateDir; + microvmCommand = import ../../pkgs/microvm-command.nix { + inherit pkgs; + }; + user = "microvm"; + group = "kvm"; +in +{ + imports = [ ./options.nix ]; + + config = lib.mkIf config.microvm.host.enable { + assertions = lib.concatMap (vmName: [ + { + assertion = + config.microvm.vms.${vmName}.config != null -> config.microvm.vms.${vmName}.flake == null; + message = "vm ${vmName}: Fully-declarative VMs cannot also set a flake!"; + } + { + assertion = + config.microvm.vms.${vmName}.config != null -> config.microvm.vms.${vmName}.updateFlake == null; + message = "vm ${vmName}: Fully-declarative VMs cannot set a updateFlake!"; + } + ]) (builtins.attrNames config.microvm.vms); + + boot.kernelModules = [ "tun" ]; + + system.activationScripts.microvm-host = '' + mkdir -p ${stateDir} + chown ${user}:${group} ${stateDir} + chmod g+w ${stateDir} + ''; + + environment.systemPackages = [ + microvmCommand + ]; + + users.users.${user} = { + isSystemUser = true; + inherit group; + }; + + security.pam.loginLimits = [ + { + domain = user; + item = "memlock"; + type = "hard"; + value = "infinity"; + } + { + domain = user; + item = "memlock"; + type = "soft"; + value = "infinity"; + } + ]; + + systemd.services = + builtins.foldl' + ( + result: name: + result + // ( + let + microvmConfig = config.microvm.vms.${name}; + inherit (microvmConfig) flake updateFlake; + isFlake = flake != null; + guestConfig = + if isFlake then flake.nixosConfigurations.${name}.config else microvmConfig.config.config; + runner = guestConfig.microvm.declaredRunner; + in + { + "install-microvm-${name}" = { + description = "Install MicroVM '${name}'"; + before = [ + "microvm@${name}.service" + "microvm-tap-interfaces@${name}.service" + "microvm-pci-devices@${name}.service" + "microvm-virtiofsd@${name}.service" + ]; + partOf = [ "microvm@${name}.service" ]; + wantedBy = [ "microvms.target" ]; + # Run on every rebuild for fully-declarative MicroVMs and flake-based MicroVMs without updateFlake. + # For MicroVMs with updateFlake set, only run on initial installation. + unitConfig.ConditionPathExists = lib.mkIf (isFlake && updateFlake != null) "!${stateDir}/${name}"; + serviceConfig.Type = "oneshot"; + script = '' + mkdir -p ${stateDir}/${name} + cd ${stateDir}/${name} + + ln -sTf ${runner} current + chown -h ${user}:${group} . current + '' + # Including the toplevel here is crucial to have the service definition + # change when the host is rebuilt and the vm definition changed. + + lib.optionalString (!isFlake) '' + ln -sTf ${guestConfig.system.build.toplevel} toplevel + '' + # Declarative deployment requires storing just the flake + + lib.optionalString isFlake '' + echo '${if updateFlake != null then updateFlake else flake}' > flake + chown -h ${user}:${group} flake + ''; + serviceConfig.SyslogIdentifier = "install-microvm-${name}"; + }; + "microvm@${name}" = { + # restartIfChanged is opt-out, so we have to include the definition unconditionally + serviceConfig.X-RestartIfChanged = [ + "" + microvmConfig.restartIfChanged + ]; + path = lib.mkForce [ ]; + # If the given declarative microvm wants to be restarted on change, + # We have to make sure this service group is restarted. To make sure + # that this service is also changed when the microvm configuration changes, + # we also have to include a trigger here. + restartTriggers = [ guestConfig.system.build.toplevel ]; + overrideStrategy = "asDropin"; + serviceConfig.Type = + if guestConfig.microvm.declaredRunner.supportsNotifySocket then "notify" else "simple"; + }; + "microvm-tap-interfaces@${name}" = { + serviceConfig.X-RestartIfChanged = [ + "" + microvmConfig.restartIfChanged + ]; + path = lib.mkForce [ ]; + overrideStrategy = "asDropin"; + }; + "microvm-pci-devices@${name}" = { + serviceConfig.X-RestartIfChanged = [ + "" + microvmConfig.restartIfChanged + ]; + path = lib.mkForce [ ]; + overrideStrategy = "asDropin"; + }; + "microvm-virtiofsd@${name}" = { + serviceConfig.X-RestartIfChanged = [ + "" + microvmConfig.restartIfChanged + ]; + path = lib.mkForce [ ]; + overrideStrategy = "asDropin"; + }; + } + ) + ) + { + "microvm-tap-interfaces@" = { + description = "Setup MicroVM '%i' TAP interfaces"; + before = [ "microvm@%i.service" ]; + partOf = [ "microvm@%i.service" ]; + after = [ "network.target" ]; + unitConfig.ConditionPathExists = "${stateDir}/%i/current/bin/tap-up"; + restartIfChanged = false; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + SyslogIdentifier = "microvm-tap-interfaces@%i"; + ExecStart = "${stateDir}/%i/current/bin/tap-up"; + ExecStop = "${stateDir}/%i/booted/bin/tap-down"; + }; + }; + + "microvm-macvtap-interfaces@" = { + description = "Setup MicroVM '%i' MACVTAP interfaces"; + before = [ "microvm@%i.service" ]; + partOf = [ "microvm@%i.service" ]; + unitConfig.ConditionPathExists = "${stateDir}/%i/current/bin/macvtap-up"; + restartIfChanged = false; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + SyslogIdentifier = "microvm-macvtap-interfaces@%i"; + ExecStart = "${stateDir}/%i/current/bin/macvtap-up"; + ExecStop = "${stateDir}/%i/booted/bin/macvtap-down"; + }; + }; + + "microvm-pci-devices@" = { + description = "Setup MicroVM '%i' devices for passthrough"; + before = [ "microvm@%i.service" ]; + partOf = [ "microvm@%i.service" ]; + unitConfig.ConditionPathExists = "${stateDir}/%i/current/bin/pci-setup"; + restartIfChanged = false; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + SyslogIdentifier = "microvm-pci-devices@%i"; + ExecStart = "${stateDir}/%i/current/bin/pci-setup"; + }; + }; + + "microvm-virtiofsd@" = + let + runFromBootedOrCurrent = pkgs.writeShellScript "microvm-runFromBootedOrCurrent" '' + NAME="$1" + VM="$2" + cd "${stateDir}/$VM" + + if [ -e booted ]; then + exec booted/bin/$NAME + else + exec current/bin/$NAME + fi + ''; + + in + { + description = "VirtioFS daemons for MicroVM '%i'"; + before = [ "microvm@%i.service" ]; + after = [ "local-fs.target" ]; + partOf = [ "microvm@%i.service" ]; + unitConfig.ConditionPathExists = "${stateDir}/%i/current/bin/virtiofsd-run"; + restartIfChanged = false; + serviceConfig = { + WorkingDirectory = "${stateDir}/%i"; + ExecStart = "${stateDir}/%i/current/bin/virtiofsd-run"; + ExecStop = "${runFromBootedOrCurrent} virtiofsd-shutdown %i"; + LimitNOFILE = 1048576; + NotifyAccess = "all"; + PrivateTmp = "yes"; + Restart = "always"; + RestartSec = "5s"; + SyslogIdentifier = "microvm-virtiofsd@%i"; + Type = "notify"; + }; + }; + + "microvm@" = { + description = "MicroVM '%i'"; + requires = [ + "microvm-tap-interfaces@%i.service" + "microvm-macvtap-interfaces@%i.service" + "microvm-pci-devices@%i.service" + "microvm-virtiofsd@%i.service" + ]; + after = [ + "network.target" + "microvm-tap-interfaces@%i.service" + "microvm-macvtap-interfaces@%i.service" + "microvm-pci-devices@%i.service" + "microvm-virtiofsd@%i.service" + ]; + unitConfig.ConditionPathExists = "${stateDir}/%i/current/bin/microvm-run"; + restartIfChanged = false; + preStart = '' + rm -f booted + ln -s $(readlink current) booted + ''; + postStop = '' + rm booted + ''; + serviceConfig = { + Type = if config.microvm.host.useNotifySockets then "notify" else "simple"; + WorkingDirectory = "${stateDir}/%i"; + ExecStart = "${stateDir}/%i/current/bin/microvm-run"; + ExecStop = "${stateDir}/%i/booted/bin/microvm-shutdown"; + TimeoutSec = config.microvm.host.startupTimeout; + Restart = "always"; + RestartSec = "5s"; + User = user; + Group = group; + SyslogIdentifier = "microvm@%i"; + LimitNOFILE = 1048576; + NotifyAccess = "all"; + LimitMEMLOCK = "infinity"; + }; + }; + } + (builtins.attrNames config.microvm.vms); + + microvm.autostart = builtins.filter (vmName: config.microvm.vms.${vmName}.autostart) ( + builtins.attrNames config.microvm.vms + ); + # Starts all the containers after boot + systemd.targets.microvms = { + wantedBy = [ "multi-user.target" ]; + wants = map (name: "microvm@${name}.service") config.microvm.autostart; + }; + + # This helper creates tap interfaces and attaches them to a bridge + # for qemu regardless if it is run as root or not. + security.wrappers.qemu-bridge-helper = lib.mkIf (!config.virtualisation.libvirtd.enable) { + source = "${pkgs.qemu-utils}/libexec/qemu-bridge-helper"; + owner = "root"; + group = "root"; + capabilities = "cap_net_admin+ep"; + }; + + # You must define this file with your bridge interfaces if you + # intend to use qemu-bridge-helper through a `type = "bridge"` + # interface. + environment.etc."qemu/bridge.conf".text = lib.mkDefault '' + allow all + ''; + + # Enable Kernel Same-Page Merging + hardware.ksm.enable = lib.mkDefault true; + + # TODO: remove in 2026 + system.activationScripts.microvm-update-check = '' + if [ -d ${stateDir} ]; then + _outdated_microvms="" + + for dir in ${stateDir}/*; do + if [ -e $dir/current/share/microvm/virtiofs ] && + [ ! -e $dir/current/bin/virtiofsd-run ]; then + _outdated_microvms="$_outdated_microvms $(basename $dir)" + elif [ -e $dir/current/share/microvm/tap-interfaces ] && + [ ! -e $dir/current/bin/tap-up ]; then + _outdated_microvms="$_outdated_microvms $(basename $dir)" + elif [ -e $dir/current/share/microvm/macvtap-interfaces ] && + [ ! -e $dir/current/bin/macvtap-up ]; then + _outdated_microvms="$_outdated_microvms $(basename $dir)" + elif [ -e $dir/current/share/microvm/pci-devices ] && + [ ! -e $dir/current/bin/pci-setup ]; then + _outdated_microvms="$_outdated_microvms $(basename $dir)" + fi + done + + if [ "$_outdated_microvms" != "" ]; then + echo "The following MicroVMs must be updated to follow the new virtiofsd/tap/macvtap/pci setup scheme: $_outdated_microvms" + fi + fi + ''; + }; +} diff --git a/example/host/options.nix b/example/host/options.nix new file mode 100755 index 0000000..b7a1197 --- /dev/null +++ b/example/host/options.nix @@ -0,0 +1,182 @@ +{ pkgs, lib, ... }: + +{ + options.microvm = with lib; { + host.enable = mkOption { + type = types.bool; + default = true; + description = '' + Whether to enable the microvm.nix host module. + ''; + }; + host.startupTimeout = mkOption { + description = "Start up timeout for the VMs in seconds"; + type = types.ints.positive; + default = 150; + }; + + host.useNotifySockets = mkOption { + type = types.bool; + default = false; + description = '' + Enable if all your MicroVMs run with a Hypervisor that sends + readiness notification over a VSOCK. + + **Danger!** If one of your MicroVMs doesn't do this, its + systemd service will not start up successfully! + ''; + }; + + vms = mkOption { + type = + with types; + attrsOf ( + submodule ( + { config, name, ... }: + { + options = { + config = mkOption { + description = '' + A specification of the desired configuration of this MicroVM, + as a NixOS module, for building **without** a flake. + ''; + default = null; + type = nullOr ( + lib.mkOptionType { + name = "Toplevel NixOS config"; + merge = + loc: defs: + (import "${config.nixpkgs}/nixos/lib/eval-config.nix" { + modules = + let + extraConfig = ( + { lib, ... }: + { + _file = "module at ${__curPos.file}:${toString __curPos.line}"; + config = { + networking.hostName = lib.mkDefault name; + }; + } + ); + in + [ + extraConfig + ../microvm + ] + ++ (map (x: x.value) defs); + prefix = [ + "microvm" + "vms" + name + "config" + ]; + inherit (config) specialArgs pkgs; + system = + if config.pkgs != null then + config.pkgs.stdenv.hostPlatform.system + else + pkgs.stdenv.hostPlatform.system; + }); + } + ); + }; + + nixpkgs = mkOption { + type = types.path; + default = if config.pkgs != null then config.pkgs.path else pkgs.path; + defaultText = literalExpression "pkgs.path"; + description = '' + This option is only respected when `config` is + specified. + + The nixpkgs path to use for the MicroVM. Defaults to the + host's nixpkgs. + ''; + }; + + pkgs = mkOption { + type = types.nullOr types.unspecified; + default = pkgs; + defaultText = literalExpression "pkgs"; + description = '' + This option is only respected when `config` is specified. + + The package set to use for the MicroVM. Must be a + nixpkgs package set with the microvm overlay. Determines + the system of the MicroVM. + + If set to null, a new package set will be instantiated. + ''; + }; + + specialArgs = mkOption { + type = types.attrsOf types.unspecified; + default = { }; + description = '' + This option is only respected when `config` is specified. + + A set of special arguments to be passed to NixOS modules. + This will be merged into the `specialArgs` used to evaluate + the NixOS configurations. + ''; + }; + + flake = mkOption { + description = "Source flake for declarative build"; + type = nullOr path; + default = null; + defaultText = literalExpression ''flakeInputs.my-infra''; + }; + + updateFlake = mkOption { + description = "Source flakeref to store for later imperative update"; + type = nullOr str; + default = null; + defaultText = literalExpression ''"git+file:///home/user/my-infra"''; + }; + + autostart = mkOption { + description = "Add this MicroVM to config.microvm.autostart?"; + type = bool; + default = true; + }; + + restartIfChanged = mkOption { + type = types.bool; + default = config.config != null; + description = '' + Restart this MicroVM's services if the systemd units are changed, + i.e. if it has been updated by rebuilding the host. + + Defaults to true for fully-declarative MicroVMs. + ''; + }; + }; + } + ) + ); + default = { }; + description = '' + The MicroVMs that shall be built declaratively with the host NixOS. + ''; + }; + + stateDir = mkOption { + type = types.path; + default = "/var/lib/microvms"; + description = '' + Directory that contains the MicroVMs + ''; + }; + + autostart = mkOption { + type = with types; listOf str; + default = [ ]; + description = '' + MicroVMs to start by default. + + This includes declarative `config.microvm.vms` as well as MicroVMs that are managed through the `microvm` command. + ''; + }; + }; +} diff --git a/example/microvm/asserts.nix b/example/microvm/asserts.nix new file mode 100755 index 0000000..793572b --- /dev/null +++ b/example/microvm/asserts.nix @@ -0,0 +1,128 @@ +{ config, lib, ... }: +let + inherit (config.networking) hostName; + +in +lib.mkIf config.microvm.guest.enable { + assertions = + # check for duplicate volume images + map (volumes: { + assertion = builtins.length volumes == 1; + message = '' + MicroVM ${hostName}: volume image "${(builtins.head volumes).image}" is used ${toString (builtins.length volumes)} > 1 times. + ''; + }) (builtins.attrValues (builtins.groupBy ({ image, ... }: image) config.microvm.volumes)) + ++ + # check for duplicate interface ids + map (interfaces: { + assertion = builtins.length interfaces == 1; + message = '' + MicroVM ${hostName}: interface id "${(builtins.head interfaces).id}" is used ${toString (builtins.length interfaces)} > 1 times. + ''; + }) (builtins.attrValues (builtins.groupBy ({ id, ... }: id) config.microvm.interfaces)) + ++ + # check for bridge interfaces + map ( + { + id, + type, + bridge, + ... + }: + if type == "bridge" then + { + assertion = bridge != null; + message = '' + MicroVM ${hostName}: interface ${id} is of type "bridge" + but doesn't have a bridge to attach to defined. + ''; + } + else + { + assertion = bridge == null; + message = '' + MicroVM ${hostName}: interface ${id} is not of type "bridge" + and therefore shouldn't have a "bridge" option defined. + ''; + } + ) config.microvm.interfaces + ++ + # check for interface name length + map ( + { id, ... }: + { + assertion = builtins.stringLength id <= 15; + message = '' + MicroVM ${hostName}: interface name ${id} is longer than the + the maximum length of 15 characters on Linux. + ''; + } + ) config.microvm.interfaces + ++ + # check for duplicate share tags + map (shares: { + assertion = builtins.length shares == 1; + message = '' + MicroVM ${hostName}: share tag "${(builtins.head shares).tag}" is used ${toString (builtins.length shares)} > 1 times. + ''; + }) (builtins.attrValues (builtins.groupBy ({ tag, ... }: tag) config.microvm.shares)) + ++ + # check for duplicate share sockets + map + (shares: { + assertion = builtins.length shares == 1; + message = '' + MicroVM ${hostName}: share socket "${(builtins.head shares).socket}" is used ${toString (builtins.length shares)} > 1 times. + ''; + }) + ( + builtins.attrValues ( + builtins.groupBy ({ socket, ... }: toString socket) ( + builtins.filter ({ proto, ... }: proto == "virtiofs") config.microvm.shares + ) + ) + ) + ++ + # check for virtiofs shares without socket + map ( + { tag, socket, ... }: + { + assertion = socket != null; + message = '' + MicroVM ${hostName}: virtiofs share with tag "${tag}" is missing a `socket` path. + ''; + } + ) (builtins.filter ({ proto, ... }: proto == "virtiofs") config.microvm.shares) + ++ + # blacklist forwardPorts + [ + { + assertion = + config.microvm.forwardPorts != [ ] + -> ( + config.microvm.hypervisor == "qemu" + && builtins.any ({ type, ... }: type == "user") config.microvm.interfaces + ); + message = '' + MicroVM ${hostName}: `config.microvm.forwardPorts` works only with qemu and one network interface with `type = "user"` + ''; + } + ] + ++ + # cloud-hypervisor specific asserts + lib.optionals (config.microvm.hypervisor == "cloud-hypervisor") [ + { + assertion = + !(lib.any (str: lib.hasInfix "oem_strings" str) config.microvm.cloud-hypervisor.platformOEMStrings); + message = '' + MicroVM ${hostName}: `config.microvm.cloud-hypervisor.platformOEMStrings` items must not contain `oem_strings` + ''; + } + ]; + + warnings = + # 32 MB is just an optimistic guess, not based on experience + lib.optional (config.microvm.mem < 32) '' + MicroVM ${hostName}: ${toString config.microvm.mem} MB of RAM is uncomfortably narrow. + ''; +} diff --git a/example/microvm/boot-disk.nix b/example/microvm/boot-disk.nix new file mode 100755 index 0000000..44788d2 --- /dev/null +++ b/example/microvm/boot-disk.nix @@ -0,0 +1,69 @@ +{ + config, + lib, + pkgs, + ... +}: + +let + inherit (config.system.boot.loader) kernelFile; + inherit (config.microvm) initrdPath; + + kernelPath = "${config.microvm.kernel}/${kernelFile}"; + +in +{ + options.microvm = with lib; { + bootDisk = mkOption { + type = types.path; + description = '' + Generated. + + Required for Hypervisors that do not support direct + kernel+initrd loading. + ''; + }; + }; + + config = lib.mkIf config.microvm.guest.enable { + microvm.bootDisk = + pkgs.runCommandLocal "microvm-bootdisk.img" + { + nativeBuildInputs = with pkgs; [ + parted + libguestfs + ]; + LIBGUESTFS_PATH = pkgs.libguestfs-appliance; + } + '' + # kernel + initrd + slack, in sectors + EFI_SIZE=$(( ( ( $(stat -c %s ${kernelPath}) + $(stat -c %s ${initrdPath}) + 16 * 4096 ) / ( 2048 * 512 ) + 1 ) * 2048 )) + + truncate -s $(( ( $EFI_SIZE + 2048 + 33 ) * 512 )) $out + echo Creating partition table + parted --script $out -- \ + mklabel gpt \ + mkpart ESP fat32 2048s $(( $EFI_SIZE + 2048 - 1 ))"s" \ + set 1 boot on + + echo Creating EFI partition + export HOME=`pwd` + guestfish --add $out run \: mkfs fat /dev/sda1 + guestfs() { + guestfish --add $out --mount /dev/sda1:/ $@ + } + guestfs mkdir /loader + echo 'default *.conf' > loader.conf + guestfs copy-in loader.conf /loader/ + guestfs mkdir /loader/entries + cat > entry.conf < "/proc/sys/net/ipv6/conf/${id}/disable_ipv6" + fi + ${lib.getExe' pkgs.iproute2 "ip"} link set '${id}' up + ${pkgs.coreutils-full}/bin/chown '${user}:${group}' /dev/tap$(< "/sys/class/net/${id}/ifindex") + '' + ) macvtapInterfaces; + + macvtap-down = '' + set -ou pipefail + '' + + lib.concatMapStrings ( + { id, ... }: + '' + ${lib.getExe' pkgs.iproute2 "ip"} link delete '${id}' + '' + ) macvtapInterfaces; + }) + ]; +} diff --git a/example/microvm/mounts.nix b/example/microvm/mounts.nix new file mode 100755 index 0000000..078aa8a --- /dev/null +++ b/example/microvm/mounts.nix @@ -0,0 +1,207 @@ +{ config, lib, ... }: + +let + inherit (config.microvm) storeDiskType storeOnDisk writableStoreOverlay; + + inherit + (import ../../lib { + inherit lib; + }) + defaultFsType + withDriveLetters + ; + + hostStore = builtins.head ( + builtins.filter ({ source, ... }: source == "/nix/store") config.microvm.shares + ); + + roStore = if storeOnDisk then "/nix/.ro-store" else hostStore.mountPoint; + + roStoreDisk = + if storeOnDisk then + if + storeDiskType == "erofs" + # erofs supports filesystem labels + then + "/dev/disk/by-label/nix-store" + else + "/dev/vda" + else + throw "No disk letter when /nix/store is not in disk"; + +in +lib.mkIf config.microvm.guest.enable { + fileSystems = lib.mkMerge [ + ( + # built-in read-only store without overlay + lib.optionalAttrs (storeOnDisk && writableStoreOverlay == null) { + "/nix/store" = { + device = roStoreDisk; + fsType = storeDiskType; + options = [ "x-systemd.requires=systemd-modules-load.service" ]; + neededForBoot = true; + noCheck = true; + }; + } + ) + ( + # host store is mounted somewhere else, + # bind-mount to the proper place + lib.optionalAttrs + ( + !storeOnDisk && config.microvm.writableStoreOverlay == null && hostStore.mountPoint != "/nix/store" + ) + { + "/nix/store" = { + device = hostStore.mountPoint; + options = [ "bind" ]; + neededForBoot = true; + }; + } + ) + ( + # built-in read-only store for the overlay + lib.optionalAttrs (storeOnDisk && writableStoreOverlay != null) { + "/nix/.ro-store" = { + device = roStoreDisk; + fsType = storeDiskType; + options = [ "x-systemd.requires=systemd-modules-load.service" ]; + neededForBoot = true; + noCheck = true; + }; + } + ) + ( + # mount store with writable overlay + lib.optionalAttrs (writableStoreOverlay != null) { + "/nix/store" = { + device = "overlay"; + fsType = "overlay"; + neededForBoot = true; + options = [ + "lowerdir=${roStore}" + "upperdir=${writableStoreOverlay}/store" + "workdir=${writableStoreOverlay}/work" + ]; + depends = [ + roStore + writableStoreOverlay + ]; + }; + } + ) + { + # a tmpfs / by default. can be overwritten. + "/" = lib.mkDefault { + device = "rootfs"; + fsType = "tmpfs"; + options = [ "size=50%,mode=0755" ]; + neededForBoot = true; + }; + } + ( + # Volumes + builtins.foldl' ( + result: + { + label, + mountPoint, + letter, + fsType ? defaultFsType, + ... + }: + result + // lib.optionalAttrs (mountPoint != null) { + "${mountPoint}" = { + inherit fsType; + # Prioritize identifying a device by label if provided. This + # minimizes the risk of misidentifying a device. + device = if label != null then "/dev/disk/by-label/${label}" else "/dev/vd${letter}"; + } + // lib.optionalAttrs (mountPoint == config.microvm.writableStoreOverlay) { + neededForBoot = true; + }; + } + ) { } (withDriveLetters config.microvm) + ) + ( + # 9p/virtiofs Shares + builtins.foldl' ( + result: + { + mountPoint, + tag, + proto, + source, + ... + }: + result + // { + "${mountPoint}" = { + device = tag; + fsType = proto; + options = + { + "virtiofs" = [ + "defaults" + "x-systemd.requires=systemd-modules-load.service" + ]; + "9p" = [ + "trans=virtio" + "version=9p2000.L" + "msize=65536" + "x-systemd.requires=systemd-modules-load.service" + ]; + } + .${proto}; + } + // lib.optionalAttrs (source == "/nix/store" || mountPoint == config.microvm.writableStoreOverlay) { + neededForBoot = true; + }; + } + ) { } config.microvm.shares + ) + ]; + + # boot.initrd.systemd patchups copied from + boot.initrd.systemd = lib.mkIf (config.boot.initrd.systemd.enable && writableStoreOverlay != null) { + mounts = [ + { + where = "/sysroot/nix/store"; + what = "overlay"; + type = "overlay"; + options = builtins.concatStringsSep "," [ + "lowerdir=/sysroot${roStore}" + "upperdir=/sysroot${writableStoreOverlay}/store" + "workdir=/sysroot${writableStoreOverlay}/work" + ]; + wantedBy = [ "initrd-fs.target" ]; + before = [ "initrd-fs.target" ]; + requires = [ "rw-store.service" ]; + after = [ "rw-store.service" ]; + unitConfig.RequiresMountsFor = "/sysroot/${roStore}"; + } + ]; + services.rw-store = { + unitConfig = { + DefaultDependencies = false; + RequiresMountsFor = "/sysroot${writableStoreOverlay}"; + }; + serviceConfig = { + Type = "oneshot"; + ExecStart = "/bin/mkdir -p -m 0755 /sysroot${writableStoreOverlay}/store /sysroot${writableStoreOverlay}/work /sysroot/nix/store"; + }; + }; + }; + + # Fix for hanging shutdown + systemd.mounts = lib.mkIf config.boot.initrd.systemd.enable [ + { + what = "store"; + where = "/nix/store"; + # Generate a `nix-store.mount.d/overrides.conf` + overrideStrategy = "asDropin"; + unitConfig.DefaultDependencies = false; + } + ]; +} diff --git a/example/microvm/optimization.nix b/example/microvm/optimization.nix new file mode 100755 index 0000000..b3dc846 --- /dev/null +++ b/example/microvm/optimization.nix @@ -0,0 +1,65 @@ +# Closure size and startup time optimization for disposable use-cases +{ + config, + options, + lib, + ... +}: + +let + cfg = config.microvm; + + canSwitchViaSsh = + config.services.openssh.enable + && + # Is the /nix/store mounted from the host? + builtins.any ({ source, ... }: source == "/nix/store") config.microvm.shares; + +in +lib.mkIf (cfg.guest.enable && cfg.optimize.enable) { + # The docs are pretty chonky + documentation.enable = lib.mkDefault false; + + boot = { + initrd.systemd = { + # Use systemd initrd for startup speed. + # TODO: error mounting /nix/store on crosvm, kvmtool + enable = lib.mkDefault ( + builtins.elem cfg.hypervisor [ + "qemu" + "cloud-hypervisor" + "firecracker" + "stratovirt" + ] + ); + tpm2.enable = lib.mkDefault false; + }; + kernelParams = [ + # we only need one serial console + "8250.nr_uarts=1" + ]; + swraid.enable = lib.mkDefault false; + }; + + nixpkgs.overlays = [ + (final: prev: { + stratovirt = prev.stratovirt.override { gtk3 = null; }; + }) + ]; + + # networkd is used due to some strange startup time issues with nixos's + # homegrown dhcp implementation + networking.useNetworkd = lib.mkDefault true; + + systemd = { + # Due to a bug in systemd-networkd: https://github.com/systemd/systemd/issues/29388 + # we cannot use systemd-networkd-wait-online. + network.wait-online.enable = lib.mkDefault false; + tpm2.enable = lib.mkDefault false; + }; + + # Exclude switch-to-configuration.pl from toplevel. + system = lib.optionalAttrs (options.system ? switch && !canSwitchViaSsh) { + switch.enable = lib.mkDefault false; + }; +} diff --git a/example/microvm/options.nix b/example/microvm/options.nix new file mode 100755 index 0000000..4f5ab62 --- /dev/null +++ b/example/microvm/options.nix @@ -0,0 +1,806 @@ +{ + config, + lib, + pkgs, + ... +}: +let + self-lib = import ../../lib { + inherit lib; + }; + + cfg = config.microvm; + hostName = config.networking.hostName or "$HOSTNAME"; + kernelAtLeast = lib.versionAtLeast config.boot.kernelPackages.kernel.version; +in +{ + options.microvm = with lib; { + guest.enable = mkOption { + type = types.bool; + default = true; + description = '' + Whether to enable the microvm.nix guest module at all. + ''; + }; + + optimize.enable = lib.mkOption { + description = '' + Enables some optimizations by default to closure size and startup time: + - defaults documentation to off + - defaults to using systemd in initrd + - use systemd-networkd + - disables systemd-network-wait-online + - disables NixOS system switching if the host store is not mounted + + This takes a few hundred MB off the closure size, including qemu, + allowing for putting MicroVMs inside Docker containers. + ''; + + type = lib.types.bool; + default = true; + }; + + cpu = mkOption { + type = with types; nullOr str; + default = null; + description = '' + What CPU to emulate, if any. If different from the host + architecture, it will have a serious performance hit. + + ::: {.note} + Only supported with qemu. + ::: + ''; + }; + + hypervisor = mkOption { + type = types.enum self-lib.hypervisors; + default = "qemu"; + description = '' + Which hypervisor to use for this MicroVM + + Choose one of: ${lib.concatStringsSep ", " self-lib.hypervisors} + ''; + }; + + preStart = mkOption { + description = "Commands to run before starting the hypervisor"; + default = ""; + type = types.lines; + }; + + socket = mkOption { + description = "Hypervisor control socket path"; + default = "${hostName}.sock"; + defaultText = literalExpression ''"''${hostName}.sock"''; + type = with types; nullOr str; + }; + + user = mkOption { + description = "User to switch to when started as root"; + default = null; + type = with types; nullOr str; + }; + + kernel = mkOption { + description = "Kernel package to use for MicroVM runners. Better set `boot.kernelPackages` instead."; + default = config.boot.kernelPackages.kernel; + defaultText = literalExpression ''"''${config.boot.kernelPackages.kernel}"''; + type = types.package; + }; + + initrdPath = mkOption { + description = "Path to the initrd file in the initrd package"; + default = "${config.system.build.initialRamdisk}/${config.system.boot.loader.initrdFile}"; + defaultText = literalExpression ''"''${config.system.build.initialRamdisk}/''${config.system.boot.loader.initrdFile}"''; + type = types.path; + }; + + vcpu = mkOption { + description = "Number of virtual CPU cores"; + default = 1; + type = types.ints.positive; + }; + + mem = mkOption { + description = "Amount of RAM in megabytes"; + default = 512; + type = types.ints.positive; + }; + + hugepageMem = mkOption { + type = types.bool; + default = false; + description = '' + Whether to use hugepages as memory backend. + (Currently only respected if using cloud-hypervisor) + ''; + }; + + hotplugMem = mkOption { + description = '' + Amount of hotplug memory in megabytes. + + This describes the maximum amount of memory that can be dynamically added to the VM with virtio-mem. + ''; + default = 0; + type = types.ints.unsigned; + }; + + hotpluggedMem = mkOption { + description = '' + Amount of hotplugged memory in megabytes. + + This basically describes the amount of hotplug memory the VM starts with. + ''; + default = config.microvm.hotplugMem; + type = types.ints.unsigned; + }; + + balloon = mkOption { + description = '' + Whether to enable ballooning. + + By "inflating" or increasing the balloon the host can reduce the VMs + memory amount and reclaim it for itself. + When "deflating" or decreasing the balloon the host can give the memory + back to the VM. + + virtio-mem is recommended over ballooning if supported by the hypervisor. + ''; + default = false; + type = types.bool; + }; + + initialBalloonMem = mkOption { + description = '' + Amount of initial balloon memory in megabytes. + ''; + default = 0; + type = types.ints.unsigned; + }; + + deflateOnOOM = mkOption { + type = types.bool; + default = true; + description = '' + Whether to enable automatic balloon deflation on out-of-memory. + ''; + }; + + forwardPorts = mkOption { + type = types.listOf ( + types.submodule { + options.from = mkOption { + type = types.enum [ + "host" + "guest" + ]; + default = "host"; + description = '' + Controls the direction in which the ports are mapped: + + - "host" means traffic from the host ports + is forwarded to the given guest port. + + - "guest" means traffic from the guest ports + is forwarded to the given host port. + ''; + }; + options.proto = mkOption { + type = types.enum [ + "tcp" + "udp" + ]; + default = "tcp"; + description = "The protocol to forward."; + }; + options.host.address = mkOption { + type = types.str; + default = ""; + description = "The IPv4 address of the host."; + }; + options.host.port = mkOption { + type = types.port; + description = "The host port to be mapped."; + }; + options.guest.address = mkOption { + type = types.str; + default = ""; + description = "The IPv4 address on the guest VLAN."; + }; + options.guest.port = mkOption { + type = types.port; + description = "The guest port to be mapped."; + }; + } + ); + default = [ ]; + example = lib.literalExpression /* nix */ '' + [ # forward local port 2222 -> 22, to ssh into the VM + { from = "host"; host.port = 2222; guest.port = 22; } + + # forward local port 80 -> 10.0.2.10:80 in the VLAN + { from = "guest"; + guest.address = "10.0.2.10"; guest.port = 80; + host.address = "127.0.0.1"; host.port = 80; + } + ] + ''; + description = '' + When using the SLiRP user networking (default), this option allows to + forward ports to/from the host/guest. + + ::: {.warning} + If the NixOS firewall on the virtual machine is enabled, you + also have to open the guest ports to enable the traffic + between host and guest. + ::: + + ::: {.note} + Currently QEMU supports only IPv4 forwarding. + ::: + ''; + }; + + volumes = mkOption { + description = "Disk images"; + default = [ ]; + type = + with types; + listOf (submodule { + options = { + image = mkOption { + type = str; + description = "Path to disk image on the host"; + }; + serial = mkOption { + type = nullOr str; + default = null; + description = "User-configured serial number for the disk"; + }; + direct = mkOption { + type = bool; + default = false; + description = "Whether to set O_DIRECT on the disk."; + }; + readOnly = mkOption { + type = bool; + default = false; + description = "Turn off write access"; + }; + label = mkOption { + type = nullOr str; + default = null; + description = "Label of the volume, if any. Only applicable if `autoCreate` is true; otherwise labeling of the volume must be done manually"; + }; + mountPoint = mkOption { + type = nullOr path; + description = "If and where to mount the volume inside the container"; + }; + size = mkOption { + type = int; + description = "Volume size (in MiB) if created automatically"; + }; + autoCreate = mkOption { + type = bool; + default = true; + description = "Created image on host automatically before start?"; + }; + mkfsExtraArgs = mkOption { + type = listOf str; + default = [ ]; + description = "Set extra Filesystem creation parameters"; + }; + fsType = mkOption { + type = str; + default = "ext4"; + description = "Filesystem for automatic creation and mounting"; + }; + }; + }); + }; + + interfaces = mkOption { + description = "Network interfaces"; + default = [ ]; + type = + with types; + listOf (submodule { + options = { + type = mkOption { + type = enum [ + "user" + "tap" + "macvtap" + "bridge" + ]; + description = '' + Interface type + ''; + }; + id = mkOption { + type = str; + description = '' + Interface name on the host + ''; + }; + macvtap.link = mkOption { + type = str; + description = '' + Attach network interface to host interface for type = "macvlan" + ''; + }; + macvtap.mode = mkOption { + type = enum [ + "private" + "vepa" + "bridge" + "passthru" + "source" + ]; + description = '' + The MACVLAN mode to use + ''; + }; + bridge = mkOption { + type = nullOr str; + default = null; + description = '' + Attach network interface to host bridge interface for type = "bridge" + ''; + }; + mac = mkOption { + type = str; + description = '' + MAC address of the guest's network interface + ''; + }; + }; + }); + }; + + shares = mkOption { + description = "Shared directory trees"; + default = [ ]; + type = + with types; + listOf ( + submodule ( + { config, ... }: + { + options = { + tag = mkOption { + type = str; + description = "Unique virtiofs daemon tag"; + }; + socket = mkOption { + type = nullOr str; + default = if config.proto == "virtiofs" then "${hostName}-virtiofs-${config.tag}.sock" else null; + description = "Socket for communication with virtiofs daemon"; + }; + source = mkOption { + type = nonEmptyStr; + description = "Path to shared directory tree"; + }; + securityModel = mkOption { + type = enum [ + "passthrough" + "none" + "mapped" + "mapped-file" + ]; + default = "none"; + description = "What security model to use for the shared directory"; + }; + mountPoint = mkOption { + type = path; + description = "Where to mount the share inside the container"; + }; + proto = mkOption { + type = enum [ + "9p" + "virtiofs" + ]; + description = "Protocol for this share"; + default = "9p"; + }; + readOnly = mkOption { + type = bool; + description = "Turn off write access"; + default = false; + }; + }; + } + ) + ); + }; + + devices = mkOption { + description = "PCI/USB devices that are passed from the host to the MicroVM"; + default = [ ]; + example = literalExpression /* nix */ '' + [ { + bus = "pci"; + path = "0000:01:00.0"; + } { + bus = "pci"; + path = "0000:01:01.0"; + deviceExtraArgs = "id=hostId,x-igd-opregion=on"; + } { + # QEMU only + bus = "usb"; + path = "vendorid=0xabcd,productid=0x0123"; + } ] + ''; + type = + with types; + listOf (submodule { + options = { + bus = mkOption { + type = enum [ + "pci" + "usb" + ]; + description = '' + Device is either on the `pci` or the `usb` bus + ''; + }; + path = mkOption { + type = str; + description = '' + Identification of the device on its bus + ''; + }; + qemu.deviceExtraArgs = mkOption { + type = with types; nullOr str; + default = null; + description = '' + Device additional arguments (optional) + ''; + }; + }; + }); + }; + + vsock.cid = mkOption { + default = null; + type = with types; nullOr int; + description = '' + Virtual Machine address; + setting it enables AF_VSOCK + + The following are reserved: + - 0: Hypervisor + - 1: Loopback + - 2: Host + ''; + }; + + kernelParams = mkOption { + type = with types; listOf str; + description = "Includes boot.kernelParams but doesn't end up in toplevel, thereby allowing references to toplevel"; + }; + + storeOnDisk = mkOption { + type = types.bool; + default = !lib.any ({ source, ... }: source == "/nix/store") config.microvm.shares; + description = "Whether to boot with the storeDisk, that is, unless the host's /nix/store is a microvm.share."; + }; + + registerClosure = + lib.mkEnableOption '' + Register system closure's store paths in Nix db. + + While enabled by default, this option may be incompatible with a persistent writable store overlay. + '' + // { + default = config.microvm.guest.enable; + }; + + writableStoreOverlay = mkOption { + type = with types; nullOr str; + default = null; + example = "/nix/.rw-store"; + description = '' + Path to the writable /nix/store overlay. + + If set to a filesystem path, the initrd will mount /nix/store + as an overlay filesystem consisting of the read-only part as a + host share or from the built storeDisk, and this configuration + option as the writable overlay part. This allows you to build + nix derivations *inside* the VM. + + Make sure that the path points to a writable filesystem + (tmpfs, volume, or share). + ''; + }; + + graphics = { + enable = mkOption { + type = types.bool; + default = false; + description = '' + Enable GUI support. + + MicroVMs with graphics are intended for the interactive + use-case. They cannot be started through systemd jobs. + + The display backend is chosen by `microvm.graphics.backend`. + ''; + }; + + backend = mkOption { + type = types.enum [ + "gtk" + "cocoa" + ]; + default = if pkgs.stdenv.hostPlatform.isDarwin then "cocoa" else "gtk"; + defaultText = lib.literalExpression ''if pkgs.stdenv.hostPlatform.isDarwin then "cocoa" else "gtk"''; + description = '' + QEMU display backend to use when `graphics.enable` is true. + + Defaults to `cocoa` on Darwin hosts and `gtk` otherwise. + ''; + }; + + socket = mkOption { + type = types.str; + default = "${hostName}-gpu.sock"; + description = '' + Path of vhost-user socket + ''; + }; + }; + + vmHostPackages = mkOption { + description = "If set, overrides the default host package."; + example = "nixpkgs.legacyPackages.aarch64-darwin.pkgs"; + type = types.nullOr types.pkgs; + default = if cfg.cpu == null then pkgs else pkgs.buildPackages; + defaultText = lib.literalExpression "if config.microvm.cpu == null then pkgs else pkgs.buildPackages"; + }; + + qemu.machine = mkOption { + type = types.str; + description = '' + QEMU machine model, eg. `microvm`, or `q35` + + Get a full list with `qemu-system-x86_64 -M help` + + This has a default declared with `lib.mkDefault` because it + depends on ''${pkgs.system}. + ''; + }; + + qemu.machineOpts = mkOption { + type = with types; nullOr (attrsOf str); + default = null; + description = "Overwrite the default machine model options."; + }; + + qemu.extraArgs = mkOption { + type = with types; listOf str; + default = [ ]; + description = "Extra arguments to pass to qemu."; + }; + + qemu.serialConsole = mkOption { + type = types.bool; + default = true; + description = '' + Whether to enable the virtual serial console on qemu. + ''; + }; + + cloud-hypervisor.platformOEMStrings = mkOption { + type = with types; listOf str; + default = [ ]; + description = '' + Extra arguments to pass to cloud-hypervisor's --platform oem_strings=[] argument. + + All the oem strings will be concatenated with a comma (,) and wrapped in oem_string=[]. + + Do not include oem_string= or the [] brackets in the value. + + The resulting string will be combined with any --platform options in + `config.microvm.cloud-hypervisor.extraArgs` and passed as a single + --platform option to cloud-hypervisor + ''; + example = lib.literalExpression /* nix */ ''[ "io.systemd.credential:APIKEY=supersecret" ]''; + }; + + cloud-hypervisor.extraArgs = mkOption { + type = with types; listOf str; + default = [ ]; + description = "Extra arguments to pass to cloud-hypervisor."; + }; + + crosvm.extraArgs = mkOption { + type = with types; listOf str; + default = [ ]; + description = "Extra arguments to pass to crosvm."; + }; + + crosvm.pivotRoot = mkOption { + type = with types; nullOr str; + default = null; + description = "A Hypervisor's sandbox directory"; + }; + + firecracker.cpu = mkOption { + type = with types; nullOr attrs; + default = null; + description = "Custom CPU template passed to firecracker."; + }; + + prettyProcnames = mkOption { + type = types.bool; + default = true; + description = '' + Set a recognizable process name right before executing the Hyperisor. + ''; + }; + + virtiofsd.inodeFileHandles = mkOption { + type = + with types; + nullOr (enum [ + "never" + "prefer" + "mandatory" + ]); + default = null; + description = '' + When to use file handles to reference inodes instead of O_PATH file descriptors + (never, prefer, mandatory) + + Allows you to overwrite default behavior in case you hit "too + many open files" on eg. ZFS. + + ''; + }; + + virtiofsd.threadPoolSize = mkOption { + type = + with types; + oneOf [ + str + ints.unsigned + ]; + default = "`nproc`"; + description = '' + The amounts of threads virtiofsd should spawn. This option also takes the special + string `\`nproc\`` which spawns as many threads as the host has cores. + ''; + }; + + virtiofsd.group = mkOption { + type = with types; nullOr str; + default = "kvm"; + description = '' + The name of the group that will own the Unix domain socket file that virtiofsd creates for communication with the hypervisor. + If null, the socket will have group ownership of the user running the hypervisor. + ''; + }; + + virtiofsd.extraArgs = mkOption { + type = with types; listOf str; + default = [ ]; + description = '' + Extra command-line switch to pass to virtiofsd. + ''; + }; + + runner = mkOption { + description = "Generated Hypervisor runner for this NixOS"; + type = with types; attrsOf package; + }; + + declaredRunner = mkOption { + description = "Generated Hypervisor declared by `config.microvm.hypervisor`"; + type = types.package; + default = config.microvm.runner.${config.microvm.hypervisor}; + defaultText = literalExpression ''"config.microvm.runner.''${config.microvm.hypervisor}"''; + }; + + binScripts = mkOption { + description = '' + Script snippets that end up in the runner package's bin/ directory + ''; + default = { }; + type = with types; attrsOf lines; + }; + + storeDiskType = mkOption { + type = types.enum [ + "squashfs" + "erofs" + ]; + description = '' + Boot disk file system type: squashfs is smaller, erofs is supposed to be faster. + + Defaults to erofs, unless the NixOS hardened profile is detected. + ''; + }; + + storeDiskErofsFlags = mkOption { + type = with types; listOf str; + description = '' + Flags to pass to mkfs.erofs + + Omit `"-Efragments"` and `"-Ededupe"` to enable multi-threading. + ''; + default = [ + "-zlz4hc" + ] + ++ lib.optional (kernelAtLeast "5.16") "-Eztailpacking" + ++ lib.optionals (kernelAtLeast "6.1") [ + # not implemented with multi-threading + "-Efragments" + "-Ededupe" + ]; + defaultText = lib.literalExpression '' + [ "-zlz4hc" ] + ++ lib.optional (kernelAtLeast "5.16") "-Eztailpacking" + ++ lib.optionals (kernelAtLeast "6.1") [ + "-Efragments" + "-Ededupe" + ] + ''; + }; + + storeDiskSquashfsFlags = mkOption { + type = with types; listOf str; + description = "Flags to pass to gensquashfs"; + default = [ + "-c" + "zstd" + "-j" + "$NIX_BUILD_CORES" + ]; + }; + + systemSymlink = mkOption { + type = types.bool; + default = !config.microvm.storeOnDisk; + description = '' + Whether to inclcude a symlink of `config.system.build.toplevel` to `share/microvm/system`. + This is required for commands like `microvm -l` to function but removes reference to the uncompressed store content when using a disk image for the nix store. + ''; + }; + + credentialFiles = mkOption { + type = with types; attrsOf path; + default = { }; + description = '' + Key-value pairs of credential files that will be loaded into the vm using systemd's io.systemd.credential feature. + ''; + example = literalExpression /* nix */ '' + { + SOPS_AGE_KEY = "/run/secrets/guest_microvm_age_key"; + } + ''; + }; + }; + + imports = [ + (lib.mkRemovedOptionModule [ + "microvm" + "balloonMem" + ] "The balloonMem option has been removed and replaced by the boolean option balloon") + ]; + + config = lib.mkMerge [ + { + microvm.qemu.machine = lib.mkIf (pkgs.stdenv.hostPlatform.system == "x86_64-linux") ( + lib.mkDefault "microvm" + ); + } + { + microvm.qemu.machine = lib.mkIf (pkgs.stdenv.hostPlatform.system == "aarch64-linux") ( + lib.mkDefault "virt" + ); + } + ]; +} diff --git a/example/microvm/pci-devices.nix b/example/microvm/pci-devices.nix new file mode 100755 index 0000000..4485879 --- /dev/null +++ b/example/microvm/pci-devices.nix @@ -0,0 +1,47 @@ +{ + config, + lib, + pkgs, + ... +}: + +let + pciDevices = builtins.filter ({ bus, ... }: bus == "pci") config.microvm.devices; + + # TODO: don't hardcode but obtain from host config + user = "microvm"; + group = "kvm"; + +in +{ + microvm.binScripts.pci-setup = lib.mkIf (pciDevices != [ ]) ( + '' + set -eou pipefail + ${pkgs.kmod}/bin/modprobe vfio-pci + '' + + lib.concatMapStrings ( + { path, ... }: + '' + cd /sys/bus/pci/devices/${path} + if [ -e driver ]; then + echo ${path} > driver/unbind + fi + echo vfio-pci > driver_override + echo ${path} > /sys/bus/pci/drivers_probe + '' + + + # In order to access the vfio dev the permissions must be set + # for the user/group running the VMM later. + # + # Insprired by https://www.kernel.org/doc/html/next/driver-api/vfio.html#vfio-usage-example + # + # assert we could get the IOMMU group number (=: name of VFIO dev) + '' + [[ -e iommu_group ]] || exit 1 + VFIO_DEV=$(basename $(readlink iommu_group)) + echo "Making VFIO device $VFIO_DEV accessible for user" + chown ${user}:${group} /dev/vfio/$VFIO_DEV + '' + ) pciDevices + ); +} diff --git a/example/microvm/ssh-deploy.nix b/example/microvm/ssh-deploy.nix new file mode 100755 index 0000000..76c8432 --- /dev/null +++ b/example/microvm/ssh-deploy.nix @@ -0,0 +1,252 @@ +{ + config, + lib, + pkgs, + ... +}: + +let + hostName = config.networking.hostName or "$HOSTNAME"; + inherit (config.system.build) toplevel; + inherit (config.microvm) declaredRunner; + inherit (config) nix; + + closureInfo = pkgs.closureInfo { + rootPaths = [ config.system.build.toplevel ]; + }; + + # Don't build these but get the derivation paths for building on a + # remote host, and for switching via SSH. + paths = builtins.mapAttrs (_: builtins.unsafeDiscardStringContext) { + closureInfoOut = closureInfo.outPath; + closureInfoDrv = closureInfo.drvPath; + toplevelOut = toplevel.outPath; + toplevelDrv = toplevel.drvPath; + nixOut = nix.package.outPath; + nixDrv = nix.package.drvPath; + runnerDrv = declaredRunner.drvPath; + }; + + canSwitchViaSsh = + config.system.switch.enable + && + # MicroVM must be reachable through SSH + config.services.openssh.enable + && + # Is the /nix/store mounted from the host? + builtins.any ({ source, ... }: source == "/nix/store") config.microvm.shares; + +in +{ + # Declarations with documentation + options.microvm.deploy = { + installOnHost = lib.mkOption { + description = '' + Use this script to deploy the working state of your local + Flake on a target host that imports + `microvm.nixosModules.host`: + + ``` + nix run .#nixosConfigurations.${hostName}.config.microvm.deploy.installOnHost root@example.com + ssh root@example.com systemctl restart microvm@${hostName} + ``` + + - Evaluate this MicroVM to a derivation + - Copy the derivation to the target host + - Build the MicroVM runner on the target host + - Install/update the MicroVM on the target host + + Can be followed by either: + - `systemctl restart microvm@${hostName}.service` on the + target host, or + - `config.microvm.deploy.sshSwitch` + ''; + type = lib.types.package; + }; + + sshSwitch = lib.mkOption { + description = '' + Instead of restarting a MicroVM for an update, perform it via + SSH. + + The host's /nix/store must be mounted, and the built + `config.microvm.declaredRunner` must exist in it. Use + `microvm.deploy.installOnHost` like this: + + ``` + nix run .#nixosConfigurations.${hostName}.config.microvm.deploy.installOnHost root@example.com + nix run .#nixosConfigurations.${hostName}.config.microvm.deploy.sshSwitch root@my-microvm.example.com switch + ``` + ''; + type = with lib.types; nullOr package; + default = null; + }; + + rebuild = lib.mkOption { + description = '' + `config.microvm.deploy.installOnHost` and `.sshSwitch` in one + script. Akin to what nixos-rebuild does but for a remote + MicroVM. + + ``` + nix run .#nixosConfigurations.${hostName}.config.microvm.deploy.rebuild root@example.com root@my-microvm.example.com switch + ``` + ''; + type = with lib.types; nullOr package; + default = null; + }; + }; + + # Implementations + config.microvm.deploy = { + installOnHost = pkgs.writeShellScriptBin "microvm-install-on-host" '' + set -eou pipefail + + USAGE="Usage: $0 root@ [--use-remote-sudo]" + + HOST="$1" + if [[ -z "$HOST" ]]; then + echo $USAGE + exit 1 + fi + shift + SSH_CMD="bash" + if [ $# -gt 0 ]; then + if [ "$1" == "--use-remote-sudo" ]; then + SSH_CMD="sudo bash" + shift + else + echo "$USAGE" + exit 1 + fi + fi + + + echo "Copying derivations to $HOST" + nix copy --no-check-sigs --to "ssh-ng://$HOST" \ + --derivation \ + "${paths.closureInfoDrv}^out" \ + "${paths.runnerDrv}^out" + + ssh "$HOST" -- $SSH_CMD -e <<__SSH__ + set -eou pipefail + + echo "Initializing MicroVM ${hostName} if necessary" + mkdir -p /nix/var/nix/gcroots/microvm + mkdir -p /var/lib/microvms/${hostName} + cd /var/lib/microvms/${hostName} + chown microvm:kvm . + chmod 0755 . + ln -sfT \$PWD/current /nix/var/nix/gcroots/microvm/${hostName} + ln -sfT \$PWD/booted /nix/var/nix/gcroots/microvm/booted-${hostName} + ln -sfT \$PWD/old /nix/var/nix/gcroots/microvm/old-${hostName} + + echo "Building toplevel ${paths.toplevelOut}" + nix build -L --accept-flake-config --no-link \ + ${ + with paths; + lib.concatMapStringsSep " " (drv: "'${drv}^out'") [ + nixDrv + closureInfoDrv + toplevelDrv + ] + } + echo "Building MicroVM runner for ${hostName}" + nix build -L --accept-flake-config -o new \ + "${paths.runnerDrv}^out" + + if [[ $(realpath ./current) != $(realpath ./new) ]]; then + echo "Installing MicroVM ${hostName}" + rm -f old + if [ -e current ]; then + mv current old + fi + mv new current + + if [ -e old ]; then + echo "Success. Diff:" + nix --extra-experimental-features nix-command \ + store diff-closures ./old ./current \ + || true + else + echo "Success." + fi + else + echo "MicroVM ${hostName} is already installed" + fi + __SSH__ + ''; + + sshSwitch = lib.mkIf canSwitchViaSsh ( + pkgs.writeShellScriptBin "microvm-switch" '' + set -eou pipefail + + USAGE="Usage: $0 root@ [--use-remote-sudo]" + + TARGET="$1" + if [[ -z "$TARGET" ]]; then + echo "$USAGE" + exit 1 + fi + shift + SSH_CMD="bash" + if [ $# -gt 0 ]; then + if [ "$1" == "--use-remote-sudo" ]; then + SSH_CMD="sudo bash" + shift + else + echo "$USAGE" + exit 1 + fi + fi + + ssh "$TARGET" $SSH_CMD -e <<__SSH__ + set -eou pipefail + + hostname=\$(cat /etc/hostname) + if [[ "\$hostname" != "${hostName}" ]]; then + echo "Attempting to deploy NixOS ${hostName} on host \$hostname" + exit 1 + fi + + # refresh nix db which is required for nix-env -p ... --set + echo "Refreshing Nix database" + ${paths.nixOut}/bin/nix-store --load-db < ${paths.closureInfoOut}/registration + ${paths.nixOut}/bin/nix-env -p /nix/var/nix/profiles/system --set ${paths.toplevelOut} + + ${paths.toplevelOut}/bin/switch-to-configuration "''${@:-switch}" + __SSH__ + '' + ); + + rebuild = + with config.microvm.deploy; + pkgs.writeShellScriptBin "microvm-rebuild" '' + set -eou pipefail + + HOST="$1" + shift + TARGET="$1" + shift + OPTS="$@" + if [ $# -gt 0 ]; then + if [ "$1" == "--use-remote-sudo" ]; then + OPTS="$1" + shift + fi + fi + if [[ -z "$HOST" || -z "$TARGET" || $# -gt 0 ]]; then + echo "Usage: $0 root@ root@ [--use-remote-sudo] switch" + exit 1 + fi + + ${lib.getExe installOnHost} "$HOST" $OPTS + ${ + if canSwitchViaSsh then + ''${lib.getExe sshSwitch} "$TARGET" $OPTS'' + else + ''ssh "$HOST" -- systemctl restart "microvm@${hostName}.service"'' + } + ''; + }; +} diff --git a/example/microvm/store-disk.nix b/example/microvm/store-disk.nix new file mode 100755 index 0000000..615d6f9 --- /dev/null +++ b/example/microvm/store-disk.nix @@ -0,0 +1,124 @@ +{ + config, + lib, + pkgs, + ... +}: + +let + regInfo = pkgs.closureInfo { + rootPaths = [ config.system.build.toplevel ]; + }; + + erofs-utils = + # Are any extended options specified? + if + lib.any ( + with lib; + flip elem [ + "-Ededupe" + "-Efragments" + ] + ) config.microvm.storeDiskErofsFlags + then + # If extended options are present, + # stick to the single-threaded erofs-utils + # to not scare anyone with warning messages. + pkgs.buildPackages.erofs-utils + else + # If no extended options are configured, + # rebuild mkfs.erofs with multi-threading. + pkgs.buildPackages.erofs-utils.overrideAttrs (attrs: { + configureFlags = attrs.configureFlags ++ [ + "--enable-multithreading" + ]; + }); + + erofsFlags = builtins.concatStringsSep " " config.microvm.storeDiskErofsFlags; + squashfsFlags = builtins.concatStringsSep " " config.microvm.storeDiskSquashfsFlags; + + mkfsCommand = + { + squashfs = "gensquashfs ${squashfsFlags} -D store --all-root -q $out"; + erofs = "mkfs.erofs ${erofsFlags} -T 0 --all-root -L nix-store --mount-point=/nix/store $out store"; + } + .${config.microvm.storeDiskType}; + + writeClosure = pkgs.writeClosure or pkgs.writeReferencesToFile; + + storeDiskContents = writeClosure ( + [ config.system.build.toplevel ] ++ lib.optional config.nix.enable regInfo + ); + +in +{ + options.microvm.storeDisk = + with lib; + mkOption { + type = types.path; + description = '' + Generated + ''; + }; + + config = lib.mkMerge [ + (lib.mkIf (config.microvm.guest.enable && config.microvm.storeOnDisk) { + # nixos/modules/profiles/hardened.nix forbids erofs. + # HACK: Other NixOS modules populate + # config.boot.blacklistedKernelModules depending on the boot + # filesystems, so checking on that directly would result in an + # infinite recursion. + microvm.storeDiskType = lib.mkDefault ( + if config.security.virtualisation.flushL1DataCache == "always" then "squashfs" else "erofs" + ); + boot.initrd.availableKernelModules = [ + config.microvm.storeDiskType + ]; + + microvm.storeDisk = + pkgs.runCommandLocal "microvm-store-disk.${config.microvm.storeDiskType}" + { + nativeBuildInputs = [ + pkgs.buildPackages.time + pkgs.buildPackages.bubblewrap + { + squashfs = pkgs.buildPackages.squashfs-tools-ng; + erofs = erofs-utils; + } + .${config.microvm.storeDiskType} + ]; + passthru = { + inherit regInfo; + }; + __structuredAttrs = true; + unsafeDiscardReferences.out = true; + } + '' + mkdir store + BWRAP_ARGS="--dev-bind / / --chdir $(pwd)" + for d in $(sort -u ${storeDiskContents}); do + BWRAP_ARGS="$BWRAP_ARGS --ro-bind $d $(pwd)/store/$(basename $d)" + done + + echo Creating a ${config.microvm.storeDiskType} + bwrap $BWRAP_ARGS -- time ${mkfsCommand} || \ + ( + echo "Bubblewrap failed. Falling back to copying...">&2 + cp -a $(sort -u ${storeDiskContents}) store/ + time ${mkfsCommand} + ) + ''; + }) + + (lib.mkIf (config.microvm.registerClosure && config.nix.enable) { + microvm.kernelParams = [ + "regInfo=${regInfo}/registration" + ]; + boot.postBootCommands = '' + if [[ "$(cat /proc/cmdline)" =~ regInfo=([^ ]*) ]]; then + ${config.nix.package.out}/bin/nix-store --load-db < ''${BASH_REMATCH[1]} + fi + ''; + }) + ]; +} diff --git a/example/microvm/system.nix b/example/microvm/system.nix new file mode 100755 index 0000000..33dba94 --- /dev/null +++ b/example/microvm/system.nix @@ -0,0 +1,82 @@ +{ + pkgs, + lib, + config, + ... +}: + +{ + config = lib.mkIf config.microvm.guest.enable { + assertions = [ + { + assertion = + (config.microvm.writableStoreOverlay != null) + -> (!config.nix.optimise.automatic && !config.nix.settings.auto-optimise-store); + message = '' + `nix.optimise.automatic` and `nix.settings.auto-optimise-store` do not work with `microvm.writableStoreOverlay`. + ''; + } + ]; + + boot.loader.grub.enable = false; + # boot.initrd.systemd.enable = lib.mkDefault true; + boot.initrd.kernelModules = [ + "virtio_mmio" + "virtio_pci" + "virtio_blk" + "9pnet_virtio" + "9p" + "virtiofs" + ] + ++ + lib.optionals + (pkgs.stdenv.targetPlatform.system == "x86_64-linux" && config.microvm.hypervisor == "firecracker") + [ + # Keyboard controller that can receive CtrlAltDel + "i8042" + ] + ++ lib.optionals (config.microvm.writableStoreOverlay != null) [ + "overlay" + ]; + + microvm.kernelParams = + let + # When a store disk is used, we can drop references to the packed contents as the squashfs/erofs contains all paths. + toplevel = + if config.microvm.storeOnDisk then + builtins.unsafeDiscardStringContext config.system.build.toplevel + else + config.system.build.toplevel; + in + config.boot.kernelParams + ++ [ + "init=${toplevel}/init" + ]; + + # modules that consume boot time but have rare use-cases + boot.blacklistedKernelModules = [ + "rfkill" + "intel_pstate" + ] + ++ lib.optional (!config.microvm.graphics.enable) "drm"; + + systemd = + let + # nix-daemon works only with a writable /nix/store + enableNixDaemon = config.microvm.writableStoreOverlay != null; + in + { + services.nix-daemon.enable = lib.mkDefault enableNixDaemon; + sockets.nix-daemon.enable = lib.mkDefault enableNixDaemon; + + # consumes a lot of boot time + services.mount-pstore.enable = false; + + # just fails in the usual usage of microvm.nix + generators = { + systemd-gpt-auto-generator = "/dev/null"; + }; + }; + + }; +} diff --git a/example/microvm/virtiofsd/default.nix b/example/microvm/virtiofsd/default.nix new file mode 100755 index 0000000..966c3a9 --- /dev/null +++ b/example/microvm/virtiofsd/default.nix @@ -0,0 +1,93 @@ +{ + config, + lib, + pkgs, + ... +}: + +let + virtiofsShares = builtins.filter ({ proto, ... }: proto == "virtiofs") config.microvm.shares; + + requiresVirtiofsd = virtiofsShares != [ ]; + + inherit (pkgs.python3Packages) supervisor; + supervisord = lib.getExe' supervisor "supervisord"; + supervisorctl = lib.getExe' supervisor "supervisorctl"; +in +{ + microvm.binScripts = lib.mkIf requiresVirtiofsd { + virtiofsd-run = + let + supervisordConfig = { + supervisord.nodaemon = true; + + "eventlistener:notify" = { + command = pkgs.writers.writePython3 "supervisord-event-handler" { } ( + pkgs.replaceVars ./supervisord-event-handler.py { + # 1 for the event handler process + virtiofsdCount = 1 + builtins.length virtiofsShares; + } + ); + events = "PROCESS_STATE"; + }; + } + // builtins.listToAttrs ( + map ( + { + tag, + socket, + source, + readOnly, + ... + }: + { + name = "program:virtiofsd-${tag}"; + value = { + stderr_syslog = true; + stdout_syslog = true; + autorestart = true; + command = pkgs.writeShellScript "virtiofsd-${tag}" '' + if [ $(id -u) = 0 ]; then + OPT_RLIMIT="--rlimit-nofile 1048576" + else + OPT_RLIMIT="" + fi + exec ${lib.getExe pkgs.virtiofsd} \ + --socket-path=${lib.escapeShellArg socket} \ + ${ + lib.optionalString ( + config.microvm.virtiofsd.group != null + ) "--socket-group=${config.microvm.virtiofsd.group}" + } \ + --shared-dir=${lib.escapeShellArg source} \ + $OPT_RLIMIT \ + --thread-pool-size ${toString config.microvm.virtiofsd.threadPoolSize} \ + --posix-acl --xattr \ + ${ + lib.optionalString ( + config.microvm.virtiofsd.inodeFileHandles != null + ) "--inode-file-handles=${config.microvm.virtiofsd.inodeFileHandles}" + } \ + ${lib.optionalString (config.microvm.hypervisor == "crosvm") "--tag=${tag}"} \ + ${lib.optionalString readOnly "--readonly"} \ + ${lib.concatStringsSep " " config.microvm.virtiofsd.extraArgs} + ''; + }; + } + ) virtiofsShares + ); + + supervisordConfigFile = pkgs.writeText "${config.networking.hostName}-virtiofsd-supervisord.conf" ( + lib.generators.toINI { } supervisordConfig + ); + + in + '' + exec ${supervisord} --configuration ${supervisordConfigFile} + ''; + + virtiofsd-shutdown = '' + exec ${supervisorctl} stop + ''; + }; +} diff --git a/example/microvm/virtiofsd/supervisord-event-handler.py b/example/microvm/virtiofsd/supervisord-event-handler.py new file mode 100755 index 0000000..6eed4ed --- /dev/null +++ b/example/microvm/virtiofsd/supervisord-event-handler.py @@ -0,0 +1,44 @@ +import subprocess +import sys + + +def write_stdout(s): + # only eventlistener protocol messages may be sent to stdout + sys.stdout.write(s) + sys.stdout.flush() + + +def write_stderr(s): + sys.stderr.write(s) + sys.stderr.flush() + + +def main(): + count = 0 + expected_count = @virtiofsdCount@ + + while True: + write_stdout('READY\n') + line = sys.stdin.readline() + + # read event payload and print it to stderr + headers = dict([x.split(':') for x in line.split()]) + sys.stdin.read(int(headers['len'])) + # body = dict([x.split(':') for x in data.split()]) + + if headers["eventname"] == "PROCESS_STATE_RUNNING": + count += 1 + write_stderr("Process state running...\n") + + if headers["eventname"] == "PROCESS_STATE_STOPPING": + count -= 1 + write_stderr("Process state stopping...\n") + + if count >= expected_count: + subprocess.run(["systemd-notify", "--ready"]) + + write_stdout('RESULT 2\nOK') + + +if __name__ == '__main__': + main() diff --git a/modules/config/default.nix b/modules/config/default.nix index 4311455..d51d29e 100755 --- a/modules/config/default.nix +++ b/modules/config/default.nix @@ -244,7 +244,7 @@ in ceresStorageDriveName = "NAS1"; erisStorageDriveName = "NAS2"; - ceresIP = "192.168.50.250"; + ceresIP = "192.168.50.240"; erisIP = "192.168.50.245"; deimosIP = "192.168.50.176"; marsIP = "192.168.50.218"; diff --git a/modules/config/instances/config/vaultwarden.nix b/modules/config/instances/config/vaultwarden.nix index 7f2a261..c530caf 100755 --- a/modules/config/instances/config/vaultwarden.nix +++ b/modules/config/instances/config/vaultwarden.nix @@ -37,7 +37,7 @@ in interface = { id = "vm-${name}"; mac = "02:00:00:00:00:51"; - idUser = "vm-${name}"; + idUser = "vmuser-vault"; macUser = "02:00:00:00:00:03"; ip = "192.168.50.51"; gate = "192.168.50.1";