#!/bin/sh
# Copyright 2022 Helmut Grohne <helmut@subdivi.de>
# SPDX-License-Identifier: MIT

: <<'POD2MAN'
=head1 NAME

debvm-run - Run a VM image created by debvm-create

=head1 SYNOPSIS

B<debvm-run> [B<-g>] [B<-i> F<image>] [B<-s> I<sshport>] [B<--> I<qemu options>]

=head1 DESCRIPTION

B<debvm-run> is essentially a thin wrapper around B<qemu> for running a virtual machine image created by B<debvm-create> or something compatible.
The virtual machine image is expected to be a raw ext4 image with a non-empty file system label.
The architecture of the machine is detected from the contained F</bin/true>.
It must contain a symbolic link pointing to a kernel image at one of F<(|/boot)/vmlinu[xz]> a symbolic link pointing to an initrd image at F<initrd.img> in the same directory as the kernel image.
Both are extracted and passed to B<qemu>.
A net interface configured for user mode is added automatically.

=head1 OPTIONS

=over 8

=item B<--append>=I<cmdline>

While the kernel command line can be modified by passing B<-append> to B<qemu> directly, doing that always replaces the entire command line and thus removes important values passed by B<debvm-run>.
This variant instead appends given command line arguments to the automatic ones.
Repeated use also causes appending rather than replacement.

=item B<-g>, B<--graphical>

By default, the option B<-nographic> is passed to B<qemu> and one interacts with the serial console of the machine.
This configuration is skipped in the presence of this option.
Note that B<debvm-create> defaults to installing a cloud kernel if available, so you may have to pass C<--include=linux-image-generic> during image construction to get graphics drivers.

=item B<-i> F<image>, B<--image>=F<image>

This option specifies the location of the virtual machine image file.
By default F<rootfs.ext4> in the working directory is used.

=item B<--netopt>=I<option>

B<debvm-run> sets up a user mode network by default.
It therefore passes a B<-netdev> option to B<qemu>.
Using this option, you can customize the value of that B<-netdev> option.
For instance, you can set up additional port forwards by passing e.g. C<--netopt hostfwd=:127.0.0.1:8080-:80>.
It can be used multiple times.

=item B<--skip>=I<task>

Skip a particular task or feature.
The option may be specified multiple times or list multiple tasks to be skipped by separating them with a comma.
By default, no tasks are skipped.
The following tasks may be skipped.

=over 4

=item B<network>

Do not pass configure network card.

=item B<rngdev>

Do not pass a random number generator device.

=item B<root>

Skip all of the following tasks matching C<root/*>.
If either of these is present, the VM will not boot unless a suitable replacement is added in another way.

=item B<root/cmd>

Since B<debvm-run> uses B<qemu> as bootloader it normally passes the label of the root block device via the kernel command line.
This passing can be inhibited to supply a different location.

=item B<root/dev>

A block device for the root filesystem is no longer passed.
This can be used to customize the block device.

=back

=item B<-s> I<sshport>, B<--sshport>=I<sshport>

If given, B<qemu> is configured to pass connections to I<127.0.0.1:sshport> to port 22 of the virtual machine.
You can connect to your virtual machine without updating your known hosts like this:

    ssh -o NoHostAuthenticationForLocalhost=yes -p $sshport root@127.0.0.1

The option is a shorthand for C<--netopt hostfwd=tcp:127.0.0.1:sshport-:22>.

=item B<--> I<qemu options>

All options beyond a double dash are passed to B<qemu>.
This can be used to configure additional hardware components.
One possible use of this method is passing B<-snapshot> to avoid modifying the virtual machine image.

=back

=head1 EXAMPLES

Run a virtual machine stored in the image F<rootfs.ext4> (the default) with
local port 8022 routed to port 22 of the virtual machine. The B<-snapshot>
argument is passed to QEMU and prevents any permanent changes to
F<rootfs.ext4>, resulting in an ephemeral run.

    debvm-run -s 8022 -i rootfs.ext4 -- -snapshot

=head1 FAQ

=over 8

=item The debvm-run console renders wrong.

Make sure C<$TERM> is set to a value known inside the VM.
You may need to install B<ncurses-term> for more definitions.
The serial console will miss events of resizing the terminal emulator.
You may run C<setterm --resize> in that case.

=item How can I kill debvm-run?

The wrapped B<qemu> can be terminated by pressing Ctrl-a x.
Refer to the B<qemu> manual page for more escape sequences.

=back

=head1 LIMITATIONS

Due to the way kernel and bootloader are being extracted before running B<qemu>, one cannot upgrade a kernel and then just reboot.
Attempting to do so, will still use the old kernel.
Instead, B<qemu> must be terminated and B<debvm-run> should be launched again to pick up the new kernel.
In order to avoid accidental reboots, one may pass B<-no-reboot> to B<qemu>.

=head1 SEE ALSO

    debvm-create(1) qemu(1)

=cut
POD2MAN

set -u

IMAGE=rootfs.ext4
GRAPHICAL=
CMDLINE_APPEND=
NETOPTS=
SKIP=,
SSHPORT=

nth_arg() {
	shift "$1"
	printf "%s" "$1"
}

die() {
	echo "$*" 1>&2
	exit 1
}
with_set_ex() {
	echo "+ $*" 1>&2
	with_set_ex_ret=0
	"$@" || with_set_ex_ret=$?
	if test "$with_set_ex_ret" != 0; then
		die "failed with exit code $with_set_ex_ret"
	fi
}
usage() {
	die "usage: $0 [-g] [-i image] [-s sshport] [-- qemu options]"
}
usage_error() {
	echo "error: $*" 1>&2
	usage
}

opt_append() {
	CMDLINE_APPEND="${CMDLINE_APPEND:+$CMDLINE_APPEND }$1"
}
opt_graphical() {
	GRAPHICAL=1
}
opt_image() {
	IMAGE=$1
}
opt_netopt() {
	NETOPTS="$NETOPTS,$1"
}
opt_skip() {
	SKIP="$SKIP$1,"
}
opt_sshport() {
	SSHPORT=$1
}

while getopts :gi:s:-: OPTCHAR; do
	case "$OPTCHAR" in
		g)	opt_graphical		;;
		i)	opt_image "$OPTARG"	;;
		s)	opt_sshport "$OPTARG"	;;
		-)
			case "$OPTARG" in
				help)
					usage
				;;
				graphical)
					"opt_$OPTARG"
				;;
				append|image|netopt|skip|sshport)
					test "$OPTIND" -gt "$#" && usage_error "missing argument for --$OPTARG"
					"opt_$OPTARG" "$(nth_arg "$OPTIND" "$@")"
					OPTIND=$((OPTIND+1))
				;;
				append=*|image=*|netopt=*|skip=*|sshport=*)
					"opt_${OPTARG%%=*}" "${OPTARG#*=}"
				;;
				*)
					usage_error "unrecognized option --$OPTARG"
				;;
			esac
		;;
		:)
			usage_error "missing argument for -$OPTARG"
		;;
		'?')
			usage_error "unrecognized option -$OPTARG"
		;;
		*)
			die "internal error while parsing command options, please report a bug"
		;;
	esac
done
shift "$((OPTIND - 1))"

if test -n "$SSHPORT"; then
	opt_netopt "hostfwd=tcp:127.0.0.1:$SSHPORT-:22"
fi

test -f "$IMAGE" || die "image '$IMAGE' not found"
test -s "$IMAGE" || die "image '$IMAGE' is empty"

if ! printf '\123\357' | cmp --bytes=2 "$IMAGE" - 1080; then
	die "image '$IMAGE' is not in ext4 format"
fi

check_skip() {
	while :; do
		case "$SKIP" in
			*",$1,"*)	return 0 ;;
		esac
		if test "$1" = "${1%/*}"; then
			return 1
		fi
		set -- "${1%/*}"
	done
}

cleanup() {
	set +x
	test -n "$KERNELTMP" && rm -f "$KERNELTMP"
	test -n "$INITRDTMP" && rm -f "$INITRDTMP"
}

trap cleanup EXIT INT TERM QUIT

KERNELTMP=$(mktemp)
INITRDTMP=$(mktemp)

ARCHITECTURE=$(dpkg --print-architecture)
VMARCH=$ARCHITECTURE
if command -v elf-arch >/dev/null 2>&1; then
	/sbin/debugfs "$IMAGE" -R "cat /bin/true" > "$KERNELTMP"
	VMARCH=$(elf-arch "$KERNELTMP")
	echo "Detected VM architecture as $VMARCH" 1>&2
else
	echo "Assuming VM architecture as $VMARCH" 1>&2
fi

for KERNELLINK in vmlinuz vmlinux boot/vmlinuz boot/vmlinux; do
	KERNELNAME=$(/sbin/debugfs "$IMAGE" -R "stat $KERNELLINK" | sed 's/Fast link dest: "\(.*\)"/\1/;t;d')
	test -n "$KERNELNAME" && break
done
if test "${KERNELLINK%/*}" = "$KERNELLINK"; then
	BOOTDIR=
else
	BOOTDIR="${KERNELLINK%/*}/"
fi
test -n "$KERNELNAME" || die "failed to discover kernel image"
test "${KERNELNAME#/}" = "$KERNELNAME" && KERNELNAME="$BOOTDIR$KERNELNAME"

INITRDNAME=$(/sbin/debugfs "$IMAGE" -R "stat ${BOOTDIR}initrd.img" | sed 's/Fast link dest: "\(.*\)"/\1/;t;d')
test -n "$INITRDNAME" || die "failed to discover initrd image"
test "${INITRDNAME#/}" = "$INITRDNAME" && INITRDNAME="$BOOTDIR$INITRDNAME"

with_set_ex /sbin/debugfs "$IMAGE" -R "cat $KERNELNAME" > "$KERNELTMP"
with_set_ex /sbin/debugfs "$IMAGE" -R "cat $INITRDNAME" > "$INITRDTMP"

# Guess the kernel architecture.
KERNELARCH=$VMARCH
if command -v file >/dev/null 2>&1; then
	case "$VMARCH:$(file -b "$KERNELTMP")" in
		"arm:Linux kernel ARM64 boot executable Image"*) KERNELARCH=arm64 ;;
		"armel:Linux kernel ARM64 boot executable Image"*) KERNELARCH=arm64 ;;
		"armhf:Linux kernel ARM64 boot executable Image"*) KERNELARCH=arm64 ;;
		# The boot stub looks the same on i386 and amd64, so we
		# actually inspect the kernel version here, which happens to
		# include amd64 for Debian kernels.
		"i386:Linux kernel x86 boot executable bzImage, version "*"-amd64 "*) KERNELARCH=amd64 ;;
		"mipsel:ELF 64-bit LSB executable,"*) KERNELARCH=mips64el ;;
	esac
fi

IMAGE_LABEL="$(/sbin/e2label "$IMAGE")"
case "$IMAGE_LABEL" in
	"")
		die "debvm-run requires a non-empty filesystem label"
	;;
	*" "*)
		die "debvm-run requires a filesystem label without spaces"
	;;
esac

KERNEL_CMDLINE=
if ! check_skip root/cmd; then
	KERNEL_CMDLINE="root=LABEL=$IMAGE_LABEL rw"
fi

KERNELFD=3
while test -h "/proc/self/fd/$KERNELFD"; do
	KERNELFD=$((KERNELFD + 1))
done
INITRDFD=$((KERNELFD + 1))
while test -h "/proc/self/fd/$INITRDFD"; do
	INITRDFD=$((INITRDFD + 1))
done
eval exec "$KERNELFD<"'"$KERNELTMP"'
eval exec "$INITRDFD<"'"$INITRDTMP"'
rm -f "$KERNELTMP" "$INITRDTMP"
KERNELTMP=
INITRDTMP=

set -- \
	-no-user-config \
	-name "debvm-run $IMAGE" \
	-m 1G \
	-kernel "/proc/self/fd/$KERNELFD" \
	-initrd "/proc/self/fd/$INITRDFD" \
	"$@"

# If the image filename contains a comma, then that comma must be escaped by
# prefixing it with another comma or otherwise output filenames are able to
# inject drive options to qemu (and load the wrong file).
IMAGE_ESCAPED="$(printf "%s" "$IMAGE" | sed 's/,/,,/g')"

if ! check_skip root/dev; then
	set -- \
		-drive "media=disk,format=raw,discard=unmap,file=$IMAGE_ESCAPED,if=virtio,cache=unsafe" \
		"$@"
fi

# Translate KERNELARCH (a Debian architecture) to a Debian CPU name.
# This utilizes the QEMU Debian package symlink mapping that ensures that
# calling qemu-system-${DEB_HOST_ARCH_CPU} will run the QEMU binary providing
# the correct emulator for that CPU.
KERNELARCHCPU="$(dpkg-architecture --force --host-arch "$KERNELARCH" --query DEB_HOST_ARCH_CPU)"
QEMU="qemu-system-$KERNELARCHCPU"
CPU=
MACHINE=
MAX_SMP=
NIC_DEV=virtio-net-pci,netdev=net0
RNG_DEV=virtio-rng-pci,rng=rng0

case "$KERNELARCHCPU" in
	amd64)
		MACHINE="type=q35"
	;;
	arm)
		CPU=max
		MACHINE="type=virt"
		MAX_SMP=8
	;;
	arm64)
		CPU=max,pauth-impdef=on
		MACHINE="type=virt,gic-version=max"
	;;
	m68k)
		MACHINE="type=virt"
		MAX_SMP=1
		NIC_DEV=virtio-net-device,netdev=net0
		RNG_DEV=virtio-rng-device,rng=rng0
	;;
	mips64el)
		CPU=5KEc
		MAX_SMP=1
	;;
	mipsel)
		MAX_SMP=1
	;;
	powerpc)
		MAX_SMP=1
	;;
	riscv64)
		MACHINE="type=virt"
	;;
	sparc64)
		MAX_SMP=1
		RNG_DEV=
	;;
esac

ENABLE_KVM=no
if test "$ARCHITECTURE" = "$KERNELARCH"; then
	ENABLE_KVM=yes
	case "$VMARCH:$KERNELARCH" in
		arm:arm64|armel:arm64|armhf:arm64)
			if ! linux32 true >/dev/null 2>&1; then
				# This arm64 cannot run 32bit arm, so don't try KVM.
				ENABLE_KVM=no
			fi
		;;
	esac
fi
if test "$ENABLE_KVM" = yes; then
	if ! command -v "$QEMU" >/dev/null 2>&1; then
		# Fall back to kvm in case we badly guessed qemu.
		QEMU=kvm
	fi
	MACHINE="${MACHINE:+$MACHINE,}accel=kvm:tcg"
	# While kvm will fall back gracefully, the following options can only
	# be passed when kvm really is available.
	if test -w /dev/kvm; then
		CPU=host
	fi
fi

if test -n "$MACHINE"; then
	set -- -machine "$MACHINE" "$@"
fi
if test -n "$CPU"; then
	set -- -cpu "$CPU" "$@"
fi
if test -z "$MAX_SMP" || test "$MAX_SMP" -gt 1; then
	NPROC=$(nproc)
	if test "$NPROC" -gt 1; then
		test -n "$MAX_SMP" && test "$NPROC" -gt "$MAX_SMP" && NPROC=$MAX_SMP
		set -- -smp "$NPROC" "$@"
	fi
fi
if test -n "$RNG_DEV" && ! check_skip rngdev; then
	set -- \
		-device "$RNG_DEV" \
		-object rng-random,filename=/dev/urandom,id=rng0 \
		"$@"
fi

if test -z "$GRAPHICAL"; then
	set -- -nographic "$@"
	case "$KERNELARCH" in
		amd64|i386)
			KERNEL_CMDLINE="${KERNEL_CMDLINE:+"$KERNEL_CMDLINE "}console=ttyS0"
		;;
	esac
	if test -t 0 && test -t 1 && test -n "${TERM:-}"; then
		KERNEL_CMDLINE="${KERNEL_CMDLINE:+"$KERNEL_CMDLINE "}TERM=$TERM"
	fi
else
	case "$KERNELARCH" in
		amd64|i386)
			set -- -vga virtio "$@"
		;;
		*)
			set -- \
				-device virtio-gpu-gl-pci \
				-display gtk,gl=on \
				"$@"
		;;
	esac
	set -- \
		-device virtio-keyboard-pci \
		-device virtio-tablet-pci \
		"$@"
fi

DNSSEARCH=$(dnsdomainname)
if test -n "$DNSSEARCH"; then
	NETOPTS=",domainname=$DNSSEARCH$NETOPTS"
fi

if test -n "$CMDLINE_APPEND"; then
	KERNEL_CMDLINE="${KERNEL_CMDLINE:+"$KERNEL_CMDLINE "}$CMDLINE_APPEND"
fi
if test -n "$KERNEL_CMDLINE"; then
	set -- -append "$KERNEL_CMDLINE" "$@"
fi

if ! check_skip network; then
	set -- -netdev "user,id=net0$NETOPTS" -device "$NIC_DEV" "$@"
fi

echo "+ $QEMU $*" 1>&2
exec "$QEMU" "$@"