Commit be50b206 authored by Guang Zeng's avatar Guang Zeng Committed by Paolo Bonzini
Browse files

kvm: x86: Add support for getting/setting expanded xstate buffer

With KVM_CAP_XSAVE, userspace uses a hardcoded 4KB buffer to get/set
xstate data from/to KVM. This doesn't work when dynamic xfeatures
(e.g. AMX) are exposed to the guest as they require a larger buffer
size.

Introduce a new capability (KVM_CAP_XSAVE2). Userspace VMM gets the
required xstate buffer size via KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2).
KVM_SET_XSAVE is extended to work with both legacy and new capabilities
by doing properly-sized memdup_user() based on the guest fpu container.
KVM_GET_XSAVE is kept for backward-compatible reason. Instead,
KVM_GET_XSAVE2 is introduced under KVM_CAP_XSAVE2 as the preferred
interface for getting xstate buffer (4KB or larger size) from KVM
(Link: https://lkml.org/lkml/2021/12/15/510

)

Also, update the api doc with the new KVM_GET_XSAVE2 ioctl.

Signed-off-by: default avatarGuang Zeng <guang.zeng@intel.com>
Signed-off-by: default avatarWei Wang <wei.w.wang@intel.com>
Signed-off-by: default avatarJing Liu <jing2.liu@intel.com>
Signed-off-by: default avatarKevin Tian <kevin.tian@intel.com>
Signed-off-by: default avatarYang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-19-yang.zhong@intel.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent c60427dd
Loading
Loading
Loading
Loading
+40 −2
Original line number Diff line number Diff line
@@ -1569,6 +1569,7 @@ otherwise it will return EBUSY error.

  struct kvm_xsave {
	__u32 region[1024];
	__u32 extra[0];
  };

This ioctl would copy current vcpu's xsave struct to the userspace.
@@ -1577,7 +1578,7 @@ This ioctl would copy current vcpu's xsave struct to the userspace.
4.43 KVM_SET_XSAVE
------------------

:Capability: KVM_CAP_XSAVE
:Capability: KVM_CAP_XSAVE and KVM_CAP_XSAVE2
:Architectures: x86
:Type: vcpu ioctl
:Parameters: struct kvm_xsave (in)
@@ -1588,9 +1589,18 @@ This ioctl would copy current vcpu's xsave struct to the userspace.

  struct kvm_xsave {
	__u32 region[1024];
	__u32 extra[0];
  };

This ioctl would copy userspace's xsave struct to the kernel.
This ioctl would copy userspace's xsave struct to the kernel. It copies
as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2),
when invoked on the vm file descriptor. The size value returned by
KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096.
Currently, it is only greater than 4096 if a dynamic feature has been
enabled with ``arch_prctl()``, but this may change in the future.

The offsets of the state save areas in struct kvm_xsave follow the
contents of CPUID leaf 0xD on the host.


4.44 KVM_GET_XCRS
@@ -5535,6 +5545,34 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header.
The Stats Data block contains an array of 64-bit values in the same order
as the descriptors in Descriptors block.

4.42 KVM_GET_XSAVE2
------------------

:Capability: KVM_CAP_XSAVE2
:Architectures: x86
:Type: vcpu ioctl
:Parameters: struct kvm_xsave (out)
:Returns: 0 on success, -1 on error


::

  struct kvm_xsave {
	__u32 region[1024];
	__u32 extra[0];
  };

This ioctl would copy current vcpu's xsave struct to the userspace. It
copies as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
when invoked on the vm file descriptor. The size value returned by
KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096.
Currently, it is only greater than 4096 if a dynamic feature has been
enabled with ``arch_prctl()``, but this may change in the future.

The offsets of the state save areas in struct kvm_xsave follow the contents
of CPUID leaf 0xD on the host.


5. The kvm_run structure
========================

+15 −1
Original line number Diff line number Diff line
@@ -373,9 +373,23 @@ struct kvm_debugregs {
	__u64 reserved[9];
};

/* for KVM_CAP_XSAVE */
/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
struct kvm_xsave {
	/*
	 * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
	 * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
	 * respectively, when invoked on the vm file descriptor.
	 *
	 * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
	 * will always be at least 4096. Currently, it is only greater
	 * than 4096 if a dynamic feature has been enabled with
	 * ``arch_prctl()``, but this may change in the future.
	 *
	 * The offsets of the state save areas in struct kvm_xsave follow
	 * the contents of CPUID leaf 0xD on the host.
	 */
	__u32 region[1024];
	__u32 extra[0];
};

#define KVM_MAX_XCRS	16
+1 −1
Original line number Diff line number Diff line
@@ -32,7 +32,7 @@
u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
EXPORT_SYMBOL_GPL(kvm_cpu_caps);

static u32 xstate_required_size(u64 xstate_bv, bool compacted)
u32 xstate_required_size(u64 xstate_bv, bool compacted)
{
	int feature_bit = 0;
	u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+2 −0
Original line number Diff line number Diff line
@@ -30,6 +30,8 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
	       u32 *ecx, u32 *edx, bool exact_only);

u32 xstate_required_size(u64 xstate_bv, bool compacted);

int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);

+44 −1
Original line number Diff line number Diff line
@@ -4314,6 +4314,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
		else
			r = 0;
		break;
	case KVM_CAP_XSAVE2: {
		u64 guest_perm = xstate_get_guest_group_perm();

		r = xstate_required_size(supported_xcr0 & guest_perm, false);
		if (r < sizeof(struct kvm_xsave))
			r = sizeof(struct kvm_xsave);
		break;
	}
	default:
		break;
	}
@@ -4917,6 +4925,16 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
				       vcpu->arch.pkru);
}

static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
					  u8 *state, unsigned int size)
{
	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
		return;

	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
				       state, size, vcpu->arch.pkru);
}

static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
					struct kvm_xsave *guest_xsave)
{
@@ -5370,6 +5388,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
		break;
	}
	case KVM_GET_XSAVE: {
		r = -EINVAL;
		if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
			break;

		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
		r = -ENOMEM;
		if (!u.xsave)
@@ -5384,7 +5406,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
		break;
	}
	case KVM_SET_XSAVE: {
		u.xsave = memdup_user(argp, sizeof(*u.xsave));
		int size = vcpu->arch.guest_fpu.uabi_size;

		u.xsave = memdup_user(argp, size);
		if (IS_ERR(u.xsave)) {
			r = PTR_ERR(u.xsave);
			goto out_nofree;
@@ -5393,6 +5417,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
		break;
	}

	case KVM_GET_XSAVE2: {
		int size = vcpu->arch.guest_fpu.uabi_size;

		u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
		r = -ENOMEM;
		if (!u.xsave)
			break;

		kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);

		r = -EFAULT;
		if (copy_to_user(argp, u.xsave, size))
			break;

		r = 0;
		break;
	}

	case KVM_GET_XCRS: {
		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
		r = -ENOMEM;
Loading