Skip to content

Commit f775b13

Browse files
Rik van Rielrkrcmar
Rik van Riel
authored andcommitted
x86,kvm: move qemu/guest FPU switching out to vcpu_run
Currently, every time a VCPU is scheduled out, the host kernel will first save the guest FPU/xstate context, then load the qemu userspace FPU context, only to then immediately save the qemu userspace FPU context back to memory. When scheduling in a VCPU, the same extraneous FPU loads and saves are done. This could be avoided by moving from a model where the guest FPU is loaded and stored with preemption disabled, to a model where the qemu userspace FPU is swapped out for the guest FPU context for the duration of the KVM_RUN ioctl. This is done under the VCPU mutex, which is also taken when other tasks inspect the VCPU FPU context, so the code should already be safe for this change. That should come as no surprise, given that s390 already has this optimization. This can fix a bug where KVM calls get_user_pages while owning the FPU, and the file system ends up requesting the FPU again: [258270.527947] __warn+0xcb/0xf0 [258270.527948] warn_slowpath_null+0x1d/0x20 [258270.527951] kernel_fpu_disable+0x3f/0x50 [258270.527953] __kernel_fpu_begin+0x49/0x100 [258270.527955] kernel_fpu_begin+0xe/0x10 [258270.527958] crc32c_pcl_intel_update+0x84/0xb0 [258270.527961] crypto_shash_update+0x3f/0x110 [258270.527968] crc32c+0x63/0x8a [libcrc32c] [258270.527975] dm_bm_checksum+0x1b/0x20 [dm_persistent_data] [258270.527978] node_prepare_for_write+0x44/0x70 [dm_persistent_data] [258270.527985] dm_block_manager_write_callback+0x41/0x50 [dm_persistent_data] [258270.527988] submit_io+0x170/0x1b0 [dm_bufio] [258270.527992] __write_dirty_buffer+0x89/0x90 [dm_bufio] [258270.527994] __make_buffer_clean+0x4f/0x80 [dm_bufio] [258270.527996] __try_evict_buffer+0x42/0x60 [dm_bufio] [258270.527998] dm_bufio_shrink_scan+0xc0/0x130 [dm_bufio] [258270.528002] shrink_slab.part.40+0x1f5/0x420 [258270.528004] shrink_node+0x22c/0x320 [258270.528006] do_try_to_free_pages+0xf5/0x330 [258270.528008] try_to_free_pages+0xe9/0x190 [258270.528009] __alloc_pages_slowpath+0x40f/0xba0 [258270.528011] __alloc_pages_nodemask+0x209/0x260 [258270.528014] alloc_pages_vma+0x1f1/0x250 [258270.528017] do_huge_pmd_anonymous_page+0x123/0x660 [258270.528021] handle_mm_fault+0xfd3/0x1330 [258270.528025] __get_user_pages+0x113/0x640 [258270.528027] get_user_pages+0x4f/0x60 [258270.528063] __gfn_to_pfn_memslot+0x120/0x3f0 [kvm] [258270.528108] try_async_pf+0x66/0x230 [kvm] [258270.528135] tdp_page_fault+0x130/0x280 [kvm] [258270.528149] kvm_mmu_page_fault+0x60/0x120 [kvm] [258270.528158] handle_ept_violation+0x91/0x170 [kvm_intel] [258270.528162] vmx_handle_exit+0x1ca/0x1400 [kvm_intel] No performance changes were detected in quick ping-pong tests on my 4 socket system, which is expected since an FPU+xstate load is on the order of 0.1us, while ping-ponging between CPUs is on the order of 20us, and somewhat noisy. Cc: [email protected] Signed-off-by: Rik van Riel <[email protected]> Suggested-by: Christian Borntraeger <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]> [Fixed a bug where reset_vcpu called put_fpu without preceding load_fpu, which happened inside from KVM_CREATE_VCPU ioctl. - Radim] Signed-off-by: Radim Krčmář <[email protected]>
1 parent 609b700 commit f775b13

File tree

3 files changed

+31
-23
lines changed

3 files changed

+31
-23
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,20 @@ struct kvm_vcpu_arch {
536536
struct kvm_mmu_memory_cache mmu_page_cache;
537537
struct kvm_mmu_memory_cache mmu_page_header_cache;
538538

539+
/*
540+
* QEMU userspace and the guest each have their own FPU state.
541+
* In vcpu_run, we switch between the user and guest FPU contexts.
542+
* While running a VCPU, the VCPU thread will have the guest FPU
543+
* context.
544+
*
545+
* Note that while the PKRU state lives inside the fpu registers,
546+
* it is switched out separately at VMENTER and VMEXIT time. The
547+
* "guest_fpu" state here contains the guest FPU context, with the
548+
* host PRKU bits.
549+
*/
550+
struct fpu user_fpu;
539551
struct fpu guest_fpu;
552+
540553
u64 xcr0;
541554
u64 guest_supported_xcr0;
542555
u32 guest_xstate_size;

arch/x86/kvm/x86.c

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2937,7 +2937,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
29372937
srcu_read_unlock(&vcpu->kvm->srcu, idx);
29382938
pagefault_enable();
29392939
kvm_x86_ops->vcpu_put(vcpu);
2940-
kvm_put_guest_fpu(vcpu);
29412940
vcpu->arch.last_host_tsc = rdtsc();
29422941
}
29432942

@@ -5254,13 +5253,10 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt)
52545253

52555254
static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
52565255
{
5257-
preempt_disable();
5258-
kvm_load_guest_fpu(emul_to_vcpu(ctxt));
52595256
}
52605257

52615258
static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
52625259
{
5263-
preempt_enable();
52645260
}
52655261

52665262
static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
@@ -6952,7 +6948,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
69526948
preempt_disable();
69536949

69546950
kvm_x86_ops->prepare_guest_switch(vcpu);
6955-
kvm_load_guest_fpu(vcpu);
69566951

69576952
/*
69586953
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -7297,12 +7292,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
72977292
}
72987293
}
72997294

7295+
kvm_load_guest_fpu(vcpu);
7296+
73007297
if (unlikely(vcpu->arch.complete_userspace_io)) {
73017298
int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
73027299
vcpu->arch.complete_userspace_io = NULL;
73037300
r = cui(vcpu);
73047301
if (r <= 0)
7305-
goto out;
7302+
goto out_fpu;
73067303
} else
73077304
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
73087305

@@ -7311,6 +7308,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
73117308
else
73127309
r = vcpu_run(vcpu);
73137310

7311+
out_fpu:
7312+
kvm_put_guest_fpu(vcpu);
73147313
out:
73157314
post_kvm_run_save(vcpu);
73167315
kvm_sigset_deactivate(vcpu);
@@ -7704,32 +7703,25 @@ static void fx_init(struct kvm_vcpu *vcpu)
77047703
vcpu->arch.cr0 |= X86_CR0_ET;
77057704
}
77067705

7706+
/* Swap (qemu) user FPU context for the guest FPU context. */
77077707
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
77087708
{
7709-
if (vcpu->guest_fpu_loaded)
7710-
return;
7711-
7712-
/*
7713-
* Restore all possible states in the guest,
7714-
* and assume host would use all available bits.
7715-
* Guest xcr0 would be loaded later.
7716-
*/
7717-
vcpu->guest_fpu_loaded = 1;
7718-
__kernel_fpu_begin();
7709+
preempt_disable();
7710+
copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
77197711
/* PKRU is separately restored in kvm_x86_ops->run. */
77207712
__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
77217713
~XFEATURE_MASK_PKRU);
7714+
preempt_enable();
77227715
trace_kvm_fpu(1);
77237716
}
77247717

7718+
/* When vcpu_run ends, restore user space FPU context. */
77257719
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
77267720
{
7727-
if (!vcpu->guest_fpu_loaded)
7728-
return;
7729-
7730-
vcpu->guest_fpu_loaded = 0;
7721+
preempt_disable();
77317722
copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
7732-
__kernel_fpu_end();
7723+
copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
7724+
preempt_enable();
77337725
++vcpu->stat.fpu_reload;
77347726
trace_kvm_fpu(0);
77357727
}
@@ -7846,7 +7838,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
78467838
* To avoid have the INIT path from kvm_apic_has_events() that be
78477839
* called with loaded FPU and does not let userspace fix the state.
78487840
*/
7849-
kvm_put_guest_fpu(vcpu);
7841+
if (init_event)
7842+
kvm_put_guest_fpu(vcpu);
78507843
mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
78517844
XFEATURE_MASK_BNDREGS);
78527845
if (mpx_state_buffer)
@@ -7855,6 +7848,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
78557848
XFEATURE_MASK_BNDCSR);
78567849
if (mpx_state_buffer)
78577850
memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
7851+
if (init_event)
7852+
kvm_load_guest_fpu(vcpu);
78587853
}
78597854

78607855
if (!init_event) {

include/linux/kvm_host.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ struct kvm_vcpu {
232232
struct mutex mutex;
233233
struct kvm_run *run;
234234

235-
int guest_fpu_loaded, guest_xcr0_loaded;
235+
int guest_xcr0_loaded;
236236
struct swait_queue_head wq;
237237
struct pid __rcu *pid;
238238
int sigset_active;

0 commit comments

Comments
 (0)