here is a couple of fixes for the nested SVM implementation. I collected these
fixes mostly when trying to get Windows 7 64bit running as an L2 guest. Most
important fixes in this set make lazy fpu switching working with nested SVM and
the nested tpr handling fixes. Without the later fix the l1 guest freezes when
trying to run win7 as l2 guest. Please review and comment on these patches :-)
Joerg
Diffstat:
arch/x86/kvm/svm.c | 187 ++++++++++++++++++++++++++++++++++----------------
arch/x86/kvm/trace.h | 12 ++--
2 files changed, 133 insertions(+), 66 deletions(-)
Shortlog:
Joerg Roedel (10):
KVM: SVM: Don't use kmap_atomic in nested_svm_map
KVM: SVM: Fix wrong interrupt injection in enable_irq_windows
KVM: SVM: Fix schedule-while-atomic on nested exception handling
KVM: SVM: Sync all control registers on nested vmexit
KVM: SVM: Annotate nested_svm_map with might_sleep()
KVM: SVM: Fix nested msr intercept handling
KVM: SVM: Don't sync nested cr8 to lapic and back
KVM: SVM: Activate nested state only when guest state is complete
KVM: SVM: Make lazy FPU switching work with nested svm
KVM: SVM: Remove newlines from nested trace points
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Cc: sta...@kernel.org
Signed-off-by: Joerg Roedel <joerg....@amd.com>
---
arch/x86/kvm/svm.c | 20 +++++++++++++++++---
1 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7c96b8b..25d26ec 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -128,6 +128,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
static int nested_svm_exit_handled(struct vcpu_svm *svm);
+static int nested_svm_exit_handled_atomic(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
@@ -1386,7 +1387,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
svm->vmcb->control.exit_info_1 = error_code;
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
- return nested_svm_exit_handled(svm);
+ return nested_svm_exit_handled_atomic(svm);
}
/* This function returns true if it is save to enable the irq window */
@@ -1520,7 +1521,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
/*
* If this function returns true, this #vmexit was already handled
*/
-static int nested_svm_exit_handled(struct vcpu_svm *svm)
+static int __nested_svm_exit_handled(struct vcpu_svm *svm, bool atomic)
{
u32 exit_code = svm->vmcb->control.exit_code;
int vmexit = NESTED_EXIT_HOST;
@@ -1567,12 +1568,25 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
}
if (vmexit == NESTED_EXIT_DONE) {
- nested_svm_vmexit(svm);
+ if (!atomic)
+ nested_svm_vmexit(svm);
+ else
+ svm->nested.exit_required = true;
}
return vmexit;
}
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ return __nested_svm_exit_handled(svm, false);
+}
+
+static int nested_svm_exit_handled_atomic(struct vcpu_svm *svm)
+{
+ return __nested_svm_exit_handled(svm, true);
+}
+
static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
{
struct vmcb_control_area *dst = &dst_vmcb->control;
--
1.6.6
Signed-off-by: Joerg Roedel <joerg....@amd.com>
---
arch/x86/kvm/svm.c | 43 +++++++++++++++++++++++++++++++++++++++----
1 files changed, 39 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a64b871..ad419aa 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -973,6 +973,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
static void update_cr0_intercept(struct vcpu_svm *svm)
{
+ struct vmcb *vmcb = svm->vmcb;
ulong gcr0 = svm->vcpu.arch.cr0;
u64 *hcr0 = &svm->vmcb->save.cr0;
@@ -984,11 +985,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
- svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+ vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ if (is_nested(svm)) {
+ struct vmcb *hsave = svm->nested.hsave;
+
+ hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+ hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
+ vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
+ }
} else {
svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+ if (is_nested(svm)) {
+ struct vmcb *hsave = svm->nested.hsave;
+
+ hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
+ hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+ }
}
}
@@ -1263,7 +1278,22 @@ static int ud_interception(struct vcpu_svm *svm)
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+ u32 excp;
+
+ if (is_nested(svm)) {
+ u32 h_excp, n_excp;
+
+ h_excp = svm->nested.hsave->control.intercept_exceptions;
+ n_excp = svm->nested.intercept_exceptions;
+ h_excp &= ~(1 << NM_VECTOR);
+ excp = h_excp | n_excp;
+ } else {
+ excp = svm->vmcb->control.intercept_exceptions;
+ excp &= ~(1 << NM_VECTOR);
+ }
+
+ svm->vmcb->control.intercept_exceptions = excp;
+
svm->vcpu.fpu_active = 1;
update_cr0_intercept(svm);
}
@@ -1507,6 +1537,9 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
if (!npt_enabled)
return NESTED_EXIT_HOST;
break;
+ case SVM_EXIT_EXCP_BASE + NM_VECTOR:
+ nm_interception(svm);
+ break;
default:
break;
}
@@ -2972,8 +3005,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- update_cr0_intercept(svm);
svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
+ if (is_nested(svm))
+ svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
+ update_cr0_intercept(svm);
}
static struct kvm_x86_ops svm_x86_ops = {
--
1.6.6
Signed-off-by: Joerg Roedel <joerg....@amd.com>
---
arch/x86/kvm/svm.c | 32 ++++++++++++++++----------------
1 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 52f78dd..041ef6f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1417,7 +1417,7 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
return 0;
}
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa)
{
struct page *page;
@@ -1425,7 +1425,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
if (is_error_page(page))
goto error;
- return kmap_atomic(page, idx);
+ return kmap(page);
error:
kvm_release_page_clean(page);
@@ -1434,7 +1434,7 @@ error:
return NULL;
}
-static void nested_svm_unmap(void *addr, enum km_type idx)
+static void nested_svm_unmap(void *addr)
{
struct page *page;
@@ -1443,7 +1443,7 @@ static void nested_svm_unmap(void *addr, enum km_type idx)
page = kmap_atomic_to_page(addr);
- kunmap_atomic(addr, idx);
+ kunmap(addr);
kvm_release_page_dirty(page);
}
@@ -1458,7 +1458,7 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
return false;
- msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+ msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm);
if (!msrpm)
goto out;
@@ -1486,7 +1486,7 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
ret = msrpm[t1] & ((1 << param) << t0);
out:
- nested_svm_unmap(msrpm, KM_USER0);
+ nested_svm_unmap(msrpm);
return ret;
}
@@ -1616,7 +1616,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb->control.exit_int_info,
vmcb->control.exit_int_info_err);
- nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
+ nested_vmcb = nested_svm_map(svm, svm->nested.vmcb);
if (!nested_vmcb)
return 1;
@@ -1706,7 +1706,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
/* Exit nested SVM mode */
svm->nested.vmcb = 0;
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(nested_vmcb);
kvm_mmu_reset_context(&svm->vcpu);
kvm_mmu_load(&svm->vcpu);
@@ -1719,7 +1719,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
u32 *nested_msrpm;
int i;
- nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+ nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm);
if (!nested_msrpm)
return false;
@@ -1728,7 +1728,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
- nested_svm_unmap(nested_msrpm, KM_USER0);
+ nested_svm_unmap(nested_msrpm);
return true;
}
@@ -1739,7 +1739,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax);
if (!nested_vmcb)
return false;
@@ -1851,7 +1851,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(nested_vmcb);
enable_gif(svm);
@@ -1884,12 +1884,12 @@ static int vmload_interception(struct vcpu_svm *svm)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax);
if (!nested_vmcb)
return 1;
nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(nested_vmcb);
return 1;
}
@@ -1904,12 +1904,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax);
if (!nested_vmcb)
return 1;
nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(nested_vmcb);
return 1;
}
--
1.6.6
Cc: sta...@kernel.org
Signed-off-by: Joerg Roedel <joerg....@amd.com>
---
arch/x86/kvm/svm.c | 17 ++++++++---------
1 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 041ef6f..7c96b8b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1389,16 +1389,17 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
return nested_svm_exit_handled(svm);
}
-static inline int nested_svm_intr(struct vcpu_svm *svm)
+/* This function returns true if it is save to enable the irq window */
+static inline bool nested_svm_intr(struct vcpu_svm *svm)
{
if (!is_nested(svm))
- return 0;
+ return true;
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- return 0;
+ return true;
if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
- return 0;
+ return false;
svm->vmcb->control.exit_code = SVM_EXIT_INTR;
@@ -1411,10 +1412,10 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
*/
svm->nested.exit_required = true;
trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
- return 1;
+ return false;
}
- return 0;
+ return true;
}
static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa)
@@ -2562,13 +2563,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- nested_svm_intr(svm);
-
/* In case GIF=0 we can't rely on the CPU to tell us when
* GIF becomes 1, because that's a separate STGI/VMRUN intercept.
* The next time we get that intercept, this function will be
* called again though and we'll get the vintr intercept. */
- if (gif_set(svm)) {
+ if (gif_set(svm) && nested_svm_intr(svm)) {
svm_set_vintr(svm);
svm_inject_irq(svm, 0x0);
}
--
1.6.6
Signed-off-by: Joerg Roedel <joerg....@amd.com>
---
arch/x86/kvm/svm.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9a4f9ee..3f59cbd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1423,6 +1423,8 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa)
{
struct page *page;
+ might_sleep();
+
page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
if (is_error_page(page))
goto error;
--
1.6.6
Signed-off-by: Joerg Roedel <joerg....@amd.com>
---
arch/x86/kvm/svm.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 25d26ec..9a4f9ee 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1644,9 +1644,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->save.ds = vmcb->save.ds;
nested_vmcb->save.gdtr = vmcb->save.gdtr;
nested_vmcb->save.idtr = vmcb->save.idtr;
+ nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
if (npt_enabled)
nested_vmcb->save.cr3 = vmcb->save.cr3;
+ else
+ nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
nested_vmcb->save.cr2 = vmcb->save.cr2;
+ nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
nested_vmcb->save.rflags = vmcb->save.rflags;
nested_vmcb->save.rip = vmcb->save.rip;
nested_vmcb->save.rsp = vmcb->save.rsp;
--
1.6.6
kunmap() takes a struct page *, not the virtual address (a consistent
source of bugs).
kmap() is generally an unloved interface, it is slow and possibly
deadlock prone, but it's better than sleeping in atomic context. If you
can hack your way around it, that is preferred.
--
error compiling committee.c: too many arguments to function
What do you say to
if (nested_svm_intercepts(svm))
svm->nested.exit_required = true;
here, and recoding nested_svm_exit_handled() to call
nested_svm_intercepts()? I think it improves readability a little by
avoiding a function that changes behaviour according to how it is called.
Long term, we may want to split out the big switch into the individual
handlers, to avoid decoding the exit reason twice.
--
error compiling committee.c: too many arguments to function
--
...
> @@ -973,6 +973,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
>
> static void update_cr0_intercept(struct vcpu_svm *svm)
> {
> + struct vmcb *vmcb = svm->vmcb;
> ulong gcr0 = svm->vcpu.arch.cr0;
> u64 *hcr0 =&svm->vmcb->save.cr0;
>
> @@ -984,11 +985,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
>
>
> if (gcr0 == *hcr0&& svm->vcpu.fpu_active) {
> - svm->vmcb->control.intercept_cr_read&= ~INTERCEPT_CR0_MASK;
> - svm->vmcb->control.intercept_cr_write&= ~INTERCEPT_CR0_MASK;
> + vmcb->control.intercept_cr_read&= ~INTERCEPT_CR0_MASK;
> + vmcb->control.intercept_cr_write&= ~INTERCEPT_CR0_MASK;
> + if (is_nested(svm)) {
> + struct vmcb *hsave = svm->nested.hsave;
> +
> + hsave->control.intercept_cr_read&= ~INTERCEPT_CR0_MASK;
> + hsave->control.intercept_cr_write&= ~INTERCEPT_CR0_MASK;
> + vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
> + vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
>
Why are the last two lines needed?
> + }
> } else {
> svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
> svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
> + if (is_nested(svm)) {
> + struct vmcb *hsave = svm->nested.hsave;
> +
> + hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
> + hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
> + }
> }
> }
>
Maybe it's better to call update_cr0_intercept() after a vmexit instead,
to avoid this repetition, and since the if () may take a different
branch for the nested guest and guest cr0.
--
error compiling committee.c: too many arguments to function
--
Overall looks good. Would appreciate Alex looking over these as well.
--
error compiling committee.c: too many arguments to function
--
> On 02/18/2010 01:38 PM, Joerg Roedel wrote:
>> Hi,
>>
>> here is a couple of fixes for the nested SVM implementation. I collected these
>> fixes mostly when trying to get Windows 7 64bit running as an L2 guest. Most
>> important fixes in this set make lazy fpu switching working with nested SVM and
>> the nested tpr handling fixes. Without the later fix the l1 guest freezes when
>> trying to run win7 as l2 guest. Please review and comment on these patches :-)
>>
>
> Overall looks good. Would appreciate Alex looking over these as well.
The kmap thing is broken though, right?
Alex--
Short for "To De Befined", I presume.
--
error compiling committee.c: too many arguments to function
--
> TDB.
TDB? That's not a patch description.
Alex
Oh yes, but that's a search and replace, not something needing deep rework.
--
error compiling committee.c: too many arguments to function
--
Ah true, thanks. I'll fix that.
> kmap() is generally an unloved interface, it is slow and possibly
> deadlock prone, but it's better than sleeping in atomic context. If
> you can hack your way around it, that is preferred.
Best would be to use kvm_read_guest, but I fear that this will have an
performance impact. Maybe I'll try this and measure if it really has a
significant performance impact.
Joerg
Thats a good idea, will change that. It improves readability.
> Long term, we may want to split out the big switch into the
> individual handlers, to avoid decoding the exit reason twice.
I don't think thats a good idea. The nested exit handling is at the
beginning of svm_handle_exit to hide the nested vcpu state from the kvm
logic which may not be aware of nesting at all. My rationale is that the
host hypervisor don't see any exits that needs to be reinjected to the
l1 hypervisor.
Joerg
Great. I'll post again with fixes soon.
Joerg
Ups. I just forgot to give this patch a right commit message. I add one
for the next post.
Joerg
Because we don't know if the l1 hypervisor wants to intercept cr0. In
this case we need this intercept to stay enabled.
> >+ }
> > } else {
> > svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
> > svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
> >+ if (is_nested(svm)) {
> >+ struct vmcb *hsave = svm->nested.hsave;
> >+
> >+ hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
> >+ hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
> >+ }
> > }
> > }
>
> Maybe it's better to call update_cr0_intercept() after a vmexit
> instead, to avoid this repetition, and since the if () may take a
> different branch for the nested guest and guest cr0.
Thinking again about it I am not sure if this is needed at all. At
vmexit emulation we call svm_set_cr0 which itself calls
update_cr0_intercept. I'll try this.
Joerg