[PATCH 0/5] Rework of msrpm optimization and additional fixes for nested svm

Joerg Roedel

unread,

Feb 25, 2010, 12:20:02 PM2/25/10

to

Hi,

this is the updated version of the msrpm merge optimization in the nested vmrun
path. With nested-shadow I see a performance improvement by a factor of two in
an up guest with kernel compile times.
A few other fixes are also included like the iopm check for nested io
intercepts and the masking of lower 12 bits of the msrpm address supplied in
the nested vmcb.
Please review and/or apply these changes.

Joerg

diffstat:

arch/x86/kvm/svm.c | 187 +++++++++++++++++++++++++++++++++++++---------------
1 files changed, 133 insertions(+), 54 deletions(-)

shortlog:

Joerg Roedel (5):
KVM: SVM: Move msrpm offset calculation to seperate function
KVM: SVM: Optimize nested svm msrpm merging
KVM: SVM: Use svm_msrpm_offset in nested_svm_exit_handled_msr
KVM: SVM: Add correct handling of nested iopm
KVM: SVM: Ignore lower 12 bit of nested msrpm_pa

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Joerg Roedel

unread,

Feb 25, 2010, 12:20:02 PM2/25/10

to

This patch adds the correct handling of the nested io
permission bitmap. Old behavior was to not lookup the port
in the iopm but only reinject an io intercept to the guest.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---
arch/x86/kvm/svm.c | 25 +++++++++++++++++++++++++
1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bb75a44..3859e2c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -78,6 +78,7 @@ struct nested_state {

/* gpa pointers to the real vectors */
u64 vmcb_msrpm;
+ u64 vmcb_iopm;

/* A VMEXIT is required but not yet emulated */
bool exit_required;
@@ -1603,6 +1604,26 @@ static void nested_svm_unmap(struct page *page)
kvm_release_page_dirty(page);
}

+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+ unsigned port;
+ u8 val, bit;
+ u64 gpa;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+ return NESTED_EXIT_HOST;
+
+ port = svm->vmcb->control.exit_info_1 >> 16;
+ gpa = svm->nested.vmcb_iopm + (port / 8);
+ bit = port % 8;
+ val = 0;
+
+ if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
+ val &= (1 << bit);
+
+ return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
u32 offset, msr, value;
@@ -1665,6 +1686,9 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
case SVM_EXIT_MSR:
vmexit = nested_svm_exit_handled_msr(svm);
break;
+ case SVM_EXIT_IOIO:
+ vmexit = nested_svm_intercept_ioio(svm);
+ break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
if (svm->nested.intercept_cr_read & cr_bits)
@@ -1989,6 +2013,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->save.cpl = nested_vmcb->save.cpl;

svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+ svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;

/* cache intercepts */
svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
--
1.7.0

Joerg Roedel

unread,

Feb 25, 2010, 12:20:03 PM2/25/10

to

The algorithm to find the offset in the msrpm for a given
msr is needed at other places too. Move that logic to its
own function.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

arch/x86/kvm/svm.c | 53 ++++++++++++++++++++++++++++++++++++---------------
1 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index df6f491..d8d4e35 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -198,6 +198,28 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
#define MSRS_RANGE_SIZE 2048
#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
+#define MSR_INVALID 0xffffffff
+
+static u32 svm_msrpm_offset(u32 msr)
+{
+ u32 offset;
+ int i;
+
+ for (i = 0; i < NUM_MSR_MAPS; i++) {
+ if (msr < msrpm_ranges[i] ||
+ msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
+ continue;
+
+ offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
+ offset += (i * MSRS_RANGE_SIZE); /* add range offset */
+
+ /* Now we have the u8 offset - but need the u32 offset */
+ return offset / 4;
+ }
+
+ /* MSR not in any range */
+ return MSR_INVALID;
+}

#define MAX_INST_SIZE 15

@@ -417,23 +439,22 @@ err_1:
static void set_msr_interception(u32 *msrpm, unsigned msr,
int read, int write)
{
- int i;
+ u8 bit_read, bit_write;
+ unsigned long tmp;
+ u32 offset;

- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr >= msrpm_ranges[i] &&
- msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
- u32 msr_offset = (i * MSRS_IN_RANGE + msr -
- msrpm_ranges[i]) * 2;
-
- u32 *base = msrpm + (msr_offset / 32);
- u32 msr_shift = msr_offset % 32;
- u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
- *base = (*base & ~(0x3 << msr_shift)) |
- (mask << msr_shift);
- return;
- }
- }
- BUG();
+ offset = svm_msrpm_offset(msr);
+ bit_read = 2 * (msr & 0x0f);
+ bit_write = 2 * (msr & 0x0f) + 1;
+
+ BUG_ON(offset == MSR_INVALID);
+
+ tmp = msrpm[offset];
+
+ read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
+ write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+
+ msrpm[offset] = tmp;
}

static void svm_vcpu_init_msrpm(u32 *msrpm)
--
1.7.0

Joerg Roedel

unread,

Feb 25, 2010, 12:20:02 PM2/25/10

to

There is a generic function now to calculate msrpm offsets.
Use that function in nested_svm_exit_handled_msr() remove
the duplicate logic.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

arch/x86/kvm/svm.c | 44 ++++++++++++++------------------------------
1 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d15e0ea..bb75a44 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1605,40 +1605,24 @@ static void nested_svm_unmap(struct page *page)

static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{

- u32 param = svm->vmcb->control.exit_info_1 & 1;
- u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- bool ret = false;
- u32 t0, t1;
- u8 val;
+ u32 offset, msr, value;
+ int write, mask;

- if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return false;
+ msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ offset = svm_msrpm_offset(msr);
+ write = svm->vmcb->control.exit_info_1 & 1;
+ mask = 1 << ((2 * (msr & 0xf)) + write);

- switch (msr) {
- case 0 ... 0x1fff:
- t0 = (msr * 2) % 8;
- t1 = msr / 8;
- break;
- case 0xc0000000 ... 0xc0001fff:
- t0 = (8192 + msr - 0xc0000000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- case 0xc0010000 ... 0xc0011fff:
- t0 = (16384 + msr - 0xc0010000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- default:
- ret = true;
- goto out;
- }
+ if (offset == MSR_INVALID)
+ return NESTED_EXIT_DONE;

- if (!kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + t1, &val, 1))
- ret = val & ((1 << param) << t0);
+ /* Offset is in 32 bit units but need in 8 bit units */
+ offset = svm->nested.vmcb_msrpm + (offset * 4);

-out:
- return ret;
+ if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+ return NESTED_EXIT_DONE;
+
+ return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}

static int nested_svm_exit_special(struct vcpu_svm *svm)
--
1.7.0

Joerg Roedel

unread,

Feb 25, 2010, 12:20:02 PM2/25/10

to

This patch optimizes the way the msrpm of the host and the
guest are merged. The old code merged the 2 msrpm pages
completly. This code needed to touch 24kb of memory for that
operation. The optimized variant this patch introduces
merges only the parts where the host msrpm may contain zero
bits. This reduces the amount of memory which is touched to
48 bytes.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

arch/x86/kvm/svm.c | 67 +++++++++++++++++++++++++++++++++++++++++++++-------
1 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d8d4e35..d15e0ea 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -92,6 +92,9 @@ struct nested_state {

};

+#define MSRPM_OFFSETS 16
+static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
struct vcpu_svm {
struct kvm_vcpu vcpu;
struct vmcb *vmcb;
@@ -436,6 +439,34 @@ err_1:

}

+static void add_msr_offset(u32 offset)
+{
+ u32 old;
+ int i;
+
+again:
+ for (i = 0; i < MSRPM_OFFSETS; ++i) {
+ old = msrpm_offsets[i];
+
+ if (old == offset)
+ return;
+
+ if (old != MSR_INVALID)
+ continue;
+
+ if (cmpxchg(&msrpm_offsets[i], old, offset) != old)
+ goto again;
+
+ return;
+ }
+
+ /*
+ * If this BUG triggers the msrpm_offsets table has an overflow. Just
+ * increase MSRPM_OFFSETS in this case.
+ */
+ BUG();
+}
+

static void set_msr_interception(u32 *msrpm, unsigned msr,
int read, int write)
{

@@ -454,6 +485,8 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,

read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);

write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);

+ add_msr_offset(offset);

+
msrpm[offset] = tmp;
}

@@ -511,6 +544,8 @@ static __init int svm_hardware_setup(void)
memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;

+ memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);

@@ -775,6 +810,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->nested.hsave = page_address(hsave_page);

svm->nested.msrpm = page_address(nested_msrpm_pages);
+ svm_vcpu_init_msrpm(svm->nested.msrpm);

svm->vmcb = page_address(page);
clear_page(svm->vmcb);
@@ -1846,20 +1882,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)

static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
- u32 *nested_msrpm;
- struct page *page;
+ /*
+ * This function merges the msr permission bitmaps of kvm and the
+ * nested vmcb. It is omptimized in that it only merges the parts where
+ * the kvm msr permission bitmap may contain zero bits
+ */
int i;

- nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, &page);
- if (!nested_msrpm)
- return false;
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return true;

- for (i = 0; i < PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
- svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
+ for (i = 0; i < MSRPM_OFFSETS; i++) {
+ u32 value, p;
+ u64 offset;

- svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+ if (msrpm_offsets[i] == 0xffffffff)
+ break;

- nested_svm_unmap(page);
+ p = msrpm_offsets[i];
+ offset = svm->nested.vmcb_msrpm + (p * 4);
+

+ if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))

+ return false;
+
+ svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ }
+
+ svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);

return true;
}
--
1.7.0

Joerg Roedel

unread,

Feb 25, 2010, 12:20:02 PM2/25/10

to

These bits are ignored by the hardware. Implement this
for nested svm too.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

arch/x86/kvm/svm.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3859e2c..cfc8a90 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2012,7 +2012,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->save.dr6 = nested_vmcb->save.dr6;

svm->vmcb->save.cpl = nested_vmcb->save.cpl;

- svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+ svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;

svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;

/* cache intercepts */

--
1.7.0

Avi Kivity

unread,

Feb 26, 2010, 5:30:01 AM2/26/10

to

Why all this atomic cleverness? The possible offsets are all determined
statically. Even if you do them dynamically (makes sense when
considering pmu passthrough), it's per-vcpu and therefore single
threaded (just move msrpm_offsets into vcpu context).

> @@ -1846,20 +1882,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
>
> static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
> {
> - u32 *nested_msrpm;
> - struct page *page;
> + /*
> + * This function merges the msr permission bitmaps of kvm and the
> + * nested vmcb. It is omptimized in that it only merges the parts where
> + * the kvm msr permission bitmap may contain zero bits
> + */
>

A comment that describes the entire function can be moved above the
function, freeing a whole tab stop for contents.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

Joerg Roedel

unread,

Feb 26, 2010, 5:30:03 AM2/26/10

to

On Fri, Feb 26, 2010 at 12:20:10PM +0200, Avi Kivity wrote:
> On 02/25/2010 07:15 PM, Joerg Roedel wrote:

> >The algorithm to find the offset in the msrpm for a given
> >msr is needed at other places too. Move that logic to its
> >own function.
> >

> > #define MAX_INST_SIZE 15
> >
> >@@ -417,23 +439,22 @@ err_1:
> > static void set_msr_interception(u32 *msrpm, unsigned msr,
> > int read, int write)
> > {
> >- int i;
> >+ u8 bit_read, bit_write;
> >+ unsigned long tmp;
> >+ u32 offset;
> >
> >- for (i = 0; i< NUM_MSR_MAPS; i++) {
> >- if (msr>= msrpm_ranges[i]&&
> >- msr< msrpm_ranges[i] + MSRS_IN_RANGE) {
> >- u32 msr_offset = (i * MSRS_IN_RANGE + msr -
> >- msrpm_ranges[i]) * 2;
> >-
> >- u32 *base = msrpm + (msr_offset / 32);
> >- u32 msr_shift = msr_offset % 32;
> >- u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);

> >- *base = (*base& ~(0x3<< msr_shift)) |

> >- (mask<< msr_shift);
> >- return;
> >- }
> >- }
> >- BUG();
> >+ offset = svm_msrpm_offset(msr);

> >+ bit_read = 2 * (msr& 0x0f);
> >+ bit_write = 2 * (msr& 0x0f) + 1;

> >+
> >+ BUG_ON(offset == MSR_INVALID);
> >+
> >+ tmp = msrpm[offset];
> >+
> >+ read ? clear_bit(bit_read,&tmp) : set_bit(bit_read,&tmp);
> >+ write ? clear_bit(bit_write,&tmp) : set_bit(bit_write,&tmp);
> >+
> >+ msrpm[offset] = tmp;
> > }
>

> This can fault - set_bit() accesses an unsigned long, which can be 8
> bytes, while offset can point into the last u32 of msrpm. So this
> needs either to revert to u32 shift/mask ops or msrpm be changed to
> a ulong array (actually better, since bitmaps in general are defined
> as arrays of ulongs).

Ah true, I will fix that. Thanks.

> btw, the op-level ternary expression is terrible, relying solely on
> *_bit()'s side effects. Please convert to an ordinary if.
>
> btw2, use __set_bit() which atomic operation is not needed.

Right, will switch to __set_bit and __clear_bit.

Joerg

Avi Kivity

unread,

Feb 26, 2010, 5:30:03 AM2/26/10

to

On 02/25/2010 07:15 PM, Joerg Roedel wrote:

> The algorithm to find the offset in the msrpm for a given
> msr is needed at other places too. Move that logic to its
> own function.
>

> #define MAX_INST_SIZE 15
>
> @@ -417,23 +439,22 @@ err_1:
> static void set_msr_interception(u32 *msrpm, unsigned msr,
> int read, int write)
> {
> - int i;
> + u8 bit_read, bit_write;
> + unsigned long tmp;
> + u32 offset;
>
> - for (i = 0; i< NUM_MSR_MAPS; i++) {
> - if (msr>= msrpm_ranges[i]&&
> - msr< msrpm_ranges[i] + MSRS_IN_RANGE) {
> - u32 msr_offset = (i * MSRS_IN_RANGE + msr -
> - msrpm_ranges[i]) * 2;
> -
> - u32 *base = msrpm + (msr_offset / 32);
> - u32 msr_shift = msr_offset % 32;
> - u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);

> - *base = (*base& ~(0x3<< msr_shift)) |

> - (mask<< msr_shift);
> - return;
> - }
> - }
> - BUG();
> + offset = svm_msrpm_offset(msr);

> + bit_read = 2 * (msr& 0x0f);
> + bit_write = 2 * (msr& 0x0f) + 1;

> +
> + BUG_ON(offset == MSR_INVALID);
> +
> + tmp = msrpm[offset];
> +

> + read ? clear_bit(bit_read,&tmp) : set_bit(bit_read,&tmp);
> + write ? clear_bit(bit_write,&tmp) : set_bit(bit_write,&tmp);

> +
> + msrpm[offset] = tmp;
> }
>

This can fault - set_bit() accesses an unsigned long, which can be 8

bytes, while offset can point into the last u32 of msrpm. So this needs
either to revert to u32 shift/mask ops or msrpm be changed to a ulong
array (actually better, since bitmaps in general are defined as arrays
of ulongs).

btw, the op-level ternary expression is terrible, relying solely on

*_bit()'s side effects. Please convert to an ordinary if.

btw2, use __set_bit() which atomic operation is not needed.

--

Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--

Avi Kivity

unread,

Feb 26, 2010, 5:40:01 AM2/26/10

to

On 02/25/2010 07:15 PM, Joerg Roedel wrote:

> + if (!(svm->nested.intercept& (1ULL<< INTERCEPT_IOIO_PROT)))

> + return NESTED_EXIT_HOST;
> +
> + port = svm->vmcb->control.exit_info_1>> 16;
> + gpa = svm->nested.vmcb_iopm + (port / 8);
> + bit = port % 8;
> + val = 0;
> +
> + if (kvm_read_guest(svm->vcpu.kvm, gpa,&val, 1))

> + val&= (1<< bit);

> +
> + return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
> +}
> +
>

A kvm_{test,set,clear}_guest_bit() would be useful, we have several
users already (not a requirement for this patchset).

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--

Avi Kivity

unread,

Feb 26, 2010, 5:40:01 AM2/26/10

to

On 02/25/2010 07:15 PM, Joerg Roedel wrote:

> There is a generic function now to calculate msrpm offsets.
> Use that function in nested_svm_exit_handled_msr() remove
> the duplicate logic.
>
>

Hm, if the function would also calculate the mask, then it would be
useful for set_msr_interception() as well.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--

Alexander Graf

unread,

Feb 26, 2010, 7:30:03 AM2/26/10

to

On 26.02.2010, at 13:25, Joerg Roedel wrote:

> The msr_offset table is the same for all guests. It doesn't make sense
> to keep it per vcpu because it will currently look the same for all
> vcpus. For standard guests this array contains 3 entrys. It is marked
> with __read_mostly for the same reason.

I'm still not convinced on this way of doing things. If it's static, make it static. If it's dynamic, make it dynamic. Dynamically generating a static list just sounds plain wrong to me.

Alex

Joerg Roedel

unread,

Feb 26, 2010, 7:50:02 AM2/26/10

to

On Fri, Feb 26, 2010 at 12:28:24PM +0200, Avi Kivity wrote:

The msr_offset table is the same for all guests. It doesn't make sense

to keep it per vcpu because it will currently look the same for all
vcpus. For standard guests this array contains 3 entrys. It is marked
with __read_mostly for the same reason.

> >@@ -1846,20 +1882,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)

> >
> > static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
> > {
> >- u32 *nested_msrpm;
> >- struct page *page;
> >+ /*
> >+ * This function merges the msr permission bitmaps of kvm and the
> >+ * nested vmcb. It is omptimized in that it only merges the parts where
> >+ * the kvm msr permission bitmap may contain zero bits
> >+ */
>
> A comment that describes the entire function can be moved above the
> function, freeing a whole tab stop for contents.

Ok, will move it out of the function.

Joerg

Avi Kivity

unread,

Feb 26, 2010, 7:50:02 AM2/26/10

to

In that case, you can calculate it during module initialization.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--

Joerg Roedel

unread,

Feb 26, 2010, 8:10:01 AM2/26/10

to

Stop. I had a static list in the first version of the patch. This list
was fine except the fact that a developer needs to remember to update
this list if the list of non-intercepted msrs is expanded. The whole
reason for a dynamically built list is to take the task of maintaining
the list away from the developer and remove a possible source of hard to
find bugs. This is what the current approach does.

Joerg

Alexander Graf

unread,

Feb 26, 2010, 8:10:02 AM2/26/10

to

I was more thinking of replacing the function calls with a list of MSRs. You can then take that list on module init, generate the MSR bitmap once and be good.

Later you can use the same list for the nested bitmap.

Alex--

Avi Kivity

unread,

Feb 26, 2010, 8:20:02 AM2/26/10

to

On 02/26/2010 03:04 PM, Joerg Roedel wrote:
>
>> I'm still not convinced on this way of doing things. If it's static,
>> make it static. If it's dynamic, make it dynamic. Dynamically
>> generating a static list just sounds plain wrong to me.
>>
> Stop. I had a static list in the first version of the patch. This list
> was fine except the fact that a developer needs to remember to update
> this list if the list of non-intercepted msrs is expanded. The whole
> reason for a dynamically built list is to take the task of maintaining
> the list away from the developer and remove a possible source of hard to
> find bugs. This is what the current approach does.
>

The problem was the two lists. If you had a

static struct svm_direct_access_msrs = {
u32 index;
bool longmode_only;
} direct_access_msrs = {
...
};

You could generate

static unsigned *msrpm_offsets_longmode, *msrpm_offsets_legacy;

as well as the original bitmaps at module init, no?

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--

Joerg Roedel

unread,

Feb 26, 2010, 8:30:02 AM2/26/10

to

On Fri, Feb 26, 2010 at 03:10:13PM +0200, Avi Kivity wrote:
> On 02/26/2010 03:04 PM, Joerg Roedel wrote:
> >
> >>I'm still not convinced on this way of doing things. If it's static,
> >>make it static. If it's dynamic, make it dynamic. Dynamically
> >>generating a static list just sounds plain wrong to me.
> >Stop. I had a static list in the first version of the patch. This list
> >was fine except the fact that a developer needs to remember to update
> >this list if the list of non-intercepted msrs is expanded. The whole
> >reason for a dynamically built list is to take the task of maintaining
> >the list away from the developer and remove a possible source of hard to
> >find bugs. This is what the current approach does.
>
> The problem was the two lists. If you had a
>
> static struct svm_direct_access_msrs = {
> u32 index;
> bool longmode_only;
> } direct_access_msrs = {
> ...
> };
>
> You could generate
>
> static unsigned *msrpm_offsets_longmode, *msrpm_offsets_legacy;
>
> as well as the original bitmaps at module init, no?

True for the msrs the guest always has access too. But for the lbr-msrs
the intercept bits may change at runtime. So an addtional flag is
required to indicate if the bits should be cleared initially.

Joerg

Joerg Roedel

unread,

Feb 26, 2010, 8:30:02 AM2/26/10

to

The msr-bitmap is per-vcpu tu support lbr-virtualization. The access to
the lbr-msrs is only enabled if the guest-vcpu enabled lbr-debugging.

A list of MSRs keeps the problem that the information is maintained at
two places: the list and the various set_msr_intercept() function calls.

Joerg

Alexander Graf

unread,

Feb 26, 2010, 8:30:02 AM2/26/10

to

On 26.02.2010, at 14:21, Joerg Roedel wrote:

> On Fri, Feb 26, 2010 at 03:10:13PM +0200, Avi Kivity wrote:
>> On 02/26/2010 03:04 PM, Joerg Roedel wrote:
>>>
>>>> I'm still not convinced on this way of doing things. If it's static,
>>>> make it static. If it's dynamic, make it dynamic. Dynamically
>>>> generating a static list just sounds plain wrong to me.
>>> Stop. I had a static list in the first version of the patch. This list
>>> was fine except the fact that a developer needs to remember to update
>>> this list if the list of non-intercepted msrs is expanded. The whole
>>> reason for a dynamically built list is to take the task of maintaining
>>> the list away from the developer and remove a possible source of hard to
>>> find bugs. This is what the current approach does.
>>
>> The problem was the two lists. If you had a
>>
>> static struct svm_direct_access_msrs = {
>> u32 index;
>> bool longmode_only;
>> } direct_access_msrs = {
>> ...
>> };
>>
>> You could generate
>>
>> static unsigned *msrpm_offsets_longmode, *msrpm_offsets_legacy;
>>
>> as well as the original bitmaps at module init, no?
>
> True for the msrs the guest always has access too. But for the lbr-msrs
> the intercept bits may change at runtime. So an addtional flag is
> required to indicate if the bits should be cleared initially.

So the msrpm bitmap changes dynamically for each vcpu? Great, make it fully dynamic then, changing the vcpu->arch.msrpm only from within its vcpu context. No need for atomic ops.

Alex--

Joerg Roedel

unread,

Feb 26, 2010, 8:40:01 AM2/26/10

to

The msrpm_offsets table is global. But I think I will follow Avis
suggestions and create a static direct_access_msrs list and generate the
msrpm_offsets at module_init. This solves the problem of two independent
lists too.

Joerg

Avi Kivity

unread,

Feb 26, 2010, 9:10:03 AM2/26/10

to

On 02/26/2010 03:30 PM, Joerg Roedel wrote:
>
>> So the msrpm bitmap changes dynamically for each vcpu? Great, make it
>> fully dynamic then, changing the vcpu->arch.msrpm only from within its
>> vcpu context. No need for atomic ops.
>>
> The msrpm_offsets table is global. But I think I will follow Avis
> suggestions and create a static direct_access_msrs list and generate the
> msrpm_offsets at module_init. This solves the problem of two independent
> lists too.
>
>

But with LBR virt, maybe a fully dynamic approach is better. Just have
static lists for updating the msrpm and offset table dynamically.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--

Joerg Roedel

unread,

Mar 1, 2010, 8:40:01 AM3/1/10

to

On Fri, Feb 26, 2010 at 12:30:40PM +0200, Avi Kivity wrote:
> On 02/25/2010 07:15 PM, Joerg Roedel wrote:
> >There is a generic function now to calculate msrpm offsets.
> >Use that function in nested_svm_exit_handled_msr() remove
> >the duplicate logic.
> >
>
> Hm, if the function would also calculate the mask, then it would be
> useful for set_msr_interception() as well.

The set_msr_interception is the only caller which need this mask. I
prefer to let this calculation there. The msrpm merge function only
needs the offset.

Joerg

Joerg Roedel

unread,

Mar 1, 2010, 9:40:02 AM3/1/10

to

Hi,

this is the third round of the msrpm merge optimization patches for
nested svm. The change to the previous post it the introduction of an
direc_access_msrs list which contains all msrs that a guest might
directly access. This list is used to initialize the msrpm bitmaps and
the msrpm_offset table used for merging two tables. This optimization
more than doubles the performance of kernel compiles in the nested guest
using nested-shadow paging.
The other random fixes in this set were not changed to the last version
of this set. Patch 1/7 is new because it was forgotten in the last post.

Thanks,

Joerg

Diffstat:

arch/x86/kvm/svm.c | 259 +++++++++++++++++++++++++++++++++++++++-------------
1 files changed, 195 insertions(+), 64 deletions(-)

Shortlog:

Joerg Roedel (7):
KVM: SVM: Return correct values in nested_svm_exit_handled_msr

KVM: SVM: Move msrpm offset calculation to seperate function

KVM: SVM: Introduce direct access msr list

KVM: SVM: Optimize nested svm msrpm merging
KVM: SVM: Use svm_msrpm_offset in nested_svm_exit_handled_msr
KVM; SVM: Add correct handling of nested iopm
KVM: SVM: Ignore lower 12 bit of nested msrpm_pa

--

Joerg Roedel

unread,

Mar 1, 2010, 9:40:02 AM3/1/10

to

The nested_svm_exit_handled_msr() returned an bool which is
a bug. I worked by accident because the exected integer
return values match with the true and false values. This
patch changes the return value to int and let the function
return the correct values.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

arch/x86/kvm/svm.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1397877..9dfbbae 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1546,16 +1546,16 @@ static void nested_svm_unmap(struct page *page)
kvm_release_page_dirty(page);
}

-static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{

u32 param = svm->vmcb->control.exit_info_1 & 1;

u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- bool ret = false;

u32 t0, t1;
+ int ret;
u8 val;

if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))

- return false;
+ return NESTED_EXIT_HOST;

switch (msr) {
case 0 ... 0x1fff:
@@ -1573,12 +1573,12 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
t0 %= 8;
break;

default:
- ret = true;

+ ret = NESTED_EXIT_DONE;
goto out;

}

if (!kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + t1, &val, 1))
- ret = val & ((1 << param) << t0);

+ ret = val & ((1 << param) << t0) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;

out:
return ret;
--
1.7.0

Joerg Roedel

unread,

Mar 1, 2010, 9:40:01 AM3/1/10

to

This patch optimizes the way the msrpm of the host and the
guest are merged. The old code merged the 2 msrpm pages
completly. This code needed to touch 24kb of memory for that
operation. The optimized variant this patch introduces
merges only the parts where the host msrpm may contain zero
bits. This reduces the amount of memory which is touched to
48 bytes.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---
arch/x86/kvm/svm.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 18d7938..c04ce1e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c

@@ -92,6 +92,9 @@ struct nested_state {

};

+#define MSRPM_OFFSETS 16
+static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
struct vcpu_svm {
struct kvm_vcpu vcpu;
struct vmcb *vmcb;

@@ -509,6 +512,49 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)

}
}

+static void add_msr_offset(u32 offset)
+{

+ int i;
+

+ for (i = 0; i < MSRPM_OFFSETS; ++i) {
+

+ /* Offset already in list? */
+ if (msrpm_offsets[i] == offset)
+ return;
+
+ /* Slot used by another offset? */
+ if (msrpm_offsets[i] != MSR_INVALID)
+ continue;
+
+ /* Add offset to list */
+ msrpm_offsets[i] = offset;

+
+ return;
+ }
+
+ /*
+ * If this BUG triggers the msrpm_offsets table has an overflow. Just
+ * increase MSRPM_OFFSETS in this case.
+ */
+ BUG();
+}

+
+static void init_msrpm_offsets(void)
+{
+ int i;
+

+ memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+

+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 offset;
+
+ offset = svm_msrpm_offset(direct_access_msrs[i].index);

+ BUG_ON(offset == MSR_INVALID);
+

+ add_msr_offset(offset);
+ }
+}
+
static void svm_enable_lbrv(struct vcpu_svm *svm)
{
u32 *msrpm = svm->msrpm;
@@ -547,6 +593,8 @@ static __init int svm_hardware_setup(void)

memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;

+ init_msrpm_offsets();
+
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);

@@ -811,6 +859,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)

svm->nested.hsave = page_address(hsave_page);

svm->nested.msrpm = page_address(nested_msrpm_pages);
+ svm_vcpu_init_msrpm(svm->nested.msrpm);

svm->vmcb = page_address(page);
clear_page(svm->vmcb);

@@ -1882,20 +1931,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)

static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
- u32 *nested_msrpm;
- struct page *page;
+ /*
+ * This function merges the msr permission bitmaps of kvm and the
+ * nested vmcb. It is omptimized in that it only merges the parts where
+ * the kvm msr permission bitmap may contain zero bits
+ */

int i;

- nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, &page);
- if (!nested_msrpm)
- return false;
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return true;

- for (i = 0; i < PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
- svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
+ for (i = 0; i < MSRPM_OFFSETS; i++) {
+ u32 value, p;
+ u64 offset;

- svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+ if (msrpm_offsets[i] == 0xffffffff)
+ break;

- nested_svm_unmap(page);

+ offset = svm->nested.vmcb_msrpm + msrpm_offsets[i];
+ p = msrpm_offsets[i] / 4;
+

+ if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+ return false;
+
+ svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ }
+
+ svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);

return true;
}
--
1.7.0

Marcelo Tosatti

unread,

Mar 2, 2010, 9:40:03 AM3/2/10

to

On Mon, Mar 01, 2010 at 03:34:33PM +0100, Joerg Roedel wrote:
> Hi,
>
> this is the third round of the msrpm merge optimization patches for
> nested svm. The change to the previous post it the introduction of an
> direc_access_msrs list which contains all msrs that a guest might
> directly access. This list is used to initialize the msrpm bitmaps and
> the msrpm_offset table used for merging two tables. This optimization
> more than doubles the performance of kernel compiles in the nested guest
> using nested-shadow paging.
> The other random fixes in this set were not changed to the last version
> of this set. Patch 1/7 is new because it was forgotten in the last post.
>
> Thanks,
>
> Joerg

Applied, thanks.