[PATCH 01/11] iommu-api: Rename ->{un}map function pointers to ->{un}map

Joerg Roedel

unread,

Jan 28, 2010, 6:40:01 AM1/28/10

to

The new function pointer names match better with the
top-level functions of the iommu-api which are using them.
Main intention of this change is to make the ->{un}map
pointer names free for two new mapping functions.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---
arch/x86/kernel/amd_iommu.c | 4 ++--
drivers/base/iommu.c | 4 ++--
drivers/pci/intel-iommu.c | 4 ++--
include/linux/iommu.h | 8 ++++----
4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index adb0ba0..59cae7c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2515,8 +2515,8 @@ static struct iommu_ops amd_iommu_ops = {
.domain_destroy = amd_iommu_domain_destroy,
.attach_dev = amd_iommu_attach_device,
.detach_dev = amd_iommu_detach_device,
- .map = amd_iommu_map_range,
- .unmap = amd_iommu_unmap_range,
+ .map_range = amd_iommu_map_range,
+ .unmap_range = amd_iommu_unmap_range,
.iova_to_phys = amd_iommu_iova_to_phys,
.domain_has_cap = amd_iommu_domain_has_cap,
};
diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index 8ad4ffe..f4c86c4 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -83,14 +83,14 @@ EXPORT_SYMBOL_GPL(iommu_detach_device);
int iommu_map_range(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, int prot)
{
- return iommu_ops->map(domain, iova, paddr, size, prot);
+ return iommu_ops->map_range(domain, iova, paddr, size, prot);
}
EXPORT_SYMBOL_GPL(iommu_map_range);

void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
size_t size)
{
- iommu_ops->unmap(domain, iova, size);
+ iommu_ops->unmap_range(domain, iova, size);
}
EXPORT_SYMBOL_GPL(iommu_unmap_range);

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 4173125..a714e3d 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3714,8 +3714,8 @@ static struct iommu_ops intel_iommu_ops = {
.domain_destroy = intel_iommu_domain_destroy,
.attach_dev = intel_iommu_attach_device,
.detach_dev = intel_iommu_detach_device,
- .map = intel_iommu_map_range,
- .unmap = intel_iommu_unmap_range,
+ .map_range = intel_iommu_map_range,
+ .unmap_range = intel_iommu_unmap_range,
.iova_to_phys = intel_iommu_iova_to_phys,
.domain_has_cap = intel_iommu_domain_has_cap,
};
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3af4ffd..0f18f37 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -36,10 +36,10 @@ struct iommu_ops {
void (*domain_destroy)(struct iommu_domain *domain);
int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
- int (*map)(struct iommu_domain *domain, unsigned long iova,
- phys_addr_t paddr, size_t size, int prot);
- void (*unmap)(struct iommu_domain *domain, unsigned long iova,
- size_t size);
+ int (*map_range)(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot);
+ void (*unmap_range)(struct iommu_domain *domain, unsigned long iova,
+ size_t size);
phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
unsigned long iova);
int (*domain_has_cap)(struct iommu_domain *domain,
--
1.6.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Joerg Roedel

unread,

Jan 28, 2010, 6:40:02 AM1/28/10

to

This patch introduces a generic function to find out the
host page size for a given gfn. This function is needed by
the kvm iommu code. This patch also simplifies the x86
host_mapping_level function.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

arch/x86/kvm/mmu.c | 18 ++----------------
include/linux/kvm_host.h | 1 +
virt/kvm/kvm_main.c | 25 +++++++++++++++++++++++++
3 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff2b2e8..226fa8e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -467,24 +467,10 @@ static int has_wrprotected_page(struct kvm *kvm,

static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
{
- unsigned long page_size = PAGE_SIZE;
- struct vm_area_struct *vma;
- unsigned long addr;
+ unsigned long page_size;
int i, ret = 0;

- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr))
- return PT_PAGE_TABLE_LEVEL;
-
- down_read(&current->mm->mmap_sem);
- vma = find_vma(current->mm, addr);
- if (!vma)
- goto out;
-
- page_size = vma_kernel_pagesize(vma);
-
-out:
- up_read(&current->mm->mmap_sem);
+ page_size = kvm_host_page_size(kvm, gfn);

for (i = PT_PAGE_TABLE_LEVEL;
i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index dfde04b..a946abc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -300,6 +300,7 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
+unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn);

void kvm_vcpu_block(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7c5c873..87a4a77 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -45,6 +45,7 @@
#include <linux/spinlock.h>
#include <linux/compat.h>
#include <linux/srcu.h>
+#include <linux/hugetlb.h>

#include <asm/processor.h>
#include <asm/io.h>
@@ -884,6 +885,30 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);

+unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr, size;
+
+ size = PAGE_SIZE;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return PAGE_SIZE;
+
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, addr);
+ if (!vma)
+ goto out;
+
+ size = vma_kernel_pagesize(vma);
+
+out:
+ up_read(&current->mm->mmap_sem);
+
+ return size;
+}
+
int memslot_id(struct kvm *kvm, gfn_t gfn)
{
int i;

Joerg Roedel

unread,

Jan 28, 2010, 6:40:02 AM1/28/10

to

This patch implements the new callbacks for the IOMMU-API
with functions that can handle different page sizes in the
IOMMU page table.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

arch/x86/kernel/amd_iommu.c | 29 +++++++++++++++++++++++++++++
1 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 52e44af..0e068c9 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2552,6 +2552,33 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
iommu_flush_tlb_pde(domain);
}

+static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
+ phys_addr_t paddr, int gfp_order, int iommu_prot)
+{
+ unsigned long page_size = 0x1000UL << gfp_order;
+ struct protection_domain *domain = dom->priv;
+ int prot = 0;
+
+ if (iommu_prot & IOMMU_READ)
+ prot |= IOMMU_PROT_IR;
+ if (iommu_prot & IOMMU_WRITE)
+ prot |= IOMMU_PROT_IW;
+
+ return iommu_map_page(domain, iova, paddr, prot, page_size);
+}
+
+static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+ int gfp_order)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long page_size, unmap_size;
+
+ page_size = 0x1000UL << gfp_order;
+ unmap_size = iommu_unmap_page(domain, iova, page_size);
+
+ return get_order(unmap_size);
+}
+
static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
unsigned long iova)
{
@@ -2587,6 +2614,8 @@ static struct iommu_ops amd_iommu_ops = {

.domain_destroy = amd_iommu_domain_destroy,
.attach_dev = amd_iommu_attach_device,
.detach_dev = amd_iommu_detach_device,

+ .map = amd_iommu_map,
+ .unmap = amd_iommu_unmap,
.map_range = amd_iommu_map_range,

.unmap_range = amd_iommu_unmap_range,
.iova_to_phys = amd_iommu_iova_to_phys,

Joerg Roedel

unread,

Jan 28, 2010, 6:40:03 AM1/28/10

to

This patch changes the old map_size parameter of alloc_pte
to a page_size parameter which can be used more easily to
alloc a pte for intermediate page sizes.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

arch/x86/include/asm/amd_iommu_types.h | 28 +++++++++++++++++
arch/x86/kernel/amd_iommu.c | 53 ++++++++++++++++++++------------
2 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index ba19ad4..5e8da56 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -172,6 +172,34 @@
(~((1ULL << (12 + ((lvl) * 9))) - 1)))
#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))

+/*
+ * Returns the page table level to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_LEVEL(pagesize) \
+ ((__ffs(pagesize) - 12) / 9)
+/*
+ * Returns the number of ptes to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_PTE_COUNT(pagesize) \
+ (1ULL << ((__ffs(pagesize) - 12) % 9))
+
+/*
+ * Aligns a given io-virtual address to a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_ALIGN(address, pagesize) \
+ ((address) & ~((pagesize) - 1))
+/*
+ * Creates an IOMMU PTE for an address an a given pagesize
+ * The PTE has no permission bits set
+ * Pagesize is expected to be a power-of-two larger than 4096
+ */
+#define PAGE_SIZE_PTE(address, pagesize) \
+ (((address) | ((pagesize) - 1)) & \
+ (~(pagesize >> 1)) & PM_ADDR_MASK)
+
#define IOMMU_PTE_P (1ULL << 0)
#define IOMMU_PTE_TV (1ULL << 1)
#define IOMMU_PTE_U (1ULL << 59)
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 59cae7c..4170031 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -730,18 +730,22 @@ static bool increase_address_space(struct protection_domain *domain,

static u64 *alloc_pte(struct protection_domain *domain,
unsigned long address,
- int end_lvl,
+ unsigned long page_size,
u64 **pte_page,
gfp_t gfp)
{
+ int level, end_lvl;
u64 *pte, *page;
- int level;
+
+ BUG_ON(!is_power_of_2(page_size));

while (address > PM_LEVEL_SIZE(domain->mode))
increase_address_space(domain, gfp);

- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+ address = PAGE_SIZE_ALIGN(address, page_size);
+ end_lvl = PAGE_SIZE_LEVEL(page_size);

while (level > end_lvl) {
if (!IOMMU_PTE_PRESENT(*pte)) {
@@ -751,6 +755,10 @@ static u64 *alloc_pte(struct protection_domain *domain,
*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
}

+ /* No level skipping support yet */
+ if (PM_PTE_LEVEL(*pte) != level)
+ return NULL;
+
level -= 1;

pte = IOMMU_PTE_PAGE(*pte);
@@ -806,31 +814,36 @@ static int iommu_map_page(struct protection_domain *dom,
unsigned long bus_addr,
unsigned long phys_addr,
int prot,
- int map_size)
+ unsigned long page_size)
{
u64 __pte, *pte;
-
- bus_addr = PAGE_ALIGN(bus_addr);
- phys_addr = PAGE_ALIGN(phys_addr);
-
- BUG_ON(!PM_ALIGNED(map_size, bus_addr));
- BUG_ON(!PM_ALIGNED(map_size, phys_addr));
+ int i, count;

if (!(prot & IOMMU_PROT_MASK))
return -EINVAL;

- pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
+ bus_addr = PAGE_ALIGN(bus_addr);
+ phys_addr = PAGE_ALIGN(phys_addr);
+ count = PAGE_SIZE_PTE_COUNT(page_size);
+ pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
+
+ for (i = 0; i < count; ++i)
+ if (IOMMU_PTE_PRESENT(pte[i]))
+ return -EBUSY;

- if (IOMMU_PTE_PRESENT(*pte))
- return -EBUSY;
+ if (page_size > PAGE_SIZE) {
+ __pte = PAGE_SIZE_PTE(phys_addr, page_size);
+ __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
+ } else
+ __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;

- __pte = phys_addr | IOMMU_PTE_P;
if (prot & IOMMU_PROT_IR)
__pte |= IOMMU_PTE_IR;
if (prot & IOMMU_PROT_IW)
__pte |= IOMMU_PTE_IW;

- *pte = __pte;
+ for (i = 0; i < count; ++i)
+ pte[i] = __pte;

update_domain(dom);

@@ -877,7 +890,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
for (addr = e->address_start; addr < e->address_end;
addr += PAGE_SIZE) {
ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
- PM_MAP_4k);
+ PAGE_SIZE);
if (ret)
return ret;
/*
@@ -1005,7 +1018,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
u64 *pte, *pte_page;

for (i = 0; i < num_ptes; ++i) {
- pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
+ pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
&pte_page, gfp);
if (!pte)
goto out_free;
@@ -1711,7 +1724,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,

pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
if (!pte) {
- pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
+ pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
GFP_ATOMIC);
aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
} else
@@ -2457,7 +2470,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
paddr &= PAGE_MASK;

for (i = 0; i < npages; ++i) {
- ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
+ ret = iommu_map_page(domain, iova, paddr, prot, PAGE_SIZE);
if (ret)
return ret;

Joerg Roedel

unread,

Jan 28, 2010, 6:50:01 AM1/28/10

to

These functions are not longer used and can be removed
savely. There functionality is now provided by the
iommu_{un}map functions which are also capable of multiple
page sizes.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

arch/x86/kernel/amd_iommu.c | 48 -------------------------------------------
drivers/base/iommu.c | 26 +---------------------
include/linux/iommu.h | 20 -----------------
3 files changed, 2 insertions(+), 92 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0e068c9..d8da998 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2506,52 +2506,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
return ret;
}

-static int amd_iommu_map_range(struct iommu_domain *dom,
- unsigned long iova, phys_addr_t paddr,
- size_t size, int iommu_prot)
-{
- struct protection_domain *domain = dom->priv;
- unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
- int prot = 0;
- int ret;
-
- if (iommu_prot & IOMMU_READ)
- prot |= IOMMU_PROT_IR;
- if (iommu_prot & IOMMU_WRITE)
- prot |= IOMMU_PROT_IW;
-
- iova &= PAGE_MASK;
- paddr &= PAGE_MASK;
-
- for (i = 0; i < npages; ++i) {
- ret = iommu_map_page(domain, iova, paddr, prot, PAGE_SIZE);
- if (ret)
- return ret;
-
- iova += PAGE_SIZE;
- paddr += PAGE_SIZE;
- }
-
- return 0;
-}
-
-static void amd_iommu_unmap_range(struct iommu_domain *dom,
- unsigned long iova, size_t size)
-{
-
- struct protection_domain *domain = dom->priv;
- unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE);
-
- iova &= PAGE_MASK;
-
- for (i = 0; i < npages; ++i) {
- iommu_unmap_page(domain, iova, PAGE_SIZE);
- iova += PAGE_SIZE;
- }
-
- iommu_flush_tlb_pde(domain);
-}
-

static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,

phys_addr_t paddr, int gfp_order, int iommu_prot)

{
@@ -2616,8 +2570,6 @@ static struct iommu_ops amd_iommu_ops = {
.detach_dev = amd_iommu_detach_device,
.map = amd_iommu_map,
.unmap = amd_iommu_unmap,
- .map_range = amd_iommu_map_range,
- .unmap_range = amd_iommu_unmap_range,
.iova_to_phys = amd_iommu_iova_to_phys,

.domain_has_cap = amd_iommu_domain_has_cap,
};
diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c

index 55d37e4..6e6b6a1 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -80,20 +80,6 @@ void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
}
EXPORT_SYMBOL_GPL(iommu_detach_device);

-int iommu_map_range(struct iommu_domain *domain, unsigned long iova,

- phys_addr_t paddr, size_t size, int prot)

-{
- return iommu_ops->map_range(domain, iova, paddr, size, prot);
-}
-EXPORT_SYMBOL_GPL(iommu_map_range);
-
-void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
- size_t size)
-{
- iommu_ops->unmap_range(domain, iova, size);
-}
-EXPORT_SYMBOL_GPL(iommu_unmap_range);
-
phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
unsigned long iova)
{
@@ -119,10 +105,7 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,

BUG_ON((iova | paddr) & invalid_mask);

- if (iommu_ops->map)
- return iommu_ops->map(domain, iova, paddr, gfp_order, prot);
-
- return iommu_ops->map_range(domain, iova, paddr, size, prot);
+ return iommu_ops->map(domain, iova, paddr, gfp_order, prot);
}
EXPORT_SYMBOL_GPL(iommu_map);

@@ -136,11 +119,6 @@ int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order)

BUG_ON(iova & invalid_mask);

- if (iommu_ops->unmap)
- return iommu_ops->unmap(domain, iova, gfp_order);
-
- iommu_ops->unmap_range(domain, iova, size);
-
- return gfp_order;
+ return iommu_ops->unmap(domain, iova, gfp_order);
}
EXPORT_SYMBOL_GPL(iommu_unmap);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5a7a3d8..be22ad8 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -40,10 +40,6 @@ struct iommu_ops {
phys_addr_t paddr, int gfp_order, int prot);
int (*unmap)(struct iommu_domain *domain, unsigned long iova,
int gfp_order);
- int (*map_range)(struct iommu_domain *domain, unsigned long iova,

- phys_addr_t paddr, size_t size, int prot);

- void (*unmap_range)(struct iommu_domain *domain, unsigned long iova,
- size_t size);

phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
unsigned long iova);
int (*domain_has_cap)(struct iommu_domain *domain,

@@ -60,10 +56,6 @@ extern int iommu_attach_device(struct iommu_domain *domain,
struct device *dev);
extern void iommu_detach_device(struct iommu_domain *domain,
struct device *dev);
-extern int iommu_map_range(struct iommu_domain *domain, unsigned long iova,

- phys_addr_t paddr, size_t size, int prot);

-extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
- size_t size);
extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, int gfp_order, int prot);
extern int iommu_unmap(struct iommu_domain *domain, unsigned long iova,
@@ -104,18 +96,6 @@ static inline void iommu_detach_device(struct iommu_domain *domain,
{
}

-static inline int iommu_map_range(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t paddr,
- size_t size, int prot)
-{
- return -ENODEV;
-}
-
-static inline void iommu_unmap_range(struct iommu_domain *domain,
- unsigned long iova, size_t size)
-{
-}
-
static inline int iommu_map(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, int gfp_order, int prot)
{

Joerg Roedel

unread,

Jan 28, 2010, 6:50:01 AM1/28/10

to

This patch changes the implementation of of
kvm_iommu_map_pages to map the pages with the host page size
into the io virtual address space.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

virt/kvm/iommu.c | 106 ++++++++++++++++++++++++++++++++++++++++++-----------
1 files changed, 84 insertions(+), 22 deletions(-)

diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 65a5143..92a434d 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -32,12 +32,27 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);

+static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, unsigned long size)
+{
+ gfn_t end_gfn;
+ pfn_t pfn;
+
+ pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
+ end_gfn = gfn + (size >> PAGE_SHIFT);
+ gfn += 1;
+
+ while (gfn < end_gfn)
+ gfn_to_pfn_memslot(kvm, slot, gfn++);
+
+ return pfn;
+}
+
int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
{
- gfn_t gfn = slot->base_gfn;
- unsigned long npages = slot->npages;
+ gfn_t gfn, end_gfn;
pfn_t pfn;
- int i, r = 0;
+ int r = 0;
struct iommu_domain *domain = kvm->arch.iommu_domain;
int flags;

@@ -45,31 +60,58 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
if (!domain)
return 0;

+ gfn = slot->base_gfn;
+ end_gfn = gfn + slot->npages;
+
flags = IOMMU_READ | IOMMU_WRITE;
if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
flags |= IOMMU_CACHE;

- for (i = 0; i < npages; i++) {
- /* check if already mapped */
- if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
+
+ while (gfn < end_gfn) {
+ unsigned long page_size;
+
+ /* Check if already mapped */
+ if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
+ gfn += 1;
continue;
+ }
+
+ /* Get the page size we could use to map */

+ page_size = kvm_host_page_size(kvm, gfn);

+
+ /* Make sure the page_size does not exceed the memslot */
+ while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
+ page_size >>= 1;
+
+ /* Make sure gfn is aligned to the page size we want to map */
+ while ((gfn << PAGE_SHIFT) & (page_size - 1))
+ page_size >>= 1;
+
+ /*
+ * Pin all pages we are about to map in memory. This is
+ * important because we unmap and unpin in 4kb steps later.
+ */
+ pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
+
+ /* Map into IO address space */
+ r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
+ get_order(page_size), flags);
+
+ gfn += page_size >> PAGE_SHIFT;

- pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
- r = iommu_map_range(domain,
- gfn_to_gpa(gfn),
- pfn_to_hpa(pfn),
- PAGE_SIZE, flags);
if (r) {
printk(KERN_ERR "kvm_iommu_map_address:"
"iommu failed to map pfn=%lx\n", pfn);
goto unmap_pages;
}
- gfn++;
+
}
+
return 0;

unmap_pages:
- kvm_iommu_put_pages(kvm, slot->base_gfn, i);
+ kvm_iommu_put_pages(kvm, slot->base_gfn, gfn);
return r;
}

@@ -186,27 +228,47 @@ out_unmap:
return r;
}

+static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; ++i)
+ kvm_release_pfn_clean(pfn + i);
+}
+
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages)
{
- gfn_t gfn = base_gfn;
+ struct iommu_domain *domain;
+ gfn_t end_gfn, gfn;
pfn_t pfn;
- struct iommu_domain *domain = kvm->arch.iommu_domain;
- unsigned long i;
u64 phys;

+ domain = kvm->arch.iommu_domain;
+ end_gfn = base_gfn + npages;
+ gfn = base_gfn;
+
/* check if iommu exists and in use */
if (!domain)
return;

- for (i = 0; i < npages; i++) {
+ while (gfn < end_gfn) {
+ unsigned long unmap_pages;
+ int order;
+
+ /* Get physical address */
phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
- pfn = phys >> PAGE_SHIFT;
- kvm_release_pfn_clean(pfn);
- gfn++;
- }
+ pfn = phys >> PAGE_SHIFT;
+
+ /* Unmap address from IO address space */
+ order = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
+ unmap_pages = 1ULL << order;

- iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGE_SIZE * npages);
+ /* Unpin all pages we just unmapped to not leak any memory */
+ kvm_unpin_pages(kvm, pfn, unmap_pages);
+
+ gfn += unmap_pages;
+ }
}

static int kvm_iommu_unmap_memslots(struct kvm *kvm)

Joerg Roedel

unread,

Jan 28, 2010, 6:50:02 AM1/28/10

to

This patch changes the iommu-api functions for mapping and
unmapping page ranges to use the new page-size based
interface. This allows to remove the range based functions
later.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

drivers/pci/intel-iommu.c | 22 ++++++++++++----------
1 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index a714e3d..f1d3b85 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3626,14 +3626,15 @@ static void intel_iommu_detach_device(struct iommu_domain *domain,
domain_remove_one_dev_info(dmar_domain, pdev);
}

-static int intel_iommu_map_range(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t hpa,

- size_t size, int iommu_prot)

+static int intel_iommu_map(struct iommu_domain *domain,
+ unsigned long iova, phys_addr_t hpa,
+ int gfp_order, int iommu_prot)
{
struct dmar_domain *dmar_domain = domain->priv;
u64 max_addr;
int addr_width;
int prot = 0;
+ size_t size;
int ret;

if (iommu_prot & IOMMU_READ)
@@ -3643,6 +3644,7 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
prot |= DMA_PTE_SNP;

+ size = 0x1000UL << gfp_order;
max_addr = iova + size;
if (dmar_domain->max_addr < max_addr) {
int min_agaw;
@@ -3669,19 +3671,19 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
return ret;
}

-static void intel_iommu_unmap_range(struct iommu_domain *domain,

- unsigned long iova, size_t size)

+static int intel_iommu_unmap(struct iommu_domain *domain,
+ unsigned long iova, int gfp_order)
{
struct dmar_domain *dmar_domain = domain->priv;
-
- if (!size)
- return;
+ size_t size = 0x1000UL << gfp_order;

dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
(iova + size - 1) >> VTD_PAGE_SHIFT);

if (dmar_domain->max_addr == iova + size)
dmar_domain->max_addr = iova;
+
+ return gfp_order;
}

static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
@@ -3714,8 +3716,8 @@ static struct iommu_ops intel_iommu_ops = {

.domain_destroy = intel_iommu_domain_destroy,
.attach_dev = intel_iommu_attach_device,
.detach_dev = intel_iommu_detach_device,

- .map_range = intel_iommu_map_range,
- .unmap_range = intel_iommu_unmap_range,
+ .map = intel_iommu_map,
+ .unmap = intel_iommu_unmap,

.iova_to_phys = intel_iommu_iova_to_phys,
.domain_has_cap = intel_iommu_domain_has_cap,
};

Joerg Roedel

unread,

Jan 28, 2010, 6:50:02 AM1/28/10

to

This patch adds new callbacks for mapping and unmapping
pages to the iommu_ops structure. These callbacks are aware
of page sizes which makes them different to the
->{un}map_range callbacks.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

drivers/base/iommu.c | 6 ++++++
include/linux/iommu.h | 4 ++++
2 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index cf7cbec..55d37e4 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -119,6 +119,9 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,

BUG_ON((iova | paddr) & invalid_mask);

+ if (iommu_ops->map)

+ return iommu_ops->map(domain, iova, paddr, gfp_order, prot);

+

return iommu_ops->map_range(domain, iova, paddr, size, prot);
}

EXPORT_SYMBOL_GPL(iommu_map);
@@ -133,6 +136,9 @@ int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order)

BUG_ON(iova & invalid_mask);

+ if (iommu_ops->unmap)

+ return iommu_ops->unmap(domain, iova, gfp_order);

+
iommu_ops->unmap_range(domain, iova, size);

return gfp_order;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6d0035b..5a7a3d8 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -36,6 +36,10 @@ struct iommu_ops {
void (*domain_destroy)(struct iommu_domain *domain);
int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
+ int (*map)(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, int gfp_order, int prot);
+ int (*unmap)(struct iommu_domain *domain, unsigned long iova,
+ int gfp_order);

int (*map_range)(struct iommu_domain *domain, unsigned long iova,

phys_addr_t paddr, size_t size, int prot);

void (*unmap_range)(struct iommu_domain *domain, unsigned long iova,

Joerg Roedel

unread,

Jan 28, 2010, 6:50:02 AM1/28/10

to

These two functions provide support for mapping and
unmapping physical addresses to io virtual addresses. The
difference to the iommu_(un)map_range() is that the new
functions take a gfp_order parameter instead of a size. This
allows the IOMMU backend implementations to detect easier if
a given range can be mapped by larger page sizes.
These new functions should replace the old ones in the long
term.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

drivers/base/iommu.c | 31 +++++++++++++++++++++++++++++++
include/linux/iommu.h | 16 ++++++++++++++++
2 files changed, 47 insertions(+), 0 deletions(-)

diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index f4c86c4..cf7cbec 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -107,3 +107,34 @@ int iommu_domain_has_cap(struct iommu_domain *domain,
return iommu_ops->domain_has_cap(domain, cap);
}
EXPORT_SYMBOL_GPL(iommu_domain_has_cap);
+
+int iommu_map(struct iommu_domain *domain, unsigned long iova,

+ phys_addr_t paddr, int gfp_order, int prot)

+{
+ unsigned long invalid_mask;
+ size_t size;
+

+ size = 0x1000UL << gfp_order;

+ invalid_mask = size - 1;
+
+ BUG_ON((iova | paddr) & invalid_mask);
+

+ return iommu_ops->map_range(domain, iova, paddr, size, prot);

+}
+EXPORT_SYMBOL_GPL(iommu_map);
+
+int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order)
+{
+ unsigned long invalid_mask;
+ size_t size;
+

+ size = 0x1000UL << gfp_order;

+ invalid_mask = size - 1;
+
+ BUG_ON(iova & invalid_mask);
+

+ iommu_ops->unmap_range(domain, iova, size);

+
+ return gfp_order;
+}
+EXPORT_SYMBOL_GPL(iommu_unmap);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0f18f37..6d0035b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -60,6 +60,10 @@ extern int iommu_map_range(struct iommu_domain *domain, unsigned long iova,

phys_addr_t paddr, size_t size, int prot);

extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
size_t size);
+extern int iommu_map(struct iommu_domain *domain, unsigned long iova,

+ phys_addr_t paddr, int gfp_order, int prot);

+extern int iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+ int gfp_order);
extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
unsigned long iova);
extern int iommu_domain_has_cap(struct iommu_domain *domain,
@@ -108,6 +112,18 @@ static inline void iommu_unmap_range(struct iommu_domain *domain,
{
}

+static inline int iommu_map(struct iommu_domain *domain, unsigned long iova,

+ phys_addr_t paddr, int gfp_order, int prot)

+{
+ return -ENODEV;
+}
+
+static inline int iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+ int gfp_order)
+{
+ return -ENODEV;
+}
+
static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
unsigned long iova)
{

David Woodhouse

unread,

Jan 28, 2010, 4:10:01 PM1/28/10

to

On Thu, 2010-01-28 at 12:37 +0100, Joerg Roedel wrote:
> This patch changes the iommu-api functions for mapping and
> unmapping page ranges to use the new page-size based
> interface. This allows to remove the range based functions
> later.

> + size = 0x1000UL << gfp_order;

Um, that's not a page-size based interface. Page size isn't always 4KiB;
this code runs on IA64 too.

We have enough fun with CPU vs. DMA page size on IA64 already :)

--
David Woodhouse Open Source Technology Centre
David.W...@intel.com Intel Corporation

Marcelo Tosatti

unread,

Jan 28, 2010, 5:30:02 PM1/28/10

to

On Thu, Jan 28, 2010 at 12:37:57PM +0100, Joerg Roedel wrote:
> This patch changes the implementation of of
> kvm_iommu_map_pages to map the pages with the host page size
> into the io virtual address space.
>
> Signed-off-by: Joerg Roedel <joerg....@amd.com>
> ---
> virt/kvm/iommu.c | 106 ++++++++++++++++++++++++++++++++++++++++++-----------
> 1 files changed, 84 insertions(+), 22 deletions(-)
>
> diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
> index 65a5143..92a434d 100644
> --- a/virt/kvm/iommu.c
> +++ b/virt/kvm/iommu.c
> @@ -32,12 +32,27 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
> static void kvm_iommu_put_pages(struct kvm *kvm,
> gfn_t base_gfn, unsigned long npages);
>
> +static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
> + gfn_t gfn, unsigned long size)
> +{
> + gfn_t end_gfn;
> + pfn_t pfn;
> +
> + pfn = gfn_to_pfn_memslot(kvm, slot, gfn);

If gfn_to_pfn_memslot returns pfn of bad_page, you might create a
large iommu translation for it?

> + /* Map into IO address space */
> + r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
> + get_order(page_size), flags);
> +
> + gfn += page_size >> PAGE_SHIFT;

Should increase gfn after checking for failure, otherwise wrong
npages is passed to kvm_iommu_put_pages.

>
> - pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
> - r = iommu_map_range(domain,
> - gfn_to_gpa(gfn),
> - pfn_to_hpa(pfn),
> - PAGE_SIZE, flags);
> if (r) {
> printk(KERN_ERR "kvm_iommu_map_address:"
> "iommu failed to map pfn=%lx\n", pfn);
> goto unmap_pages;
> }
> - gfn++;
> +
> }
> +
> return 0;
>
> unmap_pages:
> - kvm_iommu_put_pages(kvm, slot->base_gfn, i);
> + kvm_iommu_put_pages(kvm, slot->base_gfn, gfn);
> return r;
> }

--

Joerg Roedel

unread,

Jan 29, 2010, 4:10:02 AM1/29/10

to

On Fri, Jan 29, 2010 at 09:59:42AM +1300, David Woodhouse wrote:
> On Thu, 2010-01-28 at 12:37 +0100, Joerg Roedel wrote:
> > This patch changes the iommu-api functions for mapping and
> > unmapping page ranges to use the new page-size based
> > interface. This allows to remove the range based functions
> > later.
>
> > + size = 0x1000UL << gfp_order;
>
> Um, that's not a page-size based interface. Page size isn't always 4KiB;
> this code runs on IA64 too.
>
> We have enough fun with CPU vs. DMA page size on IA64 already :)

Ah right. So this should be

size = PAGE_SIZE << gfp_order;

Right? The interface is meant to map the same amount of memory which
alloc_pages(gfp_order) would return. Same for the return value of the
unmap function.

Joerg

Joerg Roedel

unread,

Jan 29, 2010, 4:40:03 AM1/29/10

to

On Thu, Jan 28, 2010 at 08:24:55PM -0200, Marcelo Tosatti wrote:
> On Thu, Jan 28, 2010 at 12:37:57PM +0100, Joerg Roedel wrote:
> > +static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
> > + gfn_t gfn, unsigned long size)
> > +{
> > + gfn_t end_gfn;
> > + pfn_t pfn;
> > +
> > + pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
>
> If gfn_to_pfn_memslot returns pfn of bad_page, you might create a
> large iommu translation for it?

Right. But that was broken even before this patch. Anyway, I will fix
it.

> > + /* Map into IO address space */
> > + r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
> > + get_order(page_size), flags);
> > +
> > + gfn += page_size >> PAGE_SHIFT;
>
> Should increase gfn after checking for failure, otherwise wrong
> npages is passed to kvm_iommu_put_pages.

True. Will fix that too.

Thanks,

Joerg

Joerg Roedel

unread,

Feb 1, 2010, 9:20:02 AM2/1/10

to

On Fri, Jan 29, 2010 at 10:05:26AM +0100, Joerg Roedel wrote:
> > Um, that's not a page-size based interface. Page size isn't always 4KiB;
> > this code runs on IA64 too.
> >
> > We have enough fun with CPU vs. DMA page size on IA64 already :)
>
> Ah right. So this should be
>
> size = PAGE_SIZE << gfp_order;
>
> Right? The interface is meant to map the same amount of memory which
> alloc_pages(gfp_order) would return. Same for the return value of the
> unmap function.

Ok, here is an updated patch (also updated in the iommu/largepage
branch). Does it look ok to you David?

From a3ef8393d8027c795709e11b2f57c6013d2474a6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg....@amd.com>
Date: Wed, 20 Jan 2010 17:17:37 +0100
Subject: [PATCH 04/11] VT-d: Change {un}map_range functions to implement {un}map interface

This patch changes the iommu-api functions for mapping and
unmapping page ranges to use the new page-size based
interface. This allows to remove the range based functions
later.

Signed-off-by: Joerg Roedel <joerg....@amd.com>

---
drivers/pci/intel-iommu.c | 22 ++++++++++++----------
1 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index a714e3d..371dc56 100644

--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3626,14 +3626,15 @@ static void intel_iommu_detach_device(struct iommu_domain *domain,
domain_remove_one_dev_info(dmar_domain, pdev);
}

-static int intel_iommu_map_range(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t hpa,
- size_t size, int iommu_prot)
+static int intel_iommu_map(struct iommu_domain *domain,
+ unsigned long iova, phys_addr_t hpa,
+ int gfp_order, int iommu_prot)
{
struct dmar_domain *dmar_domain = domain->priv;
u64 max_addr;
int addr_width;
int prot = 0;
+ size_t size;
int ret;

if (iommu_prot & IOMMU_READ)
@@ -3643,6 +3644,7 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
prot |= DMA_PTE_SNP;

+ size = PAGE_SIZE << gfp_order;

max_addr = iova + size;
if (dmar_domain->max_addr < max_addr) {
int min_agaw;
@@ -3669,19 +3671,19 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
return ret;
}

-static void intel_iommu_unmap_range(struct iommu_domain *domain,
- unsigned long iova, size_t size)
+static int intel_iommu_unmap(struct iommu_domain *domain,
+ unsigned long iova, int gfp_order)
{
struct dmar_domain *dmar_domain = domain->priv;
-
- if (!size)
- return;

+ size_t size = PAGE_SIZE << gfp_order;

dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
(iova + size - 1) >> VTD_PAGE_SHIFT);

if (dmar_domain->max_addr == iova + size)
dmar_domain->max_addr = iova;
+
+ return gfp_order;
}

static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
@@ -3714,8 +3716,8 @@ static struct iommu_ops intel_iommu_ops = {
.domain_destroy = intel_iommu_domain_destroy,
.attach_dev = intel_iommu_attach_device,
.detach_dev = intel_iommu_detach_device,
- .map_range = intel_iommu_map_range,
- .unmap_range = intel_iommu_unmap_range,
+ .map = intel_iommu_map,
+ .unmap = intel_iommu_unmap,
.iova_to_phys = intel_iommu_iova_to_phys,
.domain_has_cap = intel_iommu_domain_has_cap,
};
--
1.6.6

Joerg Roedel

unread,

Feb 1, 2010, 9:20:02 AM2/1/10

to

On Fri, Jan 29, 2010 at 10:32:33AM +0100, Joerg Roedel wrote:
> On Thu, Jan 28, 2010 at 08:24:55PM -0200, Marcelo Tosatti wrote:
> > On Thu, Jan 28, 2010 at 12:37:57PM +0100, Joerg Roedel wrote:
> > > +static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > + gfn_t gfn, unsigned long size)
> > > +{
> > > + gfn_t end_gfn;
> > > + pfn_t pfn;
> > > +
> > > + pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
> >
> > If gfn_to_pfn_memslot returns pfn of bad_page, you might create a
> > large iommu translation for it?
>
> Right. But that was broken even before this patch. Anyway, I will fix
> it.
>
> > > + /* Map into IO address space */
> > > + r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
> > > + get_order(page_size), flags);
> > > +
> > > + gfn += page_size >> PAGE_SHIFT;
> >
> > Should increase gfn after checking for failure, otherwise wrong
> > npages is passed to kvm_iommu_put_pages.
>
> True. Will fix that too.

Here is the updated patch (also updated in the iommu/largepage branch of
my tree). Does it look ok?

From fa921fe46a92dadf7f66391b519cb9ca92e2ee83 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg....@amd.com>
Date: Mon, 11 Jan 2010 16:38:18 +0100
Subject: [PATCH 06/11] kvm: Change kvm_iommu_map_pages to map large pages

This patch changes the implementation of of
kvm_iommu_map_pages to map the pages with the host page size
into the io virtual address space.

Signed-off-by: Joerg Roedel <joerg....@amd.com>
---

virt/kvm/iommu.c | 113 +++++++++++++++++++++++++++++++++++++++++++----------
1 files changed, 91 insertions(+), 22 deletions(-)

diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 65a5143..f78286e 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -32,12 +32,30 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);

static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);

+static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, unsigned long size)
+{
+ gfn_t end_gfn;
+ pfn_t pfn;
+
+ pfn = gfn_to_pfn_memslot(kvm, slot, gfn);

+ end_gfn = gfn + (size >> PAGE_SHIFT);
+ gfn += 1;
+

+ if (is_error_pfn(pfn))

+ return pfn;
+
+ while (gfn < end_gfn)
+ gfn_to_pfn_memslot(kvm, slot, gfn++);
+
+ return pfn;
+}
+
int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
{
- gfn_t gfn = slot->base_gfn;
- unsigned long npages = slot->npages;
+ gfn_t gfn, end_gfn;
pfn_t pfn;
- int i, r = 0;
+ int r = 0;
struct iommu_domain *domain = kvm->arch.iommu_domain;
int flags;

@@ -45,31 +63,62 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)

if (!domain)
return 0;

+ gfn = slot->base_gfn;
+ end_gfn = gfn + slot->npages;
+
flags = IOMMU_READ | IOMMU_WRITE;
if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
flags |= IOMMU_CACHE;

- for (i = 0; i < npages; i++) {
- /* check if already mapped */
- if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
+
+ while (gfn < end_gfn) {
+ unsigned long page_size;
+
+ /* Check if already mapped */
+ if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
+ gfn += 1;

+ continue;

+ }
+
+ /* Get the page size we could use to map */
+ page_size = kvm_host_page_size(kvm, gfn);
+
+ /* Make sure the page_size does not exceed the memslot */
+ while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
+ page_size >>= 1;
+
+ /* Make sure gfn is aligned to the page size we want to map */
+ while ((gfn << PAGE_SHIFT) & (page_size - 1))
+ page_size >>= 1;
+
+ /*
+ * Pin all pages we are about to map in memory. This is
+ * important because we unmap and unpin in 4kb steps later.
+ */
+ pfn = kvm_pin_pages(kvm, slot, gfn, page_size);

+ if (is_error_pfn(pfn)) {

+ gfn += 1;
continue;
+ }

- pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
- r = iommu_map_range(domain,
- gfn_to_gpa(gfn),
- pfn_to_hpa(pfn),
- PAGE_SIZE, flags);

+ /* Map into IO address space */
+ r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
+ get_order(page_size), flags);

if (r) {
printk(KERN_ERR "kvm_iommu_map_address:"
"iommu failed to map pfn=%lx\n", pfn);
goto unmap_pages;
}
- gfn++;

+
+ gfn += page_size >> PAGE_SHIFT;

+

+
}
+
return 0;

unmap_pages:
- kvm_iommu_put_pages(kvm, slot->base_gfn, i);
+ kvm_iommu_put_pages(kvm, slot->base_gfn, gfn);
return r;
}

@@ -186,27 +235,47 @@ out_unmap:

return r;
}

+static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; ++i)
+ kvm_release_pfn_clean(pfn + i);
+}
+

static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages)

static int kvm_iommu_unmap_memslots(struct kvm *kvm)

--
1.6.6

Marcelo Tosatti

unread,

Feb 1, 2010, 2:40:02 PM2/1/10

to

Yes, addresses the concern.

Joerg Roedel

unread,

Feb 5, 2010, 6:10:01 AM2/5/10

to

Hi David,

On Mon, Feb 01, 2010 at 03:16:46PM +0100, Joerg Roedel wrote:
> On Fri, Jan 29, 2010 at 10:05:26AM +0100, Joerg Roedel wrote:
> > > Um, that's not a page-size based interface. Page size isn't always 4KiB;
> > > this code runs on IA64 too.
> > >
> > > We have enough fun with CPU vs. DMA page size on IA64 already :)
> >
> > Ah right. So this should be
> >
> > size = PAGE_SIZE << gfp_order;
> >
> > Right? The interface is meant to map the same amount of memory which
> > alloc_pages(gfp_order) would return. Same for the return value of the
> > unmap function.
>
> Ok, here is an updated patch (also updated in the iommu/largepage
> branch). Does it look ok to you David?

have you had a chance to look at the new version of the patch? If you
are fine with it and with the overall concept of this patchset I would
be cool if you could give your Acks. I would like to send this code to
Linus in the next merge window.

Thanks,

Joerg

Joerg Roedel

unread,

Feb 5, 2010, 6:10:02 AM2/5/10

to

Hi Marcelo, Avi,

Are there any further objections against this patchset? If not it would
be cool if you could give me some acks for the kvm specific parts of
this patchset.

Joerg

Avi Kivity

unread,

Feb 7, 2010, 4:40:02 AM2/7/10

to

On 01/28/2010 01:37 PM, Joerg Roedel wrote:
> These two functions provide support for mapping and
> unmapping physical addresses to io virtual addresses. The
> difference to the iommu_(un)map_range() is that the new
> functions take a gfp_order parameter instead of a size. This
> allows the IOMMU backend implementations to detect easier if
> a given range can be mapped by larger page sizes.
> These new functions should replace the old ones in the long
> term.
>

These seem to be less flexible in the long term. Sure, it is easier for
the backend to map to multiple page sizes if your iommu supports
arbitrary power-of-two page sizes, but we should make the APIs easier
for the callers, not the backend.

--
error compiling committee.c: too many arguments to function

Avi Kivity

unread,

Feb 7, 2010, 6:00:02 AM2/7/10

to

On 02/07/2010 12:50 PM, Joerg Roedel wrote:

> On Sun, Feb 07, 2010 at 11:38:30AM +0200, Avi Kivity wrote:
>
>> On 01/28/2010 01:37 PM, Joerg Roedel wrote:
>>
>>> These two functions provide support for mapping and
>>> unmapping physical addresses to io virtual addresses. The
>>> difference to the iommu_(un)map_range() is that the new
>>> functions take a gfp_order parameter instead of a size. This
>>> allows the IOMMU backend implementations to detect easier if
>>> a given range can be mapped by larger page sizes.
>>> These new functions should replace the old ones in the long
>>> term.
>>>
>>>
>> These seem to be less flexible in the long term. Sure, it is easier for
>> the backend to map to multiple page sizes if your iommu supports
>> arbitrary power-of-two page sizes, but we should make the APIs easier
>> for the callers, not the backend.
>>

> These functions are as flexible as the old ones which just tok a size.
> The benefit of the new interface is that is makes the ability of the
> IOMMU to map the area with a large page (an get the benefit of fewer
> hardware tlb walks) visible to the caller. With the old interface the
> caller is tempted to just map ist whole area using 4kb page sizes.
> It gives a bit more complexity to the caller, thats right. But given
> that the page allocator in Linux always returns pages which are aligned
> to its size takes a lot of that complexity away.
>
>

You are right - I was thinking of the kvm use case which is range
oriented, but the ordinary kernel interfaces are gfp_order oriented.

Joerg Roedel

unread,

Feb 7, 2010, 6:00:02 AM2/7/10

to

On Sun, Feb 07, 2010 at 11:38:30AM +0200, Avi Kivity wrote:

> On 01/28/2010 01:37 PM, Joerg Roedel wrote:
>> These two functions provide support for mapping and
>> unmapping physical addresses to io virtual addresses. The
>> difference to the iommu_(un)map_range() is that the new
>> functions take a gfp_order parameter instead of a size. This
>> allows the IOMMU backend implementations to detect easier if
>> a given range can be mapped by larger page sizes.
>> These new functions should replace the old ones in the long
>> term.
>>
>
> These seem to be less flexible in the long term. Sure, it is easier for
> the backend to map to multiple page sizes if your iommu supports
> arbitrary power-of-two page sizes, but we should make the APIs easier
> for the callers, not the backend.

These functions are as flexible as the old ones which just tok a size.

The benefit of the new interface is that is makes the ability of the
IOMMU to map the area with a large page (an get the benefit of fewer
hardware tlb walks) visible to the caller. With the old interface the
caller is tempted to just map ist whole area using 4kb page sizes.
It gives a bit more complexity to the caller, thats right. But given
that the page allocator in Linux always returns pages which are aligned
to its size takes a lot of that complexity away.

Joerg

Avi Kivity

unread,

Feb 7, 2010, 7:20:01 AM2/7/10

to

How could it return a bad_page? The whole thing is called for a valid slot.

--
error compiling committee.c: too many arguments to function

--

Avi Kivity

unread,

Feb 7, 2010, 7:20:01 AM2/7/10

to

On 01/28/2010 01:37 PM, Joerg Roedel wrote:

> This patch introduces a generic function to find out the
> host page size for a given gfn. This function is needed by
> the kvm iommu code. This patch also simplifies the x86
> host_mapping_level function.
>

Applied to kvm.git since this makes sense independently of the rest of
your patchset.

--
error compiling committee.c: too many arguments to function

--

Avi Kivity

unread,

Feb 7, 2010, 7:30:02 AM2/7/10

to

On 02/05/2010 01:01 PM, Joerg Roedel wrote:
>
>> Yes, addresses the concern.
>>
> Are there any further objections against this patchset? If not it would
> be cool if you could give me some acks for the kvm specific parts of
> this patchset.
>

There are two ways we can get the kvm bits in:

- you publish a git tree excluding the kvm patches, I merge your tree,
then apply the kvm specific patches. This is best for avoiding
merge-time conflicts if I get other patches for the same area.
- you push everything including the kvm bits.

I'm fine with both, though prefer the former. If you choose the latter,
ACK for the kvm patches.

--
error compiling committee.c: too many arguments to function

--

Joerg Roedel

unread,

Feb 7, 2010, 7:50:01 AM2/7/10

to

On Sun, Feb 07, 2010 at 02:22:35PM +0200, Avi Kivity wrote:
> On 02/05/2010 01:01 PM, Joerg Roedel wrote:
>>
>>> Yes, addresses the concern.
>>>
>> Are there any further objections against this patchset? If not it would
>> be cool if you could give me some acks for the kvm specific parts of
>> this patchset.
>>
>
> There are two ways we can get the kvm bits in:
>
> - you publish a git tree excluding the kvm patches, I merge your tree,
> then apply the kvm specific patches. This is best for avoiding
> merge-time conflicts if I get other patches for the same area.
> - you push everything including the kvm bits.
>
> I'm fine with both, though prefer the former. If you choose the latter,
> ACK for the kvm patches.

Hmm, the patch set is self-contained and hard to split. We had the same
problem with the initial iommu-api merge. Since I have further updates
for the AMD IOMMU driver I think its best to send the these patches and
my updates directly to Linus after the KVM updates are merged in the
next merge window. Thanks for your ACKs :-)

Joerg

Joerg Roedel

unread,

Feb 7, 2010, 9:20:02 AM2/7/10

to

On Sun, Feb 07, 2010 at 02:09:36PM +0200, Avi Kivity wrote:
> On 01/28/2010 01:37 PM, Joerg Roedel wrote:
>> This patch introduces a generic function to find out the
>> host page size for a given gfn. This function is needed by
>> the kvm iommu code. This patch also simplifies the x86
>> host_mapping_level function.
>>
>
> Applied to kvm.git since this makes sense independently of the rest of
> your patchset.

Yes, makes sense for this patch. I will rebase my branch once you pushed
it out.

Joerg

Marcelo Tosatti

unread,

Feb 7, 2010, 2:30:02 PM2/7/10

to

Userspace can pass a valid slot with a read-only vma, for example.

Avi Kivity

unread,

Feb 8, 2010, 4:30:02 AM2/8/10

to

On 02/07/2010 08:41 PM, Marcelo Tosatti wrote:
>
>> How could it return a bad_page? The whole thing is called for a valid slot.
>>
> Userspace can pass a valid slot with a read-only vma, for example.
>

Oh yes, or without a vma at all.

--
error compiling committee.c: too many arguments to function

--

[PATCH 01/11] iommu-api: Rename ->{un}map function pointers to ->{un}map_range

Joerg Roedel

Joerg Roedel

Joerg Roedel

Joerg Roedel

Joerg Roedel

Joerg Roedel

Joerg Roedel

Joerg Roedel

Joerg Roedel

David Woodhouse

Marcelo Tosatti

Joerg Roedel

Joerg Roedel

Joerg Roedel

Joerg Roedel

Marcelo Tosatti

Joerg Roedel

Joerg Roedel

Avi Kivity

Avi Kivity

Joerg Roedel

Avi Kivity

Avi Kivity

Avi Kivity

Joerg Roedel

Joerg Roedel

Marcelo Tosatti

Avi Kivity