[PATCH] ntb: ntb perf tool

Dave Jiang

unread,

Jan 5, 2016, 6:37:35 PM1/5/16

to allen...@emc.com, jdm...@kudzu.us, linu...@googlegroups.com

Providing raw performance data via a tool that directly access data from
NTB w/o any software overhead. This allows measurement of the hardware
performance limit. In revision one we are only doing single direction
CPU and DMA writes. Eventually we will provide bi-directional writes.

The measurement using DMA engine for NTB performance measure does
not measure the raw performance of DMA engine over NTB due to software
overhead. But it should provide the peak performance through the Linux DMA
driver.

Signed-off-by: Dave Jiang <dave....@intel.com>
---
drivers/ntb/test/Kconfig | 8
drivers/ntb/test/Makefile | 1
drivers/ntb/test/ntb_perf.c | 755 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 764 insertions(+)
create mode 100644 drivers/ntb/test/ntb_perf.c

diff --git a/drivers/ntb/test/Kconfig b/drivers/ntb/test/Kconfig
index 01852f9..aad598a 100644
--- a/drivers/ntb/test/Kconfig
+++ b/drivers/ntb/test/Kconfig
@@ -17,3 +17,11 @@ config NTB_TOOL
functioning at a basic level.

If unsure, say N.
+
+config NTB_PERF
+ tristate "NTB RAW Perf Measuring Tool"
+ help
+ This is a tool to measure raw NTB performance by transfering data
+ to and from the window without additional software interaction.
+
+ If unsure, say N.
diff --git a/drivers/ntb/test/Makefile b/drivers/ntb/test/Makefile
index 0ea32a3..9e77e0b 100644
--- a/drivers/ntb/test/Makefile
+++ b/drivers/ntb/test/Makefile
@@ -1,2 +1,3 @@
obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o
obj-$(CONFIG_NTB_TOOL) += ntb_tool.o
+obj-$(CONFIG_NTB_PERF) += ntb_perf.o
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
new file mode 100644
index 0000000..fe0f13c
--- /dev/null
+++ b/drivers/ntb/test/ntb_perf.c
@@ -0,0 +1,755 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copy
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * PCIe NTB Perf Linux driver
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+
+#include <linux/ntb.h>
+
+#define DRIVER_NAME "ntb_perf"
+#define DRIVER_DESCRIPTION "PCIe NTB Performance Measurement Tool"
+
+#define DRIVER_LICENSE "Dual BSD/GPL"
+#define DRIVER_VERSION "1.0"
+#define DRIVER_AUTHOR "Dave Jiang <dave....@intel.com>"
+
+#define PERF_LINK_DOWN_TIMEOUT 10
+#define PERF_VERSION 0xffff0001
+#define MAX_THREADS 32
+#define MAX_TEST_SIZE 1024*1024 /* 1M */
+#define MAX_SRCS 32
+#define DMA_OUT_RESOURCE_TO 50
+#define DMA_RETRIES 20
+
+MODULE_LICENSE(DRIVER_LICENSE);
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
+
+static struct dentry *perf_debugfs_dir;
+
+static unsigned int seg_order = 19; /* 512K */
+module_param(seg_order, uint, 0644);
+MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment for testing");
+
+static unsigned int run_order = 32; /* 4G */
+module_param(run_order, uint, 0644);
+MODULE_PARM_DESC(run_order, "size order [n^2] of total data to transfer");
+
+static bool use_dma = false;
+module_param(use_dma, bool, 0644);
+MODULE_PARM_DESC(use_dma, "Using DMA engine to measure performance");
+
+struct perf_mw {
+ phys_addr_t phys_addr;
+ resource_size_t phys_size;
+ resource_size_t xlat_align;
+ resource_size_t xlat_align_size;
+ void __iomem *vbase;
+ size_t xlat_size;
+ size_t buf_size;
+ void *virt_addr;
+ dma_addr_t dma_addr;
+};
+
+struct perf_ctx;
+
+struct pthr_ctx {
+ struct task_struct *thread;
+ struct perf_ctx *perf;
+ atomic_t dma_sync;
+ struct dma_chan *dma_chan;
+ int dma_prep_err;
+ int src_idx;
+ void *srcs[MAX_SRCS];
+};
+
+struct perf_ctx {
+ struct ntb_dev *ntb;
+ spinlock_t db_lock;
+ struct perf_mw mw;
+ bool link_is_up;
+ struct work_struct link_cleanup;
+ struct delayed_work link_work;
+ struct dentry *debugfs_node_dir;
+ struct dentry *debugfs_run;
+ struct dentry *debugfs_threads;
+ u8 perf_threads;
+ bool run;
+ struct pthr_ctx pthr_ctx[MAX_THREADS];
+ atomic_t tsync;
+};
+
+enum {
+ VERSION = 0,
+ MW_SZ_HIGH,
+ MW_SZ_LOW,
+ SPAD_MSG,
+ SPAD_ACK,
+ MAX_SPAD
+};
+
+static void perf_link_event(void *ctx)
+{
+ struct perf_ctx *perf = ctx;
+
+ if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)
+ schedule_delayed_work(&perf->link_work, 0);
+ else
+ schedule_work(&perf->link_cleanup);
+}
+
+static void perf_db_event(void *ctx, int vec)
+{
+ struct perf_ctx *perf = ctx;
+ u64 db_bits, db_mask;
+
+ db_mask = ntb_db_vector_mask(perf->ntb, vec);
+ db_bits = ntb_db_read(perf->ntb);
+
+ dev_dbg(&perf->ntb->dev, "doorbell vec %d mask %#llx bits %#llx\n",
+ vec, db_mask, db_bits);
+}
+
+static const struct ntb_ctx_ops perf_ops = {
+ .link_event = perf_link_event,
+ .db_event = perf_db_event,
+};
+
+static void perf_copy_callback(void *data)
+{
+ struct pthr_ctx *pctx = data;
+
+ atomic_dec(&pctx->dma_sync);
+}
+
+static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
+ char *src, size_t size)
+{
+ struct perf_ctx *perf = pctx->perf;
+ struct dma_async_tx_descriptor *txd;
+ struct dma_chan *chan = pctx->dma_chan;
+ struct dma_device *device;
+ struct dmaengine_unmap_data *unmap;
+ dma_cookie_t cookie;
+ size_t src_off, dst_off;
+ struct perf_mw *mw = &perf->mw;
+ u64 vbase, dst_vaddr;
+ dma_addr_t dst_phys;
+ int retries = 0;
+
+ if (!use_dma) {
+ memcpy_toio(dst, src, size);
+ return size;
+ }
+
+ if (!chan) {
+ dev_err(&perf->ntb->dev, "DMA engine does not exist\n");
+ return -EINVAL;
+ }
+
+ device = chan->device;
+ src_off = (size_t)src & ~PAGE_MASK;
+ dst_off = (size_t)dst & ~PAGE_MASK;
+
+ if (!is_dma_copy_aligned(device, src_off, dst_off, size))
+ return -ENODEV;
+
+ vbase = (u64)(u64 *)mw->vbase;
+ dst_vaddr = (u64)(u64 *)dst;
+ dst_phys = mw->phys_addr + (dst_vaddr - vbase);
+
+ unmap = dmaengine_get_unmap_data(device->dev, 1, GFP_NOWAIT);
+ if (!unmap)
+ return -ENOMEM;
+
+ unmap->len = size;
+ unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
+ src_off, size, DMA_TO_DEVICE);
+ if (dma_mapping_error(device->dev, unmap->addr[0]))
+ goto err_get_unmap;
+
+ unmap->to_cnt = 1;
+
+ do {
+ txd = device->device_prep_dma_memcpy(chan, dst_phys,
+ unmap->addr[0],
+ size, DMA_PREP_INTERRUPT);
+ if (!txd) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(DMA_OUT_RESOURCE_TO);
+ }
+ } while (!txd && (++retries < DMA_RETRIES));
+
+ if (!txd) {
+ pctx->dma_prep_err++;
+ goto err_get_unmap;
+ }
+
+ txd->callback = perf_copy_callback;
+ txd->callback_param = pctx;
+ dma_set_unmap(txd, unmap);
+
+ cookie = dmaengine_submit(txd);
+ if (dma_submit_error(cookie))
+ goto err_set_unmap;
+
+ atomic_inc(&pctx->dma_sync);
+ dma_async_issue_pending(chan);
+
+ return size;
+
+err_set_unmap:
+ dmaengine_unmap_put(unmap);
+err_get_unmap:
+ dmaengine_unmap_put(unmap);
+ return 0;
+}
+
+static int perf_move_data(struct pthr_ctx *pctx, char *dst, char *src,
+ u64 buf_size, u64 win_size, u64 total)
+{
+ int chunks, total_chunks, i;
+ int copied_chunks = 0;
+ u64 copied = 0, result;
+ char *tmp = dst;
+ u64 perf, diff_us;
+ ktime_t kstart, kstop, kdiff;
+
+ chunks = win_size / buf_size;
+ total_chunks = total / buf_size;
+ kstart = ktime_get();
+
+ for (i = 0; i < total_chunks; i++) {
+ result = perf_copy(pctx, tmp, src, buf_size);
+ copied += result;
+ copied_chunks++;
+ if (copied_chunks == chunks) {
+ tmp = dst;
+ copied_chunks = 0;
+ } else
+ tmp += buf_size;
+
+ /* probably should schedule every 4GB? */
+ if (((copied % (1ULL << 32)) == 0) && !use_dma) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
+ }
+
+ if (use_dma)
+ pr_info("%s: All DMA descriptors submitted\n", current->comm);
+
+ while (atomic_read(&pctx->dma_sync) != 0)
+ msleep(1);
+
+ kstop = ktime_get();
+ kdiff = ktime_sub(kstop, kstart);
+ diff_us = ktime_to_us(kdiff);
+
+ pr_info("%s: copied %Lu bytes\n", current->comm, copied);
+
+ pr_info("%s: lasted %Lu usecs\n", current->comm, diff_us);
+
+ perf = copied / diff_us;
+
+ pr_info("%s: MBytes/s: %Lu\n", current->comm, perf);
+
+ return 0;
+}
+
+static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
+{
+ return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
+}
+
+static int ntb_perf_thread(void *data)
+{
+ struct pthr_ctx *pctx = data;
+ struct perf_ctx *perf = pctx->perf;
+ struct pci_dev *pdev = perf->ntb->pdev;
+ struct perf_mw *mw = &perf->mw;
+ char *dst;
+ u64 win_size, buf_size, total;
+ void *src;
+ int rc, node, i;
+ struct dma_chan *dma_chan = NULL;
+
+ pr_info("kthread %s starting...\n", current->comm);
+
+ node = dev_to_node(&pdev->dev);
+
+ if (use_dma && !pctx->dma_chan) {
+ dma_cap_mask_t dma_mask;
+
+ dma_cap_zero(dma_mask);
+ dma_cap_set(DMA_MEMCPY, dma_mask);
+ dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
+ (void *)(unsigned long)node);
+ if (!dma_chan) {
+ pr_warn("%s: cannot acquire DMA channel, quitting\n",
+ current->comm);
+ return -ENODEV;
+ }
+ pctx->dma_chan = dma_chan;
+ }
+
+ for (i = 0; i < MAX_SRCS; i++) {
+ pctx->srcs[i] = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
+ if (!pctx->srcs[i]) {
+ rc = -ENOMEM;
+ goto err;
+ }
+ }
+
+ win_size = mw->phys_size;
+ buf_size = 1ULL << seg_order;
+ total = 1ULL << run_order;
+
+ if (buf_size > MAX_TEST_SIZE)
+ buf_size = MAX_TEST_SIZE;
+
+ dst = (char *)mw->vbase;
+
+ atomic_inc(&perf->tsync);
+ while (atomic_read(&perf->tsync) != perf->perf_threads)
+ schedule();
+
+ src = pctx->srcs[pctx->src_idx];
+ pctx->src_idx = (pctx->src_idx + 1) & (MAX_SRCS - 1);
+
+ rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);
+
+ atomic_dec(&perf->tsync);
+
+ if (rc < 0) {
+ pr_err("%s: failed\n", current->comm);
+ rc = -ENXIO;
+ goto err;
+ }
+
+ for (i = 0; i < MAX_SRCS; i++) {
+ if (pctx->srcs[i]) {
+ kfree(pctx->srcs[i]);
+ pctx->srcs[i] = NULL;
+ }
+ }
+
+ return 0;
+
+err:
+ for (i = 0; i < MAX_SRCS; i++) {
+ if (pctx->srcs[i]) {
+ kfree(pctx->srcs[i]);
+ pctx->srcs[i] = NULL;
+ }
+ }
+
+ if (dma_chan) {
+ dma_release_channel(dma_chan);
+ pctx->dma_chan = NULL;
+ }
+
+ return rc;
+}
+
+static void perf_free_mw(struct perf_ctx *perf)
+{
+ struct perf_mw *mw = &perf->mw;
+ struct pci_dev *pdev = perf->ntb->pdev;
+
+ if (!mw->virt_addr)
+ return;
+
+ ntb_mw_clear_trans(perf->ntb, 0);
+ dma_free_coherent(&pdev->dev, mw->buf_size,
+ mw->virt_addr, mw->dma_addr);
+ mw->xlat_size = 0;
+ mw->buf_size = 0;
+ mw->virt_addr = NULL;
+}
+
+static int perf_set_mw(struct perf_ctx *perf, resource_size_t size)
+{
+ struct perf_mw *mw = &perf->mw;
+ size_t xlat_size, buf_size;
+
+ if (!size)
+ return -EINVAL;
+
+ xlat_size = round_up(size, mw->xlat_align_size);
+ buf_size = round_up(size, mw->xlat_align);
+
+ if (mw->xlat_size == xlat_size)
+ return 0;
+
+ if (mw->buf_size)
+ perf_free_mw(perf);
+
+ mw->xlat_size = xlat_size;
+ mw->buf_size = buf_size;
+
+ mw->virt_addr = dma_alloc_coherent(&perf->ntb->pdev->dev, buf_size,
+ &mw->dma_addr, GFP_KERNEL);
+ if (!mw->virt_addr) {
+ mw->xlat_size = 0;
+ mw->buf_size = 0;
+ }
+
+ return 0;
+}
+
+static void perf_link_work(struct work_struct *work)
+{
+ struct perf_ctx *perf =
+ container_of(work, struct perf_ctx, link_work.work);
+ struct ntb_dev *ndev = perf->ntb;
+ struct pci_dev *pdev = ndev->pdev;
+ u32 val;
+ u64 size;
+ int rc;
+
+ dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+
+ size = perf->mw.phys_size;
+ ntb_peer_spad_write(ndev, MW_SZ_HIGH, (u32)(size >> 32));
+ ntb_peer_spad_write(ndev, MW_SZ_LOW, (u32)size);
+ ntb_peer_spad_write(ndev, VERSION, PERF_VERSION);
+
+ /* now read what peer wrote */
+ val = ntb_spad_read(ndev, VERSION);
+ dev_dbg(&pdev->dev, "Remote version = %#x\n", val);
+ if (val != PERF_VERSION)
+ goto out;
+
+ val = ntb_spad_read(ndev, MW_SZ_HIGH);
+ size = (u64)val << 32;
+
+ val = ntb_spad_read(ndev, MW_SZ_LOW);
+ size |= val;
+
+ dev_dbg(&pdev->dev, "Remote MW size = %#llx\n", size);
+
+ rc = perf_set_mw(perf, size);
+ if (rc)
+ goto out1;
+
+ perf->link_is_up = true;
+
+ return;
+
+out1:
+ perf_free_mw(perf);
+
+out:
+ if (ntb_link_is_up(ndev, NULL, NULL) == 1)
+ schedule_delayed_work(&perf->link_work,
+ msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT));
+}
+
+static void perf_link_cleanup(struct work_struct *work)
+{
+ struct perf_ctx *perf = container_of(work,
+ struct perf_ctx,
+ link_cleanup);
+ int i;
+
+ dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+
+ if (!perf->link_is_up)
+ cancel_delayed_work_sync(&perf->link_work);
+
+ for (i = 0; i < MAX_SPAD; i++)
+ ntb_spad_write(perf->ntb, i, 0);
+}
+
+static int perf_setup_mw(struct ntb_dev *ntb, struct perf_ctx *perf)
+{
+ struct perf_mw *mw;
+ int rc;
+
+ mw = &perf->mw;
+
+ rc = ntb_mw_get_range(ntb, 0, &mw->phys_addr, &mw->phys_size,
+ &mw->xlat_align, &mw->xlat_align_size);
+ if (rc)
+ return rc;
+
+ perf->mw.vbase = ioremap_wc(mw->phys_addr, mw->phys_size);
+ if (!mw->vbase)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *offp)
+{
+ struct perf_ctx *perf = filp->private_data;
+ char *buf;
+ ssize_t ret, out_offset;
+
+ if (!perf)
+ return 0;
+
+ buf = kmalloc(64, GFP_KERNEL);
+ out_offset = snprintf(buf, 64, "%d\n", perf->run);
+ ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
+ kfree(buf);
+
+ return ret;
+}
+
+static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *offp)
+{
+ struct perf_ctx *perf = filp->private_data;
+ int node, i;
+
+ if (perf->link_is_up == false)
+ return 0;
+
+ if (perf->perf_threads == 0)
+ return 0;
+
+ if (atomic_read(&perf->tsync) == 0)
+ perf->run = false;
+
+ if (perf->run == true) {
+ /* lets stop the threads */
+ perf->run = false;
+ for (i = 0; i < MAX_THREADS; i++) {
+ if (perf->pthr_ctx[i].thread) {
+ kthread_stop(perf->pthr_ctx[i].thread);
+ perf->pthr_ctx[i].thread = NULL;
+ } else
+ break;
+ }
+ } else {
+ perf->run = true;
+
+ if (perf->perf_threads > MAX_THREADS) {
+ perf->perf_threads = MAX_THREADS;
+ pr_info("Reset total threads to: %u\n", MAX_THREADS);
+ }
+
+ /* no greater than 1M */
+ if (seg_order > 20) {
+ seg_order = 20;
+ pr_info("Fix seg_order to %u\n", seg_order);
+ }
+
+ if (run_order < seg_order) {
+ run_order = seg_order;
+ pr_info("Fix run_order to %u\n", run_order);
+ }
+
+ node = dev_to_node(&perf->ntb->dev);
+ /* launch kernel thread */
+ for (i = 0; i < perf->perf_threads; i++) {
+ struct pthr_ctx *pctx;
+
+ pctx = &perf->pthr_ctx[i];
+ atomic_set(&pctx->dma_sync, 0);
+ pctx->perf = perf;
+ pctx->thread =
+ kthread_create_on_node(ntb_perf_thread,
+ (void *)pctx,
+ node, "ntb_perf %d", i);
+ if (pctx->thread)
+ wake_up_process(pctx->thread);
+ else {
+ perf->run = false;
+ for (i = 0; i < MAX_THREADS; i++) {
+ if (pctx->thread) {
+ kthread_stop(pctx->thread);
+ pctx->thread = NULL;
+ } else
+ break;
+ }
+ }
+
+ if (perf->run == false)
+ return -ENXIO;
+ }
+
+ }
+
+ return count;
+}
+
+static const struct file_operations ntb_perf_debugfs_run = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .read = debugfs_run_read,
+ .write = debugfs_run_write,
+};
+
+static int perf_debugfs_setup(struct perf_ctx *perf)
+{
+ struct pci_dev *pdev = perf->ntb->pdev;
+
+ if (!perf_debugfs_dir)
+ return -ENODEV;
+
+ perf->debugfs_node_dir = debugfs_create_dir(pci_name(pdev),
+ perf_debugfs_dir);
+ if (!perf->debugfs_node_dir)
+ return -ENODEV;
+
+ perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR,
+ perf->debugfs_node_dir, perf,
+ &ntb_perf_debugfs_run);
+ if (!perf->debugfs_run)
+ return -ENODEV;
+
+ perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
+ perf->debugfs_node_dir,
+ &perf->perf_threads);
+ if (!perf->debugfs_threads)
+ return -ENODEV;
+
+ return 0;
+}
+
+static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
+{
+ struct pci_dev *pdev = ntb->pdev;
+ struct perf_ctx *perf;
+ int node;
+ int rc = 0;
+
+ node = dev_to_node(&pdev->dev);
+
+ perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, node);
+ if (!perf) {
+ rc = -ENOMEM;
+ goto err_perf;
+ }
+
+ perf->ntb = ntb;
+ perf->perf_threads = 1;
+ atomic_set(&perf->tsync, 0);
+ perf->run = false;
+ spin_lock_init(&perf->db_lock);
+ perf_setup_mw(ntb, perf);
+ INIT_DELAYED_WORK(&perf->link_work, perf_link_work);
+ INIT_WORK(&perf->link_cleanup, perf_link_cleanup);
+
+ rc = ntb_set_ctx(ntb, perf, &perf_ops);
+ if (rc)
+ goto err_ctx;
+
+ perf->link_is_up = false;
+ ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+ ntb_link_event(ntb);
+
+ if (debugfs_initialized() && !perf_debugfs_dir) {
+ perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+ if (!perf_debugfs_dir)
+ goto err_ctx;
+
+ rc = perf_debugfs_setup(perf);
+ if (rc)
+ goto err_ctx;
+ }
+
+ return 0;
+
+err_ctx:
+ cancel_delayed_work_sync(&perf->link_work);
+ cancel_work_sync(&perf->link_cleanup);
+ kfree(perf);
+err_perf:
+ return rc;
+}
+
+static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb)
+{
+ struct perf_ctx *perf = ntb->ctx;
+ int i;
+
+ dev_dbg(&perf->ntb->dev, "%s called\n", __func__);
+
+ cancel_delayed_work_sync(&perf->link_work);
+ cancel_work_sync(&perf->link_cleanup);
+
+ ntb_clear_ctx(ntb);
+ ntb_link_disable(ntb);
+
+ if (perf_debugfs_dir) {
+ debugfs_remove_recursive(perf_debugfs_dir);
+ perf_debugfs_dir = NULL;
+ }
+
+ if (use_dma) {
+ for (i = 0; i < MAX_THREADS; i++) {
+ struct pthr_ctx *pctx = &perf->pthr_ctx[i];
+ if (pctx->dma_chan)
+ dma_release_channel(pctx->dma_chan);
+ }
+ }
+
+ kfree(perf);
+}
+
+static struct ntb_client perf_client = {
+ .ops = {
+ .probe = perf_probe,
+ .remove = perf_remove,
+ },
+};
+module_ntb_client(perf_client);

Jon Mason

unread,

Jan 6, 2016, 10:05:47 AM1/6/16

to Dave Jiang, Hubbe, Allen, linu...@googlegroups.com

nit, remove the new line above

> +#include <linux/ntb.h>
> +
> +#define DRIVER_NAME "ntb_perf"
> +#define DRIVER_DESCRIPTION "PCIe NTB Performance Measurement Tool"
> +
> +#define DRIVER_LICENSE "Dual BSD/GPL"
> +#define DRIVER_VERSION "1.0"
> +#define DRIVER_AUTHOR "Dave Jiang <dave....@intel.com>"
> +
> +#define PERF_LINK_DOWN_TIMEOUT 10
> +#define PERF_VERSION 0xffff0001
> +#define MAX_THREADS 32
> +#define MAX_TEST_SIZE 1024*1024 /* 1M */

using SZ_1M would be better than defining it here.

> +#define MAX_SRCS 32

Is this related to max threads above? If so, make it move obvious and
tie them together.

The comparison to one seems unnecessary

I'd prefer you #define SZ_4G and use it above

Please use upper_32_bits(n)

> + ntb_peer_spad_write(ndev, MW_SZ_LOW, (u32)size);

Please use lower_32_bits(n)

> + ntb_peer_spad_write(ndev, VERSION, PERF_VERSION);
> +
> + /* now read what peer wrote */
> + val = ntb_spad_read(ndev, VERSION);
> + dev_dbg(&pdev->dev, "Remote version = %#x\n", val);
> + if (val != PERF_VERSION)

It might be better to my the debug comparison below the if check and
make it an error/warning. That way it becomes more obvious if this is
ever triggered.

Complete aside here, but we might need to add logic to the core NTB
code to query the number of available SPADs and error out of we try to
get more than that.

Style suggestion, !perf->link_is_up

> + return 0;
> +
> + if (perf->perf_threads == 0)
> + return 0;
> +
> + if (atomic_read(&perf->tsync) == 0)
> + perf->run = false;
> +
> + if (perf->run == true) {

Style suggestion, "if (perf->run) {"

> + /* lets stop the threads */
> + perf->run = false;
> + for (i = 0; i < MAX_THREADS; i++) {
> + if (perf->pthr_ctx[i].thread) {
> + kthread_stop(perf->pthr_ctx[i].thread);
> + perf->pthr_ctx[i].thread = NULL;
> + } else
> + break;
> + }
> + } else {
> + perf->run = true;
> +
> + if (perf->perf_threads > MAX_THREADS) {
> + perf->perf_threads = MAX_THREADS;
> + pr_info("Reset total threads to: %u\n", MAX_THREADS);
> + }
> +
> + /* no greater than 1M */
> + if (seg_order > 20) {

SZ_1M / seg size?

It might be simpler to simply run through them all and stop them
(instead of checking for the first empty and quitting)

A few minor things, but overall looks good. Please fix them up and resubmit.

Thanks,
Jon

Allen Hubbe

unread,

Jan 6, 2016, 11:37:41 AM1/6/16

to Jon Mason, Dave Jiang, linu...@googlegroups.com

Jon Mason <jdm...@kudzu.us>:

> On Tue, Jan 5, 2016 at 6:37 PM, Dave Jiang <dave....@intel.com> wrote:

> > +static void perf_link_event(void *ctx)
> > +{
> > + struct perf_ctx *perf = ctx;
> > +
> > + if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)
>
> The comparison to one seems unnecessary
>

It api doc says it returns one, zero, or negative to indicate an error. The Intel NTB driver does not return an error. Would you prefer to change the api, to make the return simply a boolean, and have some other mechanism to query an error?

> > + if (!perf->link_is_up)
> > + cancel_delayed_work_sync(&perf->link_work);
> > +
> > + for (i = 0; i < MAX_SPAD; i++)
>
> Complete aside here, but we might need to add logic to the core NTB
> code to query the number of available SPADs and error out of we try to
> get more than that.

Use ntb_spad_count(), and the count does vary. For example, there are half the number of spads in RP/TB topology than B2B.

Jiang, Dave

unread,

Jan 6, 2016, 11:57:17 AM1/6/16

to jdm...@kudzu.us, allen...@emc.com, linu...@googlegroups.com

No these are arbitrary and not related.

You'll flood the dmesg if you do that because it's checked continuously
until both sides sync up. I moved it under the if check and kept it as
dev_dbg().

Jon Mason

unread,

Jan 6, 2016, 12:45:21 PM1/6/16

to Allen Hubbe, Dave Jiang, linu...@googlegroups.com

On Wed, Jan 6, 2016 at 11:37 AM, Allen Hubbe <Allen...@emc.com> wrote:
> Jon Mason <jdm...@kudzu.us>:
>> On Tue, Jan 5, 2016 at 6:37 PM, Dave Jiang <dave....@intel.com> wrote:
>
>> > +static void perf_link_event(void *ctx)
>> > +{
>> > + struct perf_ctx *perf = ctx;
>> > +
>> > + if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)
>>
>> The comparison to one seems unnecessary
>>
>
> It api doc says it returns one, zero, or negative to indicate an error. The Intel NTB driver does not return an error. Would you prefer to change the api, to make the return simply a boolean, and have some other mechanism to query an error?

Naa, this is fine. Sorry for the confusion.

>> > + if (!perf->link_is_up)
>> > + cancel_delayed_work_sync(&perf->link_work);
>> > +
>> > + for (i = 0; i < MAX_SPAD; i++)
>>
>> Complete aside here, but we might need to add logic to the core NTB
>> code to query the number of available SPADs and error out of we try to
>> get more than that.
>
> Use ntb_spad_count(), and the count does vary. For example, there are half the number of spads in RP/TB topology than B2B.

Well, I think this code is saying it only needs to count up to
MAX_SPAD here, but there needs to be a check in the probe function to
verify that MAX_SPAD isn't > ntb_spad_count() (if not already there).

>
> --
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-ntb+...@googlegroups.com.
> To post to this group, send email to linu...@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/000001d148a0%2484a18480%248de48d80%24%40emc.com.
> For more options, visit https://groups.google.com/d/optout.

Dave Jiang

unread,

Jan 6, 2016, 1:04:48 PM1/6/16

to allen...@emc.com, jdm...@kudzu.us, linu...@googlegroups.com

Providing raw performance data via a tool that directly access data from
NTB w/o any software overhead. This allows measurement of the hardware
performance limit. In revision one we are only doing single direction
CPU and DMA writes. Eventually we will provide bi-directional writes.

The measurement using DMA engine for NTB performance measure does
not measure the raw performance of DMA engine over NTB due to software
overhead. But it should provide the peak performance through the Linux DMA
driver.

Signed-off-by: Dave Jiang <dave....@intel.com>
---
drivers/ntb/test/Kconfig | 8
drivers/ntb/test/Makefile | 1

drivers/ntb/test/ntb_perf.c | 752 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 761 insertions(+)

create mode 100644 drivers/ntb/test/ntb_perf.c

diff --git a/drivers/ntb/test/Kconfig b/drivers/ntb/test/Kconfig

index 01852f9..a5d0eda 100644

--- a/drivers/ntb/test/Kconfig
+++ b/drivers/ntb/test/Kconfig
@@ -17,3 +17,11 @@ config NTB_TOOL
functioning at a basic level.

If unsure, say N.
+
+config NTB_PERF
+ tristate "NTB RAW Perf Measuring Tool"
+ help

+ This is a tool to measure raw NTB performance by transferring data

+ to and from the window without additional software interaction.
+
+ If unsure, say N.
diff --git a/drivers/ntb/test/Makefile b/drivers/ntb/test/Makefile
index 0ea32a3..9e77e0b 100644
--- a/drivers/ntb/test/Makefile
+++ b/drivers/ntb/test/Makefile
@@ -1,2 +1,3 @@
obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o
obj-$(CONFIG_NTB_TOOL) += ntb_tool.o
+obj-$(CONFIG_NTB_PERF) += ntb_perf.o
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
new file mode 100644

index 0000000..af06286
--- /dev/null
+++ b/drivers/ntb/test/ntb_perf.c
@@ -0,0 +1,752 @@

+#include <linux/sizes.h>

+#include <linux/ntb.h>
+
+#define DRIVER_NAME "ntb_perf"
+#define DRIVER_DESCRIPTION "PCIe NTB Performance Measurement Tool"
+
+#define DRIVER_LICENSE "Dual BSD/GPL"
+#define DRIVER_VERSION "1.0"
+#define DRIVER_AUTHOR "Dave Jiang <dave....@intel.com>"
+
+#define PERF_LINK_DOWN_TIMEOUT 10
+#define PERF_VERSION 0xffff0001
+#define MAX_THREADS 32

+#define MAX_TEST_SIZE SZ_1M
+#define MAX_SRCS 32

+#define DMA_OUT_RESOURCE_TO 50
+#define DMA_RETRIES 20

+#define SZ_4G (1ULL << 32)
+#define MAX_SEG_ORDER 20 /* no larger than 1M for kmalloc buffer */

+
+MODULE_LICENSE(DRIVER_LICENSE);
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
+
+static struct dentry *perf_debugfs_dir;
+
+static unsigned int seg_order = 19; /* 512K */
+module_param(seg_order, uint, 0644);
+MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment for testing");
+
+static unsigned int run_order = 32; /* 4G */
+module_param(run_order, uint, 0644);
+MODULE_PARM_DESC(run_order, "size order [n^2] of total data to transfer");
+

+static bool use_dma; /* default to 0 */

+static void perf_link_event(void *ctx)
+{
+ struct perf_ctx *perf = ctx;
+
+ if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)

+ schedule_delayed_work(&perf->link_work, 0);
+ else
+ schedule_work(&perf->link_cleanup);
+}
+
+static void perf_db_event(void *ctx, int vec)

+{
+ struct perf_ctx *perf = ctx;

+ u64 db_bits, db_mask;
+
+ db_mask = ntb_db_vector_mask(perf->ntb, vec);
+ db_bits = ntb_db_read(perf->ntb);
+
+ dev_dbg(&perf->ntb->dev, "doorbell vec %d mask %#llx bits %#llx\n",
+ vec, db_mask, db_bits);
+}
+
+static const struct ntb_ctx_ops perf_ops = {
+ .link_event = perf_link_event,
+ .db_event = perf_db_event,
+};
+
+static void perf_copy_callback(void *data)
+{
+ struct pthr_ctx *pctx = data;
+
+ atomic_dec(&pctx->dma_sync);
+}
+
+static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
+ char *src, size_t size)

+{

+ unmap->addr[0],

+

+ for (i = 0; i < total_chunks; i++) {
+ result = perf_copy(pctx, tmp, src, buf_size);
+ copied += result;
+ copied_chunks++;
+ if (copied_chunks == chunks) {
+ tmp = dst;
+ copied_chunks = 0;
+ } else
+ tmp += buf_size;
+

+ /* Probably should schedule every 4GB to prevent soft hang. */
+ if (((copied % SZ_4G) == 0) && !use_dma) {

+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
+ }
+
+ if (use_dma)
+ pr_info("%s: All DMA descriptors submitted\n", current->comm);
+
+ while (atomic_read(&pctx->dma_sync) != 0)

+ msleep(20);

+
+ kstop = ktime_get();
+ kdiff = ktime_sub(kstop, kstart);
+ diff_us = ktime_to_us(kdiff);
+

+ pr_info("%s: copied %llu bytes\n", current->comm, copied);
+
+ pr_info("%s: lasted %llu usecs\n", current->comm, diff_us);

+
+ perf = copied / diff_us;
+

+ pr_info("%s: MBytes/s: %llu\n", current->comm, perf);

+

+
+ for (i = 0; i < MAX_SRCS; i++) {

+ kfree(pctx->srcs[i]);
+ pctx->srcs[i] = NULL;
+ }
+

+ return 0;
+
+err:
+ for (i = 0; i < MAX_SRCS; i++) {

+ kfree(pctx->srcs[i]);
+ pctx->srcs[i] = NULL;
+ }
+

+{
+ struct perf_ctx *perf =

+ container_of(work, struct perf_ctx, link_work.work);
+ struct ntb_dev *ndev = perf->ntb;
+ struct pci_dev *pdev = ndev->pdev;
+ u32 val;
+ u64 size;
+ int rc;
+
+ dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+
+ size = perf->mw.phys_size;

+ ntb_peer_spad_write(ndev, MW_SZ_HIGH, upper_32_bits(size));
+ ntb_peer_spad_write(ndev, MW_SZ_LOW, lower_32_bits(size));

+ ntb_peer_spad_write(ndev, VERSION, PERF_VERSION);
+
+ /* now read what peer wrote */
+ val = ntb_spad_read(ndev, VERSION);

+ if (val != PERF_VERSION) {

+ dev_dbg(&pdev->dev, "Remote version = %#x\n", val);

+ goto out;
+ }

+{

+ struct perf_ctx *perf = container_of(work,
+ struct perf_ctx,
+ link_cleanup);
+ int i;
+
+ dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+

+ if (!perf->link_is_up)
+ cancel_delayed_work_sync(&perf->link_work);
+

+ for (i = 0; i < ntb_spad_count(perf->ntb); i++)

+ ntb_spad_write(perf->ntb, i, 0);
+}
+
+static int perf_setup_mw(struct ntb_dev *ntb, struct perf_ctx *perf)
+{
+ struct perf_mw *mw;
+ int rc;
+
+ mw = &perf->mw;
+

+ rc = ntb_mw_get_range(ntb, 0, &mw->phys_addr, &mw->phys_size,

+ &mw->xlat_align, &mw->xlat_align_size);

+ if (rc)
+ return rc;
+
+ perf->mw.vbase = ioremap_wc(mw->phys_addr, mw->phys_size);
+ if (!mw->vbase)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *offp)

+{

+ struct perf_ctx *perf = filp->private_data;
+ char *buf;
+ ssize_t ret, out_offset;
+
+ if (!perf)
+ return 0;
+
+ buf = kmalloc(64, GFP_KERNEL);
+ out_offset = snprintf(buf, 64, "%d\n", perf->run);
+ ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
+ kfree(buf);
+
+ return ret;
+}
+
+static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *offp)

+{

+ struct perf_ctx *perf = filp->private_data;
+ int node, i;
+

+ if (!perf->link_is_up)

+ return 0;
+
+ if (perf->perf_threads == 0)
+ return 0;
+
+ if (atomic_read(&perf->tsync) == 0)
+ perf->run = false;
+

+ if (perf->run) {

+ /* lets stop the threads */
+ perf->run = false;
+ for (i = 0; i < MAX_THREADS; i++) {
+ if (perf->pthr_ctx[i].thread) {

+ kthread_stop(perf->pthr_ctx[i].thread);

+ perf->pthr_ctx[i].thread = NULL;
+ } else
+ break;
+ }
+ } else {
+ perf->run = true;
+
+ if (perf->perf_threads > MAX_THREADS) {
+ perf->perf_threads = MAX_THREADS;
+ pr_info("Reset total threads to: %u\n", MAX_THREADS);
+ }
+
+ /* no greater than 1M */

+ if (seg_order > MAX_SEG_ORDER) {
+ seg_order = MAX_SEG_ORDER;

+ kthread_stop(pctx->thread);

+ pctx->thread = NULL;
+ }

+ perf->debugfs_node_dir, perf,

+ &ntb_perf_debugfs_run);
+ if (!perf->debugfs_run)
+ return -ENODEV;
+
+ perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
+ perf->debugfs_node_dir,

+ &perf->perf_threads);

+{

+ struct perf_ctx *perf = ntb->ctx;
+ int i;
+
+ dev_dbg(&perf->ntb->dev, "%s called\n", __func__);
+
+ cancel_delayed_work_sync(&perf->link_work);
+ cancel_work_sync(&perf->link_cleanup);
+
+ ntb_clear_ctx(ntb);
+ ntb_link_disable(ntb);
+

+ debugfs_remove_recursive(perf_debugfs_dir);
+ perf_debugfs_dir = NULL;
+
+ if (use_dma) {
+ for (i = 0; i < MAX_THREADS; i++) {
+ struct pthr_ctx *pctx = &perf->pthr_ctx[i];
+
+ if (pctx->dma_chan)

+ dma_release_channel(pctx->dma_chan);

Dave Jiang

unread,

Jan 11, 2016, 4:50:13 PM1/11/16

to allen...@emc.com, jdm...@kudzu.us, linu...@googlegroups.com

index 0000000..f18be06

+ chunks = div64_u64(win_size, buf_size);
+ total_chunks = div64_u64(total, buf_size);

+ perf = div64_u64(copied, diff_us);

Dave Jiang

unread,

Jan 11, 2016, 6:33:30 PM1/11/16

to allen...@emc.com, jdm...@kudzu.us, linu...@googlegroups.com

Providing raw performance data via a tool that directly access data from
NTB w/o any software overhead. This allows measurement of the hardware
performance limit. In revision one we are only doing single direction
CPU and DMA writes. Eventually we will provide bi-directional writes.

The measurement using DMA engine for NTB performance measure does
not measure the raw performance of DMA engine over NTB due to software
overhead. But it should provide the peak performance through the Linux DMA
driver.

Signed-off-by: Dave Jiang <dave....@intel.com>
---

drivers/ntb/ntb_transport.c | 6

drivers/ntb/test/Kconfig | 8
drivers/ntb/test/Makefile | 1
drivers/ntb/test/ntb_perf.c | 752 +++++++++++++++++++++++++++++++++++++++++++

4 files changed, 767 insertions(+)
create mode 100644 drivers/ntb/test/ntb_perf.c

diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index 60654d5..45583e6 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -177,6 +177,7 @@ struct ntb_transport_qp {
u64 tx_err_no_buf;
u64 tx_memcpy;
u64 tx_async;
+ u64 tx_dma_debug;
};

struct ntb_transport_mw {
@@ -486,6 +487,8 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
out_offset += snprintf(buf + out_offset, out_count - out_offset,
"tx_async - \t%llu\n", qp->tx_async);
out_offset += snprintf(buf + out_offset, out_count - out_offset,
+ "tx_dma_debug - \t%llu\n", qp->tx_dma_debug);
+ out_offset += snprintf(buf + out_offset, out_count - out_offset,
"tx_ring_full - \t%llu\n", qp->tx_ring_full);
out_offset += snprintf(buf + out_offset, out_count - out_offset,
"tx_err_no_buf - %llu\n", qp->tx_err_no_buf);
@@ -726,6 +729,7 @@ static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
qp->tx_err_no_buf = 0;
qp->tx_memcpy = 0;
qp->tx_async = 0;
+ qp->tx_dma_debug = 0;
}

static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
@@ -1499,6 +1503,8 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
if (!txd)
goto err_get_unmap;

+ qp->tx_dma_debug++;
+
txd->callback = ntb_tx_copy_callback;
txd->callback_param = entry;
dma_set_unmap(txd, unmap);

index 0000000..0ba0f7e

+ schedule_timeout(1);

Hubbe, Allen

unread,

Jan 12, 2016, 9:07:54 AM1/12/16

to Dave Jiang, jdm...@kudzu.us, linu...@googlegroups.com

From: Dave Jiang

> Providing raw performance data via a tool that directly access data from

> diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
> index 60654d5..45583e6 100644
> --- a/drivers/ntb/ntb_transport.c
> +++ b/drivers/ntb/ntb_transport.c
> @@ -177,6 +177,7 @@ struct ntb_transport_qp {

> @@ -486,6 +487,8 @@ static ssize_t debugfs_read(struct file *filp, char

> @@ -726,6 +729,7 @@ static void ntb_qp_link_down_reset(struct

> @@ -1499,6 +1503,8 @@ static void ntb_async_tx(struct ntb_transport_qp

Git am failed on some of these hunks.

Why are there are ntb_tranport changes in the ntb_perf patch?

Allen

Jiang, Dave

unread,

Jan 12, 2016, 9:31:09 AM1/12/16

to Hubbe, Allen, jdm...@kudzu.us, linu...@googlegroups.com

There shouldn't be. Stack git must've messed up something.

>
> Allen

>
> --
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-ntb+...@googlegroups.com.
> To post to this group, send email to linu...@googlegroups.com.

> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/40F65EF2B5E2254199711F58E3ACB84D99F07704%40MX104CL02.corp.emc.com.

Allen Hubbe

unread,

Jan 12, 2016, 10:05:48 AM1/12/16

to Jiang, Dave, jdm...@kudzu.us, linu...@googlegroups.com

From: Jiang, Dave <dave....@intel.com>

> > On Jan 12, 2016, at 7:08 AM, Hubbe, Allen <Allen...@emc.com> wrote:
> > From: Dave Jiang
> >> Providing raw performance data via a tool that directly access data
> from
> >
> >> diff --git a/drivers/ntb/ntb_transport.c
> b/drivers/ntb/ntb_transport.c
> >> index 60654d5..45583e6 100644
> >> --- a/drivers/ntb/ntb_transport.c
> >> +++ b/drivers/ntb/ntb_transport.c
> >> @@ -177,6 +177,7 @@ struct ntb_transport_qp {
> >> @@ -486,6 +487,8 @@ static ssize_t debugfs_read(struct file *filp,
> char
> >> @@ -726,6 +729,7 @@ static void ntb_qp_link_down_reset(struct
> >> @@ -1499,6 +1503,8 @@ static void ntb_async_tx(struct
> ntb_transport_qp
> >
> > Git am failed on some of these hunks.
> >
> > Why are there are ntb_tranport changes in the ntb_perf patch?
>
> There shouldn't be. Stack git must've messed up something.

I applied the patch without the ntb_transport changes. They are already provided by the patch, NTB: Address out of DMA descriptor issue with NTB.

With just the perf_tool changes, this v4 works for me, for cpu and dma.

It may still need some performance tuning. The numbers are much lower than I expected. Maybe I just need to run it with different parameters, or maybe there's some numa issue. It would be convenient to change the test parameters without reloading the modules.

There might also be a race in link-up. I got stuck with a lot of this, once, and no link related messages on the other side:
[60025.597128] ntb_perf: ntb_hw_intel 0000:00:03.0: Remote version = 0x0
[60025.607119] ntb_perf: ntb_hw_intel 0000:00:03.0: perf_link_work called
[60025.607122] ntb_perf: ntb_hw_intel 0000:00:03.0: Remote version = 0x0
[60025.617122] ntb_perf: ntb_hw_intel 0000:00:03.0: perf_link_work called
[60025.617127] ntb_perf: ntb_hw_intel 0000:00:03.0: Remote version = 0x0
[60025.627133] ntb_perf: ntb_hw_intel 0000:00:03.0: perf_link_work called
[60025.627136] ntb_perf: ntb_hw_intel 0000:00:03.0: Remote version = 0x0
[60025.637118] ntb_perf: ntb_hw_intel 0000:00:03.0: perf_link_work called
[60025.637121] ntb_perf: ntb_hw_intel 0000:00:03.0: Remote version = 0x0
[60025.647123] ntb_perf: ntb_hw_intel 0000:00:03.0: perf_link_work called
[60025.647127] ntb_perf: ntb_hw_intel 0000:00:03.0: Remote version = 0x0
[60025.657115] ntb_perf: ntb_hw_intel 0000:00:03.0: perf_link_work called

On my dual-ntb system, I see both ntb are used by ntb_perf:
[60124.223795] ntb_perf: ntb_hw_intel 0000:00:03.0: perf_link_cleanup called
[60124.226942] ntb_perf: ntb_hw_intel 0000:00:03.0: perf_link_work called
[60124.226947] ntb_perf: ntb_hw_intel 0000:00:03.0: Remote MW size = 0x100000
[60124.246408] ntb_perf: ntb_hw_intel 0000:80:03.0: perf_link_cleanup called
[60124.257389] ntb_perf: ntb_hw_intel 0000:80:03.0: perf_link_work called
[60124.264699] ntb_perf: ntb_hw_intel 0000:80:03.0: Remote MW size = 0x100000

Only one of these ntb is in the debugfs dir, so the other one can't be tested:
root@HYR:/sys/kernel/debug/ntb_perf# ls
0000:00:03.0

> +static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
> +{

...

> + if (debugfs_initialized() && !perf_debugfs_dir) {
> + perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
> + if (!perf_debugfs_dir)
> + goto err_ctx;
> +
> + rc = perf_debugfs_setup(perf);
> + if (rc)
> + goto err_ctx;
> + }

I think the problem is here. Should perf_debugfs_setup be outside of the if? It looks like the per-device setup is only done if the top level dir didn't already exist. If the top level dir already exists, per-device setup isn't called.

Allen

Allen Hubbe

unread,

Jan 12, 2016, 11:00:09 AM1/12/16

to Hubbe, Allen, Jiang, Dave, jdm...@kudzu.us, linu...@googlegroups.com

From: Allen Hubbe

> From: Jiang, Dave <dave....@intel.com>
> > On Jan 12, 2016, at 7:08 AM, Hubbe, Allen <Allen...@emc.com>

> > > From: Dave Jiang
> > >> Providing raw performance data via a tool that directly access data
> > from
> > >

> There might also be a race in link-up. I got stuck with a lot of this,
> once, and no link related messages on the other side:
> [60025.597128] ntb_perf: ntb_hw_intel 0000:00:03.0: Remote version = 0x0
> [60025.607119] ntb_perf: ntb_hw_intel 0000:00:03.0: perf_link_work

I think the race has is between clearing local spads in link cleanup, and the peer writing the same spads in link work.

Suppose B is already loaded, and then A is loaded.
B: link was down - link cleanup - clear spads on B.

A: modprobe ntb_perf.
A: ntb_link_enable - tells the ntb hw to enable, but link up is async.
A: ntb_link_event - trigger a link event, even if there was no link state change.

Race #1: is the link up fast enough on A?
A: perf_link_event - link state is down, schedule link cleanup.

A & B: perf_link_event - link state is now up, schedule link work.

Race #2: A clears good data written by B, AND B reads good data written by A?
B: link work - write good data to spads on A, continue below...
A: link cleanup - clear spads on A (oops!).
B: link work continued - read spads on B have good data, work is done!

Result:
A: link work - write good data to spads on B, read spads on A are zero, reschedule... forever.

This would be a very short window to race, so there might be something else, too. I don't recall seeing link work run on B at all, and I missed the opportunity to save that log. Now it's lost among all the others in dmesg, and I can't distinguish "no output" from "maybe these messages" that far back in the log. If a banner was printed each time the module is loaded, that would help.

In ntrdma, I avoid this race by /not/ clearing the local registers. To avoid reading bad data, the peer updates a counter as long as the data is valid. Observing a change in that counter value indicates good data. That avoids a race between A and B each writing the same registers.

Allen

Dave Jiang

unread,

Jan 13, 2016, 12:12:20 PM1/13/16

to allen...@emc.com, jdm...@kudzu.us, linu...@googlegroups.com

Providing raw performance data via a tool that directly access data from

NTB w/o any software overhead. This allows measurement of the hardware
performance limit. In revision one we are only doing single direction
CPU and DMA writes. Eventually we will provide bi-directional writes.

The measurement using DMA engine for NTB performance measure does
not measure the raw performance of DMA engine over NTB due to software
overhead. But it should provide the peak performance through the Linux DMA
driver.

Signed-off-by: Dave Jiang <dave....@intel.com>
---

drivers/ntb/test/Kconfig | 8
drivers/ntb/test/Makefile | 1
drivers/ntb/test/ntb_perf.c | 748 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 757 insertions(+)
create mode 100644 drivers/ntb/test/ntb_perf.c

index 0000000..dcf9035
--- /dev/null
+++ b/drivers/ntb/test/ntb_perf.c
@@ -0,0 +1,748 @@

+ schedule_delayed_work(&perf->link_work, 2*HZ);

+ if (rc)

+ goto out1;
+
+ perf->link_is_up = true;
+
+ return;
+
+out1:
+ perf_free_mw(perf);
+
+out:
+ if (ntb_link_is_up(ndev, NULL, NULL) == 1)
+ schedule_delayed_work(&perf->link_work,
+ msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT));
+}
+
+static void perf_link_cleanup(struct work_struct *work)
+{
+ struct perf_ctx *perf = container_of(work,
+ struct perf_ctx,
+ link_cleanup);
+

+ dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+
+ if (!perf->link_is_up)
+ cancel_delayed_work_sync(&perf->link_work);
+}
+

+static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
+{

+ struct pci_dev *pdev = ntb->pdev;
+ struct perf_ctx *perf;
+ int node;
+ int rc = 0;
+
+ node = dev_to_node(&pdev->dev);
+
+ perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, node);
+ if (!perf) {
+ rc = -ENOMEM;
+ goto err_perf;
+ }
+
+ perf->ntb = ntb;
+ perf->perf_threads = 1;
+ atomic_set(&perf->tsync, 0);
+ perf->run = false;
+ spin_lock_init(&perf->db_lock);
+ perf_setup_mw(ntb, perf);
+ INIT_DELAYED_WORK(&perf->link_work, perf_link_work);
+ INIT_WORK(&perf->link_cleanup, perf_link_cleanup);
+
+ rc = ntb_set_ctx(ntb, perf, &perf_ops);

+ if (rc)
+ goto err_ctx;
+

+ perf->link_is_up = false;
+ ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+ ntb_link_event(ntb);
+

+ if (debugfs_initialized() && !perf_debugfs_dir) {
+ perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+ if (!perf_debugfs_dir)
+ goto err_ctx;
+
+ rc = perf_debugfs_setup(perf);
+ if (rc)
+ goto err_ctx;
+ }

+
+ return 0;
+
+err_ctx:
+ cancel_delayed_work_sync(&perf->link_work);
+ cancel_work_sync(&perf->link_cleanup);
+ kfree(perf);
+err_perf:
+ return rc;
+}
+

+static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb)
+{

Dave Jiang

unread,

Jan 13, 2016, 3:29:51 PM1/13/16

to allen...@emc.com, jdm...@kudzu.us, linu...@googlegroups.com

index 0000000..c8a37ba

+ node = dev_to_node(&perf->ntb->pdev->dev);

+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ if (!perf_debugfs_dir) {

+ perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+ if (!perf_debugfs_dir)

+ return -ENODEV;
+ }

Allen Hubbe

unread,

Jan 14, 2016, 11:00:04 AM1/14/16

to Dave Jiang, jdm...@kudzu.us, linu...@googlegroups.com

> From: Dave Jiang <dave....@intel.com>
> Sent: Wednesday, January 13, 2016 3:30 PM
> To: Hubbe, Allen; jdm...@kudzu.us
> Cc: linu...@googlegroups.com
> Subject: [PATCH v6] ntb: ntb perf tool

>
> Providing raw performance data via a tool that directly access data from
> NTB w/o any software overhead. This allows measurement of the hardware
> performance limit. In revision one we are only doing single direction
> CPU and DMA writes. Eventually we will provide bi-directional writes.
>
> The measurement using DMA engine for NTB performance measure does
> not measure the raw performance of DMA engine over NTB due to software
> overhead. But it should provide the peak performance through the Linux
> DMA
> driver.
>
> Signed-off-by: Dave Jiang <dave....@intel.com>

Tested-by: Allen Hubbe <Allen...@emc.com>

This driver seems to work as described. My performance results were lower than expected, but I tested on a nonstandard ntb config.

Allen

Jon Mason

unread,

Jan 17, 2016, 10:19:37 PM1/17/16

to Allen Hubbe, Dave Jiang, linu...@googlegroups.com

On Thu, Jan 14, 2016 at 10:59:40AM -0500, Allen Hubbe wrote:
> > From: Dave Jiang <dave....@intel.com>
> > Sent: Wednesday, January 13, 2016 3:30 PM
> > To: Hubbe, Allen; jdm...@kudzu.us
> > Cc: linu...@googlegroups.com
> > Subject: [PATCH v6] ntb: ntb perf tool
> >
> > Providing raw performance data via a tool that directly access data from
> > NTB w/o any software overhead. This allows measurement of the hardware
> > performance limit. In revision one we are only doing single direction
> > CPU and DMA writes. Eventually we will provide bi-directional writes.
> >
> > The measurement using DMA engine for NTB performance measure does
> > not measure the raw performance of DMA engine over NTB due to software
> > overhead. But it should provide the peak performance through the Linux
> > DMA
> > driver.
> >
> > Signed-off-by: Dave Jiang <dave....@intel.com>
>
> Tested-by: Allen Hubbe <Allen...@emc.com>

Pulled in, thanks.

>
> This driver seems to work as described. My performance results were lower than expected, but I tested on a nonstandard ntb config.
>
> Allen
>

> --
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-ntb+...@googlegroups.com.
> To post to this group, send email to linu...@googlegroups.com.

> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/000201d14ee4%2493e3e0b0%24bbaba210%24%40emc.com.

Reply all

Reply to author

Forward