__builtin_memcpy() slower than memcpy/bcopy (and on linux it is the opposite) ?

Luigi Rizzo

unread,

Jan 23, 2013, 11:32:38 AM1/23/13

to

Probably our compiler folks have some ideas on this...

When doing netmap i found that on FreeBSD memcpy/bcopy was expensive,
__builtin_memcpy() was even worse, and so i ended up writing
my custom routine, (called pkt_copy() in the program below).
This happens with gcc 4.2.1, clang, gcc 4.6.4

I was then surprised to notice that on a recent ubuntu using
gcc 4.6.2 (if that matters) the __builtin_memcpy beats other
methods by a large factor.

Here are the number in millions of calls per second. Is the test
program flawed, or the compiler is built with different options ?

Unfortunately i have no chance to run the two versions of the code
on the same machine, but the hardware should be relatively similar
(i7-2600 i@ 3.4 GHz on one, Xeon E5-1650 @ 3.2 GHz on the other)

BSD / Linux

block size (bytes) 31 32 64 2048

__builtin_memcpy 10 / 150 13 / 158 13 / 152 5.1 / 23.2
memcpy 23 / 64 47 / 64 45 / 64 5.4 / 3.8
bcopy 24 / 64 47 / 64 45 / 63 5.4 / 3.8
pkt_copy 65 / 63 65 / 63 64 / 63 5.5 / 3.7

cheers
luigi

/*
* Copyright (C) 2012 Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/

/*
* $Id: testlock.c 12015 2013-01-23 15:51:17Z luigi $
*
* Test program to study various ops and concurrency issues.
* Create multiple threads, possibly bind to cpus, and run a workload.
*
* cc -O2 -Werror -Wall testlock.c -o testlock -lpthread
* you might need -lrt
*/

#include <inttypes.h>
#include <sys/types.h>
#include <pthread.h> /* pthread_* */

#if defined(__APPLE__)

#include <libkern/OSAtomic.h>
#define atomic_add_int(p, n) OSAtomicAdd32(n, (int *)p)
#define atomic_cmpset_32(p, o, n) OSAtomicCompareAndSwap32(o, n, (int *)p)

#elif defined(linux)

int atomic_cmpset_32(volatile uint32_t *p, uint32_t old, uint32_t new)
{
int ret = *p == old;
*p = new;
return ret;
}

#if defined(HAVE_GCC_ATOMICS)
int atomic_add_int(volatile int *p, int v)
{
return __sync_fetch_and_add(p, v);
}
#else
inline
uint32_t atomic_add_int(uint32_t *p, int v)
{
__asm __volatile (
" lock xaddl %0, %1 ; "
: "+r" (v), /* 0 (result) */
"=m" (*p) /* 1 */
: "m" (*p)); /* 2 */
return (v);
}
#endif

#else /* FreeBSD */
#include <sys/param.h>
#include <machine/atomic.h>
#include <pthread_np.h> /* pthread w/ affinity */

#if __FreeBSD_version > 500000
#include <sys/cpuset.h> /* cpu_set */
#if __FreeBSD_version > 800000
#define HAVE_AFFINITY
#endif

inline void prefetch (const void *x)
{
__asm volatile("prefetcht0 %0" :: "m" (*(const unsigned long *)x));
}

#else /* FreeBSD 4.x */
int atomic_cmpset_32(volatile uint32_t *p, uint32_t old, uint32_t new)
{
int ret = *p == old;
*p = new;
return ret;
}

#define PRIu64 "llu"
#endif /* FreeBSD 4.x */

#endif /* FreeBSD */

#include <signal.h> /* signal */
#include <stdlib.h>
#include <stdio.h>
#include <poll.h>
#include <inttypes.h> /* PRI* macros */
#include <string.h> /* strcmp */
#include <fcntl.h> /* open */
#include <unistd.h> /* getopt */

#include <sys/sysctl.h> /* sysctl */
#include <sys/time.h> /* timersub */

static inline int min(int a, int b) { return a < b ? a : b; }

#define ONE_MILLION 1000000
/* debug support */
#define ND(format, ...)
#define D(format, ...) \
fprintf(stderr, "%s [%d] " format "\n", \
__FUNCTION__, __LINE__, ##__VA_ARGS__)

int verbose = 0;

#if 1//def MY_RDTSC
/* Wrapper around `rdtsc' to take reliable timestamps flushing the pipeline */
#define my_rdtsc(t) \
do { \
u_int __regs[4]; \
\
do_cpuid(0, __regs); \
(t) = rdtsc(); \
} while (0)

static __inline void
do_cpuid(u_int ax, u_int *p)
{
__asm __volatile("cpuid"
: "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3])
: "0" (ax) );
}

static __inline uint64_t
rdtsc(void)
{
uint64_t rv;

// XXX does not work on linux-64 bit
__asm __volatile("rdtscp" : "=A" (rv) : : "%rax");
return (rv);
}
#endif /* 1 */

struct targ;

/*** global arguments for all threads ***/
struct glob_arg {
struct {
uint32_t ctr[1024];
} v __attribute__ ((aligned(256) ));
int64_t m_cycles; /* total cycles */
int nthreads;
int cpus;
int privs; // 1 if has IO privileges
int arg; // microseconds in usleep
char *test_name;
void (*fn)(struct targ *);
uint64_t scale; // scaling factor
char *scale_name; // scaling factor
};

/*
* Arguments for a new thread.
*/
struct targ {
struct glob_arg *g;
int completed;
u_int *glob_ctr;
uint64_t volatile count;
struct timeval tic, toc;
int me;
pthread_t thread;
int affinity;
};

static struct targ *ta;
static int global_nthreads;

/* control-C handler */
static void
sigint_h(int sig)
{
int i;

(void)sig; /* UNUSED */
for (i = 0; i < global_nthreads; i++) {
/* cancel active threads. */
if (ta[i].completed)
continue;
D("Cancelling thread #%d\n", i);
pthread_cancel(ta[i].thread);
ta[i].completed = 0;
}
signal(SIGINT, SIG_DFL);
}

/* sysctl wrapper to return the number of active CPUs */
static int
system_ncpus(void)
{
#ifdef linux
return 1;
#else
int mib[2] = { CTL_HW, HW_NCPU}, ncpus;
size_t len = sizeof(mib);
sysctl(mib, len / sizeof(mib[0]), &ncpus, &len, NULL, 0);
D("system had %d cpus", ncpus);

return (ncpus);
#endif
}

/*
* try to get I/O privileges so we can execute cli/sti etc.
*/
int
getprivs(void)
{
int fd = open("/dev/io", O_RDWR);
if (fd < 0) {
D("cannot open /dev/io, fd %d", fd);
return 0;
}
return 1;
}

/* set the thread affinity. */
/* ARGSUSED */
#ifdef HAVE_AFFINITY
static int
setaffinity(pthread_t me, int i)
{
cpuset_t cpumask;

if (i == -1)
return 0;

/* Set thread affinity affinity.*/
CPU_ZERO(&cpumask);
CPU_SET(i, &cpumask);

if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
D("Unable to set affinity");
return 1;
}
return 0;
}
#endif

static void *
td_body(void *data)
{
struct targ *t = (struct targ *) data;

#ifdef HAVE_AFFINITY
if (0 == setaffinity(t->thread, t->affinity))
#endif
{
/* main loop.*/
D("testing %ld cycles", t->g->m_cycles);
gettimeofday(&t->tic, NULL);
t->g->fn(t);
gettimeofday(&t->toc, NULL);
}
t->completed = 1;
return (NULL);
}

void
test_sel(struct targ *t)
{
int64_t m;
for (m = 0; m < t->g->m_cycles; m++) {
fd_set r;
struct timeval to = { 0, t->g->arg};
FD_ZERO(&r);
FD_SET(0,&r);
// FD_SET(1,&r);
select(1, &r, NULL, NULL, &to);
t->count++;
}
}

void
test_poll(struct targ *t)
{
int64_t m, ms = t->g->arg/1000;
for (m = 0; m < t->g->m_cycles; m++) {
struct pollfd x;
x.fd = 0;
x.events = POLLIN;
poll(&x, 1, ms);
t->count++;
}
}

void
test_usleep(struct targ *t)
{
int64_t m;
for (m = 0; m < t->g->m_cycles; m++) {
usleep(t->g->arg);
t->count++;
}
}

void
test_cli(struct targ *t)
{
int64_t m, i;
if (!t->g->privs) {
D("%s", "privileged instructions not available");
return;
}
for (m = 0; m < t->g->m_cycles; m++) {
for (i = 0; i < ONE_MILLION; i++) {
__asm __volatile("cli;");
__asm __volatile("and %eax, %eax;");
__asm __volatile("sti;");
t->count++;
}
}
}

void
test_nop(struct targ *t)
{
int64_t m, i;
for (m = 0; m < t->g->m_cycles; m++) {
for (i = 0; i < ONE_MILLION; i++) {
__asm __volatile("nop;");
__asm __volatile("nop; nop; nop; nop; nop;");
//__asm __volatile("nop; nop; nop; nop; nop;");
t->count++;
}
}
}

void
test_rdtsc1(struct targ *t)
{
int64_t m, i;
uint64_t v;
(void)v;
for (m = 0; m < t->g->m_cycles; m++) {
for (i = 0; i < ONE_MILLION; i++) {
my_rdtsc(v);
t->count++;
}
}
}

void
test_rdtsc(struct targ *t)
{
int64_t m, i;
volatile uint64_t v;
(void)v;
for (m = 0; m < t->g->m_cycles; m++) {
for (i = 0; i < ONE_MILLION; i++) {
v = rdtsc();
t->count++;
}
}
}

void
test_add(struct targ *t)
{
int64_t m, i;
for (m = 0; m < t->g->m_cycles; m++) {
for (i = 0; i < ONE_MILLION; i++) {
t->glob_ctr[0] ++;
t->count++;
}
}
}

void
test_atomic_add(struct targ *t)
{
int64_t m, i;
for (m = 0; m < t->g->m_cycles; m++) {
for (i = 0; i < ONE_MILLION; i++) {
atomic_add_int(t->glob_ctr, 1);
t->count++;
}
}
}

void
test_atomic_cmpset(struct targ *t)
{
int64_t m, i;
for (m = 0; m < t->g->m_cycles; m++) {
for (i = 0; i < ONE_MILLION; i++) {
atomic_cmpset_32(t->glob_ctr, m, i);
t->count++;
}
}
}

void
test_time(struct targ *t)
{
int64_t m;
for (m = 0; m < t->g->m_cycles; m++) {
#ifndef __APPLE__
struct timespec ts;
clock_gettime(t->g->arg, &ts);
#endif
t->count++;
}
}

void
test_gettimeofday(struct targ *t)
{
int64_t m;
struct timeval ts;
for (m = 0; m < t->g->m_cycles; m++) {
gettimeofday(&ts, NULL);
t->count++;
}
}

/*
* getppid is the simplest system call (getpid is cached by glibc
* so it would not be a good test)
*/
void
test_getpid(struct targ *t)
{
int64_t m;
for (m = 0; m < t->g->m_cycles; m++) {
getppid();
t->count++;
}
}

#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

static void
fast_bcopy(void *_src, void *_dst, int l)
{
uint64_t *src = _src;
uint64_t *dst = _dst;
if (unlikely(l >= 1024)) {
bcopy(src, dst, l);
return;
}
for (; likely(l > 0); l-=64) {
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}

// XXX if you want to make sure there is no inlining...
// static void (*fp)(void *_src, void *_dst, int l) = fast_bcopy;

#define HU 0x3ffff
static struct glob_arg huge[HU+1];

void
test_fastcopy(struct targ *t)
{
int64_t m;
int len = t->g->arg;

if (len > (int)sizeof(struct glob_arg))
len = sizeof(struct glob_arg);
D("fast copying %d bytes", len);
for (m = 0; m < t->g->m_cycles; m++) {
fast_bcopy(t->g, (void *)&huge[m & HU], len);
t->count+=1;
}
}

void
test_bcopy(struct targ *t)
{
int64_t m;
int len = t->g->arg;

if (len > (int)sizeof(struct glob_arg))
len = sizeof(struct glob_arg);
D("bcopying %d bytes", len);
for (m = 0; m < t->g->m_cycles; m++) {
bcopy(t->g, (void *)&huge[m & HU], len);
t->count+=1;
}
}

void
test_builtin_memcpy(struct targ *t)
{
int64_t m;
int len = t->g->arg;

if (len > (int)sizeof(struct glob_arg))
len = sizeof(struct glob_arg);
D("bcopying %d bytes", len);
for (m = 0; m < t->g->m_cycles; m++) {
__builtin_memcpy(t->g, (void *)&huge[m & HU], len);
t->count+=1;
}
}

void
test_memcpy(struct targ *t)
{
int64_t m;
int len = t->g->arg;

if (len > (int)sizeof(struct glob_arg))
len = sizeof(struct glob_arg);
D("memcopying %d bytes", len);
for (m = 0; m < t->g->m_cycles; m++) {
memcpy((void *)&huge[m & HU], t->g, len);
t->count+=1;
}
}

struct entry {
void (*fn)(struct targ *);
char *name;
uint64_t scale;
uint64_t m_cycles;
};
struct entry tests[] = {
{ test_sel, "select", 1, 1000 },
{ test_poll, "poll", 1, 1000 },
{ test_usleep, "usleep", 1, 1000 },
{ test_time, "time", 1, 1000 },
{ test_gettimeofday, "gettimeofday", 1, 1000000 },
{ test_getpid, "getpid", 1, 1000000 },
{ test_bcopy, "bcopy", 1000, 100000000 },
{ test_builtin_memcpy, "__builtin_memcpy", 1000, 100000000 },
{ test_memcpy, "memcpy", 1000, 100000000 },
{ test_fastcopy, "fastcopy", 1000, 100000000 },
{ test_add, "add", ONE_MILLION, 100000000 },
{ test_nop, "nop", ONE_MILLION, 100000000 },
{ test_atomic_add, "atomic-add", ONE_MILLION, 100000000 },
{ test_cli, "cli", ONE_MILLION, 100000000 },
{ test_rdtsc, "rdtsc", ONE_MILLION, 100000000 }, // unserialized
{ test_rdtsc1, "rdtsc1", ONE_MILLION, 100000000 }, // serialized
{ test_atomic_cmpset, "cmpset", ONE_MILLION, 100000000 },
{ NULL, NULL, 0, 0 }
};

static void
usage(void)
{
const char *cmd = "test";
int i;

fprintf(stderr,
"Usage:\n"
"%s arguments\n"
"\t-m name test name\n"
"\t-n cycles (millions) of cycles\n"
"\t-l arg bytes, usec, ... \n"
"\t-t threads total threads\n"
"\t-c cores cores to use\n"
"\t-a n force affinity every n cores\n"
"\t-A n cache contention every n bytes\n"
"\t-w report_ms milliseconds between reports\n"
"",
cmd);
fprintf(stderr, "Available tests:\n");
for (i = 0; tests[i].name; i++) {
fprintf(stderr, "%12s\n", tests[i].name);
}

exit(0);
}

static int64_t
getnum(const char *s)
{
int64_t n;
char *e;

n = strtol(s, &e, 0);
switch (e ? *e : '\0') {
case 'k':
case 'K':
return n*1000;
case 'm':
case 'M':
return n*1000*1000;
case 'g':
case 'G':
return n*1000*1000*1000;
case 't':
case 'T':
return n*1000*1000*1000*1000;
default:
return n;
}
}

struct glob_arg g;
int
main(int argc, char **argv)
{
int i, ch, report_interval, affinity, align;

ND("g has size %d", (int)sizeof(g));
report_interval = 250; /* ms */
affinity = 0; /* no affinity */
align = 0; /* global variable */

bzero(&g, sizeof(g));

g.privs = getprivs();
g.nthreads = 1;
g.cpus = 1;
g.m_cycles = 0;

while ( (ch = getopt(argc, argv, "A:a:m:n:w:c:t:vl:")) != -1) {
switch(ch) {
default:
D("bad option %c %s", ch, optarg);
usage();
break;
case 'A': /* align */
align = atoi(optarg);
break;
case 'a': /* force affinity */
affinity = atoi(optarg);
break;
case 'n': /* cycles */
g.m_cycles = getnum(optarg);
break;
case 'w': /* report interval */
report_interval = atoi(optarg);
break;
case 'c':
g.cpus = atoi(optarg);
break;
case 't':
g.nthreads = atoi(optarg);
break;
case 'm':
g.test_name = optarg;
break;
case 'l':
g.arg = getnum(optarg);
break;

case 'v':
verbose++;
break;
}
}
argc -= optind;
argv += optind;
if (!g.test_name && argc > 0)
g.test_name = argv[0];

if (g.test_name) {
for (i = 0; tests[i].name; i++) {
if (!strcmp(g.test_name, tests[i].name)) {
g.fn = tests[i].fn;
g.scale = tests[i].scale;
if (g.m_cycles == 0)
g.m_cycles = tests[i].m_cycles;
if (g.scale == ONE_MILLION)
g.scale_name = "M";
else if (g.scale == 1000)
g.scale_name = "K";
else {
g.scale = 1;
g.scale_name = "";
}
break;
}
}
}
if (!g.fn) {
D("%s", "missing/unknown test name");
usage();
}
i = system_ncpus();
if (g.cpus < 0 || g.cpus > i) {
D("%d cpus is too high, have only %d cpus", g.cpus, i);
usage();
}
if (g.cpus == 0)
g.cpus = i;
if (g.nthreads < 1) {
D("bad nthreads %d, using 1", g.nthreads);
g.nthreads = 1;
}
i = sizeof(g.v.ctr) / g.nthreads*sizeof(g.v.ctr[0]);
if (align < 0 || align > i) {
D("bad align %d, max is %d", align, i);
align = i;
}

/* Install ^C handler. */
global_nthreads = g.nthreads;
signal(SIGINT, sigint_h);

ta = calloc(g.nthreads, sizeof(*ta));
/*
* Now create the desired number of threads, each one
* using a single descriptor.
*/
D("start %d threads on %d cores", g.nthreads, g.cpus);
for (i = 0; i < g.nthreads; i++) {
struct targ *t = &ta[i];
bzero(t, sizeof(*t));
t->g = &g;
t->me = i;
t->glob_ctr = &g.v.ctr[(i*align)/sizeof(g.v.ctr[0])];
D("thread %d ptr %p", i, t->glob_ctr);
t->affinity = affinity ? (affinity*i) % g.cpus : -1;
if (pthread_create(&t->thread, NULL, td_body, t) == -1) {
D("Unable to create thread %d", i);
t->completed = 1;
}
}
/* the main loop */

{
uint64_t my_count = 0, prev = 0;
uint64_t count = 0;
double delta_t;
struct timeval tic, toc;

gettimeofday(&toc, NULL);
for (;;) {
struct timeval now, delta;
uint64_t pps;
int done = 0;

delta.tv_sec = report_interval/1000;
delta.tv_usec = (report_interval%1000)*1000;
select(0, NULL, NULL, NULL, &delta);
gettimeofday(&now, NULL);
timersub(&now, &toc, &toc);
my_count = 0;
for (i = 0; i < g.nthreads; i++) {
my_count += ta[i].count;
if (ta[i].completed)
done++;
}
pps = toc.tv_sec* ONE_MILLION + toc.tv_usec;
if (pps < 10000)
continue;
pps = (my_count - prev)*ONE_MILLION / pps;
D("%" PRIu64 " %scycles/s scale %" PRIu64 " in %dus", pps/g.scale,
g.scale_name, g.scale, (int)(toc.tv_sec* ONE_MILLION + toc.tv_usec));
prev = my_count;
toc = now;
if (done == g.nthreads)
break;
}
D("total %" PRIu64 " cycles", prev);

timerclear(&tic);
timerclear(&toc);
for (i = 0; i < g.nthreads; i++) {
pthread_join(ta[i].thread, NULL);

if (ta[i].completed == 0)
continue;

/*
* Collect threads o1utput and extract information about
* how log it took to send all the packets.
*/
count += ta[i].count;
if (!timerisset(&tic) || timercmp(&ta[i].tic, &tic, <))
tic = ta[i].tic;
if (!timerisset(&toc) || timercmp(&ta[i].toc, &toc, >))
toc = ta[i].toc;
}

/* print output. */
timersub(&toc, &tic, &toc);
delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
D("total %8.6f seconds", delta_t);
}

return (0);
}
/* end of file */
_______________________________________________
freebsd...@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-curre...@freebsd.org"

Dimitry Andric

unread,

Jan 23, 2013, 2:26:14 PM1/23/13

to

On 2013-01-23 17:32, Luigi Rizzo wrote:
> Probably our compiler folks have some ideas on this...
>
> When doing netmap i found that on FreeBSD memcpy/bcopy was expensive,
> __builtin_memcpy() was even worse,

Which compilation flags did you use to test this? When I compiled your
testcase program with clang 3.2, gcc 4.2 and gcc 4.7 at -O2, with all
other settings at their defaults, all three compilers just called libc's
memcpy() for the __builtin_memcpy tests.

For example, with gcc 4.7, the loop in test_builtin_memcpy becomes:

.L116:
movq %rbx, %rax
addq $1, %rbx
andl $262143, %eax
movq %rax, %rdx
salq $12, %rax
salq $8, %rdx
leaq huge(%rdx,%rax), %rsi
movq %r12, %rdx
call memcpy
movq 24(%rbp), %rax
movq 0(%rbp), %rdi
addq $1, %rax
cmpq %rbx, 4096(%rdi)
movq %rax, 24(%rbp)
jg .L116

The other routines are emitted as similar code. For test_bcopy() the
loop becomes:

.L123:
movq %rbx, %rax
addq $1, %rbx
andl $262143, %eax
movq %rax, %rdx
salq $12, %rax
salq $8, %rdx
leaq huge(%rdx,%rax), %rsi
movq %r12, %rdx
call bcopy
movq 24(%rbp), %rax
movq 0(%rbp), %rdi
addq $1, %rax
cmpq %rbx, 4096(%rdi)
movq %rax, 24(%rbp)
jg .L123

and similarly, for test_memcpy() it becomes:

.L109:
movq %rbx, %rax
addq $1, %rbx
andl $262143, %eax
movq %rax, %rdx
salq $12, %rax
salq $8, %rdx
leaq huge(%rdx,%rax), %rdi
movq %r12, %rdx
call memcpy
movq 24(%rbp), %rax
movq 0(%rbp), %rsi
addq $1, %rax
cmpq %rbx, 4096(%rsi)
movq %rax, 24(%rbp)
jg .L109

In our libc, bcopy and memcpy are implemented from the same source file,
which just the arguments swapped around. So I fail to see what could
cause the performance difference between __builtin_memcpy, memcpy and
bcopy you are seeing.

Also, on amd64, this is implemented in lib/libc/amd64/string/bcopy.S, so
the compiler does not have any influence on its performance. Note the
routine uses "rep movsq" as its main loop, which is apparently not the
best way on modern CPUs. Maybe you have found another instance where
hand-rolled assembly is slower than compiler-optimized code... :-)

With gcc 4.7, your fast_bcopy() gets inlined to this:

.L131:
movq (%rax), %rdx
subl $64, %ecx
movq %rdx, (%rsi)
movq 8(%rax), %rdx
movq %rdx, 8(%rsi)
movq 16(%rax), %rdx
movq %rdx, 16(%rsi)
movq 24(%rax), %rdx
movq %rdx, 24(%rsi)
movq 32(%rax), %rdx
movq %rdx, 32(%rsi)
movq 40(%rax), %rdx
movq %rdx, 40(%rsi)
movq 48(%rax), %r9
movq %r9, 48(%rsi)
movq 56(%rax), %r9
addq $64, %rax
movq %r9, 56(%rsi)
addq $64, %rsi
testl %ecx, %ecx
jg .L131

while clang 3.2 produces:

.LBB14_5:
movq (%rdi), %rcx
movq %rcx, (%rsi)
movq 8(%rdi), %rcx
movq %rcx, 8(%rsi)
movq 16(%rdi), %rcx
movq %rcx, 16(%rsi)
addl $-64, %eax
movq 24(%rdi), %rcx
movq %rcx, 24(%rsi)
testl %eax, %eax
movq 32(%rdi), %rcx
movq %rcx, 32(%rsi)
movq 40(%rdi), %rcx
movq %rcx, 40(%rsi)
movq 48(%rdi), %rcx
movq %rcx, 48(%rsi)
movq 56(%rdi), %rcx
leaq 64(%rdi), %rdi
movq %rcx, 56(%rsi)
leaq 64(%rsi), %rsi
jg .LBB14_5

Both are most likely faster than the "rep movsq" logic in bcopy.S.

> and so i ended up writing
> my custom routine, (called pkt_copy() in the program below).
> This happens with gcc 4.2.1, clang, gcc 4.6.4
>
> I was then surprised to notice that on a recent ubuntu using
> gcc 4.6.2 (if that matters) the __builtin_memcpy beats other
> methods by a large factor.

On Ubuntu, I see the same thing as on FreeBSD; __builtin_memcpy just
calls the regular memcpy. However, eglibc's memcpy looks to be more
highly optimized; there are several CPU-specific implementations, for
example for i386 and amd64 arches:

sysdeps/i386/i586/memcpy_chk.S
sysdeps/i386/i586/memcpy.S
sysdeps/i386/i686/memcpy_chk.S
sysdeps/i386/i686/memcpy.S
sysdeps/i386/i686/multiarch/memcpy_chk.S
sysdeps/i386/i686/multiarch/memcpy.S
sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
sysdeps/i386/i686/multiarch/memcpy-ssse3.S
sysdeps/x86_64/memcpy_chk.S
sysdeps/x86_64/memcpy.S
sysdeps/x86_64/multiarch/memcpy_chk.S
sysdeps/x86_64/multiarch/memcpy.S
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
sysdeps/x86_64/multiarch/memcpy-ssse3.S

Most likely, your test program on Ubuntu is calling the ssse3 version,
which should be much faster than any of the above loops.

> Here are the number in millions of calls per second. Is the test
> program flawed, or the compiler is built with different options ?

I think the test program looks fine after lightly skimming it.
FreeBSD's memcpy is probably just slower for the CPUs you have been
testing on.

Artem Belevich

unread,

Jan 23, 2013, 2:29:45 PM1/23/13

to

On Wed, Jan 23, 2013 at 8:32 AM, Luigi Rizzo <ri...@iet.unipi.it> wrote:
> Probably our compiler folks have some ideas on this...
>
> When doing netmap i found that on FreeBSD memcpy/bcopy was expensive,

> __builtin_memcpy() was even worse, and so i ended up writing

> my custom routine, (called pkt_copy() in the program below).
> This happens with gcc 4.2.1, clang, gcc 4.6.4

The program does not seem to have pkt_copy. It does have fast_bcopy.
Is that the one you meant by pkt_copy?

--Artem

Luigi Rizzo

unread,

Jan 23, 2013, 4:31:40 PM1/23/13

to

On Wed, Jan 23, 2013 at 11:26 AM, Dimitry Andric <d...@freebsd.org> wrote:

> Which compilation flags did you use to test this? When I compiled your
> testcase program with clang 3.2, gcc 4.2 and gcc 4.7 at -O2, with all
> other settings at their defaults, all three compilers just called libc's
> memcpy() for the __builtin_memcpy tests.
>

just -O2 -Wall -Werror, no special -m* or -f* flags

cheers
luigi

Luigi Rizzo

unread,

Jan 23, 2013, 4:36:50 PM1/23/13

to

On Wed, Jan 23, 2013 at 11:29 AM, Artem Belevich <a...@freebsd.org> wrote:

> On Wed, Jan 23, 2013 at 8:32 AM, Luigi Rizzo <ri...@iet.unipi.it> wrote:
> > Probably our compiler folks have some ideas on this...
> >
> > When doing netmap i found that on FreeBSD memcpy/bcopy was expensive,
> > __builtin_memcpy() was even worse, and so i ended up writing
> > my custom routine, (called pkt_copy() in the program below).
> > This happens with gcc 4.2.1, clang, gcc 4.6.4
>
> The program does not seem to have pkt_copy. It does have fast_bcopy.
> Is that the one you meant by pkt_copy?
>
>

sorry for the confusion, i did some last-minute name changes.

pkt_copy() is the name of the C function,
./testloop -m fastcopy is the name you need to use to run pkt_copy()

Luigi Rizzo

unread,

Jan 23, 2013, 9:54:42 PM1/23/13

to

On Wed, Jan 23, 2013 at 05:32:38PM +0100, Luigi Rizzo wrote:
> Probably our compiler folks have some ideas on this...
>
> When doing netmap i found that on FreeBSD memcpy/bcopy was expensive,
> __builtin_memcpy() was even worse, and so i ended up writing
> my custom routine, (called pkt_copy() in the program below).
> This happens with gcc 4.2.1, clang, gcc 4.6.4
>

> I was then surprised to notice that on a recent ubuntu using
> gcc 4.6.2 (if that matters) the __builtin_memcpy beats other
> methods by a large factor.

so, it turns out that in my test program I had swapped the
source and destination operands for __builtin_memcpy(), and
this substantially changed the memory access pattern.

With the correct operands, __builtin_memcpy == memcpy == bcopy
on both FreeBSD and Linux.
On FreeBSD pkt_copy is still faster than the other methods for
small packets, whereas on Linux they are equivalent.

If you are curious why swapping source and dst changed things
so dramatically:

the test was supposed to read from a large chunk of
memory (over 1GB) to avoid always hitting L1 or L2.
Swapping operands causes reads to hit always the same line,
thus saving a lot of misses. The difference between the two
machine then probably is due to how the cache is used on writes.

sorry for the noise.