/*
* Copyright (c) 2004 by Richard Harter.
* This software may be freely used and modified without
* restriction provided that this notice is preserved unaltered.
*
*/
/*
* This function sorts an array of integers in place using a math
* sort based histogram sort. 2*len locations of auxilliary space
* are used, where len is the data length.
*/
#include <stdlib.h>
#define MIN_SORTABLE_LENGTH 128
#define PROCESSED -1
int
int_msort(int * data, int len)
{
int data_min; /* Minimum value in data array */
int data_max; /* Maximum value in data array */
int index_min; /* Index of minimum value */
int i; /* Major loop index */
int j; /* Minor loop index */
int ifac; /* Integer scaling factor */
int temp; /* Temp loc for data moves */
int *space; /* Ptr to allocated space */
int *cmp_index; /* Ptr to computed indices */
int *cum; /* Ptr to cumulative distrib. */
int *hist; /* Ptr to cumulative distrib. */
int *sorted; /* Ptr to almost sorted data */
if (len <= 1) return 1;
if (len == 2) {
if (data[0] > data[1]) {
temp = data[0];
data[0] = data[1];
data[1] = temp;
}
return 1;
}
data_min = data[0];
data_max = data[0];
index_min = 0;
for (i=len; --i>0;) {
if (data[i] < data_min) {
data_min = data[i];
index_min = i;
}
}
if (index_min != 0) {
data[index_min] = data[0];
data[0] = data_min;
}
if (len >= MIN_SORTABLE_LENGTH) {
for (i=len; --i>0;) {
if (data[i] > data_max) data_max = data[i];
}
/* Compute interpolation function */
ifac = (data_max - data_min)/(len -1);
if (ifac<=0) ifac=1;
while (((data_max-data_min)/ifac)>(len-1)) ifac++;
/* allocate index and histogram space */
space = malloc((2*len+1)*sizeof(int));
cmp_index = space;
cum = space + len;
hist = cum + 1;
sorted = hist;
memset(cum,0,(len+1)*sizeof(int));
/* Compute raw interpolation indices */
for (i=len; --i>=0;) {
hist[cmp_index[i] = (data[i] - data_min)/ifac] += 1;
}
for (i=1;i<len;i++) cum[i] += cum[i-1];
for (i=0;i<len;i++) cmp_index[i] = cum[cmp_index[i]]++;
/* Math sort proper */
for (i=len; --i >= 0;) sorted[cmp_index[i]] = data[i];
memcpy(data, sorted, len*sizeof(int));
free (space);
}
/* End of math sort section, begin insertion sort section */
{
for (i=1; i<len; i++) {
if (data[i] >= data[i-1]) continue;
temp = data[i];
data[i] = data[i-1];
for (j = i-2; temp < data[j]; j--) data[j+1] = data[j];
data[j+1] = temp;
}
}
}
Richard Harter, c...@tiac.net
http://home.tiac.net/~cri, http://www.varinoma.com
All my life I wanted to be someone;
I guess I should have been more specific.
You forgot <string.h>
What is the PROCESSED macro for?
I'm unfamiliar with a function that sometimes returns a value
and sometimes doesn't return a value. How do you use the retrun value?
space + len is undefined behavior when space == NULL.
--
pete
It has some problems.
If len is greater than or equal to MIN_SORTABLE_LENGTH, and
if all of the array elements are equal, then the sort crashes.
The sort isn't really very fast.
I have a simple shellsort which is about as fast.
I have a stable mergesort which is about as fast.
I have a quicksort which is more than twice as fast.
--
pete
>Richard Harter wrote:
>>
>> Here is a version of the math (distribution) sort for sorting
>> integers. Comments and suggestions for improvement are always
>> welcome. It is about three times as fast as qsort out of the box.
>
>It has some problems.
>
>If len is greater than or equal to MIN_SORTABLE_LENGTH, and
>if all of the array elements are equal, then the sort crashes.
Ouch. Quite right. The lines
if (ifac<=0) ifac=1;
while (((data_max-data_min)/ifac)>(len-1)) ifac++;
should read
if (ifac<=0) ifac=1;
else while (((data_max-data_min)/ifac)>(len-1)) ifac++;
Thanks for pointing this out.
>
>The sort isn't really very fast.
>I have a simple shellsort which is about as fast.
>I have a stable mergesort which is about as fast.
>I have a quicksort which is more than twice as fast.
That's a bit surprising; it doesn't quite accord with my results.
What sort of test bed are you using? Is there any chance I could look
at your code for comparison?
Thanks for the comments.
>Richard Harter wrote:
>>
>> Here is a version of the math (distribution) sort for sorting
>> integers. Comments and suggestions for improvement are always
>> welcome. It is about three times as fast as qsort out of the box.
>
>You forgot <string.h>
Why would I want it?
>
>What is the PROCESSED macro for?
An artifact that should have been deleted.
>
>I'm unfamiliar with a function that sometimes returns a value
>and sometimes doesn't return a value. How do you use the retrun value?
There should be a "return 1" at the end.
>
>space + len is undefined behavior when space == NULL.
And, of course, there should be a
if (!space) return 0;
after the malloc.
My bad. Thanks for the catches.
Prototypes for memset and memcpy.
<snip>
Try about 1000 numbers generated from the following:
data[i] = (int) pow(3.5, rand() % 18) + i;
and let us know how your histogram-sort does against shell-, merge-,
heap- or quick-sort.
--
Paul Hsieh
http://www.pobox.com/~qed/
http://bstring.sf.net/
> Here is a version of the math (distribution) sort for sorting
> integers. Comments and suggestions for improvement are always
> welcome. It is about three times as fast as qsort out of the box.
[snip detail]
I tried it but I could not time it as it does not sort properly for me
and I have a check on this in my timing routine.
For example it fails to sort the 130 integers on my system:
{ 6, 8, 0, 3, 6, 8, 3, 9, 6, 0,
8, 7, 9, 1, 2, 3, 4, 8, 1, 0,
9, 3, 9, 0, 6, 9, 9, 6, 5, 2,
8, 9, 7, 5, 8, 0, 1, 6, 3, 2,
3, 5, 8, 5, 1, 3, 6, 0, 5, 1,
5, 8, 7, 1, 4, 9, 3, 1, 7, 1,
7, 2, 2, 9, 2, 7, 9, 3, 6, 6,
7, 7, 0, 0, 8, 5, 6, 1, 3, 5,
5, 2, 6, 0, 6, 9, 0, 7, 3, 1,
9, 1, 7, 0, 7, 4, 1, 0, 7, 3,
0, 6, 4, 1, 8, 1, 0, 1, 8, 0,
2, 1, 0, 3, 6, 6, 5, 5, 8, 0,
8, 3, 2, 4, 8, 6, 7, 7, 0, 7
}
which should give:
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9
}
but gives:
{ 43, 57, 62, 73, 89, 104, 118, 130, 130, 130,
130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
130, 130, 130, 130, 130, 130, 130, 130, 131, 132,
132, 132, 132, 135, 141, 142, 144, 144, 145, 146
}
on my system. I have applied the fix you suggested in another post.
I am happy to do more tests if you find the problem or if you cannot
duplicate my result.
Brian Gladman
> Here is a version of the math (distribution) sort for sorting
> integers. Comments and suggestions for improvement are always
> welcome. It is about three times as fast as qsort out of the box.
[snip]
I tracked the bug that has turned up on my system to the line:
> for (i=0;i<len;i++) cmp_index[i] = cum[cmp_index[i]]++;
The problem here on my system is that cmp_index[i] gets updated before
the ++ increment is done so the index into cum[] used for this increment
is the new value of cmp_index[i] that has just been calculated. With:
for(i = 0; i < len; i++)
{
j = cmp_index[i]; cmp_index[i] = cum[j]++;
}
all goes well.
I will leave it up to the C language police to decide whether this is a
bug in my compiler or whether the original invokes undefined behaviour
(but I suspect it is the latter).
Brian Gladman
> That's a bit surprising; it doesn't quite accord with my results.
> What sort of test bed are you using? Is there any chance I could look
> at your code for comparison?
I used the fix suggested by BRG.
/* BEGIN e_driver.c */
/*
** See e_driver.txt.
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include "e_type.h"
#include "e_sort.h"
#include "distributions.h"
#define MAX_N 1999990
#define MIN_N (MAX_N - 1)
#define LOOPS 1
/*
** increase LOOPS if times are too fast
*/
#define SHOW_TOTAL_TIMES_ONLY 0
#define SORT_LIST { \
{nosort}, /* Do NOT comment out or move nosort() */ \
{s3sort,"s3sort","Shellsort"}, \
{stable,"stable","Stable merge sort, SMALL_MERGE is " \
xstr(SMALL_MERGE)}, \
{itsort,"itsort","Introquicksort with insertionsort, " \
"SMALL_PARTITION is "xstr(SMALL_PARTITION)}, \
{ti7qsort,"ti7","ti7qsort, SMALL_PARTITION is " \
xstr(SMALL_PARTITION)}, \
{imsort,"imsort","Harter integer sort, int_msort()"}, \
/*
{hsortm,"hsortm","casual heap"}, \
{Q_sort,"qsort","standard library qsort"}, \
{si_sort,"SI","si_sort, stable insertion sort"}, \
{i_sort,"I","i_sort, nonstable insertion sort"}, \
*/}
#define DISTRIBUTION_LIST { \
{all_different,"All different keys, randomly shuffled"}, \
{constant,"Constant"}, \
{two,"Two values, Badly written qsorts choke on this"}, \
{ramp,"Ramp, Drives center pivot qsorts, quadratic"}, \
/*
{sorted,"Already sorted"}, \
{reverse,"Reverse sorted"}, \
{card_shuffle,"card_shuffle"}, \
{randArray,"32 bit RAND"}, \
{perverse,"Perverse"}, \
{x7fff,"0x7fff values"}, \
{twenty,"Twenty values"}, \
{ten,"Ten values"}, \
{five,"Five values"}, \
{camel,"Camel, for floating types"}, \
*/}
#define TOO_FAST 0.01 /* time in seconds */
#define N_INCREMENT (++N)
#define SORTS (sizeof s_L / sizeof *s_L)
#define DISTRIBUTIONS (sizeof d_L / sizeof *d_L)
#define str(s) # s
#define xstr(s) str(s)
struct sf {
void(*sortfunc)(e_type *, size_t);
char *name;
char *description;
double total_time;
};
double sort_time(struct sf *s_L, e_type **s, size_t nmemb,
size_t sort, long unsigned loops, int *sort_error);
void copyarray(e_type *s1, e_type *s2, size_t nmemb);
int comparray(e_type *s1, e_type *s2, size_t nmemb);
void free_ptrs(e_type **s, size_t nmemb);
int main(void)
{
e_type *s[3];
size_t N, sort, dist, array;
int sort_error;
double loop_time, s_time, time_max = 0.0;
struct sf s_L[] = SORT_LIST;
struct df {
void(*distfunc)(e_type *, size_t, long unsigned *);
char *string;
} d_L[] = DISTRIBUTION_LIST;
long unsigned seed[] = {
SEED_0, SEED_1, SEED_2,
SEED_3, SEED_4, SEED_5
};
const size_t distributions = DISTRIBUTIONS;
const size_t sorts_1 = SORTS - 1;
const long unsigned loops = LOOPS;
const int STTO = SHOW_TOTAL_TIMES_ONLY;
assert(s_L[0].sortfunc == nosort);
assert(10 > SORTS);
assert(SMALL_MERGE >= 1);
assert(SMALL_PARTITION >= 4);
for (array = 0; array != sizeof s / sizeof *s; ++array) {
s[array] = malloc(MAX_N * sizeof *s[array]);
if (s[array] == NULL) {
free_ptrs(s, array);
fputs("malloc failure in e_driver program.\n", stderr);
exit(EXIT_FAILURE);
}
}
puts("/* BEGIN e_driver.c program output */\n");
if (sorts_1 != 1) {
printf("Timing %lu different sorting functions.\n",
(long unsigned)sorts_1);
} else {
puts("Timing a sorting function.");
}
fputs("Sorting arrays of N number of elements", stdout);
if (distributions != 1) {
printf(",\nin %lu different distributions",
(long unsigned)distributions);
}
puts(
".\nThe elements are of type " xstr(E_TYPE)
".\nThe times shown are in seconds."
);
if (loops > 1) {
puts(
"The sorts are done " xstr(LOOPS) " times\nbecause "
"N is too small to give measurable time otherwise."
);
}
putchar('\n');
for (dist = 0; dist != DISTRIBUTIONS; ++dist) {
for (sort = 1; SORTS > sort; ++sort) {
s_L[sort].total_time = 0.0;
}
printf("Distribution #%lu: %s\n",
dist + 1LU, d_L[dist].string);
if (!STTO) {
fputs(" N ", stdout);
for (sort = 1; SORTS > sort; ++sort) {
printf("%9s", s_L[sort].name);
}
putchar('\n');
} else {
printf("MIN N %lu\nMAX N %lu\n",
(long unsigned)MIN_N, (long unsigned)MAX_N);
}
for (N = MIN_N; MAX_N >= N; N_INCREMENT) {
if (!STTO) {
printf("%7lu", (long unsigned)N);
}
d_L[dist].distfunc(s[2], N, seed);
loop_time = sort_time(s_L, s, N, 0, LOOPS, &sort_error);
for (sort = 1; SORTS > sort; ++sort) {
s_time = sort_time(s_L, s, N, sort, LOOPS, &sort_error);
switch (sort_error) {
case 0:
break;
case 1:
free_ptrs(s, array);
puts("\n\nARRAY OUT OF ORDER!\n");
exit(EXIT_SUCCESS);
break;
default:
free_ptrs(s, array);
puts(
"\n\nSORT DISCREPANCY "
"BETWEEN LAST TWO SORTS!\n"
);
exit(EXIT_SUCCESS);
break;
}
s_time -= loop_time;
s_L[sort].total_time += s_time;
if (!STTO) {
printf(" %f", s_time);
}
if (s_time > time_max) {
time_max = s_time;
}
}
if (!STTO) {
putchar('\n');
}
}
puts("Total times:");
for (sort = 1; SORTS > sort; ++sort) {
printf(
"%9s: %f %s\n",
s_L[sort].name,
s_L[sort].total_time,
s_L[sort].description
);
}
putchar('\n');
}
free_ptrs(s, array);
if (TOO_FAST > time_max) {
puts(
"\n#define LOOPS " xstr(LOOPS)
"\n#define TOO_FAST " xstr(TOO_FAST)
"\n\nTOO_FAST > time_max\n\n"
"Increase LOOPS, if times are too fast.\n"
);
}
puts("/* END e_driver.c program output */");
return 0;
}
double sort_time(struct sf *s_L, e_type **s, size_t nmemb,
size_t sort, long unsigned loops, int *sort_error)
{
size_t i;
clock_t start, stop;
start = clock();
while (start == clock()) {
;
}
start = clock();
while (loops-- != 0) {
copyarray(s[1], s[2], nmemb);
s_L[sort].sortfunc(s[1], nmemb);
}
stop = clock();
*sort_error = 0;
switch (sort) {
case 0:
break;
case 1:
for (i = 1; nmemb > i; ++i) {
if (GT(s[1] + i - 1, s[1] + i)) {
*sort_error = 1;
break;
}
}
copyarray(s[0], s[1], nmemb);
break;
default:
if (comparray(s[0], s[1], nmemb) != 0) {
*sort_error = 2;
}
break;
}
return (double)(stop - start) / CLOCKS_PER_SEC;
}
void copyarray(e_type *s1, e_type *s2, size_t nmemb)
{
while (nmemb-- != 0) {
*s1++ = *s2++;
}
}
int comparray(e_type *s1, e_type *s2, size_t nmemb)
{
while (nmemb-- != 0) {
if (GT(s2, s1)) {
return -1;
}
if (GT(s1, s2)) {
return 1;
}
++s1;
++s2;
}
return 0;
}
void free_ptrs(e_type **s, size_t nmemb)
{
s += nmemb;
while (nmemb-- != 0) {
free(*--s);
}
}
/* END e_driver.c */
--
pete
e_driver.c is a program for testing and timing
functions for sorting one dimmensional arrays of any type.
There are six files:
e_type.h
e_driver.c
e_sort.c
e_sort.h
distributions.c
distributions.h
e_type.h:
e_type.h is the file for the programmer
to define the array element type.
It is set up so that the type can be switched back and forth
between unsigned and long double, just by a changing
one character in a preprocessor directive.
There is an example program, ptr_sort.c,
showing how to use the sorting functions with nonarithmetic types,
outside of the e_driver program.
Inside of the e_driver program however,
distributions.c is set up to test arithmetic types only.
e_driver.c:
e_driver.c is the main file.
All of the macros from the top, down to N_INCREMENT,
are intended to be easy to change
in order to varry the test conditions, such as
which sorting functions are being tested,
and the number of elements in the array.
To test functions on small arrays,
you might want to try the following values:
#define SHOW_TOTAL_TIMES_ONLY 1
#define LOOPS 100000
#define MIN_N 1
#define MAX_N (MIN_N + 49)
/* Comments */, are used to select which sorting functions
and which array distributions are tested.
e_sort.c:
All of the sorting functions are in e_sort.c.
e_sort.c has some static functions and prototypes.
e_sort.c also has the BUF_BYTES macro for the mergesort function.
#define BUF_BYTES (128 * sizeof(int))
e_sort.h:
The prototypes and macros for the sorting functions in e_sort.c
are in e_sort.h.
Some of the sorting functions in e_sort.h,
might not be listed in the SORT_LIST macro in e_driver.c.
distributions.c:
These are the test array ordering functions.
distributions.h:
These are the test array ordering function prototypes,
and the intitial values for the prng macro in distributions.c.
/* END e_driver.txt */
--
pete
#define STRUCTURES 1 /* Either 1 or 0 */
#define SORT_FUNCTIONS { \
no_sort, "This is the original order of the test array:",\
si_sort, "Stable insertionsort:", \
s8sort, "Nonstable Shellsort, four is after " \
"nine, and eight is before three:", \
/*
q_sort, "Standard library qsort, " \
"unspecified ordering of equal keys:", \
*/}
#define NUMBERS { \
{"one"},{ "two"},{"three"},{"four"},{"five"}, \
{"six"},{"seven"},{"eight"},{"nine"},{ "ten"}, \
}
#define GT(A, B) (lencomp((A), (B)) > 0)
#define NMEMB (sizeof numbers / sizeof *numbers)
#define SORTS (sizeof s_F / sizeof *s_F)
#define str(s) # s
#define xstr(s) str(s)
#if STRUCTURES != 0
#define E_TYPE \
struct node { \
void *data; \
struct ds *next; \
}
#else
#define E_TYPE char *
#endif
typedef E_TYPE e_type;
int lencomp(const void *, const void *);
void no_sort(e_type *, size_t);
void si_sort(e_type *, size_t);
void s8sort(e_type *, size_t);
void q_sort(e_type *, size_t);
int main(void)
{
size_t element, sort;
struct sf {
void (*function)(e_type *, size_t);
const char *description;
} s_F[] = SORT_FUNCTIONS;
const e_type numbers[] = NUMBERS;
e_type copy[NMEMB];
puts("/* BEGIN output from ptr_sort.c */\n\nArrays of type "
xstr(E_TYPE) ",\nare being sorted by string length.");
for (sort = 0; sort != SORTS; ++sort) {
putchar('\n');
puts(s_F[sort].description);
memcpy(copy, numbers, sizeof copy);
s_F[sort].function(copy, NMEMB);
for (element = 0; element != NMEMB; ++element) {
#if STRUCTURES != 0
puts(copy[element].data);
#else
puts(copy[element]);
#endif
}
}
puts("\n/* END output from ptr_sort.c */");
return 0;
}
int lencomp(const void *a, const void *b)
{
#if STRUCTURES != 0
const size_t a_len = strlen((*(e_type*)a).data);
const size_t b_len = strlen((*(e_type*)b).data);
#else
const size_t a_len = strlen(*(e_type*)a);
const size_t b_len = strlen(*(e_type*)b);
#endif
return b_len > a_len ? -1 : a_len != b_len;
}
void no_sort(e_type *array, size_t nmemb)
{
array || nmemb;
}
void si_sort(e_type *array, size_t nmemb)
{
e_type temp, *base, *low, *high;
if (nmemb > 1) {
--nmemb;
base = array;
do {
low = array++;
if (GT(low, array)) {
high = array;
temp = *high;
do {
*high-- = *low;
if (low == base) {
break;
}
--low;
} while (GT(low, &temp));
*high = temp;
}
} while (--nmemb != 0);
}
}
void s8sort(e_type *array, size_t nmemb)
{
e_type temp, *i, *j, *k, *after;
after = array + nmemb;
if (nmemb > (size_t)-1 / 3) {
nmemb /= 3;
}
do {
for (i = array + nmemb; after != i; ++i) {
j = i - nmemb;
if (GT(j, i)) {
k = i;
temp = *k;
do {
*k = *j;
k = j;
if (nmemb + array > j) {
break;
}
j -= nmemb;
} while (GT(j, &temp));
*k = temp;
}
}
nmemb = nmemb != 2 ? 3 * nmemb / 8 : 1;
} while (nmemb != 0);
}
void q_sort(e_type *array, size_t nmemb)
{
qsort(array, nmemb, sizeof *array, lencomp);
}
/* END ptr_sort.c */
/* BEGIN output from ptr_sort.c */
Arrays of type struct node { void *data; struct node *next; },
are being sorted by string length.
This is the original order of the test array:
one
two
three
four
five
six
seven
eight
nine
ten
Stable insertionsort:
one
two
six
ten
four
five
nine
three
seven
eight
Nonstable Shellsort, four is after nine, and eight is before three:
one
two
six
ten
five
nine
four
eight
three
seven
/* END output from ptr_sort.c */
--
pete
#ifndef H_E_TYPE
#define H_E_TYPE
#if 1
#define E_TYPE /*/ unsigned /*/ int /**/
#define SMALL_PARTITION 50
#define SMALL_MERGE 40
#else
#define E_TYPE long double
#define SMALL_PARTITION 20
#define SMALL_MERGE 20
#endif
#define GT(A, B) (*(A) > *(B))
typedef E_TYPE e_type;
#endif
/*
** for itloop and issort and itsort and ti7qloop:
** Make SMALL_PARTITION bigger for faster e_type
** Make SMALL_PARTITION smaller for slower e_type
** eg. 50 for unsigned int, 20 for long double
** assert(SMALL_PARTITION >= 4);
** SMALL_PARTITION must be greater than or equal to 4
*/
/*
** for merge and stable:
** Make SMALL_MERGE bigger for faster e_type
** Make SMALL_MERGE smaller for slower e_type
** eg. 40 for unsigned int, 20 for long double
** assert(SMALL_MERGE >= 1);
** SMALL_MERGE must be greater than or equal to one
*/
/*
** E_TYPE can be changed to almost any type.
** If E_TYPE is a nonarithmetic type,
** like a structure or a pointer,
** then the comparison macros may need to become more complicated.
*/
/* END e_type.h */
--
pete
#ifndef H_E_SORT
#define H_E_SORT
#include <stddef.h>
#include "e_type.h"
void nosort(e_type *array, size_t nmemb);
void Q_sort(e_type *array, size_t nmemb);
void hsortm(e_type *array, size_t nmemb);
void s3sort(e_type *array, size_t nmemb);
void itsort(e_type *array, size_t nmemb);
void i_sort(e_type *array, size_t nmemb);
void si_sort(e_type *array, size_t nmemb);
void ti7qsort(e_type *array, size_t nmemb);
void stable(e_type *array, size_t nmemb);
void imsort(e_type *array, size_t nmemb);
#endif
/* END e_sort.h */
--
pete
#include <stdlib.h>
#include <limits.h>
#include <assert.h>
#include "e_sort.h"
#define BUF_BYTES (128 * sizeof(int))
#define LU_RAND_SEED 123456789LU
#define LU_RAND(S) ((S) * 69069 + 362437)
/*
** Use:
** #define LU_RAND(S) ((S) * 69069 + 362437)
** for better sorting
** Use:
** #define LU_RAND(S) ((S) * 69069 + 362437 & 0xffffffff)
** to avoid implementation defined behavior
*/
#define SWAP(A, B, T) ((void)((T) = *(A), *(A) = *(B), *(B) = (T)))
#define SORT_2(A, B, T) \
{ \
if (GT((A), (B))) { \
SWAP((A), (B), (T)); \
} \
}
#define SORT_3(A, B, C, T) \
if (GT((A), (B))) { \
(T) = *(A); \
if (GT((C), (B))) { \
*(A) = *(B); \
if (GT(&(T), (C))) { \
*(B) = *(C); \
*(C) = (T); \
} else { \
*(B) = (T); \
} \
} else { \
*(A) = *(C); \
*(C) = (T); \
} \
} else { \
if (GT((B), (C))) { \
if (GT((A), (C))) { \
(T) = *(A); \
*(A) = *(C); \
*(C) = *(B); \
*(B) = (T); \
} else { \
SWAP((B), (C), (T)); \
} \
} \
}
static int compar(const void *, const void *);
static void issort(e_type *, size_t);
static void itloop(e_type *, size_t, size_t);
static void merge(e_type *, e_type *, size_t);
static int mgsort(e_type *, size_t);
static int sorted(e_type *, size_t);
static int rev_sorted(e_type *, size_t);
static long unsigned ti7qloop(e_type *, size_t, size_t, size_t);
static void rev_array(e_type *, size_t);
static int int_msort(int * data, int len);
static int compar(const void *arg1, const void *arg2)
{
return GT((e_type*)arg2, (e_type*)arg1)
? -1 : GT((e_type*)arg1, (e_type*)arg2);
}
void nosort(e_type *array, size_t nmemb)
{
array || nmemb;
}
void si_sort(e_type *array, size_t nmemb)
{
e_type temp, *base, *low, *high;
if (nmemb > 1) {
--nmemb;
base = array;
do {
low = array++;
if (GT(low, array)) {
high = array;
temp = *high;
do {
*high-- = *low;
if (low == base) {
break;
}
--low;
} while (GT(low, &temp));
*high = temp;
}
} while (--nmemb != 0);
}
}
void i_sort(e_type *array, size_t nmemb)
{
e_type temp, *first, *middle;
size_t counter;
if (nmemb > 1) {
first = middle = 1 + array;
counter = --nmemb;
while (--counter != 0) {
++first;
if (GT(middle, first)) {
middle = first;
}
}
if (GT(array, middle)) {
SWAP(array, middle, temp);
}
++array;
while (--nmemb != 0) {
first = array++;
if (GT(first, array)) {
middle = array;
temp = *middle;
do {
*middle-- = *first--;
} while (GT(first, &temp));
*middle = temp;
}
}
}
}
void s3sort(e_type *array, size_t nmemb)
{
e_type temp, *i, *j, *k, *after;
after = array + nmemb;
if (nmemb > (size_t)-1 / 3) {
nmemb /= 3;
}
do {
for (i = array + nmemb; i != after; ++i) {
j = i - nmemb;
if (GT(j, i)) {
k = i;
temp = *k;
do {
*k = *j;
k = j;
if (nmemb + array > j) {
break;
}
j -= nmemb;
} while (GT(j, &temp));
*k = temp;
}
}
nmemb = nmemb != 2 ? 3 * nmemb / 7 : 1;
} while (nmemb != 0);
}
#define SIFTDOWNM(A, I, N, P, C, T) \
{ \
(P) = (I); \
(T) = (A)[(P)]; \
(C) = (P) * 2; \
while ((N) > (C)) { \
if (GT((A) + (C) + 1, (A) + (C))) { \
++(C); \
} \
if (GT((A) + (C), &(T))) { \
(A)[(P)] = (A)[(C)]; \
(P) = (C); \
(C) *= 2; \
} else { \
break; \
} \
} \
if ((N) == (C) && GT((A) + (C), &(T))) { \
(A)[(P)] = (A)[(C)]; \
(P) = (C); \
} \
(A)[(P)] = (T); \
}
void hsortm(e_type *array, size_t nmemb)
{
size_t i, child, parent;
e_type temp;
if (nmemb > 1) {
i = --nmemb / 2;
do {
SIFTDOWNM(array, i, nmemb, parent, child, temp);
} while (i-- != 0);
SWAP(array, array + nmemb, temp);
while (--nmemb != 0) {
SIFTDOWNM(array, 0, nmemb, parent, child, temp);
SWAP(array, array + nmemb, temp);
}
}
}
static void itloop(e_type *array, size_t nmemb, size_t d)
{
e_type temp, *i, *j;
assert(SMALL_PARTITION >= 4);
while (nmemb > SMALL_PARTITION) {
if (!d--) {
hsortm(array, nmemb);
return;
}
i = nmemb-- / 2 + array;
SWAP(array, i, temp);
i = 1 + array;
j = nmemb + array;
SORT_3(i, array, j, temp);
do {
++i;
} while (GT(array, i));
do {
--j;
} while (GT(j, array));
while (j > i) {
SWAP(j, i, temp);
do {
++i;
} while (GT(array, i));
do {
--j;
} while (GT(j, array));
}
SWAP(array, j, temp);
itloop(j + 1, nmemb + array - j, d);
nmemb = j - array;
}
}
void itsort(e_type *array, size_t nmemb)
{
size_t d, n;
d = 2;
for (n = nmemb / 4; n != 0; n /= 2) {
++d;
}
itloop(array, nmemb, 2 * d);
issort(array, nmemb);
}
static void issort(e_type *array, size_t nmemb)
{
e_type temp, *last, *first, *middle, *small;
if (nmemb > 1) {
middle = 1 + array;
first = middle;
last = nmemb - 1 + array;
small = nmemb > SMALL_PARTITION
? array + SMALL_PARTITION : last;
while (first != small) {
++first;
if (GT(middle, first)) {
middle = first;
}
}
if (GT(array, middle)) {
SWAP(array, middle, temp);
}
++array;
while (array != last) {
first = array++;
if (GT(first, array)) {
middle = array;
temp = *middle;
do {
*middle-- = *first--;
} while (GT(first, &temp));
*middle = temp;
}
}
}
}
void ti7qsort(e_type *array, size_t nmemb)
{
size_t d, n;
if (nmemb > 1 && !sorted(array, nmemb)) {
if (!rev_sorted(array, nmemb)) {
d = 2;
for (n = nmemb / 4; n; n /= 2) {
++d;
}
ti7qloop(array, nmemb, d * 2, LU_RAND_SEED);
} else {
rev_array(array, nmemb);
}
}
}
static int sorted(e_type *array, size_t nmemb)
{
while (--nmemb != 0 && !GT(array, array + 1)) {
++array;
}
return !nmemb;
}
static int rev_sorted(e_type *array, size_t nmemb)
{
while (--nmemb != 0 && !GT(array + 1, array)) {
++array;
}
return !nmemb;
}
static void rev_array(e_type *array, size_t nmemb)
{
e_type temp, *end;
for (end = array + nmemb; --end > array; ++array) {
SWAP(array, end, temp);
}
}
static long unsigned
ti7qloop(e_type *array, size_t nmemb, size_t d, size_t seed)
{
e_type temp, *first, *last;
assert(SMALL_PARTITION >= 4);
while (nmemb > SMALL_PARTITION) {
if (sorted(array, nmemb)) {
return seed;
}
if (!d--) {
hsortm(array, nmemb);
return seed;
}
seed = LU_RAND(seed);
first = seed % --nmemb + array;
SWAP(array, first, temp);
first = 1 + array;
last = nmemb + array;
SORT_3(first, array, last, temp);
do {
++first;
} while (GT(array, first));
do {
--last;
} while (GT(last, array));
while (last > first) {
SWAP(last, first, temp);
do {
++first;
} while (GT(array, first));
do {
--last;
} while (GT(last, array));
}
SWAP(array, last, temp);
seed = ti7qloop(last + 1, nmemb + array - last, d, seed);
nmemb = last - array;
}
i_sort(array, nmemb);
return seed;
}
void stable(e_type *array, size_t nmemb)
{
assert(SMALL_MERGE >= 1);
if (!(nmemb > SMALL_MERGE && mgsort(array, nmemb))) {
/*
** If mgsort returns zero,
** then you might want to handle it differently.
*/
si_sort(array, nmemb);
}
}
static int mgsort(e_type *array, size_t nmemb)
{
e_type *buff;
e_type temp_array[BUF_BYTES > sizeof *array
? BUF_BYTES / sizeof *array : 1];
if (nmemb / 2 > sizeof temp_array / sizeof *temp_array) {
buff = malloc(nmemb / 2 * sizeof *buff);
if (buff != NULL) {
merge(array, buff, nmemb);
free(buff);
} else {
return 0;
}
} else {
merge(array, temp_array, nmemb);
}
return 1;
}
static void merge(e_type *base, e_type *buff, size_t nmemb)
{
assert(SMALL_MERGE >= 1);
if (nmemb > SMALL_MERGE) {
e_type *mid_ptr;
size_t const half = nmemb / 2;
e_type *const middle = base + half;
e_type *const after_buff = buff + half;
e_type *const after_array = base + nmemb;
merge(middle, buff, nmemb - half);
merge(base, buff, half);
while (!GT(base, middle) && middle != base) {
++base;
}
buff = after_buff;
mid_ptr = middle;
while (base != mid_ptr) {
*--buff = *--mid_ptr;
}
mid_ptr = middle;
while (middle != base) {
*base++ = *(GT(buff, mid_ptr) ? mid_ptr++ : buff++);
}
while (after_buff != buff && after_array != mid_ptr) {
*base++ = *(GT(buff, mid_ptr) ? mid_ptr++ : buff++);
}
while (after_buff != buff) {
*base++ = *buff++;
}
} else {
si_sort(base, nmemb);
}
}
void Q_sort(e_type *array, size_t nmemb)
{
qsort(array, nmemb, sizeof *array, compar);
}
#include <string.h>
#define MIN_SORTABLE_LENGTH 128
void imsort(e_type *array, size_t nmemb)
{
assert(INT_MAX >= nmemb);
int_msort(array, nmemb);
}
if (ifac<=0) ifac=1;
while (((data_max-data_min)/ifac)>(len-1)) ifac++;
/* allocate index and histogram space */
space = malloc((2*len+1)*sizeof(int));
if (!space) return 0;
cmp_index = space;
cum = space + len;
hist = cum + 1;
sorted = hist;
memset(cum,0,(len+1)*sizeof(int));
/* Compute raw interpolation indices */
for (i=len; --i>=0;) {
hist[cmp_index[i] = (data[i] - data_min)/ifac] += 1;
}
for (i=1;i<len;i++) cum[i] += cum[i-1];
for (i = 0; i < len; i++) {
j = cmp_index[i];
cmp_index[i] = cum[j]++;
}
/* Math sort proper */
for (i=len; --i >= 0;) sorted[cmp_index[i]] = data[i];
memcpy(data, sorted, len*sizeof(int));
free (space);
}
/* End of math sort section, begin insertion sort section */
{
for (i=1; i<len; i++) {
if (data[i] >= data[i-1]) continue;
temp = data[i];
data[i] = data[i-1];
for (j = i-2; temp < data[j]; j--) data[j+1] = data[j];
data[j+1] = temp;
}
}
return 1;
}
/* END e_sort.c */
--
pete
#ifndef H_DISTRIBUTIONS
#define H_DISTRIBUTIONS
#include <stddef.h>
#include "e_type.h"
#define SEED_0 0
#define SEED_1 123456789
#define SEED_2 362436069
#define SEED_3 521288629
#define SEED_4 88675123
#define SEED_5 886756453
void all_different(e_type *a, size_t n, long unsigned *seed);
void randArray(e_type *a, size_t n, long unsigned *seed);
void x7fff(e_type *a, size_t n, long unsigned *seed);
void two(e_type *, size_t n, long unsigned *seed);
void five(e_type *a, size_t n, long unsigned *seed);
void ten(e_type *a, size_t n, long unsigned *seed);
void twenty(e_type *a, size_t n, long unsigned *seed);
void sorted(e_type *a, size_t n, long unsigned *seed);
void reverse(e_type *a, size_t n, long unsigned *seed);
void constant(e_type *a, size_t n, long unsigned *seed);
void ramp(e_type *a, size_t n, long unsigned *seed);
void perverse(e_type *a, size_t n, long unsigned *seed);
void camel(e_type *a, size_t n, long unsigned *seed);
void card_shuffle(e_type *a, size_t n, long unsigned *seed);
#endif
/* END distributions.h */
--
pete
#include "distributions.h"
#define PERVERSE_CASES 4
#define RAND_32LU(S) \
(S[0] = S[1] ^ S[1] >> 7, \
S[1] = S[2], \
S[2] = S[3], \
S[3] = S[4], \
S[4] = S[5], \
S[5] =(S[5] ^ S[5] << 6 \
^ S[0] ^ S[0] << 13) & 0xffffffff, \
S[5] *(S[2] + S[2] + 1) & 0xffffffff \
)
void all_different(e_type *array, size_t n, long unsigned *seed)
{
size_t i, r;
array[0] = 0;
for (i = 1; n > i; ++i) {
r = RAND_32LU(seed) % (i + 1);
array[i] = 0;
array[i] = array[r];
array[r] = i;
}
}
void two(e_type *a, size_t n, long unsigned *seed)
{
while (n--) {
*a++ = RAND_32LU(seed) % 2;
}
}
void sorted(e_type *a, size_t n, long unsigned *seed)
{
a += n;
while (n--) {
*--a = n;
}
seed;
}
void reverse(e_type *a, size_t n, long unsigned *seed)
{
while (n--) {
*a++ = n;
}
seed;
}
void constant(e_type *a, size_t n, long unsigned *seed)
{
while (n--) {
*a++ = 6;
}
seed;
}
void ramp(e_type *a, size_t n, long unsigned *seed)
{
const size_t N = n;
const size_t mid = N / 2;
while (n != mid) {
*a++ = N - n--;
}
while (n) {
*a++ = n--;
}
seed;
}/* ramp makes center pivot qsorts go quadratic */
void perverse(e_type *a, size_t n, long unsigned *seed)
{
e_type number;
while (n--) {
number = RAND_32LU(seed) % PERVERSE_CASES;
*a++ = number == PERVERSE_CASES - 1 ? RAND_32LU(seed) : number;
}
}
void camel(e_type *a, size_t n, long unsigned *seed)
{
double const increment = 1.0 / n;
a += n;
while (n--) {
*--a = (e_type)(n % 2 ? 10 - n * increment : n * increment);
}
seed;
}
void card_shuffle(e_type *array, size_t n, long unsigned *seed)
{
const size_t half = n / 2;
array += n;
while (n--) {
*--array = n & 1 ? n / 2 + half : n / 2;
}
seed;
}
void randArray(e_type *a, size_t n, long unsigned *seed)
{
while (n--) {
*a++ = RAND_32LU(seed);
}
}
void x7fff(e_type *a, size_t n, long unsigned *seed)
{
while (n--) {
*a++ = RAND_32LU(seed) % 0x7fff;
}
}
void five(e_type *a, size_t n, long unsigned *seed)
{
while (n--) {
*a++ = RAND_32LU(seed) % 5;
}
}
void ten(e_type *a, size_t n, long unsigned *seed)
{
while (n--) {
*a++ = RAND_32LU(seed) % 10;
}
}
void twenty(e_type *a, size_t n, long unsigned *seed)
{
while (n--) {
*a++ = RAND_32LU(seed) % 20;
}
}
/* END distributions.c */
--
pete
Well, it doesn't make much sense timing a sorting routine that contains
calls to malloc() and free(). These operating system calls are usually very
expensive.
I suggest you replace malloc() with alloca() and remove the free(). This
allocates space on the stack instead of on the heap, and is much faster.
-Michael.
P.S. Could you please enlighten my slow-thinking head with an explanation of
the algorithm. I looked at your code, but could not figure out what's going
on :-(
>> > The sort isn't really very fast.
>> > I have a simple shellsort which is about as fast.
>> > I have a stable mergesort which is about as fast.
>> > I have a quicksort which is more than twice as fast.
>>
>> That's a bit surprising; it doesn't quite accord with my results.
>> What sort of test bed are you using? Is there any chance I could look
>> at your code for comparison?
>
>Try about 1000 numbers generated from the following:
>
>data[i] = (int) pow(3.5, rand() % 18) + i;
>
>and let us know how your histogram-sort does against shell-, merge-,
>heap- or quick-sort.
Chortle. Quite right. Even better, let the first n-1 numbers be the
descending sequence {n-1,...,1} and the last be n squared. That will
force O(n^2) performance. The caveat, and it is a big one, is that
the distribution is not badly skewed.
In a more general version one can recursively sort the bins; that will
have O(n*log n) in the worst case.
I guess you wanted this from Richard but I ended up tracking down a
problem and figured out how it works as I went along as a byproduct.
It first works out the minimum and maximum integers in the input array.
Then it divides this range into a number of 'buckets' and counts the
number of integers in each bucket (in an auxilliary array).
It then adds up the number of items in the buckets _below_ each such
bucket so that the resulting count will be the position in the final
output array of the _first_ value in the bucket.
Another array of the same length as the input is then used as follows.
The input array is scanned and the bucket for each item is computed.
The 'low count' for this bucket is then used to store this item in the
auxilliary array and the 'low count' for this bucket is then
incremented. The end result of this process is that all items will be in
their right bucket ranges within this auxilliary array but they will not
be properly ordered within each of these ranges. So finally the array
is sorted (I think with an insertion sort) and the result is copied to
the input array.
I hope I have this right - I also hope that Richard will tell us if I
have screwed this up :-)
Brian Gladman
>
>"Richard Harter" <c...@tiac.net> wrote in message
>news:41e95ff2....@news.sbtc.net...
>> On Sat, 15 Jan 2005 13:06:36 GMT, pete <pfi...@mindspring.com> wrote:
>>
>> >The sort isn't really very fast.
>> >I have a simple shellsort which is about as fast.
>> >I have a stable mergesort which is about as fast.
>> >I have a quicksort which is more than twice as fast.
>>
>> That's a bit surprising; it doesn't quite accord with my results.
>> What sort of test bed are you using? Is there any chance I could look
>> at your code for comparison?
>
>Well, it doesn't make much sense timing a sorting routine that contains
>calls to malloc() and free(). These operating system calls are usually very
>expensive.
>
>I suggest you replace malloc() with alloca() and remove the free(). This
>allocates space on the stack instead of on the heap, and is much faster.
It has *one* call to malloc and to free, not calls. Malloc costs, but
in this context the cost is small. The trouble with allocating space
on the stack (at least it used to be) is that can blow out the stack.
>
>-Michael.
>
>P.S. Could you please enlighten my slow-thinking head with an explanation of
>the algorithm. I looked at your code, but could not figure out what's going
>on :-(
Okay, let me give an example and work from there; my apologies if this
is a bit long winded.
Suppose we have an array with three different values in it that looks
like this:
Address: 0 1 2 3 4 5 6 7 8
Content: 1 3 3 3 2 1 3 2 1
The general idea is that we are going to compute an index for each
element that gives the final location of the element in the sorted
array. In this case the array of indices will be:
Indices: 0 5 6 7 3 1 7 4 2
That is, the 1's will occupy locations 0-2, the 2's locations 3-4, and
the 3's locations 5-8. How do we get there?
What we can do is construct a formula that gives the approximate
location of each element in the sorted array. IF (and this is an
important caveat as Paul indirectly points out) the data is
approximately evenly distributed an inverse interpolation will be a
good formula. In this case the inverse interpolation formula is
location = 3*datum - 3
which gives us computed indices of:
Computed: 0 6 6 6 3 0 6 3 0
which is sort of what we want but not really. However we can think of
the computed indices as bin numbers and count how many elements are in
each bin to get a histogram. Thus:
Bin numbr: 0 1 2 3 4 5 6 7 8
Bin count: 3 0 0 2 0 0 4 0 0
We can convert the histogram into a cumulative distribution by
cumulatively summing the counts. This gives us:
Bin number: 0 1 2 3 4 5 6 7 8
Cumulative: 0 3 3 3 5 5 5 9 9
The neat thing here is that the cumulative distribution gives the
initial address of the bins in the sorted array. Thus the 1's start
at address 0, the 2's start at address 1, and the 3's start at address
5. Note that there are some duplicates; they correspond to empty
bins.
Patience, we are almost there. We can now walk through the array of
computed indices and determine what the true indices are. To get a
true index we find the computed bin index and look in that slot in the
cumulative for the address of the next element to go in that bin. We
replace the computed index by the true index, and advance the value in
the cumulative bin so that it points to the next element to go in that
bin.
Since we are stepping forward through the array the sort is a stable
sort; if we did the permutation in place it wouldn't be stable.
This algorithm is an "almost sorted" algorithm; the elements within a
bin aren't sorted. This particular variant does an insertion sort as
a cleanup. If the data is reasonably distributed, i.e., if there
aren't any very large bins, it is an O(n) sort.
I hope all of this is clear.
It sounds like a compiler bug to me. In the following code
x = y++;
x should get the value of y before it is incremented and not after.
Be that as it may, your workaround shouldn't cost any more (at least
if you allocate the temp within the block.)
Thanks for checking.
>Richard Harter wrote:
>
>> That's a bit surprising; it doesn't quite accord with my results.
>> What sort of test bed are you using? Is there any chance I could look
>> at your code for comparison?
>
>I used the fix suggested by BRG.
Thanks for posting the code; I will download it, experiment, and
report back in a day or three.
>Michael Jørgensen wrote:
>>
>> P.S. Could you please enlighten my slow-thinking head with an explanation of
>> the algorithm. I looked at your code, but could not figure out what's going
>> on :-(
[snip]
>
>I guess you wanted this from Richard but I ended up tracking down a
>problem and figured out how it works as I went along as a byproduct.
[snip]
Very good.
[snip]
>>I will leave it up to the C language police to decide whether this is a
>>bug in my compiler or whether the original invokes undefined behaviour
>>(but I suspect it is the latter).
>
>
> It sounds like a compiler bug to me. In the following code
>
> x = y++;
>
> x should get the value of y before it is incremented and not after.
> Be that as it may, your workaround shouldn't cost any more (at least
> if you allocate the temp within the block.)
Yes but the implication of this in an expression of the form:
f[i] = g[ f[i] ]++
is not entirely obvious. After g[ f[i] ] has been calculated, there are
two further operations to be done:
(1) update f[i] on the left hand side, and
(2) post increment g[ f[i] ]
and the result is different depending on the order in which these two
operations are done.
I am not an expert on the C language specification but I don't think
that we are entitled to expect that (2) is necessarily done before (1).
In fact I am not sure that we can expect any defined order and it is
this that leads to my suspicion that the expression is undefined.
Brian Gladman
> I tracked the bug that has turned up on my system to the line:
>
> > for (i=0;i<len;i++) cmp_index[i] = cum[cmp_index[i]]++;
>
> The problem here on my system is that cmp_index[i] gets updated before
> the ++ increment is done so the index into cum[] used for this increment
> is the new value of cmp_index[i] that has just been calculated. With:
>
> for(i = 0; i < len; i++)
> {
> j = cmp_index[i]; cmp_index[i] = cum[j]++;
> }
>
> all goes well.
>
> I will leave it up to the C language police to decide whether this is a
> bug in my compiler or whether the original invokes undefined behaviour
> (but I suspect it is the latter).
You are correct: it is undefined in C because the prior value of a
stored object (cmp_index[i]) is read for purposes other than
determining the new value, namely specifying the index of cum[],
between the sequence points bounding the object storage.
Thad
Thanks - it certainly looked as if it would fall foul of this rule.
Brian Gladman
The binning process is an O(n) process, but produces 'b' bins which are in the correct order but internally are not sorted except in some specific circumstances. If b is larger or equal to the number of different values 'd' in the input (not the length) then you have an O(n) sort. The smaller 'b' is compared to 'd' the more approximate the sort.
For cases when b<<d, then either this process must be applied to each bin again [O(n)+O(bn)=O(bn)], or some other sort needs to be applied to the bin [O(n)+O(b * (n/b) * log n/b) = O(n)+O(n * (log n - log b)) = O(n log n - log b)]
Richard, have you considered using the known range of values in a bin after the first pass to "know" how many bins you need so that applying the same process again for a bin must be O(num_items_in_bin)? This way you would have O(n) sort using O(MAX[bin_range]) additional storage if I've done my sums right (which, I doubt!)
Ian Woods
Ouch. Does this mean that the assignment operator does not create a
sequence point? Does this mean that
a = x[a];
also invokes undefined behaviour? Is there a difference between
a = x[a]++;
and
a[i] = x[a[i]]++;
No, since a is read only for computing a new value of a.
> Is there a difference between
>
> a = x[a]++;
>
> and
> a[i] = x[a[i]]++;
No.
Thad
> Ouch. Does this mean that the assignment operator does not create a
> sequence point?
Yes, it means such.
Thad
You might consider using introspective sort (IntroSort).
My implementation (using ternary quicksort) is at:
www.michael-maniscalco.com/introsort.zip
- Michael
Richard Harter wrote:
> Here is a version of the math (distribution) sort for sorting
> integers. Comments and suggestions for improvement are always
> welcome. It is about three times as fast as qsort out of the box.
>
> /*
> * Copyright (c) 2004 by Richard Harter.
> * This software may be freely used and modified without
> * restriction provided that this notice is preserved unaltered.
> *
> */
>
> /*
> * This function sorts an array of integers in place using a math
> * sort based histogram sort. 2*len locations of auxilliary space
> * are used, where len is the data length.
> */
>
> #include <stdlib.h>
>
> #define MIN_SORTABLE_LENGTH 128
> #define PROCESSED -1
> cmp_index = space;
> cum = space + len;
> hist = cum + 1;
> sorted = hist;
> memset(cum,0,(len+1)*sizeof(int));
>
> /* Compute raw interpolation indices */
>
> for (i=len; --i>=0;) {
> hist[cmp_index[i] = (data[i] - data_min)/ifac] += 1;
> }
> for (i=1;i<len;i++) cum[i] += cum[i-1];
> for (i=0;i<len;i++) cmp_index[i] = cum[cmp_index[i]]++;
>
> /* Math sort proper */
>
> for (i=len; --i >= 0;) sorted[cmp_index[i]] = data[i];
> memcpy(data, sorted, len*sizeof(int));
> free (space);
> }
>
> /* End of math sort section, begin insertion sort section */
>
> {
> for (i=1; i<len; i++) {
> if (data[i] >= data[i-1]) continue;
> temp = data[i];
> data[i] = data[i-1];
> for (j = i-2; temp < data[j]; j--) data[j+1] = data[j];
> data[j+1] = temp;
> }
> }
> }
> Hi Richard
>
> You might consider using introspective sort (IntroSort).
> My implementation (using ternary quicksort) is at:
>
> www.michael-maniscalco.com/introsort.zip
Your implementation is very nice - thanks for sharing it.
I have a variation built around the use of the functional templates in
the C++ STL if you (or anyone else) is interested.
Brian Gladman
Sure. Can you post a link to it?
- Michael
Machine A:
Performance: 333 Mhz
Memory: 192 Mb
OS: Windows 98
Environment: DJGPP
Compiler: gcc -O2
Sorts Compared:
imsort: Harter integer sort, int_msort()
s3sort: Shellsort
stable: Stable merge sort, SMALL_MERGE is 40
itsort: Introquicksort with insertionsort, SMALL_PARTITION is 50
ti7: ti7qsort, SMALL_PARTITION is 50
Distribution #1: Ramp, Drives center pivot qsorts, quadratic
N loops imsort s3sort stable itsort ti7
250 40000 1.750 1.920 2.080 2.030 2.190
500 20000 1.710 2.040 2.200 2.420 2.480
1000 10000 1.870 2.210 2.250 2.750 2.530
2000 5000 2.200 2.470 2.530 3.240 2.800
5000 2000 2.310 3.180 2.910 4.730 3.130
10000 1000 2.470 3.400 3.130 5.610 3.180
20000 500 2.970 3.620 3.300 6.590 3.400
50000 200 4.610 4.180 3.460 7.960 3.790
100000 100 5.550 6.260 3.950 9.340 4.390
200000 50 6.260 8.900 4.560 11.590 4.730
500000 20 6.360 10.370 5.590 14.110 5.710
1000000 10 6.430 10.990 6.100 15.550 6.150
2000000 5 6.430 11.820 6.650 17.310 7.140
Distribution #2: All different keys, randomly shuffled
N loops imsort s3sort stable itsort ti7
250 40000 1.770 4.010 3.250 2.530 2.530
500 20000 1.810 4.780 3.950 2.800 2.860
1000 10000 2.100 5.600 4.240 3.020 3.190
2000 5000 2.200 6.480 4.840 3.460 3.570
5000 2000 3.130 7.900 5.610 3.790 3.950
10000 1000 4.160 8.950 6.260 4.220 4.390
20000 500 5.050 9.670 6.810 4.500 4.730
50000 200 6.640 11.810 7.750 5.100 5.170
100000 100 7.850 13.730 8.630 5.820 5.660
200000 50 9.550 16.690 9.500 6.310 6.310
500000 20 11.590 20.270 10.980 7.410 7.250
1000000 10 12.180 22.410 12.070 7.680 7.580
2000000 5 12.920 25.000 13.190 8.300 8.190
Distribution #3: 32 bit RAND
N loops imsort s3sort stable itsort ti7
250 40000 2.030 4.330 3.290 2.520 2.520
500 20000 1.970 4.830 3.780 2.740 2.910
1000 10000 2.040 5.660 4.240 2.970 3.190
2000 5000 2.470 6.540 4.830 3.400 3.630
5000 2000 3.190 8.520 5.660 3.910 4.070
10000 1000 4.230 8.840 6.210 4.110 4.340
20000 500 5.330 9.830 7.030 4.840 4.830
50000 200 7.360 11.480 7.580 4.990 5.110
100000 100 7.740 13.510 8.520 5.540 5.610
200000 50 9.010 16.640 9.500 6.260 6.210
500000 20 10.720 19.840 10.770 6.970 6.920
1000000 10 11.870 21.970 11.970 7.580 7.580
2000000 5 12.470 24.560 12.960 8.140 8.240
Distribution #4: Perverse
N loops imsort s3sort stable itsort ti7
250 40000 3.250 2.420 2.750 1.540 1.770
500 20000 4.450 2.870 3.020 1.760 1.930
1000 10000 4.610 3.300 3.520 1.920 2.030
2000 5000 3.680 3.290 3.850 1.980 1.920
5000 2000 4.130 4.230 4.400 2.310 2.040
10000 1000 4.560 4.390 4.780 2.420 2.080
20000 500 5.540 4.940 5.270 2.630 2.190
50000 200 7.310 5.610 5.990 3.030 2.360
100000 100 9.390 7.250 6.760 3.400 2.580
200000 50 10.430 10.110 7.580 4.060 3.020
500000 20 11.040 12.250 8.900 4.720 3.510
1000000 10 11.260 13.070 9.830 5.220 3.730
2000000 5 11.700 14.220 10.870 5.720 4.060
Machine B:
Performance: 2.65 Ghz
Memory: 512 Mb
OS: Windows XP
Environment: DJGPP
Compiler: gcc -O2
Sorts Compared:
imsort: Harter integer sort, int_msort()
s3sort: Shellsort
stable: Stable merge sort, SMALL_MERGE is 40
itsort: Introquicksort with insertionsort, SMALL_PARTITION is 50
ti7: ti7qsort, SMALL_PARTITION is 50
Distribution #1: Ramp, Drives center pivot qsorts, quadratic
N loops imsort s3sort stable itsort ti7
250 40000 0.280 0.330 0.270 0.330 0.330
500 20000 0.270 0.330 0.330 0.330 0.390
1000 10000 0.330 0.330 0.270 0.390 0.440
2000 5000 0.320 0.390 0.270 0.500 0.440
5000 2000 0.330 0.380 0.390 0.660 0.490
10000 1000 0.330 0.440 0.380 0.830 0.440
20000 500 0.280 0.390 0.330 0.890 0.440
50000 200 0.500 0.490 0.330 1.040 0.550
100000 100 0.490 0.540 0.380 1.090 0.550
200000 50 0.490 0.610 0.380 1.370 0.550
500000 20 0.500 0.660 0.440 1.650 0.600
1000000 10 0.540 0.820 0.540 1.870 0.710
2000000 5 0.540 0.870 0.600 2.080 0.770
Distribution #2: All different keys, randomly shuffled
N loops imsort s3sort stable itsort ti7
250 40000 0.330 0.710 0.490 0.280 0.330
500 20000 0.380 0.830 0.600 0.390 0.380
1000 10000 0.280 0.940 0.610 0.390 0.500
2000 5000 0.380 1.160 0.710 0.610 0.660
5000 2000 0.380 1.370 0.830 0.660 0.710
10000 1000 0.440 1.480 0.940 0.710 0.770
20000 500 0.340 1.540 0.940 0.770 0.780
50000 200 0.550 1.860 1.160 0.880 0.880
100000 100 0.720 1.980 1.220 0.880 0.990
200000 50 1.970 2.190 1.310 0.990 1.040
500000 20 3.010 2.630 1.430 1.090 1.150
1000000 10 3.300 2.970 1.550 1.100 1.150
2000000 5 3.450 3.290 1.640 1.210 1.250
Distribution #3: 32 bit RAND
N loops imsort s3sort stable itsort ti7
250 40000 0.390 0.710 0.440 0.330 0.330
500 20000 0.380 0.880 0.610 0.320 0.390
1000 10000 0.340 0.940 0.610 0.390 0.500
2000 5000 0.430 1.160 0.710 0.610 0.600
5000 2000 0.320 1.310 0.770 0.650 0.600
10000 1000 0.340 1.490 0.830 0.660 0.770
20000 500 0.440 1.600 0.980 0.830 0.820
50000 200 0.550 1.870 1.150 0.830 0.930
100000 100 0.650 1.970 1.210 0.920 0.930
200000 50 1.480 2.240 1.260 0.980 1.040
500000 20 2.630 2.580 1.420 1.040 1.090
1000000 10 3.180 2.850 1.480 1.150 1.200
2000000 5 3.450 3.180 1.640 1.210 1.260
Distribution #4: Perverse
N loops imsort s3sort stable itsort ti7
250 40000 0.430 0.440 0.390 0.220 0.270
500 20000 0.500 0.490 0.440 0.220 0.280
1000 10000 0.710 0.550 0.490 0.280 0.330
2000 5000 0.550 0.550 0.550 0.380 0.330
5000 2000 0.550 0.600 0.610 0.380 0.390
10000 1000 0.550 0.650 0.660 0.390 0.380
20000 500 0.610 0.710 0.660 0.440 0.440
50000 200 0.710 0.770 0.770 0.490 0.390
100000 100 0.830 0.830 0.890 0.440 0.440
200000 50 1.040 1.040 0.870 0.550 0.430
500000 20 1.370 1.200 1.040 0.600 0.430
1000000 10 1.530 1.370 1.090 0.600 0.490
2000000 5 1.590 1.420 1.200 0.660 0.540
I have made some modifications and have done some timing tests.
The tests were done on two different PCs of different vintage.
The overall results were similar on the two machines except for
certain scaling factors.
Briefly:
(1) The code is "hot" up to a threshold data set size.
Above the threshold the performance degrades significantly,
albeit not excessively. The threshold is hardware dependent; on
the older machine it is N~=20000 and on the newer machine
N~=200000. I presume that this is a locality of reference issue.
(2) There is some sensitivity to data distribution. It degrades
slightly when the distribution is "perverse", i.e., when most of
the data is packed into one bin. (See modification notes below.)
Conversely, performance is excellent for data that is (nearly)
uniformly distributed.
Modifications:
The code that was used in this test has been modified from the
originally posted code. The modifications include:
(1) Binary shifts are used instead of integer division to
determine bin number.
(2) Large bins are treated recursively, i.e., int_msort is
called when the number of elements in a bin exceeds a
threshold. The modification is not quite as effcient as
it should be because the final insertion code ends up
called more than once.
(3) Casts to unsigned have been added when computing data
differences.
(4) Memcpy has been replaced an partially unrolled loop.
(5) In some places array indexing has been replaced by
pointers.
(6) The code returns immediately when data_min equals
data_max.
Timing:
I have taken Pete's posted benchmark package and modified it
slightly for my purposes. The principal changes are
(1) The code was changed to process a suite of different
data set sizes.
(2) The number of repetitions for a given N is determined
by the formula reps=constant/N where constant is chosen
to make the timings lay in a reasonable range. In these
runs the constant was 10000000.
/*
* This function sorts an array of integers in place using a math
* sort based histogram sort. 3*len+1 locations of auxilliary space
* are used, where len is the data length.
*/
#include <stdlib.h>
#include <string.h>
#define MIN_SORTABLE_LENGTH 20
int
int_msort(int * data, int len)
{
int data_min; /* Minimum value in data array */
int data_max; /* Maximum value in data array */
int index_min; /* Index of minimum value */
int i; /* Major loop index */
int j; /* Minor loop index */
int ifac; /* Integer scaling factor */
int temp; /* Temp loc for data moves */
int save0; /* Save of data[0] */
int bin_shift; /* Shift to get data into bin */
int *datum; /* Ptr into data array */
int *space; /* Ptr to allocated space */
int *cmp_index; /* Ptr to computed indices */
int *cum; /* Ptr to cumulative distrib. */
int *hist; /* Ptr to cumulative distrib. */
int *sorted; /* Ptr to almost sorted data */
unsigned int range; /* Data range */
unsigned int reduced_range; /* Data value rel to minimum */
if (len <= 1) return 1;
if (len == 2) {
if (data[0] > data[1]) {
temp = data[0];
data[0] = data[1];
data[1] = temp;
}
return 1;
}
{
int *d;
data_min = data[0];
data_max = data[0];
d = data + 1;
for (i=len;--i>0;d++) {
if (data_min > *d) data_min = *d;
else if (data_max < *d) data_max = *d;
}
}
if (data_max == data_min) return 1;
if (len >= MIN_SORTABLE_LENGTH) {
/* Compute interpolation function */
bin_shift = 0;
reduced_range = (unsigned)(data_max - data_min);
for (; reduced_range > len; bin_shift++) reduced_range /= 2;
/* allocate index and histogram space */
space = malloc((3*len+1)*sizeof(int));
if (!space) return 0;
cmp_index = space;
sorted = cmp_index + len;
cum = sorted + len;
hist = cum + 1;
memset(cum,0,(len+1)*sizeof(int));
/* Compute raw interpolation indices */
{
int i;
if (bin_shift > 0) {
register int * r_index = cmp_index;
register int * r_hist = hist;
register int * r_data = data;
register int val;
for (i=len;--i>=0;) {
val = ((unsigned)(*r_data++ - data_min)) >> bin_shift;
(*(r_hist + val))++;
*r_index++ = val;
}
}
else {
unsigned int val;
for (i=len;--i>=0;) {
val = (unsigned)(data[i] - data_min);
cmp_index[i] = val;
hist[val]++;
}
}
}
/* for (i=1;i<len;i++) cum[i] += cum[i-1]; */
{
int *p, *q, lenm1;
p = cum;
q = p+1;
for (i=len;--i>0;) {*q++ += *p++;}
p = cmp_index;
q = p;
for (i = len; --i >= 0;) {*p++ = cum[*q++]++;}
}
for (i=len;--i>=0;) {sorted[cmp_index[i]] = data[i];}
{
int i;
int *p, *q;
p = data;
q = sorted;
for (i=len & 7; --i >= 0;) *p++ = *q++;
for (i = len >>3;--i>=0;) {
*p++ = *q++; *p++ = *q++;
*p++ = *q++; *p++ = *q++;
*p++ = *q++; *p++ = *q++;
*p++ = *q++; *p++ = *q++;
}
}
}
/* Recursion on large bins */
if (cum[0] > MIN_SORTABLE_LENGTH) {if (!int_msort(data, cum[0]))
return 0;}
for (i=1;i<len ;i++) {
if ((cum[i]-cum[i-1]) <= MIN_SORTABLE_LENGTH) continue;
if (!int_msort(data+cum[i-1],(cum[i]-cum[i-1]))) return 0;
}
free (space);
/* End of math sort section, begin insertion sort section */
{
for (i=0;i<len;i++) if (data[i] == data_min) break;
if (i != 0) {
data[i] = data[0];
data[0] = data_min;
}
for (i=1; i<len; i++) {
if (data[i] >= data[i-1]) continue;
temp = data[i];
data[i] = data[i-1];
for (j = i-2; temp < data[j]; j--) data[j+1] = data[j];
data[j+1] = temp;
}
}
return 1;
You made a nice job of it.
Here's the original, in case you're curious:
http://www.cs.princeton.edu/~rs/shell/driver.c
--
pete