[crush-tools] r539 committed - hashjoin: bug fix - duplicate headers printed when multiple input file...

4 views
Skip to first unread message

crush...@googlecode.com

unread,
Dec 8, 2014, 1:05:07 PM12/8/14
to crush...@googlegroups.com
Revision: 539
Author: jeremy...@gmail.com
Date: Mon Dec 8 18:03:51 2014 UTC
Log: hashjoin: bug fix - duplicate headers printed when multiple input
files are provided.


https://code.google.com/p/crush-tools/source/detail?r=539

Added:
/trunk/src/hashjoin/test/test_09.1.expected
/trunk/src/hashjoin/test/test_09.2.expected
/trunk/src/hashjoin/test/test_09.sh
Modified:
/trunk/src/hashjoin/hashjoin.c

=======================================
--- /dev/null
+++ /trunk/src/hashjoin/test/test_09.1.expected Mon Dec 8 18:03:51 2014 UTC
@@ -0,0 +1,7 @@
+Field-0,Field-1,Something-Else,Field-2,Field-3
+1,2,wee,3,4
+5,6,w00t,7,8
+7,8,nope,,
+1,2,wee,3,4
+5,6,w00t,7,8
+7,8,nope,,
=======================================
--- /dev/null
+++ /trunk/src/hashjoin/test/test_09.2.expected Mon Dec 8 18:03:51 2014 UTC
@@ -0,0 +1,6 @@
+1,2,wee,3,4
+5,6,w00t,7,8
+7,8,nope,,
+1,2,wee,3,4
+5,6,w00t,7,8
+7,8,nope,,
=======================================
--- /dev/null
+++ /trunk/src/hashjoin/test/test_09.sh Mon Dec 8 18:03:51 2014 UTC
@@ -0,0 +1,38 @@
+test_number=09
+description="multiple input files"
+
+subtest=1
+subtest_desc="with header"
+infile="$test_dir/input_header.log"
+dimfile="$test_dir/dimension_header.log"
+outfile="$test_dir/test_$test_number.$subtest.actual"
+expected="$test_dir/test_$test_number.$subtest.expected"
+
+$bin -K 'Field-0,Field-1' -J 'Field-2,Field-3' -f $dimfile $infile $infile
\
+ > "$outfile"
+
+if [ $? -ne 0 ] ||
+ [ "`diff -q $outfile $expected`" ]; then
+ test_status $test_number $subtest "$description ($subtest_desc)" FAIL
+else
+ test_status $test_number $subtest "$description ($subtest_desc)" PASS
+ rm "$outfile"
+fi
+
+subtest=2
+subtest_desc="no header"
+infile="$test_dir/input_no_header.log"
+dimfile="$test_dir/dimension_no_header.log"
+outfile="$test_dir/test_$test_number.$subtest.actual"
+expected="$test_dir/test_$test_number.$subtest.expected"
+
+$bin -k 1,2 -l 1,2 -j 3,4 -f $dimfile $infile $infile \
+ > "$outfile"
+
+if [ $? -ne 0 ] ||
+ [ "`diff -q $outfile $expected`" ]; then
+ test_status $test_number $subtest "$description ($subtest_desc)" FAIL
+else
+ test_status $test_number $subtest "$description ($subtest_desc)" PASS
+ rm "$outfile"
+fi
=======================================
--- /trunk/src/hashjoin/hashjoin.c Fri Dec 5 15:59:56 2014 UTC
+++ /trunk/src/hashjoin/hashjoin.c Mon Dec 8 18:03:51 2014 UTC
@@ -47,6 +47,7 @@
hashtbl_t dimension;
FILE *infile;
dbfr_t *datareader;
+ int header_printed = 0;

char *keybuffer = NULL;
size_t keybuffer_sz = 0;
@@ -109,6 +110,8 @@
infile = stdin;

if (! args->key_labels) {
+ /* The user supplied --data-keys (indexes) which stay the same for each
+ * input file. */
n_key_fields = expand_nums(args->data_key_fields,
&key_fields, &n_key_fields);
decrement(key_fields, n_key_fields);
@@ -122,6 +125,8 @@
}

if (args->key_labels) {
+ /* The user supplied --key-labels which need to be converted to
indexes
+ * for each input file. But see TODO below. */
n_key_fields = expand_label_list(args->key_labels,
datareader->next_line,
args->delim, &key_fields,
&n_key_fields);
@@ -132,12 +137,19 @@
if (args->dimension_labels && ! args->dimension_field_labels) {
dbfr_getline(datareader);
chomp(datareader->current_line);
+ /* TODO(jhinds): This does not account for the possibility of
multiple
+ * input files with different formats. */
printf("%s%s%s\n", datareader->current_line,
args->delim, args->dimension_labels);
+ } else if (args->dimension_labels || args->key_labels &&
header_printed) {
+ /* The header has already been printed. Skip the first row of
subsequent
+ * files. */
+ dbfr_getline(datareader);
}
+ header_printed = 1;

/* If the input has only a header row, quit now. */
- if (datareader->next_line == NULL) {
+ if (datareader->eof) {
infile = nextfile(argc, argv, &optind, "r");
continue;
}
Reply all
Reply to author
Forward
0 new messages