1
0
Fork 0
sortbin/tests/run_tests.sh

280 lines
8.5 KiB
Bash
Executable File

#!/bin/bash
#
# This script runs a series of tests on the SortBin tool.
#
# The tests are relatively small.
# The whole test suite requires less than 1 GB disk space and 1 GB memory.
#
# This script assumes it runs from the "tests" subdirectory of the repository
# and expects the SortBin tools already built in the "build" subdirectory.
#
# This script writes temporary data files to a subdirectory "testdata"
# in the current working directory.
#
SCRIPTDIR=$(dirname "$0")
# Tools under test.
RECGEN="${SCRIPTDIR}/../build/recgen_dbg"
SORTBIN="${SCRIPTDIR}/../build/sortbin_dbg"
# GNU sort command.
SORT="sort"
# Directory for temporary test data.
TESTDATA="testdata"
# Run the "recgen" tool and show invocation.
recgen () {
echo "${RECGEN} $*"
"${RECGEN}" "$@"
}
# Run the Unix "sort" command and show invocation.
unixsort () {
echo "${SORT} --temporary-directory=${TESTDATA} $*"
"${SORT}" "--temporary-directory=${TESTDATA}" "$@"
}
# Run the "sortbin" tool and show invocation.
sortbin () {
echo "${SORTBIN} --temporary-directory=${TESTDATA} --verbose $*"
"${SORTBIN}" "--temporary-directory=${TESTDATA}" --verbose "$@"
}
# Verify SHA-256 checksum.
# $1 = file name
# $2 = expected SHA-256 sum
verify_sha256 () {
( cd "${TESTDATA}" ; echo "${2} ${1}" | sha256sum -c )
}
# Count number of lines in output file.
# $1 = output file name
count_unique () {
local nline
nline=$(wc --lines < "${TESTDATA}/$1")
echo "$1: ${nline} unique records"
}
# Stop on the first error.
set -e
echo "Running tests"
echo
# Wipe and create test data directory, if needed.
[ ! -d "${TESTDATA}" ] || rm -r "${TESTDATA}"
mkdir -p "${TESTDATA}"
#
# Generate test input files
#
echo "Generating in01: 10 MB, 10 bytes/record, ascii, 50% duplicates"
hash_in01="fe72553a046d9e268c8c9da62902f63f1ba4b02901cd2413f9f1df7792e26304"
hash_out01="59df36e7f7895b56e59ef94cbad269cd5bec908d8a8e32aa2ca5ea624481431f"
hash_out01u="e46e33fbf34ca575652806a51d2042b64da073d76ed52e5e74c4becb85db974f"
recgen -S 1001 -s 10 -n 1000000 -a -d 0.5 "${TESTDATA}/in01"
verify_sha256 in01 ${hash_in01}
echo
echo "Generating in02: 10 MB, 10 bytes/record, ascii, 95% duplicates"
hash_in02="c5cdd3faeaf11b1a5508e3f781148a8b899514999f61917c4897bef8dc65aacf"
hash_out02u="a95f80b2400a1e14645f1d186a065a7483c103229fc94f11bb0d62e4da6edd7d"
recgen -S 1002 -s 10 -n 1000000 -a -d 0.95 "${TESTDATA}/in02"
verify_sha256 in02 ${hash_in02}
echo
echo "Generating in03: 100 MB, 10 bytes/record, ascii, 50% duplicates"
hash_in03="1a3c00a317f39e9dc336c2a46d31a3e3189a41285723f32a09be316d5273580c"
hash_out03="f17a2e566bbccecdab04373f92ba87784af77cb36c510fb7949d7fd719528a01"
hash_out03u="cbb975d1ce136e00668f14feb5880ebce474366ac5cb5c74287b7b4018c62b03"
recgen -S 1003 -s 10 -n 10000000 -a -d 0.5 "${TESTDATA}/in03"
verify_sha256 in03 ${hash_in03}
echo
echo "Generating in04: 100 MB, 100 bytes/record, ascii, 50% duplicates"
hash_in04="75fee029fbaa762176ec4f72ecf356674dda04cefebfb01af48b17ee560f9c25"
hash_out04u="4df05a28cdf2af29c4e8c5bb22930ee68bea12cca362ae378ab5dc6c6049fa9f"
recgen -S 1004 -s 100 -n 1000000 -a -d 0.5 "${TESTDATA}/in04"
verify_sha256 in04 ${hash_in04}
echo
echo "Generating in05: 10 MB, 10 bytes/record, binary, 50% duplicates"
hash_in05="aacd9d9b1955a88397004a8ca6192b20fb9bcfa60633ac735cb6297a997f38ad"
hash_out05u="f8b5da29076b36f7a97c9fc15da6a1754e4cc85aead368bf0540b9940f8f8cdb"
recgen -S 1005 -s 10 -n 1310720 -d 0.5 "${TESTDATA}/in05"
verify_sha256 in05 ${hash_in05}
echo
#
# Run GNU sort to check reference output.
#
echo "Running GNU sort to check reference output"
echo
unixsort -o "${TESTDATA}/out01" "${TESTDATA}/in01"
verify_sha256 out01 ${hash_out01}
unixsort -o "${TESTDATA}/out01u" --unique "${TESTDATA}/in01"
verify_sha256 out01u ${hash_out01u}
count_unique out01u
unixsort -o "${TESTDATA}/out02u" --unique "${TESTDATA}/in02"
verify_sha256 out02u ${hash_out02u}
count_unique out02u
unixsort -o "${TESTDATA}/out03" "${TESTDATA}/in03"
verify_sha256 out03 ${hash_out03}
unixsort -o "${TESTDATA}/out03u" --unique "${TESTDATA}/in03"
verify_sha256 out03u ${hash_out03u}
count_unique out03u
unixsort -o "${TESTDATA}/out04u" --unique "${TESTDATA}/in04"
verify_sha256 out04u ${hash_out04u}
count_unique out04u
echo
#
# Test in-memory sorting.
#
echo "in01: 10 MB, in-memory sort, non-parallel, without background I/O"
sortbin --size=10 --memory=100M --parallel=1 --no-iothread "${TESTDATA}/in01" "${TESTDATA}/out01_x"
verify_sha256 out01_x ${hash_out01}
rm "${TESTDATA}/out01_x"
echo
echo "in01: 10 MB, in-memory sort, non-parallel"
sortbin --size=10 --memory=100M --parallel=1 --iothread "${TESTDATA}/in01" "${TESTDATA}/out01_x"
verify_sha256 out01_x ${hash_out01}
rm "${TESTDATA}/out01_x"
echo
echo "in01: 10 MB, in-memory sort, parallel"
sortbin --size=10 --memory=100M --parallel=4 --iothread "${TESTDATA}/in01" "${TESTDATA}/out01_x"
verify_sha256 out01_x ${hash_out01}
rm "${TESTDATA}/out01_x"
echo
echo "in01: 10 MB, in-memory sort, unique, parallel"
sortbin --size=10 --memory=100M --parallel=4 --iothread --unique "${TESTDATA}/in01" "${TESTDATA}/out01u_x"
verify_sha256 out01u_x ${hash_out01u}
rm "${TESTDATA}/out01u_x"
echo
echo "in02: 10 MB, many duplicates, in-memory sort, unique, parallel"
sortbin --size=10 --memory=100M --parallel=4 --iothread --unique "${TESTDATA}/in02" "${TESTDATA}/out02u_x"
verify_sha256 out02u_x ${hash_out02u}
rm "${TESTDATA}/out02u_x"
echo
echo "in03: 100 MB, in-memory sort, unique, parallel"
sortbin --size=10 --memory=100M --parallel=4 --iothread --unique "${TESTDATA}/in03" "${TESTDATA}/out03u_x"
verify_sha256 out03u_x ${hash_out03u}
rm "${TESTDATA}/out03u_x"
echo
echo "in04: 100 MB, big records, in-memory sort, unique, parallel"
sortbin --size=100 --memory=100M --parallel=4 --iothread --unique "${TESTDATA}/in04" "${TESTDATA}/out04u_x"
verify_sha256 out04u_x ${hash_out04u}
rm "${TESTDATA}/out04u_x"
echo
echo "in05: 10 MB, binary data, in-memory sort, unique, parallel"
sortbin --size=10 --memory=100M --parallel=4 --iothread --unique "${TESTDATA}/in05" "${TESTDATA}/out05u_x"
verify_sha256 out05u_x ${hash_out05u}
rm "${TESTDATA}/out05u_x"
echo
#
# Test external sorting.
#
echo "in03: 100 MB, external sort, 1 merge pass"
sortbin --size=10 --memory=20M --parallel=4 --iothread "${TESTDATA}/in03" "${TESTDATA}/out03_x"
verify_sha256 out03_x ${hash_out03}
rm "${TESTDATA}/out03_x"
echo
echo "in03: 100 MB, external sort, 1 merge pass, unique"
sortbin --size=10 --memory=20M --parallel=4 --iothread --unique "${TESTDATA}/in03" "${TESTDATA}/out03u_x"
verify_sha256 out03u_x ${hash_out03u}
rm "${TESTDATA}/out03u_x"
echo
echo "in05: 10 MB, binary data, external sort, 1 merge pass, equal block sizes"
sortbin --size=10 --memory=5M --parallel=4 --iothread --unique "${TESTDATA}/in05" "${TESTDATA}/out05u_x"
verify_sha256 out05u_x ${hash_out05u}
rm "${TESTDATA}/out05u_x"
echo
echo "in03: 100 MB, external sort, 2 merge passes, unique, without background I/O"
sortbin --size=10 --memory=10M --parallel=4 --no-iothread --branch=4 --unique "${TESTDATA}/in03" "${TESTDATA}/out03u_x"
verify_sha256 out03u_x ${hash_out03u}
rm "${TESTDATA}/out03u_x"
echo
echo "in03: 100 MB, external sort, 2 merge passes, unique"
sortbin --size=10 --memory=20M --parallel=4 --iothread --branch=4 --unique "${TESTDATA}/in03" "${TESTDATA}/out03u_x"
verify_sha256 out03u_x ${hash_out03u}
rm "${TESTDATA}/out03u_x"
echo
echo "in03: 100 MB, external sort, 2 merge passes, balanced merge tree"
sortbin --size=10 --memory=22M --parallel=4 --iothread --branch=3 "${TESTDATA}/in03" "${TESTDATA}/out03_x"
verify_sha256 out03_x ${hash_out03}
rm "${TESTDATA}/out03_x"
echo
echo "in03: 100 MB, external sort, 2 merge passes, slightly unbalanced merge"
sortbin --size=10 --memory=20M --parallel=4 --iothread --branch=9 "${TESTDATA}/in03" "${TESTDATA}/out03_x"
verify_sha256 out03_x ${hash_out03}
rm "${TESTDATA}/out03_x"
echo
echo "in03: 100 MB, external sort, 2 merge passes, slightly unbalanced merge"
sortbin --size=10 --memory=13M --parallel=4 --iothread --branch=4 "${TESTDATA}/in03" "${TESTDATA}/out03_x"
verify_sha256 out03_x ${hash_out03}
rm "${TESTDATA}/out03_x"
echo
echo "in03: 100 MB, external sort, 2-way merging, 5 merge passes"
sortbin --size=10 --memory=10M --parallel=4 --iothread --branch=2 "${TESTDATA}/in03" "${TESTDATA}/out03_x"
verify_sha256 out03_x ${hash_out03}
rm "${TESTDATA}/out03_x"
echo
echo "in03: 100 MB, external sort, 24-way merge, 1 merge pass"
sortbin --size=10 --memory=8M --parallel=4 --iothread --branch=24 "${TESTDATA}/in03" "${TESTDATA}/out03_x"
verify_sha256 out03_x ${hash_out03}
rm "${TESTDATA}/out03_x"
echo
echo "Tests finished"