From a6fe2a199c83220e9787fd46a08ee1773e82dc2e Mon Sep 17 00:00:00 2001 From: Joris van Rantwijk Date: Sun, 3 Jul 2022 14:55:00 +0200 Subject: [PATCH] Add some automated tests --- .gitignore | 4 +- Makefile | 14 ++- tests/run_tests.sh | 279 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 293 insertions(+), 4 deletions(-) create mode 100755 tests/run_tests.sh diff --git a/.gitignore b/.gitignore index 7a4eee6..6a2c9e7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ *.o *~ -sortbin -recgen +/build +/tests/testdata diff --git a/Makefile b/Makefile index 28784ab..3eb3962 100644 --- a/Makefile +++ b/Makefile @@ -4,17 +4,22 @@ CXX = g++ CXXFLAGS = -Wall -O2 -pthread - # -fsanitize=address -fsanitize=undefined +CXXFLAGS_DEBUG = -g -fsanitize=address -fsanitize=undefined -fsanitize=leak SRCDIR = src BUILDDIR = build TOOLS = sortbin recgen BINFILES = $(patsubst %,$(BUILDDIR)/%,$(TOOLS)) +BINFILES_DEBUG = $(patsubst %,$(BUILDDIR)/%_dbg,$(TOOLS)) .PHONY: all all: $(BINFILES) +.PHONY: test +test: $(BINFILES_DEBUG) + cd tests ; ./run_tests.sh + $(BUILDDIR)/sortbin: $(SRCDIR)/sortbin.cpp $(BUILDDIR)/recgen: $(SRCDIR)/recgen.cpp @@ -22,7 +27,12 @@ $(BUILDDIR)/%: $(SRCDIR)/%.cpp @mkdir -p $(BUILDDIR) $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) $< $(LDLIBS) -o $@ +$(BUILDDIR)/%_dbg: $(SRCDIR)/%.cpp + @mkdir -p $(BUILDDIR) + $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CXXFLAGS_DEBUG) $(LDFLAGS) $< $(LDLIBS) -o $@ + .PHONY: clean clean: - $(RM) $(BINFILES) + $(RM) $(BINFILES) $(BINFILES_DEBUG) + $(RM) -r tests/testdata diff --git a/tests/run_tests.sh b/tests/run_tests.sh new file mode 100755 index 0000000..441fb1c --- /dev/null +++ b/tests/run_tests.sh @@ -0,0 +1,279 @@ +#!/bin/bash + +# +# This script runs a series of tests on the SortBin tool. +# +# The tests are relatively small. +# The whole test suite requires less than 1 GB disk space and 1 GB memory. +# +# This script assumes it runs from the "tests" subdirectory of the repository +# and expects the SortBin tools already built in the "build" subdirectory. +# +# This script writes temporary data files to a subdirectory "testdata" +# in the current working directory. +# + +SCRIPTDIR=$(dirname "$0") + +# Tools under test. +RECGEN="${SCRIPTDIR}/../build/recgen_dbg" +SORTBIN="${SCRIPTDIR}/../build/sortbin_dbg" + +# GNU sort command. +SORT="sort" + +# Directory for temporary test data. +TESTDATA="testdata" + + +# Run the "recgen" tool and show invocation. +recgen () { + echo "${RECGEN} $*" + "${RECGEN}" "$@" +} + + +# Run the Unix "sort" command and show invocation. +unixsort () { + echo "${SORT} --temporary-directory=${TESTDATA} $*" + "${SORT}" "--temporary-directory=${TESTDATA}" "$@" +} + + +# Run the "sortbin" tool and show invocation. +sortbin () { + echo "${SORTBIN} --temporary-directory=${TESTDATA} --verbose $*" + "${SORTBIN}" "--temporary-directory=${TESTDATA}" --verbose "$@" +} + + +# Verify SHA-256 checksum. +# $1 = file name +# $2 = expected SHA-256 sum +verify_sha256 () { + ( cd "${TESTDATA}" ; echo "${2} ${1}" | sha256sum -c ) +} + + +# Count number of lines in output file. +# $1 = output file name +count_unique () { + local nline + nline=$(wc --lines < "${TESTDATA}/$1") + echo "$1: ${nline} unique records" +} + + +# Stop on the first error. +set -e + +echo "Running tests" +echo + +# Wipe and create test data directory, if needed. +[ ! -d "${TESTDATA}" ] || rm -r "${TESTDATA}" +mkdir -p "${TESTDATA}" + + +# +# Generate test input files +# + +echo "Generating in01: 10 MB, 10 bytes/record, ascii, 50% duplicates" + +hash_in01="fe72553a046d9e268c8c9da62902f63f1ba4b02901cd2413f9f1df7792e26304" +hash_out01="59df36e7f7895b56e59ef94cbad269cd5bec908d8a8e32aa2ca5ea624481431f" +hash_out01u="e46e33fbf34ca575652806a51d2042b64da073d76ed52e5e74c4becb85db974f" + +recgen -S 1001 -s 10 -n 1000000 -a -d 0.5 "${TESTDATA}/in01" +verify_sha256 in01 ${hash_in01} +echo + +echo "Generating in02: 10 MB, 10 bytes/record, ascii, 95% duplicates" + +hash_in02="c5cdd3faeaf11b1a5508e3f781148a8b899514999f61917c4897bef8dc65aacf" +hash_out02u="a95f80b2400a1e14645f1d186a065a7483c103229fc94f11bb0d62e4da6edd7d" + +recgen -S 1002 -s 10 -n 1000000 -a -d 0.95 "${TESTDATA}/in02" +verify_sha256 in02 ${hash_in02} +echo + +echo "Generating in03: 100 MB, 10 bytes/record, ascii, 50% duplicates" + +hash_in03="1a3c00a317f39e9dc336c2a46d31a3e3189a41285723f32a09be316d5273580c" +hash_out03="f17a2e566bbccecdab04373f92ba87784af77cb36c510fb7949d7fd719528a01" +hash_out03u="cbb975d1ce136e00668f14feb5880ebce474366ac5cb5c74287b7b4018c62b03" + +recgen -S 1003 -s 10 -n 10000000 -a -d 0.5 "${TESTDATA}/in03" +verify_sha256 in03 ${hash_in03} +echo + +echo "Generating in04: 100 MB, 100 bytes/record, ascii, 50% duplicates" + +hash_in04="75fee029fbaa762176ec4f72ecf356674dda04cefebfb01af48b17ee560f9c25" +hash_out04u="4df05a28cdf2af29c4e8c5bb22930ee68bea12cca362ae378ab5dc6c6049fa9f" + +recgen -S 1004 -s 100 -n 1000000 -a -d 0.5 "${TESTDATA}/in04" +verify_sha256 in04 ${hash_in04} +echo + +echo "Generating in05: 10 MB, 10 bytes/record, binary, 50% duplicates" + +hash_in05="aacd9d9b1955a88397004a8ca6192b20fb9bcfa60633ac735cb6297a997f38ad" +hash_out05u="f8b5da29076b36f7a97c9fc15da6a1754e4cc85aead368bf0540b9940f8f8cdb" + +recgen -S 1005 -s 10 -n 1310720 -d 0.5 "${TESTDATA}/in05" +verify_sha256 in05 ${hash_in05} +echo + + +# +# Run GNU sort to check reference output. +# + +echo "Running GNU sort to check reference output" +echo + +unixsort -o "${TESTDATA}/out01" "${TESTDATA}/in01" +verify_sha256 out01 ${hash_out01} + +unixsort -o "${TESTDATA}/out01u" --unique "${TESTDATA}/in01" +verify_sha256 out01u ${hash_out01u} +count_unique out01u + +unixsort -o "${TESTDATA}/out02u" --unique "${TESTDATA}/in02" +verify_sha256 out02u ${hash_out02u} +count_unique out02u + +unixsort -o "${TESTDATA}/out03" "${TESTDATA}/in03" +verify_sha256 out03 ${hash_out03} + +unixsort -o "${TESTDATA}/out03u" --unique "${TESTDATA}/in03" +verify_sha256 out03u ${hash_out03u} +count_unique out03u + +unixsort -o "${TESTDATA}/out04u" --unique "${TESTDATA}/in04" +verify_sha256 out04u ${hash_out04u} +count_unique out04u + +echo + + +# +# Test in-memory sorting. +# + +echo "in01: 10 MB, in-memory sort, non-parallel, without background I/O" +sortbin --size=10 --memory=100M --parallel=1 --no-iothread "${TESTDATA}/in01" "${TESTDATA}/out01_x" +verify_sha256 out01_x ${hash_out01} +rm "${TESTDATA}/out01_x" +echo + +echo "in01: 10 MB, in-memory sort, non-parallel" +sortbin --size=10 --memory=100M --parallel=1 --iothread "${TESTDATA}/in01" "${TESTDATA}/out01_x" +verify_sha256 out01_x ${hash_out01} +rm "${TESTDATA}/out01_x" +echo + +echo "in01: 10 MB, in-memory sort, parallel" +sortbin --size=10 --memory=100M --parallel=4 --iothread "${TESTDATA}/in01" "${TESTDATA}/out01_x" +verify_sha256 out01_x ${hash_out01} +rm "${TESTDATA}/out01_x" +echo + +echo "in01: 10 MB, in-memory sort, unique, parallel" +sortbin --size=10 --memory=100M --parallel=4 --iothread --unique "${TESTDATA}/in01" "${TESTDATA}/out01u_x" +verify_sha256 out01u_x ${hash_out01u} +rm "${TESTDATA}/out01u_x" +echo + +echo "in02: 10 MB, many duplicates, in-memory sort, unique, parallel" +sortbin --size=10 --memory=100M --parallel=4 --iothread --unique "${TESTDATA}/in02" "${TESTDATA}/out02u_x" +verify_sha256 out02u_x ${hash_out02u} +rm "${TESTDATA}/out02u_x" +echo + +echo "in03: 100 MB, in-memory sort, unique, parallel" +sortbin --size=10 --memory=100M --parallel=4 --iothread --unique "${TESTDATA}/in03" "${TESTDATA}/out03u_x" +verify_sha256 out03u_x ${hash_out03u} +rm "${TESTDATA}/out03u_x" +echo + +echo "in04: 100 MB, big records, in-memory sort, unique, parallel" +sortbin --size=100 --memory=100M --parallel=4 --iothread --unique "${TESTDATA}/in04" "${TESTDATA}/out04u_x" +verify_sha256 out04u_x ${hash_out04u} +rm "${TESTDATA}/out04u_x" +echo + +echo "in05: 10 MB, binary data, in-memory sort, unique, parallel" +sortbin --size=10 --memory=100M --parallel=4 --iothread --unique "${TESTDATA}/in05" "${TESTDATA}/out05u_x" +verify_sha256 out05u_x ${hash_out05u} +rm "${TESTDATA}/out05u_x" +echo + + +# +# Test external sorting. +# + +echo "in03: 100 MB, external sort, 1 merge pass" +sortbin --size=10 --memory=20M --parallel=4 --iothread "${TESTDATA}/in03" "${TESTDATA}/out03_x" +verify_sha256 out03_x ${hash_out03} +rm "${TESTDATA}/out03_x" +echo + +echo "in03: 100 MB, external sort, 1 merge pass, unique" +sortbin --size=10 --memory=20M --parallel=4 --iothread --unique "${TESTDATA}/in03" "${TESTDATA}/out03u_x" +verify_sha256 out03u_x ${hash_out03u} +rm "${TESTDATA}/out03u_x" +echo + +echo "in05: 10 MB, binary data, external sort, 1 merge pass, equal block sizes" +sortbin --size=10 --memory=5M --parallel=4 --iothread --unique "${TESTDATA}/in05" "${TESTDATA}/out05u_x" +verify_sha256 out05u_x ${hash_out05u} +rm "${TESTDATA}/out05u_x" +echo + +echo "in03: 100 MB, external sort, 2 merge passes, unique, without background I/O" +sortbin --size=10 --memory=10M --parallel=4 --no-iothread --branch=4 --unique "${TESTDATA}/in03" "${TESTDATA}/out03u_x" +verify_sha256 out03u_x ${hash_out03u} +rm "${TESTDATA}/out03u_x" +echo + +echo "in03: 100 MB, external sort, 2 merge passes, unique" +sortbin --size=10 --memory=20M --parallel=4 --iothread --branch=4 --unique "${TESTDATA}/in03" "${TESTDATA}/out03u_x" +verify_sha256 out03u_x ${hash_out03u} +rm "${TESTDATA}/out03u_x" +echo + +echo "in03: 100 MB, external sort, 2 merge passes, balanced merge tree" +sortbin --size=10 --memory=22M --parallel=4 --iothread --branch=3 "${TESTDATA}/in03" "${TESTDATA}/out03_x" +verify_sha256 out03_x ${hash_out03} +rm "${TESTDATA}/out03_x" +echo + +echo "in03: 100 MB, external sort, 2 merge passes, slightly unbalanced merge" +sortbin --size=10 --memory=20M --parallel=4 --iothread --branch=9 "${TESTDATA}/in03" "${TESTDATA}/out03_x" +verify_sha256 out03_x ${hash_out03} +rm "${TESTDATA}/out03_x" +echo + +echo "in03: 100 MB, external sort, 2 merge passes, slightly unbalanced merge" +sortbin --size=10 --memory=13M --parallel=4 --iothread --branch=4 "${TESTDATA}/in03" "${TESTDATA}/out03_x" +verify_sha256 out03_x ${hash_out03} +rm "${TESTDATA}/out03_x" +echo + +echo "in03: 100 MB, external sort, 2-way merging, 5 merge passes" +sortbin --size=10 --memory=10M --parallel=4 --iothread --branch=2 "${TESTDATA}/in03" "${TESTDATA}/out03_x" +verify_sha256 out03_x ${hash_out03} +rm "${TESTDATA}/out03_x" +echo + +echo "in03: 100 MB, external sort, 24-way merge, 1 merge pass" +sortbin --size=10 --memory=8M --parallel=4 --iothread --branch=24 "${TESTDATA}/in03" "${TESTDATA}/out03_x" +verify_sha256 out03_x ${hash_out03} +rm "${TESTDATA}/out03_x" +echo + +echo "Tests finished"