sortbin/src/recgen.cpp

/*
 * Tool to generate random binary data records.
 *
 * Written by Joris van Rantwijk in 2022.
 */

#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <math.h>
#include <string.h>
#include <unistd.h>

#include <vector>


#define MAX_RECORD_SIZE             65536
#define REPORT_INTERVAL             1000000


namespace {  // anonymous namespace


/**
 * Pseudo random number generator "xoroshiro128+"
 *
 * This code is based on the reference implementation,
 * modified by Joris van Rantwijk to fit in a C++ class.
 *
 * Source: http://prng.di.unimi.it/
 *
 * The following commpents apply to the reference implementation:
 *
 * Written in 2016-2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)
 *
 * To the extent possible under law, the author has dedicated all copyright
 * and related and neighboring rights to this software to the public domain
 * worldwide. This software is distributed without any warranty.
 *
 * See <http://creativecommons.org/publicdomain/zero/1.0/>.
 *
 * This is xoroshiro128+ 1.0, our best and fastest small-state generator
 * for floating-point numbers. We suggest to use its upper bits for
 * floating-point generation, as it is slightly faster than
 * xoroshiro128++/xoroshiro128**. It passes all tests we are aware of
 * except for the four lower bits, which might fail linearity tests (and
 * just those), so if low linear complexity is not considered an issue (as
 * it is usually the case) it can be used to generate 64-bit outputs, too;
 * moreover, this generator has a very mild Hamming-weight dependency
 * making our test (http://prng.di.unimi.it/hwd.php) fail after 5 TB of
 * output; we believe this slight bias cannot affect any application. If
 * you are concerned, use xoroshiro128++, xoroshiro128** or xoshiro256+.
 *
 * We suggest to use a sign test to extract a random Boolean value, and
 * right shifts to extract subsets of bits.
 *
 * The state must be seeded so that it is not everywhere zero. If you have
 * a 64-bit seed, we suggest to seed a splitmix64 generator and use its
 * output to fill s.
 *
 * NOTE: the parameters (a=24, b=16, b=37) of this version give slightly
 * better results in our test than the 2016 version (a=55, b=14, c=36).
 */
class Xoroshiro128plus
{
public:
    Xoroshiro128plus(uint64_t s0, uint64_t s1)
    {
        m_state[0] = s0;
        m_state[1] = s1;
    }

    inline uint64_t next()
    {
        const uint64_t s0 = m_state[0];
        uint64_t s1 = m_state[1];
        const uint64_t result = s0 + s1;

        s1 ^= s0;
        m_state[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16);
        m_state[1] = rotl(s1, 37);

        return result;
    }

private:
    static inline uint64_t rotl(uint64_t x, int k)
    {
        return (x << k) | (x >> (64 - k));
    }

    uint64_t m_state[2];
};


class RecordGenerator
{
public:
    RecordGenerator(
        unsigned int record_size,
        unsigned long long num_records,
        double duplicate_fraction,
        bool flag_ascii)
      : m_record_size(record_size),
        m_bits_per_record(0),
        m_highbit_threshold(0),
        m_make_duplicates(false),
        m_flag_ascii(flag_ascii)
    {
        assert(record_size > 0);

        if (duplicate_fraction > 0) {

            // Target a specific fraction of duplicate records.
            // We do this by defining a limited set of keys, such that each
            // key maps to a random record. To generate a record, we draw
            // uniformly from the set of keys, instead of from the set of
            // all possible records.

// TODO : need a correction factor for the probability that unique secondary seeds collide in the record generation

            // Calculate target number of unique records.
            double target_unique = num_records * (1 - duplicate_fraction);

            // Calculate the amount of information per record (in bits).
            double info_per_record =
                m_flag_ascii ?
                    ((record_size - 1) * log2(36.0))
                    : (record_size * 8.0);

            // Determine how many unique keys we need to draw to
            // get an exepected number of unique records that matches
            // our target.
            //
            // Draw N records from a set of V possible values.
            // Expected number of unique values:
            //
            //   U = V * (1 - (1 - 1/V)**N)
            //
            // Solve for N:
            //
            //   N = log(1 - U/V) / log(1 - 1/V)
            //
            if (info_per_record >= 128) {
                // The number of records is so big that we can just pretend
                // that every key will produce a different record.
            } else {
                double v = exp(info_per_record * M_LN2);

                if (target_unique * 1.000001 >= v) {
                    // There are not enough different records to produce
                    // the requested number of unique records.
                    // Just produce as many as possible.
                    target_unique = num_records;
                } else {

                    // log(1 - 1/V) is inaccurate for very large values of V;
                    // approximate as (-1/V)
                    double t =
                        (v < 1.0e6) ?
                            log(1.0 - 1.0 / v)
                            : (-1.0 / v);

                    // log(1 - U/V) is inaccurate for very large values of V;
                    // approximate as (-U/V)
                    double q =
                        (v < 1.0e6 * target_unique) ?
                            log(1.0 - target_unique / v)
                            : (- target_unique / v);

                    // Calculate the target number of unique keys.
                    target_unique = q / t;
                }
            }

            // Determine the number of random bits for which the
            // expected number of unique keys matches our target.
            // First scan in steps of 1 bit.
            unsigned int need_bits = 2;
            while (need_bits < 127) {
                double expected_unique =
                    expected_num_unique(num_records, need_bits);
                if (expected_unique >= target_unique) {
                    break;
                }
                need_bits++;
            }

            // Fine scan in steps of 1/16 bit.
            unsigned int need_bits_frac16 = 0;
            while (need_bits_frac16 < 16) {
                double nbits = need_bits - 1 + need_bits_frac16 / 16.0;
                double expected_unique =
                    expected_num_unique(num_records, nbits);
                if (expected_unique >= target_unique) {
                    break;
                }
                need_bits_frac16++;
            }

            if (need_bits < 127) {
                // Use this number of bits per record.
                printf("use bits = %f\n", need_bits - 1 + need_bits_frac16 / 16.0);
                m_bits_per_record = need_bits;
                m_highbit_threshold =
                    exp((63 + need_bits_frac16 / 16.0) * M_LN2);
                m_make_duplicates = true;
            } else {
                // We need so many random bits that nobody will notice
                // if we just use a uniform distribution of records.
                // So let's do that.
                printf("use uniform\n");
                m_make_duplicates = false;
            }
        }
    }

    void generate_record(unsigned char * record, Xoroshiro128plus& rng)
    {
        if (m_make_duplicates) {
            // We have a budget of fewer than 128 random bits per record.
            // Create a random seed value of that many bits.
            // Then use it to initialize a secondary random number generator.
            // Then use that generator to generate the actual record.

            uint64_t s0 = 0, s1 = 0;
            unsigned int need_bits = m_bits_per_record;
            if (need_bits > 64) {
                s0 = rng.next();
                need_bits -= 64;
            }
            do {
                s1 = rng.next();
            } while (s1 > m_highbit_threshold);
            s1 >>= (64 - need_bits);

            Xoroshiro128plus rng2(s0, s1);
            rng2.next();
            rng2.next();
            rng2.next();

            generate_uniform_record(record, rng2);
        } else {
            // Uniform distribution of records.
            generate_uniform_record(record, rng);
        }
    }

private:
    void generate_uniform_record(unsigned char * record, Xoroshiro128plus& rng)
    {
        if (m_flag_ascii) {

            // Generate ASCII record.
            for (unsigned int i = 0; i < m_record_size - 1; i++) {
                uint64_t r = rng.next() >> 32;
                unsigned int p = r % 36;
                if (p < 10) {
                    record[i] = '0' + p;
                } else {
                    record[i] = 'a' + (p - 10);
                }
            }

            // Append newline.
            record[m_record_size-1] = '\n';

        } else {

            // Generate binary record.
            for (unsigned int i = 0; i < m_record_size; i++) {
                uint64_t r = rng.next() >> 32;
                record[i] = r & 0xff;
            }

        }
    }

    /** Calculate the expected number of unique records. */
    double expected_num_unique(unsigned long long num_records,
                               double bits_per_record)
    {
        // We draw N records from a set of V values with replacement.
        //
        // The expected number of unique values drawn in one batch is
        //
        //   V * (1 - (1 - 1/V)**N)
        //

        double v = exp(bits_per_record * M_LN2);

        // The calculation (1 - 1/V)**N is inaccurate for very large
        // values of V. In that case, approximation as exp(- N / V).
        double t =
            (v < 1.0e6) ?
                pow(1.0 - 1.0 / v, num_records)
                : exp(- double(num_records) / v);

        return v * (1.0 - t);
    }

    unsigned int m_record_size;
    unsigned int m_bits_per_record;
    uint64_t m_highbit_threshold;
    bool m_make_duplicates;
    bool m_flag_ascii;
};


int recgen(
    const char *output_name,
    unsigned int record_size,
    unsigned long long num_records,
    double duplicate_fraction,
    bool flag_ascii,
    uint64_t seed)
{
    Xoroshiro128plus rng(seed, 0);
    rng.next();
    rng.next();

    RecordGenerator record_generator(
        record_size,
        num_records,
        duplicate_fraction,
        flag_ascii);

    int fd = open(output_name, O_WRONLY | O_CREAT | O_EXCL, 0666);
    if (fd < 0) {
        fprintf(stderr, "ERROR: Can not create output file (%s)\n",
                strerror(errno));
        return -1;
    }

    FILE *outf = fdopen(fd, "w");
    if (outf == NULL) {
        fprintf(stderr, "ERROR: fdopen() failed (%s)\n", strerror(errno));
        close(fd);
        return -1;
    }

    std::vector<unsigned char> record(record_size);

    for (unsigned long long i = 0; i < num_records; i++) {
        if ((i % REPORT_INTERVAL) == 0) {
            printf("\rgenerated %llu / %llu records    ", i, num_records);
            fflush(stdout);
        }

        record_generator.generate_record(record.data(), rng);

        if (fwrite(record.data(), record_size, 1, outf) != 1) {
            fprintf(stderr, "ERROR: Writing to output file failed (%s)\n",
                    strerror(errno));
            fclose(outf);
            return -1;
        }
    }

    printf("\rgenerated %llu records - done        \n", num_records);
    fflush(stdout);

    fclose(outf);

    return 0;
}


void usage()
{
    fprintf(stderr,
        "\n"
        "Generate fixed-length random binary records.\n"
        "\n"
        "Usage: recgen [-a] [-d D] [-S R] -n N -s S outputfile\n"
        "\n"
        "Options:\n"
        "\n"
        "  -n N        specify number of records (required)\n"
        "  -s S        specify record size in bytes (required)\n"
        "  -a          generate ASCII records: 0-1, a-z, end in newline\n"
        "  -d D        specify fraction of duplicate records (0.0 to 1.0)\n"
        "  -S R        specify seed for random generator (default 1)\n"
        "\n");
}


} // anonymous namespace


int main(int argc, char **argv)
{
    double duplicate_fraction = 0.0;
    unsigned long long num_records = 0;
    unsigned long record_size = 0;
    bool flag_ascii = false;
    unsigned long long seed = 1;
    int opt;

    while ((opt = getopt(argc, argv, "ad:n:s:S:")) != -1) {
        char *endptr;
        switch (opt) {
            case 'a':
                flag_ascii = true;
                break;
            case 'd':
                duplicate_fraction = strtod(optarg, &endptr);
                if (endptr == optarg
                        || *endptr != '\0'
                        || duplicate_fraction < 0.0
                        || duplicate_fraction > 1.0) {
                    fprintf(stderr,
                        "ERROR: Invalid duplicate fraction"
                        " (must be between 0.0 and 1.0)\n");
                    return EXIT_FAILURE;
                }
                break;
            case 'n':
                num_records = strtoull(optarg, &endptr, 10);
                if (endptr == optarg
                        || *endptr != '\0'
                        || num_records == 0) {
                    fprintf(stderr,
                        "ERROR: Invalid number of records\n");
                    return EXIT_FAILURE;
                }
                break;
            case 's':
                record_size = strtoul(optarg, &endptr, 10);
                if (endptr == optarg
                        || *endptr != '\0'
                        || record_size < 2
                        || record_size > MAX_RECORD_SIZE) {
                    fprintf(stderr,
                        "ERROR: Invalid record size\n");
                    return EXIT_FAILURE;
                }
                break;
            case 'S':
                seed = strtoull(optarg, &endptr, 10);
                if (endptr == optarg
                        || *endptr != '\0'
                        || seed > UINT64_MAX) {
                    fprintf(stderr,
                        "ERROR: Invalid random seed\n");
                    return EXIT_FAILURE;
                }
                break;
            case 'h':
                usage();
                return EXIT_SUCCESS;
            default:
                usage();
                return EXIT_FAILURE;
        }
    }

    if (num_records == 0) {
        fprintf(stderr, "ERROR: Missing required parameter -n\n");
        usage();
        return EXIT_FAILURE;
    }
    if (record_size == 0) {
        fprintf(stderr, "ERROR: Missing required parameter -s\n");
        usage();
        return EXIT_FAILURE;
    }

    if (argc < optind + 1) {
        fprintf(stderr,
                "ERROR: Output file name must be specified\n");
        usage();
        return EXIT_FAILURE;
    }

    if (argc > optind + 1) {
        fprintf(stderr, "ERROR: Unexpected command-line parameters\n");
        usage();
        return EXIT_FAILURE;
    }

    const char * output_name = argv[optind];

    int ret = recgen(
        output_name,
        record_size,
        num_records,
        duplicate_fraction,
        flag_ascii,
        seed);

    return (ret == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
}
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`/*`
			`* Tool to generate random binary data records.`
			`*`
			`* Written by Joris van Rantwijk in 2022.`
			`*/`

			`#define _FILE_OFFSET_BITS 64`

			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <stdint.h>`
			`#include <assert.h>`
			`#include <errno.h>`
			`#include <fcntl.h>`
			`#include <getopt.h>`
			`#include <math.h>`
			`#include <string.h>`
			`#include <unistd.h>`

			`#include <vector>`


			`#define MAX_RECORD_SIZE 65536`
recgen: Add explicit random seed option 2022-06-25 11:58:28 +02:00			`#define REPORT_INTERVAL 1000000`
Add tool to generate random records 2022-06-24 15:16:22 +02:00

			`namespace { // anonymous namespace`


Add comments from reference impl of xoroshiro128+ 2022-06-25 15:15:57 +02:00			`/**`
			`* Pseudo random number generator "xoroshiro128+"`
			`*`
			`* This code is based on the reference implementation,`
			`* modified by Joris van Rantwijk to fit in a C++ class.`
			`*`
			`* Source: http://prng.di.unimi.it/`
			`*`
			`* The following commpents apply to the reference implementation:`
			`*`
			`* Written in 2016-2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)`
			`*`
			`* To the extent possible under law, the author has dedicated all copyright`
			`* and related and neighboring rights to this software to the public domain`
			`* worldwide. This software is distributed without any warranty.`
			`*`
			`* See <http://creativecommons.org/publicdomain/zero/1.0/>.`
			`*`
			`* This is xoroshiro128+ 1.0, our best and fastest small-state generator`
			`* for floating-point numbers. We suggest to use its upper bits for`
			`* floating-point generation, as it is slightly faster than`
			`* xoroshiro128++/xoroshiro128**. It passes all tests we are aware of`
			`* except for the four lower bits, which might fail linearity tests (and`
			`* just those), so if low linear complexity is not considered an issue (as`
			`* it is usually the case) it can be used to generate 64-bit outputs, too;`
			`* moreover, this generator has a very mild Hamming-weight dependency`
			`* making our test (http://prng.di.unimi.it/hwd.php) fail after 5 TB of`
			`* output; we believe this slight bias cannot affect any application. If`
			`* you are concerned, use xoroshiro128++, xoroshiro128** or xoshiro256+.`
			`*`
			`* We suggest to use a sign test to extract a random Boolean value, and`
			`* right shifts to extract subsets of bits.`
			`*`
			`* The state must be seeded so that it is not everywhere zero. If you have`
			`* a 64-bit seed, we suggest to seed a splitmix64 generator and use its`
			`* output to fill s.`
			`*`
			`* NOTE: the parameters (a=24, b=16, b=37) of this version give slightly`
			`* better results in our test than the 2016 version (a=55, b=14, c=36).`
			`*/`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`class Xoroshiro128plus`
			`{`
			`public:`
			`Xoroshiro128plus(uint64_t s0, uint64_t s1)`
			`{`
			`m_state[0] = s0;`
			`m_state[1] = s1;`
			`}`

			`inline uint64_t next()`
			`{`
			`const uint64_t s0 = m_state[0];`
			`uint64_t s1 = m_state[1];`
			`const uint64_t result = s0 + s1;`

			`s1 ^= s0;`
			`m_state[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16);`
			`m_state[1] = rotl(s1, 37);`

			`return result;`
			`}`

			`private:`
			`static inline uint64_t rotl(uint64_t x, int k)`
			`{`
			`return (x << k) \| (x >> (64 - k));`
			`}`

			`uint64_t m_state[2];`
			`};`



			`class RecordGenerator`
			`{`
			`public:`
			`RecordGenerator(`
			`unsigned int record_size,`
			`unsigned long long num_records,`
			`double duplicate_fraction,`
			`bool flag_ascii)`
			`: m_record_size(record_size),`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`m_bits_per_record(0),`
			`m_highbit_threshold(0),`
			`m_make_duplicates(false),`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`m_flag_ascii(flag_ascii)`
			`{`
			`assert(record_size > 0);`

			`if (duplicate_fraction > 0) {`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00
			`// Target a specific fraction of duplicate records.`
			`// We do this by defining a limited set of keys, such that each`
			`// key maps to a random record. To generate a record, we draw`
			`// uniformly from the set of keys, instead of from the set of`
			`// all possible records.`

			`// TODO : need a correction factor for the probability that unique secondary seeds collide in the record generation`

			`// Calculate target number of unique records.`
			`double target_unique = num_records * (1 - duplicate_fraction);`

			`// Calculate the amount of information per record (in bits).`
			`double info_per_record =`
			`m_flag_ascii ?`
			`((record_size - 1) * log2(36.0))`
			`: (record_size * 8.0);`

			`// Determine how many unique keys we need to draw to`
			`// get an exepected number of unique records that matches`
			`// our target.`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`//`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`// Draw N records from a set of V possible values.`
			`// Expected number of unique values:`
			`//`
			`// U = V * (1 - (1 - 1/V)**N)`
			`//`
			`// Solve for N:`
			`//`
			`// N = log(1 - U/V) / log(1 - 1/V)`
			`//`
			`if (info_per_record >= 128) {`
			`// The number of records is so big that we can just pretend`
			`// that every key will produce a different record.`
			`} else {`
			`double v = exp(info_per_record * M_LN2);`

			`if (target_unique * 1.000001 >= v) {`
			`// There are not enough different records to produce`
			`// the requested number of unique records.`
			`// Just produce as many as possible.`
			`target_unique = num_records;`
			`} else {`

			`// log(1 - 1/V) is inaccurate for very large values of V;`
			`// approximate as (-1/V)`
			`double t =`
			`(v < 1.0e6) ?`
			`log(1.0 - 1.0 / v)`
			`: (-1.0 / v);`

			`// log(1 - U/V) is inaccurate for very large values of V;`
			`// approximate as (-U/V)`
			`double q =`
			`(v < 1.0e6 * target_unique) ?`
			`log(1.0 - target_unique / v)`
			`: (- target_unique / v);`

			`// Calculate the target number of unique keys.`
			`target_unique = q / t;`
			`}`
			`}`

			`// Determine the number of random bits for which the`
			`// expected number of unique keys matches our target.`
			`// First scan in steps of 1 bit.`
			`unsigned int need_bits = 2;`
			`while (need_bits < 127) {`
			`double expected_unique =`
			`expected_num_unique(num_records, need_bits);`
			`if (expected_unique >= target_unique) {`
			`break;`
			`}`
			`need_bits++;`
			`}`

			`// Fine scan in steps of 1/16 bit.`
			`unsigned int need_bits_frac16 = 0;`
			`while (need_bits_frac16 < 16) {`
			`double nbits = need_bits - 1 + need_bits_frac16 / 16.0;`
			`double expected_unique =`
			`expected_num_unique(num_records, nbits);`
			`if (expected_unique >= target_unique) {`
			`break;`
			`}`
			`need_bits_frac16++;`
			`}`

			`if (need_bits < 127) {`
			`// Use this number of bits per record.`
			`printf("use bits = %f\n", need_bits - 1 + need_bits_frac16 / 16.0);`
			`m_bits_per_record = need_bits;`
			`m_highbit_threshold =`
			`exp((63 + need_bits_frac16 / 16.0) * M_LN2);`
			`m_make_duplicates = true;`
			`} else {`
			`// We need so many random bits that nobody will notice`
			`// if we just use a uniform distribution of records.`
			`// So let's do that.`
			`printf("use uniform\n");`
			`m_make_duplicates = false;`
			`}`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`}`
			`}`

			`void generate_record(unsigned char * record, Xoroshiro128plus& rng)`
			`{`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`if (m_make_duplicates) {`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`// We have a budget of fewer than 128 random bits per record.`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`// Create a random seed value of that many bits.`
			`// Then use it to initialize a secondary random number generator.`
			`// Then use that generator to generate the actual record.`

Add tool to generate random records 2022-06-24 15:16:22 +02:00			`uint64_t s0 = 0, s1 = 0;`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`unsigned int need_bits = m_bits_per_record;`
			`if (need_bits > 64) {`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`s0 = rng.next();`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`need_bits -= 64;`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`}`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`do {`
			`s1 = rng.next();`
			`} while (s1 > m_highbit_threshold);`
			`s1 >>= (64 - need_bits);`

Add tool to generate random records 2022-06-24 15:16:22 +02:00			`Xoroshiro128plus rng2(s0, s1);`
			`rng2.next();`
			`rng2.next();`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`rng2.next();`

Add tool to generate random records 2022-06-24 15:16:22 +02:00			`generate_uniform_record(record, rng2);`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`} else {`
			`// Uniform distribution of records.`
			`generate_uniform_record(record, rng);`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`}`
			`}`

			`private:`
			`void generate_uniform_record(unsigned char * record, Xoroshiro128plus& rng)`
			`{`
			`if (m_flag_ascii) {`

			`// Generate ASCII record.`
			`for (unsigned int i = 0; i < m_record_size - 1; i++) {`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`uint64_t r = rng.next() >> 32;`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`unsigned int p = r % 36;`
			`if (p < 10) {`
			`record[i] = '0' + p;`
			`} else {`
			`record[i] = 'a' + (p - 10);`
			`}`
			`}`

			`// Append newline.`
			`record[m_record_size-1] = '\n';`

			`} else {`

			`// Generate binary record.`
			`for (unsigned int i = 0; i < m_record_size; i++) {`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`uint64_t r = rng.next() >> 32;`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`record[i] = r & 0xff;`
			`}`

			`}`
			`}`

recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`/** Calculate the expected number of unique records. */`
			`double expected_num_unique(unsigned long long num_records,`
			`double bits_per_record)`
			`{`
			`// We draw N records from a set of V values with replacement.`
			`//`
			`// The expected number of unique values drawn in one batch is`
			`//`
			`// V * (1 - (1 - 1/V)**N)`
			`//`

			`double v = exp(bits_per_record * M_LN2);`

			`// The calculation (1 - 1/V)**N is inaccurate for very large`
			`// values of V. In that case, approximation as exp(- N / V).`
			`double t =`
			`(v < 1.0e6) ?`
			`pow(1.0 - 1.0 / v, num_records)`
			`: exp(- double(num_records) / v);`

			`return v * (1.0 - t);`
			`}`

Add tool to generate random records 2022-06-24 15:16:22 +02:00			`unsigned int m_record_size;`
			`unsigned int m_bits_per_record;`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`uint64_t m_highbit_threshold;`
			`bool m_make_duplicates;`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`bool m_flag_ascii;`
			`};`


			`int recgen(`
			`const char *output_name,`
			`unsigned int record_size,`
			`unsigned long long num_records,`
			`double duplicate_fraction,`
			`bool flag_ascii,`
			`uint64_t seed)`
			`{`
			`Xoroshiro128plus rng(seed, 0);`
			`rng.next();`
			`rng.next();`

			`RecordGenerator record_generator(`
			`record_size,`
			`num_records,`
			`duplicate_fraction,`
			`flag_ascii);`

			`int fd = open(output_name, O_WRONLY \| O_CREAT \| O_EXCL, 0666);`
			`if (fd < 0) {`
			`fprintf(stderr, "ERROR: Can not create output file (%s)\n",`
			`strerror(errno));`
			`return -1;`
			`}`

			`FILE *outf = fdopen(fd, "w");`
			`if (outf == NULL) {`
			`fprintf(stderr, "ERROR: fdopen() failed (%s)\n", strerror(errno));`
			`close(fd);`
			`return -1;`
			`}`

			`std::vector<unsigned char> record(record_size);`

			`for (unsigned long long i = 0; i < num_records; i++) {`
			`if ((i % REPORT_INTERVAL) == 0) {`
			`printf("\rgenerated %llu / %llu records ", i, num_records);`
			`fflush(stdout);`
			`}`

			`record_generator.generate_record(record.data(), rng);`

			`if (fwrite(record.data(), record_size, 1, outf) != 1) {`
			`fprintf(stderr, "ERROR: Writing to output file failed (%s)\n",`
			`strerror(errno));`
			`fclose(outf);`
			`return -1;`
			`}`
			`}`

			`printf("\rgenerated %llu records - done \n", num_records);`
			`fflush(stdout);`

			`fclose(outf);`

			`return 0;`
			`}`


			`void usage()`
			`{`
			`fprintf(stderr,`
			`"\n"`
			`"Generate fixed-length random binary records.\n"`
			`"\n"`
recgen: Add explicit random seed option 2022-06-25 11:58:28 +02:00			`"Usage: recgen [-a] [-d D] [-S R] -n N -s S outputfile\n"`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`"\n"`
			`"Options:\n"`
			`"\n"`
			`" -n N specify number of records (required)\n"`
			`" -s S specify record size in bytes (required)\n"`
recgen: Improve accuracy of duplicate fraction 2022-06-26 13:15:26 +02:00			`" -a generate ASCII records: 0-1, a-z, end in newline\n"`
			`" -d D specify fraction of duplicate records (0.0 to 1.0)\n"`
recgen: Add explicit random seed option 2022-06-25 11:58:28 +02:00			`" -S R specify seed for random generator (default 1)\n"`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`"\n");`
			`}`


			`} // anonymous namespace`


			`int main(int argc, char **argv)`
			`{`
			`double duplicate_fraction = 0.0;`
			`unsigned long long num_records = 0;`
			`unsigned long record_size = 0;`
			`bool flag_ascii = false;`
recgen: Add explicit random seed option 2022-06-25 11:58:28 +02:00			`unsigned long long seed = 1;`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`int opt;`

recgen: Add explicit random seed option 2022-06-25 11:58:28 +02:00			`while ((opt = getopt(argc, argv, "ad:n:s:S:")) != -1) {`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`char *endptr;`
			`switch (opt) {`
			`case 'a':`
			`flag_ascii = true;`
			`break;`
			`case 'd':`
			`duplicate_fraction = strtod(optarg, &endptr);`
			`if (endptr == optarg`
			`\|\| *endptr != '\0'`
			`\|\| duplicate_fraction < 0.0`
			`\|\| duplicate_fraction > 1.0) {`
			`fprintf(stderr,`
			`"ERROR: Invalid duplicate fraction"`
			`" (must be between 0.0 and 1.0)\n");`
			`return EXIT_FAILURE;`
			`}`
			`break;`
			`case 'n':`
			`num_records = strtoull(optarg, &endptr, 10);`
			`if (endptr == optarg`
			`\|\| *endptr != '\0'`
			`\|\| num_records == 0) {`
			`fprintf(stderr,`
			`"ERROR: Invalid number of records\n");`
			`return EXIT_FAILURE;`
			`}`
			`break;`
			`case 's':`
			`record_size = strtoul(optarg, &endptr, 10);`
			`if (endptr == optarg`
			`\|\| *endptr != '\0'`
			`\|\| record_size < 2`
			`\|\| record_size > MAX_RECORD_SIZE) {`
			`fprintf(stderr,`
			`"ERROR: Invalid record size\n");`
			`return EXIT_FAILURE;`
			`}`
			`break;`
recgen: Add explicit random seed option 2022-06-25 11:58:28 +02:00			`case 'S':`
			`seed = strtoull(optarg, &endptr, 10);`
			`if (endptr == optarg`
			`\|\| *endptr != '\0'`
			`\|\| seed > UINT64_MAX) {`
			`fprintf(stderr,`
			`"ERROR: Invalid random seed\n");`
			`return EXIT_FAILURE;`
			`}`
			`break;`
Add tool to generate random records 2022-06-24 15:16:22 +02:00			`case 'h':`
			`usage();`
			`return EXIT_SUCCESS;`
			`default:`
			`usage();`
			`return EXIT_FAILURE;`
			`}`
			`}`

			`if (num_records == 0) {`
			`fprintf(stderr, "ERROR: Missing required parameter -n\n");`
			`usage();`
			`return EXIT_FAILURE;`
			`}`
			`if (record_size == 0) {`
			`fprintf(stderr, "ERROR: Missing required parameter -s\n");`
			`usage();`
			`return EXIT_FAILURE;`
			`}`

			`if (argc < optind + 1) {`
			`fprintf(stderr,`
			`"ERROR: Output file name must be specified\n");`
			`usage();`
			`return EXIT_FAILURE;`
			`}`

			`if (argc > optind + 1) {`
			`fprintf(stderr, "ERROR: Unexpected command-line parameters\n");`
			`usage();`
			`return EXIT_FAILURE;`
			`}`

			`const char * output_name = argv[optind];`

			`int ret = recgen(`
			`output_name,`
			`record_size,`
			`num_records,`
			`duplicate_fraction,`
			`flag_ascii,`
			`seed);`

			`return (ret == 0) ? EXIT_SUCCESS : EXIT_FAILURE;`
			`}`