1
0
Fork 0
sortbin/src/recgen.cpp

501 lines
15 KiB
C++
Raw Normal View History

2022-06-24 15:16:22 +02:00
/*
* Tool to generate random binary data records.
*
* Written by Joris van Rantwijk in 2022.
*/
#define _FILE_OFFSET_BITS 64
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <math.h>
#include <string.h>
#include <unistd.h>
#include <vector>
#define MAX_RECORD_SIZE 65536
#define REPORT_INTERVAL 1000000
2022-06-24 15:16:22 +02:00
namespace { // anonymous namespace
/**
* Pseudo random number generator "xoroshiro128+"
*
* This code is based on the reference implementation,
* modified by Joris van Rantwijk to fit in a C++ class.
*
* Source: http://prng.di.unimi.it/
*
* The following commpents apply to the reference implementation:
*
* Written in 2016-2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)
*
* To the extent possible under law, the author has dedicated all copyright
* and related and neighboring rights to this software to the public domain
* worldwide. This software is distributed without any warranty.
*
* See <http://creativecommons.org/publicdomain/zero/1.0/>.
*
* This is xoroshiro128+ 1.0, our best and fastest small-state generator
* for floating-point numbers. We suggest to use its upper bits for
* floating-point generation, as it is slightly faster than
* xoroshiro128++/xoroshiro128**. It passes all tests we are aware of
* except for the four lower bits, which might fail linearity tests (and
* just those), so if low linear complexity is not considered an issue (as
* it is usually the case) it can be used to generate 64-bit outputs, too;
* moreover, this generator has a very mild Hamming-weight dependency
* making our test (http://prng.di.unimi.it/hwd.php) fail after 5 TB of
* output; we believe this slight bias cannot affect any application. If
* you are concerned, use xoroshiro128++, xoroshiro128** or xoshiro256+.
*
* We suggest to use a sign test to extract a random Boolean value, and
* right shifts to extract subsets of bits.
*
* The state must be seeded so that it is not everywhere zero. If you have
* a 64-bit seed, we suggest to seed a splitmix64 generator and use its
* output to fill s.
*
* NOTE: the parameters (a=24, b=16, b=37) of this version give slightly
* better results in our test than the 2016 version (a=55, b=14, c=36).
*/
2022-06-24 15:16:22 +02:00
class Xoroshiro128plus
{
public:
Xoroshiro128plus(uint64_t s0, uint64_t s1)
{
m_state[0] = s0;
m_state[1] = s1;
}
inline uint64_t next()
{
const uint64_t s0 = m_state[0];
uint64_t s1 = m_state[1];
const uint64_t result = s0 + s1;
s1 ^= s0;
m_state[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16);
m_state[1] = rotl(s1, 37);
return result;
}
private:
static inline uint64_t rotl(uint64_t x, int k)
{
return (x << k) | (x >> (64 - k));
}
uint64_t m_state[2];
};
class RecordGenerator
{
public:
RecordGenerator(
unsigned int record_size,
unsigned long long num_records,
double duplicate_fraction,
bool flag_ascii)
: m_record_size(record_size),
m_bits_per_record(0),
m_highbit_threshold(0),
m_make_duplicates(false),
2022-06-24 15:16:22 +02:00
m_flag_ascii(flag_ascii)
{
assert(record_size > 0);
if (duplicate_fraction > 0) {
// Target a specific fraction of duplicate records.
// We do this by defining a limited set of keys, such that each
// key maps to a random record. To generate a record, we draw
// uniformly from the set of keys, instead of from the set of
// all possible records.
// TODO : need a correction factor for the probability that unique secondary seeds collide in the record generation
// Calculate target number of unique records.
double target_unique = num_records * (1 - duplicate_fraction);
// Calculate the amount of information per record (in bits).
double info_per_record =
m_flag_ascii ?
((record_size - 1) * log2(36.0))
: (record_size * 8.0);
// Determine how many unique keys we need to draw to
// get an exepected number of unique records that matches
// our target.
2022-06-24 15:16:22 +02:00
//
// Draw N records from a set of V possible values.
// Expected number of unique values:
//
// U = V * (1 - (1 - 1/V)**N)
//
// Solve for N:
//
// N = log(1 - U/V) / log(1 - 1/V)
//
if (info_per_record >= 128) {
// The number of records is so big that we can just pretend
// that every key will produce a different record.
} else {
double v = exp(info_per_record * M_LN2);
if (target_unique * 1.000001 >= v) {
// There are not enough different records to produce
// the requested number of unique records.
// Just produce as many as possible.
target_unique = num_records;
} else {
// log(1 - 1/V) is inaccurate for very large values of V;
// approximate as (-1/V)
double t =
(v < 1.0e6) ?
log(1.0 - 1.0 / v)
: (-1.0 / v);
// log(1 - U/V) is inaccurate for very large values of V;
// approximate as (-U/V)
double q =
(v < 1.0e6 * target_unique) ?
log(1.0 - target_unique / v)
: (- target_unique / v);
// Calculate the target number of unique keys.
target_unique = q / t;
}
}
// Determine the number of random bits for which the
// expected number of unique keys matches our target.
// First scan in steps of 1 bit.
unsigned int need_bits = 2;
while (need_bits < 127) {
double expected_unique =
expected_num_unique(num_records, need_bits);
if (expected_unique >= target_unique) {
break;
}
need_bits++;
}
// Fine scan in steps of 1/16 bit.
unsigned int need_bits_frac16 = 0;
while (need_bits_frac16 < 16) {
double nbits = need_bits - 1 + need_bits_frac16 / 16.0;
double expected_unique =
expected_num_unique(num_records, nbits);
if (expected_unique >= target_unique) {
break;
}
need_bits_frac16++;
}
if (need_bits < 127) {
// Use this number of bits per record.
printf("use bits = %f\n", need_bits - 1 + need_bits_frac16 / 16.0);
m_bits_per_record = need_bits;
m_highbit_threshold =
exp((63 + need_bits_frac16 / 16.0) * M_LN2);
m_make_duplicates = true;
} else {
// We need so many random bits that nobody will notice
// if we just use a uniform distribution of records.
// So let's do that.
printf("use uniform\n");
m_make_duplicates = false;
}
2022-06-24 15:16:22 +02:00
}
}
void generate_record(unsigned char * record, Xoroshiro128plus& rng)
{
if (m_make_duplicates) {
2022-06-24 15:16:22 +02:00
// We have a budget of fewer than 128 random bits per record.
// Create a random seed value of that many bits.
// Then use it to initialize a secondary random number generator.
// Then use that generator to generate the actual record.
2022-06-24 15:16:22 +02:00
uint64_t s0 = 0, s1 = 0;
unsigned int need_bits = m_bits_per_record;
if (need_bits > 64) {
2022-06-24 15:16:22 +02:00
s0 = rng.next();
need_bits -= 64;
2022-06-24 15:16:22 +02:00
}
do {
s1 = rng.next();
} while (s1 > m_highbit_threshold);
s1 >>= (64 - need_bits);
2022-06-24 15:16:22 +02:00
Xoroshiro128plus rng2(s0, s1);
rng2.next();
rng2.next();
rng2.next();
2022-06-24 15:16:22 +02:00
generate_uniform_record(record, rng2);
} else {
// Uniform distribution of records.
generate_uniform_record(record, rng);
2022-06-24 15:16:22 +02:00
}
}
private:
void generate_uniform_record(unsigned char * record, Xoroshiro128plus& rng)
{
if (m_flag_ascii) {
// Generate ASCII record.
for (unsigned int i = 0; i < m_record_size - 1; i++) {
uint64_t r = rng.next() >> 32;
2022-06-24 15:16:22 +02:00
unsigned int p = r % 36;
if (p < 10) {
record[i] = '0' + p;
} else {
record[i] = 'a' + (p - 10);
}
}
// Append newline.
record[m_record_size-1] = '\n';
} else {
// Generate binary record.
for (unsigned int i = 0; i < m_record_size; i++) {
uint64_t r = rng.next() >> 32;
2022-06-24 15:16:22 +02:00
record[i] = r & 0xff;
}
}
}
/** Calculate the expected number of unique records. */
double expected_num_unique(unsigned long long num_records,
double bits_per_record)
{
// We draw N records from a set of V values with replacement.
//
// The expected number of unique values drawn in one batch is
//
// V * (1 - (1 - 1/V)**N)
//
double v = exp(bits_per_record * M_LN2);
// The calculation (1 - 1/V)**N is inaccurate for very large
// values of V. In that case, approximation as exp(- N / V).
double t =
(v < 1.0e6) ?
pow(1.0 - 1.0 / v, num_records)
: exp(- double(num_records) / v);
return v * (1.0 - t);
}
2022-06-24 15:16:22 +02:00
unsigned int m_record_size;
unsigned int m_bits_per_record;
uint64_t m_highbit_threshold;
bool m_make_duplicates;
2022-06-24 15:16:22 +02:00
bool m_flag_ascii;
};
int recgen(
const char *output_name,
unsigned int record_size,
unsigned long long num_records,
double duplicate_fraction,
bool flag_ascii,
uint64_t seed)
{
Xoroshiro128plus rng(seed, 0);
rng.next();
rng.next();
RecordGenerator record_generator(
record_size,
num_records,
duplicate_fraction,
flag_ascii);
int fd = open(output_name, O_WRONLY | O_CREAT | O_EXCL, 0666);
if (fd < 0) {
fprintf(stderr, "ERROR: Can not create output file (%s)\n",
strerror(errno));
return -1;
}
FILE *outf = fdopen(fd, "w");
if (outf == NULL) {
fprintf(stderr, "ERROR: fdopen() failed (%s)\n", strerror(errno));
close(fd);
return -1;
}
std::vector<unsigned char> record(record_size);
for (unsigned long long i = 0; i < num_records; i++) {
if ((i % REPORT_INTERVAL) == 0) {
printf("\rgenerated %llu / %llu records ", i, num_records);
fflush(stdout);
}
record_generator.generate_record(record.data(), rng);
if (fwrite(record.data(), record_size, 1, outf) != 1) {
fprintf(stderr, "ERROR: Writing to output file failed (%s)\n",
strerror(errno));
fclose(outf);
return -1;
}
}
printf("\rgenerated %llu records - done \n", num_records);
fflush(stdout);
fclose(outf);
return 0;
}
void usage()
{
fprintf(stderr,
"\n"
"Generate fixed-length random binary records.\n"
"\n"
"Usage: recgen [-a] [-d D] [-S R] -n N -s S outputfile\n"
2022-06-24 15:16:22 +02:00
"\n"
"Options:\n"
"\n"
" -n N specify number of records (required)\n"
" -s S specify record size in bytes (required)\n"
" -a generate ASCII records: 0-1, a-z, end in newline\n"
" -d D specify fraction of duplicate records (0.0 to 1.0)\n"
" -S R specify seed for random generator (default 1)\n"
2022-06-24 15:16:22 +02:00
"\n");
}
} // anonymous namespace
int main(int argc, char **argv)
{
double duplicate_fraction = 0.0;
unsigned long long num_records = 0;
unsigned long record_size = 0;
bool flag_ascii = false;
unsigned long long seed = 1;
2022-06-24 15:16:22 +02:00
int opt;
while ((opt = getopt(argc, argv, "ad:n:s:S:")) != -1) {
2022-06-24 15:16:22 +02:00
char *endptr;
switch (opt) {
case 'a':
flag_ascii = true;
break;
case 'd':
duplicate_fraction = strtod(optarg, &endptr);
if (endptr == optarg
|| *endptr != '\0'
|| duplicate_fraction < 0.0
|| duplicate_fraction > 1.0) {
fprintf(stderr,
"ERROR: Invalid duplicate fraction"
" (must be between 0.0 and 1.0)\n");
return EXIT_FAILURE;
}
break;
case 'n':
num_records = strtoull(optarg, &endptr, 10);
if (endptr == optarg
|| *endptr != '\0'
|| num_records == 0) {
fprintf(stderr,
"ERROR: Invalid number of records\n");
return EXIT_FAILURE;
}
break;
case 's':
record_size = strtoul(optarg, &endptr, 10);
if (endptr == optarg
|| *endptr != '\0'
|| record_size < 2
|| record_size > MAX_RECORD_SIZE) {
fprintf(stderr,
"ERROR: Invalid record size\n");
return EXIT_FAILURE;
}
break;
case 'S':
seed = strtoull(optarg, &endptr, 10);
if (endptr == optarg
|| *endptr != '\0'
|| seed > UINT64_MAX) {
fprintf(stderr,
"ERROR: Invalid random seed\n");
return EXIT_FAILURE;
}
break;
2022-06-24 15:16:22 +02:00
case 'h':
usage();
return EXIT_SUCCESS;
default:
usage();
return EXIT_FAILURE;
}
}
if (num_records == 0) {
fprintf(stderr, "ERROR: Missing required parameter -n\n");
usage();
return EXIT_FAILURE;
}
if (record_size == 0) {
fprintf(stderr, "ERROR: Missing required parameter -s\n");
usage();
return EXIT_FAILURE;
}
if (argc < optind + 1) {
fprintf(stderr,
"ERROR: Output file name must be specified\n");
usage();
return EXIT_FAILURE;
}
if (argc > optind + 1) {
fprintf(stderr, "ERROR: Unexpected command-line parameters\n");
usage();
return EXIT_FAILURE;
}
const char * output_name = argv[optind];
int ret = recgen(
output_name,
record_size,
num_records,
duplicate_fraction,
flag_ascii,
seed);
return (ret == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
}