recgen: Improve accuracy of duplicate fraction
This commit is contained in:
parent
b091d8b1eb
commit
ae33feaca4
176
src/recgen.cpp
176
src/recgen.cpp
|
@ -109,48 +109,147 @@ public:
|
|||
double duplicate_fraction,
|
||||
bool flag_ascii)
|
||||
: m_record_size(record_size),
|
||||
m_bits_per_record(0),
|
||||
m_highbit_threshold(0),
|
||||
m_make_duplicates(false),
|
||||
m_flag_ascii(flag_ascii)
|
||||
{
|
||||
assert(record_size > 0);
|
||||
|
||||
if (duplicate_fraction > 0) {
|
||||
// Calculate the number of bits needed to achieve the
|
||||
// specified duplicate fraction.
|
||||
|
||||
// Target a specific fraction of duplicate records.
|
||||
// We do this by defining a limited set of keys, such that each
|
||||
// key maps to a random record. To generate a record, we draw
|
||||
// uniformly from the set of keys, instead of from the set of
|
||||
// all possible records.
|
||||
|
||||
// TODO : need a correction factor for the probability that unique secondary seeds collide in the record generation
|
||||
|
||||
// Calculate target number of unique records.
|
||||
double target_unique = num_records * (1 - duplicate_fraction);
|
||||
|
||||
// Calculate the amount of information per record (in bits).
|
||||
double info_per_record =
|
||||
m_flag_ascii ?
|
||||
((record_size - 1) * log2(36.0))
|
||||
: (record_size * 8.0);
|
||||
|
||||
// Determine how many unique keys we need to draw to
|
||||
// get an exepected number of unique records that matches
|
||||
// our target.
|
||||
//
|
||||
// This calculation is not exactly right, but it
|
||||
// gives reasonable results for duplication_fraction > 0.5.
|
||||
double num_values =
|
||||
double(num_records) * (1 / duplicate_fraction - 1);
|
||||
m_bits_per_record = lrint(ceil(log2(num_values)));
|
||||
// Draw N records from a set of V possible values.
|
||||
// Expected number of unique values:
|
||||
//
|
||||
// U = V * (1 - (1 - 1/V)**N)
|
||||
//
|
||||
// Solve for N:
|
||||
//
|
||||
// N = log(1 - U/V) / log(1 - 1/V)
|
||||
//
|
||||
if (info_per_record >= 128) {
|
||||
// The number of records is so big that we can just pretend
|
||||
// that every key will produce a different record.
|
||||
} else {
|
||||
// Use uniform distribution of records.
|
||||
m_bits_per_record = 8 * record_size;
|
||||
double v = exp(info_per_record * M_LN2);
|
||||
|
||||
if (target_unique * 1.000001 >= v) {
|
||||
// There are not enough different records to produce
|
||||
// the requested number of unique records.
|
||||
// Just produce as many as possible.
|
||||
target_unique = num_records;
|
||||
} else {
|
||||
|
||||
// log(1 - 1/V) is inaccurate for very large values of V;
|
||||
// approximate as (-1/V)
|
||||
double t =
|
||||
(v < 1.0e6) ?
|
||||
log(1.0 - 1.0 / v)
|
||||
: (-1.0 / v);
|
||||
|
||||
// log(1 - U/V) is inaccurate for very large values of V;
|
||||
// approximate as (-U/V)
|
||||
double q =
|
||||
(v < 1.0e6 * target_unique) ?
|
||||
log(1.0 - target_unique / v)
|
||||
: (- target_unique / v);
|
||||
|
||||
// Calculate the target number of unique keys.
|
||||
target_unique = q / t;
|
||||
}
|
||||
}
|
||||
|
||||
// Determine the number of random bits for which the
|
||||
// expected number of unique keys matches our target.
|
||||
// First scan in steps of 1 bit.
|
||||
unsigned int need_bits = 2;
|
||||
while (need_bits < 127) {
|
||||
double expected_unique =
|
||||
expected_num_unique(num_records, need_bits);
|
||||
if (expected_unique >= target_unique) {
|
||||
break;
|
||||
}
|
||||
need_bits++;
|
||||
}
|
||||
|
||||
// Fine scan in steps of 1/16 bit.
|
||||
unsigned int need_bits_frac16 = 0;
|
||||
while (need_bits_frac16 < 16) {
|
||||
double nbits = need_bits - 1 + need_bits_frac16 / 16.0;
|
||||
double expected_unique =
|
||||
expected_num_unique(num_records, nbits);
|
||||
if (expected_unique >= target_unique) {
|
||||
break;
|
||||
}
|
||||
need_bits_frac16++;
|
||||
}
|
||||
|
||||
if (need_bits < 127) {
|
||||
// Use this number of bits per record.
|
||||
printf("use bits = %f\n", need_bits - 1 + need_bits_frac16 / 16.0);
|
||||
m_bits_per_record = need_bits;
|
||||
m_highbit_threshold =
|
||||
exp((63 + need_bits_frac16 / 16.0) * M_LN2);
|
||||
m_make_duplicates = true;
|
||||
} else {
|
||||
// We need so many random bits that nobody will notice
|
||||
// if we just use a uniform distribution of records.
|
||||
// So let's do that.
|
||||
printf("use uniform\n");
|
||||
m_make_duplicates = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void generate_record(unsigned char * record, Xoroshiro128plus& rng)
|
||||
{
|
||||
if (m_bits_per_record >= 8 * m_record_size || m_bits_per_record >= 128) {
|
||||
// Just generate uniformly selected records.
|
||||
// Nobody will notice the difference.
|
||||
generate_uniform_record(record, rng);
|
||||
} else {
|
||||
if (m_make_duplicates) {
|
||||
// We have a budget of fewer than 128 random bits per record.
|
||||
// Create a random seed value of that many bits, then use it
|
||||
// to initialize a secondary random number generator to generate
|
||||
// the data.
|
||||
// Create a random seed value of that many bits.
|
||||
// Then use it to initialize a secondary random number generator.
|
||||
// Then use that generator to generate the actual record.
|
||||
|
||||
uint64_t s0 = 0, s1 = 0;
|
||||
if (m_bits_per_record > 64) {
|
||||
unsigned int need_bits = m_bits_per_record;
|
||||
if (need_bits > 64) {
|
||||
s0 = rng.next();
|
||||
s1 = rng.next() >> (128 - m_bits_per_record);
|
||||
} else {
|
||||
s0 = rng.next() >> (64 - m_bits_per_record);
|
||||
s1 = 0;
|
||||
need_bits -= 64;
|
||||
}
|
||||
do {
|
||||
s1 = rng.next();
|
||||
} while (s1 > m_highbit_threshold);
|
||||
s1 >>= (64 - need_bits);
|
||||
|
||||
Xoroshiro128plus rng2(s0, s1);
|
||||
rng2.next();
|
||||
rng2.next();
|
||||
rng2.next();
|
||||
|
||||
generate_uniform_record(record, rng2);
|
||||
} else {
|
||||
// Uniform distribution of records.
|
||||
generate_uniform_record(record, rng);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -161,7 +260,7 @@ private:
|
|||
|
||||
// Generate ASCII record.
|
||||
for (unsigned int i = 0; i < m_record_size - 1; i++) {
|
||||
uint64_t r = rng.next() >> 4;
|
||||
uint64_t r = rng.next() >> 32;
|
||||
unsigned int p = r % 36;
|
||||
if (p < 10) {
|
||||
record[i] = '0' + p;
|
||||
|
@ -177,15 +276,40 @@ private:
|
|||
|
||||
// Generate binary record.
|
||||
for (unsigned int i = 0; i < m_record_size; i++) {
|
||||
uint64_t r = rng.next() >> 4;
|
||||
uint64_t r = rng.next() >> 32;
|
||||
record[i] = r & 0xff;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/** Calculate the expected number of unique records. */
|
||||
double expected_num_unique(unsigned long long num_records,
|
||||
double bits_per_record)
|
||||
{
|
||||
// We draw N records from a set of V values with replacement.
|
||||
//
|
||||
// The expected number of unique values drawn in one batch is
|
||||
//
|
||||
// V * (1 - (1 - 1/V)**N)
|
||||
//
|
||||
|
||||
double v = exp(bits_per_record * M_LN2);
|
||||
|
||||
// The calculation (1 - 1/V)**N is inaccurate for very large
|
||||
// values of V. In that case, approximation as exp(- N / V).
|
||||
double t =
|
||||
(v < 1.0e6) ?
|
||||
pow(1.0 - 1.0 / v, num_records)
|
||||
: exp(- double(num_records) / v);
|
||||
|
||||
return v * (1.0 - t);
|
||||
}
|
||||
|
||||
unsigned int m_record_size;
|
||||
unsigned int m_bits_per_record;
|
||||
uint64_t m_highbit_threshold;
|
||||
bool m_make_duplicates;
|
||||
bool m_flag_ascii;
|
||||
};
|
||||
|
||||
|
@ -259,10 +383,10 @@ void usage()
|
|||
"\n"
|
||||
"Options:\n"
|
||||
"\n"
|
||||
" -a generate ASCII records: 0-1, a-z, end in newline\n"
|
||||
" -d D specify fraction of duplicate records (0.0 to 1.0)\n"
|
||||
" -n N specify number of records (required)\n"
|
||||
" -s S specify record size in bytes (required)\n"
|
||||
" -a generate ASCII records: 0-1, a-z, end in newline\n"
|
||||
" -d D specify fraction of duplicate records (0.0 to 1.0)\n"
|
||||
" -S R specify seed for random generator (default 1)\n"
|
||||
"\n");
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue