recgen: Improve accuracy of duplicate fraction

2022-06-26 13:15:26 +02:00 · 2022-06-26 13:15:26 +02:00 · ae33feaca4
parent b091d8b1eb
commit ae33feaca4
1 changed files with 151 additions and 27 deletions
--- a/src/recgen.cpp
+++ b/src/recgen.cpp
@ -109,48 +109,147 @@ public:
        double duplicate_fraction,
        bool flag_ascii)
      : m_record_size(record_size),
        m_bits_per_record(0),
        m_highbit_threshold(0),
        m_make_duplicates(false),
        m_flag_ascii(flag_ascii)
    {
        assert(record_size > 0);
        if (duplicate_fraction > 0) {
-            // Calculate the number of bits needed to achieve the
+
-            // specified duplicate fraction.
+            // Target a specific fraction of duplicate records.
            // We do this by defining a limited set of keys, such that each
            // key maps to a random record. To generate a record, we draw
            // uniformly from the set of keys, instead of from the set of
            // all possible records.
 // TODO : need a correction factor for the probability that unique secondary seeds collide in the record generation
            // Calculate target number of unique records.
            double target_unique = num_records * (1 - duplicate_fraction);
            // Calculate the amount of information per record (in bits).
            double info_per_record =
                m_flag_ascii ?
                    ((record_size - 1) * log2(36.0))
                    : (record_size * 8.0);
            // Determine how many unique keys we need to draw to
            // get an exepected number of unique records that matches
            // our target.
            //
-            // This calculation is not exactly right, but it
+            // Draw N records from a set of V possible values.
-            // gives reasonable results for duplication_fraction > 0.5.
+            // Expected number of unique values:
-            double num_values =
+            //
-                double(num_records) * (1 / duplicate_fraction - 1);
+            //   U = V * (1 - (1 - 1/V)**N)
-            m_bits_per_record = lrint(ceil(log2(num_values)));
+            //
-        } else {
+            // Solve for N:
-            // Use uniform distribution of records.
+            //
-            m_bits_per_record = 8 * record_size;
+            //   N = log(1 - U/V) / log(1 - 1/V)
            //
            if (info_per_record >= 128) {
                // The number of records is so big that we can just pretend
                // that every key will produce a different record.
            } else {
                double v = exp(info_per_record * M_LN2);
                if (target_unique * 1.000001 >= v) {
                    // There are not enough different records to produce
                    // the requested number of unique records.
                    // Just produce as many as possible.
                    target_unique = num_records;
                } else {
                    // log(1 - 1/V) is inaccurate for very large values of V;
                    // approximate as (-1/V)
                    double t =
                        (v < 1.0e6) ?
                            log(1.0 - 1.0 / v)
                            : (-1.0 / v);
                    // log(1 - U/V) is inaccurate for very large values of V;
                    // approximate as (-U/V)
                    double q =
                        (v < 1.0e6 * target_unique) ?
                            log(1.0 - target_unique / v)
                            : (- target_unique / v);
                    // Calculate the target number of unique keys.
                    target_unique = q / t;
                }
            }
            // Determine the number of random bits for which the
            // expected number of unique keys matches our target.
            // First scan in steps of 1 bit.
            unsigned int need_bits = 2;
            while (need_bits < 127) {
                double expected_unique =
                    expected_num_unique(num_records, need_bits);
                if (expected_unique >= target_unique) {
                    break;
                }
                need_bits++;
            }
            // Fine scan in steps of 1/16 bit.
            unsigned int need_bits_frac16 = 0;
            while (need_bits_frac16 < 16) {
                double nbits = need_bits - 1 + need_bits_frac16 / 16.0;
                double expected_unique =
                    expected_num_unique(num_records, nbits);
                if (expected_unique >= target_unique) {
                    break;
                }
                need_bits_frac16++;
            }
            if (need_bits < 127) {
                // Use this number of bits per record.
                printf("use bits = %f\n", need_bits - 1 + need_bits_frac16 / 16.0);
                m_bits_per_record = need_bits;
                m_highbit_threshold =
                    exp((63 + need_bits_frac16 / 16.0) * M_LN2);
                m_make_duplicates = true;
            } else {
                // We need so many random bits that nobody will notice
                // if we just use a uniform distribution of records.
                // So let's do that.
                printf("use uniform\n");
                m_make_duplicates = false;
            }
        }
    }
    void generate_record(unsigned char * record, Xoroshiro128plus& rng)
    {
-        if (m_bits_per_record >= 8 * m_record_size || m_bits_per_record >= 128) {
+        if (m_make_duplicates) {
            // Just generate uniformly selected records.
            // Nobody will notice the difference.
            generate_uniform_record(record, rng);
        } else {
            // We have a budget of fewer than 128 random bits per record.
-            // Create a random seed value of that many bits, then use it
+            // Create a random seed value of that many bits.
-            // to initialize a secondary random number generator to generate
+            // Then use it to initialize a secondary random number generator.
-            // the data.
+            // Then use that generator to generate the actual record.
            uint64_t s0 = 0, s1 = 0;
-            if (m_bits_per_record > 64) {
+            unsigned int need_bits = m_bits_per_record;
            if (need_bits > 64) {
                s0 = rng.next();
-                s1 = rng.next() >> (128 - m_bits_per_record);
+                need_bits -= 64;
            } else {
                s0 = rng.next() >> (64 - m_bits_per_record);
                s1 = 0;
            }
            do {
                s1 = rng.next();
            } while (s1 > m_highbit_threshold);
            s1 >>= (64 - need_bits);
            Xoroshiro128plus rng2(s0, s1);
            rng2.next();
            rng2.next();
            rng2.next();
            generate_uniform_record(record, rng2);
        } else {
            // Uniform distribution of records.
            generate_uniform_record(record, rng);
        }
    }
@ -161,7 +260,7 @@ private:
            // Generate ASCII record.
            for (unsigned int i = 0; i < m_record_size - 1; i++) {
-                uint64_t r = rng.next() >> 4;
+                uint64_t r = rng.next() >> 32;
                unsigned int p = r % 36;
                if (p < 10) {
                    record[i] = '0' + p;
@ -177,15 +276,40 @@ private:
            // Generate binary record.
            for (unsigned int i = 0; i < m_record_size; i++) {
-                uint64_t r = rng.next() >> 4;
+                uint64_t r = rng.next() >> 32;
                record[i] = r & 0xff;
            }
        }
    }
    /** Calculate the expected number of unique records. */
    double expected_num_unique(unsigned long long num_records,
                               double bits_per_record)
    {
        // We draw N records from a set of V values with replacement.
        //
        // The expected number of unique values drawn in one batch is
        //
        //   V * (1 - (1 - 1/V)**N)
        //
        double v = exp(bits_per_record * M_LN2);
        // The calculation (1 - 1/V)**N is inaccurate for very large
        // values of V. In that case, approximation as exp(- N / V).
        double t =
            (v < 1.0e6) ?
                pow(1.0 - 1.0 / v, num_records)
                : exp(- double(num_records) / v);
        return v * (1.0 - t);
    }
    unsigned int m_record_size;
    unsigned int m_bits_per_record;
    uint64_t m_highbit_threshold;
    bool m_make_duplicates;
    bool m_flag_ascii;
 };
@ -259,10 +383,10 @@ void usage()
        "\n"
        "Options:\n"
        "\n"
        "  -a          generate ASCII records: 0-1, a-z, end in newline\n"
        "  -d D        specify fraction of duplicate records (0.0 to 1.0)\n"
        "  -n N        specify number of records (required)\n"
        "  -s S        specify record size in bytes (required)\n"
        "  -a          generate ASCII records: 0-1, a-z, end in newline\n"
        "  -d D        specify fraction of duplicate records (0.0 to 1.0)\n"
        "  -S R        specify seed for random generator (default 1)\n"
        "\n");
 }