Implement background I/O for the sorting pass

2022-06-28 20:42:36 +02:00 · 2022-06-28 20:42:36 +02:00 · 4f6b76e785
parent 57c20dbca3
commit 4f6b76e785
1 changed files with 182 additions and 26 deletions
--- a/src/sortbin.cpp
+++ b/src/sortbin.cpp
@ -31,6 +31,7 @@
 #include <unistd.h>

 #include <algorithm>
+#include <array>
 #include <condition_variable>
 #include <deque>
 #include <functional>
@ -327,7 +328,7 @@ public:
            }

            // Wait on this future.
-            fut.wait();
+            fut.get();
        }
    }

@ -1506,6 +1507,7 @@ void single_pass(
 * @param records_per_block     Number of records per sort block.
 * @param num_blocks            Number of sort blocks.
 * @param num_blocks_file1      Number of blocks for the first output file.
+ * @param io_pool               Optional thread pool for background I/O.
 * @param ctx                   Reference to context structure.
 */
 void sort_pass(
@ -1515,6 +1517,7 @@ void sort_pass(
    uint64_t records_per_block,
    uint64_t num_blocks,
    uint64_t num_blocks_file1,
+    ThreadPool * io_pool,
    const SortContext& ctx)
 {
    unsigned int record_size = ctx.record_size;
@ -1526,11 +1529,6 @@ void sort_pass(
    Timer timer;
    timer.start();

-// TODO : double-buffer with I/O in separate thread
-    // Allocate sort buffer.
-    assert(records_per_block < SIZE_MAX / record_size);
-    std::vector<unsigned char> buffer(records_per_block * record_size);
-
    // Create thread pool for parallel sorting.
    std::unique_ptr<ThreadPool> thread_pool;
    if (ctx.num_threads > 1) {
@ -1538,12 +1536,86 @@ void sort_pass(
        thread_pool.reset(new ThreadPool(ctx.num_threads));
    }

+    assert(records_per_block < SIZE_MAX / record_size);
+
+    // Allocate sort buffer(s).
+    std::array<std::vector<unsigned char>, 2> buffers;
+    buffers[0].resize(records_per_block * record_size);
+    if (io_pool != NULL) {
+        // Allocate a second buffer for background I/O.
+        buffers[1].resize(records_per_block * record_size);
+    }
+
+    // Helper function that returns the output file for a specified block.
+    // This function deals with the nasty complication that the first bunch
+    // of blocks must be written to a separate output file.
+    auto output_file_chooser =
+        [&output_file1, &output_file2, num_blocks_file1]
+        (uint64_t block_index) -> BinaryFile& {
+            if (block_index < num_blocks_file1) {
+                return output_file1;
+            } else {
+                return output_file2;
+            }
+        };
+
+    // Helper function that returns the buffer for a specified block.
+    // This function deals with the nasty complication that we may or may not
+    // be using two separate buffers for alternating blocks.
+    auto buffer_chooser =
+        [io_pool, &buffers]
+        (uint64_t block_index) -> unsigned char * {
+            if (io_pool != NULL) {
+                // Blocks alternate between the two buffers.
+                return buffers[block_index % 2].data();
+            } else {
+                // All blocks use the same buffer.
+                return buffers[0].data();
+            }
+        };
+
+    // Helper function to read a block from file to buffer.
+    // This function deals with block offset calculations and with
+    // the nasty complication that the last block may be shorter.
+    auto read_block =
+        [records_per_block, record_size, file_size]
+        (BinaryFile& input_file, uint64_t block_index, unsigned char * buf) {
+            uint64_t block_offset =
+                block_index * records_per_block * record_size;
+            size_t block_size = records_per_block * record_size;
+            if (block_size > file_size - block_offset) {
+                block_size = file_size - block_offset;
+            }
+            input_file.read(buf, block_offset, block_size);
+        };
+
+    // Helper function to write a block from buffer to file.
+    // This function deals with block offset calculations and with
+    // the nasty complication that the last block may be shorter.
+    auto write_block =
+        [records_per_block, record_size, file_size]
+        (BinaryFile& output_file, uint64_t block_index, unsigned char * buf) {
+            uint64_t block_offset =
+                block_index * records_per_block * record_size;
+            size_t block_size = records_per_block * record_size;
+            if (block_size > file_size - block_offset) {
+                block_size = file_size - block_offset;
+            }
+            output_file.write(buf, block_offset, block_size);
+        };
+
+    // Future object to wait for background I/O.
+    ThreadPool::FutureType io_future;
+
    // Loop over blocks to be sorted.
    for (uint64_t block_index = 0; block_index < num_blocks; block_index++) {

-        uint64_t first_record_idx = block_index * records_per_block;
+        // Determine number of records in this block
+        // (the last block may be shorter).
+        uint64_t block_start_idx = block_index * records_per_block;
        size_t   block_num_records =
-            std::min(records_per_block, num_records - first_record_idx);
+            (records_per_block < num_records - block_start_idx) ?
+                records_per_block : (num_records - block_start_idx);

        log(ctx,
            "  sorting block %" PRIu64 " / %" PRIu64 ": %" PRIu64 " records\n",
@ -1551,27 +1623,88 @@ void sort_pass(
            num_blocks,
            block_num_records);

-        // Read block.
-        input_file.read(
-            buffer.data(),
-            first_record_idx * record_size,
-            block_num_records * record_size);
+        // Choose a buffer to sort this block.
+        unsigned char * block_buffer = buffer_chooser(block_index);

-        // Sort records in this block.
+        // Read block via blocking I/O, if necessary.
+        // Use this to read all blocks when we do not use background I/O.
+        // Use this only for the first block when we have background I/O.
+        if (io_pool == NULL || block_index == 0) {
+            read_block(input_file, block_index, block_buffer);
+        }
+
+        // Handle background I/O, if necessary.
+        if (io_pool != NULL) {
+            ThreadPool::FutureType new_future;
+            if (block_index > 0 && block_index < num_blocks - 1) {
+                // Start background I/O to write the previous block, then
+                // read the next block when writing has finished.
+                new_future = io_pool->submit(
+                    [&input_file, &output_file_chooser, &buffer_chooser,
+                     &read_block, &write_block, block_index]
+                    () {
+                        write_block(
+                            output_file_chooser(block_index - 1),
+                            block_index - 1,
+                            buffer_chooser(block_index - 1));
+                        read_block(
+                            input_file,
+                            block_index + 1,
+                            buffer_chooser(block_index + 1));
+                    });
+            } else if (block_index < num_blocks - 1) {
+                // Start background I/O to read next block.
+                new_future = io_pool->submit(
+                    [&input_file, &buffer_chooser, &read_block, block_index]
+                    () {
+                        read_block(
+                            input_file,
+                            block_index + 1,
+                            buffer_chooser(block_index + 1));
+                    });
+            } else if (block_index > 0) {
+                // Start background I/O to write previous block.
+                new_future = io_pool->submit(
+                    [&output_file_chooser, &buffer_chooser, &write_block,
+                     block_index]
+                    () {
+                        write_block(
+                            output_file_chooser(block_index - 1),
+                            block_index - 1,
+                            buffer_chooser(block_index - 1));
+                    });
+            }
+
+            // Wait for completion of previous background I/O, if any.
+            if (block_index > 0) {
+                io_future.get();
+            }
+
+            // Keep future for the new background I/O, if any.
+            io_future = std::move(new_future);
+        }
+
+        // Sort records in this block in-place.
        sort_records(
-            buffer.data(),
+            block_buffer,
            record_size,
            block_num_records,
            ctx.num_threads,
            thread_pool.get());

-        // Write block.
-        BinaryFile& output_file =
-            (block_index < num_blocks_file1) ? output_file1 : output_file2;
-        output_file.write(
-            buffer.data(),
-            first_record_idx * record_size,
-            block_num_records * record_size);
+        // If we use background I/O and this is the last block, wait for
+        // any previous background write to complete.
+        if (io_pool != NULL && block_index == num_blocks - 1) {
+            io_future.get();
+        }
+
+        // Write block via blocking I/O, if necessary.
+        // Use this to write all blocks when we do not use background I/O.
+        // Use this only for the last block when we have background I/O.
+        if (io_pool == NULL || block_index == num_blocks - 1) {
+            BinaryFile& output_file = output_file_chooser(block_index);
+            write_block(output_file, block_index, block_buffer);
+        }
    }

    timer.stop();
@ -1696,6 +1829,8 @@ void merge_blocks(
 * @param num_blocks            Number of input blocks.
 * @param branch_factor         Number of blocks to merge per output block.
 * @param filter_dupl           True to eliminate duplicate records.
+ * @param read_thread_pool      Optional thread pool for background read I/O.
+ * @param write_thread_pool     Optional thread pool for background write I/O.
 * @param ctx                   Reference to context structure.
 */
 void merge_pass(
@ -1703,6 +1838,8 @@ void merge_pass(
    BinaryFile& output_file,
    const SortStrategy::MergePass& merge_pass,
    bool filter_dupl,
+    ThreadPool * read_thread_pool,
+    ThreadPool * write_thread_pool,
    const SortContext& ctx)
 {
    size_t num_blocks = merge_pass.records_per_block.size();
@ -1811,9 +1948,11 @@ SortStrategy plan_multi_pass_strategy(
    const SortContext& ctx)
 {
    // Plan the initial sort pass.
-    // Use blocks that are at most half of available memory,
-    // so we can use two buffers to overlap I/O and sorting.
-    uint64_t max_sort_block_size = ctx.memory_size / 2;
+
+    // When background I/O is enabled, use blocks that are at most half of
+    // available memory so we can use two buffers to overlap I/O and sorting.
+    uint64_t max_sort_block_size =
+        ctx.flag_io_thread ? (ctx.memory_size / 2) : ctx.memory_size;

    // Calculate number of records per block.
    // Make sure this is a multiple of the transfer alignment size.
@ -2057,6 +2196,20 @@ void sortbin(
        log(ctx, "creating temporary file\n");
        BinaryTempFile temp_file(ctx.temporary_directory, file_size);

+        // Create thread pools for background I/O.
+        std::array<std::unique_ptr<ThreadPool>, 2> io_thread_pool;
+        if (ctx.flag_io_thread) {
+            // Create separate threads for reading and writing.
+            // Just 1 thread per pool because
+            //  - More than 1 write thread is useless because there is
+            //    only one sequential output stream.
+            //  - More than 1 read thread may increase seeking while
+            //    in fact we want to encourage big sequential transfers.
+            log(ctx, "creating background I/O threads\n");
+            io_thread_pool[0].reset(new ThreadPool(1));
+            io_thread_pool[1].reset(new ThreadPool(1));
+        }
+
        // The merge passes alternate between tempfile-to-outputfile and
        // outputfile-to-tempfile.
        BinaryFile * output_or_temp_file[2] = { &output_file, &temp_file };
@ -2082,6 +2235,7 @@ void sortbin(
                strategy.records_per_sort_block,
                strategy.num_sort_blocks,
                strategy.num_sort_blocks_first_merge,
+                io_thread_pool[0].get(),
                ctx);
        }

@ -2117,6 +2271,8 @@ void sortbin(
                *pass_output_file,
                strategy.merge_pass[mp],
                filter_dupl,
+                io_thread_pool[0].get(),
+                io_thread_pool[1].get(),
                ctx);
        }
    }