1
0
Fork 0

Implement duplicate filtering - not yet tested

This commit is contained in:
Joris van Rantwijk 2022-06-22 22:45:09 +02:00
parent 1da4983c7d
commit af0d74dc21
1 changed files with 56 additions and 39 deletions

View File

@ -1224,30 +1224,30 @@ void sort_pass(
* @param instream2 Input stream containing block 2. * @param instream2 Input stream containing block 2.
* @param output_stream Output stream for the merged block. * @param output_stream Output stream for the merged block.
* @param record_size Record size in bytes. * @param record_size Record size in bytes.
* @param filter_dupl True to eliminate duplicate records.
*/ */
void merge_2_blocks( void merge_2_blocks(
RecordInputStream& instream1, RecordInputStream& instream1,
RecordInputStream& instream2, RecordInputStream& instream2,
RecordOutputStream& output_stream, RecordOutputStream& output_stream,
size_t record_size, size_t record_size)
bool filter_dupl)
{ {
// Merge until one stream runs empty. // Input blocks should not be empty.
if (!instream1.empty() && !instream2.empty()) { assert(!instream1.empty());
assert(!instream2.empty());
const unsigned char * rec1 = instream1.record(); const unsigned char * rec1 = instream1.record();
const unsigned char * rec2 = instream2.record(); const unsigned char * rec2 = instream2.record();
// Merge until one stream runs empty.
while (true) { while (true) {
// TODO TODO : filter duplicates
// Choose which record should go first. // Choose which record should go first.
if (record_compare(rec1, rec2, record_size) < 0) { if (record_compare(rec1, rec2, record_size) < 0) {
// Push record from stream 1 and load next record. // Push record from stream 1 and load next record.
output_stream.put(rec1); output_stream.put(rec1);
instream1.next_record(); instream1.next_record();
if (instream1.empty()) { if (instream1.empty()) {
rec1 = NULL;
break; break;
} }
rec1 = instream1.record(); rec1 = instream1.record();
@ -1256,12 +1256,12 @@ void merge_2_blocks(
output_stream.put(rec2); output_stream.put(rec2);
instream2.next_record(); instream2.next_record();
if (instream2.empty()) { if (instream2.empty()) {
rec2 = NULL;
break; break;
} }
rec2 = instream2.record(); rec2 = instream2.record();
} }
} }
}
// At most one of the streams still has records left. // At most one of the streams still has records left.
// Copy those records to the output. // Copy those records to the output.
@ -1320,28 +1320,46 @@ void merge_n_blocks(
// Get the first element of each block. // Get the first element of each block.
for (unsigned int i = 0; i < branch_factor; i++) { for (unsigned int i = 0; i < branch_factor; i++) {
if (!input_streams[i]->empty()) { // Input blocks should not be empty.
heap.emplace_back(input_streams[i]->record(), assert(!input_streams[i]->empty());
input_streams[i].get()); heap.emplace_back(input_streams[i]->record(), input_streams[i].get());
}
} }
// Make a heap of the first blocks. // Make a heap of the first blocks.
std::make_heap(heap.begin(), heap.end(), cmp_heap_elem); std::make_heap(heap.begin(), heap.end(), cmp_heap_elem);
// Allocate a temporary record for duplicate filtering.
std::vector<unsigned char> temp_record(record_size);
// The very first record can not be filtered out.
bool filter_first_pass = true;
// Keep merging until the heap runs empty. // Keep merging until the heap runs empty.
while (!heap.empty()) { while (!heap.empty()) {
// TODO TODO : filter duplicates
// Extract the first element from the heap. // Extract the first element from the heap.
const unsigned char * rec; const unsigned char * rec;
RecordInputStream * instream; RecordInputStream * instream;
std::tie(rec, instream) = heap[0]; std::tie(rec, instream) = heap[0];
std::pop_heap(heap.begin(), heap.end()); std::pop_heap(heap.begin(), heap.end());
// Push this element to the output block. if (filter_dupl) {
// Compare against previous record, only output if different.
if (filter_first_pass
|| record_compare(temp_record.data(), rec, record_size) != 0)
{
output_stream.put(rec); output_stream.put(rec);
record_copy(temp_record.data(), rec, record_size);
}
filter_first_pass = false;
} else {
// No filtering, just push record to the output block.
output_stream.put(rec);
}
// Try to pull the next record from this input stream. // Try to pull the next record from this input stream.
instream->next_record(); instream->next_record();
@ -1462,19 +1480,18 @@ void merge_pass(
instream->next_record(); instream->next_record();
} }
} else if (this_branch_factor == 2) { } else if (this_branch_factor == 2 && !filter_dupl) {
// Special case for merging 2 blocks. // Special case for merging 2 blocks.
merge_2_blocks( merge_2_blocks(
*input_streams[0], *input_streams[0],
*input_streams[1], *input_streams[1],
output_stream, output_stream,
ctx.record_size, ctx.record_size);
filter_dupl);
} else { } else {
// Merge more than 2 blocks. // Merge more than 2 blocks or filter duplicates.
merge_n_blocks( merge_n_blocks(
input_streams, input_streams,
output_stream, output_stream,