Implement duplicate filtering - not yet tested
This commit is contained in:
		
							parent
							
								
									1da4983c7d
								
							
						
					
					
						commit
						af0d74dc21
					
				
							
								
								
									
										95
									
								
								sortbin.cpp
								
								
								
								
							
							
						
						
									
										95
									
								
								sortbin.cpp
								
								
								
								
							|  | @ -1224,42 +1224,42 @@ void sort_pass( | ||||||
|  * @param instream2         Input stream containing block 2. |  * @param instream2         Input stream containing block 2. | ||||||
|  * @param output_stream     Output stream for the merged block. |  * @param output_stream     Output stream for the merged block. | ||||||
|  * @param record_size       Record size in bytes. |  * @param record_size       Record size in bytes. | ||||||
|  * @param filter_dupl       True to eliminate duplicate records. |  | ||||||
|  */ |  */ | ||||||
| void merge_2_blocks( | void merge_2_blocks( | ||||||
|     RecordInputStream& instream1, |     RecordInputStream& instream1, | ||||||
|     RecordInputStream& instream2, |     RecordInputStream& instream2, | ||||||
|     RecordOutputStream& output_stream, |     RecordOutputStream& output_stream, | ||||||
|     size_t record_size, |     size_t record_size) | ||||||
|     bool filter_dupl) |  | ||||||
| { | { | ||||||
|  |     // Input blocks should not be empty.
 | ||||||
|  |     assert(!instream1.empty()); | ||||||
|  |     assert(!instream2.empty()); | ||||||
|  | 
 | ||||||
|  |     const unsigned char * rec1 = instream1.record(); | ||||||
|  |     const unsigned char * rec2 = instream2.record(); | ||||||
|  | 
 | ||||||
|     // Merge until one stream runs empty.
 |     // Merge until one stream runs empty.
 | ||||||
|     if (!instream1.empty() && !instream2.empty()) { |     while (true) { | ||||||
|         const unsigned char * rec1 = instream1.record(); |  | ||||||
|         const unsigned char * rec2 = instream2.record(); |  | ||||||
| 
 | 
 | ||||||
|         while (true) { |         // Choose which record should go first.
 | ||||||
| 
 |         if (record_compare(rec1, rec2, record_size) < 0) { | ||||||
| // TODO TODO : filter duplicates
 |             // Push record from stream 1 and load next record.
 | ||||||
| 
 |             output_stream.put(rec1); | ||||||
|             // Choose which record should go first.
 |             instream1.next_record(); | ||||||
|             if (record_compare(rec1, rec2, record_size) < 0) { |             if (instream1.empty()) { | ||||||
|                 // Push record from stream 1 and load next record.
 |                 rec1 = NULL; | ||||||
|                 output_stream.put(rec1); |                 break; | ||||||
|                 instream1.next_record(); |  | ||||||
|                 if (instream1.empty()) { |  | ||||||
|                     break; |  | ||||||
|                 } |  | ||||||
|                 rec1 = instream1.record(); |  | ||||||
|             } else { |  | ||||||
|                 // Push record from stream 2 and load next record.
 |  | ||||||
|                 output_stream.put(rec2); |  | ||||||
|                 instream2.next_record(); |  | ||||||
|                 if (instream2.empty()) { |  | ||||||
|                     break; |  | ||||||
|                 } |  | ||||||
|                 rec2 = instream2.record(); |  | ||||||
|             } |             } | ||||||
|  |             rec1 = instream1.record(); | ||||||
|  |         } else { | ||||||
|  |             // Push record from stream 2 and load next record.
 | ||||||
|  |             output_stream.put(rec2); | ||||||
|  |             instream2.next_record(); | ||||||
|  |             if (instream2.empty()) { | ||||||
|  |                 rec2 = NULL; | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|  |             rec2 = instream2.record(); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  | @ -1320,28 +1320,46 @@ void merge_n_blocks( | ||||||
| 
 | 
 | ||||||
|     // Get the first element of each block.
 |     // Get the first element of each block.
 | ||||||
|     for (unsigned int i = 0; i < branch_factor; i++) { |     for (unsigned int i = 0; i < branch_factor; i++) { | ||||||
|         if (!input_streams[i]->empty()) { |         // Input blocks should not be empty.
 | ||||||
|             heap.emplace_back(input_streams[i]->record(), |         assert(!input_streams[i]->empty()); | ||||||
|                               input_streams[i].get()); |         heap.emplace_back(input_streams[i]->record(), input_streams[i].get()); | ||||||
|         } |  | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     // Make a heap of the first blocks.
 |     // Make a heap of the first blocks.
 | ||||||
|     std::make_heap(heap.begin(), heap.end(), cmp_heap_elem); |     std::make_heap(heap.begin(), heap.end(), cmp_heap_elem); | ||||||
| 
 | 
 | ||||||
|  |     // Allocate a temporary record for duplicate filtering.
 | ||||||
|  |     std::vector<unsigned char> temp_record(record_size); | ||||||
|  | 
 | ||||||
|  |     // The very first record can not be filtered out.
 | ||||||
|  |     bool filter_first_pass = true; | ||||||
|  | 
 | ||||||
|     // Keep merging until the heap runs empty.
 |     // Keep merging until the heap runs empty.
 | ||||||
|     while (!heap.empty()) { |     while (!heap.empty()) { | ||||||
| 
 | 
 | ||||||
| // TODO TODO : filter duplicates
 |  | ||||||
| 
 |  | ||||||
|         // Extract the first element from the heap.
 |         // Extract the first element from the heap.
 | ||||||
|         const unsigned char * rec; |         const unsigned char * rec; | ||||||
|         RecordInputStream * instream; |         RecordInputStream * instream; | ||||||
|         std::tie(rec, instream) = heap[0]; |         std::tie(rec, instream) = heap[0]; | ||||||
|         std::pop_heap(heap.begin(), heap.end()); |         std::pop_heap(heap.begin(), heap.end()); | ||||||
| 
 | 
 | ||||||
|         // Push this element to the output block.
 |         if (filter_dupl) { | ||||||
|         output_stream.put(rec); | 
 | ||||||
|  |             // Compare against previous record, only output if different.
 | ||||||
|  |             if (filter_first_pass | ||||||
|  |                 || record_compare(temp_record.data(), rec, record_size) != 0) | ||||||
|  |             { | ||||||
|  |                 output_stream.put(rec); | ||||||
|  |                 record_copy(temp_record.data(), rec, record_size); | ||||||
|  |             } | ||||||
|  |             filter_first_pass = false; | ||||||
|  | 
 | ||||||
|  |         } else { | ||||||
|  | 
 | ||||||
|  |             // No filtering, just push record to the output block.
 | ||||||
|  |             output_stream.put(rec); | ||||||
|  | 
 | ||||||
|  |         } | ||||||
| 
 | 
 | ||||||
|         // Try to pull the next record from this input stream.
 |         // Try to pull the next record from this input stream.
 | ||||||
|         instream->next_record(); |         instream->next_record(); | ||||||
|  | @ -1462,19 +1480,18 @@ void merge_pass( | ||||||
|                 instream->next_record(); |                 instream->next_record(); | ||||||
|             } |             } | ||||||
| 
 | 
 | ||||||
|         } else if (this_branch_factor == 2) { |         } else if (this_branch_factor == 2 && !filter_dupl) { | ||||||
| 
 | 
 | ||||||
|             // Special case for merging 2 blocks.
 |             // Special case for merging 2 blocks.
 | ||||||
|             merge_2_blocks( |             merge_2_blocks( | ||||||
|                 *input_streams[0], |                 *input_streams[0], | ||||||
|                 *input_streams[1], |                 *input_streams[1], | ||||||
|                 output_stream, |                 output_stream, | ||||||
|                 ctx.record_size, |                 ctx.record_size); | ||||||
|                 filter_dupl); |  | ||||||
| 
 | 
 | ||||||
|         } else { |         } else { | ||||||
| 
 | 
 | ||||||
|             // Merge more than 2 blocks.
 |             // Merge more than 2 blocks or filter duplicates.
 | ||||||
|             merge_n_blocks( |             merge_n_blocks( | ||||||
|                 input_streams, |                 input_streams, | ||||||
|                 output_stream, |                 output_stream, | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue