summaryrefslogtreecommitdiff
path: root/table
diff options
context:
space:
mode:
Diffstat (limited to 'table')
-rw-r--r--table/block_based/block_based_table_builder.cc67
1 files changed, 32 insertions, 35 deletions
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 3776004ca..bc7c113c6 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1644,44 +1644,41 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
? r->compression_opts.zstd_max_train_bytes
: r->compression_opts.max_dict_bytes;
+ const size_t kNumBlocksBuffered = r->data_block_and_keys_buffers.size();
+
+ // Abstract algebra teaches us that a finite cyclic group (such as the
+ // additive group of integers modulo N) can be generated by a number that is
+ // coprime with N. Since N is variable (number of buffered data blocks), we
+ // must then pick a prime number in order to guarantee coprimeness with any N.
+ //
+ // One downside of this approach is the spread will be poor when
+ // `kPrimeGeneratorRemainder` is close to zero or close to
+ // `kNumBlocksBuffered`.
+ //
+ // Picked a random number between one and one trillion and then chose the
+ // next prime number greater than or equal to it.
+ const uint64_t kPrimeGenerator = 545055921143ull;
+ // Can avoid repeated division by just adding the remainder repeatedly.
+ const size_t kPrimeGeneratorRemainder = static_cast<size_t>(
+ kPrimeGenerator % static_cast<uint64_t>(kNumBlocksBuffered));
+ const size_t kInitSampleIdx = kNumBlocksBuffered / 2;
- // If buffer size is reasonable, we pre-generate a permutation to enforce
- // uniqueness. This prevents wasting samples on duplicates, which is
- // particularly likely when not many blocks were buffered.
- std::vector<uint16_t> data_block_order;
- size_t data_block_order_idx = 0;
- if (r->data_block_and_keys_buffers.size() <= ((1 << 16) - 1)) {
- data_block_order.resize(r->data_block_and_keys_buffers.size());
- std::iota(data_block_order.begin(), data_block_order.end(),
- static_cast<uint16_t>(0));
- // We could be smarter and interleave the shuffling and sample appending
- // logic. Then we could terminate as soon as `kSampleBytes` is reached,
- // saving some shuffling computation.
- RandomShuffle(data_block_order.begin(), data_block_order.end(),
- static_cast<uint32_t>(r->creation_time));
- }
-
- Random64 generator{r->creation_time};
std::string compression_dict_samples;
std::vector<size_t> compression_dict_sample_lens;
- if (!r->data_block_and_keys_buffers.empty()) {
- while ((data_block_order.empty() ||
- data_block_order_idx < data_block_order.size()) &&
- compression_dict_samples.size() < kSampleBytes) {
- size_t rand_idx;
- if (data_block_order.empty()) {
- rand_idx = static_cast<size_t>(
- generator.Uniform(r->data_block_and_keys_buffers.size()));
- } else {
- rand_idx = data_block_order[data_block_order_idx];
- ++data_block_order_idx;
- }
- size_t copy_len =
- std::min(kSampleBytes - compression_dict_samples.size(),
- r->data_block_and_keys_buffers[rand_idx].first.size());
- compression_dict_samples.append(
- r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len);
- compression_dict_sample_lens.emplace_back(copy_len);
+ size_t buffer_idx = kInitSampleIdx;
+ for (size_t i = 0;
+ i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes;
+ ++i) {
+ size_t copy_len =
+ std::min(kSampleBytes - compression_dict_samples.size(),
+ r->data_block_and_keys_buffers[buffer_idx].first.size());
+ compression_dict_samples.append(
+ r->data_block_and_keys_buffers[buffer_idx].first, 0, copy_len);
+ compression_dict_sample_lens.emplace_back(copy_len);
+
+ buffer_idx += kPrimeGeneratorRemainder;
+ if (buffer_idx >= kNumBlocksBuffered) {
+ buffer_idx -= kNumBlocksBuffered;
}
}