Pick samples for compression dictionary using prime number (#7987)

Summary: The sample selection technique taken in https://github.com/facebook/rocksdb/issues/7970 was problematic because it had two code paths for sample selection depending on the number of data blocks, and one of those code paths involved an allocation. Using prime numbers, we can consolidate into one code path without allocation. The downside is there will be values of N (number of data blocks buffered) that suffer from poor spread in the selected samples. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7987 Test Plan: `make check -j48` Reviewed By: pdillinger Differential Revision: D26586147 Pulled By: ajkr fbshipit-source-id: 62028e54336fadb6e2c7a7fe6747daa05a263d32
author: Andrew Kryczka <andrewkr@fb.com> 2021-02-22 17:41:11 -0800
committer: Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com> 2021-02-22 17:43:03 -0800
commit: daca92c17ae7d89fa34f7aeb58fbae22938eb226 (patch)
tree: 995f88190c9028e91d912d0e81fe0187c8a50ac1 /table
parent: 59d91796d23006838fb973a5db70fdf712b5237b (diff)
1 files changed, 32 insertions, 35 deletions
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 3776004ca..bc7c113c6 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1644,44 +1644,41 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
   const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
                                   ? r->compression_opts.zstd_max_train_bytes
                                   : r->compression_opts.max_dict_bytes;
+  const size_t kNumBlocksBuffered = r->data_block_and_keys_buffers.size();
+
+  // Abstract algebra teaches us that a finite cyclic group (such as the
+  // additive group of integers modulo N) can be generated by a number that is
+  // coprime with N. Since N is variable (number of buffered data blocks), we
+  // must then pick a prime number in order to guarantee coprimeness with any N.
+  //
+  // One downside of this approach is the spread will be poor when
+  // `kPrimeGeneratorRemainder` is close to zero or close to
+  // `kNumBlocksBuffered`.
+  //
+  // Picked a random number between one and one trillion and then chose the
+  // next prime number greater than or equal to it.
+  const uint64_t kPrimeGenerator = 545055921143ull;
+  // Can avoid repeated division by just adding the remainder repeatedly.
+  const size_t kPrimeGeneratorRemainder = static_cast<size_t>(
+      kPrimeGenerator % static_cast<uint64_t>(kNumBlocksBuffered));
+  const size_t kInitSampleIdx = kNumBlocksBuffered / 2;
 
-  // If buffer size is reasonable, we pre-generate a permutation to enforce
-  // uniqueness. This prevents wasting samples on duplicates, which is
-  // particularly likely when not many blocks were buffered.
-  std::vector<uint16_t> data_block_order;
-  size_t data_block_order_idx = 0;
-  if (r->data_block_and_keys_buffers.size() <= ((1 << 16) - 1)) {
-    data_block_order.resize(r->data_block_and_keys_buffers.size());
-    std::iota(data_block_order.begin(), data_block_order.end(),
-              static_cast<uint16_t>(0));
-    // We could be smarter and interleave the shuffling and sample appending
-    // logic. Then we could terminate as soon as `kSampleBytes` is reached,
-    // saving some shuffling computation.
-    RandomShuffle(data_block_order.begin(), data_block_order.end(),
-                  static_cast<uint32_t>(r->creation_time));
-  }
-
-  Random64 generator{r->creation_time};
   std::string compression_dict_samples;
   std::vector<size_t> compression_dict_sample_lens;
-  if (!r->data_block_and_keys_buffers.empty()) {
-    while ((data_block_order.empty() ||
-            data_block_order_idx < data_block_order.size()) &&
-           compression_dict_samples.size() < kSampleBytes) {
-      size_t rand_idx;
-      if (data_block_order.empty()) {
-        rand_idx = static_cast<size_t>(
-            generator.Uniform(r->data_block_and_keys_buffers.size()));
-      } else {
-        rand_idx = data_block_order[data_block_order_idx];
-        ++data_block_order_idx;
-      }
-      size_t copy_len =
-          std::min(kSampleBytes - compression_dict_samples.size(),
-                   r->data_block_and_keys_buffers[rand_idx].first.size());
-      compression_dict_samples.append(
-          r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len);
-      compression_dict_sample_lens.emplace_back(copy_len);
+  size_t buffer_idx = kInitSampleIdx;
+  for (size_t i = 0;
+       i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes;
+       ++i) {
+    size_t copy_len =
+        std::min(kSampleBytes - compression_dict_samples.size(),
+                 r->data_block_and_keys_buffers[buffer_idx].first.size());
+    compression_dict_samples.append(
+        r->data_block_and_keys_buffers[buffer_idx].first, 0, copy_len);
+    compression_dict_sample_lens.emplace_back(copy_len);
+
+    buffer_idx += kPrimeGeneratorRemainder;
+    if (buffer_idx >= kNumBlocksBuffered) {
+      buffer_idx -= kNumBlocksBuffered;
     }
   }
author	Andrew Kryczka <andrewkr@fb.com>	2021-02-22 17:41:11 -0800
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>	2021-02-22 17:43:03 -0800
commit	daca92c17ae7d89fa34f7aeb58fbae22938eb226 (patch)
tree	995f88190c9028e91d912d0e81fe0187c8a50ac1 /table
parent	59d91796d23006838fb973a5db70fdf712b5237b (diff)