summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSarah Charlotte Johnson <scharlottej13@gmail.com>2024-02-02 15:36:01 -0800
committerGitHub <noreply@github.com>2024-02-02 15:36:01 -0800
commitc1a3a39f869263cc13020a535dfe22b8774ca40b (patch)
tree75e1898242594e7f14c238fafb78f4c7067aae98
parente4726942230c17867815ca65a3983a42c2680b75 (diff)
parentbb23f9245dcd631c33000657bd21c1fe532abfcc (diff)
Merge pull request #1 from coiled/sarah/1trc
Updates to readme and data generation script
-rw-r--r--README.md16
-rw-r--r--generate_data.py9
2 files changed, 21 insertions, 4 deletions
diff --git a/README.md b/README.md
index ed1b139..603dc34 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,15 @@
-# 1trc \ No newline at end of file
+# One Trillion Row Challenge
+
+Inspired by Gunnar Morling's [one billion row challenge](https://github.com/gunnarmorling/1brc), we thought we'd take things one step further and start the one trillion row challenge (1TRC).
+
+## Data Generation
+
+You can generate the dataset yourself using the [data generation script](generate_data.py), adapted from [Jacob Tomlinson's data generation script](https://github.com/gunnarmorling/1brc/discussions/487). We've also hosted the dataset in a requester pays S3 bucket `s3://coiled-datasets-rp/1trc` in `us-east-1`.
+
+It draws a random sample of weather stations and normally distributed temperatures drawn from the mean for each station based on the values in [lookup.csv](lookup.csv).
+
+## The Challenge
+
+The main task, like the 1BRC, is to calculate the min, mean, and max values per weather station, sorted alphabetically.
+
+
diff --git a/generate_data.py b/generate_data.py
index eca7e2a..f5de1a8 100644
--- a/generate_data.py
+++ b/generate_data.py
@@ -1,3 +1,6 @@
+# This script was adapted from Jacob Tomlinson's 1BRC submission
+# https://github.com/gunnarmorling/1brc/discussions/487
+
import os
import tempfile
import coiled
@@ -10,7 +13,7 @@ n = 1_000_000_000_000 # Total number of rows of data to generate
chunksize = 10_000_000 # Number of rows of data per file
std = 10.0 # Assume normally distributed temperatures with a standard deviation of 10
lookup_df = pd.read_csv("lookup.csv") # Lookup table of stations and their mean temperatures
-bucket = "s3://oss-scratch-space/1trc"
+bucket = "s3://coiled-datasets-rp/1trc"
def generate_chunk(partition_idx, bucket, chunksize, std, lookup_df):
@@ -35,10 +38,10 @@ def generate_chunk(partition_idx, bucket, chunksize, std, lookup_df):
df.station = df.station.map(lookup_df.station)
# Save this chunk to the output file
- filename = f"measurements-{partition_idx}.txt"
+ filename = f"measurements-{partition_idx}.parquet"
with tempfile.TemporaryDirectory() as tmpdir:
local = os.path.join(tmpdir, filename)
- df.to_csv(local, sep=";", header=False, index=False)
+ df.to_parquet(local, engine="pyarrow")
fs = fsspec.filesystem("s3")
fs.put(local, f"{bucket}/{filename}")