diff options
author | scharlottej13 <sarah@coiled.io> | 2024-02-02 12:59:55 -0800 |
---|---|---|
committer | scharlottej13 <sarah@coiled.io> | 2024-02-02 15:09:02 -0800 |
commit | 771a151ce64f10f0d9e0ee6a81e245f00de80cf6 (patch) | |
tree | e3c896028e7d7de65e7f9556eeca95586c95b92c | |
parent | e4726942230c17867815ca65a3983a42c2680b75 (diff) |
Updates to readme and data generation script
-rw-r--r-- | README.md | 15 | ||||
-rw-r--r-- | generate_data.py | 6 |
2 files changed, 17 insertions, 4 deletions
@@ -1 +1,14 @@ -# 1trc
\ No newline at end of file +# One Trillion Row Challenge + +Inspired by Gunnar Morling's [one billion row challenge](https://github.com/gunnarmorling/1brc), we thought we'd take things one step further and start the one trillion row challenge (1TRC). + +## Data Generation + +You can generate the dataset yourself using the [data generation script](generate_data.py). We've also hosted the dataset in a requester pays S3 bucket s3://coiled-datasets-rp/1trc in `us-east-1`. + +It draws a random sample of weather stations and normally distributed temperatures drawn from the mean for each station based on the values in [lookup.csv](lookup.csv). +## The Challenge + +The main task, like the 1BRC, is to calculate the min, mean, and max values per weather station, sorted alphabetically. + + diff --git a/generate_data.py b/generate_data.py index eca7e2a..fa30785 100644 --- a/generate_data.py +++ b/generate_data.py @@ -10,7 +10,7 @@ n = 1_000_000_000_000 # Total number of rows of data to generate chunksize = 10_000_000 # Number of rows of data per file std = 10.0 # Assume normally distributed temperatures with a standard deviation of 10 lookup_df = pd.read_csv("lookup.csv") # Lookup table of stations and their mean temperatures -bucket = "s3://oss-scratch-space/1trc" +bucket = "s3://coiled-datasets-rp/1trc" def generate_chunk(partition_idx, bucket, chunksize, std, lookup_df): @@ -35,10 +35,10 @@ def generate_chunk(partition_idx, bucket, chunksize, std, lookup_df): df.station = df.station.map(lookup_df.station) # Save this chunk to the output file - filename = f"measurements-{partition_idx}.txt" + filename = f"measurements-{partition_idx}.parquet" with tempfile.TemporaryDirectory() as tmpdir: local = os.path.join(tmpdir, filename) - df.to_csv(local, sep=";", header=False, index=False) + df.to_parquet(local, engine="pyarrow") fs = fsspec.filesystem("s3") fs.put(local, f"{bucket}/{filename}") |