From 771a151ce64f10f0d9e0ee6a81e245f00de80cf6 Mon Sep 17 00:00:00 2001
From: scharlottej13 <sarah@coiled.io>
Date: Fri, 2 Feb 2024 12:59:55 -0800
Subject: Updates to readme and data generation script

---
 README.md        | 15 ++++++++++++++-
 generate_data.py |  6 +++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index ed1b139..9b456e5 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,14 @@
-# 1trc
\ No newline at end of file
+# One Trillion Row Challenge
+
+Inspired by Gunnar Morling's [one billion row challenge](https://github.com/gunnarmorling/1brc), we thought we'd take things one step further and start the one trillion row challenge (1TRC).
+
+## Data Generation
+
+You can generate the dataset yourself using the [data generation script](generate_data.py). We've also hosted the dataset in a requester pays S3 bucket s3://coiled-datasets-rp/1trc in `us-east-1`. 
+
+It draws a random sample of weather stations and normally distributed temperatures drawn from the mean for each station based on the values in [lookup.csv](lookup.csv).
+## The Challenge
+
+The main task, like the 1BRC, is to calculate the min, mean, and max values per weather station, sorted alphabetically.
+
+
diff --git a/generate_data.py b/generate_data.py
index eca7e2a..fa30785 100644
--- a/generate_data.py
+++ b/generate_data.py
@@ -10,7 +10,7 @@ n = 1_000_000_000_000  # Total number of rows of data to generate
 chunksize = 10_000_000  # Number of rows of data per file
 std = 10.0  # Assume normally distributed temperatures with a standard deviation of 10
 lookup_df = pd.read_csv("lookup.csv")  # Lookup table of stations and their mean temperatures
-bucket = "s3://oss-scratch-space/1trc"
+bucket = "s3://coiled-datasets-rp/1trc"
 
 
 def generate_chunk(partition_idx, bucket, chunksize, std, lookup_df):
@@ -35,10 +35,10 @@ def generate_chunk(partition_idx, bucket, chunksize, std, lookup_df):
     df.station = df.station.map(lookup_df.station)
 
     # Save this chunk to the output file
-    filename = f"measurements-{partition_idx}.txt"
+    filename = f"measurements-{partition_idx}.parquet"
     with tempfile.TemporaryDirectory() as tmpdir:
         local = os.path.join(tmpdir, filename)
-        df.to_csv(local, sep=";", header=False, index=False)
+        df.to_parquet(local, engine="pyarrow")
         fs = fsspec.filesystem("s3")
         fs.put(local, f"{bucket}/{filename}")
 
-- 
cgit v1.2.3-70-g09d2


From 7ec23a39198016eb285cee324c0f967ffda8b084 Mon Sep 17 00:00:00 2001
From: Sarah Charlotte Johnson <scharlottej13@gmail.com>
Date: Fri, 2 Feb 2024 15:32:50 -0800
Subject: Update README.md

Co-authored-by: James Bourbeau <jrbourbeau@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9b456e5..075e993 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ Inspired by Gunnar Morling's [one billion row challenge](https://github.com/gunn
 
 ## Data Generation
 
-You can generate the dataset yourself using the [data generation script](generate_data.py). We've also hosted the dataset in a requester pays S3 bucket s3://coiled-datasets-rp/1trc in `us-east-1`. 
+You can generate the dataset yourself using the [data generation script](generate_data.py). We've also hosted the dataset in a requester pays S3 bucket `s3://coiled-datasets-rp/1trc` in `us-east-1`. 
 
 It draws a random sample of weather stations and normally distributed temperatures drawn from the mean for each station based on the values in [lookup.csv](lookup.csv).
 ## The Challenge
-- 
cgit v1.2.3-70-g09d2


From bb23f9245dcd631c33000657bd21c1fe532abfcc Mon Sep 17 00:00:00 2001
From: scharlottej13 <sarah@coiled.io>
Date: Fri, 2 Feb 2024 15:35:13 -0800
Subject: Add attribution to Jacob's script

---
 README.md        | 3 ++-
 generate_data.py | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 075e993..603dc34 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,10 @@ Inspired by Gunnar Morling's [one billion row challenge](https://github.com/gunn
 
 ## Data Generation
 
-You can generate the dataset yourself using the [data generation script](generate_data.py). We've also hosted the dataset in a requester pays S3 bucket `s3://coiled-datasets-rp/1trc` in `us-east-1`. 
+You can generate the dataset yourself using the [data generation script](generate_data.py), adapted from [Jacob Tomlinson's data generation script](https://github.com/gunnarmorling/1brc/discussions/487). We've also hosted the dataset in a requester pays S3 bucket `s3://coiled-datasets-rp/1trc` in `us-east-1`. 
 
 It draws a random sample of weather stations and normally distributed temperatures drawn from the mean for each station based on the values in [lookup.csv](lookup.csv).
+
 ## The Challenge
 
 The main task, like the 1BRC, is to calculate the min, mean, and max values per weather station, sorted alphabetically.
diff --git a/generate_data.py b/generate_data.py
index fa30785..f5de1a8 100644
--- a/generate_data.py
+++ b/generate_data.py
@@ -1,3 +1,6 @@
+# This script was adapted from Jacob Tomlinson's 1BRC submission
+# https://github.com/gunnarmorling/1brc/discussions/487
+
 import os
 import tempfile
 import coiled
-- 
cgit v1.2.3-70-g09d2