summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames Bourbeau <jrbourbeau@gmail.com>2024-01-29 16:52:38 -0600
committerJames Bourbeau <jrbourbeau@gmail.com>2024-01-29 16:52:38 -0600
commit970dc9727731c8360463bc7f00b4e388ea03b2f1 (patch)
treeeaf01c294a1ef007665939f346551644e98c3249
parentd11beb0b3c99682c0d6afc3bb7adea733cbfc7e7 (diff)
Initial commit
-rw-r--r--environment.yml8
-rw-r--r--generate_data.py65
-rw-r--r--lookup.csv414
3 files changed, 487 insertions, 0 deletions
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..dbb60dc
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,8 @@
+name: 1trc
+channels:
+ - conda-forge
+dependencies:
+ - python=3.12
+ - dask
+ - coiled
+ - s3fs \ No newline at end of file
diff --git a/generate_data.py b/generate_data.py
new file mode 100644
index 0000000..eca7e2a
--- /dev/null
+++ b/generate_data.py
@@ -0,0 +1,65 @@
+import os
+import tempfile
+import coiled
+import fsspec
+import numpy as np
+import pandas as pd
+from dask.distributed import progress
+
+n = 1_000_000_000_000 # Total number of rows of data to generate
+chunksize = 10_000_000 # Number of rows of data per file
+std = 10.0 # Assume normally distributed temperatures with a standard deviation of 10
+lookup_df = pd.read_csv("lookup.csv") # Lookup table of stations and their mean temperatures
+bucket = "s3://oss-scratch-space/1trc"
+
+
+def generate_chunk(partition_idx, bucket, chunksize, std, lookup_df):
+ """Generate some sample data based on the lookup table."""
+
+ rng = np.random.default_rng(partition_idx) # Determinisitic data generation
+ df = pd.DataFrame(
+ {
+ # Choose a random station from the lookup table for each row in our output
+ "station": rng.integers(0, len(lookup_df) - 1, int(chunksize)),
+ # Generate a normal distibution around zero for each row in our output
+ # Because the std is the same for every station we can adjust the mean for each row afterwards
+ "measure": rng.normal(0, std, int(chunksize)),
+ }
+ )
+
+ # Offset each measurement by the station's mean value
+ df.measure += df.station.map(lookup_df.mean_temp)
+ # Round the temprature to one decimal place
+ df.measure = df.measure.round(decimals=1)
+ # Convert the station index to the station name
+ df.station = df.station.map(lookup_df.station)
+
+ # Save this chunk to the output file
+ filename = f"measurements-{partition_idx}.txt"
+ with tempfile.TemporaryDirectory() as tmpdir:
+ local = os.path.join(tmpdir, filename)
+ df.to_csv(local, sep=";", header=False, index=False)
+ fs = fsspec.filesystem("s3")
+ fs.put(local, f"{bucket}/{filename}")
+
+
+if __name__ == "__main__":
+
+ with coiled.Cluster(
+ n_workers=500,
+ worker_cpu=1,
+ arm=True,
+ region="us-east-1",
+ spot_policy="spot_with_fallback",
+ ) as cluster:
+ with cluster.get_client() as client:
+ # Generate partitioned dataset
+ results = client.map(
+ generate_chunk,
+ range(int(n / chunksize)),
+ bucket=bucket,
+ chunksize=chunksize,
+ std=std,
+ lookup_df=lookup_df,
+ )
+ progress(results)
diff --git a/lookup.csv b/lookup.csv
new file mode 100644
index 0000000..cbde9f6
--- /dev/null
+++ b/lookup.csv
@@ -0,0 +1,414 @@
+station,mean_temp
+Abha,18.0
+Abidjan,26.0
+Abéché,29.4
+Accra,26.4
+Addis Ababa,16.0
+Adelaide,17.3
+Aden,29.1
+Ahvaz,25.4
+Albuquerque,14.0
+Alexandra,11.0
+Alexandria,20.0
+Algiers,18.2
+Alice Springs,21.0
+Almaty,10.0
+Amsterdam,10.2
+Anadyr,-6.9
+Anchorage,2.8
+Andorra la Vella,9.8
+Ankara,12.0
+Antananarivo,17.9
+Antsiranana,25.2
+Arkhangelsk,1.3
+Ashgabat,17.1
+Asmara,15.6
+Assab,30.5
+Astana,3.5
+Athens,19.2
+Atlanta,17.0
+Auckland,15.2
+Austin,20.7
+Baghdad,22.77
+Baguio,19.5
+Baku,15.1
+Baltimore,13.1
+Bamako,27.8
+Bangkok,28.6
+Bangui,26.0
+Banjul,26.0
+Barcelona,18.2
+Bata,25.1
+Batumi,14.0
+Beijing,12.9
+Beirut,20.9
+Belgrade,12.5
+Belize City,26.7
+Benghazi,19.9
+Bergen,7.7
+Berlin,10.3
+Bilbao,14.7
+Birao,26.5
+Bishkek,11.3
+Bissau,27.0
+Blantyre,22.2
+Bloemfontein,15.6
+Boise,11.4
+Bordeaux,14.2
+Bosaso,30.0
+Boston,10.9
+Bouaké,26.0
+Bratislava,10.5
+Brazzaville,25.0
+Bridgetown,27.0
+Brisbane,21.4
+Brussels,10.5
+Bucharest,10.8
+Budapest,11.3
+Bujumbura,23.8
+Bulawayo,18.9
+Burnie,13.1
+Busan,15.0
+Cabo San Lucas,23.9
+Cairns,25.0
+Cairo,21.4
+Calgary,4.4
+Canberra,13.1
+Cape Town,16.2
+Changsha,17.4
+Charlotte,16.1
+Chiang Mai,25.8
+Chicago,9.8
+Chihuahua,18.6
+Chișinău,10.2
+Chittagong,25.9
+Chongqing,18.6
+Christchurch,12.2
+City of San Marino,11.8
+Colombo,27.4
+Columbus,11.7
+Conakry,26.4
+Copenhagen,9.1
+Cotonou,27.2
+Cracow,9.3
+Da Lat,17.9
+Da Nang,25.8
+Dakar,24.0
+Dallas,19.0
+Damascus,17.0
+Dampier,26.4
+Dar es Salaam,25.8
+Darwin,27.6
+Denpasar,23.7
+Denver,10.4
+Detroit,10.0
+Dhaka,25.9
+Dikson,-11.1
+Dili,26.6
+Djibouti,29.9
+Dodoma,22.7
+Dolisie,24.0
+Douala,26.7
+Dubai,26.9
+Dublin,9.8
+Dunedin,11.1
+Durban,20.6
+Dushanbe,14.7
+Edinburgh,9.3
+Edmonton,4.2
+El Paso,18.1
+Entebbe,21.0
+Erbil,19.5
+Erzurum,5.1
+Fairbanks,-2.3
+Fianarantsoa,17.9
+"Flores, Petén",26.4
+Frankfurt,10.6
+Fresno,17.9
+Fukuoka,17.0
+Gabès,19.5
+Gaborone,21.0
+Gagnoa,26.0
+Gangtok,15.2
+Garissa,29.3
+Garoua,28.3
+George Town,27.9
+Ghanzi,21.4
+Gjoa Haven,-14.4
+Guadalajara,20.9
+Guangzhou,22.4
+Guatemala City,20.4
+Halifax,7.5
+Hamburg,9.7
+Hamilton,13.8
+Hanga Roa,20.5
+Hanoi,23.6
+Harare,18.4
+Harbin,5.0
+Hargeisa,21.7
+Hat Yai,27.0
+Havana,25.2
+Helsinki,5.9
+Heraklion,18.9
+Hiroshima,16.3
+Ho Chi Minh City,27.4
+Hobart,12.7
+Hong Kong,23.3
+Honiara,26.5
+Honolulu,25.4
+Houston,20.8
+Ifrane,11.4
+Indianapolis,11.8
+Iqaluit,-9.3
+Irkutsk,1.0
+Istanbul,13.9
+İzmir,17.9
+Jacksonville,20.3
+Jakarta,26.7
+Jayapura,27.0
+Jerusalem,18.3
+Johannesburg,15.5
+Jos,22.8
+Juba,27.8
+Kabul,12.1
+Kampala,20.0
+Kandi,27.7
+Kankan,26.5
+Kano,26.4
+Kansas City,12.5
+Karachi,26.0
+Karonga,24.4
+Kathmandu,18.3
+Khartoum,29.9
+Kingston,27.4
+Kinshasa,25.3
+Kolkata,26.7
+Kuala Lumpur,27.3
+Kumasi,26.0
+Kunming,15.7
+Kuopio,3.4
+Kuwait City,25.7
+Kyiv,8.4
+Kyoto,15.8
+La Ceiba,26.2
+La Paz,23.7
+Lagos,26.8
+Lahore,24.3
+Lake Havasu City,23.7
+Lake Tekapo,8.7
+Las Palmas de Gran Canaria,21.2
+Las Vegas,20.3
+Launceston,13.1
+Lhasa,7.6
+Libreville,25.9
+Lisbon,17.5
+Livingstone,21.8
+Ljubljana,10.9
+Lodwar,29.3
+Lomé,26.9
+London,11.3
+Los Angeles,18.6
+Louisville,13.9
+Luanda,25.8
+Lubumbashi,20.8
+Lusaka,19.9
+Luxembourg City,9.3
+Lviv,7.8
+Lyon,12.5
+Madrid,15.0
+Mahajanga,26.3
+Makassar,26.7
+Makurdi,26.0
+Malabo,26.3
+Malé,28.0
+Managua,27.3
+Manama,26.5
+Mandalay,28.0
+Mango,28.1
+Manila,28.4
+Maputo,22.8
+Marrakesh,19.6
+Marseille,15.8
+Maun,22.4
+Medan,26.5
+Mek'ele,22.7
+Melbourne,15.1
+Memphis,17.2
+Mexicali,23.1
+Mexico City,17.5
+Miami,24.9
+Milan,13.0
+Milwaukee,8.9
+Minneapolis,7.8
+Minsk,6.7
+Mogadishu,27.1
+Mombasa,26.3
+Monaco,16.4
+Moncton,6.1
+Monterrey,22.3
+Montreal,6.8
+Moscow,5.8
+Mumbai,27.1
+Murmansk,0.6
+Muscat,28.0
+Mzuzu,17.7
+N'Djamena,28.3
+Naha,23.1
+Nairobi,17.8
+Nakhon Ratchasima,27.3
+Napier,14.6
+Napoli,15.9
+Nashville,15.4
+Nassau,24.6
+Ndola,20.3
+New Delhi,25.0
+New Orleans,20.7
+New York City,12.9
+Ngaoundéré,22.0
+Niamey,29.3
+Nicosia,19.7
+Niigata,13.9
+Nouadhibou,21.3
+Nouakchott,25.7
+Novosibirsk,1.7
+Nuuk,-1.4
+Odesa,10.7
+Odienné,26.0
+Oklahoma City,15.9
+Omaha,10.6
+Oranjestad,28.1
+Oslo,5.7
+Ottawa,6.6
+Ouagadougou,28.3
+Ouahigouya,28.6
+Ouarzazate,18.9
+Oulu,2.7
+Palembang,27.3
+Palermo,18.5
+Palm Springs,24.5
+Palmerston North,13.2
+Panama City,28.0
+Parakou,26.8
+Paris,12.3
+Perth,18.7
+Petropavlovsk-Kamchatsky,1.9
+Philadelphia,13.2
+Phnom Penh,28.3
+Phoenix,23.9
+Pittsburgh,10.8
+Podgorica,15.3
+Pointe-Noire,26.1
+Pontianak,27.7
+Port Moresby,26.9
+Port Sudan,28.4
+Port Vila,24.3
+Port-Gentil,26.0
+Portland (OR),12.4
+Porto,15.7
+Prague,8.4
+Praia,24.4
+Pretoria,18.2
+Pyongyang,10.8
+Rabat,17.2
+Rangpur,24.4
+Reggane,28.3
+Reykjavík,4.3
+Riga,6.2
+Riyadh,26.0
+Rome,15.2
+Roseau,26.2
+Rostov-on-Don,9.9
+Sacramento,16.3
+Saint Petersburg,5.8
+Saint-Pierre,5.7
+Salt Lake City,11.6
+San Antonio,20.8
+San Diego,17.8
+San Francisco,14.6
+San Jose,16.4
+San José,22.6
+San Juan,27.2
+San Salvador,23.1
+Sana'a,20.0
+Santo Domingo,25.9
+Sapporo,8.9
+Sarajevo,10.1
+Saskatoon,3.3
+Seattle,11.3
+Ségou,28.0
+Seoul,12.5
+Seville,19.2
+Shanghai,16.7
+Singapore,27.0
+Skopje,12.4
+Sochi,14.2
+Sofia,10.6
+Sokoto,28.0
+Split,16.1
+St. John's,5.0
+St. Louis,13.9
+Stockholm,6.6
+Surabaya,27.1
+Suva,25.6
+Suwałki,7.2
+Sydney,17.7
+Tabora,23.0
+Tabriz,12.6
+Taipei,23.0
+Tallinn,6.4
+Tamale,27.9
+Tamanrasset,21.7
+Tampa,22.9
+Tashkent,14.8
+Tauranga,14.8
+Tbilisi,12.9
+Tegucigalpa,21.7
+Tehran,17.0
+Tel Aviv,20.0
+Thessaloniki,16.0
+Thiès,24.0
+Tijuana,17.8
+Timbuktu,28.0
+Tirana,15.2
+Toamasina,23.4
+Tokyo,15.4
+Toliara,24.1
+Toluca,12.4
+Toronto,9.4
+Tripoli,20.0
+Tromsø,2.9
+Tucson,20.9
+Tunis,18.4
+Ulaanbaatar,-0.4
+Upington,20.4
+Ürümqi,7.4
+Vaduz,10.1
+Valencia,18.3
+Valletta,18.8
+Vancouver,10.4
+Veracruz,25.4
+Vienna,10.4
+Vientiane,25.9
+Villahermosa,27.1
+Vilnius,6.0
+Virginia Beach,15.8
+Vladivostok,4.9
+Warsaw,8.5
+"Washington, D.C.",14.6
+Wau,27.8
+Wellington,12.9
+Whitehorse,-0.1
+Wichita,13.9
+Willemstad,28.0
+Winnipeg,3.0
+Wrocław,9.6
+Xi'an,14.1
+Yakutsk,-8.8
+Yangon,27.5
+Yaoundé,23.8
+Yellowknife,-4.3
+Yerevan,12.4
+Yinchuan,9.0
+Zagreb,10.7
+Zanzibar City,26.0
+Zürich,9.3