From 970dc9727731c8360463bc7f00b4e388ea03b2f1 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Mon, 29 Jan 2024 16:52:38 -0600 Subject: Initial commit --- environment.yml | 8 ++ generate_data.py | 65 +++++++++ lookup.csv | 414 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 487 insertions(+) create mode 100644 environment.yml create mode 100644 generate_data.py create mode 100644 lookup.csv diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..dbb60dc --- /dev/null +++ b/environment.yml @@ -0,0 +1,8 @@ +name: 1trc +channels: + - conda-forge +dependencies: + - python=3.12 + - dask + - coiled + - s3fs \ No newline at end of file diff --git a/generate_data.py b/generate_data.py new file mode 100644 index 0000000..eca7e2a --- /dev/null +++ b/generate_data.py @@ -0,0 +1,65 @@ +import os +import tempfile +import coiled +import fsspec +import numpy as np +import pandas as pd +from dask.distributed import progress + +n = 1_000_000_000_000 # Total number of rows of data to generate +chunksize = 10_000_000 # Number of rows of data per file +std = 10.0 # Assume normally distributed temperatures with a standard deviation of 10 +lookup_df = pd.read_csv("lookup.csv") # Lookup table of stations and their mean temperatures +bucket = "s3://oss-scratch-space/1trc" + + +def generate_chunk(partition_idx, bucket, chunksize, std, lookup_df): + """Generate some sample data based on the lookup table.""" + + rng = np.random.default_rng(partition_idx) # Determinisitic data generation + df = pd.DataFrame( + { + # Choose a random station from the lookup table for each row in our output + "station": rng.integers(0, len(lookup_df) - 1, int(chunksize)), + # Generate a normal distibution around zero for each row in our output + # Because the std is the same for every station we can adjust the mean for each row afterwards + "measure": rng.normal(0, std, int(chunksize)), + } + ) + + # Offset each measurement by the station's mean value + df.measure += df.station.map(lookup_df.mean_temp) + # Round the temprature to one decimal place + df.measure = df.measure.round(decimals=1) + # Convert the station index to the station name + df.station = df.station.map(lookup_df.station) + + # Save this chunk to the output file + filename = f"measurements-{partition_idx}.txt" + with tempfile.TemporaryDirectory() as tmpdir: + local = os.path.join(tmpdir, filename) + df.to_csv(local, sep=";", header=False, index=False) + fs = fsspec.filesystem("s3") + fs.put(local, f"{bucket}/{filename}") + + +if __name__ == "__main__": + + with coiled.Cluster( + n_workers=500, + worker_cpu=1, + arm=True, + region="us-east-1", + spot_policy="spot_with_fallback", + ) as cluster: + with cluster.get_client() as client: + # Generate partitioned dataset + results = client.map( + generate_chunk, + range(int(n / chunksize)), + bucket=bucket, + chunksize=chunksize, + std=std, + lookup_df=lookup_df, + ) + progress(results) diff --git a/lookup.csv b/lookup.csv new file mode 100644 index 0000000..cbde9f6 --- /dev/null +++ b/lookup.csv @@ -0,0 +1,414 @@ +station,mean_temp +Abha,18.0 +Abidjan,26.0 +Abéché,29.4 +Accra,26.4 +Addis Ababa,16.0 +Adelaide,17.3 +Aden,29.1 +Ahvaz,25.4 +Albuquerque,14.0 +Alexandra,11.0 +Alexandria,20.0 +Algiers,18.2 +Alice Springs,21.0 +Almaty,10.0 +Amsterdam,10.2 +Anadyr,-6.9 +Anchorage,2.8 +Andorra la Vella,9.8 +Ankara,12.0 +Antananarivo,17.9 +Antsiranana,25.2 +Arkhangelsk,1.3 +Ashgabat,17.1 +Asmara,15.6 +Assab,30.5 +Astana,3.5 +Athens,19.2 +Atlanta,17.0 +Auckland,15.2 +Austin,20.7 +Baghdad,22.77 +Baguio,19.5 +Baku,15.1 +Baltimore,13.1 +Bamako,27.8 +Bangkok,28.6 +Bangui,26.0 +Banjul,26.0 +Barcelona,18.2 +Bata,25.1 +Batumi,14.0 +Beijing,12.9 +Beirut,20.9 +Belgrade,12.5 +Belize City,26.7 +Benghazi,19.9 +Bergen,7.7 +Berlin,10.3 +Bilbao,14.7 +Birao,26.5 +Bishkek,11.3 +Bissau,27.0 +Blantyre,22.2 +Bloemfontein,15.6 +Boise,11.4 +Bordeaux,14.2 +Bosaso,30.0 +Boston,10.9 +Bouaké,26.0 +Bratislava,10.5 +Brazzaville,25.0 +Bridgetown,27.0 +Brisbane,21.4 +Brussels,10.5 +Bucharest,10.8 +Budapest,11.3 +Bujumbura,23.8 +Bulawayo,18.9 +Burnie,13.1 +Busan,15.0 +Cabo San Lucas,23.9 +Cairns,25.0 +Cairo,21.4 +Calgary,4.4 +Canberra,13.1 +Cape Town,16.2 +Changsha,17.4 +Charlotte,16.1 +Chiang Mai,25.8 +Chicago,9.8 +Chihuahua,18.6 +Chișinău,10.2 +Chittagong,25.9 +Chongqing,18.6 +Christchurch,12.2 +City of San Marino,11.8 +Colombo,27.4 +Columbus,11.7 +Conakry,26.4 +Copenhagen,9.1 +Cotonou,27.2 +Cracow,9.3 +Da Lat,17.9 +Da Nang,25.8 +Dakar,24.0 +Dallas,19.0 +Damascus,17.0 +Dampier,26.4 +Dar es Salaam,25.8 +Darwin,27.6 +Denpasar,23.7 +Denver,10.4 +Detroit,10.0 +Dhaka,25.9 +Dikson,-11.1 +Dili,26.6 +Djibouti,29.9 +Dodoma,22.7 +Dolisie,24.0 +Douala,26.7 +Dubai,26.9 +Dublin,9.8 +Dunedin,11.1 +Durban,20.6 +Dushanbe,14.7 +Edinburgh,9.3 +Edmonton,4.2 +El Paso,18.1 +Entebbe,21.0 +Erbil,19.5 +Erzurum,5.1 +Fairbanks,-2.3 +Fianarantsoa,17.9 +"Flores, Petén",26.4 +Frankfurt,10.6 +Fresno,17.9 +Fukuoka,17.0 +Gabès,19.5 +Gaborone,21.0 +Gagnoa,26.0 +Gangtok,15.2 +Garissa,29.3 +Garoua,28.3 +George Town,27.9 +Ghanzi,21.4 +Gjoa Haven,-14.4 +Guadalajara,20.9 +Guangzhou,22.4 +Guatemala City,20.4 +Halifax,7.5 +Hamburg,9.7 +Hamilton,13.8 +Hanga Roa,20.5 +Hanoi,23.6 +Harare,18.4 +Harbin,5.0 +Hargeisa,21.7 +Hat Yai,27.0 +Havana,25.2 +Helsinki,5.9 +Heraklion,18.9 +Hiroshima,16.3 +Ho Chi Minh City,27.4 +Hobart,12.7 +Hong Kong,23.3 +Honiara,26.5 +Honolulu,25.4 +Houston,20.8 +Ifrane,11.4 +Indianapolis,11.8 +Iqaluit,-9.3 +Irkutsk,1.0 +Istanbul,13.9 +İzmir,17.9 +Jacksonville,20.3 +Jakarta,26.7 +Jayapura,27.0 +Jerusalem,18.3 +Johannesburg,15.5 +Jos,22.8 +Juba,27.8 +Kabul,12.1 +Kampala,20.0 +Kandi,27.7 +Kankan,26.5 +Kano,26.4 +Kansas City,12.5 +Karachi,26.0 +Karonga,24.4 +Kathmandu,18.3 +Khartoum,29.9 +Kingston,27.4 +Kinshasa,25.3 +Kolkata,26.7 +Kuala Lumpur,27.3 +Kumasi,26.0 +Kunming,15.7 +Kuopio,3.4 +Kuwait City,25.7 +Kyiv,8.4 +Kyoto,15.8 +La Ceiba,26.2 +La Paz,23.7 +Lagos,26.8 +Lahore,24.3 +Lake Havasu City,23.7 +Lake Tekapo,8.7 +Las Palmas de Gran Canaria,21.2 +Las Vegas,20.3 +Launceston,13.1 +Lhasa,7.6 +Libreville,25.9 +Lisbon,17.5 +Livingstone,21.8 +Ljubljana,10.9 +Lodwar,29.3 +Lomé,26.9 +London,11.3 +Los Angeles,18.6 +Louisville,13.9 +Luanda,25.8 +Lubumbashi,20.8 +Lusaka,19.9 +Luxembourg City,9.3 +Lviv,7.8 +Lyon,12.5 +Madrid,15.0 +Mahajanga,26.3 +Makassar,26.7 +Makurdi,26.0 +Malabo,26.3 +Malé,28.0 +Managua,27.3 +Manama,26.5 +Mandalay,28.0 +Mango,28.1 +Manila,28.4 +Maputo,22.8 +Marrakesh,19.6 +Marseille,15.8 +Maun,22.4 +Medan,26.5 +Mek'ele,22.7 +Melbourne,15.1 +Memphis,17.2 +Mexicali,23.1 +Mexico City,17.5 +Miami,24.9 +Milan,13.0 +Milwaukee,8.9 +Minneapolis,7.8 +Minsk,6.7 +Mogadishu,27.1 +Mombasa,26.3 +Monaco,16.4 +Moncton,6.1 +Monterrey,22.3 +Montreal,6.8 +Moscow,5.8 +Mumbai,27.1 +Murmansk,0.6 +Muscat,28.0 +Mzuzu,17.7 +N'Djamena,28.3 +Naha,23.1 +Nairobi,17.8 +Nakhon Ratchasima,27.3 +Napier,14.6 +Napoli,15.9 +Nashville,15.4 +Nassau,24.6 +Ndola,20.3 +New Delhi,25.0 +New Orleans,20.7 +New York City,12.9 +Ngaoundéré,22.0 +Niamey,29.3 +Nicosia,19.7 +Niigata,13.9 +Nouadhibou,21.3 +Nouakchott,25.7 +Novosibirsk,1.7 +Nuuk,-1.4 +Odesa,10.7 +Odienné,26.0 +Oklahoma City,15.9 +Omaha,10.6 +Oranjestad,28.1 +Oslo,5.7 +Ottawa,6.6 +Ouagadougou,28.3 +Ouahigouya,28.6 +Ouarzazate,18.9 +Oulu,2.7 +Palembang,27.3 +Palermo,18.5 +Palm Springs,24.5 +Palmerston North,13.2 +Panama City,28.0 +Parakou,26.8 +Paris,12.3 +Perth,18.7 +Petropavlovsk-Kamchatsky,1.9 +Philadelphia,13.2 +Phnom Penh,28.3 +Phoenix,23.9 +Pittsburgh,10.8 +Podgorica,15.3 +Pointe-Noire,26.1 +Pontianak,27.7 +Port Moresby,26.9 +Port Sudan,28.4 +Port Vila,24.3 +Port-Gentil,26.0 +Portland (OR),12.4 +Porto,15.7 +Prague,8.4 +Praia,24.4 +Pretoria,18.2 +Pyongyang,10.8 +Rabat,17.2 +Rangpur,24.4 +Reggane,28.3 +Reykjavík,4.3 +Riga,6.2 +Riyadh,26.0 +Rome,15.2 +Roseau,26.2 +Rostov-on-Don,9.9 +Sacramento,16.3 +Saint Petersburg,5.8 +Saint-Pierre,5.7 +Salt Lake City,11.6 +San Antonio,20.8 +San Diego,17.8 +San Francisco,14.6 +San Jose,16.4 +San José,22.6 +San Juan,27.2 +San Salvador,23.1 +Sana'a,20.0 +Santo Domingo,25.9 +Sapporo,8.9 +Sarajevo,10.1 +Saskatoon,3.3 +Seattle,11.3 +Ségou,28.0 +Seoul,12.5 +Seville,19.2 +Shanghai,16.7 +Singapore,27.0 +Skopje,12.4 +Sochi,14.2 +Sofia,10.6 +Sokoto,28.0 +Split,16.1 +St. John's,5.0 +St. Louis,13.9 +Stockholm,6.6 +Surabaya,27.1 +Suva,25.6 +Suwałki,7.2 +Sydney,17.7 +Tabora,23.0 +Tabriz,12.6 +Taipei,23.0 +Tallinn,6.4 +Tamale,27.9 +Tamanrasset,21.7 +Tampa,22.9 +Tashkent,14.8 +Tauranga,14.8 +Tbilisi,12.9 +Tegucigalpa,21.7 +Tehran,17.0 +Tel Aviv,20.0 +Thessaloniki,16.0 +Thiès,24.0 +Tijuana,17.8 +Timbuktu,28.0 +Tirana,15.2 +Toamasina,23.4 +Tokyo,15.4 +Toliara,24.1 +Toluca,12.4 +Toronto,9.4 +Tripoli,20.0 +Tromsø,2.9 +Tucson,20.9 +Tunis,18.4 +Ulaanbaatar,-0.4 +Upington,20.4 +Ürümqi,7.4 +Vaduz,10.1 +Valencia,18.3 +Valletta,18.8 +Vancouver,10.4 +Veracruz,25.4 +Vienna,10.4 +Vientiane,25.9 +Villahermosa,27.1 +Vilnius,6.0 +Virginia Beach,15.8 +Vladivostok,4.9 +Warsaw,8.5 +"Washington, D.C.",14.6 +Wau,27.8 +Wellington,12.9 +Whitehorse,-0.1 +Wichita,13.9 +Willemstad,28.0 +Winnipeg,3.0 +Wrocław,9.6 +Xi'an,14.1 +Yakutsk,-8.8 +Yangon,27.5 +Yaoundé,23.8 +Yellowknife,-4.3 +Yerevan,12.4 +Yinchuan,9.0 +Zagreb,10.7 +Zanzibar City,26.0 +Zürich,9.3 -- cgit v1.2.3-70-g09d2