summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Patrick <github@danielpatrick.dev>2024-03-03 08:41:56 +0000
committerGunnar Morling <gunnar.morling@googlemail.com>2024-03-03 13:10:15 +0100
commit6daa93cca188eef12454c6061a25b1906cab57e7 (patch)
treea14a15adca179d4a5f48ac593b65f28b6b1678db
parentc92346790e8548f52e81254227efc935356e5e53 (diff)
More accurate file size estimate
-rwxr-xr-xsrc/main/python/create_measurements.py28
1 files changed, 12 insertions, 16 deletions
diff --git a/src/main/python/create_measurements.py b/src/main/python/create_measurements.py
index 26ec768..52e9fc1 100755
--- a/src/main/python/create_measurements.py
+++ b/src/main/python/create_measurements.py
@@ -84,22 +84,18 @@ def estimate_file_size(weather_station_names, num_rows_to_create):
"""
Tries to estimate how large a file the test data will be
"""
- max_string = float('-inf')
- min_string = float('inf')
- per_record_size = 0
- record_size_unit = "bytes"
-
- for station in weather_station_names:
- if len(station) > max_string:
- max_string = len(station)
- if len(station) < min_string:
- min_string = len(station)
- per_record_size = ((max_string + min_string * 2) + len(",-123.4")) / 2
-
- total_file_size = num_rows_to_create * per_record_size
- human_file_size = convert_bytes(total_file_size)
-
- return f"Estimated max file size is: {human_file_size}.\nTrue size is probably much smaller (around half)."
+ total_name_bytes = sum(len(s.encode("utf-8")) for s in weather_station_names)
+ avg_name_bytes = total_name_bytes / float(len(weather_station_names))
+
+ # avg_temp_bytes = sum(len(str(n / 10)) for n in range(-999, 1000)) / 1999
+ avg_temp_bytes = 4.400200100050025
+
+ # add 2 for separator and newline
+ avg_line_length = avg_name_bytes + avg_temp_bytes + 2
+
+ human_file_size = convert_bytes(num_rows_to_create * avg_line_length)
+
+ return f"Estimated max file size is: {human_file_size}."
def build_test_data(weather_station_names, num_rows_to_create):