String building benchmarks code
Below is the full code that can be used to run the three micro-benchmarks described in my post about string concatenation:
#! -*- coding: utf-8 -*-
import csv
import io
import itertools
import functools
import math
import sys
import timeit
if sys.version_info < (3,):
StringIO = io.BytesIO
else:
StringIO = io.StringIO
def string_join(n_components):
"""Allocate a list of strings and concatenate them with join."""
str_components = []
for i in range(n_components):
str_components.append("%d" % i)
return "".join(str_components)
def string_buffer(n_components):
"""Build a string by incrementally writing to a StringIO."""
buffer = StringIO()
for i in range(n_components):
buffer.write("%d" % i)
buffer.seek(0)
return buffer.read()
def string_concat(n_components):
"""Incrementally build a string with +="""
str_out = ""
for i in range(n_components):
str_out += "%d" % i
return str_out
if sys.version_info < (3, 4):
# Backport simplified versions of some statistics functions
def mean(values):
return sum(values) / len(values)
def stdev(values, xbar=None):
N = len(values)
if xbar is None:
xbar = mean(values)
variance = sum(((x - xbar) ** 2 for x in values)) / (N - 1)
return math.sqrt(variance)
else:
from statistics import mean, stdev
def _format_truncated(value, truncation):
"""Round and then format the number with an appropriate level of truncation."""
truncated_value = round(value, truncation)
truncation = max([truncation, 0])
fmt_str = "0.%df" % truncation # "0.1f" -> "13.1", "0.0f" -> "13"
return format(truncated_value, fmt_str)
def _format_mean_std(mean_val, std_val):
"""Write a string with the approrpiate number of significant figures.
This will round the mean so that the lowest significant digit is the
highest significant digit of the standard deviation, e.g.
>>> _format_mean_std(1.24, 0.3)
"1.2 (± 0.3)"
>>> _format_mean_std(5.7883, 11.9344)
"6 (± 10)"
>>> _format_mean_std(5.7883, 0.02455)
"5.79 (± 0.02)"
Standard deviation is only reported to 1 significant figure.
"""
# Assume we have 1.24 ± 0.3, we want to round to 1.2, since any sig figs
# an order of magnitude smaller than the standard deviation are suspect
std_mag = math.log(std_val, 10)
# Truncate the standard deviation to 1 sig fig.
std_truncation = -int(math.floor(std_mag))
mean_truncation = -1 * int(std_mag // 1)
# Don't truncate too far - if the stdev is >= the mean, leave 1 std
mean_round_mag = -math.log(abs(mean_val), 10)
if mean_round_mag > mean_truncation:
mean_truncation = math.ceil(mean_round_mag)
mean_str = _format_truncated(mean_val, mean_truncation)
std_str = _format_truncated(std_val, std_truncation)
return "%s (± %s)" % (mean_str, std_str)
def _time_with_std(timer, number, k=5):
"""Get timing information in microseconds with std.
Runs the ``timer`` timer with ``number`` repetitions of the snippet ``k``
times and returns a string of the form "mean (±std)".
"""
timing_values = []
for _ in range(k):
timing_value = timer.timeit(number=number)
timing_value *= 1e6 / number # Convert to microseconds
timing_values.append(timing_value)
time_mean = mean(timing_values)
time_std = stdev(timing_values, xbar=time_mean)
return time_mean, time_std
def main(csv_mode):
outputs = []
setup = "from __main__ import string_join, string_concat, string_buffer"
for n in [10, 100, 1000, 10000]:
number = max((100000 // n, 1))
join_timer = timeit.Timer("string_join(%d)" % n, setup=setup)
buffer_timer = timeit.Timer("string_buffer(%d)" % n, setup=setup)
concat_timer = timeit.Timer("string_concat(%d)" % n, setup=setup)
time_with_std = functools.partial(_time_with_std, number=number, k=11)
timing_results = map(time_with_std, (join_timer, buffer_timer, concat_timer))
formatted_strings = itertools.starmap(_format_mean_std, timing_results)
outputs.append((n,) + tuple(formatted_strings))
if csv_mode:
writer = csv.writer(sys.stdout)
writer.writerows(outputs)
else:
header = "{:^20} | {:^20} | {:^20} | {:^20}".format(
"# components", "join_time (μs)", "buffer_time (μs)", "concat_time (μs)"
)
print(header)
print("-" * len(header))
for results in outputs:
print("{:<20} | {:^20} | {:^20} | {:^20}".format(*results))
if __name__ == "__main__":
# Don't want to bother with a real argparse for this...
csv_mode = len(sys.argv) == 2 and sys.argv[1] == "--csv"
main(csv_mode)
You can download the code here.