diff --git a/dev/devicelab/README.md b/dev/devicelab/README.md index 6310f3fed4..8cc0021907 100644 --- a/dev/devicelab/README.md +++ b/dev/devicelab/README.md @@ -162,6 +162,47 @@ flags to `bin/run.dart`: An example of a local engine architecture is `android_debug_unopt_x86`. +## Running an A/B test for engine changes + +You can run an A/B test that compares the performance of the default engine +against a local engine build. The test runs the same benchmark a specified +number of times against both engines, then outputs a tab-separated spreadsheet +with the results. The results can be copied to a Google Spreadsheet for further +inspection. + +Example: + +```sh +../../bin/cache/dart-sdk/bin/dart bin/run.dart --ab=10 \ + --local-engine=host_debug_unopt \ + -t bin/tasks/web_benchmarks_canvaskit.dart +``` + +The `--ab=10` tells the runner to run an A/B test 10 times. + +`--local-engine=host_debug_unopt` tells the A/B test to use the `host_debug_unopt` +engine build. `--local-engine` is required for A/B test. + +A/B can run exactly one task. Multiple tasks are not supported. + +Example output: + +``` +Score Average A (noise) Average B (noise) Speed-up +bench_card_infinite_scroll.canvaskit.drawFrameDuration.average 2900.20 (8.44%) 2426.70 (8.94%) 1.20x +bench_card_infinite_scroll.canvaskit.totalUiFrame.average 4964.00 (6.29%) 4098.00 (8.03%) 1.21x +draw_rect.canvaskit.windowRenderDuration.average 1959.45 (16.56%) 2286.65 (0.61%) 0.86x +draw_rect.canvaskit.sceneBuildDuration.average 1969.45 (16.37%) 2294.90 (0.58%) 0.86x +draw_rect.canvaskit.drawFrameDuration.average 5335.20 (17.59%) 6437.60 (0.59%) 0.83x +draw_rect.canvaskit.totalUiFrame.average 6832.00 (13.16%) 7932.00 (0.34%) 0.86x +``` + +The output contains averages and noises for each score. More importantly, it +contains the speed-up value, i.e. how much _faster_ is the local engine than +the default engine. Values less than 1.0 indicate a slow-down. For example, +0.5x means the local engine is twice as slow as the default engine, and 2.0x +means it's twice as fast. Higher is better. + # Reproducing broken builds locally To reproduce the breakage locally `git checkout` the corresponding Flutter diff --git a/dev/devicelab/bin/run.dart b/dev/devicelab/bin/run.dart index a3f176229c..7a2daa6728 100644 --- a/dev/devicelab/bin/run.dart +++ b/dev/devicelab/bin/run.dart @@ -9,18 +9,34 @@ import 'dart:io'; import 'package:args/args.dart'; import 'package:path/path.dart' as path; +import 'package:flutter_devicelab/framework/ab.dart'; import 'package:flutter_devicelab/framework/manifest.dart'; import 'package:flutter_devicelab/framework/runner.dart'; import 'package:flutter_devicelab/framework/utils.dart'; +ArgResults args; + List _taskNames = []; +/// Suppresses standard output, prints only standard error output. +bool silent; + +/// The build of the local engine to use. +/// +/// Required for A/B test mode. +String localEngine; + +/// The path to the engine "src/" directory. +String localEngineSrcPath; + +/// Whether to exit on first test failure. +bool exitOnFirstTestFailure; + /// Runs tasks. /// /// The tasks are chosen depending on the command-line options /// (see [_argParser]). Future main(List rawArgs) async { - ArgResults args; try { args = _argParser.parse(rawArgs); } on FormatException catch (error) { @@ -55,10 +71,19 @@ Future main(List rawArgs) async { return; } - final bool silent = args['silent'] as bool; - final String localEngine = args['local-engine'] as String; - final String localEngineSrcPath = args['local-engine-src-path'] as String; + silent = args['silent'] as bool; + localEngine = args['local-engine'] as String; + localEngineSrcPath = args['local-engine-src-path'] as String; + exitOnFirstTestFailure = args['exit'] as bool; + if (args.wasParsed('ab')) { + await _runABTest(); + } else { + await _runTasks(); + } +} + +Future _runTasks() async { for (final String taskName in _taskNames) { section('Running task "$taskName"'); final Map result = await runTask( @@ -74,13 +99,73 @@ Future main(List rawArgs) async { if (!(result['success'] as bool)) { exitCode = 1; - if (args['exit'] as bool) { + if (exitOnFirstTestFailure) { return; } } } } +Future _runABTest() async { + final int runsPerTest = int.parse(args['ab'] as String); + + if (_taskNames.length > 1) { + stderr.writeln('When running in A/B test mode exactly one task must be passed but got ${_taskNames.join(', ')}.\n'); + stderr.writeln(_argParser.usage); + exit(1); + } + + if (!args.wasParsed('local-engine')) { + stderr.writeln('When running in A/B test mode --local-engine is required.\n'); + stderr.writeln(_argParser.usage); + exit(1); + } + + final String taskName = _taskNames.single; + + print('$taskName A/B test. Will run $runsPerTest times.'); + + final ABTest abTest = ABTest(); + for (int i = 1; i <= runsPerTest; i++) { + section('Run #$i'); + + print('Running with the default engine (A)'); + final Map defaultEngineResult = await runTask( + taskName, + silent: silent, + ); + + print('Default engine result:'); + print(const JsonEncoder.withIndent(' ').convert(defaultEngineResult)); + + if (!(defaultEngineResult['success'] as bool)) { + stderr.writeln('Task failed on the default engine.'); + exit(1); + } + + abTest.addAResult(defaultEngineResult); + + print('Running with the local engine (B)'); + final Map localEngineResult = await runTask( + taskName, + silent: silent, + localEngine: localEngine, + localEngineSrcPath: localEngineSrcPath, + ); + + print('Task localEngineResult:'); + print(const JsonEncoder.withIndent(' ').convert(localEngineResult)); + + if (!(localEngineResult['success'] as bool)) { + stderr.writeln('Task failed on the local engine.'); + exit(1); + } + + abTest.addBResult(localEngineResult); + } + print(abTest.printSummary()); +} + void addTasks({ List tasks, ArgResults args, @@ -132,6 +217,22 @@ final ArgParser _argParser = ArgParser() } }, ) + ..addOption( + 'ab', + help: 'Runs an A/B test comparing the default engine with the local\n' + 'engine build for one task. This option does not support running\n' + 'multiple tasks. The value is the number of times to run the task.\n' + 'The task is expected to be a benchmark that reports score keys.\n' + 'The A/B test collects the metrics collected by the test and\n' + 'produces a report containing averages, noise, and the speed-up\n' + 'between the two engines. --local-engine is required when running\n' + 'an A/B test.', + callback: (String value) { + if (value != null && int.tryParse(value) == null) { + throw ArgParserException('Option --ab must be a number, but was "$value".'); + } + }, + ) ..addFlag( 'all', abbr: 'a', @@ -152,7 +253,8 @@ final ArgParser _argParser = ArgParser() help: 'Name of a build output within the engine out directory, if you\n' 'are building Flutter locally. Use this to select a specific\n' 'version of the engine if you have built multiple engine targets.\n' - 'This path is relative to --local-engine-src-path/out.', + 'This path is relative to --local-engine-src-path/out. This option\n' + 'is required when running an A/B test (see the --ab option).', ) ..addFlag( 'list', diff --git a/dev/devicelab/lib/framework/ab.dart b/dev/devicelab/lib/framework/ab.dart new file mode 100644 index 0000000000..ad1de24753 --- /dev/null +++ b/dev/devicelab/lib/framework/ab.dart @@ -0,0 +1,136 @@ +// Copyright 2014 The Flutter Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +import 'dart:math' as math; +import 'package:meta/meta.dart'; + +/// Collects data from an A/B test and produces a summary for human evaluation. +/// +/// See [printSummary] for more. +class ABTest { + final Map> _aResults = >{}; + final Map> _bResults = >{}; + + /// Adds the result of a single A run of the benchmark. + /// + /// The result may contain multiple score keys. + /// + /// [result] is expected to be a serialization of [TaskResult]. + void addAResult(Map result) { + _addResult(result, _aResults); + } + + /// Adds the result of a single B run of the benchmark. + /// + /// The result may contain multiple score keys. + /// + /// [result] is expected to be a serialization of [TaskResult]. + void addBResult(Map result) { + _addResult(result, _bResults); + } + + /// Returns the summary as a tab-separated spreadsheet. + /// + /// This value can be copied straight to a Google Spreadsheet for further analysis. + String printSummary() { + final Map summariesA = _summarize(_aResults); + final Map summariesB = _summarize(_bResults); + final Set scoreKeyUnion = { + ...summariesA.keys, + ...summariesB.keys, + }; + + final StringBuffer buffer = StringBuffer( + 'Score\tAverage A (noise)\tAverage B (noise)\tSpeed-up\n', + ); + + for (final String scoreKey in scoreKeyUnion) { + final _ScoreSummary summaryA = summariesA[scoreKey]; + final _ScoreSummary summaryB = summariesB[scoreKey]; + buffer.write('$scoreKey\t'); + + if (summaryA != null) { + buffer.write('${summaryA.average.toStringAsFixed(2)} (${_ratioToPercent(summaryA.noise)})\t'); + } else { + buffer.write('\t'); + } + + if (summaryB != null) { + buffer.write('${summaryB.average.toStringAsFixed(2)} (${_ratioToPercent(summaryB.noise)})\t'); + } else { + buffer.write('\t'); + } + + if (summaryA != null && summaryB != null) { + buffer.write('${(summaryA.average / summaryB.average).toStringAsFixed(2)}x\t'); + } + + buffer.writeln(); + } + + return buffer.toString(); + } +} + +class _ScoreSummary { + _ScoreSummary({ + @required this.average, + @required this.noise, + }); + + /// Average (arithmetic mean) of a series of values collected by a benchmark. + final double average; + + /// The noise (standard deviation divided by [average]) in the collected + /// values. + final double noise; +} + +void _addResult(Map result, Map> results) { + final List scoreKeys = (result['benchmarkScoreKeys'] as List).cast(); + final Map data = result['data'] as Map; + for (final String scoreKey in scoreKeys) { + final double score = (data[scoreKey] as num).toDouble(); + results.putIfAbsent(scoreKey, () => []).add(score); + } +} + +Map _summarize(Map> results) { + return results.map((String scoreKey, List values) { + final double average = _computeAverage(values); + return MapEntry(scoreKey, _ScoreSummary( + average: average, + // If the average is zero, the benchmark got the perfect score with no noise. + noise: average > 0 + ? _computeStandardDeviationForPopulation(values) / average + : 0.0, + )); + }); +} + +/// Computes the arithmetic mean (or average) of given [values]. +double _computeAverage(Iterable values) { + final double sum = values.reduce((double a, double b) => a + b); + return sum / values.length; +} + +/// Computes population standard deviation. +/// +/// Unlike sample standard deviation, which divides by N - 1, this divides by N. +/// +/// See also: +/// +/// * https://en.wikipedia.org/wiki/Standard_deviation +double _computeStandardDeviationForPopulation(Iterable population) { + final double mean = _computeAverage(population); + final double sumOfSquaredDeltas = population.fold( + 0.0, + (double previous, num value) => previous += math.pow(value - mean, 2), + ); + return math.sqrt(sumOfSquaredDeltas / population.length); +} + +String _ratioToPercent(double value) { + return '${(value * 100).toStringAsFixed(2)}%'; +}