<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><meta http-equiv="content-type" content="text/html; charset=utf-8" />
<title>[242658] trunk/Tools</title>
</head>
<body>
<style type="text/css"><!--
#msg dl.meta { border: 1px #006 solid; background: #369; padding: 6px; color: #fff; }
#msg dl.meta dt { float: left; width: 6em; font-weight: bold; }
#msg dt:after { content:':';}
#msg dl, #msg dt, #msg ul, #msg li, #header, #footer, #logmsg { font-family: verdana,arial,helvetica,sans-serif; font-size: 10pt; }
#msg dl a { font-weight: bold}
#msg dl a:link { color:#fc3; }
#msg dl a:active { color:#ff0; }
#msg dl a:visited { color:#cc6; }
h3 { font-family: verdana,arial,helvetica,sans-serif; font-size: 10pt; font-weight: bold; }
#msg pre { overflow: auto; background: #ffc; border: 1px #fa0 solid; padding: 6px; }
#logmsg { background: #ffc; border: 1px #fa0 solid; padding: 1em 1em 0 1em; }
#logmsg p, #logmsg pre, #logmsg blockquote { margin: 0 0 1em 0; }
#logmsg p, #logmsg li, #logmsg dt, #logmsg dd { line-height: 14pt; }
#logmsg h1, #logmsg h2, #logmsg h3, #logmsg h4, #logmsg h5, #logmsg h6 { margin: .5em 0; }
#logmsg h1:first-child, #logmsg h2:first-child, #logmsg h3:first-child, #logmsg h4:first-child, #logmsg h5:first-child, #logmsg h6:first-child { margin-top: 0; }
#logmsg ul, #logmsg ol { padding: 0; list-style-position: inside; margin: 0 0 0 1em; }
#logmsg ul { text-indent: -1em; padding-left: 1em; }#logmsg ol { text-indent: -1.5em; padding-left: 1.5em; }
#logmsg > ul, #logmsg > ol { margin: 0 0 1em 0; }
#logmsg pre { background: #eee; padding: 1em; }
#logmsg blockquote { border: 1px solid #fa0; border-left-width: 10px; padding: 1em 1em 0 1em; background: white;}
#logmsg dl { margin: 0; }
#logmsg dt { font-weight: bold; }
#logmsg dd { margin: 0; padding: 0 0 0.5em 0; }
#logmsg dd:before { content:'\00bb';}
#logmsg table { border-spacing: 0px; border-collapse: collapse; border-top: 4px solid #fa0; border-bottom: 1px solid #fa0; background: #fff; }
#logmsg table th { text-align: left; font-weight: normal; padding: 0.2em 0.5em; border-top: 1px dotted #fa0; }
#logmsg table td { text-align: right; border-top: 1px dotted #fa0; padding: 0.2em 0.5em; }
#logmsg table thead th { text-align: center; border-bottom: 1px solid #fa0; }
#logmsg table th.Corner { text-align: left; }
#logmsg hr { border: none 0; border-top: 2px dashed #fa0; height: 1px; }
#header, #footer { color: #fff; background: #636; border: 1px #300 solid; padding: 6px; }
#patch { width: 100%; }
#patch h4 {font-family: verdana,arial,helvetica,sans-serif;font-size:10pt;padding:8px;background:#369;color:#fff;margin:0;}
#patch .propset h4, #patch .binary h4 {margin:0;}
#patch pre {padding:0;line-height:1.2em;margin:0;}
#patch .diff {width:100%;background:#eee;padding: 0 0 10px 0;overflow:auto;}
#patch .propset .diff, #patch .binary .diff {padding:10px 0;}
#patch span {display:block;padding:0 10px;}
#patch .modfile, #patch .addfile, #patch .delfile, #patch .propset, #patch .binary, #patch .copfile {border:1px solid #ccc;margin:10px 0;}
#patch ins {background:#dfd;text-decoration:none;display:block;padding:0 10px;}
#patch del {background:#fdd;text-decoration:none;display:block;padding:0 10px;}
#patch .lines, .info {color:#888;background:#fff;}
--></style>
<div id="msg">
<dl class="meta">
<dt>Revision</dt> <dd><a href="http://trac.webkit.org/projects/webkit/changeset/242658">242658</a></dd>
<dt>Author</dt> <dd>sbarati@apple.com</dd>
<dt>Date</dt> <dd>2019-03-08 15:20:15 -0800 (Fri, 08 Mar 2019)</dd>
</dl>
<h3>Log Message</h3>
<pre>Add a compare-results script to compare benchmark results
https://bugs.webkit.org/show_bug.cgi?id=195486
<rdar://problem/48723397>
Reviewed by Geoffrey Garen.
This patch adds a script to compare benchmark results using Welch's two-tailed t test.
Initially, this patch only reasons about PLT5/JetStream2/Speedometer2. It will be easy
to extend it to learn about our other benchmarks.
* Scripts/compare-results: Added.
(readJSONFile):
(detectJetStream2):
(JetStream2Results):
(detectSpeedometer2):
(Speedometer2Results):
(detectPLT5):
(PLT5Results):
(detectBenchmark):
(biggerIsBetter):
(ttest):
(getOptions):
(main):</pre>
<h3>Modified Paths</h3>
<ul>
<li><a href="#trunkToolsChangeLog">trunk/Tools/ChangeLog</a></li>
</ul>
<h3>Added Paths</h3>
<ul>
<li><a href="#trunkToolsScriptscompareresults">trunk/Tools/Scripts/compare-results</a></li>
</ul>
</div>
<div id="patch">
<h3>Diff</h3>
<a id="trunkToolsChangeLog"></a>
<div class="modfile"><h4>Modified: trunk/Tools/ChangeLog (242657 => 242658)</h4>
<pre class="diff"><span>
<span class="info">--- trunk/Tools/ChangeLog 2019-03-08 23:19:27 UTC (rev 242657)
+++ trunk/Tools/ChangeLog 2019-03-08 23:20:15 UTC (rev 242658)
</span><span class="lines">@@ -1,3 +1,29 @@
</span><ins>+2019-03-08 Saam barati <sbarati@apple.com>
+
+ Add a compare-results script to compare benchmark results
+ https://bugs.webkit.org/show_bug.cgi?id=195486
+ <rdar://problem/48723397>
+
+ Reviewed by Geoffrey Garen.
+
+ This patch adds a script to compare benchmark results using Welch's two-tailed t test.
+ Initially, this patch only reasons about PLT5/JetStream2/Speedometer2. It will be easy
+ to extend it to learn about our other benchmarks.
+
+ * Scripts/compare-results: Added.
+ (readJSONFile):
+ (detectJetStream2):
+ (JetStream2Results):
+ (detectSpeedometer2):
+ (Speedometer2Results):
+ (detectPLT5):
+ (PLT5Results):
+ (detectBenchmark):
+ (biggerIsBetter):
+ (ttest):
+ (getOptions):
+ (main):
+
</ins><span class="cx"> 2019-03-08 Stephanie Lewis <slewis@apple.com>
</span><span class="cx">
</span><span class="cx"> Ensure old tab state is cleared between iterations of run-benchmark
</span></span></pre></div>
<a id="trunkToolsScriptscompareresults"></a>
<div class="addfile"><h4>Added: trunk/Tools/Scripts/compare-results (0 => 242658)</h4>
<pre class="diff"><span>
<span class="info">--- trunk/Tools/Scripts/compare-results (rev 0)
+++ trunk/Tools/Scripts/compare-results 2019-03-08 23:20:15 UTC (rev 242658)
</span><span class="lines">@@ -0,0 +1,198 @@
</span><ins>+#!/usr/bin/env python -u
+
+# Copyright (C) 2019 Apple Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. Neither the name of Apple Inc. ("Apple") nor the names of
+# its contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+import argparse
+import json
+
+try:
+ from scipy import stats
+except:
+ print "ERROR: scipy package is not installed. Run `pip install scipy`"
+ sys.exit(1)
+
+try:
+ import numpy
+except:
+ print "ERROR: numpy package is not installed. Run `pip install numpy`"
+ sys.exit(1)
+
+def readJSONFile(path):
+ with open(path, 'r') as contents:
+ return json.loads(contents.read())
+
+Speedometer2 = "Speedometer2"
+JetStream2 = "JetStream2"
+PLT5 = "PLT5"
+
+def detectJetStream2(payload):
+ return "JetStream2.0" in payload
+
+def JetStream2Results(payload):
+ assert detectJetStream2(payload)
+
+ js = payload["JetStream2.0"]
+ iterations = len(js["tests"]["gaussian-blur"]["metrics"]["Score"]["current"])
+ results = []
+ for i in range(iterations):
+ scores = []
+ for test in js["tests"].keys():
+ scores.append(js["tests"][test]["metrics"]["Score"]["current"][i])
+ geomean = stats.gmean(scores)
+
+ results.append(geomean)
+
+ return results
+
+def detectSpeedometer2(payload):
+ return "Speedometer-2" in payload
+
+def Speedometer2Results(payload):
+ assert detectSpeedometer2(payload)
+ results = []
+ for arr in payload["Speedometer-2"]["metrics"]["Score"]["current"]:
+ results.append(numpy.mean(arr))
+ return results
+
+def detectPLT5(payload):
+ if "iterations" not in payload:
+ return False
+ iterations = payload["iterations"]
+ if not isinstance(iterations, list):
+ return False
+ if not len(iterations):
+ return False
+ if "cold" not in iterations[0]:
+ return False
+ if "warm" not in iterations[0]:
+ return False
+ if "Geometric" not in iterations[0]:
+ return False
+ return True
+
+def PLT5Results(payload):
+ assert detectPLT5(payload)
+ results = []
+ for obj in payload["iterations"]:
+ results.append(obj["Geometric"])
+ return results
+
+def detectBenchmark(payload):
+ if detectJetStream2(payload):
+ return JetStream2
+ if detectSpeedometer2(payload):
+ return Speedometer2
+ if detectPLT5(payload):
+ return PLT5
+ return None
+
+def biggerIsBetter(benchmarkType):
+ if benchmarkType == JetStream2:
+ return True
+ if benchmarkType == Speedometer2:
+ return True
+ if benchmarkType == PLT5:
+ return False
+
+ print "Should not be reached."
+ assert False
+
+def ttest(benchmarkType, a, b):
+ # We use two-tailed Welch's
+ (tStatistic, pValue) = stats.ttest_ind(a, b, equal_var=False)
+ aMean = numpy.mean(a)
+ bMean = numpy.mean(b)
+ print "a mean = {:.5f}".format(aMean)
+ print "b mean = {:.5f}".format(bMean)
+
+ print "pValue = {:.10f}".format(pValue)
+
+ if biggerIsBetter(benchmarkType):
+ print "(Bigger means are better.)"
+ if aMean > bMean:
+ print "{:.3f} times worse".format((aMean / bMean))
+ else:
+ print "{:.3f} times better".format((bMean / aMean))
+ else:
+ print "(Smaller means are better.)"
+ if aMean > bMean:
+ print "{:.3f} times better".format((aMean / bMean))
+ else:
+ print "{:.3f} times worse".format((bMean / aMean))
+
+ if pValue <= 0.05:
+ print "Results ARE significant"
+ else:
+ print "Results ARE NOT significant"
+
+def getOptions():
+ parser = argparse.ArgumentParser(description="Compare two WebKit benchmark results. Pass in two JSON result files to compare them. This script prints the pValue along with the magnitude of the change.")
+
+ parser.add_argument("-a",
+ type=str,
+ required=True,
+ help="a of a/b. Path to JSON results file.")
+
+ parser.add_argument("-b",
+ type=str,
+ required=True,
+ help="b of a/b. Path to JSON results file.")
+
+ return parser.parse_known_args()[0]
+
+
+def main():
+ args = getOptions()
+
+ a = readJSONFile(args.a)
+ b = readJSONFile(args.b)
+
+ typeA = detectBenchmark(a)
+ typeB = detectBenchmark(b)
+
+ if typeA != typeB:
+ print "-a and -b are not the same benchmark. a={} b={}".format(typeA, typeB)
+ sys.exit(1)
+
+ if not (typeA and typeB):
+ print "Unknown benchmark type. a={} b={}".format(typeA, typeB)
+ sys.exit(1)
+
+ if typeA == JetStream2:
+ ttest(typeA, JetStream2Results(a), JetStream2Results(b))
+ elif typeA == Speedometer2:
+ ttest(typeA, Speedometer2Results(a), Speedometer2Results(b))
+ elif typeA == PLT5:
+ ttest(typeA, PLT5Results(a), PLT5Results(b))
+ else:
+ print "Unknown benchmark type"
+ sys.exit(1)
+
+if __name__ == "__main__":
+ main()
+
</ins></span></pre>
</div>
</div>
</body>
</html>