From b097e688f8bf55f5b0c04c30d2cb4a42ce195815 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 10 Dec 2018 16:00:27 -0800 Subject: [PATCH] Add CGO 2019 BOLT paper artifact Summary: Add scripts to reproduce paper results for open-source workloads (clang and gcc). Reviewed By: maksfb Differential Revision: D13408607 fbshipit-source-id: 16d3ddee51c --- paper/reproduce-bolt-cgo19/README.md | 197 +++++++++++++ paper/reproduce-bolt-cgo19/breakdown.sh | 42 +++ paper/reproduce-bolt-cgo19/clang/Makefile | 338 ++++++++++++++++++++++ paper/reproduce-bolt-cgo19/gcc/Makefile | 267 +++++++++++++++++ 4 files changed, 844 insertions(+) create mode 100644 paper/reproduce-bolt-cgo19/README.md create mode 100755 paper/reproduce-bolt-cgo19/breakdown.sh create mode 100644 paper/reproduce-bolt-cgo19/clang/Makefile create mode 100644 paper/reproduce-bolt-cgo19/gcc/Makefile diff --git a/paper/reproduce-bolt-cgo19/README.md b/paper/reproduce-bolt-cgo19/README.md new file mode 100644 index 0000000..b595acc --- /dev/null +++ b/paper/reproduce-bolt-cgo19/README.md @@ -0,0 +1,197 @@ +# reproduce-bolt-cgo19 + +The open-source workload evaluated on this paper is clang 7 and gcc 8.2. Our goal is +to demonstrate that building clang/gcc with all optimizations, including LTO and +PGO, still leaves opportunities for a post-link optimizer such as BOLT to do +a better job at basic block placement and function reordering, significantly +improving workload performance. + +In a nutshell, the paper advocates for a two-step profiling +pipeline (PGO and BOLT) evaluated on a data-center environment, showing +that doing a single pass of profile collection is not +enough to leverage the full potential of profile-guide optimizations. +In this two-step approach, PGO or AutoFDO can be used to feed the +compiler with profile information, which is an important enabler of better +inlining decisions, while a second pass is used to collect profile for a +post-link optimizer such as BOLT, enabling us to improve basic block order +and function order in the final binary and further increase performance. + +Notice that even though a PGO-enabled compiler already uses profile information +to enhance layout decisions, the reason why BOLT can still +get performance wins on top of it is +because BOLT profile is more accurate at the final steps of the compilation +and excels at such tasks. BOLT profile +is collected and applied directly at binary level and there is no +imperfect conversion step trying to map PC addresses back to source code +that relies on the accuracy of debug information. Since BOLT doesn't rely +on source code, it can also optimize assembly-written code or library code +you are statically linking for which there are no sources available. + +Furthermore, profiles used in compilers and BOLT, +for space-efficiency reasons, are not traces but an aggregation of execution +counts. This aggregation loses information: a given function accumulates +the superposition of many traces, each one possibly exercising a different path +of basic blocks, e.g. depending on its callee. Thus, it has limited +applicability and significant code +changes may render it stale. For example, after the compiler decides to inline a function that +was not previously inlined in the code where the profile was originally collected, +it now lacks the correct profile for this function when called exclusively +at that call site. BOLT, by operating at the final binary after +all compilation decisions that substantially change code have been taken, is +in a better position to do code layout and low-level optimizations suitable +to a lower-level IR. + +## Usage + +Clone this repo, cd to either the clang or the gcc folder, depending on the workload +you want to evaluate, and run make as in the following commands: + +``` +> cd clang # or gcc +> vim Makefile # edit NUMCORES according to your system, customize Makefile +> make +> cat results.txt +``` + +Check the results.txt file with the numbers for the clang-build bars in +Figures 7 and 8 of the paper. + +These Makefile rules are based on the steps described at +https://github.com/facebookincubator/BOLT/blob/master/docs/OptimizingClang.md + +# Hardware prerequisites + +You will need a machine with a fair amount of RAM (32GB RAM is OK for the GCC +evaluation, but more is needed for Clang because of the expensive LTO tasks +running in parallel) and around 120GB of free disk space. +This machine needs an Intel processor with LBR support for profile data +collection. By now, LBR is pretty established on Intel processors - +microarchitectures Sandy Bridge (2011) and later supports LBR. +The lower your core count, the slower it will be, as this is building a large +code base several times (adjust the NUMCORES Makefile variable). The whole +process (evaluating GCC and Clang) takes about 6 hours using 40 threads running +simultaneously on our Broadwell setup (see below for specs). + +# Software prerequisites + +We present next a brief list of software prerequisites along with the +corresponding CentOS 7 package install command: + +``` +> git -- yum install git +> cmake -- yum install cmake +> ninja -- yum install ninja-build +> flex -- yum install flex +``` + +Since we build Clang/LLVM, check here for its own list +of requirements: http://llvm.org/docs/GettingStarted.html#requirements + +In general, for building Clang/LLVM, you should be fine if your system has a +relatively modern C++ compiler such as gcc version 4.8.0 or higher. + +# Troubleshooting + +Make sure you understand the rules in the Makefile before diagnosing an issue +and check the log files. +If one of the steps to build a compiler failed, it is best to wipe the compiler +build folder entirely before running make again. + +There are 5 compilers installed for clang and 4 for gcc: + +``` +> benchmarks/stage1 +> benchmarks/stage2 # (clang only) +> benchmarks/clangbolt # or gccbolt +> benchmarks/clangpgo # or gccpgo +> benchmarks/clangpgobolt $ or gccpgobolt +``` + +You may want to delete one of these folders if the rules failed to make the +compiler. The next time you run make, it will restart the build process for +the compiler you deleted. Once these 5 (or 4) compilers are built, the Makefile will limit +itself to measure the speed of 4 configurations and report them to you. + +## Out of memory + +If your system freezes, you may have ran out of memory when doing the expensive +full LTO step for clang when building benchmarks/clangpgo. Edit the Makefile +in Step 6 and change make install -j $(NUMCORES) to a lower number (remove +$(NUMCORES) and use the number of threads you believe your system will +handle). + +## Downloading sources + +If your machine uses a proxy and you run into trouble with the default Makefile +rules to download sources, it is easier to download the sources yourself and +put them into the designated folders, so make can proceed to the build steps by +using your manually downloaded sources. These are the source folders used: + +``` +> benchmarks/llvm # llvm repo with Clang, LLD and compiler-rt (check Step 1) +> benchmarks/gcc # gcc sources after running ./contrib/download_prerequisites +> # (check step 9) +> src/llvm # llvm repo with BOLT (check step 7) +``` + +If you wish to run the Makefile steps organized in separate download, build and +experimental phases, you can use special rules to do so. This can be +useful if you need to separately download all source files in a machine that has internet +connection, then transfer the files to a builder machine with restricted connection +where you will resume the build and experimental steps there. These special rules are: + +``` +> make download_sources +> make build_all +> make results +``` + +## makeinfo failures + +When building gcc, makeinfo failures can happen if the last modified dates of source files +are inconsistent. In this case, you will see a "missing makeinfo" failure in +a gcc build. Notice that this message may be present in logs, but it is not +a fatal error. It is only fatal if make thinks it needs to update the +documentation files. This may happen if you manually copied the gcc source files +without preserving their original file dates, causing make to conclude it needs +to regenerate tex files. To avoid this, always copy gcc sources by +packing them first with a tool such as tar/gzip. + +# Results on different Intel microarchitectures + +Our 2-node Ivy Bridge test machine (Xeon E5-2680 v2) @ 2.8GHz, 40 logical cores +with 32GB RAM finished the GCC evaluation in 3 hours and 55 minutes. GCC +results were the following: + +``` +> gccpgo is 16.65% faster than baseline +> gccbolt is 25.04% faster than baseline +> gccpgobolt is 28.40% faster than baseline +``` + +The Clang evaluation took 3 hours and 45 minutes. Clang results were the +following: + +``` +> clangpgo is 36.72% faster than baseline +> clangbolt is 40.74% faster than baseline +> clangpgobolt is 61.96% faster than baseline +``` + +Our Broadwell test machine (Xeon E5-2680 v4) @ 2.4GHz, 56 logical cores with +256GB RAM finished the clang evaluation in 2 hours and 9 minutes. Clang +results were the following: + +``` +> clangpgo is 34.65% faster than baseline +> clangbolt is 30.72% faster than baseline +> clangpgobolt is 50.55% faster than baseline +``` + +The GCC evaluation took 2 hours and 57 minutes on our Broadwell test machine +and the results are the following: +``` +> gccpgo is 10.57% faster than baseline +> gccbolt is 11.57% faster than baseline +> gccpgobolt is 14.44% faster than baseline +``` diff --git a/paper/reproduce-bolt-cgo19/breakdown.sh b/paper/reproduce-bolt-cgo19/breakdown.sh new file mode 100755 index 0000000..bb6a7a5 --- /dev/null +++ b/paper/reproduce-bolt-cgo19/breakdown.sh @@ -0,0 +1,42 @@ +#!/bin/bash -x + +function run_one() { + make clean_measurements + make PERFCOUNTERS="${PERF}" BOLTOPTS="${BOLT}" results_bolt + mkdir -p ../results-${1}-${2} + cp * -v ../results-${1}-${2} +} + +function run() { + make clean_bolted_builds + PERF=" " + run_one ${1} 1 + PERF="-e instructions,L1-dcache-load-misses,dTLB-load-misses " + run_one ${1} 2 + PERF="-e instructions,L1-icache-load-misses,iTLB-load-misses " + run_one ${1} 3 + PERF="-e cycles,instructions,LLC-load-misses " + run_one ${1} 4 +} + +function run_suite() { + cd ${1} + + BOLT="-reorder-blocks=cache+ -dyno-stats -use-gnu-stack" + run bb-reorder + + BOLT="-reorder-blocks=cache+ -dyno-stats -use-gnu-stack -icf=1" + run bb-reorder-icf + + BOLT="-reorder-blocks=cache+ -dyno-stats -use-gnu-stack -icf=1 -split-functions=3 -split-all-cold" + run bb-reorder-icf-split + + BOLT="-reorder-blocks=cache+ -reorder-functions=hfsort+ -split-functions=3 -split-all-cold -dyno-stats -icf=1 -use-gnu-stack" + run bb-func + + BOLT="-reorder-blocks=cache+ -reorder-functions=hfsort+ -split-functions=3 -split-all-cold -dyno-stats -icf=1 -use-gnu-stack -simplify-rodata-loads -frame-opt=hot -indirect-call-promotion=jump-tables -indirect-call-promotion-topn=3 -plt=all" + run bb-all +} + +run_suite clang + diff --git a/paper/reproduce-bolt-cgo19/clang/Makefile b/paper/reproduce-bolt-cgo19/clang/Makefile new file mode 100644 index 0000000..d1111c7 --- /dev/null +++ b/paper/reproduce-bolt-cgo19/clang/Makefile @@ -0,0 +1,338 @@ +# Makefile recipes to reproduce the open-source results reported in +# "BOLT: A Practical Binary Optimizer for Data Centers and Beyond" +# CGO 2019 +# +# The open-source workload evaluated on this paper is clang 7. Our goal is +# to demonstrate that building clang with all optimizations, including LTO and +# PGO, still leaves opportunities for a post-link optimizer such as BOLT to do +# a better job at basic block placement and function reordering, significantly +# improving workload performance. +# +# Technical aspects: +# +# You will probably need a machine with at least 64GB RAM. The lower your core +# count, the slower it will be, as this is building a large code base several +# times, which benefits with a higher core count. +# +# These rules are based on the steps described at +# https://github.com/facebookincubator/BOLT/blob/master/docs/OptimizingClang.md +# +# It is important to adjust NUMCORES to the number of cores in your system as +# it *will* affect results. +# +# Note: This is a regular Makefile. If you want to re-do a step, simply delete +# the rule target or touch one of its prerequisites to be more updated than the +# target. + +NUMCORES := 40 +# Smaller if your system doesn't have enough memory to handle several LTO builds +# in parallel +NUMCORESLTO := 4 +TOPLEV := $(shell pwd) +SOURCES := $(TOPLEV)/src +BOLTSOURCE := $(SOURCES)/llvm +BOLT := $(SOURCES)/install/bin/llvm-bolt +PERF2BOLT := $(SOURCES)/install/bin/perf2bolt +BENCHMARKS := $(TOPLEV)/benchmarks +CLANGSOURCE := $(BENCHMARKS)/llvm +GCCSOURCE := $(BENCHMARKS)/gcc +CLANGSTAGE1 := $(BENCHMARKS)/stage1/install/bin/clang +CLANGSTAGE2 := $(BENCHMARKS)/stage2/install/bin/clang +PGOPROFILE := $(BENCHMARKS)/stage2/clang.profdata +CLANGPGO := $(BENCHMARKS)/clangpgo/install/bin/clang +RAWDATA := $(BENCHMARKS)/stage2/perf.data +BOLTDATA := $(BENCHMARKS)/stage2/bolt.fdata +BOLTLOG := $(TOPLEV)/bolt.log +MEASUREMENTS := $(TOPLEV)/measurements +COMPARISON := $(TOPLEV)/comparison.txt +RESULTS := $(TOPLEV)/results.txt +LOG_TRAIN := $(TOPLEV)/output_training.txt +USE_NINJA := true +NUM_EXP := 3 +EXPERIMENTS := $(shell seq 1 $(NUM_EXP)) + +# ============================= PERF OPTIONS =================================== + +# Measure task-clock, cycles, instructions, branches, branch misses +PERFCOUNTERS := + +# Measure dcache, dTLB-load-misses +#PERFCOUNTERS := -e instructions,L1-dcache-load-misses,dTLB-load-misses + +# Measure icache, iTLB-load-misses +#PERFCOUNTERS := -e instructions,L1-icache-load-misses,iTLB-load-misses + +# Measure LLC +#PERFCOUNTERS := -e cycles,instructions,LLC-load-misses + +# ============================= BOLT OPTIONS =================================== + +# Full options +BOLTOPTS := -reorder-blocks=cache+ -reorder-functions=hfsort+ \ +-split-functions=3 -split-all-cold -dyno-stats -icf=1 -use-gnu-stack + +# Evaluating just BB reorder +#BOLTOPTS := -reorder-blocks=cache+ -dyno-stats -use-gnu-stack + +# Evaluating BB reorder and ICF +#BOLTOPTS := -reorder-blocks=cache+ -dyno-stats -use-gnu-stack -icf=1 + +ifeq (true, $(USE_NINJA)) +CMAKE := cmake -G Ninja +MAKE_CMD := ninja +else +CMAKE := cmake +MAKE_CMD := make +endif + +# ================================= RULES ====================================== + +.PHONY: all clean distclean clean_measurements + +all: print_results_clangpgo print_results_clangbolt print_results_clangpgobolt + +download_sources: $(CLANGSOURCE) $(GCCSOURCE) $(BOLTSOURCE) + +build_all: $(BOLT) $(CLANGSTAGE1) $(CLANGPGO) \ + $(BENCHMARKS)/clangbolt/install/bin/clang \ + $(BENCHMARKS)/clangpgobolt/install/bin/clang + +results: print_results_clangpgo print_results_clangbolt print_results_clangpgobolt + +results_bolt: print_results_clangbolt print_results_clangpgobolt + +# Step 1: Download clang sources +$(CLANGSOURCE): + mkdir -p $(BENCHMARKS) + cd $(BENCHMARKS) && git clone -q --depth=1 --branch=release_70 \ + https://git.llvm.org/git/llvm.git/ llvm + cd $(BENCHMARKS)/llvm/tools && git clone -q --depth=1 --branch=release_70 \ + https://git.llvm.org/git/clang.git/ + cd $(BENCHMARKS)/llvm/projects && git clone -q --depth=1 --branch=release_70 \ + https://git.llvm.org/git/lld.git/ + cd $(BENCHMARKS)/llvm/projects && git clone -q --depth=1 --branch=release_70 \ + https://git.llvm.org/git/compiler-rt.git/ + +# Step 2: Building stage1 clang compiler so we use the same compiler used in the +# paper. Our goal is to improve our workload on top of this compiler. +$(CLANGSTAGE1): $(BENCHMARKS)/llvm + mkdir -p $(BENCHMARKS)/stage1 + export LDFLAGS="-Wl,-q,-znow" && cd $(BENCHMARKS)/stage1 && $(CMAKE) \ + $(CLANGSOURCE) -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_ASSERTIONS=OFF \ + -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_ASM_COMPILER=gcc \ + -DCMAKE_INSTALL_PREFIX=$(BENCHMARKS)/stage1/install \ + -DENABLE_LINKER_BUILD_ID=ON + cd $(BENCHMARKS)/stage1 && $(MAKE_CMD) install -j $(NUMCORES) + +# Step 3: Building stage2 clang with instrumentation capability. This is our +# workload (clang itself). We have to enable instrumentation in order to collect +# profile data for it, which will enable us to build a faster version of it +# named clangpgo. +$(CLANGSTAGE2): $(CLANGSTAGE1) + mkdir -p $(BENCHMARKS)/stage2 + cd $(BENCHMARKS)/stage2 && $(CMAKE) $(CLANGSOURCE) \ + -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_ASSERTIONS=OFF \ + -DCMAKE_C_COMPILER=$(CLANGSTAGE1) \ + -DCMAKE_CXX_COMPILER=$(CLANGSTAGE1)++ \ + -DLLVM_USE_LINKER=lld -DLLVM_BUILD_INSTRUMENTED=ON \ + -DCMAKE_INSTALL_PREFIX=$(BENCHMARKS)/stage2/install + cd $(BENCHMARKS)/stage2 && $(MAKE_CMD) install -j $(NUMCORES) + +# Step 4: Collect profile data for our workload. Remember our workload is clang, +# and since it is a compiler, we have to build something to collect profile. We +# build clang itself again for this. +$(BENCHMARKS)/stage2/profiles: $(CLANGSTAGE2) + mkdir -p $(BENCHMARKS)/train + cd $(BENCHMARKS)/train && $(CMAKE) $(CLANGSOURCE) \ + -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=$(CLANGSTAGE2) \ + -DCMAKE_CXX_COMPILER=$(CLANGSTAGE2)++ \ + -DLLVM_USE_LINKER=lld \ + -DCMAKE_INSTALL_PREFIX=$(BENCHMARKS)/train/install + cd $(BENCHMARKS)/train && $(MAKE_CMD) clang -j $(NUMCORES) + +# Step 5: Merge profiles. Intermediate step to generate the PGO data to build +# a faster workload (clang + lto + pgo). +$(PGOPROFILE): $(BENCHMARKS)/stage2/profiles + cd $(BENCHMARKS)/stage2/profiles && \ + $(BENCHMARKS)/stage1/install/bin/llvm-profdata merge \ + -output=$(PGOPROFILE) *.profraw + +# Step 6: Build the fastest version of our open-source workload: PGO- and LTO- +# enabled. We will show that BOLT can further speedup this binary (which is +# clang the compiler driver and C++ frontend). +$(CLANGPGO): $(PGOPROFILE) + mkdir -p $(BENCHMARKS)/clangpgo + export LDFLAGS="-Wl,-q,-znow" && cd $(BENCHMARKS)/clangpgo && $(CMAKE) \ + $(CLANGSOURCE) \ + -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=$(CLANGSTAGE1) \ + -DCMAKE_CXX_COMPILER=$(CLANGSTAGE1)++ \ + -DLLVM_ENABLE_ASSERTIONS=OFF \ + -DLLVM_USE_LINKER=lld \ + -DLLVM_ENABLE_LTO=Full \ + -DENABLE_LINKER_BUILD_ID=ON \ + -DLLVM_PROFDATA_FILE=$< \ + -DCMAKE_INSTALL_PREFIX=$(BENCHMARKS)/clangpgo/install + cd $(BENCHMARKS)/clangpgo && $(MAKE_CMD) clang -j $(NUMCORES) + cd $(BENCHMARKS)/clangpgo && $(MAKE_CMD) install -j $(NUMCORESLTO) + +# Step 7: Download the open-source BOLT tool (which is being evaluated here) +# This is using BOLT rev dd94222, which was tested during this artifact +# submission. Feel free to use master. +$(BOLTSOURCE): + mkdir -p $(SOURCES) + cd $(SOURCES) && git clone https://github.com/llvm-mirror/llvm \ + llvm -q --single-branch + cd $(SOURCES)/llvm/tools && git checkout -b llvm-bolt \ + f137ed238db11440f03083b1c88b7ffc0f4af65e + cd $(SOURCES)/llvm/tools && git clone \ + https://github.com/facebookincubator/BOLT llvm-bolt + cd $(SOURCES)/llvm/tools/llvm-bolt && git checkout \ + dd94222dabf6f8942c0fb6eb122bbfa60569dd5e + cd $(SOURCES)/llvm && patch -p1 < tools/llvm-bolt/llvm.patch + +# Step 8: Build BOLT +$(BOLT): $(BOLTSOURCE) + mkdir -p $(SOURCES)/build + cd $(SOURCES)/build && cmake $(BOLTSOURCE) \ + -DLLVM_TARGETS_TO_BUILD="X86;AArch64" \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=$(SOURCES)/install + cd $(SOURCES)/build && make install -j $(NUMCORES) + +# Step 9: Download GCC sources. The profile collected during a GCC build will be +# used as our training data for BOLT when optimizing clang. +# We use a different project to build so our training set is different than our +# evaluation set. +$(GCCSOURCE): + mkdir -p $(BENCHMARKS) + cd $(BENCHMARKS) && git clone -q --depth=1 --branch=gcc-8_2_0-release \ + https://github.com/gcc-mirror/gcc gcc + cd $(BENCHMARKS)/gcc && ./contrib/download_prerequisites + +# Step 10: Create new clang installations with clang binaries processed by BOLT. +# We have two bolted versions of clang: stage1+bolt and pgo+bolt, and we +# evaluate the effect of bolt on both. +$(BENCHMARKS)/clangbolt: $(CLANGSTAGE1) + mkdir -p $(BENCHMARKS)/clangbolt + cd $(BENCHMARKS)/clangbolt && cp -r $(BENCHMARKS)/stage1/install . + +$(BENCHMARKS)/clangpgobolt: $(CLANGPGO) + mkdir -p $(BENCHMARKS)/clangpgobolt + cd $(BENCHMARKS)/clangpgobolt && cp -r $(BENCHMARKS)/clangpgo/install . + +# Step 11: Collect BOLT data for a clang installation (when building gcc) +# BOLT data is collected with Linux perf. +$(RAWDATA).clangbolt $(RAWDATA).clangpgobolt: \ +$(RAWDATA).%: $(BENCHMARKS)/% $(BOLT) $(GCCSOURCE) + -rm -rf $(BENCHMARKS)/train + mkdir -p $(BENCHMARKS)/train + cd $(BENCHMARKS)/train && CC=$(<)/install/bin/clang \ + CXX=$(<)/install/bin/clang++ \ + $(GCCSOURCE)/configure --disable-bootstrap \ + --enable-linker-build-id --enable-languages=c,c++ \ + --with-gnu-as --with-gnu-ld --disable-multilib + cd $(BENCHMARKS)/train && perf record -e cycles:u -j any,u -o $@ \ + -- make maybe-all-gcc -j $(NUMCORES) &> $(LOG_TRAIN).$* + +# Step 12: Aggregate data. This is a data conversion step, reading perf.data +# generated by Linux perf and creating the profile file used by BOLT. This needs +# to read every sample recorded at each hardware performance counter event, read +# the LBR for this event (16 branches or 32 addresses) and convert them to +# aggregated edge counts. +$(BOLTDATA).clangbolt $(BOLTDATA).clangpgobolt: \ +$(BOLTDATA).%: $(RAWDATA).% $(PERF2BOLT) + cd $(BENCHMARKS)/stage2 && \ + $(PERF2BOLT) $(BENCHMARKS)/$(*)/install/bin/clang-7 -p $< -o $@ -w $@.yaml \ + |& tee $(BOLTLOG).$* + +# Step 13: Run BOLT now that we have both inputs: the profile data collected by +# perf and the input binary (clang). BOLT should provide a log of the work it +# did and output a faster binary (faster clang, for this case). +$(BENCHMARKS)/clangbolt/install/bin/clang \ +$(BENCHMARKS)/clangpgobolt/install/bin/clang:\ +$(BENCHMARKS)/%bolt/install/bin/clang: $(BOLTDATA).%bolt + $(BOLT) $(@)-7 -o $(@)-7.bolt -b $(<).yaml $(BOLTOPTS) |& \ + tee -a $(BOLTLOG).$(*)bolt + cp $(@)-7.bolt $(@)-7 + +# Step 14: Measure compile time to build a large project (clang itself) +# to evaluate a compiler performance. +$(MEASUREMENTS).clangbolt $(MEASUREMENTS).stage1 $(MEASUREMENTS).clangpgo \ +$(MEASUREMENTS).clangpgobolt: \ +$(MEASUREMENTS).%: $(BENCHMARKS)/%/install/bin/clang + for number in $(EXPERIMENTS); do \ + mkdir -p ${@}.work ; \ + echo Measuring trial number $${number} for $* ; \ + cd ${@}.work && $(CMAKE) $(CLANGSOURCE) \ + -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=${^} -DCMAKE_CXX_COMPILER=${^}++ \ + -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=$(BENCHMARKS)/eval/install \ + &> ${@}.log.$${number}; \ + perf stat $(PERFCOUNTERS) -x , -o ${@}.exp.$${number} -- \ + $(MAKE_CMD) clang -j $(NUMCORES) &>> ${@}.log.$${number} ;\ + rm -rf ${@}.work ;\ + done + cat ${@}.exp.* &> ${@} + +# Step 15: Aggregate comparison results in a single file +$(TOPLEV)/clangpgo.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).clangpgo + cat $^ &> $@ + +$(TOPLEV)/clangbolt.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).clangbolt + cat $^ &> $@ + +$(TOPLEV)/clangpgobolt.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).clangpgobolt + cat $^ &> $@ + +AWK_SCRIPT := ' \ + BEGIN \ + { \ + sum = 0; \ + sumsq = 0; \ + }; \ + { \ + sum += $$1; \ + sumsq += ($$1)^2; \ + printf "Data point %s: %f\n", NR, $$1 \ + } \ + END \ + { \ + printf "Mean: %f StdDev: %f\n", sum/NR, sqrt((sumsq - sum^2/NR)/(NR-1)) \ + }; \ +' + +# Step 16: Compare and print results +print_results_clangpgo print_results_clangbolt print_results_clangpgobolt: \ +print_results_%: $(TOPLEV)/%.txt + echo "SIDE A:" + cat $< | grep task-clock | head -n $(NUM_EXP) | awk -F',' \ + $(AWK_SCRIPT) |& tee $(COMPARISON).a + echo "SIDE B:" + cat $< | grep task-clock | tail -n $(NUM_EXP) | awk -F',' \ + $(AWK_SCRIPT) |& tee $(COMPARISON).b + ASIDE=`cat $(COMPARISON).a | tail -n 1 | awk '{print $$2}'` \ + BSIDE=`cat $(COMPARISON).b | tail -n 1 | awk '{print $$2}'` \ + sh <<< 'COMP=$$(echo "scale=4;($$ASIDE / $$BSIDE - 1) * 100" | bc); \ + echo -ne "\n\n $* is $${COMP}% faster than \ + baseline, average of $(NUM_EXP) experiments\n\n"' |& \ + tee -a $(RESULTS) + +# Cleaning steps +# clean deletes final results, so experiments can be restarted +# without rebuilding everything +# distclean further removes benchmarks and BOLT sources +clean: + -rm -rf $(MEASUREMENTS).* $(COMPARISON).* $(RESULTS) $(TOPLEV)/clangpgo.txt \ + $(TOPLEV)/clangbolt.txt $(TOPLEV)/clangpgobolt.txt $(RESULTS) + +clean_measurements: clean + +clean_bolted_builds: + -rm -rf $(BENCHMARKS)/clangbolt $(BENCHMARKS)/clangpgobolt + +distclean: clean + -rm -rf $(BENCHMARKS) $(SOURCES) $(BOLTLOG).* $(LOG_TRAIN).* diff --git a/paper/reproduce-bolt-cgo19/gcc/Makefile b/paper/reproduce-bolt-cgo19/gcc/Makefile new file mode 100644 index 0000000..19e2754 --- /dev/null +++ b/paper/reproduce-bolt-cgo19/gcc/Makefile @@ -0,0 +1,267 @@ +# Makefile recipes to reproduce the open-source results reported in +# "BOLT: A Practical Binary Optimizer for Data Centers and Beyond" +# CGO 2019 +# +# The open-source workload evaluated on this paper is gcc 8.2.# +# It is important to adjust NUMCORES to the number of cores in your system as +# it *will* affect results. +# +# Note: This is a regular Makefile. If you want to re-do a step, simply delete +# the rule target or touch one of its prerequisites to be more updated than the +# target. + +NUMCORES := 40 +TOPLEV := $(shell pwd) +SOURCES := $(TOPLEV)/src +BOLTSOURCE := $(SOURCES)/llvm +BOLT := $(SOURCES)/install/bin/llvm-bolt +PERF2BOLT := $(SOURCES)/install/bin/perf2bolt +BENCHMARKS := $(TOPLEV)/benchmarks +CLANGSOURCE := $(BENCHMARKS)/llvm +GCCSOURCE := $(BENCHMARKS)/gcc +GCCSTAGE1 := $(BENCHMARKS)/stage1/install/bin/gcc +GCCPGO := $(BENCHMARKS)/gccpgo/install/bin/gcc +RAWDATA := $(BENCHMARKS)/perf.data +BOLTDATA := $(BENCHMARKS)/bolt.fdata +BOLTLOG := $(TOPLEV)/bolt.log +MEASUREMENTS := $(TOPLEV)/measurements +COMPARISON := $(TOPLEV)/comparison.txt +RESULTS := $(TOPLEV)/results.txt +LOG_TRAIN := $(TOPLEV)/output_training.txt +USE_NINJA := true +NUM_EXP := 3 +EXPERIMENTS := $(shell seq 1 $(NUM_EXP)) + +ifeq (true, $(USE_NINJA)) +CMAKE := cmake -G Ninja +MAKE_CMD := ninja +else +CMAKE := cmake +MAKE_CMD := make +endif + +.PHONY: all clean distclean clean_measurements + +all: print_results_gccpgo print_results_gccbolt print_results_gccpgobolt + +download_sources: $(CLANGSOURCE) $(GCCSOURCE) $(BOLTSOURCE) + +build_all: $(BOLT) $(GCCSTAGE1) $(GCCPGO) \ + $(BENCHMARKS)/gccbolt/install/bin/gcc \ + $(BENCHMARKS)/gccpgobolt/install/bin/gcc + +results: print_results_gccpgo print_results_gccbolt print_results_gccpgobolt + +# Step 1: Download GCC sources. +$(GCCSOURCE): + mkdir -p $(BENCHMARKS) + cd $(BENCHMARKS) && git clone -q --depth=1 --branch=gcc-8_2_0-release \ + https://github.com/gcc-mirror/gcc gcc + cd $(BENCHMARKS)/gcc && ./contrib/download_prerequisites + +# STEP 2: Building baseline compiler +$(GCCSTAGE1): $(GCCSOURCE) + mkdir -p $(BENCHMARKS)/stage1 + cd $(BENCHMARKS)/stage1 && \ + $(GCCSOURCE)/configure --enable-bootstrap \ + --enable-linker-build-id --enable-languages=c,c++ \ + --with-gnu-as --with-gnu-ld --disable-multilib \ + --prefix=$(BENCHMARKS)/stage1/install + cd $(BENCHMARKS)/stage1 && make -j $(NUMCORES) + cd $(BENCHMARKS)/stage1 && make install -j $(NUMCORES) + +# Step 3: Building pgo gcc +$(GCCPGO): $(GCCSOURCE) + mkdir -p $(BENCHMARKS)/gccpgo + cd $(BENCHMARKS)/gccpgo && \ + $(GCCSOURCE)/configure --enable-bootstrap \ + --enable-linker-build-id --enable-languages=c,c++ \ + --with-gnu-as --with-gnu-ld --disable-multilib \ + --prefix=$(BENCHMARKS)/gccpgo/install + cd $(BENCHMARKS)/gccpgo && make profiledbootstrap -j $(NUMCORES) + cd $(BENCHMARKS)/gccpgo && make install -j $(NUMCORES) + +# Step 4: Download the open-source BOLT tool (which is being evaluated here) +# This is using BOLT rev dd94222, which was tested during this artifact +# submission. Feel free to use master. +$(BOLTSOURCE): + mkdir -p $(SOURCES) + cd $(SOURCES) && git clone https://github.com/llvm-mirror/llvm \ + llvm -q --single-branch + cd $(SOURCES)/llvm/tools && git checkout -b llvm-bolt \ + f137ed238db11440f03083b1c88b7ffc0f4af65e + cd $(SOURCES)/llvm/tools && git clone \ + https://github.com/facebookincubator/BOLT llvm-bolt + cd $(SOURCES)/llvm/tools/llvm-bolt && git checkout \ + dd94222dabf6f8942c0fb6eb122bbfa60569dd5e + cd $(SOURCES)/llvm && patch -p1 < tools/llvm-bolt/llvm.patch + +# Step 5: Build BOLT +$(BOLT): $(BOLTSOURCE) + mkdir -p $(SOURCES)/build + cd $(SOURCES)/build && cmake $(BOLTSOURCE) \ + -DLLVM_TARGETS_TO_BUILD="X86;AArch64" \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=$(SOURCES)/install + cd $(SOURCES)/build && make install -j $(NUMCORES) + +# Step 6: Download clang sources (used as our input to test gcc speed) +$(CLANGSOURCE): + mkdir -p $(BENCHMARKS) + cd $(BENCHMARKS) && git clone -q --depth=1 --branch=release_70 \ + https://git.llvm.org/git/llvm.git/ llvm + cd $(BENCHMARKS)/llvm/tools && git clone -q --depth=1 --branch=release_70 \ + https://git.llvm.org/git/clang.git/ + cd $(BENCHMARKS)/llvm/projects && git clone -q --depth=1 --branch=release_70 \ + https://git.llvm.org/git/lld.git/ + cd $(BENCHMARKS)/llvm/projects && git clone -q --depth=1 --branch=release_70 \ + https://git.llvm.org/git/compiler-rt.git/ + +# Step 7: Create new gcc installations with gcc binaries processed by BOLT. +# We have two bolted versions of gcc: stage1+bolt and pgo+bolt, and we +# evaluate the effect of bolt on both. +# In order to be processed by BOLT, these gcc setups are built differently. +# We add the -q linker flag to add relocation metadata to binaries, and +# we also use -fno-reorder-blocks-and-partition to disable a gcc 8 +# optimization that renders the binary unsupported by BOLT. This is related +# to function splitting. BOLT does function splitting by itself, but +# can't read binaries with split functions. +$(BENCHMARKS)/gccbolt: $(GCCSOURCE) + mkdir -p $(BENCHMARKS)/gccbolt + cd $(BENCHMARKS)/gccbolt && \ + $(GCCSOURCE)/configure --enable-bootstrap \ + --enable-linker-build-id --enable-languages=c,c++ \ + --with-gnu-as --with-gnu-ld --disable-multilib \ + --with-boot-ldflags='-Wl,-q,-znow -static-libstdc++ -static-libgcc' \ + --with-stage1-ldflags='-Wl,-q,-znow' \ + --prefix=$(BENCHMARKS)/gccbolt/install + cd $(BENCHMARKS)/gccbolt && \ + make -j $(NUMCORES) BOOT_CFLAGS='-O2 -g -fno-reorder-blocks-and-partition' + cd $(BENCHMARKS)/gccbolt && make install -j $(NUMCORES) + +$(BENCHMARKS)/gccpgobolt: $(GCCSOURCE) + mkdir -p $(BENCHMARKS)/gccpgobolt + cd $(BENCHMARKS)/gccpgobolt && \ + $(GCCSOURCE)/configure --enable-bootstrap \ + --with-boot-ldflags='-Wl,-q,-znow -static-libstdc++ -static-libgcc' \ + --with-stage1-ldflags='-Wl,-q,-znow' \ + --enable-linker-build-id --enable-languages=c,c++ \ + --with-gnu-as --with-gnu-ld --disable-multilib \ + --prefix=$(BENCHMARKS)/gccpgobolt/install + cd $(BENCHMARKS)/gccpgobolt && make profiledbootstrap -j $(NUMCORES) \ + BOOT_CFLAGS='-O2 -g -fno-reorder-blocks-and-partition' + cd $(BENCHMARKS)/gccpgobolt && make install -j $(NUMCORES) + +# Step 8: Collect BOLT data for a gcc installation (when building gcc itself) +# BOLT data is collected with Linux perf. +$(RAWDATA).gccbolt $(RAWDATA).gccpgobolt: \ +$(RAWDATA).%: $(BENCHMARKS)/% $(BOLT) $(GCCSOURCE) + -rm -rf $(BENCHMARKS)/train + mkdir -p $(BENCHMARKS)/train + cd $(BENCHMARKS)/train && CC=$(<)/install/bin/gcc \ + CXX=$(<)/install/bin/g++ \ + $(GCCSOURCE)/configure --disable-bootstrap \ + --enable-languages=c,c++ --with-gnu-as --with-gnu-ld --disable-multilib + cd $(BENCHMARKS)/train && perf record -e cycles:u -j any,u -o $@ \ + -- make maybe-all-gcc -j $(NUMCORES) &> $(LOG_TRAIN).$* + +# Step 9: Aggregate data. This is a data conversion step, reading perf.data +# generated by Linux perf and creating the profile file used by BOLT. This needs +# to read every sample recorded at each hardware performance counter event, read +# the LBR for this event (16 branches or 32 addresses) and convert them to +# aggregated edge counts. +$(BOLTDATA).gccbolt $(BOLTDATA).gccpgobolt: \ +$(BOLTDATA).%: $(RAWDATA).% $(PERF2BOLT) + cd $(BENCHMARKS) && \ + $(PERF2BOLT) $(BENCHMARKS)/$(*)/install/libexec/gcc/x86_64-pc-linux-gnu/8.2.0/cc1plus \ + -p $< -o $@ -w $@.yaml |& tee $(BOLTLOG).$* + +# Step 10: Run BOLT now that we have both inputs: the profile data collected by +# perf and the input binary (gcc). BOLT should provide a log of the work it +# did and output a faster binary (faster gcc, for this case). +$(BENCHMARKS)/gccbolt/install/bin/gcc $(BENCHMARKS)/gccpgobolt/install/bin/gcc:\ +$(BENCHMARKS)/%bolt/install/bin/gcc: $(BOLTDATA).%bolt + $(BOLT) $(@D)/../libexec/gcc/x86_64-pc-linux-gnu/8.2.0/cc1plus \ + -o $(@D)/../libexec/gcc/x86_64-pc-linux-gnu/8.2.0/cc1plus.bolt -b $(<).yaml \ + -reorder-blocks=cache+ -reorder-functions=hfsort+ -split-functions=3 \ + -split-all-cold -dyno-stats -icf=1 -use-gnu-stack |& \ + tee -a $(BOLTLOG).$(*)bolt + cp $(@D)/../libexec/gcc/x86_64-pc-linux-gnu/8.2.0/cc1plus.bolt \ + $(@D)/../libexec/gcc/x86_64-pc-linux-gnu/8.2.0/cc1plus + +# Step 11: Measure compile time to build a large project (clang) +# to evaluate compiler performance. +$(MEASUREMENTS).gccbolt $(MEASUREMENTS).stage1 $(MEASUREMENTS).gccpgo \ +$(MEASUREMENTS).gccpgobolt: \ +$(MEASUREMENTS).%: $(BENCHMARKS)/%/install/bin/gcc $(CLANGSOURCE) + for number in $(EXPERIMENTS); do \ + mkdir -p ${@}.work ; \ + echo Measuring trial number $${number} for $* ; \ + cd ${@}.work && $(CMAKE) $(CLANGSOURCE) \ + -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=$(<) -DCMAKE_CXX_COMPILER=$( ${@}.log.$${number}; \ + perf stat -x , -o ${@}.exp.$${number} \ + -- $(MAKE_CMD) clang -j $(NUMCORES) \ + &>> ${@}.log.$${number} ;\ + rm -rf ${@}.work ;\ + done + cat ${@}.exp.* &> ${@} + +# Step 12: Aggregate comparison results in a single file +$(TOPLEV)/gccpgo.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).gccpgo + cat $^ &> $@ + +$(TOPLEV)/gccbolt.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).gccbolt + cat $^ &> $@ + +$(TOPLEV)/gccpgobolt.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).gccpgobolt + cat $^ &> $@ + +AWK_SCRIPT := ' \ + BEGIN \ + { \ + sum = 0; \ + sumsq = 0; \ + }; \ + { \ + sum += $$1; \ + sumsq += ($$1)^2; \ + printf "Data point %s: %f\n", NR, $$1 \ + } \ + END \ + { \ + printf "Mean: %f StdDev: %f\n", sum/NR, sqrt((sumsq - sum^2/NR)/(NR-1)) \ + }; \ +' + +# Step 13: Compare and print results; +print_results_gccpgo print_results_gccbolt print_results_gccpgobolt: \ +print_results_%: $(TOPLEV)/%.txt + echo "SIDE A:" + cat $< | grep task-clock | head -n $(NUM_EXP) | awk -F',' \ + $(AWK_SCRIPT) |& tee $(COMPARISON).a + echo "SIDE B:" + cat $< | grep task-clock | tail -n $(NUM_EXP) | awk -F',' \ + $(AWK_SCRIPT) |& tee $(COMPARISON).b + ASIDE=`cat $(COMPARISON).a | tail -n 1 | awk '{print $$2}'` \ + BSIDE=`cat $(COMPARISON).b | tail -n 1 | awk '{print $$2}'` \ + sh <<< 'COMP=$$(echo "scale=4;($$ASIDE / $$BSIDE - 1) * 100" | bc); \ + echo -ne "\n\n $* is $${COMP}% faster than \ + baseline, average of $(NUM_EXP) experiments\n\n"' |& \ + tee -a $(RESULTS) + +# Cleaning steps +# clean deletes final results, so experiments can be restarted +# without rebuilding everything +# distclean further removes benchmarks and BOLT sources +clean: + -rm -rf $(MEASUREMENTS).* $(COMPARISON).* $(RESULTS) $(TOPLEV)/gccpgo.txt \ + $(TOPLEV)/gccbolt.txt $(TOPLEV)/gccpgobolt.txt $(RESULTS) + +clean_measurements: clean + +distclean: clean + -rm -rf $(BENCHMARKS) $(SOURCES) $(BOLTLOG).* $(LOG_TRAIN).*