From b097e688f8bf55f5b0c04c30d2cb4a42ce195815 Mon Sep 17 00:00:00 2001
From: Rafael Auler <rafaelauler@fb.com>
Date: Mon, 10 Dec 2018 16:00:27 -0800
Subject: [PATCH] Add CGO 2019 BOLT paper artifact

Summary:
Add scripts to reproduce paper results for open-source
workloads (clang and gcc).

Reviewed By: maksfb

Differential Revision: D13408607

fbshipit-source-id: 16d3ddee51c
---
 paper/reproduce-bolt-cgo19/README.md      | 197 +++++++++++++
 paper/reproduce-bolt-cgo19/breakdown.sh   |  42 +++
 paper/reproduce-bolt-cgo19/clang/Makefile | 338 ++++++++++++++++++++++
 paper/reproduce-bolt-cgo19/gcc/Makefile   | 267 +++++++++++++++++
 4 files changed, 844 insertions(+)
 create mode 100644 paper/reproduce-bolt-cgo19/README.md
 create mode 100755 paper/reproduce-bolt-cgo19/breakdown.sh
 create mode 100644 paper/reproduce-bolt-cgo19/clang/Makefile
 create mode 100644 paper/reproduce-bolt-cgo19/gcc/Makefile

diff --git a/paper/reproduce-bolt-cgo19/README.md b/paper/reproduce-bolt-cgo19/README.md
new file mode 100644
index 0000000..b595acc
--- /dev/null
+++ b/paper/reproduce-bolt-cgo19/README.md
@@ -0,0 +1,197 @@
+# reproduce-bolt-cgo19
+
+The open-source workload evaluated on this paper is clang 7 and gcc 8.2. Our goal is
+to demonstrate that building clang/gcc with all optimizations, including LTO and
+PGO, still leaves opportunities for a post-link optimizer such as BOLT to do
+a better job at basic block placement and function reordering, significantly
+improving workload performance.
+
+In a nutshell, the paper advocates for a two-step profiling
+pipeline (PGO and BOLT) evaluated on a data-center environment, showing
+that doing a single pass of profile collection is not
+enough to leverage the full potential of profile-guide optimizations.
+In this two-step approach, PGO or AutoFDO can be used to feed the
+compiler with profile information, which is an important enabler of better
+inlining decisions, while a second pass is used to collect profile for a
+post-link optimizer such as BOLT, enabling us to improve basic block order
+and function order in the final binary and further increase performance.
+
+Notice that even though a PGO-enabled compiler already uses profile information
+to enhance layout decisions, the reason why BOLT can still
+get performance wins on top of it is
+because BOLT profile is more accurate at the final steps of the compilation
+and excels at such tasks. BOLT profile
+is collected and applied directly at binary level and there is no
+imperfect conversion step trying to map PC addresses back to source code
+that relies on the accuracy of debug information. Since BOLT doesn't rely
+on source code, it can also optimize assembly-written code or library code
+you are statically linking for which there are no sources available.
+
+Furthermore, profiles used in compilers and BOLT,
+for space-efficiency reasons, are not traces but an aggregation of execution
+counts. This aggregation loses information: a given function accumulates
+the superposition of many traces, each one possibly exercising a different path
+of basic blocks, e.g. depending on its callee. Thus, it has limited
+applicability and significant code
+changes may render it stale. For example, after the compiler decides to inline a function that
+was not previously inlined in the code where the profile was originally collected,
+it now lacks the correct profile for this function when called exclusively
+at that call site. BOLT, by operating at the final binary after
+all compilation decisions that substantially change code have been taken, is
+in a better position to do code layout and low-level optimizations suitable
+to a lower-level IR.
+
+## Usage
+
+Clone this repo, cd to either the clang or the gcc folder, depending on the workload
+you want to evaluate, and run make as in the following commands:
+
+```
+> cd clang      # or gcc
+> vim Makefile  # edit NUMCORES according to your system, customize Makefile
+> make
+> cat results.txt
+```
+
+Check the results.txt file with the numbers for the clang-build bars in
+Figures 7 and 8 of the paper.
+
+These Makefile rules are based on the steps described at
+https://github.com/facebookincubator/BOLT/blob/master/docs/OptimizingClang.md
+
+# Hardware prerequisites
+
+You will need a machine with a fair amount of RAM (32GB RAM is OK for the GCC
+evaluation, but more is needed for Clang because of the expensive LTO tasks
+running in parallel) and around 120GB of free disk space.
+This machine needs an Intel processor with LBR support for profile data
+collection. By now, LBR is pretty established on Intel processors -
+microarchitectures Sandy Bridge (2011) and later supports LBR.
+The lower your core count, the slower it will be, as this is building a large
+code base several times (adjust the NUMCORES Makefile variable). The whole
+process (evaluating GCC and Clang) takes about 6 hours using 40 threads running
+simultaneously on our Broadwell setup (see below for specs).
+
+# Software prerequisites
+
+We present next a brief list of software prerequisites along with the
+corresponding CentOS 7 package install command:
+
+```
+> git -- yum install git
+> cmake -- yum install cmake
+> ninja -- yum install ninja-build
+> flex -- yum install flex
+```
+
+Since we build Clang/LLVM, check here for its own list
+of requirements: http://llvm.org/docs/GettingStarted.html#requirements
+
+In general, for building Clang/LLVM, you should be fine if your system has a
+relatively modern C++ compiler such as gcc version 4.8.0 or higher.
+
+# Troubleshooting
+
+Make sure you understand the rules in the Makefile before diagnosing an issue
+and check the log files.
+If one of the steps to build a compiler failed, it is best to wipe the compiler
+build folder entirely before running make again.
+
+There are 5 compilers installed for clang and 4 for gcc:
+
+```
+> benchmarks/stage1
+> benchmarks/stage2       # (clang only)
+> benchmarks/clangbolt    # or gccbolt
+> benchmarks/clangpgo     # or gccpgo
+> benchmarks/clangpgobolt $ or gccpgobolt
+```
+
+You may want to delete one of these folders if the rules failed to make the
+compiler. The next time you run make, it will restart the build process for
+the compiler you deleted. Once these 5 (or 4) compilers are built, the Makefile will limit
+itself to measure the speed of 4 configurations and report them to you.
+
+## Out of memory
+
+If your system freezes, you may have ran out of memory when doing the expensive
+full LTO step for clang when building benchmarks/clangpgo. Edit the Makefile
+in Step 6 and change make install -j $(NUMCORES) to a lower number (remove
+$(NUMCORES) and use the number of threads you believe your system will
+handle).
+
+## Downloading sources
+
+If your machine uses a proxy and you run into trouble with the default Makefile
+rules to download sources, it is easier to download the sources yourself and
+put them into the designated folders, so make can proceed to the build steps by
+using your manually downloaded sources. These are the source folders used:
+
+```
+> benchmarks/llvm     # llvm repo with Clang, LLD and compiler-rt (check Step 1)
+> benchmarks/gcc      # gcc sources after running ./contrib/download_prerequisites
+>                     # (check step 9)
+> src/llvm            # llvm repo with BOLT (check step 7)
+```
+
+If you wish to run the Makefile steps organized in separate download, build and
+experimental phases, you can use special rules to do so. This can be
+useful if you need to separately download all source files in a machine that has internet
+connection, then transfer the files to a builder machine with restricted connection
+where you will resume the build and experimental steps there. These special rules are:
+
+```
+> make download_sources
+> make build_all
+> make results
+```
+
+## makeinfo failures
+
+When building gcc, makeinfo failures can happen if the last modified dates of source files
+are inconsistent. In this case, you will see a "missing makeinfo" failure in
+a gcc build. Notice that this message may be present in logs, but it is not
+a fatal error. It is only fatal if make thinks it needs to update the
+documentation files. This may happen if you manually copied the gcc source files
+without preserving their original file dates, causing make to conclude it needs
+to regenerate tex files. To avoid this, always copy gcc sources by
+packing them first with a tool such as tar/gzip.
+
+# Results on different Intel microarchitectures
+
+Our 2-node Ivy Bridge test machine (Xeon E5-2680 v2) @ 2.8GHz, 40 logical cores
+with 32GB RAM finished the GCC evaluation in 3 hours and 55 minutes. GCC
+results were the following:
+
+```
+> gccpgo is 16.65% faster than baseline
+> gccbolt is 25.04% faster than baseline
+> gccpgobolt is 28.40% faster than baseline
+```
+
+The Clang evaluation took 3 hours and 45 minutes. Clang results were the
+following:
+
+```
+> clangpgo is 36.72% faster than baseline
+> clangbolt is 40.74% faster than baseline
+> clangpgobolt is 61.96% faster than baseline
+```
+
+Our Broadwell test machine (Xeon E5-2680 v4) @ 2.4GHz, 56 logical cores with
+256GB RAM finished the clang evaluation in 2 hours and 9 minutes. Clang
+results were the following:
+
+```
+> clangpgo is 34.65% faster than baseline
+> clangbolt is 30.72% faster than baseline
+> clangpgobolt is 50.55% faster than baseline
+```
+
+The GCC evaluation took 2 hours and 57 minutes on our Broadwell test machine
+and the results are the following:
+```
+> gccpgo is 10.57% faster than baseline
+> gccbolt is 11.57% faster than baseline
+> gccpgobolt is 14.44% faster than baseline
+```
diff --git a/paper/reproduce-bolt-cgo19/breakdown.sh b/paper/reproduce-bolt-cgo19/breakdown.sh
new file mode 100755
index 0000000..bb6a7a5
--- /dev/null
+++ b/paper/reproduce-bolt-cgo19/breakdown.sh
@@ -0,0 +1,42 @@
+#!/bin/bash -x
+
+function run_one() {
+    make clean_measurements
+    make PERFCOUNTERS="${PERF}" BOLTOPTS="${BOLT}" results_bolt
+    mkdir -p ../results-${1}-${2}
+    cp * -v ../results-${1}-${2}
+}
+
+function run() {
+    make clean_bolted_builds
+    PERF=" "
+    run_one ${1} 1
+    PERF="-e instructions,L1-dcache-load-misses,dTLB-load-misses "
+    run_one ${1} 2
+    PERF="-e instructions,L1-icache-load-misses,iTLB-load-misses "
+    run_one ${1} 3
+    PERF="-e cycles,instructions,LLC-load-misses "
+    run_one ${1} 4
+}
+
+function run_suite() {
+    cd ${1}
+
+    BOLT="-reorder-blocks=cache+ -dyno-stats -use-gnu-stack"
+    run bb-reorder
+
+    BOLT="-reorder-blocks=cache+ -dyno-stats -use-gnu-stack -icf=1"
+    run bb-reorder-icf
+
+    BOLT="-reorder-blocks=cache+ -dyno-stats -use-gnu-stack -icf=1 -split-functions=3 -split-all-cold"
+    run bb-reorder-icf-split
+
+    BOLT="-reorder-blocks=cache+ -reorder-functions=hfsort+ -split-functions=3 -split-all-cold -dyno-stats -icf=1 -use-gnu-stack"
+    run bb-func
+
+    BOLT="-reorder-blocks=cache+ -reorder-functions=hfsort+ -split-functions=3 -split-all-cold -dyno-stats -icf=1 -use-gnu-stack -simplify-rodata-loads -frame-opt=hot -indirect-call-promotion=jump-tables -indirect-call-promotion-topn=3 -plt=all"
+    run bb-all
+}
+
+run_suite clang
+
diff --git a/paper/reproduce-bolt-cgo19/clang/Makefile b/paper/reproduce-bolt-cgo19/clang/Makefile
new file mode 100644
index 0000000..d1111c7
--- /dev/null
+++ b/paper/reproduce-bolt-cgo19/clang/Makefile
@@ -0,0 +1,338 @@
+# Makefile recipes to reproduce the open-source results reported in
+# "BOLT: A Practical Binary Optimizer for Data Centers and Beyond"
+# CGO 2019
+#
+# The open-source workload evaluated on this paper is clang 7. Our goal is
+# to demonstrate that building clang with all optimizations, including LTO and
+# PGO, still leaves opportunities for a post-link optimizer such as BOLT to do
+# a better job at basic block placement and function reordering, significantly
+# improving workload performance.
+#
+# Technical aspects:
+#
+# You will probably need a machine with at least 64GB RAM. The lower your core
+# count, the slower it will be, as this is building a large code base several
+# times, which benefits with a higher core count.
+#
+# These rules are based on the steps described at
+#  https://github.com/facebookincubator/BOLT/blob/master/docs/OptimizingClang.md
+#
+# It is important to adjust NUMCORES to the number of cores in your system as
+# it *will* affect results.
+#
+# Note: This is a regular Makefile. If you want to re-do a step, simply delete
+# the rule target or touch one of its prerequisites to be more updated than the
+# target.
+
+NUMCORES       := 40
+# Smaller if your system doesn't have enough memory to handle several LTO builds
+# in parallel
+NUMCORESLTO    := 4
+TOPLEV         := $(shell pwd)
+SOURCES        := $(TOPLEV)/src
+BOLTSOURCE     := $(SOURCES)/llvm
+BOLT           := $(SOURCES)/install/bin/llvm-bolt
+PERF2BOLT      := $(SOURCES)/install/bin/perf2bolt
+BENCHMARKS     := $(TOPLEV)/benchmarks
+CLANGSOURCE    := $(BENCHMARKS)/llvm
+GCCSOURCE      := $(BENCHMARKS)/gcc
+CLANGSTAGE1    := $(BENCHMARKS)/stage1/install/bin/clang
+CLANGSTAGE2    := $(BENCHMARKS)/stage2/install/bin/clang
+PGOPROFILE     := $(BENCHMARKS)/stage2/clang.profdata
+CLANGPGO       := $(BENCHMARKS)/clangpgo/install/bin/clang
+RAWDATA        := $(BENCHMARKS)/stage2/perf.data
+BOLTDATA       := $(BENCHMARKS)/stage2/bolt.fdata
+BOLTLOG        := $(TOPLEV)/bolt.log
+MEASUREMENTS   := $(TOPLEV)/measurements
+COMPARISON     := $(TOPLEV)/comparison.txt
+RESULTS        := $(TOPLEV)/results.txt
+LOG_TRAIN      := $(TOPLEV)/output_training.txt
+USE_NINJA      := true
+NUM_EXP        := 3
+EXPERIMENTS    := $(shell seq 1 $(NUM_EXP))
+
+# ============================= PERF OPTIONS ===================================
+
+# Measure task-clock, cycles, instructions, branches, branch misses
+PERFCOUNTERS   :=
+
+# Measure dcache, dTLB-load-misses
+#PERFCOUNTERS   := -e instructions,L1-dcache-load-misses,dTLB-load-misses
+
+# Measure icache, iTLB-load-misses
+#PERFCOUNTERS   := -e instructions,L1-icache-load-misses,iTLB-load-misses
+
+# Measure LLC
+#PERFCOUNTERS   := -e cycles,instructions,LLC-load-misses
+
+# ============================= BOLT OPTIONS ===================================
+
+# Full options
+BOLTOPTS      := -reorder-blocks=cache+ -reorder-functions=hfsort+ \
+-split-functions=3 -split-all-cold -dyno-stats -icf=1 -use-gnu-stack
+
+# Evaluating just BB reorder
+#BOLTOPTS      := -reorder-blocks=cache+ -dyno-stats -use-gnu-stack
+
+# Evaluating BB reorder and ICF
+#BOLTOPTS       := -reorder-blocks=cache+ -dyno-stats -use-gnu-stack -icf=1
+
+ifeq (true, $(USE_NINJA))
+CMAKE          := cmake -G Ninja
+MAKE_CMD       := ninja
+else
+CMAKE          := cmake
+MAKE_CMD       := make
+endif
+
+# ================================= RULES ======================================
+
+.PHONY: all clean distclean clean_measurements
+
+all: print_results_clangpgo print_results_clangbolt print_results_clangpgobolt
+
+download_sources: $(CLANGSOURCE) $(GCCSOURCE) $(BOLTSOURCE)
+
+build_all: $(BOLT) $(CLANGSTAGE1) $(CLANGPGO) \
+  $(BENCHMARKS)/clangbolt/install/bin/clang \
+  $(BENCHMARKS)/clangpgobolt/install/bin/clang
+
+results: print_results_clangpgo print_results_clangbolt print_results_clangpgobolt
+
+results_bolt: print_results_clangbolt print_results_clangpgobolt
+
+# Step 1: Download clang sources
+$(CLANGSOURCE):
+	mkdir -p $(BENCHMARKS)
+	cd $(BENCHMARKS)               && git clone -q --depth=1 --branch=release_70 \
+	  https://git.llvm.org/git/llvm.git/ llvm
+	cd $(BENCHMARKS)/llvm/tools    && git clone -q --depth=1 --branch=release_70 \
+	  https://git.llvm.org/git/clang.git/
+	cd $(BENCHMARKS)/llvm/projects && git clone -q --depth=1 --branch=release_70 \
+	  https://git.llvm.org/git/lld.git/
+	cd $(BENCHMARKS)/llvm/projects && git clone -q --depth=1 --branch=release_70 \
+	  https://git.llvm.org/git/compiler-rt.git/
+
+# Step 2: Building stage1 clang compiler so we use the same compiler used in the
+# paper. Our goal is to improve our workload on top of this compiler.
+$(CLANGSTAGE1): $(BENCHMARKS)/llvm
+	mkdir -p $(BENCHMARKS)/stage1
+	export LDFLAGS="-Wl,-q,-znow"  && cd $(BENCHMARKS)/stage1 && $(CMAKE) \
+	  $(CLANGSOURCE) -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+	  -DLLVM_ENABLE_ASSERTIONS=OFF \
+	  -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_ASM_COMPILER=gcc \
+	  -DCMAKE_INSTALL_PREFIX=$(BENCHMARKS)/stage1/install \
+	  -DENABLE_LINKER_BUILD_ID=ON
+	cd $(BENCHMARKS)/stage1 && $(MAKE_CMD) install -j $(NUMCORES)
+
+# Step 3: Building stage2 clang with instrumentation capability. This is our
+# workload (clang itself). We have to enable instrumentation in order to collect
+# profile data for it, which will enable us to build a faster version of it
+# named clangpgo.
+$(CLANGSTAGE2): $(CLANGSTAGE1)
+	mkdir -p $(BENCHMARKS)/stage2
+	cd $(BENCHMARKS)/stage2 && $(CMAKE) $(CLANGSOURCE) \
+	  -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+	  -DLLVM_ENABLE_ASSERTIONS=OFF \
+	  -DCMAKE_C_COMPILER=$(CLANGSTAGE1) \
+	  -DCMAKE_CXX_COMPILER=$(CLANGSTAGE1)++ \
+	  -DLLVM_USE_LINKER=lld -DLLVM_BUILD_INSTRUMENTED=ON \
+	  -DCMAKE_INSTALL_PREFIX=$(BENCHMARKS)/stage2/install
+	cd $(BENCHMARKS)/stage2 && $(MAKE_CMD) install -j $(NUMCORES)
+
+# Step 4: Collect profile data for our workload. Remember our workload is clang,
+# and since it is a compiler, we have to build something to collect profile. We
+# build clang itself again for this.
+$(BENCHMARKS)/stage2/profiles: $(CLANGSTAGE2)
+	mkdir -p $(BENCHMARKS)/train
+	cd $(BENCHMARKS)/train && $(CMAKE) $(CLANGSOURCE) \
+	  -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+	  -DCMAKE_C_COMPILER=$(CLANGSTAGE2) \
+	  -DCMAKE_CXX_COMPILER=$(CLANGSTAGE2)++ \
+	  -DLLVM_USE_LINKER=lld \
+	  -DCMAKE_INSTALL_PREFIX=$(BENCHMARKS)/train/install
+	cd $(BENCHMARKS)/train && $(MAKE_CMD) clang -j $(NUMCORES)
+
+# Step 5: Merge profiles. Intermediate step to generate the PGO data to build
+# a faster workload (clang + lto + pgo).
+$(PGOPROFILE): $(BENCHMARKS)/stage2/profiles
+	cd $(BENCHMARKS)/stage2/profiles && \
+	  $(BENCHMARKS)/stage1/install/bin/llvm-profdata merge \
+	  -output=$(PGOPROFILE) *.profraw
+
+# Step 6: Build the fastest version of our open-source workload: PGO- and LTO-
+# enabled. We will show that BOLT can further speedup this binary (which is
+# clang the compiler driver and C++ frontend).
+$(CLANGPGO): $(PGOPROFILE)
+	mkdir -p $(BENCHMARKS)/clangpgo
+	export LDFLAGS="-Wl,-q,-znow" && cd $(BENCHMARKS)/clangpgo && $(CMAKE) \
+	  $(CLANGSOURCE) \
+	  -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+	  -DCMAKE_C_COMPILER=$(CLANGSTAGE1) \
+	  -DCMAKE_CXX_COMPILER=$(CLANGSTAGE1)++ \
+	  -DLLVM_ENABLE_ASSERTIONS=OFF \
+	  -DLLVM_USE_LINKER=lld \
+	  -DLLVM_ENABLE_LTO=Full \
+	  -DENABLE_LINKER_BUILD_ID=ON \
+	  -DLLVM_PROFDATA_FILE=$< \
+	  -DCMAKE_INSTALL_PREFIX=$(BENCHMARKS)/clangpgo/install
+	cd $(BENCHMARKS)/clangpgo && $(MAKE_CMD) clang -j $(NUMCORES)
+	cd $(BENCHMARKS)/clangpgo && $(MAKE_CMD) install -j $(NUMCORESLTO)
+
+# Step 7: Download the open-source BOLT tool (which is being evaluated here)
+# This is using BOLT rev dd94222, which was tested during this artifact
+# submission. Feel free to use master.
+$(BOLTSOURCE):
+	mkdir -p $(SOURCES)
+	cd $(SOURCES)            && git clone https://github.com/llvm-mirror/llvm \
+	  llvm -q --single-branch
+	cd $(SOURCES)/llvm/tools && git checkout -b llvm-bolt \
+	  f137ed238db11440f03083b1c88b7ffc0f4af65e
+	cd $(SOURCES)/llvm/tools && git clone \
+	  https://github.com/facebookincubator/BOLT llvm-bolt
+	cd $(SOURCES)/llvm/tools/llvm-bolt && git checkout \
+	  dd94222dabf6f8942c0fb6eb122bbfa60569dd5e
+	cd $(SOURCES)/llvm && patch -p1 < tools/llvm-bolt/llvm.patch
+
+# Step 8: Build BOLT
+$(BOLT): $(BOLTSOURCE)
+	mkdir -p $(SOURCES)/build
+	cd $(SOURCES)/build && cmake $(BOLTSOURCE) \
+	  -DLLVM_TARGETS_TO_BUILD="X86;AArch64" \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  -DCMAKE_INSTALL_PREFIX=$(SOURCES)/install
+	cd $(SOURCES)/build && make install -j $(NUMCORES)
+
+# Step 9: Download GCC sources. The profile collected during a GCC build will be
+# used as our training data for BOLT when optimizing clang.
+# We use a different project to build so our training set is different than our
+# evaluation set.
+$(GCCSOURCE):
+	mkdir -p $(BENCHMARKS)
+	cd $(BENCHMARKS)    && git clone -q --depth=1 --branch=gcc-8_2_0-release \
+	  https://github.com/gcc-mirror/gcc gcc
+	cd $(BENCHMARKS)/gcc && ./contrib/download_prerequisites
+
+# Step 10: Create new clang installations with clang binaries processed by BOLT.
+# We have two bolted versions of clang: stage1+bolt and pgo+bolt, and we
+# evaluate the effect of bolt on both.
+$(BENCHMARKS)/clangbolt: $(CLANGSTAGE1)
+	mkdir -p $(BENCHMARKS)/clangbolt
+	cd $(BENCHMARKS)/clangbolt && cp -r $(BENCHMARKS)/stage1/install .
+
+$(BENCHMARKS)/clangpgobolt: $(CLANGPGO)
+	mkdir -p $(BENCHMARKS)/clangpgobolt
+	cd $(BENCHMARKS)/clangpgobolt && cp -r $(BENCHMARKS)/clangpgo/install .
+
+# Step 11: Collect BOLT data for a clang installation (when building gcc)
+# BOLT data is collected with Linux perf.
+$(RAWDATA).clangbolt $(RAWDATA).clangpgobolt: \
+$(RAWDATA).%: $(BENCHMARKS)/% $(BOLT) $(GCCSOURCE)
+	-rm -rf $(BENCHMARKS)/train
+	mkdir -p $(BENCHMARKS)/train
+	cd $(BENCHMARKS)/train && CC=$(<)/install/bin/clang \
+	  CXX=$(<)/install/bin/clang++ \
+	  $(GCCSOURCE)/configure --disable-bootstrap \
+	  --enable-linker-build-id --enable-languages=c,c++ \
+	  --with-gnu-as --with-gnu-ld --disable-multilib
+	cd $(BENCHMARKS)/train && perf record -e cycles:u -j any,u -o $@ \
+	  -- make maybe-all-gcc -j $(NUMCORES) &> $(LOG_TRAIN).$*
+
+# Step 12: Aggregate data. This is a data conversion step, reading perf.data
+# generated by Linux perf and creating the profile file used by BOLT. This needs
+# to read every sample recorded at each hardware performance counter event, read
+# the LBR for this event (16 branches or 32 addresses) and convert them to
+# aggregated edge counts.
+$(BOLTDATA).clangbolt $(BOLTDATA).clangpgobolt: \
+$(BOLTDATA).%: $(RAWDATA).% $(PERF2BOLT)
+	cd $(BENCHMARKS)/stage2 && \
+	  $(PERF2BOLT) $(BENCHMARKS)/$(*)/install/bin/clang-7 -p $< -o $@ -w $@.yaml \
+	  |& tee $(BOLTLOG).$*
+
+# Step 13: Run BOLT now that we have both inputs: the profile data collected by
+# perf and the input binary (clang). BOLT should provide a log of the work it
+# did and output a faster binary (faster clang, for this case).
+$(BENCHMARKS)/clangbolt/install/bin/clang \
+$(BENCHMARKS)/clangpgobolt/install/bin/clang:\
+$(BENCHMARKS)/%bolt/install/bin/clang: $(BOLTDATA).%bolt
+	$(BOLT) $(@)-7 -o $(@)-7.bolt -b $(<).yaml $(BOLTOPTS) |& \
+	  tee -a $(BOLTLOG).$(*)bolt
+	cp $(@)-7.bolt $(@)-7
+
+# Step 14: Measure compile time to build a large project (clang itself)
+# to evaluate a compiler performance.
+$(MEASUREMENTS).clangbolt $(MEASUREMENTS).stage1 $(MEASUREMENTS).clangpgo \
+$(MEASUREMENTS).clangpgobolt: \
+$(MEASUREMENTS).%: $(BENCHMARKS)/%/install/bin/clang
+	for number in $(EXPERIMENTS); do \
+	  mkdir -p ${@}.work ; \
+	  echo Measuring trial number $${number} for $* ; \
+	  cd ${@}.work && $(CMAKE) $(CLANGSOURCE) \
+	    -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+	    -DCMAKE_C_COMPILER=${^} -DCMAKE_CXX_COMPILER=${^}++ \
+	    -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=$(BENCHMARKS)/eval/install \
+	    &> ${@}.log.$${number}; \
+	  perf stat $(PERFCOUNTERS) -x , -o ${@}.exp.$${number} -- \
+	    $(MAKE_CMD) clang -j $(NUMCORES) &>> ${@}.log.$${number} ;\
+		rm -rf ${@}.work ;\
+	done
+	cat ${@}.exp.* &> ${@}
+
+# Step 15: Aggregate comparison results in a single file
+$(TOPLEV)/clangpgo.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).clangpgo
+	cat $^ &> $@
+
+$(TOPLEV)/clangbolt.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).clangbolt
+	cat $^ &> $@
+
+$(TOPLEV)/clangpgobolt.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).clangpgobolt
+	cat $^ &> $@
+
+AWK_SCRIPT := '                                                               \
+	BEGIN                                                                       \
+	{                                                                           \
+	  sum = 0;                                                                  \
+	  sumsq = 0;                                                                \
+	};                                                                          \
+	{                                                                           \
+    sum += $$1;                                                               \
+    sumsq += ($$1)^2;                                                         \
+	  printf "Data point %s: %f\n", NR, $$1                                     \
+  }                                                                           \
+  END                                                                         \
+	{                                                                           \
+	  printf "Mean: %f StdDev: %f\n", sum/NR, sqrt((sumsq - sum^2/NR)/(NR-1))   \
+	};  \
+'
+
+# Step 16: Compare and print results
+print_results_clangpgo print_results_clangbolt print_results_clangpgobolt: \
+print_results_%: $(TOPLEV)/%.txt
+	echo "SIDE A:"
+	cat $< | grep task-clock | head -n $(NUM_EXP) | awk -F',' \
+	  $(AWK_SCRIPT) |& tee $(COMPARISON).a
+	echo "SIDE B:"
+	cat $< | grep task-clock | tail -n $(NUM_EXP) | awk -F',' \
+	  $(AWK_SCRIPT) |& tee $(COMPARISON).b
+	ASIDE=`cat $(COMPARISON).a | tail -n 1 | awk '{print $$2}'` \
+	  BSIDE=`cat $(COMPARISON).b | tail -n 1 | awk '{print $$2}'` \
+	  sh <<< 'COMP=$$(echo "scale=4;($$ASIDE / $$BSIDE - 1) * 100" | bc); \
+	          echo -ne "\n\n $* is $${COMP}% faster than \
+	          baseline, average of $(NUM_EXP) experiments\n\n"' |& \
+	  tee -a $(RESULTS)
+
+# Cleaning steps
+# clean deletes final results, so experiments can be restarted
+#   without rebuilding everything
+# distclean further removes benchmarks and BOLT sources
+clean:
+	-rm -rf $(MEASUREMENTS).* $(COMPARISON).* $(RESULTS) $(TOPLEV)/clangpgo.txt \
+	  $(TOPLEV)/clangbolt.txt $(TOPLEV)/clangpgobolt.txt $(RESULTS)
+
+clean_measurements: clean
+
+clean_bolted_builds:
+	-rm -rf $(BENCHMARKS)/clangbolt $(BENCHMARKS)/clangpgobolt
+
+distclean: clean
+	-rm -rf $(BENCHMARKS) $(SOURCES) $(BOLTLOG).* $(LOG_TRAIN).*
diff --git a/paper/reproduce-bolt-cgo19/gcc/Makefile b/paper/reproduce-bolt-cgo19/gcc/Makefile
new file mode 100644
index 0000000..19e2754
--- /dev/null
+++ b/paper/reproduce-bolt-cgo19/gcc/Makefile
@@ -0,0 +1,267 @@
+# Makefile recipes to reproduce the open-source results reported in
+# "BOLT: A Practical Binary Optimizer for Data Centers and Beyond"
+# CGO 2019
+#
+# The open-source workload evaluated on this paper is gcc 8.2.#
+# It is important to adjust NUMCORES to the number of cores in your system as
+# it *will* affect results.
+#
+# Note: This is a regular Makefile. If you want to re-do a step, simply delete
+# the rule target or touch one of its prerequisites to be more updated than the
+# target.
+
+NUMCORES       := 40
+TOPLEV         := $(shell pwd)
+SOURCES        := $(TOPLEV)/src
+BOLTSOURCE     := $(SOURCES)/llvm
+BOLT           := $(SOURCES)/install/bin/llvm-bolt
+PERF2BOLT      := $(SOURCES)/install/bin/perf2bolt
+BENCHMARKS     := $(TOPLEV)/benchmarks
+CLANGSOURCE    := $(BENCHMARKS)/llvm
+GCCSOURCE      := $(BENCHMARKS)/gcc
+GCCSTAGE1      := $(BENCHMARKS)/stage1/install/bin/gcc
+GCCPGO         := $(BENCHMARKS)/gccpgo/install/bin/gcc
+RAWDATA        := $(BENCHMARKS)/perf.data
+BOLTDATA       := $(BENCHMARKS)/bolt.fdata
+BOLTLOG        := $(TOPLEV)/bolt.log
+MEASUREMENTS   := $(TOPLEV)/measurements
+COMPARISON     := $(TOPLEV)/comparison.txt
+RESULTS        := $(TOPLEV)/results.txt
+LOG_TRAIN      := $(TOPLEV)/output_training.txt
+USE_NINJA      := true
+NUM_EXP        := 3
+EXPERIMENTS    := $(shell seq 1 $(NUM_EXP))
+
+ifeq (true, $(USE_NINJA))
+CMAKE          := cmake -G Ninja
+MAKE_CMD       := ninja
+else
+CMAKE          := cmake
+MAKE_CMD       := make
+endif
+
+.PHONY: all clean distclean clean_measurements
+
+all: print_results_gccpgo print_results_gccbolt print_results_gccpgobolt
+
+download_sources: $(CLANGSOURCE) $(GCCSOURCE) $(BOLTSOURCE)
+
+build_all: $(BOLT) $(GCCSTAGE1) $(GCCPGO) \
+  $(BENCHMARKS)/gccbolt/install/bin/gcc \
+  $(BENCHMARKS)/gccpgobolt/install/bin/gcc
+
+results: print_results_gccpgo print_results_gccbolt print_results_gccpgobolt
+
+# Step 1: Download GCC sources.
+$(GCCSOURCE):
+	mkdir -p $(BENCHMARKS)
+	cd $(BENCHMARKS)    && git clone -q --depth=1 --branch=gcc-8_2_0-release \
+	  https://github.com/gcc-mirror/gcc gcc
+	cd $(BENCHMARKS)/gcc && ./contrib/download_prerequisites
+
+# STEP 2: Building baseline compiler
+$(GCCSTAGE1): $(GCCSOURCE)
+	mkdir -p $(BENCHMARKS)/stage1
+	cd $(BENCHMARKS)/stage1 && \
+	  $(GCCSOURCE)/configure --enable-bootstrap \
+	  --enable-linker-build-id --enable-languages=c,c++ \
+	  --with-gnu-as --with-gnu-ld --disable-multilib \
+	  --prefix=$(BENCHMARKS)/stage1/install
+	cd $(BENCHMARKS)/stage1 && make -j $(NUMCORES)
+	cd $(BENCHMARKS)/stage1 && make install -j $(NUMCORES)
+
+# Step 3: Building pgo gcc
+$(GCCPGO): $(GCCSOURCE)
+	mkdir -p $(BENCHMARKS)/gccpgo
+	cd $(BENCHMARKS)/gccpgo && \
+	  $(GCCSOURCE)/configure --enable-bootstrap \
+	  --enable-linker-build-id --enable-languages=c,c++ \
+	  --with-gnu-as --with-gnu-ld --disable-multilib \
+	  --prefix=$(BENCHMARKS)/gccpgo/install
+	cd $(BENCHMARKS)/gccpgo && make profiledbootstrap -j $(NUMCORES)
+	cd $(BENCHMARKS)/gccpgo && make install -j $(NUMCORES)
+
+# Step 4: Download the open-source BOLT tool (which is being evaluated here)
+# This is using BOLT rev dd94222, which was tested during this artifact
+# submission. Feel free to use master.
+$(BOLTSOURCE):
+	mkdir -p $(SOURCES)
+	cd $(SOURCES)            && git clone https://github.com/llvm-mirror/llvm \
+	  llvm -q --single-branch
+	cd $(SOURCES)/llvm/tools && git checkout -b llvm-bolt \
+	  f137ed238db11440f03083b1c88b7ffc0f4af65e
+	cd $(SOURCES)/llvm/tools && git clone \
+	  https://github.com/facebookincubator/BOLT llvm-bolt
+	cd $(SOURCES)/llvm/tools/llvm-bolt && git checkout \
+	  dd94222dabf6f8942c0fb6eb122bbfa60569dd5e
+	cd $(SOURCES)/llvm && patch -p1 < tools/llvm-bolt/llvm.patch
+
+# Step 5: Build BOLT
+$(BOLT): $(BOLTSOURCE)
+	mkdir -p $(SOURCES)/build
+	cd $(SOURCES)/build && cmake $(BOLTSOURCE) \
+	  -DLLVM_TARGETS_TO_BUILD="X86;AArch64" \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  -DCMAKE_INSTALL_PREFIX=$(SOURCES)/install
+	cd $(SOURCES)/build && make install -j $(NUMCORES)
+
+# Step 6: Download clang sources (used as our input to test gcc speed)
+$(CLANGSOURCE):
+	mkdir -p $(BENCHMARKS)
+	cd $(BENCHMARKS)               && git clone -q --depth=1 --branch=release_70 \
+	  https://git.llvm.org/git/llvm.git/ llvm
+	cd $(BENCHMARKS)/llvm/tools    && git clone -q --depth=1 --branch=release_70 \
+	  https://git.llvm.org/git/clang.git/
+	cd $(BENCHMARKS)/llvm/projects && git clone -q --depth=1 --branch=release_70 \
+	  https://git.llvm.org/git/lld.git/
+	cd $(BENCHMARKS)/llvm/projects && git clone -q --depth=1 --branch=release_70 \
+	  https://git.llvm.org/git/compiler-rt.git/
+
+# Step 7: Create new gcc installations with gcc binaries processed by BOLT.
+# We have two bolted versions of gcc: stage1+bolt and pgo+bolt, and we
+# evaluate the effect of bolt on both.
+# In order to be processed by BOLT, these gcc setups are built differently.
+# We add the -q linker flag to add relocation metadata to binaries, and
+# we also use -fno-reorder-blocks-and-partition to disable a gcc 8
+# optimization that renders the binary unsupported by BOLT. This is related
+# to function splitting. BOLT does function splitting by itself, but
+# can't read binaries with split functions.
+$(BENCHMARKS)/gccbolt: $(GCCSOURCE)
+	mkdir -p $(BENCHMARKS)/gccbolt
+	cd $(BENCHMARKS)/gccbolt && \
+	  $(GCCSOURCE)/configure --enable-bootstrap \
+	  --enable-linker-build-id --enable-languages=c,c++ \
+	  --with-gnu-as --with-gnu-ld --disable-multilib \
+	  --with-boot-ldflags='-Wl,-q,-znow -static-libstdc++ -static-libgcc' \
+	  --with-stage1-ldflags='-Wl,-q,-znow' \
+	  --prefix=$(BENCHMARKS)/gccbolt/install
+	cd $(BENCHMARKS)/gccbolt && \
+	  make -j $(NUMCORES) BOOT_CFLAGS='-O2 -g -fno-reorder-blocks-and-partition'
+	cd $(BENCHMARKS)/gccbolt && make install -j $(NUMCORES)
+
+$(BENCHMARKS)/gccpgobolt: $(GCCSOURCE)
+	mkdir -p $(BENCHMARKS)/gccpgobolt
+	cd $(BENCHMARKS)/gccpgobolt && \
+	  $(GCCSOURCE)/configure --enable-bootstrap \
+	  --with-boot-ldflags='-Wl,-q,-znow -static-libstdc++ -static-libgcc' \
+	  --with-stage1-ldflags='-Wl,-q,-znow' \
+	  --enable-linker-build-id --enable-languages=c,c++ \
+	  --with-gnu-as --with-gnu-ld --disable-multilib \
+	  --prefix=$(BENCHMARKS)/gccpgobolt/install
+	cd $(BENCHMARKS)/gccpgobolt && make profiledbootstrap -j $(NUMCORES) \
+	  BOOT_CFLAGS='-O2 -g -fno-reorder-blocks-and-partition'
+	cd $(BENCHMARKS)/gccpgobolt && make install -j $(NUMCORES)
+
+# Step 8: Collect BOLT data for a gcc installation (when building gcc itself)
+# BOLT data is collected with Linux perf.
+$(RAWDATA).gccbolt $(RAWDATA).gccpgobolt: \
+$(RAWDATA).%: $(BENCHMARKS)/% $(BOLT) $(GCCSOURCE)
+	-rm -rf $(BENCHMARKS)/train
+	mkdir -p $(BENCHMARKS)/train
+	cd $(BENCHMARKS)/train && CC=$(<)/install/bin/gcc \
+	  CXX=$(<)/install/bin/g++ \
+	  $(GCCSOURCE)/configure --disable-bootstrap \
+	  --enable-languages=c,c++ --with-gnu-as --with-gnu-ld --disable-multilib
+	cd $(BENCHMARKS)/train && perf record -e cycles:u -j any,u -o $@ \
+	  -- make maybe-all-gcc -j $(NUMCORES) &> $(LOG_TRAIN).$*
+
+# Step 9: Aggregate data. This is a data conversion step, reading perf.data
+# generated by Linux perf and creating the profile file used by BOLT. This needs
+# to read every sample recorded at each hardware performance counter event, read
+# the LBR for this event (16 branches or 32 addresses) and convert them to
+# aggregated edge counts.
+$(BOLTDATA).gccbolt $(BOLTDATA).gccpgobolt: \
+$(BOLTDATA).%: $(RAWDATA).% $(PERF2BOLT)
+	cd $(BENCHMARKS) && \
+	  $(PERF2BOLT) $(BENCHMARKS)/$(*)/install/libexec/gcc/x86_64-pc-linux-gnu/8.2.0/cc1plus \
+	  -p $< -o $@ -w $@.yaml |& tee $(BOLTLOG).$*
+
+# Step 10: Run BOLT now that we have both inputs: the profile data collected by
+# perf and the input binary (gcc). BOLT should provide a log of the work it
+# did and output a faster binary (faster gcc, for this case).
+$(BENCHMARKS)/gccbolt/install/bin/gcc $(BENCHMARKS)/gccpgobolt/install/bin/gcc:\
+$(BENCHMARKS)/%bolt/install/bin/gcc: $(BOLTDATA).%bolt
+	$(BOLT) $(@D)/../libexec/gcc/x86_64-pc-linux-gnu/8.2.0/cc1plus \
+	  -o $(@D)/../libexec/gcc/x86_64-pc-linux-gnu/8.2.0/cc1plus.bolt -b $(<).yaml \
+	  -reorder-blocks=cache+ -reorder-functions=hfsort+ -split-functions=3 \
+	  -split-all-cold -dyno-stats -icf=1 -use-gnu-stack |& \
+	  tee -a $(BOLTLOG).$(*)bolt
+	cp $(@D)/../libexec/gcc/x86_64-pc-linux-gnu/8.2.0/cc1plus.bolt \
+	  $(@D)/../libexec/gcc/x86_64-pc-linux-gnu/8.2.0/cc1plus
+
+# Step 11: Measure compile time to build a large project (clang)
+# to evaluate compiler performance.
+$(MEASUREMENTS).gccbolt $(MEASUREMENTS).stage1 $(MEASUREMENTS).gccpgo \
+$(MEASUREMENTS).gccpgobolt: \
+$(MEASUREMENTS).%: $(BENCHMARKS)/%/install/bin/gcc $(CLANGSOURCE)
+	for number in $(EXPERIMENTS); do \
+	  mkdir -p ${@}.work ; \
+	  echo Measuring trial number $${number} for $* ; \
+	  cd ${@}.work && $(CMAKE) $(CLANGSOURCE) \
+	    -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+	    -DCMAKE_C_COMPILER=$(<) -DCMAKE_CXX_COMPILER=$(<D)/g++ \
+	    -DCMAKE_INSTALL_PREFIX=$(BENCHMARKS)/eval/install \
+	    -DCMAKE_CXX_LINK_FLAGS="-Wl,-rpath,$(<D)/../lib64" \
+	    &> ${@}.log.$${number}; \
+	  perf stat -x , -o ${@}.exp.$${number} \
+	    -- $(MAKE_CMD) clang -j $(NUMCORES) \
+	    &>> ${@}.log.$${number} ;\
+		rm -rf ${@}.work ;\
+	done
+	cat ${@}.exp.* &> ${@}
+
+# Step 12: Aggregate comparison results in a single file
+$(TOPLEV)/gccpgo.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).gccpgo
+	cat $^ &> $@
+
+$(TOPLEV)/gccbolt.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).gccbolt
+	cat $^ &> $@
+
+$(TOPLEV)/gccpgobolt.txt: $(MEASUREMENTS).stage1 $(MEASUREMENTS).gccpgobolt
+	cat $^ &> $@
+
+AWK_SCRIPT := '                                                               \
+	BEGIN                                                                       \
+	{                                                                           \
+	  sum = 0;                                                                  \
+	  sumsq = 0;                                                                \
+	};                                                                          \
+	{                                                                           \
+    sum += $$1;                                                               \
+    sumsq += ($$1)^2;                                                         \
+	  printf "Data point %s: %f\n", NR, $$1                                     \
+  }                                                                           \
+  END                                                                         \
+	{                                                                           \
+	  printf "Mean: %f StdDev: %f\n", sum/NR, sqrt((sumsq - sum^2/NR)/(NR-1))   \
+	};  \
+'
+
+# Step 13: Compare and print results;
+print_results_gccpgo print_results_gccbolt print_results_gccpgobolt: \
+print_results_%: $(TOPLEV)/%.txt
+	echo "SIDE A:"
+	cat $< | grep task-clock | head -n $(NUM_EXP) | awk -F',' \
+	  $(AWK_SCRIPT) |& tee $(COMPARISON).a
+	echo "SIDE B:"
+	cat $< | grep task-clock | tail -n $(NUM_EXP) | awk -F',' \
+	  $(AWK_SCRIPT) |& tee $(COMPARISON).b
+	ASIDE=`cat $(COMPARISON).a | tail -n 1 | awk '{print $$2}'` \
+	  BSIDE=`cat $(COMPARISON).b | tail -n 1 | awk '{print $$2}'` \
+	  sh <<< 'COMP=$$(echo "scale=4;($$ASIDE / $$BSIDE - 1) * 100" | bc); \
+	          echo -ne "\n\n $* is $${COMP}% faster than \
+	          baseline, average of $(NUM_EXP) experiments\n\n"' |& \
+	  tee -a $(RESULTS)
+
+# Cleaning steps
+# clean deletes final results, so experiments can be restarted
+#   without rebuilding everything
+# distclean further removes benchmarks and BOLT sources
+clean:
+	-rm -rf $(MEASUREMENTS).* $(COMPARISON).* $(RESULTS) $(TOPLEV)/gccpgo.txt \
+	  $(TOPLEV)/gccbolt.txt $(TOPLEV)/gccpgobolt.txt $(RESULTS)
+
+clean_measurements: clean
+
+distclean: clean
+	-rm -rf $(BENCHMARKS) $(SOURCES) $(BOLTLOG).* $(LOG_TRAIN).*