Error if function using indirect jmp touches redzone

The indirect jmp mitigation clobbers the redzone, so verify that that is harmless.
Comment fix.
2020-07-07 14:29:31 -07:00 · 2020-05-14 16:23:55 -07:00 · 2020-05-14 15:15:50 -07:00 · 2020-05-14 15:09:45 -07:00 · 2020-05-14 14:31:19 -07:00 · 2020-05-14 11:20:12 -07:00
117 changed files with 9055 additions and 3494 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,29 @@
+include(ExternalProject)
+
 set(BOLT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(BOLT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+set(CMAKE_CXX_STANDARD 14)
+
+ExternalProject_Add(bolt_rt
+  SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/runtime"
+  STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-stamps
+  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins
+  CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_BUILD_TYPE=Release
+             -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
+             -DCMAKE_INSTALL_PREFIX=${LLVM_BINARY_DIR}
+  # You might want to set this to True if actively developing bolt_rt, otherwise
+  # cmake will not rebuild it after source code changes
+  BUILD_ALWAYS True
+  )
+
+install(CODE "execute_process\(COMMAND \${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=\${CMAKE_INSTALL_PREFIX} -P ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/cmake_install.cmake \)"
+  COMPONENT bolt_rt)
+
+add_llvm_install_targets(install-bolt_rt
+  DEPENDS bolt_rt
+  COMPONENT bolt_rt)

 add_subdirectory(src)
 add_subdirectory(test)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -21,6 +21,14 @@ We actively welcome your pull requests.
  before it can be merged.
 * When all of the tests are passing and all other conditions described above
  satisfied, the PR is ready for review and merge.
+* If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>

 ## Issues

--- a/docs/Heatmap.png
+++ b/docs/Heatmap.png
--- a/docs/Heatmaps.md
+++ b/docs/Heatmaps.md
@ -0,0 +1,50 @@
+# Code Heatmaps
+
+BOLT has gained the ability to print code heatmaps based on
+sampling-based LBR profiles generated by `perf`. The output is produced
+in colored ASCII to be displayed in a color-capable terminal. It looks
+something like this:
+
+![](./Heatmap.png)
+
+Heatmaps can be generated for BOLTed and non-BOLTed binaries. You can
+use them to compare the code layout before and after optimizations.
+
+To generate a heatmap, start with running your app under `perf`:
+
+```bash
+$ perf record -e cycles:u -j any,u -- <executable with args>
+```
+or if you want to monitor the existing process(es):
+```bash
+$ perf record -e cycles:u -j any,u [-p PID|-a] -- sleep <interval>
+```
+
+Note that at the moment running with LBR (`-j any,u` or `-b`) is
+a requirement.
+
+Once the run is complete, and `perf.data` is generated, run BOLT in
+a heatmap mode:
+
+```bash
+$ llvm-bolt heatmap -p perf.data <executable>
+```
+
+By default the heatmap will be dumped to *stdout*. You can change it
+with `-o <heatmapfile>` option. Each character/block in the heatmap
+shows the execution data accumulated for corresponding 64 bytes of
+code. You can change this granularity with a `-block-size` option.
+E.g. set it to 4096 to see code usage grouped by 4K pages.
+Other useful options are:
+
+```bash
+-line-size=<uint>   - number of entries per line (default 256)
+-max-address=<uint> - maximum address considered valid for heatmap (default 4GB)
+```
+
+If you prefer to look at the data in a browser (or would like to share
+it that way), then you can use an HTML conversion tool. E.g.:
+
+```bash
+$ aha -b -f <heatmapfile> > <heatmapfile>.html
+```
--- a/llvm.patch
+++ b/llvm.patch
@ -848,7 +848,7 @@ index 8e9b4ac5632..d2c569e3399 100644
                          SMLoc Loc) override;
   void
 diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
-index 582a836023b..0b15454ecd6 100644
+index 582a836023b..f1e341bd624 100644
 --- a/include/llvm/MC/MCStreamer.h
 +++ b/include/llvm/MC/MCStreamer.h
@@ -199,7 +199,7 @@ class MCStreamer {
@ -860,17 +860,6 @@ index 582a836023b..0b15454ecd6 100644
 
   /// \brief This is stack of current and previous section values saved by
   /// PushSection.
-@@ -290,8 +290,8 @@ public:
-   /// If the comment includes embedded \n's, they will each get the comment
-   /// prefix as appropriate.  The added comment should not end with a \n.
-   /// By default, each comment is terminated with an end of line, i.e. the
-  /// EOL param is set to true by default. If one prefers not to end the 
-  /// comment with a new line then the EOL param should be passed 
-+  /// EOL param is set to true by default. If one prefers not to end the
-+  /// comment with a new line then the EOL param should be passed
-   /// with a false value.
-   virtual void AddComment(const Twine &T, bool EOL = true) {}
- 
@@ -338,9 +338,7 @@ public:
 
   /// \brief Returns an index to represent the order a symbol was emitted in.
@ -1009,11 +998,10 @@ index 46504e74bc2..836fd8ddc45 100644
   Expected<Elf_Shdr_Range> sections() const;
 
   Expected<Elf_Sym_Range> symbols(const Elf_Shdr *Sec) const {
-@@ -396,6 +408,34 @@ void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
-   }
+@@ -397,6 +409,34 @@ void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
 }
 
-+template <class ELFT>
+ template <class ELFT>
 +Expected<const typename ELFFile<ELFT>::Elf_Dyn *>
 +ELFFile<ELFT>::dynamic_table_begin(const Elf_Phdr *Phdr) const {
 +  if (!Phdr)
@ -1041,9 +1029,10 @@ index 46504e74bc2..836fd8ddc45 100644
 +  return reinterpret_cast<const Elf_Dyn *>(base() + End);
 +}
 +
- template <class ELFT>
+template <class ELFT>
 Expected<const typename ELFT::Sym *>
 ELFFile<ELFT>::getRelocationSymbol(const Elf_Rel *Rel,
+                                    const Elf_Shdr *SymTab) const {
 diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
 index 4d001039238..62837bbcaa0 100644
 --- a/include/llvm/Object/ELFObjectFile.h
@ -1056,11 +1045,10 @@ index 4d001039238..62837bbcaa0 100644
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
   section_iterator getRelocatedSection(DataRefImpl Sec) const override;
-@@ -716,6 +717,14 @@ bool ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec) const {
-   return getSection(Sec)->sh_type == ELF::SHT_NOBITS;
+@@ -717,6 +718,14 @@ bool ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec) const {
 }
 
-+template <class ELFT>
+ template <class ELFT>
 +bool ELFObjectFile<ELFT>::isSectionReadOnly(DataRefImpl Sec) const {
 +  const Elf_Shdr *EShdr = getSection(Sec);
 +  return EShdr->sh_flags & ELF::SHF_ALLOC &&
@ -1068,9 +1056,10 @@ index 4d001039238..62837bbcaa0 100644
 +         EShdr->sh_type == ELF::SHT_PROGBITS;
 +}
 +
- template <class ELFT>
+template <class ELFT>
 relocation_iterator
 ELFObjectFile<ELFT>::section_rel_begin(DataRefImpl Sec) const {
+   DataRefImpl RelData;
@@ -751,9 +760,6 @@ ELFObjectFile<ELFT>::section_rel_end(DataRefImpl Sec) const {
 template <class ELFT>
 section_iterator
@ -1101,7 +1090,7 @@ index 4d001039238..62837bbcaa0 100644
   if (sec->sh_type == ELF::SHT_REL)
     return getRel(Rel)->r_offset;
 diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
-index bfd3462bf69..9be0b260f34 100644
+index bfd3462bf69..52bc210b577 100644
 --- a/include/llvm/Object/MachO.h
 +++ b/include/llvm/Object/MachO.h
@@ -320,6 +320,7 @@ public:
@ -1112,15 +1101,6 @@ index bfd3462bf69..9be0b260f34 100644
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
 
-@@ -331,7 +332,7 @@ public:
- 
-   relocation_iterator locrel_begin() const;
-   relocation_iterator locrel_end() const;
-  
-+
-   void moveRelocationNext(DataRefImpl &Rel) const override;
-   uint64_t getRelocationOffset(DataRefImpl Rel) const override;
-   symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
 diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
 index 9c4ae94d3a6..64342723371 100644
 --- a/include/llvm/Object/ObjectFile.h
@ -1215,18 +1195,9 @@ index d11f5a83779..0ad115c886b 100644
   /// FD is the file descriptor that this writes to.  If ShouldClose is true,
   /// this closes the file when the stream is destroyed. If FD is for stdout or
 diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
-index adada672af0..c9c79971a25 100644
+index adada672af0..b3d68ed66af 100644
 --- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
 +++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
-@@ -38,7 +38,7 @@ DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() {
- }
- 
- bool
-DWARFAbbreviationDeclaration::extract(DataExtractor Data, 
-+DWARFAbbreviationDeclaration::extract(DataExtractor Data,
-                                       uint32_t* OffsetPtr) {
-   clear();
-   const uint32_t Offset = *OffsetPtr;
@@ -61,13 +61,15 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
 
   // Read all of the abbreviation attributes and forms.
@ -1587,7 +1558,7 @@ index 3d274b63a4f..cef29f4b41d 100644
 
 StringRef RuntimeDyld::getErrorString() { return Dyld->getErrorString(); }
 diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
-index 36b43ec9b78..3dc3e8f325c 100644
+index 36b43ec9b78..1a56e590014 100644
 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
 +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -270,6 +270,25 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
@ -1688,15 +1659,6 @@ index 36b43ec9b78..3dc3e8f325c 100644
       resolveAArch64Branch(SectionID, Value, RelI, Stubs);
     } else if (RelType == ELF::R_AARCH64_ADR_GOT_PAGE) {
       // Craete new GOT entry or find existing one. If GOT entry is
-@@ -1410,7 +1478,7 @@ RuntimeDyldELF::processRelocationRef(
-     } else {
-       processSimpleRelocation(SectionID, Offset, RelType, Value);
-     }
-  
-+
-   } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) {
-     if (RelType == ELF::R_PPC64_REL24) {
-       // Determine ABI variant in use for this object.
@@ -1632,7 +1700,7 @@ RuntimeDyldELF::processRelocationRef(
       // equivalent to the usual PLT implementation except that we use the stub
       // mechanism in RuntimeDyld (which puts stubs at the end of the section)
@ -1819,18 +1781,10 @@ index a0f9a857e3c..be32963b705 100644
         assert((cast<MCFillFragment>(F).getValue() == 0) &&
                "Invalid fill in virtual section!");
 diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
-index 0e0ea965d14..0044566d9ab 100644
+index 0e0ea965d14..49885269d06 100644
 --- a/lib/MC/MCDwarf.cpp
 +++ b/lib/MC/MCDwarf.cpp
-@@ -41,6 +41,7 @@
- #include <cassert>
- #include <cstdint>
- #include <string>
-+#include <tuple>
- #include <utility>
- #include <vector>
- 
-@@ -156,12 +157,36 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
+@@ -156,12 +156,36 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
   unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
   unsigned Isa = 0;
   unsigned Discriminator = 0;
@ -1868,7 +1822,7 @@ index 0e0ea965d14..0044566d9ab 100644
     if (FileNum != LineEntry.getFileNum()) {
       FileNum = LineEntry.getFileNum();
       MCOS->EmitIntValue(dwarf::DW_LNS_set_file, 1);
-@@ -197,18 +222,33 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
+@@ -197,18 +221,33 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
     if (LineEntry.getFlags() & DWARF2_FLAG_EPILOGUE_BEGIN)
       MCOS->EmitIntValue(dwarf::DW_LNS_set_epilogue_begin, 1);
 
@ -1910,7 +1864,7 @@ index 0e0ea965d14..0044566d9ab 100644
   }
 
   // Emit a DW_LNE_end_sequence for the end of the section.
-@@ -250,7 +290,7 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS,
+@@ -250,7 +289,7 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS,
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
 
   // Handle the rest of the Compile Units.
@ -1919,16 +1873,7 @@ index 0e0ea965d14..0044566d9ab 100644
     CUIDTablePair.second.EmitCU(MCOS, Params, LineStr);
 
   if (LineStr)
-@@ -484,7 +524,7 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
- 
-   // Parameters of the state machine, are next.
-   MCOS->EmitIntValue(context.getAsmInfo()->getMinInstAlignment(), 1);
-  // maximum_operations_per_instruction 
-+  // maximum_operations_per_instruction
-   // For non-VLIW architectures this field is always 1.
-   // FIXME: VLIW architectures need to update this field accordingly.
-   if (LineTableVersion >= 4)
-@@ -514,8 +554,12 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
+@@ -514,8 +553,12 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
 
 void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS,
                               MCDwarfLineTableParams Params,
@ -1943,7 +1888,7 @@ index 0e0ea965d14..0044566d9ab 100644
 
   // Put out the line tables.
   for (const auto &LineSec : MCLineSections.getMCLineEntries())
-@@ -1253,12 +1297,217 @@ public:
+@@ -1253,12 +1296,217 @@ public:
   void EmitCFIInstruction(const MCCFIInstruction &Instr);
 };
 
@ -2161,7 +2106,7 @@ index 0e0ea965d14..0044566d9ab 100644
 void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
   int dataAlignmentFactor = getDataAlignmentFactor(Streamer);
   auto *MRI = Streamer.getContext().getRegisterInfo();
-@@ -1373,7 +1622,28 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
+@@ -1373,7 +1621,28 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     Streamer.EmitIntValue(dwarf::DW_CFA_GNU_args_size, 1);
     Streamer.EmitULEB128IntValue(Instr.getOffset());
     return;
@ -2286,7 +2231,7 @@ index 0a684588110..58199c97420 100644
                                          unsigned char Value,
                                          SMLoc Loc) {
 diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
-index 776569894a5..0954b70df49 100644
+index 776569894a5..aa130bb2d6a 100644
 --- a/lib/MC/MCStreamer.cpp
 +++ b/lib/MC/MCStreamer.cpp
@@ -85,11 +85,15 @@ void MCStreamer::reset() {
@ -2329,15 +2274,6 @@ index 776569894a5..0954b70df49 100644
 }
 
 void MCStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
-@@ -513,7 +524,7 @@ void MCStreamer::EmitCFIEscape(StringRef Values) {
- 
- void MCStreamer::EmitCFIGnuArgsSize(int64_t Size) {
-   MCSymbol *Label = EmitCFILabel();
-  MCCFIInstruction Instruction = 
-+  MCCFIInstruction Instruction =
-     MCCFIInstruction::createGnuArgsSize(Label, Size);
-   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
-   if (!CurFrame)
@@ -884,6 +895,14 @@ void MCStreamer::visitUsedExpr(const MCExpr &Expr) {
   }
 }
@ -2363,16 +2299,10 @@ index 776569894a5..0954b70df49 100644
                                    SMLoc Loc) {}
 void MCStreamer::EmitBundleAlignMode(unsigned AlignPow2) {}
 diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
-index b544fa5c147..746c9f32865 100644
+index b544fa5c147..c885bf9f037 100644
 --- a/lib/Object/COFFObjectFile.cpp
 +++ b/lib/Object/COFFObjectFile.cpp
-@@ -339,11 +339,16 @@ unsigned COFFObjectFile::getSectionID(SectionRef Sec) const {
- 
- bool COFFObjectFile::isSectionVirtual(DataRefImpl Ref) const {
-   const coff_section *Sec = toSec(Ref);
-  // In COFF, a virtual section won't have any in-file 
-+  // In COFF, a virtual section won't have any in-file
-   // content, so the file pointer to the content will be zero.
+@@ -344,6 +344,11 @@ bool COFFObjectFile::isSectionVirtual(DataRefImpl Ref) const {
   return Sec->PointerToRawData == 0;
 }
 
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.1.0)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+project(libbolt_rt_project)
+
+add_library(bolt_rt STATIC
+  instr.cpp
+  )
+
+install(TARGETS bolt_rt DESTINATION lib)
--- a/runtime/instr.cpp
+++ b/runtime/instr.cpp
@ -0,0 +1,285 @@
+//===-- instr.cpp -----------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// This file contains code that is linked to the final binary with a function
+// that is called at program exit to dump instrumented data collected during
+// execution.
+//
+//===----------------------------------------------------------------------===//
+//
+// BOLT runtime instrumentation library for x86 Linux.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <elf.h>
+
+// All extern declarations here need to be defined by BOLT itself.
+
+// Counters inserted by instrumentation, incremented during runtime when
+// points of interest (locations) in the program are reached.
+extern uint64_t __bolt_instr_locations[];
+// Number of counters.
+extern uint32_t __bolt_instr_num_locs;
+// Filename to dump data to.
+extern char __bolt_instr_filename[];
+
+// A location is a function name plus offset. Function name needs to be
+// retrieved from the string table and is stored as an index to this table.
+struct Location {
+  uint32_t FunctionName;
+  uint32_t Offset;
+};
+
+// An edge description defines an instrumented edge in the program, fully
+// identified by where the jump is located and its destination.
+struct EdgeDescription {
+  Location From;
+  Location To;
+};
+
+// These need to be read from disk. They are generated by BOLT and written to
+// an ELF note section in the binary itself.
+struct InstrumentationInfo {
+  EdgeDescription *Descriptions;
+  char *Strings;    // String table with function names used in this binary
+  int FileDesc;     // File descriptor for the file on disk backing this
+                    // information in memory via mmap
+  uint8_t *MMapPtr; // The mmap ptr
+  int MMapSize;     // The mmap size
+};
+
+// Declare some syscall wrappers we use throughout this code to avoid linking
+// against system libc.
+static uint64_t
+__open(const char *pathname,
+       uint64_t flags,
+       uint64_t mode) {
+  uint64_t ret;
+  __asm__ __volatile__ (
+          "movq $2, %%rax\n"
+          "syscall"
+          : "=a"(ret)
+          : "D"(pathname), "S"(flags), "d"(mode)
+          : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+static uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
+  uint64_t ret;
+  __asm__ __volatile__ (
+          "movq $1, %%rax\n"
+          "syscall\n"
+          : "=a"(ret)
+          : "D"(fd), "S"(buf), "d"(count)
+          : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+static uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
+  uint64_t ret;
+  __asm__ __volatile__ (
+          "movq $8, %%rax\n"
+          "syscall\n"
+          : "=a"(ret)
+          : "D"(fd), "S"(pos), "d"(whence)
+          : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+static int __close(uint64_t fd) {
+  uint64_t ret;
+  __asm__ __volatile__ (
+          "movq $3, %%rax\n"
+          "syscall\n"
+          : "=a"(ret)
+          : "D"(fd)
+          : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+static void *__mmap(uint64_t addr, uint64_t size, uint64_t prot,
+                    uint64_t flags, uint64_t fd, uint64_t offset) {
+  void *ret;
+  register uint64_t r8 asm("r8") = fd;
+  register uint64_t r9 asm("r9") = offset;
+  register uint64_t r10 asm("r10") = flags;
+  __asm__ __volatile__ (
+          "movq $9, %%rax\n"
+          "syscall\n"
+          : "=a"(ret)
+          : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), "r"(r9)
+          : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+static uint64_t __munmap(void *addr, uint64_t size) {
+  uint64_t ret;
+  __asm__ __volatile__ (
+          "movq $11, %%rax\n"
+          "syscall\n"
+          : "=a"(ret)
+          : "D"(addr), "S"(size)
+          : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+static uint64_t __exit(uint64_t code) {
+  uint64_t ret;
+  __asm__ __volatile__ (
+          "movq $231, %%rax\n"
+          "syscall\n"
+          : "=a"(ret)
+          : "D"(code)
+          : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+// Helper functions for writing strings to the .fdata file
+
+// Write number Num using Base to the buffer in OutBuf, returns a pointer to
+// the end of the string.
+static char *intToStr(char *OutBuf, uint32_t Num, uint32_t Base) {
+  const char *Chars = "0123456789abcdef";
+  char Buf[20];
+  char *Ptr = Buf;
+  while (Num) {
+    *Ptr++ = *(Chars + (Num % Base));
+    Num /= Base;
+  }
+  if (Ptr == Buf) {
+    *OutBuf++ = '0';
+    return OutBuf;
+  }
+  while (Ptr != Buf) {
+    *OutBuf++ = *--Ptr;
+  }
+  return OutBuf;
+}
+
+// Copy Str to OutBuf, returns a pointer to the end of the copied string.
+static char *strCopy(char *OutBuf, const char *Str) {
+  while (*Str)
+    *OutBuf++ = *Str++;
+  return OutBuf;
+}
+
+// Print Msg to STDERR and quits with error code 1.
+static void reportError(const char *Msg, uint64_t Size) {
+  __write(2, Msg, Size);
+  __exit(1);
+}
+
+// Perform a string comparison and returns zero if Str1 matches Str2. Compares
+// at most Size characters.
+static int compareStr(const char *Str1, const char *Str2, int Size) {
+  while (*Str1 == *Str2) {
+    if (*Str1 == '\0' || --Size == 0)
+      return 0;
+    ++Str1;
+    ++Str2;
+  }
+  return 1;
+}
+
+// Write as a string in OutBuf an identifier for the program point at function
+// whose name is in the string table index FuncStrIndex plus Offset.
+static char *serializeLoc(const InstrumentationInfo &Info, char *OutBuf,
+                          const Location Loc) {
+  // fdata location format: Type Name Offset
+  // Type 1 - regular symbol
+  OutBuf = strCopy(OutBuf, "1 ");
+  const char *Str = Info.Strings + Loc.FunctionName;
+  while (*Str) {
+    *OutBuf++ = *Str++;
+  }
+  *OutBuf++ = ' ';
+  OutBuf = intToStr(OutBuf, Loc.Offset, 16);
+  *OutBuf++ = ' ';
+  return OutBuf;
+}
+
+// Read and map to memory the descriptions written by BOLT into the executable's
+// notes section
+static InstrumentationInfo readDescriptions() {
+  InstrumentationInfo Result;
+  uint64_t FD = __open("/proc/self/exe",
+                       /*flags=*/0 /*O_RDONLY*/,
+                       /*mode=*/0666);
+  Result.FileDesc = FD;
+
+  // mmap our binary to memory
+  uint64_t Size = __lseek(FD, 0, 2 /*SEEK_END*/);
+  uint8_t *BinContents = reinterpret_cast<uint8_t *>(
+      __mmap(0, Size, 0x1 /* PROT_READ*/, 0x2 /* MAP_PRIVATE*/, FD, 0));
+  Result.MMapPtr = BinContents;
+  Result.MMapSize = Size;
+  Elf64_Ehdr *Hdr = reinterpret_cast<Elf64_Ehdr *>(BinContents);
+  Elf64_Shdr *Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff);
+  Elf64_Shdr *StringTblHeader = reinterpret_cast<Elf64_Shdr *>(
+      BinContents + Hdr->e_shoff + Hdr->e_shstrndx * Hdr->e_shentsize);
+
+  // Find .bolt.instr.tables with the data we need and set pointers to it
+  for (int I = 0; I < Hdr->e_shnum; ++I) {
+    char *SecName = reinterpret_cast<char *>(
+        BinContents + StringTblHeader->sh_offset + Shdr->sh_name);
+    if (compareStr(SecName, ".bolt.instr.tables", 64) != 0) {
+      Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff +
+                                            (I + 1) * Hdr->e_shentsize);
+      continue;
+    }
+    // Actual contents of the ELF note start after offset 20 decimal:
+    //  Offset 0: Producer name size (4 bytes)
+    //  Offset 4: Contents size (4 bytes)
+    //  Offset 8: Note type (4 bytes)
+    //  Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
+    //  Offset 20: Contents
+    Result.Descriptions =
+        reinterpret_cast<EdgeDescription *>(BinContents + Shdr->sh_offset + 20);
+    // String table is located after the full EdgeDescriptions table containing
+    // __bolt_instr_num_locs entries is finished
+    Result.Strings = reinterpret_cast<char *>(
+        BinContents + Shdr->sh_offset + 20 +
+        (__bolt_instr_num_locs * sizeof(EdgeDescription)));
+    return Result;
+  }
+  const char ErrMsg[] =
+      "BOLT instrumentation runtime error: could not find section "
+      ".bolt.instr.tables\n";
+  reportError(ErrMsg, sizeof(ErrMsg));
+  return Result;
+}
+
+// This is the entry point called at program exit. BOLT patches the executable's
+// FINI entry in the .dynamic section with the address of this function. Our
+// goal here is to flush to disk all instrumentation data in memory, using
+// BOLT's fdata format.
+extern "C" void __bolt_instr_data_dump() {
+  const InstrumentationInfo Info = readDescriptions();
+
+  uint64_t FD = __open(__bolt_instr_filename,
+                       /*flags=*/0x241 /*O_WRONLY|O_TRUNC|O_CREAT*/,
+                       /*mode=*/0666);
+
+  for (int I = 0, E = __bolt_instr_num_locs; I < E; ++I) {
+    char LineBuf[2000];
+    char *Ptr = LineBuf;
+    uint32_t HitCount = __bolt_instr_locations[I];
+    if (!HitCount)
+      continue;
+
+    EdgeDescription *Desc = &Info.Descriptions[I];
+    Ptr = serializeLoc(Info, Ptr, Desc->From);
+    Ptr = serializeLoc(Info, Ptr, Desc->To);
+    Ptr = strCopy(Ptr, "0 ");
+    Ptr = intToStr(Ptr, HitCount, 10);
+    *Ptr++ = '\n';
+    __write(FD, LineBuf, Ptr - LineBuf);
+  }
+  __close(FD);
+  __munmap(Info.MMapPtr, Info.MMapSize);
+  __close(Info.FileDesc);
+}
--- a/src/BinaryBasicBlock.cpp
+++ b/src/BinaryBasicBlock.cpp
@ -12,6 +12,7 @@
 #include "BinaryBasicBlock.h"
 #include "BinaryContext.h"
 #include "BinaryFunction.h"
+#include "ParallelUtilities.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@ -96,6 +97,10 @@ bool BinaryBasicBlock::validateSuccessorInvariants() {
      }
    }
  } else {
+    // Unknown control flow.
+    if (Inst && BC.MIB->isIndirectBranch(*Inst))
+      return true;
+
    const MCSymbol *TBB = nullptr;
    const MCSymbol *FBB = nullptr;
    MCInst *CondBranch = nullptr;
@ -255,7 +260,7 @@ void BinaryBasicBlock::replaceSuccessor(BinaryBasicBlock *Succ,
                                        BinaryBasicBlock *NewSucc,
                                        uint64_t Count,
                                        uint64_t MispredictedCount) {
-  Succ->removePredecessor(this);
+  Succ->removePredecessor(this, /*Multiple=*/false);
  auto I = succ_begin();
  auto BI = BranchInfo.begin();
  for (; I != succ_end(); ++I) {
@ -280,7 +285,7 @@ void BinaryBasicBlock::removeAllSuccessors() {
 }

 void BinaryBasicBlock::removeSuccessor(BinaryBasicBlock *Succ) {
-  Succ->removePredecessor(this);
+  Succ->removePredecessor(this, /*Multiple=*/false);
  auto I = succ_begin();
  auto BI = BranchInfo.begin();
  for (; I != succ_end(); ++I) {
@ -299,13 +304,16 @@ void BinaryBasicBlock::addPredecessor(BinaryBasicBlock *Pred) {
  Predecessors.push_back(Pred);
 }

-void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred) {
+void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred,
+                                         bool Multiple) {
  // Note: the predecessor could be listed multiple times.
  bool Erased{false};
  for (auto PredI = Predecessors.begin(); PredI != Predecessors.end(); ) {
    if (*PredI == Pred) {
      Erased = true;
      PredI = Predecessors.erase(PredI);
+      if (!Multiple)
+        return;
    } else {
      ++PredI;
    }
@ -448,6 +456,7 @@ void BinaryBasicBlock::addBranchInstruction(const BinaryBasicBlock *Successor) {
  assert(isSuccessor(Successor));
  auto &BC = Function->getBinaryContext();
  MCInst NewInst;
+  std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
  BC.MIB->createUncondBranch(NewInst, Successor->getLabel(), BC.Ctx.get());
  Instructions.emplace_back(std::move(NewInst));
 }
@ -530,8 +539,8 @@ void BinaryBasicBlock::dump() const {
  outs() << "\n";
 }

-uint64_t BinaryBasicBlock::estimateSize() const {
-  return Function->getBinaryContext().computeCodeSize(begin(), end());
+uint64_t BinaryBasicBlock::estimateSize(const MCCodeEmitter *Emitter) const {
+  return Function->getBinaryContext().computeCodeSize(begin(), end(), Emitter);
 }

 BinaryBasicBlock::BinaryBranchInfo &
--- a/src/BinaryBasicBlock.h
+++ b/src/BinaryBasicBlock.h
@ -16,14 +16,15 @@

 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/raw_ostream.h"
 #include <limits>
-#include <utility>
 #include <set>
+#include <utility>

 namespace llvm {

@ -49,6 +50,12 @@ public:
  struct BinaryBranchInfo {
    uint64_t Count;
    uint64_t MispredictedCount; /// number of branches mispredicted
+
+    bool operator<(const BinaryBranchInfo &Other) const {
+      return (Count < Other.Count) ||
+             (Count == Other.Count &&
+              MispredictedCount < Other.MispredictedCount);
+    }
  };

  static constexpr uint32_t INVALID_OFFSET =
@ -358,13 +365,17 @@ public:

  /// Find the fallthrough successor for a block, or nullptr if there is
  /// none.
-  const BinaryBasicBlock* getFallthrough() const {
+  BinaryBasicBlock* getFallthrough() {
    if (succ_size() == 2)
      return getConditionalSuccessor(false);
    else
      return getSuccessor();
  }

+  const BinaryBasicBlock *getFallthrough() const {
+    return const_cast<BinaryBasicBlock *>(this)->getFallthrough();
+  }
+
  /// Return branch info corresponding to a taken branch.
  const BinaryBranchInfo &getTakenBranchInfo() const {
    assert(BranchInfo.size() == 2 &&
@ -450,6 +461,13 @@ public:
    }
  }

+  /// Add a range of instructions to the end of this basic block.
+  template <typename RangeTy>
+  void addInstructions(RangeTy R) {
+    for (auto &I : R)
+      addInstruction(I);
+  }
+
  /// Add instruction before Pos in this basic block.
  template <typename Itr>
  Itr insertPseudoInstr(Itr Pos, MCInst &Instr) {
@ -740,6 +758,11 @@ public:
    return Instructions.emplace(At, std::move(NewInst));
  }

+  iterator insertInstruction(iterator At, MCInst &NewInst) {
+    adjustNumPseudos(NewInst, 1);
+    return Instructions.emplace(At, NewInst);
+  }
+
  /// Helper to retrieve any terminators in \p BB before \p Pos. This is used
  /// to skip CFI instructions and to retrieve the first terminator instruction
  /// in basic blocks with two terminators (conditional jump and unconditional
@ -848,8 +871,11 @@ public:
    return InputRange.second - InputRange.first;
  }

-  /// Returns an estimate of size of basic block during run time.
-  uint64_t estimateSize() const;
+  /// Returns an estimate of size of basic block during run time optionally
+  /// using a user-supplied emitter for lock-free multi-thread work.
+  /// MCCodeEmitter is not thread safe and each thread should operate with its
+  /// own copy of it.
+  uint64_t estimateSize(const MCCodeEmitter *Emitter = nullptr) const;

  /// Return index in the current layout. The user is responsible for
  /// making sure the indices are up to date,
@ -884,7 +910,10 @@ private:

  /// Remove predecessor of the basic block. Don't use directly, instead
  /// use removeSuccessor() function.
-  void removePredecessor(BinaryBasicBlock *Pred);
+  /// If \p Multiple is set to true, it will remove all predecessors that
+  /// are equal to \p Pred. Otherwise, the first instance of \p Pred found
+  /// will be removed. This only matters in awkward, redundant CFGs.
+  void removePredecessor(BinaryBasicBlock *Pred, bool Multiple=true);

  /// Return offset of the basic block from the function start.
  uint32_t getOffset() const {
--- a/src/BinaryContext.cpp
+++ b/src/BinaryContext.cpp
--- a/src/BinaryContext.h
+++ b/src/BinaryContext.h
@ -17,8 +17,10 @@
 #include "BinaryData.h"
 #include "BinarySection.h"
 #include "DebugData.h"
+#include "JumpTable.h"
 #include "MCPlusBuilder.h"
 #include "llvm/ADT/iterator.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
@ -32,6 +34,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Object/ObjectFile.h"
@ -41,8 +44,10 @@
 #include <functional>
 #include <map>
 #include <set>
+#include <shared_mutex>
 #include <string>
 #include <system_error>
+#include <type_traits>
 #include <unordered_map>
 #include <vector>

@ -55,8 +60,21 @@ using namespace object;
 namespace bolt {

 class BinaryFunction;
+class BinaryBasicBlock;
 class DataReader;

+enum class MemoryContentsType : char {
+  UNKNOWN = 0,              /// Unknown contents.
+  POSSIBLE_JUMP_TABLE,      /// Possibly a non-PIC jump table.
+  POSSIBLE_PIC_JUMP_TABLE,  /// Possibly a PIC jump table.
+};
+
+/// Free memory allocated for \p List.
+template<typename T> void clearList(T& List) {
+  T TempList;
+  TempList.swap(List);
+}
+
 /// Helper function to truncate a \p Value to given size in \p Bytes.
 inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
  return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8));
@ -137,9 +155,23 @@ class BinaryContext {
  /// Low level section registration.
  BinarySection &registerSection(BinarySection *Section);

+  /// Store all functions in the binary, sorted by original address.
+  std::map<uint64_t, BinaryFunction> BinaryFunctions;
+
+  /// A mutex that is used to control parallel accesses to BinaryFunctions
+  mutable std::shared_timed_mutex BinaryFunctionsMutex;
+
  /// Functions injected by BOLT
  std::vector<BinaryFunction *> InjectedBinaryFunctions;

+  /// Jump tables for all functions mapped by address.
+  std::map<uint64_t, JumpTable *> JumpTables;
+
+  /// Used in duplicateJumpTable() to uniquely identify a JT clone
+  /// Start our IDs with a high number so getJumpTableContainingAddress checks
+  /// with size won't overflow
+  uint32_t DuplicatedJumpTables{0x10000000};
+
 public:
  /// [name] -> [BinaryData*] map used for global symbol resolution.
  using SymbolMapType = std::map<std::string, BinaryData *>;
@ -160,6 +192,58 @@ public:
    FilterIterator<binary_data_const_iterator>;
  using FilteredBinaryDataIterator = FilterIterator<binary_data_iterator>;

+  /// Return BinaryFunction containing a given \p Address or nullptr if
+  /// no registered function has it.
+  ///
+  /// In a binary a function has somewhat vague  boundaries. E.g. a function can
+  /// refer to the first byte past the end of the function, and it will still be
+  /// referring to this function, not the function following it in the address
+  /// space. Thus we have the following flags that allow to lookup for
+  /// a function where a caller has more context for the search.
+  ///
+  /// If \p CheckPastEnd is true and the \p Address falls on a byte
+  /// immediately following the last byte of some function and there's no other
+  /// function that starts there, then return the function as the one containing
+  /// the \p Address. This is useful when we need to locate functions for
+  /// references pointing immediately past a function body.
+  ///
+  /// If \p UseMaxSize is true, then include the space between this function
+  /// body and the next object in address ranges that we check.
+  BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address,
+                                                     bool CheckPastEnd = false,
+                                                     bool UseMaxSize = false,
+                                                     bool Shallow = false);
+
+  /// Return BinaryFunction which has a fragment that starts at a given
+  /// \p Address. If the BinaryFunction is a child fragment, then return its
+  /// parent unless \p Shallow parameter is set to true.
+  BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address,
+                                             bool Shallow = false);
+
+  const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address,
+                                                   bool Shallow = false) const {
+    return const_cast<BinaryContext *>(this)->
+        getBinaryFunctionAtAddress(Address, Shallow);
+  }
+
+  /// Return size of an entry for the given jump table \p Type.
+  uint64_t getJumpTableEntrySize(JumpTable::JumpTableType Type) const {
+    return Type == JumpTable::JTT_PIC ? 4 : AsmInfo->getCodePointerSize();
+  }
+
+  /// Return JumpTable containing a given \p Address.
+  JumpTable *getJumpTableContainingAddress(uint64_t Address) {
+    auto JTI = JumpTables.upper_bound(Address);
+    if (JTI == JumpTables.begin())
+      return nullptr;
+    --JTI;
+    if (JTI->first + JTI->second->getSize() > Address)
+      return JTI->second;
+    if (JTI->second->getSize() == 0 && JTI->first == Address)
+      return JTI->second;
+    return nullptr;
+  }
+
  /// [MCSymbol] -> [BinaryFunction]
  ///
  /// As we fold identical functions, multiple symbols can point
@ -167,6 +251,9 @@ public:
  std::unordered_map<const MCSymbol *,
                     BinaryFunction *> SymbolToFunctionMap;

+  /// A mutex that is used to control parallel accesses to SymbolToFunctionMap
+  mutable std::shared_timed_mutex SymbolToFunctionMapMutex;
+
  /// Look up the symbol entry that contains the given \p Address (based on
  /// the start address and size for each symbol).  Returns a pointer to
  /// the BinaryData for that symbol.  If no data is found, nullptr is returned.
@ -187,6 +274,10 @@ public:
  /// top level BinaryData.
  bool validateHoles() const;

+  /// Produce output address ranges based on input ranges for some module.
+  DebugAddressRangesVector translateModuleAddressRanges(
+      const DWARFAddressRangesVector &InputRanges) const;
+
  /// Get a bogus "absolute" section that will be associated with all
  /// absolute BinaryDatas.
  BinarySection &absoluteSection();
@ -202,6 +293,25 @@ public:
  /// is complete, e.g. after building CFGs for all functions.
  void assignMemData();

+  /// Construct BinaryFunction object and add it to internal maps.
+  BinaryFunction *createBinaryFunction(const std::string &Name,
+                                       BinarySection &Section,
+                                       uint64_t Address,
+                                       uint64_t Size,
+                                       bool IsSimple,
+                                       uint64_t SymbolSize = 0,
+                                       uint16_t Alignment = 0);
+
+  /// Return all functions for this rewrite instance.
+  std::map<uint64_t, BinaryFunction> &getBinaryFunctions() {
+    return BinaryFunctions;
+  }
+
+  /// Return all functions for this rewrite instance.
+  const std::map<uint64_t, BinaryFunction> &getBinaryFunctions() const {
+    return BinaryFunctions;
+  }
+
  /// Create BOLT-injected function
  BinaryFunction *createInjectedBinaryFunction(const std::string &Name,
                                               bool IsSimple = true);
@ -210,7 +320,54 @@ public:
    return InjectedBinaryFunctions;
  }

-public:
+  /// Construct a jump table for \p Function at \p Address or return an existing
+  /// one at that location.
+  ///
+  /// May create an embedded jump table and return its label as the second
+  /// element of the pair.
+  const MCSymbol *getOrCreateJumpTable(BinaryFunction &Function,
+                                       uint64_t Address,
+                                       JumpTable::JumpTableType Type);
+
+  /// Analyze a possible jump table of type \p Type at a given \p Address.
+  /// \p BF is a function referencing the jump table.
+  /// Return true if the jump table was detected at \p Address, and false
+  /// otherwise.
+  ///
+  /// If \p NextJTAddress is different from zero, it is used as an upper
+  /// bound for jump table memory layout.
+  ///
+  /// Optionally, populate \p Offsets with jump table entries. The entries
+  /// could be partially populated if the jump table detection fails.
+  bool analyzeJumpTable(const uint64_t Address,
+                        const JumpTable::JumpTableType Type,
+                        const BinaryFunction &BF,
+                        const uint64_t NextJTAddress = 0,
+                        JumpTable::OffsetsType *Offsets = nullptr);
+
+  /// After jump table locations are established, this function will populate
+  /// their OffsetEntries based on memory contents.
+  void populateJumpTables();
+
+  /// Returns a jump table ID and label pointing to the duplicated jump table.
+  /// Ordinarily, jump tables are identified by their address in the input
+  /// binary. We return an ID with the high bit set to differentiate it from
+  /// regular addresses, avoiding conflicts with standard jump tables.
+  std::pair<uint64_t, const MCSymbol *>
+  duplicateJumpTable(BinaryFunction &Function, JumpTable *JT,
+                     const MCSymbol *OldLabel);
+
+  /// Generate a unique name for jump table at a given \p Address belonging
+  /// to function \p BF.
+  std::string generateJumpTableName(const BinaryFunction &BF, uint64_t Address);
+
+  /// Return true if the array of bytes represents a valid code padding.
+  bool hasValidCodePadding(const BinaryFunction &BF);
+
+  /// Verify padding area between functions, and adjust max function size
+  /// accordingly.
+  void adjustCodePadding();
+
  /// Regular page size.
  static constexpr unsigned RegularPageSize = 0x1000;

@ -220,13 +377,20 @@ public:
  /// Map address to a constant island owner (constant data in code section)
  std::map<uint64_t, BinaryFunction *> AddressToConstantIslandMap;

+  /// A map from jump table address to insertion order.  Used for generating
+  /// jump table names.
+  std::map<uint64_t, size_t> JumpTableIds;
+
  /// Set of addresses in the code that are not a function start, and are
  /// referenced from outside of containing function. E.g. this could happen
  /// when a function has more than a single entry point.
-  std::set<uint64_t> InterproceduralReferences;
+  std::set<std::pair<BinaryFunction *, uint64_t>> InterproceduralReferences;

  std::unique_ptr<MCContext> Ctx;

+  /// A mutex that is used to control parallel accesses to Ctx
+  mutable std::shared_timed_mutex CtxMutex;
+
  std::unique_ptr<DWARFContext> DwCtx;

  std::unique_ptr<Triple> TheTriple;
@ -300,6 +464,9 @@ public:
  /// List of functions that always trap.
  std::vector<const BinaryFunction *> TrappedFunctions;

+  /// Map SDT locations to SDT markers info
+  std::unordered_map<uint64_t, SDTMarkerInfo> SDTMarkers;
+
  BinaryContext(std::unique_ptr<MCContext> Ctx,
                std::unique_ptr<DWARFContext> DwCtx,
                std::unique_ptr<Triple> TheTriple,
@ -383,6 +550,25 @@ public:
    BinaryDataMap.clear();
  }

+  /// Process \p Address reference from code in function \BF.
+  /// \p IsPCRel indicates if the reference is PC-relative.
+  /// Return <Symbol, Addend> pair corresponding to the \p Address.
+  std::pair<const MCSymbol *, uint64_t> handleAddressRef(uint64_t Address,
+                                                         BinaryFunction &BF,
+                                                         bool IsPCRel);
+
+  /// Analyze memory contents at the given \p Address and return the type of
+  /// memory contents (such as a possible jump table).
+  MemoryContentsType analyzeMemoryAt(uint64_t Address, BinaryFunction &BF);
+
+  /// Return a value of the global \p Symbol or an error if the value
+  /// was not set.
+  ErrorOr<uint64_t> getSymbolValue(const MCSymbol &Symbol) const {
+    const auto *BD = getBinaryDataByName(Symbol.getName());
+    if (!BD)
+      return std::make_error_code(std::errc::bad_address);
+    return BD->getAddress();
+  }

  /// Return a global symbol registered at a given \p Address and \p Size.
  /// If no symbol exists, create one with unique name using \p Prefix.
@ -448,6 +634,65 @@ public:
    return Itr != GlobalSymbols.end() ? Itr->second : nullptr;
  }

+  /// Return true if \p SymbolName was generated internally and was not present
+  /// in the input binary.
+  bool isInternalSymbolName(const StringRef Name) {
+    return Name.startswith("SYMBOLat") ||
+           Name.startswith("DATAat") ||
+           Name.startswith("HOLEat");
+  }
+
+  MCSymbol *getHotTextStartSymbol() const {
+    return Ctx->getOrCreateSymbol("__hot_start");
+  }
+
+  MCSymbol *getHotTextEndSymbol() const {
+    return Ctx->getOrCreateSymbol("__hot_end");
+  }
+
+  MCSection *getTextSection() const {
+    return MOFI->getTextSection();
+  }
+
+  /// Return code section with a given name.
+  MCSection *getCodeSection(StringRef SectionName) const {
+    return Ctx->getELFSection(SectionName,
+                              ELF::SHT_PROGBITS,
+                              ELF::SHF_EXECINSTR | ELF::SHF_ALLOC);
+  }
+
+  /// \name Pre-assigned Section Names
+  /// @{
+
+  const char *getMainCodeSectionName() const {
+    return ".text";
+  }
+
+  const char *getColdCodeSectionName() const {
+    return ".text.cold";
+  }
+
+  const char *getHotTextMoverSectionName() const {
+    return ".text.mover";
+  }
+
+  const char *getInjectedCodeSectionName() const {
+    return ".text.injected";
+  }
+
+  const char *getInjectedColdCodeSectionName() const {
+    return ".text.injected.cold";
+  }
+
+  ErrorOr<BinarySection &> getGdbIndexSection() const {
+    return getUniqueSectionByName(".gdb_index");
+  }
+
+  /// @}
+
+  /// Resolve inter-procedural dependencies.
+  void processInterproceduralReferences();
+
  /// Perform any necessary post processing on the symbol table after
  /// function disassembly is complete.  This processing fixes top
  /// level data holes and makes sure the symbol table is valid.
@ -535,6 +780,19 @@ public:
                                              Sections.end()));
  }

+  /// Iterate over all registered code sections.
+  iterator_range<FilteredSectionIterator> textSections() {
+    auto isText = [](const SectionIterator &Itr) {
+      return *Itr && Itr->isAllocatable() && Itr->isText();
+    };
+    return make_range(FilteredSectionIterator(isText,
+                                              Sections.begin(),
+                                              Sections.end()),
+                      FilteredSectionIterator(isText,
+                                              Sections.end(),
+                                              Sections.end()));
+  }
+
  /// Iterate over all registered allocatable sections.
  iterator_range<FilteredSectionConstIterator> allocatableSections() const {
    return const_cast<BinaryContext *>(this)->allocatableSections();
@ -586,7 +844,9 @@ public:
  /// functions only work for allocatable sections, i.e. ones with non-zero
  /// addresses.
  ErrorOr<BinarySection &> getSectionForAddress(uint64_t Address);
-  ErrorOr<const BinarySection &> getSectionForAddress(uint64_t Address) const;
+  ErrorOr<const BinarySection &> getSectionForAddress(uint64_t Address) const {
+    return const_cast<BinaryContext *>(this)->getSectionForAddress(Address);
+  }

  /// Return section(s) associated with given \p Name.
  iterator_range<NameToSectionMapType::iterator>
@ -598,18 +858,10 @@ public:
    return make_range(NameToSection.equal_range(Name));
  }

-  /// Return the unique (allocatable) section associated with given \p Name.
+  /// Return the unique section associated with given \p Name.
  /// If there is more than one section with the same name, return an error
  /// object.
-  ErrorOr<BinarySection &> getUniqueSectionByName(StringRef SectionName) {
-    auto Sections = getSectionByName(SectionName);
-    if (Sections.begin() != Sections.end() &&
-        std::next(Sections.begin()) == Sections.end())
-      return *Sections.begin()->second;
-    return std::make_error_code(std::errc::bad_address);
-  }
-  ErrorOr<const BinarySection &>
-  getUniqueSectionByName(StringRef SectionName) const {
+  ErrorOr<BinarySection &> getUniqueSectionByName(StringRef SectionName) const {
    auto Sections = getSectionByName(SectionName);
    if (Sections.begin() != Sections.end() &&
        std::next(Sections.begin()) == Sections.end())
@ -617,22 +869,38 @@ public:
    return std::make_error_code(std::errc::bad_address);
  }

-  /// Given \p Address in the binary, extract and return a pointer value at that
-  /// address. The address has to be a valid statically allocated address for
-  /// the binary.
-  ErrorOr<uint64_t> extractPointerAtAddress(uint64_t Address) const;
+  /// Return an unsigned value of \p Size stored at \p Address. The address has
+  /// to be a valid statically allocated address for the binary.
+  ErrorOr<uint64_t> getUnsignedValueAtAddress(uint64_t Address,
+                                              size_t Size) const;
+
+  /// Return a signed value of \p Size stored at \p Address. The address has
+  /// to be a valid statically allocated address for the binary.
+  ErrorOr<uint64_t> getSignedValueAtAddress(uint64_t Address,
+                                            size_t Size) const;
+
+  /// Special case of getUnsignedValueAtAddress() that uses a pointer size.
+  ErrorOr<uint64_t> getPointerAtAddress(uint64_t Address) const {
+    return getUnsignedValueAtAddress(Address, AsmInfo->getCodePointerSize());
+  }

  /// Replaces all references to \p ChildBF with \p ParentBF. \p ChildBF is then
  /// removed from the list of functions \p BFs. The profile data of \p ChildBF
-  /// is merged into that of \p ParentBF.
-  void foldFunction(BinaryFunction &ChildBF,
-                    BinaryFunction &ParentBF,
-                    std::map<uint64_t, BinaryFunction> &BFs);
+  /// is merged into that of \p ParentBF. This function is thread safe.
+  void foldFunction(BinaryFunction &ChildBF, BinaryFunction &ParentBF);

  /// Add a Section relocation at a given \p Address.
  void addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Type,
                     uint64_t Addend = 0, uint64_t Value = 0);

+  /// All PC-relative relocations in data objects.
+  std::map<uint64_t, std::pair<uint64_t, uint64_t>> PCRelocation;
+
+  void addPCRelativeDataRelocation(uint64_t Address, uint64_t Type,
+                                   uint64_t Value) {
+    PCRelocation[Address] = std::make_pair(Type, Value);
+  }
+
  /// Remove registered relocation at a given \p Address.
  bool removeRelocationAt(uint64_t Address);

@ -640,12 +908,15 @@ public:
  /// is no relocation at such address.
  const Relocation *getRelocationAt(uint64_t Address);

+  /// This function is thread safe.
  const BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) const {
+    std::shared_lock<std::shared_timed_mutex> Lock(SymbolToFunctionMapMutex);
    auto BFI = SymbolToFunctionMap.find(Symbol);
    return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second;
  }

  BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) {
+    std::shared_lock<std::shared_timed_mutex> Lock(SymbolToFunctionMapMutex);
    auto BFI = SymbolToFunctionMap.find(Symbol);
    return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second;
  }
@ -657,8 +928,7 @@ public:
  }

  /// Populate some internal data structures with debug info.
-  void preprocessDebugInfo(
-      std::map<uint64_t, BinaryFunction> &BinaryFunctions);
+  void preprocessDebugInfo();

  /// Add a filename entry from SrcCUID to DestCUID.
  unsigned addDebugFilenameToUnit(const uint32_t DestCUID,
@ -666,8 +936,7 @@ public:
                                  unsigned FileIndex);

  /// Return functions in output layout order
-  static std::vector<BinaryFunction *>
-  getSortedFunctions(std::map<uint64_t, BinaryFunction> &BinaryFunctions);
+  std::vector<BinaryFunction *> getSortedFunctions();

  /// Do the best effort to calculate the size of the function by emitting
  /// its code, and relaxing branch instructions.
@ -676,26 +945,33 @@ public:
  /// size is for the cold one.
  std::pair<size_t, size_t> calculateEmittedSize(BinaryFunction &BF);

-  /// Calculate the size of the instruction \p Inst.
-  uint64_t computeInstructionSize(const MCInst &Inst) const {
+  /// Calculate the size of the instruction \p Inst optionally using a
+  /// user-supplied emitter for lock-free multi-thread work. MCCodeEmitter is
+  /// not thread safe and each thread should operate with its own copy of it.
+  uint64_t
+  computeInstructionSize(const MCInst &Inst,
+                         const MCCodeEmitter *Emitter = nullptr) const {
+    if (!Emitter)
+      Emitter = this->MCE.get();
    SmallString<256> Code;
    SmallVector<MCFixup, 4> Fixups;
    raw_svector_ostream VecOS(Code);
-    MCE->encodeInstruction(Inst, VecOS, Fixups, *STI);
-
+    Emitter->encodeInstruction(Inst, VecOS, Fixups, *STI);
    return Code.size();
  }

  /// Compute the native code size for a range of instructions.
  /// Note: this can be imprecise wrt the final binary since happening prior to
  /// relaxation, as well as wrt the original binary because of opcode
-  /// shortening.
+  /// shortening.MCCodeEmitter is not thread safe and each thread should operate
+  /// with its own copy of it.
  template <typename Itr>
-  uint64_t computeCodeSize(Itr Beg, Itr End) const {
+  uint64_t computeCodeSize(Itr Beg, Itr End,
+                           const MCCodeEmitter *Emitter = nullptr) const {
    uint64_t Size = 0;
    while (Beg != End) {
      if (!MII->get(Beg->getOpcode()).isPseudo())
-        Size += computeInstructionSize(*Beg);
+        Size += computeInstructionSize(*Beg, Emitter);
      ++Beg;
    }
    return Size;
@ -760,8 +1036,44 @@ public:

  void exitWithBugReport(StringRef Message,
                         const BinaryFunction &Function) const;
+
+  struct IndependentCodeEmitter {
+    std::unique_ptr<MCObjectFileInfo> LocalMOFI;
+    std::unique_ptr<MCContext> LocalCtx;
+    std::unique_ptr<MCCodeEmitter> MCE;
+  };
+
+  /// Encapsulates an independent MCCodeEmitter that doesn't share resources
+  /// with the main one available through BinaryContext::MCE, managed by
+  /// BinaryContext.
+  /// This is intended to create a lock-free environment for an auxiliary thread
+  /// that needs to perform work with an MCCodeEmitter that can be transient or
+  /// won't be used in the main code emitter.
+  IndependentCodeEmitter createIndependentMCCodeEmitter() const {
+    IndependentCodeEmitter MCEInstance;
+    MCEInstance.LocalMOFI = llvm::make_unique<MCObjectFileInfo>();
+    MCEInstance.LocalCtx = llvm::make_unique<MCContext>(
+        AsmInfo.get(), MRI.get(), MCEInstance.LocalMOFI.get());
+    MCEInstance.LocalMOFI->InitMCObjectFileInfo(*TheTriple, /*PIC=*/false,
+                                                *MCEInstance.LocalCtx);
+    MCEInstance.MCE.reset(
+        TheTarget->createMCCodeEmitter(*MII, *MRI, *MCEInstance.LocalCtx));
+    return MCEInstance;
+  }
 };

+template <typename T,
+          typename = std::enable_if_t<sizeof(T) == 1> >
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const ArrayRef<T> &ByteArray) {
+  const char *Sep = "";
+  for (const auto Byte : ByteArray) {
+    OS << Sep << format("%.2x", Byte);
+    Sep = " ";
+  }
+  return OS;
+}
+
 } // namespace bolt
 } // namespace llvm

--- a/src/BinaryData.cpp
+++ b/src/BinaryData.cpp
@ -73,8 +73,8 @@ StringRef BinaryData::getOutputSectionName() const {
 }

 uint64_t BinaryData::getOutputAddress() const {
-  assert(OutputSection->getFileAddress());
-  return OutputSection->getFileAddress() + OutputOffset;
+  assert(OutputSection->getOutputAddress());
+  return OutputSection->getOutputAddress() + OutputOffset;
 }

 uint64_t BinaryData::getOffset() const {
--- a/src/BinaryData.h
+++ b/src/BinaryData.h
@ -106,7 +106,7 @@ public:
  bool isAtomic() const {
    return isTopLevelJumpTable() || !Parent;
  }
-  
+
  iterator_range<std::vector<std::string>::const_iterator> names() const {
    return make_range(Names.begin(), Names.end());
  }
--- a/src/BinaryFunction.cpp
+++ b/src/BinaryFunction.cpp
--- a/src/BinaryFunction.h
+++ b/src/BinaryFunction.h
@ -18,6 +18,7 @@
 #include "BinaryBasicBlock.h"
 #include "BinaryContext.h"
 #include "BinaryLoop.h"
+#include "BinarySection.h"
 #include "DataReader.h"
 #include "DebugData.h"
 #include "JumpTable.h"
@ -40,6 +41,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include <algorithm>

 using namespace llvm::object;

@ -53,108 +55,6 @@ namespace bolt {
 using DWARFUnitLineTable = std::pair<DWARFUnit *,
                                     const DWARFDebugLine::LineTable *>;

-/// Class encapsulating runtime statistics about an execution unit.
-class DynoStats {
-
-#define DYNO_STATS\
-  D(FIRST_DYNO_STAT,              "<reserved>", Fn)\
-  D(FORWARD_COND_BRANCHES,        "executed forward branches", Fn)\
-  D(FORWARD_COND_BRANCHES_TAKEN,  "taken forward branches", Fn)\
-  D(BACKWARD_COND_BRANCHES,       "executed backward branches", Fn)\
-  D(BACKWARD_COND_BRANCHES_TAKEN, "taken backward branches", Fn)\
-  D(UNCOND_BRANCHES,              "executed unconditional branches", Fn)\
-  D(FUNCTION_CALLS,               "all function calls", Fn)\
-  D(INDIRECT_CALLS,               "indirect calls", Fn)\
-  D(PLT_CALLS,                    "PLT calls", Fn)\
-  D(INSTRUCTIONS,                 "executed instructions", Fn)\
-  D(LOADS,                        "executed load instructions", Fn)\
-  D(STORES,                       "executed store instructions", Fn)\
-  D(JUMP_TABLE_BRANCHES,          "taken jump table branches", Fn)\
-  D(ALL_BRANCHES,                 "total branches",\
-      Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\
-  D(ALL_TAKEN,                    "taken branches",\
-      Fadd(TAKEN_CONDITIONAL, UNCOND_BRANCHES))\
-  D(NONTAKEN_CONDITIONAL,         "non-taken conditional branches",\
-      Fsub(ALL_CONDITIONAL, TAKEN_CONDITIONAL))\
-  D(TAKEN_CONDITIONAL,            "taken conditional branches",\
-      Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\
-  D(ALL_CONDITIONAL,              "all conditional branches",\
-      Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\
-  D(VENEER_CALLS_AARCH64,         "linker-inserted veneer calls", Fn)\
-  D(LAST_DYNO_STAT,               "<reserved>", 0)
-
-public:
-#define D(name, ...) name,
-  enum Category : uint8_t { DYNO_STATS };
-#undef D
-
-
-private:
-  uint64_t Stats[LAST_DYNO_STAT+1];
-  bool PrintAArch64Stats;
-
-#define D(name, desc, ...) desc,
-  static constexpr const char *Desc[] = { DYNO_STATS };
-#undef D
-
-public:
-  DynoStats(bool PrintAArch64Stats ) {
-    this->PrintAArch64Stats = PrintAArch64Stats;
-    for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat)
-      Stats[Stat] = 0;
-  }
-
-  uint64_t &operator[](size_t I) {
-    assert(I > FIRST_DYNO_STAT && I < LAST_DYNO_STAT &&
-           "index out of bounds");
-    return Stats[I];
-  }
-
-  uint64_t operator[](size_t I) const {
-    switch (I) {
-#define D(name, desc, func) \
-    case name: \
-      return func;
-#define Fn Stats[I]
-#define Fadd(a, b) operator[](a) + operator[](b)
-#define Fsub(a, b) operator[](a) - operator[](b)
-#define F(a) operator[](a)
-#define Radd(a, b) (a + b)
-#define Rsub(a, b) (a - b)
-    DYNO_STATS
-#undef Rsub
-#undef Radd
-#undef F
-#undef Fsub
-#undef Fadd
-#undef Fn
-#undef D
-    default:
-      llvm_unreachable("index out of bounds");
-    }
-    return 0;
-  }
-
-  void print(raw_ostream &OS, const DynoStats *Other = nullptr) const;
-
-  void operator+=(const DynoStats &Other);
-  bool operator<(const DynoStats &Other) const;
-  bool operator==(const DynoStats &Other) const;
-  bool operator!=(const DynoStats &Other) const { return !operator==(Other); }
-  bool lessThan(const DynoStats &Other, ArrayRef<Category> Keys) const;
-
-  static const char* Description(const Category C) {
-    return Desc[C];
-  }
-};
-
-inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) {
-  Stats.print(OS, nullptr);
-  return OS;
-}
-
-DynoStats operator+(const DynoStats &A, const DynoStats &B);
-
 /// Types of macro-fusion alignment corrections.
 enum MacroFusionType {
  MFT_NONE,
@ -302,11 +202,27 @@ private:

  std::unique_ptr<BinaryLoopInfo> BLI;

+  /// All labels in the function that are referenced via relocations from
+  /// data objects. Typically these are jump table destinations and computed
+  /// goto labels.
+  std::set<uint64_t> ExternallyReferencedOffsets;
+
+  /// Offsets of indirect branches with unknown destinations.
+  std::set<uint64_t> UnknownIndirectBranchOffsets;
+
  /// False if the function is too complex to reconstruct its control
  /// flow graph.
  /// In relocation mode we still disassemble and re-assemble such functions.
  bool IsSimple{true};

+  /// True if the function has an indirect branch with unknown destination.
+  bool HasUnknownControlFlow{false};
+
+  /// The code from inside the function references one of the code locations
+  /// from the same function as a data, i.e. it's possible the label is used
+  /// inside an address calculation or could be referenced from outside.
+  bool HasInternalLabelReference{false};
+
  /// In AArch64, preserve nops to maintain code equal to input (assuming no
  /// optimizations are done).
  bool PreserveNops{false};
@ -336,6 +252,15 @@ private:
  /// destination.
  bool HasFixedIndirectBranch{false};

+  /// Is the function known to exceed its input size?
+  bool IsLarge{false};
+
+  /// True if the function is a fragment of another function. This means that
+  /// this function could only be entered via its parent or one of its sibling
+  /// fragments. It could be entered at any basic block. It can also return
+  /// the control to any basic block of its parent or its sibling.
+  bool IsFragment{false};
+
  /// The address for the code for this function in codegen memory.
  uint64_t ImageAddress{0};

@ -348,6 +273,12 @@ private:
  /// Name for the corresponding cold code section.
  std::string ColdCodeSectionName;

+  /// Parent function for split function fragments.
+  BinaryFunction *ParentFunction{nullptr};
+
+  /// All fragments for a parent function.
+  std::unordered_set<BinaryFunction *> Fragments;
+
  /// The profile data for the number of times the function was executed.
  uint64_t ExecutionCount{COUNT_NO_PROFILE};

@ -395,6 +326,9 @@ private:
  /// Function order for streaming into the destination binary.
  uint32_t Index{-1U};

+  /// Indicate that the function body has SDT marker
+  bool HasSDTMarker{false};
+
  /// Get basic block index assuming it belongs to this function.
  unsigned getIndex(const BinaryBasicBlock *BB) const {
    assert(BB->getIndex() < BasicBlocks.size());
@ -433,7 +367,7 @@ private:

  /// Associate DW_CFA_GNU_args_size info with invoke instructions
  /// (call instructions with non-empty landing pad).
-  void propagateGnuArgsSizeInfo();
+  void propagateGnuArgsSizeInfo(MCPlusBuilder::AllocatorIdTy AllocId);

  /// Synchronize branch instructions with CFG.
  void postProcessBranches();
@ -451,8 +385,8 @@ private:
  std::set<uint64_t> CodeOffsets;
  /// The address offset where we emitted the constant island, that is, the
  /// chunk of data in the function code area (AArch only)
-  int64_t OutputDataOffset;
-  int64_t OutputColdDataOffset;
+  int64_t OutputDataOffset{0};
+  int64_t OutputColdDataOffset{0};

  /// Map labels to corresponding basic blocks.
  std::unordered_map<const MCSymbol *, BinaryBasicBlock *> LabelToBB;
@ -537,25 +471,20 @@ private:
  /// function and that apply before the entry basic block).
  CFIInstrMapType CIEFrameInstructions;

-  /// All compound jump tables for this function.
+  /// All compound jump tables for this function. This duplicates what's stored
+  /// in the BinaryContext, but additionally it gives quick access for all
+  /// jump tables used by this function.
+  ///
  /// <OriginalAddress> -> <JumpTable *>
  std::map<uint64_t, JumpTable *> JumpTables;

-  /// A map from jump table address to insertion order.  Used for generating
-  /// jump table names.
-  mutable std::map<uint64_t, size_t> JumpTableIds;
-
-  /// Generate a unique name for this jump table at the given address that
-  /// should be repeatable no matter what the start address of the table is.
-  std::string generateJumpTableName(uint64_t Address) const;
-
  /// Iterate over all jump tables associated with this function.
  iterator_range<std::map<uint64_t, JumpTable *>::const_iterator>
  jumpTables() const {
    return make_range(JumpTables.begin(), JumpTables.end());
  }

-  /// All jump table sites in the function.
+  /// All jump table sites in the function before CFG is built.
  std::vector<std::pair<uint64_t, uint64_t>> JTSites;

  /// List of relocations in this function.
@ -625,6 +554,12 @@ private:
  /// Count the number of functions created.
  static uint64_t Count;

+  /// LocSym annotation records an index to this vector. This holds a label
+  /// for each instruction whose input/output offsets need to be known after
+  /// emission. Enables writing bolt address translation tables, used for
+  /// mapping control transfer in the output binary back to the original binary.
+  std::vector<const MCSymbol *> LocSyms;
+
  /// Register alternative function name.
  void addAlternativeName(std::string NewName) {
    Names.emplace_back(NewName);
@ -654,6 +589,17 @@ private:
    return getOrCreateLocalLabel(getAddress() + Offset);
  }

+  /// Register an internal offset in a function referenced from outside.
+  void registerReferencedOffset(uint64_t Offset) {
+    ExternallyReferencedOffsets.emplace(Offset);
+  }
+
+  /// True if there are references to internals of this function from data,
+  /// e.g. from jump tables.
+  bool hasInternalReference() const {
+    return !ExternallyReferencedOffsets.empty();
+  }
+
  /// Update all \p From references in the code to refer to \p To. Used
  /// in disassembled state only.
  void updateReferences(const MCSymbol *From, const MCSymbol *To);
@ -661,6 +607,16 @@ private:
  /// This is called in disassembled state.
  void addEntryPoint(uint64_t Address);

+  void setParentFunction(BinaryFunction *BF) {
+    assert((!ParentFunction || ParentFunction == BF) &&
+           "cannot have more than one parent function");
+    ParentFunction = BF;
+  }
+
+  void addFragment(BinaryFunction *BF) {
+    Fragments.insert(BF);
+  }
+
  /// Return true if there is a registered entry point at a given offset
  /// into the function.
  bool hasEntryPointAtOffset(uint64_t Offset) {
@ -687,9 +643,11 @@ private:

  /// Emit line number information corresponding to \p NewLoc. \p PrevLoc
  /// provides a context for de-duplication of line number info.
+  /// \p FirstInstr indicates if \p NewLoc represents the first instruction
+  /// in a sequence, such as a function fragment.
  ///
  /// Return new current location which is either \p NewLoc or \p PrevLoc.
-  SMLoc emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc) const;
+  SMLoc emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc, bool FirstInstr) const;

  BinaryFunction& operator=(const BinaryFunction &) = delete;
  BinaryFunction(const BinaryFunction &) = delete;
@ -759,6 +717,10 @@ public:
    return iterator_range<const_iterator>(begin(), end());
  }

+  // Iterators by pointer.
+  BasicBlockListType::iterator pbegin()  { return BasicBlocks.begin(); }
+  BasicBlockListType::iterator pend()    { return BasicBlocks.end(); }
+
  order_iterator       layout_begin()    { return BasicBlocksLayout.begin(); }
  const_order_iterator layout_begin()    const
                                         { return BasicBlocksLayout.begin(); }
@ -822,6 +784,13 @@ public:
    return *this;
  }

+  /// Return a symbol for an instruction location. \p Idx is recorded as an
+  /// annotation in the instruction.
+  const MCSymbol *getLocSym(size_t Idx) const {
+    assert(Idx < LocSyms.size() && "Invalid index");
+    return LocSyms[Idx];
+  }
+
  /// Update layout of basic blocks used for output.
  void updateBasicBlockLayout(BasicBlockOrderType &NewLayout) {
    BasicBlocksPreviousLayout = BasicBlocksLayout;
@ -899,13 +868,6 @@ public:
  /// Attempt to validate CFG invariants.
  bool validateCFG() const;

-  /// Return dynostats for the function.
-  ///
-  /// The function relies on branch instructions being in-sync with CFG for
-  /// branch instructions stats. Thus it is better to call it after
-  /// fixBranches().
-  DynoStats getDynoStats() const;
-
  BinaryBasicBlock *getBasicBlockForLabel(const MCSymbol *Label) {
    auto I = LabelToBB.find(Label);
    return I == LabelToBB.end() ? nullptr : I->second;
@ -939,7 +901,7 @@ public:
  /// Retrieve the landing pad BB associated with invoke instruction \p Invoke
  /// that is in \p BB. Return nullptr if none exists
  BinaryBasicBlock *getLandingPadBBFor(const BinaryBasicBlock &BB,
-                                       const MCInst &InvokeInst) {
+                                       const MCInst &InvokeInst) const {
    assert(BC.MIB->isInvoke(InvokeInst) && "must be invoke instruction");
    const auto LP = BC.MIB->getEHInfo(InvokeInst);
    if (LP && LP->first) {
@ -954,15 +916,20 @@ public:
  /// CFG is constructed or while instruction offsets are available in CFG.
  MCInst *getInstructionAtOffset(uint64_t Offset);

+  const MCInst *getInstructionAtOffset(uint64_t Offset) const {
+    return const_cast<BinaryFunction *>(this)->getInstructionAtOffset(Offset);
+  }
+
  /// Return jump table that covers a given \p Address in memory.
  JumpTable *getJumpTableContainingAddress(uint64_t Address) {
    auto JTI = JumpTables.upper_bound(Address);
    if (JTI == JumpTables.begin())
      return nullptr;
    --JTI;
-    if (JTI->first + JTI->second->getSize() > Address) {
+    if (JTI->first + JTI->second->getSize() > Address)
+      return JTI->second;
+    if (JTI->second->getSize() == 0 && JTI->first == Address)
      return JTI->second;
-    }
    return nullptr;
  }

@ -1000,7 +967,7 @@ public:

  /// Check if (possibly one out of many) function name matches the given
  /// regex.
-  bool hasNameRegex(const std::string &NameRegex) const;
+  const std::string *hasNameRegex(const StringRef NameRegex) const;

  /// Return a vector of all possible names for the function.
  const std::vector<std::string> &getNames() const {
@ -1124,6 +1091,7 @@ public:
  MCSymbol *getFunctionEndLabel() const {
    assert(BC.Ctx && "cannot be called with empty context");
    if (!FunctionEndLabel) {
+      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
      FunctionEndLabel = BC.Ctx->createTempSymbol("func_end", true);
    }
    return FunctionEndLabel;
@ -1132,6 +1100,7 @@ public:
  /// Return MC symbol associated with the end of the cold part of the function.
  MCSymbol *getFunctionColdEndLabel() const {
    if (!FunctionColdEndLabel) {
+      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
      FunctionColdEndLabel = BC.Ctx->createTempSymbol("func_cold_end", true);
    }
    return FunctionColdEndLabel;
@ -1232,7 +1201,7 @@ public:
  /// address in a function. During disassembly we have to make sure we create
  /// relocation at that location.
  void addPCRelativeRelocationAddress(uint64_t Address) {
-    assert(Address >= getAddress() && Address < getAddress() + getSize() &&
+    assert(containsAddress(Address, /*UseMaxSize=*/ true) &&
           "address is outside of the function");
    PCRelativeRelocationOffsets.emplace(Address - getAddress());
  }
@ -1240,16 +1209,41 @@ public:
  /// Get data used by this function.
  std::set<BinaryData *> dataUses(bool OnlyHot) const;

+  /// Return then name of the section this function originated from.
+  StringRef getOriginSectionName() const {
+    return getSection().getName();
+  }
+
  /// Return internal section name for this function.
  StringRef getCodeSectionName() const {
    return StringRef(CodeSectionName);
  }

+  /// Assign a code section name to the function.
+  void setCodeSectionName(StringRef Name) {
+    CodeSectionName = Name;
+  }
+
+  /// Get output code section.
+  ErrorOr<BinarySection &> getCodeSection() const {
+    return BC.getUniqueSectionByName(getCodeSectionName());
+  }
+
  /// Return cold code section name for the function.
  StringRef getColdCodeSectionName() const {
    return StringRef(ColdCodeSectionName);
  }

+  /// Assign a section name for the cold part of the function.
+  void setColdCodeSectionName(StringRef Name) {
+    ColdCodeSectionName = Name;
+  }
+
+  /// Get output code section for cold code of this function.
+  ErrorOr<BinarySection &> getColdCodeSection() const {
+    return BC.getUniqueSectionByName(getColdCodeSectionName());
+  }
+
  /// Return true iif the function will halt execution on entry.
  bool trapsOnEntry() const {
    return TrapsOnEntry;
@ -1264,6 +1258,16 @@ public:
    return IsSimple;
  }

+  /// Return true if the function has instruction(s) with unknown control flow.
+  bool hasUnknownControlFlow() const {
+    return HasUnknownControlFlow;
+  }
+
+  /// Return true if the function should be split for the output.
+  bool shouldSplit() const {
+    return IsLarge && !getBinaryContext().HasRelocations;
+  }
+
  /// Return true if the function body is non-contiguous.
  bool isSplit() const {
    return layout_size() &&
@ -1300,6 +1304,9 @@ public:
    return !JumpTables.empty();
  }

+  /// Return true if the function has SDT marker
+  bool hasSDTMarker() const { return HasSDTMarker; }
+
  const JumpTable *getJumpTable(const MCInst &Inst) const {
    const auto Address = BC.MIB->getJumpTable(Inst);
    return getJumpTableContainingAddress(Address);
@ -1329,7 +1336,7 @@ public:
  }

  /// Return true if the given address \p PC is inside the function body.
-  bool containsAddress(uint64_t PC, bool UseMaxSize=false) const {
+  bool containsAddress(uint64_t PC, bool UseMaxSize = false) const {
    if (UseMaxSize)
      return Address <= PC && PC < Address + MaxSize;
    return Address <= PC && PC < Address + Size;
@ -1338,7 +1345,8 @@ public:
  /// Add new names this function is known under.
  template <class ContainterTy>
  void addNewNames(const ContainterTy &NewNames) {
-    Names.insert(Names.begin(),  NewNames.begin(), NewNames.end());
+    Names.insert(Names.begin(), NewNames.begin(), NewNames.end());
+    std::sort(Names.begin(), Names.end());
  }

  /// Create a basic block at a given \p Offset in the
@ -1353,6 +1361,7 @@ public:
                   bool DeriveAlignment = false) {
    assert(BC.Ctx && "cannot be called with empty context");
    if (!Label) {
+      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
      Label = BC.Ctx->createTempSymbol("BB", true);
    }
    auto BB = std::unique_ptr<BinaryBasicBlock>(
@ -1379,9 +1388,10 @@ public:
    assert((CurrentState == State::CFG || !getBasicBlockAtOffset(Offset)) &&
           "basic block already exists in pre-CFG state");

-    if (!Label)
+    if (!Label) {
+      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
      Label = BC.Ctx->createTempSymbol("BB", true);
-
+    }
    auto BBPtr = createBasicBlock(Offset, Label, DeriveAlignment);
    BasicBlocks.emplace_back(BBPtr.release());

@ -1438,13 +1448,15 @@ public:
    BinaryBasicBlock *Start,
    std::vector<std::unique_ptr<BinaryBasicBlock>> &&NewBBs,
    const bool UpdateLayout = true,
-    const bool UpdateCFIState = true);
+    const bool UpdateCFIState = true,
+    const bool RecomputeLandingPads = true);

  iterator insertBasicBlocks(
    iterator StartBB,
    std::vector<std::unique_ptr<BinaryBasicBlock>> &&NewBBs,
    const bool UpdateLayout = true,
-    const bool UpdateCFIState = true);
+    const bool UpdateCFIState = true,
+    const bool RecomputeLandingPads = true);

  /// Update the basic block layout for this function.  The BBs from
  /// [Start->Index, Start->Index + NumNewBlocks) are inserted into the
@ -1463,6 +1475,20 @@ public:
  /// new blocks into the CFG.  This must be called after updateLayout.
  void updateCFIState(BinaryBasicBlock *Start, const unsigned NumNewBlocks);

+  /// Return true if we detected ambiguous jump tables in this function, which
+  /// happen when one JT is used in more than one indirect jumps. This precludes
+  /// us from splitting edges for this JT unless we duplicate the JT (see
+  /// disambiguateJumpTables).
+  bool checkForAmbiguousJumpTables();
+
+  /// Detect when two distinct indirect jumps are using the same jump table and
+  /// duplicate it, allocating a separate JT for each indirect branch. This is
+  /// necessary for code transformations on the CFG that change an edge induced
+  /// by an indirect branch, e.g.: instrumentation or shrink wrapping. However,
+  /// this is only possible if we are not updating jump tables in place, but are
+  /// writing it to a new location (moving them).
+  void disambiguateJumpTables();
+
  /// Change \p OrigDest to \p NewDest in the jump table used at the end of
  /// \p BB. Returns false if \p OrigDest couldn't be find as a valid target
  /// and no replacement took place.
@ -1628,6 +1654,11 @@ public:
    return *this;
  }

+  BinaryFunction &setLarge(bool Large) {
+    IsLarge = Large;
+    return *this;
+  }
+
  BinaryFunction &setUsesGnuArgsSize(bool Uses = true) {
    UsesGnuArgsSize = Uses;
    return *this;
@ -1701,6 +1732,10 @@ public:
    return ImageSize;
  }

+  BinaryFunction *getParentFunction() const {
+    return ParentFunction;
+  }
+
  /// Set the profile data for the number of times the function was called.
  BinaryFunction &setExecutionCount(uint64_t Count) {
    ExecutionCount = Count;
@ -1807,6 +1842,7 @@ public:

    // Register our island at global namespace
    Symbol = BC.getOrCreateGlobalSymbol(Address, "ISLANDat");
+
    // Internal bookkeeping
    const auto Offset = Address - getAddress();
    assert((!IslandOffsets.count(Offset) || IslandOffsets[Offset] == Symbol) &&
@ -1823,20 +1859,20 @@ public:
  /// separate symbols when emitting our constant island on behalf of this other
  /// function.
  MCSymbol *
-  getOrCreateProxyIslandAccess(uint64_t Address, BinaryFunction *Referrer) {
+  getOrCreateProxyIslandAccess(uint64_t Address, BinaryFunction &Referrer) {
    auto Symbol = getOrCreateIslandAccess(Address);
    if (!Symbol)
      return nullptr;

    MCSymbol *Proxy;
-    if (!IslandProxies[Referrer].count(Symbol)) {
+    if (!IslandProxies[&Referrer].count(Symbol)) {
      Proxy =
          BC.Ctx->getOrCreateSymbol(Symbol->getName() +
-                                    ".proxy.for." + Referrer->getPrintName());
-      IslandProxies[Referrer][Symbol] = Proxy;
-      IslandProxies[Referrer][Proxy] = Symbol;
+                                    ".proxy.for." + Referrer.getPrintName());
+      IslandProxies[&Referrer][Symbol] = Proxy;
+      IslandProxies[&Referrer][Proxy] = Symbol;
    }
-    Proxy = IslandProxies[Referrer][Symbol];
+    Proxy = IslandProxies[&Referrer][Symbol];
    return Proxy;
  }

@ -1919,6 +1955,9 @@ public:
  /// Returns false if disassembly failed.
  void disassemble(ArrayRef<uint8_t> FunctionData);

+  /// Validate entry points.
+  void postProcessEntryPoints();
+
  /// Post-processing for jump tables after disassembly. Since their
  /// boundaries are not known until all call sites are seen, we need this
  /// extra pass to perform any final adjustments.
@ -1930,7 +1969,7 @@ public:
  ///
  /// Returns true on success and update the current function state to
  /// State::CFG. Returns false if CFG cannot be built.
-  bool buildCFG();
+  bool buildCFG(MCPlusBuilder::AllocatorIdTy);

  /// Read any kind of profile information available for the function.
  void readProfile();
@ -1951,7 +1990,7 @@ public:
  ///
  /// Return true upon successful processing, or false if the control flow
  /// cannot be statically evaluated for any given indirect branch.
-  bool postProcessIndirectBranches();
+  bool postProcessIndirectBranches(MCPlusBuilder::AllocatorIdTy AllocId);

  /// In functions with multiple entry points, the profile collection records
  /// data for other entry points in a different function entry. This function
@ -2119,7 +2158,7 @@ public:
  /// Emit function code. The caller is responsible for emitting function
  /// symbol(s) and setting the section to emit the code to.
  void emitBody(MCStreamer &Streamer, bool EmitColdPart,
-                bool EmitCodeOnly = false);
+                bool EmitCodeOnly = false, bool LabelsForOffsets = false);

  /// Emit function as a blob with relocations and labels for relocations.
  void emitBodyRaw(MCStreamer *Streamer);
@ -2151,6 +2190,8 @@ public:

  /// Sets the associated .debug_info entry.
  void addSubprogramDIE(const DWARFDie DIE) {
+    static std::mutex CriticalSectionMutex;
+    std::lock_guard<std::mutex> Lock(CriticalSectionMutex);
    SubprogramDIEs.emplace_back(DIE);
    if (!UnitLineTable.first) {
      if (const auto *LineTable =
@ -2253,7 +2294,7 @@ public:
  }

  /// Return output address ranges for a function.
-  DWARFAddressRangesVector getOutputAddressRanges() const;
+  DebugAddressRangesVector getOutputAddressRanges() const;

  /// Given an address corresponding to an instruction in the input binary,
  /// return an address of this instruction in output binary.
@ -2264,7 +2305,7 @@ public:

  /// Take address ranges corresponding to the input binary and translate
  /// them to address ranges in the output binary.
-  DWARFAddressRangesVector translateInputToOutputRanges(
+  DebugAddressRangesVector translateInputToOutputRanges(
      const DWARFAddressRangesVector &InputRanges) const;

  /// Similar to translateInputToOutputRanges() but operates on location lists
@ -2307,48 +2348,6 @@ public:
  const FragmentInfo &cold() const { return ColdFragment; }
 };

-/// Return program-wide dynostats.
-template <typename FuncsType>
-inline DynoStats getDynoStats(const FuncsType &Funcs) {
-  bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
-  DynoStats dynoStats(IsAArch64);
-  for (auto &BFI : Funcs) {
-    auto &BF = BFI.second;
-    if (BF.isSimple()) {
-      dynoStats += BF.getDynoStats();
-    }
-  }
-  return dynoStats;
-}
-
-/// Call a function with optional before and after dynostats printing.
-template <typename FnType, typename FuncsType>
-inline void
-callWithDynoStats(FnType &&Func,
-                  const FuncsType &Funcs,
-                  StringRef Phase,
-                  const bool Flag) {
-  bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
-  DynoStats DynoStatsBefore(IsAArch64);
-  if (Flag) {
-    DynoStatsBefore = getDynoStats(Funcs);
-  }
-
-  Func();
-
-  if (Flag) {
-    const auto DynoStatsAfter = getDynoStats(Funcs);
-    const auto Changed = (DynoStatsAfter != DynoStatsBefore);
-    outs() << "BOLT-INFO: program-wide dynostats after running "
-           << Phase << (Changed ? "" : " (no change)") << ":\n\n"
-           << DynoStatsBefore << '\n';
-    if (Changed) {
-      DynoStatsAfter.print(outs(), &DynoStatsBefore);
-    }
-    outs() << '\n';
-  }
-}
-
 inline raw_ostream &operator<<(raw_ostream &OS,
                               const BinaryFunction &Function) {
  OS << Function.getPrintName();
--- a/src/BinaryFunctionProfile.cpp
+++ b/src/BinaryFunctionProfile.cpp
@ -152,7 +152,7 @@ bool BinaryFunction::recordTrace(
      const auto *Instr = BB->getLastNonPseudoInstr();
      uint64_t Offset{0};
      if (Instr) {
-        Offset = BC.MIB->getAnnotationWithDefault<uint64_t>(*Instr, "Offset");
+        Offset = BC.MIB->getAnnotationWithDefault<uint32_t>(*Instr, "Offset");
      } else {
        Offset = BB->getOffset();
      }
@ -175,7 +175,11 @@ bool BinaryFunction::recordBranch(uint64_t From, uint64_t To,
    return false;
  }

-  // Could be bad LBR data; ignore the branch.
+  // Could be bad LBR data; ignore the branch. In the case of data collected
+  // in binaries optimized by BOLT, a source BB may be mapped to two output
+  // BBs as a result of optimizations. In that case, a branch between these
+  // two will be recorded as a branch from A going to A in the source address
+  // space. Keep processing.
  if (From == To) {
    return true;
  }
@ -200,7 +204,7 @@ bool BinaryFunction::recordBranch(uint64_t From, uint64_t To,
    const auto *LastInstr = ToBB->getLastNonPseudoInstr();
    if (LastInstr) {
      const auto LastInstrOffset =
-        BC.MIB->getAnnotationWithDefault<uint64_t>(*LastInstr, "Offset");
+        BC.MIB->getAnnotationWithDefault<uint32_t>(*LastInstr, "Offset");

      // With old .fdata we are getting FT branches for "jcc,jmp" sequences.
      if (To == LastInstrOffset && BC.MIB->isUnconditionalBranch(*LastInstr)) {
@ -226,23 +230,40 @@ bool BinaryFunction::recordBranch(uint64_t From, uint64_t To,
  // discarded it as a FT from __builtin_unreachable.
  auto *FromInstruction = getInstructionAtOffset(From);
  if (!FromInstruction) {
-    DEBUG(dbgs() << "no instruction for offset " << From << " in "
-                 << *this << '\n');
-    return false;
-  }
-
-  if (FromBB == ToBB) {
-    // Check for a return from a recursive call.
-    // Otherwise it's a simple loop.
+    // If the data was collected in a bolted binary, the From addresses may be
+    // translated to the first instruction of the source BB if BOLT inserted
+    // a new branch that did not exist in the source (we can't map it to the
+    // source instruction, so we map it to the first instr of source BB).
+    // We do not keep offsets for random instructions. So the check above will
+    // evaluate to true if the first instr is not a branch (call/jmp/ret/etc)
+    if (BC.DR.collectedInBoltedBinary()) {
+      if (FromBB->getInputOffset() != From) {
+        DEBUG(dbgs() << "offset " << From << " does not match a BB in " << *this
+                     << '\n');
+        return false;
+      }
+      FromInstruction = nullptr;
+    } else {
+      DEBUG(dbgs() << "no instruction for offset " << From << " in " << *this
+                   << '\n');
+      return false;
+    }
  }

  if (!FromBB->getSuccessor(ToBB->getLabel())) {
    // Check if this is a recursive call or a return from a recursive call.
-    if (ToBB->isEntryPoint() && (BC.MIB->isCall(*FromInstruction) ||
-                                 BC.MIB->isIndirectBranch(*FromInstruction))) {
+    if (FromInstruction && ToBB->isEntryPoint() &&
+        (BC.MIB->isCall(*FromInstruction) ||
+         BC.MIB->isIndirectBranch(*FromInstruction))) {
      // Execution count is already accounted for.
      return true;
    }
+    // For data collected in a bolted binary, we may have created two output BBs
+    // that map to one original block. Branches between these two blocks will
+    // appear here as one BB jumping to itself, even though it has no loop edges.
+    // Ignore these.
+    if (BC.DR.collectedInBoltedBinary() && FromBB == ToBB)
+      return true;

    DEBUG(dbgs() << "invalid branch in " << *this << '\n'
                 << Twine::utohexstr(From) << " -> "
@ -299,16 +320,15 @@ void BinaryFunction::postProcessProfile() {
    return;
  }

-  // Check if MCF post-processing was requested.
-  if (opts::DoMCF != MCF_DISABLE) {
-    removeTagsFromProfile();
-    solveMCF(*this, opts::DoMCF);
+  if (!(getProfileFlags() & PF_LBR)) {
+    // Check if MCF post-processing was requested.
+    if (opts::DoMCF != MCF_DISABLE) {
+      removeTagsFromProfile();
+      solveMCF(*this, opts::DoMCF);
+    }
    return;
  }

-  if (!(getProfileFlags() & PF_LBR))
-    return;
-
  // Pre-sort branch data.
  if (BranchData)
    std::stable_sort(BranchData->Data.begin(), BranchData->Data.end());
@ -368,6 +388,12 @@ void BinaryFunction::postProcessProfile() {
  if (opts::InferFallThroughs)
    inferFallThroughCounts();

+  // Check if MCF post-processing was requested.
+  if (opts::DoMCF != MCF_DISABLE) {
+    removeTagsFromProfile();
+    solveMCF(*this, opts::DoMCF);
+  }
+
  // Update profile information for jump tables based on CFG branch data.
  for (auto *BB : BasicBlocks) {
    const auto *LastInstr = BB->getLastNonPseudoInstr();
@ -843,6 +869,11 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) {
    if (BI.From.Name == BI.To.Name) {
      // Try to record information with 0 count.
      IsValid = recordBranch(BI.From.Offset, BI.To.Offset, 0);
+    } else if (BC.DR.collectedInBoltedBinary()) {
+      // We can't check branch source for collections in bolted binaries because
+      // the source of the branch may be mapped to the first instruction in a BB
+      // instead of the original branch (which may not exist in the source bin).
+      IsValid = true;
    } else {
      // The branch has to originate from this function.
      // Check for calls, tail calls, rets and indirect branches.
--- a/src/BinaryPassManager.cpp
+++ b/src/BinaryPassManager.cpp
@ -201,6 +201,13 @@ PrintUCE("print-uce",
  cl::Hidden,
  cl::cat(BoltOptCategory));

+static cl::opt<bool>
+PrintProfileStats("print-profile-stats",
+  cl::desc("print profile quality/bias analysis"),
+  cl::ZeroOrMore,
+  cl::init(false),
+  cl::cat(BoltCategory));
+
 static cl::opt<bool>
 SimplifyConditionalTailCalls("simplify-conditional-tail-calls",
  cl::desc("simplify conditional tail calls by removing unnecessary jumps"),
@ -229,6 +236,14 @@ StringOps("inline-memcpy",
  cl::ZeroOrMore,
  cl::cat(BoltOptCategory));

+static cl::list<std::string>
+SpecializeMemcpy1("memcpy1-spec",
+  cl::desc("list of functions with call sites for which to specialize memcpy() "
+           "for size 1"),
+  cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
 static cl::opt<bool>
 StripRepRet("strip-rep-ret",
  cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"),
@ -292,6 +307,7 @@ const char BinaryFunctionPassManager::TimerGroupDesc[] =
    "Binary Function Pass Manager";

 void BinaryFunctionPassManager::runPasses() {
+  auto &BFs = BC.getBinaryFunctions();
  for (const auto &OptPassPair : Passes) {
    if (!OptPassPair.first)
      continue;
@ -307,7 +323,7 @@ void BinaryFunctionPassManager::runPasses() {

    callWithDynoStats(
      [this,&Pass] {
-        Pass->runOnFunctions(BC, BFs, LargeFunctions);
+        Pass->runOnFunctions(BC);
      },
      BFs,
      Pass->getName(),
@ -350,14 +366,10 @@ void BinaryFunctionPassManager::runPasses() {
  }
 }

-void BinaryFunctionPassManager::runAllPasses(
-  BinaryContext &BC,
-  std::map<uint64_t, BinaryFunction> &Functions,
-  std::set<uint64_t> &LargeFunctions
-) {
-  BinaryFunctionPassManager Manager(BC, Functions, LargeFunctions);
+void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
+  BinaryFunctionPassManager Manager(BC);

-  const auto InitialDynoStats = getDynoStats(Functions);
+  const auto InitialDynoStats = getDynoStats(BC.getBinaryFunctions());

  // Here we manage dependencies/order manually, since passes are run in the
  // order they're registered.
@ -365,6 +377,9 @@ void BinaryFunctionPassManager::runAllPasses(
  // Run this pass first to use stats for the original functions.
  Manager.registerPass(llvm::make_unique<PrintProgramStats>(NeverPrint));

+  if (opts::PrintProfileStats)
+    Manager.registerPass(llvm::make_unique<PrintProfileStats>(NeverPrint));
+
  Manager.registerPass(llvm::make_unique<ValidateInternalCalls>(NeverPrint));

  Manager.registerPass(llvm::make_unique<StripRepRet>(NeverPrint),
@ -374,7 +389,12 @@ void BinaryFunctionPassManager::runAllPasses(
                       opts::ICF);

  if (BC.isAArch64())
-      Manager.registerPass(llvm::make_unique<VeneerElimination>(PrintVeneerElimination));
+      Manager.registerPass(
+          llvm::make_unique<VeneerElimination>(PrintVeneerElimination));
+
+  Manager.registerPass(
+      llvm::make_unique<SpecializeMemcpy1>(NeverPrint, opts::SpecializeMemcpy1),
+      !opts::SpecializeMemcpy1.empty());

  Manager.registerPass(llvm::make_unique<InlineMemcpy>(NeverPrint),
                       opts::StringOps);
@ -463,10 +483,14 @@ void BinaryFunctionPassManager::runAllPasses(
  Manager.registerPass(
      llvm::make_unique<RetpolineInsertion>(PrintRetpolineInsertion));

-  Manager.registerPass(
-      llvm::make_unique<LFenceInsertion>());
+  // Insert lfences to mitigate Spectre v1 and LVI. This pass is not compatible
+  // with the retpoline mitigation pass.
+  Manager.registerPass(llvm::make_unique<LFenceInsertion>());

-  // Thighten branches according to offset differences between branch and
+  // Assign each function an output section.
+  Manager.registerPass(llvm::make_unique<AssignSections>());
+
+  // Tighten branches according to offset differences between branch and
  // targets. No extra instructions after this pass, otherwise we may have
  // relocations out of range and crash during linking.
  if (BC.isAArch64())
--- a/src/BinaryPassManager.h
+++ b/src/BinaryPassManager.h
@ -27,8 +27,6 @@ namespace bolt {
 class BinaryFunctionPassManager {
 private:
  BinaryContext &BC;
-  std::map<uint64_t, BinaryFunction> &BFs;
-  std::set<uint64_t> &LargeFunctions;
  std::vector<std::pair<const bool,
                        std::unique_ptr<BinaryFunctionPass>>> Passes;

@ -36,10 +34,8 @@ private:
  static const char TimerGroupName[];
  static const char TimerGroupDesc[];

-  BinaryFunctionPassManager(BinaryContext &BC,
-                            std::map<uint64_t, BinaryFunction> &BFs,
-                            std::set<uint64_t> &LargeFunctions)
-    : BC(BC), BFs(BFs), LargeFunctions(LargeFunctions) {}
+  BinaryFunctionPassManager(BinaryContext &BC)
+    : BC(BC) {}

  /// Adds a pass to this manager based on the value of its corresponding
  /// command-line option.
@ -57,10 +53,7 @@ private:
  void runPasses();

  /// Runs all enabled implemented passes on all functions.
-  static void runAllPasses(BinaryContext &BC,
-                           std::map<uint64_t, BinaryFunction> &Functions,
-                           std::set<uint64_t> &LargeFunctions);
-
+  static void runAllPasses(BinaryContext &BC);
 };

 } // namespace bolt
--- a/src/BinarySection.cpp
+++ b/src/BinarySection.cpp
@ -66,7 +66,7 @@ BinarySection::~BinarySection() {
    delete[] getData();
    return;
  }
-  
+
  if (!isAllocatable() &&
      (!hasSectionRef() ||
       OutputContents.data() != getContents(Section).data())) {
@ -78,7 +78,7 @@ void BinarySection::print(raw_ostream &OS) const {
  OS << getName() << ", "
     << "0x" << Twine::utohexstr(getAddress()) << ", "
     << getSize()
-     << " (0x" << Twine::utohexstr(getFileAddress()) << ", "
+     << " (0x" << Twine::utohexstr(getOutputAddress()) << ", "
     << getOutputSize() << ")"
     << ", data = " << getData()
     << ", output data = " << getOutputData();
@ -160,3 +160,23 @@ void BinarySection::reorderContents(const std::vector<BinaryData *> &Order,
  Contents = OutputContents = StringRef(NewData, OS.str().size());
  OutputSize = Contents.size();
 }
+
+std::string BinarySection::encodeELFNote(StringRef NameStr, StringRef DescStr,
+                                         uint32_t Type) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  const uint32_t NameSz = NameStr.size() + 1;
+  const uint32_t DescSz = DescStr.size();
+  OS.write(reinterpret_cast<const char *>(&(NameSz)), 4);
+  OS.write(reinterpret_cast<const char *>(&(DescSz)), 4);
+  OS.write(reinterpret_cast<const char *>(&(Type)), 4);
+  OS << NameStr << '\0';
+  for (uint64_t I = NameSz; I < alignTo(NameSz, 4); ++I) {
+    OS << '\0';
+  }
+  OS << DescStr;
+  for (uint64_t I = DescStr.size(); I < alignTo(DescStr.size(), 4); ++I) {
+    OS << '\0';
+  }
+  return OS.str();
+}
--- a/src/BinarySection.h
+++ b/src/BinarySection.h
@ -62,13 +62,16 @@ class BinarySection {
                                   // finalized?
  std::string OutputName;          // Output section name (if the section has
                                   // been renamed)
-  uint64_t FileAddress{0};         // Section address for the rewritten binary.
+  uint64_t OutputAddress{0};       // Section address for the rewritten binary.
  uint64_t OutputSize{0};          // Section size in the rewritten binary.
  uint64_t FileOffset{0};          // File offset in the rewritten binary file.
  StringRef OutputContents;        // Rewritten section contents.
  unsigned SectionID{-1u};         // Unique ID used for address mapping.
                                   // Set by ExecutableFileMemoryManager.
+  uint32_t Index{0};               // Section index in the output file.
  mutable bool IsReordered{false}; // Have the contents been reordered?
+  bool IsAnonymous{false};         // True if the name should not be included
+                                   // in the output file.

  uint64_t hash(const BinaryData &BD,
                std::map<const BinaryData *, uint64_t> &Cache) const;
@ -264,6 +267,7 @@ public:
  }
  bool isLocal() const { return IsLocal; }
  bool isReordered() const { return IsReordered; }
+  bool isAnonymous() const { return IsAnonymous; }
  unsigned getELFType() const { return ELFType; }
  unsigned getELFFlags() const { return ELFFlags; }

@ -280,7 +284,8 @@ public:
  /// Does this section contain the given \p Address?
  /// Note: this is in terms of the original mapped binary addresses.
  bool containsAddress(uint64_t Address) const {
-    return getAddress() <= Address && Address < getEndAddress();
+    return (getAddress() <= Address && Address < getEndAddress()) ||
+           (getSize() == 0 && getAddress() == Address);
  }

  /// Does this section contain the range [\p Address, \p Address + \p Size)?
@ -371,7 +376,7 @@ public:
  uint64_t getAllocAddress() const {
    return reinterpret_cast<uint64_t>(getOutputData());
  }
-  uint64_t getFileAddress() const { return FileAddress; }
+  uint64_t getOutputAddress() const { return OutputAddress; }
  uint64_t getFileOffset() const { return FileOffset; }
  unsigned getSectionID() const {
    assert(hasValidSectionID() && "trying to use uninitialized section id");
@ -380,10 +385,13 @@ public:
  bool hasValidSectionID() const {
    return SectionID != -1u;
  }
+  uint32_t getIndex() const {
+    return Index;
+  }

  // mutation
-  void setFileAddress(uint64_t Address) {
-    FileAddress = Address;
+  void setOutputAddress(uint64_t Address) {
+    OutputAddress = Address;
  }
  void setFileOffset(uint64_t Offset) {
    FileOffset = Offset;
@ -392,9 +400,15 @@ public:
    assert(!hasValidSectionID() && "trying to set section id twice");
    SectionID = ID;
  }
+  void setIndex(uint32_t I) {
+    Index = I;
+  }
  void setOutputName(StringRef Name) {
    OutputName = Name;
  }
+  void setAnonymous(bool Flag) {
+    IsAnonymous = Flag;
+  }

  /// Reorder the contents of this section according to /p Order.  If
  /// /p Inplace is true, the entire contents of the section is reordered,
@ -402,6 +416,18 @@ public:
  void reorderContents(const std::vector<BinaryData *> &Order, bool Inplace);

  void print(raw_ostream &OS) const;
+
+  /// Write the contents of an ELF note section given the name of the producer,
+  /// a number identifying the type of note and the contents of the note in
+  /// \p DescStr.
+  static std::string encodeELFNote(StringRef NameStr, StringRef DescStr,
+                                   uint32_t Type);
+
+  /// Code for ELF notes written by producer 'BOLT'
+  enum {
+    NT_BOLT_BAT = 1,
+    NT_BOLT_INSTRUMENTATION_TABLES = 2
+  };
 };

 inline uint8_t *copyByteArray(const uint8_t *Data, uint64_t Size) {
@ -425,6 +451,21 @@ inline raw_ostream &operator<<(raw_ostream &OS, const BinarySection &Section) {
  return OS;
 }

+struct SDTMarkerInfo {
+  uint64_t PC;
+  uint64_t Base;
+  uint64_t Semaphore;
+  StringRef Provider;
+  StringRef Name;
+  StringRef Args;
+
+  /// The offset of PC within the note section
+  unsigned PCOffset;
+
+  /// A label that marks the location of the SDT nop instruction
+  MCSymbol *Label;
+};
+
 } // namespace bolt
 } // namespace llvm

--- a/src/BoltAddressTranslation.cpp
+++ b/src/BoltAddressTranslation.cpp
@ -0,0 +1,304 @@
+//===--- BoltAddressTranslation.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#include "BoltAddressTranslation.h"
+#include "BinaryFunction.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/Support/DataExtractor.h"
+
+#define DEBUG_TYPE "bolt-bat"
+
+namespace llvm {
+namespace bolt {
+
+const char* BoltAddressTranslation::SECTION_NAME = ".note.bolt_bat";
+
+void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
+                                               const BinaryBasicBlock &BB,
+                                               uint64_t FuncAddress) {
+  const uint64_t Key = BB.getOutputAddressRange().first - FuncAddress;
+  const uint64_t Val = BB.getInputOffset();
+
+  assert(Val != BinaryBasicBlock::INVALID_OFFSET &&
+         "Every output BB must track back to an input BB for profile "
+         "collection in bolted binaries");
+
+  DEBUG(dbgs() << "BB " << BB.getName() <<"\n");
+  DEBUG(dbgs() << "  Key: " << Twine::utohexstr(Key)
+               << " Val: " << Twine::utohexstr(Val) << "\n");
+  Map.insert(std::pair<uint32_t, uint32_t>(Key, Val));
+
+  // Look for special instructions we are interested in mapping offsets. These
+  // are key instructions for the profile identified by
+  // BC.keepOffsetForInstruction(Inst) and are instructions that cause control
+  // flow change. We also record offsets for the last instruction in the BB in
+  // some cases. These are harmless for BAT writing purposes, besides increasing
+  // the size of the table unnecessarily.
+  for (const auto &Inst : BB) {
+    if (!BC.MIB->hasAnnotation(Inst, "LocSym"))
+      continue;
+    const auto OutputOffset =
+        BC.MIB->getAnnotationAs<uint32_t>(Inst, "LocSym") - FuncAddress;
+
+    auto InputOffsetOrErr = BC.MIB->tryGetAnnotationAs<uint32_t>(Inst, "Offset");
+    DEBUG(if (!InputOffsetOrErr) {
+      auto *Function = BB.getFunction();
+      dbgs() << "Function: " << Function->getPrintName()
+             << " BB: " << BB.getName() << " lacking annotation for: ";
+      BC.printInstruction(dbgs(), Inst);
+      dbgs() << "\n";
+    });
+    assert(InputOffsetOrErr && "Expected annotation with input offset");
+    const auto InputOffset = *InputOffsetOrErr;
+
+    // Is this the first instruction in the BB? No need to duplicate the entry
+    if (Key == OutputOffset)
+      continue;
+
+    DEBUG(dbgs() << "  Key: " << Twine::utohexstr(OutputOffset)
+                 << " Val: " << Twine::utohexstr(InputOffset)
+                 << " (branch)\n");
+    Map.insert(
+        std::pair<uint32_t, uint32_t>(OutputOffset, InputOffset | BRANCHENTRY));
+  }
+}
+
+void BoltAddressTranslation::write(raw_ostream &OS) {
+  DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n");
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    auto &Function = BFI.second;
+
+    DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n");
+    DEBUG(dbgs() << " Address reference: 0x"
+                 << Twine::utohexstr(Function.getOutputAddress()) << "\n");
+    MapTy Map;
+    const bool IsSplit = Function.isSplit();
+    for (const auto &BB : Function.layout()) {
+      if (IsSplit && BB->isCold())
+        break;
+      writeEntriesForBB(Map, *BB, Function.getOutputAddress());
+    }
+    Maps.insert(std::pair<uint64_t, MapTy>(Function.getOutputAddress(), Map));
+
+    if (!IsSplit)
+      continue;
+
+    // Cold map
+    Map.clear();
+    DEBUG(dbgs() << " Cold part\n");
+    for (const auto &BB : Function.layout()) {
+      if (!BB->isCold())
+        continue;
+      writeEntriesForBB(Map, *BB, Function.cold().getAddress());
+    }
+    Maps.insert(std::pair<uint64_t, MapTy>(Function.cold().getAddress(), Map));
+    ColdPartSource.insert(std::pair<uint64_t, uint64_t>(
+        Function.cold().getAddress(), Function.getOutputAddress()));
+  }
+
+  const uint32_t NumFuncs = Maps.size();
+  OS.write(reinterpret_cast<const char *>(&NumFuncs), 4);
+  DEBUG(dbgs() << "Writing " << NumFuncs << " functions for BAT.\n");
+  for (auto &MapEntry : Maps) {
+    const uint64_t Address = MapEntry.first;
+    MapTy &Map = MapEntry.second;
+    const uint32_t NumEntries = Map.size();
+    DEBUG(dbgs() << "Writing " << NumEntries << " entries for 0x"
+                 << Twine::utohexstr(Address) << ".\n");
+    OS.write(reinterpret_cast<const char *>(&Address), 8);
+    OS.write(reinterpret_cast<const char *>(&NumEntries), 4);
+    for (auto &KeyVal : Map) {
+      OS.write(reinterpret_cast<const char *>(&KeyVal.first), 4);
+      OS.write(reinterpret_cast<const char *>(&KeyVal.second), 4);
+    }
+  }
+  const uint32_t NumColdEntries = ColdPartSource.size();
+  DEBUG(dbgs() << "Writing " << NumColdEntries << " cold part mappings.\n");
+  OS.write(reinterpret_cast<const char *>(&NumColdEntries), 4);
+  for (auto &ColdEntry : ColdPartSource) {
+    OS.write(reinterpret_cast<const char *>(&ColdEntry.first), 8);
+    OS.write(reinterpret_cast<const char *>(&ColdEntry.second), 8);
+    DEBUG(dbgs() << " " << Twine::utohexstr(ColdEntry.first) << " -> "
+          << Twine::utohexstr(ColdEntry.second) << "\n");
+  }
+
+  outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
+  outs() << "BOLT-INFO: Wrote " << NumColdEntries
+         << " BAT cold-to-hot entries\n";
+}
+
+std::error_code BoltAddressTranslation::parse(StringRef Buf) {
+  DataExtractor DE = DataExtractor(Buf, true, 8);
+  uint32_t Offset = 0;
+  if (Buf.size() < 12)
+    return make_error_code(llvm::errc::io_error);
+
+  const uint32_t NameSz = DE.getU32(&Offset);
+  const uint32_t DescSz = DE.getU32(&Offset);
+  const uint32_t Type = DE.getU32(&Offset);
+
+  if (Type != BinarySection::NT_BOLT_BAT ||
+      Buf.size() + Offset < alignTo(NameSz, 4) + DescSz)
+    return make_error_code(llvm::errc::io_error);
+
+  StringRef Name = Buf.slice(Offset, Offset + NameSz);
+  Offset = alignTo(Offset + NameSz, 4);
+  if (Name.substr(0, 4) != "BOLT")
+    return make_error_code(llvm::errc::io_error);
+
+  if (Buf.size() - Offset < 4)
+    return make_error_code(llvm::errc::io_error);
+
+  const uint32_t NumFunctions = DE.getU32(&Offset);
+  DEBUG(dbgs() << "Parsing " << NumFunctions << " functions\n");
+  for (uint32_t I = 0; I < NumFunctions; ++I) {
+    if (Buf.size() - Offset < 12)
+      return make_error_code(llvm::errc::io_error);
+
+    const uint64_t Address = DE.getU64(&Offset);
+    const uint32_t NumEntries = DE.getU32(&Offset);
+    MapTy Map;
+
+    DEBUG(dbgs() << "Parsing " << NumEntries << " entries for 0x"
+                 << Twine::utohexstr(Address) << "\n");
+    if (Buf.size() - Offset < 8 * NumEntries)
+      return make_error_code(llvm::errc::io_error);
+    for (uint32_t J = 0; J < NumEntries; ++J) {
+      const uint32_t OutputAddr = DE.getU32(&Offset);
+      const uint32_t InputAddr = DE.getU32(&Offset);
+      Map.insert(std::pair<uint32_t, uint32_t>(OutputAddr, InputAddr));
+      DEBUG(dbgs() << Twine::utohexstr(OutputAddr) << " -> "
+                   << Twine::utohexstr(InputAddr) << "\n");
+    }
+    Maps.insert(std::pair<uint64_t, MapTy>(Address, Map));
+  }
+
+  if (Buf.size() - Offset < 4)
+    return make_error_code(llvm::errc::io_error);
+
+  const uint32_t NumColdEntries = DE.getU32(&Offset);
+  DEBUG(dbgs() << "Parsing " << NumColdEntries << " cold part mappings\n");
+  for (uint32_t I = 0; I < NumColdEntries; ++I) {
+    if (Buf.size() - Offset < 16)
+      return make_error_code(llvm::errc::io_error);
+    const uint32_t ColdAddress = DE.getU64(&Offset);
+    const uint32_t HotAddress = DE.getU64(&Offset);
+    ColdPartSource.insert(
+        std::pair<uint64_t, uint64_t>(ColdAddress, HotAddress));
+    DEBUG(dbgs() << Twine::utohexstr(ColdAddress) << " -> "
+                 << Twine::utohexstr(HotAddress) << "\n");
+  }
+  outs() << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n";
+  outs() << "BOLT-INFO: Parsed " << NumColdEntries
+         << " BAT cold-to-hot entries\n";
+
+  return std::error_code();
+}
+
+uint64_t BoltAddressTranslation::translate(const BinaryFunction &Func,
+                                           uint64_t Offset,
+                                           bool IsBranchSrc) const {
+  auto Iter = Maps.find(Func.getAddress());
+  if (Iter == Maps.end())
+    return Offset;
+
+  const MapTy &Map = Iter->second;
+  auto KeyVal = Map.upper_bound(Offset);
+  if (KeyVal == Map.begin())
+    return Offset;
+
+  --KeyVal;
+
+  const uint32_t Val = KeyVal->second & ~BRANCHENTRY;
+  // Branch source addresses are translated to the first instruction of the
+  // source BB to avoid accounting for modifications BOLT may have made in the
+  // BB regarding deletion/addition of instructions.
+  if (IsBranchSrc)
+    return Val;
+  return Offset - KeyVal->first + Val;
+}
+
+Optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
+BoltAddressTranslation::getFallthroughsInTrace(
+    const BinaryFunction &Func,
+    const LBREntry &FirstLBR, const LBREntry &SecondLBR) const {
+  SmallVector<std::pair<uint64_t, uint64_t>, 16> Res;
+
+  // Filter out trivial case
+  if (FirstLBR.To >= SecondLBR.From)
+    return Res;
+
+  const auto From = FirstLBR.To - Func.getAddress();
+  const auto To = SecondLBR.From - Func.getAddress();
+
+  auto Iter = Maps.find(Func.getAddress());
+  if (Iter == Maps.end()) {
+    return NoneType();
+  }
+
+  const MapTy &Map = Iter->second;
+  auto FromIter = Map.upper_bound(From);
+  if (FromIter == Map.begin())
+    return Res;
+  // Skip instruction entries, to create fallthroughs we are only interested in
+  // BB boundaries
+  do {
+    if (FromIter == Map.begin())
+      return Res;
+    --FromIter;
+  } while (FromIter->second & BRANCHENTRY);
+
+  auto ToIter = Map.upper_bound(To);
+  if (ToIter == Map.begin())
+    return Res;
+  --ToIter;
+  if (FromIter->first >= ToIter->first)
+    return Res;
+
+  for (auto Iter = FromIter; Iter != ToIter; ) {
+    const auto Src = Iter->first;
+    if (Iter->second & BRANCHENTRY) {
+      ++Iter;
+      continue;
+    }
+
+    ++Iter;
+    while (Iter->second & BRANCHENTRY && Iter != ToIter) {
+      ++Iter;
+    }
+    if (Iter->second & BRANCHENTRY)
+      break;
+    Res.emplace_back(std::make_pair(Src, Iter->first));
+  }
+
+  return Res;
+}
+
+uint64_t BoltAddressTranslation::fetchParentAddress(uint64_t Address) const {
+  auto Iter = ColdPartSource.find(Address);
+  if (Iter == ColdPartSource.end())
+    return 0;
+  return Iter->second;
+}
+
+bool BoltAddressTranslation::enabledFor(
+    llvm::object::ELFObjectFileBase *InputFile) const {
+  for (const auto &Section : InputFile->sections()) {
+    StringRef SectionName;
+    if (std::error_code EC = Section.getName(SectionName))
+      continue;
+
+    if (SectionName == SECTION_NAME)
+      return true;
+  }
+  return false;
+}
+}
+}
--- a/src/BoltAddressTranslation.h
+++ b/src/BoltAddressTranslation.h
@ -0,0 +1,121 @@
+//===--- BoltAddressTranslation.h -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_BOLTADDRESSTRANSLATION_H
+#define LLVM_TOOLS_LLVM_BOLT_BOLTADDRESSTRANSLATION_H
+
+#include "BinaryContext.h"
+#include "llvm/Object/ELFObjectFile.h"
+
+namespace llvm {
+
+namespace bolt {
+
+/// The map of output addresses to input ones to be used when translating
+/// samples collected in a binary that was already processed by BOLT. We do not
+/// support reoptimizing a binary already processed by BOLT, but we do support
+/// collecting samples in a binary processed by BOLT. We then translate samples
+/// back to addresses from the input (original) binary, one that can be
+/// optimized. The goal is to avoid special deployments of non-bolted binaries
+/// just for the purposes of data collection.
+///
+/// The in-memory representation of the map is as follows. Each function has its
+/// own map. A function is identified by its output address. This is the key to
+/// retrieve a translation map. The translation map is a collection of ordered
+/// keys identifying the start of a region (relative to the function start) in
+/// the output address space (addresses in the binary processed by BOLT).
+///
+/// A translation then happens when perf2bolt needs to convert sample addresses
+/// in the output address space back to input addresses, valid to run BOLT in
+/// the original input binary. To convert, perf2bolt first needs to fetch the
+/// translation map for a sample recorded in a given function. It then finds
+/// the largest key that is still smaller or equal than the recorded address.
+/// It then converts this address to use the value of this key.
+///
+///   Example translation Map for function foo
+///      KEY                             VALUE                    BB?
+///    Output offset1 (first BB)         Original input offset1   Y
+///    ...
+///    Output offsetN (last branch)      Original input offsetN   N
+///
+/// The information on whether a given entry is a BB start or an instruction
+/// that changes control flow is encoded in the last (highest) bit of VALUE.
+///
+/// Notes:
+/// Instructions that will never appear in LBR because they do not cause control
+/// flow change are omitted from this map. Basic block locations are recorded
+/// because they can be a target of a jump (To address in the LBR) and also to
+/// recreate the BB layout of this function. We use the BB layout map to
+/// recreate fall-through jumps in the profile, given an LBR trace.
+class BoltAddressTranslation {
+public:
+  // In-memory representation of the address translation table
+  using MapTy = std::map<uint32_t, uint32_t>;
+
+  /// Name of the ELF section where the table will be serialized to in the
+  /// output binary
+  static const char *SECTION_NAME;
+
+  BoltAddressTranslation(BinaryContext &BC) : BC(BC) {}
+
+  /// Write the serialized address translation tables for each reordered
+  /// function
+  void write(raw_ostream &OS);
+
+  /// Read the serialized address translation tables and load them internally
+  /// in memory. Return a parse error if failed.
+  std::error_code parse(StringRef Buf);
+
+  /// If the maps are loaded in memory, perform the lookup to translate LBR
+  /// addresses in \p Func.
+  uint64_t translate(const BinaryFunction &Func, uint64_t Offset,
+                     bool IsBranchSrc) const;
+
+  /// Use the map keys containing basic block addresses to infer fall-throughs
+  /// taken in the path started at FirstLBR.To and ending at SecondLBR.From.
+  /// Return NoneType if trace is invalid or the list of fall-throughs
+  /// otherwise.
+  Optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
+  getFallthroughsInTrace(const BinaryFunction &Func, const LBREntry &FirstLBR,
+                         const LBREntry &SecondLBR) const;
+
+  /// If available, fetch the address of the hot part linked to the cold part
+  /// at \p Address. Return 0 otherwise.
+  uint64_t fetchParentAddress(uint64_t Address) const;
+
+  /// True if the input binary has a translation table we can use to convert
+  /// addresses when aggregating profile
+  bool enabledFor(llvm::object::ELFObjectFileBase *InputFile) const;
+
+private:
+  /// Helper to update \p Map by inserting one or more BAT entries reflecting
+  /// \p BB for function located at \p FuncAddress. At least one entry will be
+  /// emitted for the start of the BB. More entries may be emitted to cover
+  /// the location of calls or any instruction that may change control flow.
+  void writeEntriesForBB(MapTy &Map, const BinaryBasicBlock &BB,
+                         uint64_t FuncAddress);
+
+  BinaryContext &BC;
+
+  std::map<uint64_t, MapTy> Maps;
+
+  /// Links outlined cold bocks to their original function
+  std::map<uint64_t, uint64_t> ColdPartSource;
+
+  /// Identifies the address of a control-flow changing instructions in a
+  /// translation map entry
+  const static uint32_t BRANCHENTRY = 0x80000000;
+};
+}
+
+}
+
+#endif
--- a/src/BoltDiff.cpp
+++ b/src/BoltDiff.cpp
@ -204,7 +204,7 @@ class RewriteInstanceDiff {
  /// later when matching functions in binary 2 to corresponding functions
  /// in binary 1
  void buildLookupMaps() {
-    for (const auto &BFI : RI1.BinaryFunctions) {
+    for (const auto &BFI : RI1.BC->getBinaryFunctions()) {
      StringRef LTOName;
      const auto &Function = BFI.second;
      const auto Score = getNormalizedScore(Function, RI1);
@ -224,7 +224,7 @@ class RewriteInstanceDiff {
    }

    // Compute LTONameLookup2 and LargestBin2
-    for (const auto &BFI : RI2.BinaryFunctions) {
+    for (const auto &BFI : RI2.BC->getBinaryFunctions()) {
      StringRef LTOName;
      const auto &Function = BFI.second;
      const auto Score = getNormalizedScore(Function, RI2);
@ -245,7 +245,7 @@ class RewriteInstanceDiff {
  void matchFunctions() {
    outs() << "BOLT-DIFF: Mapping functions in Binary2 to Binary1\n";

-    for (const auto &BFI2 : RI2.BinaryFunctions) {
+    for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) {
      const auto &Function2 = BFI2.second;
      StringRef LTOName;
      bool Match = false;
@ -451,7 +451,7 @@ class RewriteInstanceDiff {
  /// having a large difference in performance because hotness shifted from
  /// LTO variant 1 to variant 2, even though they represent the same function.
  void computeAggregatedLTOScore() {
-    for (const auto &BFI : RI1.BinaryFunctions) {
+    for (const auto &BFI : RI1.BC->getBinaryFunctions()) {
      const auto &Function = BFI.second;
      double Score = getNormalizedScore(Function, RI1);
      auto Iter = LTOMap1.find(&Function);
@ -461,7 +461,7 @@ class RewriteInstanceDiff {
    }

    double UnmappedScore{0};
-    for (const auto &BFI : RI2.BinaryFunctions) {
+    for (const auto &BFI : RI2.BC->getBinaryFunctions()) {
      const auto &Function = BFI.second;
      bool Matched = FuncMap.find(&Function) != FuncMap.end();
      double Score = getNormalizedScore(Function, RI2);
@ -475,7 +475,8 @@ class RewriteInstanceDiff {
      if (FuncMap.find(Iter->second) == FuncMap.end())
        UnmappedScore += Score;
    }
-    int64_t Unmapped = RI2.BinaryFunctions.size() - Bin2MappedFuncs.size();
+    int64_t Unmapped =
+      RI2.BC->getBinaryFunctions().size() - Bin2MappedFuncs.size();
    outs() << "BOLT-DIFF: " << Unmapped
           << " functions in Binary2 have no correspondence to any other "
              "function in Binary1.\n";
@ -595,7 +596,7 @@ class RewriteInstanceDiff {
  void reportUnmapped() {
    outs() << "List of functions from binary 2 that were not matched with any "
           << "function in binary 1:\n";
-    for (const auto &BFI2 : RI2.BinaryFunctions) {
+    for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) {
      const auto &Function2 = BFI2.second;
      if (Bin2MappedFuncs.count(&Function2))
        continue;
@ -654,9 +655,9 @@ void RewriteInstance::compare(RewriteInstance &RI2) {
  if (opts::ICF) {
    IdenticalCodeFolding ICF(opts::NeverPrint);
    outs() << "BOLT-DIFF: Starting ICF pass for binary 1";
-    ICF.runOnFunctions(*BC, BinaryFunctions, LargeFunctions);
+    ICF.runOnFunctions(*BC);
    outs() << "BOLT-DIFF: Starting ICF pass for binary 2";
-    ICF.runOnFunctions(*RI2.BC, RI2.BinaryFunctions, RI2.LargeFunctions);
+    ICF.runOnFunctions(*RI2.BC);
  }

  RewriteInstanceDiff RID(*this, RI2);
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -48,8 +48,6 @@ add_public_gen_version_target(GenBoltRevision)
 set(LLVM_LINK_COMPONENTS
  ${LLVM_TARGETS_TO_BUILD}
  BOLTPasses
-  BOLTTargetAArch64
-  BOLTTargetX86
  CodeGen
  Core
  DebugInfoDWARF
@ -61,6 +59,18 @@ set(LLVM_LINK_COMPONENTS
  Support
  )

+string(FIND "${LLVM_TARGETS_TO_BUILD}" "AArch64" POSITION)
+if (NOT ${POSITION} EQUAL -1)
+  list(APPEND LLVM_LINK_COMPONENTS BOLTTargetAArch64)
+  set(BOLT_AArch64 On)
+endif()
+
+string(FIND "${LLVM_TARGETS_TO_BUILD}" "X86" POSITION)
+if (NOT ${POSITION} EQUAL -1)
+  list(APPEND LLVM_LINK_COMPONENTS BOLTTargetX86)
+  set(BOLT_X64 On)
+endif()
+
 add_llvm_tool(llvm-bolt
  llvm-bolt.cpp
  BinaryBasicBlock.cpp
@ -70,16 +80,20 @@ add_llvm_tool(llvm-bolt
  BinaryFunctionProfile.cpp
  BinaryPassManager.cpp
  BinarySection.cpp
+  BoltAddressTranslation.cpp
  BoltDiff.cpp
  CacheMetrics.cpp
  DataAggregator.cpp
  DataReader.cpp
  DebugData.cpp
  DWARFRewriter.cpp
+  DynoStats.cpp
  Exceptions.cpp
+  ExecutableFileMemoryManager.cpp
  Heatmap.cpp
  JumpTable.cpp
  MCPlusBuilder.cpp
+  ParallelUtilities.cpp
  ProfileReader.cpp
  ProfileWriter.cpp
  Relocation.cpp
@ -87,8 +101,17 @@ add_llvm_tool(llvm-bolt

  DEPENDS
  intrinsics_gen
+  bolt_rt
  )

+if (DEFINED BOLT_AArch64)
+  target_compile_definitions(llvm-bolt PRIVATE AARCH64_AVAILABLE)
+endif()
+
+if (DEFINED BOLT_X64)
+  target_compile_definitions(llvm-bolt PRIVATE X86_AVAILABLE)
+endif()
+
 add_llvm_tool_symlink(perf2bolt llvm-bolt)
 add_llvm_tool_symlink(llvm-boltdiff llvm-bolt)
 add_llvm_tool_symlink(llvm-bolt-heatmap llvm-bolt)
--- a/src/DWARFRewriter.cpp
+++ b/src/DWARFRewriter.cpp
@ -9,11 +9,10 @@
 //
 //===----------------------------------------------------------------------===//

-
-#include "BinaryBasicBlock.h"
+#include "DWARFRewriter.h"
 #include "BinaryContext.h"
 #include "BinaryFunction.h"
-#include "RewriteInstance.h"
+#include "ParallelUtilities.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
@ -57,62 +56,126 @@ KeepARanges("keep-aranges",
  cl::Hidden,
  cl::cat(BoltCategory));

+static cl::opt<bool>
+DeterministicDebugInfo("deterministic-debuginfo",
+  cl::desc("disables parallel execution of tasks that may produce"
+           "nondeterministic debug info"),
+  cl::init(true),
+  cl::cat(BoltCategory));
+
 } // namespace opts

-void RewriteInstance::updateDebugInfo() {
+void DWARFRewriter::updateDebugInfo() {
  SectionPatchers[".debug_abbrev"] = llvm::make_unique<DebugAbbrevPatcher>();
-  SectionPatchers[".debug_info"]  = llvm::make_unique<SimpleBinaryPatcher>();
+  SectionPatchers[".debug_info"] = llvm::make_unique<SimpleBinaryPatcher>();

-  RangesSectionsWriter = llvm::make_unique<DebugRangesSectionsWriter>(BC.get());
-  LocationListWriter = llvm::make_unique<DebugLocWriter>(BC.get());
+  DebugInfoPatcher =
+      static_cast<SimpleBinaryPatcher *>(SectionPatchers[".debug_info"].get());
+  AbbrevPatcher =
+      static_cast<DebugAbbrevPatcher *>(SectionPatchers[".debug_abbrev"].get());
+  assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized.");

-  for (auto &CU : BC->DwCtx->compile_units()) {
-    updateUnitDebugInfo(CU->getUnitDIE(false),
-                        std::vector<const BinaryFunction *>{});
+  RangesSectionsWriter = llvm::make_unique<DebugRangesSectionsWriter>(&BC);
+  LocationListWriter = llvm::make_unique<DebugLocWriter>(&BC);
+
+  auto processUnitDIE = [&](const DWARFDie DIE) {
+    const BinaryFunction *CachedFunction = nullptr;
+    std::map<DebugAddressRangesVector, uint64_t> CachedRanges{};
+    updateUnitDebugInfo(DIE, std::vector<const BinaryFunction *>{},
+                        CachedFunction, CachedRanges);
+  };
+
+  if (opts::NoThreads || opts::DeterministicDebugInfo) {
+    for (auto &CU : BC.DwCtx->compile_units())
+      processUnitDIE(CU->getUnitDIE(false));
+  } else {
+    // Update unit debug info in parallel
+    auto &ThreadPool = ParallelUtilities::getThreadPool();
+    for (auto &CU : BC.DwCtx->compile_units())
+      ThreadPool.async(processUnitDIE, CU->getUnitDIE(false));
+
+    ThreadPool.wait();
  }

+  flushPendingRanges();
+
  finalizeDebugSections();

  updateGdbIndexSection();
 }

-void RewriteInstance::updateUnitDebugInfo(
-    const DWARFDie DIE,
-    std::vector<const BinaryFunction *> FunctionStack) {
-
+void DWARFRewriter::updateUnitDebugInfo(
+    const DWARFDie DIE, std::vector<const BinaryFunction *> FunctionStack,
+    const BinaryFunction *&CachedFunction,
+    std::map<DebugAddressRangesVector, uint64_t> &CachedRanges) {
  bool IsFunctionDef = false;
  switch (DIE.getTag()) {
  case dwarf::DW_TAG_compile_unit:
    {
      const auto ModuleRanges = DIE.getAddressRanges();
-      auto OutputRanges = translateModuleAddressRanges(ModuleRanges);
+      auto OutputRanges = BC.translateModuleAddressRanges(ModuleRanges);
      const auto RangesSectionOffset =
-        RangesSectionsWriter->addCURanges(DIE.getDwarfUnit()->getOffset(),
-                                          std::move(OutputRanges));
+      RangesSectionsWriter->addCURanges(DIE.getDwarfUnit()->getOffset(),
+                                        std::move(OutputRanges));
      updateDWARFObjectAddressRanges(DIE, RangesSectionOffset);
    }
    break;

  case dwarf::DW_TAG_subprogram:
    {
-      // The function cannot have multiple ranges on the input.
-      uint64_t SectionIndex, LowPC, HighPC;
-      if (DIE.getLowAndHighPC(LowPC, HighPC, SectionIndex)) {
-        IsFunctionDef = true;
-        const auto *Function = getBinaryFunctionAtAddress(LowPC);
-        if (Function && Function->isFolded()) {
-          Function = nullptr;
+      // Get function address either from ranges or [LowPC, HighPC) pair.
+      bool UsesRanges = false;
+      uint64_t Address;
+      uint64_t SectionIndex, HighPC;
+      if (!DIE.getLowAndHighPC(Address, HighPC, SectionIndex)) {
+        auto Ranges = DIE.getAddressRanges();
+        // Not a function definition.
+        if (Ranges.empty())
+          break;
+
+        Address = Ranges.front().LowPC;
+        UsesRanges = true;
+      }
+
+      IsFunctionDef = true;
+      const auto *Function = BC.getBinaryFunctionAtAddress(Address);
+      if (Function && Function->isFolded())
+        Function = nullptr;
+      FunctionStack.push_back(Function);
+
+      DebugAddressRangesVector FunctionRanges;
+      if (Function)
+        FunctionRanges = Function->getOutputAddressRanges();
+
+      // Update ranges.
+      if (UsesRanges) {
+        updateDWARFObjectAddressRanges(DIE,
+            RangesSectionsWriter->addRanges(FunctionRanges));
+      } else {
+        // Delay conversion of [LowPC, HighPC) into DW_AT_ranges if possible.
+        const auto *Abbrev = DIE.getAbbreviationDeclarationPtr();
+        assert(Abbrev && "abbrev expected");
+
+        // Create a critical section.
+        static std::shared_timed_mutex CriticalSectionMutex;
+        std::unique_lock<std::shared_timed_mutex> Lock(CriticalSectionMutex);
+
+        if (FunctionRanges.size() > 1) {
+          convertPending(Abbrev);
+          // Exit critical section early.
+          Lock.unlock();
+          convertToRanges(DIE, FunctionRanges);
+        } else if (ConvertedRangesAbbrevs.find(Abbrev) !=
+                   ConvertedRangesAbbrevs.end()) {
+          // Exit critical section early.
+          Lock.unlock();
+          convertToRanges(DIE, FunctionRanges);
+        } else {
+          if (FunctionRanges.empty())
+            FunctionRanges.emplace_back(DebugAddressRange());
+          PendingRanges[Abbrev].emplace_back(
+              std::make_pair(DIE, FunctionRanges.front()));
        }
-        FunctionStack.push_back(Function);
-        auto RangesSectionOffset =
-          RangesSectionsWriter->getEmptyRangesOffset();
-        if (Function) {
-          auto FunctionRanges = Function->getOutputAddressRanges();
-          RangesSectionOffset =
-            RangesSectionsWriter->addRanges(Function,
-                                            std::move(FunctionRanges));
-        }
-        updateDWARFObjectAddressRanges(DIE, RangesSectionOffset);
      }
    }
    break;
@ -136,8 +199,8 @@ void RewriteInstance::updateUnitDebugInfo(
                   << Twine::utohexstr(DIE.getDwarfUnit()->getOffset()) << '\n';
          }
        );
-        RangesSectionOffset =
-          RangesSectionsWriter->addRanges(Function, std::move(OutputRanges));
+        RangesSectionOffset = RangesSectionsWriter->addRanges(
+            Function, std::move(OutputRanges), CachedFunction, CachedRanges);
      }
      updateDWARFObjectAddressRanges(DIE, RangesSectionOffset);
    }
@ -186,9 +249,7 @@ void RewriteInstance::updateUnitDebugInfo(
            }
          }

-          auto DebugInfoPatcher =
-              static_cast<SimpleBinaryPatcher *>(
-                  SectionPatchers[".debug_info"].get());
+          std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
          DebugInfoPatcher->addLE32Patch(AttrOffset, LocListSectionOffset);
        } else {
          assert((Value.isFormClass(DWARFFormValue::FC_Exprloc) ||
@ -208,9 +269,8 @@ void RewriteInstance::updateUnitDebugInfo(
                         << " for DIE with tag " << DIE.getTag()
                         << " to 0x" << Twine::utohexstr(NewAddress) << '\n');
          }
-          auto DebugInfoPatcher =
-              static_cast<SimpleBinaryPatcher *>(
-                  SectionPatchers[".debug_info"].get());
+
+          std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
          DebugInfoPatcher->addLE64Patch(AttrOffset, NewAddress);
        } else if (opts::Verbosity >= 1) {
          errs() << "BOLT-WARNING: unexpected form value for attribute at 0x"
@ -222,14 +282,14 @@ void RewriteInstance::updateUnitDebugInfo(

  // Recursively update each child.
  for (auto Child = DIE.getFirstChild(); Child; Child = Child.getSibling()) {
-    updateUnitDebugInfo(Child, FunctionStack);
+    updateUnitDebugInfo(Child, FunctionStack, CachedFunction, CachedRanges);
  }

  if (IsFunctionDef)
    FunctionStack.pop_back();
 }

-void RewriteInstance::updateDWARFObjectAddressRanges(
+void DWARFRewriter::updateDWARFObjectAddressRanges(
    const DWARFDie DIE, uint64_t DebugRangesOffset) {

  // Some objects don't have an associated DIE and cannot be updated (such as
@ -239,17 +299,10 @@ void RewriteInstance::updateDWARFObjectAddressRanges(
  }

  if (opts::Verbosity >= 2 && DebugRangesOffset == -1U) {
-    errs() << "BOLT-WARNING: using invalid DW_AT_range for DIE at offset 0x"
+    errs() << "BOLT-WARNING: using invalid DW_AT_ranges for DIE at offset 0x"
           << Twine::utohexstr(DIE.getOffset()) << '\n';
  }

-  auto DebugInfoPatcher =
-      static_cast<SimpleBinaryPatcher *>(SectionPatchers[".debug_info"].get());
-  auto AbbrevPatcher =
-      static_cast<DebugAbbrevPatcher*>(SectionPatchers[".debug_abbrev"].get());
-
-  assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized.");
-
  const auto *AbbreviationDecl = DIE.getAbbreviationDeclarationPtr();
  if (!AbbreviationDecl) {
    if (opts::Verbosity >= 1) {
@ -260,14 +313,14 @@ void RewriteInstance::updateDWARFObjectAddressRanges(
    return;
  }

-  auto AbbrevCode = AbbreviationDecl->getCode();
-
  if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges)) {
    // Case 1: The object was already non-contiguous and had DW_AT_ranges.
    // In this case we simply need to update the value of DW_AT_ranges.
    uint32_t AttrOffset = -1U;
    DIE.find(dwarf::DW_AT_ranges, &AttrOffset);
    assert(AttrOffset != -1U &&  "failed to locate DWARF attribute");
+
+    std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
    DebugInfoPatcher->addLE32Patch(AttrOffset, DebugRangesOffset);
  } else {
    // Case 2: The object has both DW_AT_low_pc and DW_AT_high_pc emitted back
@ -284,50 +337,8 @@ void RewriteInstance::updateDWARFObjectAddressRanges(
    // large size.
    if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) &&
        AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc)) {
-      uint32_t LowPCOffset = -1U;
-      uint32_t HighPCOffset = -1U;
-      DWARFFormValue LowPCFormValue =
-          *DIE.find(dwarf::DW_AT_low_pc, &LowPCOffset);
-      DWARFFormValue HighPCFormValue =
-          *DIE.find(dwarf::DW_AT_high_pc, &HighPCOffset);
-
-      if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr ||
-          (HighPCFormValue.getForm() != dwarf::DW_FORM_addr &&
-           HighPCFormValue.getForm() != dwarf::DW_FORM_data8 &&
-           HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) {
-        errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE "
-                 << "at offset 0x" << Twine::utohexstr(DIE.getOffset())
-                 << "\n";
-        return;
-      }
-      if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) {
-        errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. "
-               << "Cannot update DIE at offset 0x"
-               << Twine::utohexstr(DIE.getOffset()) << '\n';
-        return;
-      }
-
-      AbbrevPatcher->addAttributePatch(DIE.getDwarfUnit(),
-                                       AbbrevCode,
-                                       dwarf::DW_AT_low_pc,
-                                       dwarf::DW_AT_ranges,
-                                       dwarf::DW_FORM_sec_offset);
-      AbbrevPatcher->addAttributePatch(DIE.getDwarfUnit(),
-                                       AbbrevCode,
-                                       dwarf::DW_AT_high_pc,
-                                       dwarf::DW_AT_low_pc,
-                                       dwarf::DW_FORM_udata);
-      unsigned LowPCSize = 0;
-      if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr ||
-          HighPCFormValue.getForm() == dwarf::DW_FORM_data8) {
-        LowPCSize = 12;
-      } else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) {
-        LowPCSize = 8;
-      } else {
-        llvm_unreachable("unexpected form");
-      }
-      DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset);
-      DebugInfoPatcher->addUDataPatch(LowPCOffset + 4, 0, LowPCSize);
+      convertToRanges(AbbreviationDecl);
+      convertToRanges(DIE, DebugRangesOffset);
    } else {
      if (opts::Verbosity >= 1) {
        errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x"
@ -337,8 +348,8 @@ void RewriteInstance::updateDWARFObjectAddressRanges(
  }
 }

-void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() {
-  for (auto &It : BinaryFunctions) {
+void DWARFRewriter::updateDebugLineInfoForNonSimpleFunctions() {
+  for (auto &It : BC.getBinaryFunctions()) {
    const auto &Function = It.second;

    if (Function.isSimple())
@ -353,7 +364,7 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() {

    std::vector<uint32_t> Results;
    MCSectionELF *FunctionSection =
-        BC->Ctx->getELFSection(Function.getCodeSectionName(),
+        BC.Ctx->getELFSection(Function.getCodeSectionName(),
                               ELF::SHT_PROGBITS,
                               ELF::SHF_EXECINSTR | ELF::SHF_ALLOC);

@ -361,10 +372,10 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() {
    if (LineTable->lookupAddressRange(Address, Function.getMaxSize(),
                                      Results)) {
      auto &OutputLineTable =
-          BC->Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections();
+          BC.Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections();
      for (auto RowIndex : Results) {
        const auto &Row = LineTable->Rows[RowIndex];
-        BC->Ctx->setCurrentDwarfLoc(
+        BC.Ctx->setCurrentDwarfLoc(
            Row.File,
            Row.Line,
            Row.Column,
@ -375,17 +386,17 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() {
            Row.Isa,
            Row.Discriminator,
            Row.Address);
-        auto Loc = BC->Ctx->getCurrentDwarfLoc();
-        BC->Ctx->clearDwarfLocSeen();
+        auto Loc = BC.Ctx->getCurrentDwarfLoc();
+        BC.Ctx->clearDwarfLocSeen();
        OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc},
                                     FunctionSection);
      }
      // Add an empty entry past the end of the function
      // for end_sequence mark.
-      BC->Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0,
+      BC.Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0,
                                  Address + Function.getMaxSize());
-      auto Loc = BC->Ctx->getCurrentDwarfLoc();
-      BC->Ctx->clearDwarfLocSeen();
+      auto Loc = BC.Ctx->getCurrentDwarfLoc();
+      BC.Ctx->clearDwarfLocSeen();
      OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc},
                                   FunctionSection);
    } else {
@ -395,9 +406,9 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() {
  }
 }

-void RewriteInstance::updateLineTableOffsets() {
+void DWARFRewriter::updateLineTableOffsets() {
  const auto *LineSection =
-    BC->Ctx->getObjectFileInfo()->getDwarfLineSection();
+    BC.Ctx->getObjectFileInfo()->getDwarfLineSection();
  auto CurrentFragment = LineSection->begin();
  uint32_t CurrentOffset = 0;
  uint32_t Offset = 0;
@ -406,7 +417,7 @@ void RewriteInstance::updateLineTableOffsets() {
  // output file, thus we can compute all table's offset by passing through
  // each fragment at most once, continuing from the last CU's beginning
  // instead of from the first fragment.
-  for (const auto &CUIDLineTablePair : BC->Ctx->getMCDwarfLineTables()) {
+  for (const auto &CUIDLineTablePair : BC.Ctx->getMCDwarfLineTables()) {
    auto Label = CUIDLineTablePair.second.getLabel();
    if (!Label)
      continue;
@ -415,10 +426,10 @@ void RewriteInstance::updateLineTableOffsets() {
    if (CUOffset == -1U)
      continue;

-    auto *CU = BC->DwCtx->getCompileUnitForOffset(CUOffset);
+    auto *CU = BC.DwCtx->getCompileUnitForOffset(CUOffset);
    assert(CU && "no CU found at offset");
    auto LTOffset =
-      BC->DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list);
+      BC.DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list);
    if (!LTOffset)
      continue;

@ -444,9 +455,9 @@ void RewriteInstance::updateLineTableOffsets() {
    Offset += Label->getOffset() - CurrentOffset;
    CurrentOffset = Label->getOffset();

-    auto DbgInfoSection = BC->getUniqueSectionByName(".debug_info");
+    auto DbgInfoSection = BC.getUniqueSectionByName(".debug_info");
    assert(DbgInfoSection && ".debug_info section must exist");
-    auto *Zero = BC->registerNameAtAddress("Zero", 0, 0, 0);
+    auto *Zero = BC.registerNameAtAddress("Zero", 0, 0, 0);
    DbgInfoSection->addRelocation(LTOffset,
                                  Zero,
                                  ELF::R_X86_64_32,
@ -463,43 +474,43 @@ void RewriteInstance::updateLineTableOffsets() {
  }
 }

-void RewriteInstance::finalizeDebugSections() {
+void DWARFRewriter::finalizeDebugSections() {
  // Skip .debug_aranges if we are re-generating .gdb_index.
-  if (opts::KeepARanges || !GdbIndexSection) {
+  if (opts::KeepARanges || !BC.getGdbIndexSection()) {
    SmallVector<char, 16> ARangesBuffer;
    raw_svector_ostream OS(ARangesBuffer);

-    auto MAB = std::unique_ptr<MCAsmBackend>(BC->TheTarget->createMCAsmBackend(
-        *BC->STI, *BC->MRI, MCTargetOptions()));
+    auto MAB = std::unique_ptr<MCAsmBackend>(BC.TheTarget->createMCAsmBackend(
+        *BC.STI, *BC.MRI, MCTargetOptions()));
    auto Writer = std::unique_ptr<MCObjectWriter>(MAB->createObjectWriter(OS));

    RangesSectionsWriter->writeArangesSection(Writer.get());
    const auto &ARangesContents = OS.str();

-    BC->registerOrUpdateNoteSection(".debug_aranges",
+    BC.registerOrUpdateNoteSection(".debug_aranges",
                                    copyByteArray(ARangesContents),
                                    ARangesContents.size());
  }

  auto RangesSectionContents = RangesSectionsWriter->finalize();
-  BC->registerOrUpdateNoteSection(".debug_ranges",
+  BC.registerOrUpdateNoteSection(".debug_ranges",
                                  copyByteArray(*RangesSectionContents),
                                  RangesSectionContents->size());

  auto LocationListSectionContents = LocationListWriter->finalize();
-  BC->registerOrUpdateNoteSection(".debug_loc",
+  BC.registerOrUpdateNoteSection(".debug_loc",
                                  copyByteArray(*LocationListSectionContents),
                                  LocationListSectionContents->size());
 }

-void RewriteInstance::updateGdbIndexSection() {
-  if (!GdbIndexSection)
+void DWARFRewriter::updateGdbIndexSection() {
+  if (!BC.getGdbIndexSection())
    return;

  // See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html for
  // .gdb_index section format.

-  StringRef GdbIndexContents = GdbIndexSection->getContents();
+  StringRef GdbIndexContents = BC.getGdbIndexSection()->getContents();

  const auto *Data = GdbIndexContents.data();

@ -523,13 +534,13 @@ void RewriteInstance::updateGdbIndexSection() {
  // Map CUs offsets to indices and verify existing index table.
  std::map<uint32_t, uint32_t> OffsetToIndexMap;
  const auto CUListSize = CUTypesOffset - CUListOffset;
-  const auto NumCUs = BC->DwCtx->getNumCompileUnits();
+  const auto NumCUs = BC.DwCtx->getNumCompileUnits();
  if (CUListSize != NumCUs * 16) {
    errs() << "BOLT-ERROR: .gdb_index: CU count mismatch\n";
    exit(1);
  }
  for (unsigned Index = 0; Index < NumCUs; ++Index, Data += 16) {
-    const auto *CU = BC->DwCtx->getCompileUnitAtIndex(Index);
+    const auto *CU = BC.DwCtx->getCompileUnitAtIndex(Index);
    const auto Offset = read64le(Data);
    if (CU->getOffset() != Offset) {
      errs() << "BOLT-ERROR: .gdb_index CU offset mismatch\n";
@ -595,7 +606,123 @@ void RewriteInstance::updateGdbIndexSection() {
  memcpy(Buffer, Data, TrailingSize);

  // Register the new section.
-  BC->registerOrUpdateNoteSection(".gdb_index",
+  BC.registerOrUpdateNoteSection(".gdb_index",
                                  NewGdbIndexContents,
                                  NewGdbIndexSize);
 }
+
+void
+DWARFRewriter::convertToRanges(const DWARFAbbreviationDeclaration *Abbrev) {
+  std::lock_guard<std::mutex> Lock(AbbrevPatcherMutex);
+  AbbrevPatcher->addAttributePatch(Abbrev,
+                                   dwarf::DW_AT_low_pc,
+                                   dwarf::DW_AT_ranges,
+                                   dwarf::DW_FORM_sec_offset);
+  AbbrevPatcher->addAttributePatch(Abbrev,
+                                   dwarf::DW_AT_high_pc,
+                                   dwarf::DW_AT_low_pc,
+                                   dwarf::DW_FORM_udata);
+}
+
+void DWARFRewriter::convertToRanges(DWARFDie DIE,
+                                    const DebugAddressRangesVector &Ranges) {
+  uint64_t RangesSectionOffset;
+  if (Ranges.empty()) {
+    RangesSectionOffset = RangesSectionsWriter->getEmptyRangesOffset();
+  } else {
+    RangesSectionOffset = RangesSectionsWriter->addRanges(Ranges);
+  }
+
+  convertToRanges(DIE, RangesSectionOffset);
+}
+
+void DWARFRewriter::convertPending(const DWARFAbbreviationDeclaration *Abbrev) {
+  if (ConvertedRangesAbbrevs.count(Abbrev))
+    return;
+
+  convertToRanges(Abbrev);
+
+  auto I = PendingRanges.find(Abbrev);
+  if (I != PendingRanges.end()) {
+    for (auto &Pair : I->second) {
+      convertToRanges(Pair.first, {Pair.second});
+    }
+    PendingRanges.erase(I);
+  }
+
+  ConvertedRangesAbbrevs.emplace(Abbrev);
+}
+
+void DWARFRewriter::flushPendingRanges() {
+  for (auto &I : PendingRanges) {
+    for (auto &RangePair : I.second) {
+      patchLowHigh(RangePair.first, RangePair.second);
+    }
+  }
+}
+
+namespace {
+
+void getRangeAttrData(
+    DWARFDie DIE,
+    uint32_t &LowPCOffset, uint32_t &HighPCOffset,
+    DWARFFormValue &LowPCFormValue, DWARFFormValue &HighPCFormValue) {
+  LowPCOffset = -1U;
+  HighPCOffset = -1U;
+  LowPCFormValue = *DIE.find(dwarf::DW_AT_low_pc, &LowPCOffset);
+  HighPCFormValue = *DIE.find(dwarf::DW_AT_high_pc, &HighPCOffset);
+
+  if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr ||
+      (HighPCFormValue.getForm() != dwarf::DW_FORM_addr &&
+       HighPCFormValue.getForm() != dwarf::DW_FORM_data8 &&
+       HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) {
+    errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE "
+             << "at offset 0x" << Twine::utohexstr(DIE.getOffset()) << "\n";
+    return;
+  }
+  if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) {
+    errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. "
+           << "Cannot update DIE at offset 0x"
+           << Twine::utohexstr(DIE.getOffset()) << '\n';
+    return;
+  }
+}
+
+}
+
+void DWARFRewriter::patchLowHigh(DWARFDie DIE, DebugAddressRange Range) {
+  uint32_t LowPCOffset, HighPCOffset;
+  DWARFFormValue LowPCFormValue, HighPCFormValue;
+  getRangeAttrData(
+      DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue);
+  DebugInfoPatcher->addLE64Patch(LowPCOffset, Range.LowPC);
+  if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr ||
+      HighPCFormValue.getForm() == dwarf::DW_FORM_data8) {
+    DebugInfoPatcher->addLE64Patch(HighPCOffset, Range.HighPC - Range.LowPC);
+  } else {
+    DebugInfoPatcher->addLE32Patch(HighPCOffset, Range.HighPC - Range.LowPC);
+  }
+}
+
+void DWARFRewriter::convertToRanges(DWARFDie DIE,
+                                    uint64_t RangesSectionOffset) {
+  uint32_t LowPCOffset, HighPCOffset;
+  DWARFFormValue LowPCFormValue, HighPCFormValue;
+  getRangeAttrData(
+      DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue);
+
+  unsigned LowPCSize = 0;
+  if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr ||
+      HighPCFormValue.getForm() == dwarf::DW_FORM_data8) {
+    LowPCSize = 12;
+  } else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) {
+    LowPCSize = 8;
+  } else {
+    llvm_unreachable("unexpected form");
+  }
+
+  std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
+  DebugInfoPatcher->addLE32Patch(LowPCOffset, RangesSectionOffset);
+  DebugInfoPatcher->addUDataPatch(LowPCOffset + 4, 0, LowPCSize);
+}
+
--- a/src/DWARFRewriter.h
+++ b/src/DWARFRewriter.h
@ -0,0 +1,125 @@
+//===--- DWARFRewriter.h --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H
+#define LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H
+
+#include "DebugData.h"
+#include "RewriteInstance.h"
+#include <map>
+#include <mutex>
+
+namespace llvm {
+
+namespace bolt {
+
+class BinaryFunction;
+
+class DWARFRewriter {
+  DWARFRewriter() = delete;
+
+  BinaryContext &BC;
+
+  using SectionPatchersType = RewriteInstance::SectionPatchersType;
+
+  SectionPatchersType &SectionPatchers;
+
+  SimpleBinaryPatcher *DebugInfoPatcher{nullptr};
+
+  std::mutex DebugInfoPatcherMutex;
+
+  DebugAbbrevPatcher *AbbrevPatcher{nullptr};
+
+  std::mutex AbbrevPatcherMutex;
+
+  /// Stores and serializes information that will be put into the .debug_ranges
+  /// and .debug_aranges DWARF sections.
+  std::unique_ptr<DebugRangesSectionsWriter> RangesSectionsWriter;
+
+  std::unique_ptr<DebugLocWriter> LocationListWriter;
+
+  /// Recursively update debug info for all DIEs in \p Unit.
+  /// If \p Function is not empty, it points to a function corresponding
+  /// to a parent DW_TAG_subprogram node of the current \p DIE.
+  void updateUnitDebugInfo(
+      const DWARFDie DIE, std::vector<const BinaryFunction *> FunctionStack,
+      const BinaryFunction *&CachedFunction,
+      std::map<DebugAddressRangesVector, uint64_t> &CachedRanges);
+
+  /// Patches the binary for an object's address ranges to be updated.
+  /// The object can be a anything that has associated address ranges via either
+  /// DW_AT_low/high_pc or DW_AT_ranges (i.e. functions, lexical blocks, etc).
+  /// \p DebugRangesOffset is the offset in .debug_ranges of the object's
+  /// new address ranges in the output binary.
+  /// \p Unit Compile unit the object belongs to.
+  /// \p DIE is the object's DIE in the input binary.
+  void updateDWARFObjectAddressRanges(const DWARFDie DIE,
+                                      uint64_t DebugRangesOffset);
+
+  /// Generate new contents for .debug_ranges and .debug_aranges section.
+  void finalizeDebugSections();
+
+  /// Patches the binary for DWARF address ranges (e.g. in functions and lexical
+  /// blocks) to be updated.
+  void updateDebugAddressRanges();
+
+  /// Rewrite .gdb_index section if present.
+  void updateGdbIndexSection();
+
+  /// Abbreviations that were converted to use DW_AT_ranges.
+  std::set<const DWARFAbbreviationDeclaration *> ConvertedRangesAbbrevs;
+
+  /// DIEs with abbrevs that were not converted to DW_AT_ranges.
+  /// We only update those when all DIEs have been processed to guarantee that
+  /// the abbrev (which is shared) is intact.
+  std::map<const DWARFAbbreviationDeclaration *,
+           std::vector<std::pair<DWARFDie, DebugAddressRange>>> PendingRanges;
+
+  /// Convert \p Abbrev from using a simple DW_AT_(low|high)_pc range to
+  /// DW_AT_ranges.
+  void convertToRanges(const DWARFAbbreviationDeclaration *Abbrev);
+
+  /// Update \p DIE that was using DW_AT_(low|high)_pc with DW_AT_ranges offset.
+  void convertToRanges(DWARFDie DIE, uint64_t RangesSectionOffset);
+
+  /// Same as above, but takes a vector of \p Ranges as a parameter.
+  void convertToRanges(DWARFDie DIE, const DebugAddressRangesVector &Ranges);
+
+  /// Patch DW_AT_(low|high)_pc values for the \p DIE based on \p Range.
+  void patchLowHigh(DWARFDie DIE, DebugAddressRange Range);
+
+  /// Convert pending ranges associated with the given \p Abbrev.
+  void convertPending(const DWARFAbbreviationDeclaration *Abbrev);
+
+  /// Once all DIEs were seen, update DW_AT_(low|high)_pc values.
+  void flushPendingRanges();
+
+public:
+  DWARFRewriter(BinaryContext &BC,
+                SectionPatchersType &SectionPatchers)
+    : BC(BC), SectionPatchers(SectionPatchers) {}
+
+  /// Main function for updating the DWARF debug info.
+  void updateDebugInfo();
+
+  /// Computes output .debug_line line table offsets for each compile unit,
+  /// and updates stmt_list for a corresponding compile unit.
+  void updateLineTableOffsets();
+
+  /// Updates debug line information for non-simple functions, which are not
+  /// rewritten.
+  void updateDebugLineInfoForNonSimpleFunctions();
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
--- a/src/DataAggregator.cpp
+++ b/src/DataAggregator.cpp
@ -14,6 +14,7 @@

 #include "BinaryContext.h"
 #include "BinaryFunction.h"
+#include "BoltAddressTranslation.h"
 #include "DataAggregator.h"
 #include "Heatmap.h"
 #include "llvm/Support/Debug.h"
@ -54,6 +55,13 @@ IgnoreBuildID("ignore-build-id",
  cl::init(false),
  cl::cat(AggregatorCategory));

+static cl::opt<bool>
+FilterMemProfile("filter-mem-profile",
+  cl::desc("if processing a memory profile, filter out stack or heap accesses that "
+           "won't be useful for BOLT to reduce profile file size"),
+  cl::init(true),
+  cl::cat(AggregatorCategory));
+
 static cl::opt<unsigned>
 HeatmapBlock("block-size",
  cl::desc("size of a heat map block in bytes (default 64)"),
@ -88,6 +96,13 @@ TimeAggregator("time-aggr",
  cl::ZeroOrMore,
  cl::cat(AggregatorCategory));

+static cl::opt<bool>
+UseEventPC("use-event-pc",
+  cl::desc("use event PC in combination with LBR sampling"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(AggregatorCategory));
+
 static cl::opt<bool>
 WriteAutoFDOData("autofdo",
  cl::desc("generate autofdo textual data instead of bolt data"),
@ -210,6 +225,7 @@ void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
    *Str++ = 0;
  } while (true);

+  Argv.push_back("-f");
  Argv.push_back("-i");
  Argv.push_back(PerfDataFilename.data());
  Argv.push_back(nullptr);
@ -232,13 +248,18 @@ void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
  TempFiles.push_back(PPI.StderrPath.data());

  Optional<StringRef> Redirects[] = {
-      llvm::None,                       // Stdin
+      llvm::None,                        // Stdin
      StringRef(PPI.StdoutPath.data()),  // Stdout
      StringRef(PPI.StderrPath.data())}; // Stderr

-  DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> "
-               << PPI.StdoutPath.data() << " 2> "
-               << PPI.StderrPath.data() << "\n");
+  DEBUG({
+      dbgs() << "Launching perf: ";
+      for (const char *Arg : Argv)
+        dbgs() << Arg << " ";
+      dbgs() << " 1> "
+             << PPI.StdoutPath.data() << " 2> "
+             << PPI.StderrPath.data() << "\n";
+    });

  if (Wait) {
    PPI.PI.ReturnCode =
@ -422,11 +443,8 @@ std::error_code DataAggregator::writeAutoFDOData() {
  return std::error_code();
 }

-void DataAggregator::parseProfile(
-    BinaryContext &BC,
-    std::map<uint64_t, BinaryFunction> &BFs) {
+void DataAggregator::parseProfile(BinaryContext &BC) {
  this->BC = &BC;
-  this->BFs = &BFs;

  if (opts::ReadPreAggregated) {
    parsePreAggregated();
@ -546,9 +564,7 @@ void DataAggregator::parseProfile(
  deleteTempFiles();
 }

-void DataAggregator::processProfile(
-    BinaryContext &BC,
-    std::map<uint64_t, BinaryFunction> &BFs) {
+void DataAggregator::processProfile(BinaryContext &BC) {
  if (opts::ReadPreAggregated)
    processPreAggregated();
  else if (opts::BasicAggregation)
@ -559,7 +575,7 @@ void DataAggregator::processProfile(
  processMemEvents();

  // Mark all functions with registered events as having a valid profile.
-  for (auto &BFI : BFs) {
+  for (auto &BFI : BC.getBinaryFunctions()) {
    auto &BF = BFI.second;
    if (BF.getBranchData()) {
      const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE
@ -577,19 +593,46 @@ void DataAggregator::processProfile(
 }

 BinaryFunction *
-DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) {
+DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) const {
  if (!BC->containsAddress(Address))
    return nullptr;

-  auto FI = BFs->upper_bound(Address);
-  if (FI == BFs->begin())
-    return nullptr;
-  --FI;
+  // Use shallow search to avoid fetching the parent function, in case
+  // BinaryContext linked two functions. When aggregating data and writing the
+  // profile, we want to write offsets relative to the closest symbol in the
+  // symbol table, not relative to the parent function, to avoid creating
+  // profile that is too fragile and depends on the layout of other functions.
+  return BC->getBinaryFunctionContainingAddress(Address, /*CheckPastEnd=*/false,
+                                                /*UseMaxSize=*/true,
+                                                /*Shallow=*/true);
+}

-  const auto UsedSize = FI->second.getMaxSize();
-  if (Address >= FI->first + UsedSize)
-    return nullptr;
-  return &FI->second;
+StringRef DataAggregator::getLocationName(BinaryFunction &Func,
+                                          uint64_t Count) {
+  if (!BAT)
+    return Func.getNames()[0];
+
+  const auto *OrigFunc = &Func;
+  if (const auto HotAddr = BAT->fetchParentAddress(Func.getAddress())) {
+    NumColdSamples += Count;
+    auto *HotFunc = getBinaryFunctionContainingAddress(HotAddr);
+    if (HotFunc)
+      OrigFunc = HotFunc;
+  }
+  const auto &Names = OrigFunc->getNames();
+  // If it is a local function, prefer the name containing the file name where
+  // the local function was declared
+  for (const auto &Name : Names) {
+    StringRef AlternativeName = Name;
+    size_t FileNameIdx = AlternativeName.find('/');
+    // Confirm the alternative name has the pattern Symbol/FileName/1 before
+    // using it
+    if (FileNameIdx == StringRef::npos ||
+        AlternativeName.find('/', FileNameIdx + 1) == StringRef::npos)
+      continue;
+    return AlternativeName;
+  }
+  return Names[0];
 }

 bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address,
@ -597,12 +640,17 @@ bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address,
  auto I = FuncsToSamples.find(Func.getNames()[0]);
  if (I == FuncsToSamples.end()) {
    bool Success;
+    StringRef LocName = getLocationName(Func, Count);
    std::tie(I, Success) = FuncsToSamples.insert(std::make_pair(
        Func.getNames()[0],
-        FuncSampleData(Func.getNames()[0], FuncSampleData::ContainerTy())));
+        FuncSampleData(LocName, FuncSampleData::ContainerTy())));
  }

-  I->second.bumpCount(Address - Func.getAddress(), Count);
+  Address -= Func.getAddress();
+  if (BAT)
+    Address = BAT->translate(Func, Address, /*IsBranchSrc=*/false);
+
+  I->second.bumpCount(Address, Count);
  return true;
 }

@ -612,12 +660,26 @@ bool DataAggregator::doIntraBranch(BinaryFunction &Func, uint64_t From,
  FuncBranchData *AggrData = Func.getBranchData();
  if (!AggrData) {
    AggrData = &FuncsToBranches[Func.getNames()[0]];
-    AggrData->Name = Func.getNames()[0];
+    AggrData->Name = getLocationName(Func, Count);
    Func.setBranchData(AggrData);
  }

-  AggrData->bumpBranchCount(From - Func.getAddress(), To - Func.getAddress(),
-                            Count, Mispreds);
+  From -= Func.getAddress();
+  To -= Func.getAddress();
+  DEBUG(dbgs() << "BOLT-DEBUG: bumpBranchCount: " << Func.getPrintName()
+               << " @ " << Twine::utohexstr(From) << " -> "
+               << Func.getPrintName() << " @ " << Twine::utohexstr(To)
+               << '\n');
+  if (BAT) {
+    From = BAT->translate(Func, From, /*IsBranchSrc=*/true);
+    To = BAT->translate(Func, To, /*IsBranchSrc=*/false);
+    DEBUG(dbgs() << "BOLT-DEBUG: BAT translation on bumpBranchCount: "
+                 << Func.getPrintName() << " @ " << Twine::utohexstr(From)
+                 << " -> " << Func.getPrintName() << " @ "
+                 << Twine::utohexstr(To) << '\n');
+  }
+
+  AggrData->bumpBranchCount(From, To, Count, Mispreds);
  return true;
 }

@ -630,26 +692,30 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
  StringRef SrcFunc;
  StringRef DstFunc;
  if (FromFunc) {
-    SrcFunc = FromFunc->getNames()[0];
+    SrcFunc = getLocationName(*FromFunc, Count);
    FromAggrData = FromFunc->getBranchData();
    if (!FromAggrData) {
-      FromAggrData = &FuncsToBranches[SrcFunc];
+      FromAggrData = &FuncsToBranches[FromFunc->getNames()[0]];
      FromAggrData->Name = SrcFunc;
      FromFunc->setBranchData(FromAggrData);
    }
    From -= FromFunc->getAddress();
+    if (BAT)
+      From = BAT->translate(*FromFunc, From, /*IsBranchSrc=*/true);

    FromFunc->recordExit(From, Mispreds, Count);
  }
  if (ToFunc) {
-    DstFunc = ToFunc->getNames()[0];
+    DstFunc = getLocationName(*ToFunc, 0);
    ToAggrData = ToFunc->getBranchData();
    if (!ToAggrData) {
-      ToAggrData = &FuncsToBranches[DstFunc];
+      ToAggrData = &FuncsToBranches[ToFunc->getNames()[0]];
      ToAggrData->Name = DstFunc;
      ToFunc->setBranchData(ToAggrData);
    }
    To -= ToFunc->getAddress();
+    if (BAT)
+      To = BAT->translate(*ToFunc, To, /*IsBranchSrc=*/false);

    ToFunc->recordEntry(To, Mispreds, Count);
  }
@ -684,13 +750,19 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
  auto *FromFunc = getBinaryFunctionContainingAddress(First.To);
  auto *ToFunc = getBinaryFunctionContainingAddress(Second.From);
  if (!FromFunc || !ToFunc) {
+    DEBUG(
+        dbgs() << "Out of range trace starting in " << FromFunc->getPrintName()
+               << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
+               << " and ending in " << ToFunc->getPrintName() << " @ "
+               << ToFunc->getPrintName() << " @ "
+               << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n');
    NumLongRangeTraces += Count;
    return false;
  }
  if (FromFunc != ToFunc) {
    NumInvalidTraces += Count;
-    DEBUG(dbgs() << "Trace starting in " << FromFunc->getPrintName() << " @ "
-                 << Twine::utohexstr(First.To - FromFunc->getAddress())
+    DEBUG(dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
+                 << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
                 << " and ending in " << ToFunc->getPrintName() << " @ "
                 << ToFunc->getPrintName() << " @ "
                 << Twine::utohexstr(Second.From - ToFunc->getAddress())
@ -698,12 +770,22 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
    return false;
  }

-  auto FTs = FromFunc->getFallthroughsInTrace(First, Second, Count);
+  auto FTs = BAT ? BAT->getFallthroughsInTrace(*FromFunc, First, Second)
+                 : FromFunc->getFallthroughsInTrace(First, Second, Count);
  if (!FTs) {
+    DEBUG(dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
+                 << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
+                 << " and ending in " << ToFunc->getPrintName() << " @ "
+                 << ToFunc->getPrintName() << " @ "
+                 << Twine::utohexstr(Second.From - ToFunc->getAddress())
+                 << '\n');
    NumInvalidTraces += Count;
    return false;
  }

+  DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for "
+               << FromFunc->getPrintName() << ":" << Twine::utohexstr(First.To)
+               << " to " << Twine::utohexstr(Second.From) << ".\n");
  for (const auto &Pair : *FTs) {
    doIntraBranch(*FromFunc, Pair.first + FromFunc->getAddress(),
                  Pair.second + FromFunc->getAddress(), Count, false);
@ -796,7 +878,7 @@ ErrorOr<DataAggregator::PerfBranchSample> DataAggregator::parseBranchSample() {
  auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes);
  if (MMapInfoIter == BinaryMMapInfo.end()) {
    consumeRestOfLine();
-    return Res;
+    return make_error_code(errc::no_such_process);
  }

  while (checkAndConsumeFS()) {}
@ -1009,8 +1091,11 @@ std::error_code DataAggregator::printLBRHeatMap() {

  while (hasData()) {
    auto SampleRes = parseBranchSample();
-    if (std::error_code EC = SampleRes.getError())
+    if (auto EC = SampleRes.getError()) {
+      if (EC == errc::no_such_process)
+        continue;
      return EC;
+    }

    auto &Sample = SampleRes.get();

@ -1071,33 +1156,39 @@ std::error_code DataAggregator::parseBranchEvents() {
  uint64_t NumTotalSamples{0};
  uint64_t NumEntries{0};
  uint64_t NumSamples{0};
+  uint64_t NumSamplesNoLBR{0};
  uint64_t NumTraces{0};

  while (hasData()) {
    ++NumTotalSamples;

    auto SampleRes = parseBranchSample();
-    if (std::error_code EC = SampleRes.getError())
+    if (auto EC = SampleRes.getError()) {
+      if (EC == errc::no_such_process)
+        continue;
      return EC;
+    }
+    ++NumSamples;

    auto &Sample = SampleRes.get();
    if (opts::WriteAutoFDOData)
      ++BasicSamples[Sample.PC];

-    if (Sample.LBR.empty())
+    if (Sample.LBR.empty()) {
+      ++NumSamplesNoLBR;
      continue;
+    }

-    ++NumSamples;
    NumEntries += Sample.LBR.size();

-    // LBRs are stored in reverse execution order. NextLBR refers to the next
-    // executed branch record.
-    const LBREntry *NextLBR{nullptr};
+    // LBRs are stored in reverse execution order. NextPC refers to the next
+    // recorded executed PC.
+    uint64_t NextPC = opts::UseEventPC ? Sample.PC : 0;
    for (const auto &LBR : Sample.LBR) {
-      if (NextLBR) {
+      if (NextPC) {
        // Record fall-through trace.
        const auto TraceFrom = LBR.To;
-        const auto TraceTo = NextLBR->From;
+        const auto TraceTo = NextPC;
        const auto *TraceBF = getBinaryFunctionContainingAddress(TraceFrom);
        if (TraceBF && TraceBF->containsAddress(TraceTo)) {
            auto &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
@ -1108,14 +1199,37 @@ std::error_code DataAggregator::parseBranchEvents() {
            }
        } else {
          if (TraceBF && getBinaryFunctionContainingAddress(TraceTo)) {
+            DEBUG(dbgs() << "Invalid trace starting in "
+                         << TraceBF->getPrintName() << " @ "
+                         << Twine::utohexstr(TraceFrom - TraceBF->getAddress())
+                         << " and ending @ " << Twine::utohexstr(TraceTo)
+                         << '\n');
            ++NumInvalidTraces;
          } else {
+            DEBUG(
+                dbgs() << "Out of range trace starting in "
+                       << (TraceBF ? TraceBF->getPrintName() : "None") << " @ "
+                       << Twine::utohexstr(
+                              TraceFrom - (TraceBF ? TraceBF->getAddress() : 0))
+                       << " and ending in "
+                       << (getBinaryFunctionContainingAddress(TraceTo)
+                               ? getBinaryFunctionContainingAddress(TraceTo)
+                                     ->getPrintName()
+                               : "None")
+                       << " @ "
+                       << Twine::utohexstr(
+                              TraceTo -
+                              (getBinaryFunctionContainingAddress(TraceTo)
+                                   ? getBinaryFunctionContainingAddress(TraceTo)
+                                         ->getAddress()
+                                   : 0))
+                       << '\n');
            ++NumLongRangeTraces;
          }
        }
        ++NumTraces;
      }
-      NextLBR = &LBR;
+      NextPC = LBR.From;

      auto From = LBR.From;
      if (!getBinaryFunctionContainingAddress(From))
@ -1159,14 +1273,23 @@ std::error_code DataAggregator::parseBranchEvents() {
  outs() << "PERF2BOLT: read " << NumSamples << " samples and "
         << NumEntries << " LBR entries\n";
  if (NumTotalSamples) {
-    const auto IgnoredSamples = NumTotalSamples - NumSamples;
-    const auto PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples;
-    outs() << "PERF2BOLT: " << IgnoredSamples << " samples";
-    printColored(outs(), PercentIgnored, 20, 50);
-    outs() << " were ignored\n";
-    if (PercentIgnored > 50.0f) {
-      errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples were "
-                "attributed to the input binary\n";
+    if (NumSamples && NumSamplesNoLBR == NumSamples) {
+      // Note: we don't know if perf2bolt is being used to parse memory samples
+      // at this point. In this case, it is OK to parse zero LBRs.
+      errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
+                "LBR. Record profile with perf record -j any or run perf2bolt "
+                "in no-LBR mode with -nl (the performance improvement in -nl "
+                "mode may be limited)\n";
+    } else {
+      const auto IgnoredSamples = NumTotalSamples - NumSamples;
+      const auto PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples;
+      outs() << "PERF2BOLT: " << IgnoredSamples << " samples";
+      printColored(outs(), PercentIgnored, 20, 50);
+      outs() << " were ignored\n";
+      if (PercentIgnored > 50.0f) {
+        errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples "
+                  "were attributed to the input binary\n";
+      }
    }
  }
  outs() << "PERF2BOLT: traces mismatching disassembled function contents: "
@ -1191,6 +1314,19 @@ std::error_code DataAggregator::parseBranchEvents() {
  }
  outs() << "\n";

+  if (NumColdSamples > 0) {
+    const auto ColdSamples = NumColdSamples * 100.0f / NumTotalSamples;
+    outs() << "PERF2BOLT: " << NumColdSamples
+           << format(" (%.1f%%)", ColdSamples)
+           << " samples recorded in cold regions of split functions.\n";
+    if (ColdSamples > 5.0f) {
+      outs()
+          << "WARNING: The BOLT-processed binary where samples were collected "
+             "likely used bad data or your service observed a large shift in "
+             "profile. You may want to audit this.\n";
+    }
+  }
+
  return std::error_code();
 }

@ -1330,11 +1466,17 @@ void DataAggregator::processMemEvents() {
    if (MemFunc) {
      MemName = MemFunc->getNames()[0];
      Addr -= MemFunc->getAddress();
-    } else if (Addr) {  // TODO: filter heap/stack/nulls here?
+    } else if (Addr) {
      if (auto *BD = BC->getBinaryDataContainingAddress(Addr)) {
        MemName = BD->getName();
        Addr -= BD->getAddress();
+      } else if (opts::FilterMemProfile) {
+        // Filter out heap/stack accesses
+        continue;
      }
+    } else if (opts::FilterMemProfile) {
+      // Filter out nulls
+      continue;
    }

    const Location FuncLoc(!FuncName.empty(), FuncName, PC);
@ -1394,7 +1536,7 @@ void DataAggregator::processPreAggregated() {
                     AggrEntry.From.Offset, false};
      LBREntry Second{AggrEntry.To.Offset, AggrEntry.To.Offset, false};
      doTrace(First, Second, AggrEntry.Count);
-      ++NumTraces;
+      NumTraces += AggrEntry.Count;
      break;
    }
    }
@ -1776,6 +1918,8 @@ std::error_code DataAggregator::writeAggregatedFile() const {
  uint64_t BranchValues{0};
  uint64_t MemValues{0};

+  if (BAT)
+    OutFile << "boltedcollection\n";
  if (opts::BasicAggregation) {
    OutFile << "no_lbr";
    for (const auto &Entry : EventNames) {
--- a/src/DataAggregator.h
+++ b/src/DataAggregator.h
@ -28,6 +28,7 @@ namespace bolt {

 class BinaryFunction;
 class BinaryContext;
+class BoltAddressTranslation;

 /// DataAggregator inherits all parsing logic from DataReader as well as
 /// its data structures used to represent aggregated profile data in memory.
@ -172,11 +173,13 @@ class DataAggregator : public DataReader {

  /// References to core BOLT data structures
  BinaryContext *BC{nullptr};
-  std::map<uint64_t, BinaryFunction> *BFs{nullptr};
+
+  BoltAddressTranslation *BAT{nullptr};

  /// Aggregation statistics
  uint64_t NumInvalidTraces{0};
  uint64_t NumLongRangeTraces{0};
+  uint64_t NumColdSamples{0};

  /// Looks into system PATH for Linux Perf and set up the aggregator to use it
  void findPerfExecutable();
@ -194,7 +197,16 @@ class DataAggregator : public DataReader {

  /// Look up which function contains an address by using out map of
  /// disassembled BinaryFunctions
-  BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address);
+  BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address) const;
+
+  /// Retrieve the location name to be used for samples recorded in \p Func.
+  /// If doing BAT translation, link cold parts to the hot part  names (used by
+  /// the original binary).  \p Count specifies how many samples were recorded
+  /// at that location, so we can tally total activity in cold areas if we are
+  /// dealing with profiling data collected in a bolted binary. For LBRs,
+  /// \p Count should only be used for the source of the branch to avoid
+  /// counting cold activity twice (one for source and another for destination).
+  StringRef getLocationName(BinaryFunction &Func, uint64_t Count);

  /// Semantic actions - parser hooks to interpret parsed perf samples
  /// Register a sample (non-LBR mode), i.e. a new hit at \p Address
@ -226,7 +238,9 @@ class DataAggregator : public DataReader {
  std::error_code printLBRHeatMap();

  /// Parse a single perf sample containing a PID associated with a sequence of
-  /// LBR entries
+  /// LBR entries. If the PID does not correspond to the binary we are looking
+  /// for, return std::errc::no_such_process. If other parsing errors occur,
+  /// return the error. Otherwise, return the parsed sample.
  ErrorOr<PerfBranchSample> parseBranchSample();

  /// Parse a single perf sample containing a PID associated with an event name
@ -384,6 +398,14 @@ public:
  /// Set the file name to save aggregate data to
  void setOutputFDataName(StringRef Name) { OutputFDataName = Name; }

+  /// Set Bolt Address Translation Table when processing samples collected in
+  /// bolted binaries
+  void setBAT(BoltAddressTranslation *B) { BAT = B; }
+
+  /// Returns true if this aggregation job is using a translation table to
+  /// remap samples collected on binaries already processed by BOLT.
+  bool usesBAT() const { return BAT; }
+
  /// Start an aggregation job asynchronously. Call "aggregate" to finish it
  /// with a list of disassembled functions.
  void start(StringRef PerfDataFilename);
@ -400,12 +422,10 @@ public:

  /// Parse profile and mark functions/objects with profile.
  /// Don't assign profile to functions yet.
-  void parseProfile(BinaryContext &BC,
-                    std::map<uint64_t, BinaryFunction> &BFs);
+  void parseProfile(BinaryContext &BC);

  /// Populate functions with profile.
-  void processProfile(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs);
+  void processProfile(BinaryContext &BC);

  /// Check whether \p FileName is a perf.data file
  static bool checkPerfDataMagic(StringRef FileName);
--- a/src/DataReader.cpp
+++ b/src/DataReader.cpp
@ -251,16 +251,31 @@ void FuncMemData::update(const Location &Offset, const Location &Addr) {
  ++Data[Iter->second].Count;
 }

+void DataReader::reset() {
+  for (auto &Pair : getAllFuncsBranchData()) {
+    Pair.second.Used = false;
+  }
+  for (auto &Pair : getAllFuncsMemData()) {
+    Pair.second.Used = false;
+  }
+}
+
 ErrorOr<std::unique_ptr<DataReader>>
 DataReader::readPerfData(StringRef Path, raw_ostream &Diag) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
-      MemoryBuffer::getFileOrSTDIN(Path);
-  if (std::error_code EC = MB.getError()) {
-    Diag << "Cannot open " << Path << ": " << EC.message() << "\n";
+  auto MB = MemoryBuffer::getFileOrSTDIN(Path);
+  if (auto EC = MB.getError()) {
+    Diag << "cannot open " << Path << ": " << EC.message() << "\n";
    return EC;
  }
-  auto DR = make_unique<DataReader>(std::move(MB.get()), Diag);
-  DR->parse();
+  auto DR = llvm::make_unique<DataReader>(std::move(MB.get()), Diag);
+  if (auto EC = DR->parse()) {
+    return EC;
+  }
+  if (!DR->ParsingBuf.empty()) {
+    Diag << "WARNING: invalid profile data detected at line " << DR->Line
+         << ". Possibly corrupted profile.\n";
+  }
+
  DR->buildLTONameMaps();
  return std::move(DR);
 }
@ -280,6 +295,13 @@ bool DataReader::expectAndConsumeFS() {
  return true;
 }

+void DataReader::consumeAllRemainingFS() {
+  while (ParsingBuf[0] == FieldSeparator) {
+    ParsingBuf = ParsingBuf.drop_front(1);
+    Col += 1;
+  }
+}
+
 bool DataReader::checkAndConsumeNewLine() {
  if (ParsingBuf[0] != '\n')
    return false;
@ -374,12 +396,14 @@ ErrorOr<Location> DataReader::parseLocation(char EndChar,

  if (!expectAndConsumeFS())
    return make_error_code(llvm::errc::io_error);
+  consumeAllRemainingFS();

  // Read the string containing the symbol or the DSO name
  auto NameRes = parseString(FieldSeparator);
  if (std::error_code EC = NameRes.getError())
    return EC;
  StringRef Name = NameRes.get();
+  consumeAllRemainingFS();

  // Read the offset
  auto Offset = parseHexField(EndChar, EndNl);
@ -395,21 +419,25 @@ ErrorOr<BranchInfo> DataReader::parseBranchInfo() {
    return EC;
  Location From = Res.get();

+  consumeAllRemainingFS();
  Res = parseLocation(FieldSeparator);
  if (std::error_code EC = Res.getError())
    return EC;
  Location To = Res.get();

+  consumeAllRemainingFS();
  auto MRes = parseNumberField(FieldSeparator);
  if (std::error_code EC = MRes.getError())
    return EC;
  int64_t NumMispreds = MRes.get();

+  consumeAllRemainingFS();
  auto BRes = parseNumberField(FieldSeparator, /* EndNl = */ true);
  if (std::error_code EC = BRes.getError())
    return EC;
  int64_t NumBranches = BRes.get();

+  consumeAllRemainingFS();
  if (!checkAndConsumeNewLine()) {
    reportError("expected end of line");
    return make_error_code(llvm::errc::io_error);
@ -424,15 +452,18 @@ ErrorOr<MemInfo> DataReader::parseMemInfo() {
    return EC;
  Location Offset = Res.get();

+  consumeAllRemainingFS();
  Res = parseMemLocation(FieldSeparator);
  if (std::error_code EC = Res.getError())
    return EC;
  Location Addr = Res.get();

+  consumeAllRemainingFS();
  auto CountRes = parseNumberField(FieldSeparator, true);
  if (std::error_code EC = CountRes.getError())
    return EC;

+  consumeAllRemainingFS();
  if (!checkAndConsumeNewLine()) {
    reportError("expected end of line");
    return make_error_code(llvm::errc::io_error);
@ -447,11 +478,13 @@ ErrorOr<SampleInfo> DataReader::parseSampleInfo() {
    return EC;
  Location Address = Res.get();

+  consumeAllRemainingFS();
  auto BRes = parseNumberField(FieldSeparator, /* EndNl = */ true);
  if (std::error_code EC = BRes.getError())
    return EC;
  int64_t Occurrences = BRes.get();

+  consumeAllRemainingFS();
  if (!checkAndConsumeNewLine()) {
    reportError("expected end of line");
    return make_error_code(llvm::errc::io_error);
@ -483,6 +516,20 @@ ErrorOr<bool> DataReader::maybeParseNoLBRFlag() {
  return true;
 }

+ErrorOr<bool> DataReader::maybeParseBATFlag() {
+  if (ParsingBuf.size() < 16 || ParsingBuf.substr(0, 16) != "boltedcollection")
+    return false;
+  ParsingBuf = ParsingBuf.drop_front(16);
+  Col += 16;
+
+  if (!checkAndConsumeNewLine()) {
+    reportError("malformed boltedcollection line");
+    return make_error_code(llvm::errc::io_error);
+  }
+  return true;
+}
+
+
 bool DataReader::hasBranchData() {
  if (ParsingBuf.size() == 0)
    return false;
@ -599,6 +646,17 @@ std::error_code DataReader::parse() {
  if (!FlagOrErr)
    return FlagOrErr.getError();
  NoLBRMode = *FlagOrErr;
+
+  auto BATFlagOrErr = maybeParseBATFlag();
+  if (!BATFlagOrErr)
+    return BATFlagOrErr.getError();
+  BATMode = *BATFlagOrErr;
+
+  if (!hasBranchData() && !hasMemData()) {
+    Diag << "ERROR: no valid profile data found\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+
  if (NoLBRMode)
    return parseInNoLBRMode();

--- a/src/DataReader.h
+++ b/src/DataReader.h
@ -303,6 +303,9 @@ public:
  static ErrorOr<std::unique_ptr<DataReader>> readPerfData(StringRef Path,
                                                           raw_ostream &Diag);

+  /// Mark all profile objects unused.
+  void reset();
+
  /// Parses the input bolt data file into internal data structures. We expect
  /// the file format to follow the syntax below.
  ///
@ -398,6 +401,11 @@ public:
  /// Return false only if we are running with profiling data that lacks LBR.
  bool hasLBR() const { return !NoLBRMode; }

+  /// Return true if the profiling data was collected in a bolted binary. This
+  /// means we lose the ability to identify stale data at some branch locations,
+  /// since we have to be more permissive in some cases.
+  bool collectedInBoltedBinary() const { return BATMode; }
+
  /// Return true if event named \p Name was used to collect this profile data.
  bool usesEvent(StringRef Name) const {
    for (auto I = EventNames.begin(), E = EventNames.end(); I != E; ++I) {
@ -417,6 +425,7 @@ protected:

  void reportError(StringRef ErrorMsg);
  bool expectAndConsumeFS();
+  void consumeAllRemainingFS();
  bool checkAndConsumeNewLine();
  ErrorOr<StringRef> parseString(char EndChar, bool EndNl=false);
  ErrorOr<int64_t> parseNumberField(char EndChar, bool EndNl=false);
@ -432,6 +441,7 @@ protected:
  ErrorOr<SampleInfo> parseSampleInfo();
  ErrorOr<MemInfo> parseMemInfo();
  ErrorOr<bool> maybeParseNoLBRFlag();
+  ErrorOr<bool> maybeParseBATFlag();
  bool hasBranchData();
  bool hasMemData();

@ -448,6 +458,7 @@ protected:
  FuncsToSamplesMapTy FuncsToSamples;
  FuncsToMemEventsMapTy FuncsToMemEvents;
  bool NoLBRMode{false};
+  bool BATMode{false};
  StringSet<> EventNames;
  static const char FieldSeparator = ' ';

--- a/src/DebugData.cpp
+++ b/src/DebugData.cpp
@ -40,7 +40,7 @@ namespace {
 // Returns the number of written bytes.
 uint64_t writeAddressRanges(
    MCObjectWriter *Writer,
-    const DWARFAddressRangesVector &AddressRanges,
+    const DebugAddressRangesVector &AddressRanges,
    const bool WriteRelativeRanges = false) {
  for (auto &Range : AddressRanges) {
    Writer->writeLE64(Range.LowPC);
@ -62,26 +62,26 @@ DebugRangesSectionsWriter::DebugRangesSectionsWriter(BinaryContext *BC) {
    std::unique_ptr<MCObjectWriter>(BC->createObjectWriter(*RangesStream));

  // Add an empty range as the first entry;
-  SectionOffset += writeAddressRanges(Writer.get(), DWARFAddressRangesVector{});
-}
-
-uint64_t DebugRangesSectionsWriter::addCURanges(
-    uint64_t CUOffset,
-    DWARFAddressRangesVector &&Ranges) {
-  const auto RangesOffset = addRanges(Ranges);
-  CUAddressRanges.emplace(CUOffset, std::move(Ranges));
-
-  return RangesOffset;
+  SectionOffset += writeAddressRanges(Writer.get(), DebugAddressRangesVector{});
 }

 uint64_t
-DebugRangesSectionsWriter::addRanges(const BinaryFunction *Function,
-                                     DWARFAddressRangesVector &&Ranges) {
+DebugRangesSectionsWriter::addCURanges(uint64_t CUOffset,
+                                       DebugAddressRangesVector &&Ranges) {
+  const auto RangesOffset = addRanges(Ranges);
+
+  std::lock_guard<std::mutex> Lock(CUAddressRangesMutex);
+  CUAddressRanges.emplace(CUOffset, std::move(Ranges));
+  return RangesOffset;
+}
+
+uint64_t DebugRangesSectionsWriter::addRanges(
+    const BinaryFunction *Function, DebugAddressRangesVector &&Ranges,
+    const BinaryFunction *&CachedFunction,
+    std::map<DebugAddressRangesVector, uint64_t> &CachedRanges) {
  if (Ranges.empty())
    return getEmptyRangesOffset();

-  static const BinaryFunction *CachedFunction;
-
  if (Function == CachedFunction) {
    const auto RI = CachedRanges.find(Ranges);
    if (RI != CachedRanges.end())
@ -98,10 +98,13 @@ DebugRangesSectionsWriter::addRanges(const BinaryFunction *Function,
 }

 uint64_t
-DebugRangesSectionsWriter::addRanges(const DWARFAddressRangesVector &Ranges) {
+DebugRangesSectionsWriter::addRanges(const DebugAddressRangesVector &Ranges) {
  if (Ranges.empty())
    return getEmptyRangesOffset();

+  // Reading the SectionOffset and updating it should be atomic to guarantee
+  // unique and correct offsets in patches.
+  std::lock_guard<std::mutex> Lock(WriterMutex);
  const auto EntryOffset = SectionOffset;
  SectionOffset += writeAddressRanges(Writer.get(), Ranges);

@ -165,14 +168,17 @@ uint64_t DebugLocWriter::addList(const DWARFDebugLoc::LocationList &LocList) {
  if (LocList.Entries.empty())
    return getEmptyListOffset();

+  // Reading the SectionOffset and updating it should be atomic to guarantee
+  // unique and correct offsets in patches.
+  std::lock_guard<std::mutex> Lock(WriterMutex);
  const auto EntryOffset = SectionOffset;
+
  for (const auto &Entry : LocList.Entries) {
    Writer->writeLE64(Entry.Begin);
    Writer->writeLE64(Entry.End);
    Writer->writeLE16(Entry.Loc.size());
-    Writer->writeBytes(
-        StringRef(reinterpret_cast<const char *>(Entry.Loc.data()),
-                  Entry.Loc.size()));
+    Writer->writeBytes(StringRef(
+        reinterpret_cast<const char *>(Entry.Loc.data()), Entry.Loc.size()));
    SectionOffset += 2 * 8 + 2 + Entry.Loc.size();
  }
  Writer->writeLE64(0);
@ -229,42 +235,29 @@ void SimpleBinaryPatcher::patchBinary(std::string &BinaryContents) {
  }
 }

-void DebugAbbrevPatcher::addAttributePatch(const DWARFUnit *Unit,
-                                           uint32_t AbbrevCode,
-                                           dwarf::Attribute AttrTag,
-                                           uint8_t NewAttrTag,
-                                           uint8_t NewAttrForm) {
-  assert(Unit && "No compile unit specified.");
-  Patches[Unit].emplace_back(
-      AbbrevAttrPatch{AbbrevCode, AttrTag, NewAttrTag, NewAttrForm});
+void DebugAbbrevPatcher::addAttributePatch(
+    const DWARFAbbreviationDeclaration *Abbrev,
+    dwarf::Attribute AttrTag,
+    uint8_t NewAttrTag,
+    uint8_t NewAttrForm) {
+  assert(Abbrev && "no abbreviation specified");
+  AbbrevPatches.emplace(
+      AbbrevAttrPatch{Abbrev, AttrTag, NewAttrTag, NewAttrForm});
 }

 void DebugAbbrevPatcher::patchBinary(std::string &Contents) {
  SimpleBinaryPatcher Patcher;

-  for (const auto &UnitPatchesPair : Patches) {
-    const auto *Unit = UnitPatchesPair.first;
-    const auto *UnitAbbreviations = Unit->getAbbreviations();
-    assert(UnitAbbreviations &&
-           "Compile unit doesn't have associated abbreviations.");
-    const auto &UnitPatches = UnitPatchesPair.second;
-    for (const auto &AttrPatch : UnitPatches) {
-      const auto *AbbreviationDeclaration =
-        UnitAbbreviations->getAbbreviationDeclaration(AttrPatch.Code);
-      assert(AbbreviationDeclaration && "No abbreviation with given code.");
-      const auto Attribute =
-          AbbreviationDeclaration->findAttribute(AttrPatch.Attr);
+  for (const auto &Patch : AbbrevPatches) {
+    const auto Attribute = Patch.Abbrev->findAttribute(Patch.Attr);
+    assert(Attribute && "Specified attribute doesn't occur in abbreviation.");

-      assert(Attribute && "Specified attribute doesn't occur in abbreviation.");
-      // Because we're only handling standard values (i.e. no DW_FORM_GNU_* or
-      // DW_AT_APPLE_*), they are all small (< 128) and encoded in a single
-      // byte in ULEB128, otherwise it'll be more tricky as we may need to
-      // grow or shrink the section.
-      Patcher.addBytePatch(Attribute->AttrOffset,
-          AttrPatch.NewAttr);
-      Patcher.addBytePatch(Attribute->FormOffset,
-          AttrPatch.NewForm);
-    }
+    // Because we're only handling standard values (i.e. no DW_FORM_GNU_* or
+    // DW_AT_APPLE_*), they are all small (< 128) and encoded in a single
+    // byte in ULEB128, otherwise it'll be more tricky as we may need to
+    // grow or shrink the section.
+    Patcher.addBytePatch(Attribute->AttrOffset, Patch.NewAttr);
+    Patcher.addBytePatch(Attribute->FormOffset, Patch.NewForm);
  }
  Patcher.patchBinary(Contents);
 }
--- a/src/DebugData.h
+++ b/src/DebugData.h
@ -20,26 +20,42 @@
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
+#include <mutex>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>

-#include "BinaryBasicBlock.h"
-
 namespace llvm {

 class DWARFCompileUnit;
-class DWARFDebugInfoEntryMinimal;
 class MCObjectWriter;

 namespace bolt {

 class BinaryContext;
-class BasicBlockTable;
-class BinaryBasicBlock;
 class BinaryFunction;

-/// Eeferences a row in a DWARFDebugLine::LineTable by the DWARF
+/// Address range representation. Takes less space than DWARFAddressRange.
+struct DebugAddressRange {
+  uint64_t LowPC{0};
+  uint64_t HighPC{0};
+
+  DebugAddressRange() = default;
+
+  DebugAddressRange(uint64_t LowPC, uint64_t HighPC)
+      : LowPC(LowPC), HighPC(HighPC) {}
+};
+
+static inline bool operator<(const DebugAddressRange &LHS,
+                             const DebugAddressRange &RHS) {
+  return std::tie(LHS.LowPC, LHS.HighPC) < std::tie(RHS.LowPC, RHS.HighPC);
+}
+
+/// DebugAddressRangesVector - represents a set of absolute address ranges.
+using DebugAddressRangesVector = SmallVector<DebugAddressRange, 2>;
+
+/// References a row in a DWARFDebugLine::LineTable by the DWARF
 /// Context index of the DWARF Compile Unit that owns the Line Table and the row
 /// index. This is tied to our IR during disassembly so that we can later update
 /// .debug_line information. RowIndex has a base of 1, which means a RowIndex
@ -84,14 +100,16 @@ public:
  DebugRangesSectionsWriter(BinaryContext *BC);

  /// Add ranges for CU matching \p CUOffset and return offset into section.
-  uint64_t addCURanges(uint64_t CUOffset, DWARFAddressRangesVector &&Ranges);
+  uint64_t addCURanges(uint64_t CUOffset, DebugAddressRangesVector &&Ranges);

  /// Add ranges with caching for \p Function.
-  uint64_t addRanges(const BinaryFunction *Function,
-                     DWARFAddressRangesVector &&Ranges);
+  uint64_t
+  addRanges(const BinaryFunction *Function, DebugAddressRangesVector &&Ranges,
+            const BinaryFunction *&CachedFunction,
+            std::map<DebugAddressRangesVector, uint64_t> &CachedRanges);

  /// Add ranges and return offset into section.
-  uint64_t addRanges(const DWARFAddressRangesVector &Ranges);
+  uint64_t addRanges(const DebugAddressRangesVector &Ranges);

  /// Writes .debug_aranges with the added ranges to the MCObjectWriter.
  void writeArangesSection(MCObjectWriter *Writer) const;
@ -106,7 +124,7 @@ public:
  uint64_t getEmptyRangesOffset() const { return EmptyRangesOffset; }

  /// Map DWARFCompileUnit index to ranges.
-  using CUAddressRangesType = std::map<uint64_t, DWARFAddressRangesVector>;
+  using CUAddressRangesType = std::map<uint64_t, DebugAddressRangesVector>;

  /// Return ranges for a given CU.
  const CUAddressRangesType &getCUAddressRanges() const {
@ -124,6 +142,8 @@ private:

  std::unique_ptr<MCObjectWriter> Writer;

+  std::mutex WriterMutex;
+
  /// Current offset in the section (updated as new entries are written).
  /// Starts with 16 since the first 16 bytes are reserved for an empty range.
  uint32_t SectionOffset{0};
@ -133,11 +153,10 @@ private:
  /// (first address, interval size).
  CUAddressRangesType CUAddressRanges;

+  std::mutex CUAddressRangesMutex;
+
  /// Offset of an empty address ranges list.
  static constexpr uint64_t EmptyRangesOffset{0};
-
-  /// Cached used for de-duplicating entries for the same function.
-  std::map<DWARFAddressRangesVector, uint64_t> CachedRanges;
 };

 /// Serializes the .debug_loc DWARF section with LocationLists.
@ -160,6 +179,8 @@ private:

  std::unique_ptr<MCObjectWriter> Writer;

+  std::mutex WriterMutex;
+
  /// Offset of an empty location list.
  static uint64_t const EmptyListOffset = 0;

@ -219,25 +240,33 @@ class DebugAbbrevPatcher : public BinaryPatcher {
 private:
  /// Patch of changing one attribute to another.
  struct AbbrevAttrPatch {
-    uint32_t Code;    // Code of abbreviation to be modified.
+    const DWARFAbbreviationDeclaration *Abbrev;
    dwarf::Attribute Attr;    // ID of attribute to be replaced.
-    uint8_t NewAttr;  // ID of the new attribute.
-    uint8_t NewForm;  // Form of the new attribute.
+    uint8_t NewAttr;          // ID of the new attribute.
+    uint8_t NewForm;          // Form of the new attribute.
+
+    bool operator==(const AbbrevAttrPatch &RHS) const {
+      return Abbrev == RHS.Abbrev && Attr == RHS.Attr;
+    }
  };

-  std::map<const DWARFUnit *, std::vector<AbbrevAttrPatch>> Patches;
+  struct AbbrevHash {
+    std::size_t operator()(const AbbrevAttrPatch &P) const {
+      return std::hash<uint64_t>()(((uint64_t)P.Abbrev << 16) + P.Attr);
+    }
+  };
+
+  std::unordered_set<AbbrevAttrPatch, AbbrevHash> AbbrevPatches;

 public:
  ~DebugAbbrevPatcher() { }
-  /// Adds a patch to change an attribute of an abbreviation that belongs to
-  /// \p Unit to another attribute.
-  /// \p AbbrevCode code of the abbreviation to be modified.
+  /// Adds a patch to change an attribute of the abbreviation
+  /// \p Abbrev the abbreviation to be modified.
  /// \p AttrTag ID of the attribute to be replaced.
  /// \p NewAttrTag ID of the new attribute.
  /// \p NewAttrForm Form of the new attribute.
  /// We only handle standard forms, that are encoded in a single byte.
-  void addAttributePatch(const DWARFUnit *Unit,
-                         uint32_t AbbrevCode,
+  void addAttributePatch(const DWARFAbbreviationDeclaration *Abbrev,
                         dwarf::Attribute AttrTag,
                         uint8_t NewAttrTag,
                         uint8_t NewAttrForm);
--- a/src/DynoStats.cpp
+++ b/src/DynoStats.cpp
@ -0,0 +1,259 @@
+//===--- DynoStats.cpp ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "DynoStats.h"
+#include "BinaryBasicBlock.h"
+#include "BinaryFunction.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <numeric>
+#include <string>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltCategory;
+
+static cl::opt<uint32_t>
+DynoStatsScale("dyno-stats-scale",
+  cl::desc("scale to be applied while reporting dyno stats"),
+  cl::Optional,
+  cl::init(1),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+constexpr const char *DynoStats::Desc[];
+
+bool DynoStats::operator<(const DynoStats &Other) const {
+  return std::lexicographical_compare(
+    &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT],
+    &Other.Stats[FIRST_DYNO_STAT], &Other.Stats[LAST_DYNO_STAT]
+  );
+}
+
+bool DynoStats::operator==(const DynoStats &Other) const {
+  return std::equal(
+    &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT],
+    &Other.Stats[FIRST_DYNO_STAT]
+  );
+}
+
+bool DynoStats::lessThan(const DynoStats &Other,
+                         ArrayRef<Category> Keys) const {
+  return std::lexicographical_compare(
+    Keys.begin(), Keys.end(),
+    Keys.begin(), Keys.end(),
+    [this,&Other](const Category A, const Category) {
+      return Stats[A] < Other.Stats[A];
+    }
+  );
+}
+
+void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const {
+  auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat,
+                                uint64_t OtherStat) {
+    OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name;
+    if (Other) {
+      if (Stat != OtherStat) {
+       OtherStat = std::max(OtherStat, uint64_t(1)); // to prevent divide by 0
+       OS << format(" (%+.1f%%)",
+                    ( (float) Stat - (float) OtherStat ) * 100.0 /
+                      (float) (OtherStat) );
+      } else {
+        OS << " (=)";
+      }
+    }
+    OS << '\n';
+  };
+
+  for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1;
+       Stat < DynoStats::LAST_DYNO_STAT;
+       ++Stat) {
+
+    if (!PrintAArch64Stats && Stat == DynoStats::VENEER_CALLS_AARCH64)
+      continue;
+
+    printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0);
+  }
+}
+
+void DynoStats::operator+=(const DynoStats &Other) {
+  for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1;
+       Stat < DynoStats::LAST_DYNO_STAT;
+       ++Stat) {
+    Stats[Stat] += Other[Stat];
+  }
+}
+
+DynoStats getDynoStats(const BinaryFunction &BF) {
+  auto &BC = BF.getBinaryContext();
+
+  DynoStats Stats(/*PrintAArch64Stats*/ BC.isAArch64());
+
+  // Return empty-stats about the function we don't completely understand.
+  if (!BF.isSimple() || !BF.hasValidProfile())
+    return Stats;
+
+  // If the function was folded in non-relocation mode we keep its profile
+  // for optimization. However, it should be excluded from the dyno stats.
+  if (BF.isFolded())
+    return Stats;
+
+  // Update enumeration of basic blocks for correct detection of branch'
+  // direction.
+  BF.updateLayoutIndices();
+
+  for (const auto &BB : BF.layout()) {
+    // The basic block execution count equals to the sum of incoming branch
+    // frequencies. This may deviate from the sum of outgoing branches of the
+    // basic block especially since the block may contain a function that
+    // does not return or a function that throws an exception.
+    const uint64_t BBExecutionCount =  BB->getKnownExecutionCount();
+
+    // Ignore empty blocks and blocks that were not executed.
+    if (BB->getNumNonPseudos() == 0 || BBExecutionCount == 0)
+      continue;
+
+    // Count AArch64 linker-inserted veneers
+    if(BF.isAArch64Veneer())
+        Stats[DynoStats::VENEER_CALLS_AARCH64] += BF.getKnownExecutionCount();
+
+    // Count the number of calls by iterating through all instructions.
+    for (const auto &Instr : *BB) {
+      if (BC.MIB->isStore(Instr)) {
+        Stats[DynoStats::STORES] += BBExecutionCount;
+      }
+      if (BC.MIB->isLoad(Instr)) {
+        Stats[DynoStats::LOADS] += BBExecutionCount;
+      }
+
+      if (!BC.MIB->isCall(Instr))
+        continue;
+
+      uint64_t CallFreq = BBExecutionCount;
+      if (BC.MIB->getConditionalTailCall(Instr)) {
+        CallFreq =
+          BC.MIB->getAnnotationWithDefault<uint64_t>(Instr, "CTCTakenCount");
+      }
+      Stats[DynoStats::FUNCTION_CALLS] += CallFreq;
+      if (BC.MIB->isIndirectCall(Instr)) {
+        Stats[DynoStats::INDIRECT_CALLS] += CallFreq;
+      } else if (const auto *CallSymbol = BC.MIB->getTargetSymbol(Instr)) {
+        const auto *BF = BC.getFunctionForSymbol(CallSymbol);
+        if (BF && BF->isPLTFunction()) {
+          Stats[DynoStats::PLT_CALLS] += CallFreq;
+
+          // We don't process PLT functions and hence have to adjust relevant
+          // dynostats here for:
+          //
+          //   jmp *GOT_ENTRY(%rip)
+          //
+          // NOTE: this is arch-specific.
+          Stats[DynoStats::FUNCTION_CALLS] += CallFreq;
+          Stats[DynoStats::INDIRECT_CALLS] += CallFreq;
+          Stats[DynoStats::LOADS] += CallFreq;
+          Stats[DynoStats::INSTRUCTIONS] += CallFreq;
+        }
+      }
+    }
+
+    Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount;
+
+    // Jump tables.
+    const auto *LastInstr = BB->getLastNonPseudoInstr();
+    if (BC.MIB->getJumpTable(*LastInstr)) {
+      Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount;
+      DEBUG(
+        static uint64_t MostFrequentJT;
+        if (BBExecutionCount > MostFrequentJT) {
+          MostFrequentJT = BBExecutionCount;
+          dbgs() << "BOLT-INFO: most frequently executed jump table is in "
+                 << "function " << BF << " in basic block " << BB->getName()
+                 << " executed totally " << BBExecutionCount << " times.\n";
+        }
+      );
+      continue;
+    }
+
+    if (BC.MIB->isIndirectBranch(*LastInstr) && !BC.MIB->isCall(*LastInstr)) {
+      Stats[DynoStats::UNKNOWN_INDIRECT_BRANCHES] += BBExecutionCount;
+      continue;
+    }
+
+    // Update stats for branches.
+    const MCSymbol *TBB = nullptr;
+    const MCSymbol *FBB = nullptr;
+    MCInst *CondBranch = nullptr;
+    MCInst *UncondBranch = nullptr;
+    if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) {
+      continue;
+    }
+
+    if (!CondBranch && !UncondBranch) {
+      continue;
+    }
+
+    // Simple unconditional branch.
+    if (!CondBranch) {
+      Stats[DynoStats::UNCOND_BRANCHES] += BBExecutionCount;
+      continue;
+    }
+
+    // CTCs: instruction annotations could be stripped, hence check the number
+    // of successors to identify conditional tail calls.
+    if (BB->succ_size() == 1) {
+      if (BB->branch_info_begin() != BB->branch_info_end())
+        Stats[DynoStats::UNCOND_BRANCHES] += BB->branch_info_begin()->Count;
+      continue;
+    }
+
+    // Conditional branch that could be followed by an unconditional branch.
+    auto TakenCount = BB->getTakenBranchInfo().Count;
+    if (TakenCount == BinaryBasicBlock::COUNT_NO_PROFILE)
+      TakenCount = 0;
+
+    auto NonTakenCount = BB->getFallthroughBranchInfo().Count;
+    if (NonTakenCount == BinaryBasicBlock::COUNT_NO_PROFILE)
+      NonTakenCount = 0;
+
+    if (BF.isForwardBranch(BB, BB->getConditionalSuccessor(true))) {
+      Stats[DynoStats::FORWARD_COND_BRANCHES] += BBExecutionCount;
+      Stats[DynoStats::FORWARD_COND_BRANCHES_TAKEN] += TakenCount;
+    } else {
+      Stats[DynoStats::BACKWARD_COND_BRANCHES] += BBExecutionCount;
+      Stats[DynoStats::BACKWARD_COND_BRANCHES_TAKEN] += TakenCount;
+    }
+
+    if (UncondBranch) {
+      Stats[DynoStats::UNCOND_BRANCHES] += NonTakenCount;
+    }
+  }
+
+  return Stats;
+}
+
+} // namespace bolt
+} // namespace llvm
--- a/src/DynoStats.h
+++ b/src/DynoStats.h
@ -0,0 +1,179 @@
+//===--- DynoStats.h ------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H
+#define LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H
+
+#include "BinaryFunction.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+namespace bolt {
+
+/// Class encapsulating runtime statistics about an execution unit.
+class DynoStats {
+
+#define DYNO_STATS\
+  D(FIRST_DYNO_STAT,              "<reserved>", Fn)\
+  D(FORWARD_COND_BRANCHES,        "executed forward branches", Fn)\
+  D(FORWARD_COND_BRANCHES_TAKEN,  "taken forward branches", Fn)\
+  D(BACKWARD_COND_BRANCHES,       "executed backward branches", Fn)\
+  D(BACKWARD_COND_BRANCHES_TAKEN, "taken backward branches", Fn)\
+  D(UNCOND_BRANCHES,              "executed unconditional branches", Fn)\
+  D(FUNCTION_CALLS,               "all function calls", Fn)\
+  D(INDIRECT_CALLS,               "indirect calls", Fn)\
+  D(PLT_CALLS,                    "PLT calls", Fn)\
+  D(INSTRUCTIONS,                 "executed instructions", Fn)\
+  D(LOADS,                        "executed load instructions", Fn)\
+  D(STORES,                       "executed store instructions", Fn)\
+  D(JUMP_TABLE_BRANCHES,          "taken jump table branches", Fn)\
+  D(UNKNOWN_INDIRECT_BRANCHES,    "taken unknown indirect branches", Fn)\
+  D(ALL_BRANCHES,                 "total branches",\
+      Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\
+  D(ALL_TAKEN,                    "taken branches",\
+      Fadd(TAKEN_CONDITIONAL, UNCOND_BRANCHES))\
+  D(NONTAKEN_CONDITIONAL,         "non-taken conditional branches",\
+      Fsub(ALL_CONDITIONAL, TAKEN_CONDITIONAL))\
+  D(TAKEN_CONDITIONAL,            "taken conditional branches",\
+      Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\
+  D(ALL_CONDITIONAL,              "all conditional branches",\
+      Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\
+  D(VENEER_CALLS_AARCH64,         "linker-inserted veneer calls", Fn)\
+  D(LAST_DYNO_STAT,               "<reserved>", 0)
+
+public:
+#define D(name, ...) name,
+  enum Category : uint8_t { DYNO_STATS };
+#undef D
+
+
+private:
+  uint64_t Stats[LAST_DYNO_STAT+1];
+  bool PrintAArch64Stats;
+
+#define D(name, desc, ...) desc,
+  static constexpr const char *Desc[] = { DYNO_STATS };
+#undef D
+
+public:
+  DynoStats(bool PrintAArch64Stats) {
+    this->PrintAArch64Stats = PrintAArch64Stats;
+    for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat)
+      Stats[Stat] = 0;
+  }
+
+  uint64_t &operator[](size_t I) {
+    assert(I > FIRST_DYNO_STAT && I < LAST_DYNO_STAT &&
+           "index out of bounds");
+    return Stats[I];
+  }
+
+  uint64_t operator[](size_t I) const {
+    switch (I) {
+#define D(name, desc, func) \
+    case name: \
+      return func;
+#define Fn Stats[I]
+#define Fadd(a, b) operator[](a) + operator[](b)
+#define Fsub(a, b) operator[](a) - operator[](b)
+#define F(a) operator[](a)
+#define Radd(a, b) (a + b)
+#define Rsub(a, b) (a - b)
+    DYNO_STATS
+#undef Rsub
+#undef Radd
+#undef F
+#undef Fsub
+#undef Fadd
+#undef Fn
+#undef D
+    default:
+      llvm_unreachable("index out of bounds");
+    }
+    return 0;
+  }
+
+  void print(raw_ostream &OS, const DynoStats *Other = nullptr) const;
+
+  void operator+=(const DynoStats &Other);
+  bool operator<(const DynoStats &Other) const;
+  bool operator==(const DynoStats &Other) const;
+  bool operator!=(const DynoStats &Other) const { return !operator==(Other); }
+  bool lessThan(const DynoStats &Other, ArrayRef<Category> Keys) const;
+
+  static const char* Description(const Category C) {
+    return Desc[C];
+  }
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) {
+  Stats.print(OS, nullptr);
+  return OS;
+}
+
+DynoStats operator+(const DynoStats &A, const DynoStats &B);
+
+/// Return dynostats for the function.
+///
+/// The function relies on branch instructions being in-sync with CFG for
+/// branch instructions stats. Thus it is better to call it after
+/// fixBranches().
+DynoStats getDynoStats(const BinaryFunction &BF);
+
+/// Return program-wide dynostats.
+template <typename FuncsType>
+inline DynoStats getDynoStats(const FuncsType &Funcs) {
+  bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
+  DynoStats dynoStats(IsAArch64);
+  for (auto &BFI : Funcs) {
+    auto &BF = BFI.second;
+    if (BF.isSimple()) {
+      dynoStats += getDynoStats(BF);
+    }
+  }
+  return dynoStats;
+}
+
+/// Call a function with optional before and after dynostats printing.
+template <typename FnType, typename FuncsType>
+inline void
+callWithDynoStats(FnType &&Func,
+                  const FuncsType &Funcs,
+                  StringRef Phase,
+                  const bool Flag) {
+  bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
+  DynoStats DynoStatsBefore(IsAArch64);
+  if (Flag) {
+    DynoStatsBefore = getDynoStats(Funcs);
+  }
+
+  Func();
+
+  if (Flag) {
+    const auto DynoStatsAfter = getDynoStats(Funcs);
+    const auto Changed = (DynoStatsAfter != DynoStatsBefore);
+    outs() << "BOLT-INFO: program-wide dynostats after running "
+           << Phase << (Changed ? "" : " (no change)") << ":\n\n"
+           << DynoStatsBefore << '\n';
+    if (Changed) {
+      DynoStatsAfter.print(outs(), &DynoStatsBefore);
+    }
+    outs() << '\n';
+  }
+}
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
--- a/src/Exceptions.cpp
+++ b/src/Exceptions.cpp
@ -266,7 +266,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
          return;
        }
        if (TTypeEncoding & DW_EH_PE_indirect) {
-          auto PointerOrErr = BC.extractPointerAtAddress(TypeAddress);
+          auto PointerOrErr = BC.getPointerAtAddress(TypeAddress);
          assert(PointerOrErr && "failed to decode indirect address");
          TypeAddress = *PointerOrErr;
        }
@ -349,9 +349,8 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
      if ((TTypeEncoding & DW_EH_PE_pcrel) && (TypeAddress == TTEntryAddress)) {
        TypeAddress = 0;
      }
-      if (TypeAddress &&
-          (TTypeEncoding & DW_EH_PE_indirect)) {
-        auto PointerOrErr = BC.extractPointerAtAddress(TypeAddress);
+      if (TypeAddress && (TTypeEncoding & DW_EH_PE_indirect)) {
+        auto PointerOrErr = BC.getPointerAtAddress(TypeAddress);
        assert(PointerOrErr && "failed to decode indirect address");
        TypeAddress = *PointerOrErr;
      }
@ -431,9 +430,14 @@ void BinaryFunction::updateEHRanges() {
        continue;

      // Same symbol is used for the beginning and the end of the range.
-      const MCSymbol *EHSymbol = BC.Ctx->createTempSymbol("EH", true);
+      const MCSymbol *EHSymbol;
      MCInst EHLabel;
-      BC.MIB->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get());
+      {
+        std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
+        EHSymbol = BC.Ctx->createTempSymbol("EH", true);
+        BC.MIB->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get());
+      }
+
      II = std::next(BB->insertPseudoInstr(II, EHLabel));

      // At this point we could be in one of the following states:
@ -526,42 +530,19 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) {
  // a landing pad, this means that the first landing pad offset will be 0.
  // As a result, an exception handling runtime will ignore this landing pad,
  // because zero offset denotes the absence of a landing pad.
+  // For this reason, we emit LPStart value of 0 and output an absolute value
+  // of the landing pad in the table.
  //
-  // To workaround this issue, we issue a special LPStart for cold fragments
-  // that is equal to FDE start minus 1 byte.
-  //
-  // Note that main function fragment cannot start with a landing pad and we
-  // omit LPStart.
-  const MCExpr *LPStartExpr = nullptr;
-  std::function<void(const MCSymbol *)> emitLandingPad;
-  if (EmitColdPart) {
-    Streamer->EmitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format
-    LPStartExpr = MCBinaryExpr::createSub(
-                          MCSymbolRefExpr::create(StartSymbol, *BC.Ctx.get()),
-                          MCConstantExpr::create(1, *BC.Ctx.get()),
-                          *BC.Ctx.get());
-    Streamer->EmitValue(LPStartExpr, 4);
-    emitLandingPad = [&](const MCSymbol *LPSymbol) {
-      if (!LPSymbol) {
-        Streamer->EmitIntValue(0, 4);
-        return;
-      }
-      Streamer->EmitValue(MCBinaryExpr::createSub(
-                              MCSymbolRefExpr::create(LPSymbol, *BC.Ctx.get()),
-                              LPStartExpr,
-                              *BC.Ctx.get()),
-                          4);
-    };
-  } else {
-    Streamer->EmitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format
-    emitLandingPad = [&](const MCSymbol *LPSymbol) {
-      if (!LPSymbol) {
-        Streamer->EmitIntValue(0, 4);
-        return;
-      }
-      Streamer->emitAbsoluteSymbolDiff(LPSymbol, StartSymbol, 4);
-    };
-  }
+  // FIXME: this may break PIEs and DSOs where the base address is not 0.
+  Streamer->EmitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format
+  Streamer->EmitIntValue(0, 4);
+  auto emitLandingPad = [&](const MCSymbol *LPSymbol) {
+    if (!LPSymbol) {
+      Streamer->EmitIntValue(0, 4);
+      return;
+    }
+    Streamer->EmitSymbolValue(LPSymbol, 4);
+  };

  Streamer->EmitIntValue(TTypeEncoding, 1);        // TType format

@ -697,17 +678,6 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
    return true;

  const FDE &CurFDE = *I->second;
-  if (Function.getSize() != CurFDE.getAddressRange()) {
-    if (opts::Verbosity >= 1) {
-      errs() << "BOLT-WARNING: CFI information size mismatch for function \""
-             << Function << "\""
-             << format(": Function size is %dB, CFI covers "
-                       "%dB\n",
-                       Function.getSize(), CurFDE.getAddressRange());
-    }
-    return false;
-  }
-
  auto LSDA = CurFDE.getLSDAAddress();
  Function.setLSDAAddress(LSDA ? *LSDA : 0);

@ -868,7 +838,8 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
          return false;
        default:
          if (opts::Verbosity >= 1) {
-            errs() << "BOLT-WARNING: Unrecognized CFI instruction\n";
+            errs() << "BOLT-WARNING: Unrecognized CFI instruction: "
+                   << Instr.Opcode << '\n';
          }
          return false;
        }
--- a/src/ExecutableFileMemoryManager.cpp
+++ b/src/ExecutableFileMemoryManager.cpp
@ -0,0 +1,110 @@
+//===--- ExecutableFileMemoryManager.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ExecutableFileMemoryManager.h"
+#include "RewriteInstance.h"
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "efmm"
+
+using namespace llvm;
+using namespace object;
+using namespace bolt;
+
+namespace llvm {
+
+namespace bolt {
+
+uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size,
+                                                      unsigned Alignment,
+                                                      unsigned SectionID,
+                                                      StringRef SectionName,
+                                                      bool IsCode,
+                                                      bool IsReadOnly) {
+  // Register a debug section as a note section.
+  if (!ObjectsLoaded && RewriteInstance::isDebugSection(SectionName)) {
+    uint8_t *DataCopy = new uint8_t[Size];
+    auto &Section = BC.registerOrUpdateNoteSection(SectionName,
+                                                   DataCopy,
+                                                   Size,
+                                                   Alignment);
+    Section.setSectionID(SectionID);
+    assert(!Section.isAllocatable() && "note sections cannot be allocatable");
+    return DataCopy;
+  }
+
+  uint8_t *Ret;
+  if (IsCode) {
+    Ret = SectionMemoryManager::allocateCodeSection(Size, Alignment,
+                                                    SectionID, SectionName);
+  } else {
+    Ret = SectionMemoryManager::allocateDataSection(Size, Alignment,
+                                                    SectionID, SectionName,
+                                                    IsReadOnly);
+  }
+
+  const auto Flags = BinarySection::getFlags(IsReadOnly, IsCode, true);
+  SmallVector<char, 256> Buf;
+  if (ObjectsLoaded > 0)
+    SectionName = (Twine(SectionName) + ".bolt.extra." + Twine(ObjectsLoaded))
+                      .toStringRef(Buf);
+
+  auto &Section = BC.registerOrUpdateSection(SectionName,
+                                             ELF::SHT_PROGBITS,
+                                             Flags,
+                                             Ret,
+                                             Size,
+                                             Alignment);
+  Section.setSectionID(SectionID);
+  assert(Section.isAllocatable() &&
+         "verify that allocatable is marked as allocatable");
+
+  DEBUG(dbgs() << "BOLT: allocating " << (Section.isLocal() ? "local " : "")
+               << (IsCode ? "code" : (IsReadOnly ? "read-only data" : "data"))
+               << " section : " << SectionName
+               << " with size " << Size << ", alignment " << Alignment
+               << " at 0x" << Ret << ", ID = " << SectionID << "\n");
+
+  return Ret;
+}
+
+/// Notifier for non-allocatable (note) section.
+uint8_t *ExecutableFileMemoryManager::recordNoteSection(
+    const uint8_t *Data,
+    uintptr_t Size,
+    unsigned Alignment,
+    unsigned SectionID,
+    StringRef SectionName) {
+  DEBUG(dbgs() << "BOLT: note section "
+               << SectionName
+               << " with size " << Size << ", alignment " << Alignment
+               << " at 0x"
+               << Twine::utohexstr(reinterpret_cast<uint64_t>(Data)) << '\n');
+  auto &Section = BC.registerOrUpdateNoteSection(SectionName,
+                                                 copyByteArray(Data, Size),
+                                                 Size,
+                                                 Alignment);
+  Section.setSectionID(SectionID);
+  assert(!Section.isAllocatable() && "note sections cannot be allocatable");
+  return Section.getOutputData();
+}
+
+bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) {
+  DEBUG(dbgs() << "BOLT: finalizeMemory()\n");
+  ++ObjectsLoaded;
+  return SectionMemoryManager::finalizeMemory(ErrMsg);
+}
+
+ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { }
+
+}
+
+}
--- a/src/ExecutableFileMemoryManager.h
+++ b/src/ExecutableFileMemoryManager.h
@ -0,0 +1,100 @@
+//===--- ExecutableFileMemoryManager.h ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H
+#define LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H
+
+#include "BinaryContext.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+namespace bolt {
+
+struct SegmentInfo {
+  uint64_t Address;           /// Address of the segment in memory.
+  uint64_t Size;              /// Size of the segment in memory.
+  uint64_t FileOffset;        /// Offset in the file.
+  uint64_t FileSize;          /// Size in file.
+
+  void print(raw_ostream &OS) const {
+    OS << "SegmentInfo { Address: 0x"
+       << Twine::utohexstr(Address) << ", Size: 0x"
+       << Twine::utohexstr(Size) << ", FileOffset: 0x"
+       << Twine::utohexstr(FileOffset) << ", FileSize: 0x"
+       << Twine::utohexstr(FileSize) << "}";
+  };
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const SegmentInfo &SegInfo) {
+  SegInfo.print(OS);
+  return OS;
+}
+
+/// Class responsible for allocating and managing code and data sections.
+class ExecutableFileMemoryManager : public SectionMemoryManager {
+private:
+  uint8_t *allocateSection(intptr_t Size,
+                           unsigned Alignment,
+                           unsigned SectionID,
+                           StringRef SectionName,
+                           bool IsCode,
+                           bool IsReadOnly);
+  BinaryContext &BC;
+  bool AllowStubs;
+
+public:
+  // Our linker's main purpose is to handle a single object file, created
+  // by RewriteInstance after reading the input binary and reordering it.
+  // After objects finish loading, we increment this. Therefore, whenever
+  // this is greater than zero, we are dealing with additional objects that
+  // will not be managed by BinaryContext but only exist to support linking
+  // user-supplied objects into the main input executable.
+  uint32_t ObjectsLoaded{0};
+
+  /// [start memory address] -> [segment info] mapping.
+  std::map<uint64_t, SegmentInfo> SegmentMapInfo;
+
+  ExecutableFileMemoryManager(BinaryContext &BC, bool AllowStubs)
+    : BC(BC), AllowStubs(AllowStubs) {}
+
+  ~ExecutableFileMemoryManager();
+
+  uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID,
+                               StringRef SectionName) override {
+    return allocateSection(Size, Alignment, SectionID, SectionName,
+                           /*IsCode=*/true, true);
+  }
+
+  uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID, StringRef SectionName,
+                               bool IsReadOnly) override {
+    return allocateSection(Size, Alignment, SectionID, SectionName,
+                           /*IsCode=*/false, IsReadOnly);
+  }
+
+  uint8_t *recordNoteSection(const uint8_t *Data, uintptr_t Size,
+                             unsigned Alignment, unsigned SectionID,
+                             StringRef SectionName) override;
+
+  bool allowStubAllocation() const override { return AllowStubs; }
+
+  bool finalizeMemory(std::string *ErrMsg = nullptr) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
--- a/src/JumpTable.cpp
+++ b/src/JumpTable.cpp
@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//

 #include "JumpTable.h"
+#include "BinaryFunction.h"
 #include "BinarySection.h"
 #include "Relocation.h"
 #include "llvm/MC/MCStreamer.h"
@ -27,8 +28,27 @@ extern cl::opt<JumpTableSupportLevel> JumpTables;
 extern cl::opt<unsigned> Verbosity;
 }

+JumpTable::JumpTable(StringRef Name,
+                     uint64_t Address,
+                     std::size_t EntrySize,
+                     JumpTableType Type,
+                     LabelMapType &&Labels,
+                     BinaryFunction &BF,
+                     BinarySection &Section)
+  : BinaryData(Name, Address, 0, EntrySize, Section),
+    EntrySize(EntrySize),
+    OutputEntrySize(EntrySize),
+    Type(Type),
+    Labels(Labels),
+    Parent(&BF) {
+}
+
 std::pair<size_t, size_t>
 JumpTable::getEntriesForAddress(const uint64_t Addr) const {
+  // Check if this is not an address, but a cloned JT id
+  if ((int64_t)Addr < 0ll)
+    return std::make_pair(0, Entries.size());
+
  const uint64_t InstOffset = Addr - getAddress();
  size_t StartIndex = 0, EndIndex = 0;
  uint64_t Offset = 0;
@ -55,13 +75,12 @@ JumpTable::getEntriesForAddress(const uint64_t Addr) const {
  return std::make_pair(StartIndex, EndIndex);
 }

-bool JumpTable::replaceDestination(uint64_t JTAddress,
-                                   const MCSymbol *OldDest,
+bool JumpTable::replaceDestination(uint64_t JTAddress, const MCSymbol *OldDest,
                                   MCSymbol *NewDest) {
  bool Patched{false};
  const auto Range = getEntriesForAddress(JTAddress);
-  for (auto I = &Entries[Range.first], E = &Entries[Range.second];
-       I != E; ++I) {
+  for (auto I = &Entries[Range.first], E = &Entries[Range.second]; I != E;
+       ++I) {
    auto &Entry = *I;
    if (Entry == OldDest) {
      Patched = true;
@ -153,16 +172,20 @@ uint64_t JumpTable::emit(MCStreamer *Streamer,

 void JumpTable::print(raw_ostream &OS) const {
  uint64_t Offset = 0;
+  if (Type == JTT_PIC)
+    OS << "PIC ";
+  OS << "Jump table " << getName() << " for function " << *Parent << " at 0x"
+     << Twine::utohexstr(getAddress()) << " with a total count of " << Count
+     << ":\n";
+  for (const auto EntryOffset : OffsetEntries) {
+    OS << "  " << Twine::utohexstr(EntryOffset) << '\n';
+  }
  for (const auto *Entry : Entries) {
    auto LI = Labels.find(Offset);
-    if (LI != Labels.end()) {
-      OS << "Jump Table " << LI->second->getName() << " at @0x"
-         << Twine::utohexstr(getAddress()+Offset);
-      if (Offset) {
-        OS << " (possibly part of larger jump table):\n";
-      } else {
-        OS << " with total count of " << Count << ":\n";
-      }
+    if (Offset && LI != Labels.end()) {
+      OS << "Jump Table " << LI->second->getName() << " at 0x"
+         << Twine::utohexstr(getAddress() + Offset)
+        << " (possibly part of larger jump table):\n";
    }
    OS << format("  0x%04" PRIx64 " : ", Offset) << Entry->getName();
    if (!Counts.empty()) {
@ -174,18 +197,3 @@ void JumpTable::print(raw_ostream &OS) const {
  }
  OS << "\n\n";
 }
-
-JumpTable::JumpTable(StringRef Name,
-                     uint64_t Address,
-                     std::size_t EntrySize,
-                     JumpTableType Type,
-                     decltype(OffsetEntries) &&OffsetEntries,
-                     decltype(Labels) &&Labels,
-                     BinarySection &Section)
-  : BinaryData(Name, Address, 0, EntrySize, Section),
-    EntrySize(EntrySize),
-    OutputEntrySize(EntrySize),
-    Type(Type),
-    OffsetEntries(OffsetEntries),
-    Labels(Labels)
-{ }
--- a/src/JumpTable.h
+++ b/src/JumpTable.h
@ -30,11 +30,19 @@ enum JumpTableSupportLevel : char {
  JTS_AGGRESSIVE = 4, /// Aggressive splitting of jump tables.
 };

+class BinaryFunction;
+
 /// Representation of a jump table.
 ///
 /// The jump table may include other jump tables that are referenced by
 /// a different label at a different offset in this jump table.
 class JumpTable : public BinaryData {
+  friend class BinaryContext;
+
+  JumpTable() = delete;
+  JumpTable(const JumpTable &) = delete;
+  JumpTable &operator=(const JumpTable &) = delete;
+
 public:
  enum JumpTableType : char {
    JTT_NORMAL,
@ -60,7 +68,8 @@ public:
  std::vector<MCSymbol *> Entries;

  /// All the entries as offsets into a function. Invalid after CFG is built.
-  std::vector<uint64_t> OffsetEntries;
+  using OffsetsType = std::vector<uint64_t>;
+  OffsetsType OffsetEntries;

  /// Map <Offset> -> <Label> used for embedded jump tables. Label at 0 offset
  /// is the main label for the jump table.
@ -75,6 +84,20 @@ public:
  /// Total number of times this jump table was used.
  uint64_t Count{0};

+  /// BinaryFunction this jump tables belongs to.
+  BinaryFunction *Parent{nullptr};
+
+private:
+  /// Constructor should only be called by a BinaryContext.
+  JumpTable(StringRef Name,
+            uint64_t Address,
+            std::size_t EntrySize,
+            JumpTableType Type,
+            LabelMapType &&Labels,
+            BinaryFunction &BF,
+            BinarySection &Section);
+
+public:
  /// Return the size of the jump table.
  uint64_t getSize() const {
    return std::max(OffsetEntries.size(), Entries.size()) * EntrySize;
@ -89,15 +112,6 @@ public:
  /// starting at (or containing) 'Addr'.
  std::pair<size_t, size_t> getEntriesForAddress(const uint64_t Addr) const;

-  /// Constructor.
-  JumpTable(StringRef Name,
-            uint64_t Address,
-            std::size_t EntrySize,
-            JumpTableType Type,
-            decltype(OffsetEntries) &&OffsetEntries,
-            LabelMapType &&Labels,
-            BinarySection &Section);
-
  virtual bool isJumpTable() const override { return true; }

  /// Change all entries of the jump table in \p JTAddress pointing to
--- a/src/MCPlus.h
+++ b/src/MCPlus.h
@ -81,7 +81,7 @@ private:
 template <typename ValueType>
 class MCSimpleAnnotation : public MCAnnotation {
 public:
-  const ValueType &getValue() const { return Value; }
+  ValueType &getValue() { return Value; }
  bool equals(const MCAnnotation &Other) const override {
    return Value == static_cast<const MCSimpleAnnotation &>(Other).Value;
  }
--- a/src/MCPlusBuilder.cpp
+++ b/src/MCPlusBuilder.cpp
@ -148,12 +148,13 @@ int64_t MCPlusBuilder::getGnuArgsSize(const MCInst &Inst) const {
  return *Value;
 }

-void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize) {
+void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
+                                   AllocatorIdTy AllocId) {
  assert(GnuArgsSize >= 0 && "cannot set GNU_args_size to negative value");
  assert(getGnuArgsSize(Inst) == -1LL && "GNU_args_size already set");
  assert(isInvoke(Inst) && "GNU_args_size can only be set for invoke");

-  setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize);
+  setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize, AllocId);
 }

 uint64_t MCPlusBuilder::getJumpTable(const MCInst &Inst) const {
@ -163,13 +164,24 @@ uint64_t MCPlusBuilder::getJumpTable(const MCInst &Inst) const {
  return *Value;
 }

+uint16_t MCPlusBuilder::getJumpTableIndexReg(const MCInst &Inst) const {
+  return getAnnotationAs<uint16_t>(Inst, "JTIndexReg");
+}
+
 bool MCPlusBuilder::setJumpTable(MCInst &Inst, uint64_t Value,
-                                 uint16_t IndexReg) {
+                                 uint16_t IndexReg, AllocatorIdTy AllocId) {
  if (!isIndirectBranch(Inst))
    return false;
-  assert(getJumpTable(Inst) == 0 && "jump table already set");
-  setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value);
-  addAnnotation<>(Inst, "JTIndexReg", IndexReg);
+  setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value, AllocId);
+  getOrCreateAnnotationAs<uint16_t>(Inst, "JTIndexReg", AllocId) = IndexReg;
+  return true;
+}
+
+bool MCPlusBuilder::unsetJumpTable(MCInst &Inst) {
+  if (!getJumpTable(Inst))
+    return false;
+  removeAnnotation(Inst, MCAnnotation::kJumpTable);
+  removeAnnotation(Inst, "JTIndexReg");
  return true;
 }

@ -214,41 +226,12 @@ bool MCPlusBuilder::removeAnnotation(MCInst &Inst, unsigned Index) {
    auto ImmValue = AnnotationInst->getOperand(I).getImm();
    if (extractAnnotationIndex(ImmValue) == Index) {
      AnnotationInst->erase(AnnotationInst->begin() + I);
-      auto *Annotation =
-        reinterpret_cast<MCAnnotation *>(extractAnnotationValue(ImmValue));
-      auto Itr = AnnotationPool.find(Annotation);
-      if (Itr != AnnotationPool.end()) {
-        AnnotationPool.erase(Itr);
-        Annotation->~MCAnnotation();
-      }
      return true;
    }
  }
-
  return false;
 }

-void MCPlusBuilder::removeAllAnnotations(MCInst &Inst) {
-  auto *AnnotationInst = getAnnotationInst(Inst);
-  if (!AnnotationInst)
-    return;
-
-  for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) {
-    auto ImmValue = AnnotationInst->getOperand(I).getImm();
-    AnnotationInst->erase(std::prev(AnnotationInst->end()));
-    auto *Annotation =
-      reinterpret_cast<MCAnnotation *>(extractAnnotationValue(ImmValue));
-    auto Itr = AnnotationPool.find(Annotation);
-    if (Itr != AnnotationPool.end()) {
-      AnnotationPool.erase(Itr);
-      Annotation->~MCAnnotation();
-    }
-  }
-
-  // Clear all attached MC+ info since it's no longer used.
-  Inst.erase(std::prev(Inst.end()));
-}
-
 void MCPlusBuilder::stripAnnotations(MCInst &Inst) {
  auto *AnnotationInst = getAnnotationInst(Inst);
  if (!AnnotationInst)
@ -268,7 +251,7 @@ MCPlusBuilder::printAnnotations(const MCInst &Inst, raw_ostream &OS) const {
    const auto Index = extractAnnotationIndex(Imm);
    const auto Value = extractAnnotationValue(Imm);
    const auto *Annotation =
-        reinterpret_cast<const MCAnnotation *>(Value);
+            reinterpret_cast<const MCAnnotation *>(Value);
    if (Index >= MCAnnotation::kGeneric) {
      OS << " # " << AnnotationNames[Index - MCAnnotation::kGeneric]
         << ": ";
@ -283,7 +266,7 @@ bool MCPlusBuilder::evaluateBranch(const MCInst &Inst, uint64_t Addr,
 }

 void MCPlusBuilder::getClobberedRegs(const MCInst &Inst,
-                                       BitVector &Regs) const {
+                                     BitVector &Regs) const {
  if (isPrefix(Inst) || isCFI(Inst))
    return;

@ -302,7 +285,7 @@ void MCPlusBuilder::getClobberedRegs(const MCInst &Inst,
 }

 void MCPlusBuilder::getTouchedRegs(const MCInst &Inst,
-                                     BitVector &Regs) const {
+                                   BitVector &Regs) const {
  if (isPrefix(Inst) || isCFI(Inst))
    return;

@ -325,7 +308,7 @@ void MCPlusBuilder::getTouchedRegs(const MCInst &Inst,
 }

 void MCPlusBuilder::getWrittenRegs(const MCInst &Inst,
-                                     BitVector &Regs) const {
+                                   BitVector &Regs) const {
  if (isPrefix(Inst) || isCFI(Inst))
    return;

@ -381,7 +364,7 @@ bool MCPlusBuilder::hasUseOfPhysReg(const MCInst &MI, unsigned Reg) const {

 const BitVector &
 MCPlusBuilder::getAliases(MCPhysReg Reg,
-                            bool OnlySmaller) const {
+                          bool OnlySmaller) const {
  // AliasMap caches a mapping of registers to the set of registers that
  // alias (are sub or superregs of itself, including itself).
  static std::vector<BitVector> AliasMap;
--- a/src/MCPlusBuilder.h
+++ b/src/MCPlusBuilder.h
@ -35,8 +35,11 @@
 #include <cassert>
 #include <cstdint>
 #include <map>
+#include <mutex>
 #include <set>
+#include <shared_mutex>
 #include <system_error>
+#include <unordered_map>
 #include <unordered_set>

 namespace llvm {
@ -44,26 +47,31 @@ namespace bolt {

 /// Different types of indirect branches encountered during disassembly.
 enum class IndirectBranchType : char {
-  UNKNOWN = 0,              /// Unable to determine type.
-  POSSIBLE_TAIL_CALL,       /// Possibly a tail call.
-  POSSIBLE_JUMP_TABLE,      /// Possibly a switch/jump table.
-  POSSIBLE_PIC_JUMP_TABLE,  /// Possibly a jump table for PIC.
-  POSSIBLE_GOTO,            /// Possibly a gcc's computed goto.
-  POSSIBLE_FIXED_BRANCH,    /// Possibly an indirect branch to a fixed location.
+  UNKNOWN = 0,             /// Unable to determine type.
+  POSSIBLE_TAIL_CALL,      /// Possibly a tail call.
+  POSSIBLE_JUMP_TABLE,     /// Possibly a switch/jump table.
+  POSSIBLE_PIC_JUMP_TABLE, /// Possibly a jump table for PIC.
+  POSSIBLE_GOTO,           /// Possibly a gcc's computed goto.
+  POSSIBLE_FIXED_BRANCH,   /// Possibly an indirect branch to a fixed location.
 };

 class MCPlusBuilder {
+public:
+  using AllocatorIdTy = uint16_t;
+
 private:
-  /// Annotation instruction allocator.
-  SpecificBumpPtrAllocator<MCInst> MCInstAllocator;
+  /// A struct that represents a single annotation allocator
+  struct AnnotationAllocator {
+    SpecificBumpPtrAllocator<MCInst> MCInstAllocator;
+    BumpPtrAllocator ValueAllocator;
+    std::unordered_set<MCPlus::MCAnnotation *> AnnotationPool;
+  };

-  /// Annotation value allocator.
-  BumpPtrAllocator Allocator;
+  /// A set of annotation allocators
+  std::unordered_map<AllocatorIdTy, AnnotationAllocator> AnnotationAllocators;

-  /// Record all the annotations with non-trivial type.  To prevent leaks, these
-  /// will need destructors called when the annotation is removed or when all
-  /// annotations are destroyed.
-  std::unordered_set<MCPlus::MCAnnotation*> AnnotationPool;
+  /// A variable that is used to generate unique ids for annotation allocators
+  AllocatorIdTy MaxAllocatorId = 0;

  /// We encode Index and Value into a 64-bit immediate operand value.
  static int64_t encodeAnnotationImm(unsigned Index, int64_t Value) {
@ -100,10 +108,12 @@ private:
    return AnnotationInst;
  }

-  void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value) {
+  void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value,
+                            AllocatorIdTy AllocatorId = 0) {
    auto *AnnotationInst = getAnnotationInst(Inst);
    if (!AnnotationInst) {
-      AnnotationInst = new (MCInstAllocator.Allocate()) MCInst();
+      auto &Allocator = getAnnotationAllocator(AllocatorId);
+      AnnotationInst = new (Allocator.MCInstAllocator.Allocate()) MCInst();
      AnnotationInst->setOpcode(TargetOpcode::ANNOTATION_LABEL);
      Inst.addOperand(MCOperand::createInst(AnnotationInst));
    }
@ -278,20 +288,55 @@ public:
 public:
  MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info,
                const MCRegisterInfo *RegInfo)
-    : Analysis(Analysis), Info(Info), RegInfo(RegInfo) {}
+      : Analysis(Analysis), Info(Info), RegInfo(RegInfo) {
+    // Initialize the default annotation allocator with id 0
+    AnnotationAllocators.emplace(0, AnnotationAllocator());
+    MaxAllocatorId++;
+  }
+
+  /// Initialize a new annotation allocator and return its id
+  AllocatorIdTy initializeNewAnnotationAllocator() {
+    AnnotationAllocators.emplace(MaxAllocatorId, AnnotationAllocator());
+    return MaxAllocatorId++;
+  }
+
+  /// Return the annotation allocator of a given id
+  AnnotationAllocator &getAnnotationAllocator(AllocatorIdTy AllocatorId) {
+    assert(AnnotationAllocators.count(AllocatorId) &&
+           "allocator not initialized");
+    return AnnotationAllocators.find(AllocatorId)->second;
+  }
+
+  // Check if an annotation allocator with the given id exists
+  bool checkAllocatorExists(AllocatorIdTy AllocatorId) {
+    return AnnotationAllocators.count(AllocatorId);
+  }
+
+  /// Free the values allocator within the annotation allocator
+  void freeValuesAllocator(AllocatorIdTy AllocatorId) {
+    auto &Allocator = getAnnotationAllocator(AllocatorId);
+    for (auto *Annotation : Allocator.AnnotationPool)
+      Annotation->~MCAnnotation();
+
+    Allocator.AnnotationPool.clear();
+    Allocator.ValueAllocator.Reset();
+  }

  virtual ~MCPlusBuilder() {
    freeAnnotations();
  }

-  /// Free all memory allocated for annotations.
+  /// Free all memory allocated for annotations
  void freeAnnotations() {
-    for (auto *Annotation : AnnotationPool) {
-      Annotation->~MCAnnotation();
+    for (auto &Element : AnnotationAllocators) {
+      auto &Allocator = Element.second;
+      for (auto *Annotation : Allocator.AnnotationPool)
+        Annotation->~MCAnnotation();
+
+      Allocator.AnnotationPool.clear();
+      Allocator.ValueAllocator.Reset();
+      Allocator.MCInstAllocator.DestroyAll();
    }
-    AnnotationPool.clear();
-    MCInstAllocator.DestroyAll();
-    Allocator.Reset();
  }

  using CompFuncTy = std::function<bool(const MCSymbol *, const MCSymbol *)>;
@ -334,6 +379,11 @@ public:
    return false;
  }

+  /// Check whether we support inverting this branch
+  virtual bool isUnsupportedBranch(unsigned Opcode) const {
+    return false;
+  }
+
  /// Return true of the instruction is of pseudo kind.
  bool isPseudo(const MCInst &Inst) const {
    return Info->get(Inst.getOpcode()).isPseudo();
@ -353,11 +403,28 @@ public:
    llvm_unreachable("not implemented");
  }

+  virtual void createPushRegisterIndirect(MCInst &Inst,
+                                          const MCPhysReg &BaseReg, int64_t Scale,
+                                          const MCPhysReg &IndexReg, int64_t Offset,
+                                          const MCExpr *OffsetExpr,
+                                          const MCPhysReg &AddrSegmentReg,
+                                          unsigned Size) const {
+    llvm_unreachable("not implemented");
+  }
+
  virtual void createPopRegister(MCInst &Inst, MCPhysReg Reg,
                                 unsigned Size) const {
    llvm_unreachable("not implemented");
  }

+  virtual void createPushFlags(MCInst &Inst, unsigned Size) const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual void createPopFlags(MCInst &Inst, unsigned Size) const {
+    llvm_unreachable("not implemented");
+  }
+
  virtual bool createDirectCall(MCInst &Inst, const MCSymbol *Target,
                                MCContext *Ctx) {
    llvm_unreachable("not implemented");
@ -368,7 +435,22 @@ public:
    llvm_unreachable("not implemented");
  }

-  virtual MCPhysReg getX86NoRegister() const {
+  virtual MCPhysReg getInstructionPointer() const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Return a register number that is guaranteed to not match with
+  /// any real register on the underlying architecture.
+  virtual MCPhysReg getNoRegister() const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Return a register corresponding to a function integer argument \p ArgNo
+  /// if the argument is passed in a register. Or return the result of
+  /// getNoRegister() otherwise. The enumeration starts at 0.
+  ///
+  /// Note: this should depend on a used calling convention.
+  virtual MCPhysReg getIntArgRegister(unsigned ArgNo) const {
    llvm_unreachable("not implemented");
  }

@ -394,6 +476,11 @@ public:
    return false;
  }

+  virtual bool isBreakpoint(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
  virtual bool isPrefix(const MCInst &Inst) const {
    llvm_unreachable("not implemented");
    return false;
@ -457,6 +544,11 @@ public:
    return false;
  }

+  virtual bool isLfence(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
  virtual bool isLeave(const MCInst &Inst) const {
    llvm_unreachable("not implemented");
    return false;
@ -482,6 +574,11 @@ public:
    return false;
  }

+  virtual bool isActualLoad(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
  virtual bool isLoad(const MCInst &Inst) const {
    llvm_unreachable("not implemented");
    return false;
@ -890,9 +987,9 @@ public:
  /// of the passed \p Symbol plus \p Addend. If the instruction does not have
  /// an immediate operand or has more than one - then return false. Otherwise
  /// return true.
-  virtual bool replaceImmWithSymbol(MCInst &Inst, MCSymbol *Symbol,
-                                    int64_t Addend, MCContext *Ctx,
-                                    int64_t &Value, uint64_t RelType) const {
+  virtual bool replaceImmWithSymbolRef(MCInst &Inst, const MCSymbol *Symbol,
+                                       int64_t Addend, MCContext *Ctx,
+                                       int64_t &Value, uint64_t RelType) const {
    llvm_unreachable("not implemented");
    return false;
  }
@ -957,14 +1054,21 @@ public:
  int64_t getGnuArgsSize(const MCInst &Inst) const;

  /// Add the value of GNU_args_size to Inst if it already has EH info.
-  void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize);
+  void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
+                      AllocatorIdTy AllocId = 0);

  /// Return jump table addressed by this instruction.
  uint64_t getJumpTable(const MCInst &Inst) const;

+  /// Return index register for instruction that uses a jump table.
+  uint16_t getJumpTableIndexReg(const MCInst &Inst) const;
+
  /// Set jump table addressed by this instruction.
-  bool setJumpTable(MCInst &Inst, uint64_t Value,
-                    uint16_t IndexReg);
+  bool setJumpTable(MCInst &Inst, uint64_t Value, uint16_t IndexReg,
+                    AllocatorIdTy AllocId = 0);
+
+  /// Disassociate instruction with a jump table.
+  bool unsetJumpTable(MCInst &Inst);

  /// Return destination of conditional tail call instruction if \p Inst is one.
  Optional<uint64_t> getConditionalTailCall(const MCInst &Inst) const;
@ -1126,7 +1230,7 @@ public:
  }

  /// Replace instruction opcode to be a tail call instead of jump.
-  virtual bool convertJmpToTailCall(MCInst &Inst, MCContext *Ctx) {
+  virtual bool convertJmpToTailCall(MCInst &Inst) {
    llvm_unreachable("not implemented");
    return false;
  }
@ -1334,6 +1438,32 @@ public:
    return false;
  }

+  /// Create instruction to left shift contents of target
+  virtual bool createShl(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
+                         const MCPhysReg &IndexReg, int64_t Offset,
+                         const MCExpr *OffsetExpr,
+                         const MCPhysReg &AddrSegmentReg,
+                         uint8_t Immediate, int Size) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Create instruction to load an effective address into a target
+  virtual bool createLea(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
+                         const MCPhysReg &IndexReg, int64_t Offset,
+                         const MCExpr *OffsetExpr, const MCPhysReg &AddrSegmentReg,
+                         const MCPhysReg &DstReg, int Size) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Create instruction to increment contents of target by 1
+  virtual bool createIncMemory(MCInst &Inst, const MCSymbol *Target,
+                               MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
  /// Create a fragment of code (sequence of instructions) that load a 32-bit
  /// address from memory, zero-extends it to 64 and jump to it (indirect jump).
  virtual bool
@ -1364,6 +1494,21 @@ public:
    return true;
  }

+  /// Create an inline version of memcpy(dest, src, 1).
+  virtual std::vector<MCInst> createOneByteMemcpy() const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
+  /// Create a sequence of instructions to compare contents of a register
+  /// \p RegNo to immediate \Imm and jump to \p Target if they are equal.
+  virtual std::vector<MCInst>
+  createCmpJE(MCPhysReg RegNo, int64_t Imm, const MCSymbol *Target,
+              MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
  /// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
  /// (dest + n) instead of dest.
  virtual std::vector<MCInst> createInlineMemcpy(bool ReturnEnd) const {
@ -1411,7 +1556,6 @@ public:
    return true;
  }

-
  /// Return annotation index matching the \p Name.
  Optional<unsigned> getAnnotationIndex(StringRef Name) const {
    auto AI = AnnotationNameIndexMap.find(Name);
@ -1437,25 +1581,30 @@ public:
  /// Store an annotation value on an MCInst.  This assumes the annotation
  /// is not already present.
  template <typename ValueType>
-  const ValueType &addAnnotation(MCInst &Inst,
-                                 unsigned Index,
-                                 const ValueType &Val) {
+  const ValueType &addAnnotation(MCInst &Inst, unsigned Index,
+                                 const ValueType &Val,
+                                 AllocatorIdTy AllocatorId = 0) {
    assert(!hasAnnotation(Inst, Index));
-    auto *A = new (Allocator) MCPlus::MCSimpleAnnotation<ValueType>(Val);
+    auto &Allocator = getAnnotationAllocator(AllocatorId);
+    auto *A = new (Allocator.ValueAllocator)
+        MCPlus::MCSimpleAnnotation<ValueType>(Val);
+
    if (!std::is_trivial<ValueType>::value) {
-      AnnotationPool.insert(A);
+      Allocator.AnnotationPool.insert(A);
    }
-    setAnnotationOpValue(Inst, Index, reinterpret_cast<int64_t>(A));
+    setAnnotationOpValue(Inst, Index, reinterpret_cast<int64_t>(A),
+                         AllocatorId);
    return A->getValue();
  }

  /// Store an annotation value on an MCInst.  This assumes the annotation
  /// is not already present.
  template <typename ValueType>
-  const ValueType &addAnnotation(MCInst &Inst,
-                                 StringRef Name,
-                                 const ValueType &Val) {
-    return addAnnotation(Inst, getOrCreateAnnotationIndex(Name), Val);
+  const ValueType &addAnnotation(MCInst &Inst, StringRef Name,
+                                 const ValueType &Val,
+                                 AllocatorIdTy AllocatorId = 0) {
+    return addAnnotation(Inst, getOrCreateAnnotationIndex(Name), Val,
+                         AllocatorId);
  }

  /// Get an annotation as a specific value, but if the annotation does not
@ -1463,12 +1612,13 @@ public:
  /// Return a non-const ref so caller can freely modify its contents
  /// afterwards.
  template <typename ValueType>
-  ValueType& getOrCreateAnnotationAs(MCInst &Inst, unsigned Index) {
+  ValueType &getOrCreateAnnotationAs(MCInst &Inst, unsigned Index,
+                                     AllocatorIdTy AllocatorId = 0) {
    auto Val =
-      tryGetAnnotationAs<ValueType>(const_cast<const MCInst &>(Inst), Index);
+        tryGetAnnotationAs<ValueType>(const_cast<const MCInst &>(Inst), Index);
    if (!Val)
-      Val = addAnnotation(Inst, Index, ValueType());
-    return const_cast<ValueType&>(*Val);
+      Val = addAnnotation(Inst, Index, ValueType(), AllocatorId);
+    return const_cast<ValueType &>(*Val);
  }

  /// Get an annotation as a specific value, but if the annotation does not
@ -1476,25 +1626,26 @@ public:
  /// Return a non-const ref so caller can freely modify its contents
  /// afterwards.
  template <typename ValueType>
-  ValueType& getOrCreateAnnotationAs(MCInst &Inst, StringRef Name) {
+  ValueType &getOrCreateAnnotationAs(MCInst &Inst, StringRef Name,
+                                     AllocatorIdTy AllocatorId = 0) {
    const auto Index = getOrCreateAnnotationIndex(Name);
-    return getOrCreateAnnotationAs<ValueType>(Inst, Index);
+    return getOrCreateAnnotationAs<ValueType>(Inst, Index, AllocatorId);
  }

  /// Get an annotation as a specific value. Assumes that the annotation exists.
  /// Use hasAnnotation() if the annotation may not exist.
  template <typename ValueType>
-  const ValueType &getAnnotationAs(const MCInst &Inst, unsigned Index) const {
+  ValueType &getAnnotationAs(const MCInst &Inst, unsigned Index) const {
    auto Value = getAnnotationOpValue(Inst, Index);
    assert(Value && "annotation should exist");
-    return reinterpret_cast<const MCPlus::MCSimpleAnnotation<ValueType> *>
+    return reinterpret_cast<MCPlus::MCSimpleAnnotation<ValueType> *>
      (*Value)->getValue();
  }

  /// Get an annotation as a specific value. Assumes that the annotation exists.
  /// Use hasAnnotation() if the annotation may not exist.
  template <typename ValueType>
-  const ValueType &getAnnotationAs(const MCInst &Inst, StringRef Name) const {
+  ValueType &getAnnotationAs(const MCInst &Inst, StringRef Name) const {
    const auto Index = getAnnotationIndex(Name);
    assert(Index && "annotation should exist");
    return getAnnotationAs<ValueType>(Inst, *Index);
@ -1586,9 +1737,6 @@ public:
    return removeAnnotation(Inst, *Index);
  }

-  /// Remove all meta-data annotations from Inst.
-  void removeAllAnnotations(MCInst &Inst);
-
  /// Remove meta-data, but don't destroy it.
  void stripAnnotations(MCInst &Inst);

@ -1610,8 +1758,13 @@ public:
  /// empty vector of instructions.  The label is meant to indicate the basic
  /// block where all previous snippets are joined, i.e. the instructions that
  /// would immediate follow the original call.
-  using ICPdata = std::vector<std::pair<MCSymbol*, std::vector<MCInst>>>;
-  virtual ICPdata indirectCallPromotion(
+  using BlocksVectorTy = std::vector<std::pair<MCSymbol*, std::vector<MCInst>>>;
+  struct MultiBlocksCode {
+    BlocksVectorTy Blocks;
+    std::vector<MCSymbol*> Successors;
+  };
+
+  virtual BlocksVectorTy indirectCallPromotion(
    const MCInst &CallInst,
    const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
    const std::vector<std::pair<MCSymbol *, uint64_t>> &VtableSyms,
@ -1620,19 +1773,18 @@ public:
    MCContext *Ctx
  ) {
    llvm_unreachable("not implemented");
-    return ICPdata();
+    return BlocksVectorTy();
  }

-  virtual ICPdata jumpTablePromotion(
+  virtual BlocksVectorTy jumpTablePromotion(
    const MCInst &IJmpInst,
    const std::vector<std::pair<MCSymbol *,uint64_t>>& Targets,
    const std::vector<MCInst *> &TargetFetchInsns,
    MCContext *Ctx
  ) const {
    llvm_unreachable("not implemented");
-    return ICPdata();
+    return BlocksVectorTy();
  }
-
 };

 } // namespace bolt
--- a/src/ParallelUtilities.cpp
+++ b/src/ParallelUtilities.cpp
@ -0,0 +1,232 @@
+//===--- ParallelUtilities.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ParallelUtilities.h"
+#include "llvm/Support/Timer.h"
+#include <mutex>
+#include <shared_mutex>
+
+#define DEBUG_TYPE "par-utils"
+
+namespace opts {
+extern cl::OptionCategory BoltCategory;
+
+cl::opt<unsigned>
+ThreadCount("thread-count",
+  cl::desc("number of threads"),
+  cl::init(hardware_concurrency()),
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+NoThreads("no-threads",
+  cl::desc("disable multithreading"),
+  cl::init(false),
+  cl::cat(BoltCategory));
+
+cl::opt<unsigned>
+TaskCount("tasks-per-thread",
+  cl::desc("number of tasks to be created per thread"),
+  cl::init(20),
+  cl::cat(BoltCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+namespace ParallelUtilities {
+
+namespace {
+/// A single thread pool that is used to run parallel tasks
+std::unique_ptr<ThreadPool> ThreadPoolPtr;
+
+unsigned computeCostFor(const BinaryFunction &BF,
+                        const PredicateTy &SkipPredicate,
+                        const SchedulingPolicy &SchedPolicy) {
+  if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
+    return 1;
+
+  if (SkipPredicate && SkipPredicate(BF))
+    return 0;
+
+  switch (SchedPolicy) {
+  case SchedulingPolicy::SP_CONSTANT:
+    return 1;
+  case SchedulingPolicy::SP_INST_LINEAR:
+    return BF.getSize();
+  case SchedulingPolicy::SP_INST_QUADRATIC:
+    return BF.getSize() * BF.getSize();
+  case SchedulingPolicy::SP_BB_LINEAR:
+    return BF.size();
+  case SchedulingPolicy::SP_BB_QUADRATIC:
+    return BF.size() * BF.size();
+  default:
+    llvm_unreachable("unsupported scheduling policy");
+  }
+}
+
+inline unsigned estimateTotalCost(const BinaryContext &BC,
+                                  const PredicateTy &SkipPredicate,
+                                  SchedulingPolicy &SchedPolicy) {
+  if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
+    return BC.getBinaryFunctions().size();
+
+  unsigned TotalCost = 0;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    auto &BF = BFI.second;
+    TotalCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
+  }
+
+  // Switch to trivial scheduling if total estimated work is zero
+  if (TotalCost == 0) {
+    outs() << "BOLT-WARNING: Running parallel work of 0 estimated cost, will "
+              "switch to  trivial scheduling.\n";
+
+    SchedPolicy = SP_TRIVIAL;
+    TotalCost = BC.getBinaryFunctions().size();
+  }
+  return TotalCost;
+}
+
+} // namespace
+
+ThreadPool &getThreadPool() {
+  if (ThreadPoolPtr.get())
+    return *ThreadPoolPtr;
+
+  ThreadPoolPtr = std::make_unique<ThreadPool>(opts::ThreadCount);
+  return *ThreadPoolPtr;
+}
+
+void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
+                       WorkFuncTy WorkFunction, PredicateTy SkipPredicate,
+                       std::string LogName, bool ForceSequential,
+                       unsigned TasksPerThread) {
+  if (BC.getBinaryFunctions().size() == 0)
+    return;
+
+  auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
+                      std::map<uint64_t, BinaryFunction>::iterator BlockEnd) {
+    Timer T(LogName, LogName);
+    DEBUG(T.startTimer());
+
+    for (auto It = BlockBegin; It != BlockEnd; ++It) {
+      auto &BF = It->second;
+      if (SkipPredicate && SkipPredicate(BF))
+        continue;
+
+      WorkFunction(BF);
+    }
+    DEBUG(T.stopTimer());
+  };
+
+  if (opts::NoThreads || ForceSequential) {
+    runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end());
+    return;
+  }
+
+  // Estimate the overall runtime cost using the scheduling policy
+  const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
+  const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
+  const unsigned BlockCost =
+      TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;
+
+  // Divide work into blocks of equal cost
+  ThreadPool &Pool = getThreadPool();
+  auto BlockBegin = BC.getBinaryFunctions().begin();
+  unsigned CurrentCost = 0;
+
+  for (auto It = BC.getBinaryFunctions().begin();
+       It != BC.getBinaryFunctions().end(); ++It) {
+    auto &BF = It->second;
+    CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
+
+    if (CurrentCost >= BlockCost) {
+      Pool.async(runBlock, BlockBegin, std::next(It));
+      BlockBegin = std::next(It);
+      CurrentCost = 0;
+    }
+  }
+  Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end());
+  Pool.wait();
+}
+
+void runOnEachFunctionWithUniqueAllocId(
+    BinaryContext &BC, SchedulingPolicy SchedPolicy,
+    WorkFuncWithAllocTy WorkFunction, PredicateTy SkipPredicate,
+    std::string LogName, bool ForceSequential, unsigned TasksPerThread) {
+  if (BC.getBinaryFunctions().size() == 0)
+    return;
+
+  std::shared_timed_mutex MainLock;
+  auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
+                      std::map<uint64_t, BinaryFunction>::iterator BlockEnd,
+                      MCPlusBuilder::AllocatorIdTy AllocId) {
+    Timer T(LogName, LogName);
+    DEBUG(T.startTimer());
+    std::shared_lock<std::shared_timed_mutex> Lock(MainLock);
+    for (auto It = BlockBegin; It != BlockEnd; ++It) {
+      auto &BF = It->second;
+      if (SkipPredicate && SkipPredicate(BF))
+        continue;
+
+      WorkFunction(BF, AllocId);
+    }
+    DEBUG(T.stopTimer());
+  };
+
+  if (opts::NoThreads || ForceSequential) {
+    runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end(), 0);
+    return;
+  }
+  // This lock is used to postpone task execution
+  std::unique_lock<std::shared_timed_mutex> Lock(MainLock);
+
+  // Estimate the overall runtime cost using the scheduling policy
+  const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
+  const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
+  const unsigned BlockCost =
+      TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;
+
+  // Divide work into blocks of equal cost
+  ThreadPool &Pool = getThreadPool();
+  auto BlockBegin = BC.getBinaryFunctions().begin();
+  unsigned CurrentCost = 0;
+  unsigned AllocId = 1;
+  for (auto It = BC.getBinaryFunctions().begin();
+       It != BC.getBinaryFunctions().end(); ++It) {
+    auto &BF = It->second;
+    CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
+
+    if (CurrentCost >= BlockCost) {
+      if (!BC.MIB->checkAllocatorExists(AllocId)) {
+        auto Id = BC.MIB->initializeNewAnnotationAllocator();
+        assert(AllocId == Id && "unexpected allocator id created");
+      }
+      Pool.async(runBlock, BlockBegin, std::next(It), AllocId);
+      AllocId++;
+      BlockBegin = std::next(It);
+      CurrentCost = 0;
+    }
+  }
+
+  if (!BC.MIB->checkAllocatorExists(AllocId)) {
+    auto Id = BC.MIB->initializeNewAnnotationAllocator();
+    assert(AllocId == Id && "unexpected allocator id created");
+  }
+
+  Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end(), AllocId);
+  Lock.unlock();
+  Pool.wait();
+}
+
+} // namespace ParallelUtilities
+} // namespace bolt
+} // namespace llvm
--- a/src/ParallelUtilities.h
+++ b/src/ParallelUtilities.h
@ -0,0 +1,78 @@
+//===-- ParallelUtilities.h - ----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This class creates an interface that can be used to run parallel tasks that
+// operate on functions. Several scheduling criteria are supported using
+// SchedulingPolicy, and are defined by how the runtime cost should be
+// estimated.
+// If the NoThreads flags is passed, work will execute sequentially.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
+#define LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
+
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "MCPlusBuilder.h"
+#include "llvm/Support/ThreadPool.h"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::opt<unsigned> ThreadCount;
+extern cl::opt<bool> NoThreads;
+extern cl::opt<unsigned> TaskCount;
+}
+
+namespace llvm {
+namespace bolt {
+namespace ParallelUtilities {
+
+using WorkFuncWithAllocTy =
+    std::function<void(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy)>;
+using WorkFuncTy = std::function<void(BinaryFunction &BF)>;
+using PredicateTy = std::function<bool(const BinaryFunction &BF)>;
+
+enum SchedulingPolicy {
+  SP_TRIVIAL,     /// cost is estimated by the number of functions
+  SP_CONSTANT,    /// cost is estimated by the number of non-skipped functions
+  SP_INST_LINEAR, /// cost is estimated by inst count
+  SP_INST_QUADRATIC, /// cost is estimated by the square of the inst count
+  SP_BB_LINEAR,      /// cost is estimated by BB count
+  SP_BB_QUADRATIC,   /// cost is estimated by the square of the BB count
+};
+
+/// Return the managed threadpool and initialize it if not intiliazed
+ThreadPool &getThreadPool();
+
+/// Perform the work on each BinaryFunction except those that are accepted
+/// by SkipPredicate, scheduling heuristic is based on SchedPolicy.
+/// ForceSequential will selectively disable parallel execution and perform the
+/// work sequentially.
+void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
+                       WorkFuncTy WorkFunction,
+                       PredicateTy SkipPredicate = PredicateTy(),
+                       std::string LogName = "", bool ForceSequential = false,
+                       unsigned TasksPerThread = opts::TaskCount);
+
+/// Perform the work on each BinaryFunction except those that are rejected
+/// by SkipPredicate, and create a unique annotation allocator for each
+/// task. This should be used whenever the work function creates annotations to
+/// allow thread-safe annotation creation.
+/// ForceSequential will selectively disable parallel execution and perform the
+/// work sequentially.
+void runOnEachFunctionWithUniqueAllocId(
+    BinaryContext &BC, SchedulingPolicy SchedPolicy,
+    WorkFuncWithAllocTy WorkFunction, PredicateTy SkipPredicate,
+    std::string LogName = "", bool ForceSequential = false,
+    unsigned TasksPerThread = opts::TaskCount);
+
+} // namespace ParallelUtilities
+} // namespace bolt
+} // namespace llvm
+#endif
--- a/src/Passes/Aligner.cpp
+++ b/src/Passes/Aligner.cpp
@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//

 #include "Aligner.h"
+#include "ParallelUtilities.h"

 #define DEBUG_TYPE "bolt-aligner"

@ -88,16 +89,16 @@ void alignMaxBytes(BinaryFunction &Function) {
 // the fuction by not more than the minimum over
 // -- the size of the function
 // -- the specified number of bytes
-void alignCompact(BinaryFunction &Function) {
+void alignCompact(BinaryFunction &Function, const MCCodeEmitter *Emitter) {
  const auto &BC = Function.getBinaryContext();
  size_t HotSize = 0;
  size_t ColdSize = 0;

  for (const auto *BB : Function.layout()) {
    if (BB->isCold())
-      ColdSize += BC.computeCodeSize(BB->begin(), BB->end());
+      ColdSize += BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
    else
-      HotSize += BC.computeCodeSize(BB->begin(), BB->end());
+      HotSize += BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
  }

  Function.setAlignment(opts::AlignFunctions);
@ -114,13 +115,15 @@ void alignCompact(BinaryFunction &Function) {

 } // end anonymous namespace

-void AlignerPass::alignBlocks(BinaryFunction &Function) {
+void AlignerPass::alignBlocks(BinaryFunction &Function,
+                              const MCCodeEmitter *Emitter) {
  if (!Function.hasValidProfile() || !Function.isSimple())
    return;

  const auto &BC = Function.getBinaryContext();

-  const auto FuncCount = std::max(1UL, Function.getKnownExecutionCount());
+  const auto FuncCount =
+      std::max<uint64_t>(1, Function.getKnownExecutionCount());
  BinaryBasicBlock *PrevBB{nullptr};
  for (auto *BB : Function.layout()) {
    auto Count = BB->getKnownExecutionCount();
@ -139,8 +142,9 @@ void AlignerPass::alignBlocks(BinaryFunction &Function) {
    if (Count < FTCount * 2)
      continue;

-    const auto BlockSize = BC.computeCodeSize(BB->begin(), BB->end());
-    const auto BytesToUse = std::min(opts::BlockAlignment - 1UL, BlockSize);
+    const auto BlockSize = BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
+    const auto BytesToUse =
+        std::min<uint64_t>(opts::BlockAlignment - 1, BlockSize);

    if (opts::AlignBlocksMinSize && BlockSize < opts::AlignBlocksMinSize)
      continue;
@ -149,30 +153,36 @@ void AlignerPass::alignBlocks(BinaryFunction &Function) {
    BB->setAlignmentMaxBytes(BytesToUse);

    // Update stats.
-    AlignHistogram[BytesToUse]++;
-    AlignedBlocksCount += BB->getKnownExecutionCount();
+    DEBUG(
+      std::unique_lock<std::shared_timed_mutex> Lock(AlignHistogramMtx);
+      AlignHistogram[BytesToUse]++;
+      AlignedBlocksCount += BB->getKnownExecutionCount();
+    );
  }
 }

-void AlignerPass::runOnFunctions(BinaryContext &BC,
-                                 std::map<uint64_t, BinaryFunction> &BFs,
-                                 std::set<uint64_t> &LargeFunctions) {
+void AlignerPass::runOnFunctions(BinaryContext &BC) {
  if (!BC.HasRelocations)
    return;

  AlignHistogram.resize(opts::BlockAlignment);

-  for (auto &It : BFs) {
-    auto &Function = It.second;
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    // Create a separate MCCodeEmitter to allow lock free execution
+    auto Emitter = BC.createIndependentMCCodeEmitter();

    if (opts::UseCompactAligner)
-      alignCompact(Function);
+      alignCompact(BF, Emitter.MCE.get());
    else
-      alignMaxBytes(Function);
+      alignMaxBytes(BF);

    if (opts::AlignBlocks && !opts::PreserveBlocksAlignment)
-      alignBlocks(Function);
-  }
+      alignBlocks(BF, Emitter.MCE.get());
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun,
+      ParallelUtilities::PredicateTy(nullptr), "AlignerPass");

  DEBUG(
    dbgs() << "BOLT-DEBUG: max bytes per basic block alignment distribution:\n";
--- a/src/Passes/Aligner.h
+++ b/src/Passes/Aligner.h
@ -19,15 +19,15 @@ namespace bolt {

 class AlignerPass : public BinaryFunctionPass {
 private:
-
  /// Stats for usage of max bytes for basic block alignment.
  std::vector<uint32_t> AlignHistogram;
+  std::shared_timed_mutex AlignHistogramMtx;

  /// Stats: execution count of blocks that were aligned.
-  uint64_t AlignedBlocksCount{0};
+  std::atomic<uint64_t> AlignedBlocksCount{0};

  /// Assign alignment to basic blocks based on profile.
-  void alignBlocks(BinaryFunction &Function);
+  void alignBlocks(BinaryFunction &Function, const MCCodeEmitter *Emitter);

 public:
  explicit AlignerPass() : BinaryFunctionPass(false) {}
@ -37,9 +37,7 @@ public:
  }

  /// Pass entry point
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 } // namespace bolt
--- a/src/Passes/AllocCombiner.cpp
+++ b/src/Passes/AllocCombiner.cpp
@ -100,14 +100,13 @@ void AllocCombinerPass::combineAdjustments(BinaryContext &BC,
  }
 }

-void AllocCombinerPass::runOnFunctions(BinaryContext &BC,
-                                       std::map<uint64_t, BinaryFunction> &BFs,
-                                       std::set<uint64_t> &LargeFunctions) {
+void AllocCombinerPass::runOnFunctions(BinaryContext &BC) {
  if (opts::FrameOptimization == FOP_NONE)
    return;

  runForAllWeCare(
-      BFs, [&](BinaryFunction &Function) { combineAdjustments(BC, Function); });
+      BC.getBinaryFunctions(),
+      [&](BinaryFunction &Function) { combineAdjustments(BC, Function); });

  outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
         << " empty spaces coalesced.\n";
--- a/src/Passes/AllocCombiner.h
+++ b/src/Passes/AllocCombiner.h
@ -40,9 +40,7 @@ public:
  }

  /// Pass entry point
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 } // namespace bolt
--- a/src/Passes/BinaryFunctionCallGraph.cpp
+++ b/src/Passes/BinaryFunctionCallGraph.cpp
@ -77,7 +77,6 @@ std::deque<BinaryFunction *> BinaryFunctionCallGraph::buildTraversalOrder() {
 }

 BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
-                                       std::map<uint64_t, BinaryFunction> &BFs,
                                       CgFilterFunction Filter,
                                       bool CgFromPerfData,
                                       bool IncludeColdCalls,
@ -126,7 +125,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
  uint64_t NoProfileCallsites = 0;
  uint64_t NumFallbacks = 0;
  uint64_t RecursiveCallsites = 0;
-  for (auto &It : BFs) {
+  for (auto &It : BC.getBinaryFunctions()) {
    auto *Function = &It.second;

    if (Filter(*Function)) {
--- a/src/Passes/BinaryFunctionCallGraph.h
+++ b/src/Passes/BinaryFunctionCallGraph.h
@ -57,7 +57,7 @@ private:
 using CgFilterFunction = std::function<bool (const BinaryFunction &BF)>;
 inline bool NoFilter(const BinaryFunction &) { return false; }

-/// Builds a call graph from the map of BinaryFunctions provided in BFs.
+/// Builds a call graph from the map of BinaryFunctions provided in BC.
 /// The arguments control how the graph is constructed.
 /// Filter is called on each function, any function that it returns true for
 /// is omitted from the graph.
@ -68,7 +68,6 @@ inline bool NoFilter(const BinaryFunction &) { return false; }
 /// UseEdgeCounts is used to control if the Weight attribute on Arcs is computed
 /// using the number of calls.
 BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
-                                       std::map<uint64_t, BinaryFunction> &BFs,
                                       CgFilterFunction Filter = NoFilter,
                                       bool CgFromPerfData = false,
                                       bool IncludeColdCalls = true,
--- a/src/Passes/BinaryPasses.cpp
+++ b/src/Passes/BinaryPasses.cpp
@ -10,9 +10,12 @@
 //===----------------------------------------------------------------------===//

 #include "BinaryPasses.h"
+#include "ParallelUtilities.h"
 #include "Passes/ReorderAlgorithm.h"
 #include "llvm/Support/Options.h"
+
 #include <numeric>
+#include <vector>

 #define DEBUG_TYPE "bolt-opts"

@ -54,8 +57,10 @@ extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
 extern cl::opt<unsigned> Verbosity;
 extern cl::opt<bool> SplitEH;
+extern cl::opt<bool> EnableBAT;
 extern cl::opt<bolt::BinaryFunction::SplittingType> SplitFunctions;
 extern bool shouldProcess(const bolt::BinaryFunction &Function);
+extern bool isHotTextMover(const bolt::BinaryFunction &Function);

 enum DynoStatsSortOrder : char {
  Ascending,
@ -134,6 +139,22 @@ PrintSortedBy("print-sorted-by",
  cl::ZeroOrMore,
  cl::cat(BoltOptCategory));

+static cl::opt<bool>
+PrintUnknown("print-unknown",
+  cl::desc("print names of functions with unknown control flow"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory),
+  cl::Hidden);
+
+static cl::opt<bool>
+PrintUnknownCFG("print-unknown-cfg",
+  cl::desc("dump CFG of functions with unknown control flow"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory),
+  cl::ReallyHidden);
+
 static cl::opt<bolt::ReorderBasicBlocks::LayoutType>
 ReorderBlocks("reorder-blocks",
  cl::desc("change layout of basic blocks in a function"),
@ -267,7 +288,7 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
        if (!BB->isValid()) {
          dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName()
                 << " in function " << Function << "\n";
-          BB->dump();
+          Function.dump();
        }
      }
    });
@ -275,7 +296,10 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
    DeletedBlocks += Count;
    DeletedBytes += Bytes;
    if (Count) {
-      Modified.insert(&Function);
+      {
+        std::unique_lock<std::shared_timed_mutex> Lock(ModifiedMtx);
+        Modified.insert(&Function);
+      }
      if (opts::Verbosity > 0) {
        outs() << "BOLT-INFO: Removed " << Count
               << " dead basic block(s) accounting for " << Bytes
@ -285,17 +309,19 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
  }
 }

-void EliminateUnreachableBlocks::runOnFunctions(
-  BinaryContext&,
-  std::map<uint64_t, BinaryFunction> &BFs,
-  std::set<uint64_t> &
-) {
-  for (auto &It : BFs) {
-    auto &Function = It.second;
-    if (shouldOptimize(Function)) {
-      runOnFunction(Function);
-    }
-  }
+void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    runOnFunction(BF);
+  };
+
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return !shouldOptimize(BF);
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
+      "EliminateUnreachableBlocks");
+
  outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
         << DeletedBytes << " bytes of code.\n";
 }
@ -305,43 +331,43 @@ bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
          opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
 }

-void ReorderBasicBlocks::runOnFunctions(
-        BinaryContext &BC,
-        std::map<uint64_t, BinaryFunction> &BFs,
-        std::set<uint64_t> &LargeFunctions) {
+void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
  if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
    return;

  IsAArch64 = BC.isAArch64();
+  std::atomic<uint64_t> ModifiedFuncCount{0};

-  uint64_t ModifiedFuncCount = 0;
-  for (auto &It : BFs) {
-    auto &Function = It.second;
-
-    if (!shouldOptimize(Function))
-      continue;
-
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
    const bool ShouldSplit =
-            (opts::SplitFunctions == BinaryFunction::ST_ALL) ||
-            (opts::SplitFunctions == BinaryFunction::ST_EH &&
-             Function.hasEHRanges()) ||
-            (LargeFunctions.find(It.first) != LargeFunctions.end());
-    modifyFunctionLayout(Function, opts::ReorderBlocks, opts::MinBranchClusters,
+        (opts::SplitFunctions == BinaryFunction::ST_ALL) ||
+        (opts::SplitFunctions == BinaryFunction::ST_EH && BF.hasEHRanges()) ||
+        BF.shouldSplit();
+    modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters,
                         ShouldSplit);
-
-    if (Function.hasLayoutChanged()) {
+    if (BF.hasLayoutChanged()) {
      ++ModifiedFuncCount;
    }
-  }
+  };
+
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return !shouldOptimize(BF);
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
+      "ReorderBasicBlocks");

  outs() << "BOLT-INFO: basic block reordering modified layout of "
-         << format("%zu (%.2lf%%) functions\n",
-                   ModifiedFuncCount, 100.0 * ModifiedFuncCount / BFs.size());
+         << format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(),
+                   100.0 * ModifiedFuncCount.load() /
+                       BC.getBinaryFunctions().size());

  if (opts::PrintFuncStat > 0) {
    raw_ostream &OS = outs();
    // Copy all the values into vector in order to sort them
    std::map<uint64_t, BinaryFunction &> ScoreMap;
+    auto &BFs = BC.getBinaryFunctions();
    for (auto It = BFs.begin(); It != BFs.end(); ++It) {
      ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
          It->second.getFunctionScore(), It->second));
@ -349,8 +375,8 @@ void ReorderBasicBlocks::runOnFunctions(

    OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
    OS << "           There are " << BFs.size() << " functions in total. \n";
-    OS << "           Number of functions being modified: " << ModifiedFuncCount
-       << "\n";
+    OS << "           Number of functions being modified: "
+       << ModifiedFuncCount.load() << "\n";
    OS << "           User asks for detailed information on top "
       << opts::PrintFuncStat << " functions. (Ranked by function score)"
       << "\n\n";
@ -550,11 +576,8 @@ void ReorderBasicBlocks::splitFunction(BinaryFunction &BF) const {
  }
 }

-void FixupBranches::runOnFunctions(
-  BinaryContext &BC,
-  std::map<uint64_t, BinaryFunction> &BFs,
-  std::set<uint64_t> &) {
-  for (auto &It : BFs) {
+void FixupBranches::runOnFunctions(BinaryContext &BC) {
+  for (auto &It : BC.getBinaryFunctions()) {
    auto &Function = It.second;
    if (BC.HasRelocations || shouldOptimize(Function)) {
      if (BC.HasRelocations && !Function.isSimple())
@ -564,42 +587,38 @@ void FixupBranches::runOnFunctions(
  }
 }

-void FinalizeFunctions::runOnFunctions(
-  BinaryContext &BC,
-  std::map<uint64_t, BinaryFunction> &BFs,
-  std::set<uint64_t> &
-) {
-  for (auto &It : BFs) {
-    auto &Function = It.second;
-    const auto ShouldOptimize = shouldOptimize(Function);
-
-    // Always fix functions in relocation mode.
-    if (!BC.HasRelocations && !ShouldOptimize)
-      continue;
-
-    // Fix the CFI state.
-    if (ShouldOptimize && !Function.finalizeCFIState()) {
+void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    if (shouldOptimize(BF) && !BF.finalizeCFIState()) {
      if (BC.HasRelocations) {
-        errs() << "BOLT-ERROR: unable to fix CFI state for function "
-               << Function << ". Exiting.\n";
+        errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
+               << ". Exiting.\n";
        exit(1);
      }
-      Function.setSimple(false);
-      continue;
+      BF.setSimple(false);
+      return;
    }

-    Function.setFinalized();
+    BF.setFinalized();

    // Update exception handling information.
-    Function.updateEHRanges();
-  }
+    BF.updateEHRanges();
+  };
+
+  ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
+    return !BC.HasRelocations && !shouldOptimize(BF);
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
+      SkipPredicate, "FinalizeFunctions");
 }

-void LowerAnnotations::runOnFunctions(
-    BinaryContext &BC,
-    std::map<uint64_t, BinaryFunction> &BFs,
-    std::set<uint64_t> &) {
-  for (auto &It : BFs) {
+void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
+  std::vector<std::pair<MCInst *, uint64_t>> PreservedSDTAnnotations;
+  std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
+
+  for (auto &It : BC.getBinaryFunctions()) {
    auto &BF = It.second;
    int64_t CurrentGnuArgsSize = 0;

@ -612,9 +631,12 @@ void LowerAnnotations::runOnFunctions(
        CurrentGnuArgsSize = 0;
      }

-      for (auto II = BB->begin(); II != BB->end(); ++II) {
-        // Convert GnuArgsSize annotations into CFIs.
-        if (BF.usesGnuArgsSize() && BC.MIB->isInvoke(*II)) {
+      // First convert GnuArgsSize annotations into CFIs. This may change instr
+      // pointers, so do it before recording ptrs for preserved annotations
+      if (BF.usesGnuArgsSize()) {
+        for (auto II = BB->begin(); II != BB->end(); ++II) {
+          if (!BC.MIB->isInvoke(*II))
+            continue;
          const auto NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
          assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size");
          if (NewGnuArgsSize != CurrentGnuArgsSize) {
@ -624,13 +646,33 @@ void LowerAnnotations::runOnFunctions(
            II = std::next(InsertII);
          }
        }
-        BC.MIB->removeAllAnnotations(*II);
+      }
+
+      // Now record preserved annotations separately and then strip annotations
+      for (auto II = BB->begin(); II != BB->end(); ++II) {
+        if (BC.MIB->hasAnnotation(*II, "SDTMarker")) {
+          PreservedSDTAnnotations.push_back(std::make_pair(
+              &(*II), BC.MIB->getAnnotationAs<uint64_t>(*II, "SDTMarker")));
+        }
+
+        if (opts::EnableBAT && BC.MIB->hasAnnotation(*II, "Offset")) {
+          PreservedOffsetAnnotations.push_back(std::make_pair(
+              &(*II), BC.MIB->getAnnotationAs<uint32_t>(*II, "Offset")));
+        }
+
+        BC.MIB->stripAnnotations(*II);
      }
    }
  }

-  // Release all memory taken by annotations.
+  // Release all memory taken by annotations
  BC.MIB->freeAnnotations();
+
+  // Reinsert preserved annotations we need during code emission.
+  for (const auto &Item : PreservedSDTAnnotations)
+    BC.MIB->addAnnotation<uint64_t>(*Item.first, "SDTMarker", Item.second);
+  for (const auto &Item : PreservedOffsetAnnotations)
+    BC.MIB->addAnnotation<uint32_t>(*Item.first, "Offset", Item.second);
 }

 namespace {
@ -984,15 +1026,11 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC,
  return NumLocalCTCs > 0;
 }

-void SimplifyConditionalTailCalls::runOnFunctions(
-  BinaryContext &BC,
-  std::map<uint64_t, BinaryFunction> &BFs,
-  std::set<uint64_t> &
-) {
+void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
  if (!BC.isX86())
    return;

-  for (auto &It : BFs) {
+  for (auto &It : BC.getBinaryFunctions()) {
    auto &Function = It.second;

    if (!shouldOptimize(Function))
@ -1080,9 +1118,7 @@ void Peepholes::removeUselessCondBranches(BinaryContext &BC,
  }
 }

-void Peepholes::runOnFunctions(BinaryContext &BC,
-                               std::map<uint64_t, BinaryFunction> &BFs,
-                               std::set<uint64_t> &LargeFunctions) {
+void Peepholes::runOnFunctions(BinaryContext &BC) {
  const char Opts =
    std::accumulate(opts::Peepholes.begin(),
                    opts::Peepholes.end(),
@ -1093,7 +1129,7 @@ void Peepholes::runOnFunctions(BinaryContext &BC,
  if (Opts == opts::PEEP_NONE || !BC.isX86())
    return;

-  for (auto &It : BFs) {
+  for (auto &It : BC.getBinaryFunctions()) {
    auto &Function = It.second;
    if (shouldOptimize(Function)) {
      if (Opts & opts::PEEP_SHORTEN)
@ -1197,12 +1233,8 @@ bool SimplifyRODataLoads::simplifyRODataLoads(
  return NumLocalLoadsSimplified > 0;
 }

-void SimplifyRODataLoads::runOnFunctions(
-  BinaryContext &BC,
-  std::map<uint64_t, BinaryFunction> &BFs,
-  std::set<uint64_t> &
-) {
-  for (auto &It : BFs) {
+void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
+  for (auto &It : BC.getBinaryFunctions()) {
    auto &Function = It.second;
    if (shouldOptimize(Function) && simplifyRODataLoads(BC, Function)) {
      Modified.insert(&Function);
@ -1216,24 +1248,156 @@ void SimplifyRODataLoads::runOnFunctions(
         << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
 }

+void AssignSections::runOnFunctions(BinaryContext &BC) {
+  for (auto *Function : BC.getInjectedBinaryFunctions()) {
+    Function->setCodeSectionName(BC.getInjectedCodeSectionName());
+    Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
+  }
+
+  // In non-relocation mode functions have pre-assigned section names.
+  if (!BC.HasRelocations)
+    return;
+
+  const auto UseColdSection = BC.NumProfiledFuncs > 0;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    auto &Function = BFI.second;
+    if (opts::isHotTextMover(Function)) {
+      Function.setCodeSectionName(BC.getHotTextMoverSectionName());
+      Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
+      continue;
+    }
+
+    if (!UseColdSection ||
+        Function.hasValidIndex() ||
+        Function.hasValidProfile()) {
+      Function.setCodeSectionName(BC.getMainCodeSectionName());
+    } else {
+      Function.setCodeSectionName(BC.getColdCodeSectionName());
+    }
+
+    if (Function.isSplit())
+      Function.setColdCodeSectionName(BC.getColdCodeSectionName());
+  }
+}
+
+void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
+  double FlowImbalanceMean = 0.0;
+  size_t NumBlocksConsidered = 0;
+  double WorstBias = 0.0;
+  const BinaryFunction *WorstBiasFunc = nullptr;
+
+  // For each function CFG, we fill an IncomingMap with the sum of the frequency
+  // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
+  // of the frequency of outgoing edges.
+  using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
+  std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
+  std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
+
+  // Compute mean
+  for (const auto &BFI : BC.getBinaryFunctions()) {
+    const BinaryFunction &Function = BFI.second;
+    if (Function.empty() || !Function.isSimple())
+      continue;
+    FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
+    FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
+    for (const auto &BB : Function) {
+      auto TotalOutgoing = 0ULL;
+      auto SuccBIIter = BB.branch_info_begin();
+      for (auto Succ : BB.successors()) {
+        auto Count = SuccBIIter->Count;
+        if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
+          ++SuccBIIter;
+          continue;
+        }
+        TotalOutgoing += Count;
+        IncomingMap[Succ] += Count;
+        ++SuccBIIter;
+      }
+      OutgoingMap[&BB] = TotalOutgoing;
+    }
+
+    size_t NumBlocks = 0;
+    double Mean = 0.0;
+    for (const auto &BB : Function) {
+      // Do not compute score for low frequency blocks, entry or exit blocks
+      if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
+        continue;
+      ++NumBlocks;
+      const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
+      Mean += fabs(Difference / IncomingMap[&BB]);
+    }
+
+    FlowImbalanceMean += Mean;
+    NumBlocksConsidered += NumBlocks;
+    if (!NumBlocks)
+      continue;
+    double FuncMean = Mean / NumBlocks;
+    if (FuncMean > WorstBias) {
+      WorstBias = FuncMean;
+      WorstBiasFunc = &Function;
+    }
+  }
+  if (NumBlocksConsidered > 0)
+    FlowImbalanceMean /= NumBlocksConsidered;
+
+  // Compute standard deviation
+  NumBlocksConsidered = 0;
+  double FlowImbalanceVar = 0.0;
+  for (const auto &BFI : BC.getBinaryFunctions()) {
+    const BinaryFunction &Function = BFI.second;
+    if (Function.empty() || !Function.isSimple())
+      continue;
+    FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
+    FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
+    for (const auto &BB : Function) {
+      if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
+        continue;
+      ++NumBlocksConsidered;
+      const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
+      FlowImbalanceVar +=
+          pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
+    }
+  }
+  if (NumBlocksConsidered) {
+    FlowImbalanceVar /= NumBlocksConsidered;
+    FlowImbalanceVar = sqrt(FlowImbalanceVar);
+  }
+
+  // Report to user
+  outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
+                   (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
+  if (WorstBiasFunc && opts::Verbosity >= 1) {
+    outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
+           << "\n";
+    DEBUG(WorstBiasFunc->dump());
+  }
+}
+
 void
-PrintProgramStats::runOnFunctions(BinaryContext &BC,
-                                  std::map<uint64_t, BinaryFunction> &BFs,
-                                  std::set<uint64_t> &) {
+PrintProgramStats::runOnFunctions(BinaryContext &BC) {
  uint64_t NumSimpleFunctions{0};
  uint64_t NumStaleProfileFunctions{0};
  uint64_t NumNonSimpleProfiledFunctions{0};
+  uint64_t NumUnknownControlFlowFunctions{0};
  std::vector<BinaryFunction *> ProfiledFunctions;
  const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
-  for (auto &BFI : BFs) {
+  for (auto &BFI : BC.getBinaryFunctions()) {
    auto &Function = BFI.second;
    if (!Function.isSimple()) {
-      if (Function.hasProfile()) {
+      if (Function.hasProfile() && !Function.isPLTFunction()) {
        ++NumNonSimpleProfiledFunctions;
      }
      continue;
    }
    ++NumSimpleFunctions;
+    if (Function.hasUnknownControlFlow()) {
+      if (opts::PrintUnknownCFG) {
+        Function.dump();
+      } else if (opts::PrintUnknown) {
+        errs() << "function with unknown control flow: " << Function <<'\n';
+      }
+      ++NumUnknownControlFlowFunctions;
+    }
    if (!Function.hasProfile())
      continue;
    if (Function.hasValidProfile()) {
@ -1321,11 +1485,11 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC,
    std::vector<const BinaryFunction *> Functions;
    std::map<const BinaryFunction *, DynoStats> Stats;

-    for (const auto &BFI : BFs) {
+    for (const auto &BFI : BC.getBinaryFunctions()) {
      const auto &BF = BFI.second;
      if (shouldOptimize(BF) && BF.hasValidProfile()) {
        Functions.push_back(&BF);
-        Stats.emplace(&BF, BF.getDynoStats());
+        Stats.emplace(&BF, getDynoStats(BF));
      }
    }

@ -1377,7 +1541,7 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC,
    outs() << " are:\n";
    auto SFI = Functions.begin();
    for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
-      const auto Stats = (*SFI)->getDynoStats();
+      const auto Stats = getDynoStats(**SFI);
      outs() << "  " << **SFI;
      if (!SortAll) {
        outs() << " (";
@ -1427,12 +1591,13 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC,
  // Collect and print information about suboptimal code layout on input.
  if (opts::ReportBadLayout) {
    std::vector<const BinaryFunction *> SuboptimalFuncs;
-    for (auto &BFI : BFs) {
+    for (auto &BFI : BC.getBinaryFunctions()) {
      const auto &BF = BFI.second;
      if (!BF.hasValidProfile())
        continue;

-      const auto HotThreshold = std::max(BF.getKnownExecutionCount(), 1UL);
+      const auto HotThreshold =
+          std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
      bool HotSeen = false;
      for (const auto *BB : BF.rlayout()) {
        if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
@ -1463,13 +1628,19 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC,
      }
    }
  }
+
+  if (NumUnknownControlFlowFunctions) {
+    outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
+           << " functions have instructions with unknown control flow";
+    if (!opts::PrintUnknown) {
+      outs() << ". Use -print-unknown to see the list.";
+    }
+    outs() << '\n';
+  }
 }

-void InstructionLowering::runOnFunctions(
-    BinaryContext &BC,
-    std::map<uint64_t, BinaryFunction> &BFs,
-    std::set<uint64_t> &LargeFunctions) {
-  for (auto &BFI : BFs) {
+void InstructionLowering::runOnFunctions(BinaryContext &BC) {
+  for (auto &BFI : BC.getBinaryFunctions()) {
    for (auto &BB : BFI.second) {
      for (auto &Instruction : BB) {
        BC.MIB->lowerTailCall(Instruction);
@ -1478,13 +1649,10 @@ void InstructionLowering::runOnFunctions(
  }
 }

-void StripRepRet::runOnFunctions(
-    BinaryContext &BC,
-    std::map<uint64_t, BinaryFunction> &BFs,
-    std::set<uint64_t> &LargeFunctions) {
+void StripRepRet::runOnFunctions(BinaryContext &BC) {
  uint64_t NumPrefixesRemoved = 0;
  uint64_t NumBytesSaved = 0;
-  for (auto &BFI : BFs) {
+  for (auto &BFI : BC.getBinaryFunctions()) {
    for (auto &BB : BFI.second) {
      auto LastInstRIter = BB.getLastNonPseudo();
      if (LastInstRIter == BB.rend() ||
@ -1504,17 +1672,15 @@ void StripRepRet::runOnFunctions(
  }
 }

-void InlineMemcpy::runOnFunctions(BinaryContext &BC,
-                                  std::map<uint64_t, BinaryFunction> &BFs,
-                                  std::set<uint64_t> &LargeFunctions) {
+void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
  if (!BC.isX86())
    return;

  uint64_t NumInlined = 0;
  uint64_t NumInlinedDyno = 0;
-  for (auto &BFI : BFs) {
+  for (auto &BFI : BC.getBinaryFunctions()) {
    for (auto &BB : BFI.second) {
-      for(auto II = BB.begin(); II != BB.end(); ++II) {
+      for (auto II = BB.begin(); II != BB.end(); ++II) {
        auto &Inst = *II;

        if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
@ -1554,5 +1720,139 @@ void InlineMemcpy::runOnFunctions(BinaryContext &BC,
  }
 }

+bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
+  if (!BinaryFunctionPass::shouldOptimize(Function))
+    return false;
+
+  for (auto &FunctionSpec : Spec) {
+    auto FunctionName = StringRef(FunctionSpec).split(':').first;
+    if (Function.hasNameRegex(FunctionName))
+      return true;
+  }
+
+  return false;
+}
+
+std::set<size_t>
+SpecializeMemcpy1::getCallSitesToOptimize(const BinaryFunction &Function) const{
+  StringRef SitesString;
+  for (auto &FunctionSpec : Spec) {
+    StringRef FunctionName;
+    std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
+    if (Function.hasNameRegex(FunctionName))
+      break;
+    SitesString = "";
+  }
+
+  std::set<size_t> Sites;
+  SmallVector<StringRef, 4> SitesVec;
+  SitesString.split(SitesVec, ':');
+  for (auto SiteString : SitesVec) {
+    if (SiteString.empty())
+      continue;
+    size_t Result;
+    if (!SiteString.getAsInteger(10, Result))
+      Sites.emplace(Result);
+  }
+
+  return Sites;
+}
+
+void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
+  if (!BC.isX86())
+    return;
+
+  uint64_t NumSpecialized = 0;
+  uint64_t NumSpecializedDyno = 0;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    auto &Function = BFI.second;
+    if (!shouldOptimize(Function))
+      continue;
+
+    auto CallsToOptimize = getCallSitesToOptimize(Function);
+    auto shouldOptimize = [&](size_t N) {
+      return CallsToOptimize.empty() || CallsToOptimize.count(N);
+    };
+
+    std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
+    size_t CallSiteID = 0;
+    for (auto *CurBB : Blocks) {
+      for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
+        auto &Inst = *II;
+
+        if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
+            !Inst.getOperand(0).isExpr())
+          continue;
+
+        const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
+        if (CalleeSymbol->getName() != "memcpy" &&
+            CalleeSymbol->getName() != "memcpy@PLT")
+          continue;
+
+        if (BC.MIB->isTailCall(Inst))
+          continue;
+
+        ++CallSiteID;
+
+        if (!shouldOptimize(CallSiteID))
+          continue;
+
+        // Create a copy of a call to memcpy(dest, src, size).
+        auto MemcpyInstr = Inst;
+
+        auto *OneByteMemcpyBB = CurBB->splitAt(II);
+
+        BinaryBasicBlock *NextBB{nullptr};
+        if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
+          NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
+          NextBB->eraseInstruction(NextBB->begin());
+        } else {
+          NextBB = OneByteMemcpyBB->getSuccessor();
+          OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
+          assert(NextBB && "unexpected call to memcpy() with no return");
+        }
+
+        auto *MemcpyBB = Function.addBasicBlock(CurBB->getInputOffset());
+        auto CmpJCC = BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2),
+                                          1,
+                                          OneByteMemcpyBB->getLabel(),
+                                          BC.Ctx.get());
+        CurBB->addInstructions(CmpJCC);
+        CurBB->addSuccessor(MemcpyBB);
+
+        MemcpyBB->addInstruction(std::move(MemcpyInstr));
+        MemcpyBB->addSuccessor(NextBB);
+        MemcpyBB->setCFIState(NextBB->getCFIState());
+        MemcpyBB->setExecutionCount(0);
+
+        // To prevent the actual call from being moved to cold, we set its
+        // execution count to 1.
+        if (CurBB->getKnownExecutionCount() > 0)
+          MemcpyBB->setExecutionCount(1);
+
+        auto OneByteMemcpy = BC.MIB->createOneByteMemcpy();
+        OneByteMemcpyBB->addInstructions(OneByteMemcpy);
+
+        ++NumSpecialized;
+        NumSpecializedDyno += CurBB->getKnownExecutionCount();
+
+        CurBB = NextBB;
+
+        // Note: we don't expect the next instruction to be a call to memcpy.
+        II = CurBB->begin();
+      }
+    }
+  }
+
+  if (NumSpecialized) {
+    outs() << "BOLT-INFO: specialized " << NumSpecialized
+           << " memcpy() call sites for size 1";
+    if (NumSpecializedDyno)
+      outs() << ". The calls were executed " << NumSpecializedDyno
+             << " times based on profile.";
+    outs() << '\n';
+  }
+}
+
 } // namespace bolt
 } // namespace llvm
--- a/src/Passes/BinaryPasses.h
+++ b/src/Passes/BinaryPasses.h
@ -16,9 +16,10 @@

 #include "BinaryContext.h"
 #include "BinaryFunction.h"
+#include "DynoStats.h"
 #include "HFSort.h"
 #include "llvm/Support/CommandLine.h"
-
+#include <atomic>
 #include <map>
 #include <set>
 #include <string>
@ -38,7 +39,7 @@ protected:

  /// Control whether a specific function should be skipped during
  /// optimization.
-  bool shouldOptimize(const BinaryFunction &BF) const;
+  virtual bool shouldOptimize(const BinaryFunction &BF) const;
 public:
  virtual ~BinaryFunctionPass() = default;

@ -53,9 +54,7 @@ public:
  virtual bool shouldPrint(const BinaryFunction &BF) const;

  /// Execute this pass on the given functions.
-  virtual void runOnFunctions(BinaryContext &BC,
-                              std::map<uint64_t, BinaryFunction> &BFs,
-                              std::set<uint64_t> &LargeFunctions) = 0;
+  virtual void runOnFunctions(BinaryContext &BC) = 0;
 };

 /// A pass to print program-wide dynostats.
@ -79,10 +78,8 @@ public:
    return false;
  }

-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override {
-    const auto NewDynoStats = getDynoStats(BFs);
+  void runOnFunctions(BinaryContext &BC) override {
+    const auto NewDynoStats = getDynoStats(BC.getBinaryFunctions());
    const auto Changed = (NewDynoStats != PrevDynoStats);
    outs() << "BOLT-INFO: program-wide dynostats "
           << Title << (Changed ? "" : " (no change)") << ":\n\n"
@ -98,9 +95,10 @@ public:
 /// Detect and eliminate unreachable basic blocks. We could have those
 /// filled with nops and they are used for alignment.
 class EliminateUnreachableBlocks : public BinaryFunctionPass {
+  std::shared_timed_mutex ModifiedMtx;
  std::unordered_set<const BinaryFunction *> Modified;
-  unsigned DeletedBlocks{0};
-  uint64_t DeletedBytes{0};
+  std::atomic<unsigned> DeletedBlocks{0};
+  std::atomic<uint64_t> DeletedBytes{0};
  void runOnFunction(BinaryFunction& Function);
 public:
  EliminateUnreachableBlocks(const cl::opt<bool> &PrintPass)
@ -112,9 +110,7 @@ class EliminateUnreachableBlocks : public BinaryFunctionPass {
  bool shouldPrint(const BinaryFunction &BF) const override {
    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
  }
-  void runOnFunctions(BinaryContext&,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext&) override;
 };

 // Reorder the basic blocks for each function based on hotness.
@ -164,9 +160,7 @@ public:
    return "reordering";
  }
  bool shouldPrint(const BinaryFunction &BF) const override;
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 /// Sync local branches with CFG.
@ -178,9 +172,7 @@ class FixupBranches : public BinaryFunctionPass {
  const char *getName() const override {
    return "fix-branches";
  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 /// Fix the CFI state and exception handling information after all other
@ -193,9 +185,7 @@ class FinalizeFunctions : public BinaryFunctionPass {
  const char *getName() const override {
    return "finalize-functions";
  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 /// Convert and remove all BOLT-related annotations before LLVM code emission.
@ -207,9 +197,7 @@ class LowerAnnotations : public BinaryFunctionPass {
  const char *getName() const override {
    return "lower-annotations";
  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 /// An optimization to simplify conditional tail calls by removing
@ -281,9 +269,7 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass {
  bool shouldPrint(const BinaryFunction &BF) const override {
    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 /// Perform simple peephole optimizations.
@ -313,9 +299,7 @@ public:
  const char *getName() const override {
    return "peepholes";
  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 /// An optimization to simplify loads from read-only sections.The pass converts
@ -323,7 +307,7 @@ public:
 ///
 ///      mov 0x12f(%rip), %eax
 ///
-/// to their counterparts that use immediate opreands instead of memory loads:
+/// to their counterparts that use immediate operands instead of memory loads:
 ///
 ///     mov $0x4007dc, %eax
 ///
@ -348,9 +332,39 @@ public:
  bool shouldPrint(const BinaryFunction &BF) const override {
    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Assign output sections to all functions.
+class AssignSections : public BinaryFunctionPass {
+ public:
+  explicit AssignSections()
+    : BinaryFunctionPass(false) {
+  }
+
+  const char *getName() const override {
+    return "assign-sections";
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Compute and report to the user the imbalance in flow equations for all
+/// CFGs, so we can detect bad quality profile. Prints average and standard
+/// deviation of the absolute differences of outgoing flow minus incoming flow
+/// for blocks of interest (excluding prologues, epilogues, and BB frequency
+/// lower than 100).
+class PrintProfileStats : public BinaryFunctionPass {
+ public:
+  explicit PrintProfileStats(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "profile-stats";
+  }
+  bool shouldPrint(const BinaryFunction &) const override {
+    return false;
+  }
+  void runOnFunctions(BinaryContext &BC) override;
 };

 /// Prints a list of the top 100 functions sorted by a set of
@ -366,9 +380,7 @@ class PrintProgramStats : public BinaryFunctionPass {
  bool shouldPrint(const BinaryFunction &) const override {
    return false;
  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 /// Pass for lowering any instructions that we have raised and that have
@ -382,9 +394,7 @@ public:
    return "inst-lowering";
  }

-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 /// Pass for stripping 'repz' from 'repz retq' sequence of instructions.
@ -397,9 +407,7 @@ public:
    return "strip-rep-ret";
  }

-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 /// Pass for inlining calls to memcpy using 'rep movsb' on X86.
@ -412,9 +420,30 @@ public:
    return "inline-memcpy";
  }

-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Pass for specializing memcpy for a size of 1 byte.
+class SpecializeMemcpy1 : public BinaryFunctionPass {
+private:
+  std::vector<std::string> Spec;
+
+  /// Return indices of the call sites to optimize. Count starts at 1.
+  /// Returns an empty set for all call sites in the function.
+  std::set<size_t> getCallSitesToOptimize(const BinaryFunction &) const;
+
+public:
+  explicit SpecializeMemcpy1(const cl::opt<bool> &PrintPass,
+                             cl::list<std::string> &Spec)
+    : BinaryFunctionPass(PrintPass), Spec(Spec) {}
+
+  bool shouldOptimize(const BinaryFunction &BF) const override;
+
+  const char *getName() const override {
+    return "specialize-memcpy";
+  }
+
+  void runOnFunctions(BinaryContext &BC) override;
 };

 enum FrameOptimizationType : char {
--- a/src/Passes/CMakeLists.txt
+++ b/src/Passes/CMakeLists.txt
@ -15,6 +15,7 @@ add_llvm_library(LLVMBOLTPasses
  IdenticalCodeFolding.cpp
  IndirectCallPromotion.cpp
  Inliner.cpp
+  Instrumentation.cpp
  JTFootprintReduction.cpp
  LivenessAnalysis.cpp
  LongJmp.cpp
--- a/src/Passes/CachePlusReorderAlgorithm.cpp
+++ b/src/Passes/CachePlusReorderAlgorithm.cpp
@ -23,6 +23,7 @@ using EdgeList = std::vector<std::pair<BinaryBasicBlock *, uint64_t>>;
 namespace opts {

 extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<bool> NoThreads;

 cl::opt<unsigned>
 ClusterSplitThreshold("cluster-split-threshold",
@ -288,6 +289,12 @@ private:
      ExecutionCounts[BB->getLayoutIndex()] = EC;
    }

+    // Create a separate MCCodeEmitter to allow lock-free execution
+    BinaryContext::IndependentCodeEmitter Emitter;
+    if (!opts::NoThreads) {
+      Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter();
+    }
+
    // Initialize clusters
    Clusters.reserve(BF.layout_size());
    AllClusters.reserve(BF.layout_size());
@ -295,7 +302,8 @@ private:
    Size.reserve(BF.layout_size());
    for (auto BB : BF.layout()) {
      size_t Index = BB->getLayoutIndex();
-      Size.push_back(std::max(BB->estimateSize(), size_t(1)));
+      Size.push_back(
+          std::max<uint64_t>(BB->estimateSize(Emitter.MCE.get()), 1));
      AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
      Clusters.push_back(&AllClusters[Index]);
      CurCluster.push_back(&AllClusters[Index]);
--- a/src/Passes/DataflowAnalysis.h
+++ b/src/Passes/DataflowAnalysis.h
@ -172,6 +172,9 @@ protected:
  /// Reference to the function being analysed
  BinaryFunction &Func;

+  /// The id of the annotation allocator to be used
+  MCPlusBuilder::AllocatorIdTy AllocatorId = 0;
+
  /// Tracks the state at basic block start (end) if direction of the dataflow
  /// is forward (backward).
  std::unordered_map<const BinaryBasicBlock *, StateTy> StateAtBBEntry;
@ -244,7 +247,7 @@ protected:

  StateTy &getOrCreateStateAt(MCInst &Point) {
    return BC.MIB->getOrCreateAnnotationAs<StateTy>(
-        Point, derived().getAnnotationIndex());
+        Point, derived().getAnnotationIndex(), AllocatorId);
  }

  StateTy &getOrCreateStateAt(ProgramPoint Point) {
@ -254,6 +257,11 @@ protected:
  }

 public:
+  /// Return the allocator id
+  unsigned getAllocatorId() {
+    return AllocatorId;
+  }
+
  /// If the direction of the dataflow is forward, operates on the last
  /// instruction of all predecessors when performing an iteration of the
  /// dataflow equation for the start of this BB.  If backwards, operates on
@ -267,8 +275,10 @@ public:

  /// We need the current binary context and the function that will be processed
  /// in this dataflow analysis.
-  DataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF)
-      : BC(BC), Func(BF) {}
+  DataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF,
+                   MCPlusBuilder::AllocatorIdTy AllocatorId = 0)
+      : BC(BC), Func(BF), AllocatorId(AllocatorId) {}
+
  virtual ~DataflowAnalysis() {
    cleanAnnotations();
  }
@ -324,15 +334,15 @@ public:
  void run() {
    derived().preflight();

-    // Initialize state for all points of the function
-    for (auto &BB : Func) {
-      auto &St = getOrCreateStateAt(BB);
-      St = derived().getStartingStateAtBB(BB);
-      for (auto &Inst : BB) {
-        auto &St = getOrCreateStateAt(Inst);
-        St = derived().getStartingStateAtPoint(Inst);
+      // Initialize state for all points of the function
+      for (auto &BB : Func) {
+        auto &St = getOrCreateStateAt(BB);
+        St = derived().getStartingStateAtBB(BB);
+        for (auto &Inst : BB) {
+          auto &St = getOrCreateStateAt(Inst);
+          St = derived().getStartingStateAtPoint(Inst);
+        }
      }
-    }
    assert(Func.begin() != Func.end() && "Unexpected empty function");

    std::queue<BinaryBasicBlock *> Worklist;
@ -545,8 +555,10 @@ public:
    return count(*Expressions[PointIdx], Expr);
  }

-  InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF)
-    : DataflowAnalysis<Derived, BitVector, Backward, StatePrinterTy>(BC, BF) {}
+  InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF,
+                         MCPlusBuilder::AllocatorIdTy AllocId = 0)
+      : DataflowAnalysis<Derived, BitVector, Backward, StatePrinterTy>(
+            BC, BF, AllocId) {}
  virtual ~InstrsDataflowAnalysis() {}
 };

--- a/src/Passes/DataflowInfoManager.cpp
+++ b/src/Passes/DataflowInfoManager.cpp
@ -19,7 +19,7 @@ ReachingDefOrUse</*Def=*/true> &DataflowInfoManager::getReachingDefs() {
  if (RD)
    return *RD;
  assert(RA && "RegAnalysis required");
-  RD.reset(new ReachingDefOrUse<true>(*RA, BC, BF));
+  RD.reset(new ReachingDefOrUse<true>(*RA, BC, BF, None, AllocatorId));
  RD->run();
  return *RD;
 }
@ -32,7 +32,7 @@ ReachingDefOrUse</*Def=*/false> &DataflowInfoManager::getReachingUses() {
  if (RU)
    return *RU;
  assert(RA && "RegAnalysis required");
-  RU.reset(new ReachingDefOrUse<false>(*RA, BC, BF));
+  RU.reset(new ReachingDefOrUse<false>(*RA, BC, BF, None, AllocatorId));
  RU->run();
  return *RU;
 }
@ -45,7 +45,7 @@ LivenessAnalysis &DataflowInfoManager::getLivenessAnalysis() {
  if (LA)
    return *LA;
  assert(RA && "RegAnalysis required");
-  LA.reset(new LivenessAnalysis(*RA, BC, BF));
+  LA.reset(new LivenessAnalysis(*RA, BC, BF, AllocatorId));
  LA->run();
  return *LA;
 }
@ -58,7 +58,7 @@ StackReachingUses &DataflowInfoManager::getStackReachingUses() {
  if (SRU)
    return *SRU;
  assert(FA && "FrameAnalysis required");
-  SRU.reset(new StackReachingUses(*FA, BC, BF));
+  SRU.reset(new StackReachingUses(*FA, BC, BF, AllocatorId));
  SRU->run();
  return *SRU;
 }
@ -70,7 +70,7 @@ void DataflowInfoManager::invalidateStackReachingUses() {
 DominatorAnalysis<false> &DataflowInfoManager::getDominatorAnalysis() {
  if (DA)
    return *DA;
-  DA.reset(new DominatorAnalysis<false>(BC, BF));
+  DA.reset(new DominatorAnalysis<false>(BC, BF, AllocatorId));
  DA->run();
  return *DA;
 }
@ -82,7 +82,7 @@ void DataflowInfoManager::invalidateDominatorAnalysis() {
 DominatorAnalysis<true> &DataflowInfoManager::getPostDominatorAnalysis() {
  if (PDA)
    return *PDA;
-  PDA.reset(new DominatorAnalysis<true>(BC, BF));
+  PDA.reset(new DominatorAnalysis<true>(BC, BF, AllocatorId));
  PDA->run();
  return *PDA;
 }
@ -94,7 +94,7 @@ void DataflowInfoManager::invalidatePostDominatorAnalysis() {
 StackPointerTracking &DataflowInfoManager::getStackPointerTracking() {
  if (SPT)
    return *SPT;
-  SPT.reset(new StackPointerTracking(BC, BF));
+  SPT.reset(new StackPointerTracking(BC, BF, AllocatorId));
  SPT->run();
  return *SPT;
 }
@ -107,7 +107,7 @@ void DataflowInfoManager::invalidateStackPointerTracking() {
 ReachingInsns<false> &DataflowInfoManager::getReachingInsns() {
  if (RI)
    return *RI;
-  RI.reset(new ReachingInsns<false>(BC, BF));
+  RI.reset(new ReachingInsns<false>(BC, BF, AllocatorId));
  RI->run();
  return *RI;
 }
@ -119,7 +119,7 @@ void DataflowInfoManager::invalidateReachingInsns() {
 ReachingInsns<true> &DataflowInfoManager::getReachingInsnsBackwards() {
  if (RIB)
    return *RIB;
-  RIB.reset(new ReachingInsns<true>(BC, BF));
+  RIB.reset(new ReachingInsns<true>(BC, BF, AllocatorId));
  RIB->run();
  return *RIB;
 }
@ -131,7 +131,8 @@ void DataflowInfoManager::invalidateReachingInsnsBackwards() {
 StackAllocationAnalysis &DataflowInfoManager::getStackAllocationAnalysis() {
  if (SAA)
    return *SAA;
-  SAA.reset(new StackAllocationAnalysis(BC, BF, getStackPointerTracking()));
+  SAA.reset(new StackAllocationAnalysis(BC, BF, getStackPointerTracking(),
+                                        AllocatorId));
  SAA->run();
  return *SAA;
 }
--- a/src/Passes/DataflowInfoManager.h
+++ b/src/Passes/DataflowInfoManager.h
@ -47,10 +47,15 @@ class DataflowInfoManager {
  std::unique_ptr<std::unordered_map<const MCInst *, BinaryBasicBlock *>>
      InsnToBB;

+  // Id of the allocator to be used for annotations added by any of the managed
+  // analysis
+  MCPlusBuilder::AllocatorIdTy AllocatorId;
+
 public:
  DataflowInfoManager(const BinaryContext &BC, BinaryFunction &BF,
-                      const RegAnalysis *RA, const FrameAnalysis *FA)
-      : RA(RA), FA(FA), BC(BC), BF(BF){};
+                      const RegAnalysis *RA, const FrameAnalysis *FA,
+                      MCPlusBuilder::AllocatorIdTy AllocId = 0)
+      : RA(RA), FA(FA), BC(BC), BF(BF), AllocatorId(AllocId){};

  /// Helper function to fetch the parent BB associated with a program point
  /// If PP is a BB itself, then return itself (cast to a BinaryBasicBlock)
--- a/src/Passes/DominatorAnalysis.h
+++ b/src/Passes/DominatorAnalysis.h
@ -35,34 +35,35 @@ class DominatorAnalysis
                                Backward>;

 public:
-  DominatorAnalysis(const BinaryContext &BC, BinaryFunction &BF)
-      : InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>(BC, BF) {}
+  DominatorAnalysis(const BinaryContext &BC, BinaryFunction &BF,
+                    MCPlusBuilder::AllocatorIdTy AllocId)
+      : InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>(BC, BF,
+                                                                      AllocId) {
+  }
  virtual ~DominatorAnalysis() {}

-  SmallVector<ProgramPoint, 4> getDominanceFrontierFor(const MCInst &Dom) {
-    SmallVector<ProgramPoint, 4> Result;
+  SmallSetVector<ProgramPoint, 4> getDominanceFrontierFor(const MCInst &Dom) {
+    SmallSetVector<ProgramPoint, 4> Result;
    auto DomIdx = this->ExprToIdx[&Dom];
    assert(!Backward && "Post-dom frontier not implemented");
    for (auto &BB : this->Func) {
      bool HasDominatedPred = false;
      bool HasNonDominatedPred = false;
-      SmallVector<ProgramPoint, 4> Candidates;
+      SmallSetVector<ProgramPoint, 4> Candidates;
      this->doForAllSuccsOrPreds(BB, [&](ProgramPoint P) {
        if ((*this->getStateAt(P))[DomIdx]) {
-          Candidates.emplace_back(P);
+          Candidates.insert(P);
          HasDominatedPred = true;
          return;
        }
        HasNonDominatedPred = true;
      });
      if (HasDominatedPred && HasNonDominatedPred)
-        Result.append(Candidates.begin(), Candidates.end());
+        Result.insert(Candidates.begin(), Candidates.end());
      if ((*this->getStateAt(ProgramPoint::getLastPointAt(BB)))[DomIdx] &&
          BB.succ_begin() == BB.succ_end())
-        Result.emplace_back(ProgramPoint::getLastPointAt(BB));
+        Result.insert(ProgramPoint::getLastPointAt(BB));
    }
-    std::sort(Result.begin(), Result.end());
-    Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
    return Result;
  }

@ -104,8 +105,6 @@ public:
  }

  void run() {
-    NamedRegionTimer T1("DA", "Dominator Analysis", "Dataflow", "Dataflow",
-                        opts::TimeOpts);
    InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>::run();
  }

--- a/src/Passes/FrameAnalysis.cpp
+++ b/src/Passes/FrameAnalysis.cpp
@ -10,6 +10,8 @@
 //===----------------------------------------------------------------------===//
 #include "FrameAnalysis.h"
 #include "CallGraphWalker.h"
+#include "ParallelUtilities.h"
+#include "llvm/Support/ThreadPool.h"
 #include <fstream>

 #define DEBUG_TYPE "fa"
@ -17,8 +19,9 @@
 using namespace llvm;

 namespace opts {
-extern cl::opt<bool> TimeOpts;
+extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<unsigned> Verbosity;
+
 extern bool shouldProcess(const bolt::BinaryFunction &Function);

 static cl::list<std::string>
@ -30,7 +33,17 @@ static cl::opt<std::string> FrameOptFunctionNamesFile(
    "funcs-file-fop",
    cl::desc("file with list of functions to frame optimize"));

+static cl::opt<bool>
+TimeFA("time-fa",
+  cl::desc("time frame analysis steps"),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
 bool shouldFrameOptimize(const llvm::bolt::BinaryFunction &Function) {
+  if (Function.hasUnknownControlFlow())
+    return false;
+
  if (!FrameOptFunctionNamesFile.empty()) {
    assert(!FrameOptFunctionNamesFile.empty() && "unexpected empty file name");
    std::ifstream FuncsFile(FrameOptFunctionNamesFile, std::ios::in);
@ -85,7 +98,8 @@ namespace {
 class FrameAccessAnalysis {
  /// We depend on Stack Pointer Tracking to figure out the current SP offset
  /// value at a given program point
-  StackPointerTracking SPT;
+  StackPointerTracking &SPT;
+
  /// Context vars
  const BinaryContext &BC;
  const BinaryFunction &BF;
@ -150,14 +164,9 @@ class FrameAccessAnalysis {
  }

 public:
-  FrameAccessAnalysis(const BinaryContext &BC, BinaryFunction &BF)
-      : SPT(BC, BF), BC(BC), BF(BF) {
-    {
-      NamedRegionTimer T1("SPT", "Stack Pointer Tracking", "Dataflow",
-                          "Dataflow", opts::TimeOpts);
-      SPT.run();
-    }
-  }
+  FrameAccessAnalysis(const BinaryContext &BC, BinaryFunction &BF,
+                      StackPointerTracking &SPT)
+      : SPT(SPT), BC(BC), BF(BF) {}

  void enterNewBB() { Prev = nullptr; }
  const FrameIndexEntry &getFIE() const { return FIE; }
@ -393,7 +402,7 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
               << "\n");
  bool UpdatedArgsTouched = false;
  bool NoInfo = false;
-  FrameAccessAnalysis FAA(BC, BF);
+  FrameAccessAnalysis FAA(BC, BF, getSPT(BF));

  for (auto BB : BF.layout()) {
    FAA.enterNewBB();
@ -452,7 +461,7 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
 }

 bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
-  FrameAccessAnalysis FAA(BC, BF);
+  FrameAccessAnalysis FAA(BC, BF, getSPT(BF));

  DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName()
               << "\"\n");
@ -485,27 +494,42 @@ bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
 }

 void FrameAnalysis::cleanAnnotations() {
-  for (auto &I : BFs) {
-    for (auto &BB : I.second) {
+  NamedRegionTimer T("cleanannotations", "clean annotations", "FA",
+                     "FA breakdown", opts::TimeFA);
+
+  ParallelUtilities::WorkFuncTy CleanFunction = [&](BinaryFunction &BF) {
+    for (auto &BB : BF) {
      for (auto &Inst : BB) {
        BC.MIB->removeAnnotation(Inst, "ArgAccessEntry");
        BC.MIB->removeAnnotation(Inst, "FrameAccessEntry");
      }
    }
-  }
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, CleanFunction,
+      ParallelUtilities::PredicateTy(nullptr), "cleanAnnotations");
 }

-FrameAnalysis::FrameAnalysis(BinaryContext &BC,
-                             std::map<uint64_t, BinaryFunction> &BFs,
-                             BinaryFunctionCallGraph &CG)
-    : BC(BC), BFs(BFs) {
+FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
+    : BC(BC) {
  // Position 0 of the vector should be always associated with "assume access
  // everything".
  ArgAccessesVector.emplace_back(ArgAccesses(/*AssumeEverything*/ true));

-  traverseCG(CG);
+  if (!opts::NoThreads) {
+    NamedRegionTimer T1("precomputespt", "pre-compute spt", "FA",
+                        "FA breakdown", opts::TimeFA);
+    preComputeSPT();
+  }

-  for (auto &I : BFs) {
+  {
+    NamedRegionTimer T1("traversecg", "traverse call graph", "FA",
+                        "FA breakdown", opts::TimeFA);
+    traverseCG(CG);
+  }
+
+  for (auto &I : BC.getBinaryFunctions()) {
    auto Count = I.second.getExecutionCount();
    if (Count != BinaryFunction::COUNT_NO_PROFILE)
      CountDenominator += Count;
@ -521,8 +545,8 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC,
    }

    {
-      NamedRegionTimer T1("restorefi", "restore frame index", "FOP",
-                          "FOP breakdown", opts::TimeOpts);
+      NamedRegionTimer T1("restorefi", "restore frame index", "FA",
+                          "FA breakdown", opts::TimeFA);
      if (!restoreFrameIndex(I.second)) {
        ++NumFunctionsFailedRestoreFI;
        auto Count = I.second.getExecutionCount();
@ -533,6 +557,18 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC,
    }
    AnalyzedFunctions.insert(&I.second);
  }
+
+  {
+    NamedRegionTimer T1("clearspt", "clear spt", "FA", "FA breakdown",
+                        opts::TimeFA);
+    clearSPTMap();
+
+    // Clean up memory allocated for annotation values
+    if (!opts::NoThreads) {
+      for (auto Id : SPTAllocatorsId)
+        BC.MIB->freeValuesAllocator(Id);
+    }
+  }
 }

 void FrameAnalysis::printStats() {
@ -548,5 +584,60 @@ void FrameAnalysis::printStats() {
         << " could not have its frame indices restored.\n";
 }

+void FrameAnalysis::clearSPTMap() {
+  if (opts::NoThreads) {
+    SPTMap.clear();
+    return;
+  }
+
+  ParallelUtilities::WorkFuncTy ClearFunctionSPT = [&](BinaryFunction &BF) {
+    auto &SPTPtr = SPTMap.find(&BF)->second;
+    SPTPtr.reset();
+  };
+
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return !BF.isSimple() || !BF.hasCFG();
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, ClearFunctionSPT,
+      SkipFunc, "clearSPTMap");
+
+  SPTMap.clear();
+}
+
+void FrameAnalysis::preComputeSPT() {
+  // Make sure that the SPTMap is empty
+  assert(SPTMap.size() == 0);
+
+  // Create map entries to allow lock-free parallel execution
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    auto &BF = BFI.second;
+    if (!BF.isSimple() || !BF.hasCFG())
+      continue;
+    SPTMap.emplace(&BF, std::unique_ptr<StackPointerTracking>());
+  }
+
+  // Create an index for the SPT annotation to allow lock-free parallel
+  // execution
+  BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
+
+  // Run SPT in parallel
+  ParallelUtilities::WorkFuncWithAllocTy ProcessFunction =
+      [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) {
+        auto &SPTPtr = SPTMap.find(&BF)->second;
+        SPTPtr = std::make_unique<StackPointerTracking>(BC, BF, AllocId);
+        SPTPtr->run();
+      };
+
+  ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
+    return !BF.isSimple() || !BF.hasCFG();
+  };
+
+  ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
+      BC, ParallelUtilities::SchedulingPolicy::SP_BB_QUADRATIC, ProcessFunction,
+      SkipPredicate, "preComputeSPT");
+}
+
 } // namespace bolt
 } // namespace llvm
--- a/src/Passes/FrameAnalysis.h
+++ b/src/Passes/FrameAnalysis.h
@ -93,7 +93,7 @@ raw_ostream &operator<<(raw_ostream &OS,
 /// Initialization:
 ///
 ///   FrameAnalysis FA(PrintPass);
-///   FA.runOnFunctions(BC, BFs, LargeFunctions);
+///   FA.runOnFunctions(BC);
 ///
 /// Usage (fetching frame access information about a given instruction):
 ///
@ -113,7 +113,6 @@ raw_ostream &operator<<(raw_ostream &OS,
 ///
 class FrameAnalysis {
  BinaryContext &BC;
-  std::map<uint64_t, BinaryFunction> &BFs;

  /// Map functions to the set of <stack offsets, size> tuples representing
  /// accesses to stack positions that belongs to caller
@ -168,9 +167,17 @@ class FrameAnalysis {
  /// to analyze and this information can't be safely determined for \p BF.
  bool restoreFrameIndex(BinaryFunction &BF);

+  /// A store for SPT info per function
+  std::unordered_map<const BinaryFunction *,
+                     std::unique_ptr<StackPointerTracking>>
+      SPTMap;
+
+  /// A vector that stores ids of the allocators that are used in SPT
+  /// computation
+  std::vector<MCPlusBuilder::AllocatorIdTy> SPTAllocatorsId;
+
 public:
  explicit FrameAnalysis(BinaryContext &BC,
-                         std::map<uint64_t, BinaryFunction> &BFs,
                         BinaryFunctionCallGraph &CG);

  /// Return true if we could fully analyze \p Func
@ -197,10 +204,30 @@ public:
    cleanAnnotations();
  }

-
  /// Print to standard output statistics about the analysis performed by this
  /// pass
  void printStats();
+
+  /// Get or create an SPT object and run the analysis
+  StackPointerTracking &getSPT(BinaryFunction &BF) {
+    if (!SPTMap.count(&BF)) {
+      SPTMap.emplace(&BF, std::make_unique<StackPointerTracking>(BC, BF));
+      auto Iter = SPTMap.find(&BF);
+      assert(Iter != SPTMap.end() && "item should exist");
+      Iter->second->run();
+      return *Iter->second;
+    }
+
+    auto Iter = SPTMap.find(&BF);
+    assert(Iter != SPTMap.end() && "item should exist");
+    return *Iter->second;
+  }
+
+  /// Clean and de-allocate all SPT objects
+  void clearSPTMap();
+
+  /// Perform SPT analysis for all functions in parallel
+  void preComputeSPT();
 };

 } // namespace bolt
--- a/src/Passes/FrameOptimizer.cpp
+++ b/src/Passes/FrameOptimizer.cpp
@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//

 #include "FrameOptimizer.h"
+#include "ParallelUtilities.h"
 #include "ShrinkWrapping.h"
 #include "StackAvailableExpressions.h"
 #include "StackReachingUses.h"
@ -47,7 +48,6 @@ RemoveStores("frame-opt-rm-stores",
  cl::ZeroOrMore,
  cl::cat(BoltOptCategory));

-
 } // namespace opts

 namespace llvm {
@ -221,21 +221,36 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
  }
 }

-void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
-                                        std::map<uint64_t, BinaryFunction> &BFs,
-                                        std::set<uint64_t> &LargeFunctions) {
+void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
  if (opts::FrameOptimization == FOP_NONE)
    return;

-  // Run FrameAnalysis pass
-  BinaryFunctionCallGraph CG = buildCallGraph(BC, BFs);
-  FrameAnalysis FA(BC, BFs, CG);
-  RegAnalysis RA(BC, &BFs, &CG);
+  std::unique_ptr<BinaryFunctionCallGraph> CG;
+  std::unique_ptr<FrameAnalysis> FA;
+  std::unique_ptr<RegAnalysis> RA;

-  // Our main loop: perform caller-saved register optimizations, then
-  // callee-saved register optimizations (shrink wrapping).
-  for (auto &I : BFs) {
-    if (!FA.hasFrameInfo(I.second))
+  {
+    NamedRegionTimer T1("callgraph", "create call graph", "FOP",
+                        "FOP breakdown", opts::TimeOpts);
+    CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC));
+  }
+
+  {
+    NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
+                        "FOP breakdown", opts::TimeOpts);
+    FA = std::make_unique<FrameAnalysis>(BC, *CG);
+  }
+
+  {
+    NamedRegionTimer T1("reganalysis", "reg analysis", "FOP",
+                        "FOP breakdown", opts::TimeOpts);
+    RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get());
+  }
+
+  // Perform caller-saved register optimizations, then callee-saved register
+  // optimizations (shrink wrapping)
+  for (auto &I : BC.getBinaryFunctions()) {
+    if (!FA->hasFrameInfo(I.second))
      continue;
    // Restrict pass execution if user asked to only run on hot functions
    if (opts::FrameOptimization == FOP_HOT) {
@ -247,27 +262,28 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
                   << " ) exceeds our hotness threshold ( "
                   << BC.getHotThreshold() << " )\n");
    }
+
    {
      NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
                          opts::TimeOpts);
-      removeUnnecessaryLoads(RA, FA, BC, I.second);
+      removeUnnecessaryLoads(*RA, *FA, BC, I.second);
    }
+
    if (opts::RemoveStores) {
      NamedRegionTimer T1("removestores", "remove stores", "FOP",
                          "FOP breakdown", opts::TimeOpts);
-      removeUnusedStores(FA, BC, I.second);
+      removeUnusedStores(*FA, BC, I.second);
    }
    // Don't even start shrink wrapping if no profiling info is available
    if (I.second.getKnownExecutionCount() == 0)
      continue;
-    {
-      NamedRegionTimer T1("movespills", "move spills", "FOP", "FOP breakdown",
-                          opts::TimeOpts);
-      DataflowInfoManager Info(BC, I.second, &RA, &FA);
-      ShrinkWrapping SW(FA, BC, I.second, Info);
-      if (SW.perform())
-        FuncsChanged.insert(&I.second);
-    }
+
+  }
+
+  {
+    NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
+                        "FOP breakdown", opts::TimeOpts);
+    performShrinkWrapping(*RA, *FA, BC);
  }

  outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
@ -278,9 +294,67 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
         << NumLoadsChangedToImm << " to use an immediate.\n"
         << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and "
         << NumRedundantStores << " store(s).\n";
-  FA.printStats();
+  FA->printStats();
  ShrinkWrapping::printStats();
 }

+void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
+                                               const FrameAnalysis &FA,
+                                               BinaryContext &BC) {
+  // Initialize necessary annotations to allow safe parallel accesses to
+  // annotation index in MIB
+  BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
+  BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
+  BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
+  BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
+  BC.MIB->getOrCreateAnnotationIndex(
+      StackLayoutModifier::getOffsetCFIRegTagName());
+  BC.MIB->getOrCreateAnnotationIndex("ReachingDefs");
+  BC.MIB->getOrCreateAnnotationIndex("ReachingUses");
+  BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis");
+  BC.MIB->getOrCreateAnnotationIndex("StackReachingUses");
+  BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis");
+  BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis");
+  BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
+  BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
+  BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions");
+  BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis");
+  BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
+  BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
+  BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward");
+  BC.MIB->getOrCreateAnnotationIndex("ReachingInsns");
+  BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
+  BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
+
+  ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
+    if (!FA.hasFrameInfo(BF))
+      return true;
+
+    if (opts::FrameOptimization == FOP_HOT &&
+        (BF.getKnownExecutionCount() < BC.getHotThreshold()))
+      return true;
+
+    if (BF.getKnownExecutionCount() == 0)
+      return true;
+
+    return false;
+  };
+
+  ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
+      [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
+        DataflowInfoManager Info(BC, BF, &RA, &FA, AllocatorId);
+        ShrinkWrapping SW(FA, BC, BF, Info, AllocatorId);
+
+        if (SW.perform()) {
+          std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
+          FuncsChanged.insert(&BF);
+        }
+      };
+
+  ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
+      BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
+      SkipPredicate, "shrink-wrapping");
+}
+
 } // namespace bolt
 } // namespace llvm
--- a/src/Passes/FrameOptimizer.h
+++ b/src/Passes/FrameOptimizer.h
@ -86,6 +86,8 @@ class FrameOptimizerPass : public BinaryFunctionPass {

  DenseSet<const BinaryFunction *> FuncsChanged;

+  std::mutex FuncsChangedMutex;
+
  /// Perform a dataflow analysis in \p BF to reveal unnecessary reloads from
  /// the frame. Use the analysis to convert memory loads to register moves or
  /// immediate loads. Delete redundant register moves.
@ -99,6 +101,10 @@ class FrameOptimizerPass : public BinaryFunctionPass {
                          const BinaryContext &BC,
                          BinaryFunction &BF);

+  /// Perform shrinkwrapping step
+  void performShrinkWrapping(const RegAnalysis &RA, const FrameAnalysis &FA,
+                             BinaryContext &BC);
+
 public:
  explicit FrameOptimizerPass(const cl::opt<bool> &PrintPass)
      : BinaryFunctionPass(PrintPass) {}
@ -108,9 +114,7 @@ public:
  }

  /// Pass entry point
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;

  bool shouldPrint(const BinaryFunction &BF) const override {
    return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
--- a/src/Passes/HFSortPlus.cpp
+++ b/src/Passes/HFSortPlus.cpp
@ -11,6 +11,7 @@

 #include "BinaryFunction.h"
 #include "HFSort.h"
+#include "ParallelUtilities.h"
 #include "ReorderUtils.h"
 #include "llvm/Support/Options.h"

@ -319,50 +320,115 @@ public:
  /// Merge pairs of clusters while there is an improvement in the
  /// expected cache miss ratio
  void runPassTwo() {
-    while (Clusters.size() > 1) {
-      Cluster *BestClusterPred = nullptr;
-      Cluster *BestClusterSucc = nullptr;
-      double BestGain = -1;
-      for (auto ClusterPred : Clusters) {
-        // get candidates for merging with the current cluster
-        Adjacent.forAllAdjacent(
-          ClusterPred,
-          // find the best candidate
-          [&](Cluster *ClusterSucc) {
-            assert(ClusterPred != ClusterSucc && "loop edges are not supported");
-            // compute the gain of merging two clusters
-            const double Gain = mergeGain(ClusterPred, ClusterSucc);
+    // BucketsCount is hard-coded to make the algorithm determinestic regardless
+    // of the number of threads
+    const unsigned BucketsCount = 124;
+    unsigned IterationCount = 0;

-            // breaking ties by density to make the hottest clusters be merged first
-            if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 &&
-                                    compareClusterPairs(ClusterPred,
-                                                        ClusterSucc,
-                                                        BestClusterPred,
-                                                        BestClusterSucc))) {
-              BestGain = Gain;
-              BestClusterPred = ClusterPred;
-              BestClusterSucc = ClusterSucc;
-            }
-          });
+    llvm::ThreadPool *Pool;
+    if (!opts::NoThreads)
+      Pool = &ParallelUtilities::getThreadPool();
+
+    while (Clusters.size() > 1) {
+      MergeCandidateEntry GlobalMaximum;
+      std::vector<MergeCandidateEntry> LocalMaximums(BucketsCount);
+
+      // Compare two candidates with a given gain
+      auto compareCandidates = [](const MergeCandidateEntry &CandidateA,
+                                  const MergeCandidateEntry &CandidateB) {
+        // breaking ties by density to make the hottest clusters be
+        // merged first
+        return CandidateA.Gain > CandidateB.Gain ||
+               (std::abs(CandidateA.Gain - CandidateB.Gain) < 1e-8 &&
+                compareClusterPairs(
+                    CandidateA.ClusterPred, CandidateA.ClusterSucc,
+                    CandidateB.ClusterPred, CandidateB.ClusterSucc));
+      };
+
+      // find the best candidates to merge within a bucket range
+      auto findMaximaInBucket = [&](const unsigned Start, const unsigned End,
+                                    const unsigned BucketId) {
+        auto &LocalMaximum = LocalMaximums[BucketId];
+
+        for (unsigned Idx = Start; Idx < End; Idx++) {
+          if (Idx >= Clusters.size())
+            return;
+
+          auto ClusterPred = Clusters[Idx];
+
+          // get best candidates to merge with the current cluster
+          Adjacent.forAllAdjacent(
+              ClusterPred,
+              // find the best candidate
+              [&](Cluster *ClusterSucc) {
+                assert(ClusterPred != ClusterSucc &&
+                       "loop edges are not supported");
+
+                // compute the gain of merging two clusters
+                const double Gain = mergeGain(ClusterPred, ClusterSucc);
+
+                // create a new candidate
+                MergeCandidateEntry Candidate;
+                Candidate.Gain = Gain;
+                Candidate.ClusterPred = ClusterPred;
+                Candidate.ClusterSucc = ClusterSucc;
+
+                if (compareCandidates(Candidate, LocalMaximum))
+                  LocalMaximum = Candidate;
+              });
+        }
+      };
+
+      unsigned BucketSize = Clusters.size() / BucketsCount;
+      if (Clusters.size() % BucketsCount)
+        BucketSize++;
+
+      // find the best candidate within each bucket
+      unsigned BucketId = 0;
+      for (unsigned ClusterIdx = 0; ClusterIdx < Clusters.size();
+           ClusterIdx += BucketSize, BucketId++) {
+
+        if (opts::NoThreads) {
+          findMaximaInBucket(ClusterIdx, ClusterIdx + BucketSize, BucketId);
+        } else {
+          Pool->async(findMaximaInBucket, ClusterIdx, ClusterIdx + BucketSize,
+                      BucketId);
+        }
      }

-      // stop merging when there is no improvement
-      if (BestGain <= 0.0)
+      if (!opts::NoThreads)
+        Pool->wait();
+
+      // find glabal maximum
+      for (auto &LocalMaximum : LocalMaximums) {
+        if (LocalMaximum.Gain > 0 &&
+            compareCandidates(LocalMaximum, GlobalMaximum))
+          GlobalMaximum = LocalMaximum;
+      }
+
+      if (GlobalMaximum.Gain <= 0.0)
        break;

-      // merge the best pair of clusters
-      mergeClusters(BestClusterPred, BestClusterSucc);
+      DEBUG(outs() << "merging##" << GlobalMaximum.ClusterPred->id() << "##"
+                   << GlobalMaximum.ClusterSucc->id() << "@@"
+                   << GlobalMaximum.Gain << "\n");
+
+      mergeClusters(GlobalMaximum.ClusterPred, GlobalMaximum.ClusterSucc);
    }
+
+    DEBUG(outs() << "BOLT-INFO: hfsort+ pass two finished in" << IterationCount
+                 << " iterations.");
  }

  /// Run hfsort+ algorithm and return ordered set of function clusters.
  std::vector<Cluster> run() {
    DEBUG(dbgs() << "Starting hfsort+ w/"
-                 << (UseGainCache ? "gain cache" : "no cache")
-                 << " for " << Clusters.size() << " clusters "
+                 << (UseGainCache ? "gain cache" : "no cache") << " for "
+                 << Clusters.size() << " clusters "
                 << "with ITLBPageSize = " << ITLBPageSize << ", "
                 << "ITLBEntries = " << ITLBEntries << ", "
-                 << "and MergeProbability = " << opts::MergeProbability << "\n");
+                 << "and MergeProbability = " << opts::MergeProbability
+                 << "\n");

    // Pass 1
    runPassOne();
@ -370,7 +436,8 @@ public:
    // Pass 2
    runPassTwo();

-    DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n");
+    DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size()
+                 << " clusters\n");

    // Sorting clusters by density in decreasing order
    std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
@ -418,6 +485,13 @@ public:
  }

 private:
+  /// A struct that is used to store a merge candidate
+  struct MergeCandidateEntry {
+    double Gain{-1};
+    Cluster *ClusterPred{nullptr};
+    Cluster *ClusterSucc{nullptr};
+  };
+
  /// Initialize the set of active clusters, function id to cluster mapping,
  /// total number of samples and function addresses.
  std::vector<Cluster *> initializeClusters() {
@ -502,7 +576,7 @@ private:
  // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
  // containing both x and y and all clusters adjacent to x and y (and recompute
  // them on the next iteration).
-  mutable ClusterPairCache<Cluster, double> GainCache;
+  mutable ClusterPairCacheThreadSafe<Cluster, double> GainCache;
 };

 } // end namespace anonymous
--- a/src/Passes/IdenticalCodeFolding.cpp
+++ b/src/Passes/IdenticalCodeFolding.cpp
@ -9,9 +9,12 @@
 //
 //===----------------------------------------------------------------------===//

-
 #include "Passes/IdenticalCodeFolding.h"
+#include "ParallelUtilities.h"
 #include "llvm/Support/Options.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/Timer.h"
+#include <atomic>
 #include <map>
 #include <set>
 #include <unordered_map>
@ -32,6 +35,12 @@ UseDFS("icf-dfs",
  cl::ZeroOrMore,
  cl::cat(BoltOptCategory));

+static cl::opt<bool>
+TimeICF("time-icf",
+  cl::desc("time icf steps"),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
 } // namespace opts

 namespace {
@ -276,72 +285,108 @@ bool isIdenticalWith(const BinaryFunction &A, const BinaryFunction &B,

  return true;
 }
-}
+
+// This hash table is used to identify identical functions. It maps
+// a function to a bucket of functions identical to it.
+struct KeyHash {
+  std::size_t operator()(const BinaryFunction *F) const {
+    return F->hash(/*Recompute=*/false);
+  }
+};
+
+struct KeyCongruent {
+  bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
+    if (A == B)
+      return true;
+    return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/true, opts::UseDFS);
+  }
+};
+
+struct KeyEqual {
+  bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
+    if (A == B)
+      return true;
+    return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/false, opts::UseDFS);
+  }
+};
+
+typedef std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>,
+                           KeyHash, KeyCongruent>
+    CongruentBucketsMap;
+
+typedef std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
+                           KeyHash, KeyEqual>
+    IdenticalBucketsMap;
+
+} // namespace

 namespace llvm {
 namespace bolt {

-void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC,
-                                        std::map<uint64_t, BinaryFunction> &BFs,
-                                        std::set<uint64_t> &) {
-  const auto OriginalFunctionCount = BFs.size();
-  uint64_t NumFunctionsFolded = 0;
-  uint64_t NumJTFunctionsFolded = 0;
-  uint64_t BytesSavedEstimate = 0;
-  uint64_t CallsSavedEstimate = 0;
+void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
+  const auto OriginalFunctionCount = BC.getBinaryFunctions().size();
+  uint64_t NumFunctionsFolded{0};
+  std::atomic<uint64_t> NumJTFunctionsFolded{0};
+  std::atomic<uint64_t> BytesSavedEstimate{0};
+  std::atomic<uint64_t> CallsSavedEstimate{0};
+  std::atomic<uint64_t> NumFoldedLastIteration{0};
+  CongruentBucketsMap CongruentBuckets;

-  // This hash table is used to identify identical functions. It maps
-  // a function to a bucket of functions identical to it.
-  struct KeyHash {
-    std::size_t operator()(const BinaryFunction *F) const {
-      return F->hash(/*Recompute=*/false);
-    }
-  };
-  struct KeyCongruent {
-    bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
-      return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/true, opts::UseDFS);
-    }
-  };
-  struct KeyEqual {
-    bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
-      return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/false, opts::UseDFS);
-    }
+  // Hash all the functions
+  auto hashFunctions = [&]() {
+    NamedRegionTimer HashFunctionsTimer("hashing", "hashing", "ICF breakdown",
+                                        "ICF breakdown", opts::TimeICF);
+    ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+      // Make sure indices are in-order.
+      BF.updateLayoutIndices();
+
+      // Pre-compute hash before pushing into hashtable.
+      BF.hash(/*Recompute=*/true, opts::UseDFS);
+    };
+
+    ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+      return !shouldOptimize(BF);
+    };
+
+    ParallelUtilities::runOnEachFunction(
+        BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
+        "hashFunctions", /*ForceSequential*/ false, 2);
  };

-  // Create buckets with congruent functions - functions that potentially could
-  // be folded.
-  std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>,
-                     KeyHash, KeyCongruent> CongruentBuckets;
-  for (auto &BFI : BFs) {
-    auto &BF = BFI.second;
-    if (!shouldOptimize(BF) || BF.isFolded())
-      continue;
-
-    // Make sure indices are in-order.
-    BF.updateLayoutIndices();
-
-    // Pre-compute hash before pushing into hashtable.
-    BF.hash(/*Recompute=*/true, opts::UseDFS);
-
-    CongruentBuckets[&BF].emplace(&BF);
-  }
-
-  // We repeat the pass until no new modifications happen.
-  unsigned Iteration = 1;
-  uint64_t NumFoldedLastIteration;
-  do {
-    NumFoldedLastIteration = 0;
-
-    DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
-
-    for (auto &CBI : CongruentBuckets) {
-      auto &Candidates = CBI.second;
-      if (Candidates.size() < 2)
+  // Creates buckets with congruent functions - functions that potentially
+  // could  be folded.
+  auto createCongruentBuckets = [&]() {
+    NamedRegionTimer CongruentBucketsTimer("congruent buckets",
+                                           "congruent buckets", "ICF breakdown",
+                                           "ICF breakdown", opts::TimeICF);
+    for (auto &BFI : BC.getBinaryFunctions()) {
+      auto &BF = BFI.second;
+      if (!this->shouldOptimize(BF))
        continue;
+      CongruentBuckets[&BF].emplace(&BF);
+    }
+  };
+
+  // Partition each set of congruent functions into sets of identical functions
+  // and fold them
+  auto performFoldingPass = [&]() {
+    NamedRegionTimer FoldingPassesTimer("folding passes", "folding passes",
+                                        "ICF breakdown", "ICF breakdown",
+                                        opts::TimeICF);
+    Timer SinglePass("single fold pass", "single fold pass");
+    DEBUG(SinglePass.startTimer());
+
+    ThreadPool *ThPool;
+    if (!opts::NoThreads)
+      ThPool = &ParallelUtilities::getThreadPool();
+
+    // Fold identical functions within a single congruent bucket
+    auto procesSingleBucket = [&](std::set<BinaryFunction *> &Candidates) {
+      Timer T("folding single congruent list", "folding single congruent list");
+      DEBUG(T.startTimer());

      // Identical functions go into the same bucket.
-      std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
-                         KeyHash, KeyEqual> IdenticalBuckets;
+      IdenticalBucketsMap IdenticalBuckets;
      for (auto *BF : Candidates) {
        IdenticalBuckets[BF].emplace_back(BF);
      }
@ -355,9 +400,10 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC,
        // Fold functions. Keep the order consistent across invocations with
        // different options.
        std::stable_sort(Twins.begin(), Twins.end(),
-            [](const BinaryFunction *A, const BinaryFunction *B) {
-              return A->getFunctionNumber() < B->getFunctionNumber();
-            });
+                         [](const BinaryFunction *A, const BinaryFunction *B) {
+                           return A->getFunctionNumber() <
+                                  B->getFunctionNumber();
+                         });

        BinaryFunction *ParentBF = Twins[0];
        for (unsigned i = 1; i < Twins.size(); ++i) {
@ -375,7 +421,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC,
          BytesSavedEstimate += ChildBF->getSize();
          CallsSavedEstimate += std::min(ChildBF->getKnownExecutionCount(),
                                         ParentBF->getKnownExecutionCount());
-          BC.foldFunction(*ChildBF, *ParentBF, BFs);
+          BC.foldFunction(*ChildBF, *ParentBF);

          ++NumFoldedLastIteration;

@ -384,13 +430,44 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC,
        }
      }

+      DEBUG(T.stopTimer());
+    };
+
+    // Create a task for each congruent bucket
+    for (auto &Entry : CongruentBuckets) {
+      auto &Bucket = Entry.second;
+      if (Bucket.size() < 2)
+        continue;
+
+      if (opts::NoThreads)
+        procesSingleBucket(Bucket);
+      else
+        ThPool->async(procesSingleBucket, std::ref(Bucket));
    }
+
+    if (!opts::NoThreads)
+      ThPool->wait();
+
+    DEBUG(SinglePass.stopTimer());
+  };
+
+  hashFunctions();
+  createCongruentBuckets();
+
+  unsigned Iteration = 1;
+  // We repeat the pass until no new modifications happen.
+  do {
+    NumFoldedLastIteration = 0;
+    DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
+
+    performFoldingPass();
+
    NumFunctionsFolded += NumFoldedLastIteration;
    ++Iteration;

  } while (NumFoldedLastIteration > 0);

-  DEBUG(
+   DEBUG(
    // Print functions that are congruent but not identical.
    for (auto &CBI : CongruentBuckets) {
      auto &Candidates = CBI.second;
--- a/src/Passes/IdenticalCodeFolding.h
+++ b/src/Passes/IdenticalCodeFolding.h
@ -23,6 +23,16 @@ namespace bolt {
 /// references to a single one of them.
 ///
 class IdenticalCodeFolding : public BinaryFunctionPass {
+protected:
+  bool shouldOptimize(const BinaryFunction &BF) const override {
+    if (BF.hasUnknownControlFlow())
+      return false;
+    if (BF.isFolded())
+      return false;
+    if (BF.hasSDTMarker())
+      return false;
+    return BinaryFunctionPass::shouldOptimize(BF);
+  }
 public:
  explicit IdenticalCodeFolding(const cl::opt<bool> &PrintPass)
    : BinaryFunctionPass(PrintPass) { }
@ -30,9 +40,7 @@ public:
  const char *getName() const override {
    return "identical-code-folding";
  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 } // namespace bolt
--- a/src/Passes/IndirectCallPromotion.cpp
+++ b/src/Passes/IndirectCallPromotion.cpp
@ -40,11 +40,43 @@ IndirectCallPromotion("indirect-call-promotion",
  cl::cat(BoltOptCategory));

 static cl::opt<unsigned>
-IndirectCallPromotionThreshold(
-    "indirect-call-promotion-threshold",
-    cl::desc("threshold for optimizing a frequently taken indirect call"),
-    cl::init(90),
+ICPJTRemainingPercentThreshold(
+    "icp-jt-remaining-percent-threshold",
+    cl::desc("The percentage threshold against remaining unpromoted indirect "
+             "call count for the promotion for jump tables"),
+    cl::init(30),
    cl::ZeroOrMore,
+    cl::Hidden,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ICPJTTotalPercentThreshold(
+    "icp-jt-total-percent-threshold",
+    cl::desc("The percentage threshold against total count for the promotion for "
+             "jump tables"),
+    cl::init(5),
+    cl::ZeroOrMore,
+    cl::Hidden,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ICPCallsRemainingPercentThreshold(
+    "icp-calls-remaining-percent-threshold",
+    cl::desc("The percentage threshold against remaining unpromoted indirect "
+             "call count for the promotion for calls"),
+    cl::init(50),
+    cl::ZeroOrMore,
+    cl::Hidden,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ICPCallsTotalPercentThreshold(
+    "icp-calls-total-percent-threshold",
+    cl::desc("The percentage threshold against total count for the promotion for "
+             "calls"),
+    cl::init(30),
+    cl::ZeroOrMore,
+    cl::Hidden,
    cl::cat(BoltOptCategory));

 static cl::opt<unsigned>
@ -52,7 +84,7 @@ IndirectCallPromotionMispredictThreshold(
    "indirect-call-promotion-mispredict-threshold",
    cl::desc("misprediction threshold for skipping ICP on an "
             "indirect call"),
-    cl::init(2),
+    cl::init(0),
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));

@ -69,17 +101,17 @@ IndirectCallPromotionUseMispredicts(
 static cl::opt<unsigned>
 IndirectCallPromotionTopN(
    "indirect-call-promotion-topn",
-    cl::desc("number of targets to consider when doing indirect "
-                   "call promotion"),
-    cl::init(1),
+    cl::desc("limit number of targets to consider when doing indirect "
+             "call promotion. 0 = no limit"),
+    cl::init(3),
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));

 static cl::opt<unsigned>
 IndirectCallPromotionCallsTopN(
    "indirect-call-promotion-calls-topn",
-    cl::desc("number of targets to consider when doing indirect "
-             "call promotion on calls"),
+    cl::desc("limit number of targets to consider when doing indirect "
+             "call promotion on calls. 0 = no limit"),
    cl::init(0),
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));
@ -87,8 +119,8 @@ IndirectCallPromotionCallsTopN(
 static cl::opt<unsigned>
 IndirectCallPromotionJumpTablesTopN(
    "indirect-call-promotion-jump-tables-topn",
-    cl::desc("number of targets to consider when doing indirect "
-             "call promotion on jump tables"),
+    cl::desc("limit number of targets to consider when doing indirect "
+             "call promotion on jump tables. 0 = no limit"),
    cl::init(0),
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));
@ -106,8 +138,8 @@ static cl::opt<unsigned>
 ICPTopCallsites(
    "icp-top-callsites",
    cl::desc("only optimize calls that contribute to this percentage of all "
-             "indirect calls"),
-    cl::init(0),
+             "indirect calls. 0 = all callsites"),
+    cl::init(99),
    cl::Hidden,
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));
@ -181,6 +213,42 @@ IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF,
  }
 }

+void IndirectCallPromotion::printDecision(
+    llvm::raw_ostream &OS,
+    std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const {
+  uint64_t TotalCount = 0;
+  uint64_t TotalMispreds = 0;
+  for (const auto &S : Targets) {
+    TotalCount += S.Branches;
+    TotalMispreds += S.Mispreds;
+  }
+  if (!TotalCount)
+    TotalCount = 1;
+  if (!TotalMispreds)
+    TotalMispreds = 1;
+
+  OS << "BOLT-INFO: ICP decision for call site with " << Targets.size()
+     << " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds
+     << "\n";
+
+  size_t I = 0;
+  for (const auto &S : Targets) {
+    OS << "Count = " << S.Branches << ", "
+       << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
+       << "Mispreds = " << S.Mispreds << ", "
+       << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds);
+    if (I < N)
+      OS << " * to be optimized *";
+    if (!S.JTIndices.empty()) {
+      OS << " Indices:";
+      for (const auto Idx : S.JTIndices)
+        OS << " " << Idx;
+    }
+    OS << "\n";
+    I += S.JTIndices.empty() ? 1 : S.JTIndices.size();
+  }
+}
+
 // Get list of targets for a given call sorted by most frequently
 // called first.
 std::vector<IndirectCallPromotion::Callsite>
@ -242,7 +310,8 @@ IndirectCallPromotion::getCallTargets(
      auto &A = *Result;
      const auto &B = *First;
      if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym) {
-        A.JTIndex.insert(A.JTIndex.end(), B.JTIndex.begin(), B.JTIndex.end());
+        A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(),
+                           B.JTIndices.end());
      } else {
        *(++Result) = *First;
      }
@ -272,10 +341,17 @@ IndirectCallPromotion::getCallTargets(
    }
  }

-  // Sort by most commonly called targets.
+  // Sort by target count, number of indices in case of jump table,  and
+  // mispredicts. We prioritize targets with high count, small number of
+  // indices and high mispredicts
  std::stable_sort(Targets.begin(), Targets.end(),
                   [](const Callsite &A, const Callsite &B) {
-                     return A.Branches > B.Branches;
+                     if (A.Branches != B.Branches)
+                       return A.Branches > B.Branches;
+                     else if (A.JTIndices.size() != B.JTIndices.size())
+                       return A.JTIndices.size() < B.JTIndices.size();
+                     else
+                       return A.Mispreds > B.Mispreds;
                   });

  // Remove non-symbol targets
@ -380,9 +456,9 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets(

  uint64_t ArrayStart;
  if (DispExpr) {
-    auto *BD = BC.getBinaryDataByName(DispExpr->getSymbol().getName());
-    assert(BD && "global symbol needs a value");
-    ArrayStart = BD->getAddress();
+    auto DispValueOrError = BC.getSymbolValue(DispExpr->getSymbol());
+    assert(DispValueOrError && "global symbol needs a value");
+    ArrayStart = *DispValueOrError;
  } else {
    ArrayStart = static_cast<uint64_t>(DispValue);
  }
@ -491,7 +567,7 @@ IndirectCallPromotion::SymTargetsType
 IndirectCallPromotion::findCallTargetSymbols(
  BinaryContext &BC,
  std::vector<Callsite> &Targets,
-  const size_t N,
+  size_t &N,
  BinaryFunction &Function,
  BinaryBasicBlock *BB,
  MCInst &CallInst,
@ -511,7 +587,7 @@ IndirectCallPromotion::findCallTargetSymbols(
    if (!HotTargets.empty()) {
      auto findTargetsIndex = [&](uint64_t JTIndex) {
        for (size_t I = 0; I < Targets.size(); ++I) {
-          auto &JTIs = Targets[I].JTIndex;
+          auto &JTIs = Targets[I].JTIndices;
          if (std::find(JTIs.begin(), JTIs.end(), JTIndex) != JTIs.end())
            return I;
        }
@ -521,35 +597,81 @@ IndirectCallPromotion::findCallTargetSymbols(
                         "callsite");
      };

-      const auto MaxHotTargets = std::min(N, HotTargets.size());
-
      if (opts::Verbosity >= 1) {
-        for (size_t I = 0; I < MaxHotTargets; ++I) {
+        for (size_t I = 0; I < HotTargets.size(); ++I) {
          outs() << "BOLT-INFO: HotTarget[" << I << "] = ("
                 << HotTargets[I].first << ", " << HotTargets[I].second << ")\n";
        }
      }

+      // Recompute hottest targets, now discriminating which index is hot
+      // NOTE: This is a tradeoff. On one hand, we get index information. On the
+      // other hand, info coming from the memory profile is much less accurate
+      // than LBRs. So we may actually end up working with more coarse
+      // profile granularity in exchange for information about indices.
      std::vector<Callsite> NewTargets;
-      for (size_t I = 0; I < MaxHotTargets; ++I) {
+      std::map<const MCSymbol *, uint32_t> IndicesPerTarget;
+      uint64_t TotalMemAccesses = 0;
+      for (size_t I = 0; I < HotTargets.size(); ++I) {
+        const auto TargetIndex = findTargetsIndex(HotTargets[I].second);
+        ++IndicesPerTarget[Targets[TargetIndex].To.Sym];
+        TotalMemAccesses += HotTargets[I].first;
+      }
+      uint64_t RemainingMemAccesses = TotalMemAccesses;
+      const size_t TopN = opts::IndirectCallPromotionJumpTablesTopN != 0
+                              ? opts::IndirectCallPromotionTopN
+                              : opts::IndirectCallPromotionTopN;
+      size_t I{0};
+      for (; I < HotTargets.size(); ++I) {
+        const auto MemAccesses = HotTargets[I].first;
+        if (100 * MemAccesses <
+            TotalMemAccesses * opts::ICPJTTotalPercentThreshold)
+          break;
+        if (100 * MemAccesses <
+            RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold)
+          break;
+        if (TopN && I >= TopN)
+          break;
+        RemainingMemAccesses -= MemAccesses;
+
        const auto JTIndex = HotTargets[I].second;
-        const auto TargetIndex = findTargetsIndex(JTIndex);
+        auto &Target = Targets[findTargetsIndex(JTIndex)];

-        NewTargets.push_back(Targets[TargetIndex]);
-        std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndex);
+        NewTargets.push_back(Target);
+        std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices);
+        Target.JTIndices.erase(std::remove(Target.JTIndices.begin(),
+                                           Target.JTIndices.end(), JTIndex),
+                               Target.JTIndices.end());

-        Targets.erase(Targets.begin() + TargetIndex);
+        // Keep fixCFG counts sane if more indices use this same target later
+        assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map");
+        NewTargets.back().Branches =
+            Target.Branches / IndicesPerTarget[Target.To.Sym];
+        NewTargets.back().Mispreds =
+            Target.Mispreds / IndicesPerTarget[Target.To.Sym];
+        assert(Target.Branches >= NewTargets.back().Branches);
+        assert(Target.Mispreds >= NewTargets.back().Mispreds);
+        Target.Branches -= NewTargets.back().Branches;
+        Target.Mispreds -= NewTargets.back().Mispreds;
      }
      std::copy(Targets.begin(), Targets.end(), std::back_inserter(NewTargets));
-      assert(NewTargets.size() == Targets.size() + MaxHotTargets);
      std::swap(NewTargets, Targets);
+      N = I;
+
+      if (N == 0 && opts::Verbosity >= 1) {
+        outs() << "BOLT-INFO: ICP failed in " << Function << " in "
+               << BB->getName()
+               << ": failed to meet thresholds after memory profile data was "
+                  "loaded.\n";
+        return SymTargets;
+      }
    }

    for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) {
      auto &Target = Targets[TgtIdx];
      assert(Target.To.Sym && "All ICP targets must be to known symbols");
-      assert(!Target.JTIndex.empty() && "Jump tables must have indices");
-      for (auto Idx : Target.JTIndex) {
+      assert(!Target.JTIndices.empty() && "Jump tables must have indices");
+      for (auto Idx : Target.JTIndices) {
        SymTargets.push_back(std::make_pair(Target.To.Sym, Idx));
        ++I;
      }
@ -558,7 +680,7 @@ IndirectCallPromotion::findCallTargetSymbols(
    for (size_t I = 0; I < N; ++I) {
      assert(Targets[I].To.Sym &&
             "All ICP targets must be to known symbols");
-      assert(Targets[I].JTIndex.empty() &&
+      assert(Targets[I].JTIndices.empty() &&
             "Can't have jump table indices for non-jump tables");
      SymTargets.push_back(std::make_pair(Targets[I].To.Sym, 0));
    }
@ -647,7 +769,7 @@ IndirectCallPromotion::maybeGetVtableSyms(
                            << "+" << MethodOffset << "/" << MI.Count
                            << "\n");

-    if (auto MethodAddr = BC.extractPointerAtAddress(Address)) {
+    if (auto MethodAddr = BC.getPointerAtAddress(Address)) {
      auto *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get());
      if (!MethodBD)  // skip unknown methods
        continue;
@ -697,7 +819,7 @@ IndirectCallPromotion::rewriteCall(
   BinaryFunction &Function,
   BinaryBasicBlock *IndCallBlock,
   const MCInst &CallInst,
-   MCPlusBuilder::ICPdata &&ICPcode,
+   MCPlusBuilder::BlocksVectorTy &&ICPcode,
   const std::vector<MCInst *> &MethodFetchInsns
 ) const {
  // Create new basic blocks with correct code in each one first.
@ -720,6 +842,10 @@ IndirectCallPromotion::rewriteCall(
  }

  auto MovedInst = IndCallBlock->splitInstructions(&CallInst);
+  // Link new BBs to the original input offset of the BB where the indirect
+  // call site is, so we can map samples recorded in new BBs back to the
+  // original BB seen in the input binary (if using BAT)
+  const auto OrigOffset = IndCallBlock->getInputOffset();

  IndCallBlock->eraseInstructions(MethodFetchInsns.begin(),
                                  MethodFetchInsns.end());
@ -737,7 +863,7 @@ IndirectCallPromotion::rewriteCall(
    auto &Sym = Itr->first;
    auto &Insts = Itr->second;
    assert(Sym);
-    auto TBB = Function.createBasicBlock(0, Sym);
+    auto TBB = Function.createBasicBlock(OrigOffset, Sym);
    for (auto &Inst : Insts) { // sanitize new instructions.
      if (BC.MIB->isCall(Inst))
        BC.MIB->removeAnnotation(Inst, "CallProfile");
@ -774,10 +900,12 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG(
  for (const auto &Target : Targets) {
    TotalIndirectBranches += Target.Branches;
  }
+  if (TotalIndirectBranches == 0)
+    TotalIndirectBranches = 1;
  std::vector<BinaryBranchInfo> BBI;
  std::vector<BinaryBranchInfo> ScaledBBI;
  for (const auto &Target : Targets) {
-    const auto NumEntries = std::max(1UL, Target.JTIndex.size());
+    const auto NumEntries = std::max(1UL, Target.JTIndices.size());
    for (size_t I = 0; I < NumEntries; ++I) {
      BBI.push_back(
          BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries,
@ -796,7 +924,7 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG(

    std::vector<MCSymbol*> SymTargets;
    for (const auto &Target : Targets) {
-      const auto NumEntries = std::max(1UL, Target.JTIndex.size());
+      const auto NumEntries = std::max(1UL, Target.JTIndices.size());
      for (size_t I = 0; I < NumEntries; ++I) {
        SymTargets.push_back(Target.To.Sym);
      }
@ -924,15 +1052,12 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB,
  } else if (opts::IndirectCallPromotionCallsTopN != 0) {
    TopN = opts::IndirectCallPromotionCallsTopN;
  }
-  const auto TrialN = std::min(TopN, Targets.size());
+  const auto TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size();

  if (opts::ICPTopCallsites > 0) {
    auto &BC = BB->getFunction()->getBinaryContext();
-    if (BC.MIB->hasAnnotation(Inst, "DoICP")) {
-      computeStats(TrialN);
-      return TrialN;
-    }
-    return 0;
+    if (!BC.MIB->hasAnnotation(Inst, "DoICP"))
+      return 0;
  }

  // Pick the top N targets.
@ -974,35 +1099,28 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB,
    // Count total number of calls for (at most) the top N targets.
    // We may choose a smaller N (TrialN vs. N) if the frequency threshold
    // is exceeded by fewer targets.
-    double Threshold = double(opts::IndirectCallPromotionThreshold);
-    for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++MaxTargets) {
-      if (N + (Targets[I].JTIndex.empty() ? 1 : Targets[I].JTIndex.size()) >
+    const unsigned TotalThreshold = IsJumpTable
+                                        ? opts::ICPJTTotalPercentThreshold
+                                        : opts::ICPCallsTotalPercentThreshold;
+    const unsigned RemainingThreshold =
+        IsJumpTable ? opts::ICPJTRemainingPercentThreshold
+                    : opts::ICPCallsRemainingPercentThreshold;
+    uint64_t NumRemainingCalls = NumCalls;
+    for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) {
+      if (100 * Targets[I].Branches < NumCalls * TotalThreshold)
+        break;
+      if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold)
+        break;
+      if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) >
          TrialN)
        break;
      TotalCallsTopN += Targets[I].Branches;
      TotalMispredictsTopN += Targets[I].Mispreds;
-      Threshold -= (100.0 * Targets[I].Branches) / NumCalls;
-      N += Targets[I].JTIndex.empty() ? 1 : Targets[I].JTIndex.size();
+      NumRemainingCalls -= Targets[I].Branches;
+      N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size();
    }
    computeStats(MaxTargets);

-    // Compute the frequency of the top N call targets.  If this frequency
-    // is greater than the threshold, we should try ICP on this callsite.
-    const double TopNFrequency = (100.0 * TotalCallsTopN) / NumCalls;
-
-    if (TopNFrequency == 0 ||
-        TopNFrequency < opts::IndirectCallPromotionThreshold) {
-      if (opts::Verbosity >= 1) {
-        const auto InstIdx = &Inst - &(*BB->begin());
-        outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ "
-               << InstIdx << " in " << BB->getName() << ", calls = "
-               << NumCalls << ", top N frequency "
-               << format("%.1f", TopNFrequency) << "% < "
-               << opts::IndirectCallPromotionThreshold << "%\n";
-      }
-      return 0;
-    }
-
    // Don't check misprediction frequency for jump tables -- we don't really
    // care as long as we are saving loads from the jump table.
    if (!IsJumpTable || opts::ICPJumpTablesByTarget) {
@ -1069,7 +1187,7 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB,
           << ", taken freq = " << format("%.1f", Frequency) << "%"
           << ", mis. freq = " << format("%.1f", MisFrequency) << "%";
    bool First = true;
-    for (auto JTIndex : Targets[I].JTIndex) {
+    for (auto JTIndex : Targets[I].JTIndices) {
      outs() << (First ? ", indices = " : ", ") << JTIndex;
      First = false;
    }
@ -1082,14 +1200,12 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB,
  });
 }

-void IndirectCallPromotion::runOnFunctions(
-  BinaryContext &BC,
-  std::map<uint64_t, BinaryFunction> &BFs,
-  std::set<uint64_t> &LargeFunctions
-) {
+void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
  if (opts::IndirectCallPromotion == ICP_NONE)
    return;

+  auto &BFs = BC.getBinaryFunctions();
+
  const bool OptimizeCalls =
    (opts::IndirectCallPromotion == ICP_CALLS ||
     opts::IndirectCallPromotion == ICP_ALL);
@ -1100,7 +1216,7 @@ void IndirectCallPromotion::runOnFunctions(
  std::unique_ptr<RegAnalysis> RA;
  std::unique_ptr<BinaryFunctionCallGraph> CG;
  if (OptimizeJumpTables) {
-    CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs)));
+    CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
    RA.reset(new RegAnalysis(BC, &BFs, &*CG));
  }

@ -1148,8 +1264,13 @@ void IndirectCallPromotion::runOnFunctions(
  // If icp-top-callsites is enabled, compute the total number of indirect
  // calls and then optimize the hottest callsites that contribute to that
  // total.
-  if (opts::ICPTopCallsites > 0) {
-    using IndirectCallsite = std::pair<uint64_t, MCInst *>;
+  SetVector<BinaryFunction *> Functions;
+  if (opts::ICPTopCallsites == 0) {
+    for (auto &KV : BFs) {
+      Functions.insert(&KV.second);
+    }
+  } else {
+    using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>;
    std::vector<IndirectCallsite> IndirectCalls;
    size_t TotalIndirectCalls = 0;

@ -1183,7 +1304,7 @@ void IndirectCallPromotion::runOnFunctions(
              NumCalls += BInfo.Branches;
            }

-            IndirectCalls.push_back(std::make_pair(NumCalls, &Inst));
+            IndirectCalls.push_back(std::make_tuple(NumCalls, &Inst, &Function));
            TotalIndirectCalls += NumCalls;
          }
        }
@ -1198,30 +1319,25 @@ void IndirectCallPromotion::runOnFunctions(
    const float TopPerc = opts::ICPTopCallsites / 100.0f;
    int64_t MaxCalls = TotalIndirectCalls * TopPerc;
    size_t Num = 0;
-    for (auto &IC : IndirectCalls) {
+    for (const auto &IC : IndirectCalls) {
      if (MaxCalls <= 0)
        break;
-      MaxCalls -= IC.first;
+      MaxCalls -= std::get<0>(IC);
+      BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true);
+      Functions.insert(std::get<2>(IC));
      ++Num;
    }
    outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
           << ", " << Num << " callsites cover " << opts::ICPTopCallsites
           << "% of all indirect calls\n";
-
-    // Mark sites to optimize with "DoICP" annotation.
-    for (size_t I = 0; I < Num; ++I) {
-      auto *Inst = IndirectCalls[I].second;
-      BC.MIB->addAnnotation(*Inst, "DoICP", true);
-    }
  }

-  for (auto &BFIt : BFs) {
-    auto &Function = BFIt.second;
+  for (auto *FuncPtr : Functions) {
+    auto &Function = *FuncPtr;

-    if (!Function.isSimple() || !opts::shouldProcess(Function))
-      continue;
-
-    if (!Function.hasProfile())
+    if (!Function.isSimple() ||
+        !opts::shouldProcess(Function) ||
+        !Function.hasProfile())
      continue;

    const bool HasLayout = !Function.layout_empty();
@ -1309,7 +1425,10 @@ void IndirectCallPromotion::runOnFunctions(
        // this callsite.
        size_t N = canPromoteCallsite(BB, Inst, Targets, NumCalls);

-        if (!N)
+        // If it is a jump table and it failed to meet our initial threshold,
+        // proceed to findCallTargetSymbols -- it may reevaluate N if
+        // memory profile is present
+        if (!N && !IsJumpTable)
          continue;

        if (opts::Verbosity >= 1) {
@ -1326,6 +1445,13 @@ void IndirectCallPromotion::runOnFunctions(
                                                      Inst,
                                                      TargetFetchInst);

+        // findCallTargetSymbols may have changed N if mem profile is available
+        // for jump tables
+        if (!N)
+          continue;
+
+        DEBUG(printDecision(dbgs(), Targets, N));
+
        // If we can't resolve any of the target symbols, punt on this callsite.
        // TODO: can this ever happen?
        if (SymTargets.size() < N) {
@ -1446,12 +1572,12 @@ void IndirectCallPromotion::runOnFunctions(
         << "BOLT-INFO: ICP percentage of indirect calls that can be "
            "optimized = "
         << format("%.1f", (100.0 * TotalNumFrequentCalls) /
-                   std::max(TotalIndirectCalls, 1ul))
+                   std::max<size_t>(TotalIndirectCalls, 1))
         << "%\n"
         << "BOLT-INFO: ICP percentage of indirect callsites that are "
            "optimized = "
         << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
-                   std::max(TotalIndirectCallsites, 1ul))
+                   std::max<uint64_t>(TotalIndirectCallsites, 1))
         << "%\n"
         << "BOLT-INFO: ICP number of method load elimination candidates = "
         << TotalMethodLoadEliminationCandidates
@ -1459,17 +1585,17 @@ void IndirectCallPromotion::runOnFunctions(
         << "BOLT-INFO: ICP percentage of method calls candidates that have "
            "loads eliminated = "
         << format("%.1f", (100.0 * TotalMethodLoadsEliminated) /
-                   std::max(TotalMethodLoadEliminationCandidates, 1ul))
+                   std::max<uint64_t>(TotalMethodLoadEliminationCandidates, 1))
         << "%\n"
         << "BOLT-INFO: ICP percentage of indirect branches that are "
            "optimized = "
         << format("%.1f", (100.0 * TotalNumFrequentJmps) /
-                   std::max(TotalIndirectJmps, 1ul))
+                   std::max<uint64_t>(TotalIndirectJmps, 1))
         << "%\n"
         << "BOLT-INFO: ICP percentage of jump table callsites that are "
         << "optimized = "
         << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
-                   std::max(TotalJumpTableCallsites, 1ul))
+                   std::max<uint64_t>(TotalJumpTableCallsites, 1))
         << "%\n"
         << "BOLT-INFO: ICP number of jump table callsites that can use hot "
         << "indices = " << TotalIndexBasedCandidates
@ -1477,7 +1603,7 @@ void IndirectCallPromotion::runOnFunctions(
         << "BOLT-INFO: ICP percentage of jump table callsites that use hot "
            "indices = "
         << format("%.1f", (100.0 * TotalIndexBasedJumps) /
-                   std::max(TotalIndexBasedCandidates, 1ul))
+                   std::max<uint64_t>(TotalIndexBasedCandidates, 1))
         << "%\n";

 #ifndef NDEBUG
--- a/src/Passes/IndirectCallPromotion.h
+++ b/src/Passes/IndirectCallPromotion.h
@ -119,7 +119,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
    uint64_t Mispreds{0};
    uint64_t Branches{0};
    // Indices in the jmp table (jt only)
-    std::vector<uint64_t> JTIndex;
+    std::vector<uint64_t> JTIndices;
    bool isValid() const {
      return From.isValid() && To.isValid();
    }
@ -128,7 +128,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
             uint64_t Mispreds, uint64_t Branches,
             uint64_t JTIndex)
    : From(From), To(To), Mispreds(Mispreds), Branches(Branches),
-      JTIndex(1, JTIndex) { }
+      JTIndices(1, JTIndex) { }
  };

  std::unordered_set<const BinaryFunction *> Modified;
@ -177,6 +177,10 @@ class IndirectCallPromotion : public BinaryFunctionPass {
  // Total number of jump table sites that use hot indices.
  uint64_t TotalIndexBasedJumps{0};

+  void printDecision(llvm::raw_ostream &OS,
+                     std::vector<IndirectCallPromotion::Callsite> &Targets,
+                     unsigned N) const;
+
  std::vector<Callsite> getCallTargets(BinaryBasicBlock &BB,
                                       const MCInst &Inst) const;

@ -201,7 +205,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {

  SymTargetsType findCallTargetSymbols(BinaryContext &BC,
                                       std::vector<Callsite> &Targets,
-                                       const size_t N,
+                                       size_t &N,
                                       BinaryFunction &Function,
                                       BinaryBasicBlock *BB,
                                       MCInst &Inst,
@ -218,7 +222,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
              BinaryFunction &Function,
              BinaryBasicBlock *IndCallBlock,
              const MCInst &CallInst,
-              MCPlusBuilder::ICPdata &&ICPcode,
+              MCPlusBuilder::BlocksVectorTy &&ICPcode,
              const std::vector<MCInst *> &MethodFetchInsns) const;

  BinaryBasicBlock *fixCFG(BinaryContext &BC,
@ -239,9 +243,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
  bool shouldPrint(const BinaryFunction &BF) const override {
    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 } // namespace bolt
--- a/src/Passes/Inliner.cpp
+++ b/src/Passes/Inliner.cpp
@ -180,6 +180,9 @@ Inliner::InliningInfo Inliner::getInliningInfo(const BinaryFunction &BF) const {

  // Perform necessary checks unless the option overrides it.
  if (!opts::mustConsider(BF)) {
+    if (BF.hasSDTMarker())
+      return INL_NONE;
+
    if (BF.hasEHRanges())
      return INL_NONE;

@ -248,9 +251,8 @@ Inliner::InliningInfo Inliner::getInliningInfo(const BinaryFunction &BF) const {
 }

 void
-Inliner::findInliningCandidates(BinaryContext &BC,
-                                const std::map<uint64_t, BinaryFunction> &BFs) {
-  for (const auto &BFI : BFs) {
+Inliner::findInliningCandidates(BinaryContext &BC) {
+  for (const auto &BFI : BC.getBinaryFunctions()) {
    const auto &Function = BFI.second;
    const auto InlInfo = getInliningInfo(Function);
    if (InlInfo.Type != INL_NONE)
@ -532,16 +534,14 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) {
  return DidInlining;
 }

-void Inliner::runOnFunctions(BinaryContext &BC,
-                             std::map<uint64_t, BinaryFunction> &BFs,
-                             std::set<uint64_t> &) {
+void Inliner::runOnFunctions(BinaryContext &BC) {
  opts::syncOptions();

  if (!opts::inliningEnabled())
    return;

  uint64_t TotalSize = 0;
-  for (auto &BFI : BFs)
+  for (auto &BFI : BC.getBinaryFunctions())
    TotalSize += BFI.second.getSize();

  bool InlinedOnce;
@ -553,10 +553,10 @@ void Inliner::runOnFunctions(BinaryContext &BC,
    InlinedOnce = false;

    InliningCandidates.clear();
-    findInliningCandidates(BC, BFs);
+    findInliningCandidates(BC);

    std::vector<BinaryFunction *> ConsideredFunctions;
-    for (auto &BFI : BFs) {
+    for (auto &BFI : BC.getBinaryFunctions()) {
      auto &Function = BFI.second;
      if (!shouldOptimize(Function))
        continue;
--- a/src/Passes/Inliner.h
+++ b/src/Passes/Inliner.h
@ -39,7 +39,7 @@ private:
      : Type(Type)
    {}
  };
-    
+
  std::unordered_map<const BinaryFunction *, InliningInfo> InliningCandidates;

  /// Count total amount of bytes inlined for all instances of Inliner.
@ -58,7 +58,7 @@ private:

  /// Size in bytes of a tail call instruction.
  static uint64_t SizeOfTailCallInst;
-  
+
  /// Set of functions modified by inlining (used for printing).
  std::unordered_set<const BinaryFunction *> Modified;

@ -68,8 +68,7 @@ private:
  /// Return the size in bytes of a tail call instruction.
  uint64_t getSizeOfTailCallInst(const BinaryContext &BC);

-  void findInliningCandidates(BinaryContext &BC,
-                              const std::map<uint64_t, BinaryFunction> &BFs);
+  void findInliningCandidates(BinaryContext &BC);

  bool inlineCallsInFunction(BinaryFunction &Function);

@ -97,9 +96,7 @@ public:
    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
  }

-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 } // namespace bolt
--- a/src/Passes/Instrumentation.cpp
+++ b/src/Passes/Instrumentation.cpp
@ -0,0 +1,314 @@
+//===--- Passes/Instrumentation.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Instrumentation.h"
+#include "Passes/DataflowInfoManager.h"
+#include "llvm/Support/Options.h"
+
+#define DEBUG_TYPE "bolt-instrumentation"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::OptionCategory BoltCategory;
+
+extern bool shouldProcess(const llvm::bolt::BinaryFunction &Function);
+
+cl::opt<std::string> InstrumentationFilename(
+    "instrumentation-file",
+    cl::desc("file name where instrumented profile will be saved"),
+    cl::init("/tmp/prof.fdata"),
+    cl::Optional,
+    cl::cat(BoltCategory));
+
+cl::opt<bool> InstrumentHotOnly(
+    "instrument-hot-only",
+    cl::desc("only insert instrumentation on hot functions (need profile)"),
+    cl::init(false),
+    cl::Optional,
+    cl::cat(BoltCategory));
+}
+
+namespace llvm {
+namespace bolt {
+
+uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) {
+  auto Iter = FuncToStringIdx.find(&Function);
+  if (Iter != FuncToStringIdx.end())
+    return Iter->second;
+  auto Idx = StringTable.size();
+  FuncToStringIdx.emplace(std::make_pair(&Function, Idx));
+  StringTable.append(Function.getNames()[0]);
+  StringTable.append(1, '\0');
+  return Idx;
+}
+
+Instrumentation::CounterDescription Instrumentation::createDescription(
+    const BinaryFunction &FromFunction, uint32_t From,
+    const BinaryFunction &ToFunction, uint32_t To) {
+  CounterDescription Res;
+  Res.FromFuncStringIdx = getFunctionNameIndex(FromFunction);
+  Res.FromOffset = From;
+  Res.ToFuncStringIdx = getFunctionNameIndex(ToFunction);
+  Res.ToOffset = To;
+  return Res;
+}
+
+std::vector<MCInst> Instrumentation::createInstrumentationSnippet(
+    BinaryFunction &FromFunction, uint32_t FromOffset, BinaryFunction &ToFunc,
+    uint32_t ToOffset) {
+  Descriptions.emplace_back(
+      createDescription(FromFunction, FromOffset, ToFunc, ToOffset));
+
+  BinaryContext &BC = FromFunction.getBinaryContext();
+  MCSymbol *Label =
+      BC.Ctx->createTempSymbol("InstrEntry", true);
+  Labels.emplace_back(Label);
+  std::vector<MCInst> CounterInstrs(5);
+  // Don't clobber application red zone (ABI dependent)
+  BC.MIB->createStackPointerIncrement(CounterInstrs[0], 128,
+                                      /*NoFlagsClobber=*/true);
+  BC.MIB->createPushFlags(CounterInstrs[1], 2);
+  BC.MIB->createIncMemory(CounterInstrs[2], Label, &*BC.Ctx);
+  BC.MIB->createPopFlags(CounterInstrs[3], 2);
+  BC.MIB->createStackPointerDecrement(CounterInstrs[4], 128,
+                                      /*NoFlagsClobber=*/true);
+  return CounterInstrs;
+}
+
+bool Instrumentation::instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
+                                          BinaryFunction &FromFunction,
+                                          BinaryBasicBlock &FromBB,
+                                          uint32_t From, BinaryFunction &ToFunc,
+                                          BinaryBasicBlock *TargetBB,
+                                          uint32_t ToOffset) {
+  std::vector<MCInst> CounterInstrs =
+      createInstrumentationSnippet(FromFunction, From, ToFunc, ToOffset);
+
+  BinaryContext &BC = FromFunction.getBinaryContext();
+  const MCInst &Inst = *Iter;
+  if (BC.MIB->isCall(Inst) && !TargetBB) {
+    for (auto &NewInst : CounterInstrs) {
+      Iter = FromBB.insertInstruction(Iter, NewInst);
+      ++Iter;
+    }
+    return true;
+  }
+
+  if (!TargetBB)
+    return false;
+
+  // Indirect branch, conditional branches or fall-throughs
+  // Regular cond branch, put counter at start of target block
+  if (TargetBB->pred_size() == 1 && &FromBB != TargetBB &&
+      !TargetBB->isEntryPoint()) {
+    auto RemoteIter = TargetBB->begin();
+    for (auto &NewInst : CounterInstrs) {
+      RemoteIter = TargetBB->insertInstruction(RemoteIter, NewInst);
+      ++RemoteIter;
+    }
+    return true;
+  }
+  if (FromBB.succ_size() == 1 && &FromBB != TargetBB) {
+    for (auto &NewInst : CounterInstrs) {
+      Iter = FromBB.insertInstruction(Iter, NewInst);
+      ++Iter;
+    }
+    return true;
+  }
+  // Critical edge, create BB and put counter there
+  SplitWorklist.emplace_back(std::make_pair(&FromBB, TargetBB));
+  SplitInstrs.emplace_back(std::move(CounterInstrs));
+  return true;
+}
+
+void Instrumentation::runOnFunctions(BinaryContext &BC) {
+  if (!BC.isX86())
+    return;
+
+  const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/false,
+                                             /*IsText=*/false,
+                                             /*IsAllocatable=*/true);
+  BC.registerOrUpdateSection(".bolt.instr.counters", ELF::SHT_PROGBITS, Flags,
+                             nullptr, 0, 1,
+                             /*local=*/true);
+
+  BC.registerOrUpdateNoteSection(".bolt.instr.tables", nullptr,
+                                  0,
+                                  /*Alignment=*/1,
+                                  /*IsReadOnly=*/true, ELF::SHT_NOTE);
+
+  uint64_t InstrumentationSites{0ULL};
+  uint64_t InstrumentationSitesSavingFlags{0ULL};
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    if (!Function.isSimple() || !opts::shouldProcess(Function)
+        || (opts::InstrumentHotOnly && !Function.getKnownExecutionCount()))
+      continue;
+    Function.disambiguateJumpTables();
+    SplitWorklist.clear();
+    SplitInstrs.clear();
+
+    for (auto BBI = Function.begin(); BBI != Function.end(); ++BBI) {
+      auto &BB{*BBI};
+      bool HasUnconditionalBranch{false};
+      bool HasJumpTable{false};
+
+      for (auto I = BB.begin(); I != BB.end(); ++I) {
+        const auto &Inst = *I;
+        if (!BC.MIB->hasAnnotation(Inst, "Offset"))
+          continue;
+
+        const bool IsJumpTable = Function.getJumpTable(Inst);
+        if (IsJumpTable)
+          HasJumpTable = true;
+        else if (BC.MIB->isUnconditionalBranch(Inst))
+          HasUnconditionalBranch = true;
+        else if ((!BC.MIB->isCall(Inst) &&
+                  !BC.MIB->isConditionalBranch(Inst)) ||
+                 BC.MIB->isUnsupportedBranch(Inst.getOpcode()))
+          continue;
+
+        uint32_t FromOffset = BC.MIB->getAnnotationAs<uint32_t>(Inst, "Offset");
+        const MCSymbol *Target = BC.MIB->getTargetSymbol(Inst);
+        BinaryBasicBlock *TargetBB = Function.getBasicBlockForLabel(Target);
+        uint32_t ToOffset = TargetBB ? TargetBB->getInputOffset() : 0;
+        BinaryFunction *TargetFunc =
+            TargetBB ? &Function : BC.getFunctionForSymbol(Target);
+        // Should be null for indirect branches/calls
+        if (TargetFunc) {
+          if (instrumentOneTarget(I, Function, BB, FromOffset, *TargetFunc,
+                                  TargetBB, ToOffset))
+            ++InstrumentationSites;
+          continue;
+        }
+
+        if (IsJumpTable) {
+          for (auto &Succ : BB.successors()) {
+            if (instrumentOneTarget(I, Function, BB, FromOffset, Function,
+                                    &*Succ, Succ->getInputOffset()))
+              ++InstrumentationSites;
+          }
+          continue;
+        }
+
+        // FIXME: handle indirect calls
+      } // End of instructions loop
+
+      // Instrument fallthroughs (when the direct jump instruction is missing)
+      if (!HasUnconditionalBranch && !HasJumpTable && BB.succ_size() > 0 &&
+          BB.size() > 0) {
+        auto *FTBB = BB.getFallthrough();
+        assert(FTBB && "expected valid fall-through basic block");
+        auto I = BB.begin();
+        auto LastInstr = BB.end();
+        --LastInstr;
+        while (LastInstr != I && BC.MIB->isPseudo(*LastInstr))
+          --LastInstr;
+        uint32_t FromOffset = 0;
+        // The last instruction in the BB should have an annotation, except
+        // if it was branching to the end of the function as a result of
+        // __builtin_unreachable(), in which case it was deleted by fixBranches.
+        // Ignore this case. FIXME: force fixBranches() to preserve the offset.
+        if (!BC.MIB->hasAnnotation(*LastInstr, "Offset"))
+          continue;
+
+        FromOffset = BC.MIB->getAnnotationAs<uint32_t>(*LastInstr, "Offset");
+        if (instrumentOneTarget(I, Function, BB, FromOffset, Function, FTBB,
+                                FTBB->getInputOffset()))
+          ++InstrumentationSites;
+      }
+    } // End of BBs loop
+
+    // Consume list of critical edges: split them and add instrumentation to the
+    // newly created BBs
+    auto Iter = SplitInstrs.begin();
+    for (auto &BBPair : SplitWorklist) {
+      auto *NewBB = Function.splitEdge(BBPair.first, BBPair.second);
+      NewBB->addInstructions(Iter->begin(), Iter->end());
+      ++Iter;
+    }
+  }
+
+  outs() << "BOLT-INSTRUMENTER: Instrumented " << InstrumentationSites
+         << " sites, " << InstrumentationSitesSavingFlags << " saving flags.\n";
+}
+
+void Instrumentation::emitTablesAsELFNote(BinaryContext &BC) {
+  std::string TablesStr;
+  raw_string_ostream OS(TablesStr);
+
+  // Start of the vector with descriptions (one CounterDescription for each
+  // counter), vector size is Labels.size() CounterDescription-sized elmts
+  for (const auto &Desc : Descriptions) {
+    OS.write(reinterpret_cast<const char *>(&Desc.FromFuncStringIdx), 4);
+    OS.write(reinterpret_cast<const char *>(&Desc.FromOffset), 4);
+    OS.write(reinterpret_cast<const char *>(&Desc.ToFuncStringIdx), 4);
+    OS.write(reinterpret_cast<const char *>(&Desc.ToOffset), 4);
+  }
+  // Our string table lives immediately after descriptions vector
+  OS << StringTable;
+  OS.flush();
+  const auto BoltInfo = BinarySection::encodeELFNote(
+      "BOLT", TablesStr, BinarySection::NT_BOLT_INSTRUMENTATION_TABLES);
+  BC.registerOrUpdateNoteSection(".bolt.instr.tables", copyByteArray(BoltInfo),
+                                 BoltInfo.size(),
+                                 /*Alignment=*/1,
+                                 /*IsReadOnly=*/true, ELF::SHT_NOTE);
+}
+
+void Instrumentation::emit(BinaryContext &BC, MCStreamer &Streamer) {
+  emitTablesAsELFNote(BC);
+
+  const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/false,
+                                             /*IsText=*/false,
+                                             /*IsAllocatable=*/true);
+  auto *Section = BC.Ctx->getELFSection(".bolt.instr.counters",
+                                        ELF::SHT_PROGBITS,
+                                        Flags);
+
+  // All of the following symbols will be exported as globals to be used by the
+  // instrumentation runtime library to dump the instrumentation data to disk.
+  // Label marking start of the memory region containing instrumentation
+  // counters, total vector size is Labels.size() 8-byte counters
+  MCSymbol *Locs = BC.Ctx->getOrCreateSymbol("__bolt_instr_locations");
+  MCSymbol *NumLocs = BC.Ctx->getOrCreateSymbol("__bolt_instr_num_locs");
+  /// File name where profile is going to written to after target binary
+  /// finishes a run
+  MCSymbol *FilenameSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_filename");
+
+  Streamer.SwitchSection(Section);
+  Streamer.EmitLabel(Locs);
+  Streamer.EmitSymbolAttribute(Locs,
+                               MCSymbolAttr::MCSA_Global);
+  for (const auto &Label : Labels) {
+    Streamer.EmitLabel(Label);
+    Streamer.emitFill(8, 0);
+  }
+  Streamer.EmitLabel(NumLocs);
+  Streamer.EmitSymbolAttribute(NumLocs,
+                               MCSymbolAttr::MCSA_Global);
+  Streamer.EmitIntValue(Labels.size(), /*Size=*/4);
+  Streamer.EmitLabel(FilenameSym);
+  Streamer.EmitBytes(opts::InstrumentationFilename);
+  Streamer.emitFill(1, 0);
+  outs() << "BOLT-INSTRUMENTER: Total size of counters: "
+         << (Labels.size() * 8) << " bytes (static alloc memory)\n";
+  outs() << "BOLT-INSTRUMENTER: Total size of string table emitted: "
+         << StringTable.size() << " bytes in file\n";
+  outs() << "BOLT-INSTRUMENTER: Total size of descriptors: "
+         << (Labels.size() * 16) << " bytes in file\n";
+  outs() << "BOLT-INSTRUMENTER: Profile will be saved to file "
+         << opts::InstrumentationFilename << "\n";
+}
+
+}
+}
--- a/src/Passes/Instrumentation.h
+++ b/src/Passes/Instrumentation.h
@ -0,0 +1,128 @@
+//===--- Passes/Instrumentation.h -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_INSTRUMENTATION_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_INSTRUMENTATION_H
+
+#include "BinaryPasses.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+
+namespace llvm {
+namespace bolt {
+
+/// This is an instrumentation pass that modifies the input binary to generate
+/// a profile after execution finishes. It modifies branches to increment
+/// counters stored in the process memory and inserts a new function that
+/// dumps this data to an fdata file.
+///
+/// The runtime for instrumentation has a string table that holds function
+/// names. It also must include two data structures: the counter values being
+/// incremented after each instrumented branch and a description of these
+/// counters to be written in a file during dump. The description references
+/// string indices in the string table for function names, as well as function
+/// offsets locating branch source and destination. The counter values will be
+/// converted to decimal form when writing the dumped fdata.
+///
+/// OPPORTUNITIES ON PERFORMANCE
+/// This instrumentation is experimental and currently uses a naive approach
+/// where every branch is instrumented. This is not ideal for runtime
+/// performance, but should be good enough for us to evaluate/debug LBR profile
+/// quality against instrumentation. Hopefully we can make this more efficient
+/// in the future, but most optimizations here can cost a lot in BOLT processing
+/// time. Keep in mind the instrumentation pass runs on every single BB of the
+/// entire input binary, thus it is very expensive to do analyses, such as FLAGS
+/// liveness to avoid spilling flags on every branch, if the binary is large.
+///
+/// MISSING: instrumentation of indirect calls
+class Instrumentation {
+public:
+  Instrumentation() {}
+
+  /// Modifies all functions by inserting instrumentation code (first step)
+  void runOnFunctions(BinaryContext &BC);
+
+  /// Emit data structures that will be necessary during runtime (second step)
+  void emit(BinaryContext &BC, MCStreamer &Streamer);
+
+private:
+  // Instrumented branch location information
+  struct CounterDescription {
+    uint32_t FromFuncStringIdx;
+    uint32_t FromOffset;
+    uint32_t ToFuncStringIdx;
+    uint32_t ToOffset;
+  };
+
+  /// Retrieve the string table index for the name of \p Function. We encode
+  /// instrumented locations descriptions with the aid of a string table to
+  /// manage memory of the instrumentation runtime in a more efficient way.
+  /// If this function name is not represented in the string table yet, it will
+  /// be inserted and its index returned.
+  uint32_t getFunctionNameIndex(const BinaryFunction &Function);
+
+  /// Populate all information needed to identify an instrumented location:
+  /// branch source location in terms of function name plus offset, as well as
+  /// branch destination (also name + offset). This will be encoded in the
+  /// binary as static data and function name strings will reference a strtab.
+  CounterDescription createDescription(const BinaryFunction &FromFunction,
+                                       uint32_t From,
+                                       const BinaryFunction &ToFunction,
+                                       uint32_t To);
+
+
+  /// Create the sequence of instructions to instrument a branch happening
+  /// at \p FromFunction + \p FromOffset to \p ToFunc + \p ToOffset
+  std::vector<MCInst> createInstrumentationSnippet(BinaryFunction &FromFunction,
+                                                   uint32_t FromOffset,
+                                                   BinaryFunction &ToFunc,
+                                                   uint32_t ToOffset);
+
+  /// Instrument the branch in \p Iter located at \p FromFunction + \p From,
+  /// basic block \p FromBB. The destination of the branch is \p ToFunc +
+  /// \p ToOffset. \p TargetBB should be non-null if this is a local branch
+  /// and null if it is a call. Return true on success.
+  bool instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
+                           BinaryFunction &FromFunction,
+                           BinaryBasicBlock &FromBB, uint32_t From,
+                           BinaryFunction &ToFunc, BinaryBasicBlock *TargetBB,
+                           uint32_t ToOffset);
+
+  /// Create a non-allocatable ELF section with read-only tables necessary for
+  /// writing the instrumented data profile during program finish. The runtime
+  /// library needs to open the program executable file and read this data from
+  /// disk, this is not loaded by the system.
+  void emitTablesAsELFNote(BinaryContext &BC);
+
+  /// Critical edges worklist
+  /// This worklist keeps track of CFG edges <From-To> that needs to be split.
+  /// This task is deferred until we finish processing all BBs because we can't
+  /// modify the CFG while iterating over it. For each edge, \p SplitInstrs
+  /// stores the list of instrumentation instructions as a vector of MCInsts.
+  /// instrumentOneTarget() populates this, runOnFunctions() consumes.
+  std::vector<std::pair<BinaryBasicBlock *, BinaryBasicBlock *>> SplitWorklist;
+  std::vector<std::vector<MCInst>> SplitInstrs;
+
+  /// Stores function names, to be emitted to the runtime
+  std::string StringTable;
+
+  /// strtab indices in StringTable for each function name
+  std::unordered_map<const BinaryFunction *, uint32_t> FuncToStringIdx;
+  std::vector<CounterDescription> Descriptions;
+
+  /// Identify all counters used in runtime while instrumentation is running
+  std::vector<MCSymbol *> Labels;
+};
+
+}
+}
+
+#endif
--- a/src/Passes/JTFootprintReduction.cpp
+++ b/src/Passes/JTFootprintReduction.cpp
@ -243,21 +243,17 @@ void JTFootprintReduction::optimizeFunction(BinaryContext &BC,
  }
 }

-void JTFootprintReduction::runOnFunctions(
-  BinaryContext &BC,
-  std::map<uint64_t, BinaryFunction> &BFs,
-  std::set<uint64_t> &LargeFunctions
-) {
+void JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
  if (opts::JumpTables == JTS_BASIC && BC.HasRelocations)
    return;

  std::unique_ptr<RegAnalysis> RA;
  std::unique_ptr<BinaryFunctionCallGraph> CG;
  if (!opts::JTFootprintOnlyPIC) {
-    CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs)));
-    RA.reset(new RegAnalysis(BC, &BFs, &*CG));
+    CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
+    RA.reset(new RegAnalysis(BC, &BC.getBinaryFunctions(), &*CG));
  }
-  for (auto &BFIt : BFs) {
+  for (auto &BFIt : BC.getBinaryFunctions()) {
    auto &Function = BFIt.second;

    if (!Function.isSimple() || !opts::shouldProcess(Function))
--- a/src/Passes/JTFootprintReduction.h
+++ b/src/Passes/JTFootprintReduction.h
@ -75,9 +75,7 @@ public:
  bool shouldPrint(const BinaryFunction &BF) const override {
    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 } // namespace bolt
--- a/src/Passes/LFenceInsertion.cpp
+++ b/src/Passes/LFenceInsertion.cpp
@ -8,11 +8,21 @@
 //===----------------------------------------------------------------------===//
 //
 // This class implements a pass that inserts LFENCE instructions before each
-// conditional branch to protect against Spectre Variant 1.
-// The performance impact of this is significant!
+// conditional branch to protect against Spectre Variant 1, as well as the
+// various LVI mitigations.
+//
+// The runtime performance impact of this is significant!
+//
+// NOTE: This pass is incompatible with RetpolineInsertion. It is also
+//       incompatible with ABIs that allow red-zones, due the the
+//       flags-preserving jmp mitigation clobbering 8 bytes in the red-zone.
+//       Options are to disable red-zone when compiling the target binary,
+//       or configure the compilers to never generate memory-indirect jmps.
 //===----------------------------------------------------------------------===//
 #include "LFenceInsertion.h"
 #include "RewriteInstance.h"
+#include "RetpolineInsertion.h" //IndirectBranchInfo
+#include "ParallelUtilities.h"
 #include "llvm/Support/raw_ostream.h"

 #define DEBUG_TYPE "bolt-lfence"
@ -20,6 +30,7 @@
 using namespace llvm;
 using namespace bolt;
 namespace opts {
+extern bool shouldProcess(const bolt::BinaryFunction &Function);

 extern cl::OptionCategory BoltCategory;

@ -30,14 +41,53 @@ InsertLFences("insert-lfences",
  cl::ZeroOrMore,
  cl::cat(BoltCategory));

+llvm::cl::opt<bool>
+LFenceConditionalBranches("lfence-conditional-branches",
+  cl::desc("determine if all conditional branches should be lfence mitigated"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+llvm::cl::opt<bool>
+LFenceLoads("lfence-loads",
+  cl::desc("determine if all loads should be lfence mitigated"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+llvm::cl::opt<bool>
+LFenceReturns("lfence-returns",
+  cl::desc("determine if all returns should be lfence mitigated"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+llvm::cl::opt<bool>
+LFenceIndirectCalls("lfence-indirect-calls",
+  cl::desc("determine if all indirect calls should be lfence mitigated"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+llvm::cl::opt<bool>
+LFenceIndirectJumps("lfence-indirect-jumps",
+  cl::desc("determine if all indirect jumps should be lfence mitigated"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
 } // namespace opts

 namespace llvm {
 namespace bolt {

-void LFenceInsertion::runOnFunctions(BinaryContext &BC,
-                                     std::map<uint64_t, BinaryFunction> &BFs,
-                                     std::set<uint64_t> &LargeFunctions) {
+static void report_redzone_error() {
+  errs() << "BOLT-ERROR: 'Redzone access in function with indirect jmp mitigation'\n";
+  exit(1);
+}
+
+void LFenceInsertion::runOnFunctions(BinaryContext &BC) {

  if (!opts::InsertLFences)
    return;
@ -49,25 +99,234 @@ void LFenceInsertion::runOnFunctions(BinaryContext &BC,

  auto &MIB = *BC.MIB;
  uint32_t LFencedBranches = 0;
-  for (auto &It : BFs) {
+  uint32_t LFencedLoads = 0;
+  uint32_t LFencedRets = 0;
+  uint32_t LFencedIndirectCalls = 0;
+  uint32_t LFencedIndirectJmps = 0;
+  for (auto &It : BC.getBinaryFunctions()) {
    auto &Function = It.second;
+    bool MemIndirectJmp = false;
+    bool Redzone = false;
+
+    // For performance reasons, we may want to skip some functions and
+    // manually add lfences to them only where absolutely needed.
+    if (!opts::shouldProcess(Function))
+      continue;
+
    for (auto &BB : Function) {
+      bool LastWasLFence = false;
      for (auto It = BB.begin(); It != BB.end(); ++It) {
        auto &Inst = *It;

-        if (!MIB.isConditionalBranch(Inst))
-          continue;
+        if (MIB.isActualLoad(Inst) && MIB.isBranchOnMem(Inst)) {
+          IndirectBranchInfo BrInfo(Inst, MIB);
+          const auto &MemRef = BrInfo.Memory;

-        MCInst LFence;
-        MIB.createLfence(LFence);
-        It = BB.insertInstruction(It, std::move(LFence));
-        ++It;
-        LFencedBranches++;
+          if (MemRef.BaseRegNum == MIB.getStackPointer() &&
+              MemRef.DispValue < 0) {
+            if (MemIndirectJmp) {
+              report_redzone_error();
+            }
+            Redzone = true;
+          }
+        }
+
+        if (opts::LFenceConditionalBranches &&
+            MIB.isConditionalBranch(Inst)) {
+          // Inserts a lfence before every conditional branch.
+          // For example:
+          //   cmp %reg1, %reg2
+          //   je <jump_dest>
+          // gets rewritten to:
+          //   cmp %reg1, %reg2
+          //   lfence
+          //   je <jump_dest>
+          if (!LastWasLFence) {
+            MCInst LFence;
+            MIB.createLfence(LFence);
+            It = BB.insertInstruction(It, std::move(LFence));
+            ++It;
+          }
+          LFencedBranches++;
+          LastWasLFence = false;
+        } else if (opts::LFenceLoads &&
+                   MIB.isActualLoad(Inst) &&
+                   !MIB.isReturn(Inst) &&
+                   !MIB.isIndirectBranch(Inst) &&
+                   !MIB.isIndirectCall(Inst)) {
+          // Inserts an lfence after every load from memory.
+          // For example:
+          //   mov    0x8(%rbx), %rdi
+          // Gets rewritten to:
+          //   mov    0x8(%rbx), %rdi
+          //   lfence
+          ++It;
+          MCInst LFence;
+          MIB.createLfence(LFence);
+          It = BB.insertInstruction(It, std::move(LFence));
+          LFencedLoads++;
+          LastWasLFence = true;
+        } else if (opts::LFenceReturns &&
+                   MIB.isReturn(Inst) && !MIB.isIndirectBranch(Inst)) {
+          // Inserts a dummy write + lfence before every ret.
+          // For example:
+          //   retq
+          // gets rewritten to:
+          //   shlq $0, (%rsp)
+          //   lfence
+          //   retq
+          MCInst Shlq;
+          MIB.createShl(Shlq, MIB.getStackPointer(), 1, MIB.getNoRegister(), 0, nullptr,
+                        MIB.getNoRegister(), 0, 8);
+          It = BB.insertInstruction(It, std::move(Shlq));
+          ++It;
+          MCInst LFence;
+          MIB.createLfence(LFence);
+          It = BB.insertInstruction(It, std::move(LFence));
+          ++It;
+          LFencedRets++;
+          LastWasLFence = false;
+        } else if (opts::LFenceIndirectCalls &&
+                   MIB.isIndirectCall(Inst) && MIB.isLoad(Inst) && !MIB.isIndirectBranch(Inst)) {
+          // Translates indirect calls into lea/mov/jmp then applies the jmp mitigation.
+          // For example:
+          //   callq *(%rsi)
+          // gets rewritten to:
+          //   pushq %rdi //Dummy to overwrite later
+          //   pushq %rdi
+          //   leaq 0x18(%rip), %rdi //After the retq
+          //   movq %rdi, 8(%rsp) //Overwrite the dummy
+          //   popq %rdi
+          //   lfence
+          //   pushq (%rsi)
+          //   lfence //XXX Not needed, according to Intel?
+          //   shlq $0, (%rsp)
+          //   lfence
+          //   retq
+          IndirectBranchInfo BrInfo(Inst, MIB);
+          const auto &MemRef = BrInfo.Memory;
+          auto *Ctx = BC.Ctx.get();
+          assert(BrInfo.isMem());
+
+          // Create a separate MCCodeEmitter to allow lock-free execution
+          BinaryContext::IndependentCodeEmitter Emitter;
+          if (!opts::NoThreads) {
+            Emitter = BC.createIndependentMCCodeEmitter();
+          }
+
+          int offset = 0x15 + BC.computeCodeSize(It, std::next(It), Emitter.MCE.get());
+
+          MCPhysReg ScratchReg = MIB.getIntArgRegister(0);
+          MCInst Pushq1; //Dummy, to overwrite later.
+          MIB.createPushRegister(Pushq1, ScratchReg, 8);
+          It = BB.insertInstruction(It, std::move(Pushq1));
+          ++It;
+          MCInst Pushq2;
+          MIB.createPushRegister(Pushq2, ScratchReg, 8);
+          It = BB.insertInstruction(It, std::move(Pushq2));
+          ++It;
+          MCInst Leaq;
+          MIB.createLea(Leaq, MIB.getInstructionPointer(), 1, MIB.getNoRegister(),
+                        offset, nullptr, MIB.getNoRegister(), ScratchReg, 8);
+          It = BB.insertInstruction(It, std::move(Leaq));
+          ++It;
+          MCInst Movq;
+          MIB.createSaveToStack(Movq, MIB.getStackPointer(), 8, ScratchReg, 8);
+          It = BB.insertInstruction(It, std::move(Movq));
+          ++It;
+          MCInst Popq;
+          MIB.createPopRegister(Popq, ScratchReg, 8);
+          It = BB.insertInstruction(It, std::move(Popq));
+          ++It;
+          MCInst LFence1;
+          MIB.createLfence(LFence1);
+          It = BB.insertInstruction(It, std::move(LFence1));
+          ++It;
+          MCInst Pushq3;
+          MIB.createPushRegisterIndirect(Pushq3, MemRef.BaseRegNum, MemRef.ScaleValue,
+                                         MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
+                                         MemRef.SegRegNum, 8);
+          It = BB.insertInstruction(It, std::move(Pushq3));
+          ++It;
+          MCInst LFence2;
+          MIB.createLfence(LFence2);
+          It = BB.insertInstruction(It, std::move(LFence2));
+          ++It;
+          MCInst Shlq;
+          MIB.createShl(Shlq, MIB.getStackPointer(), 1, MIB.getNoRegister(), 0, nullptr,
+                        MIB.getNoRegister(), 0, 8);
+          It = BB.insertInstruction(It, std::move(Shlq));
+          ++It;
+          MCInst LFence3;
+          MIB.createLfence(LFence3);
+          It = BB.insertInstruction(It, std::move(LFence3));
+          ++It;
+          MCInst Retq;
+          MIB.createReturn(Retq);
+          BB.replaceInstruction(It, std::vector<MCInst>({Retq}));
+          LFencedIndirectCalls++;
+          LastWasLFence = false;
+        } else if (opts::LFenceIndirectJumps &&
+                   MIB.isIndirectBranch(Inst) && MIB.isLoad(Inst)) {
+          // Maps indirect jumps to "push; ret", then applies ret mitigation.
+          // For example:
+          //   jmpq *(%rsi)
+          // gets rewritten to:
+          //   pushq (%rsi)
+          //   lfence //XXX Not needed, according to Intel?
+          //   shlq $0, (%rsp)
+          //   lfence
+          //   retq
+
+          // Since this mitigation clobbers the redzone, we need to make
+          // sure that this function never uses it.
+          if (Redzone) {
+            report_redzone_error();
+          }
+          MemIndirectJmp = true;
+
+          IndirectBranchInfo BrInfo(Inst, MIB);
+          const auto &MemRef = BrInfo.Memory;
+
+          MCInst Push;
+          MIB.createPushRegisterIndirect(Push, MemRef.BaseRegNum, MemRef.ScaleValue,
+                                         MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
+                                         MemRef.SegRegNum, 8);
+          It = BB.insertInstruction(It, std::move(Push));
+          ++It;
+          MCInst LFence1;
+          MIB.createLfence(LFence1);
+          It = BB.insertInstruction(It, std::move(LFence1));
+          ++It;
+          MCInst Shlq;
+          MIB.createShl(Shlq, MIB.getStackPointer(), 1, MIB.getNoRegister(), 0, nullptr,
+                        MIB.getNoRegister(), 0, 8);
+          It = BB.insertInstruction(It, std::move(Shlq));
+          ++It;
+          MCInst LFence2;
+          MIB.createLfence(LFence2);
+          It = BB.insertInstruction(It, std::move(LFence2));
+          ++It;
+          MCInst Retq;
+          MIB.createReturn(Retq);
+          BB.replaceInstruction(It, std::vector<MCInst>({Retq}));
+          LFencedIndirectJmps++;
+          LastWasLFence = false;
+        } else if (MIB.isLfence(Inst)) {
+          LastWasLFence = true;
+        } else {
+          LastWasLFence = false;
+        }
      }
    }
  }
-  outs() << "\nBOLT-INFO: The number of lfenced branches is : " << LFencedBranches
-         << "\n";
+
+  outs() << "\nBOLT-INFO: The number of lfenced branches is : " << LFencedBranches;
+  outs() << "\nBOLT-INFO: The number of lfenced loads is : " << LFencedLoads;
+  outs() << "\nBOLT-INFO: The number of lfenced rets is : " << LFencedRets;
+  outs() << "\nBOLT-INFO: The number of lfenced indirect calls is : " << LFencedIndirectCalls;
+  outs() << "\nBOLT-INFO: The number of lfenced indirect jmps is : " << LFencedIndirectJmps
+         << "\n\n";
 }

 } // namespace bolt
--- a/src/Passes/LFenceInsertion.h
+++ b/src/Passes/LFenceInsertion.h
@ -28,9 +28,7 @@ public:

  const char *getName() const override { return "lfence-insertion"; }

-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };
 } // namespace bolt
 } // namespace llvm
--- a/src/Passes/LivenessAnalysis.h
+++ b/src/Passes/LivenessAnalysis.h
@ -38,8 +38,8 @@ class LivenessAnalysis

 public:
  LivenessAnalysis(const RegAnalysis &RA, const BinaryContext &BC,
-                   BinaryFunction &BF)
-      : Parent(BC, BF), RA(RA), NumRegs(BC.MRI->getNumRegs()) {}
+                   BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId)
+      : Parent(BC, BF, AllocId), RA(RA), NumRegs(BC.MRI->getNumRegs()) {}
  virtual ~LivenessAnalysis();

  bool isAlive(ProgramPoint PP, MCPhysReg Reg) const {
@ -50,8 +50,6 @@ public:
  }

  void run() {
-    NamedRegionTimer T1("LA", "Liveness Analysis", "Dataflow", "Dataflow",
-                        opts::TimeOpts);
    Parent::run();
  }

--- a/src/Passes/LongJmp.cpp
+++ b/src/Passes/LongJmp.cpp
@ -84,7 +84,7 @@ LongJmpPass::createNewStub(BinaryBasicBlock &SourceBB, const MCSymbol *TgtSym,
  MCInst Inst;
  BC.MIB->createUncondBranch(Inst, TgtSym, BC.Ctx.get());
  if (TgtIsFunc)
-    BC.MIB->convertJmpToTailCall(Inst, BC.Ctx.get());
+    BC.MIB->convertJmpToTailCall(Inst);
  StubBB->addInstruction(Inst);
  StubBB->setExecutionCount(0);

@ -427,9 +427,9 @@ uint64_t LongJmpPass::getSymbolAddress(const BinaryContext &BC,
  if (Iter == HotAddresses.end()) {
    // Look at BinaryContext's resolution for this symbol - this is a symbol not
    // mapped to a BinaryFunction
-    auto *BD = BC.getBinaryDataByName(Target->getName());
-    assert(BD && "Unrecognized symbol");
-    return BD ? BD->getAddress() : 0;
+    auto ValueOrError = BC.getSymbolValue(*Target);
+    assert(ValueOrError && "Unrecognized symbol");
+    return *ValueOrError;
  }
  return Iter->second;
 }
@ -595,11 +595,9 @@ bool LongJmpPass::relax(BinaryFunction &Func) {
  return Modified;
 }

-void LongJmpPass::runOnFunctions(BinaryContext &BC,
-                                 std::map<uint64_t, BinaryFunction> &BFs,
-                                 std::set<uint64_t> &LargeFunctions) {
+void LongJmpPass::runOnFunctions(BinaryContext &BC) {
  outs() << "BOLT-INFO: Starting stub-insertion pass\n";
-  auto Sorted = BinaryContext::getSortedFunctions(BFs);
+  auto Sorted = BC.getSortedFunctions();
  bool Modified;
  uint32_t Iterations{0};
  do {
--- a/src/Passes/LongJmp.h
+++ b/src/Passes/LongJmp.h
@ -150,9 +150,7 @@ public:

  const char *getName() const override { return "long-jmp"; }

-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };
 }
 }
--- a/src/Passes/MCF.cpp
+++ b/src/Passes/MCF.cpp
@ -2460,12 +2460,10 @@ void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction) {
      }
    };

-    size_t CurEdgeNum{0};
    auto Next = std::next(BBI);
    for (auto Succ : BB.successors()) {
      int IsFT = (Next != E && Succ == *Next) ? 1 : 0;
      AddSuccArc(Succ, BI->Count, IsFT);
-      ++CurEdgeNum;
      ++BI;
    }

--- a/src/Passes/PLTCall.cpp
+++ b/src/Passes/PLTCall.cpp
@ -43,15 +43,12 @@ PLT("plt",
 namespace llvm {
 namespace bolt {

-void PLTCall::runOnFunctions(
-    BinaryContext &BC,
-    std::map<uint64_t, BinaryFunction> &BFs,
-    std::set<uint64_t> &) {
+void PLTCall::runOnFunctions(BinaryContext &BC) {
  if (opts::PLT == OT_NONE)
    return;

  uint64_t NumCallsOptimized = 0;
-  for (auto &It : BFs) {
+  for (auto &It : BC.getBinaryFunctions()) {
    auto &Function = It.second;
    if (!shouldOptimize(Function))
      continue;
--- a/src/Passes/PLTCall.h
+++ b/src/Passes/PLTCall.h
@ -38,9 +38,7 @@ public:
  bool shouldPrint(const BinaryFunction &BF) const override {
    return BinaryFunctionPass::shouldPrint(BF);
 }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 } // namespace bolt
--- a/src/Passes/ReachingDefOrUse.h
+++ b/src/Passes/ReachingDefOrUse.h
@ -36,9 +36,10 @@ class ReachingDefOrUse

 public:
  ReachingDefOrUse(const RegAnalysis &RA, const BinaryContext &BC,
-                   BinaryFunction &BF, Optional<MCPhysReg> TrackingReg = None)
-      : InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>(BC, BF), RA(RA),
-        TrackingReg(TrackingReg) {}
+                   BinaryFunction &BF, Optional<MCPhysReg> TrackingReg = None,
+                   MCPlusBuilder::AllocatorIdTy AllocId = 0)
+      : InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>(BC, BF, AllocId),
+        RA(RA), TrackingReg(TrackingReg) {}
  virtual ~ReachingDefOrUse() {}

  bool isReachedBy(MCPhysReg Reg, ExprIterator Candidates) {
@ -60,8 +61,6 @@ public:
  }

  void run() {
-    NamedRegionTimer T1("RD", "Reaching Defs", "Dataflow", "Dataflow",
-                        opts::TimeOpts);
    InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>::run();
  }

--- a/src/Passes/ReachingInsns.h
+++ b/src/Passes/ReachingInsns.h
@ -29,8 +29,9 @@ class ReachingInsns
  friend class DataflowAnalysis<ReachingInsns<Backward>, BitVector, Backward>;

 public:
-  ReachingInsns(const BinaryContext &BC, BinaryFunction &BF)
-      : InstrsDataflowAnalysis<ReachingInsns, Backward>(BC, BF) {}
+  ReachingInsns(const BinaryContext &BC, BinaryFunction &BF,
+                MCPlusBuilder::AllocatorIdTy AllocId = 0)
+      : InstrsDataflowAnalysis<ReachingInsns, Backward>(BC, BF, AllocId) {}
  virtual ~ReachingInsns() {}

  bool isInLoop(const BinaryBasicBlock &BB) {
@ -46,8 +47,6 @@ public:
  }

  void run() {
-    NamedRegionTimer T1("RI", "Reaching Insns", "Dataflow", "Dataflow",
-                        opts::TimeOpts);
    InstrsDataflowAnalysis<ReachingInsns<Backward>, Backward>::run();
  }

--- a/src/Passes/RegAnalysis.h
+++ b/src/Passes/RegAnalysis.h
@ -36,7 +36,8 @@ public:
  /// set of clobbered registers.
  BitVector getFunctionClobberList(const BinaryFunction *Func);

-  RegAnalysis(BinaryContext &BC, std::map<uint64_t, BinaryFunction> *BFs,
+  RegAnalysis(BinaryContext &BC,
+              std::map<uint64_t, BinaryFunction> *BFs,
              BinaryFunctionCallGraph *CG);

  /// Compute the set of registers \p Inst may read from, marking them in
--- a/src/Passes/RegReAssign.cpp
+++ b/src/Passes/RegReAssign.cpp
@ -339,7 +339,7 @@ bool RegReAssign::conservativePassOverFunction(BinaryContext &BC,
 void RegReAssign::setupAggressivePass(BinaryContext &BC,
                                     std::map<uint64_t, BinaryFunction> &BFs) {
  setupConservativePass(BC, BFs);
-  CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs)));
+  CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
  RA.reset(new RegAnalysis(BC, &BFs, &*CG));

  GPRegs = BitVector(BC.MRI->getNumRegs(), false);
@ -380,18 +380,16 @@ void RegReAssign::setupConservativePass(
  });
 }

-void RegReAssign::runOnFunctions(BinaryContext &BC,
-                                std::map<uint64_t, BinaryFunction> &BFs,
-                                std::set<uint64_t> &LargeFunctions) {
+void RegReAssign::runOnFunctions(BinaryContext &BC) {
  RegScore = std::vector<int64_t>(BC.MRI->getNumRegs(), 0);
  RankedRegs = std::vector<size_t>(BC.MRI->getNumRegs(), 0);

  if (opts::AggressiveReAssign)
-    setupAggressivePass(BC, BFs);
+    setupAggressivePass(BC, BC.getBinaryFunctions());
  else
-    setupConservativePass(BC, BFs);
+    setupConservativePass(BC, BC.getBinaryFunctions());

-  for (auto &I : BFs) {
+  for (auto &I : BC.getBinaryFunctions()) {
    auto &Function = I.second;

    if (!Function.isSimple() || !opts::shouldProcess(Function))
--- a/src/Passes/RegReAssign.h
+++ b/src/Passes/RegReAssign.h
@ -58,9 +58,7 @@ public:
    return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
  }

-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };
 }
 }
--- a/src/Passes/ReorderAlgorithm.cpp
+++ b/src/Passes/ReorderAlgorithm.cpp
@ -27,6 +27,7 @@ using namespace bolt;
 namespace opts {

 extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<bool> NoThreads;

 static cl::opt<bool>
 PrintClusters("print-clusters",
@ -65,7 +66,13 @@ struct HashPair {

 }

-void ClusterAlgorithm::computeClusterAverageFrequency() {
+void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
+  // Create a separate MCCodeEmitter to allow lock-free execution
+  BinaryContext::IndependentCodeEmitter Emitter;
+  if (!opts::NoThreads) {
+    Emitter = BC.createIndependentMCCodeEmitter();
+  }
+
  AvgFreq.resize(Clusters.size(), 0.0);
  for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
    double Freq = 0.0;
@ -75,7 +82,7 @@ void ClusterAlgorithm::computeClusterAverageFrequency() {
        Freq += BB->getExecutionCount();
        // Estimate the size of a block in bytes at run time
        // NOTE: This might be inaccurate
-        ClusterSize += BB->estimateSize();
+        ClusterSize += BB->estimateSize(Emitter.MCE.get());
      }
    }
    AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
@ -525,7 +532,7 @@ void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
  auto &ClusterEdges = CAlgo->ClusterEdges;

  // Compute clusters' average frequencies.
-  CAlgo->computeClusterAverageFrequency();
+  CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
  std::vector<double> &AvgFreq = CAlgo->AvgFreq;

  if (opts::PrintClusters)
@ -627,7 +634,7 @@ void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
  std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;

  // Compute clusters' average frequencies.
-  CAlgo->computeClusterAverageFrequency();
+  CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
  std::vector<double> &AvgFreq = CAlgo->AvgFreq;

  if (opts::PrintClusters)
--- a/src/Passes/ReorderAlgorithm.h
+++ b/src/Passes/ReorderAlgorithm.h
@ -53,7 +53,7 @@ public:
  /// the sum of average frequencies of its blocks (execution count / # instrs).
  /// The average frequencies are stored in the AvgFreq vector, index by the
  /// cluster indices in the Clusters vector.
-  void computeClusterAverageFrequency();
+  void computeClusterAverageFrequency(const BinaryContext &BC);

  /// Clear clusters and related info.
  virtual void reset();
--- a/src/Passes/ReorderData.cpp
+++ b/src/Passes/ReorderData.cpp
@ -379,9 +379,7 @@ bool ReorderData::markUnmoveableSymbols(BinaryContext &BC,
  return FoundUnmoveable;
 }

-void ReorderData::runOnFunctions(BinaryContext &BC,
-                                 std::map<uint64_t, BinaryFunction> &BFs,
-                                 std::set<uint64_t> &LargeFunctions) {
+void ReorderData::runOnFunctions(BinaryContext &BC) {
  static const char* DefaultSections[] = {
    ".rodata",
    ".data",
@ -435,7 +433,8 @@ void ReorderData::runOnFunctions(BinaryContext &BC,
      std::tie(Order, SplitPointIdx) = sortedByCount(BC, *Section);
    } else {
      outs() << "BOLT-INFO: reorder-sections: ordering data by funcs\n";
-      std::tie(Order, SplitPointIdx) = sortedByFunc(BC, *Section, BFs);
+      std::tie(Order, SplitPointIdx) =
+        sortedByFunc(BC, *Section, BC.getBinaryFunctions());
    }
    auto SplitPoint = Order.begin() + SplitPointIdx;

--- a/src/Passes/ReorderData.h
+++ b/src/Passes/ReorderData.h
@ -57,9 +57,7 @@ public:
    return "reorder-data";
  }

-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 } // namespace bolt
--- a/src/Passes/ReorderFunctions.cpp
+++ b/src/Passes/ReorderFunctions.cpp
@ -276,21 +276,13 @@ std::vector<std::string> readFunctionOrderFile() {

 }

-void ReorderFunctions::runOnFunctions(BinaryContext &BC,
-                                      std::map<uint64_t, BinaryFunction> &BFs,
-                                      std::set<uint64_t> &LargeFunctions) {
-  if (!BC.HasRelocations && opts::ReorderFunctions != RT_NONE) {
-    errs() << "BOLT-ERROR: Function reordering only works when "
-           << "relocs are enabled.\n";
-    exit(1);
-  }
-
+void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
+  auto &BFs = BC.getBinaryFunctions();
  if (opts::ReorderFunctions != RT_NONE &&
      opts::ReorderFunctions != RT_EXEC_COUNT &&
      opts::ReorderFunctions != RT_USER) {
    Cg = buildCallGraph(BC,
-                        BFs,
-                        [this](const BinaryFunction &BF) {
+                        [](const BinaryFunction &BF) {
                          if (!BF.hasProfile())
                            return true;
                          if (BF.getState() != BinaryFunction::State::CFG)
--- a/src/Passes/ReorderFunctions.h
+++ b/src/Passes/ReorderFunctions.h
@ -41,9 +41,7 @@ public:
  const char *getName() const override {
    return "reorder-functions";
  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };

 } // namespace bolt
--- a/src/Passes/ReorderUtils.h
+++ b/src/Passes/ReorderUtils.h
@ -106,6 +106,49 @@ private:
  BitVector Valid;
 };

+// This class holds cached results of specified type for a pair of Clusters.
+// It can invalidate all cache entries associated with a given Cluster.
+// The functions set, get and contains are thread safe when called with
+// distinct keys.
+template <typename Cluster, typename ValueType>
+class ClusterPairCacheThreadSafe {
+public:
+  explicit ClusterPairCacheThreadSafe(size_t Size)
+      : Size(Size), Cache(Size * Size), Valid(Size * Size, false) {}
+
+  bool contains(const Cluster *First, const Cluster *Second) const {
+    return Valid[index(First, Second)];
+  }
+
+  ValueType get(const Cluster *First, const Cluster *Second) const {
+    assert(contains(First, Second));
+    return Cache[index(First, Second)];
+  }
+
+  void set(const Cluster *First, const Cluster *Second, ValueType Value) {
+    const auto Index = index(First, Second);
+    Cache[Index] = Value;
+    Valid[Index] = true;
+  }
+
+  void invalidate(const Cluster *C) {
+    for (size_t idx = C->id() * Size; idx < (C->id() + 1) * Size; idx++)
+      Valid[idx] = false;
+
+    for (size_t id = 0; id < Size; id++)
+      Valid[(id * Size) + C->id()] = false;
+  }
+
+private:
+  size_t Size;
+  std::vector<ValueType> Cache;
+  std::vector<ValueType> Valid;
+
+  size_t index(const Cluster *First, const Cluster *Second) const {
+    return (First->id() * Size) + Second->id();
+  }
+};
+
 } // namespace bolt
 } // namespace llvm

--- a/src/Passes/RetpolineInsertion.cpp
+++ b/src/Passes/RetpolineInsertion.cpp
@ -138,9 +138,10 @@ BinaryFunction *createNewRetpoline(BinaryContext &BC,
      BB2.addInstruction(PushR11);

      MCInst LoadCalleeAddrs;
-      MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue,
-                     BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr,
-                     BrInfo.SegRegNum, MIB.getX86R11(), 8);
+      const auto &MemRef = BrInfo.Memory;
+      MIB.createLoad(LoadCalleeAddrs, MemRef.BaseRegNum, MemRef.ScaleValue,
+                     MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
+                     MemRef.SegRegNum, MIB.getX86R11(), 8);

      BB2.addInstruction(LoadCalleeAddrs);

@ -186,27 +187,29 @@ std::string createRetpolineFunctionTag(BinaryContext &BC,

  std::string Tag = "__retpoline_mem_";

+  const auto &MemRef = BrInfo.Memory;
+
  std::string DispExprStr;
-  if (BrInfo.DispExpr) {
+  if (MemRef.DispExpr) {
    llvm::raw_string_ostream Ostream(DispExprStr);
-    BrInfo.DispExpr->print(Ostream, BC.AsmInfo.get());
+    MemRef.DispExpr->print(Ostream, BC.AsmInfo.get());
    Ostream.flush();
  }

-  Tag += BrInfo.BaseRegNum != BC.MIB->getX86NoRegister()
-             ? "r" + to_string(BrInfo.BaseRegNum)
+  Tag += MemRef.BaseRegNum != BC.MIB->getNoRegister()
+             ? "r" + to_string(MemRef.BaseRegNum)
             : "";

  Tag +=
-      BrInfo.DispExpr ? "+" + DispExprStr : "+" + to_string(BrInfo.DispValue);
+      MemRef.DispExpr ? "+" + DispExprStr : "+" + to_string(MemRef.DispValue);

-  Tag += BrInfo.IndexRegNum != BC.MIB->getX86NoRegister()
-             ? "+" + to_string(BrInfo.ScaleValue) + "*" +
-                   to_string(BrInfo.IndexRegNum)
+  Tag += MemRef.IndexRegNum != BC.MIB->getNoRegister()
+             ? "+" + to_string(MemRef.ScaleValue) + "*" +
+                   to_string(MemRef.IndexRegNum)
             : "";

-  Tag += BrInfo.SegRegNum != BC.MIB->getX86NoRegister()
-             ? "_seg_" + to_string(BrInfo.SegRegNum)
+  Tag += MemRef.SegRegNum != BC.MIB->getNoRegister()
+             ? "_seg_" + to_string(MemRef.SegRegNum)
             : "";

  return Tag;
@ -232,10 +235,11 @@ void createBranchReplacement(BinaryContext &BC,
  auto &MIB = *BC.MIB;
  // Load the branch address in r11 if available
  if (BrInfo.isMem() && R11Available) {
+    const auto &MemRef = BrInfo.Memory;
    MCInst LoadCalleeAddrs;
-    MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue,
-                   BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr,
-                   BrInfo.SegRegNum, MIB.getX86R11(), 8);
+    MIB.createLoad(LoadCalleeAddrs, MemRef.BaseRegNum, MemRef.ScaleValue,
+                   MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
+                   MemRef.SegRegNum, MIB.getX86R11(), 8);
    Replacement.push_back(LoadCalleeAddrs);
  }

@ -255,9 +259,10 @@ IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) {

  if (MIB.isBranchOnMem(Inst)) {
    IsMem = true;
-    if (!MIB.evaluateX86MemoryOperand(Inst, &BaseRegNum, &ScaleValue,
-                                      &IndexRegNum, &DispValue, &SegRegNum,
-                                      &DispExpr)) {
+    if (!MIB.evaluateX86MemoryOperand(Inst, &Memory.BaseRegNum,
+                                      &Memory.ScaleValue,
+                                      &Memory.IndexRegNum, &Memory.DispValue,
+                                      &Memory.SegRegNum, &Memory.DispExpr)) {
      llvm_unreachable("not expected");
    }
  } else if (MIB.isBranchOnReg(Inst)) {
@ -268,10 +273,7 @@ IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) {
  }
 }

-void RetpolineInsertion::runOnFunctions(BinaryContext &BC,
-                                        std::map<uint64_t, BinaryFunction> &BFs,
-                                        std::set<uint64_t> &LargeFunctions) {
-
+void RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
  if (!opts::InsertRetpolines)
    return;

@ -282,7 +284,7 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC,

  auto &MIB = *BC.MIB;
  uint32_t RetpolinedBranches = 0;
-  for (auto &It : BFs) {
+  for (auto &It : BC.getBinaryFunctions()) {
    auto &Function = It.second;
    for (auto &BB : Function) {
      for (auto It = BB.begin(); It != BB.end(); ++It) {
@ -309,12 +311,13 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC,
        // If the instruction addressing pattern uses rsp and the retpoline
        // loads the callee address then displacement needs to be updated
        if (BrInfo.isMem() && !R11Available) {
+          auto &MemRef = BrInfo.Memory;
          auto Addend = (BrInfo.isJump() || BrInfo.isTailCall()) ? 8 : 16;
-          if (BrInfo.BaseRegNum == MIB.getStackPointer()) {
-            BrInfo.DispValue += Addend;
+          if (MemRef.BaseRegNum == MIB.getStackPointer()) {
+            MemRef.DispValue += Addend;
          }
-          if (BrInfo.IndexRegNum == MIB.getStackPointer())
-            BrInfo.DispValue += Addend * BrInfo.ScaleValue;
+          if (MemRef.IndexRegNum == MIB.getStackPointer())
+            MemRef.DispValue += Addend * MemRef.ScaleValue;
        }

        TargetRetpoline = getOrCreateRetpoline(BC, BrInfo, R11Available);
--- a/src/Passes/RetpolineInsertion.h
+++ b/src/Passes/RetpolineInsertion.h
@ -34,19 +34,21 @@ public:
  bool isJump() const { return !IsCall; }
  bool isTailCall() const { return IsTailCall; }

+  struct MemOpInfo {
+    unsigned BaseRegNum;
+    int64_t ScaleValue;
+    unsigned IndexRegNum;
+    int64_t DispValue;
+    unsigned SegRegNum;
+    const MCExpr *DispExpr{nullptr};
+  };
+
  union {
    // Register branch information
    MCPhysReg BranchReg;

    // Memory branch information
-    struct {
-      unsigned BaseRegNum;
-      int64_t ScaleValue;
-      unsigned IndexRegNum;
-      int64_t DispValue;
-      unsigned SegRegNum;
-      const MCExpr *DispExpr{nullptr};
-    };
+    MemOpInfo Memory;
  };
 };

@ -71,9 +73,7 @@ public:

  const char *getName() const override { return "retpoline-insertion"; }

-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  void runOnFunctions(BinaryContext &BC) override;
 };
 } // namespace bolt
 } // namespace llvm
--- a/src/Passes/ShrinkWrapping.cpp
+++ b/src/Passes/ShrinkWrapping.cpp
@ -102,7 +102,7 @@ void CalleeSavedAnalysis::analyzeSaves() {
        CalleeSaved.set(FIE->RegOrImm);
        SaveFIEByReg[FIE->RegOrImm] = &*FIE;
        SavingCost[FIE->RegOrImm] += InsnToBB[&Inst]->getKnownExecutionCount();
-        BC.MIB->addAnnotation(Inst, getSaveTag(), FIE->RegOrImm);
+        BC.MIB->addAnnotation(Inst, getSaveTag(), FIE->RegOrImm, AllocatorId);
        OffsetsByReg[FIE->RegOrImm] = FIE->StackOffset;
        DEBUG(dbgs() << "Logging new candidate for Callee-Saved Reg: "
                     << FIE->RegOrImm << "\n");
@ -153,7 +153,8 @@ void CalleeSavedAnalysis::analyzeRestores() {
                     << "\n");
        if (LoadFIEByReg[FIE->RegOrImm] == nullptr)
          LoadFIEByReg[FIE->RegOrImm] = &*FIE;
-        BC.MIB->addAnnotation(Inst, getRestoreTag(), FIE->RegOrImm);
+        BC.MIB->addAnnotation(Inst, getRestoreTag(), FIE->RegOrImm,
+                              AllocatorId);
        HasRestores.set(FIE->RegOrImm);
      }
      Prev = &Inst;
@ -311,7 +312,7 @@ void StackLayoutModifier::checkStackPointerRestore(MCInst &Point) {

  // We are restoring SP to an old value based on FP. Mark it as a stack
  // access to be fixed later.
-  BC.MIB->addAnnotation(Point, getSlotTag(), Output);
+  BC.MIB->addAnnotation(Point, getSlotTag(), Output, AllocatorId);
 }

 void StackLayoutModifier::classifyStackAccesses() {
@ -354,7 +355,7 @@ void StackLayoutModifier::classifyStackAccesses() {
      // We are free to go. Add it as available stack slot which we know how
      // to move it.
      AvailableRegions[FIEX->StackOffset] = FIEX->Size;
-      BC.MIB->addAnnotation(Inst, getSlotTag(), FIEX->StackOffset);
+      BC.MIB->addAnnotation(Inst, getSlotTag(), FIEX->StackOffset, AllocatorId);
      RegionToRegMap[FIEX->StackOffset].insert(FIEX->RegOrImm);
      RegToRegionMap[FIEX->RegOrImm].insert(FIEX->StackOffset);
      DEBUG(dbgs() << "Adding region " << FIEX->StackOffset << " size "
@ -371,7 +372,7 @@ void StackLayoutModifier::classifyCFIs() {
  auto recordAccess = [&](MCInst *Inst, int64_t Offset) {
    const uint16_t Reg = BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false);
    if (Reg == BC.MIB->getStackPointer() || Reg == BC.MIB->getFramePointer()) {
-      BC.MIB->addAnnotation(*Inst, getSlotTag(), Offset);
+      BC.MIB->addAnnotation(*Inst, getSlotTag(), Offset, AllocatorId);
      DEBUG(dbgs() << "Recording CFI " << Offset << "\n");
    } else {
      IsSimple = false;
@ -400,12 +401,14 @@ void StackLayoutModifier::classifyCFIs() {
        recordAccess(&Inst, CFI->getOffset());
        BC.MIB->addAnnotation(Inst, getOffsetCFIRegTag(),
                              BC.MRI->getLLVMRegNum(CFI->getRegister(),
-                                                    /*isEH=*/false));
+                                                    /*isEH=*/false),
+                              AllocatorId);
        break;
      case MCCFIInstruction::OpSameValue:
        BC.MIB->addAnnotation(Inst, getOffsetCFIRegTag(),
                              BC.MRI->getLLVMRegNum(CFI->getRegister(),
-                                                    /*isEH=*/false));
+                                                    /*isEH=*/false),
+                              AllocatorId);
        break;
      case MCCFIInstruction::OpRememberState:
        CFIStack.push(std::make_pair(CfaOffset, CfaReg));
@ -432,7 +435,7 @@ void StackLayoutModifier::classifyCFIs() {
 void StackLayoutModifier::scheduleChange(
    MCInst &Inst, StackLayoutModifier::WorklistItem Item) {
  auto &WList = BC.MIB->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
-      Inst, getTodoTag());
+      Inst, getTodoTag(), AllocatorId);
  WList.push_back(Item);
 }

@ -510,7 +513,7 @@ bool StackLayoutModifier::collapseRegion(MCInst *Alloc, int64_t RegionAddr,
      }

      if (Slot == RegionAddr) {
-        BC.MIB->addAnnotation(Inst, "AccessesDeletedPos", 0U);
+        BC.MIB->addAnnotation(Inst, "AccessesDeletedPos", 0U, AllocatorId);
        continue;
      }
      if (BC.MIB->isPush(Inst) || BC.MIB->isPop(Inst)) {
@ -771,7 +774,7 @@ void ShrinkWrapping::pruneUnwantedCSRs() {
 }

 void ShrinkWrapping::computeSaveLocations() {
-  SavePos = std::vector<SmallPtrSet<MCInst *, 4>>(BC.MRI->getNumRegs());
+  SavePos = std::vector<SmallSetVector<MCInst *, 4>>(BC.MRI->getNumRegs());
  auto &RI = Info.getReachingInsnsBackwards();
  auto &DA = Info.getDominatorAnalysis();
  auto &SPT = Info.getStackPointerTracking();
@ -960,7 +963,7 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR,
  // In case of a critical edge, we need to create extra BBs to host restores
  // into edges transitioning to the dominance frontier, otherwise we pull these
  // restores to inside the dominated area.
-  Frontier = DA.getDominanceFrontierFor(*BestPosSave);
+  Frontier = DA.getDominanceFrontierFor(*BestPosSave).takeVector();
  DEBUG({
    dbgs() << "Dumping dominance frontier for ";
    BC.printInstruction(dbgs(), *BestPosSave);
@ -1454,13 +1457,13 @@ protected:
 public:
  PredictiveStackPointerTracking(const BinaryContext &BC, BinaryFunction &BF,
                                 decltype(ShrinkWrapping::Todo) &TodoMap,
-                                 DataflowInfoManager &Info)
-      : StackPointerTrackingBase<PredictiveStackPointerTracking>(BC, BF),
+                                 DataflowInfoManager &Info,
+                                 MCPlusBuilder::AllocatorIdTy AllocatorId = 0)
+      : StackPointerTrackingBase<PredictiveStackPointerTracking>(BC, BF,
+                                                                 AllocatorId),
        TodoMap(TodoMap), Info(Info) {}

  void run() {
-    NamedRegionTimer T1("PSPT", "Predictive Stack Pointer Tracking", "Dataflow",
-                        "Dataflow", opts::TimeOpts);
    StackPointerTrackingBase<PredictiveStackPointerTracking>::run();
  }
 };
@ -1553,7 +1556,7 @@ void ShrinkWrapping::rebuildCFIForSP() {
        continue;
      auto *CFI = BF.getCFIFor(Inst);
      if (CFI->getOperation() == MCCFIInstruction::OpDefCfaOffset)
-        BC.MIB->addAnnotation(Inst, "DeleteMe", 0U);
+        BC.MIB->addAnnotation(Inst, "DeleteMe", 0U, AllocatorId);
    }
  }

@ -1812,7 +1815,7 @@ BBIterTy ShrinkWrapping::processInsertionsList(
 }

 bool ShrinkWrapping::processInsertions() {
-  PredictiveStackPointerTracking PSPT(BC, BF, Todo, Info);
+  PredictiveStackPointerTracking PSPT(BC, BF, Todo, Info, AllocatorId);
  PSPT.run();

  bool Changes{false};
@ -1910,6 +1913,15 @@ bool ShrinkWrapping::perform() {
  PopOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
  DomOrder = std::vector<MCPhysReg>(BC.MRI->getNumRegs(), 0);

+  if (BF.checkForAmbiguousJumpTables()) {
+    DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
+                 << ".\n");
+    // We could call disambiguateJumpTables here, but it is probably not worth
+    // the cost (of duplicating potentially large jump tables that could regress
+    // dcache misses). Moreover, ambiguous JTs are rare and coming from code
+    // written in assembly language. Just bail.
+    return false;
+  }
  SLM.initialize();
  CSA.compute();
  classifyCSRUses();
--- a/src/Passes/ShrinkWrapping.h
+++ b/src/Passes/ShrinkWrapping.h
@ -27,6 +27,8 @@ class CalleeSavedAnalysis {
  const BinaryContext &BC;
  BinaryFunction &BF;
  DataflowInfoManager &Info;
+  MCPlusBuilder::AllocatorIdTy AllocatorId;
+
  Optional<unsigned> SaveTagIndex;
  Optional<unsigned> RestoreTagIndex;

@ -39,12 +41,6 @@ class CalleeSavedAnalysis {
  /// function.
  void analyzeRestores();

-  /// Returns the identifying string used to annotate instructions with metadata
-  /// for this analysis. These are deleted in the destructor.
-  static StringRef getSaveTagName() {
-    return StringRef("CSA-SavedReg");
-  }
-
  unsigned getSaveTag() {
    if (SaveTagIndex)
      return *SaveTagIndex;
@ -52,10 +48,6 @@ class CalleeSavedAnalysis {
    return *SaveTagIndex;
  }

-  static StringRef getRestoreTagName() {
-    return StringRef("CSA-RestoredReg");
-  }
-
  unsigned getRestoreTag() {
    if (RestoreTagIndex)
      return *RestoreTagIndex;
@ -72,8 +64,9 @@ public:
  std::vector<const FrameIndexEntry*> LoadFIEByReg;

  CalleeSavedAnalysis(const FrameAnalysis &FA, const BinaryContext &BC,
-                      BinaryFunction &BF, DataflowInfoManager &Info)
-      : FA(FA), BC(BC), BF(BF), Info(Info),
+                      BinaryFunction &BF, DataflowInfoManager &Info,
+                      MCPlusBuilder::AllocatorIdTy AllocId)
+      : FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId),
        CalleeSaved(BC.MRI->getNumRegs(), false),
        OffsetsByReg(BC.MRI->getNumRegs(), 0LL),
        HasRestores(BC.MRI->getNumRegs(), false),
@ -112,6 +105,17 @@ public:
  /// instructions).
  std::vector<MCInst *> getSavesByReg(uint16_t Reg);
  std::vector<MCInst *> getRestoresByReg(uint16_t Reg);
+
+  /// Returns the identifying string used to annotate instructions with metadata
+  /// for this analysis. These are deleted in the destructor.
+  static StringRef getSaveTagName() {
+    return StringRef("CSA-SavedReg");
+  }
+
+  static StringRef getRestoreTagName() {
+    return StringRef("CSA-RestoredReg");
+  }
+
 };

 /// Identifies in a given binary function all stack regions being used and allow
@ -122,6 +126,7 @@ class StackLayoutModifier {
  const BinaryContext &BC;
  BinaryFunction &BF;
  DataflowInfoManager &Info;
+  MCPlusBuilder::AllocatorIdTy AllocatorId;

  // Keep track of stack slots we know how to safely move
  std::map<int64_t, int64_t> AvailableRegions;
@ -217,20 +222,11 @@ private:
    return *OffsetCFIRegTagIndex;
  }

-  static StringRef getTodoTagName() {
-    return StringRef("SLM-TodoTag");
-  }
-  static StringRef getSlotTagName() {
-    return StringRef("SLM-SlotTag");
-  }
-  static StringRef getOffsetCFIRegTagName() {
-    return StringRef("SLM-OffsetCFIReg");
-  }
-
 public:
  StackLayoutModifier(const FrameAnalysis &FA, const BinaryContext &BC,
-                      BinaryFunction &BF, DataflowInfoManager &Info)
-      : FA(FA), BC(BC), BF(BF), Info(Info) {}
+                      BinaryFunction &BF, DataflowInfoManager &Info,
+                      MCPlusBuilder::AllocatorIdTy AllocId)
+      : FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId) {}

  ~StackLayoutModifier() {
    for (auto &BB : BF) {
@ -283,6 +279,19 @@ public:
  /// Perform initial assessment of the function trying to understand its stack
  /// accesses.
  void initialize();
+
+  static StringRef getTodoTagName() {
+    return StringRef("SLM-TodoTag");
+  }
+
+  static StringRef getSlotTagName() {
+    return StringRef("SLM-SlotTag");
+  }
+
+  static StringRef getOffsetCFIRegTagName() {
+    return StringRef("SLM-OffsetCFIReg");
+  }
+
 };

 /// Implements a pass to optimize callee-saved register spills. These spills
@ -294,6 +303,7 @@ class ShrinkWrapping {
  const BinaryContext &BC;
  BinaryFunction &BF;
  DataflowInfoManager &Info;
+  MCPlusBuilder::AllocatorIdTy AllocatorId;
  StackLayoutModifier SLM;
  /// For each CSR, store a vector of all CFI indexes deleted as a consequence
  /// of moving this Callee-Saved Reg
@ -306,7 +316,7 @@ class ShrinkWrapping {
  std::vector<int64_t> PopOffsetByReg;
  std::vector<MCPhysReg> DomOrder;
  CalleeSavedAnalysis CSA;
-  std::vector<SmallPtrSet<MCInst *, 4>> SavePos;
+  std::vector<SmallSetVector<MCInst *, 4>> SavePos;
  std::vector<uint64_t> BestSaveCount;
  std::vector<MCInst *> BestSavePos;

@ -381,7 +391,7 @@ private:
  void scheduleChange(ProgramPoint PP, T&& ...Item) {
    if (PP.isInst()) {
      auto &WList = BC.MIB->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
-          *PP.getInst(), getAnnotationIndex());
+          *PP.getInst(), getAnnotationIndex(), AllocatorId);
      WList.emplace_back(std::forward<T>(Item)...);
      return;
    }
@ -398,7 +408,7 @@ private:
      BB = *BB->succ_begin();
    }
    auto &WList = BC.MIB->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
-      *BB->begin(), getAnnotationIndex());
+        *BB->begin(), getAnnotationIndex(), AllocatorId);
    WList.emplace_back(std::forward<T>(Item)...);
  }

@ -517,9 +527,10 @@ private:

 public:
  ShrinkWrapping(const FrameAnalysis &FA, const BinaryContext &BC,
-                  BinaryFunction &BF, DataflowInfoManager &Info)
-      : FA(FA), BC(BC), BF(BF), Info(Info), SLM(FA, BC, BF, Info),
-        CSA(FA, BC, BF, Info) {}
+                 BinaryFunction &BF, DataflowInfoManager &Info,
+                 MCPlusBuilder::AllocatorIdTy AllocId)
+      : FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId),
+        SLM(FA, BC, BF, Info, AllocId), CSA(FA, BC, BF, Info, AllocId) {}

  ~ShrinkWrapping() {
    for (auto &BB : BF) {
--- a/src/Passes/StackAllocationAnalysis.h
+++ b/src/Passes/StackAllocationAnalysis.h
@ -35,14 +35,13 @@ class StackAllocationAnalysis

 public:
  StackAllocationAnalysis(const BinaryContext &BC, BinaryFunction &BF,
-                          StackPointerTracking &SPT)
-      : InstrsDataflowAnalysis<StackAllocationAnalysis, false>(BC, BF),
+                          StackPointerTracking &SPT,
+                          MCPlusBuilder::AllocatorIdTy AllocId)
+      : InstrsDataflowAnalysis<StackAllocationAnalysis, false>(BC, BF, AllocId),
        SPT(SPT) {}
  virtual ~StackAllocationAnalysis() {}

  void run() {
-    NamedRegionTimer T1("SAA", "Stack Allocation Analysis", "Dataflow",
-                        "Dataflow", opts::TimeOpts);
    InstrsDataflowAnalysis<StackAllocationAnalysis, false>::run();
  }

--- a/src/Passes/StackAvailableExpressions.h
+++ b/src/Passes/StackAvailableExpressions.h
@ -36,8 +36,6 @@ public:
  virtual ~StackAvailableExpressions() {}

  void run() {
-    NamedRegionTimer T1("SAE", "Stack Available Expressions", "Dataflow",
-                        "Dataflow", opts::TimeOpts);
    InstrsDataflowAnalysis<StackAvailableExpressions>::run();
  }

--- a/src/Passes/StackPointerTracking.cpp
+++ b/src/Passes/StackPointerTracking.cpp
@ -14,9 +14,10 @@
 namespace llvm {
 namespace bolt {

-StackPointerTracking::StackPointerTracking(const BinaryContext &BC,
-                                           BinaryFunction &BF)
-    : StackPointerTrackingBase<StackPointerTracking>(BC, BF) {}
+StackPointerTracking::StackPointerTracking(
+    const BinaryContext &BC, BinaryFunction &BF,
+    MCPlusBuilder::AllocatorIdTy AllocatorId)
+    : StackPointerTrackingBase<StackPointerTracking>(BC, BF, AllocatorId) {}

 } // end namespace bolt
 } // end namespace llvm
--- a/Show More
+++ b/Show More