Compare commits

..

2 Commits

Author SHA1 Message Date
Nolan Leake
2ece5fa87a Spectre-mitigation pass -insert-lfences 2019-12-19 16:28:38 -08:00
Jeffrey Griffin
8b8ff91e08 identify PIC jump tables with an extra MOV64rr
this code pattern was seen generated by rust 1.31.0, which uses a patched LLVM
8.0.0 internally. in general analyzePICJumpTable now follows the data flow of
R1 and R2 backward through MOVs until the LEA, at which point the two should
have converged.

it also now does not skip instructions affecting R1, which improves correctness
at the cost of rejecting other possible valid code patterns which were
previously accidentally accepted correctly, but this seems unlikely.
2019-02-08 14:12:03 -08:00
117 changed files with 3482 additions and 9043 deletions

View File

@ -1,29 +1,5 @@
include(ExternalProject)
set(BOLT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(BOLT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(CMAKE_CXX_STANDARD 14)
ExternalProject_Add(bolt_rt
SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/runtime"
STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-stamps
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
-DCMAKE_INSTALL_PREFIX=${LLVM_BINARY_DIR}
# You might want to set this to True if actively developing bolt_rt, otherwise
# cmake will not rebuild it after source code changes
BUILD_ALWAYS True
)
install(CODE "execute_process\(COMMAND \${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=\${CMAKE_INSTALL_PREFIX} -P ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/cmake_install.cmake \)"
COMPONENT bolt_rt)
add_llvm_install_targets(install-bolt_rt
DEPENDS bolt_rt
COMPONENT bolt_rt)
add_subdirectory(src)
add_subdirectory(test)

View File

@ -21,14 +21,6 @@ We actively welcome your pull requests.
before it can be merged.
* When all of the tests are passing and all other conditions described above
satisfied, the PR is ready for review and merge.
* If you haven't already, complete the Contributor License Agreement ("CLA").
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Facebook's open source projects.
Complete your CLA here: <https://code.facebook.com/cla>
## Issues

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

View File

@ -1,50 +0,0 @@
# Code Heatmaps
BOLT has gained the ability to print code heatmaps based on
sampling-based LBR profiles generated by `perf`. The output is produced
in colored ASCII to be displayed in a color-capable terminal. It looks
something like this:
![](./Heatmap.png)
Heatmaps can be generated for BOLTed and non-BOLTed binaries. You can
use them to compare the code layout before and after optimizations.
To generate a heatmap, start with running your app under `perf`:
```bash
$ perf record -e cycles:u -j any,u -- <executable with args>
```
or if you want to monitor the existing process(es):
```bash
$ perf record -e cycles:u -j any,u [-p PID|-a] -- sleep <interval>
```
Note that at the moment running with LBR (`-j any,u` or `-b`) is
a requirement.
Once the run is complete, and `perf.data` is generated, run BOLT in
a heatmap mode:
```bash
$ llvm-bolt heatmap -p perf.data <executable>
```
By default the heatmap will be dumped to *stdout*. You can change it
with `-o <heatmapfile>` option. Each character/block in the heatmap
shows the execution data accumulated for corresponding 64 bytes of
code. You can change this granularity with a `-block-size` option.
E.g. set it to 4096 to see code usage grouped by 4K pages.
Other useful options are:
```bash
-line-size=<uint> - number of entries per line (default 256)
-max-address=<uint> - maximum address considered valid for heatmap (default 4GB)
```
If you prefer to look at the data in a browser (or would like to share
it that way), then you can use an HTML conversion tool. E.g.:
```bash
$ aha -b -f <heatmapfile> > <heatmapfile>.html
```

View File

@ -848,7 +848,7 @@ index 8e9b4ac5632..d2c569e3399 100644
SMLoc Loc) override;
void
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 582a836023b..f1e341bd624 100644
index 582a836023b..0b15454ecd6 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -199,7 +199,7 @@ class MCStreamer {
@ -860,6 +860,17 @@ index 582a836023b..f1e341bd624 100644
/// \brief This is stack of current and previous section values saved by
/// PushSection.
@@ -290,8 +290,8 @@ public:
/// If the comment includes embedded \n's, they will each get the comment
/// prefix as appropriate. The added comment should not end with a \n.
/// By default, each comment is terminated with an end of line, i.e. the
- /// EOL param is set to true by default. If one prefers not to end the
- /// comment with a new line then the EOL param should be passed
+ /// EOL param is set to true by default. If one prefers not to end the
+ /// comment with a new line then the EOL param should be passed
/// with a false value.
virtual void AddComment(const Twine &T, bool EOL = true) {}
@@ -338,9 +338,7 @@ public:
/// \brief Returns an index to represent the order a symbol was emitted in.
@ -998,10 +1009,11 @@ index 46504e74bc2..836fd8ddc45 100644
Expected<Elf_Shdr_Range> sections() const;
Expected<Elf_Sym_Range> symbols(const Elf_Shdr *Sec) const {
@@ -397,6 +409,34 @@ void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
@@ -396,6 +408,34 @@ void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
}
}
template <class ELFT>
+template <class ELFT>
+Expected<const typename ELFFile<ELFT>::Elf_Dyn *>
+ELFFile<ELFT>::dynamic_table_begin(const Elf_Phdr *Phdr) const {
+ if (!Phdr)
@ -1029,10 +1041,9 @@ index 46504e74bc2..836fd8ddc45 100644
+ return reinterpret_cast<const Elf_Dyn *>(base() + End);
+}
+
+template <class ELFT>
template <class ELFT>
Expected<const typename ELFT::Sym *>
ELFFile<ELFT>::getRelocationSymbol(const Elf_Rel *Rel,
const Elf_Shdr *SymTab) const {
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index 4d001039238..62837bbcaa0 100644
--- a/include/llvm/Object/ELFObjectFile.h
@ -1045,10 +1056,11 @@ index 4d001039238..62837bbcaa0 100644
relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
relocation_iterator section_rel_end(DataRefImpl Sec) const override;
section_iterator getRelocatedSection(DataRefImpl Sec) const override;
@@ -717,6 +718,14 @@ bool ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec) const {
@@ -716,6 +717,14 @@ bool ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec) const {
return getSection(Sec)->sh_type == ELF::SHT_NOBITS;
}
template <class ELFT>
+template <class ELFT>
+bool ELFObjectFile<ELFT>::isSectionReadOnly(DataRefImpl Sec) const {
+ const Elf_Shdr *EShdr = getSection(Sec);
+ return EShdr->sh_flags & ELF::SHF_ALLOC &&
@ -1056,10 +1068,9 @@ index 4d001039238..62837bbcaa0 100644
+ EShdr->sh_type == ELF::SHT_PROGBITS;
+}
+
+template <class ELFT>
template <class ELFT>
relocation_iterator
ELFObjectFile<ELFT>::section_rel_begin(DataRefImpl Sec) const {
DataRefImpl RelData;
@@ -751,9 +760,6 @@ ELFObjectFile<ELFT>::section_rel_end(DataRefImpl Sec) const {
template <class ELFT>
section_iterator
@ -1090,7 +1101,7 @@ index 4d001039238..62837bbcaa0 100644
if (sec->sh_type == ELF::SHT_REL)
return getRel(Rel)->r_offset;
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index bfd3462bf69..52bc210b577 100644
index bfd3462bf69..9be0b260f34 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h
@@ -320,6 +320,7 @@ public:
@ -1101,6 +1112,15 @@ index bfd3462bf69..52bc210b577 100644
relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
relocation_iterator section_rel_end(DataRefImpl Sec) const override;
@@ -331,7 +332,7 @@ public:
relocation_iterator locrel_begin() const;
relocation_iterator locrel_end() const;
-
+
void moveRelocationNext(DataRefImpl &Rel) const override;
uint64_t getRelocationOffset(DataRefImpl Rel) const override;
symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 9c4ae94d3a6..64342723371 100644
--- a/include/llvm/Object/ObjectFile.h
@ -1195,9 +1215,18 @@ index d11f5a83779..0ad115c886b 100644
/// FD is the file descriptor that this writes to. If ShouldClose is true,
/// this closes the file when the stream is destroyed. If FD is for stdout or
diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index adada672af0..b3d68ed66af 100644
index adada672af0..c9c79971a25 100644
--- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -38,7 +38,7 @@ DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() {
}
bool
-DWARFAbbreviationDeclaration::extract(DataExtractor Data,
+DWARFAbbreviationDeclaration::extract(DataExtractor Data,
uint32_t* OffsetPtr) {
clear();
const uint32_t Offset = *OffsetPtr;
@@ -61,13 +61,15 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
// Read all of the abbreviation attributes and forms.
@ -1558,7 +1587,7 @@ index 3d274b63a4f..cef29f4b41d 100644
StringRef RuntimeDyld::getErrorString() { return Dyld->getErrorString(); }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 36b43ec9b78..1a56e590014 100644
index 36b43ec9b78..3dc3e8f325c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -270,6 +270,25 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
@ -1659,6 +1688,15 @@ index 36b43ec9b78..1a56e590014 100644
resolveAArch64Branch(SectionID, Value, RelI, Stubs);
} else if (RelType == ELF::R_AARCH64_ADR_GOT_PAGE) {
// Craete new GOT entry or find existing one. If GOT entry is
@@ -1410,7 +1478,7 @@ RuntimeDyldELF::processRelocationRef(
} else {
processSimpleRelocation(SectionID, Offset, RelType, Value);
}
-
+
} else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) {
if (RelType == ELF::R_PPC64_REL24) {
// Determine ABI variant in use for this object.
@@ -1632,7 +1700,7 @@ RuntimeDyldELF::processRelocationRef(
// equivalent to the usual PLT implementation except that we use the stub
// mechanism in RuntimeDyld (which puts stubs at the end of the section)
@ -1781,10 +1819,18 @@ index a0f9a857e3c..be32963b705 100644
assert((cast<MCFillFragment>(F).getValue() == 0) &&
"Invalid fill in virtual section!");
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 0e0ea965d14..49885269d06 100644
index 0e0ea965d14..0044566d9ab 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -156,12 +156,36 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
@@ -41,6 +41,7 @@
#include <cassert>
#include <cstdint>
#include <string>
+#include <tuple>
#include <utility>
#include <vector>
@@ -156,12 +157,36 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
unsigned Isa = 0;
unsigned Discriminator = 0;
@ -1822,7 +1868,7 @@ index 0e0ea965d14..49885269d06 100644
if (FileNum != LineEntry.getFileNum()) {
FileNum = LineEntry.getFileNum();
MCOS->EmitIntValue(dwarf::DW_LNS_set_file, 1);
@@ -197,18 +221,33 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
@@ -197,18 +222,33 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
if (LineEntry.getFlags() & DWARF2_FLAG_EPILOGUE_BEGIN)
MCOS->EmitIntValue(dwarf::DW_LNS_set_epilogue_begin, 1);
@ -1864,7 +1910,7 @@ index 0e0ea965d14..49885269d06 100644
}
// Emit a DW_LNE_end_sequence for the end of the section.
@@ -250,7 +289,7 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS,
@@ -250,7 +290,7 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS,
MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
// Handle the rest of the Compile Units.
@ -1873,7 +1919,16 @@ index 0e0ea965d14..49885269d06 100644
CUIDTablePair.second.EmitCU(MCOS, Params, LineStr);
if (LineStr)
@@ -514,8 +553,12 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
@@ -484,7 +524,7 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
// Parameters of the state machine, are next.
MCOS->EmitIntValue(context.getAsmInfo()->getMinInstAlignment(), 1);
- // maximum_operations_per_instruction
+ // maximum_operations_per_instruction
// For non-VLIW architectures this field is always 1.
// FIXME: VLIW architectures need to update this field accordingly.
if (LineTableVersion >= 4)
@@ -514,8 +554,12 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS,
MCDwarfLineTableParams Params,
@ -1888,7 +1943,7 @@ index 0e0ea965d14..49885269d06 100644
// Put out the line tables.
for (const auto &LineSec : MCLineSections.getMCLineEntries())
@@ -1253,12 +1296,217 @@ public:
@@ -1253,12 +1297,217 @@ public:
void EmitCFIInstruction(const MCCFIInstruction &Instr);
};
@ -2106,7 +2161,7 @@ index 0e0ea965d14..49885269d06 100644
void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
int dataAlignmentFactor = getDataAlignmentFactor(Streamer);
auto *MRI = Streamer.getContext().getRegisterInfo();
@@ -1373,7 +1621,28 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
@@ -1373,7 +1622,28 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
Streamer.EmitIntValue(dwarf::DW_CFA_GNU_args_size, 1);
Streamer.EmitULEB128IntValue(Instr.getOffset());
return;
@ -2231,7 +2286,7 @@ index 0a684588110..58199c97420 100644
unsigned char Value,
SMLoc Loc) {
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 776569894a5..aa130bb2d6a 100644
index 776569894a5..0954b70df49 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -85,11 +85,15 @@ void MCStreamer::reset() {
@ -2274,6 +2329,15 @@ index 776569894a5..aa130bb2d6a 100644
}
void MCStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
@@ -513,7 +524,7 @@ void MCStreamer::EmitCFIEscape(StringRef Values) {
void MCStreamer::EmitCFIGnuArgsSize(int64_t Size) {
MCSymbol *Label = EmitCFILabel();
- MCCFIInstruction Instruction =
+ MCCFIInstruction Instruction =
MCCFIInstruction::createGnuArgsSize(Label, Size);
MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
if (!CurFrame)
@@ -884,6 +895,14 @@ void MCStreamer::visitUsedExpr(const MCExpr &Expr) {
}
}
@ -2299,10 +2363,16 @@ index 776569894a5..aa130bb2d6a 100644
SMLoc Loc) {}
void MCStreamer::EmitBundleAlignMode(unsigned AlignPow2) {}
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index b544fa5c147..c885bf9f037 100644
index b544fa5c147..746c9f32865 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -344,6 +344,11 @@ bool COFFObjectFile::isSectionVirtual(DataRefImpl Ref) const {
@@ -339,11 +339,16 @@ unsigned COFFObjectFile::getSectionID(SectionRef Sec) const {
bool COFFObjectFile::isSectionVirtual(DataRefImpl Ref) const {
const coff_section *Sec = toSec(Ref);
- // In COFF, a virtual section won't have any in-file
+ // In COFF, a virtual section won't have any in-file
// content, so the file pointer to the content will be zero.
return Sec->PointerToRawData == 0;
}

View File

@ -1,12 +0,0 @@
cmake_minimum_required(VERSION 3.1.0)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
project(libbolt_rt_project)
add_library(bolt_rt STATIC
instr.cpp
)
install(TARGETS bolt_rt DESTINATION lib)

View File

@ -1,285 +0,0 @@
//===-- instr.cpp -----------------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
// This file contains code that is linked to the final binary with a function
// that is called at program exit to dump instrumented data collected during
// execution.
//
//===----------------------------------------------------------------------===//
//
// BOLT runtime instrumentation library for x86 Linux.
//
//===----------------------------------------------------------------------===//
#include <cstdint>
#include <elf.h>
// All extern declarations here need to be defined by BOLT itself.
// Counters inserted by instrumentation, incremented during runtime when
// points of interest (locations) in the program are reached.
extern uint64_t __bolt_instr_locations[];
// Number of counters.
extern uint32_t __bolt_instr_num_locs;
// Filename to dump data to.
extern char __bolt_instr_filename[];
// A location is a function name plus offset. Function name needs to be
// retrieved from the string table and is stored as an index to this table.
struct Location {
uint32_t FunctionName;
uint32_t Offset;
};
// An edge description defines an instrumented edge in the program, fully
// identified by where the jump is located and its destination.
struct EdgeDescription {
Location From;
Location To;
};
// These need to be read from disk. They are generated by BOLT and written to
// an ELF note section in the binary itself.
struct InstrumentationInfo {
EdgeDescription *Descriptions;
char *Strings; // String table with function names used in this binary
int FileDesc; // File descriptor for the file on disk backing this
// information in memory via mmap
uint8_t *MMapPtr; // The mmap ptr
int MMapSize; // The mmap size
};
// Declare some syscall wrappers we use throughout this code to avoid linking
// against system libc.
static uint64_t
__open(const char *pathname,
uint64_t flags,
uint64_t mode) {
uint64_t ret;
__asm__ __volatile__ (
"movq $2, %%rax\n"
"syscall"
: "=a"(ret)
: "D"(pathname), "S"(flags), "d"(mode)
: "cc", "rcx", "r11", "memory");
return ret;
}
static uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
uint64_t ret;
__asm__ __volatile__ (
"movq $1, %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(fd), "S"(buf), "d"(count)
: "cc", "rcx", "r11", "memory");
return ret;
}
static uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
uint64_t ret;
__asm__ __volatile__ (
"movq $8, %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(fd), "S"(pos), "d"(whence)
: "cc", "rcx", "r11", "memory");
return ret;
}
static int __close(uint64_t fd) {
uint64_t ret;
__asm__ __volatile__ (
"movq $3, %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(fd)
: "cc", "rcx", "r11", "memory");
return ret;
}
static void *__mmap(uint64_t addr, uint64_t size, uint64_t prot,
uint64_t flags, uint64_t fd, uint64_t offset) {
void *ret;
register uint64_t r8 asm("r8") = fd;
register uint64_t r9 asm("r9") = offset;
register uint64_t r10 asm("r10") = flags;
__asm__ __volatile__ (
"movq $9, %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), "r"(r9)
: "cc", "rcx", "r11", "memory");
return ret;
}
static uint64_t __munmap(void *addr, uint64_t size) {
uint64_t ret;
__asm__ __volatile__ (
"movq $11, %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(addr), "S"(size)
: "cc", "rcx", "r11", "memory");
return ret;
}
static uint64_t __exit(uint64_t code) {
uint64_t ret;
__asm__ __volatile__ (
"movq $231, %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(code)
: "cc", "rcx", "r11", "memory");
return ret;
}
// Helper functions for writing strings to the .fdata file
// Write number Num using Base to the buffer in OutBuf, returns a pointer to
// the end of the string.
static char *intToStr(char *OutBuf, uint32_t Num, uint32_t Base) {
const char *Chars = "0123456789abcdef";
char Buf[20];
char *Ptr = Buf;
while (Num) {
*Ptr++ = *(Chars + (Num % Base));
Num /= Base;
}
if (Ptr == Buf) {
*OutBuf++ = '0';
return OutBuf;
}
while (Ptr != Buf) {
*OutBuf++ = *--Ptr;
}
return OutBuf;
}
// Copy Str to OutBuf, returns a pointer to the end of the copied string.
static char *strCopy(char *OutBuf, const char *Str) {
while (*Str)
*OutBuf++ = *Str++;
return OutBuf;
}
// Print Msg to STDERR and quits with error code 1.
static void reportError(const char *Msg, uint64_t Size) {
__write(2, Msg, Size);
__exit(1);
}
// Perform a string comparison and returns zero if Str1 matches Str2. Compares
// at most Size characters.
static int compareStr(const char *Str1, const char *Str2, int Size) {
while (*Str1 == *Str2) {
if (*Str1 == '\0' || --Size == 0)
return 0;
++Str1;
++Str2;
}
return 1;
}
// Write as a string in OutBuf an identifier for the program point at function
// whose name is in the string table index FuncStrIndex plus Offset.
static char *serializeLoc(const InstrumentationInfo &Info, char *OutBuf,
const Location Loc) {
// fdata location format: Type Name Offset
// Type 1 - regular symbol
OutBuf = strCopy(OutBuf, "1 ");
const char *Str = Info.Strings + Loc.FunctionName;
while (*Str) {
*OutBuf++ = *Str++;
}
*OutBuf++ = ' ';
OutBuf = intToStr(OutBuf, Loc.Offset, 16);
*OutBuf++ = ' ';
return OutBuf;
}
// Read and map to memory the descriptions written by BOLT into the executable's
// notes section
static InstrumentationInfo readDescriptions() {
InstrumentationInfo Result;
uint64_t FD = __open("/proc/self/exe",
/*flags=*/0 /*O_RDONLY*/,
/*mode=*/0666);
Result.FileDesc = FD;
// mmap our binary to memory
uint64_t Size = __lseek(FD, 0, 2 /*SEEK_END*/);
uint8_t *BinContents = reinterpret_cast<uint8_t *>(
__mmap(0, Size, 0x1 /* PROT_READ*/, 0x2 /* MAP_PRIVATE*/, FD, 0));
Result.MMapPtr = BinContents;
Result.MMapSize = Size;
Elf64_Ehdr *Hdr = reinterpret_cast<Elf64_Ehdr *>(BinContents);
Elf64_Shdr *Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff);
Elf64_Shdr *StringTblHeader = reinterpret_cast<Elf64_Shdr *>(
BinContents + Hdr->e_shoff + Hdr->e_shstrndx * Hdr->e_shentsize);
// Find .bolt.instr.tables with the data we need and set pointers to it
for (int I = 0; I < Hdr->e_shnum; ++I) {
char *SecName = reinterpret_cast<char *>(
BinContents + StringTblHeader->sh_offset + Shdr->sh_name);
if (compareStr(SecName, ".bolt.instr.tables", 64) != 0) {
Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff +
(I + 1) * Hdr->e_shentsize);
continue;
}
// Actual contents of the ELF note start after offset 20 decimal:
// Offset 0: Producer name size (4 bytes)
// Offset 4: Contents size (4 bytes)
// Offset 8: Note type (4 bytes)
// Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
// Offset 20: Contents
Result.Descriptions =
reinterpret_cast<EdgeDescription *>(BinContents + Shdr->sh_offset + 20);
// String table is located after the full EdgeDescriptions table containing
// __bolt_instr_num_locs entries is finished
Result.Strings = reinterpret_cast<char *>(
BinContents + Shdr->sh_offset + 20 +
(__bolt_instr_num_locs * sizeof(EdgeDescription)));
return Result;
}
const char ErrMsg[] =
"BOLT instrumentation runtime error: could not find section "
".bolt.instr.tables\n";
reportError(ErrMsg, sizeof(ErrMsg));
return Result;
}
// This is the entry point called at program exit. BOLT patches the executable's
// FINI entry in the .dynamic section with the address of this function. Our
// goal here is to flush to disk all instrumentation data in memory, using
// BOLT's fdata format.
extern "C" void __bolt_instr_data_dump() {
const InstrumentationInfo Info = readDescriptions();
uint64_t FD = __open(__bolt_instr_filename,
/*flags=*/0x241 /*O_WRONLY|O_TRUNC|O_CREAT*/,
/*mode=*/0666);
for (int I = 0, E = __bolt_instr_num_locs; I < E; ++I) {
char LineBuf[2000];
char *Ptr = LineBuf;
uint32_t HitCount = __bolt_instr_locations[I];
if (!HitCount)
continue;
EdgeDescription *Desc = &Info.Descriptions[I];
Ptr = serializeLoc(Info, Ptr, Desc->From);
Ptr = serializeLoc(Info, Ptr, Desc->To);
Ptr = strCopy(Ptr, "0 ");
Ptr = intToStr(Ptr, HitCount, 10);
*Ptr++ = '\n';
__write(FD, LineBuf, Ptr - LineBuf);
}
__close(FD);
__munmap(Info.MMapPtr, Info.MMapSize);
__close(Info.FileDesc);
}

View File

@ -12,7 +12,6 @@
#include "BinaryBasicBlock.h"
#include "BinaryContext.h"
#include "BinaryFunction.h"
#include "ParallelUtilities.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
@ -97,10 +96,6 @@ bool BinaryBasicBlock::validateSuccessorInvariants() {
}
}
} else {
// Unknown control flow.
if (Inst && BC.MIB->isIndirectBranch(*Inst))
return true;
const MCSymbol *TBB = nullptr;
const MCSymbol *FBB = nullptr;
MCInst *CondBranch = nullptr;
@ -260,7 +255,7 @@ void BinaryBasicBlock::replaceSuccessor(BinaryBasicBlock *Succ,
BinaryBasicBlock *NewSucc,
uint64_t Count,
uint64_t MispredictedCount) {
Succ->removePredecessor(this, /*Multiple=*/false);
Succ->removePredecessor(this);
auto I = succ_begin();
auto BI = BranchInfo.begin();
for (; I != succ_end(); ++I) {
@ -285,7 +280,7 @@ void BinaryBasicBlock::removeAllSuccessors() {
}
void BinaryBasicBlock::removeSuccessor(BinaryBasicBlock *Succ) {
Succ->removePredecessor(this, /*Multiple=*/false);
Succ->removePredecessor(this);
auto I = succ_begin();
auto BI = BranchInfo.begin();
for (; I != succ_end(); ++I) {
@ -304,16 +299,13 @@ void BinaryBasicBlock::addPredecessor(BinaryBasicBlock *Pred) {
Predecessors.push_back(Pred);
}
void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred,
bool Multiple) {
void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred) {
// Note: the predecessor could be listed multiple times.
bool Erased{false};
for (auto PredI = Predecessors.begin(); PredI != Predecessors.end(); ) {
if (*PredI == Pred) {
Erased = true;
PredI = Predecessors.erase(PredI);
if (!Multiple)
return;
} else {
++PredI;
}
@ -456,7 +448,6 @@ void BinaryBasicBlock::addBranchInstruction(const BinaryBasicBlock *Successor) {
assert(isSuccessor(Successor));
auto &BC = Function->getBinaryContext();
MCInst NewInst;
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
BC.MIB->createUncondBranch(NewInst, Successor->getLabel(), BC.Ctx.get());
Instructions.emplace_back(std::move(NewInst));
}
@ -539,8 +530,8 @@ void BinaryBasicBlock::dump() const {
outs() << "\n";
}
uint64_t BinaryBasicBlock::estimateSize(const MCCodeEmitter *Emitter) const {
return Function->getBinaryContext().computeCodeSize(begin(), end(), Emitter);
uint64_t BinaryBasicBlock::estimateSize() const {
return Function->getBinaryContext().computeCodeSize(begin(), end());
}
BinaryBasicBlock::BinaryBranchInfo &

View File

@ -16,15 +16,14 @@
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorOr.h"
#include "llvm/Support/raw_ostream.h"
#include <limits>
#include <set>
#include <utility>
#include <set>
namespace llvm {
@ -50,12 +49,6 @@ public:
struct BinaryBranchInfo {
uint64_t Count;
uint64_t MispredictedCount; /// number of branches mispredicted
bool operator<(const BinaryBranchInfo &Other) const {
return (Count < Other.Count) ||
(Count == Other.Count &&
MispredictedCount < Other.MispredictedCount);
}
};
static constexpr uint32_t INVALID_OFFSET =
@ -365,17 +358,13 @@ public:
/// Find the fallthrough successor for a block, or nullptr if there is
/// none.
BinaryBasicBlock* getFallthrough() {
const BinaryBasicBlock* getFallthrough() const {
if (succ_size() == 2)
return getConditionalSuccessor(false);
else
return getSuccessor();
}
const BinaryBasicBlock *getFallthrough() const {
return const_cast<BinaryBasicBlock *>(this)->getFallthrough();
}
/// Return branch info corresponding to a taken branch.
const BinaryBranchInfo &getTakenBranchInfo() const {
assert(BranchInfo.size() == 2 &&
@ -461,13 +450,6 @@ public:
}
}
/// Add a range of instructions to the end of this basic block.
template <typename RangeTy>
void addInstructions(RangeTy R) {
for (auto &I : R)
addInstruction(I);
}
/// Add instruction before Pos in this basic block.
template <typename Itr>
Itr insertPseudoInstr(Itr Pos, MCInst &Instr) {
@ -758,11 +740,6 @@ public:
return Instructions.emplace(At, std::move(NewInst));
}
iterator insertInstruction(iterator At, MCInst &NewInst) {
adjustNumPseudos(NewInst, 1);
return Instructions.emplace(At, NewInst);
}
/// Helper to retrieve any terminators in \p BB before \p Pos. This is used
/// to skip CFI instructions and to retrieve the first terminator instruction
/// in basic blocks with two terminators (conditional jump and unconditional
@ -871,11 +848,8 @@ public:
return InputRange.second - InputRange.first;
}
/// Returns an estimate of size of basic block during run time optionally
/// using a user-supplied emitter for lock-free multi-thread work.
/// MCCodeEmitter is not thread safe and each thread should operate with its
/// own copy of it.
uint64_t estimateSize(const MCCodeEmitter *Emitter = nullptr) const;
/// Returns an estimate of size of basic block during run time.
uint64_t estimateSize() const;
/// Return index in the current layout. The user is responsible for
/// making sure the indices are up to date,
@ -910,10 +884,7 @@ private:
/// Remove predecessor of the basic block. Don't use directly, instead
/// use removeSuccessor() function.
/// If \p Multiple is set to true, it will remove all predecessors that
/// are equal to \p Pred. Otherwise, the first instance of \p Pred found
/// will be removed. This only matters in awkward, redundant CFGs.
void removePredecessor(BinaryBasicBlock *Pred, bool Multiple=true);
void removePredecessor(BinaryBasicBlock *Pred);
/// Return offset of the basic block from the function start.
uint32_t getOffset() const {

File diff suppressed because it is too large Load Diff

View File

@ -17,10 +17,8 @@
#include "BinaryData.h"
#include "BinarySection.h"
#include "DebugData.h"
#include "JumpTable.h"
#include "MCPlusBuilder.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
@ -34,7 +32,6 @@
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Object/ObjectFile.h"
@ -44,10 +41,8 @@
#include <functional>
#include <map>
#include <set>
#include <shared_mutex>
#include <string>
#include <system_error>
#include <type_traits>
#include <unordered_map>
#include <vector>
@ -60,21 +55,8 @@ using namespace object;
namespace bolt {
class BinaryFunction;
class BinaryBasicBlock;
class DataReader;
enum class MemoryContentsType : char {
UNKNOWN = 0, /// Unknown contents.
POSSIBLE_JUMP_TABLE, /// Possibly a non-PIC jump table.
POSSIBLE_PIC_JUMP_TABLE, /// Possibly a PIC jump table.
};
/// Free memory allocated for \p List.
template<typename T> void clearList(T& List) {
T TempList;
TempList.swap(List);
}
/// Helper function to truncate a \p Value to given size in \p Bytes.
inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8));
@ -155,23 +137,9 @@ class BinaryContext {
/// Low level section registration.
BinarySection &registerSection(BinarySection *Section);
/// Store all functions in the binary, sorted by original address.
std::map<uint64_t, BinaryFunction> BinaryFunctions;
/// A mutex that is used to control parallel accesses to BinaryFunctions
mutable std::shared_timed_mutex BinaryFunctionsMutex;
/// Functions injected by BOLT
std::vector<BinaryFunction *> InjectedBinaryFunctions;
/// Jump tables for all functions mapped by address.
std::map<uint64_t, JumpTable *> JumpTables;
/// Used in duplicateJumpTable() to uniquely identify a JT clone
/// Start our IDs with a high number so getJumpTableContainingAddress checks
/// with size won't overflow
uint32_t DuplicatedJumpTables{0x10000000};
public:
/// [name] -> [BinaryData*] map used for global symbol resolution.
using SymbolMapType = std::map<std::string, BinaryData *>;
@ -192,58 +160,6 @@ public:
FilterIterator<binary_data_const_iterator>;
using FilteredBinaryDataIterator = FilterIterator<binary_data_iterator>;
/// Return BinaryFunction containing a given \p Address or nullptr if
/// no registered function has it.
///
/// In a binary a function has somewhat vague boundaries. E.g. a function can
/// refer to the first byte past the end of the function, and it will still be
/// referring to this function, not the function following it in the address
/// space. Thus we have the following flags that allow to lookup for
/// a function where a caller has more context for the search.
///
/// If \p CheckPastEnd is true and the \p Address falls on a byte
/// immediately following the last byte of some function and there's no other
/// function that starts there, then return the function as the one containing
/// the \p Address. This is useful when we need to locate functions for
/// references pointing immediately past a function body.
///
/// If \p UseMaxSize is true, then include the space between this function
/// body and the next object in address ranges that we check.
BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address,
bool CheckPastEnd = false,
bool UseMaxSize = false,
bool Shallow = false);
/// Return BinaryFunction which has a fragment that starts at a given
/// \p Address. If the BinaryFunction is a child fragment, then return its
/// parent unless \p Shallow parameter is set to true.
BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address,
bool Shallow = false);
const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address,
bool Shallow = false) const {
return const_cast<BinaryContext *>(this)->
getBinaryFunctionAtAddress(Address, Shallow);
}
/// Return size of an entry for the given jump table \p Type.
uint64_t getJumpTableEntrySize(JumpTable::JumpTableType Type) const {
return Type == JumpTable::JTT_PIC ? 4 : AsmInfo->getCodePointerSize();
}
/// Return JumpTable containing a given \p Address.
JumpTable *getJumpTableContainingAddress(uint64_t Address) {
auto JTI = JumpTables.upper_bound(Address);
if (JTI == JumpTables.begin())
return nullptr;
--JTI;
if (JTI->first + JTI->second->getSize() > Address)
return JTI->second;
if (JTI->second->getSize() == 0 && JTI->first == Address)
return JTI->second;
return nullptr;
}
/// [MCSymbol] -> [BinaryFunction]
///
/// As we fold identical functions, multiple symbols can point
@ -251,9 +167,6 @@ public:
std::unordered_map<const MCSymbol *,
BinaryFunction *> SymbolToFunctionMap;
/// A mutex that is used to control parallel accesses to SymbolToFunctionMap
mutable std::shared_timed_mutex SymbolToFunctionMapMutex;
/// Look up the symbol entry that contains the given \p Address (based on
/// the start address and size for each symbol). Returns a pointer to
/// the BinaryData for that symbol. If no data is found, nullptr is returned.
@ -274,10 +187,6 @@ public:
/// top level BinaryData.
bool validateHoles() const;
/// Produce output address ranges based on input ranges for some module.
DebugAddressRangesVector translateModuleAddressRanges(
const DWARFAddressRangesVector &InputRanges) const;
/// Get a bogus "absolute" section that will be associated with all
/// absolute BinaryDatas.
BinarySection &absoluteSection();
@ -293,25 +202,6 @@ public:
/// is complete, e.g. after building CFGs for all functions.
void assignMemData();
/// Construct BinaryFunction object and add it to internal maps.
BinaryFunction *createBinaryFunction(const std::string &Name,
BinarySection &Section,
uint64_t Address,
uint64_t Size,
bool IsSimple,
uint64_t SymbolSize = 0,
uint16_t Alignment = 0);
/// Return all functions for this rewrite instance.
std::map<uint64_t, BinaryFunction> &getBinaryFunctions() {
return BinaryFunctions;
}
/// Return all functions for this rewrite instance.
const std::map<uint64_t, BinaryFunction> &getBinaryFunctions() const {
return BinaryFunctions;
}
/// Create BOLT-injected function
BinaryFunction *createInjectedBinaryFunction(const std::string &Name,
bool IsSimple = true);
@ -320,54 +210,7 @@ public:
return InjectedBinaryFunctions;
}
/// Construct a jump table for \p Function at \p Address or return an existing
/// one at that location.
///
/// May create an embedded jump table and return its label as the second
/// element of the pair.
const MCSymbol *getOrCreateJumpTable(BinaryFunction &Function,
uint64_t Address,
JumpTable::JumpTableType Type);
/// Analyze a possible jump table of type \p Type at a given \p Address.
/// \p BF is a function referencing the jump table.
/// Return true if the jump table was detected at \p Address, and false
/// otherwise.
///
/// If \p NextJTAddress is different from zero, it is used as an upper
/// bound for jump table memory layout.
///
/// Optionally, populate \p Offsets with jump table entries. The entries
/// could be partially populated if the jump table detection fails.
bool analyzeJumpTable(const uint64_t Address,
const JumpTable::JumpTableType Type,
const BinaryFunction &BF,
const uint64_t NextJTAddress = 0,
JumpTable::OffsetsType *Offsets = nullptr);
/// After jump table locations are established, this function will populate
/// their OffsetEntries based on memory contents.
void populateJumpTables();
/// Returns a jump table ID and label pointing to the duplicated jump table.
/// Ordinarily, jump tables are identified by their address in the input
/// binary. We return an ID with the high bit set to differentiate it from
/// regular addresses, avoiding conflicts with standard jump tables.
std::pair<uint64_t, const MCSymbol *>
duplicateJumpTable(BinaryFunction &Function, JumpTable *JT,
const MCSymbol *OldLabel);
/// Generate a unique name for jump table at a given \p Address belonging
/// to function \p BF.
std::string generateJumpTableName(const BinaryFunction &BF, uint64_t Address);
/// Return true if the array of bytes represents a valid code padding.
bool hasValidCodePadding(const BinaryFunction &BF);
/// Verify padding area between functions, and adjust max function size
/// accordingly.
void adjustCodePadding();
public:
/// Regular page size.
static constexpr unsigned RegularPageSize = 0x1000;
@ -377,20 +220,13 @@ public:
/// Map address to a constant island owner (constant data in code section)
std::map<uint64_t, BinaryFunction *> AddressToConstantIslandMap;
/// A map from jump table address to insertion order. Used for generating
/// jump table names.
std::map<uint64_t, size_t> JumpTableIds;
/// Set of addresses in the code that are not a function start, and are
/// referenced from outside of containing function. E.g. this could happen
/// when a function has more than a single entry point.
std::set<std::pair<BinaryFunction *, uint64_t>> InterproceduralReferences;
std::set<uint64_t> InterproceduralReferences;
std::unique_ptr<MCContext> Ctx;
/// A mutex that is used to control parallel accesses to Ctx
mutable std::shared_timed_mutex CtxMutex;
std::unique_ptr<DWARFContext> DwCtx;
std::unique_ptr<Triple> TheTriple;
@ -464,9 +300,6 @@ public:
/// List of functions that always trap.
std::vector<const BinaryFunction *> TrappedFunctions;
/// Map SDT locations to SDT markers info
std::unordered_map<uint64_t, SDTMarkerInfo> SDTMarkers;
BinaryContext(std::unique_ptr<MCContext> Ctx,
std::unique_ptr<DWARFContext> DwCtx,
std::unique_ptr<Triple> TheTriple,
@ -550,25 +383,6 @@ public:
BinaryDataMap.clear();
}
/// Process \p Address reference from code in function \BF.
/// \p IsPCRel indicates if the reference is PC-relative.
/// Return <Symbol, Addend> pair corresponding to the \p Address.
std::pair<const MCSymbol *, uint64_t> handleAddressRef(uint64_t Address,
BinaryFunction &BF,
bool IsPCRel);
/// Analyze memory contents at the given \p Address and return the type of
/// memory contents (such as a possible jump table).
MemoryContentsType analyzeMemoryAt(uint64_t Address, BinaryFunction &BF);
/// Return a value of the global \p Symbol or an error if the value
/// was not set.
ErrorOr<uint64_t> getSymbolValue(const MCSymbol &Symbol) const {
const auto *BD = getBinaryDataByName(Symbol.getName());
if (!BD)
return std::make_error_code(std::errc::bad_address);
return BD->getAddress();
}
/// Return a global symbol registered at a given \p Address and \p Size.
/// If no symbol exists, create one with unique name using \p Prefix.
@ -634,65 +448,6 @@ public:
return Itr != GlobalSymbols.end() ? Itr->second : nullptr;
}
/// Return true if \p SymbolName was generated internally and was not present
/// in the input binary.
bool isInternalSymbolName(const StringRef Name) {
return Name.startswith("SYMBOLat") ||
Name.startswith("DATAat") ||
Name.startswith("HOLEat");
}
MCSymbol *getHotTextStartSymbol() const {
return Ctx->getOrCreateSymbol("__hot_start");
}
MCSymbol *getHotTextEndSymbol() const {
return Ctx->getOrCreateSymbol("__hot_end");
}
MCSection *getTextSection() const {
return MOFI->getTextSection();
}
/// Return code section with a given name.
MCSection *getCodeSection(StringRef SectionName) const {
return Ctx->getELFSection(SectionName,
ELF::SHT_PROGBITS,
ELF::SHF_EXECINSTR | ELF::SHF_ALLOC);
}
/// \name Pre-assigned Section Names
/// @{
const char *getMainCodeSectionName() const {
return ".text";
}
const char *getColdCodeSectionName() const {
return ".text.cold";
}
const char *getHotTextMoverSectionName() const {
return ".text.mover";
}
const char *getInjectedCodeSectionName() const {
return ".text.injected";
}
const char *getInjectedColdCodeSectionName() const {
return ".text.injected.cold";
}
ErrorOr<BinarySection &> getGdbIndexSection() const {
return getUniqueSectionByName(".gdb_index");
}
/// @}
/// Resolve inter-procedural dependencies.
void processInterproceduralReferences();
/// Perform any necessary post processing on the symbol table after
/// function disassembly is complete. This processing fixes top
/// level data holes and makes sure the symbol table is valid.
@ -780,19 +535,6 @@ public:
Sections.end()));
}
/// Iterate over all registered code sections.
iterator_range<FilteredSectionIterator> textSections() {
auto isText = [](const SectionIterator &Itr) {
return *Itr && Itr->isAllocatable() && Itr->isText();
};
return make_range(FilteredSectionIterator(isText,
Sections.begin(),
Sections.end()),
FilteredSectionIterator(isText,
Sections.end(),
Sections.end()));
}
/// Iterate over all registered allocatable sections.
iterator_range<FilteredSectionConstIterator> allocatableSections() const {
return const_cast<BinaryContext *>(this)->allocatableSections();
@ -844,9 +586,7 @@ public:
/// functions only work for allocatable sections, i.e. ones with non-zero
/// addresses.
ErrorOr<BinarySection &> getSectionForAddress(uint64_t Address);
ErrorOr<const BinarySection &> getSectionForAddress(uint64_t Address) const {
return const_cast<BinaryContext *>(this)->getSectionForAddress(Address);
}
ErrorOr<const BinarySection &> getSectionForAddress(uint64_t Address) const;
/// Return section(s) associated with given \p Name.
iterator_range<NameToSectionMapType::iterator>
@ -858,10 +598,18 @@ public:
return make_range(NameToSection.equal_range(Name));
}
/// Return the unique section associated with given \p Name.
/// Return the unique (allocatable) section associated with given \p Name.
/// If there is more than one section with the same name, return an error
/// object.
ErrorOr<BinarySection &> getUniqueSectionByName(StringRef SectionName) const {
ErrorOr<BinarySection &> getUniqueSectionByName(StringRef SectionName) {
auto Sections = getSectionByName(SectionName);
if (Sections.begin() != Sections.end() &&
std::next(Sections.begin()) == Sections.end())
return *Sections.begin()->second;
return std::make_error_code(std::errc::bad_address);
}
ErrorOr<const BinarySection &>
getUniqueSectionByName(StringRef SectionName) const {
auto Sections = getSectionByName(SectionName);
if (Sections.begin() != Sections.end() &&
std::next(Sections.begin()) == Sections.end())
@ -869,38 +617,22 @@ public:
return std::make_error_code(std::errc::bad_address);
}
/// Return an unsigned value of \p Size stored at \p Address. The address has
/// to be a valid statically allocated address for the binary.
ErrorOr<uint64_t> getUnsignedValueAtAddress(uint64_t Address,
size_t Size) const;
/// Return a signed value of \p Size stored at \p Address. The address has
/// to be a valid statically allocated address for the binary.
ErrorOr<uint64_t> getSignedValueAtAddress(uint64_t Address,
size_t Size) const;
/// Special case of getUnsignedValueAtAddress() that uses a pointer size.
ErrorOr<uint64_t> getPointerAtAddress(uint64_t Address) const {
return getUnsignedValueAtAddress(Address, AsmInfo->getCodePointerSize());
}
/// Given \p Address in the binary, extract and return a pointer value at that
/// address. The address has to be a valid statically allocated address for
/// the binary.
ErrorOr<uint64_t> extractPointerAtAddress(uint64_t Address) const;
/// Replaces all references to \p ChildBF with \p ParentBF. \p ChildBF is then
/// removed from the list of functions \p BFs. The profile data of \p ChildBF
/// is merged into that of \p ParentBF. This function is thread safe.
void foldFunction(BinaryFunction &ChildBF, BinaryFunction &ParentBF);
/// is merged into that of \p ParentBF.
void foldFunction(BinaryFunction &ChildBF,
BinaryFunction &ParentBF,
std::map<uint64_t, BinaryFunction> &BFs);
/// Add a Section relocation at a given \p Address.
void addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Type,
uint64_t Addend = 0, uint64_t Value = 0);
/// All PC-relative relocations in data objects.
std::map<uint64_t, std::pair<uint64_t, uint64_t>> PCRelocation;
void addPCRelativeDataRelocation(uint64_t Address, uint64_t Type,
uint64_t Value) {
PCRelocation[Address] = std::make_pair(Type, Value);
}
/// Remove registered relocation at a given \p Address.
bool removeRelocationAt(uint64_t Address);
@ -908,15 +640,12 @@ public:
/// is no relocation at such address.
const Relocation *getRelocationAt(uint64_t Address);
/// This function is thread safe.
const BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) const {
std::shared_lock<std::shared_timed_mutex> Lock(SymbolToFunctionMapMutex);
auto BFI = SymbolToFunctionMap.find(Symbol);
return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second;
}
BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) {
std::shared_lock<std::shared_timed_mutex> Lock(SymbolToFunctionMapMutex);
auto BFI = SymbolToFunctionMap.find(Symbol);
return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second;
}
@ -928,7 +657,8 @@ public:
}
/// Populate some internal data structures with debug info.
void preprocessDebugInfo();
void preprocessDebugInfo(
std::map<uint64_t, BinaryFunction> &BinaryFunctions);
/// Add a filename entry from SrcCUID to DestCUID.
unsigned addDebugFilenameToUnit(const uint32_t DestCUID,
@ -936,7 +666,8 @@ public:
unsigned FileIndex);
/// Return functions in output layout order
std::vector<BinaryFunction *> getSortedFunctions();
static std::vector<BinaryFunction *>
getSortedFunctions(std::map<uint64_t, BinaryFunction> &BinaryFunctions);
/// Do the best effort to calculate the size of the function by emitting
/// its code, and relaxing branch instructions.
@ -945,33 +676,26 @@ public:
/// size is for the cold one.
std::pair<size_t, size_t> calculateEmittedSize(BinaryFunction &BF);
/// Calculate the size of the instruction \p Inst optionally using a
/// user-supplied emitter for lock-free multi-thread work. MCCodeEmitter is
/// not thread safe and each thread should operate with its own copy of it.
uint64_t
computeInstructionSize(const MCInst &Inst,
const MCCodeEmitter *Emitter = nullptr) const {
if (!Emitter)
Emitter = this->MCE.get();
/// Calculate the size of the instruction \p Inst.
uint64_t computeInstructionSize(const MCInst &Inst) const {
SmallString<256> Code;
SmallVector<MCFixup, 4> Fixups;
raw_svector_ostream VecOS(Code);
Emitter->encodeInstruction(Inst, VecOS, Fixups, *STI);
MCE->encodeInstruction(Inst, VecOS, Fixups, *STI);
return Code.size();
}
/// Compute the native code size for a range of instructions.
/// Note: this can be imprecise wrt the final binary since happening prior to
/// relaxation, as well as wrt the original binary because of opcode
/// shortening.MCCodeEmitter is not thread safe and each thread should operate
/// with its own copy of it.
/// shortening.
template <typename Itr>
uint64_t computeCodeSize(Itr Beg, Itr End,
const MCCodeEmitter *Emitter = nullptr) const {
uint64_t computeCodeSize(Itr Beg, Itr End) const {
uint64_t Size = 0;
while (Beg != End) {
if (!MII->get(Beg->getOpcode()).isPseudo())
Size += computeInstructionSize(*Beg, Emitter);
Size += computeInstructionSize(*Beg);
++Beg;
}
return Size;
@ -1036,44 +760,8 @@ public:
void exitWithBugReport(StringRef Message,
const BinaryFunction &Function) const;
struct IndependentCodeEmitter {
std::unique_ptr<MCObjectFileInfo> LocalMOFI;
std::unique_ptr<MCContext> LocalCtx;
std::unique_ptr<MCCodeEmitter> MCE;
};
/// Encapsulates an independent MCCodeEmitter that doesn't share resources
/// with the main one available through BinaryContext::MCE, managed by
/// BinaryContext.
/// This is intended to create a lock-free environment for an auxiliary thread
/// that needs to perform work with an MCCodeEmitter that can be transient or
/// won't be used in the main code emitter.
IndependentCodeEmitter createIndependentMCCodeEmitter() const {
IndependentCodeEmitter MCEInstance;
MCEInstance.LocalMOFI = llvm::make_unique<MCObjectFileInfo>();
MCEInstance.LocalCtx = llvm::make_unique<MCContext>(
AsmInfo.get(), MRI.get(), MCEInstance.LocalMOFI.get());
MCEInstance.LocalMOFI->InitMCObjectFileInfo(*TheTriple, /*PIC=*/false,
*MCEInstance.LocalCtx);
MCEInstance.MCE.reset(
TheTarget->createMCCodeEmitter(*MII, *MRI, *MCEInstance.LocalCtx));
return MCEInstance;
}
};
template <typename T,
typename = std::enable_if_t<sizeof(T) == 1> >
inline raw_ostream &operator<<(raw_ostream &OS,
const ArrayRef<T> &ByteArray) {
const char *Sep = "";
for (const auto Byte : ByteArray) {
OS << Sep << format("%.2x", Byte);
Sep = " ";
}
return OS;
}
} // namespace bolt
} // namespace llvm

View File

@ -73,8 +73,8 @@ StringRef BinaryData::getOutputSectionName() const {
}
uint64_t BinaryData::getOutputAddress() const {
assert(OutputSection->getOutputAddress());
return OutputSection->getOutputAddress() + OutputOffset;
assert(OutputSection->getFileAddress());
return OutputSection->getFileAddress() + OutputOffset;
}
uint64_t BinaryData::getOffset() const {

View File

@ -106,7 +106,7 @@ public:
bool isAtomic() const {
return isTopLevelJumpTable() || !Parent;
}
iterator_range<std::vector<std::string>::const_iterator> names() const {
return make_range(Names.begin(), Names.end());
}

File diff suppressed because it is too large Load Diff

View File

@ -18,7 +18,6 @@
#include "BinaryBasicBlock.h"
#include "BinaryContext.h"
#include "BinaryLoop.h"
#include "BinarySection.h"
#include "DataReader.h"
#include "DebugData.h"
#include "JumpTable.h"
@ -41,7 +40,6 @@
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <algorithm>
using namespace llvm::object;
@ -55,6 +53,108 @@ namespace bolt {
using DWARFUnitLineTable = std::pair<DWARFUnit *,
const DWARFDebugLine::LineTable *>;
/// Class encapsulating runtime statistics about an execution unit.
class DynoStats {
#define DYNO_STATS\
D(FIRST_DYNO_STAT, "<reserved>", Fn)\
D(FORWARD_COND_BRANCHES, "executed forward branches", Fn)\
D(FORWARD_COND_BRANCHES_TAKEN, "taken forward branches", Fn)\
D(BACKWARD_COND_BRANCHES, "executed backward branches", Fn)\
D(BACKWARD_COND_BRANCHES_TAKEN, "taken backward branches", Fn)\
D(UNCOND_BRANCHES, "executed unconditional branches", Fn)\
D(FUNCTION_CALLS, "all function calls", Fn)\
D(INDIRECT_CALLS, "indirect calls", Fn)\
D(PLT_CALLS, "PLT calls", Fn)\
D(INSTRUCTIONS, "executed instructions", Fn)\
D(LOADS, "executed load instructions", Fn)\
D(STORES, "executed store instructions", Fn)\
D(JUMP_TABLE_BRANCHES, "taken jump table branches", Fn)\
D(ALL_BRANCHES, "total branches",\
Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\
D(ALL_TAKEN, "taken branches",\
Fadd(TAKEN_CONDITIONAL, UNCOND_BRANCHES))\
D(NONTAKEN_CONDITIONAL, "non-taken conditional branches",\
Fsub(ALL_CONDITIONAL, TAKEN_CONDITIONAL))\
D(TAKEN_CONDITIONAL, "taken conditional branches",\
Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\
D(ALL_CONDITIONAL, "all conditional branches",\
Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\
D(VENEER_CALLS_AARCH64, "linker-inserted veneer calls", Fn)\
D(LAST_DYNO_STAT, "<reserved>", 0)
public:
#define D(name, ...) name,
enum Category : uint8_t { DYNO_STATS };
#undef D
private:
uint64_t Stats[LAST_DYNO_STAT+1];
bool PrintAArch64Stats;
#define D(name, desc, ...) desc,
static constexpr const char *Desc[] = { DYNO_STATS };
#undef D
public:
DynoStats(bool PrintAArch64Stats ) {
this->PrintAArch64Stats = PrintAArch64Stats;
for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat)
Stats[Stat] = 0;
}
uint64_t &operator[](size_t I) {
assert(I > FIRST_DYNO_STAT && I < LAST_DYNO_STAT &&
"index out of bounds");
return Stats[I];
}
uint64_t operator[](size_t I) const {
switch (I) {
#define D(name, desc, func) \
case name: \
return func;
#define Fn Stats[I]
#define Fadd(a, b) operator[](a) + operator[](b)
#define Fsub(a, b) operator[](a) - operator[](b)
#define F(a) operator[](a)
#define Radd(a, b) (a + b)
#define Rsub(a, b) (a - b)
DYNO_STATS
#undef Rsub
#undef Radd
#undef F
#undef Fsub
#undef Fadd
#undef Fn
#undef D
default:
llvm_unreachable("index out of bounds");
}
return 0;
}
void print(raw_ostream &OS, const DynoStats *Other = nullptr) const;
void operator+=(const DynoStats &Other);
bool operator<(const DynoStats &Other) const;
bool operator==(const DynoStats &Other) const;
bool operator!=(const DynoStats &Other) const { return !operator==(Other); }
bool lessThan(const DynoStats &Other, ArrayRef<Category> Keys) const;
static const char* Description(const Category C) {
return Desc[C];
}
};
inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) {
Stats.print(OS, nullptr);
return OS;
}
DynoStats operator+(const DynoStats &A, const DynoStats &B);
/// Types of macro-fusion alignment corrections.
enum MacroFusionType {
MFT_NONE,
@ -202,27 +302,11 @@ private:
std::unique_ptr<BinaryLoopInfo> BLI;
/// All labels in the function that are referenced via relocations from
/// data objects. Typically these are jump table destinations and computed
/// goto labels.
std::set<uint64_t> ExternallyReferencedOffsets;
/// Offsets of indirect branches with unknown destinations.
std::set<uint64_t> UnknownIndirectBranchOffsets;
/// False if the function is too complex to reconstruct its control
/// flow graph.
/// In relocation mode we still disassemble and re-assemble such functions.
bool IsSimple{true};
/// True if the function has an indirect branch with unknown destination.
bool HasUnknownControlFlow{false};
/// The code from inside the function references one of the code locations
/// from the same function as a data, i.e. it's possible the label is used
/// inside an address calculation or could be referenced from outside.
bool HasInternalLabelReference{false};
/// In AArch64, preserve nops to maintain code equal to input (assuming no
/// optimizations are done).
bool PreserveNops{false};
@ -252,15 +336,6 @@ private:
/// destination.
bool HasFixedIndirectBranch{false};
/// Is the function known to exceed its input size?
bool IsLarge{false};
/// True if the function is a fragment of another function. This means that
/// this function could only be entered via its parent or one of its sibling
/// fragments. It could be entered at any basic block. It can also return
/// the control to any basic block of its parent or its sibling.
bool IsFragment{false};
/// The address for the code for this function in codegen memory.
uint64_t ImageAddress{0};
@ -273,12 +348,6 @@ private:
/// Name for the corresponding cold code section.
std::string ColdCodeSectionName;
/// Parent function for split function fragments.
BinaryFunction *ParentFunction{nullptr};
/// All fragments for a parent function.
std::unordered_set<BinaryFunction *> Fragments;
/// The profile data for the number of times the function was executed.
uint64_t ExecutionCount{COUNT_NO_PROFILE};
@ -326,9 +395,6 @@ private:
/// Function order for streaming into the destination binary.
uint32_t Index{-1U};
/// Indicate that the function body has SDT marker
bool HasSDTMarker{false};
/// Get basic block index assuming it belongs to this function.
unsigned getIndex(const BinaryBasicBlock *BB) const {
assert(BB->getIndex() < BasicBlocks.size());
@ -367,7 +433,7 @@ private:
/// Associate DW_CFA_GNU_args_size info with invoke instructions
/// (call instructions with non-empty landing pad).
void propagateGnuArgsSizeInfo(MCPlusBuilder::AllocatorIdTy AllocId);
void propagateGnuArgsSizeInfo();
/// Synchronize branch instructions with CFG.
void postProcessBranches();
@ -385,8 +451,8 @@ private:
std::set<uint64_t> CodeOffsets;
/// The address offset where we emitted the constant island, that is, the
/// chunk of data in the function code area (AArch only)
int64_t OutputDataOffset{0};
int64_t OutputColdDataOffset{0};
int64_t OutputDataOffset;
int64_t OutputColdDataOffset;
/// Map labels to corresponding basic blocks.
std::unordered_map<const MCSymbol *, BinaryBasicBlock *> LabelToBB;
@ -471,20 +537,25 @@ private:
/// function and that apply before the entry basic block).
CFIInstrMapType CIEFrameInstructions;
/// All compound jump tables for this function. This duplicates what's stored
/// in the BinaryContext, but additionally it gives quick access for all
/// jump tables used by this function.
///
/// All compound jump tables for this function.
/// <OriginalAddress> -> <JumpTable *>
std::map<uint64_t, JumpTable *> JumpTables;
/// A map from jump table address to insertion order. Used for generating
/// jump table names.
mutable std::map<uint64_t, size_t> JumpTableIds;
/// Generate a unique name for this jump table at the given address that
/// should be repeatable no matter what the start address of the table is.
std::string generateJumpTableName(uint64_t Address) const;
/// Iterate over all jump tables associated with this function.
iterator_range<std::map<uint64_t, JumpTable *>::const_iterator>
jumpTables() const {
return make_range(JumpTables.begin(), JumpTables.end());
}
/// All jump table sites in the function before CFG is built.
/// All jump table sites in the function.
std::vector<std::pair<uint64_t, uint64_t>> JTSites;
/// List of relocations in this function.
@ -554,12 +625,6 @@ private:
/// Count the number of functions created.
static uint64_t Count;
/// LocSym annotation records an index to this vector. This holds a label
/// for each instruction whose input/output offsets need to be known after
/// emission. Enables writing bolt address translation tables, used for
/// mapping control transfer in the output binary back to the original binary.
std::vector<const MCSymbol *> LocSyms;
/// Register alternative function name.
void addAlternativeName(std::string NewName) {
Names.emplace_back(NewName);
@ -589,17 +654,6 @@ private:
return getOrCreateLocalLabel(getAddress() + Offset);
}
/// Register an internal offset in a function referenced from outside.
void registerReferencedOffset(uint64_t Offset) {
ExternallyReferencedOffsets.emplace(Offset);
}
/// True if there are references to internals of this function from data,
/// e.g. from jump tables.
bool hasInternalReference() const {
return !ExternallyReferencedOffsets.empty();
}
/// Update all \p From references in the code to refer to \p To. Used
/// in disassembled state only.
void updateReferences(const MCSymbol *From, const MCSymbol *To);
@ -607,16 +661,6 @@ private:
/// This is called in disassembled state.
void addEntryPoint(uint64_t Address);
void setParentFunction(BinaryFunction *BF) {
assert((!ParentFunction || ParentFunction == BF) &&
"cannot have more than one parent function");
ParentFunction = BF;
}
void addFragment(BinaryFunction *BF) {
Fragments.insert(BF);
}
/// Return true if there is a registered entry point at a given offset
/// into the function.
bool hasEntryPointAtOffset(uint64_t Offset) {
@ -643,11 +687,9 @@ private:
/// Emit line number information corresponding to \p NewLoc. \p PrevLoc
/// provides a context for de-duplication of line number info.
/// \p FirstInstr indicates if \p NewLoc represents the first instruction
/// in a sequence, such as a function fragment.
///
/// Return new current location which is either \p NewLoc or \p PrevLoc.
SMLoc emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc, bool FirstInstr) const;
SMLoc emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc) const;
BinaryFunction& operator=(const BinaryFunction &) = delete;
BinaryFunction(const BinaryFunction &) = delete;
@ -717,10 +759,6 @@ public:
return iterator_range<const_iterator>(begin(), end());
}
// Iterators by pointer.
BasicBlockListType::iterator pbegin() { return BasicBlocks.begin(); }
BasicBlockListType::iterator pend() { return BasicBlocks.end(); }
order_iterator layout_begin() { return BasicBlocksLayout.begin(); }
const_order_iterator layout_begin() const
{ return BasicBlocksLayout.begin(); }
@ -784,13 +822,6 @@ public:
return *this;
}
/// Return a symbol for an instruction location. \p Idx is recorded as an
/// annotation in the instruction.
const MCSymbol *getLocSym(size_t Idx) const {
assert(Idx < LocSyms.size() && "Invalid index");
return LocSyms[Idx];
}
/// Update layout of basic blocks used for output.
void updateBasicBlockLayout(BasicBlockOrderType &NewLayout) {
BasicBlocksPreviousLayout = BasicBlocksLayout;
@ -868,6 +899,13 @@ public:
/// Attempt to validate CFG invariants.
bool validateCFG() const;
/// Return dynostats for the function.
///
/// The function relies on branch instructions being in-sync with CFG for
/// branch instructions stats. Thus it is better to call it after
/// fixBranches().
DynoStats getDynoStats() const;
BinaryBasicBlock *getBasicBlockForLabel(const MCSymbol *Label) {
auto I = LabelToBB.find(Label);
return I == LabelToBB.end() ? nullptr : I->second;
@ -901,7 +939,7 @@ public:
/// Retrieve the landing pad BB associated with invoke instruction \p Invoke
/// that is in \p BB. Return nullptr if none exists
BinaryBasicBlock *getLandingPadBBFor(const BinaryBasicBlock &BB,
const MCInst &InvokeInst) const {
const MCInst &InvokeInst) {
assert(BC.MIB->isInvoke(InvokeInst) && "must be invoke instruction");
const auto LP = BC.MIB->getEHInfo(InvokeInst);
if (LP && LP->first) {
@ -916,20 +954,15 @@ public:
/// CFG is constructed or while instruction offsets are available in CFG.
MCInst *getInstructionAtOffset(uint64_t Offset);
const MCInst *getInstructionAtOffset(uint64_t Offset) const {
return const_cast<BinaryFunction *>(this)->getInstructionAtOffset(Offset);
}
/// Return jump table that covers a given \p Address in memory.
JumpTable *getJumpTableContainingAddress(uint64_t Address) {
auto JTI = JumpTables.upper_bound(Address);
if (JTI == JumpTables.begin())
return nullptr;
--JTI;
if (JTI->first + JTI->second->getSize() > Address)
return JTI->second;
if (JTI->second->getSize() == 0 && JTI->first == Address)
if (JTI->first + JTI->second->getSize() > Address) {
return JTI->second;
}
return nullptr;
}
@ -967,7 +1000,7 @@ public:
/// Check if (possibly one out of many) function name matches the given
/// regex.
const std::string *hasNameRegex(const StringRef NameRegex) const;
bool hasNameRegex(const std::string &NameRegex) const;
/// Return a vector of all possible names for the function.
const std::vector<std::string> &getNames() const {
@ -1091,7 +1124,6 @@ public:
MCSymbol *getFunctionEndLabel() const {
assert(BC.Ctx && "cannot be called with empty context");
if (!FunctionEndLabel) {
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
FunctionEndLabel = BC.Ctx->createTempSymbol("func_end", true);
}
return FunctionEndLabel;
@ -1100,7 +1132,6 @@ public:
/// Return MC symbol associated with the end of the cold part of the function.
MCSymbol *getFunctionColdEndLabel() const {
if (!FunctionColdEndLabel) {
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
FunctionColdEndLabel = BC.Ctx->createTempSymbol("func_cold_end", true);
}
return FunctionColdEndLabel;
@ -1201,7 +1232,7 @@ public:
/// address in a function. During disassembly we have to make sure we create
/// relocation at that location.
void addPCRelativeRelocationAddress(uint64_t Address) {
assert(containsAddress(Address, /*UseMaxSize=*/ true) &&
assert(Address >= getAddress() && Address < getAddress() + getSize() &&
"address is outside of the function");
PCRelativeRelocationOffsets.emplace(Address - getAddress());
}
@ -1209,41 +1240,16 @@ public:
/// Get data used by this function.
std::set<BinaryData *> dataUses(bool OnlyHot) const;
/// Return then name of the section this function originated from.
StringRef getOriginSectionName() const {
return getSection().getName();
}
/// Return internal section name for this function.
StringRef getCodeSectionName() const {
return StringRef(CodeSectionName);
}
/// Assign a code section name to the function.
void setCodeSectionName(StringRef Name) {
CodeSectionName = Name;
}
/// Get output code section.
ErrorOr<BinarySection &> getCodeSection() const {
return BC.getUniqueSectionByName(getCodeSectionName());
}
/// Return cold code section name for the function.
StringRef getColdCodeSectionName() const {
return StringRef(ColdCodeSectionName);
}
/// Assign a section name for the cold part of the function.
void setColdCodeSectionName(StringRef Name) {
ColdCodeSectionName = Name;
}
/// Get output code section for cold code of this function.
ErrorOr<BinarySection &> getColdCodeSection() const {
return BC.getUniqueSectionByName(getColdCodeSectionName());
}
/// Return true iif the function will halt execution on entry.
bool trapsOnEntry() const {
return TrapsOnEntry;
@ -1258,16 +1264,6 @@ public:
return IsSimple;
}
/// Return true if the function has instruction(s) with unknown control flow.
bool hasUnknownControlFlow() const {
return HasUnknownControlFlow;
}
/// Return true if the function should be split for the output.
bool shouldSplit() const {
return IsLarge && !getBinaryContext().HasRelocations;
}
/// Return true if the function body is non-contiguous.
bool isSplit() const {
return layout_size() &&
@ -1304,9 +1300,6 @@ public:
return !JumpTables.empty();
}
/// Return true if the function has SDT marker
bool hasSDTMarker() const { return HasSDTMarker; }
const JumpTable *getJumpTable(const MCInst &Inst) const {
const auto Address = BC.MIB->getJumpTable(Inst);
return getJumpTableContainingAddress(Address);
@ -1336,7 +1329,7 @@ public:
}
/// Return true if the given address \p PC is inside the function body.
bool containsAddress(uint64_t PC, bool UseMaxSize = false) const {
bool containsAddress(uint64_t PC, bool UseMaxSize=false) const {
if (UseMaxSize)
return Address <= PC && PC < Address + MaxSize;
return Address <= PC && PC < Address + Size;
@ -1345,8 +1338,7 @@ public:
/// Add new names this function is known under.
template <class ContainterTy>
void addNewNames(const ContainterTy &NewNames) {
Names.insert(Names.begin(), NewNames.begin(), NewNames.end());
std::sort(Names.begin(), Names.end());
Names.insert(Names.begin(), NewNames.begin(), NewNames.end());
}
/// Create a basic block at a given \p Offset in the
@ -1361,7 +1353,6 @@ public:
bool DeriveAlignment = false) {
assert(BC.Ctx && "cannot be called with empty context");
if (!Label) {
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
Label = BC.Ctx->createTempSymbol("BB", true);
}
auto BB = std::unique_ptr<BinaryBasicBlock>(
@ -1388,10 +1379,9 @@ public:
assert((CurrentState == State::CFG || !getBasicBlockAtOffset(Offset)) &&
"basic block already exists in pre-CFG state");
if (!Label) {
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
if (!Label)
Label = BC.Ctx->createTempSymbol("BB", true);
}
auto BBPtr = createBasicBlock(Offset, Label, DeriveAlignment);
BasicBlocks.emplace_back(BBPtr.release());
@ -1448,15 +1438,13 @@ public:
BinaryBasicBlock *Start,
std::vector<std::unique_ptr<BinaryBasicBlock>> &&NewBBs,
const bool UpdateLayout = true,
const bool UpdateCFIState = true,
const bool RecomputeLandingPads = true);
const bool UpdateCFIState = true);
iterator insertBasicBlocks(
iterator StartBB,
std::vector<std::unique_ptr<BinaryBasicBlock>> &&NewBBs,
const bool UpdateLayout = true,
const bool UpdateCFIState = true,
const bool RecomputeLandingPads = true);
const bool UpdateCFIState = true);
/// Update the basic block layout for this function. The BBs from
/// [Start->Index, Start->Index + NumNewBlocks) are inserted into the
@ -1475,20 +1463,6 @@ public:
/// new blocks into the CFG. This must be called after updateLayout.
void updateCFIState(BinaryBasicBlock *Start, const unsigned NumNewBlocks);
/// Return true if we detected ambiguous jump tables in this function, which
/// happen when one JT is used in more than one indirect jumps. This precludes
/// us from splitting edges for this JT unless we duplicate the JT (see
/// disambiguateJumpTables).
bool checkForAmbiguousJumpTables();
/// Detect when two distinct indirect jumps are using the same jump table and
/// duplicate it, allocating a separate JT for each indirect branch. This is
/// necessary for code transformations on the CFG that change an edge induced
/// by an indirect branch, e.g.: instrumentation or shrink wrapping. However,
/// this is only possible if we are not updating jump tables in place, but are
/// writing it to a new location (moving them).
void disambiguateJumpTables();
/// Change \p OrigDest to \p NewDest in the jump table used at the end of
/// \p BB. Returns false if \p OrigDest couldn't be find as a valid target
/// and no replacement took place.
@ -1654,11 +1628,6 @@ public:
return *this;
}
BinaryFunction &setLarge(bool Large) {
IsLarge = Large;
return *this;
}
BinaryFunction &setUsesGnuArgsSize(bool Uses = true) {
UsesGnuArgsSize = Uses;
return *this;
@ -1732,10 +1701,6 @@ public:
return ImageSize;
}
BinaryFunction *getParentFunction() const {
return ParentFunction;
}
/// Set the profile data for the number of times the function was called.
BinaryFunction &setExecutionCount(uint64_t Count) {
ExecutionCount = Count;
@ -1842,7 +1807,6 @@ public:
// Register our island at global namespace
Symbol = BC.getOrCreateGlobalSymbol(Address, "ISLANDat");
// Internal bookkeeping
const auto Offset = Address - getAddress();
assert((!IslandOffsets.count(Offset) || IslandOffsets[Offset] == Symbol) &&
@ -1859,20 +1823,20 @@ public:
/// separate symbols when emitting our constant island on behalf of this other
/// function.
MCSymbol *
getOrCreateProxyIslandAccess(uint64_t Address, BinaryFunction &Referrer) {
getOrCreateProxyIslandAccess(uint64_t Address, BinaryFunction *Referrer) {
auto Symbol = getOrCreateIslandAccess(Address);
if (!Symbol)
return nullptr;
MCSymbol *Proxy;
if (!IslandProxies[&Referrer].count(Symbol)) {
if (!IslandProxies[Referrer].count(Symbol)) {
Proxy =
BC.Ctx->getOrCreateSymbol(Symbol->getName() +
".proxy.for." + Referrer.getPrintName());
IslandProxies[&Referrer][Symbol] = Proxy;
IslandProxies[&Referrer][Proxy] = Symbol;
".proxy.for." + Referrer->getPrintName());
IslandProxies[Referrer][Symbol] = Proxy;
IslandProxies[Referrer][Proxy] = Symbol;
}
Proxy = IslandProxies[&Referrer][Symbol];
Proxy = IslandProxies[Referrer][Symbol];
return Proxy;
}
@ -1955,9 +1919,6 @@ public:
/// Returns false if disassembly failed.
void disassemble(ArrayRef<uint8_t> FunctionData);
/// Validate entry points.
void postProcessEntryPoints();
/// Post-processing for jump tables after disassembly. Since their
/// boundaries are not known until all call sites are seen, we need this
/// extra pass to perform any final adjustments.
@ -1969,7 +1930,7 @@ public:
///
/// Returns true on success and update the current function state to
/// State::CFG. Returns false if CFG cannot be built.
bool buildCFG(MCPlusBuilder::AllocatorIdTy);
bool buildCFG();
/// Read any kind of profile information available for the function.
void readProfile();
@ -1990,7 +1951,7 @@ public:
///
/// Return true upon successful processing, or false if the control flow
/// cannot be statically evaluated for any given indirect branch.
bool postProcessIndirectBranches(MCPlusBuilder::AllocatorIdTy AllocId);
bool postProcessIndirectBranches();
/// In functions with multiple entry points, the profile collection records
/// data for other entry points in a different function entry. This function
@ -2158,7 +2119,7 @@ public:
/// Emit function code. The caller is responsible for emitting function
/// symbol(s) and setting the section to emit the code to.
void emitBody(MCStreamer &Streamer, bool EmitColdPart,
bool EmitCodeOnly = false, bool LabelsForOffsets = false);
bool EmitCodeOnly = false);
/// Emit function as a blob with relocations and labels for relocations.
void emitBodyRaw(MCStreamer *Streamer);
@ -2190,8 +2151,6 @@ public:
/// Sets the associated .debug_info entry.
void addSubprogramDIE(const DWARFDie DIE) {
static std::mutex CriticalSectionMutex;
std::lock_guard<std::mutex> Lock(CriticalSectionMutex);
SubprogramDIEs.emplace_back(DIE);
if (!UnitLineTable.first) {
if (const auto *LineTable =
@ -2294,7 +2253,7 @@ public:
}
/// Return output address ranges for a function.
DebugAddressRangesVector getOutputAddressRanges() const;
DWARFAddressRangesVector getOutputAddressRanges() const;
/// Given an address corresponding to an instruction in the input binary,
/// return an address of this instruction in output binary.
@ -2305,7 +2264,7 @@ public:
/// Take address ranges corresponding to the input binary and translate
/// them to address ranges in the output binary.
DebugAddressRangesVector translateInputToOutputRanges(
DWARFAddressRangesVector translateInputToOutputRanges(
const DWARFAddressRangesVector &InputRanges) const;
/// Similar to translateInputToOutputRanges() but operates on location lists
@ -2348,6 +2307,48 @@ public:
const FragmentInfo &cold() const { return ColdFragment; }
};
/// Return program-wide dynostats.
template <typename FuncsType>
inline DynoStats getDynoStats(const FuncsType &Funcs) {
bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
DynoStats dynoStats(IsAArch64);
for (auto &BFI : Funcs) {
auto &BF = BFI.second;
if (BF.isSimple()) {
dynoStats += BF.getDynoStats();
}
}
return dynoStats;
}
/// Call a function with optional before and after dynostats printing.
template <typename FnType, typename FuncsType>
inline void
callWithDynoStats(FnType &&Func,
const FuncsType &Funcs,
StringRef Phase,
const bool Flag) {
bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
DynoStats DynoStatsBefore(IsAArch64);
if (Flag) {
DynoStatsBefore = getDynoStats(Funcs);
}
Func();
if (Flag) {
const auto DynoStatsAfter = getDynoStats(Funcs);
const auto Changed = (DynoStatsAfter != DynoStatsBefore);
outs() << "BOLT-INFO: program-wide dynostats after running "
<< Phase << (Changed ? "" : " (no change)") << ":\n\n"
<< DynoStatsBefore << '\n';
if (Changed) {
DynoStatsAfter.print(outs(), &DynoStatsBefore);
}
outs() << '\n';
}
}
inline raw_ostream &operator<<(raw_ostream &OS,
const BinaryFunction &Function) {
OS << Function.getPrintName();

View File

@ -152,7 +152,7 @@ bool BinaryFunction::recordTrace(
const auto *Instr = BB->getLastNonPseudoInstr();
uint64_t Offset{0};
if (Instr) {
Offset = BC.MIB->getAnnotationWithDefault<uint32_t>(*Instr, "Offset");
Offset = BC.MIB->getAnnotationWithDefault<uint64_t>(*Instr, "Offset");
} else {
Offset = BB->getOffset();
}
@ -175,11 +175,7 @@ bool BinaryFunction::recordBranch(uint64_t From, uint64_t To,
return false;
}
// Could be bad LBR data; ignore the branch. In the case of data collected
// in binaries optimized by BOLT, a source BB may be mapped to two output
// BBs as a result of optimizations. In that case, a branch between these
// two will be recorded as a branch from A going to A in the source address
// space. Keep processing.
// Could be bad LBR data; ignore the branch.
if (From == To) {
return true;
}
@ -204,7 +200,7 @@ bool BinaryFunction::recordBranch(uint64_t From, uint64_t To,
const auto *LastInstr = ToBB->getLastNonPseudoInstr();
if (LastInstr) {
const auto LastInstrOffset =
BC.MIB->getAnnotationWithDefault<uint32_t>(*LastInstr, "Offset");
BC.MIB->getAnnotationWithDefault<uint64_t>(*LastInstr, "Offset");
// With old .fdata we are getting FT branches for "jcc,jmp" sequences.
if (To == LastInstrOffset && BC.MIB->isUnconditionalBranch(*LastInstr)) {
@ -230,40 +226,23 @@ bool BinaryFunction::recordBranch(uint64_t From, uint64_t To,
// discarded it as a FT from __builtin_unreachable.
auto *FromInstruction = getInstructionAtOffset(From);
if (!FromInstruction) {
// If the data was collected in a bolted binary, the From addresses may be
// translated to the first instruction of the source BB if BOLT inserted
// a new branch that did not exist in the source (we can't map it to the
// source instruction, so we map it to the first instr of source BB).
// We do not keep offsets for random instructions. So the check above will
// evaluate to true if the first instr is not a branch (call/jmp/ret/etc)
if (BC.DR.collectedInBoltedBinary()) {
if (FromBB->getInputOffset() != From) {
DEBUG(dbgs() << "offset " << From << " does not match a BB in " << *this
<< '\n');
return false;
}
FromInstruction = nullptr;
} else {
DEBUG(dbgs() << "no instruction for offset " << From << " in " << *this
<< '\n');
return false;
}
DEBUG(dbgs() << "no instruction for offset " << From << " in "
<< *this << '\n');
return false;
}
if (FromBB == ToBB) {
// Check for a return from a recursive call.
// Otherwise it's a simple loop.
}
if (!FromBB->getSuccessor(ToBB->getLabel())) {
// Check if this is a recursive call or a return from a recursive call.
if (FromInstruction && ToBB->isEntryPoint() &&
(BC.MIB->isCall(*FromInstruction) ||
BC.MIB->isIndirectBranch(*FromInstruction))) {
if (ToBB->isEntryPoint() && (BC.MIB->isCall(*FromInstruction) ||
BC.MIB->isIndirectBranch(*FromInstruction))) {
// Execution count is already accounted for.
return true;
}
// For data collected in a bolted binary, we may have created two output BBs
// that map to one original block. Branches between these two blocks will
// appear here as one BB jumping to itself, even though it has no loop edges.
// Ignore these.
if (BC.DR.collectedInBoltedBinary() && FromBB == ToBB)
return true;
DEBUG(dbgs() << "invalid branch in " << *this << '\n'
<< Twine::utohexstr(From) << " -> "
@ -320,15 +299,16 @@ void BinaryFunction::postProcessProfile() {
return;
}
if (!(getProfileFlags() & PF_LBR)) {
// Check if MCF post-processing was requested.
if (opts::DoMCF != MCF_DISABLE) {
removeTagsFromProfile();
solveMCF(*this, opts::DoMCF);
}
// Check if MCF post-processing was requested.
if (opts::DoMCF != MCF_DISABLE) {
removeTagsFromProfile();
solveMCF(*this, opts::DoMCF);
return;
}
if (!(getProfileFlags() & PF_LBR))
return;
// Pre-sort branch data.
if (BranchData)
std::stable_sort(BranchData->Data.begin(), BranchData->Data.end());
@ -388,12 +368,6 @@ void BinaryFunction::postProcessProfile() {
if (opts::InferFallThroughs)
inferFallThroughCounts();
// Check if MCF post-processing was requested.
if (opts::DoMCF != MCF_DISABLE) {
removeTagsFromProfile();
solveMCF(*this, opts::DoMCF);
}
// Update profile information for jump tables based on CFG branch data.
for (auto *BB : BasicBlocks) {
const auto *LastInstr = BB->getLastNonPseudoInstr();
@ -869,11 +843,6 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) {
if (BI.From.Name == BI.To.Name) {
// Try to record information with 0 count.
IsValid = recordBranch(BI.From.Offset, BI.To.Offset, 0);
} else if (BC.DR.collectedInBoltedBinary()) {
// We can't check branch source for collections in bolted binaries because
// the source of the branch may be mapped to the first instruction in a BB
// instead of the original branch (which may not exist in the source bin).
IsValid = true;
} else {
// The branch has to originate from this function.
// Check for calls, tail calls, rets and indirect branches.

View File

@ -201,13 +201,6 @@ PrintUCE("print-uce",
cl::Hidden,
cl::cat(BoltOptCategory));
static cl::opt<bool>
PrintProfileStats("print-profile-stats",
cl::desc("print profile quality/bias analysis"),
cl::ZeroOrMore,
cl::init(false),
cl::cat(BoltCategory));
static cl::opt<bool>
SimplifyConditionalTailCalls("simplify-conditional-tail-calls",
cl::desc("simplify conditional tail calls by removing unnecessary jumps"),
@ -236,14 +229,6 @@ StringOps("inline-memcpy",
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::list<std::string>
SpecializeMemcpy1("memcpy1-spec",
cl::desc("list of functions with call sites for which to specialize memcpy() "
"for size 1"),
cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::opt<bool>
StripRepRet("strip-rep-ret",
cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"),
@ -307,7 +292,6 @@ const char BinaryFunctionPassManager::TimerGroupDesc[] =
"Binary Function Pass Manager";
void BinaryFunctionPassManager::runPasses() {
auto &BFs = BC.getBinaryFunctions();
for (const auto &OptPassPair : Passes) {
if (!OptPassPair.first)
continue;
@ -323,7 +307,7 @@ void BinaryFunctionPassManager::runPasses() {
callWithDynoStats(
[this,&Pass] {
Pass->runOnFunctions(BC);
Pass->runOnFunctions(BC, BFs, LargeFunctions);
},
BFs,
Pass->getName(),
@ -366,10 +350,14 @@ void BinaryFunctionPassManager::runPasses() {
}
}
void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
BinaryFunctionPassManager Manager(BC);
void BinaryFunctionPassManager::runAllPasses(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &Functions,
std::set<uint64_t> &LargeFunctions
) {
BinaryFunctionPassManager Manager(BC, Functions, LargeFunctions);
const auto InitialDynoStats = getDynoStats(BC.getBinaryFunctions());
const auto InitialDynoStats = getDynoStats(Functions);
// Here we manage dependencies/order manually, since passes are run in the
// order they're registered.
@ -377,9 +365,6 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
// Run this pass first to use stats for the original functions.
Manager.registerPass(llvm::make_unique<PrintProgramStats>(NeverPrint));
if (opts::PrintProfileStats)
Manager.registerPass(llvm::make_unique<PrintProfileStats>(NeverPrint));
Manager.registerPass(llvm::make_unique<ValidateInternalCalls>(NeverPrint));
Manager.registerPass(llvm::make_unique<StripRepRet>(NeverPrint),
@ -389,12 +374,7 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
opts::ICF);
if (BC.isAArch64())
Manager.registerPass(
llvm::make_unique<VeneerElimination>(PrintVeneerElimination));
Manager.registerPass(
llvm::make_unique<SpecializeMemcpy1>(NeverPrint, opts::SpecializeMemcpy1),
!opts::SpecializeMemcpy1.empty());
Manager.registerPass(llvm::make_unique<VeneerElimination>(PrintVeneerElimination));
Manager.registerPass(llvm::make_unique<InlineMemcpy>(NeverPrint),
opts::StringOps);
@ -483,14 +463,10 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
Manager.registerPass(
llvm::make_unique<RetpolineInsertion>(PrintRetpolineInsertion));
// Insert lfences to mitigate Spectre v1 and LVI. This pass is not compatible
// with the retpoline mitigation pass.
Manager.registerPass(llvm::make_unique<LFenceInsertion>());
Manager.registerPass(
llvm::make_unique<LFenceInsertion>());
// Assign each function an output section.
Manager.registerPass(llvm::make_unique<AssignSections>());
// Tighten branches according to offset differences between branch and
// Thighten branches according to offset differences between branch and
// targets. No extra instructions after this pass, otherwise we may have
// relocations out of range and crash during linking.
if (BC.isAArch64())

View File

@ -27,6 +27,8 @@ namespace bolt {
class BinaryFunctionPassManager {
private:
BinaryContext &BC;
std::map<uint64_t, BinaryFunction> &BFs;
std::set<uint64_t> &LargeFunctions;
std::vector<std::pair<const bool,
std::unique_ptr<BinaryFunctionPass>>> Passes;
@ -34,8 +36,10 @@ private:
static const char TimerGroupName[];
static const char TimerGroupDesc[];
BinaryFunctionPassManager(BinaryContext &BC)
: BC(BC) {}
BinaryFunctionPassManager(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions)
: BC(BC), BFs(BFs), LargeFunctions(LargeFunctions) {}
/// Adds a pass to this manager based on the value of its corresponding
/// command-line option.
@ -53,7 +57,10 @@ private:
void runPasses();
/// Runs all enabled implemented passes on all functions.
static void runAllPasses(BinaryContext &BC);
static void runAllPasses(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &Functions,
std::set<uint64_t> &LargeFunctions);
};
} // namespace bolt

View File

@ -66,7 +66,7 @@ BinarySection::~BinarySection() {
delete[] getData();
return;
}
if (!isAllocatable() &&
(!hasSectionRef() ||
OutputContents.data() != getContents(Section).data())) {
@ -78,7 +78,7 @@ void BinarySection::print(raw_ostream &OS) const {
OS << getName() << ", "
<< "0x" << Twine::utohexstr(getAddress()) << ", "
<< getSize()
<< " (0x" << Twine::utohexstr(getOutputAddress()) << ", "
<< " (0x" << Twine::utohexstr(getFileAddress()) << ", "
<< getOutputSize() << ")"
<< ", data = " << getData()
<< ", output data = " << getOutputData();
@ -160,23 +160,3 @@ void BinarySection::reorderContents(const std::vector<BinaryData *> &Order,
Contents = OutputContents = StringRef(NewData, OS.str().size());
OutputSize = Contents.size();
}
std::string BinarySection::encodeELFNote(StringRef NameStr, StringRef DescStr,
uint32_t Type) {
std::string Str;
raw_string_ostream OS(Str);
const uint32_t NameSz = NameStr.size() + 1;
const uint32_t DescSz = DescStr.size();
OS.write(reinterpret_cast<const char *>(&(NameSz)), 4);
OS.write(reinterpret_cast<const char *>(&(DescSz)), 4);
OS.write(reinterpret_cast<const char *>(&(Type)), 4);
OS << NameStr << '\0';
for (uint64_t I = NameSz; I < alignTo(NameSz, 4); ++I) {
OS << '\0';
}
OS << DescStr;
for (uint64_t I = DescStr.size(); I < alignTo(DescStr.size(), 4); ++I) {
OS << '\0';
}
return OS.str();
}

View File

@ -62,16 +62,13 @@ class BinarySection {
// finalized?
std::string OutputName; // Output section name (if the section has
// been renamed)
uint64_t OutputAddress{0}; // Section address for the rewritten binary.
uint64_t FileAddress{0}; // Section address for the rewritten binary.
uint64_t OutputSize{0}; // Section size in the rewritten binary.
uint64_t FileOffset{0}; // File offset in the rewritten binary file.
StringRef OutputContents; // Rewritten section contents.
unsigned SectionID{-1u}; // Unique ID used for address mapping.
// Set by ExecutableFileMemoryManager.
uint32_t Index{0}; // Section index in the output file.
mutable bool IsReordered{false}; // Have the contents been reordered?
bool IsAnonymous{false}; // True if the name should not be included
// in the output file.
uint64_t hash(const BinaryData &BD,
std::map<const BinaryData *, uint64_t> &Cache) const;
@ -267,7 +264,6 @@ public:
}
bool isLocal() const { return IsLocal; }
bool isReordered() const { return IsReordered; }
bool isAnonymous() const { return IsAnonymous; }
unsigned getELFType() const { return ELFType; }
unsigned getELFFlags() const { return ELFFlags; }
@ -284,8 +280,7 @@ public:
/// Does this section contain the given \p Address?
/// Note: this is in terms of the original mapped binary addresses.
bool containsAddress(uint64_t Address) const {
return (getAddress() <= Address && Address < getEndAddress()) ||
(getSize() == 0 && getAddress() == Address);
return getAddress() <= Address && Address < getEndAddress();
}
/// Does this section contain the range [\p Address, \p Address + \p Size)?
@ -376,7 +371,7 @@ public:
uint64_t getAllocAddress() const {
return reinterpret_cast<uint64_t>(getOutputData());
}
uint64_t getOutputAddress() const { return OutputAddress; }
uint64_t getFileAddress() const { return FileAddress; }
uint64_t getFileOffset() const { return FileOffset; }
unsigned getSectionID() const {
assert(hasValidSectionID() && "trying to use uninitialized section id");
@ -385,13 +380,10 @@ public:
bool hasValidSectionID() const {
return SectionID != -1u;
}
uint32_t getIndex() const {
return Index;
}
// mutation
void setOutputAddress(uint64_t Address) {
OutputAddress = Address;
void setFileAddress(uint64_t Address) {
FileAddress = Address;
}
void setFileOffset(uint64_t Offset) {
FileOffset = Offset;
@ -400,15 +392,9 @@ public:
assert(!hasValidSectionID() && "trying to set section id twice");
SectionID = ID;
}
void setIndex(uint32_t I) {
Index = I;
}
void setOutputName(StringRef Name) {
OutputName = Name;
}
void setAnonymous(bool Flag) {
IsAnonymous = Flag;
}
/// Reorder the contents of this section according to /p Order. If
/// /p Inplace is true, the entire contents of the section is reordered,
@ -416,18 +402,6 @@ public:
void reorderContents(const std::vector<BinaryData *> &Order, bool Inplace);
void print(raw_ostream &OS) const;
/// Write the contents of an ELF note section given the name of the producer,
/// a number identifying the type of note and the contents of the note in
/// \p DescStr.
static std::string encodeELFNote(StringRef NameStr, StringRef DescStr,
uint32_t Type);
/// Code for ELF notes written by producer 'BOLT'
enum {
NT_BOLT_BAT = 1,
NT_BOLT_INSTRUMENTATION_TABLES = 2
};
};
inline uint8_t *copyByteArray(const uint8_t *Data, uint64_t Size) {
@ -451,21 +425,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const BinarySection &Section) {
return OS;
}
struct SDTMarkerInfo {
uint64_t PC;
uint64_t Base;
uint64_t Semaphore;
StringRef Provider;
StringRef Name;
StringRef Args;
/// The offset of PC within the note section
unsigned PCOffset;
/// A label that marks the location of the SDT nop instruction
MCSymbol *Label;
};
} // namespace bolt
} // namespace llvm

View File

@ -1,304 +0,0 @@
//===--- BoltAddressTranslation.cpp ---------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "BoltAddressTranslation.h"
#include "BinaryFunction.h"
#include "llvm/MC/MCAsmLayout.h"
#include "llvm/Support/DataExtractor.h"
#define DEBUG_TYPE "bolt-bat"
namespace llvm {
namespace bolt {
const char* BoltAddressTranslation::SECTION_NAME = ".note.bolt_bat";
void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
const BinaryBasicBlock &BB,
uint64_t FuncAddress) {
const uint64_t Key = BB.getOutputAddressRange().first - FuncAddress;
const uint64_t Val = BB.getInputOffset();
assert(Val != BinaryBasicBlock::INVALID_OFFSET &&
"Every output BB must track back to an input BB for profile "
"collection in bolted binaries");
DEBUG(dbgs() << "BB " << BB.getName() <<"\n");
DEBUG(dbgs() << " Key: " << Twine::utohexstr(Key)
<< " Val: " << Twine::utohexstr(Val) << "\n");
Map.insert(std::pair<uint32_t, uint32_t>(Key, Val));
// Look for special instructions we are interested in mapping offsets. These
// are key instructions for the profile identified by
// BC.keepOffsetForInstruction(Inst) and are instructions that cause control
// flow change. We also record offsets for the last instruction in the BB in
// some cases. These are harmless for BAT writing purposes, besides increasing
// the size of the table unnecessarily.
for (const auto &Inst : BB) {
if (!BC.MIB->hasAnnotation(Inst, "LocSym"))
continue;
const auto OutputOffset =
BC.MIB->getAnnotationAs<uint32_t>(Inst, "LocSym") - FuncAddress;
auto InputOffsetOrErr = BC.MIB->tryGetAnnotationAs<uint32_t>(Inst, "Offset");
DEBUG(if (!InputOffsetOrErr) {
auto *Function = BB.getFunction();
dbgs() << "Function: " << Function->getPrintName()
<< " BB: " << BB.getName() << " lacking annotation for: ";
BC.printInstruction(dbgs(), Inst);
dbgs() << "\n";
});
assert(InputOffsetOrErr && "Expected annotation with input offset");
const auto InputOffset = *InputOffsetOrErr;
// Is this the first instruction in the BB? No need to duplicate the entry
if (Key == OutputOffset)
continue;
DEBUG(dbgs() << " Key: " << Twine::utohexstr(OutputOffset)
<< " Val: " << Twine::utohexstr(InputOffset)
<< " (branch)\n");
Map.insert(
std::pair<uint32_t, uint32_t>(OutputOffset, InputOffset | BRANCHENTRY));
}
}
void BoltAddressTranslation::write(raw_ostream &OS) {
DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n");
for (auto &BFI : BC.getBinaryFunctions()) {
auto &Function = BFI.second;
DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n");
DEBUG(dbgs() << " Address reference: 0x"
<< Twine::utohexstr(Function.getOutputAddress()) << "\n");
MapTy Map;
const bool IsSplit = Function.isSplit();
for (const auto &BB : Function.layout()) {
if (IsSplit && BB->isCold())
break;
writeEntriesForBB(Map, *BB, Function.getOutputAddress());
}
Maps.insert(std::pair<uint64_t, MapTy>(Function.getOutputAddress(), Map));
if (!IsSplit)
continue;
// Cold map
Map.clear();
DEBUG(dbgs() << " Cold part\n");
for (const auto &BB : Function.layout()) {
if (!BB->isCold())
continue;
writeEntriesForBB(Map, *BB, Function.cold().getAddress());
}
Maps.insert(std::pair<uint64_t, MapTy>(Function.cold().getAddress(), Map));
ColdPartSource.insert(std::pair<uint64_t, uint64_t>(
Function.cold().getAddress(), Function.getOutputAddress()));
}
const uint32_t NumFuncs = Maps.size();
OS.write(reinterpret_cast<const char *>(&NumFuncs), 4);
DEBUG(dbgs() << "Writing " << NumFuncs << " functions for BAT.\n");
for (auto &MapEntry : Maps) {
const uint64_t Address = MapEntry.first;
MapTy &Map = MapEntry.second;
const uint32_t NumEntries = Map.size();
DEBUG(dbgs() << "Writing " << NumEntries << " entries for 0x"
<< Twine::utohexstr(Address) << ".\n");
OS.write(reinterpret_cast<const char *>(&Address), 8);
OS.write(reinterpret_cast<const char *>(&NumEntries), 4);
for (auto &KeyVal : Map) {
OS.write(reinterpret_cast<const char *>(&KeyVal.first), 4);
OS.write(reinterpret_cast<const char *>(&KeyVal.second), 4);
}
}
const uint32_t NumColdEntries = ColdPartSource.size();
DEBUG(dbgs() << "Writing " << NumColdEntries << " cold part mappings.\n");
OS.write(reinterpret_cast<const char *>(&NumColdEntries), 4);
for (auto &ColdEntry : ColdPartSource) {
OS.write(reinterpret_cast<const char *>(&ColdEntry.first), 8);
OS.write(reinterpret_cast<const char *>(&ColdEntry.second), 8);
DEBUG(dbgs() << " " << Twine::utohexstr(ColdEntry.first) << " -> "
<< Twine::utohexstr(ColdEntry.second) << "\n");
}
outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
outs() << "BOLT-INFO: Wrote " << NumColdEntries
<< " BAT cold-to-hot entries\n";
}
std::error_code BoltAddressTranslation::parse(StringRef Buf) {
DataExtractor DE = DataExtractor(Buf, true, 8);
uint32_t Offset = 0;
if (Buf.size() < 12)
return make_error_code(llvm::errc::io_error);
const uint32_t NameSz = DE.getU32(&Offset);
const uint32_t DescSz = DE.getU32(&Offset);
const uint32_t Type = DE.getU32(&Offset);
if (Type != BinarySection::NT_BOLT_BAT ||
Buf.size() + Offset < alignTo(NameSz, 4) + DescSz)
return make_error_code(llvm::errc::io_error);
StringRef Name = Buf.slice(Offset, Offset + NameSz);
Offset = alignTo(Offset + NameSz, 4);
if (Name.substr(0, 4) != "BOLT")
return make_error_code(llvm::errc::io_error);
if (Buf.size() - Offset < 4)
return make_error_code(llvm::errc::io_error);
const uint32_t NumFunctions = DE.getU32(&Offset);
DEBUG(dbgs() << "Parsing " << NumFunctions << " functions\n");
for (uint32_t I = 0; I < NumFunctions; ++I) {
if (Buf.size() - Offset < 12)
return make_error_code(llvm::errc::io_error);
const uint64_t Address = DE.getU64(&Offset);
const uint32_t NumEntries = DE.getU32(&Offset);
MapTy Map;
DEBUG(dbgs() << "Parsing " << NumEntries << " entries for 0x"
<< Twine::utohexstr(Address) << "\n");
if (Buf.size() - Offset < 8 * NumEntries)
return make_error_code(llvm::errc::io_error);
for (uint32_t J = 0; J < NumEntries; ++J) {
const uint32_t OutputAddr = DE.getU32(&Offset);
const uint32_t InputAddr = DE.getU32(&Offset);
Map.insert(std::pair<uint32_t, uint32_t>(OutputAddr, InputAddr));
DEBUG(dbgs() << Twine::utohexstr(OutputAddr) << " -> "
<< Twine::utohexstr(InputAddr) << "\n");
}
Maps.insert(std::pair<uint64_t, MapTy>(Address, Map));
}
if (Buf.size() - Offset < 4)
return make_error_code(llvm::errc::io_error);
const uint32_t NumColdEntries = DE.getU32(&Offset);
DEBUG(dbgs() << "Parsing " << NumColdEntries << " cold part mappings\n");
for (uint32_t I = 0; I < NumColdEntries; ++I) {
if (Buf.size() - Offset < 16)
return make_error_code(llvm::errc::io_error);
const uint32_t ColdAddress = DE.getU64(&Offset);
const uint32_t HotAddress = DE.getU64(&Offset);
ColdPartSource.insert(
std::pair<uint64_t, uint64_t>(ColdAddress, HotAddress));
DEBUG(dbgs() << Twine::utohexstr(ColdAddress) << " -> "
<< Twine::utohexstr(HotAddress) << "\n");
}
outs() << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n";
outs() << "BOLT-INFO: Parsed " << NumColdEntries
<< " BAT cold-to-hot entries\n";
return std::error_code();
}
uint64_t BoltAddressTranslation::translate(const BinaryFunction &Func,
uint64_t Offset,
bool IsBranchSrc) const {
auto Iter = Maps.find(Func.getAddress());
if (Iter == Maps.end())
return Offset;
const MapTy &Map = Iter->second;
auto KeyVal = Map.upper_bound(Offset);
if (KeyVal == Map.begin())
return Offset;
--KeyVal;
const uint32_t Val = KeyVal->second & ~BRANCHENTRY;
// Branch source addresses are translated to the first instruction of the
// source BB to avoid accounting for modifications BOLT may have made in the
// BB regarding deletion/addition of instructions.
if (IsBranchSrc)
return Val;
return Offset - KeyVal->first + Val;
}
Optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
BoltAddressTranslation::getFallthroughsInTrace(
const BinaryFunction &Func,
const LBREntry &FirstLBR, const LBREntry &SecondLBR) const {
SmallVector<std::pair<uint64_t, uint64_t>, 16> Res;
// Filter out trivial case
if (FirstLBR.To >= SecondLBR.From)
return Res;
const auto From = FirstLBR.To - Func.getAddress();
const auto To = SecondLBR.From - Func.getAddress();
auto Iter = Maps.find(Func.getAddress());
if (Iter == Maps.end()) {
return NoneType();
}
const MapTy &Map = Iter->second;
auto FromIter = Map.upper_bound(From);
if (FromIter == Map.begin())
return Res;
// Skip instruction entries, to create fallthroughs we are only interested in
// BB boundaries
do {
if (FromIter == Map.begin())
return Res;
--FromIter;
} while (FromIter->second & BRANCHENTRY);
auto ToIter = Map.upper_bound(To);
if (ToIter == Map.begin())
return Res;
--ToIter;
if (FromIter->first >= ToIter->first)
return Res;
for (auto Iter = FromIter; Iter != ToIter; ) {
const auto Src = Iter->first;
if (Iter->second & BRANCHENTRY) {
++Iter;
continue;
}
++Iter;
while (Iter->second & BRANCHENTRY && Iter != ToIter) {
++Iter;
}
if (Iter->second & BRANCHENTRY)
break;
Res.emplace_back(std::make_pair(Src, Iter->first));
}
return Res;
}
uint64_t BoltAddressTranslation::fetchParentAddress(uint64_t Address) const {
auto Iter = ColdPartSource.find(Address);
if (Iter == ColdPartSource.end())
return 0;
return Iter->second;
}
bool BoltAddressTranslation::enabledFor(
llvm::object::ELFObjectFileBase *InputFile) const {
for (const auto &Section : InputFile->sections()) {
StringRef SectionName;
if (std::error_code EC = Section.getName(SectionName))
continue;
if (SectionName == SECTION_NAME)
return true;
}
return false;
}
}
}

View File

@ -1,121 +0,0 @@
//===--- BoltAddressTranslation.h -----------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_BOLTADDRESSTRANSLATION_H
#define LLVM_TOOLS_LLVM_BOLT_BOLTADDRESSTRANSLATION_H
#include "BinaryContext.h"
#include "llvm/Object/ELFObjectFile.h"
namespace llvm {
namespace bolt {
/// The map of output addresses to input ones to be used when translating
/// samples collected in a binary that was already processed by BOLT. We do not
/// support reoptimizing a binary already processed by BOLT, but we do support
/// collecting samples in a binary processed by BOLT. We then translate samples
/// back to addresses from the input (original) binary, one that can be
/// optimized. The goal is to avoid special deployments of non-bolted binaries
/// just for the purposes of data collection.
///
/// The in-memory representation of the map is as follows. Each function has its
/// own map. A function is identified by its output address. This is the key to
/// retrieve a translation map. The translation map is a collection of ordered
/// keys identifying the start of a region (relative to the function start) in
/// the output address space (addresses in the binary processed by BOLT).
///
/// A translation then happens when perf2bolt needs to convert sample addresses
/// in the output address space back to input addresses, valid to run BOLT in
/// the original input binary. To convert, perf2bolt first needs to fetch the
/// translation map for a sample recorded in a given function. It then finds
/// the largest key that is still smaller or equal than the recorded address.
/// It then converts this address to use the value of this key.
///
/// Example translation Map for function foo
/// KEY VALUE BB?
/// Output offset1 (first BB) Original input offset1 Y
/// ...
/// Output offsetN (last branch) Original input offsetN N
///
/// The information on whether a given entry is a BB start or an instruction
/// that changes control flow is encoded in the last (highest) bit of VALUE.
///
/// Notes:
/// Instructions that will never appear in LBR because they do not cause control
/// flow change are omitted from this map. Basic block locations are recorded
/// because they can be a target of a jump (To address in the LBR) and also to
/// recreate the BB layout of this function. We use the BB layout map to
/// recreate fall-through jumps in the profile, given an LBR trace.
class BoltAddressTranslation {
public:
// In-memory representation of the address translation table
using MapTy = std::map<uint32_t, uint32_t>;
/// Name of the ELF section where the table will be serialized to in the
/// output binary
static const char *SECTION_NAME;
BoltAddressTranslation(BinaryContext &BC) : BC(BC) {}
/// Write the serialized address translation tables for each reordered
/// function
void write(raw_ostream &OS);
/// Read the serialized address translation tables and load them internally
/// in memory. Return a parse error if failed.
std::error_code parse(StringRef Buf);
/// If the maps are loaded in memory, perform the lookup to translate LBR
/// addresses in \p Func.
uint64_t translate(const BinaryFunction &Func, uint64_t Offset,
bool IsBranchSrc) const;
/// Use the map keys containing basic block addresses to infer fall-throughs
/// taken in the path started at FirstLBR.To and ending at SecondLBR.From.
/// Return NoneType if trace is invalid or the list of fall-throughs
/// otherwise.
Optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
getFallthroughsInTrace(const BinaryFunction &Func, const LBREntry &FirstLBR,
const LBREntry &SecondLBR) const;
/// If available, fetch the address of the hot part linked to the cold part
/// at \p Address. Return 0 otherwise.
uint64_t fetchParentAddress(uint64_t Address) const;
/// True if the input binary has a translation table we can use to convert
/// addresses when aggregating profile
bool enabledFor(llvm::object::ELFObjectFileBase *InputFile) const;
private:
/// Helper to update \p Map by inserting one or more BAT entries reflecting
/// \p BB for function located at \p FuncAddress. At least one entry will be
/// emitted for the start of the BB. More entries may be emitted to cover
/// the location of calls or any instruction that may change control flow.
void writeEntriesForBB(MapTy &Map, const BinaryBasicBlock &BB,
uint64_t FuncAddress);
BinaryContext &BC;
std::map<uint64_t, MapTy> Maps;
/// Links outlined cold bocks to their original function
std::map<uint64_t, uint64_t> ColdPartSource;
/// Identifies the address of a control-flow changing instructions in a
/// translation map entry
const static uint32_t BRANCHENTRY = 0x80000000;
};
}
}
#endif

View File

@ -204,7 +204,7 @@ class RewriteInstanceDiff {
/// later when matching functions in binary 2 to corresponding functions
/// in binary 1
void buildLookupMaps() {
for (const auto &BFI : RI1.BC->getBinaryFunctions()) {
for (const auto &BFI : RI1.BinaryFunctions) {
StringRef LTOName;
const auto &Function = BFI.second;
const auto Score = getNormalizedScore(Function, RI1);
@ -224,7 +224,7 @@ class RewriteInstanceDiff {
}
// Compute LTONameLookup2 and LargestBin2
for (const auto &BFI : RI2.BC->getBinaryFunctions()) {
for (const auto &BFI : RI2.BinaryFunctions) {
StringRef LTOName;
const auto &Function = BFI.second;
const auto Score = getNormalizedScore(Function, RI2);
@ -245,7 +245,7 @@ class RewriteInstanceDiff {
void matchFunctions() {
outs() << "BOLT-DIFF: Mapping functions in Binary2 to Binary1\n";
for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) {
for (const auto &BFI2 : RI2.BinaryFunctions) {
const auto &Function2 = BFI2.second;
StringRef LTOName;
bool Match = false;
@ -451,7 +451,7 @@ class RewriteInstanceDiff {
/// having a large difference in performance because hotness shifted from
/// LTO variant 1 to variant 2, even though they represent the same function.
void computeAggregatedLTOScore() {
for (const auto &BFI : RI1.BC->getBinaryFunctions()) {
for (const auto &BFI : RI1.BinaryFunctions) {
const auto &Function = BFI.second;
double Score = getNormalizedScore(Function, RI1);
auto Iter = LTOMap1.find(&Function);
@ -461,7 +461,7 @@ class RewriteInstanceDiff {
}
double UnmappedScore{0};
for (const auto &BFI : RI2.BC->getBinaryFunctions()) {
for (const auto &BFI : RI2.BinaryFunctions) {
const auto &Function = BFI.second;
bool Matched = FuncMap.find(&Function) != FuncMap.end();
double Score = getNormalizedScore(Function, RI2);
@ -475,8 +475,7 @@ class RewriteInstanceDiff {
if (FuncMap.find(Iter->second) == FuncMap.end())
UnmappedScore += Score;
}
int64_t Unmapped =
RI2.BC->getBinaryFunctions().size() - Bin2MappedFuncs.size();
int64_t Unmapped = RI2.BinaryFunctions.size() - Bin2MappedFuncs.size();
outs() << "BOLT-DIFF: " << Unmapped
<< " functions in Binary2 have no correspondence to any other "
"function in Binary1.\n";
@ -596,7 +595,7 @@ class RewriteInstanceDiff {
void reportUnmapped() {
outs() << "List of functions from binary 2 that were not matched with any "
<< "function in binary 1:\n";
for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) {
for (const auto &BFI2 : RI2.BinaryFunctions) {
const auto &Function2 = BFI2.second;
if (Bin2MappedFuncs.count(&Function2))
continue;
@ -655,9 +654,9 @@ void RewriteInstance::compare(RewriteInstance &RI2) {
if (opts::ICF) {
IdenticalCodeFolding ICF(opts::NeverPrint);
outs() << "BOLT-DIFF: Starting ICF pass for binary 1";
ICF.runOnFunctions(*BC);
ICF.runOnFunctions(*BC, BinaryFunctions, LargeFunctions);
outs() << "BOLT-DIFF: Starting ICF pass for binary 2";
ICF.runOnFunctions(*RI2.BC);
ICF.runOnFunctions(*RI2.BC, RI2.BinaryFunctions, RI2.LargeFunctions);
}
RewriteInstanceDiff RID(*this, RI2);

View File

@ -48,6 +48,8 @@ add_public_gen_version_target(GenBoltRevision)
set(LLVM_LINK_COMPONENTS
${LLVM_TARGETS_TO_BUILD}
BOLTPasses
BOLTTargetAArch64
BOLTTargetX86
CodeGen
Core
DebugInfoDWARF
@ -59,18 +61,6 @@ set(LLVM_LINK_COMPONENTS
Support
)
string(FIND "${LLVM_TARGETS_TO_BUILD}" "AArch64" POSITION)
if (NOT ${POSITION} EQUAL -1)
list(APPEND LLVM_LINK_COMPONENTS BOLTTargetAArch64)
set(BOLT_AArch64 On)
endif()
string(FIND "${LLVM_TARGETS_TO_BUILD}" "X86" POSITION)
if (NOT ${POSITION} EQUAL -1)
list(APPEND LLVM_LINK_COMPONENTS BOLTTargetX86)
set(BOLT_X64 On)
endif()
add_llvm_tool(llvm-bolt
llvm-bolt.cpp
BinaryBasicBlock.cpp
@ -80,20 +70,16 @@ add_llvm_tool(llvm-bolt
BinaryFunctionProfile.cpp
BinaryPassManager.cpp
BinarySection.cpp
BoltAddressTranslation.cpp
BoltDiff.cpp
CacheMetrics.cpp
DataAggregator.cpp
DataReader.cpp
DebugData.cpp
DWARFRewriter.cpp
DynoStats.cpp
Exceptions.cpp
ExecutableFileMemoryManager.cpp
Heatmap.cpp
JumpTable.cpp
MCPlusBuilder.cpp
ParallelUtilities.cpp
ProfileReader.cpp
ProfileWriter.cpp
Relocation.cpp
@ -101,17 +87,8 @@ add_llvm_tool(llvm-bolt
DEPENDS
intrinsics_gen
bolt_rt
)
if (DEFINED BOLT_AArch64)
target_compile_definitions(llvm-bolt PRIVATE AARCH64_AVAILABLE)
endif()
if (DEFINED BOLT_X64)
target_compile_definitions(llvm-bolt PRIVATE X86_AVAILABLE)
endif()
add_llvm_tool_symlink(perf2bolt llvm-bolt)
add_llvm_tool_symlink(llvm-boltdiff llvm-bolt)
add_llvm_tool_symlink(llvm-bolt-heatmap llvm-bolt)

View File

@ -9,10 +9,11 @@
//
//===----------------------------------------------------------------------===//
#include "DWARFRewriter.h"
#include "BinaryBasicBlock.h"
#include "BinaryContext.h"
#include "BinaryFunction.h"
#include "ParallelUtilities.h"
#include "RewriteInstance.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
@ -56,126 +57,62 @@ KeepARanges("keep-aranges",
cl::Hidden,
cl::cat(BoltCategory));
static cl::opt<bool>
DeterministicDebugInfo("deterministic-debuginfo",
cl::desc("disables parallel execution of tasks that may produce"
"nondeterministic debug info"),
cl::init(true),
cl::cat(BoltCategory));
} // namespace opts
void DWARFRewriter::updateDebugInfo() {
void RewriteInstance::updateDebugInfo() {
SectionPatchers[".debug_abbrev"] = llvm::make_unique<DebugAbbrevPatcher>();
SectionPatchers[".debug_info"] = llvm::make_unique<SimpleBinaryPatcher>();
SectionPatchers[".debug_info"] = llvm::make_unique<SimpleBinaryPatcher>();
DebugInfoPatcher =
static_cast<SimpleBinaryPatcher *>(SectionPatchers[".debug_info"].get());
AbbrevPatcher =
static_cast<DebugAbbrevPatcher *>(SectionPatchers[".debug_abbrev"].get());
assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized.");
RangesSectionsWriter = llvm::make_unique<DebugRangesSectionsWriter>(BC.get());
LocationListWriter = llvm::make_unique<DebugLocWriter>(BC.get());
RangesSectionsWriter = llvm::make_unique<DebugRangesSectionsWriter>(&BC);
LocationListWriter = llvm::make_unique<DebugLocWriter>(&BC);
auto processUnitDIE = [&](const DWARFDie DIE) {
const BinaryFunction *CachedFunction = nullptr;
std::map<DebugAddressRangesVector, uint64_t> CachedRanges{};
updateUnitDebugInfo(DIE, std::vector<const BinaryFunction *>{},
CachedFunction, CachedRanges);
};
if (opts::NoThreads || opts::DeterministicDebugInfo) {
for (auto &CU : BC.DwCtx->compile_units())
processUnitDIE(CU->getUnitDIE(false));
} else {
// Update unit debug info in parallel
auto &ThreadPool = ParallelUtilities::getThreadPool();
for (auto &CU : BC.DwCtx->compile_units())
ThreadPool.async(processUnitDIE, CU->getUnitDIE(false));
ThreadPool.wait();
for (auto &CU : BC->DwCtx->compile_units()) {
updateUnitDebugInfo(CU->getUnitDIE(false),
std::vector<const BinaryFunction *>{});
}
flushPendingRanges();
finalizeDebugSections();
updateGdbIndexSection();
}
void DWARFRewriter::updateUnitDebugInfo(
const DWARFDie DIE, std::vector<const BinaryFunction *> FunctionStack,
const BinaryFunction *&CachedFunction,
std::map<DebugAddressRangesVector, uint64_t> &CachedRanges) {
void RewriteInstance::updateUnitDebugInfo(
const DWARFDie DIE,
std::vector<const BinaryFunction *> FunctionStack) {
bool IsFunctionDef = false;
switch (DIE.getTag()) {
case dwarf::DW_TAG_compile_unit:
{
const auto ModuleRanges = DIE.getAddressRanges();
auto OutputRanges = BC.translateModuleAddressRanges(ModuleRanges);
auto OutputRanges = translateModuleAddressRanges(ModuleRanges);
const auto RangesSectionOffset =
RangesSectionsWriter->addCURanges(DIE.getDwarfUnit()->getOffset(),
std::move(OutputRanges));
RangesSectionsWriter->addCURanges(DIE.getDwarfUnit()->getOffset(),
std::move(OutputRanges));
updateDWARFObjectAddressRanges(DIE, RangesSectionOffset);
}
break;
case dwarf::DW_TAG_subprogram:
{
// Get function address either from ranges or [LowPC, HighPC) pair.
bool UsesRanges = false;
uint64_t Address;
uint64_t SectionIndex, HighPC;
if (!DIE.getLowAndHighPC(Address, HighPC, SectionIndex)) {
auto Ranges = DIE.getAddressRanges();
// Not a function definition.
if (Ranges.empty())
break;
Address = Ranges.front().LowPC;
UsesRanges = true;
}
IsFunctionDef = true;
const auto *Function = BC.getBinaryFunctionAtAddress(Address);
if (Function && Function->isFolded())
Function = nullptr;
FunctionStack.push_back(Function);
DebugAddressRangesVector FunctionRanges;
if (Function)
FunctionRanges = Function->getOutputAddressRanges();
// Update ranges.
if (UsesRanges) {
updateDWARFObjectAddressRanges(DIE,
RangesSectionsWriter->addRanges(FunctionRanges));
} else {
// Delay conversion of [LowPC, HighPC) into DW_AT_ranges if possible.
const auto *Abbrev = DIE.getAbbreviationDeclarationPtr();
assert(Abbrev && "abbrev expected");
// Create a critical section.
static std::shared_timed_mutex CriticalSectionMutex;
std::unique_lock<std::shared_timed_mutex> Lock(CriticalSectionMutex);
if (FunctionRanges.size() > 1) {
convertPending(Abbrev);
// Exit critical section early.
Lock.unlock();
convertToRanges(DIE, FunctionRanges);
} else if (ConvertedRangesAbbrevs.find(Abbrev) !=
ConvertedRangesAbbrevs.end()) {
// Exit critical section early.
Lock.unlock();
convertToRanges(DIE, FunctionRanges);
} else {
if (FunctionRanges.empty())
FunctionRanges.emplace_back(DebugAddressRange());
PendingRanges[Abbrev].emplace_back(
std::make_pair(DIE, FunctionRanges.front()));
// The function cannot have multiple ranges on the input.
uint64_t SectionIndex, LowPC, HighPC;
if (DIE.getLowAndHighPC(LowPC, HighPC, SectionIndex)) {
IsFunctionDef = true;
const auto *Function = getBinaryFunctionAtAddress(LowPC);
if (Function && Function->isFolded()) {
Function = nullptr;
}
FunctionStack.push_back(Function);
auto RangesSectionOffset =
RangesSectionsWriter->getEmptyRangesOffset();
if (Function) {
auto FunctionRanges = Function->getOutputAddressRanges();
RangesSectionOffset =
RangesSectionsWriter->addRanges(Function,
std::move(FunctionRanges));
}
updateDWARFObjectAddressRanges(DIE, RangesSectionOffset);
}
}
break;
@ -199,8 +136,8 @@ void DWARFRewriter::updateUnitDebugInfo(
<< Twine::utohexstr(DIE.getDwarfUnit()->getOffset()) << '\n';
}
);
RangesSectionOffset = RangesSectionsWriter->addRanges(
Function, std::move(OutputRanges), CachedFunction, CachedRanges);
RangesSectionOffset =
RangesSectionsWriter->addRanges(Function, std::move(OutputRanges));
}
updateDWARFObjectAddressRanges(DIE, RangesSectionOffset);
}
@ -249,7 +186,9 @@ void DWARFRewriter::updateUnitDebugInfo(
}
}
std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
auto DebugInfoPatcher =
static_cast<SimpleBinaryPatcher *>(
SectionPatchers[".debug_info"].get());
DebugInfoPatcher->addLE32Patch(AttrOffset, LocListSectionOffset);
} else {
assert((Value.isFormClass(DWARFFormValue::FC_Exprloc) ||
@ -269,8 +208,9 @@ void DWARFRewriter::updateUnitDebugInfo(
<< " for DIE with tag " << DIE.getTag()
<< " to 0x" << Twine::utohexstr(NewAddress) << '\n');
}
std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
auto DebugInfoPatcher =
static_cast<SimpleBinaryPatcher *>(
SectionPatchers[".debug_info"].get());
DebugInfoPatcher->addLE64Patch(AttrOffset, NewAddress);
} else if (opts::Verbosity >= 1) {
errs() << "BOLT-WARNING: unexpected form value for attribute at 0x"
@ -282,14 +222,14 @@ void DWARFRewriter::updateUnitDebugInfo(
// Recursively update each child.
for (auto Child = DIE.getFirstChild(); Child; Child = Child.getSibling()) {
updateUnitDebugInfo(Child, FunctionStack, CachedFunction, CachedRanges);
updateUnitDebugInfo(Child, FunctionStack);
}
if (IsFunctionDef)
FunctionStack.pop_back();
}
void DWARFRewriter::updateDWARFObjectAddressRanges(
void RewriteInstance::updateDWARFObjectAddressRanges(
const DWARFDie DIE, uint64_t DebugRangesOffset) {
// Some objects don't have an associated DIE and cannot be updated (such as
@ -299,10 +239,17 @@ void DWARFRewriter::updateDWARFObjectAddressRanges(
}
if (opts::Verbosity >= 2 && DebugRangesOffset == -1U) {
errs() << "BOLT-WARNING: using invalid DW_AT_ranges for DIE at offset 0x"
errs() << "BOLT-WARNING: using invalid DW_AT_range for DIE at offset 0x"
<< Twine::utohexstr(DIE.getOffset()) << '\n';
}
auto DebugInfoPatcher =
static_cast<SimpleBinaryPatcher *>(SectionPatchers[".debug_info"].get());
auto AbbrevPatcher =
static_cast<DebugAbbrevPatcher*>(SectionPatchers[".debug_abbrev"].get());
assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized.");
const auto *AbbreviationDecl = DIE.getAbbreviationDeclarationPtr();
if (!AbbreviationDecl) {
if (opts::Verbosity >= 1) {
@ -313,14 +260,14 @@ void DWARFRewriter::updateDWARFObjectAddressRanges(
return;
}
auto AbbrevCode = AbbreviationDecl->getCode();
if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges)) {
// Case 1: The object was already non-contiguous and had DW_AT_ranges.
// In this case we simply need to update the value of DW_AT_ranges.
uint32_t AttrOffset = -1U;
DIE.find(dwarf::DW_AT_ranges, &AttrOffset);
assert(AttrOffset != -1U && "failed to locate DWARF attribute");
std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
DebugInfoPatcher->addLE32Patch(AttrOffset, DebugRangesOffset);
} else {
// Case 2: The object has both DW_AT_low_pc and DW_AT_high_pc emitted back
@ -337,8 +284,50 @@ void DWARFRewriter::updateDWARFObjectAddressRanges(
// large size.
if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) &&
AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc)) {
convertToRanges(AbbreviationDecl);
convertToRanges(DIE, DebugRangesOffset);
uint32_t LowPCOffset = -1U;
uint32_t HighPCOffset = -1U;
DWARFFormValue LowPCFormValue =
*DIE.find(dwarf::DW_AT_low_pc, &LowPCOffset);
DWARFFormValue HighPCFormValue =
*DIE.find(dwarf::DW_AT_high_pc, &HighPCOffset);
if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr ||
(HighPCFormValue.getForm() != dwarf::DW_FORM_addr &&
HighPCFormValue.getForm() != dwarf::DW_FORM_data8 &&
HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) {
errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE "
<< "at offset 0x" << Twine::utohexstr(DIE.getOffset())
<< "\n";
return;
}
if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) {
errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. "
<< "Cannot update DIE at offset 0x"
<< Twine::utohexstr(DIE.getOffset()) << '\n';
return;
}
AbbrevPatcher->addAttributePatch(DIE.getDwarfUnit(),
AbbrevCode,
dwarf::DW_AT_low_pc,
dwarf::DW_AT_ranges,
dwarf::DW_FORM_sec_offset);
AbbrevPatcher->addAttributePatch(DIE.getDwarfUnit(),
AbbrevCode,
dwarf::DW_AT_high_pc,
dwarf::DW_AT_low_pc,
dwarf::DW_FORM_udata);
unsigned LowPCSize = 0;
if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr ||
HighPCFormValue.getForm() == dwarf::DW_FORM_data8) {
LowPCSize = 12;
} else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) {
LowPCSize = 8;
} else {
llvm_unreachable("unexpected form");
}
DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset);
DebugInfoPatcher->addUDataPatch(LowPCOffset + 4, 0, LowPCSize);
} else {
if (opts::Verbosity >= 1) {
errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x"
@ -348,8 +337,8 @@ void DWARFRewriter::updateDWARFObjectAddressRanges(
}
}
void DWARFRewriter::updateDebugLineInfoForNonSimpleFunctions() {
for (auto &It : BC.getBinaryFunctions()) {
void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() {
for (auto &It : BinaryFunctions) {
const auto &Function = It.second;
if (Function.isSimple())
@ -364,7 +353,7 @@ void DWARFRewriter::updateDebugLineInfoForNonSimpleFunctions() {
std::vector<uint32_t> Results;
MCSectionELF *FunctionSection =
BC.Ctx->getELFSection(Function.getCodeSectionName(),
BC->Ctx->getELFSection(Function.getCodeSectionName(),
ELF::SHT_PROGBITS,
ELF::SHF_EXECINSTR | ELF::SHF_ALLOC);
@ -372,10 +361,10 @@ void DWARFRewriter::updateDebugLineInfoForNonSimpleFunctions() {
if (LineTable->lookupAddressRange(Address, Function.getMaxSize(),
Results)) {
auto &OutputLineTable =
BC.Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections();
BC->Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections();
for (auto RowIndex : Results) {
const auto &Row = LineTable->Rows[RowIndex];
BC.Ctx->setCurrentDwarfLoc(
BC->Ctx->setCurrentDwarfLoc(
Row.File,
Row.Line,
Row.Column,
@ -386,17 +375,17 @@ void DWARFRewriter::updateDebugLineInfoForNonSimpleFunctions() {
Row.Isa,
Row.Discriminator,
Row.Address);
auto Loc = BC.Ctx->getCurrentDwarfLoc();
BC.Ctx->clearDwarfLocSeen();
auto Loc = BC->Ctx->getCurrentDwarfLoc();
BC->Ctx->clearDwarfLocSeen();
OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc},
FunctionSection);
}
// Add an empty entry past the end of the function
// for end_sequence mark.
BC.Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0,
BC->Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0,
Address + Function.getMaxSize());
auto Loc = BC.Ctx->getCurrentDwarfLoc();
BC.Ctx->clearDwarfLocSeen();
auto Loc = BC->Ctx->getCurrentDwarfLoc();
BC->Ctx->clearDwarfLocSeen();
OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc},
FunctionSection);
} else {
@ -406,9 +395,9 @@ void DWARFRewriter::updateDebugLineInfoForNonSimpleFunctions() {
}
}
void DWARFRewriter::updateLineTableOffsets() {
void RewriteInstance::updateLineTableOffsets() {
const auto *LineSection =
BC.Ctx->getObjectFileInfo()->getDwarfLineSection();
BC->Ctx->getObjectFileInfo()->getDwarfLineSection();
auto CurrentFragment = LineSection->begin();
uint32_t CurrentOffset = 0;
uint32_t Offset = 0;
@ -417,7 +406,7 @@ void DWARFRewriter::updateLineTableOffsets() {
// output file, thus we can compute all table's offset by passing through
// each fragment at most once, continuing from the last CU's beginning
// instead of from the first fragment.
for (const auto &CUIDLineTablePair : BC.Ctx->getMCDwarfLineTables()) {
for (const auto &CUIDLineTablePair : BC->Ctx->getMCDwarfLineTables()) {
auto Label = CUIDLineTablePair.second.getLabel();
if (!Label)
continue;
@ -426,10 +415,10 @@ void DWARFRewriter::updateLineTableOffsets() {
if (CUOffset == -1U)
continue;
auto *CU = BC.DwCtx->getCompileUnitForOffset(CUOffset);
auto *CU = BC->DwCtx->getCompileUnitForOffset(CUOffset);
assert(CU && "no CU found at offset");
auto LTOffset =
BC.DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list);
BC->DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list);
if (!LTOffset)
continue;
@ -455,9 +444,9 @@ void DWARFRewriter::updateLineTableOffsets() {
Offset += Label->getOffset() - CurrentOffset;
CurrentOffset = Label->getOffset();
auto DbgInfoSection = BC.getUniqueSectionByName(".debug_info");
auto DbgInfoSection = BC->getUniqueSectionByName(".debug_info");
assert(DbgInfoSection && ".debug_info section must exist");
auto *Zero = BC.registerNameAtAddress("Zero", 0, 0, 0);
auto *Zero = BC->registerNameAtAddress("Zero", 0, 0, 0);
DbgInfoSection->addRelocation(LTOffset,
Zero,
ELF::R_X86_64_32,
@ -474,43 +463,43 @@ void DWARFRewriter::updateLineTableOffsets() {
}
}
void DWARFRewriter::finalizeDebugSections() {
void RewriteInstance::finalizeDebugSections() {
// Skip .debug_aranges if we are re-generating .gdb_index.
if (opts::KeepARanges || !BC.getGdbIndexSection()) {
if (opts::KeepARanges || !GdbIndexSection) {
SmallVector<char, 16> ARangesBuffer;
raw_svector_ostream OS(ARangesBuffer);
auto MAB = std::unique_ptr<MCAsmBackend>(BC.TheTarget->createMCAsmBackend(
*BC.STI, *BC.MRI, MCTargetOptions()));
auto MAB = std::unique_ptr<MCAsmBackend>(BC->TheTarget->createMCAsmBackend(
*BC->STI, *BC->MRI, MCTargetOptions()));
auto Writer = std::unique_ptr<MCObjectWriter>(MAB->createObjectWriter(OS));
RangesSectionsWriter->writeArangesSection(Writer.get());
const auto &ARangesContents = OS.str();
BC.registerOrUpdateNoteSection(".debug_aranges",
BC->registerOrUpdateNoteSection(".debug_aranges",
copyByteArray(ARangesContents),
ARangesContents.size());
}
auto RangesSectionContents = RangesSectionsWriter->finalize();
BC.registerOrUpdateNoteSection(".debug_ranges",
BC->registerOrUpdateNoteSection(".debug_ranges",
copyByteArray(*RangesSectionContents),
RangesSectionContents->size());
auto LocationListSectionContents = LocationListWriter->finalize();
BC.registerOrUpdateNoteSection(".debug_loc",
BC->registerOrUpdateNoteSection(".debug_loc",
copyByteArray(*LocationListSectionContents),
LocationListSectionContents->size());
}
void DWARFRewriter::updateGdbIndexSection() {
if (!BC.getGdbIndexSection())
void RewriteInstance::updateGdbIndexSection() {
if (!GdbIndexSection)
return;
// See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html for
// .gdb_index section format.
StringRef GdbIndexContents = BC.getGdbIndexSection()->getContents();
StringRef GdbIndexContents = GdbIndexSection->getContents();
const auto *Data = GdbIndexContents.data();
@ -534,13 +523,13 @@ void DWARFRewriter::updateGdbIndexSection() {
// Map CUs offsets to indices and verify existing index table.
std::map<uint32_t, uint32_t> OffsetToIndexMap;
const auto CUListSize = CUTypesOffset - CUListOffset;
const auto NumCUs = BC.DwCtx->getNumCompileUnits();
const auto NumCUs = BC->DwCtx->getNumCompileUnits();
if (CUListSize != NumCUs * 16) {
errs() << "BOLT-ERROR: .gdb_index: CU count mismatch\n";
exit(1);
}
for (unsigned Index = 0; Index < NumCUs; ++Index, Data += 16) {
const auto *CU = BC.DwCtx->getCompileUnitAtIndex(Index);
const auto *CU = BC->DwCtx->getCompileUnitAtIndex(Index);
const auto Offset = read64le(Data);
if (CU->getOffset() != Offset) {
errs() << "BOLT-ERROR: .gdb_index CU offset mismatch\n";
@ -606,123 +595,7 @@ void DWARFRewriter::updateGdbIndexSection() {
memcpy(Buffer, Data, TrailingSize);
// Register the new section.
BC.registerOrUpdateNoteSection(".gdb_index",
BC->registerOrUpdateNoteSection(".gdb_index",
NewGdbIndexContents,
NewGdbIndexSize);
}
void
DWARFRewriter::convertToRanges(const DWARFAbbreviationDeclaration *Abbrev) {
std::lock_guard<std::mutex> Lock(AbbrevPatcherMutex);
AbbrevPatcher->addAttributePatch(Abbrev,
dwarf::DW_AT_low_pc,
dwarf::DW_AT_ranges,
dwarf::DW_FORM_sec_offset);
AbbrevPatcher->addAttributePatch(Abbrev,
dwarf::DW_AT_high_pc,
dwarf::DW_AT_low_pc,
dwarf::DW_FORM_udata);
}
void DWARFRewriter::convertToRanges(DWARFDie DIE,
const DebugAddressRangesVector &Ranges) {
uint64_t RangesSectionOffset;
if (Ranges.empty()) {
RangesSectionOffset = RangesSectionsWriter->getEmptyRangesOffset();
} else {
RangesSectionOffset = RangesSectionsWriter->addRanges(Ranges);
}
convertToRanges(DIE, RangesSectionOffset);
}
void DWARFRewriter::convertPending(const DWARFAbbreviationDeclaration *Abbrev) {
if (ConvertedRangesAbbrevs.count(Abbrev))
return;
convertToRanges(Abbrev);
auto I = PendingRanges.find(Abbrev);
if (I != PendingRanges.end()) {
for (auto &Pair : I->second) {
convertToRanges(Pair.first, {Pair.second});
}
PendingRanges.erase(I);
}
ConvertedRangesAbbrevs.emplace(Abbrev);
}
void DWARFRewriter::flushPendingRanges() {
for (auto &I : PendingRanges) {
for (auto &RangePair : I.second) {
patchLowHigh(RangePair.first, RangePair.second);
}
}
}
namespace {
void getRangeAttrData(
DWARFDie DIE,
uint32_t &LowPCOffset, uint32_t &HighPCOffset,
DWARFFormValue &LowPCFormValue, DWARFFormValue &HighPCFormValue) {
LowPCOffset = -1U;
HighPCOffset = -1U;
LowPCFormValue = *DIE.find(dwarf::DW_AT_low_pc, &LowPCOffset);
HighPCFormValue = *DIE.find(dwarf::DW_AT_high_pc, &HighPCOffset);
if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr ||
(HighPCFormValue.getForm() != dwarf::DW_FORM_addr &&
HighPCFormValue.getForm() != dwarf::DW_FORM_data8 &&
HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) {
errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE "
<< "at offset 0x" << Twine::utohexstr(DIE.getOffset()) << "\n";
return;
}
if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) {
errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. "
<< "Cannot update DIE at offset 0x"
<< Twine::utohexstr(DIE.getOffset()) << '\n';
return;
}
}
}
void DWARFRewriter::patchLowHigh(DWARFDie DIE, DebugAddressRange Range) {
uint32_t LowPCOffset, HighPCOffset;
DWARFFormValue LowPCFormValue, HighPCFormValue;
getRangeAttrData(
DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue);
DebugInfoPatcher->addLE64Patch(LowPCOffset, Range.LowPC);
if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr ||
HighPCFormValue.getForm() == dwarf::DW_FORM_data8) {
DebugInfoPatcher->addLE64Patch(HighPCOffset, Range.HighPC - Range.LowPC);
} else {
DebugInfoPatcher->addLE32Patch(HighPCOffset, Range.HighPC - Range.LowPC);
}
}
void DWARFRewriter::convertToRanges(DWARFDie DIE,
uint64_t RangesSectionOffset) {
uint32_t LowPCOffset, HighPCOffset;
DWARFFormValue LowPCFormValue, HighPCFormValue;
getRangeAttrData(
DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue);
unsigned LowPCSize = 0;
if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr ||
HighPCFormValue.getForm() == dwarf::DW_FORM_data8) {
LowPCSize = 12;
} else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) {
LowPCSize = 8;
} else {
llvm_unreachable("unexpected form");
}
std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
DebugInfoPatcher->addLE32Patch(LowPCOffset, RangesSectionOffset);
DebugInfoPatcher->addUDataPatch(LowPCOffset + 4, 0, LowPCSize);
}

View File

@ -1,125 +0,0 @@
//===--- DWARFRewriter.h --------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H
#define LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H
#include "DebugData.h"
#include "RewriteInstance.h"
#include <map>
#include <mutex>
namespace llvm {
namespace bolt {
class BinaryFunction;
class DWARFRewriter {
DWARFRewriter() = delete;
BinaryContext &BC;
using SectionPatchersType = RewriteInstance::SectionPatchersType;
SectionPatchersType &SectionPatchers;
SimpleBinaryPatcher *DebugInfoPatcher{nullptr};
std::mutex DebugInfoPatcherMutex;
DebugAbbrevPatcher *AbbrevPatcher{nullptr};
std::mutex AbbrevPatcherMutex;
/// Stores and serializes information that will be put into the .debug_ranges
/// and .debug_aranges DWARF sections.
std::unique_ptr<DebugRangesSectionsWriter> RangesSectionsWriter;
std::unique_ptr<DebugLocWriter> LocationListWriter;
/// Recursively update debug info for all DIEs in \p Unit.
/// If \p Function is not empty, it points to a function corresponding
/// to a parent DW_TAG_subprogram node of the current \p DIE.
void updateUnitDebugInfo(
const DWARFDie DIE, std::vector<const BinaryFunction *> FunctionStack,
const BinaryFunction *&CachedFunction,
std::map<DebugAddressRangesVector, uint64_t> &CachedRanges);
/// Patches the binary for an object's address ranges to be updated.
/// The object can be a anything that has associated address ranges via either
/// DW_AT_low/high_pc or DW_AT_ranges (i.e. functions, lexical blocks, etc).
/// \p DebugRangesOffset is the offset in .debug_ranges of the object's
/// new address ranges in the output binary.
/// \p Unit Compile unit the object belongs to.
/// \p DIE is the object's DIE in the input binary.
void updateDWARFObjectAddressRanges(const DWARFDie DIE,
uint64_t DebugRangesOffset);
/// Generate new contents for .debug_ranges and .debug_aranges section.
void finalizeDebugSections();
/// Patches the binary for DWARF address ranges (e.g. in functions and lexical
/// blocks) to be updated.
void updateDebugAddressRanges();
/// Rewrite .gdb_index section if present.
void updateGdbIndexSection();
/// Abbreviations that were converted to use DW_AT_ranges.
std::set<const DWARFAbbreviationDeclaration *> ConvertedRangesAbbrevs;
/// DIEs with abbrevs that were not converted to DW_AT_ranges.
/// We only update those when all DIEs have been processed to guarantee that
/// the abbrev (which is shared) is intact.
std::map<const DWARFAbbreviationDeclaration *,
std::vector<std::pair<DWARFDie, DebugAddressRange>>> PendingRanges;
/// Convert \p Abbrev from using a simple DW_AT_(low|high)_pc range to
/// DW_AT_ranges.
void convertToRanges(const DWARFAbbreviationDeclaration *Abbrev);
/// Update \p DIE that was using DW_AT_(low|high)_pc with DW_AT_ranges offset.
void convertToRanges(DWARFDie DIE, uint64_t RangesSectionOffset);
/// Same as above, but takes a vector of \p Ranges as a parameter.
void convertToRanges(DWARFDie DIE, const DebugAddressRangesVector &Ranges);
/// Patch DW_AT_(low|high)_pc values for the \p DIE based on \p Range.
void patchLowHigh(DWARFDie DIE, DebugAddressRange Range);
/// Convert pending ranges associated with the given \p Abbrev.
void convertPending(const DWARFAbbreviationDeclaration *Abbrev);
/// Once all DIEs were seen, update DW_AT_(low|high)_pc values.
void flushPendingRanges();
public:
DWARFRewriter(BinaryContext &BC,
SectionPatchersType &SectionPatchers)
: BC(BC), SectionPatchers(SectionPatchers) {}
/// Main function for updating the DWARF debug info.
void updateDebugInfo();
/// Computes output .debug_line line table offsets for each compile unit,
/// and updates stmt_list for a corresponding compile unit.
void updateLineTableOffsets();
/// Updates debug line information for non-simple functions, which are not
/// rewritten.
void updateDebugLineInfoForNonSimpleFunctions();
};
} // namespace bolt
} // namespace llvm
#endif

View File

@ -14,7 +14,6 @@
#include "BinaryContext.h"
#include "BinaryFunction.h"
#include "BoltAddressTranslation.h"
#include "DataAggregator.h"
#include "Heatmap.h"
#include "llvm/Support/Debug.h"
@ -55,13 +54,6 @@ IgnoreBuildID("ignore-build-id",
cl::init(false),
cl::cat(AggregatorCategory));
static cl::opt<bool>
FilterMemProfile("filter-mem-profile",
cl::desc("if processing a memory profile, filter out stack or heap accesses that "
"won't be useful for BOLT to reduce profile file size"),
cl::init(true),
cl::cat(AggregatorCategory));
static cl::opt<unsigned>
HeatmapBlock("block-size",
cl::desc("size of a heat map block in bytes (default 64)"),
@ -96,13 +88,6 @@ TimeAggregator("time-aggr",
cl::ZeroOrMore,
cl::cat(AggregatorCategory));
static cl::opt<bool>
UseEventPC("use-event-pc",
cl::desc("use event PC in combination with LBR sampling"),
cl::init(false),
cl::ZeroOrMore,
cl::cat(AggregatorCategory));
static cl::opt<bool>
WriteAutoFDOData("autofdo",
cl::desc("generate autofdo textual data instead of bolt data"),
@ -225,7 +210,6 @@ void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
*Str++ = 0;
} while (true);
Argv.push_back("-f");
Argv.push_back("-i");
Argv.push_back(PerfDataFilename.data());
Argv.push_back(nullptr);
@ -248,18 +232,13 @@ void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
TempFiles.push_back(PPI.StderrPath.data());
Optional<StringRef> Redirects[] = {
llvm::None, // Stdin
llvm::None, // Stdin
StringRef(PPI.StdoutPath.data()), // Stdout
StringRef(PPI.StderrPath.data())}; // Stderr
DEBUG({
dbgs() << "Launching perf: ";
for (const char *Arg : Argv)
dbgs() << Arg << " ";
dbgs() << " 1> "
<< PPI.StdoutPath.data() << " 2> "
<< PPI.StderrPath.data() << "\n";
});
DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> "
<< PPI.StdoutPath.data() << " 2> "
<< PPI.StderrPath.data() << "\n");
if (Wait) {
PPI.PI.ReturnCode =
@ -443,8 +422,11 @@ std::error_code DataAggregator::writeAutoFDOData() {
return std::error_code();
}
void DataAggregator::parseProfile(BinaryContext &BC) {
void DataAggregator::parseProfile(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs) {
this->BC = &BC;
this->BFs = &BFs;
if (opts::ReadPreAggregated) {
parsePreAggregated();
@ -564,7 +546,9 @@ void DataAggregator::parseProfile(BinaryContext &BC) {
deleteTempFiles();
}
void DataAggregator::processProfile(BinaryContext &BC) {
void DataAggregator::processProfile(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs) {
if (opts::ReadPreAggregated)
processPreAggregated();
else if (opts::BasicAggregation)
@ -575,7 +559,7 @@ void DataAggregator::processProfile(BinaryContext &BC) {
processMemEvents();
// Mark all functions with registered events as having a valid profile.
for (auto &BFI : BC.getBinaryFunctions()) {
for (auto &BFI : BFs) {
auto &BF = BFI.second;
if (BF.getBranchData()) {
const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE
@ -593,46 +577,19 @@ void DataAggregator::processProfile(BinaryContext &BC) {
}
BinaryFunction *
DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) const {
DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) {
if (!BC->containsAddress(Address))
return nullptr;
// Use shallow search to avoid fetching the parent function, in case
// BinaryContext linked two functions. When aggregating data and writing the
// profile, we want to write offsets relative to the closest symbol in the
// symbol table, not relative to the parent function, to avoid creating
// profile that is too fragile and depends on the layout of other functions.
return BC->getBinaryFunctionContainingAddress(Address, /*CheckPastEnd=*/false,
/*UseMaxSize=*/true,
/*Shallow=*/true);
}
auto FI = BFs->upper_bound(Address);
if (FI == BFs->begin())
return nullptr;
--FI;
StringRef DataAggregator::getLocationName(BinaryFunction &Func,
uint64_t Count) {
if (!BAT)
return Func.getNames()[0];
const auto *OrigFunc = &Func;
if (const auto HotAddr = BAT->fetchParentAddress(Func.getAddress())) {
NumColdSamples += Count;
auto *HotFunc = getBinaryFunctionContainingAddress(HotAddr);
if (HotFunc)
OrigFunc = HotFunc;
}
const auto &Names = OrigFunc->getNames();
// If it is a local function, prefer the name containing the file name where
// the local function was declared
for (const auto &Name : Names) {
StringRef AlternativeName = Name;
size_t FileNameIdx = AlternativeName.find('/');
// Confirm the alternative name has the pattern Symbol/FileName/1 before
// using it
if (FileNameIdx == StringRef::npos ||
AlternativeName.find('/', FileNameIdx + 1) == StringRef::npos)
continue;
return AlternativeName;
}
return Names[0];
const auto UsedSize = FI->second.getMaxSize();
if (Address >= FI->first + UsedSize)
return nullptr;
return &FI->second;
}
bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address,
@ -640,17 +597,12 @@ bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address,
auto I = FuncsToSamples.find(Func.getNames()[0]);
if (I == FuncsToSamples.end()) {
bool Success;
StringRef LocName = getLocationName(Func, Count);
std::tie(I, Success) = FuncsToSamples.insert(std::make_pair(
Func.getNames()[0],
FuncSampleData(LocName, FuncSampleData::ContainerTy())));
FuncSampleData(Func.getNames()[0], FuncSampleData::ContainerTy())));
}
Address -= Func.getAddress();
if (BAT)
Address = BAT->translate(Func, Address, /*IsBranchSrc=*/false);
I->second.bumpCount(Address, Count);
I->second.bumpCount(Address - Func.getAddress(), Count);
return true;
}
@ -660,26 +612,12 @@ bool DataAggregator::doIntraBranch(BinaryFunction &Func, uint64_t From,
FuncBranchData *AggrData = Func.getBranchData();
if (!AggrData) {
AggrData = &FuncsToBranches[Func.getNames()[0]];
AggrData->Name = getLocationName(Func, Count);
AggrData->Name = Func.getNames()[0];
Func.setBranchData(AggrData);
}
From -= Func.getAddress();
To -= Func.getAddress();
DEBUG(dbgs() << "BOLT-DEBUG: bumpBranchCount: " << Func.getPrintName()
<< " @ " << Twine::utohexstr(From) << " -> "
<< Func.getPrintName() << " @ " << Twine::utohexstr(To)
<< '\n');
if (BAT) {
From = BAT->translate(Func, From, /*IsBranchSrc=*/true);
To = BAT->translate(Func, To, /*IsBranchSrc=*/false);
DEBUG(dbgs() << "BOLT-DEBUG: BAT translation on bumpBranchCount: "
<< Func.getPrintName() << " @ " << Twine::utohexstr(From)
<< " -> " << Func.getPrintName() << " @ "
<< Twine::utohexstr(To) << '\n');
}
AggrData->bumpBranchCount(From, To, Count, Mispreds);
AggrData->bumpBranchCount(From - Func.getAddress(), To - Func.getAddress(),
Count, Mispreds);
return true;
}
@ -692,30 +630,26 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
StringRef SrcFunc;
StringRef DstFunc;
if (FromFunc) {
SrcFunc = getLocationName(*FromFunc, Count);
SrcFunc = FromFunc->getNames()[0];
FromAggrData = FromFunc->getBranchData();
if (!FromAggrData) {
FromAggrData = &FuncsToBranches[FromFunc->getNames()[0]];
FromAggrData = &FuncsToBranches[SrcFunc];
FromAggrData->Name = SrcFunc;
FromFunc->setBranchData(FromAggrData);
}
From -= FromFunc->getAddress();
if (BAT)
From = BAT->translate(*FromFunc, From, /*IsBranchSrc=*/true);
FromFunc->recordExit(From, Mispreds, Count);
}
if (ToFunc) {
DstFunc = getLocationName(*ToFunc, 0);
DstFunc = ToFunc->getNames()[0];
ToAggrData = ToFunc->getBranchData();
if (!ToAggrData) {
ToAggrData = &FuncsToBranches[ToFunc->getNames()[0]];
ToAggrData = &FuncsToBranches[DstFunc];
ToAggrData->Name = DstFunc;
ToFunc->setBranchData(ToAggrData);
}
To -= ToFunc->getAddress();
if (BAT)
To = BAT->translate(*ToFunc, To, /*IsBranchSrc=*/false);
ToFunc->recordEntry(To, Mispreds, Count);
}
@ -750,19 +684,13 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
auto *FromFunc = getBinaryFunctionContainingAddress(First.To);
auto *ToFunc = getBinaryFunctionContainingAddress(Second.From);
if (!FromFunc || !ToFunc) {
DEBUG(
dbgs() << "Out of range trace starting in " << FromFunc->getPrintName()
<< " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
<< " and ending in " << ToFunc->getPrintName() << " @ "
<< ToFunc->getPrintName() << " @ "
<< Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n');
NumLongRangeTraces += Count;
return false;
}
if (FromFunc != ToFunc) {
NumInvalidTraces += Count;
DEBUG(dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
<< " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
DEBUG(dbgs() << "Trace starting in " << FromFunc->getPrintName() << " @ "
<< Twine::utohexstr(First.To - FromFunc->getAddress())
<< " and ending in " << ToFunc->getPrintName() << " @ "
<< ToFunc->getPrintName() << " @ "
<< Twine::utohexstr(Second.From - ToFunc->getAddress())
@ -770,22 +698,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
return false;
}
auto FTs = BAT ? BAT->getFallthroughsInTrace(*FromFunc, First, Second)
: FromFunc->getFallthroughsInTrace(First, Second, Count);
auto FTs = FromFunc->getFallthroughsInTrace(First, Second, Count);
if (!FTs) {
DEBUG(dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
<< " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
<< " and ending in " << ToFunc->getPrintName() << " @ "
<< ToFunc->getPrintName() << " @ "
<< Twine::utohexstr(Second.From - ToFunc->getAddress())
<< '\n');
NumInvalidTraces += Count;
return false;
}
DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for "
<< FromFunc->getPrintName() << ":" << Twine::utohexstr(First.To)
<< " to " << Twine::utohexstr(Second.From) << ".\n");
for (const auto &Pair : *FTs) {
doIntraBranch(*FromFunc, Pair.first + FromFunc->getAddress(),
Pair.second + FromFunc->getAddress(), Count, false);
@ -878,7 +796,7 @@ ErrorOr<DataAggregator::PerfBranchSample> DataAggregator::parseBranchSample() {
auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes);
if (MMapInfoIter == BinaryMMapInfo.end()) {
consumeRestOfLine();
return make_error_code(errc::no_such_process);
return Res;
}
while (checkAndConsumeFS()) {}
@ -1091,11 +1009,8 @@ std::error_code DataAggregator::printLBRHeatMap() {
while (hasData()) {
auto SampleRes = parseBranchSample();
if (auto EC = SampleRes.getError()) {
if (EC == errc::no_such_process)
continue;
if (std::error_code EC = SampleRes.getError())
return EC;
}
auto &Sample = SampleRes.get();
@ -1156,39 +1071,33 @@ std::error_code DataAggregator::parseBranchEvents() {
uint64_t NumTotalSamples{0};
uint64_t NumEntries{0};
uint64_t NumSamples{0};
uint64_t NumSamplesNoLBR{0};
uint64_t NumTraces{0};
while (hasData()) {
++NumTotalSamples;
auto SampleRes = parseBranchSample();
if (auto EC = SampleRes.getError()) {
if (EC == errc::no_such_process)
continue;
if (std::error_code EC = SampleRes.getError())
return EC;
}
++NumSamples;
auto &Sample = SampleRes.get();
if (opts::WriteAutoFDOData)
++BasicSamples[Sample.PC];
if (Sample.LBR.empty()) {
++NumSamplesNoLBR;
if (Sample.LBR.empty())
continue;
}
++NumSamples;
NumEntries += Sample.LBR.size();
// LBRs are stored in reverse execution order. NextPC refers to the next
// recorded executed PC.
uint64_t NextPC = opts::UseEventPC ? Sample.PC : 0;
// LBRs are stored in reverse execution order. NextLBR refers to the next
// executed branch record.
const LBREntry *NextLBR{nullptr};
for (const auto &LBR : Sample.LBR) {
if (NextPC) {
if (NextLBR) {
// Record fall-through trace.
const auto TraceFrom = LBR.To;
const auto TraceTo = NextPC;
const auto TraceTo = NextLBR->From;
const auto *TraceBF = getBinaryFunctionContainingAddress(TraceFrom);
if (TraceBF && TraceBF->containsAddress(TraceTo)) {
auto &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
@ -1199,37 +1108,14 @@ std::error_code DataAggregator::parseBranchEvents() {
}
} else {
if (TraceBF && getBinaryFunctionContainingAddress(TraceTo)) {
DEBUG(dbgs() << "Invalid trace starting in "
<< TraceBF->getPrintName() << " @ "
<< Twine::utohexstr(TraceFrom - TraceBF->getAddress())
<< " and ending @ " << Twine::utohexstr(TraceTo)
<< '\n');
++NumInvalidTraces;
} else {
DEBUG(
dbgs() << "Out of range trace starting in "
<< (TraceBF ? TraceBF->getPrintName() : "None") << " @ "
<< Twine::utohexstr(
TraceFrom - (TraceBF ? TraceBF->getAddress() : 0))
<< " and ending in "
<< (getBinaryFunctionContainingAddress(TraceTo)
? getBinaryFunctionContainingAddress(TraceTo)
->getPrintName()
: "None")
<< " @ "
<< Twine::utohexstr(
TraceTo -
(getBinaryFunctionContainingAddress(TraceTo)
? getBinaryFunctionContainingAddress(TraceTo)
->getAddress()
: 0))
<< '\n');
++NumLongRangeTraces;
}
}
++NumTraces;
}
NextPC = LBR.From;
NextLBR = &LBR;
auto From = LBR.From;
if (!getBinaryFunctionContainingAddress(From))
@ -1273,23 +1159,14 @@ std::error_code DataAggregator::parseBranchEvents() {
outs() << "PERF2BOLT: read " << NumSamples << " samples and "
<< NumEntries << " LBR entries\n";
if (NumTotalSamples) {
if (NumSamples && NumSamplesNoLBR == NumSamples) {
// Note: we don't know if perf2bolt is being used to parse memory samples
// at this point. In this case, it is OK to parse zero LBRs.
errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
"LBR. Record profile with perf record -j any or run perf2bolt "
"in no-LBR mode with -nl (the performance improvement in -nl "
"mode may be limited)\n";
} else {
const auto IgnoredSamples = NumTotalSamples - NumSamples;
const auto PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples;
outs() << "PERF2BOLT: " << IgnoredSamples << " samples";
printColored(outs(), PercentIgnored, 20, 50);
outs() << " were ignored\n";
if (PercentIgnored > 50.0f) {
errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples "
"were attributed to the input binary\n";
}
const auto IgnoredSamples = NumTotalSamples - NumSamples;
const auto PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples;
outs() << "PERF2BOLT: " << IgnoredSamples << " samples";
printColored(outs(), PercentIgnored, 20, 50);
outs() << " were ignored\n";
if (PercentIgnored > 50.0f) {
errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples were "
"attributed to the input binary\n";
}
}
outs() << "PERF2BOLT: traces mismatching disassembled function contents: "
@ -1314,19 +1191,6 @@ std::error_code DataAggregator::parseBranchEvents() {
}
outs() << "\n";
if (NumColdSamples > 0) {
const auto ColdSamples = NumColdSamples * 100.0f / NumTotalSamples;
outs() << "PERF2BOLT: " << NumColdSamples
<< format(" (%.1f%%)", ColdSamples)
<< " samples recorded in cold regions of split functions.\n";
if (ColdSamples > 5.0f) {
outs()
<< "WARNING: The BOLT-processed binary where samples were collected "
"likely used bad data or your service observed a large shift in "
"profile. You may want to audit this.\n";
}
}
return std::error_code();
}
@ -1466,17 +1330,11 @@ void DataAggregator::processMemEvents() {
if (MemFunc) {
MemName = MemFunc->getNames()[0];
Addr -= MemFunc->getAddress();
} else if (Addr) {
} else if (Addr) { // TODO: filter heap/stack/nulls here?
if (auto *BD = BC->getBinaryDataContainingAddress(Addr)) {
MemName = BD->getName();
Addr -= BD->getAddress();
} else if (opts::FilterMemProfile) {
// Filter out heap/stack accesses
continue;
}
} else if (opts::FilterMemProfile) {
// Filter out nulls
continue;
}
const Location FuncLoc(!FuncName.empty(), FuncName, PC);
@ -1536,7 +1394,7 @@ void DataAggregator::processPreAggregated() {
AggrEntry.From.Offset, false};
LBREntry Second{AggrEntry.To.Offset, AggrEntry.To.Offset, false};
doTrace(First, Second, AggrEntry.Count);
NumTraces += AggrEntry.Count;
++NumTraces;
break;
}
}
@ -1918,8 +1776,6 @@ std::error_code DataAggregator::writeAggregatedFile() const {
uint64_t BranchValues{0};
uint64_t MemValues{0};
if (BAT)
OutFile << "boltedcollection\n";
if (opts::BasicAggregation) {
OutFile << "no_lbr";
for (const auto &Entry : EventNames) {

View File

@ -28,7 +28,6 @@ namespace bolt {
class BinaryFunction;
class BinaryContext;
class BoltAddressTranslation;
/// DataAggregator inherits all parsing logic from DataReader as well as
/// its data structures used to represent aggregated profile data in memory.
@ -173,13 +172,11 @@ class DataAggregator : public DataReader {
/// References to core BOLT data structures
BinaryContext *BC{nullptr};
BoltAddressTranslation *BAT{nullptr};
std::map<uint64_t, BinaryFunction> *BFs{nullptr};
/// Aggregation statistics
uint64_t NumInvalidTraces{0};
uint64_t NumLongRangeTraces{0};
uint64_t NumColdSamples{0};
/// Looks into system PATH for Linux Perf and set up the aggregator to use it
void findPerfExecutable();
@ -197,16 +194,7 @@ class DataAggregator : public DataReader {
/// Look up which function contains an address by using out map of
/// disassembled BinaryFunctions
BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address) const;
/// Retrieve the location name to be used for samples recorded in \p Func.
/// If doing BAT translation, link cold parts to the hot part names (used by
/// the original binary). \p Count specifies how many samples were recorded
/// at that location, so we can tally total activity in cold areas if we are
/// dealing with profiling data collected in a bolted binary. For LBRs,
/// \p Count should only be used for the source of the branch to avoid
/// counting cold activity twice (one for source and another for destination).
StringRef getLocationName(BinaryFunction &Func, uint64_t Count);
BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address);
/// Semantic actions - parser hooks to interpret parsed perf samples
/// Register a sample (non-LBR mode), i.e. a new hit at \p Address
@ -238,9 +226,7 @@ class DataAggregator : public DataReader {
std::error_code printLBRHeatMap();
/// Parse a single perf sample containing a PID associated with a sequence of
/// LBR entries. If the PID does not correspond to the binary we are looking
/// for, return std::errc::no_such_process. If other parsing errors occur,
/// return the error. Otherwise, return the parsed sample.
/// LBR entries
ErrorOr<PerfBranchSample> parseBranchSample();
/// Parse a single perf sample containing a PID associated with an event name
@ -398,14 +384,6 @@ public:
/// Set the file name to save aggregate data to
void setOutputFDataName(StringRef Name) { OutputFDataName = Name; }
/// Set Bolt Address Translation Table when processing samples collected in
/// bolted binaries
void setBAT(BoltAddressTranslation *B) { BAT = B; }
/// Returns true if this aggregation job is using a translation table to
/// remap samples collected on binaries already processed by BOLT.
bool usesBAT() const { return BAT; }
/// Start an aggregation job asynchronously. Call "aggregate" to finish it
/// with a list of disassembled functions.
void start(StringRef PerfDataFilename);
@ -422,10 +400,12 @@ public:
/// Parse profile and mark functions/objects with profile.
/// Don't assign profile to functions yet.
void parseProfile(BinaryContext &BC);
void parseProfile(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs);
/// Populate functions with profile.
void processProfile(BinaryContext &BC);
void processProfile(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs);
/// Check whether \p FileName is a perf.data file
static bool checkPerfDataMagic(StringRef FileName);

View File

@ -251,31 +251,16 @@ void FuncMemData::update(const Location &Offset, const Location &Addr) {
++Data[Iter->second].Count;
}
void DataReader::reset() {
for (auto &Pair : getAllFuncsBranchData()) {
Pair.second.Used = false;
}
for (auto &Pair : getAllFuncsMemData()) {
Pair.second.Used = false;
}
}
ErrorOr<std::unique_ptr<DataReader>>
DataReader::readPerfData(StringRef Path, raw_ostream &Diag) {
auto MB = MemoryBuffer::getFileOrSTDIN(Path);
if (auto EC = MB.getError()) {
Diag << "cannot open " << Path << ": " << EC.message() << "\n";
ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
MemoryBuffer::getFileOrSTDIN(Path);
if (std::error_code EC = MB.getError()) {
Diag << "Cannot open " << Path << ": " << EC.message() << "\n";
return EC;
}
auto DR = llvm::make_unique<DataReader>(std::move(MB.get()), Diag);
if (auto EC = DR->parse()) {
return EC;
}
if (!DR->ParsingBuf.empty()) {
Diag << "WARNING: invalid profile data detected at line " << DR->Line
<< ". Possibly corrupted profile.\n";
}
auto DR = make_unique<DataReader>(std::move(MB.get()), Diag);
DR->parse();
DR->buildLTONameMaps();
return std::move(DR);
}
@ -295,13 +280,6 @@ bool DataReader::expectAndConsumeFS() {
return true;
}
void DataReader::consumeAllRemainingFS() {
while (ParsingBuf[0] == FieldSeparator) {
ParsingBuf = ParsingBuf.drop_front(1);
Col += 1;
}
}
bool DataReader::checkAndConsumeNewLine() {
if (ParsingBuf[0] != '\n')
return false;
@ -396,14 +374,12 @@ ErrorOr<Location> DataReader::parseLocation(char EndChar,
if (!expectAndConsumeFS())
return make_error_code(llvm::errc::io_error);
consumeAllRemainingFS();
// Read the string containing the symbol or the DSO name
auto NameRes = parseString(FieldSeparator);
if (std::error_code EC = NameRes.getError())
return EC;
StringRef Name = NameRes.get();
consumeAllRemainingFS();
// Read the offset
auto Offset = parseHexField(EndChar, EndNl);
@ -419,25 +395,21 @@ ErrorOr<BranchInfo> DataReader::parseBranchInfo() {
return EC;
Location From = Res.get();
consumeAllRemainingFS();
Res = parseLocation(FieldSeparator);
if (std::error_code EC = Res.getError())
return EC;
Location To = Res.get();
consumeAllRemainingFS();
auto MRes = parseNumberField(FieldSeparator);
if (std::error_code EC = MRes.getError())
return EC;
int64_t NumMispreds = MRes.get();
consumeAllRemainingFS();
auto BRes = parseNumberField(FieldSeparator, /* EndNl = */ true);
if (std::error_code EC = BRes.getError())
return EC;
int64_t NumBranches = BRes.get();
consumeAllRemainingFS();
if (!checkAndConsumeNewLine()) {
reportError("expected end of line");
return make_error_code(llvm::errc::io_error);
@ -452,18 +424,15 @@ ErrorOr<MemInfo> DataReader::parseMemInfo() {
return EC;
Location Offset = Res.get();
consumeAllRemainingFS();
Res = parseMemLocation(FieldSeparator);
if (std::error_code EC = Res.getError())
return EC;
Location Addr = Res.get();
consumeAllRemainingFS();
auto CountRes = parseNumberField(FieldSeparator, true);
if (std::error_code EC = CountRes.getError())
return EC;
consumeAllRemainingFS();
if (!checkAndConsumeNewLine()) {
reportError("expected end of line");
return make_error_code(llvm::errc::io_error);
@ -478,13 +447,11 @@ ErrorOr<SampleInfo> DataReader::parseSampleInfo() {
return EC;
Location Address = Res.get();
consumeAllRemainingFS();
auto BRes = parseNumberField(FieldSeparator, /* EndNl = */ true);
if (std::error_code EC = BRes.getError())
return EC;
int64_t Occurrences = BRes.get();
consumeAllRemainingFS();
if (!checkAndConsumeNewLine()) {
reportError("expected end of line");
return make_error_code(llvm::errc::io_error);
@ -516,20 +483,6 @@ ErrorOr<bool> DataReader::maybeParseNoLBRFlag() {
return true;
}
ErrorOr<bool> DataReader::maybeParseBATFlag() {
if (ParsingBuf.size() < 16 || ParsingBuf.substr(0, 16) != "boltedcollection")
return false;
ParsingBuf = ParsingBuf.drop_front(16);
Col += 16;
if (!checkAndConsumeNewLine()) {
reportError("malformed boltedcollection line");
return make_error_code(llvm::errc::io_error);
}
return true;
}
bool DataReader::hasBranchData() {
if (ParsingBuf.size() == 0)
return false;
@ -646,17 +599,6 @@ std::error_code DataReader::parse() {
if (!FlagOrErr)
return FlagOrErr.getError();
NoLBRMode = *FlagOrErr;
auto BATFlagOrErr = maybeParseBATFlag();
if (!BATFlagOrErr)
return BATFlagOrErr.getError();
BATMode = *BATFlagOrErr;
if (!hasBranchData() && !hasMemData()) {
Diag << "ERROR: no valid profile data found\n";
return make_error_code(llvm::errc::io_error);
}
if (NoLBRMode)
return parseInNoLBRMode();

View File

@ -303,9 +303,6 @@ public:
static ErrorOr<std::unique_ptr<DataReader>> readPerfData(StringRef Path,
raw_ostream &Diag);
/// Mark all profile objects unused.
void reset();
/// Parses the input bolt data file into internal data structures. We expect
/// the file format to follow the syntax below.
///
@ -401,11 +398,6 @@ public:
/// Return false only if we are running with profiling data that lacks LBR.
bool hasLBR() const { return !NoLBRMode; }
/// Return true if the profiling data was collected in a bolted binary. This
/// means we lose the ability to identify stale data at some branch locations,
/// since we have to be more permissive in some cases.
bool collectedInBoltedBinary() const { return BATMode; }
/// Return true if event named \p Name was used to collect this profile data.
bool usesEvent(StringRef Name) const {
for (auto I = EventNames.begin(), E = EventNames.end(); I != E; ++I) {
@ -425,7 +417,6 @@ protected:
void reportError(StringRef ErrorMsg);
bool expectAndConsumeFS();
void consumeAllRemainingFS();
bool checkAndConsumeNewLine();
ErrorOr<StringRef> parseString(char EndChar, bool EndNl=false);
ErrorOr<int64_t> parseNumberField(char EndChar, bool EndNl=false);
@ -441,7 +432,6 @@ protected:
ErrorOr<SampleInfo> parseSampleInfo();
ErrorOr<MemInfo> parseMemInfo();
ErrorOr<bool> maybeParseNoLBRFlag();
ErrorOr<bool> maybeParseBATFlag();
bool hasBranchData();
bool hasMemData();
@ -458,7 +448,6 @@ protected:
FuncsToSamplesMapTy FuncsToSamples;
FuncsToMemEventsMapTy FuncsToMemEvents;
bool NoLBRMode{false};
bool BATMode{false};
StringSet<> EventNames;
static const char FieldSeparator = ' ';

View File

@ -40,7 +40,7 @@ namespace {
// Returns the number of written bytes.
uint64_t writeAddressRanges(
MCObjectWriter *Writer,
const DebugAddressRangesVector &AddressRanges,
const DWARFAddressRangesVector &AddressRanges,
const bool WriteRelativeRanges = false) {
for (auto &Range : AddressRanges) {
Writer->writeLE64(Range.LowPC);
@ -62,26 +62,26 @@ DebugRangesSectionsWriter::DebugRangesSectionsWriter(BinaryContext *BC) {
std::unique_ptr<MCObjectWriter>(BC->createObjectWriter(*RangesStream));
// Add an empty range as the first entry;
SectionOffset += writeAddressRanges(Writer.get(), DebugAddressRangesVector{});
SectionOffset += writeAddressRanges(Writer.get(), DWARFAddressRangesVector{});
}
uint64_t
DebugRangesSectionsWriter::addCURanges(uint64_t CUOffset,
DebugAddressRangesVector &&Ranges) {
uint64_t DebugRangesSectionsWriter::addCURanges(
uint64_t CUOffset,
DWARFAddressRangesVector &&Ranges) {
const auto RangesOffset = addRanges(Ranges);
std::lock_guard<std::mutex> Lock(CUAddressRangesMutex);
CUAddressRanges.emplace(CUOffset, std::move(Ranges));
return RangesOffset;
}
uint64_t DebugRangesSectionsWriter::addRanges(
const BinaryFunction *Function, DebugAddressRangesVector &&Ranges,
const BinaryFunction *&CachedFunction,
std::map<DebugAddressRangesVector, uint64_t> &CachedRanges) {
uint64_t
DebugRangesSectionsWriter::addRanges(const BinaryFunction *Function,
DWARFAddressRangesVector &&Ranges) {
if (Ranges.empty())
return getEmptyRangesOffset();
static const BinaryFunction *CachedFunction;
if (Function == CachedFunction) {
const auto RI = CachedRanges.find(Ranges);
if (RI != CachedRanges.end())
@ -98,13 +98,10 @@ uint64_t DebugRangesSectionsWriter::addRanges(
}
uint64_t
DebugRangesSectionsWriter::addRanges(const DebugAddressRangesVector &Ranges) {
DebugRangesSectionsWriter::addRanges(const DWARFAddressRangesVector &Ranges) {
if (Ranges.empty())
return getEmptyRangesOffset();
// Reading the SectionOffset and updating it should be atomic to guarantee
// unique and correct offsets in patches.
std::lock_guard<std::mutex> Lock(WriterMutex);
const auto EntryOffset = SectionOffset;
SectionOffset += writeAddressRanges(Writer.get(), Ranges);
@ -168,17 +165,14 @@ uint64_t DebugLocWriter::addList(const DWARFDebugLoc::LocationList &LocList) {
if (LocList.Entries.empty())
return getEmptyListOffset();
// Reading the SectionOffset and updating it should be atomic to guarantee
// unique and correct offsets in patches.
std::lock_guard<std::mutex> Lock(WriterMutex);
const auto EntryOffset = SectionOffset;
for (const auto &Entry : LocList.Entries) {
Writer->writeLE64(Entry.Begin);
Writer->writeLE64(Entry.End);
Writer->writeLE16(Entry.Loc.size());
Writer->writeBytes(StringRef(
reinterpret_cast<const char *>(Entry.Loc.data()), Entry.Loc.size()));
Writer->writeBytes(
StringRef(reinterpret_cast<const char *>(Entry.Loc.data()),
Entry.Loc.size()));
SectionOffset += 2 * 8 + 2 + Entry.Loc.size();
}
Writer->writeLE64(0);
@ -235,29 +229,42 @@ void SimpleBinaryPatcher::patchBinary(std::string &BinaryContents) {
}
}
void DebugAbbrevPatcher::addAttributePatch(
const DWARFAbbreviationDeclaration *Abbrev,
dwarf::Attribute AttrTag,
uint8_t NewAttrTag,
uint8_t NewAttrForm) {
assert(Abbrev && "no abbreviation specified");
AbbrevPatches.emplace(
AbbrevAttrPatch{Abbrev, AttrTag, NewAttrTag, NewAttrForm});
void DebugAbbrevPatcher::addAttributePatch(const DWARFUnit *Unit,
uint32_t AbbrevCode,
dwarf::Attribute AttrTag,
uint8_t NewAttrTag,
uint8_t NewAttrForm) {
assert(Unit && "No compile unit specified.");
Patches[Unit].emplace_back(
AbbrevAttrPatch{AbbrevCode, AttrTag, NewAttrTag, NewAttrForm});
}
void DebugAbbrevPatcher::patchBinary(std::string &Contents) {
SimpleBinaryPatcher Patcher;
for (const auto &Patch : AbbrevPatches) {
const auto Attribute = Patch.Abbrev->findAttribute(Patch.Attr);
assert(Attribute && "Specified attribute doesn't occur in abbreviation.");
for (const auto &UnitPatchesPair : Patches) {
const auto *Unit = UnitPatchesPair.first;
const auto *UnitAbbreviations = Unit->getAbbreviations();
assert(UnitAbbreviations &&
"Compile unit doesn't have associated abbreviations.");
const auto &UnitPatches = UnitPatchesPair.second;
for (const auto &AttrPatch : UnitPatches) {
const auto *AbbreviationDeclaration =
UnitAbbreviations->getAbbreviationDeclaration(AttrPatch.Code);
assert(AbbreviationDeclaration && "No abbreviation with given code.");
const auto Attribute =
AbbreviationDeclaration->findAttribute(AttrPatch.Attr);
// Because we're only handling standard values (i.e. no DW_FORM_GNU_* or
// DW_AT_APPLE_*), they are all small (< 128) and encoded in a single
// byte in ULEB128, otherwise it'll be more tricky as we may need to
// grow or shrink the section.
Patcher.addBytePatch(Attribute->AttrOffset, Patch.NewAttr);
Patcher.addBytePatch(Attribute->FormOffset, Patch.NewForm);
assert(Attribute && "Specified attribute doesn't occur in abbreviation.");
// Because we're only handling standard values (i.e. no DW_FORM_GNU_* or
// DW_AT_APPLE_*), they are all small (< 128) and encoded in a single
// byte in ULEB128, otherwise it'll be more tricky as we may need to
// grow or shrink the section.
Patcher.addBytePatch(Attribute->AttrOffset,
AttrPatch.NewAttr);
Patcher.addBytePatch(Attribute->FormOffset,
AttrPatch.NewForm);
}
}
Patcher.patchBinary(Contents);
}

View File

@ -20,42 +20,26 @@
#include "llvm/Support/SMLoc.h"
#include "llvm/Support/raw_ostream.h"
#include <map>
#include <mutex>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include "BinaryBasicBlock.h"
namespace llvm {
class DWARFCompileUnit;
class DWARFDebugInfoEntryMinimal;
class MCObjectWriter;
namespace bolt {
class BinaryContext;
class BasicBlockTable;
class BinaryBasicBlock;
class BinaryFunction;
/// Address range representation. Takes less space than DWARFAddressRange.
struct DebugAddressRange {
uint64_t LowPC{0};
uint64_t HighPC{0};
DebugAddressRange() = default;
DebugAddressRange(uint64_t LowPC, uint64_t HighPC)
: LowPC(LowPC), HighPC(HighPC) {}
};
static inline bool operator<(const DebugAddressRange &LHS,
const DebugAddressRange &RHS) {
return std::tie(LHS.LowPC, LHS.HighPC) < std::tie(RHS.LowPC, RHS.HighPC);
}
/// DebugAddressRangesVector - represents a set of absolute address ranges.
using DebugAddressRangesVector = SmallVector<DebugAddressRange, 2>;
/// References a row in a DWARFDebugLine::LineTable by the DWARF
/// Eeferences a row in a DWARFDebugLine::LineTable by the DWARF
/// Context index of the DWARF Compile Unit that owns the Line Table and the row
/// index. This is tied to our IR during disassembly so that we can later update
/// .debug_line information. RowIndex has a base of 1, which means a RowIndex
@ -100,16 +84,14 @@ public:
DebugRangesSectionsWriter(BinaryContext *BC);
/// Add ranges for CU matching \p CUOffset and return offset into section.
uint64_t addCURanges(uint64_t CUOffset, DebugAddressRangesVector &&Ranges);
uint64_t addCURanges(uint64_t CUOffset, DWARFAddressRangesVector &&Ranges);
/// Add ranges with caching for \p Function.
uint64_t
addRanges(const BinaryFunction *Function, DebugAddressRangesVector &&Ranges,
const BinaryFunction *&CachedFunction,
std::map<DebugAddressRangesVector, uint64_t> &CachedRanges);
uint64_t addRanges(const BinaryFunction *Function,
DWARFAddressRangesVector &&Ranges);
/// Add ranges and return offset into section.
uint64_t addRanges(const DebugAddressRangesVector &Ranges);
uint64_t addRanges(const DWARFAddressRangesVector &Ranges);
/// Writes .debug_aranges with the added ranges to the MCObjectWriter.
void writeArangesSection(MCObjectWriter *Writer) const;
@ -124,7 +106,7 @@ public:
uint64_t getEmptyRangesOffset() const { return EmptyRangesOffset; }
/// Map DWARFCompileUnit index to ranges.
using CUAddressRangesType = std::map<uint64_t, DebugAddressRangesVector>;
using CUAddressRangesType = std::map<uint64_t, DWARFAddressRangesVector>;
/// Return ranges for a given CU.
const CUAddressRangesType &getCUAddressRanges() const {
@ -142,8 +124,6 @@ private:
std::unique_ptr<MCObjectWriter> Writer;
std::mutex WriterMutex;
/// Current offset in the section (updated as new entries are written).
/// Starts with 16 since the first 16 bytes are reserved for an empty range.
uint32_t SectionOffset{0};
@ -153,10 +133,11 @@ private:
/// (first address, interval size).
CUAddressRangesType CUAddressRanges;
std::mutex CUAddressRangesMutex;
/// Offset of an empty address ranges list.
static constexpr uint64_t EmptyRangesOffset{0};
/// Cached used for de-duplicating entries for the same function.
std::map<DWARFAddressRangesVector, uint64_t> CachedRanges;
};
/// Serializes the .debug_loc DWARF section with LocationLists.
@ -179,8 +160,6 @@ private:
std::unique_ptr<MCObjectWriter> Writer;
std::mutex WriterMutex;
/// Offset of an empty location list.
static uint64_t const EmptyListOffset = 0;
@ -240,33 +219,25 @@ class DebugAbbrevPatcher : public BinaryPatcher {
private:
/// Patch of changing one attribute to another.
struct AbbrevAttrPatch {
const DWARFAbbreviationDeclaration *Abbrev;
uint32_t Code; // Code of abbreviation to be modified.
dwarf::Attribute Attr; // ID of attribute to be replaced.
uint8_t NewAttr; // ID of the new attribute.
uint8_t NewForm; // Form of the new attribute.
bool operator==(const AbbrevAttrPatch &RHS) const {
return Abbrev == RHS.Abbrev && Attr == RHS.Attr;
}
uint8_t NewAttr; // ID of the new attribute.
uint8_t NewForm; // Form of the new attribute.
};
struct AbbrevHash {
std::size_t operator()(const AbbrevAttrPatch &P) const {
return std::hash<uint64_t>()(((uint64_t)P.Abbrev << 16) + P.Attr);
}
};
std::unordered_set<AbbrevAttrPatch, AbbrevHash> AbbrevPatches;
std::map<const DWARFUnit *, std::vector<AbbrevAttrPatch>> Patches;
public:
~DebugAbbrevPatcher() { }
/// Adds a patch to change an attribute of the abbreviation
/// \p Abbrev the abbreviation to be modified.
/// Adds a patch to change an attribute of an abbreviation that belongs to
/// \p Unit to another attribute.
/// \p AbbrevCode code of the abbreviation to be modified.
/// \p AttrTag ID of the attribute to be replaced.
/// \p NewAttrTag ID of the new attribute.
/// \p NewAttrForm Form of the new attribute.
/// We only handle standard forms, that are encoded in a single byte.
void addAttributePatch(const DWARFAbbreviationDeclaration *Abbrev,
void addAttributePatch(const DWARFUnit *Unit,
uint32_t AbbrevCode,
dwarf::Attribute AttrTag,
uint8_t NewAttrTag,
uint8_t NewAttrForm);

View File

@ -1,259 +0,0 @@
//===--- DynoStats.cpp ----------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "DynoStats.h"
#include "BinaryBasicBlock.h"
#include "BinaryFunction.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <numeric>
#include <string>
#undef DEBUG_TYPE
#define DEBUG_TYPE "bolt"
using namespace llvm;
using namespace bolt;
namespace opts {
extern cl::OptionCategory BoltCategory;
static cl::opt<uint32_t>
DynoStatsScale("dyno-stats-scale",
cl::desc("scale to be applied while reporting dyno stats"),
cl::Optional,
cl::init(1),
cl::Hidden,
cl::cat(BoltCategory));
} // namespace opts
namespace llvm {
namespace bolt {
constexpr const char *DynoStats::Desc[];
bool DynoStats::operator<(const DynoStats &Other) const {
return std::lexicographical_compare(
&Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT],
&Other.Stats[FIRST_DYNO_STAT], &Other.Stats[LAST_DYNO_STAT]
);
}
bool DynoStats::operator==(const DynoStats &Other) const {
return std::equal(
&Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT],
&Other.Stats[FIRST_DYNO_STAT]
);
}
bool DynoStats::lessThan(const DynoStats &Other,
ArrayRef<Category> Keys) const {
return std::lexicographical_compare(
Keys.begin(), Keys.end(),
Keys.begin(), Keys.end(),
[this,&Other](const Category A, const Category) {
return Stats[A] < Other.Stats[A];
}
);
}
void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const {
auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat,
uint64_t OtherStat) {
OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name;
if (Other) {
if (Stat != OtherStat) {
OtherStat = std::max(OtherStat, uint64_t(1)); // to prevent divide by 0
OS << format(" (%+.1f%%)",
( (float) Stat - (float) OtherStat ) * 100.0 /
(float) (OtherStat) );
} else {
OS << " (=)";
}
}
OS << '\n';
};
for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1;
Stat < DynoStats::LAST_DYNO_STAT;
++Stat) {
if (!PrintAArch64Stats && Stat == DynoStats::VENEER_CALLS_AARCH64)
continue;
printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0);
}
}
void DynoStats::operator+=(const DynoStats &Other) {
for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1;
Stat < DynoStats::LAST_DYNO_STAT;
++Stat) {
Stats[Stat] += Other[Stat];
}
}
DynoStats getDynoStats(const BinaryFunction &BF) {
auto &BC = BF.getBinaryContext();
DynoStats Stats(/*PrintAArch64Stats*/ BC.isAArch64());
// Return empty-stats about the function we don't completely understand.
if (!BF.isSimple() || !BF.hasValidProfile())
return Stats;
// If the function was folded in non-relocation mode we keep its profile
// for optimization. However, it should be excluded from the dyno stats.
if (BF.isFolded())
return Stats;
// Update enumeration of basic blocks for correct detection of branch'
// direction.
BF.updateLayoutIndices();
for (const auto &BB : BF.layout()) {
// The basic block execution count equals to the sum of incoming branch
// frequencies. This may deviate from the sum of outgoing branches of the
// basic block especially since the block may contain a function that
// does not return or a function that throws an exception.
const uint64_t BBExecutionCount = BB->getKnownExecutionCount();
// Ignore empty blocks and blocks that were not executed.
if (BB->getNumNonPseudos() == 0 || BBExecutionCount == 0)
continue;
// Count AArch64 linker-inserted veneers
if(BF.isAArch64Veneer())
Stats[DynoStats::VENEER_CALLS_AARCH64] += BF.getKnownExecutionCount();
// Count the number of calls by iterating through all instructions.
for (const auto &Instr : *BB) {
if (BC.MIB->isStore(Instr)) {
Stats[DynoStats::STORES] += BBExecutionCount;
}
if (BC.MIB->isLoad(Instr)) {
Stats[DynoStats::LOADS] += BBExecutionCount;
}
if (!BC.MIB->isCall(Instr))
continue;
uint64_t CallFreq = BBExecutionCount;
if (BC.MIB->getConditionalTailCall(Instr)) {
CallFreq =
BC.MIB->getAnnotationWithDefault<uint64_t>(Instr, "CTCTakenCount");
}
Stats[DynoStats::FUNCTION_CALLS] += CallFreq;
if (BC.MIB->isIndirectCall(Instr)) {
Stats[DynoStats::INDIRECT_CALLS] += CallFreq;
} else if (const auto *CallSymbol = BC.MIB->getTargetSymbol(Instr)) {
const auto *BF = BC.getFunctionForSymbol(CallSymbol);
if (BF && BF->isPLTFunction()) {
Stats[DynoStats::PLT_CALLS] += CallFreq;
// We don't process PLT functions and hence have to adjust relevant
// dynostats here for:
//
// jmp *GOT_ENTRY(%rip)
//
// NOTE: this is arch-specific.
Stats[DynoStats::FUNCTION_CALLS] += CallFreq;
Stats[DynoStats::INDIRECT_CALLS] += CallFreq;
Stats[DynoStats::LOADS] += CallFreq;
Stats[DynoStats::INSTRUCTIONS] += CallFreq;
}
}
}
Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount;
// Jump tables.
const auto *LastInstr = BB->getLastNonPseudoInstr();
if (BC.MIB->getJumpTable(*LastInstr)) {
Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount;
DEBUG(
static uint64_t MostFrequentJT;
if (BBExecutionCount > MostFrequentJT) {
MostFrequentJT = BBExecutionCount;
dbgs() << "BOLT-INFO: most frequently executed jump table is in "
<< "function " << BF << " in basic block " << BB->getName()
<< " executed totally " << BBExecutionCount << " times.\n";
}
);
continue;
}
if (BC.MIB->isIndirectBranch(*LastInstr) && !BC.MIB->isCall(*LastInstr)) {
Stats[DynoStats::UNKNOWN_INDIRECT_BRANCHES] += BBExecutionCount;
continue;
}
// Update stats for branches.
const MCSymbol *TBB = nullptr;
const MCSymbol *FBB = nullptr;
MCInst *CondBranch = nullptr;
MCInst *UncondBranch = nullptr;
if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) {
continue;
}
if (!CondBranch && !UncondBranch) {
continue;
}
// Simple unconditional branch.
if (!CondBranch) {
Stats[DynoStats::UNCOND_BRANCHES] += BBExecutionCount;
continue;
}
// CTCs: instruction annotations could be stripped, hence check the number
// of successors to identify conditional tail calls.
if (BB->succ_size() == 1) {
if (BB->branch_info_begin() != BB->branch_info_end())
Stats[DynoStats::UNCOND_BRANCHES] += BB->branch_info_begin()->Count;
continue;
}
// Conditional branch that could be followed by an unconditional branch.
auto TakenCount = BB->getTakenBranchInfo().Count;
if (TakenCount == BinaryBasicBlock::COUNT_NO_PROFILE)
TakenCount = 0;
auto NonTakenCount = BB->getFallthroughBranchInfo().Count;
if (NonTakenCount == BinaryBasicBlock::COUNT_NO_PROFILE)
NonTakenCount = 0;
if (BF.isForwardBranch(BB, BB->getConditionalSuccessor(true))) {
Stats[DynoStats::FORWARD_COND_BRANCHES] += BBExecutionCount;
Stats[DynoStats::FORWARD_COND_BRANCHES_TAKEN] += TakenCount;
} else {
Stats[DynoStats::BACKWARD_COND_BRANCHES] += BBExecutionCount;
Stats[DynoStats::BACKWARD_COND_BRANCHES_TAKEN] += TakenCount;
}
if (UncondBranch) {
Stats[DynoStats::UNCOND_BRANCHES] += NonTakenCount;
}
}
return Stats;
}
} // namespace bolt
} // namespace llvm

View File

@ -1,179 +0,0 @@
//===--- DynoStats.h ------------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H
#define LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H
#include "BinaryFunction.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
namespace llvm {
namespace bolt {
/// Class encapsulating runtime statistics about an execution unit.
class DynoStats {
#define DYNO_STATS\
D(FIRST_DYNO_STAT, "<reserved>", Fn)\
D(FORWARD_COND_BRANCHES, "executed forward branches", Fn)\
D(FORWARD_COND_BRANCHES_TAKEN, "taken forward branches", Fn)\
D(BACKWARD_COND_BRANCHES, "executed backward branches", Fn)\
D(BACKWARD_COND_BRANCHES_TAKEN, "taken backward branches", Fn)\
D(UNCOND_BRANCHES, "executed unconditional branches", Fn)\
D(FUNCTION_CALLS, "all function calls", Fn)\
D(INDIRECT_CALLS, "indirect calls", Fn)\
D(PLT_CALLS, "PLT calls", Fn)\
D(INSTRUCTIONS, "executed instructions", Fn)\
D(LOADS, "executed load instructions", Fn)\
D(STORES, "executed store instructions", Fn)\
D(JUMP_TABLE_BRANCHES, "taken jump table branches", Fn)\
D(UNKNOWN_INDIRECT_BRANCHES, "taken unknown indirect branches", Fn)\
D(ALL_BRANCHES, "total branches",\
Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\
D(ALL_TAKEN, "taken branches",\
Fadd(TAKEN_CONDITIONAL, UNCOND_BRANCHES))\
D(NONTAKEN_CONDITIONAL, "non-taken conditional branches",\
Fsub(ALL_CONDITIONAL, TAKEN_CONDITIONAL))\
D(TAKEN_CONDITIONAL, "taken conditional branches",\
Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\
D(ALL_CONDITIONAL, "all conditional branches",\
Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\
D(VENEER_CALLS_AARCH64, "linker-inserted veneer calls", Fn)\
D(LAST_DYNO_STAT, "<reserved>", 0)
public:
#define D(name, ...) name,
enum Category : uint8_t { DYNO_STATS };
#undef D
private:
uint64_t Stats[LAST_DYNO_STAT+1];
bool PrintAArch64Stats;
#define D(name, desc, ...) desc,
static constexpr const char *Desc[] = { DYNO_STATS };
#undef D
public:
DynoStats(bool PrintAArch64Stats) {
this->PrintAArch64Stats = PrintAArch64Stats;
for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat)
Stats[Stat] = 0;
}
uint64_t &operator[](size_t I) {
assert(I > FIRST_DYNO_STAT && I < LAST_DYNO_STAT &&
"index out of bounds");
return Stats[I];
}
uint64_t operator[](size_t I) const {
switch (I) {
#define D(name, desc, func) \
case name: \
return func;
#define Fn Stats[I]
#define Fadd(a, b) operator[](a) + operator[](b)
#define Fsub(a, b) operator[](a) - operator[](b)
#define F(a) operator[](a)
#define Radd(a, b) (a + b)
#define Rsub(a, b) (a - b)
DYNO_STATS
#undef Rsub
#undef Radd
#undef F
#undef Fsub
#undef Fadd
#undef Fn
#undef D
default:
llvm_unreachable("index out of bounds");
}
return 0;
}
void print(raw_ostream &OS, const DynoStats *Other = nullptr) const;
void operator+=(const DynoStats &Other);
bool operator<(const DynoStats &Other) const;
bool operator==(const DynoStats &Other) const;
bool operator!=(const DynoStats &Other) const { return !operator==(Other); }
bool lessThan(const DynoStats &Other, ArrayRef<Category> Keys) const;
static const char* Description(const Category C) {
return Desc[C];
}
};
inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) {
Stats.print(OS, nullptr);
return OS;
}
DynoStats operator+(const DynoStats &A, const DynoStats &B);
/// Return dynostats for the function.
///
/// The function relies on branch instructions being in-sync with CFG for
/// branch instructions stats. Thus it is better to call it after
/// fixBranches().
DynoStats getDynoStats(const BinaryFunction &BF);
/// Return program-wide dynostats.
template <typename FuncsType>
inline DynoStats getDynoStats(const FuncsType &Funcs) {
bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
DynoStats dynoStats(IsAArch64);
for (auto &BFI : Funcs) {
auto &BF = BFI.second;
if (BF.isSimple()) {
dynoStats += getDynoStats(BF);
}
}
return dynoStats;
}
/// Call a function with optional before and after dynostats printing.
template <typename FnType, typename FuncsType>
inline void
callWithDynoStats(FnType &&Func,
const FuncsType &Funcs,
StringRef Phase,
const bool Flag) {
bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
DynoStats DynoStatsBefore(IsAArch64);
if (Flag) {
DynoStatsBefore = getDynoStats(Funcs);
}
Func();
if (Flag) {
const auto DynoStatsAfter = getDynoStats(Funcs);
const auto Changed = (DynoStatsAfter != DynoStatsBefore);
outs() << "BOLT-INFO: program-wide dynostats after running "
<< Phase << (Changed ? "" : " (no change)") << ":\n\n"
<< DynoStatsBefore << '\n';
if (Changed) {
DynoStatsAfter.print(outs(), &DynoStatsBefore);
}
outs() << '\n';
}
}
} // namespace bolt
} // namespace llvm
#endif

View File

@ -266,7 +266,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
return;
}
if (TTypeEncoding & DW_EH_PE_indirect) {
auto PointerOrErr = BC.getPointerAtAddress(TypeAddress);
auto PointerOrErr = BC.extractPointerAtAddress(TypeAddress);
assert(PointerOrErr && "failed to decode indirect address");
TypeAddress = *PointerOrErr;
}
@ -349,8 +349,9 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
if ((TTypeEncoding & DW_EH_PE_pcrel) && (TypeAddress == TTEntryAddress)) {
TypeAddress = 0;
}
if (TypeAddress && (TTypeEncoding & DW_EH_PE_indirect)) {
auto PointerOrErr = BC.getPointerAtAddress(TypeAddress);
if (TypeAddress &&
(TTypeEncoding & DW_EH_PE_indirect)) {
auto PointerOrErr = BC.extractPointerAtAddress(TypeAddress);
assert(PointerOrErr && "failed to decode indirect address");
TypeAddress = *PointerOrErr;
}
@ -430,14 +431,9 @@ void BinaryFunction::updateEHRanges() {
continue;
// Same symbol is used for the beginning and the end of the range.
const MCSymbol *EHSymbol;
const MCSymbol *EHSymbol = BC.Ctx->createTempSymbol("EH", true);
MCInst EHLabel;
{
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
EHSymbol = BC.Ctx->createTempSymbol("EH", true);
BC.MIB->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get());
}
BC.MIB->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get());
II = std::next(BB->insertPseudoInstr(II, EHLabel));
// At this point we could be in one of the following states:
@ -530,19 +526,42 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) {
// a landing pad, this means that the first landing pad offset will be 0.
// As a result, an exception handling runtime will ignore this landing pad,
// because zero offset denotes the absence of a landing pad.
// For this reason, we emit LPStart value of 0 and output an absolute value
// of the landing pad in the table.
//
// FIXME: this may break PIEs and DSOs where the base address is not 0.
Streamer->EmitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format
Streamer->EmitIntValue(0, 4);
auto emitLandingPad = [&](const MCSymbol *LPSymbol) {
if (!LPSymbol) {
Streamer->EmitIntValue(0, 4);
return;
}
Streamer->EmitSymbolValue(LPSymbol, 4);
};
// To workaround this issue, we issue a special LPStart for cold fragments
// that is equal to FDE start minus 1 byte.
//
// Note that main function fragment cannot start with a landing pad and we
// omit LPStart.
const MCExpr *LPStartExpr = nullptr;
std::function<void(const MCSymbol *)> emitLandingPad;
if (EmitColdPart) {
Streamer->EmitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format
LPStartExpr = MCBinaryExpr::createSub(
MCSymbolRefExpr::create(StartSymbol, *BC.Ctx.get()),
MCConstantExpr::create(1, *BC.Ctx.get()),
*BC.Ctx.get());
Streamer->EmitValue(LPStartExpr, 4);
emitLandingPad = [&](const MCSymbol *LPSymbol) {
if (!LPSymbol) {
Streamer->EmitIntValue(0, 4);
return;
}
Streamer->EmitValue(MCBinaryExpr::createSub(
MCSymbolRefExpr::create(LPSymbol, *BC.Ctx.get()),
LPStartExpr,
*BC.Ctx.get()),
4);
};
} else {
Streamer->EmitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format
emitLandingPad = [&](const MCSymbol *LPSymbol) {
if (!LPSymbol) {
Streamer->EmitIntValue(0, 4);
return;
}
Streamer->emitAbsoluteSymbolDiff(LPSymbol, StartSymbol, 4);
};
}
Streamer->EmitIntValue(TTypeEncoding, 1); // TType format
@ -678,6 +697,17 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
return true;
const FDE &CurFDE = *I->second;
if (Function.getSize() != CurFDE.getAddressRange()) {
if (opts::Verbosity >= 1) {
errs() << "BOLT-WARNING: CFI information size mismatch for function \""
<< Function << "\""
<< format(": Function size is %dB, CFI covers "
"%dB\n",
Function.getSize(), CurFDE.getAddressRange());
}
return false;
}
auto LSDA = CurFDE.getLSDAAddress();
Function.setLSDAAddress(LSDA ? *LSDA : 0);
@ -838,8 +868,7 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
return false;
default:
if (opts::Verbosity >= 1) {
errs() << "BOLT-WARNING: Unrecognized CFI instruction: "
<< Instr.Opcode << '\n';
errs() << "BOLT-WARNING: Unrecognized CFI instruction\n";
}
return false;
}

View File

@ -1,110 +0,0 @@
//===--- ExecutableFileMemoryManager.cpp ----------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "ExecutableFileMemoryManager.h"
#include "RewriteInstance.h"
#undef DEBUG_TYPE
#define DEBUG_TYPE "efmm"
using namespace llvm;
using namespace object;
using namespace bolt;
namespace llvm {
namespace bolt {
uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size,
unsigned Alignment,
unsigned SectionID,
StringRef SectionName,
bool IsCode,
bool IsReadOnly) {
// Register a debug section as a note section.
if (!ObjectsLoaded && RewriteInstance::isDebugSection(SectionName)) {
uint8_t *DataCopy = new uint8_t[Size];
auto &Section = BC.registerOrUpdateNoteSection(SectionName,
DataCopy,
Size,
Alignment);
Section.setSectionID(SectionID);
assert(!Section.isAllocatable() && "note sections cannot be allocatable");
return DataCopy;
}
uint8_t *Ret;
if (IsCode) {
Ret = SectionMemoryManager::allocateCodeSection(Size, Alignment,
SectionID, SectionName);
} else {
Ret = SectionMemoryManager::allocateDataSection(Size, Alignment,
SectionID, SectionName,
IsReadOnly);
}
const auto Flags = BinarySection::getFlags(IsReadOnly, IsCode, true);
SmallVector<char, 256> Buf;
if (ObjectsLoaded > 0)
SectionName = (Twine(SectionName) + ".bolt.extra." + Twine(ObjectsLoaded))
.toStringRef(Buf);
auto &Section = BC.registerOrUpdateSection(SectionName,
ELF::SHT_PROGBITS,
Flags,
Ret,
Size,
Alignment);
Section.setSectionID(SectionID);
assert(Section.isAllocatable() &&
"verify that allocatable is marked as allocatable");
DEBUG(dbgs() << "BOLT: allocating " << (Section.isLocal() ? "local " : "")
<< (IsCode ? "code" : (IsReadOnly ? "read-only data" : "data"))
<< " section : " << SectionName
<< " with size " << Size << ", alignment " << Alignment
<< " at 0x" << Ret << ", ID = " << SectionID << "\n");
return Ret;
}
/// Notifier for non-allocatable (note) section.
uint8_t *ExecutableFileMemoryManager::recordNoteSection(
const uint8_t *Data,
uintptr_t Size,
unsigned Alignment,
unsigned SectionID,
StringRef SectionName) {
DEBUG(dbgs() << "BOLT: note section "
<< SectionName
<< " with size " << Size << ", alignment " << Alignment
<< " at 0x"
<< Twine::utohexstr(reinterpret_cast<uint64_t>(Data)) << '\n');
auto &Section = BC.registerOrUpdateNoteSection(SectionName,
copyByteArray(Data, Size),
Size,
Alignment);
Section.setSectionID(SectionID);
assert(!Section.isAllocatable() && "note sections cannot be allocatable");
return Section.getOutputData();
}
bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) {
DEBUG(dbgs() << "BOLT: finalizeMemory()\n");
++ObjectsLoaded;
return SectionMemoryManager::finalizeMemory(ErrMsg);
}
ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { }
}
}

View File

@ -1,100 +0,0 @@
//===--- ExecutableFileMemoryManager.h ------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H
#define LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H
#include "BinaryContext.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ExecutionEngine/SectionMemoryManager.h"
#include "llvm/Support/raw_ostream.h"
namespace llvm {
namespace bolt {
struct SegmentInfo {
uint64_t Address; /// Address of the segment in memory.
uint64_t Size; /// Size of the segment in memory.
uint64_t FileOffset; /// Offset in the file.
uint64_t FileSize; /// Size in file.
void print(raw_ostream &OS) const {
OS << "SegmentInfo { Address: 0x"
<< Twine::utohexstr(Address) << ", Size: 0x"
<< Twine::utohexstr(Size) << ", FileOffset: 0x"
<< Twine::utohexstr(FileOffset) << ", FileSize: 0x"
<< Twine::utohexstr(FileSize) << "}";
};
};
inline raw_ostream &operator<<(raw_ostream &OS, const SegmentInfo &SegInfo) {
SegInfo.print(OS);
return OS;
}
/// Class responsible for allocating and managing code and data sections.
class ExecutableFileMemoryManager : public SectionMemoryManager {
private:
uint8_t *allocateSection(intptr_t Size,
unsigned Alignment,
unsigned SectionID,
StringRef SectionName,
bool IsCode,
bool IsReadOnly);
BinaryContext &BC;
bool AllowStubs;
public:
// Our linker's main purpose is to handle a single object file, created
// by RewriteInstance after reading the input binary and reordering it.
// After objects finish loading, we increment this. Therefore, whenever
// this is greater than zero, we are dealing with additional objects that
// will not be managed by BinaryContext but only exist to support linking
// user-supplied objects into the main input executable.
uint32_t ObjectsLoaded{0};
/// [start memory address] -> [segment info] mapping.
std::map<uint64_t, SegmentInfo> SegmentMapInfo;
ExecutableFileMemoryManager(BinaryContext &BC, bool AllowStubs)
: BC(BC), AllowStubs(AllowStubs) {}
~ExecutableFileMemoryManager();
uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
unsigned SectionID,
StringRef SectionName) override {
return allocateSection(Size, Alignment, SectionID, SectionName,
/*IsCode=*/true, true);
}
uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
unsigned SectionID, StringRef SectionName,
bool IsReadOnly) override {
return allocateSection(Size, Alignment, SectionID, SectionName,
/*IsCode=*/false, IsReadOnly);
}
uint8_t *recordNoteSection(const uint8_t *Data, uintptr_t Size,
unsigned Alignment, unsigned SectionID,
StringRef SectionName) override;
bool allowStubAllocation() const override { return AllowStubs; }
bool finalizeMemory(std::string *ErrMsg = nullptr) override;
};
} // namespace bolt
} // namespace llvm
#endif

View File

@ -10,7 +10,6 @@
//===----------------------------------------------------------------------===//
#include "JumpTable.h"
#include "BinaryFunction.h"
#include "BinarySection.h"
#include "Relocation.h"
#include "llvm/MC/MCStreamer.h"
@ -28,27 +27,8 @@ extern cl::opt<JumpTableSupportLevel> JumpTables;
extern cl::opt<unsigned> Verbosity;
}
JumpTable::JumpTable(StringRef Name,
uint64_t Address,
std::size_t EntrySize,
JumpTableType Type,
LabelMapType &&Labels,
BinaryFunction &BF,
BinarySection &Section)
: BinaryData(Name, Address, 0, EntrySize, Section),
EntrySize(EntrySize),
OutputEntrySize(EntrySize),
Type(Type),
Labels(Labels),
Parent(&BF) {
}
std::pair<size_t, size_t>
JumpTable::getEntriesForAddress(const uint64_t Addr) const {
// Check if this is not an address, but a cloned JT id
if ((int64_t)Addr < 0ll)
return std::make_pair(0, Entries.size());
const uint64_t InstOffset = Addr - getAddress();
size_t StartIndex = 0, EndIndex = 0;
uint64_t Offset = 0;
@ -75,12 +55,13 @@ JumpTable::getEntriesForAddress(const uint64_t Addr) const {
return std::make_pair(StartIndex, EndIndex);
}
bool JumpTable::replaceDestination(uint64_t JTAddress, const MCSymbol *OldDest,
bool JumpTable::replaceDestination(uint64_t JTAddress,
const MCSymbol *OldDest,
MCSymbol *NewDest) {
bool Patched{false};
const auto Range = getEntriesForAddress(JTAddress);
for (auto I = &Entries[Range.first], E = &Entries[Range.second]; I != E;
++I) {
for (auto I = &Entries[Range.first], E = &Entries[Range.second];
I != E; ++I) {
auto &Entry = *I;
if (Entry == OldDest) {
Patched = true;
@ -172,20 +153,16 @@ uint64_t JumpTable::emit(MCStreamer *Streamer,
void JumpTable::print(raw_ostream &OS) const {
uint64_t Offset = 0;
if (Type == JTT_PIC)
OS << "PIC ";
OS << "Jump table " << getName() << " for function " << *Parent << " at 0x"
<< Twine::utohexstr(getAddress()) << " with a total count of " << Count
<< ":\n";
for (const auto EntryOffset : OffsetEntries) {
OS << " " << Twine::utohexstr(EntryOffset) << '\n';
}
for (const auto *Entry : Entries) {
auto LI = Labels.find(Offset);
if (Offset && LI != Labels.end()) {
OS << "Jump Table " << LI->second->getName() << " at 0x"
<< Twine::utohexstr(getAddress() + Offset)
<< " (possibly part of larger jump table):\n";
if (LI != Labels.end()) {
OS << "Jump Table " << LI->second->getName() << " at @0x"
<< Twine::utohexstr(getAddress()+Offset);
if (Offset) {
OS << " (possibly part of larger jump table):\n";
} else {
OS << " with total count of " << Count << ":\n";
}
}
OS << format(" 0x%04" PRIx64 " : ", Offset) << Entry->getName();
if (!Counts.empty()) {
@ -197,3 +174,18 @@ void JumpTable::print(raw_ostream &OS) const {
}
OS << "\n\n";
}
JumpTable::JumpTable(StringRef Name,
uint64_t Address,
std::size_t EntrySize,
JumpTableType Type,
decltype(OffsetEntries) &&OffsetEntries,
decltype(Labels) &&Labels,
BinarySection &Section)
: BinaryData(Name, Address, 0, EntrySize, Section),
EntrySize(EntrySize),
OutputEntrySize(EntrySize),
Type(Type),
OffsetEntries(OffsetEntries),
Labels(Labels)
{ }

View File

@ -30,19 +30,11 @@ enum JumpTableSupportLevel : char {
JTS_AGGRESSIVE = 4, /// Aggressive splitting of jump tables.
};
class BinaryFunction;
/// Representation of a jump table.
///
/// The jump table may include other jump tables that are referenced by
/// a different label at a different offset in this jump table.
class JumpTable : public BinaryData {
friend class BinaryContext;
JumpTable() = delete;
JumpTable(const JumpTable &) = delete;
JumpTable &operator=(const JumpTable &) = delete;
public:
enum JumpTableType : char {
JTT_NORMAL,
@ -68,8 +60,7 @@ public:
std::vector<MCSymbol *> Entries;
/// All the entries as offsets into a function. Invalid after CFG is built.
using OffsetsType = std::vector<uint64_t>;
OffsetsType OffsetEntries;
std::vector<uint64_t> OffsetEntries;
/// Map <Offset> -> <Label> used for embedded jump tables. Label at 0 offset
/// is the main label for the jump table.
@ -84,20 +75,6 @@ public:
/// Total number of times this jump table was used.
uint64_t Count{0};
/// BinaryFunction this jump tables belongs to.
BinaryFunction *Parent{nullptr};
private:
/// Constructor should only be called by a BinaryContext.
JumpTable(StringRef Name,
uint64_t Address,
std::size_t EntrySize,
JumpTableType Type,
LabelMapType &&Labels,
BinaryFunction &BF,
BinarySection &Section);
public:
/// Return the size of the jump table.
uint64_t getSize() const {
return std::max(OffsetEntries.size(), Entries.size()) * EntrySize;
@ -112,6 +89,15 @@ public:
/// starting at (or containing) 'Addr'.
std::pair<size_t, size_t> getEntriesForAddress(const uint64_t Addr) const;
/// Constructor.
JumpTable(StringRef Name,
uint64_t Address,
std::size_t EntrySize,
JumpTableType Type,
decltype(OffsetEntries) &&OffsetEntries,
LabelMapType &&Labels,
BinarySection &Section);
virtual bool isJumpTable() const override { return true; }
/// Change all entries of the jump table in \p JTAddress pointing to

View File

@ -81,7 +81,7 @@ private:
template <typename ValueType>
class MCSimpleAnnotation : public MCAnnotation {
public:
ValueType &getValue() { return Value; }
const ValueType &getValue() const { return Value; }
bool equals(const MCAnnotation &Other) const override {
return Value == static_cast<const MCSimpleAnnotation &>(Other).Value;
}

View File

@ -148,13 +148,12 @@ int64_t MCPlusBuilder::getGnuArgsSize(const MCInst &Inst) const {
return *Value;
}
void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
AllocatorIdTy AllocId) {
void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize) {
assert(GnuArgsSize >= 0 && "cannot set GNU_args_size to negative value");
assert(getGnuArgsSize(Inst) == -1LL && "GNU_args_size already set");
assert(isInvoke(Inst) && "GNU_args_size can only be set for invoke");
setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize, AllocId);
setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize);
}
uint64_t MCPlusBuilder::getJumpTable(const MCInst &Inst) const {
@ -164,24 +163,13 @@ uint64_t MCPlusBuilder::getJumpTable(const MCInst &Inst) const {
return *Value;
}
uint16_t MCPlusBuilder::getJumpTableIndexReg(const MCInst &Inst) const {
return getAnnotationAs<uint16_t>(Inst, "JTIndexReg");
}
bool MCPlusBuilder::setJumpTable(MCInst &Inst, uint64_t Value,
uint16_t IndexReg, AllocatorIdTy AllocId) {
uint16_t IndexReg) {
if (!isIndirectBranch(Inst))
return false;
setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value, AllocId);
getOrCreateAnnotationAs<uint16_t>(Inst, "JTIndexReg", AllocId) = IndexReg;
return true;
}
bool MCPlusBuilder::unsetJumpTable(MCInst &Inst) {
if (!getJumpTable(Inst))
return false;
removeAnnotation(Inst, MCAnnotation::kJumpTable);
removeAnnotation(Inst, "JTIndexReg");
assert(getJumpTable(Inst) == 0 && "jump table already set");
setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value);
addAnnotation<>(Inst, "JTIndexReg", IndexReg);
return true;
}
@ -226,12 +214,41 @@ bool MCPlusBuilder::removeAnnotation(MCInst &Inst, unsigned Index) {
auto ImmValue = AnnotationInst->getOperand(I).getImm();
if (extractAnnotationIndex(ImmValue) == Index) {
AnnotationInst->erase(AnnotationInst->begin() + I);
auto *Annotation =
reinterpret_cast<MCAnnotation *>(extractAnnotationValue(ImmValue));
auto Itr = AnnotationPool.find(Annotation);
if (Itr != AnnotationPool.end()) {
AnnotationPool.erase(Itr);
Annotation->~MCAnnotation();
}
return true;
}
}
return false;
}
void MCPlusBuilder::removeAllAnnotations(MCInst &Inst) {
auto *AnnotationInst = getAnnotationInst(Inst);
if (!AnnotationInst)
return;
for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) {
auto ImmValue = AnnotationInst->getOperand(I).getImm();
AnnotationInst->erase(std::prev(AnnotationInst->end()));
auto *Annotation =
reinterpret_cast<MCAnnotation *>(extractAnnotationValue(ImmValue));
auto Itr = AnnotationPool.find(Annotation);
if (Itr != AnnotationPool.end()) {
AnnotationPool.erase(Itr);
Annotation->~MCAnnotation();
}
}
// Clear all attached MC+ info since it's no longer used.
Inst.erase(std::prev(Inst.end()));
}
void MCPlusBuilder::stripAnnotations(MCInst &Inst) {
auto *AnnotationInst = getAnnotationInst(Inst);
if (!AnnotationInst)
@ -251,7 +268,7 @@ MCPlusBuilder::printAnnotations(const MCInst &Inst, raw_ostream &OS) const {
const auto Index = extractAnnotationIndex(Imm);
const auto Value = extractAnnotationValue(Imm);
const auto *Annotation =
reinterpret_cast<const MCAnnotation *>(Value);
reinterpret_cast<const MCAnnotation *>(Value);
if (Index >= MCAnnotation::kGeneric) {
OS << " # " << AnnotationNames[Index - MCAnnotation::kGeneric]
<< ": ";
@ -266,7 +283,7 @@ bool MCPlusBuilder::evaluateBranch(const MCInst &Inst, uint64_t Addr,
}
void MCPlusBuilder::getClobberedRegs(const MCInst &Inst,
BitVector &Regs) const {
BitVector &Regs) const {
if (isPrefix(Inst) || isCFI(Inst))
return;
@ -285,7 +302,7 @@ void MCPlusBuilder::getClobberedRegs(const MCInst &Inst,
}
void MCPlusBuilder::getTouchedRegs(const MCInst &Inst,
BitVector &Regs) const {
BitVector &Regs) const {
if (isPrefix(Inst) || isCFI(Inst))
return;
@ -308,7 +325,7 @@ void MCPlusBuilder::getTouchedRegs(const MCInst &Inst,
}
void MCPlusBuilder::getWrittenRegs(const MCInst &Inst,
BitVector &Regs) const {
BitVector &Regs) const {
if (isPrefix(Inst) || isCFI(Inst))
return;
@ -364,7 +381,7 @@ bool MCPlusBuilder::hasUseOfPhysReg(const MCInst &MI, unsigned Reg) const {
const BitVector &
MCPlusBuilder::getAliases(MCPhysReg Reg,
bool OnlySmaller) const {
bool OnlySmaller) const {
// AliasMap caches a mapping of registers to the set of registers that
// alias (are sub or superregs of itself, including itself).
static std::vector<BitVector> AliasMap;

View File

@ -35,11 +35,8 @@
#include <cassert>
#include <cstdint>
#include <map>
#include <mutex>
#include <set>
#include <shared_mutex>
#include <system_error>
#include <unordered_map>
#include <unordered_set>
namespace llvm {
@ -47,31 +44,26 @@ namespace bolt {
/// Different types of indirect branches encountered during disassembly.
enum class IndirectBranchType : char {
UNKNOWN = 0, /// Unable to determine type.
POSSIBLE_TAIL_CALL, /// Possibly a tail call.
POSSIBLE_JUMP_TABLE, /// Possibly a switch/jump table.
POSSIBLE_PIC_JUMP_TABLE, /// Possibly a jump table for PIC.
POSSIBLE_GOTO, /// Possibly a gcc's computed goto.
POSSIBLE_FIXED_BRANCH, /// Possibly an indirect branch to a fixed location.
UNKNOWN = 0, /// Unable to determine type.
POSSIBLE_TAIL_CALL, /// Possibly a tail call.
POSSIBLE_JUMP_TABLE, /// Possibly a switch/jump table.
POSSIBLE_PIC_JUMP_TABLE, /// Possibly a jump table for PIC.
POSSIBLE_GOTO, /// Possibly a gcc's computed goto.
POSSIBLE_FIXED_BRANCH, /// Possibly an indirect branch to a fixed location.
};
class MCPlusBuilder {
public:
using AllocatorIdTy = uint16_t;
private:
/// A struct that represents a single annotation allocator
struct AnnotationAllocator {
SpecificBumpPtrAllocator<MCInst> MCInstAllocator;
BumpPtrAllocator ValueAllocator;
std::unordered_set<MCPlus::MCAnnotation *> AnnotationPool;
};
/// Annotation instruction allocator.
SpecificBumpPtrAllocator<MCInst> MCInstAllocator;
/// A set of annotation allocators
std::unordered_map<AllocatorIdTy, AnnotationAllocator> AnnotationAllocators;
/// Annotation value allocator.
BumpPtrAllocator Allocator;
/// A variable that is used to generate unique ids for annotation allocators
AllocatorIdTy MaxAllocatorId = 0;
/// Record all the annotations with non-trivial type. To prevent leaks, these
/// will need destructors called when the annotation is removed or when all
/// annotations are destroyed.
std::unordered_set<MCPlus::MCAnnotation*> AnnotationPool;
/// We encode Index and Value into a 64-bit immediate operand value.
static int64_t encodeAnnotationImm(unsigned Index, int64_t Value) {
@ -108,12 +100,10 @@ private:
return AnnotationInst;
}
void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value,
AllocatorIdTy AllocatorId = 0) {
void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value) {
auto *AnnotationInst = getAnnotationInst(Inst);
if (!AnnotationInst) {
auto &Allocator = getAnnotationAllocator(AllocatorId);
AnnotationInst = new (Allocator.MCInstAllocator.Allocate()) MCInst();
AnnotationInst = new (MCInstAllocator.Allocate()) MCInst();
AnnotationInst->setOpcode(TargetOpcode::ANNOTATION_LABEL);
Inst.addOperand(MCOperand::createInst(AnnotationInst));
}
@ -288,55 +278,20 @@ public:
public:
MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info,
const MCRegisterInfo *RegInfo)
: Analysis(Analysis), Info(Info), RegInfo(RegInfo) {
// Initialize the default annotation allocator with id 0
AnnotationAllocators.emplace(0, AnnotationAllocator());
MaxAllocatorId++;
}
/// Initialize a new annotation allocator and return its id
AllocatorIdTy initializeNewAnnotationAllocator() {
AnnotationAllocators.emplace(MaxAllocatorId, AnnotationAllocator());
return MaxAllocatorId++;
}
/// Return the annotation allocator of a given id
AnnotationAllocator &getAnnotationAllocator(AllocatorIdTy AllocatorId) {
assert(AnnotationAllocators.count(AllocatorId) &&
"allocator not initialized");
return AnnotationAllocators.find(AllocatorId)->second;
}
// Check if an annotation allocator with the given id exists
bool checkAllocatorExists(AllocatorIdTy AllocatorId) {
return AnnotationAllocators.count(AllocatorId);
}
/// Free the values allocator within the annotation allocator
void freeValuesAllocator(AllocatorIdTy AllocatorId) {
auto &Allocator = getAnnotationAllocator(AllocatorId);
for (auto *Annotation : Allocator.AnnotationPool)
Annotation->~MCAnnotation();
Allocator.AnnotationPool.clear();
Allocator.ValueAllocator.Reset();
}
: Analysis(Analysis), Info(Info), RegInfo(RegInfo) {}
virtual ~MCPlusBuilder() {
freeAnnotations();
}
/// Free all memory allocated for annotations
/// Free all memory allocated for annotations.
void freeAnnotations() {
for (auto &Element : AnnotationAllocators) {
auto &Allocator = Element.second;
for (auto *Annotation : Allocator.AnnotationPool)
Annotation->~MCAnnotation();
Allocator.AnnotationPool.clear();
Allocator.ValueAllocator.Reset();
Allocator.MCInstAllocator.DestroyAll();
for (auto *Annotation : AnnotationPool) {
Annotation->~MCAnnotation();
}
AnnotationPool.clear();
MCInstAllocator.DestroyAll();
Allocator.Reset();
}
using CompFuncTy = std::function<bool(const MCSymbol *, const MCSymbol *)>;
@ -379,11 +334,6 @@ public:
return false;
}
/// Check whether we support inverting this branch
virtual bool isUnsupportedBranch(unsigned Opcode) const {
return false;
}
/// Return true of the instruction is of pseudo kind.
bool isPseudo(const MCInst &Inst) const {
return Info->get(Inst.getOpcode()).isPseudo();
@ -403,28 +353,11 @@ public:
llvm_unreachable("not implemented");
}
virtual void createPushRegisterIndirect(MCInst &Inst,
const MCPhysReg &BaseReg, int64_t Scale,
const MCPhysReg &IndexReg, int64_t Offset,
const MCExpr *OffsetExpr,
const MCPhysReg &AddrSegmentReg,
unsigned Size) const {
llvm_unreachable("not implemented");
}
virtual void createPopRegister(MCInst &Inst, MCPhysReg Reg,
unsigned Size) const {
llvm_unreachable("not implemented");
}
virtual void createPushFlags(MCInst &Inst, unsigned Size) const {
llvm_unreachable("not implemented");
}
virtual void createPopFlags(MCInst &Inst, unsigned Size) const {
llvm_unreachable("not implemented");
}
virtual bool createDirectCall(MCInst &Inst, const MCSymbol *Target,
MCContext *Ctx) {
llvm_unreachable("not implemented");
@ -435,22 +368,7 @@ public:
llvm_unreachable("not implemented");
}
virtual MCPhysReg getInstructionPointer() const {
llvm_unreachable("not implemented");
}
/// Return a register number that is guaranteed to not match with
/// any real register on the underlying architecture.
virtual MCPhysReg getNoRegister() const {
llvm_unreachable("not implemented");
}
/// Return a register corresponding to a function integer argument \p ArgNo
/// if the argument is passed in a register. Or return the result of
/// getNoRegister() otherwise. The enumeration starts at 0.
///
/// Note: this should depend on a used calling convention.
virtual MCPhysReg getIntArgRegister(unsigned ArgNo) const {
virtual MCPhysReg getX86NoRegister() const {
llvm_unreachable("not implemented");
}
@ -476,11 +394,6 @@ public:
return false;
}
virtual bool isBreakpoint(const MCInst &Inst) const {
llvm_unreachable("not implemented");
return false;
}
virtual bool isPrefix(const MCInst &Inst) const {
llvm_unreachable("not implemented");
return false;
@ -544,11 +457,6 @@ public:
return false;
}
virtual bool isLfence(const MCInst &Inst) const {
llvm_unreachable("not implemented");
return false;
}
virtual bool isLeave(const MCInst &Inst) const {
llvm_unreachable("not implemented");
return false;
@ -574,11 +482,6 @@ public:
return false;
}
virtual bool isActualLoad(const MCInst &Inst) const {
llvm_unreachable("not implemented");
return false;
}
virtual bool isLoad(const MCInst &Inst) const {
llvm_unreachable("not implemented");
return false;
@ -987,9 +890,9 @@ public:
/// of the passed \p Symbol plus \p Addend. If the instruction does not have
/// an immediate operand or has more than one - then return false. Otherwise
/// return true.
virtual bool replaceImmWithSymbolRef(MCInst &Inst, const MCSymbol *Symbol,
int64_t Addend, MCContext *Ctx,
int64_t &Value, uint64_t RelType) const {
virtual bool replaceImmWithSymbol(MCInst &Inst, MCSymbol *Symbol,
int64_t Addend, MCContext *Ctx,
int64_t &Value, uint64_t RelType) const {
llvm_unreachable("not implemented");
return false;
}
@ -1054,21 +957,14 @@ public:
int64_t getGnuArgsSize(const MCInst &Inst) const;
/// Add the value of GNU_args_size to Inst if it already has EH info.
void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
AllocatorIdTy AllocId = 0);
void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize);
/// Return jump table addressed by this instruction.
uint64_t getJumpTable(const MCInst &Inst) const;
/// Return index register for instruction that uses a jump table.
uint16_t getJumpTableIndexReg(const MCInst &Inst) const;
/// Set jump table addressed by this instruction.
bool setJumpTable(MCInst &Inst, uint64_t Value, uint16_t IndexReg,
AllocatorIdTy AllocId = 0);
/// Disassociate instruction with a jump table.
bool unsetJumpTable(MCInst &Inst);
bool setJumpTable(MCInst &Inst, uint64_t Value,
uint16_t IndexReg);
/// Return destination of conditional tail call instruction if \p Inst is one.
Optional<uint64_t> getConditionalTailCall(const MCInst &Inst) const;
@ -1230,7 +1126,7 @@ public:
}
/// Replace instruction opcode to be a tail call instead of jump.
virtual bool convertJmpToTailCall(MCInst &Inst) {
virtual bool convertJmpToTailCall(MCInst &Inst, MCContext *Ctx) {
llvm_unreachable("not implemented");
return false;
}
@ -1438,32 +1334,6 @@ public:
return false;
}
/// Create instruction to left shift contents of target
virtual bool createShl(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
const MCPhysReg &IndexReg, int64_t Offset,
const MCExpr *OffsetExpr,
const MCPhysReg &AddrSegmentReg,
uint8_t Immediate, int Size) const {
llvm_unreachable("not implemented");
return false;
}
/// Create instruction to load an effective address into a target
virtual bool createLea(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
const MCPhysReg &IndexReg, int64_t Offset,
const MCExpr *OffsetExpr, const MCPhysReg &AddrSegmentReg,
const MCPhysReg &DstReg, int Size) const {
llvm_unreachable("not implemented");
return false;
}
/// Create instruction to increment contents of target by 1
virtual bool createIncMemory(MCInst &Inst, const MCSymbol *Target,
MCContext *Ctx) const {
llvm_unreachable("not implemented");
return false;
}
/// Create a fragment of code (sequence of instructions) that load a 32-bit
/// address from memory, zero-extends it to 64 and jump to it (indirect jump).
virtual bool
@ -1494,21 +1364,6 @@ public:
return true;
}
/// Create an inline version of memcpy(dest, src, 1).
virtual std::vector<MCInst> createOneByteMemcpy() const {
llvm_unreachable("not implemented");
return {};
}
/// Create a sequence of instructions to compare contents of a register
/// \p RegNo to immediate \Imm and jump to \p Target if they are equal.
virtual std::vector<MCInst>
createCmpJE(MCPhysReg RegNo, int64_t Imm, const MCSymbol *Target,
MCContext *Ctx) const {
llvm_unreachable("not implemented");
return {};
}
/// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
/// (dest + n) instead of dest.
virtual std::vector<MCInst> createInlineMemcpy(bool ReturnEnd) const {
@ -1556,6 +1411,7 @@ public:
return true;
}
/// Return annotation index matching the \p Name.
Optional<unsigned> getAnnotationIndex(StringRef Name) const {
auto AI = AnnotationNameIndexMap.find(Name);
@ -1581,30 +1437,25 @@ public:
/// Store an annotation value on an MCInst. This assumes the annotation
/// is not already present.
template <typename ValueType>
const ValueType &addAnnotation(MCInst &Inst, unsigned Index,
const ValueType &Val,
AllocatorIdTy AllocatorId = 0) {
const ValueType &addAnnotation(MCInst &Inst,
unsigned Index,
const ValueType &Val) {
assert(!hasAnnotation(Inst, Index));
auto &Allocator = getAnnotationAllocator(AllocatorId);
auto *A = new (Allocator.ValueAllocator)
MCPlus::MCSimpleAnnotation<ValueType>(Val);
auto *A = new (Allocator) MCPlus::MCSimpleAnnotation<ValueType>(Val);
if (!std::is_trivial<ValueType>::value) {
Allocator.AnnotationPool.insert(A);
AnnotationPool.insert(A);
}
setAnnotationOpValue(Inst, Index, reinterpret_cast<int64_t>(A),
AllocatorId);
setAnnotationOpValue(Inst, Index, reinterpret_cast<int64_t>(A));
return A->getValue();
}
/// Store an annotation value on an MCInst. This assumes the annotation
/// is not already present.
template <typename ValueType>
const ValueType &addAnnotation(MCInst &Inst, StringRef Name,
const ValueType &Val,
AllocatorIdTy AllocatorId = 0) {
return addAnnotation(Inst, getOrCreateAnnotationIndex(Name), Val,
AllocatorId);
const ValueType &addAnnotation(MCInst &Inst,
StringRef Name,
const ValueType &Val) {
return addAnnotation(Inst, getOrCreateAnnotationIndex(Name), Val);
}
/// Get an annotation as a specific value, but if the annotation does not
@ -1612,13 +1463,12 @@ public:
/// Return a non-const ref so caller can freely modify its contents
/// afterwards.
template <typename ValueType>
ValueType &getOrCreateAnnotationAs(MCInst &Inst, unsigned Index,
AllocatorIdTy AllocatorId = 0) {
ValueType& getOrCreateAnnotationAs(MCInst &Inst, unsigned Index) {
auto Val =
tryGetAnnotationAs<ValueType>(const_cast<const MCInst &>(Inst), Index);
tryGetAnnotationAs<ValueType>(const_cast<const MCInst &>(Inst), Index);
if (!Val)
Val = addAnnotation(Inst, Index, ValueType(), AllocatorId);
return const_cast<ValueType &>(*Val);
Val = addAnnotation(Inst, Index, ValueType());
return const_cast<ValueType&>(*Val);
}
/// Get an annotation as a specific value, but if the annotation does not
@ -1626,26 +1476,25 @@ public:
/// Return a non-const ref so caller can freely modify its contents
/// afterwards.
template <typename ValueType>
ValueType &getOrCreateAnnotationAs(MCInst &Inst, StringRef Name,
AllocatorIdTy AllocatorId = 0) {
ValueType& getOrCreateAnnotationAs(MCInst &Inst, StringRef Name) {
const auto Index = getOrCreateAnnotationIndex(Name);
return getOrCreateAnnotationAs<ValueType>(Inst, Index, AllocatorId);
return getOrCreateAnnotationAs<ValueType>(Inst, Index);
}
/// Get an annotation as a specific value. Assumes that the annotation exists.
/// Use hasAnnotation() if the annotation may not exist.
template <typename ValueType>
ValueType &getAnnotationAs(const MCInst &Inst, unsigned Index) const {
const ValueType &getAnnotationAs(const MCInst &Inst, unsigned Index) const {
auto Value = getAnnotationOpValue(Inst, Index);
assert(Value && "annotation should exist");
return reinterpret_cast<MCPlus::MCSimpleAnnotation<ValueType> *>
return reinterpret_cast<const MCPlus::MCSimpleAnnotation<ValueType> *>
(*Value)->getValue();
}
/// Get an annotation as a specific value. Assumes that the annotation exists.
/// Use hasAnnotation() if the annotation may not exist.
template <typename ValueType>
ValueType &getAnnotationAs(const MCInst &Inst, StringRef Name) const {
const ValueType &getAnnotationAs(const MCInst &Inst, StringRef Name) const {
const auto Index = getAnnotationIndex(Name);
assert(Index && "annotation should exist");
return getAnnotationAs<ValueType>(Inst, *Index);
@ -1737,6 +1586,9 @@ public:
return removeAnnotation(Inst, *Index);
}
/// Remove all meta-data annotations from Inst.
void removeAllAnnotations(MCInst &Inst);
/// Remove meta-data, but don't destroy it.
void stripAnnotations(MCInst &Inst);
@ -1758,13 +1610,8 @@ public:
/// empty vector of instructions. The label is meant to indicate the basic
/// block where all previous snippets are joined, i.e. the instructions that
/// would immediate follow the original call.
using BlocksVectorTy = std::vector<std::pair<MCSymbol*, std::vector<MCInst>>>;
struct MultiBlocksCode {
BlocksVectorTy Blocks;
std::vector<MCSymbol*> Successors;
};
virtual BlocksVectorTy indirectCallPromotion(
using ICPdata = std::vector<std::pair<MCSymbol*, std::vector<MCInst>>>;
virtual ICPdata indirectCallPromotion(
const MCInst &CallInst,
const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
const std::vector<std::pair<MCSymbol *, uint64_t>> &VtableSyms,
@ -1773,18 +1620,19 @@ public:
MCContext *Ctx
) {
llvm_unreachable("not implemented");
return BlocksVectorTy();
return ICPdata();
}
virtual BlocksVectorTy jumpTablePromotion(
virtual ICPdata jumpTablePromotion(
const MCInst &IJmpInst,
const std::vector<std::pair<MCSymbol *,uint64_t>>& Targets,
const std::vector<MCInst *> &TargetFetchInsns,
MCContext *Ctx
) const {
llvm_unreachable("not implemented");
return BlocksVectorTy();
return ICPdata();
}
};
} // namespace bolt

View File

@ -1,232 +0,0 @@
//===--- ParallelUtilities.cpp -------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "ParallelUtilities.h"
#include "llvm/Support/Timer.h"
#include <mutex>
#include <shared_mutex>
#define DEBUG_TYPE "par-utils"
namespace opts {
extern cl::OptionCategory BoltCategory;
cl::opt<unsigned>
ThreadCount("thread-count",
cl::desc("number of threads"),
cl::init(hardware_concurrency()),
cl::cat(BoltCategory));
cl::opt<bool>
NoThreads("no-threads",
cl::desc("disable multithreading"),
cl::init(false),
cl::cat(BoltCategory));
cl::opt<unsigned>
TaskCount("tasks-per-thread",
cl::desc("number of tasks to be created per thread"),
cl::init(20),
cl::cat(BoltCategory));
} // namespace opts
namespace llvm {
namespace bolt {
namespace ParallelUtilities {
namespace {
/// A single thread pool that is used to run parallel tasks
std::unique_ptr<ThreadPool> ThreadPoolPtr;
unsigned computeCostFor(const BinaryFunction &BF,
const PredicateTy &SkipPredicate,
const SchedulingPolicy &SchedPolicy) {
if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
return 1;
if (SkipPredicate && SkipPredicate(BF))
return 0;
switch (SchedPolicy) {
case SchedulingPolicy::SP_CONSTANT:
return 1;
case SchedulingPolicy::SP_INST_LINEAR:
return BF.getSize();
case SchedulingPolicy::SP_INST_QUADRATIC:
return BF.getSize() * BF.getSize();
case SchedulingPolicy::SP_BB_LINEAR:
return BF.size();
case SchedulingPolicy::SP_BB_QUADRATIC:
return BF.size() * BF.size();
default:
llvm_unreachable("unsupported scheduling policy");
}
}
inline unsigned estimateTotalCost(const BinaryContext &BC,
const PredicateTy &SkipPredicate,
SchedulingPolicy &SchedPolicy) {
if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
return BC.getBinaryFunctions().size();
unsigned TotalCost = 0;
for (auto &BFI : BC.getBinaryFunctions()) {
auto &BF = BFI.second;
TotalCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
}
// Switch to trivial scheduling if total estimated work is zero
if (TotalCost == 0) {
outs() << "BOLT-WARNING: Running parallel work of 0 estimated cost, will "
"switch to trivial scheduling.\n";
SchedPolicy = SP_TRIVIAL;
TotalCost = BC.getBinaryFunctions().size();
}
return TotalCost;
}
} // namespace
ThreadPool &getThreadPool() {
if (ThreadPoolPtr.get())
return *ThreadPoolPtr;
ThreadPoolPtr = std::make_unique<ThreadPool>(opts::ThreadCount);
return *ThreadPoolPtr;
}
void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
WorkFuncTy WorkFunction, PredicateTy SkipPredicate,
std::string LogName, bool ForceSequential,
unsigned TasksPerThread) {
if (BC.getBinaryFunctions().size() == 0)
return;
auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
std::map<uint64_t, BinaryFunction>::iterator BlockEnd) {
Timer T(LogName, LogName);
DEBUG(T.startTimer());
for (auto It = BlockBegin; It != BlockEnd; ++It) {
auto &BF = It->second;
if (SkipPredicate && SkipPredicate(BF))
continue;
WorkFunction(BF);
}
DEBUG(T.stopTimer());
};
if (opts::NoThreads || ForceSequential) {
runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end());
return;
}
// Estimate the overall runtime cost using the scheduling policy
const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
const unsigned BlockCost =
TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;
// Divide work into blocks of equal cost
ThreadPool &Pool = getThreadPool();
auto BlockBegin = BC.getBinaryFunctions().begin();
unsigned CurrentCost = 0;
for (auto It = BC.getBinaryFunctions().begin();
It != BC.getBinaryFunctions().end(); ++It) {
auto &BF = It->second;
CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
if (CurrentCost >= BlockCost) {
Pool.async(runBlock, BlockBegin, std::next(It));
BlockBegin = std::next(It);
CurrentCost = 0;
}
}
Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end());
Pool.wait();
}
void runOnEachFunctionWithUniqueAllocId(
BinaryContext &BC, SchedulingPolicy SchedPolicy,
WorkFuncWithAllocTy WorkFunction, PredicateTy SkipPredicate,
std::string LogName, bool ForceSequential, unsigned TasksPerThread) {
if (BC.getBinaryFunctions().size() == 0)
return;
std::shared_timed_mutex MainLock;
auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
std::map<uint64_t, BinaryFunction>::iterator BlockEnd,
MCPlusBuilder::AllocatorIdTy AllocId) {
Timer T(LogName, LogName);
DEBUG(T.startTimer());
std::shared_lock<std::shared_timed_mutex> Lock(MainLock);
for (auto It = BlockBegin; It != BlockEnd; ++It) {
auto &BF = It->second;
if (SkipPredicate && SkipPredicate(BF))
continue;
WorkFunction(BF, AllocId);
}
DEBUG(T.stopTimer());
};
if (opts::NoThreads || ForceSequential) {
runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end(), 0);
return;
}
// This lock is used to postpone task execution
std::unique_lock<std::shared_timed_mutex> Lock(MainLock);
// Estimate the overall runtime cost using the scheduling policy
const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
const unsigned BlockCost =
TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;
// Divide work into blocks of equal cost
ThreadPool &Pool = getThreadPool();
auto BlockBegin = BC.getBinaryFunctions().begin();
unsigned CurrentCost = 0;
unsigned AllocId = 1;
for (auto It = BC.getBinaryFunctions().begin();
It != BC.getBinaryFunctions().end(); ++It) {
auto &BF = It->second;
CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
if (CurrentCost >= BlockCost) {
if (!BC.MIB->checkAllocatorExists(AllocId)) {
auto Id = BC.MIB->initializeNewAnnotationAllocator();
assert(AllocId == Id && "unexpected allocator id created");
}
Pool.async(runBlock, BlockBegin, std::next(It), AllocId);
AllocId++;
BlockBegin = std::next(It);
CurrentCost = 0;
}
}
if (!BC.MIB->checkAllocatorExists(AllocId)) {
auto Id = BC.MIB->initializeNewAnnotationAllocator();
assert(AllocId == Id && "unexpected allocator id created");
}
Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end(), AllocId);
Lock.unlock();
Pool.wait();
}
} // namespace ParallelUtilities
} // namespace bolt
} // namespace llvm

View File

@ -1,78 +0,0 @@
//===-- ParallelUtilities.h - ----------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// This class creates an interface that can be used to run parallel tasks that
// operate on functions. Several scheduling criteria are supported using
// SchedulingPolicy, and are defined by how the runtime cost should be
// estimated.
// If the NoThreads flags is passed, work will execute sequentially.
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
#define LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
#include "BinaryContext.h"
#include "BinaryFunction.h"
#include "MCPlusBuilder.h"
#include "llvm/Support/ThreadPool.h"
using namespace llvm;
namespace opts {
extern cl::opt<unsigned> ThreadCount;
extern cl::opt<bool> NoThreads;
extern cl::opt<unsigned> TaskCount;
}
namespace llvm {
namespace bolt {
namespace ParallelUtilities {
using WorkFuncWithAllocTy =
std::function<void(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy)>;
using WorkFuncTy = std::function<void(BinaryFunction &BF)>;
using PredicateTy = std::function<bool(const BinaryFunction &BF)>;
enum SchedulingPolicy {
SP_TRIVIAL, /// cost is estimated by the number of functions
SP_CONSTANT, /// cost is estimated by the number of non-skipped functions
SP_INST_LINEAR, /// cost is estimated by inst count
SP_INST_QUADRATIC, /// cost is estimated by the square of the inst count
SP_BB_LINEAR, /// cost is estimated by BB count
SP_BB_QUADRATIC, /// cost is estimated by the square of the BB count
};
/// Return the managed threadpool and initialize it if not intiliazed
ThreadPool &getThreadPool();
/// Perform the work on each BinaryFunction except those that are accepted
/// by SkipPredicate, scheduling heuristic is based on SchedPolicy.
/// ForceSequential will selectively disable parallel execution and perform the
/// work sequentially.
void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
WorkFuncTy WorkFunction,
PredicateTy SkipPredicate = PredicateTy(),
std::string LogName = "", bool ForceSequential = false,
unsigned TasksPerThread = opts::TaskCount);
/// Perform the work on each BinaryFunction except those that are rejected
/// by SkipPredicate, and create a unique annotation allocator for each
/// task. This should be used whenever the work function creates annotations to
/// allow thread-safe annotation creation.
/// ForceSequential will selectively disable parallel execution and perform the
/// work sequentially.
void runOnEachFunctionWithUniqueAllocId(
BinaryContext &BC, SchedulingPolicy SchedPolicy,
WorkFuncWithAllocTy WorkFunction, PredicateTy SkipPredicate,
std::string LogName = "", bool ForceSequential = false,
unsigned TasksPerThread = opts::TaskCount);
} // namespace ParallelUtilities
} // namespace bolt
} // namespace llvm
#endif

View File

@ -10,7 +10,6 @@
//===----------------------------------------------------------------------===//
#include "Aligner.h"
#include "ParallelUtilities.h"
#define DEBUG_TYPE "bolt-aligner"
@ -89,16 +88,16 @@ void alignMaxBytes(BinaryFunction &Function) {
// the fuction by not more than the minimum over
// -- the size of the function
// -- the specified number of bytes
void alignCompact(BinaryFunction &Function, const MCCodeEmitter *Emitter) {
void alignCompact(BinaryFunction &Function) {
const auto &BC = Function.getBinaryContext();
size_t HotSize = 0;
size_t ColdSize = 0;
for (const auto *BB : Function.layout()) {
if (BB->isCold())
ColdSize += BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
ColdSize += BC.computeCodeSize(BB->begin(), BB->end());
else
HotSize += BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
HotSize += BC.computeCodeSize(BB->begin(), BB->end());
}
Function.setAlignment(opts::AlignFunctions);
@ -115,15 +114,13 @@ void alignCompact(BinaryFunction &Function, const MCCodeEmitter *Emitter) {
} // end anonymous namespace
void AlignerPass::alignBlocks(BinaryFunction &Function,
const MCCodeEmitter *Emitter) {
void AlignerPass::alignBlocks(BinaryFunction &Function) {
if (!Function.hasValidProfile() || !Function.isSimple())
return;
const auto &BC = Function.getBinaryContext();
const auto FuncCount =
std::max<uint64_t>(1, Function.getKnownExecutionCount());
const auto FuncCount = std::max(1UL, Function.getKnownExecutionCount());
BinaryBasicBlock *PrevBB{nullptr};
for (auto *BB : Function.layout()) {
auto Count = BB->getKnownExecutionCount();
@ -142,9 +139,8 @@ void AlignerPass::alignBlocks(BinaryFunction &Function,
if (Count < FTCount * 2)
continue;
const auto BlockSize = BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
const auto BytesToUse =
std::min<uint64_t>(opts::BlockAlignment - 1, BlockSize);
const auto BlockSize = BC.computeCodeSize(BB->begin(), BB->end());
const auto BytesToUse = std::min(opts::BlockAlignment - 1UL, BlockSize);
if (opts::AlignBlocksMinSize && BlockSize < opts::AlignBlocksMinSize)
continue;
@ -153,36 +149,30 @@ void AlignerPass::alignBlocks(BinaryFunction &Function,
BB->setAlignmentMaxBytes(BytesToUse);
// Update stats.
DEBUG(
std::unique_lock<std::shared_timed_mutex> Lock(AlignHistogramMtx);
AlignHistogram[BytesToUse]++;
AlignedBlocksCount += BB->getKnownExecutionCount();
);
AlignHistogram[BytesToUse]++;
AlignedBlocksCount += BB->getKnownExecutionCount();
}
}
void AlignerPass::runOnFunctions(BinaryContext &BC) {
void AlignerPass::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
if (!BC.HasRelocations)
return;
AlignHistogram.resize(opts::BlockAlignment);
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
// Create a separate MCCodeEmitter to allow lock free execution
auto Emitter = BC.createIndependentMCCodeEmitter();
for (auto &It : BFs) {
auto &Function = It.second;
if (opts::UseCompactAligner)
alignCompact(BF, Emitter.MCE.get());
alignCompact(Function);
else
alignMaxBytes(BF);
alignMaxBytes(Function);
if (opts::AlignBlocks && !opts::PreserveBlocksAlignment)
alignBlocks(BF, Emitter.MCE.get());
};
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun,
ParallelUtilities::PredicateTy(nullptr), "AlignerPass");
alignBlocks(Function);
}
DEBUG(
dbgs() << "BOLT-DEBUG: max bytes per basic block alignment distribution:\n";

View File

@ -19,15 +19,15 @@ namespace bolt {
class AlignerPass : public BinaryFunctionPass {
private:
/// Stats for usage of max bytes for basic block alignment.
std::vector<uint32_t> AlignHistogram;
std::shared_timed_mutex AlignHistogramMtx;
/// Stats: execution count of blocks that were aligned.
std::atomic<uint64_t> AlignedBlocksCount{0};
uint64_t AlignedBlocksCount{0};
/// Assign alignment to basic blocks based on profile.
void alignBlocks(BinaryFunction &Function, const MCCodeEmitter *Emitter);
void alignBlocks(BinaryFunction &Function);
public:
explicit AlignerPass() : BinaryFunctionPass(false) {}
@ -37,7 +37,9 @@ public:
}
/// Pass entry point
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt

View File

@ -100,13 +100,14 @@ void AllocCombinerPass::combineAdjustments(BinaryContext &BC,
}
}
void AllocCombinerPass::runOnFunctions(BinaryContext &BC) {
void AllocCombinerPass::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
if (opts::FrameOptimization == FOP_NONE)
return;
runForAllWeCare(
BC.getBinaryFunctions(),
[&](BinaryFunction &Function) { combineAdjustments(BC, Function); });
BFs, [&](BinaryFunction &Function) { combineAdjustments(BC, Function); });
outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
<< " empty spaces coalesced.\n";

View File

@ -40,7 +40,9 @@ public:
}
/// Pass entry point
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt

View File

@ -77,6 +77,7 @@ std::deque<BinaryFunction *> BinaryFunctionCallGraph::buildTraversalOrder() {
}
BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
CgFilterFunction Filter,
bool CgFromPerfData,
bool IncludeColdCalls,
@ -125,7 +126,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
uint64_t NoProfileCallsites = 0;
uint64_t NumFallbacks = 0;
uint64_t RecursiveCallsites = 0;
for (auto &It : BC.getBinaryFunctions()) {
for (auto &It : BFs) {
auto *Function = &It.second;
if (Filter(*Function)) {

View File

@ -57,7 +57,7 @@ private:
using CgFilterFunction = std::function<bool (const BinaryFunction &BF)>;
inline bool NoFilter(const BinaryFunction &) { return false; }
/// Builds a call graph from the map of BinaryFunctions provided in BC.
/// Builds a call graph from the map of BinaryFunctions provided in BFs.
/// The arguments control how the graph is constructed.
/// Filter is called on each function, any function that it returns true for
/// is omitted from the graph.
@ -68,6 +68,7 @@ inline bool NoFilter(const BinaryFunction &) { return false; }
/// UseEdgeCounts is used to control if the Weight attribute on Arcs is computed
/// using the number of calls.
BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
CgFilterFunction Filter = NoFilter,
bool CgFromPerfData = false,
bool IncludeColdCalls = true,

View File

@ -10,12 +10,9 @@
//===----------------------------------------------------------------------===//
#include "BinaryPasses.h"
#include "ParallelUtilities.h"
#include "Passes/ReorderAlgorithm.h"
#include "llvm/Support/Options.h"
#include <numeric>
#include <vector>
#define DEBUG_TYPE "bolt-opts"
@ -57,10 +54,8 @@ extern cl::OptionCategory BoltOptCategory;
extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
extern cl::opt<unsigned> Verbosity;
extern cl::opt<bool> SplitEH;
extern cl::opt<bool> EnableBAT;
extern cl::opt<bolt::BinaryFunction::SplittingType> SplitFunctions;
extern bool shouldProcess(const bolt::BinaryFunction &Function);
extern bool isHotTextMover(const bolt::BinaryFunction &Function);
enum DynoStatsSortOrder : char {
Ascending,
@ -139,22 +134,6 @@ PrintSortedBy("print-sorted-by",
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::opt<bool>
PrintUnknown("print-unknown",
cl::desc("print names of functions with unknown control flow"),
cl::init(false),
cl::ZeroOrMore,
cl::cat(BoltCategory),
cl::Hidden);
static cl::opt<bool>
PrintUnknownCFG("print-unknown-cfg",
cl::desc("dump CFG of functions with unknown control flow"),
cl::init(false),
cl::ZeroOrMore,
cl::cat(BoltCategory),
cl::ReallyHidden);
static cl::opt<bolt::ReorderBasicBlocks::LayoutType>
ReorderBlocks("reorder-blocks",
cl::desc("change layout of basic blocks in a function"),
@ -288,7 +267,7 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
if (!BB->isValid()) {
dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName()
<< " in function " << Function << "\n";
Function.dump();
BB->dump();
}
}
});
@ -296,10 +275,7 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
DeletedBlocks += Count;
DeletedBytes += Bytes;
if (Count) {
{
std::unique_lock<std::shared_timed_mutex> Lock(ModifiedMtx);
Modified.insert(&Function);
}
Modified.insert(&Function);
if (opts::Verbosity > 0) {
outs() << "BOLT-INFO: Removed " << Count
<< " dead basic block(s) accounting for " << Bytes
@ -309,19 +285,17 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
}
}
void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
runOnFunction(BF);
};
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
return !shouldOptimize(BF);
};
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
"EliminateUnreachableBlocks");
void EliminateUnreachableBlocks::runOnFunctions(
BinaryContext&,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &
) {
for (auto &It : BFs) {
auto &Function = It.second;
if (shouldOptimize(Function)) {
runOnFunction(Function);
}
}
outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
<< DeletedBytes << " bytes of code.\n";
}
@ -331,43 +305,43 @@ bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
}
void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
void ReorderBasicBlocks::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
return;
IsAArch64 = BC.isAArch64();
std::atomic<uint64_t> ModifiedFuncCount{0};
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
uint64_t ModifiedFuncCount = 0;
for (auto &It : BFs) {
auto &Function = It.second;
if (!shouldOptimize(Function))
continue;
const bool ShouldSplit =
(opts::SplitFunctions == BinaryFunction::ST_ALL) ||
(opts::SplitFunctions == BinaryFunction::ST_EH && BF.hasEHRanges()) ||
BF.shouldSplit();
modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters,
(opts::SplitFunctions == BinaryFunction::ST_ALL) ||
(opts::SplitFunctions == BinaryFunction::ST_EH &&
Function.hasEHRanges()) ||
(LargeFunctions.find(It.first) != LargeFunctions.end());
modifyFunctionLayout(Function, opts::ReorderBlocks, opts::MinBranchClusters,
ShouldSplit);
if (BF.hasLayoutChanged()) {
if (Function.hasLayoutChanged()) {
++ModifiedFuncCount;
}
};
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
return !shouldOptimize(BF);
};
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
"ReorderBasicBlocks");
}
outs() << "BOLT-INFO: basic block reordering modified layout of "
<< format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(),
100.0 * ModifiedFuncCount.load() /
BC.getBinaryFunctions().size());
<< format("%zu (%.2lf%%) functions\n",
ModifiedFuncCount, 100.0 * ModifiedFuncCount / BFs.size());
if (opts::PrintFuncStat > 0) {
raw_ostream &OS = outs();
// Copy all the values into vector in order to sort them
std::map<uint64_t, BinaryFunction &> ScoreMap;
auto &BFs = BC.getBinaryFunctions();
for (auto It = BFs.begin(); It != BFs.end(); ++It) {
ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
It->second.getFunctionScore(), It->second));
@ -375,8 +349,8 @@ void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
OS << " There are " << BFs.size() << " functions in total. \n";
OS << " Number of functions being modified: "
<< ModifiedFuncCount.load() << "\n";
OS << " Number of functions being modified: " << ModifiedFuncCount
<< "\n";
OS << " User asks for detailed information on top "
<< opts::PrintFuncStat << " functions. (Ranked by function score)"
<< "\n\n";
@ -576,8 +550,11 @@ void ReorderBasicBlocks::splitFunction(BinaryFunction &BF) const {
}
}
void FixupBranches::runOnFunctions(BinaryContext &BC) {
for (auto &It : BC.getBinaryFunctions()) {
void FixupBranches::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &) {
for (auto &It : BFs) {
auto &Function = It.second;
if (BC.HasRelocations || shouldOptimize(Function)) {
if (BC.HasRelocations && !Function.isSimple())
@ -587,38 +564,42 @@ void FixupBranches::runOnFunctions(BinaryContext &BC) {
}
}
void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
if (shouldOptimize(BF) && !BF.finalizeCFIState()) {
void FinalizeFunctions::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &
) {
for (auto &It : BFs) {
auto &Function = It.second;
const auto ShouldOptimize = shouldOptimize(Function);
// Always fix functions in relocation mode.
if (!BC.HasRelocations && !ShouldOptimize)
continue;
// Fix the CFI state.
if (ShouldOptimize && !Function.finalizeCFIState()) {
if (BC.HasRelocations) {
errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
<< ". Exiting.\n";
errs() << "BOLT-ERROR: unable to fix CFI state for function "
<< Function << ". Exiting.\n";
exit(1);
}
BF.setSimple(false);
return;
Function.setSimple(false);
continue;
}
BF.setFinalized();
Function.setFinalized();
// Update exception handling information.
BF.updateEHRanges();
};
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
return !BC.HasRelocations && !shouldOptimize(BF);
};
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
SkipPredicate, "FinalizeFunctions");
Function.updateEHRanges();
}
}
void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
std::vector<std::pair<MCInst *, uint64_t>> PreservedSDTAnnotations;
std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
for (auto &It : BC.getBinaryFunctions()) {
void LowerAnnotations::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &) {
for (auto &It : BFs) {
auto &BF = It.second;
int64_t CurrentGnuArgsSize = 0;
@ -631,12 +612,9 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
CurrentGnuArgsSize = 0;
}
// First convert GnuArgsSize annotations into CFIs. This may change instr
// pointers, so do it before recording ptrs for preserved annotations
if (BF.usesGnuArgsSize()) {
for (auto II = BB->begin(); II != BB->end(); ++II) {
if (!BC.MIB->isInvoke(*II))
continue;
for (auto II = BB->begin(); II != BB->end(); ++II) {
// Convert GnuArgsSize annotations into CFIs.
if (BF.usesGnuArgsSize() && BC.MIB->isInvoke(*II)) {
const auto NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size");
if (NewGnuArgsSize != CurrentGnuArgsSize) {
@ -646,33 +624,13 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
II = std::next(InsertII);
}
}
}
// Now record preserved annotations separately and then strip annotations
for (auto II = BB->begin(); II != BB->end(); ++II) {
if (BC.MIB->hasAnnotation(*II, "SDTMarker")) {
PreservedSDTAnnotations.push_back(std::make_pair(
&(*II), BC.MIB->getAnnotationAs<uint64_t>(*II, "SDTMarker")));
}
if (opts::EnableBAT && BC.MIB->hasAnnotation(*II, "Offset")) {
PreservedOffsetAnnotations.push_back(std::make_pair(
&(*II), BC.MIB->getAnnotationAs<uint32_t>(*II, "Offset")));
}
BC.MIB->stripAnnotations(*II);
BC.MIB->removeAllAnnotations(*II);
}
}
}
// Release all memory taken by annotations
// Release all memory taken by annotations.
BC.MIB->freeAnnotations();
// Reinsert preserved annotations we need during code emission.
for (const auto &Item : PreservedSDTAnnotations)
BC.MIB->addAnnotation<uint64_t>(*Item.first, "SDTMarker", Item.second);
for (const auto &Item : PreservedOffsetAnnotations)
BC.MIB->addAnnotation<uint32_t>(*Item.first, "Offset", Item.second);
}
namespace {
@ -1026,11 +984,15 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC,
return NumLocalCTCs > 0;
}
void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
void SimplifyConditionalTailCalls::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &
) {
if (!BC.isX86())
return;
for (auto &It : BC.getBinaryFunctions()) {
for (auto &It : BFs) {
auto &Function = It.second;
if (!shouldOptimize(Function))
@ -1118,7 +1080,9 @@ void Peepholes::removeUselessCondBranches(BinaryContext &BC,
}
}
void Peepholes::runOnFunctions(BinaryContext &BC) {
void Peepholes::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
const char Opts =
std::accumulate(opts::Peepholes.begin(),
opts::Peepholes.end(),
@ -1129,7 +1093,7 @@ void Peepholes::runOnFunctions(BinaryContext &BC) {
if (Opts == opts::PEEP_NONE || !BC.isX86())
return;
for (auto &It : BC.getBinaryFunctions()) {
for (auto &It : BFs) {
auto &Function = It.second;
if (shouldOptimize(Function)) {
if (Opts & opts::PEEP_SHORTEN)
@ -1233,8 +1197,12 @@ bool SimplifyRODataLoads::simplifyRODataLoads(
return NumLocalLoadsSimplified > 0;
}
void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
for (auto &It : BC.getBinaryFunctions()) {
void SimplifyRODataLoads::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &
) {
for (auto &It : BFs) {
auto &Function = It.second;
if (shouldOptimize(Function) && simplifyRODataLoads(BC, Function)) {
Modified.insert(&Function);
@ -1248,156 +1216,24 @@ void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
<< "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
}
void AssignSections::runOnFunctions(BinaryContext &BC) {
for (auto *Function : BC.getInjectedBinaryFunctions()) {
Function->setCodeSectionName(BC.getInjectedCodeSectionName());
Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
}
// In non-relocation mode functions have pre-assigned section names.
if (!BC.HasRelocations)
return;
const auto UseColdSection = BC.NumProfiledFuncs > 0;
for (auto &BFI : BC.getBinaryFunctions()) {
auto &Function = BFI.second;
if (opts::isHotTextMover(Function)) {
Function.setCodeSectionName(BC.getHotTextMoverSectionName());
Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
continue;
}
if (!UseColdSection ||
Function.hasValidIndex() ||
Function.hasValidProfile()) {
Function.setCodeSectionName(BC.getMainCodeSectionName());
} else {
Function.setCodeSectionName(BC.getColdCodeSectionName());
}
if (Function.isSplit())
Function.setColdCodeSectionName(BC.getColdCodeSectionName());
}
}
void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
double FlowImbalanceMean = 0.0;
size_t NumBlocksConsidered = 0;
double WorstBias = 0.0;
const BinaryFunction *WorstBiasFunc = nullptr;
// For each function CFG, we fill an IncomingMap with the sum of the frequency
// of incoming edges for each BB. Likewise for each OutgoingMap and the sum
// of the frequency of outgoing edges.
using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
// Compute mean
for (const auto &BFI : BC.getBinaryFunctions()) {
const BinaryFunction &Function = BFI.second;
if (Function.empty() || !Function.isSimple())
continue;
FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
for (const auto &BB : Function) {
auto TotalOutgoing = 0ULL;
auto SuccBIIter = BB.branch_info_begin();
for (auto Succ : BB.successors()) {
auto Count = SuccBIIter->Count;
if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
++SuccBIIter;
continue;
}
TotalOutgoing += Count;
IncomingMap[Succ] += Count;
++SuccBIIter;
}
OutgoingMap[&BB] = TotalOutgoing;
}
size_t NumBlocks = 0;
double Mean = 0.0;
for (const auto &BB : Function) {
// Do not compute score for low frequency blocks, entry or exit blocks
if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
continue;
++NumBlocks;
const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
Mean += fabs(Difference / IncomingMap[&BB]);
}
FlowImbalanceMean += Mean;
NumBlocksConsidered += NumBlocks;
if (!NumBlocks)
continue;
double FuncMean = Mean / NumBlocks;
if (FuncMean > WorstBias) {
WorstBias = FuncMean;
WorstBiasFunc = &Function;
}
}
if (NumBlocksConsidered > 0)
FlowImbalanceMean /= NumBlocksConsidered;
// Compute standard deviation
NumBlocksConsidered = 0;
double FlowImbalanceVar = 0.0;
for (const auto &BFI : BC.getBinaryFunctions()) {
const BinaryFunction &Function = BFI.second;
if (Function.empty() || !Function.isSimple())
continue;
FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
for (const auto &BB : Function) {
if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
continue;
++NumBlocksConsidered;
const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
FlowImbalanceVar +=
pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
}
}
if (NumBlocksConsidered) {
FlowImbalanceVar /= NumBlocksConsidered;
FlowImbalanceVar = sqrt(FlowImbalanceVar);
}
// Report to user
outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
(100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
if (WorstBiasFunc && opts::Verbosity >= 1) {
outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
<< "\n";
DEBUG(WorstBiasFunc->dump());
}
}
void
PrintProgramStats::runOnFunctions(BinaryContext &BC) {
PrintProgramStats::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &) {
uint64_t NumSimpleFunctions{0};
uint64_t NumStaleProfileFunctions{0};
uint64_t NumNonSimpleProfiledFunctions{0};
uint64_t NumUnknownControlFlowFunctions{0};
std::vector<BinaryFunction *> ProfiledFunctions;
const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
for (auto &BFI : BC.getBinaryFunctions()) {
for (auto &BFI : BFs) {
auto &Function = BFI.second;
if (!Function.isSimple()) {
if (Function.hasProfile() && !Function.isPLTFunction()) {
if (Function.hasProfile()) {
++NumNonSimpleProfiledFunctions;
}
continue;
}
++NumSimpleFunctions;
if (Function.hasUnknownControlFlow()) {
if (opts::PrintUnknownCFG) {
Function.dump();
} else if (opts::PrintUnknown) {
errs() << "function with unknown control flow: " << Function <<'\n';
}
++NumUnknownControlFlowFunctions;
}
if (!Function.hasProfile())
continue;
if (Function.hasValidProfile()) {
@ -1485,11 +1321,11 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC) {
std::vector<const BinaryFunction *> Functions;
std::map<const BinaryFunction *, DynoStats> Stats;
for (const auto &BFI : BC.getBinaryFunctions()) {
for (const auto &BFI : BFs) {
const auto &BF = BFI.second;
if (shouldOptimize(BF) && BF.hasValidProfile()) {
Functions.push_back(&BF);
Stats.emplace(&BF, getDynoStats(BF));
Stats.emplace(&BF, BF.getDynoStats());
}
}
@ -1541,7 +1377,7 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC) {
outs() << " are:\n";
auto SFI = Functions.begin();
for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
const auto Stats = getDynoStats(**SFI);
const auto Stats = (*SFI)->getDynoStats();
outs() << " " << **SFI;
if (!SortAll) {
outs() << " (";
@ -1591,13 +1427,12 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC) {
// Collect and print information about suboptimal code layout on input.
if (opts::ReportBadLayout) {
std::vector<const BinaryFunction *> SuboptimalFuncs;
for (auto &BFI : BC.getBinaryFunctions()) {
for (auto &BFI : BFs) {
const auto &BF = BFI.second;
if (!BF.hasValidProfile())
continue;
const auto HotThreshold =
std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
const auto HotThreshold = std::max(BF.getKnownExecutionCount(), 1UL);
bool HotSeen = false;
for (const auto *BB : BF.rlayout()) {
if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
@ -1628,19 +1463,13 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC) {
}
}
}
if (NumUnknownControlFlowFunctions) {
outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
<< " functions have instructions with unknown control flow";
if (!opts::PrintUnknown) {
outs() << ". Use -print-unknown to see the list.";
}
outs() << '\n';
}
}
void InstructionLowering::runOnFunctions(BinaryContext &BC) {
for (auto &BFI : BC.getBinaryFunctions()) {
void InstructionLowering::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
for (auto &BFI : BFs) {
for (auto &BB : BFI.second) {
for (auto &Instruction : BB) {
BC.MIB->lowerTailCall(Instruction);
@ -1649,10 +1478,13 @@ void InstructionLowering::runOnFunctions(BinaryContext &BC) {
}
}
void StripRepRet::runOnFunctions(BinaryContext &BC) {
void StripRepRet::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
uint64_t NumPrefixesRemoved = 0;
uint64_t NumBytesSaved = 0;
for (auto &BFI : BC.getBinaryFunctions()) {
for (auto &BFI : BFs) {
for (auto &BB : BFI.second) {
auto LastInstRIter = BB.getLastNonPseudo();
if (LastInstRIter == BB.rend() ||
@ -1672,15 +1504,17 @@ void StripRepRet::runOnFunctions(BinaryContext &BC) {
}
}
void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
void InlineMemcpy::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
if (!BC.isX86())
return;
uint64_t NumInlined = 0;
uint64_t NumInlinedDyno = 0;
for (auto &BFI : BC.getBinaryFunctions()) {
for (auto &BFI : BFs) {
for (auto &BB : BFI.second) {
for (auto II = BB.begin(); II != BB.end(); ++II) {
for(auto II = BB.begin(); II != BB.end(); ++II) {
auto &Inst = *II;
if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
@ -1720,139 +1554,5 @@ void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
}
}
bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
if (!BinaryFunctionPass::shouldOptimize(Function))
return false;
for (auto &FunctionSpec : Spec) {
auto FunctionName = StringRef(FunctionSpec).split(':').first;
if (Function.hasNameRegex(FunctionName))
return true;
}
return false;
}
std::set<size_t>
SpecializeMemcpy1::getCallSitesToOptimize(const BinaryFunction &Function) const{
StringRef SitesString;
for (auto &FunctionSpec : Spec) {
StringRef FunctionName;
std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
if (Function.hasNameRegex(FunctionName))
break;
SitesString = "";
}
std::set<size_t> Sites;
SmallVector<StringRef, 4> SitesVec;
SitesString.split(SitesVec, ':');
for (auto SiteString : SitesVec) {
if (SiteString.empty())
continue;
size_t Result;
if (!SiteString.getAsInteger(10, Result))
Sites.emplace(Result);
}
return Sites;
}
void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
if (!BC.isX86())
return;
uint64_t NumSpecialized = 0;
uint64_t NumSpecializedDyno = 0;
for (auto &BFI : BC.getBinaryFunctions()) {
auto &Function = BFI.second;
if (!shouldOptimize(Function))
continue;
auto CallsToOptimize = getCallSitesToOptimize(Function);
auto shouldOptimize = [&](size_t N) {
return CallsToOptimize.empty() || CallsToOptimize.count(N);
};
std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
size_t CallSiteID = 0;
for (auto *CurBB : Blocks) {
for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
auto &Inst = *II;
if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
!Inst.getOperand(0).isExpr())
continue;
const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
if (CalleeSymbol->getName() != "memcpy" &&
CalleeSymbol->getName() != "memcpy@PLT")
continue;
if (BC.MIB->isTailCall(Inst))
continue;
++CallSiteID;
if (!shouldOptimize(CallSiteID))
continue;
// Create a copy of a call to memcpy(dest, src, size).
auto MemcpyInstr = Inst;
auto *OneByteMemcpyBB = CurBB->splitAt(II);
BinaryBasicBlock *NextBB{nullptr};
if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
NextBB->eraseInstruction(NextBB->begin());
} else {
NextBB = OneByteMemcpyBB->getSuccessor();
OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
assert(NextBB && "unexpected call to memcpy() with no return");
}
auto *MemcpyBB = Function.addBasicBlock(CurBB->getInputOffset());
auto CmpJCC = BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2),
1,
OneByteMemcpyBB->getLabel(),
BC.Ctx.get());
CurBB->addInstructions(CmpJCC);
CurBB->addSuccessor(MemcpyBB);
MemcpyBB->addInstruction(std::move(MemcpyInstr));
MemcpyBB->addSuccessor(NextBB);
MemcpyBB->setCFIState(NextBB->getCFIState());
MemcpyBB->setExecutionCount(0);
// To prevent the actual call from being moved to cold, we set its
// execution count to 1.
if (CurBB->getKnownExecutionCount() > 0)
MemcpyBB->setExecutionCount(1);
auto OneByteMemcpy = BC.MIB->createOneByteMemcpy();
OneByteMemcpyBB->addInstructions(OneByteMemcpy);
++NumSpecialized;
NumSpecializedDyno += CurBB->getKnownExecutionCount();
CurBB = NextBB;
// Note: we don't expect the next instruction to be a call to memcpy.
II = CurBB->begin();
}
}
}
if (NumSpecialized) {
outs() << "BOLT-INFO: specialized " << NumSpecialized
<< " memcpy() call sites for size 1";
if (NumSpecializedDyno)
outs() << ". The calls were executed " << NumSpecializedDyno
<< " times based on profile.";
outs() << '\n';
}
}
} // namespace bolt
} // namespace llvm

View File

@ -16,10 +16,9 @@
#include "BinaryContext.h"
#include "BinaryFunction.h"
#include "DynoStats.h"
#include "HFSort.h"
#include "llvm/Support/CommandLine.h"
#include <atomic>
#include <map>
#include <set>
#include <string>
@ -39,7 +38,7 @@ protected:
/// Control whether a specific function should be skipped during
/// optimization.
virtual bool shouldOptimize(const BinaryFunction &BF) const;
bool shouldOptimize(const BinaryFunction &BF) const;
public:
virtual ~BinaryFunctionPass() = default;
@ -54,7 +53,9 @@ public:
virtual bool shouldPrint(const BinaryFunction &BF) const;
/// Execute this pass on the given functions.
virtual void runOnFunctions(BinaryContext &BC) = 0;
virtual void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) = 0;
};
/// A pass to print program-wide dynostats.
@ -78,8 +79,10 @@ public:
return false;
}
void runOnFunctions(BinaryContext &BC) override {
const auto NewDynoStats = getDynoStats(BC.getBinaryFunctions());
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override {
const auto NewDynoStats = getDynoStats(BFs);
const auto Changed = (NewDynoStats != PrevDynoStats);
outs() << "BOLT-INFO: program-wide dynostats "
<< Title << (Changed ? "" : " (no change)") << ":\n\n"
@ -95,10 +98,9 @@ public:
/// Detect and eliminate unreachable basic blocks. We could have those
/// filled with nops and they are used for alignment.
class EliminateUnreachableBlocks : public BinaryFunctionPass {
std::shared_timed_mutex ModifiedMtx;
std::unordered_set<const BinaryFunction *> Modified;
std::atomic<unsigned> DeletedBlocks{0};
std::atomic<uint64_t> DeletedBytes{0};
unsigned DeletedBlocks{0};
uint64_t DeletedBytes{0};
void runOnFunction(BinaryFunction& Function);
public:
EliminateUnreachableBlocks(const cl::opt<bool> &PrintPass)
@ -110,7 +112,9 @@ class EliminateUnreachableBlocks : public BinaryFunctionPass {
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext&) override;
void runOnFunctions(BinaryContext&,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
// Reorder the basic blocks for each function based on hotness.
@ -160,7 +164,9 @@ public:
return "reordering";
}
bool shouldPrint(const BinaryFunction &BF) const override;
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Sync local branches with CFG.
@ -172,7 +178,9 @@ class FixupBranches : public BinaryFunctionPass {
const char *getName() const override {
return "fix-branches";
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Fix the CFI state and exception handling information after all other
@ -185,7 +193,9 @@ class FinalizeFunctions : public BinaryFunctionPass {
const char *getName() const override {
return "finalize-functions";
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Convert and remove all BOLT-related annotations before LLVM code emission.
@ -197,7 +207,9 @@ class LowerAnnotations : public BinaryFunctionPass {
const char *getName() const override {
return "lower-annotations";
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// An optimization to simplify conditional tail calls by removing
@ -269,7 +281,9 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass {
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Perform simple peephole optimizations.
@ -299,7 +313,9 @@ public:
const char *getName() const override {
return "peepholes";
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// An optimization to simplify loads from read-only sections.The pass converts
@ -307,7 +323,7 @@ public:
///
/// mov 0x12f(%rip), %eax
///
/// to their counterparts that use immediate operands instead of memory loads:
/// to their counterparts that use immediate opreands instead of memory loads:
///
/// mov $0x4007dc, %eax
///
@ -332,39 +348,9 @@ public:
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext &BC) override;
};
/// Assign output sections to all functions.
class AssignSections : public BinaryFunctionPass {
public:
explicit AssignSections()
: BinaryFunctionPass(false) {
}
const char *getName() const override {
return "assign-sections";
}
void runOnFunctions(BinaryContext &BC) override;
};
/// Compute and report to the user the imbalance in flow equations for all
/// CFGs, so we can detect bad quality profile. Prints average and standard
/// deviation of the absolute differences of outgoing flow minus incoming flow
/// for blocks of interest (excluding prologues, epilogues, and BB frequency
/// lower than 100).
class PrintProfileStats : public BinaryFunctionPass {
public:
explicit PrintProfileStats(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "profile-stats";
}
bool shouldPrint(const BinaryFunction &) const override {
return false;
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Prints a list of the top 100 functions sorted by a set of
@ -380,7 +366,9 @@ class PrintProgramStats : public BinaryFunctionPass {
bool shouldPrint(const BinaryFunction &) const override {
return false;
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Pass for lowering any instructions that we have raised and that have
@ -394,7 +382,9 @@ public:
return "inst-lowering";
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Pass for stripping 'repz' from 'repz retq' sequence of instructions.
@ -407,7 +397,9 @@ public:
return "strip-rep-ret";
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Pass for inlining calls to memcpy using 'rep movsb' on X86.
@ -420,30 +412,9 @@ public:
return "inline-memcpy";
}
void runOnFunctions(BinaryContext &BC) override;
};
/// Pass for specializing memcpy for a size of 1 byte.
class SpecializeMemcpy1 : public BinaryFunctionPass {
private:
std::vector<std::string> Spec;
/// Return indices of the call sites to optimize. Count starts at 1.
/// Returns an empty set for all call sites in the function.
std::set<size_t> getCallSitesToOptimize(const BinaryFunction &) const;
public:
explicit SpecializeMemcpy1(const cl::opt<bool> &PrintPass,
cl::list<std::string> &Spec)
: BinaryFunctionPass(PrintPass), Spec(Spec) {}
bool shouldOptimize(const BinaryFunction &BF) const override;
const char *getName() const override {
return "specialize-memcpy";
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
enum FrameOptimizationType : char {

View File

@ -15,7 +15,6 @@ add_llvm_library(LLVMBOLTPasses
IdenticalCodeFolding.cpp
IndirectCallPromotion.cpp
Inliner.cpp
Instrumentation.cpp
JTFootprintReduction.cpp
LivenessAnalysis.cpp
LongJmp.cpp

View File

@ -23,7 +23,6 @@ using EdgeList = std::vector<std::pair<BinaryBasicBlock *, uint64_t>>;
namespace opts {
extern cl::OptionCategory BoltOptCategory;
extern cl::opt<bool> NoThreads;
cl::opt<unsigned>
ClusterSplitThreshold("cluster-split-threshold",
@ -289,12 +288,6 @@ private:
ExecutionCounts[BB->getLayoutIndex()] = EC;
}
// Create a separate MCCodeEmitter to allow lock-free execution
BinaryContext::IndependentCodeEmitter Emitter;
if (!opts::NoThreads) {
Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter();
}
// Initialize clusters
Clusters.reserve(BF.layout_size());
AllClusters.reserve(BF.layout_size());
@ -302,8 +295,7 @@ private:
Size.reserve(BF.layout_size());
for (auto BB : BF.layout()) {
size_t Index = BB->getLayoutIndex();
Size.push_back(
std::max<uint64_t>(BB->estimateSize(Emitter.MCE.get()), 1));
Size.push_back(std::max(BB->estimateSize(), size_t(1)));
AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
Clusters.push_back(&AllClusters[Index]);
CurCluster.push_back(&AllClusters[Index]);

View File

@ -172,9 +172,6 @@ protected:
/// Reference to the function being analysed
BinaryFunction &Func;
/// The id of the annotation allocator to be used
MCPlusBuilder::AllocatorIdTy AllocatorId = 0;
/// Tracks the state at basic block start (end) if direction of the dataflow
/// is forward (backward).
std::unordered_map<const BinaryBasicBlock *, StateTy> StateAtBBEntry;
@ -247,7 +244,7 @@ protected:
StateTy &getOrCreateStateAt(MCInst &Point) {
return BC.MIB->getOrCreateAnnotationAs<StateTy>(
Point, derived().getAnnotationIndex(), AllocatorId);
Point, derived().getAnnotationIndex());
}
StateTy &getOrCreateStateAt(ProgramPoint Point) {
@ -257,11 +254,6 @@ protected:
}
public:
/// Return the allocator id
unsigned getAllocatorId() {
return AllocatorId;
}
/// If the direction of the dataflow is forward, operates on the last
/// instruction of all predecessors when performing an iteration of the
/// dataflow equation for the start of this BB. If backwards, operates on
@ -275,10 +267,8 @@ public:
/// We need the current binary context and the function that will be processed
/// in this dataflow analysis.
DataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF,
MCPlusBuilder::AllocatorIdTy AllocatorId = 0)
: BC(BC), Func(BF), AllocatorId(AllocatorId) {}
DataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF)
: BC(BC), Func(BF) {}
virtual ~DataflowAnalysis() {
cleanAnnotations();
}
@ -334,15 +324,15 @@ public:
void run() {
derived().preflight();
// Initialize state for all points of the function
for (auto &BB : Func) {
auto &St = getOrCreateStateAt(BB);
St = derived().getStartingStateAtBB(BB);
for (auto &Inst : BB) {
auto &St = getOrCreateStateAt(Inst);
St = derived().getStartingStateAtPoint(Inst);
}
// Initialize state for all points of the function
for (auto &BB : Func) {
auto &St = getOrCreateStateAt(BB);
St = derived().getStartingStateAtBB(BB);
for (auto &Inst : BB) {
auto &St = getOrCreateStateAt(Inst);
St = derived().getStartingStateAtPoint(Inst);
}
}
assert(Func.begin() != Func.end() && "Unexpected empty function");
std::queue<BinaryBasicBlock *> Worklist;
@ -555,10 +545,8 @@ public:
return count(*Expressions[PointIdx], Expr);
}
InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF,
MCPlusBuilder::AllocatorIdTy AllocId = 0)
: DataflowAnalysis<Derived, BitVector, Backward, StatePrinterTy>(
BC, BF, AllocId) {}
InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF)
: DataflowAnalysis<Derived, BitVector, Backward, StatePrinterTy>(BC, BF) {}
virtual ~InstrsDataflowAnalysis() {}
};

View File

@ -19,7 +19,7 @@ ReachingDefOrUse</*Def=*/true> &DataflowInfoManager::getReachingDefs() {
if (RD)
return *RD;
assert(RA && "RegAnalysis required");
RD.reset(new ReachingDefOrUse<true>(*RA, BC, BF, None, AllocatorId));
RD.reset(new ReachingDefOrUse<true>(*RA, BC, BF));
RD->run();
return *RD;
}
@ -32,7 +32,7 @@ ReachingDefOrUse</*Def=*/false> &DataflowInfoManager::getReachingUses() {
if (RU)
return *RU;
assert(RA && "RegAnalysis required");
RU.reset(new ReachingDefOrUse<false>(*RA, BC, BF, None, AllocatorId));
RU.reset(new ReachingDefOrUse<false>(*RA, BC, BF));
RU->run();
return *RU;
}
@ -45,7 +45,7 @@ LivenessAnalysis &DataflowInfoManager::getLivenessAnalysis() {
if (LA)
return *LA;
assert(RA && "RegAnalysis required");
LA.reset(new LivenessAnalysis(*RA, BC, BF, AllocatorId));
LA.reset(new LivenessAnalysis(*RA, BC, BF));
LA->run();
return *LA;
}
@ -58,7 +58,7 @@ StackReachingUses &DataflowInfoManager::getStackReachingUses() {
if (SRU)
return *SRU;
assert(FA && "FrameAnalysis required");
SRU.reset(new StackReachingUses(*FA, BC, BF, AllocatorId));
SRU.reset(new StackReachingUses(*FA, BC, BF));
SRU->run();
return *SRU;
}
@ -70,7 +70,7 @@ void DataflowInfoManager::invalidateStackReachingUses() {
DominatorAnalysis<false> &DataflowInfoManager::getDominatorAnalysis() {
if (DA)
return *DA;
DA.reset(new DominatorAnalysis<false>(BC, BF, AllocatorId));
DA.reset(new DominatorAnalysis<false>(BC, BF));
DA->run();
return *DA;
}
@ -82,7 +82,7 @@ void DataflowInfoManager::invalidateDominatorAnalysis() {
DominatorAnalysis<true> &DataflowInfoManager::getPostDominatorAnalysis() {
if (PDA)
return *PDA;
PDA.reset(new DominatorAnalysis<true>(BC, BF, AllocatorId));
PDA.reset(new DominatorAnalysis<true>(BC, BF));
PDA->run();
return *PDA;
}
@ -94,7 +94,7 @@ void DataflowInfoManager::invalidatePostDominatorAnalysis() {
StackPointerTracking &DataflowInfoManager::getStackPointerTracking() {
if (SPT)
return *SPT;
SPT.reset(new StackPointerTracking(BC, BF, AllocatorId));
SPT.reset(new StackPointerTracking(BC, BF));
SPT->run();
return *SPT;
}
@ -107,7 +107,7 @@ void DataflowInfoManager::invalidateStackPointerTracking() {
ReachingInsns<false> &DataflowInfoManager::getReachingInsns() {
if (RI)
return *RI;
RI.reset(new ReachingInsns<false>(BC, BF, AllocatorId));
RI.reset(new ReachingInsns<false>(BC, BF));
RI->run();
return *RI;
}
@ -119,7 +119,7 @@ void DataflowInfoManager::invalidateReachingInsns() {
ReachingInsns<true> &DataflowInfoManager::getReachingInsnsBackwards() {
if (RIB)
return *RIB;
RIB.reset(new ReachingInsns<true>(BC, BF, AllocatorId));
RIB.reset(new ReachingInsns<true>(BC, BF));
RIB->run();
return *RIB;
}
@ -131,8 +131,7 @@ void DataflowInfoManager::invalidateReachingInsnsBackwards() {
StackAllocationAnalysis &DataflowInfoManager::getStackAllocationAnalysis() {
if (SAA)
return *SAA;
SAA.reset(new StackAllocationAnalysis(BC, BF, getStackPointerTracking(),
AllocatorId));
SAA.reset(new StackAllocationAnalysis(BC, BF, getStackPointerTracking()));
SAA->run();
return *SAA;
}

View File

@ -47,15 +47,10 @@ class DataflowInfoManager {
std::unique_ptr<std::unordered_map<const MCInst *, BinaryBasicBlock *>>
InsnToBB;
// Id of the allocator to be used for annotations added by any of the managed
// analysis
MCPlusBuilder::AllocatorIdTy AllocatorId;
public:
DataflowInfoManager(const BinaryContext &BC, BinaryFunction &BF,
const RegAnalysis *RA, const FrameAnalysis *FA,
MCPlusBuilder::AllocatorIdTy AllocId = 0)
: RA(RA), FA(FA), BC(BC), BF(BF), AllocatorId(AllocId){};
const RegAnalysis *RA, const FrameAnalysis *FA)
: RA(RA), FA(FA), BC(BC), BF(BF){};
/// Helper function to fetch the parent BB associated with a program point
/// If PP is a BB itself, then return itself (cast to a BinaryBasicBlock)

View File

@ -35,35 +35,34 @@ class DominatorAnalysis
Backward>;
public:
DominatorAnalysis(const BinaryContext &BC, BinaryFunction &BF,
MCPlusBuilder::AllocatorIdTy AllocId)
: InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>(BC, BF,
AllocId) {
}
DominatorAnalysis(const BinaryContext &BC, BinaryFunction &BF)
: InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>(BC, BF) {}
virtual ~DominatorAnalysis() {}
SmallSetVector<ProgramPoint, 4> getDominanceFrontierFor(const MCInst &Dom) {
SmallSetVector<ProgramPoint, 4> Result;
SmallVector<ProgramPoint, 4> getDominanceFrontierFor(const MCInst &Dom) {
SmallVector<ProgramPoint, 4> Result;
auto DomIdx = this->ExprToIdx[&Dom];
assert(!Backward && "Post-dom frontier not implemented");
for (auto &BB : this->Func) {
bool HasDominatedPred = false;
bool HasNonDominatedPred = false;
SmallSetVector<ProgramPoint, 4> Candidates;
SmallVector<ProgramPoint, 4> Candidates;
this->doForAllSuccsOrPreds(BB, [&](ProgramPoint P) {
if ((*this->getStateAt(P))[DomIdx]) {
Candidates.insert(P);
Candidates.emplace_back(P);
HasDominatedPred = true;
return;
}
HasNonDominatedPred = true;
});
if (HasDominatedPred && HasNonDominatedPred)
Result.insert(Candidates.begin(), Candidates.end());
Result.append(Candidates.begin(), Candidates.end());
if ((*this->getStateAt(ProgramPoint::getLastPointAt(BB)))[DomIdx] &&
BB.succ_begin() == BB.succ_end())
Result.insert(ProgramPoint::getLastPointAt(BB));
Result.emplace_back(ProgramPoint::getLastPointAt(BB));
}
std::sort(Result.begin(), Result.end());
Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
return Result;
}
@ -105,6 +104,8 @@ public:
}
void run() {
NamedRegionTimer T1("DA", "Dominator Analysis", "Dataflow", "Dataflow",
opts::TimeOpts);
InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>::run();
}

View File

@ -10,8 +10,6 @@
//===----------------------------------------------------------------------===//
#include "FrameAnalysis.h"
#include "CallGraphWalker.h"
#include "ParallelUtilities.h"
#include "llvm/Support/ThreadPool.h"
#include <fstream>
#define DEBUG_TYPE "fa"
@ -19,9 +17,8 @@
using namespace llvm;
namespace opts {
extern cl::OptionCategory BoltOptCategory;
extern cl::opt<bool> TimeOpts;
extern cl::opt<unsigned> Verbosity;
extern bool shouldProcess(const bolt::BinaryFunction &Function);
static cl::list<std::string>
@ -33,17 +30,7 @@ static cl::opt<std::string> FrameOptFunctionNamesFile(
"funcs-file-fop",
cl::desc("file with list of functions to frame optimize"));
static cl::opt<bool>
TimeFA("time-fa",
cl::desc("time frame analysis steps"),
cl::ReallyHidden,
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
bool shouldFrameOptimize(const llvm::bolt::BinaryFunction &Function) {
if (Function.hasUnknownControlFlow())
return false;
if (!FrameOptFunctionNamesFile.empty()) {
assert(!FrameOptFunctionNamesFile.empty() && "unexpected empty file name");
std::ifstream FuncsFile(FrameOptFunctionNamesFile, std::ios::in);
@ -98,8 +85,7 @@ namespace {
class FrameAccessAnalysis {
/// We depend on Stack Pointer Tracking to figure out the current SP offset
/// value at a given program point
StackPointerTracking &SPT;
StackPointerTracking SPT;
/// Context vars
const BinaryContext &BC;
const BinaryFunction &BF;
@ -164,9 +150,14 @@ class FrameAccessAnalysis {
}
public:
FrameAccessAnalysis(const BinaryContext &BC, BinaryFunction &BF,
StackPointerTracking &SPT)
: SPT(SPT), BC(BC), BF(BF) {}
FrameAccessAnalysis(const BinaryContext &BC, BinaryFunction &BF)
: SPT(BC, BF), BC(BC), BF(BF) {
{
NamedRegionTimer T1("SPT", "Stack Pointer Tracking", "Dataflow",
"Dataflow", opts::TimeOpts);
SPT.run();
}
}
void enterNewBB() { Prev = nullptr; }
const FrameIndexEntry &getFIE() const { return FIE; }
@ -402,7 +393,7 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
<< "\n");
bool UpdatedArgsTouched = false;
bool NoInfo = false;
FrameAccessAnalysis FAA(BC, BF, getSPT(BF));
FrameAccessAnalysis FAA(BC, BF);
for (auto BB : BF.layout()) {
FAA.enterNewBB();
@ -461,7 +452,7 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
}
bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
FrameAccessAnalysis FAA(BC, BF, getSPT(BF));
FrameAccessAnalysis FAA(BC, BF);
DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName()
<< "\"\n");
@ -494,42 +485,27 @@ bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
}
void FrameAnalysis::cleanAnnotations() {
NamedRegionTimer T("cleanannotations", "clean annotations", "FA",
"FA breakdown", opts::TimeFA);
ParallelUtilities::WorkFuncTy CleanFunction = [&](BinaryFunction &BF) {
for (auto &BB : BF) {
for (auto &I : BFs) {
for (auto &BB : I.second) {
for (auto &Inst : BB) {
BC.MIB->removeAnnotation(Inst, "ArgAccessEntry");
BC.MIB->removeAnnotation(Inst, "FrameAccessEntry");
}
}
};
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, CleanFunction,
ParallelUtilities::PredicateTy(nullptr), "cleanAnnotations");
}
}
FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
: BC(BC) {
FrameAnalysis::FrameAnalysis(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
BinaryFunctionCallGraph &CG)
: BC(BC), BFs(BFs) {
// Position 0 of the vector should be always associated with "assume access
// everything".
ArgAccessesVector.emplace_back(ArgAccesses(/*AssumeEverything*/ true));
if (!opts::NoThreads) {
NamedRegionTimer T1("precomputespt", "pre-compute spt", "FA",
"FA breakdown", opts::TimeFA);
preComputeSPT();
}
traverseCG(CG);
{
NamedRegionTimer T1("traversecg", "traverse call graph", "FA",
"FA breakdown", opts::TimeFA);
traverseCG(CG);
}
for (auto &I : BC.getBinaryFunctions()) {
for (auto &I : BFs) {
auto Count = I.second.getExecutionCount();
if (Count != BinaryFunction::COUNT_NO_PROFILE)
CountDenominator += Count;
@ -545,8 +521,8 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
}
{
NamedRegionTimer T1("restorefi", "restore frame index", "FA",
"FA breakdown", opts::TimeFA);
NamedRegionTimer T1("restorefi", "restore frame index", "FOP",
"FOP breakdown", opts::TimeOpts);
if (!restoreFrameIndex(I.second)) {
++NumFunctionsFailedRestoreFI;
auto Count = I.second.getExecutionCount();
@ -557,18 +533,6 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
}
AnalyzedFunctions.insert(&I.second);
}
{
NamedRegionTimer T1("clearspt", "clear spt", "FA", "FA breakdown",
opts::TimeFA);
clearSPTMap();
// Clean up memory allocated for annotation values
if (!opts::NoThreads) {
for (auto Id : SPTAllocatorsId)
BC.MIB->freeValuesAllocator(Id);
}
}
}
void FrameAnalysis::printStats() {
@ -584,60 +548,5 @@ void FrameAnalysis::printStats() {
<< " could not have its frame indices restored.\n";
}
void FrameAnalysis::clearSPTMap() {
if (opts::NoThreads) {
SPTMap.clear();
return;
}
ParallelUtilities::WorkFuncTy ClearFunctionSPT = [&](BinaryFunction &BF) {
auto &SPTPtr = SPTMap.find(&BF)->second;
SPTPtr.reset();
};
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
return !BF.isSimple() || !BF.hasCFG();
};
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, ClearFunctionSPT,
SkipFunc, "clearSPTMap");
SPTMap.clear();
}
void FrameAnalysis::preComputeSPT() {
// Make sure that the SPTMap is empty
assert(SPTMap.size() == 0);
// Create map entries to allow lock-free parallel execution
for (auto &BFI : BC.getBinaryFunctions()) {
auto &BF = BFI.second;
if (!BF.isSimple() || !BF.hasCFG())
continue;
SPTMap.emplace(&BF, std::unique_ptr<StackPointerTracking>());
}
// Create an index for the SPT annotation to allow lock-free parallel
// execution
BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
// Run SPT in parallel
ParallelUtilities::WorkFuncWithAllocTy ProcessFunction =
[&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) {
auto &SPTPtr = SPTMap.find(&BF)->second;
SPTPtr = std::make_unique<StackPointerTracking>(BC, BF, AllocId);
SPTPtr->run();
};
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
return !BF.isSimple() || !BF.hasCFG();
};
ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
BC, ParallelUtilities::SchedulingPolicy::SP_BB_QUADRATIC, ProcessFunction,
SkipPredicate, "preComputeSPT");
}
} // namespace bolt
} // namespace llvm

View File

@ -93,7 +93,7 @@ raw_ostream &operator<<(raw_ostream &OS,
/// Initialization:
///
/// FrameAnalysis FA(PrintPass);
/// FA.runOnFunctions(BC);
/// FA.runOnFunctions(BC, BFs, LargeFunctions);
///
/// Usage (fetching frame access information about a given instruction):
///
@ -113,6 +113,7 @@ raw_ostream &operator<<(raw_ostream &OS,
///
class FrameAnalysis {
BinaryContext &BC;
std::map<uint64_t, BinaryFunction> &BFs;
/// Map functions to the set of <stack offsets, size> tuples representing
/// accesses to stack positions that belongs to caller
@ -167,17 +168,9 @@ class FrameAnalysis {
/// to analyze and this information can't be safely determined for \p BF.
bool restoreFrameIndex(BinaryFunction &BF);
/// A store for SPT info per function
std::unordered_map<const BinaryFunction *,
std::unique_ptr<StackPointerTracking>>
SPTMap;
/// A vector that stores ids of the allocators that are used in SPT
/// computation
std::vector<MCPlusBuilder::AllocatorIdTy> SPTAllocatorsId;
public:
explicit FrameAnalysis(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
BinaryFunctionCallGraph &CG);
/// Return true if we could fully analyze \p Func
@ -204,30 +197,10 @@ public:
cleanAnnotations();
}
/// Print to standard output statistics about the analysis performed by this
/// pass
void printStats();
/// Get or create an SPT object and run the analysis
StackPointerTracking &getSPT(BinaryFunction &BF) {
if (!SPTMap.count(&BF)) {
SPTMap.emplace(&BF, std::make_unique<StackPointerTracking>(BC, BF));
auto Iter = SPTMap.find(&BF);
assert(Iter != SPTMap.end() && "item should exist");
Iter->second->run();
return *Iter->second;
}
auto Iter = SPTMap.find(&BF);
assert(Iter != SPTMap.end() && "item should exist");
return *Iter->second;
}
/// Clean and de-allocate all SPT objects
void clearSPTMap();
/// Perform SPT analysis for all functions in parallel
void preComputeSPT();
};
} // namespace bolt

View File

@ -10,7 +10,6 @@
//===----------------------------------------------------------------------===//
#include "FrameOptimizer.h"
#include "ParallelUtilities.h"
#include "ShrinkWrapping.h"
#include "StackAvailableExpressions.h"
#include "StackReachingUses.h"
@ -48,6 +47,7 @@ RemoveStores("frame-opt-rm-stores",
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
} // namespace opts
namespace llvm {
@ -221,36 +221,21 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
}
}
void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
if (opts::FrameOptimization == FOP_NONE)
return;
std::unique_ptr<BinaryFunctionCallGraph> CG;
std::unique_ptr<FrameAnalysis> FA;
std::unique_ptr<RegAnalysis> RA;
// Run FrameAnalysis pass
BinaryFunctionCallGraph CG = buildCallGraph(BC, BFs);
FrameAnalysis FA(BC, BFs, CG);
RegAnalysis RA(BC, &BFs, &CG);
{
NamedRegionTimer T1("callgraph", "create call graph", "FOP",
"FOP breakdown", opts::TimeOpts);
CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC));
}
{
NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
"FOP breakdown", opts::TimeOpts);
FA = std::make_unique<FrameAnalysis>(BC, *CG);
}
{
NamedRegionTimer T1("reganalysis", "reg analysis", "FOP",
"FOP breakdown", opts::TimeOpts);
RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get());
}
// Perform caller-saved register optimizations, then callee-saved register
// optimizations (shrink wrapping)
for (auto &I : BC.getBinaryFunctions()) {
if (!FA->hasFrameInfo(I.second))
// Our main loop: perform caller-saved register optimizations, then
// callee-saved register optimizations (shrink wrapping).
for (auto &I : BFs) {
if (!FA.hasFrameInfo(I.second))
continue;
// Restrict pass execution if user asked to only run on hot functions
if (opts::FrameOptimization == FOP_HOT) {
@ -262,28 +247,27 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
<< " ) exceeds our hotness threshold ( "
<< BC.getHotThreshold() << " )\n");
}
{
NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
opts::TimeOpts);
removeUnnecessaryLoads(*RA, *FA, BC, I.second);
removeUnnecessaryLoads(RA, FA, BC, I.second);
}
if (opts::RemoveStores) {
NamedRegionTimer T1("removestores", "remove stores", "FOP",
"FOP breakdown", opts::TimeOpts);
removeUnusedStores(*FA, BC, I.second);
removeUnusedStores(FA, BC, I.second);
}
// Don't even start shrink wrapping if no profiling info is available
if (I.second.getKnownExecutionCount() == 0)
continue;
}
{
NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
"FOP breakdown", opts::TimeOpts);
performShrinkWrapping(*RA, *FA, BC);
{
NamedRegionTimer T1("movespills", "move spills", "FOP", "FOP breakdown",
opts::TimeOpts);
DataflowInfoManager Info(BC, I.second, &RA, &FA);
ShrinkWrapping SW(FA, BC, I.second, Info);
if (SW.perform())
FuncsChanged.insert(&I.second);
}
}
outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
@ -294,67 +278,9 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
<< NumLoadsChangedToImm << " to use an immediate.\n"
<< "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and "
<< NumRedundantStores << " store(s).\n";
FA->printStats();
FA.printStats();
ShrinkWrapping::printStats();
}
void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
const FrameAnalysis &FA,
BinaryContext &BC) {
// Initialize necessary annotations to allow safe parallel accesses to
// annotation index in MIB
BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
BC.MIB->getOrCreateAnnotationIndex(
StackLayoutModifier::getOffsetCFIRegTagName());
BC.MIB->getOrCreateAnnotationIndex("ReachingDefs");
BC.MIB->getOrCreateAnnotationIndex("ReachingUses");
BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis");
BC.MIB->getOrCreateAnnotationIndex("StackReachingUses");
BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis");
BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis");
BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions");
BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis");
BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward");
BC.MIB->getOrCreateAnnotationIndex("ReachingInsns");
BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
if (!FA.hasFrameInfo(BF))
return true;
if (opts::FrameOptimization == FOP_HOT &&
(BF.getKnownExecutionCount() < BC.getHotThreshold()))
return true;
if (BF.getKnownExecutionCount() == 0)
return true;
return false;
};
ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
[&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
DataflowInfoManager Info(BC, BF, &RA, &FA, AllocatorId);
ShrinkWrapping SW(FA, BC, BF, Info, AllocatorId);
if (SW.perform()) {
std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
FuncsChanged.insert(&BF);
}
};
ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
SkipPredicate, "shrink-wrapping");
}
} // namespace bolt
} // namespace llvm

View File

@ -86,8 +86,6 @@ class FrameOptimizerPass : public BinaryFunctionPass {
DenseSet<const BinaryFunction *> FuncsChanged;
std::mutex FuncsChangedMutex;
/// Perform a dataflow analysis in \p BF to reveal unnecessary reloads from
/// the frame. Use the analysis to convert memory loads to register moves or
/// immediate loads. Delete redundant register moves.
@ -101,10 +99,6 @@ class FrameOptimizerPass : public BinaryFunctionPass {
const BinaryContext &BC,
BinaryFunction &BF);
/// Perform shrinkwrapping step
void performShrinkWrapping(const RegAnalysis &RA, const FrameAnalysis &FA,
BinaryContext &BC);
public:
explicit FrameOptimizerPass(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) {}
@ -114,7 +108,9 @@ public:
}
/// Pass entry point
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;

View File

@ -11,7 +11,6 @@
#include "BinaryFunction.h"
#include "HFSort.h"
#include "ParallelUtilities.h"
#include "ReorderUtils.h"
#include "llvm/Support/Options.h"
@ -320,115 +319,50 @@ public:
/// Merge pairs of clusters while there is an improvement in the
/// expected cache miss ratio
void runPassTwo() {
// BucketsCount is hard-coded to make the algorithm determinestic regardless
// of the number of threads
const unsigned BucketsCount = 124;
unsigned IterationCount = 0;
llvm::ThreadPool *Pool;
if (!opts::NoThreads)
Pool = &ParallelUtilities::getThreadPool();
while (Clusters.size() > 1) {
MergeCandidateEntry GlobalMaximum;
std::vector<MergeCandidateEntry> LocalMaximums(BucketsCount);
Cluster *BestClusterPred = nullptr;
Cluster *BestClusterSucc = nullptr;
double BestGain = -1;
for (auto ClusterPred : Clusters) {
// get candidates for merging with the current cluster
Adjacent.forAllAdjacent(
ClusterPred,
// find the best candidate
[&](Cluster *ClusterSucc) {
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
// compute the gain of merging two clusters
const double Gain = mergeGain(ClusterPred, ClusterSucc);
// Compare two candidates with a given gain
auto compareCandidates = [](const MergeCandidateEntry &CandidateA,
const MergeCandidateEntry &CandidateB) {
// breaking ties by density to make the hottest clusters be
// merged first
return CandidateA.Gain > CandidateB.Gain ||
(std::abs(CandidateA.Gain - CandidateB.Gain) < 1e-8 &&
compareClusterPairs(
CandidateA.ClusterPred, CandidateA.ClusterSucc,
CandidateB.ClusterPred, CandidateB.ClusterSucc));
};
// find the best candidates to merge within a bucket range
auto findMaximaInBucket = [&](const unsigned Start, const unsigned End,
const unsigned BucketId) {
auto &LocalMaximum = LocalMaximums[BucketId];
for (unsigned Idx = Start; Idx < End; Idx++) {
if (Idx >= Clusters.size())
return;
auto ClusterPred = Clusters[Idx];
// get best candidates to merge with the current cluster
Adjacent.forAllAdjacent(
ClusterPred,
// find the best candidate
[&](Cluster *ClusterSucc) {
assert(ClusterPred != ClusterSucc &&
"loop edges are not supported");
// compute the gain of merging two clusters
const double Gain = mergeGain(ClusterPred, ClusterSucc);
// create a new candidate
MergeCandidateEntry Candidate;
Candidate.Gain = Gain;
Candidate.ClusterPred = ClusterPred;
Candidate.ClusterSucc = ClusterSucc;
if (compareCandidates(Candidate, LocalMaximum))
LocalMaximum = Candidate;
});
}
};
unsigned BucketSize = Clusters.size() / BucketsCount;
if (Clusters.size() % BucketsCount)
BucketSize++;
// find the best candidate within each bucket
unsigned BucketId = 0;
for (unsigned ClusterIdx = 0; ClusterIdx < Clusters.size();
ClusterIdx += BucketSize, BucketId++) {
if (opts::NoThreads) {
findMaximaInBucket(ClusterIdx, ClusterIdx + BucketSize, BucketId);
} else {
Pool->async(findMaximaInBucket, ClusterIdx, ClusterIdx + BucketSize,
BucketId);
}
// breaking ties by density to make the hottest clusters be merged first
if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 &&
compareClusterPairs(ClusterPred,
ClusterSucc,
BestClusterPred,
BestClusterSucc))) {
BestGain = Gain;
BestClusterPred = ClusterPred;
BestClusterSucc = ClusterSucc;
}
});
}
if (!opts::NoThreads)
Pool->wait();
// find glabal maximum
for (auto &LocalMaximum : LocalMaximums) {
if (LocalMaximum.Gain > 0 &&
compareCandidates(LocalMaximum, GlobalMaximum))
GlobalMaximum = LocalMaximum;
}
if (GlobalMaximum.Gain <= 0.0)
// stop merging when there is no improvement
if (BestGain <= 0.0)
break;
DEBUG(outs() << "merging##" << GlobalMaximum.ClusterPred->id() << "##"
<< GlobalMaximum.ClusterSucc->id() << "@@"
<< GlobalMaximum.Gain << "\n");
mergeClusters(GlobalMaximum.ClusterPred, GlobalMaximum.ClusterSucc);
// merge the best pair of clusters
mergeClusters(BestClusterPred, BestClusterSucc);
}
DEBUG(outs() << "BOLT-INFO: hfsort+ pass two finished in" << IterationCount
<< " iterations.");
}
/// Run hfsort+ algorithm and return ordered set of function clusters.
std::vector<Cluster> run() {
DEBUG(dbgs() << "Starting hfsort+ w/"
<< (UseGainCache ? "gain cache" : "no cache") << " for "
<< Clusters.size() << " clusters "
<< (UseGainCache ? "gain cache" : "no cache")
<< " for " << Clusters.size() << " clusters "
<< "with ITLBPageSize = " << ITLBPageSize << ", "
<< "ITLBEntries = " << ITLBEntries << ", "
<< "and MergeProbability = " << opts::MergeProbability
<< "\n");
<< "and MergeProbability = " << opts::MergeProbability << "\n");
// Pass 1
runPassOne();
@ -436,8 +370,7 @@ public:
// Pass 2
runPassTwo();
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size()
<< " clusters\n");
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n");
// Sorting clusters by density in decreasing order
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
@ -485,13 +418,6 @@ public:
}
private:
/// A struct that is used to store a merge candidate
struct MergeCandidateEntry {
double Gain{-1};
Cluster *ClusterPred{nullptr};
Cluster *ClusterSucc{nullptr};
};
/// Initialize the set of active clusters, function id to cluster mapping,
/// total number of samples and function addresses.
std::vector<Cluster *> initializeClusters() {
@ -576,7 +502,7 @@ private:
// when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
// containing both x and y and all clusters adjacent to x and y (and recompute
// them on the next iteration).
mutable ClusterPairCacheThreadSafe<Cluster, double> GainCache;
mutable ClusterPairCache<Cluster, double> GainCache;
};
} // end namespace anonymous

View File

@ -9,12 +9,9 @@
//
//===----------------------------------------------------------------------===//
#include "Passes/IdenticalCodeFolding.h"
#include "ParallelUtilities.h"
#include "llvm/Support/Options.h"
#include "llvm/Support/ThreadPool.h"
#include "llvm/Support/Timer.h"
#include <atomic>
#include <map>
#include <set>
#include <unordered_map>
@ -35,12 +32,6 @@ UseDFS("icf-dfs",
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::opt<bool>
TimeICF("time-icf",
cl::desc("time icf steps"),
cl::ReallyHidden,
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
} // namespace opts
namespace {
@ -285,108 +276,72 @@ bool isIdenticalWith(const BinaryFunction &A, const BinaryFunction &B,
return true;
}
// This hash table is used to identify identical functions. It maps
// a function to a bucket of functions identical to it.
struct KeyHash {
std::size_t operator()(const BinaryFunction *F) const {
return F->hash(/*Recompute=*/false);
}
};
struct KeyCongruent {
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
if (A == B)
return true;
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/true, opts::UseDFS);
}
};
struct KeyEqual {
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
if (A == B)
return true;
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/false, opts::UseDFS);
}
};
typedef std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>,
KeyHash, KeyCongruent>
CongruentBucketsMap;
typedef std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
KeyHash, KeyEqual>
IdenticalBucketsMap;
} // namespace
}
namespace llvm {
namespace bolt {
void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
const auto OriginalFunctionCount = BC.getBinaryFunctions().size();
uint64_t NumFunctionsFolded{0};
std::atomic<uint64_t> NumJTFunctionsFolded{0};
std::atomic<uint64_t> BytesSavedEstimate{0};
std::atomic<uint64_t> CallsSavedEstimate{0};
std::atomic<uint64_t> NumFoldedLastIteration{0};
CongruentBucketsMap CongruentBuckets;
void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &) {
const auto OriginalFunctionCount = BFs.size();
uint64_t NumFunctionsFolded = 0;
uint64_t NumJTFunctionsFolded = 0;
uint64_t BytesSavedEstimate = 0;
uint64_t CallsSavedEstimate = 0;
// Hash all the functions
auto hashFunctions = [&]() {
NamedRegionTimer HashFunctionsTimer("hashing", "hashing", "ICF breakdown",
"ICF breakdown", opts::TimeICF);
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
// Make sure indices are in-order.
BF.updateLayoutIndices();
// Pre-compute hash before pushing into hashtable.
BF.hash(/*Recompute=*/true, opts::UseDFS);
};
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
return !shouldOptimize(BF);
};
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
"hashFunctions", /*ForceSequential*/ false, 2);
// This hash table is used to identify identical functions. It maps
// a function to a bucket of functions identical to it.
struct KeyHash {
std::size_t operator()(const BinaryFunction *F) const {
return F->hash(/*Recompute=*/false);
}
};
// Creates buckets with congruent functions - functions that potentially
// could be folded.
auto createCongruentBuckets = [&]() {
NamedRegionTimer CongruentBucketsTimer("congruent buckets",
"congruent buckets", "ICF breakdown",
"ICF breakdown", opts::TimeICF);
for (auto &BFI : BC.getBinaryFunctions()) {
auto &BF = BFI.second;
if (!this->shouldOptimize(BF))
continue;
CongruentBuckets[&BF].emplace(&BF);
struct KeyCongruent {
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/true, opts::UseDFS);
}
};
struct KeyEqual {
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/false, opts::UseDFS);
}
};
// Partition each set of congruent functions into sets of identical functions
// and fold them
auto performFoldingPass = [&]() {
NamedRegionTimer FoldingPassesTimer("folding passes", "folding passes",
"ICF breakdown", "ICF breakdown",
opts::TimeICF);
Timer SinglePass("single fold pass", "single fold pass");
DEBUG(SinglePass.startTimer());
// Create buckets with congruent functions - functions that potentially could
// be folded.
std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>,
KeyHash, KeyCongruent> CongruentBuckets;
for (auto &BFI : BFs) {
auto &BF = BFI.second;
if (!shouldOptimize(BF) || BF.isFolded())
continue;
ThreadPool *ThPool;
if (!opts::NoThreads)
ThPool = &ParallelUtilities::getThreadPool();
// Make sure indices are in-order.
BF.updateLayoutIndices();
// Fold identical functions within a single congruent bucket
auto procesSingleBucket = [&](std::set<BinaryFunction *> &Candidates) {
Timer T("folding single congruent list", "folding single congruent list");
DEBUG(T.startTimer());
// Pre-compute hash before pushing into hashtable.
BF.hash(/*Recompute=*/true, opts::UseDFS);
CongruentBuckets[&BF].emplace(&BF);
}
// We repeat the pass until no new modifications happen.
unsigned Iteration = 1;
uint64_t NumFoldedLastIteration;
do {
NumFoldedLastIteration = 0;
DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
for (auto &CBI : CongruentBuckets) {
auto &Candidates = CBI.second;
if (Candidates.size() < 2)
continue;
// Identical functions go into the same bucket.
IdenticalBucketsMap IdenticalBuckets;
std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
KeyHash, KeyEqual> IdenticalBuckets;
for (auto *BF : Candidates) {
IdenticalBuckets[BF].emplace_back(BF);
}
@ -400,10 +355,9 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
// Fold functions. Keep the order consistent across invocations with
// different options.
std::stable_sort(Twins.begin(), Twins.end(),
[](const BinaryFunction *A, const BinaryFunction *B) {
return A->getFunctionNumber() <
B->getFunctionNumber();
});
[](const BinaryFunction *A, const BinaryFunction *B) {
return A->getFunctionNumber() < B->getFunctionNumber();
});
BinaryFunction *ParentBF = Twins[0];
for (unsigned i = 1; i < Twins.size(); ++i) {
@ -421,7 +375,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
BytesSavedEstimate += ChildBF->getSize();
CallsSavedEstimate += std::min(ChildBF->getKnownExecutionCount(),
ParentBF->getKnownExecutionCount());
BC.foldFunction(*ChildBF, *ParentBF);
BC.foldFunction(*ChildBF, *ParentBF, BFs);
++NumFoldedLastIteration;
@ -430,44 +384,13 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
}
}
DEBUG(T.stopTimer());
};
// Create a task for each congruent bucket
for (auto &Entry : CongruentBuckets) {
auto &Bucket = Entry.second;
if (Bucket.size() < 2)
continue;
if (opts::NoThreads)
procesSingleBucket(Bucket);
else
ThPool->async(procesSingleBucket, std::ref(Bucket));
}
if (!opts::NoThreads)
ThPool->wait();
DEBUG(SinglePass.stopTimer());
};
hashFunctions();
createCongruentBuckets();
unsigned Iteration = 1;
// We repeat the pass until no new modifications happen.
do {
NumFoldedLastIteration = 0;
DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
performFoldingPass();
NumFunctionsFolded += NumFoldedLastIteration;
++Iteration;
} while (NumFoldedLastIteration > 0);
DEBUG(
DEBUG(
// Print functions that are congruent but not identical.
for (auto &CBI : CongruentBuckets) {
auto &Candidates = CBI.second;

View File

@ -23,16 +23,6 @@ namespace bolt {
/// references to a single one of them.
///
class IdenticalCodeFolding : public BinaryFunctionPass {
protected:
bool shouldOptimize(const BinaryFunction &BF) const override {
if (BF.hasUnknownControlFlow())
return false;
if (BF.isFolded())
return false;
if (BF.hasSDTMarker())
return false;
return BinaryFunctionPass::shouldOptimize(BF);
}
public:
explicit IdenticalCodeFolding(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
@ -40,7 +30,9 @@ public:
const char *getName() const override {
return "identical-code-folding";
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt

View File

@ -40,43 +40,11 @@ IndirectCallPromotion("indirect-call-promotion",
cl::cat(BoltOptCategory));
static cl::opt<unsigned>
ICPJTRemainingPercentThreshold(
"icp-jt-remaining-percent-threshold",
cl::desc("The percentage threshold against remaining unpromoted indirect "
"call count for the promotion for jump tables"),
cl::init(30),
IndirectCallPromotionThreshold(
"indirect-call-promotion-threshold",
cl::desc("threshold for optimizing a frequently taken indirect call"),
cl::init(90),
cl::ZeroOrMore,
cl::Hidden,
cl::cat(BoltOptCategory));
static cl::opt<unsigned>
ICPJTTotalPercentThreshold(
"icp-jt-total-percent-threshold",
cl::desc("The percentage threshold against total count for the promotion for "
"jump tables"),
cl::init(5),
cl::ZeroOrMore,
cl::Hidden,
cl::cat(BoltOptCategory));
static cl::opt<unsigned>
ICPCallsRemainingPercentThreshold(
"icp-calls-remaining-percent-threshold",
cl::desc("The percentage threshold against remaining unpromoted indirect "
"call count for the promotion for calls"),
cl::init(50),
cl::ZeroOrMore,
cl::Hidden,
cl::cat(BoltOptCategory));
static cl::opt<unsigned>
ICPCallsTotalPercentThreshold(
"icp-calls-total-percent-threshold",
cl::desc("The percentage threshold against total count for the promotion for "
"calls"),
cl::init(30),
cl::ZeroOrMore,
cl::Hidden,
cl::cat(BoltOptCategory));
static cl::opt<unsigned>
@ -84,7 +52,7 @@ IndirectCallPromotionMispredictThreshold(
"indirect-call-promotion-mispredict-threshold",
cl::desc("misprediction threshold for skipping ICP on an "
"indirect call"),
cl::init(0),
cl::init(2),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
@ -101,17 +69,17 @@ IndirectCallPromotionUseMispredicts(
static cl::opt<unsigned>
IndirectCallPromotionTopN(
"indirect-call-promotion-topn",
cl::desc("limit number of targets to consider when doing indirect "
"call promotion. 0 = no limit"),
cl::init(3),
cl::desc("number of targets to consider when doing indirect "
"call promotion"),
cl::init(1),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::opt<unsigned>
IndirectCallPromotionCallsTopN(
"indirect-call-promotion-calls-topn",
cl::desc("limit number of targets to consider when doing indirect "
"call promotion on calls. 0 = no limit"),
cl::desc("number of targets to consider when doing indirect "
"call promotion on calls"),
cl::init(0),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
@ -119,8 +87,8 @@ IndirectCallPromotionCallsTopN(
static cl::opt<unsigned>
IndirectCallPromotionJumpTablesTopN(
"indirect-call-promotion-jump-tables-topn",
cl::desc("limit number of targets to consider when doing indirect "
"call promotion on jump tables. 0 = no limit"),
cl::desc("number of targets to consider when doing indirect "
"call promotion on jump tables"),
cl::init(0),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
@ -138,8 +106,8 @@ static cl::opt<unsigned>
ICPTopCallsites(
"icp-top-callsites",
cl::desc("only optimize calls that contribute to this percentage of all "
"indirect calls. 0 = all callsites"),
cl::init(99),
"indirect calls"),
cl::init(0),
cl::Hidden,
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
@ -213,42 +181,6 @@ IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF,
}
}
void IndirectCallPromotion::printDecision(
llvm::raw_ostream &OS,
std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const {
uint64_t TotalCount = 0;
uint64_t TotalMispreds = 0;
for (const auto &S : Targets) {
TotalCount += S.Branches;
TotalMispreds += S.Mispreds;
}
if (!TotalCount)
TotalCount = 1;
if (!TotalMispreds)
TotalMispreds = 1;
OS << "BOLT-INFO: ICP decision for call site with " << Targets.size()
<< " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds
<< "\n";
size_t I = 0;
for (const auto &S : Targets) {
OS << "Count = " << S.Branches << ", "
<< format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
<< "Mispreds = " << S.Mispreds << ", "
<< format("%.1f", (100.0 * S.Mispreds) / TotalMispreds);
if (I < N)
OS << " * to be optimized *";
if (!S.JTIndices.empty()) {
OS << " Indices:";
for (const auto Idx : S.JTIndices)
OS << " " << Idx;
}
OS << "\n";
I += S.JTIndices.empty() ? 1 : S.JTIndices.size();
}
}
// Get list of targets for a given call sorted by most frequently
// called first.
std::vector<IndirectCallPromotion::Callsite>
@ -310,8 +242,7 @@ IndirectCallPromotion::getCallTargets(
auto &A = *Result;
const auto &B = *First;
if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym) {
A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(),
B.JTIndices.end());
A.JTIndex.insert(A.JTIndex.end(), B.JTIndex.begin(), B.JTIndex.end());
} else {
*(++Result) = *First;
}
@ -341,17 +272,10 @@ IndirectCallPromotion::getCallTargets(
}
}
// Sort by target count, number of indices in case of jump table, and
// mispredicts. We prioritize targets with high count, small number of
// indices and high mispredicts
// Sort by most commonly called targets.
std::stable_sort(Targets.begin(), Targets.end(),
[](const Callsite &A, const Callsite &B) {
if (A.Branches != B.Branches)
return A.Branches > B.Branches;
else if (A.JTIndices.size() != B.JTIndices.size())
return A.JTIndices.size() < B.JTIndices.size();
else
return A.Mispreds > B.Mispreds;
return A.Branches > B.Branches;
});
// Remove non-symbol targets
@ -456,9 +380,9 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets(
uint64_t ArrayStart;
if (DispExpr) {
auto DispValueOrError = BC.getSymbolValue(DispExpr->getSymbol());
assert(DispValueOrError && "global symbol needs a value");
ArrayStart = *DispValueOrError;
auto *BD = BC.getBinaryDataByName(DispExpr->getSymbol().getName());
assert(BD && "global symbol needs a value");
ArrayStart = BD->getAddress();
} else {
ArrayStart = static_cast<uint64_t>(DispValue);
}
@ -567,7 +491,7 @@ IndirectCallPromotion::SymTargetsType
IndirectCallPromotion::findCallTargetSymbols(
BinaryContext &BC,
std::vector<Callsite> &Targets,
size_t &N,
const size_t N,
BinaryFunction &Function,
BinaryBasicBlock *BB,
MCInst &CallInst,
@ -587,7 +511,7 @@ IndirectCallPromotion::findCallTargetSymbols(
if (!HotTargets.empty()) {
auto findTargetsIndex = [&](uint64_t JTIndex) {
for (size_t I = 0; I < Targets.size(); ++I) {
auto &JTIs = Targets[I].JTIndices;
auto &JTIs = Targets[I].JTIndex;
if (std::find(JTIs.begin(), JTIs.end(), JTIndex) != JTIs.end())
return I;
}
@ -597,81 +521,35 @@ IndirectCallPromotion::findCallTargetSymbols(
"callsite");
};
const auto MaxHotTargets = std::min(N, HotTargets.size());
if (opts::Verbosity >= 1) {
for (size_t I = 0; I < HotTargets.size(); ++I) {
for (size_t I = 0; I < MaxHotTargets; ++I) {
outs() << "BOLT-INFO: HotTarget[" << I << "] = ("
<< HotTargets[I].first << ", " << HotTargets[I].second << ")\n";
}
}
// Recompute hottest targets, now discriminating which index is hot
// NOTE: This is a tradeoff. On one hand, we get index information. On the
// other hand, info coming from the memory profile is much less accurate
// than LBRs. So we may actually end up working with more coarse
// profile granularity in exchange for information about indices.
std::vector<Callsite> NewTargets;
std::map<const MCSymbol *, uint32_t> IndicesPerTarget;
uint64_t TotalMemAccesses = 0;
for (size_t I = 0; I < HotTargets.size(); ++I) {
const auto TargetIndex = findTargetsIndex(HotTargets[I].second);
++IndicesPerTarget[Targets[TargetIndex].To.Sym];
TotalMemAccesses += HotTargets[I].first;
}
uint64_t RemainingMemAccesses = TotalMemAccesses;
const size_t TopN = opts::IndirectCallPromotionJumpTablesTopN != 0
? opts::IndirectCallPromotionTopN
: opts::IndirectCallPromotionTopN;
size_t I{0};
for (; I < HotTargets.size(); ++I) {
const auto MemAccesses = HotTargets[I].first;
if (100 * MemAccesses <
TotalMemAccesses * opts::ICPJTTotalPercentThreshold)
break;
if (100 * MemAccesses <
RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold)
break;
if (TopN && I >= TopN)
break;
RemainingMemAccesses -= MemAccesses;
for (size_t I = 0; I < MaxHotTargets; ++I) {
const auto JTIndex = HotTargets[I].second;
auto &Target = Targets[findTargetsIndex(JTIndex)];
const auto TargetIndex = findTargetsIndex(JTIndex);
NewTargets.push_back(Target);
std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices);
Target.JTIndices.erase(std::remove(Target.JTIndices.begin(),
Target.JTIndices.end(), JTIndex),
Target.JTIndices.end());
NewTargets.push_back(Targets[TargetIndex]);
std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndex);
// Keep fixCFG counts sane if more indices use this same target later
assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map");
NewTargets.back().Branches =
Target.Branches / IndicesPerTarget[Target.To.Sym];
NewTargets.back().Mispreds =
Target.Mispreds / IndicesPerTarget[Target.To.Sym];
assert(Target.Branches >= NewTargets.back().Branches);
assert(Target.Mispreds >= NewTargets.back().Mispreds);
Target.Branches -= NewTargets.back().Branches;
Target.Mispreds -= NewTargets.back().Mispreds;
Targets.erase(Targets.begin() + TargetIndex);
}
std::copy(Targets.begin(), Targets.end(), std::back_inserter(NewTargets));
assert(NewTargets.size() == Targets.size() + MaxHotTargets);
std::swap(NewTargets, Targets);
N = I;
if (N == 0 && opts::Verbosity >= 1) {
outs() << "BOLT-INFO: ICP failed in " << Function << " in "
<< BB->getName()
<< ": failed to meet thresholds after memory profile data was "
"loaded.\n";
return SymTargets;
}
}
for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) {
auto &Target = Targets[TgtIdx];
assert(Target.To.Sym && "All ICP targets must be to known symbols");
assert(!Target.JTIndices.empty() && "Jump tables must have indices");
for (auto Idx : Target.JTIndices) {
assert(!Target.JTIndex.empty() && "Jump tables must have indices");
for (auto Idx : Target.JTIndex) {
SymTargets.push_back(std::make_pair(Target.To.Sym, Idx));
++I;
}
@ -680,7 +558,7 @@ IndirectCallPromotion::findCallTargetSymbols(
for (size_t I = 0; I < N; ++I) {
assert(Targets[I].To.Sym &&
"All ICP targets must be to known symbols");
assert(Targets[I].JTIndices.empty() &&
assert(Targets[I].JTIndex.empty() &&
"Can't have jump table indices for non-jump tables");
SymTargets.push_back(std::make_pair(Targets[I].To.Sym, 0));
}
@ -769,7 +647,7 @@ IndirectCallPromotion::maybeGetVtableSyms(
<< "+" << MethodOffset << "/" << MI.Count
<< "\n");
if (auto MethodAddr = BC.getPointerAtAddress(Address)) {
if (auto MethodAddr = BC.extractPointerAtAddress(Address)) {
auto *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get());
if (!MethodBD) // skip unknown methods
continue;
@ -819,7 +697,7 @@ IndirectCallPromotion::rewriteCall(
BinaryFunction &Function,
BinaryBasicBlock *IndCallBlock,
const MCInst &CallInst,
MCPlusBuilder::BlocksVectorTy &&ICPcode,
MCPlusBuilder::ICPdata &&ICPcode,
const std::vector<MCInst *> &MethodFetchInsns
) const {
// Create new basic blocks with correct code in each one first.
@ -842,10 +720,6 @@ IndirectCallPromotion::rewriteCall(
}
auto MovedInst = IndCallBlock->splitInstructions(&CallInst);
// Link new BBs to the original input offset of the BB where the indirect
// call site is, so we can map samples recorded in new BBs back to the
// original BB seen in the input binary (if using BAT)
const auto OrigOffset = IndCallBlock->getInputOffset();
IndCallBlock->eraseInstructions(MethodFetchInsns.begin(),
MethodFetchInsns.end());
@ -863,7 +737,7 @@ IndirectCallPromotion::rewriteCall(
auto &Sym = Itr->first;
auto &Insts = Itr->second;
assert(Sym);
auto TBB = Function.createBasicBlock(OrigOffset, Sym);
auto TBB = Function.createBasicBlock(0, Sym);
for (auto &Inst : Insts) { // sanitize new instructions.
if (BC.MIB->isCall(Inst))
BC.MIB->removeAnnotation(Inst, "CallProfile");
@ -900,12 +774,10 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG(
for (const auto &Target : Targets) {
TotalIndirectBranches += Target.Branches;
}
if (TotalIndirectBranches == 0)
TotalIndirectBranches = 1;
std::vector<BinaryBranchInfo> BBI;
std::vector<BinaryBranchInfo> ScaledBBI;
for (const auto &Target : Targets) {
const auto NumEntries = std::max(1UL, Target.JTIndices.size());
const auto NumEntries = std::max(1UL, Target.JTIndex.size());
for (size_t I = 0; I < NumEntries; ++I) {
BBI.push_back(
BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries,
@ -924,7 +796,7 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG(
std::vector<MCSymbol*> SymTargets;
for (const auto &Target : Targets) {
const auto NumEntries = std::max(1UL, Target.JTIndices.size());
const auto NumEntries = std::max(1UL, Target.JTIndex.size());
for (size_t I = 0; I < NumEntries; ++I) {
SymTargets.push_back(Target.To.Sym);
}
@ -1052,12 +924,15 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB,
} else if (opts::IndirectCallPromotionCallsTopN != 0) {
TopN = opts::IndirectCallPromotionCallsTopN;
}
const auto TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size();
const auto TrialN = std::min(TopN, Targets.size());
if (opts::ICPTopCallsites > 0) {
auto &BC = BB->getFunction()->getBinaryContext();
if (!BC.MIB->hasAnnotation(Inst, "DoICP"))
return 0;
if (BC.MIB->hasAnnotation(Inst, "DoICP")) {
computeStats(TrialN);
return TrialN;
}
return 0;
}
// Pick the top N targets.
@ -1099,28 +974,35 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB,
// Count total number of calls for (at most) the top N targets.
// We may choose a smaller N (TrialN vs. N) if the frequency threshold
// is exceeded by fewer targets.
const unsigned TotalThreshold = IsJumpTable
? opts::ICPJTTotalPercentThreshold
: opts::ICPCallsTotalPercentThreshold;
const unsigned RemainingThreshold =
IsJumpTable ? opts::ICPJTRemainingPercentThreshold
: opts::ICPCallsRemainingPercentThreshold;
uint64_t NumRemainingCalls = NumCalls;
for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) {
if (100 * Targets[I].Branches < NumCalls * TotalThreshold)
break;
if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold)
break;
if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) >
double Threshold = double(opts::IndirectCallPromotionThreshold);
for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++MaxTargets) {
if (N + (Targets[I].JTIndex.empty() ? 1 : Targets[I].JTIndex.size()) >
TrialN)
break;
TotalCallsTopN += Targets[I].Branches;
TotalMispredictsTopN += Targets[I].Mispreds;
NumRemainingCalls -= Targets[I].Branches;
N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size();
Threshold -= (100.0 * Targets[I].Branches) / NumCalls;
N += Targets[I].JTIndex.empty() ? 1 : Targets[I].JTIndex.size();
}
computeStats(MaxTargets);
// Compute the frequency of the top N call targets. If this frequency
// is greater than the threshold, we should try ICP on this callsite.
const double TopNFrequency = (100.0 * TotalCallsTopN) / NumCalls;
if (TopNFrequency == 0 ||
TopNFrequency < opts::IndirectCallPromotionThreshold) {
if (opts::Verbosity >= 1) {
const auto InstIdx = &Inst - &(*BB->begin());
outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ "
<< InstIdx << " in " << BB->getName() << ", calls = "
<< NumCalls << ", top N frequency "
<< format("%.1f", TopNFrequency) << "% < "
<< opts::IndirectCallPromotionThreshold << "%\n";
}
return 0;
}
// Don't check misprediction frequency for jump tables -- we don't really
// care as long as we are saving loads from the jump table.
if (!IsJumpTable || opts::ICPJumpTablesByTarget) {
@ -1187,7 +1069,7 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB,
<< ", taken freq = " << format("%.1f", Frequency) << "%"
<< ", mis. freq = " << format("%.1f", MisFrequency) << "%";
bool First = true;
for (auto JTIndex : Targets[I].JTIndices) {
for (auto JTIndex : Targets[I].JTIndex) {
outs() << (First ? ", indices = " : ", ") << JTIndex;
First = false;
}
@ -1200,12 +1082,14 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB,
});
}
void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
void IndirectCallPromotion::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions
) {
if (opts::IndirectCallPromotion == ICP_NONE)
return;
auto &BFs = BC.getBinaryFunctions();
const bool OptimizeCalls =
(opts::IndirectCallPromotion == ICP_CALLS ||
opts::IndirectCallPromotion == ICP_ALL);
@ -1216,7 +1100,7 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
std::unique_ptr<RegAnalysis> RA;
std::unique_ptr<BinaryFunctionCallGraph> CG;
if (OptimizeJumpTables) {
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs)));
RA.reset(new RegAnalysis(BC, &BFs, &*CG));
}
@ -1264,13 +1148,8 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
// If icp-top-callsites is enabled, compute the total number of indirect
// calls and then optimize the hottest callsites that contribute to that
// total.
SetVector<BinaryFunction *> Functions;
if (opts::ICPTopCallsites == 0) {
for (auto &KV : BFs) {
Functions.insert(&KV.second);
}
} else {
using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>;
if (opts::ICPTopCallsites > 0) {
using IndirectCallsite = std::pair<uint64_t, MCInst *>;
std::vector<IndirectCallsite> IndirectCalls;
size_t TotalIndirectCalls = 0;
@ -1304,7 +1183,7 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
NumCalls += BInfo.Branches;
}
IndirectCalls.push_back(std::make_tuple(NumCalls, &Inst, &Function));
IndirectCalls.push_back(std::make_pair(NumCalls, &Inst));
TotalIndirectCalls += NumCalls;
}
}
@ -1319,25 +1198,30 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
const float TopPerc = opts::ICPTopCallsites / 100.0f;
int64_t MaxCalls = TotalIndirectCalls * TopPerc;
size_t Num = 0;
for (const auto &IC : IndirectCalls) {
for (auto &IC : IndirectCalls) {
if (MaxCalls <= 0)
break;
MaxCalls -= std::get<0>(IC);
BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true);
Functions.insert(std::get<2>(IC));
MaxCalls -= IC.first;
++Num;
}
outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
<< ", " << Num << " callsites cover " << opts::ICPTopCallsites
<< "% of all indirect calls\n";
// Mark sites to optimize with "DoICP" annotation.
for (size_t I = 0; I < Num; ++I) {
auto *Inst = IndirectCalls[I].second;
BC.MIB->addAnnotation(*Inst, "DoICP", true);
}
}
for (auto *FuncPtr : Functions) {
auto &Function = *FuncPtr;
for (auto &BFIt : BFs) {
auto &Function = BFIt.second;
if (!Function.isSimple() ||
!opts::shouldProcess(Function) ||
!Function.hasProfile())
if (!Function.isSimple() || !opts::shouldProcess(Function))
continue;
if (!Function.hasProfile())
continue;
const bool HasLayout = !Function.layout_empty();
@ -1425,10 +1309,7 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
// this callsite.
size_t N = canPromoteCallsite(BB, Inst, Targets, NumCalls);
// If it is a jump table and it failed to meet our initial threshold,
// proceed to findCallTargetSymbols -- it may reevaluate N if
// memory profile is present
if (!N && !IsJumpTable)
if (!N)
continue;
if (opts::Verbosity >= 1) {
@ -1445,13 +1326,6 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
Inst,
TargetFetchInst);
// findCallTargetSymbols may have changed N if mem profile is available
// for jump tables
if (!N)
continue;
DEBUG(printDecision(dbgs(), Targets, N));
// If we can't resolve any of the target symbols, punt on this callsite.
// TODO: can this ever happen?
if (SymTargets.size() < N) {
@ -1572,12 +1446,12 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
<< "BOLT-INFO: ICP percentage of indirect calls that can be "
"optimized = "
<< format("%.1f", (100.0 * TotalNumFrequentCalls) /
std::max<size_t>(TotalIndirectCalls, 1))
std::max(TotalIndirectCalls, 1ul))
<< "%\n"
<< "BOLT-INFO: ICP percentage of indirect callsites that are "
"optimized = "
<< format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
std::max<uint64_t>(TotalIndirectCallsites, 1))
std::max(TotalIndirectCallsites, 1ul))
<< "%\n"
<< "BOLT-INFO: ICP number of method load elimination candidates = "
<< TotalMethodLoadEliminationCandidates
@ -1585,17 +1459,17 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
<< "BOLT-INFO: ICP percentage of method calls candidates that have "
"loads eliminated = "
<< format("%.1f", (100.0 * TotalMethodLoadsEliminated) /
std::max<uint64_t>(TotalMethodLoadEliminationCandidates, 1))
std::max(TotalMethodLoadEliminationCandidates, 1ul))
<< "%\n"
<< "BOLT-INFO: ICP percentage of indirect branches that are "
"optimized = "
<< format("%.1f", (100.0 * TotalNumFrequentJmps) /
std::max<uint64_t>(TotalIndirectJmps, 1))
std::max(TotalIndirectJmps, 1ul))
<< "%\n"
<< "BOLT-INFO: ICP percentage of jump table callsites that are "
<< "optimized = "
<< format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
std::max<uint64_t>(TotalJumpTableCallsites, 1))
std::max(TotalJumpTableCallsites, 1ul))
<< "%\n"
<< "BOLT-INFO: ICP number of jump table callsites that can use hot "
<< "indices = " << TotalIndexBasedCandidates
@ -1603,7 +1477,7 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
<< "BOLT-INFO: ICP percentage of jump table callsites that use hot "
"indices = "
<< format("%.1f", (100.0 * TotalIndexBasedJumps) /
std::max<uint64_t>(TotalIndexBasedCandidates, 1))
std::max(TotalIndexBasedCandidates, 1ul))
<< "%\n";
#ifndef NDEBUG

View File

@ -119,7 +119,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
uint64_t Mispreds{0};
uint64_t Branches{0};
// Indices in the jmp table (jt only)
std::vector<uint64_t> JTIndices;
std::vector<uint64_t> JTIndex;
bool isValid() const {
return From.isValid() && To.isValid();
}
@ -128,7 +128,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
uint64_t Mispreds, uint64_t Branches,
uint64_t JTIndex)
: From(From), To(To), Mispreds(Mispreds), Branches(Branches),
JTIndices(1, JTIndex) { }
JTIndex(1, JTIndex) { }
};
std::unordered_set<const BinaryFunction *> Modified;
@ -177,10 +177,6 @@ class IndirectCallPromotion : public BinaryFunctionPass {
// Total number of jump table sites that use hot indices.
uint64_t TotalIndexBasedJumps{0};
void printDecision(llvm::raw_ostream &OS,
std::vector<IndirectCallPromotion::Callsite> &Targets,
unsigned N) const;
std::vector<Callsite> getCallTargets(BinaryBasicBlock &BB,
const MCInst &Inst) const;
@ -205,7 +201,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
SymTargetsType findCallTargetSymbols(BinaryContext &BC,
std::vector<Callsite> &Targets,
size_t &N,
const size_t N,
BinaryFunction &Function,
BinaryBasicBlock *BB,
MCInst &Inst,
@ -222,7 +218,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
BinaryFunction &Function,
BinaryBasicBlock *IndCallBlock,
const MCInst &CallInst,
MCPlusBuilder::BlocksVectorTy &&ICPcode,
MCPlusBuilder::ICPdata &&ICPcode,
const std::vector<MCInst *> &MethodFetchInsns) const;
BinaryBasicBlock *fixCFG(BinaryContext &BC,
@ -243,7 +239,9 @@ class IndirectCallPromotion : public BinaryFunctionPass {
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt

View File

@ -180,9 +180,6 @@ Inliner::InliningInfo Inliner::getInliningInfo(const BinaryFunction &BF) const {
// Perform necessary checks unless the option overrides it.
if (!opts::mustConsider(BF)) {
if (BF.hasSDTMarker())
return INL_NONE;
if (BF.hasEHRanges())
return INL_NONE;
@ -251,8 +248,9 @@ Inliner::InliningInfo Inliner::getInliningInfo(const BinaryFunction &BF) const {
}
void
Inliner::findInliningCandidates(BinaryContext &BC) {
for (const auto &BFI : BC.getBinaryFunctions()) {
Inliner::findInliningCandidates(BinaryContext &BC,
const std::map<uint64_t, BinaryFunction> &BFs) {
for (const auto &BFI : BFs) {
const auto &Function = BFI.second;
const auto InlInfo = getInliningInfo(Function);
if (InlInfo.Type != INL_NONE)
@ -534,14 +532,16 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) {
return DidInlining;
}
void Inliner::runOnFunctions(BinaryContext &BC) {
void Inliner::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &) {
opts::syncOptions();
if (!opts::inliningEnabled())
return;
uint64_t TotalSize = 0;
for (auto &BFI : BC.getBinaryFunctions())
for (auto &BFI : BFs)
TotalSize += BFI.second.getSize();
bool InlinedOnce;
@ -553,10 +553,10 @@ void Inliner::runOnFunctions(BinaryContext &BC) {
InlinedOnce = false;
InliningCandidates.clear();
findInliningCandidates(BC);
findInliningCandidates(BC, BFs);
std::vector<BinaryFunction *> ConsideredFunctions;
for (auto &BFI : BC.getBinaryFunctions()) {
for (auto &BFI : BFs) {
auto &Function = BFI.second;
if (!shouldOptimize(Function))
continue;

View File

@ -39,7 +39,7 @@ private:
: Type(Type)
{}
};
std::unordered_map<const BinaryFunction *, InliningInfo> InliningCandidates;
/// Count total amount of bytes inlined for all instances of Inliner.
@ -58,7 +58,7 @@ private:
/// Size in bytes of a tail call instruction.
static uint64_t SizeOfTailCallInst;
/// Set of functions modified by inlining (used for printing).
std::unordered_set<const BinaryFunction *> Modified;
@ -68,7 +68,8 @@ private:
/// Return the size in bytes of a tail call instruction.
uint64_t getSizeOfTailCallInst(const BinaryContext &BC);
void findInliningCandidates(BinaryContext &BC);
void findInliningCandidates(BinaryContext &BC,
const std::map<uint64_t, BinaryFunction> &BFs);
bool inlineCallsInFunction(BinaryFunction &Function);
@ -96,7 +97,9 @@ public:
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt

View File

@ -1,314 +0,0 @@
//===--- Passes/Instrumentation.cpp ---------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "Instrumentation.h"
#include "Passes/DataflowInfoManager.h"
#include "llvm/Support/Options.h"
#define DEBUG_TYPE "bolt-instrumentation"
using namespace llvm;
namespace opts {
extern cl::OptionCategory BoltCategory;
extern bool shouldProcess(const llvm::bolt::BinaryFunction &Function);
cl::opt<std::string> InstrumentationFilename(
"instrumentation-file",
cl::desc("file name where instrumented profile will be saved"),
cl::init("/tmp/prof.fdata"),
cl::Optional,
cl::cat(BoltCategory));
cl::opt<bool> InstrumentHotOnly(
"instrument-hot-only",
cl::desc("only insert instrumentation on hot functions (need profile)"),
cl::init(false),
cl::Optional,
cl::cat(BoltCategory));
}
namespace llvm {
namespace bolt {
uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) {
auto Iter = FuncToStringIdx.find(&Function);
if (Iter != FuncToStringIdx.end())
return Iter->second;
auto Idx = StringTable.size();
FuncToStringIdx.emplace(std::make_pair(&Function, Idx));
StringTable.append(Function.getNames()[0]);
StringTable.append(1, '\0');
return Idx;
}
Instrumentation::CounterDescription Instrumentation::createDescription(
const BinaryFunction &FromFunction, uint32_t From,
const BinaryFunction &ToFunction, uint32_t To) {
CounterDescription Res;
Res.FromFuncStringIdx = getFunctionNameIndex(FromFunction);
Res.FromOffset = From;
Res.ToFuncStringIdx = getFunctionNameIndex(ToFunction);
Res.ToOffset = To;
return Res;
}
std::vector<MCInst> Instrumentation::createInstrumentationSnippet(
BinaryFunction &FromFunction, uint32_t FromOffset, BinaryFunction &ToFunc,
uint32_t ToOffset) {
Descriptions.emplace_back(
createDescription(FromFunction, FromOffset, ToFunc, ToOffset));
BinaryContext &BC = FromFunction.getBinaryContext();
MCSymbol *Label =
BC.Ctx->createTempSymbol("InstrEntry", true);
Labels.emplace_back(Label);
std::vector<MCInst> CounterInstrs(5);
// Don't clobber application red zone (ABI dependent)
BC.MIB->createStackPointerIncrement(CounterInstrs[0], 128,
/*NoFlagsClobber=*/true);
BC.MIB->createPushFlags(CounterInstrs[1], 2);
BC.MIB->createIncMemory(CounterInstrs[2], Label, &*BC.Ctx);
BC.MIB->createPopFlags(CounterInstrs[3], 2);
BC.MIB->createStackPointerDecrement(CounterInstrs[4], 128,
/*NoFlagsClobber=*/true);
return CounterInstrs;
}
bool Instrumentation::instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
BinaryFunction &FromFunction,
BinaryBasicBlock &FromBB,
uint32_t From, BinaryFunction &ToFunc,
BinaryBasicBlock *TargetBB,
uint32_t ToOffset) {
std::vector<MCInst> CounterInstrs =
createInstrumentationSnippet(FromFunction, From, ToFunc, ToOffset);
BinaryContext &BC = FromFunction.getBinaryContext();
const MCInst &Inst = *Iter;
if (BC.MIB->isCall(Inst) && !TargetBB) {
for (auto &NewInst : CounterInstrs) {
Iter = FromBB.insertInstruction(Iter, NewInst);
++Iter;
}
return true;
}
if (!TargetBB)
return false;
// Indirect branch, conditional branches or fall-throughs
// Regular cond branch, put counter at start of target block
if (TargetBB->pred_size() == 1 && &FromBB != TargetBB &&
!TargetBB->isEntryPoint()) {
auto RemoteIter = TargetBB->begin();
for (auto &NewInst : CounterInstrs) {
RemoteIter = TargetBB->insertInstruction(RemoteIter, NewInst);
++RemoteIter;
}
return true;
}
if (FromBB.succ_size() == 1 && &FromBB != TargetBB) {
for (auto &NewInst : CounterInstrs) {
Iter = FromBB.insertInstruction(Iter, NewInst);
++Iter;
}
return true;
}
// Critical edge, create BB and put counter there
SplitWorklist.emplace_back(std::make_pair(&FromBB, TargetBB));
SplitInstrs.emplace_back(std::move(CounterInstrs));
return true;
}
void Instrumentation::runOnFunctions(BinaryContext &BC) {
if (!BC.isX86())
return;
const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/false,
/*IsText=*/false,
/*IsAllocatable=*/true);
BC.registerOrUpdateSection(".bolt.instr.counters", ELF::SHT_PROGBITS, Flags,
nullptr, 0, 1,
/*local=*/true);
BC.registerOrUpdateNoteSection(".bolt.instr.tables", nullptr,
0,
/*Alignment=*/1,
/*IsReadOnly=*/true, ELF::SHT_NOTE);
uint64_t InstrumentationSites{0ULL};
uint64_t InstrumentationSitesSavingFlags{0ULL};
for (auto &BFI : BC.getBinaryFunctions()) {
BinaryFunction &Function = BFI.second;
if (!Function.isSimple() || !opts::shouldProcess(Function)
|| (opts::InstrumentHotOnly && !Function.getKnownExecutionCount()))
continue;
Function.disambiguateJumpTables();
SplitWorklist.clear();
SplitInstrs.clear();
for (auto BBI = Function.begin(); BBI != Function.end(); ++BBI) {
auto &BB{*BBI};
bool HasUnconditionalBranch{false};
bool HasJumpTable{false};
for (auto I = BB.begin(); I != BB.end(); ++I) {
const auto &Inst = *I;
if (!BC.MIB->hasAnnotation(Inst, "Offset"))
continue;
const bool IsJumpTable = Function.getJumpTable(Inst);
if (IsJumpTable)
HasJumpTable = true;
else if (BC.MIB->isUnconditionalBranch(Inst))
HasUnconditionalBranch = true;
else if ((!BC.MIB->isCall(Inst) &&
!BC.MIB->isConditionalBranch(Inst)) ||
BC.MIB->isUnsupportedBranch(Inst.getOpcode()))
continue;
uint32_t FromOffset = BC.MIB->getAnnotationAs<uint32_t>(Inst, "Offset");
const MCSymbol *Target = BC.MIB->getTargetSymbol(Inst);
BinaryBasicBlock *TargetBB = Function.getBasicBlockForLabel(Target);
uint32_t ToOffset = TargetBB ? TargetBB->getInputOffset() : 0;
BinaryFunction *TargetFunc =
TargetBB ? &Function : BC.getFunctionForSymbol(Target);
// Should be null for indirect branches/calls
if (TargetFunc) {
if (instrumentOneTarget(I, Function, BB, FromOffset, *TargetFunc,
TargetBB, ToOffset))
++InstrumentationSites;
continue;
}
if (IsJumpTable) {
for (auto &Succ : BB.successors()) {
if (instrumentOneTarget(I, Function, BB, FromOffset, Function,
&*Succ, Succ->getInputOffset()))
++InstrumentationSites;
}
continue;
}
// FIXME: handle indirect calls
} // End of instructions loop
// Instrument fallthroughs (when the direct jump instruction is missing)
if (!HasUnconditionalBranch && !HasJumpTable && BB.succ_size() > 0 &&
BB.size() > 0) {
auto *FTBB = BB.getFallthrough();
assert(FTBB && "expected valid fall-through basic block");
auto I = BB.begin();
auto LastInstr = BB.end();
--LastInstr;
while (LastInstr != I && BC.MIB->isPseudo(*LastInstr))
--LastInstr;
uint32_t FromOffset = 0;
// The last instruction in the BB should have an annotation, except
// if it was branching to the end of the function as a result of
// __builtin_unreachable(), in which case it was deleted by fixBranches.
// Ignore this case. FIXME: force fixBranches() to preserve the offset.
if (!BC.MIB->hasAnnotation(*LastInstr, "Offset"))
continue;
FromOffset = BC.MIB->getAnnotationAs<uint32_t>(*LastInstr, "Offset");
if (instrumentOneTarget(I, Function, BB, FromOffset, Function, FTBB,
FTBB->getInputOffset()))
++InstrumentationSites;
}
} // End of BBs loop
// Consume list of critical edges: split them and add instrumentation to the
// newly created BBs
auto Iter = SplitInstrs.begin();
for (auto &BBPair : SplitWorklist) {
auto *NewBB = Function.splitEdge(BBPair.first, BBPair.second);
NewBB->addInstructions(Iter->begin(), Iter->end());
++Iter;
}
}
outs() << "BOLT-INSTRUMENTER: Instrumented " << InstrumentationSites
<< " sites, " << InstrumentationSitesSavingFlags << " saving flags.\n";
}
void Instrumentation::emitTablesAsELFNote(BinaryContext &BC) {
std::string TablesStr;
raw_string_ostream OS(TablesStr);
// Start of the vector with descriptions (one CounterDescription for each
// counter), vector size is Labels.size() CounterDescription-sized elmts
for (const auto &Desc : Descriptions) {
OS.write(reinterpret_cast<const char *>(&Desc.FromFuncStringIdx), 4);
OS.write(reinterpret_cast<const char *>(&Desc.FromOffset), 4);
OS.write(reinterpret_cast<const char *>(&Desc.ToFuncStringIdx), 4);
OS.write(reinterpret_cast<const char *>(&Desc.ToOffset), 4);
}
// Our string table lives immediately after descriptions vector
OS << StringTable;
OS.flush();
const auto BoltInfo = BinarySection::encodeELFNote(
"BOLT", TablesStr, BinarySection::NT_BOLT_INSTRUMENTATION_TABLES);
BC.registerOrUpdateNoteSection(".bolt.instr.tables", copyByteArray(BoltInfo),
BoltInfo.size(),
/*Alignment=*/1,
/*IsReadOnly=*/true, ELF::SHT_NOTE);
}
void Instrumentation::emit(BinaryContext &BC, MCStreamer &Streamer) {
emitTablesAsELFNote(BC);
const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/false,
/*IsText=*/false,
/*IsAllocatable=*/true);
auto *Section = BC.Ctx->getELFSection(".bolt.instr.counters",
ELF::SHT_PROGBITS,
Flags);
// All of the following symbols will be exported as globals to be used by the
// instrumentation runtime library to dump the instrumentation data to disk.
// Label marking start of the memory region containing instrumentation
// counters, total vector size is Labels.size() 8-byte counters
MCSymbol *Locs = BC.Ctx->getOrCreateSymbol("__bolt_instr_locations");
MCSymbol *NumLocs = BC.Ctx->getOrCreateSymbol("__bolt_instr_num_locs");
/// File name where profile is going to written to after target binary
/// finishes a run
MCSymbol *FilenameSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_filename");
Streamer.SwitchSection(Section);
Streamer.EmitLabel(Locs);
Streamer.EmitSymbolAttribute(Locs,
MCSymbolAttr::MCSA_Global);
for (const auto &Label : Labels) {
Streamer.EmitLabel(Label);
Streamer.emitFill(8, 0);
}
Streamer.EmitLabel(NumLocs);
Streamer.EmitSymbolAttribute(NumLocs,
MCSymbolAttr::MCSA_Global);
Streamer.EmitIntValue(Labels.size(), /*Size=*/4);
Streamer.EmitLabel(FilenameSym);
Streamer.EmitBytes(opts::InstrumentationFilename);
Streamer.emitFill(1, 0);
outs() << "BOLT-INSTRUMENTER: Total size of counters: "
<< (Labels.size() * 8) << " bytes (static alloc memory)\n";
outs() << "BOLT-INSTRUMENTER: Total size of string table emitted: "
<< StringTable.size() << " bytes in file\n";
outs() << "BOLT-INSTRUMENTER: Total size of descriptors: "
<< (Labels.size() * 16) << " bytes in file\n";
outs() << "BOLT-INSTRUMENTER: Profile will be saved to file "
<< opts::InstrumentationFilename << "\n";
}
}
}

View File

@ -1,128 +0,0 @@
//===--- Passes/Instrumentation.h -----------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_INSTRUMENTATION_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_INSTRUMENTATION_H
#include "BinaryPasses.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
namespace llvm {
namespace bolt {
/// This is an instrumentation pass that modifies the input binary to generate
/// a profile after execution finishes. It modifies branches to increment
/// counters stored in the process memory and inserts a new function that
/// dumps this data to an fdata file.
///
/// The runtime for instrumentation has a string table that holds function
/// names. It also must include two data structures: the counter values being
/// incremented after each instrumented branch and a description of these
/// counters to be written in a file during dump. The description references
/// string indices in the string table for function names, as well as function
/// offsets locating branch source and destination. The counter values will be
/// converted to decimal form when writing the dumped fdata.
///
/// OPPORTUNITIES ON PERFORMANCE
/// This instrumentation is experimental and currently uses a naive approach
/// where every branch is instrumented. This is not ideal for runtime
/// performance, but should be good enough for us to evaluate/debug LBR profile
/// quality against instrumentation. Hopefully we can make this more efficient
/// in the future, but most optimizations here can cost a lot in BOLT processing
/// time. Keep in mind the instrumentation pass runs on every single BB of the
/// entire input binary, thus it is very expensive to do analyses, such as FLAGS
/// liveness to avoid spilling flags on every branch, if the binary is large.
///
/// MISSING: instrumentation of indirect calls
class Instrumentation {
public:
Instrumentation() {}
/// Modifies all functions by inserting instrumentation code (first step)
void runOnFunctions(BinaryContext &BC);
/// Emit data structures that will be necessary during runtime (second step)
void emit(BinaryContext &BC, MCStreamer &Streamer);
private:
// Instrumented branch location information
struct CounterDescription {
uint32_t FromFuncStringIdx;
uint32_t FromOffset;
uint32_t ToFuncStringIdx;
uint32_t ToOffset;
};
/// Retrieve the string table index for the name of \p Function. We encode
/// instrumented locations descriptions with the aid of a string table to
/// manage memory of the instrumentation runtime in a more efficient way.
/// If this function name is not represented in the string table yet, it will
/// be inserted and its index returned.
uint32_t getFunctionNameIndex(const BinaryFunction &Function);
/// Populate all information needed to identify an instrumented location:
/// branch source location in terms of function name plus offset, as well as
/// branch destination (also name + offset). This will be encoded in the
/// binary as static data and function name strings will reference a strtab.
CounterDescription createDescription(const BinaryFunction &FromFunction,
uint32_t From,
const BinaryFunction &ToFunction,
uint32_t To);
/// Create the sequence of instructions to instrument a branch happening
/// at \p FromFunction + \p FromOffset to \p ToFunc + \p ToOffset
std::vector<MCInst> createInstrumentationSnippet(BinaryFunction &FromFunction,
uint32_t FromOffset,
BinaryFunction &ToFunc,
uint32_t ToOffset);
/// Instrument the branch in \p Iter located at \p FromFunction + \p From,
/// basic block \p FromBB. The destination of the branch is \p ToFunc +
/// \p ToOffset. \p TargetBB should be non-null if this is a local branch
/// and null if it is a call. Return true on success.
bool instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
BinaryFunction &FromFunction,
BinaryBasicBlock &FromBB, uint32_t From,
BinaryFunction &ToFunc, BinaryBasicBlock *TargetBB,
uint32_t ToOffset);
/// Create a non-allocatable ELF section with read-only tables necessary for
/// writing the instrumented data profile during program finish. The runtime
/// library needs to open the program executable file and read this data from
/// disk, this is not loaded by the system.
void emitTablesAsELFNote(BinaryContext &BC);
/// Critical edges worklist
/// This worklist keeps track of CFG edges <From-To> that needs to be split.
/// This task is deferred until we finish processing all BBs because we can't
/// modify the CFG while iterating over it. For each edge, \p SplitInstrs
/// stores the list of instrumentation instructions as a vector of MCInsts.
/// instrumentOneTarget() populates this, runOnFunctions() consumes.
std::vector<std::pair<BinaryBasicBlock *, BinaryBasicBlock *>> SplitWorklist;
std::vector<std::vector<MCInst>> SplitInstrs;
/// Stores function names, to be emitted to the runtime
std::string StringTable;
/// strtab indices in StringTable for each function name
std::unordered_map<const BinaryFunction *, uint32_t> FuncToStringIdx;
std::vector<CounterDescription> Descriptions;
/// Identify all counters used in runtime while instrumentation is running
std::vector<MCSymbol *> Labels;
};
}
}
#endif

View File

@ -243,17 +243,21 @@ void JTFootprintReduction::optimizeFunction(BinaryContext &BC,
}
}
void JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
void JTFootprintReduction::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions
) {
if (opts::JumpTables == JTS_BASIC && BC.HasRelocations)
return;
std::unique_ptr<RegAnalysis> RA;
std::unique_ptr<BinaryFunctionCallGraph> CG;
if (!opts::JTFootprintOnlyPIC) {
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
RA.reset(new RegAnalysis(BC, &BC.getBinaryFunctions(), &*CG));
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs)));
RA.reset(new RegAnalysis(BC, &BFs, &*CG));
}
for (auto &BFIt : BC.getBinaryFunctions()) {
for (auto &BFIt : BFs) {
auto &Function = BFIt.second;
if (!Function.isSimple() || !opts::shouldProcess(Function))

View File

@ -75,7 +75,9 @@ public:
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt

View File

@ -8,21 +8,11 @@
//===----------------------------------------------------------------------===//
//
// This class implements a pass that inserts LFENCE instructions before each
// conditional branch to protect against Spectre Variant 1, as well as the
// various LVI mitigations.
//
// The runtime performance impact of this is significant!
//
// NOTE: This pass is incompatible with RetpolineInsertion. It is also
// incompatible with ABIs that allow red-zones, due the the
// flags-preserving jmp mitigation clobbering 8 bytes in the red-zone.
// Options are to disable red-zone when compiling the target binary,
// or configure the compilers to never generate memory-indirect jmps.
// conditional branch to protect against Spectre Variant 1.
// The performance impact of this is significant!
//===----------------------------------------------------------------------===//
#include "LFenceInsertion.h"
#include "RewriteInstance.h"
#include "RetpolineInsertion.h" //IndirectBranchInfo
#include "ParallelUtilities.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "bolt-lfence"
@ -30,7 +20,6 @@
using namespace llvm;
using namespace bolt;
namespace opts {
extern bool shouldProcess(const bolt::BinaryFunction &Function);
extern cl::OptionCategory BoltCategory;
@ -41,53 +30,14 @@ InsertLFences("insert-lfences",
cl::ZeroOrMore,
cl::cat(BoltCategory));
llvm::cl::opt<bool>
LFenceConditionalBranches("lfence-conditional-branches",
cl::desc("determine if all conditional branches should be lfence mitigated"),
cl::init(true),
cl::ZeroOrMore,
cl::Hidden,
cl::cat(BoltCategory));
llvm::cl::opt<bool>
LFenceLoads("lfence-loads",
cl::desc("determine if all loads should be lfence mitigated"),
cl::init(true),
cl::ZeroOrMore,
cl::Hidden,
cl::cat(BoltCategory));
llvm::cl::opt<bool>
LFenceReturns("lfence-returns",
cl::desc("determine if all returns should be lfence mitigated"),
cl::init(true),
cl::ZeroOrMore,
cl::Hidden,
cl::cat(BoltCategory));
llvm::cl::opt<bool>
LFenceIndirectCalls("lfence-indirect-calls",
cl::desc("determine if all indirect calls should be lfence mitigated"),
cl::init(true),
cl::ZeroOrMore,
cl::Hidden,
cl::cat(BoltCategory));
llvm::cl::opt<bool>
LFenceIndirectJumps("lfence-indirect-jumps",
cl::desc("determine if all indirect jumps should be lfence mitigated"),
cl::init(true),
cl::ZeroOrMore,
cl::Hidden,
cl::cat(BoltCategory));
} // namespace opts
namespace llvm {
namespace bolt {
static void report_redzone_error() {
errs() << "BOLT-ERROR: 'Redzone access in function with indirect jmp mitigation'\n";
exit(1);
}
void LFenceInsertion::runOnFunctions(BinaryContext &BC) {
void LFenceInsertion::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
if (!opts::InsertLFences)
return;
@ -99,234 +49,25 @@ void LFenceInsertion::runOnFunctions(BinaryContext &BC) {
auto &MIB = *BC.MIB;
uint32_t LFencedBranches = 0;
uint32_t LFencedLoads = 0;
uint32_t LFencedRets = 0;
uint32_t LFencedIndirectCalls = 0;
uint32_t LFencedIndirectJmps = 0;
for (auto &It : BC.getBinaryFunctions()) {
for (auto &It : BFs) {
auto &Function = It.second;
bool MemIndirectJmp = false;
bool Redzone = false;
// For performance reasons, we may want to skip some functions and
// manually add lfences to them only where absolutely needed.
if (!opts::shouldProcess(Function))
continue;
for (auto &BB : Function) {
bool LastWasLFence = false;
for (auto It = BB.begin(); It != BB.end(); ++It) {
auto &Inst = *It;
if (MIB.isActualLoad(Inst) && MIB.isBranchOnMem(Inst)) {
IndirectBranchInfo BrInfo(Inst, MIB);
const auto &MemRef = BrInfo.Memory;
if (!MIB.isConditionalBranch(Inst))
continue;
if (MemRef.BaseRegNum == MIB.getStackPointer() &&
MemRef.DispValue < 0) {
if (MemIndirectJmp) {
report_redzone_error();
}
Redzone = true;
}
}
if (opts::LFenceConditionalBranches &&
MIB.isConditionalBranch(Inst)) {
// Inserts a lfence before every conditional branch.
// For example:
// cmp %reg1, %reg2
// je <jump_dest>
// gets rewritten to:
// cmp %reg1, %reg2
// lfence
// je <jump_dest>
if (!LastWasLFence) {
MCInst LFence;
MIB.createLfence(LFence);
It = BB.insertInstruction(It, std::move(LFence));
++It;
}
LFencedBranches++;
LastWasLFence = false;
} else if (opts::LFenceLoads &&
MIB.isActualLoad(Inst) &&
!MIB.isReturn(Inst) &&
!MIB.isIndirectBranch(Inst) &&
!MIB.isIndirectCall(Inst)) {
// Inserts an lfence after every load from memory.
// For example:
// mov 0x8(%rbx), %rdi
// Gets rewritten to:
// mov 0x8(%rbx), %rdi
// lfence
++It;
MCInst LFence;
MIB.createLfence(LFence);
It = BB.insertInstruction(It, std::move(LFence));
LFencedLoads++;
LastWasLFence = true;
} else if (opts::LFenceReturns &&
MIB.isReturn(Inst) && !MIB.isIndirectBranch(Inst)) {
// Inserts a dummy write + lfence before every ret.
// For example:
// retq
// gets rewritten to:
// shlq $0, (%rsp)
// lfence
// retq
MCInst Shlq;
MIB.createShl(Shlq, MIB.getStackPointer(), 1, MIB.getNoRegister(), 0, nullptr,
MIB.getNoRegister(), 0, 8);
It = BB.insertInstruction(It, std::move(Shlq));
++It;
MCInst LFence;
MIB.createLfence(LFence);
It = BB.insertInstruction(It, std::move(LFence));
++It;
LFencedRets++;
LastWasLFence = false;
} else if (opts::LFenceIndirectCalls &&
MIB.isIndirectCall(Inst) && MIB.isLoad(Inst) && !MIB.isIndirectBranch(Inst)) {
// Translates indirect calls into lea/mov/jmp then applies the jmp mitigation.
// For example:
// callq *(%rsi)
// gets rewritten to:
// pushq %rdi //Dummy to overwrite later
// pushq %rdi
// leaq 0x18(%rip), %rdi //After the retq
// movq %rdi, 8(%rsp) //Overwrite the dummy
// popq %rdi
// lfence
// pushq (%rsi)
// lfence //XXX Not needed, according to Intel?
// shlq $0, (%rsp)
// lfence
// retq
IndirectBranchInfo BrInfo(Inst, MIB);
const auto &MemRef = BrInfo.Memory;
auto *Ctx = BC.Ctx.get();
assert(BrInfo.isMem());
// Create a separate MCCodeEmitter to allow lock-free execution
BinaryContext::IndependentCodeEmitter Emitter;
if (!opts::NoThreads) {
Emitter = BC.createIndependentMCCodeEmitter();
}
int offset = 0x15 + BC.computeCodeSize(It, std::next(It), Emitter.MCE.get());
MCPhysReg ScratchReg = MIB.getIntArgRegister(0);
MCInst Pushq1; //Dummy, to overwrite later.
MIB.createPushRegister(Pushq1, ScratchReg, 8);
It = BB.insertInstruction(It, std::move(Pushq1));
++It;
MCInst Pushq2;
MIB.createPushRegister(Pushq2, ScratchReg, 8);
It = BB.insertInstruction(It, std::move(Pushq2));
++It;
MCInst Leaq;
MIB.createLea(Leaq, MIB.getInstructionPointer(), 1, MIB.getNoRegister(),
offset, nullptr, MIB.getNoRegister(), ScratchReg, 8);
It = BB.insertInstruction(It, std::move(Leaq));
++It;
MCInst Movq;
MIB.createSaveToStack(Movq, MIB.getStackPointer(), 8, ScratchReg, 8);
It = BB.insertInstruction(It, std::move(Movq));
++It;
MCInst Popq;
MIB.createPopRegister(Popq, ScratchReg, 8);
It = BB.insertInstruction(It, std::move(Popq));
++It;
MCInst LFence1;
MIB.createLfence(LFence1);
It = BB.insertInstruction(It, std::move(LFence1));
++It;
MCInst Pushq3;
MIB.createPushRegisterIndirect(Pushq3, MemRef.BaseRegNum, MemRef.ScaleValue,
MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
MemRef.SegRegNum, 8);
It = BB.insertInstruction(It, std::move(Pushq3));
++It;
MCInst LFence2;
MIB.createLfence(LFence2);
It = BB.insertInstruction(It, std::move(LFence2));
++It;
MCInst Shlq;
MIB.createShl(Shlq, MIB.getStackPointer(), 1, MIB.getNoRegister(), 0, nullptr,
MIB.getNoRegister(), 0, 8);
It = BB.insertInstruction(It, std::move(Shlq));
++It;
MCInst LFence3;
MIB.createLfence(LFence3);
It = BB.insertInstruction(It, std::move(LFence3));
++It;
MCInst Retq;
MIB.createReturn(Retq);
BB.replaceInstruction(It, std::vector<MCInst>({Retq}));
LFencedIndirectCalls++;
LastWasLFence = false;
} else if (opts::LFenceIndirectJumps &&
MIB.isIndirectBranch(Inst) && MIB.isLoad(Inst)) {
// Maps indirect jumps to "push; ret", then applies ret mitigation.
// For example:
// jmpq *(%rsi)
// gets rewritten to:
// pushq (%rsi)
// lfence //XXX Not needed, according to Intel?
// shlq $0, (%rsp)
// lfence
// retq
// Since this mitigation clobbers the redzone, we need to make
// sure that this function never uses it.
if (Redzone) {
report_redzone_error();
}
MemIndirectJmp = true;
IndirectBranchInfo BrInfo(Inst, MIB);
const auto &MemRef = BrInfo.Memory;
MCInst Push;
MIB.createPushRegisterIndirect(Push, MemRef.BaseRegNum, MemRef.ScaleValue,
MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
MemRef.SegRegNum, 8);
It = BB.insertInstruction(It, std::move(Push));
++It;
MCInst LFence1;
MIB.createLfence(LFence1);
It = BB.insertInstruction(It, std::move(LFence1));
++It;
MCInst Shlq;
MIB.createShl(Shlq, MIB.getStackPointer(), 1, MIB.getNoRegister(), 0, nullptr,
MIB.getNoRegister(), 0, 8);
It = BB.insertInstruction(It, std::move(Shlq));
++It;
MCInst LFence2;
MIB.createLfence(LFence2);
It = BB.insertInstruction(It, std::move(LFence2));
++It;
MCInst Retq;
MIB.createReturn(Retq);
BB.replaceInstruction(It, std::vector<MCInst>({Retq}));
LFencedIndirectJmps++;
LastWasLFence = false;
} else if (MIB.isLfence(Inst)) {
LastWasLFence = true;
} else {
LastWasLFence = false;
}
MCInst LFence;
MIB.createLfence(LFence);
It = BB.insertInstruction(It, std::move(LFence));
++It;
LFencedBranches++;
}
}
}
outs() << "\nBOLT-INFO: The number of lfenced branches is : " << LFencedBranches;
outs() << "\nBOLT-INFO: The number of lfenced loads is : " << LFencedLoads;
outs() << "\nBOLT-INFO: The number of lfenced rets is : " << LFencedRets;
outs() << "\nBOLT-INFO: The number of lfenced indirect calls is : " << LFencedIndirectCalls;
outs() << "\nBOLT-INFO: The number of lfenced indirect jmps is : " << LFencedIndirectJmps
<< "\n\n";
outs() << "\nBOLT-INFO: The number of lfenced branches is : " << LFencedBranches
<< "\n";
}
} // namespace bolt

View File

@ -28,7 +28,9 @@ public:
const char *getName() const override { return "lfence-insertion"; }
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt
} // namespace llvm

View File

@ -38,8 +38,8 @@ class LivenessAnalysis
public:
LivenessAnalysis(const RegAnalysis &RA, const BinaryContext &BC,
BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId)
: Parent(BC, BF, AllocId), RA(RA), NumRegs(BC.MRI->getNumRegs()) {}
BinaryFunction &BF)
: Parent(BC, BF), RA(RA), NumRegs(BC.MRI->getNumRegs()) {}
virtual ~LivenessAnalysis();
bool isAlive(ProgramPoint PP, MCPhysReg Reg) const {
@ -50,6 +50,8 @@ public:
}
void run() {
NamedRegionTimer T1("LA", "Liveness Analysis", "Dataflow", "Dataflow",
opts::TimeOpts);
Parent::run();
}

View File

@ -84,7 +84,7 @@ LongJmpPass::createNewStub(BinaryBasicBlock &SourceBB, const MCSymbol *TgtSym,
MCInst Inst;
BC.MIB->createUncondBranch(Inst, TgtSym, BC.Ctx.get());
if (TgtIsFunc)
BC.MIB->convertJmpToTailCall(Inst);
BC.MIB->convertJmpToTailCall(Inst, BC.Ctx.get());
StubBB->addInstruction(Inst);
StubBB->setExecutionCount(0);
@ -427,9 +427,9 @@ uint64_t LongJmpPass::getSymbolAddress(const BinaryContext &BC,
if (Iter == HotAddresses.end()) {
// Look at BinaryContext's resolution for this symbol - this is a symbol not
// mapped to a BinaryFunction
auto ValueOrError = BC.getSymbolValue(*Target);
assert(ValueOrError && "Unrecognized symbol");
return *ValueOrError;
auto *BD = BC.getBinaryDataByName(Target->getName());
assert(BD && "Unrecognized symbol");
return BD ? BD->getAddress() : 0;
}
return Iter->second;
}
@ -595,9 +595,11 @@ bool LongJmpPass::relax(BinaryFunction &Func) {
return Modified;
}
void LongJmpPass::runOnFunctions(BinaryContext &BC) {
void LongJmpPass::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
outs() << "BOLT-INFO: Starting stub-insertion pass\n";
auto Sorted = BC.getSortedFunctions();
auto Sorted = BinaryContext::getSortedFunctions(BFs);
bool Modified;
uint32_t Iterations{0};
do {

View File

@ -150,7 +150,9 @@ public:
const char *getName() const override { return "long-jmp"; }
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
}
}

View File

@ -2460,10 +2460,12 @@ void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction) {
}
};
size_t CurEdgeNum{0};
auto Next = std::next(BBI);
for (auto Succ : BB.successors()) {
int IsFT = (Next != E && Succ == *Next) ? 1 : 0;
AddSuccArc(Succ, BI->Count, IsFT);
++CurEdgeNum;
++BI;
}

View File

@ -43,12 +43,15 @@ PLT("plt",
namespace llvm {
namespace bolt {
void PLTCall::runOnFunctions(BinaryContext &BC) {
void PLTCall::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &) {
if (opts::PLT == OT_NONE)
return;
uint64_t NumCallsOptimized = 0;
for (auto &It : BC.getBinaryFunctions()) {
for (auto &It : BFs) {
auto &Function = It.second;
if (!shouldOptimize(Function))
continue;

View File

@ -38,7 +38,9 @@ public:
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF);
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt

View File

@ -36,10 +36,9 @@ class ReachingDefOrUse
public:
ReachingDefOrUse(const RegAnalysis &RA, const BinaryContext &BC,
BinaryFunction &BF, Optional<MCPhysReg> TrackingReg = None,
MCPlusBuilder::AllocatorIdTy AllocId = 0)
: InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>(BC, BF, AllocId),
RA(RA), TrackingReg(TrackingReg) {}
BinaryFunction &BF, Optional<MCPhysReg> TrackingReg = None)
: InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>(BC, BF), RA(RA),
TrackingReg(TrackingReg) {}
virtual ~ReachingDefOrUse() {}
bool isReachedBy(MCPhysReg Reg, ExprIterator Candidates) {
@ -61,6 +60,8 @@ public:
}
void run() {
NamedRegionTimer T1("RD", "Reaching Defs", "Dataflow", "Dataflow",
opts::TimeOpts);
InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>::run();
}

View File

@ -29,9 +29,8 @@ class ReachingInsns
friend class DataflowAnalysis<ReachingInsns<Backward>, BitVector, Backward>;
public:
ReachingInsns(const BinaryContext &BC, BinaryFunction &BF,
MCPlusBuilder::AllocatorIdTy AllocId = 0)
: InstrsDataflowAnalysis<ReachingInsns, Backward>(BC, BF, AllocId) {}
ReachingInsns(const BinaryContext &BC, BinaryFunction &BF)
: InstrsDataflowAnalysis<ReachingInsns, Backward>(BC, BF) {}
virtual ~ReachingInsns() {}
bool isInLoop(const BinaryBasicBlock &BB) {
@ -47,6 +46,8 @@ public:
}
void run() {
NamedRegionTimer T1("RI", "Reaching Insns", "Dataflow", "Dataflow",
opts::TimeOpts);
InstrsDataflowAnalysis<ReachingInsns<Backward>, Backward>::run();
}

View File

@ -36,8 +36,7 @@ public:
/// set of clobbered registers.
BitVector getFunctionClobberList(const BinaryFunction *Func);
RegAnalysis(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> *BFs,
RegAnalysis(BinaryContext &BC, std::map<uint64_t, BinaryFunction> *BFs,
BinaryFunctionCallGraph *CG);
/// Compute the set of registers \p Inst may read from, marking them in

View File

@ -339,7 +339,7 @@ bool RegReAssign::conservativePassOverFunction(BinaryContext &BC,
void RegReAssign::setupAggressivePass(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs) {
setupConservativePass(BC, BFs);
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs)));
RA.reset(new RegAnalysis(BC, &BFs, &*CG));
GPRegs = BitVector(BC.MRI->getNumRegs(), false);
@ -380,16 +380,18 @@ void RegReAssign::setupConservativePass(
});
}
void RegReAssign::runOnFunctions(BinaryContext &BC) {
void RegReAssign::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
RegScore = std::vector<int64_t>(BC.MRI->getNumRegs(), 0);
RankedRegs = std::vector<size_t>(BC.MRI->getNumRegs(), 0);
if (opts::AggressiveReAssign)
setupAggressivePass(BC, BC.getBinaryFunctions());
setupAggressivePass(BC, BFs);
else
setupConservativePass(BC, BC.getBinaryFunctions());
setupConservativePass(BC, BFs);
for (auto &I : BC.getBinaryFunctions()) {
for (auto &I : BFs) {
auto &Function = I.second;
if (!Function.isSimple() || !opts::shouldProcess(Function))

View File

@ -58,7 +58,9 @@ public:
return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
}
}

View File

@ -27,7 +27,6 @@ using namespace bolt;
namespace opts {
extern cl::OptionCategory BoltOptCategory;
extern cl::opt<bool> NoThreads;
static cl::opt<bool>
PrintClusters("print-clusters",
@ -66,13 +65,7 @@ struct HashPair {
}
void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
// Create a separate MCCodeEmitter to allow lock-free execution
BinaryContext::IndependentCodeEmitter Emitter;
if (!opts::NoThreads) {
Emitter = BC.createIndependentMCCodeEmitter();
}
void ClusterAlgorithm::computeClusterAverageFrequency() {
AvgFreq.resize(Clusters.size(), 0.0);
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
double Freq = 0.0;
@ -82,7 +75,7 @@ void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
Freq += BB->getExecutionCount();
// Estimate the size of a block in bytes at run time
// NOTE: This might be inaccurate
ClusterSize += BB->estimateSize(Emitter.MCE.get());
ClusterSize += BB->estimateSize();
}
}
AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
@ -532,7 +525,7 @@ void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
auto &ClusterEdges = CAlgo->ClusterEdges;
// Compute clusters' average frequencies.
CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
CAlgo->computeClusterAverageFrequency();
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
if (opts::PrintClusters)
@ -634,7 +627,7 @@ void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
// Compute clusters' average frequencies.
CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
CAlgo->computeClusterAverageFrequency();
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
if (opts::PrintClusters)

View File

@ -53,7 +53,7 @@ public:
/// the sum of average frequencies of its blocks (execution count / # instrs).
/// The average frequencies are stored in the AvgFreq vector, index by the
/// cluster indices in the Clusters vector.
void computeClusterAverageFrequency(const BinaryContext &BC);
void computeClusterAverageFrequency();
/// Clear clusters and related info.
virtual void reset();

View File

@ -379,7 +379,9 @@ bool ReorderData::markUnmoveableSymbols(BinaryContext &BC,
return FoundUnmoveable;
}
void ReorderData::runOnFunctions(BinaryContext &BC) {
void ReorderData::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
static const char* DefaultSections[] = {
".rodata",
".data",
@ -433,8 +435,7 @@ void ReorderData::runOnFunctions(BinaryContext &BC) {
std::tie(Order, SplitPointIdx) = sortedByCount(BC, *Section);
} else {
outs() << "BOLT-INFO: reorder-sections: ordering data by funcs\n";
std::tie(Order, SplitPointIdx) =
sortedByFunc(BC, *Section, BC.getBinaryFunctions());
std::tie(Order, SplitPointIdx) = sortedByFunc(BC, *Section, BFs);
}
auto SplitPoint = Order.begin() + SplitPointIdx;

View File

@ -57,7 +57,9 @@ public:
return "reorder-data";
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt

View File

@ -276,13 +276,21 @@ std::vector<std::string> readFunctionOrderFile() {
}
void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
auto &BFs = BC.getBinaryFunctions();
void ReorderFunctions::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
if (!BC.HasRelocations && opts::ReorderFunctions != RT_NONE) {
errs() << "BOLT-ERROR: Function reordering only works when "
<< "relocs are enabled.\n";
exit(1);
}
if (opts::ReorderFunctions != RT_NONE &&
opts::ReorderFunctions != RT_EXEC_COUNT &&
opts::ReorderFunctions != RT_USER) {
Cg = buildCallGraph(BC,
[](const BinaryFunction &BF) {
BFs,
[this](const BinaryFunction &BF) {
if (!BF.hasProfile())
return true;
if (BF.getState() != BinaryFunction::State::CFG)

View File

@ -41,7 +41,9 @@ public:
const char *getName() const override {
return "reorder-functions";
}
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt

View File

@ -106,49 +106,6 @@ private:
BitVector Valid;
};
// This class holds cached results of specified type for a pair of Clusters.
// It can invalidate all cache entries associated with a given Cluster.
// The functions set, get and contains are thread safe when called with
// distinct keys.
template <typename Cluster, typename ValueType>
class ClusterPairCacheThreadSafe {
public:
explicit ClusterPairCacheThreadSafe(size_t Size)
: Size(Size), Cache(Size * Size), Valid(Size * Size, false) {}
bool contains(const Cluster *First, const Cluster *Second) const {
return Valid[index(First, Second)];
}
ValueType get(const Cluster *First, const Cluster *Second) const {
assert(contains(First, Second));
return Cache[index(First, Second)];
}
void set(const Cluster *First, const Cluster *Second, ValueType Value) {
const auto Index = index(First, Second);
Cache[Index] = Value;
Valid[Index] = true;
}
void invalidate(const Cluster *C) {
for (size_t idx = C->id() * Size; idx < (C->id() + 1) * Size; idx++)
Valid[idx] = false;
for (size_t id = 0; id < Size; id++)
Valid[(id * Size) + C->id()] = false;
}
private:
size_t Size;
std::vector<ValueType> Cache;
std::vector<ValueType> Valid;
size_t index(const Cluster *First, const Cluster *Second) const {
return (First->id() * Size) + Second->id();
}
};
} // namespace bolt
} // namespace llvm

View File

@ -138,10 +138,9 @@ BinaryFunction *createNewRetpoline(BinaryContext &BC,
BB2.addInstruction(PushR11);
MCInst LoadCalleeAddrs;
const auto &MemRef = BrInfo.Memory;
MIB.createLoad(LoadCalleeAddrs, MemRef.BaseRegNum, MemRef.ScaleValue,
MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
MemRef.SegRegNum, MIB.getX86R11(), 8);
MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue,
BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr,
BrInfo.SegRegNum, MIB.getX86R11(), 8);
BB2.addInstruction(LoadCalleeAddrs);
@ -187,29 +186,27 @@ std::string createRetpolineFunctionTag(BinaryContext &BC,
std::string Tag = "__retpoline_mem_";
const auto &MemRef = BrInfo.Memory;
std::string DispExprStr;
if (MemRef.DispExpr) {
if (BrInfo.DispExpr) {
llvm::raw_string_ostream Ostream(DispExprStr);
MemRef.DispExpr->print(Ostream, BC.AsmInfo.get());
BrInfo.DispExpr->print(Ostream, BC.AsmInfo.get());
Ostream.flush();
}
Tag += MemRef.BaseRegNum != BC.MIB->getNoRegister()
? "r" + to_string(MemRef.BaseRegNum)
Tag += BrInfo.BaseRegNum != BC.MIB->getX86NoRegister()
? "r" + to_string(BrInfo.BaseRegNum)
: "";
Tag +=
MemRef.DispExpr ? "+" + DispExprStr : "+" + to_string(MemRef.DispValue);
BrInfo.DispExpr ? "+" + DispExprStr : "+" + to_string(BrInfo.DispValue);
Tag += MemRef.IndexRegNum != BC.MIB->getNoRegister()
? "+" + to_string(MemRef.ScaleValue) + "*" +
to_string(MemRef.IndexRegNum)
Tag += BrInfo.IndexRegNum != BC.MIB->getX86NoRegister()
? "+" + to_string(BrInfo.ScaleValue) + "*" +
to_string(BrInfo.IndexRegNum)
: "";
Tag += MemRef.SegRegNum != BC.MIB->getNoRegister()
? "_seg_" + to_string(MemRef.SegRegNum)
Tag += BrInfo.SegRegNum != BC.MIB->getX86NoRegister()
? "_seg_" + to_string(BrInfo.SegRegNum)
: "";
return Tag;
@ -235,11 +232,10 @@ void createBranchReplacement(BinaryContext &BC,
auto &MIB = *BC.MIB;
// Load the branch address in r11 if available
if (BrInfo.isMem() && R11Available) {
const auto &MemRef = BrInfo.Memory;
MCInst LoadCalleeAddrs;
MIB.createLoad(LoadCalleeAddrs, MemRef.BaseRegNum, MemRef.ScaleValue,
MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
MemRef.SegRegNum, MIB.getX86R11(), 8);
MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue,
BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr,
BrInfo.SegRegNum, MIB.getX86R11(), 8);
Replacement.push_back(LoadCalleeAddrs);
}
@ -259,10 +255,9 @@ IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) {
if (MIB.isBranchOnMem(Inst)) {
IsMem = true;
if (!MIB.evaluateX86MemoryOperand(Inst, &Memory.BaseRegNum,
&Memory.ScaleValue,
&Memory.IndexRegNum, &Memory.DispValue,
&Memory.SegRegNum, &Memory.DispExpr)) {
if (!MIB.evaluateX86MemoryOperand(Inst, &BaseRegNum, &ScaleValue,
&IndexRegNum, &DispValue, &SegRegNum,
&DispExpr)) {
llvm_unreachable("not expected");
}
} else if (MIB.isBranchOnReg(Inst)) {
@ -273,7 +268,10 @@ IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) {
}
}
void RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
void RetpolineInsertion::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
if (!opts::InsertRetpolines)
return;
@ -284,7 +282,7 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
auto &MIB = *BC.MIB;
uint32_t RetpolinedBranches = 0;
for (auto &It : BC.getBinaryFunctions()) {
for (auto &It : BFs) {
auto &Function = It.second;
for (auto &BB : Function) {
for (auto It = BB.begin(); It != BB.end(); ++It) {
@ -311,13 +309,12 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
// If the instruction addressing pattern uses rsp and the retpoline
// loads the callee address then displacement needs to be updated
if (BrInfo.isMem() && !R11Available) {
auto &MemRef = BrInfo.Memory;
auto Addend = (BrInfo.isJump() || BrInfo.isTailCall()) ? 8 : 16;
if (MemRef.BaseRegNum == MIB.getStackPointer()) {
MemRef.DispValue += Addend;
if (BrInfo.BaseRegNum == MIB.getStackPointer()) {
BrInfo.DispValue += Addend;
}
if (MemRef.IndexRegNum == MIB.getStackPointer())
MemRef.DispValue += Addend * MemRef.ScaleValue;
if (BrInfo.IndexRegNum == MIB.getStackPointer())
BrInfo.DispValue += Addend * BrInfo.ScaleValue;
}
TargetRetpoline = getOrCreateRetpoline(BC, BrInfo, R11Available);

View File

@ -34,21 +34,19 @@ public:
bool isJump() const { return !IsCall; }
bool isTailCall() const { return IsTailCall; }
struct MemOpInfo {
unsigned BaseRegNum;
int64_t ScaleValue;
unsigned IndexRegNum;
int64_t DispValue;
unsigned SegRegNum;
const MCExpr *DispExpr{nullptr};
};
union {
// Register branch information
MCPhysReg BranchReg;
// Memory branch information
MemOpInfo Memory;
struct {
unsigned BaseRegNum;
int64_t ScaleValue;
unsigned IndexRegNum;
int64_t DispValue;
unsigned SegRegNum;
const MCExpr *DispExpr{nullptr};
};
};
};
@ -73,7 +71,9 @@ public:
const char *getName() const override { return "retpoline-insertion"; }
void runOnFunctions(BinaryContext &BC) override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt
} // namespace llvm

View File

@ -102,7 +102,7 @@ void CalleeSavedAnalysis::analyzeSaves() {
CalleeSaved.set(FIE->RegOrImm);
SaveFIEByReg[FIE->RegOrImm] = &*FIE;
SavingCost[FIE->RegOrImm] += InsnToBB[&Inst]->getKnownExecutionCount();
BC.MIB->addAnnotation(Inst, getSaveTag(), FIE->RegOrImm, AllocatorId);
BC.MIB->addAnnotation(Inst, getSaveTag(), FIE->RegOrImm);
OffsetsByReg[FIE->RegOrImm] = FIE->StackOffset;
DEBUG(dbgs() << "Logging new candidate for Callee-Saved Reg: "
<< FIE->RegOrImm << "\n");
@ -153,8 +153,7 @@ void CalleeSavedAnalysis::analyzeRestores() {
<< "\n");
if (LoadFIEByReg[FIE->RegOrImm] == nullptr)
LoadFIEByReg[FIE->RegOrImm] = &*FIE;
BC.MIB->addAnnotation(Inst, getRestoreTag(), FIE->RegOrImm,
AllocatorId);
BC.MIB->addAnnotation(Inst, getRestoreTag(), FIE->RegOrImm);
HasRestores.set(FIE->RegOrImm);
}
Prev = &Inst;
@ -312,7 +311,7 @@ void StackLayoutModifier::checkStackPointerRestore(MCInst &Point) {
// We are restoring SP to an old value based on FP. Mark it as a stack
// access to be fixed later.
BC.MIB->addAnnotation(Point, getSlotTag(), Output, AllocatorId);
BC.MIB->addAnnotation(Point, getSlotTag(), Output);
}
void StackLayoutModifier::classifyStackAccesses() {
@ -355,7 +354,7 @@ void StackLayoutModifier::classifyStackAccesses() {
// We are free to go. Add it as available stack slot which we know how
// to move it.
AvailableRegions[FIEX->StackOffset] = FIEX->Size;
BC.MIB->addAnnotation(Inst, getSlotTag(), FIEX->StackOffset, AllocatorId);
BC.MIB->addAnnotation(Inst, getSlotTag(), FIEX->StackOffset);
RegionToRegMap[FIEX->StackOffset].insert(FIEX->RegOrImm);
RegToRegionMap[FIEX->RegOrImm].insert(FIEX->StackOffset);
DEBUG(dbgs() << "Adding region " << FIEX->StackOffset << " size "
@ -372,7 +371,7 @@ void StackLayoutModifier::classifyCFIs() {
auto recordAccess = [&](MCInst *Inst, int64_t Offset) {
const uint16_t Reg = BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false);
if (Reg == BC.MIB->getStackPointer() || Reg == BC.MIB->getFramePointer()) {
BC.MIB->addAnnotation(*Inst, getSlotTag(), Offset, AllocatorId);
BC.MIB->addAnnotation(*Inst, getSlotTag(), Offset);
DEBUG(dbgs() << "Recording CFI " << Offset << "\n");
} else {
IsSimple = false;
@ -401,14 +400,12 @@ void StackLayoutModifier::classifyCFIs() {
recordAccess(&Inst, CFI->getOffset());
BC.MIB->addAnnotation(Inst, getOffsetCFIRegTag(),
BC.MRI->getLLVMRegNum(CFI->getRegister(),
/*isEH=*/false),
AllocatorId);
/*isEH=*/false));
break;
case MCCFIInstruction::OpSameValue:
BC.MIB->addAnnotation(Inst, getOffsetCFIRegTag(),
BC.MRI->getLLVMRegNum(CFI->getRegister(),
/*isEH=*/false),
AllocatorId);
/*isEH=*/false));
break;
case MCCFIInstruction::OpRememberState:
CFIStack.push(std::make_pair(CfaOffset, CfaReg));
@ -435,7 +432,7 @@ void StackLayoutModifier::classifyCFIs() {
void StackLayoutModifier::scheduleChange(
MCInst &Inst, StackLayoutModifier::WorklistItem Item) {
auto &WList = BC.MIB->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
Inst, getTodoTag(), AllocatorId);
Inst, getTodoTag());
WList.push_back(Item);
}
@ -513,7 +510,7 @@ bool StackLayoutModifier::collapseRegion(MCInst *Alloc, int64_t RegionAddr,
}
if (Slot == RegionAddr) {
BC.MIB->addAnnotation(Inst, "AccessesDeletedPos", 0U, AllocatorId);
BC.MIB->addAnnotation(Inst, "AccessesDeletedPos", 0U);
continue;
}
if (BC.MIB->isPush(Inst) || BC.MIB->isPop(Inst)) {
@ -774,7 +771,7 @@ void ShrinkWrapping::pruneUnwantedCSRs() {
}
void ShrinkWrapping::computeSaveLocations() {
SavePos = std::vector<SmallSetVector<MCInst *, 4>>(BC.MRI->getNumRegs());
SavePos = std::vector<SmallPtrSet<MCInst *, 4>>(BC.MRI->getNumRegs());
auto &RI = Info.getReachingInsnsBackwards();
auto &DA = Info.getDominatorAnalysis();
auto &SPT = Info.getStackPointerTracking();
@ -963,7 +960,7 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR,
// In case of a critical edge, we need to create extra BBs to host restores
// into edges transitioning to the dominance frontier, otherwise we pull these
// restores to inside the dominated area.
Frontier = DA.getDominanceFrontierFor(*BestPosSave).takeVector();
Frontier = DA.getDominanceFrontierFor(*BestPosSave);
DEBUG({
dbgs() << "Dumping dominance frontier for ";
BC.printInstruction(dbgs(), *BestPosSave);
@ -1457,13 +1454,13 @@ protected:
public:
PredictiveStackPointerTracking(const BinaryContext &BC, BinaryFunction &BF,
decltype(ShrinkWrapping::Todo) &TodoMap,
DataflowInfoManager &Info,
MCPlusBuilder::AllocatorIdTy AllocatorId = 0)
: StackPointerTrackingBase<PredictiveStackPointerTracking>(BC, BF,
AllocatorId),
DataflowInfoManager &Info)
: StackPointerTrackingBase<PredictiveStackPointerTracking>(BC, BF),
TodoMap(TodoMap), Info(Info) {}
void run() {
NamedRegionTimer T1("PSPT", "Predictive Stack Pointer Tracking", "Dataflow",
"Dataflow", opts::TimeOpts);
StackPointerTrackingBase<PredictiveStackPointerTracking>::run();
}
};
@ -1556,7 +1553,7 @@ void ShrinkWrapping::rebuildCFIForSP() {
continue;
auto *CFI = BF.getCFIFor(Inst);
if (CFI->getOperation() == MCCFIInstruction::OpDefCfaOffset)
BC.MIB->addAnnotation(Inst, "DeleteMe", 0U, AllocatorId);
BC.MIB->addAnnotation(Inst, "DeleteMe", 0U);
}
}
@ -1815,7 +1812,7 @@ BBIterTy ShrinkWrapping::processInsertionsList(
}
bool ShrinkWrapping::processInsertions() {
PredictiveStackPointerTracking PSPT(BC, BF, Todo, Info, AllocatorId);
PredictiveStackPointerTracking PSPT(BC, BF, Todo, Info);
PSPT.run();
bool Changes{false};
@ -1913,15 +1910,6 @@ bool ShrinkWrapping::perform() {
PopOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
DomOrder = std::vector<MCPhysReg>(BC.MRI->getNumRegs(), 0);
if (BF.checkForAmbiguousJumpTables()) {
DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
<< ".\n");
// We could call disambiguateJumpTables here, but it is probably not worth
// the cost (of duplicating potentially large jump tables that could regress
// dcache misses). Moreover, ambiguous JTs are rare and coming from code
// written in assembly language. Just bail.
return false;
}
SLM.initialize();
CSA.compute();
classifyCSRUses();

View File

@ -27,8 +27,6 @@ class CalleeSavedAnalysis {
const BinaryContext &BC;
BinaryFunction &BF;
DataflowInfoManager &Info;
MCPlusBuilder::AllocatorIdTy AllocatorId;
Optional<unsigned> SaveTagIndex;
Optional<unsigned> RestoreTagIndex;
@ -41,6 +39,12 @@ class CalleeSavedAnalysis {
/// function.
void analyzeRestores();
/// Returns the identifying string used to annotate instructions with metadata
/// for this analysis. These are deleted in the destructor.
static StringRef getSaveTagName() {
return StringRef("CSA-SavedReg");
}
unsigned getSaveTag() {
if (SaveTagIndex)
return *SaveTagIndex;
@ -48,6 +52,10 @@ class CalleeSavedAnalysis {
return *SaveTagIndex;
}
static StringRef getRestoreTagName() {
return StringRef("CSA-RestoredReg");
}
unsigned getRestoreTag() {
if (RestoreTagIndex)
return *RestoreTagIndex;
@ -64,9 +72,8 @@ public:
std::vector<const FrameIndexEntry*> LoadFIEByReg;
CalleeSavedAnalysis(const FrameAnalysis &FA, const BinaryContext &BC,
BinaryFunction &BF, DataflowInfoManager &Info,
MCPlusBuilder::AllocatorIdTy AllocId)
: FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId),
BinaryFunction &BF, DataflowInfoManager &Info)
: FA(FA), BC(BC), BF(BF), Info(Info),
CalleeSaved(BC.MRI->getNumRegs(), false),
OffsetsByReg(BC.MRI->getNumRegs(), 0LL),
HasRestores(BC.MRI->getNumRegs(), false),
@ -105,17 +112,6 @@ public:
/// instructions).
std::vector<MCInst *> getSavesByReg(uint16_t Reg);
std::vector<MCInst *> getRestoresByReg(uint16_t Reg);
/// Returns the identifying string used to annotate instructions with metadata
/// for this analysis. These are deleted in the destructor.
static StringRef getSaveTagName() {
return StringRef("CSA-SavedReg");
}
static StringRef getRestoreTagName() {
return StringRef("CSA-RestoredReg");
}
};
/// Identifies in a given binary function all stack regions being used and allow
@ -126,7 +122,6 @@ class StackLayoutModifier {
const BinaryContext &BC;
BinaryFunction &BF;
DataflowInfoManager &Info;
MCPlusBuilder::AllocatorIdTy AllocatorId;
// Keep track of stack slots we know how to safely move
std::map<int64_t, int64_t> AvailableRegions;
@ -222,11 +217,20 @@ private:
return *OffsetCFIRegTagIndex;
}
static StringRef getTodoTagName() {
return StringRef("SLM-TodoTag");
}
static StringRef getSlotTagName() {
return StringRef("SLM-SlotTag");
}
static StringRef getOffsetCFIRegTagName() {
return StringRef("SLM-OffsetCFIReg");
}
public:
StackLayoutModifier(const FrameAnalysis &FA, const BinaryContext &BC,
BinaryFunction &BF, DataflowInfoManager &Info,
MCPlusBuilder::AllocatorIdTy AllocId)
: FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId) {}
BinaryFunction &BF, DataflowInfoManager &Info)
: FA(FA), BC(BC), BF(BF), Info(Info) {}
~StackLayoutModifier() {
for (auto &BB : BF) {
@ -279,19 +283,6 @@ public:
/// Perform initial assessment of the function trying to understand its stack
/// accesses.
void initialize();
static StringRef getTodoTagName() {
return StringRef("SLM-TodoTag");
}
static StringRef getSlotTagName() {
return StringRef("SLM-SlotTag");
}
static StringRef getOffsetCFIRegTagName() {
return StringRef("SLM-OffsetCFIReg");
}
};
/// Implements a pass to optimize callee-saved register spills. These spills
@ -303,7 +294,6 @@ class ShrinkWrapping {
const BinaryContext &BC;
BinaryFunction &BF;
DataflowInfoManager &Info;
MCPlusBuilder::AllocatorIdTy AllocatorId;
StackLayoutModifier SLM;
/// For each CSR, store a vector of all CFI indexes deleted as a consequence
/// of moving this Callee-Saved Reg
@ -316,7 +306,7 @@ class ShrinkWrapping {
std::vector<int64_t> PopOffsetByReg;
std::vector<MCPhysReg> DomOrder;
CalleeSavedAnalysis CSA;
std::vector<SmallSetVector<MCInst *, 4>> SavePos;
std::vector<SmallPtrSet<MCInst *, 4>> SavePos;
std::vector<uint64_t> BestSaveCount;
std::vector<MCInst *> BestSavePos;
@ -391,7 +381,7 @@ private:
void scheduleChange(ProgramPoint PP, T&& ...Item) {
if (PP.isInst()) {
auto &WList = BC.MIB->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
*PP.getInst(), getAnnotationIndex(), AllocatorId);
*PP.getInst(), getAnnotationIndex());
WList.emplace_back(std::forward<T>(Item)...);
return;
}
@ -408,7 +398,7 @@ private:
BB = *BB->succ_begin();
}
auto &WList = BC.MIB->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
*BB->begin(), getAnnotationIndex(), AllocatorId);
*BB->begin(), getAnnotationIndex());
WList.emplace_back(std::forward<T>(Item)...);
}
@ -527,10 +517,9 @@ private:
public:
ShrinkWrapping(const FrameAnalysis &FA, const BinaryContext &BC,
BinaryFunction &BF, DataflowInfoManager &Info,
MCPlusBuilder::AllocatorIdTy AllocId)
: FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId),
SLM(FA, BC, BF, Info, AllocId), CSA(FA, BC, BF, Info, AllocId) {}
BinaryFunction &BF, DataflowInfoManager &Info)
: FA(FA), BC(BC), BF(BF), Info(Info), SLM(FA, BC, BF, Info),
CSA(FA, BC, BF, Info) {}
~ShrinkWrapping() {
for (auto &BB : BF) {

View File

@ -35,13 +35,14 @@ class StackAllocationAnalysis
public:
StackAllocationAnalysis(const BinaryContext &BC, BinaryFunction &BF,
StackPointerTracking &SPT,
MCPlusBuilder::AllocatorIdTy AllocId)
: InstrsDataflowAnalysis<StackAllocationAnalysis, false>(BC, BF, AllocId),
StackPointerTracking &SPT)
: InstrsDataflowAnalysis<StackAllocationAnalysis, false>(BC, BF),
SPT(SPT) {}
virtual ~StackAllocationAnalysis() {}
void run() {
NamedRegionTimer T1("SAA", "Stack Allocation Analysis", "Dataflow",
"Dataflow", opts::TimeOpts);
InstrsDataflowAnalysis<StackAllocationAnalysis, false>::run();
}

View File

@ -36,6 +36,8 @@ public:
virtual ~StackAvailableExpressions() {}
void run() {
NamedRegionTimer T1("SAE", "Stack Available Expressions", "Dataflow",
"Dataflow", opts::TimeOpts);
InstrsDataflowAnalysis<StackAvailableExpressions>::run();
}

View File

@ -14,10 +14,9 @@
namespace llvm {
namespace bolt {
StackPointerTracking::StackPointerTracking(
const BinaryContext &BC, BinaryFunction &BF,
MCPlusBuilder::AllocatorIdTy AllocatorId)
: StackPointerTrackingBase<StackPointerTracking>(BC, BF, AllocatorId) {}
StackPointerTracking::StackPointerTracking(const BinaryContext &BC,
BinaryFunction &BF)
: StackPointerTrackingBase<StackPointerTracking>(BC, BF) {}
} // end namespace bolt
} // end namespace llvm

Some files were not shown because too many files have changed in this diff Show More