Compare commits
111 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
130d2c7589 | ||
|
|
fe8082af97 | ||
|
|
faef3fcd5b | ||
|
|
c1312e2956 | ||
|
|
f6d8aed1a9 | ||
|
|
f40c55f512 | ||
|
|
061fb7d1e7 | ||
|
|
0655e9a71f | ||
|
|
9d4a9c0e82 | ||
|
|
7e63dc16ee | ||
|
|
7fd4544586 | ||
|
|
753c1e1ee4 | ||
|
|
d36e6fc435 | ||
|
|
c6fa8fb91d | ||
|
|
6a339b9949 | ||
|
|
f353064d08 | ||
|
|
5a485bd2d8 | ||
|
|
1895790cab | ||
|
|
b782793df5 | ||
|
|
490fedfb4b | ||
|
|
b2258a5314 | ||
|
|
f069640bf3 | ||
|
|
75a18d4aad | ||
|
|
9e322d2a6d | ||
|
|
e8144c0d6d | ||
|
|
7c8ea6450e | ||
|
|
b965f62407 | ||
|
|
495f6c1737 | ||
|
|
108b67d892 | ||
|
|
7d5b72ea49 | ||
|
|
b8a5ae9471 | ||
|
|
5fa33f3084 | ||
|
|
a1f4793b70 | ||
|
|
227b921898 | ||
|
|
acb8749ccf | ||
|
|
e0fe32729a | ||
|
|
5de692014f | ||
|
|
bce6479435 | ||
|
|
01e75a8c9f | ||
|
|
c8cda8a4be | ||
|
|
8dfe2267b8 | ||
|
|
5bdb9857c2 | ||
|
|
303346b57c | ||
|
|
86de981e90 | ||
|
|
9c0b72b76c | ||
|
|
c8aea9f568 | ||
|
|
e67d72ca13 | ||
|
|
9058415d17 | ||
|
|
8c428d62d7 | ||
|
|
86ed045912 | ||
|
|
9d160e29dc | ||
|
|
39195b099f | ||
|
|
1fba68f849 | ||
|
|
0af97f2a82 | ||
|
|
f3f36a96cc | ||
|
|
7a38eb2d37 | ||
|
|
32f147aee1 | ||
|
|
89e112a748 | ||
|
|
8d1124f84e | ||
|
|
9d99ab8592 | ||
|
|
77d27b5983 | ||
|
|
3ecda6eccb | ||
|
|
721b037ab2 | ||
|
|
24847345a0 | ||
|
|
ea49a61463 | ||
|
|
a0fe82caa2 | ||
|
|
414ea2254e | ||
|
|
76b97902ed | ||
|
|
2771404c36 | ||
|
|
155eedaf2f | ||
|
|
85eb82a23d | ||
|
|
9bf3324c3b | ||
|
|
02f4d9be2b | ||
|
|
d1bb2e73af | ||
|
|
1cba184e00 | ||
|
|
e98a81c170 | ||
|
|
c94edca4d1 | ||
|
|
32fd328ade | ||
|
|
449c43120f | ||
|
|
58a4f2fd42 | ||
|
|
71d734e9ed | ||
|
|
dd45439d19 | ||
|
|
b52cc1b764 | ||
|
|
491695350a | ||
|
|
96bf250d9b | ||
|
|
f8d4184063 | ||
|
|
dd54572c9e | ||
|
|
3941112452 | ||
|
|
83de75bc91 | ||
|
|
c31ad699df | ||
|
|
1963f5cd26 | ||
|
|
7cacc8888a | ||
|
|
a9642d2664 | ||
|
|
b754aa70cd | ||
|
|
0109ba0130 | ||
|
|
85e16a2954 | ||
|
|
c7ee7d0e81 | ||
|
|
00f309f1be | ||
|
|
a36d0144ed | ||
|
|
a25bab5f01 | ||
|
|
26c5fe5f50 | ||
|
|
9c2d2e271f | ||
|
|
5de10a6526 | ||
|
|
39b15dd3ee | ||
|
|
54756ae8d8 | ||
|
|
22dc4b331c | ||
|
|
bfa5a65e12 | ||
|
|
adb6e1d8b1 | ||
|
|
91489df461 | ||
|
|
4a4b7d939b | ||
|
|
e37d18ee51 |
@ -1,5 +1,29 @@
|
||||
include(ExternalProject)
|
||||
|
||||
set(BOLT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
set(BOLT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
|
||||
ExternalProject_Add(bolt_rt
|
||||
SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/runtime"
|
||||
STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-stamps
|
||||
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins
|
||||
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
||||
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
|
||||
-DCMAKE_INSTALL_PREFIX=${LLVM_BINARY_DIR}
|
||||
# You might want to set this to True if actively developing bolt_rt, otherwise
|
||||
# cmake will not rebuild it after source code changes
|
||||
BUILD_ALWAYS True
|
||||
)
|
||||
|
||||
install(CODE "execute_process\(COMMAND \${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=\${CMAKE_INSTALL_PREFIX} -P ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/cmake_install.cmake \)"
|
||||
COMPONENT bolt_rt)
|
||||
|
||||
add_llvm_install_targets(install-bolt_rt
|
||||
DEPENDS bolt_rt
|
||||
COMPONENT bolt_rt)
|
||||
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(test)
|
||||
|
||||
@ -21,6 +21,14 @@ We actively welcome your pull requests.
|
||||
before it can be merged.
|
||||
* When all of the tests are passing and all other conditions described above
|
||||
satisfied, the PR is ready for review and merge.
|
||||
* If you haven't already, complete the Contributor License Agreement ("CLA").
|
||||
|
||||
## Contributor License Agreement ("CLA")
|
||||
|
||||
In order to accept your pull request, we need you to submit a CLA. You only need
|
||||
to do this once to work on any of Facebook's open source projects.
|
||||
|
||||
Complete your CLA here: <https://code.facebook.com/cla>
|
||||
|
||||
## Issues
|
||||
|
||||
|
||||
BIN
docs/Heatmap.png
Normal file
BIN
docs/Heatmap.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
50
docs/Heatmaps.md
Normal file
50
docs/Heatmaps.md
Normal file
@ -0,0 +1,50 @@
|
||||
# Code Heatmaps
|
||||
|
||||
BOLT has gained the ability to print code heatmaps based on
|
||||
sampling-based LBR profiles generated by `perf`. The output is produced
|
||||
in colored ASCII to be displayed in a color-capable terminal. It looks
|
||||
something like this:
|
||||
|
||||

|
||||
|
||||
Heatmaps can be generated for BOLTed and non-BOLTed binaries. You can
|
||||
use them to compare the code layout before and after optimizations.
|
||||
|
||||
To generate a heatmap, start with running your app under `perf`:
|
||||
|
||||
```bash
|
||||
$ perf record -e cycles:u -j any,u -- <executable with args>
|
||||
```
|
||||
or if you want to monitor the existing process(es):
|
||||
```bash
|
||||
$ perf record -e cycles:u -j any,u [-p PID|-a] -- sleep <interval>
|
||||
```
|
||||
|
||||
Note that at the moment running with LBR (`-j any,u` or `-b`) is
|
||||
a requirement.
|
||||
|
||||
Once the run is complete, and `perf.data` is generated, run BOLT in
|
||||
a heatmap mode:
|
||||
|
||||
```bash
|
||||
$ llvm-bolt heatmap -p perf.data <executable>
|
||||
```
|
||||
|
||||
By default the heatmap will be dumped to *stdout*. You can change it
|
||||
with `-o <heatmapfile>` option. Each character/block in the heatmap
|
||||
shows the execution data accumulated for corresponding 64 bytes of
|
||||
code. You can change this granularity with a `-block-size` option.
|
||||
E.g. set it to 4096 to see code usage grouped by 4K pages.
|
||||
Other useful options are:
|
||||
|
||||
```bash
|
||||
-line-size=<uint> - number of entries per line (default 256)
|
||||
-max-address=<uint> - maximum address considered valid for heatmap (default 4GB)
|
||||
```
|
||||
|
||||
If you prefer to look at the data in a browser (or would like to share
|
||||
it that way), then you can use an HTML conversion tool. E.g.:
|
||||
|
||||
```bash
|
||||
$ aha -b -f <heatmapfile> > <heatmapfile>.html
|
||||
```
|
||||
114
llvm.patch
114
llvm.patch
@ -848,7 +848,7 @@ index 8e9b4ac5632..d2c569e3399 100644
|
||||
SMLoc Loc) override;
|
||||
void
|
||||
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
|
||||
index 582a836023b..0b15454ecd6 100644
|
||||
index 582a836023b..f1e341bd624 100644
|
||||
--- a/include/llvm/MC/MCStreamer.h
|
||||
+++ b/include/llvm/MC/MCStreamer.h
|
||||
@@ -199,7 +199,7 @@ class MCStreamer {
|
||||
@ -860,17 +860,6 @@ index 582a836023b..0b15454ecd6 100644
|
||||
|
||||
/// \brief This is stack of current and previous section values saved by
|
||||
/// PushSection.
|
||||
@@ -290,8 +290,8 @@ public:
|
||||
/// If the comment includes embedded \n's, they will each get the comment
|
||||
/// prefix as appropriate. The added comment should not end with a \n.
|
||||
/// By default, each comment is terminated with an end of line, i.e. the
|
||||
- /// EOL param is set to true by default. If one prefers not to end the
|
||||
- /// comment with a new line then the EOL param should be passed
|
||||
+ /// EOL param is set to true by default. If one prefers not to end the
|
||||
+ /// comment with a new line then the EOL param should be passed
|
||||
/// with a false value.
|
||||
virtual void AddComment(const Twine &T, bool EOL = true) {}
|
||||
|
||||
@@ -338,9 +338,7 @@ public:
|
||||
|
||||
/// \brief Returns an index to represent the order a symbol was emitted in.
|
||||
@ -1009,11 +998,10 @@ index 46504e74bc2..836fd8ddc45 100644
|
||||
Expected<Elf_Shdr_Range> sections() const;
|
||||
|
||||
Expected<Elf_Sym_Range> symbols(const Elf_Shdr *Sec) const {
|
||||
@@ -396,6 +408,34 @@ void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
|
||||
}
|
||||
@@ -397,6 +409,34 @@ void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
|
||||
}
|
||||
|
||||
+template <class ELFT>
|
||||
template <class ELFT>
|
||||
+Expected<const typename ELFFile<ELFT>::Elf_Dyn *>
|
||||
+ELFFile<ELFT>::dynamic_table_begin(const Elf_Phdr *Phdr) const {
|
||||
+ if (!Phdr)
|
||||
@ -1041,9 +1029,10 @@ index 46504e74bc2..836fd8ddc45 100644
|
||||
+ return reinterpret_cast<const Elf_Dyn *>(base() + End);
|
||||
+}
|
||||
+
|
||||
template <class ELFT>
|
||||
+template <class ELFT>
|
||||
Expected<const typename ELFT::Sym *>
|
||||
ELFFile<ELFT>::getRelocationSymbol(const Elf_Rel *Rel,
|
||||
const Elf_Shdr *SymTab) const {
|
||||
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
|
||||
index 4d001039238..62837bbcaa0 100644
|
||||
--- a/include/llvm/Object/ELFObjectFile.h
|
||||
@ -1056,11 +1045,10 @@ index 4d001039238..62837bbcaa0 100644
|
||||
relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
|
||||
relocation_iterator section_rel_end(DataRefImpl Sec) const override;
|
||||
section_iterator getRelocatedSection(DataRefImpl Sec) const override;
|
||||
@@ -716,6 +717,14 @@ bool ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec) const {
|
||||
return getSection(Sec)->sh_type == ELF::SHT_NOBITS;
|
||||
@@ -717,6 +718,14 @@ bool ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec) const {
|
||||
}
|
||||
|
||||
+template <class ELFT>
|
||||
template <class ELFT>
|
||||
+bool ELFObjectFile<ELFT>::isSectionReadOnly(DataRefImpl Sec) const {
|
||||
+ const Elf_Shdr *EShdr = getSection(Sec);
|
||||
+ return EShdr->sh_flags & ELF::SHF_ALLOC &&
|
||||
@ -1068,9 +1056,10 @@ index 4d001039238..62837bbcaa0 100644
|
||||
+ EShdr->sh_type == ELF::SHT_PROGBITS;
|
||||
+}
|
||||
+
|
||||
template <class ELFT>
|
||||
+template <class ELFT>
|
||||
relocation_iterator
|
||||
ELFObjectFile<ELFT>::section_rel_begin(DataRefImpl Sec) const {
|
||||
DataRefImpl RelData;
|
||||
@@ -751,9 +760,6 @@ ELFObjectFile<ELFT>::section_rel_end(DataRefImpl Sec) const {
|
||||
template <class ELFT>
|
||||
section_iterator
|
||||
@ -1101,7 +1090,7 @@ index 4d001039238..62837bbcaa0 100644
|
||||
if (sec->sh_type == ELF::SHT_REL)
|
||||
return getRel(Rel)->r_offset;
|
||||
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
|
||||
index bfd3462bf69..9be0b260f34 100644
|
||||
index bfd3462bf69..52bc210b577 100644
|
||||
--- a/include/llvm/Object/MachO.h
|
||||
+++ b/include/llvm/Object/MachO.h
|
||||
@@ -320,6 +320,7 @@ public:
|
||||
@ -1112,15 +1101,6 @@ index bfd3462bf69..9be0b260f34 100644
|
||||
relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
|
||||
relocation_iterator section_rel_end(DataRefImpl Sec) const override;
|
||||
|
||||
@@ -331,7 +332,7 @@ public:
|
||||
|
||||
relocation_iterator locrel_begin() const;
|
||||
relocation_iterator locrel_end() const;
|
||||
-
|
||||
+
|
||||
void moveRelocationNext(DataRefImpl &Rel) const override;
|
||||
uint64_t getRelocationOffset(DataRefImpl Rel) const override;
|
||||
symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
|
||||
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
|
||||
index 9c4ae94d3a6..64342723371 100644
|
||||
--- a/include/llvm/Object/ObjectFile.h
|
||||
@ -1215,18 +1195,9 @@ index d11f5a83779..0ad115c886b 100644
|
||||
/// FD is the file descriptor that this writes to. If ShouldClose is true,
|
||||
/// this closes the file when the stream is destroyed. If FD is for stdout or
|
||||
diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
|
||||
index adada672af0..c9c79971a25 100644
|
||||
index adada672af0..b3d68ed66af 100644
|
||||
--- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
|
||||
+++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
|
||||
@@ -38,7 +38,7 @@ DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() {
|
||||
}
|
||||
|
||||
bool
|
||||
-DWARFAbbreviationDeclaration::extract(DataExtractor Data,
|
||||
+DWARFAbbreviationDeclaration::extract(DataExtractor Data,
|
||||
uint32_t* OffsetPtr) {
|
||||
clear();
|
||||
const uint32_t Offset = *OffsetPtr;
|
||||
@@ -61,13 +61,15 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
|
||||
|
||||
// Read all of the abbreviation attributes and forms.
|
||||
@ -1587,7 +1558,7 @@ index 3d274b63a4f..cef29f4b41d 100644
|
||||
|
||||
StringRef RuntimeDyld::getErrorString() { return Dyld->getErrorString(); }
|
||||
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
|
||||
index 36b43ec9b78..3dc3e8f325c 100644
|
||||
index 36b43ec9b78..1a56e590014 100644
|
||||
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
|
||||
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
|
||||
@@ -270,6 +270,25 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
|
||||
@ -1688,15 +1659,6 @@ index 36b43ec9b78..3dc3e8f325c 100644
|
||||
resolveAArch64Branch(SectionID, Value, RelI, Stubs);
|
||||
} else if (RelType == ELF::R_AARCH64_ADR_GOT_PAGE) {
|
||||
// Craete new GOT entry or find existing one. If GOT entry is
|
||||
@@ -1410,7 +1478,7 @@ RuntimeDyldELF::processRelocationRef(
|
||||
} else {
|
||||
processSimpleRelocation(SectionID, Offset, RelType, Value);
|
||||
}
|
||||
-
|
||||
+
|
||||
} else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) {
|
||||
if (RelType == ELF::R_PPC64_REL24) {
|
||||
// Determine ABI variant in use for this object.
|
||||
@@ -1632,7 +1700,7 @@ RuntimeDyldELF::processRelocationRef(
|
||||
// equivalent to the usual PLT implementation except that we use the stub
|
||||
// mechanism in RuntimeDyld (which puts stubs at the end of the section)
|
||||
@ -1819,18 +1781,10 @@ index a0f9a857e3c..be32963b705 100644
|
||||
assert((cast<MCFillFragment>(F).getValue() == 0) &&
|
||||
"Invalid fill in virtual section!");
|
||||
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
|
||||
index 0e0ea965d14..0044566d9ab 100644
|
||||
index 0e0ea965d14..49885269d06 100644
|
||||
--- a/lib/MC/MCDwarf.cpp
|
||||
+++ b/lib/MC/MCDwarf.cpp
|
||||
@@ -41,6 +41,7 @@
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
+#include <tuple>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
@@ -156,12 +157,36 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
|
||||
@@ -156,12 +156,36 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
|
||||
unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
|
||||
unsigned Isa = 0;
|
||||
unsigned Discriminator = 0;
|
||||
@ -1868,7 +1822,7 @@ index 0e0ea965d14..0044566d9ab 100644
|
||||
if (FileNum != LineEntry.getFileNum()) {
|
||||
FileNum = LineEntry.getFileNum();
|
||||
MCOS->EmitIntValue(dwarf::DW_LNS_set_file, 1);
|
||||
@@ -197,18 +222,33 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
|
||||
@@ -197,18 +221,33 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
|
||||
if (LineEntry.getFlags() & DWARF2_FLAG_EPILOGUE_BEGIN)
|
||||
MCOS->EmitIntValue(dwarf::DW_LNS_set_epilogue_begin, 1);
|
||||
|
||||
@ -1910,7 +1864,7 @@ index 0e0ea965d14..0044566d9ab 100644
|
||||
}
|
||||
|
||||
// Emit a DW_LNE_end_sequence for the end of the section.
|
||||
@@ -250,7 +290,7 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS,
|
||||
@@ -250,7 +289,7 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS,
|
||||
MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
|
||||
|
||||
// Handle the rest of the Compile Units.
|
||||
@ -1919,16 +1873,7 @@ index 0e0ea965d14..0044566d9ab 100644
|
||||
CUIDTablePair.second.EmitCU(MCOS, Params, LineStr);
|
||||
|
||||
if (LineStr)
|
||||
@@ -484,7 +524,7 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
|
||||
|
||||
// Parameters of the state machine, are next.
|
||||
MCOS->EmitIntValue(context.getAsmInfo()->getMinInstAlignment(), 1);
|
||||
- // maximum_operations_per_instruction
|
||||
+ // maximum_operations_per_instruction
|
||||
// For non-VLIW architectures this field is always 1.
|
||||
// FIXME: VLIW architectures need to update this field accordingly.
|
||||
if (LineTableVersion >= 4)
|
||||
@@ -514,8 +554,12 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
|
||||
@@ -514,8 +553,12 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
|
||||
|
||||
void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS,
|
||||
MCDwarfLineTableParams Params,
|
||||
@ -1943,7 +1888,7 @@ index 0e0ea965d14..0044566d9ab 100644
|
||||
|
||||
// Put out the line tables.
|
||||
for (const auto &LineSec : MCLineSections.getMCLineEntries())
|
||||
@@ -1253,12 +1297,217 @@ public:
|
||||
@@ -1253,12 +1296,217 @@ public:
|
||||
void EmitCFIInstruction(const MCCFIInstruction &Instr);
|
||||
};
|
||||
|
||||
@ -2161,7 +2106,7 @@ index 0e0ea965d14..0044566d9ab 100644
|
||||
void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
|
||||
int dataAlignmentFactor = getDataAlignmentFactor(Streamer);
|
||||
auto *MRI = Streamer.getContext().getRegisterInfo();
|
||||
@@ -1373,7 +1622,28 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
|
||||
@@ -1373,7 +1621,28 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
|
||||
Streamer.EmitIntValue(dwarf::DW_CFA_GNU_args_size, 1);
|
||||
Streamer.EmitULEB128IntValue(Instr.getOffset());
|
||||
return;
|
||||
@ -2286,7 +2231,7 @@ index 0a684588110..58199c97420 100644
|
||||
unsigned char Value,
|
||||
SMLoc Loc) {
|
||||
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
|
||||
index 776569894a5..0954b70df49 100644
|
||||
index 776569894a5..aa130bb2d6a 100644
|
||||
--- a/lib/MC/MCStreamer.cpp
|
||||
+++ b/lib/MC/MCStreamer.cpp
|
||||
@@ -85,11 +85,15 @@ void MCStreamer::reset() {
|
||||
@ -2329,15 +2274,6 @@ index 776569894a5..0954b70df49 100644
|
||||
}
|
||||
|
||||
void MCStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
|
||||
@@ -513,7 +524,7 @@ void MCStreamer::EmitCFIEscape(StringRef Values) {
|
||||
|
||||
void MCStreamer::EmitCFIGnuArgsSize(int64_t Size) {
|
||||
MCSymbol *Label = EmitCFILabel();
|
||||
- MCCFIInstruction Instruction =
|
||||
+ MCCFIInstruction Instruction =
|
||||
MCCFIInstruction::createGnuArgsSize(Label, Size);
|
||||
MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
|
||||
if (!CurFrame)
|
||||
@@ -884,6 +895,14 @@ void MCStreamer::visitUsedExpr(const MCExpr &Expr) {
|
||||
}
|
||||
}
|
||||
@ -2363,16 +2299,10 @@ index 776569894a5..0954b70df49 100644
|
||||
SMLoc Loc) {}
|
||||
void MCStreamer::EmitBundleAlignMode(unsigned AlignPow2) {}
|
||||
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
|
||||
index b544fa5c147..746c9f32865 100644
|
||||
index b544fa5c147..c885bf9f037 100644
|
||||
--- a/lib/Object/COFFObjectFile.cpp
|
||||
+++ b/lib/Object/COFFObjectFile.cpp
|
||||
@@ -339,11 +339,16 @@ unsigned COFFObjectFile::getSectionID(SectionRef Sec) const {
|
||||
|
||||
bool COFFObjectFile::isSectionVirtual(DataRefImpl Ref) const {
|
||||
const coff_section *Sec = toSec(Ref);
|
||||
- // In COFF, a virtual section won't have any in-file
|
||||
+ // In COFF, a virtual section won't have any in-file
|
||||
// content, so the file pointer to the content will be zero.
|
||||
@@ -344,6 +344,11 @@ bool COFFObjectFile::isSectionVirtual(DataRefImpl Ref) const {
|
||||
return Sec->PointerToRawData == 0;
|
||||
}
|
||||
|
||||
|
||||
12
runtime/CMakeLists.txt
Normal file
12
runtime/CMakeLists.txt
Normal file
@ -0,0 +1,12 @@
|
||||
cmake_minimum_required(VERSION 3.1.0)
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
|
||||
project(libbolt_rt_project)
|
||||
|
||||
add_library(bolt_rt STATIC
|
||||
instr.cpp
|
||||
)
|
||||
|
||||
install(TARGETS bolt_rt DESTINATION lib)
|
||||
285
runtime/instr.cpp
Normal file
285
runtime/instr.cpp
Normal file
@ -0,0 +1,285 @@
|
||||
//===-- instr.cpp -----------------------------------------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
// This file contains code that is linked to the final binary with a function
|
||||
// that is called at program exit to dump instrumented data collected during
|
||||
// execution.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// BOLT runtime instrumentation library for x86 Linux.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <cstdint>
|
||||
#include <elf.h>
|
||||
|
||||
// All extern declarations here need to be defined by BOLT itself.
|
||||
|
||||
// Counters inserted by instrumentation, incremented during runtime when
|
||||
// points of interest (locations) in the program are reached.
|
||||
extern uint64_t __bolt_instr_locations[];
|
||||
// Number of counters.
|
||||
extern uint32_t __bolt_instr_num_locs;
|
||||
// Filename to dump data to.
|
||||
extern char __bolt_instr_filename[];
|
||||
|
||||
// A location is a function name plus offset. Function name needs to be
|
||||
// retrieved from the string table and is stored as an index to this table.
|
||||
struct Location {
|
||||
uint32_t FunctionName;
|
||||
uint32_t Offset;
|
||||
};
|
||||
|
||||
// An edge description defines an instrumented edge in the program, fully
|
||||
// identified by where the jump is located and its destination.
|
||||
struct EdgeDescription {
|
||||
Location From;
|
||||
Location To;
|
||||
};
|
||||
|
||||
// These need to be read from disk. They are generated by BOLT and written to
|
||||
// an ELF note section in the binary itself.
|
||||
struct InstrumentationInfo {
|
||||
EdgeDescription *Descriptions;
|
||||
char *Strings; // String table with function names used in this binary
|
||||
int FileDesc; // File descriptor for the file on disk backing this
|
||||
// information in memory via mmap
|
||||
uint8_t *MMapPtr; // The mmap ptr
|
||||
int MMapSize; // The mmap size
|
||||
};
|
||||
|
||||
// Declare some syscall wrappers we use throughout this code to avoid linking
|
||||
// against system libc.
|
||||
static uint64_t
|
||||
__open(const char *pathname,
|
||||
uint64_t flags,
|
||||
uint64_t mode) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $2, %%rax\n"
|
||||
"syscall"
|
||||
: "=a"(ret)
|
||||
: "D"(pathname), "S"(flags), "d"(mode)
|
||||
: "cc", "rcx", "r11", "memory");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $1, %%rax\n"
|
||||
"syscall\n"
|
||||
: "=a"(ret)
|
||||
: "D"(fd), "S"(buf), "d"(count)
|
||||
: "cc", "rcx", "r11", "memory");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $8, %%rax\n"
|
||||
"syscall\n"
|
||||
: "=a"(ret)
|
||||
: "D"(fd), "S"(pos), "d"(whence)
|
||||
: "cc", "rcx", "r11", "memory");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __close(uint64_t fd) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $3, %%rax\n"
|
||||
"syscall\n"
|
||||
: "=a"(ret)
|
||||
: "D"(fd)
|
||||
: "cc", "rcx", "r11", "memory");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void *__mmap(uint64_t addr, uint64_t size, uint64_t prot,
|
||||
uint64_t flags, uint64_t fd, uint64_t offset) {
|
||||
void *ret;
|
||||
register uint64_t r8 asm("r8") = fd;
|
||||
register uint64_t r9 asm("r9") = offset;
|
||||
register uint64_t r10 asm("r10") = flags;
|
||||
__asm__ __volatile__ (
|
||||
"movq $9, %%rax\n"
|
||||
"syscall\n"
|
||||
: "=a"(ret)
|
||||
: "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), "r"(r9)
|
||||
: "cc", "rcx", "r11", "memory");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static uint64_t __munmap(void *addr, uint64_t size) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $11, %%rax\n"
|
||||
"syscall\n"
|
||||
: "=a"(ret)
|
||||
: "D"(addr), "S"(size)
|
||||
: "cc", "rcx", "r11", "memory");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static uint64_t __exit(uint64_t code) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $231, %%rax\n"
|
||||
"syscall\n"
|
||||
: "=a"(ret)
|
||||
: "D"(code)
|
||||
: "cc", "rcx", "r11", "memory");
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Helper functions for writing strings to the .fdata file
|
||||
|
||||
// Write number Num using Base to the buffer in OutBuf, returns a pointer to
|
||||
// the end of the string.
|
||||
static char *intToStr(char *OutBuf, uint32_t Num, uint32_t Base) {
|
||||
const char *Chars = "0123456789abcdef";
|
||||
char Buf[20];
|
||||
char *Ptr = Buf;
|
||||
while (Num) {
|
||||
*Ptr++ = *(Chars + (Num % Base));
|
||||
Num /= Base;
|
||||
}
|
||||
if (Ptr == Buf) {
|
||||
*OutBuf++ = '0';
|
||||
return OutBuf;
|
||||
}
|
||||
while (Ptr != Buf) {
|
||||
*OutBuf++ = *--Ptr;
|
||||
}
|
||||
return OutBuf;
|
||||
}
|
||||
|
||||
// Copy Str to OutBuf, returns a pointer to the end of the copied string.
|
||||
static char *strCopy(char *OutBuf, const char *Str) {
|
||||
while (*Str)
|
||||
*OutBuf++ = *Str++;
|
||||
return OutBuf;
|
||||
}
|
||||
|
||||
// Print Msg to STDERR and quits with error code 1.
|
||||
static void reportError(const char *Msg, uint64_t Size) {
|
||||
__write(2, Msg, Size);
|
||||
__exit(1);
|
||||
}
|
||||
|
||||
// Perform a string comparison and returns zero if Str1 matches Str2. Compares
|
||||
// at most Size characters.
|
||||
static int compareStr(const char *Str1, const char *Str2, int Size) {
|
||||
while (*Str1 == *Str2) {
|
||||
if (*Str1 == '\0' || --Size == 0)
|
||||
return 0;
|
||||
++Str1;
|
||||
++Str2;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Write as a string in OutBuf an identifier for the program point at function
|
||||
// whose name is in the string table index FuncStrIndex plus Offset.
|
||||
static char *serializeLoc(const InstrumentationInfo &Info, char *OutBuf,
|
||||
const Location Loc) {
|
||||
// fdata location format: Type Name Offset
|
||||
// Type 1 - regular symbol
|
||||
OutBuf = strCopy(OutBuf, "1 ");
|
||||
const char *Str = Info.Strings + Loc.FunctionName;
|
||||
while (*Str) {
|
||||
*OutBuf++ = *Str++;
|
||||
}
|
||||
*OutBuf++ = ' ';
|
||||
OutBuf = intToStr(OutBuf, Loc.Offset, 16);
|
||||
*OutBuf++ = ' ';
|
||||
return OutBuf;
|
||||
}
|
||||
|
||||
// Read and map to memory the descriptions written by BOLT into the executable's
|
||||
// notes section
|
||||
static InstrumentationInfo readDescriptions() {
|
||||
InstrumentationInfo Result;
|
||||
uint64_t FD = __open("/proc/self/exe",
|
||||
/*flags=*/0 /*O_RDONLY*/,
|
||||
/*mode=*/0666);
|
||||
Result.FileDesc = FD;
|
||||
|
||||
// mmap our binary to memory
|
||||
uint64_t Size = __lseek(FD, 0, 2 /*SEEK_END*/);
|
||||
uint8_t *BinContents = reinterpret_cast<uint8_t *>(
|
||||
__mmap(0, Size, 0x1 /* PROT_READ*/, 0x2 /* MAP_PRIVATE*/, FD, 0));
|
||||
Result.MMapPtr = BinContents;
|
||||
Result.MMapSize = Size;
|
||||
Elf64_Ehdr *Hdr = reinterpret_cast<Elf64_Ehdr *>(BinContents);
|
||||
Elf64_Shdr *Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff);
|
||||
Elf64_Shdr *StringTblHeader = reinterpret_cast<Elf64_Shdr *>(
|
||||
BinContents + Hdr->e_shoff + Hdr->e_shstrndx * Hdr->e_shentsize);
|
||||
|
||||
// Find .bolt.instr.tables with the data we need and set pointers to it
|
||||
for (int I = 0; I < Hdr->e_shnum; ++I) {
|
||||
char *SecName = reinterpret_cast<char *>(
|
||||
BinContents + StringTblHeader->sh_offset + Shdr->sh_name);
|
||||
if (compareStr(SecName, ".bolt.instr.tables", 64) != 0) {
|
||||
Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff +
|
||||
(I + 1) * Hdr->e_shentsize);
|
||||
continue;
|
||||
}
|
||||
// Actual contents of the ELF note start after offset 20 decimal:
|
||||
// Offset 0: Producer name size (4 bytes)
|
||||
// Offset 4: Contents size (4 bytes)
|
||||
// Offset 8: Note type (4 bytes)
|
||||
// Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
|
||||
// Offset 20: Contents
|
||||
Result.Descriptions =
|
||||
reinterpret_cast<EdgeDescription *>(BinContents + Shdr->sh_offset + 20);
|
||||
// String table is located after the full EdgeDescriptions table containing
|
||||
// __bolt_instr_num_locs entries is finished
|
||||
Result.Strings = reinterpret_cast<char *>(
|
||||
BinContents + Shdr->sh_offset + 20 +
|
||||
(__bolt_instr_num_locs * sizeof(EdgeDescription)));
|
||||
return Result;
|
||||
}
|
||||
const char ErrMsg[] =
|
||||
"BOLT instrumentation runtime error: could not find section "
|
||||
".bolt.instr.tables\n";
|
||||
reportError(ErrMsg, sizeof(ErrMsg));
|
||||
return Result;
|
||||
}
|
||||
|
||||
// This is the entry point called at program exit. BOLT patches the executable's
|
||||
// FINI entry in the .dynamic section with the address of this function. Our
|
||||
// goal here is to flush to disk all instrumentation data in memory, using
|
||||
// BOLT's fdata format.
|
||||
extern "C" void __bolt_instr_data_dump() {
|
||||
const InstrumentationInfo Info = readDescriptions();
|
||||
|
||||
uint64_t FD = __open(__bolt_instr_filename,
|
||||
/*flags=*/0x241 /*O_WRONLY|O_TRUNC|O_CREAT*/,
|
||||
/*mode=*/0666);
|
||||
|
||||
for (int I = 0, E = __bolt_instr_num_locs; I < E; ++I) {
|
||||
char LineBuf[2000];
|
||||
char *Ptr = LineBuf;
|
||||
uint32_t HitCount = __bolt_instr_locations[I];
|
||||
if (!HitCount)
|
||||
continue;
|
||||
|
||||
EdgeDescription *Desc = &Info.Descriptions[I];
|
||||
Ptr = serializeLoc(Info, Ptr, Desc->From);
|
||||
Ptr = serializeLoc(Info, Ptr, Desc->To);
|
||||
Ptr = strCopy(Ptr, "0 ");
|
||||
Ptr = intToStr(Ptr, HitCount, 10);
|
||||
*Ptr++ = '\n';
|
||||
__write(FD, LineBuf, Ptr - LineBuf);
|
||||
}
|
||||
__close(FD);
|
||||
__munmap(Info.MMapPtr, Info.MMapSize);
|
||||
__close(Info.FileDesc);
|
||||
}
|
||||
@ -12,6 +12,7 @@
|
||||
#include "BinaryBasicBlock.h"
|
||||
#include "BinaryContext.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/MC/MCAsmInfo.h"
|
||||
#include "llvm/MC/MCContext.h"
|
||||
@ -96,6 +97,10 @@ bool BinaryBasicBlock::validateSuccessorInvariants() {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Unknown control flow.
|
||||
if (Inst && BC.MIB->isIndirectBranch(*Inst))
|
||||
return true;
|
||||
|
||||
const MCSymbol *TBB = nullptr;
|
||||
const MCSymbol *FBB = nullptr;
|
||||
MCInst *CondBranch = nullptr;
|
||||
@ -255,7 +260,7 @@ void BinaryBasicBlock::replaceSuccessor(BinaryBasicBlock *Succ,
|
||||
BinaryBasicBlock *NewSucc,
|
||||
uint64_t Count,
|
||||
uint64_t MispredictedCount) {
|
||||
Succ->removePredecessor(this);
|
||||
Succ->removePredecessor(this, /*Multiple=*/false);
|
||||
auto I = succ_begin();
|
||||
auto BI = BranchInfo.begin();
|
||||
for (; I != succ_end(); ++I) {
|
||||
@ -280,7 +285,7 @@ void BinaryBasicBlock::removeAllSuccessors() {
|
||||
}
|
||||
|
||||
void BinaryBasicBlock::removeSuccessor(BinaryBasicBlock *Succ) {
|
||||
Succ->removePredecessor(this);
|
||||
Succ->removePredecessor(this, /*Multiple=*/false);
|
||||
auto I = succ_begin();
|
||||
auto BI = BranchInfo.begin();
|
||||
for (; I != succ_end(); ++I) {
|
||||
@ -299,13 +304,16 @@ void BinaryBasicBlock::addPredecessor(BinaryBasicBlock *Pred) {
|
||||
Predecessors.push_back(Pred);
|
||||
}
|
||||
|
||||
void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred) {
|
||||
void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred,
|
||||
bool Multiple) {
|
||||
// Note: the predecessor could be listed multiple times.
|
||||
bool Erased{false};
|
||||
for (auto PredI = Predecessors.begin(); PredI != Predecessors.end(); ) {
|
||||
if (*PredI == Pred) {
|
||||
Erased = true;
|
||||
PredI = Predecessors.erase(PredI);
|
||||
if (!Multiple)
|
||||
return;
|
||||
} else {
|
||||
++PredI;
|
||||
}
|
||||
@ -448,6 +456,7 @@ void BinaryBasicBlock::addBranchInstruction(const BinaryBasicBlock *Successor) {
|
||||
assert(isSuccessor(Successor));
|
||||
auto &BC = Function->getBinaryContext();
|
||||
MCInst NewInst;
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
BC.MIB->createUncondBranch(NewInst, Successor->getLabel(), BC.Ctx.get());
|
||||
Instructions.emplace_back(std::move(NewInst));
|
||||
}
|
||||
@ -530,8 +539,8 @@ void BinaryBasicBlock::dump() const {
|
||||
outs() << "\n";
|
||||
}
|
||||
|
||||
uint64_t BinaryBasicBlock::estimateSize() const {
|
||||
return Function->getBinaryContext().computeCodeSize(begin(), end());
|
||||
uint64_t BinaryBasicBlock::estimateSize(const MCCodeEmitter *Emitter) const {
|
||||
return Function->getBinaryContext().computeCodeSize(begin(), end(), Emitter);
|
||||
}
|
||||
|
||||
BinaryBasicBlock::BinaryBranchInfo &
|
||||
|
||||
@ -16,14 +16,15 @@
|
||||
|
||||
#include "llvm/ADT/GraphTraits.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/MC/MCCodeEmitter.h"
|
||||
#include "llvm/MC/MCInst.h"
|
||||
#include "llvm/MC/MCSymbol.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/ErrorOr.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
|
||||
namespace llvm {
|
||||
|
||||
@ -49,6 +50,12 @@ public:
|
||||
struct BinaryBranchInfo {
|
||||
uint64_t Count;
|
||||
uint64_t MispredictedCount; /// number of branches mispredicted
|
||||
|
||||
bool operator<(const BinaryBranchInfo &Other) const {
|
||||
return (Count < Other.Count) ||
|
||||
(Count == Other.Count &&
|
||||
MispredictedCount < Other.MispredictedCount);
|
||||
}
|
||||
};
|
||||
|
||||
static constexpr uint32_t INVALID_OFFSET =
|
||||
@ -358,13 +365,17 @@ public:
|
||||
|
||||
/// Find the fallthrough successor for a block, or nullptr if there is
|
||||
/// none.
|
||||
const BinaryBasicBlock* getFallthrough() const {
|
||||
BinaryBasicBlock* getFallthrough() {
|
||||
if (succ_size() == 2)
|
||||
return getConditionalSuccessor(false);
|
||||
else
|
||||
return getSuccessor();
|
||||
}
|
||||
|
||||
const BinaryBasicBlock *getFallthrough() const {
|
||||
return const_cast<BinaryBasicBlock *>(this)->getFallthrough();
|
||||
}
|
||||
|
||||
/// Return branch info corresponding to a taken branch.
|
||||
const BinaryBranchInfo &getTakenBranchInfo() const {
|
||||
assert(BranchInfo.size() == 2 &&
|
||||
@ -450,6 +461,13 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a range of instructions to the end of this basic block.
|
||||
template <typename RangeTy>
|
||||
void addInstructions(RangeTy R) {
|
||||
for (auto &I : R)
|
||||
addInstruction(I);
|
||||
}
|
||||
|
||||
/// Add instruction before Pos in this basic block.
|
||||
template <typename Itr>
|
||||
Itr insertPseudoInstr(Itr Pos, MCInst &Instr) {
|
||||
@ -740,6 +758,11 @@ public:
|
||||
return Instructions.emplace(At, std::move(NewInst));
|
||||
}
|
||||
|
||||
iterator insertInstruction(iterator At, MCInst &NewInst) {
|
||||
adjustNumPseudos(NewInst, 1);
|
||||
return Instructions.emplace(At, NewInst);
|
||||
}
|
||||
|
||||
/// Helper to retrieve any terminators in \p BB before \p Pos. This is used
|
||||
/// to skip CFI instructions and to retrieve the first terminator instruction
|
||||
/// in basic blocks with two terminators (conditional jump and unconditional
|
||||
@ -848,8 +871,11 @@ public:
|
||||
return InputRange.second - InputRange.first;
|
||||
}
|
||||
|
||||
/// Returns an estimate of size of basic block during run time.
|
||||
uint64_t estimateSize() const;
|
||||
/// Returns an estimate of size of basic block during run time optionally
|
||||
/// using a user-supplied emitter for lock-free multi-thread work.
|
||||
/// MCCodeEmitter is not thread safe and each thread should operate with its
|
||||
/// own copy of it.
|
||||
uint64_t estimateSize(const MCCodeEmitter *Emitter = nullptr) const;
|
||||
|
||||
/// Return index in the current layout. The user is responsible for
|
||||
/// making sure the indices are up to date,
|
||||
@ -884,7 +910,10 @@ private:
|
||||
|
||||
/// Remove predecessor of the basic block. Don't use directly, instead
|
||||
/// use removeSuccessor() function.
|
||||
void removePredecessor(BinaryBasicBlock *Pred);
|
||||
/// If \p Multiple is set to true, it will remove all predecessors that
|
||||
/// are equal to \p Pred. Otherwise, the first instance of \p Pred found
|
||||
/// will be removed. This only matters in awkward, redundant CFGs.
|
||||
void removePredecessor(BinaryBasicBlock *Pred, bool Multiple=true);
|
||||
|
||||
/// Return offset of the basic block from the function start.
|
||||
uint32_t getOffset() const {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -17,8 +17,10 @@
|
||||
#include "BinaryData.h"
|
||||
#include "BinarySection.h"
|
||||
#include "DebugData.h"
|
||||
#include "JumpTable.h"
|
||||
#include "MCPlusBuilder.h"
|
||||
#include "llvm/ADT/iterator.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/Triple.h"
|
||||
#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
|
||||
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
|
||||
@ -32,6 +34,7 @@
|
||||
#include "llvm/MC/MCInstrInfo.h"
|
||||
#include "llvm/MC/MCObjectFileInfo.h"
|
||||
#include "llvm/MC/MCRegisterInfo.h"
|
||||
#include "llvm/MC/MCSectionELF.h"
|
||||
#include "llvm/MC/MCSubtargetInfo.h"
|
||||
#include "llvm/MC/MCSymbol.h"
|
||||
#include "llvm/Object/ObjectFile.h"
|
||||
@ -41,8 +44,10 @@
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <shared_mutex>
|
||||
#include <string>
|
||||
#include <system_error>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
@ -55,8 +60,21 @@ using namespace object;
|
||||
namespace bolt {
|
||||
|
||||
class BinaryFunction;
|
||||
class BinaryBasicBlock;
|
||||
class DataReader;
|
||||
|
||||
enum class MemoryContentsType : char {
|
||||
UNKNOWN = 0, /// Unknown contents.
|
||||
POSSIBLE_JUMP_TABLE, /// Possibly a non-PIC jump table.
|
||||
POSSIBLE_PIC_JUMP_TABLE, /// Possibly a PIC jump table.
|
||||
};
|
||||
|
||||
/// Free memory allocated for \p List.
|
||||
template<typename T> void clearList(T& List) {
|
||||
T TempList;
|
||||
TempList.swap(List);
|
||||
}
|
||||
|
||||
/// Helper function to truncate a \p Value to given size in \p Bytes.
|
||||
inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
|
||||
return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8));
|
||||
@ -137,9 +155,23 @@ class BinaryContext {
|
||||
/// Low level section registration.
|
||||
BinarySection ®isterSection(BinarySection *Section);
|
||||
|
||||
/// Store all functions in the binary, sorted by original address.
|
||||
std::map<uint64_t, BinaryFunction> BinaryFunctions;
|
||||
|
||||
/// A mutex that is used to control parallel accesses to BinaryFunctions
|
||||
mutable std::shared_timed_mutex BinaryFunctionsMutex;
|
||||
|
||||
/// Functions injected by BOLT
|
||||
std::vector<BinaryFunction *> InjectedBinaryFunctions;
|
||||
|
||||
/// Jump tables for all functions mapped by address.
|
||||
std::map<uint64_t, JumpTable *> JumpTables;
|
||||
|
||||
/// Used in duplicateJumpTable() to uniquely identify a JT clone
|
||||
/// Start our IDs with a high number so getJumpTableContainingAddress checks
|
||||
/// with size won't overflow
|
||||
uint32_t DuplicatedJumpTables{0x10000000};
|
||||
|
||||
public:
|
||||
/// [name] -> [BinaryData*] map used for global symbol resolution.
|
||||
using SymbolMapType = std::map<std::string, BinaryData *>;
|
||||
@ -160,6 +192,58 @@ public:
|
||||
FilterIterator<binary_data_const_iterator>;
|
||||
using FilteredBinaryDataIterator = FilterIterator<binary_data_iterator>;
|
||||
|
||||
/// Return BinaryFunction containing a given \p Address or nullptr if
|
||||
/// no registered function has it.
|
||||
///
|
||||
/// In a binary a function has somewhat vague boundaries. E.g. a function can
|
||||
/// refer to the first byte past the end of the function, and it will still be
|
||||
/// referring to this function, not the function following it in the address
|
||||
/// space. Thus we have the following flags that allow to lookup for
|
||||
/// a function where a caller has more context for the search.
|
||||
///
|
||||
/// If \p CheckPastEnd is true and the \p Address falls on a byte
|
||||
/// immediately following the last byte of some function and there's no other
|
||||
/// function that starts there, then return the function as the one containing
|
||||
/// the \p Address. This is useful when we need to locate functions for
|
||||
/// references pointing immediately past a function body.
|
||||
///
|
||||
/// If \p UseMaxSize is true, then include the space between this function
|
||||
/// body and the next object in address ranges that we check.
|
||||
BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address,
|
||||
bool CheckPastEnd = false,
|
||||
bool UseMaxSize = false,
|
||||
bool Shallow = false);
|
||||
|
||||
/// Return BinaryFunction which has a fragment that starts at a given
|
||||
/// \p Address. If the BinaryFunction is a child fragment, then return its
|
||||
/// parent unless \p Shallow parameter is set to true.
|
||||
BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address,
|
||||
bool Shallow = false);
|
||||
|
||||
const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address,
|
||||
bool Shallow = false) const {
|
||||
return const_cast<BinaryContext *>(this)->
|
||||
getBinaryFunctionAtAddress(Address, Shallow);
|
||||
}
|
||||
|
||||
/// Return size of an entry for the given jump table \p Type.
|
||||
uint64_t getJumpTableEntrySize(JumpTable::JumpTableType Type) const {
|
||||
return Type == JumpTable::JTT_PIC ? 4 : AsmInfo->getCodePointerSize();
|
||||
}
|
||||
|
||||
/// Return JumpTable containing a given \p Address.
|
||||
JumpTable *getJumpTableContainingAddress(uint64_t Address) {
|
||||
auto JTI = JumpTables.upper_bound(Address);
|
||||
if (JTI == JumpTables.begin())
|
||||
return nullptr;
|
||||
--JTI;
|
||||
if (JTI->first + JTI->second->getSize() > Address)
|
||||
return JTI->second;
|
||||
if (JTI->second->getSize() == 0 && JTI->first == Address)
|
||||
return JTI->second;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/// [MCSymbol] -> [BinaryFunction]
|
||||
///
|
||||
/// As we fold identical functions, multiple symbols can point
|
||||
@ -167,6 +251,9 @@ public:
|
||||
std::unordered_map<const MCSymbol *,
|
||||
BinaryFunction *> SymbolToFunctionMap;
|
||||
|
||||
/// A mutex that is used to control parallel accesses to SymbolToFunctionMap
|
||||
mutable std::shared_timed_mutex SymbolToFunctionMapMutex;
|
||||
|
||||
/// Look up the symbol entry that contains the given \p Address (based on
|
||||
/// the start address and size for each symbol). Returns a pointer to
|
||||
/// the BinaryData for that symbol. If no data is found, nullptr is returned.
|
||||
@ -187,6 +274,10 @@ public:
|
||||
/// top level BinaryData.
|
||||
bool validateHoles() const;
|
||||
|
||||
/// Produce output address ranges based on input ranges for some module.
|
||||
DebugAddressRangesVector translateModuleAddressRanges(
|
||||
const DWARFAddressRangesVector &InputRanges) const;
|
||||
|
||||
/// Get a bogus "absolute" section that will be associated with all
|
||||
/// absolute BinaryDatas.
|
||||
BinarySection &absoluteSection();
|
||||
@ -202,6 +293,25 @@ public:
|
||||
/// is complete, e.g. after building CFGs for all functions.
|
||||
void assignMemData();
|
||||
|
||||
/// Construct BinaryFunction object and add it to internal maps.
|
||||
BinaryFunction *createBinaryFunction(const std::string &Name,
|
||||
BinarySection &Section,
|
||||
uint64_t Address,
|
||||
uint64_t Size,
|
||||
bool IsSimple,
|
||||
uint64_t SymbolSize = 0,
|
||||
uint16_t Alignment = 0);
|
||||
|
||||
/// Return all functions for this rewrite instance.
|
||||
std::map<uint64_t, BinaryFunction> &getBinaryFunctions() {
|
||||
return BinaryFunctions;
|
||||
}
|
||||
|
||||
/// Return all functions for this rewrite instance.
|
||||
const std::map<uint64_t, BinaryFunction> &getBinaryFunctions() const {
|
||||
return BinaryFunctions;
|
||||
}
|
||||
|
||||
/// Create BOLT-injected function
|
||||
BinaryFunction *createInjectedBinaryFunction(const std::string &Name,
|
||||
bool IsSimple = true);
|
||||
@ -210,7 +320,54 @@ public:
|
||||
return InjectedBinaryFunctions;
|
||||
}
|
||||
|
||||
public:
|
||||
/// Construct a jump table for \p Function at \p Address or return an existing
|
||||
/// one at that location.
|
||||
///
|
||||
/// May create an embedded jump table and return its label as the second
|
||||
/// element of the pair.
|
||||
const MCSymbol *getOrCreateJumpTable(BinaryFunction &Function,
|
||||
uint64_t Address,
|
||||
JumpTable::JumpTableType Type);
|
||||
|
||||
/// Analyze a possible jump table of type \p Type at a given \p Address.
|
||||
/// \p BF is a function referencing the jump table.
|
||||
/// Return true if the jump table was detected at \p Address, and false
|
||||
/// otherwise.
|
||||
///
|
||||
/// If \p NextJTAddress is different from zero, it is used as an upper
|
||||
/// bound for jump table memory layout.
|
||||
///
|
||||
/// Optionally, populate \p Offsets with jump table entries. The entries
|
||||
/// could be partially populated if the jump table detection fails.
|
||||
bool analyzeJumpTable(const uint64_t Address,
|
||||
const JumpTable::JumpTableType Type,
|
||||
const BinaryFunction &BF,
|
||||
const uint64_t NextJTAddress = 0,
|
||||
JumpTable::OffsetsType *Offsets = nullptr);
|
||||
|
||||
/// After jump table locations are established, this function will populate
|
||||
/// their OffsetEntries based on memory contents.
|
||||
void populateJumpTables();
|
||||
|
||||
/// Returns a jump table ID and label pointing to the duplicated jump table.
|
||||
/// Ordinarily, jump tables are identified by their address in the input
|
||||
/// binary. We return an ID with the high bit set to differentiate it from
|
||||
/// regular addresses, avoiding conflicts with standard jump tables.
|
||||
std::pair<uint64_t, const MCSymbol *>
|
||||
duplicateJumpTable(BinaryFunction &Function, JumpTable *JT,
|
||||
const MCSymbol *OldLabel);
|
||||
|
||||
/// Generate a unique name for jump table at a given \p Address belonging
|
||||
/// to function \p BF.
|
||||
std::string generateJumpTableName(const BinaryFunction &BF, uint64_t Address);
|
||||
|
||||
/// Return true if the array of bytes represents a valid code padding.
|
||||
bool hasValidCodePadding(const BinaryFunction &BF);
|
||||
|
||||
/// Verify padding area between functions, and adjust max function size
|
||||
/// accordingly.
|
||||
void adjustCodePadding();
|
||||
|
||||
/// Regular page size.
|
||||
static constexpr unsigned RegularPageSize = 0x1000;
|
||||
|
||||
@ -220,13 +377,20 @@ public:
|
||||
/// Map address to a constant island owner (constant data in code section)
|
||||
std::map<uint64_t, BinaryFunction *> AddressToConstantIslandMap;
|
||||
|
||||
/// A map from jump table address to insertion order. Used for generating
|
||||
/// jump table names.
|
||||
std::map<uint64_t, size_t> JumpTableIds;
|
||||
|
||||
/// Set of addresses in the code that are not a function start, and are
|
||||
/// referenced from outside of containing function. E.g. this could happen
|
||||
/// when a function has more than a single entry point.
|
||||
std::set<uint64_t> InterproceduralReferences;
|
||||
std::set<std::pair<BinaryFunction *, uint64_t>> InterproceduralReferences;
|
||||
|
||||
std::unique_ptr<MCContext> Ctx;
|
||||
|
||||
/// A mutex that is used to control parallel accesses to Ctx
|
||||
mutable std::shared_timed_mutex CtxMutex;
|
||||
|
||||
std::unique_ptr<DWARFContext> DwCtx;
|
||||
|
||||
std::unique_ptr<Triple> TheTriple;
|
||||
@ -300,6 +464,9 @@ public:
|
||||
/// List of functions that always trap.
|
||||
std::vector<const BinaryFunction *> TrappedFunctions;
|
||||
|
||||
/// Map SDT locations to SDT markers info
|
||||
std::unordered_map<uint64_t, SDTMarkerInfo> SDTMarkers;
|
||||
|
||||
BinaryContext(std::unique_ptr<MCContext> Ctx,
|
||||
std::unique_ptr<DWARFContext> DwCtx,
|
||||
std::unique_ptr<Triple> TheTriple,
|
||||
@ -383,6 +550,25 @@ public:
|
||||
BinaryDataMap.clear();
|
||||
}
|
||||
|
||||
/// Process \p Address reference from code in function \BF.
|
||||
/// \p IsPCRel indicates if the reference is PC-relative.
|
||||
/// Return <Symbol, Addend> pair corresponding to the \p Address.
|
||||
std::pair<const MCSymbol *, uint64_t> handleAddressRef(uint64_t Address,
|
||||
BinaryFunction &BF,
|
||||
bool IsPCRel);
|
||||
|
||||
/// Analyze memory contents at the given \p Address and return the type of
|
||||
/// memory contents (such as a possible jump table).
|
||||
MemoryContentsType analyzeMemoryAt(uint64_t Address, BinaryFunction &BF);
|
||||
|
||||
/// Return a value of the global \p Symbol or an error if the value
|
||||
/// was not set.
|
||||
ErrorOr<uint64_t> getSymbolValue(const MCSymbol &Symbol) const {
|
||||
const auto *BD = getBinaryDataByName(Symbol.getName());
|
||||
if (!BD)
|
||||
return std::make_error_code(std::errc::bad_address);
|
||||
return BD->getAddress();
|
||||
}
|
||||
|
||||
/// Return a global symbol registered at a given \p Address and \p Size.
|
||||
/// If no symbol exists, create one with unique name using \p Prefix.
|
||||
@ -448,6 +634,65 @@ public:
|
||||
return Itr != GlobalSymbols.end() ? Itr->second : nullptr;
|
||||
}
|
||||
|
||||
/// Return true if \p SymbolName was generated internally and was not present
|
||||
/// in the input binary.
|
||||
bool isInternalSymbolName(const StringRef Name) {
|
||||
return Name.startswith("SYMBOLat") ||
|
||||
Name.startswith("DATAat") ||
|
||||
Name.startswith("HOLEat");
|
||||
}
|
||||
|
||||
MCSymbol *getHotTextStartSymbol() const {
|
||||
return Ctx->getOrCreateSymbol("__hot_start");
|
||||
}
|
||||
|
||||
MCSymbol *getHotTextEndSymbol() const {
|
||||
return Ctx->getOrCreateSymbol("__hot_end");
|
||||
}
|
||||
|
||||
MCSection *getTextSection() const {
|
||||
return MOFI->getTextSection();
|
||||
}
|
||||
|
||||
/// Return code section with a given name.
|
||||
MCSection *getCodeSection(StringRef SectionName) const {
|
||||
return Ctx->getELFSection(SectionName,
|
||||
ELF::SHT_PROGBITS,
|
||||
ELF::SHF_EXECINSTR | ELF::SHF_ALLOC);
|
||||
}
|
||||
|
||||
/// \name Pre-assigned Section Names
|
||||
/// @{
|
||||
|
||||
const char *getMainCodeSectionName() const {
|
||||
return ".text";
|
||||
}
|
||||
|
||||
const char *getColdCodeSectionName() const {
|
||||
return ".text.cold";
|
||||
}
|
||||
|
||||
const char *getHotTextMoverSectionName() const {
|
||||
return ".text.mover";
|
||||
}
|
||||
|
||||
const char *getInjectedCodeSectionName() const {
|
||||
return ".text.injected";
|
||||
}
|
||||
|
||||
const char *getInjectedColdCodeSectionName() const {
|
||||
return ".text.injected.cold";
|
||||
}
|
||||
|
||||
ErrorOr<BinarySection &> getGdbIndexSection() const {
|
||||
return getUniqueSectionByName(".gdb_index");
|
||||
}
|
||||
|
||||
/// @}
|
||||
|
||||
/// Resolve inter-procedural dependencies.
|
||||
void processInterproceduralReferences();
|
||||
|
||||
/// Perform any necessary post processing on the symbol table after
|
||||
/// function disassembly is complete. This processing fixes top
|
||||
/// level data holes and makes sure the symbol table is valid.
|
||||
@ -535,6 +780,19 @@ public:
|
||||
Sections.end()));
|
||||
}
|
||||
|
||||
/// Iterate over all registered code sections.
|
||||
iterator_range<FilteredSectionIterator> textSections() {
|
||||
auto isText = [](const SectionIterator &Itr) {
|
||||
return *Itr && Itr->isAllocatable() && Itr->isText();
|
||||
};
|
||||
return make_range(FilteredSectionIterator(isText,
|
||||
Sections.begin(),
|
||||
Sections.end()),
|
||||
FilteredSectionIterator(isText,
|
||||
Sections.end(),
|
||||
Sections.end()));
|
||||
}
|
||||
|
||||
/// Iterate over all registered allocatable sections.
|
||||
iterator_range<FilteredSectionConstIterator> allocatableSections() const {
|
||||
return const_cast<BinaryContext *>(this)->allocatableSections();
|
||||
@ -586,7 +844,9 @@ public:
|
||||
/// functions only work for allocatable sections, i.e. ones with non-zero
|
||||
/// addresses.
|
||||
ErrorOr<BinarySection &> getSectionForAddress(uint64_t Address);
|
||||
ErrorOr<const BinarySection &> getSectionForAddress(uint64_t Address) const;
|
||||
ErrorOr<const BinarySection &> getSectionForAddress(uint64_t Address) const {
|
||||
return const_cast<BinaryContext *>(this)->getSectionForAddress(Address);
|
||||
}
|
||||
|
||||
/// Return section(s) associated with given \p Name.
|
||||
iterator_range<NameToSectionMapType::iterator>
|
||||
@ -598,18 +858,10 @@ public:
|
||||
return make_range(NameToSection.equal_range(Name));
|
||||
}
|
||||
|
||||
/// Return the unique (allocatable) section associated with given \p Name.
|
||||
/// Return the unique section associated with given \p Name.
|
||||
/// If there is more than one section with the same name, return an error
|
||||
/// object.
|
||||
ErrorOr<BinarySection &> getUniqueSectionByName(StringRef SectionName) {
|
||||
auto Sections = getSectionByName(SectionName);
|
||||
if (Sections.begin() != Sections.end() &&
|
||||
std::next(Sections.begin()) == Sections.end())
|
||||
return *Sections.begin()->second;
|
||||
return std::make_error_code(std::errc::bad_address);
|
||||
}
|
||||
ErrorOr<const BinarySection &>
|
||||
getUniqueSectionByName(StringRef SectionName) const {
|
||||
ErrorOr<BinarySection &> getUniqueSectionByName(StringRef SectionName) const {
|
||||
auto Sections = getSectionByName(SectionName);
|
||||
if (Sections.begin() != Sections.end() &&
|
||||
std::next(Sections.begin()) == Sections.end())
|
||||
@ -617,22 +869,38 @@ public:
|
||||
return std::make_error_code(std::errc::bad_address);
|
||||
}
|
||||
|
||||
/// Given \p Address in the binary, extract and return a pointer value at that
|
||||
/// address. The address has to be a valid statically allocated address for
|
||||
/// the binary.
|
||||
ErrorOr<uint64_t> extractPointerAtAddress(uint64_t Address) const;
|
||||
/// Return an unsigned value of \p Size stored at \p Address. The address has
|
||||
/// to be a valid statically allocated address for the binary.
|
||||
ErrorOr<uint64_t> getUnsignedValueAtAddress(uint64_t Address,
|
||||
size_t Size) const;
|
||||
|
||||
/// Return a signed value of \p Size stored at \p Address. The address has
|
||||
/// to be a valid statically allocated address for the binary.
|
||||
ErrorOr<uint64_t> getSignedValueAtAddress(uint64_t Address,
|
||||
size_t Size) const;
|
||||
|
||||
/// Special case of getUnsignedValueAtAddress() that uses a pointer size.
|
||||
ErrorOr<uint64_t> getPointerAtAddress(uint64_t Address) const {
|
||||
return getUnsignedValueAtAddress(Address, AsmInfo->getCodePointerSize());
|
||||
}
|
||||
|
||||
/// Replaces all references to \p ChildBF with \p ParentBF. \p ChildBF is then
|
||||
/// removed from the list of functions \p BFs. The profile data of \p ChildBF
|
||||
/// is merged into that of \p ParentBF.
|
||||
void foldFunction(BinaryFunction &ChildBF,
|
||||
BinaryFunction &ParentBF,
|
||||
std::map<uint64_t, BinaryFunction> &BFs);
|
||||
/// is merged into that of \p ParentBF. This function is thread safe.
|
||||
void foldFunction(BinaryFunction &ChildBF, BinaryFunction &ParentBF);
|
||||
|
||||
/// Add a Section relocation at a given \p Address.
|
||||
void addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Type,
|
||||
uint64_t Addend = 0, uint64_t Value = 0);
|
||||
|
||||
/// All PC-relative relocations in data objects.
|
||||
std::map<uint64_t, std::pair<uint64_t, uint64_t>> PCRelocation;
|
||||
|
||||
void addPCRelativeDataRelocation(uint64_t Address, uint64_t Type,
|
||||
uint64_t Value) {
|
||||
PCRelocation[Address] = std::make_pair(Type, Value);
|
||||
}
|
||||
|
||||
/// Remove registered relocation at a given \p Address.
|
||||
bool removeRelocationAt(uint64_t Address);
|
||||
|
||||
@ -640,12 +908,15 @@ public:
|
||||
/// is no relocation at such address.
|
||||
const Relocation *getRelocationAt(uint64_t Address);
|
||||
|
||||
/// This function is thread safe.
|
||||
const BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) const {
|
||||
std::shared_lock<std::shared_timed_mutex> Lock(SymbolToFunctionMapMutex);
|
||||
auto BFI = SymbolToFunctionMap.find(Symbol);
|
||||
return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second;
|
||||
}
|
||||
|
||||
BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) {
|
||||
std::shared_lock<std::shared_timed_mutex> Lock(SymbolToFunctionMapMutex);
|
||||
auto BFI = SymbolToFunctionMap.find(Symbol);
|
||||
return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second;
|
||||
}
|
||||
@ -657,8 +928,7 @@ public:
|
||||
}
|
||||
|
||||
/// Populate some internal data structures with debug info.
|
||||
void preprocessDebugInfo(
|
||||
std::map<uint64_t, BinaryFunction> &BinaryFunctions);
|
||||
void preprocessDebugInfo();
|
||||
|
||||
/// Add a filename entry from SrcCUID to DestCUID.
|
||||
unsigned addDebugFilenameToUnit(const uint32_t DestCUID,
|
||||
@ -666,8 +936,7 @@ public:
|
||||
unsigned FileIndex);
|
||||
|
||||
/// Return functions in output layout order
|
||||
static std::vector<BinaryFunction *>
|
||||
getSortedFunctions(std::map<uint64_t, BinaryFunction> &BinaryFunctions);
|
||||
std::vector<BinaryFunction *> getSortedFunctions();
|
||||
|
||||
/// Do the best effort to calculate the size of the function by emitting
|
||||
/// its code, and relaxing branch instructions.
|
||||
@ -676,26 +945,33 @@ public:
|
||||
/// size is for the cold one.
|
||||
std::pair<size_t, size_t> calculateEmittedSize(BinaryFunction &BF);
|
||||
|
||||
/// Calculate the size of the instruction \p Inst.
|
||||
uint64_t computeInstructionSize(const MCInst &Inst) const {
|
||||
/// Calculate the size of the instruction \p Inst optionally using a
|
||||
/// user-supplied emitter for lock-free multi-thread work. MCCodeEmitter is
|
||||
/// not thread safe and each thread should operate with its own copy of it.
|
||||
uint64_t
|
||||
computeInstructionSize(const MCInst &Inst,
|
||||
const MCCodeEmitter *Emitter = nullptr) const {
|
||||
if (!Emitter)
|
||||
Emitter = this->MCE.get();
|
||||
SmallString<256> Code;
|
||||
SmallVector<MCFixup, 4> Fixups;
|
||||
raw_svector_ostream VecOS(Code);
|
||||
MCE->encodeInstruction(Inst, VecOS, Fixups, *STI);
|
||||
|
||||
Emitter->encodeInstruction(Inst, VecOS, Fixups, *STI);
|
||||
return Code.size();
|
||||
}
|
||||
|
||||
/// Compute the native code size for a range of instructions.
|
||||
/// Note: this can be imprecise wrt the final binary since happening prior to
|
||||
/// relaxation, as well as wrt the original binary because of opcode
|
||||
/// shortening.
|
||||
/// shortening.MCCodeEmitter is not thread safe and each thread should operate
|
||||
/// with its own copy of it.
|
||||
template <typename Itr>
|
||||
uint64_t computeCodeSize(Itr Beg, Itr End) const {
|
||||
uint64_t computeCodeSize(Itr Beg, Itr End,
|
||||
const MCCodeEmitter *Emitter = nullptr) const {
|
||||
uint64_t Size = 0;
|
||||
while (Beg != End) {
|
||||
if (!MII->get(Beg->getOpcode()).isPseudo())
|
||||
Size += computeInstructionSize(*Beg);
|
||||
Size += computeInstructionSize(*Beg, Emitter);
|
||||
++Beg;
|
||||
}
|
||||
return Size;
|
||||
@ -760,8 +1036,44 @@ public:
|
||||
|
||||
void exitWithBugReport(StringRef Message,
|
||||
const BinaryFunction &Function) const;
|
||||
|
||||
struct IndependentCodeEmitter {
|
||||
std::unique_ptr<MCObjectFileInfo> LocalMOFI;
|
||||
std::unique_ptr<MCContext> LocalCtx;
|
||||
std::unique_ptr<MCCodeEmitter> MCE;
|
||||
};
|
||||
|
||||
/// Encapsulates an independent MCCodeEmitter that doesn't share resources
|
||||
/// with the main one available through BinaryContext::MCE, managed by
|
||||
/// BinaryContext.
|
||||
/// This is intended to create a lock-free environment for an auxiliary thread
|
||||
/// that needs to perform work with an MCCodeEmitter that can be transient or
|
||||
/// won't be used in the main code emitter.
|
||||
IndependentCodeEmitter createIndependentMCCodeEmitter() const {
|
||||
IndependentCodeEmitter MCEInstance;
|
||||
MCEInstance.LocalMOFI = llvm::make_unique<MCObjectFileInfo>();
|
||||
MCEInstance.LocalCtx = llvm::make_unique<MCContext>(
|
||||
AsmInfo.get(), MRI.get(), MCEInstance.LocalMOFI.get());
|
||||
MCEInstance.LocalMOFI->InitMCObjectFileInfo(*TheTriple, /*PIC=*/false,
|
||||
*MCEInstance.LocalCtx);
|
||||
MCEInstance.MCE.reset(
|
||||
TheTarget->createMCCodeEmitter(*MII, *MRI, *MCEInstance.LocalCtx));
|
||||
return MCEInstance;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T,
|
||||
typename = std::enable_if_t<sizeof(T) == 1> >
|
||||
inline raw_ostream &operator<<(raw_ostream &OS,
|
||||
const ArrayRef<T> &ByteArray) {
|
||||
const char *Sep = "";
|
||||
for (const auto Byte : ByteArray) {
|
||||
OS << Sep << format("%.2x", Byte);
|
||||
Sep = " ";
|
||||
}
|
||||
return OS;
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
|
||||
@ -73,8 +73,8 @@ StringRef BinaryData::getOutputSectionName() const {
|
||||
}
|
||||
|
||||
uint64_t BinaryData::getOutputAddress() const {
|
||||
assert(OutputSection->getFileAddress());
|
||||
return OutputSection->getFileAddress() + OutputOffset;
|
||||
assert(OutputSection->getOutputAddress());
|
||||
return OutputSection->getOutputAddress() + OutputOffset;
|
||||
}
|
||||
|
||||
uint64_t BinaryData::getOffset() const {
|
||||
|
||||
@ -106,7 +106,7 @@ public:
|
||||
bool isAtomic() const {
|
||||
return isTopLevelJumpTable() || !Parent;
|
||||
}
|
||||
|
||||
|
||||
iterator_range<std::vector<std::string>::const_iterator> names() const {
|
||||
return make_range(Names.begin(), Names.end());
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -18,6 +18,7 @@
|
||||
#include "BinaryBasicBlock.h"
|
||||
#include "BinaryContext.h"
|
||||
#include "BinaryLoop.h"
|
||||
#include "BinarySection.h"
|
||||
#include "DataReader.h"
|
||||
#include "DebugData.h"
|
||||
#include "JumpTable.h"
|
||||
@ -40,6 +41,7 @@
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace llvm::object;
|
||||
|
||||
@ -53,108 +55,6 @@ namespace bolt {
|
||||
using DWARFUnitLineTable = std::pair<DWARFUnit *,
|
||||
const DWARFDebugLine::LineTable *>;
|
||||
|
||||
/// Class encapsulating runtime statistics about an execution unit.
|
||||
class DynoStats {
|
||||
|
||||
#define DYNO_STATS\
|
||||
D(FIRST_DYNO_STAT, "<reserved>", Fn)\
|
||||
D(FORWARD_COND_BRANCHES, "executed forward branches", Fn)\
|
||||
D(FORWARD_COND_BRANCHES_TAKEN, "taken forward branches", Fn)\
|
||||
D(BACKWARD_COND_BRANCHES, "executed backward branches", Fn)\
|
||||
D(BACKWARD_COND_BRANCHES_TAKEN, "taken backward branches", Fn)\
|
||||
D(UNCOND_BRANCHES, "executed unconditional branches", Fn)\
|
||||
D(FUNCTION_CALLS, "all function calls", Fn)\
|
||||
D(INDIRECT_CALLS, "indirect calls", Fn)\
|
||||
D(PLT_CALLS, "PLT calls", Fn)\
|
||||
D(INSTRUCTIONS, "executed instructions", Fn)\
|
||||
D(LOADS, "executed load instructions", Fn)\
|
||||
D(STORES, "executed store instructions", Fn)\
|
||||
D(JUMP_TABLE_BRANCHES, "taken jump table branches", Fn)\
|
||||
D(ALL_BRANCHES, "total branches",\
|
||||
Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\
|
||||
D(ALL_TAKEN, "taken branches",\
|
||||
Fadd(TAKEN_CONDITIONAL, UNCOND_BRANCHES))\
|
||||
D(NONTAKEN_CONDITIONAL, "non-taken conditional branches",\
|
||||
Fsub(ALL_CONDITIONAL, TAKEN_CONDITIONAL))\
|
||||
D(TAKEN_CONDITIONAL, "taken conditional branches",\
|
||||
Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\
|
||||
D(ALL_CONDITIONAL, "all conditional branches",\
|
||||
Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\
|
||||
D(VENEER_CALLS_AARCH64, "linker-inserted veneer calls", Fn)\
|
||||
D(LAST_DYNO_STAT, "<reserved>", 0)
|
||||
|
||||
public:
|
||||
#define D(name, ...) name,
|
||||
enum Category : uint8_t { DYNO_STATS };
|
||||
#undef D
|
||||
|
||||
|
||||
private:
|
||||
uint64_t Stats[LAST_DYNO_STAT+1];
|
||||
bool PrintAArch64Stats;
|
||||
|
||||
#define D(name, desc, ...) desc,
|
||||
static constexpr const char *Desc[] = { DYNO_STATS };
|
||||
#undef D
|
||||
|
||||
public:
|
||||
DynoStats(bool PrintAArch64Stats ) {
|
||||
this->PrintAArch64Stats = PrintAArch64Stats;
|
||||
for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat)
|
||||
Stats[Stat] = 0;
|
||||
}
|
||||
|
||||
uint64_t &operator[](size_t I) {
|
||||
assert(I > FIRST_DYNO_STAT && I < LAST_DYNO_STAT &&
|
||||
"index out of bounds");
|
||||
return Stats[I];
|
||||
}
|
||||
|
||||
uint64_t operator[](size_t I) const {
|
||||
switch (I) {
|
||||
#define D(name, desc, func) \
|
||||
case name: \
|
||||
return func;
|
||||
#define Fn Stats[I]
|
||||
#define Fadd(a, b) operator[](a) + operator[](b)
|
||||
#define Fsub(a, b) operator[](a) - operator[](b)
|
||||
#define F(a) operator[](a)
|
||||
#define Radd(a, b) (a + b)
|
||||
#define Rsub(a, b) (a - b)
|
||||
DYNO_STATS
|
||||
#undef Rsub
|
||||
#undef Radd
|
||||
#undef F
|
||||
#undef Fsub
|
||||
#undef Fadd
|
||||
#undef Fn
|
||||
#undef D
|
||||
default:
|
||||
llvm_unreachable("index out of bounds");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void print(raw_ostream &OS, const DynoStats *Other = nullptr) const;
|
||||
|
||||
void operator+=(const DynoStats &Other);
|
||||
bool operator<(const DynoStats &Other) const;
|
||||
bool operator==(const DynoStats &Other) const;
|
||||
bool operator!=(const DynoStats &Other) const { return !operator==(Other); }
|
||||
bool lessThan(const DynoStats &Other, ArrayRef<Category> Keys) const;
|
||||
|
||||
static const char* Description(const Category C) {
|
||||
return Desc[C];
|
||||
}
|
||||
};
|
||||
|
||||
inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) {
|
||||
Stats.print(OS, nullptr);
|
||||
return OS;
|
||||
}
|
||||
|
||||
DynoStats operator+(const DynoStats &A, const DynoStats &B);
|
||||
|
||||
/// Types of macro-fusion alignment corrections.
|
||||
enum MacroFusionType {
|
||||
MFT_NONE,
|
||||
@ -302,11 +202,27 @@ private:
|
||||
|
||||
std::unique_ptr<BinaryLoopInfo> BLI;
|
||||
|
||||
/// All labels in the function that are referenced via relocations from
|
||||
/// data objects. Typically these are jump table destinations and computed
|
||||
/// goto labels.
|
||||
std::set<uint64_t> ExternallyReferencedOffsets;
|
||||
|
||||
/// Offsets of indirect branches with unknown destinations.
|
||||
std::set<uint64_t> UnknownIndirectBranchOffsets;
|
||||
|
||||
/// False if the function is too complex to reconstruct its control
|
||||
/// flow graph.
|
||||
/// In relocation mode we still disassemble and re-assemble such functions.
|
||||
bool IsSimple{true};
|
||||
|
||||
/// True if the function has an indirect branch with unknown destination.
|
||||
bool HasUnknownControlFlow{false};
|
||||
|
||||
/// The code from inside the function references one of the code locations
|
||||
/// from the same function as a data, i.e. it's possible the label is used
|
||||
/// inside an address calculation or could be referenced from outside.
|
||||
bool HasInternalLabelReference{false};
|
||||
|
||||
/// In AArch64, preserve nops to maintain code equal to input (assuming no
|
||||
/// optimizations are done).
|
||||
bool PreserveNops{false};
|
||||
@ -336,6 +252,15 @@ private:
|
||||
/// destination.
|
||||
bool HasFixedIndirectBranch{false};
|
||||
|
||||
/// Is the function known to exceed its input size?
|
||||
bool IsLarge{false};
|
||||
|
||||
/// True if the function is a fragment of another function. This means that
|
||||
/// this function could only be entered via its parent or one of its sibling
|
||||
/// fragments. It could be entered at any basic block. It can also return
|
||||
/// the control to any basic block of its parent or its sibling.
|
||||
bool IsFragment{false};
|
||||
|
||||
/// The address for the code for this function in codegen memory.
|
||||
uint64_t ImageAddress{0};
|
||||
|
||||
@ -348,6 +273,12 @@ private:
|
||||
/// Name for the corresponding cold code section.
|
||||
std::string ColdCodeSectionName;
|
||||
|
||||
/// Parent function for split function fragments.
|
||||
BinaryFunction *ParentFunction{nullptr};
|
||||
|
||||
/// All fragments for a parent function.
|
||||
std::unordered_set<BinaryFunction *> Fragments;
|
||||
|
||||
/// The profile data for the number of times the function was executed.
|
||||
uint64_t ExecutionCount{COUNT_NO_PROFILE};
|
||||
|
||||
@ -395,6 +326,9 @@ private:
|
||||
/// Function order for streaming into the destination binary.
|
||||
uint32_t Index{-1U};
|
||||
|
||||
/// Indicate that the function body has SDT marker
|
||||
bool HasSDTMarker{false};
|
||||
|
||||
/// Get basic block index assuming it belongs to this function.
|
||||
unsigned getIndex(const BinaryBasicBlock *BB) const {
|
||||
assert(BB->getIndex() < BasicBlocks.size());
|
||||
@ -433,7 +367,7 @@ private:
|
||||
|
||||
/// Associate DW_CFA_GNU_args_size info with invoke instructions
|
||||
/// (call instructions with non-empty landing pad).
|
||||
void propagateGnuArgsSizeInfo();
|
||||
void propagateGnuArgsSizeInfo(MCPlusBuilder::AllocatorIdTy AllocId);
|
||||
|
||||
/// Synchronize branch instructions with CFG.
|
||||
void postProcessBranches();
|
||||
@ -451,8 +385,8 @@ private:
|
||||
std::set<uint64_t> CodeOffsets;
|
||||
/// The address offset where we emitted the constant island, that is, the
|
||||
/// chunk of data in the function code area (AArch only)
|
||||
int64_t OutputDataOffset;
|
||||
int64_t OutputColdDataOffset;
|
||||
int64_t OutputDataOffset{0};
|
||||
int64_t OutputColdDataOffset{0};
|
||||
|
||||
/// Map labels to corresponding basic blocks.
|
||||
std::unordered_map<const MCSymbol *, BinaryBasicBlock *> LabelToBB;
|
||||
@ -537,25 +471,20 @@ private:
|
||||
/// function and that apply before the entry basic block).
|
||||
CFIInstrMapType CIEFrameInstructions;
|
||||
|
||||
/// All compound jump tables for this function.
|
||||
/// All compound jump tables for this function. This duplicates what's stored
|
||||
/// in the BinaryContext, but additionally it gives quick access for all
|
||||
/// jump tables used by this function.
|
||||
///
|
||||
/// <OriginalAddress> -> <JumpTable *>
|
||||
std::map<uint64_t, JumpTable *> JumpTables;
|
||||
|
||||
/// A map from jump table address to insertion order. Used for generating
|
||||
/// jump table names.
|
||||
mutable std::map<uint64_t, size_t> JumpTableIds;
|
||||
|
||||
/// Generate a unique name for this jump table at the given address that
|
||||
/// should be repeatable no matter what the start address of the table is.
|
||||
std::string generateJumpTableName(uint64_t Address) const;
|
||||
|
||||
/// Iterate over all jump tables associated with this function.
|
||||
iterator_range<std::map<uint64_t, JumpTable *>::const_iterator>
|
||||
jumpTables() const {
|
||||
return make_range(JumpTables.begin(), JumpTables.end());
|
||||
}
|
||||
|
||||
/// All jump table sites in the function.
|
||||
/// All jump table sites in the function before CFG is built.
|
||||
std::vector<std::pair<uint64_t, uint64_t>> JTSites;
|
||||
|
||||
/// List of relocations in this function.
|
||||
@ -625,6 +554,12 @@ private:
|
||||
/// Count the number of functions created.
|
||||
static uint64_t Count;
|
||||
|
||||
/// LocSym annotation records an index to this vector. This holds a label
|
||||
/// for each instruction whose input/output offsets need to be known after
|
||||
/// emission. Enables writing bolt address translation tables, used for
|
||||
/// mapping control transfer in the output binary back to the original binary.
|
||||
std::vector<const MCSymbol *> LocSyms;
|
||||
|
||||
/// Register alternative function name.
|
||||
void addAlternativeName(std::string NewName) {
|
||||
Names.emplace_back(NewName);
|
||||
@ -654,6 +589,17 @@ private:
|
||||
return getOrCreateLocalLabel(getAddress() + Offset);
|
||||
}
|
||||
|
||||
/// Register an internal offset in a function referenced from outside.
|
||||
void registerReferencedOffset(uint64_t Offset) {
|
||||
ExternallyReferencedOffsets.emplace(Offset);
|
||||
}
|
||||
|
||||
/// True if there are references to internals of this function from data,
|
||||
/// e.g. from jump tables.
|
||||
bool hasInternalReference() const {
|
||||
return !ExternallyReferencedOffsets.empty();
|
||||
}
|
||||
|
||||
/// Update all \p From references in the code to refer to \p To. Used
|
||||
/// in disassembled state only.
|
||||
void updateReferences(const MCSymbol *From, const MCSymbol *To);
|
||||
@ -661,6 +607,16 @@ private:
|
||||
/// This is called in disassembled state.
|
||||
void addEntryPoint(uint64_t Address);
|
||||
|
||||
void setParentFunction(BinaryFunction *BF) {
|
||||
assert((!ParentFunction || ParentFunction == BF) &&
|
||||
"cannot have more than one parent function");
|
||||
ParentFunction = BF;
|
||||
}
|
||||
|
||||
void addFragment(BinaryFunction *BF) {
|
||||
Fragments.insert(BF);
|
||||
}
|
||||
|
||||
/// Return true if there is a registered entry point at a given offset
|
||||
/// into the function.
|
||||
bool hasEntryPointAtOffset(uint64_t Offset) {
|
||||
@ -687,9 +643,11 @@ private:
|
||||
|
||||
/// Emit line number information corresponding to \p NewLoc. \p PrevLoc
|
||||
/// provides a context for de-duplication of line number info.
|
||||
/// \p FirstInstr indicates if \p NewLoc represents the first instruction
|
||||
/// in a sequence, such as a function fragment.
|
||||
///
|
||||
/// Return new current location which is either \p NewLoc or \p PrevLoc.
|
||||
SMLoc emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc) const;
|
||||
SMLoc emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc, bool FirstInstr) const;
|
||||
|
||||
BinaryFunction& operator=(const BinaryFunction &) = delete;
|
||||
BinaryFunction(const BinaryFunction &) = delete;
|
||||
@ -759,6 +717,10 @@ public:
|
||||
return iterator_range<const_iterator>(begin(), end());
|
||||
}
|
||||
|
||||
// Iterators by pointer.
|
||||
BasicBlockListType::iterator pbegin() { return BasicBlocks.begin(); }
|
||||
BasicBlockListType::iterator pend() { return BasicBlocks.end(); }
|
||||
|
||||
order_iterator layout_begin() { return BasicBlocksLayout.begin(); }
|
||||
const_order_iterator layout_begin() const
|
||||
{ return BasicBlocksLayout.begin(); }
|
||||
@ -822,6 +784,13 @@ public:
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// Return a symbol for an instruction location. \p Idx is recorded as an
|
||||
/// annotation in the instruction.
|
||||
const MCSymbol *getLocSym(size_t Idx) const {
|
||||
assert(Idx < LocSyms.size() && "Invalid index");
|
||||
return LocSyms[Idx];
|
||||
}
|
||||
|
||||
/// Update layout of basic blocks used for output.
|
||||
void updateBasicBlockLayout(BasicBlockOrderType &NewLayout) {
|
||||
BasicBlocksPreviousLayout = BasicBlocksLayout;
|
||||
@ -899,13 +868,6 @@ public:
|
||||
/// Attempt to validate CFG invariants.
|
||||
bool validateCFG() const;
|
||||
|
||||
/// Return dynostats for the function.
|
||||
///
|
||||
/// The function relies on branch instructions being in-sync with CFG for
|
||||
/// branch instructions stats. Thus it is better to call it after
|
||||
/// fixBranches().
|
||||
DynoStats getDynoStats() const;
|
||||
|
||||
BinaryBasicBlock *getBasicBlockForLabel(const MCSymbol *Label) {
|
||||
auto I = LabelToBB.find(Label);
|
||||
return I == LabelToBB.end() ? nullptr : I->second;
|
||||
@ -939,7 +901,7 @@ public:
|
||||
/// Retrieve the landing pad BB associated with invoke instruction \p Invoke
|
||||
/// that is in \p BB. Return nullptr if none exists
|
||||
BinaryBasicBlock *getLandingPadBBFor(const BinaryBasicBlock &BB,
|
||||
const MCInst &InvokeInst) {
|
||||
const MCInst &InvokeInst) const {
|
||||
assert(BC.MIB->isInvoke(InvokeInst) && "must be invoke instruction");
|
||||
const auto LP = BC.MIB->getEHInfo(InvokeInst);
|
||||
if (LP && LP->first) {
|
||||
@ -954,15 +916,20 @@ public:
|
||||
/// CFG is constructed or while instruction offsets are available in CFG.
|
||||
MCInst *getInstructionAtOffset(uint64_t Offset);
|
||||
|
||||
const MCInst *getInstructionAtOffset(uint64_t Offset) const {
|
||||
return const_cast<BinaryFunction *>(this)->getInstructionAtOffset(Offset);
|
||||
}
|
||||
|
||||
/// Return jump table that covers a given \p Address in memory.
|
||||
JumpTable *getJumpTableContainingAddress(uint64_t Address) {
|
||||
auto JTI = JumpTables.upper_bound(Address);
|
||||
if (JTI == JumpTables.begin())
|
||||
return nullptr;
|
||||
--JTI;
|
||||
if (JTI->first + JTI->second->getSize() > Address) {
|
||||
if (JTI->first + JTI->second->getSize() > Address)
|
||||
return JTI->second;
|
||||
if (JTI->second->getSize() == 0 && JTI->first == Address)
|
||||
return JTI->second;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -1000,7 +967,7 @@ public:
|
||||
|
||||
/// Check if (possibly one out of many) function name matches the given
|
||||
/// regex.
|
||||
bool hasNameRegex(const std::string &NameRegex) const;
|
||||
const std::string *hasNameRegex(const StringRef NameRegex) const;
|
||||
|
||||
/// Return a vector of all possible names for the function.
|
||||
const std::vector<std::string> &getNames() const {
|
||||
@ -1124,6 +1091,7 @@ public:
|
||||
MCSymbol *getFunctionEndLabel() const {
|
||||
assert(BC.Ctx && "cannot be called with empty context");
|
||||
if (!FunctionEndLabel) {
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
FunctionEndLabel = BC.Ctx->createTempSymbol("func_end", true);
|
||||
}
|
||||
return FunctionEndLabel;
|
||||
@ -1132,6 +1100,7 @@ public:
|
||||
/// Return MC symbol associated with the end of the cold part of the function.
|
||||
MCSymbol *getFunctionColdEndLabel() const {
|
||||
if (!FunctionColdEndLabel) {
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
FunctionColdEndLabel = BC.Ctx->createTempSymbol("func_cold_end", true);
|
||||
}
|
||||
return FunctionColdEndLabel;
|
||||
@ -1232,7 +1201,7 @@ public:
|
||||
/// address in a function. During disassembly we have to make sure we create
|
||||
/// relocation at that location.
|
||||
void addPCRelativeRelocationAddress(uint64_t Address) {
|
||||
assert(Address >= getAddress() && Address < getAddress() + getSize() &&
|
||||
assert(containsAddress(Address, /*UseMaxSize=*/ true) &&
|
||||
"address is outside of the function");
|
||||
PCRelativeRelocationOffsets.emplace(Address - getAddress());
|
||||
}
|
||||
@ -1240,16 +1209,41 @@ public:
|
||||
/// Get data used by this function.
|
||||
std::set<BinaryData *> dataUses(bool OnlyHot) const;
|
||||
|
||||
/// Return then name of the section this function originated from.
|
||||
StringRef getOriginSectionName() const {
|
||||
return getSection().getName();
|
||||
}
|
||||
|
||||
/// Return internal section name for this function.
|
||||
StringRef getCodeSectionName() const {
|
||||
return StringRef(CodeSectionName);
|
||||
}
|
||||
|
||||
/// Assign a code section name to the function.
|
||||
void setCodeSectionName(StringRef Name) {
|
||||
CodeSectionName = Name;
|
||||
}
|
||||
|
||||
/// Get output code section.
|
||||
ErrorOr<BinarySection &> getCodeSection() const {
|
||||
return BC.getUniqueSectionByName(getCodeSectionName());
|
||||
}
|
||||
|
||||
/// Return cold code section name for the function.
|
||||
StringRef getColdCodeSectionName() const {
|
||||
return StringRef(ColdCodeSectionName);
|
||||
}
|
||||
|
||||
/// Assign a section name for the cold part of the function.
|
||||
void setColdCodeSectionName(StringRef Name) {
|
||||
ColdCodeSectionName = Name;
|
||||
}
|
||||
|
||||
/// Get output code section for cold code of this function.
|
||||
ErrorOr<BinarySection &> getColdCodeSection() const {
|
||||
return BC.getUniqueSectionByName(getColdCodeSectionName());
|
||||
}
|
||||
|
||||
/// Return true iif the function will halt execution on entry.
|
||||
bool trapsOnEntry() const {
|
||||
return TrapsOnEntry;
|
||||
@ -1264,6 +1258,16 @@ public:
|
||||
return IsSimple;
|
||||
}
|
||||
|
||||
/// Return true if the function has instruction(s) with unknown control flow.
|
||||
bool hasUnknownControlFlow() const {
|
||||
return HasUnknownControlFlow;
|
||||
}
|
||||
|
||||
/// Return true if the function should be split for the output.
|
||||
bool shouldSplit() const {
|
||||
return IsLarge && !getBinaryContext().HasRelocations;
|
||||
}
|
||||
|
||||
/// Return true if the function body is non-contiguous.
|
||||
bool isSplit() const {
|
||||
return layout_size() &&
|
||||
@ -1300,6 +1304,9 @@ public:
|
||||
return !JumpTables.empty();
|
||||
}
|
||||
|
||||
/// Return true if the function has SDT marker
|
||||
bool hasSDTMarker() const { return HasSDTMarker; }
|
||||
|
||||
const JumpTable *getJumpTable(const MCInst &Inst) const {
|
||||
const auto Address = BC.MIB->getJumpTable(Inst);
|
||||
return getJumpTableContainingAddress(Address);
|
||||
@ -1329,7 +1336,7 @@ public:
|
||||
}
|
||||
|
||||
/// Return true if the given address \p PC is inside the function body.
|
||||
bool containsAddress(uint64_t PC, bool UseMaxSize=false) const {
|
||||
bool containsAddress(uint64_t PC, bool UseMaxSize = false) const {
|
||||
if (UseMaxSize)
|
||||
return Address <= PC && PC < Address + MaxSize;
|
||||
return Address <= PC && PC < Address + Size;
|
||||
@ -1338,7 +1345,8 @@ public:
|
||||
/// Add new names this function is known under.
|
||||
template <class ContainterTy>
|
||||
void addNewNames(const ContainterTy &NewNames) {
|
||||
Names.insert(Names.begin(), NewNames.begin(), NewNames.end());
|
||||
Names.insert(Names.begin(), NewNames.begin(), NewNames.end());
|
||||
std::sort(Names.begin(), Names.end());
|
||||
}
|
||||
|
||||
/// Create a basic block at a given \p Offset in the
|
||||
@ -1353,6 +1361,7 @@ public:
|
||||
bool DeriveAlignment = false) {
|
||||
assert(BC.Ctx && "cannot be called with empty context");
|
||||
if (!Label) {
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
Label = BC.Ctx->createTempSymbol("BB", true);
|
||||
}
|
||||
auto BB = std::unique_ptr<BinaryBasicBlock>(
|
||||
@ -1379,9 +1388,10 @@ public:
|
||||
assert((CurrentState == State::CFG || !getBasicBlockAtOffset(Offset)) &&
|
||||
"basic block already exists in pre-CFG state");
|
||||
|
||||
if (!Label)
|
||||
if (!Label) {
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
Label = BC.Ctx->createTempSymbol("BB", true);
|
||||
|
||||
}
|
||||
auto BBPtr = createBasicBlock(Offset, Label, DeriveAlignment);
|
||||
BasicBlocks.emplace_back(BBPtr.release());
|
||||
|
||||
@ -1438,13 +1448,15 @@ public:
|
||||
BinaryBasicBlock *Start,
|
||||
std::vector<std::unique_ptr<BinaryBasicBlock>> &&NewBBs,
|
||||
const bool UpdateLayout = true,
|
||||
const bool UpdateCFIState = true);
|
||||
const bool UpdateCFIState = true,
|
||||
const bool RecomputeLandingPads = true);
|
||||
|
||||
iterator insertBasicBlocks(
|
||||
iterator StartBB,
|
||||
std::vector<std::unique_ptr<BinaryBasicBlock>> &&NewBBs,
|
||||
const bool UpdateLayout = true,
|
||||
const bool UpdateCFIState = true);
|
||||
const bool UpdateCFIState = true,
|
||||
const bool RecomputeLandingPads = true);
|
||||
|
||||
/// Update the basic block layout for this function. The BBs from
|
||||
/// [Start->Index, Start->Index + NumNewBlocks) are inserted into the
|
||||
@ -1463,6 +1475,20 @@ public:
|
||||
/// new blocks into the CFG. This must be called after updateLayout.
|
||||
void updateCFIState(BinaryBasicBlock *Start, const unsigned NumNewBlocks);
|
||||
|
||||
/// Return true if we detected ambiguous jump tables in this function, which
|
||||
/// happen when one JT is used in more than one indirect jumps. This precludes
|
||||
/// us from splitting edges for this JT unless we duplicate the JT (see
|
||||
/// disambiguateJumpTables).
|
||||
bool checkForAmbiguousJumpTables();
|
||||
|
||||
/// Detect when two distinct indirect jumps are using the same jump table and
|
||||
/// duplicate it, allocating a separate JT for each indirect branch. This is
|
||||
/// necessary for code transformations on the CFG that change an edge induced
|
||||
/// by an indirect branch, e.g.: instrumentation or shrink wrapping. However,
|
||||
/// this is only possible if we are not updating jump tables in place, but are
|
||||
/// writing it to a new location (moving them).
|
||||
void disambiguateJumpTables();
|
||||
|
||||
/// Change \p OrigDest to \p NewDest in the jump table used at the end of
|
||||
/// \p BB. Returns false if \p OrigDest couldn't be find as a valid target
|
||||
/// and no replacement took place.
|
||||
@ -1628,6 +1654,11 @@ public:
|
||||
return *this;
|
||||
}
|
||||
|
||||
BinaryFunction &setLarge(bool Large) {
|
||||
IsLarge = Large;
|
||||
return *this;
|
||||
}
|
||||
|
||||
BinaryFunction &setUsesGnuArgsSize(bool Uses = true) {
|
||||
UsesGnuArgsSize = Uses;
|
||||
return *this;
|
||||
@ -1701,6 +1732,10 @@ public:
|
||||
return ImageSize;
|
||||
}
|
||||
|
||||
BinaryFunction *getParentFunction() const {
|
||||
return ParentFunction;
|
||||
}
|
||||
|
||||
/// Set the profile data for the number of times the function was called.
|
||||
BinaryFunction &setExecutionCount(uint64_t Count) {
|
||||
ExecutionCount = Count;
|
||||
@ -1807,6 +1842,7 @@ public:
|
||||
|
||||
// Register our island at global namespace
|
||||
Symbol = BC.getOrCreateGlobalSymbol(Address, "ISLANDat");
|
||||
|
||||
// Internal bookkeeping
|
||||
const auto Offset = Address - getAddress();
|
||||
assert((!IslandOffsets.count(Offset) || IslandOffsets[Offset] == Symbol) &&
|
||||
@ -1823,20 +1859,20 @@ public:
|
||||
/// separate symbols when emitting our constant island on behalf of this other
|
||||
/// function.
|
||||
MCSymbol *
|
||||
getOrCreateProxyIslandAccess(uint64_t Address, BinaryFunction *Referrer) {
|
||||
getOrCreateProxyIslandAccess(uint64_t Address, BinaryFunction &Referrer) {
|
||||
auto Symbol = getOrCreateIslandAccess(Address);
|
||||
if (!Symbol)
|
||||
return nullptr;
|
||||
|
||||
MCSymbol *Proxy;
|
||||
if (!IslandProxies[Referrer].count(Symbol)) {
|
||||
if (!IslandProxies[&Referrer].count(Symbol)) {
|
||||
Proxy =
|
||||
BC.Ctx->getOrCreateSymbol(Symbol->getName() +
|
||||
".proxy.for." + Referrer->getPrintName());
|
||||
IslandProxies[Referrer][Symbol] = Proxy;
|
||||
IslandProxies[Referrer][Proxy] = Symbol;
|
||||
".proxy.for." + Referrer.getPrintName());
|
||||
IslandProxies[&Referrer][Symbol] = Proxy;
|
||||
IslandProxies[&Referrer][Proxy] = Symbol;
|
||||
}
|
||||
Proxy = IslandProxies[Referrer][Symbol];
|
||||
Proxy = IslandProxies[&Referrer][Symbol];
|
||||
return Proxy;
|
||||
}
|
||||
|
||||
@ -1919,6 +1955,9 @@ public:
|
||||
/// Returns false if disassembly failed.
|
||||
void disassemble(ArrayRef<uint8_t> FunctionData);
|
||||
|
||||
/// Validate entry points.
|
||||
void postProcessEntryPoints();
|
||||
|
||||
/// Post-processing for jump tables after disassembly. Since their
|
||||
/// boundaries are not known until all call sites are seen, we need this
|
||||
/// extra pass to perform any final adjustments.
|
||||
@ -1930,7 +1969,7 @@ public:
|
||||
///
|
||||
/// Returns true on success and update the current function state to
|
||||
/// State::CFG. Returns false if CFG cannot be built.
|
||||
bool buildCFG();
|
||||
bool buildCFG(MCPlusBuilder::AllocatorIdTy);
|
||||
|
||||
/// Read any kind of profile information available for the function.
|
||||
void readProfile();
|
||||
@ -1951,7 +1990,7 @@ public:
|
||||
///
|
||||
/// Return true upon successful processing, or false if the control flow
|
||||
/// cannot be statically evaluated for any given indirect branch.
|
||||
bool postProcessIndirectBranches();
|
||||
bool postProcessIndirectBranches(MCPlusBuilder::AllocatorIdTy AllocId);
|
||||
|
||||
/// In functions with multiple entry points, the profile collection records
|
||||
/// data for other entry points in a different function entry. This function
|
||||
@ -2119,7 +2158,7 @@ public:
|
||||
/// Emit function code. The caller is responsible for emitting function
|
||||
/// symbol(s) and setting the section to emit the code to.
|
||||
void emitBody(MCStreamer &Streamer, bool EmitColdPart,
|
||||
bool EmitCodeOnly = false);
|
||||
bool EmitCodeOnly = false, bool LabelsForOffsets = false);
|
||||
|
||||
/// Emit function as a blob with relocations and labels for relocations.
|
||||
void emitBodyRaw(MCStreamer *Streamer);
|
||||
@ -2151,6 +2190,8 @@ public:
|
||||
|
||||
/// Sets the associated .debug_info entry.
|
||||
void addSubprogramDIE(const DWARFDie DIE) {
|
||||
static std::mutex CriticalSectionMutex;
|
||||
std::lock_guard<std::mutex> Lock(CriticalSectionMutex);
|
||||
SubprogramDIEs.emplace_back(DIE);
|
||||
if (!UnitLineTable.first) {
|
||||
if (const auto *LineTable =
|
||||
@ -2253,7 +2294,7 @@ public:
|
||||
}
|
||||
|
||||
/// Return output address ranges for a function.
|
||||
DWARFAddressRangesVector getOutputAddressRanges() const;
|
||||
DebugAddressRangesVector getOutputAddressRanges() const;
|
||||
|
||||
/// Given an address corresponding to an instruction in the input binary,
|
||||
/// return an address of this instruction in output binary.
|
||||
@ -2264,7 +2305,7 @@ public:
|
||||
|
||||
/// Take address ranges corresponding to the input binary and translate
|
||||
/// them to address ranges in the output binary.
|
||||
DWARFAddressRangesVector translateInputToOutputRanges(
|
||||
DebugAddressRangesVector translateInputToOutputRanges(
|
||||
const DWARFAddressRangesVector &InputRanges) const;
|
||||
|
||||
/// Similar to translateInputToOutputRanges() but operates on location lists
|
||||
@ -2307,48 +2348,6 @@ public:
|
||||
const FragmentInfo &cold() const { return ColdFragment; }
|
||||
};
|
||||
|
||||
/// Return program-wide dynostats.
|
||||
template <typename FuncsType>
|
||||
inline DynoStats getDynoStats(const FuncsType &Funcs) {
|
||||
bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
|
||||
DynoStats dynoStats(IsAArch64);
|
||||
for (auto &BFI : Funcs) {
|
||||
auto &BF = BFI.second;
|
||||
if (BF.isSimple()) {
|
||||
dynoStats += BF.getDynoStats();
|
||||
}
|
||||
}
|
||||
return dynoStats;
|
||||
}
|
||||
|
||||
/// Call a function with optional before and after dynostats printing.
|
||||
template <typename FnType, typename FuncsType>
|
||||
inline void
|
||||
callWithDynoStats(FnType &&Func,
|
||||
const FuncsType &Funcs,
|
||||
StringRef Phase,
|
||||
const bool Flag) {
|
||||
bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
|
||||
DynoStats DynoStatsBefore(IsAArch64);
|
||||
if (Flag) {
|
||||
DynoStatsBefore = getDynoStats(Funcs);
|
||||
}
|
||||
|
||||
Func();
|
||||
|
||||
if (Flag) {
|
||||
const auto DynoStatsAfter = getDynoStats(Funcs);
|
||||
const auto Changed = (DynoStatsAfter != DynoStatsBefore);
|
||||
outs() << "BOLT-INFO: program-wide dynostats after running "
|
||||
<< Phase << (Changed ? "" : " (no change)") << ":\n\n"
|
||||
<< DynoStatsBefore << '\n';
|
||||
if (Changed) {
|
||||
DynoStatsAfter.print(outs(), &DynoStatsBefore);
|
||||
}
|
||||
outs() << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
inline raw_ostream &operator<<(raw_ostream &OS,
|
||||
const BinaryFunction &Function) {
|
||||
OS << Function.getPrintName();
|
||||
|
||||
@ -152,7 +152,7 @@ bool BinaryFunction::recordTrace(
|
||||
const auto *Instr = BB->getLastNonPseudoInstr();
|
||||
uint64_t Offset{0};
|
||||
if (Instr) {
|
||||
Offset = BC.MIB->getAnnotationWithDefault<uint64_t>(*Instr, "Offset");
|
||||
Offset = BC.MIB->getAnnotationWithDefault<uint32_t>(*Instr, "Offset");
|
||||
} else {
|
||||
Offset = BB->getOffset();
|
||||
}
|
||||
@ -175,7 +175,11 @@ bool BinaryFunction::recordBranch(uint64_t From, uint64_t To,
|
||||
return false;
|
||||
}
|
||||
|
||||
// Could be bad LBR data; ignore the branch.
|
||||
// Could be bad LBR data; ignore the branch. In the case of data collected
|
||||
// in binaries optimized by BOLT, a source BB may be mapped to two output
|
||||
// BBs as a result of optimizations. In that case, a branch between these
|
||||
// two will be recorded as a branch from A going to A in the source address
|
||||
// space. Keep processing.
|
||||
if (From == To) {
|
||||
return true;
|
||||
}
|
||||
@ -200,7 +204,7 @@ bool BinaryFunction::recordBranch(uint64_t From, uint64_t To,
|
||||
const auto *LastInstr = ToBB->getLastNonPseudoInstr();
|
||||
if (LastInstr) {
|
||||
const auto LastInstrOffset =
|
||||
BC.MIB->getAnnotationWithDefault<uint64_t>(*LastInstr, "Offset");
|
||||
BC.MIB->getAnnotationWithDefault<uint32_t>(*LastInstr, "Offset");
|
||||
|
||||
// With old .fdata we are getting FT branches for "jcc,jmp" sequences.
|
||||
if (To == LastInstrOffset && BC.MIB->isUnconditionalBranch(*LastInstr)) {
|
||||
@ -226,23 +230,40 @@ bool BinaryFunction::recordBranch(uint64_t From, uint64_t To,
|
||||
// discarded it as a FT from __builtin_unreachable.
|
||||
auto *FromInstruction = getInstructionAtOffset(From);
|
||||
if (!FromInstruction) {
|
||||
DEBUG(dbgs() << "no instruction for offset " << From << " in "
|
||||
<< *this << '\n');
|
||||
return false;
|
||||
}
|
||||
|
||||
if (FromBB == ToBB) {
|
||||
// Check for a return from a recursive call.
|
||||
// Otherwise it's a simple loop.
|
||||
// If the data was collected in a bolted binary, the From addresses may be
|
||||
// translated to the first instruction of the source BB if BOLT inserted
|
||||
// a new branch that did not exist in the source (we can't map it to the
|
||||
// source instruction, so we map it to the first instr of source BB).
|
||||
// We do not keep offsets for random instructions. So the check above will
|
||||
// evaluate to true if the first instr is not a branch (call/jmp/ret/etc)
|
||||
if (BC.DR.collectedInBoltedBinary()) {
|
||||
if (FromBB->getInputOffset() != From) {
|
||||
DEBUG(dbgs() << "offset " << From << " does not match a BB in " << *this
|
||||
<< '\n');
|
||||
return false;
|
||||
}
|
||||
FromInstruction = nullptr;
|
||||
} else {
|
||||
DEBUG(dbgs() << "no instruction for offset " << From << " in " << *this
|
||||
<< '\n');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!FromBB->getSuccessor(ToBB->getLabel())) {
|
||||
// Check if this is a recursive call or a return from a recursive call.
|
||||
if (ToBB->isEntryPoint() && (BC.MIB->isCall(*FromInstruction) ||
|
||||
BC.MIB->isIndirectBranch(*FromInstruction))) {
|
||||
if (FromInstruction && ToBB->isEntryPoint() &&
|
||||
(BC.MIB->isCall(*FromInstruction) ||
|
||||
BC.MIB->isIndirectBranch(*FromInstruction))) {
|
||||
// Execution count is already accounted for.
|
||||
return true;
|
||||
}
|
||||
// For data collected in a bolted binary, we may have created two output BBs
|
||||
// that map to one original block. Branches between these two blocks will
|
||||
// appear here as one BB jumping to itself, even though it has no loop edges.
|
||||
// Ignore these.
|
||||
if (BC.DR.collectedInBoltedBinary() && FromBB == ToBB)
|
||||
return true;
|
||||
|
||||
DEBUG(dbgs() << "invalid branch in " << *this << '\n'
|
||||
<< Twine::utohexstr(From) << " -> "
|
||||
@ -299,16 +320,15 @@ void BinaryFunction::postProcessProfile() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if MCF post-processing was requested.
|
||||
if (opts::DoMCF != MCF_DISABLE) {
|
||||
removeTagsFromProfile();
|
||||
solveMCF(*this, opts::DoMCF);
|
||||
if (!(getProfileFlags() & PF_LBR)) {
|
||||
// Check if MCF post-processing was requested.
|
||||
if (opts::DoMCF != MCF_DISABLE) {
|
||||
removeTagsFromProfile();
|
||||
solveMCF(*this, opts::DoMCF);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (!(getProfileFlags() & PF_LBR))
|
||||
return;
|
||||
|
||||
// Pre-sort branch data.
|
||||
if (BranchData)
|
||||
std::stable_sort(BranchData->Data.begin(), BranchData->Data.end());
|
||||
@ -368,6 +388,12 @@ void BinaryFunction::postProcessProfile() {
|
||||
if (opts::InferFallThroughs)
|
||||
inferFallThroughCounts();
|
||||
|
||||
// Check if MCF post-processing was requested.
|
||||
if (opts::DoMCF != MCF_DISABLE) {
|
||||
removeTagsFromProfile();
|
||||
solveMCF(*this, opts::DoMCF);
|
||||
}
|
||||
|
||||
// Update profile information for jump tables based on CFG branch data.
|
||||
for (auto *BB : BasicBlocks) {
|
||||
const auto *LastInstr = BB->getLastNonPseudoInstr();
|
||||
@ -843,6 +869,11 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) {
|
||||
if (BI.From.Name == BI.To.Name) {
|
||||
// Try to record information with 0 count.
|
||||
IsValid = recordBranch(BI.From.Offset, BI.To.Offset, 0);
|
||||
} else if (BC.DR.collectedInBoltedBinary()) {
|
||||
// We can't check branch source for collections in bolted binaries because
|
||||
// the source of the branch may be mapped to the first instruction in a BB
|
||||
// instead of the original branch (which may not exist in the source bin).
|
||||
IsValid = true;
|
||||
} else {
|
||||
// The branch has to originate from this function.
|
||||
// Check for calls, tail calls, rets and indirect branches.
|
||||
|
||||
@ -201,6 +201,13 @@ PrintUCE("print-uce",
|
||||
cl::Hidden,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintProfileStats("print-profile-stats",
|
||||
cl::desc("print profile quality/bias analysis"),
|
||||
cl::ZeroOrMore,
|
||||
cl::init(false),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
SimplifyConditionalTailCalls("simplify-conditional-tail-calls",
|
||||
cl::desc("simplify conditional tail calls by removing unnecessary jumps"),
|
||||
@ -229,6 +236,14 @@ StringOps("inline-memcpy",
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::list<std::string>
|
||||
SpecializeMemcpy1("memcpy1-spec",
|
||||
cl::desc("list of functions with call sites for which to specialize memcpy() "
|
||||
"for size 1"),
|
||||
cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
StripRepRet("strip-rep-ret",
|
||||
cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"),
|
||||
@ -292,6 +307,7 @@ const char BinaryFunctionPassManager::TimerGroupDesc[] =
|
||||
"Binary Function Pass Manager";
|
||||
|
||||
void BinaryFunctionPassManager::runPasses() {
|
||||
auto &BFs = BC.getBinaryFunctions();
|
||||
for (const auto &OptPassPair : Passes) {
|
||||
if (!OptPassPair.first)
|
||||
continue;
|
||||
@ -307,7 +323,7 @@ void BinaryFunctionPassManager::runPasses() {
|
||||
|
||||
callWithDynoStats(
|
||||
[this,&Pass] {
|
||||
Pass->runOnFunctions(BC, BFs, LargeFunctions);
|
||||
Pass->runOnFunctions(BC);
|
||||
},
|
||||
BFs,
|
||||
Pass->getName(),
|
||||
@ -350,14 +366,10 @@ void BinaryFunctionPassManager::runPasses() {
|
||||
}
|
||||
}
|
||||
|
||||
void BinaryFunctionPassManager::runAllPasses(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &Functions,
|
||||
std::set<uint64_t> &LargeFunctions
|
||||
) {
|
||||
BinaryFunctionPassManager Manager(BC, Functions, LargeFunctions);
|
||||
void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
|
||||
BinaryFunctionPassManager Manager(BC);
|
||||
|
||||
const auto InitialDynoStats = getDynoStats(Functions);
|
||||
const auto InitialDynoStats = getDynoStats(BC.getBinaryFunctions());
|
||||
|
||||
// Here we manage dependencies/order manually, since passes are run in the
|
||||
// order they're registered.
|
||||
@ -365,6 +377,9 @@ void BinaryFunctionPassManager::runAllPasses(
|
||||
// Run this pass first to use stats for the original functions.
|
||||
Manager.registerPass(llvm::make_unique<PrintProgramStats>(NeverPrint));
|
||||
|
||||
if (opts::PrintProfileStats)
|
||||
Manager.registerPass(llvm::make_unique<PrintProfileStats>(NeverPrint));
|
||||
|
||||
Manager.registerPass(llvm::make_unique<ValidateInternalCalls>(NeverPrint));
|
||||
|
||||
Manager.registerPass(llvm::make_unique<StripRepRet>(NeverPrint),
|
||||
@ -374,7 +389,12 @@ void BinaryFunctionPassManager::runAllPasses(
|
||||
opts::ICF);
|
||||
|
||||
if (BC.isAArch64())
|
||||
Manager.registerPass(llvm::make_unique<VeneerElimination>(PrintVeneerElimination));
|
||||
Manager.registerPass(
|
||||
llvm::make_unique<VeneerElimination>(PrintVeneerElimination));
|
||||
|
||||
Manager.registerPass(
|
||||
llvm::make_unique<SpecializeMemcpy1>(NeverPrint, opts::SpecializeMemcpy1),
|
||||
!opts::SpecializeMemcpy1.empty());
|
||||
|
||||
Manager.registerPass(llvm::make_unique<InlineMemcpy>(NeverPrint),
|
||||
opts::StringOps);
|
||||
@ -463,10 +483,14 @@ void BinaryFunctionPassManager::runAllPasses(
|
||||
Manager.registerPass(
|
||||
llvm::make_unique<RetpolineInsertion>(PrintRetpolineInsertion));
|
||||
|
||||
Manager.registerPass(
|
||||
llvm::make_unique<LFenceInsertion>());
|
||||
// Insert lfences to mitigate Spectre v1 and LVI. This pass is not compatible
|
||||
// with the retpoline mitigation pass.
|
||||
Manager.registerPass(llvm::make_unique<LFenceInsertion>());
|
||||
|
||||
// Thighten branches according to offset differences between branch and
|
||||
// Assign each function an output section.
|
||||
Manager.registerPass(llvm::make_unique<AssignSections>());
|
||||
|
||||
// Tighten branches according to offset differences between branch and
|
||||
// targets. No extra instructions after this pass, otherwise we may have
|
||||
// relocations out of range and crash during linking.
|
||||
if (BC.isAArch64())
|
||||
|
||||
@ -27,8 +27,6 @@ namespace bolt {
|
||||
class BinaryFunctionPassManager {
|
||||
private:
|
||||
BinaryContext &BC;
|
||||
std::map<uint64_t, BinaryFunction> &BFs;
|
||||
std::set<uint64_t> &LargeFunctions;
|
||||
std::vector<std::pair<const bool,
|
||||
std::unique_ptr<BinaryFunctionPass>>> Passes;
|
||||
|
||||
@ -36,10 +34,8 @@ private:
|
||||
static const char TimerGroupName[];
|
||||
static const char TimerGroupDesc[];
|
||||
|
||||
BinaryFunctionPassManager(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions)
|
||||
: BC(BC), BFs(BFs), LargeFunctions(LargeFunctions) {}
|
||||
BinaryFunctionPassManager(BinaryContext &BC)
|
||||
: BC(BC) {}
|
||||
|
||||
/// Adds a pass to this manager based on the value of its corresponding
|
||||
/// command-line option.
|
||||
@ -57,10 +53,7 @@ private:
|
||||
void runPasses();
|
||||
|
||||
/// Runs all enabled implemented passes on all functions.
|
||||
static void runAllPasses(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &Functions,
|
||||
std::set<uint64_t> &LargeFunctions);
|
||||
|
||||
static void runAllPasses(BinaryContext &BC);
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
@ -66,7 +66,7 @@ BinarySection::~BinarySection() {
|
||||
delete[] getData();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
if (!isAllocatable() &&
|
||||
(!hasSectionRef() ||
|
||||
OutputContents.data() != getContents(Section).data())) {
|
||||
@ -78,7 +78,7 @@ void BinarySection::print(raw_ostream &OS) const {
|
||||
OS << getName() << ", "
|
||||
<< "0x" << Twine::utohexstr(getAddress()) << ", "
|
||||
<< getSize()
|
||||
<< " (0x" << Twine::utohexstr(getFileAddress()) << ", "
|
||||
<< " (0x" << Twine::utohexstr(getOutputAddress()) << ", "
|
||||
<< getOutputSize() << ")"
|
||||
<< ", data = " << getData()
|
||||
<< ", output data = " << getOutputData();
|
||||
@ -160,3 +160,23 @@ void BinarySection::reorderContents(const std::vector<BinaryData *> &Order,
|
||||
Contents = OutputContents = StringRef(NewData, OS.str().size());
|
||||
OutputSize = Contents.size();
|
||||
}
|
||||
|
||||
std::string BinarySection::encodeELFNote(StringRef NameStr, StringRef DescStr,
|
||||
uint32_t Type) {
|
||||
std::string Str;
|
||||
raw_string_ostream OS(Str);
|
||||
const uint32_t NameSz = NameStr.size() + 1;
|
||||
const uint32_t DescSz = DescStr.size();
|
||||
OS.write(reinterpret_cast<const char *>(&(NameSz)), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&(DescSz)), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&(Type)), 4);
|
||||
OS << NameStr << '\0';
|
||||
for (uint64_t I = NameSz; I < alignTo(NameSz, 4); ++I) {
|
||||
OS << '\0';
|
||||
}
|
||||
OS << DescStr;
|
||||
for (uint64_t I = DescStr.size(); I < alignTo(DescStr.size(), 4); ++I) {
|
||||
OS << '\0';
|
||||
}
|
||||
return OS.str();
|
||||
}
|
||||
|
||||
@ -62,13 +62,16 @@ class BinarySection {
|
||||
// finalized?
|
||||
std::string OutputName; // Output section name (if the section has
|
||||
// been renamed)
|
||||
uint64_t FileAddress{0}; // Section address for the rewritten binary.
|
||||
uint64_t OutputAddress{0}; // Section address for the rewritten binary.
|
||||
uint64_t OutputSize{0}; // Section size in the rewritten binary.
|
||||
uint64_t FileOffset{0}; // File offset in the rewritten binary file.
|
||||
StringRef OutputContents; // Rewritten section contents.
|
||||
unsigned SectionID{-1u}; // Unique ID used for address mapping.
|
||||
// Set by ExecutableFileMemoryManager.
|
||||
uint32_t Index{0}; // Section index in the output file.
|
||||
mutable bool IsReordered{false}; // Have the contents been reordered?
|
||||
bool IsAnonymous{false}; // True if the name should not be included
|
||||
// in the output file.
|
||||
|
||||
uint64_t hash(const BinaryData &BD,
|
||||
std::map<const BinaryData *, uint64_t> &Cache) const;
|
||||
@ -264,6 +267,7 @@ public:
|
||||
}
|
||||
bool isLocal() const { return IsLocal; }
|
||||
bool isReordered() const { return IsReordered; }
|
||||
bool isAnonymous() const { return IsAnonymous; }
|
||||
unsigned getELFType() const { return ELFType; }
|
||||
unsigned getELFFlags() const { return ELFFlags; }
|
||||
|
||||
@ -280,7 +284,8 @@ public:
|
||||
/// Does this section contain the given \p Address?
|
||||
/// Note: this is in terms of the original mapped binary addresses.
|
||||
bool containsAddress(uint64_t Address) const {
|
||||
return getAddress() <= Address && Address < getEndAddress();
|
||||
return (getAddress() <= Address && Address < getEndAddress()) ||
|
||||
(getSize() == 0 && getAddress() == Address);
|
||||
}
|
||||
|
||||
/// Does this section contain the range [\p Address, \p Address + \p Size)?
|
||||
@ -371,7 +376,7 @@ public:
|
||||
uint64_t getAllocAddress() const {
|
||||
return reinterpret_cast<uint64_t>(getOutputData());
|
||||
}
|
||||
uint64_t getFileAddress() const { return FileAddress; }
|
||||
uint64_t getOutputAddress() const { return OutputAddress; }
|
||||
uint64_t getFileOffset() const { return FileOffset; }
|
||||
unsigned getSectionID() const {
|
||||
assert(hasValidSectionID() && "trying to use uninitialized section id");
|
||||
@ -380,10 +385,13 @@ public:
|
||||
bool hasValidSectionID() const {
|
||||
return SectionID != -1u;
|
||||
}
|
||||
uint32_t getIndex() const {
|
||||
return Index;
|
||||
}
|
||||
|
||||
// mutation
|
||||
void setFileAddress(uint64_t Address) {
|
||||
FileAddress = Address;
|
||||
void setOutputAddress(uint64_t Address) {
|
||||
OutputAddress = Address;
|
||||
}
|
||||
void setFileOffset(uint64_t Offset) {
|
||||
FileOffset = Offset;
|
||||
@ -392,9 +400,15 @@ public:
|
||||
assert(!hasValidSectionID() && "trying to set section id twice");
|
||||
SectionID = ID;
|
||||
}
|
||||
void setIndex(uint32_t I) {
|
||||
Index = I;
|
||||
}
|
||||
void setOutputName(StringRef Name) {
|
||||
OutputName = Name;
|
||||
}
|
||||
void setAnonymous(bool Flag) {
|
||||
IsAnonymous = Flag;
|
||||
}
|
||||
|
||||
/// Reorder the contents of this section according to /p Order. If
|
||||
/// /p Inplace is true, the entire contents of the section is reordered,
|
||||
@ -402,6 +416,18 @@ public:
|
||||
void reorderContents(const std::vector<BinaryData *> &Order, bool Inplace);
|
||||
|
||||
void print(raw_ostream &OS) const;
|
||||
|
||||
/// Write the contents of an ELF note section given the name of the producer,
|
||||
/// a number identifying the type of note and the contents of the note in
|
||||
/// \p DescStr.
|
||||
static std::string encodeELFNote(StringRef NameStr, StringRef DescStr,
|
||||
uint32_t Type);
|
||||
|
||||
/// Code for ELF notes written by producer 'BOLT'
|
||||
enum {
|
||||
NT_BOLT_BAT = 1,
|
||||
NT_BOLT_INSTRUMENTATION_TABLES = 2
|
||||
};
|
||||
};
|
||||
|
||||
inline uint8_t *copyByteArray(const uint8_t *Data, uint64_t Size) {
|
||||
@ -425,6 +451,21 @@ inline raw_ostream &operator<<(raw_ostream &OS, const BinarySection &Section) {
|
||||
return OS;
|
||||
}
|
||||
|
||||
struct SDTMarkerInfo {
|
||||
uint64_t PC;
|
||||
uint64_t Base;
|
||||
uint64_t Semaphore;
|
||||
StringRef Provider;
|
||||
StringRef Name;
|
||||
StringRef Args;
|
||||
|
||||
/// The offset of PC within the note section
|
||||
unsigned PCOffset;
|
||||
|
||||
/// A label that marks the location of the SDT nop instruction
|
||||
MCSymbol *Label;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
|
||||
304
src/BoltAddressTranslation.cpp
Normal file
304
src/BoltAddressTranslation.cpp
Normal file
@ -0,0 +1,304 @@
|
||||
//===--- BoltAddressTranslation.cpp ---------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "BoltAddressTranslation.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "llvm/MC/MCAsmLayout.h"
|
||||
#include "llvm/Support/DataExtractor.h"
|
||||
|
||||
#define DEBUG_TYPE "bolt-bat"
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
const char* BoltAddressTranslation::SECTION_NAME = ".note.bolt_bat";
|
||||
|
||||
void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
|
||||
const BinaryBasicBlock &BB,
|
||||
uint64_t FuncAddress) {
|
||||
const uint64_t Key = BB.getOutputAddressRange().first - FuncAddress;
|
||||
const uint64_t Val = BB.getInputOffset();
|
||||
|
||||
assert(Val != BinaryBasicBlock::INVALID_OFFSET &&
|
||||
"Every output BB must track back to an input BB for profile "
|
||||
"collection in bolted binaries");
|
||||
|
||||
DEBUG(dbgs() << "BB " << BB.getName() <<"\n");
|
||||
DEBUG(dbgs() << " Key: " << Twine::utohexstr(Key)
|
||||
<< " Val: " << Twine::utohexstr(Val) << "\n");
|
||||
Map.insert(std::pair<uint32_t, uint32_t>(Key, Val));
|
||||
|
||||
// Look for special instructions we are interested in mapping offsets. These
|
||||
// are key instructions for the profile identified by
|
||||
// BC.keepOffsetForInstruction(Inst) and are instructions that cause control
|
||||
// flow change. We also record offsets for the last instruction in the BB in
|
||||
// some cases. These are harmless for BAT writing purposes, besides increasing
|
||||
// the size of the table unnecessarily.
|
||||
for (const auto &Inst : BB) {
|
||||
if (!BC.MIB->hasAnnotation(Inst, "LocSym"))
|
||||
continue;
|
||||
const auto OutputOffset =
|
||||
BC.MIB->getAnnotationAs<uint32_t>(Inst, "LocSym") - FuncAddress;
|
||||
|
||||
auto InputOffsetOrErr = BC.MIB->tryGetAnnotationAs<uint32_t>(Inst, "Offset");
|
||||
DEBUG(if (!InputOffsetOrErr) {
|
||||
auto *Function = BB.getFunction();
|
||||
dbgs() << "Function: " << Function->getPrintName()
|
||||
<< " BB: " << BB.getName() << " lacking annotation for: ";
|
||||
BC.printInstruction(dbgs(), Inst);
|
||||
dbgs() << "\n";
|
||||
});
|
||||
assert(InputOffsetOrErr && "Expected annotation with input offset");
|
||||
const auto InputOffset = *InputOffsetOrErr;
|
||||
|
||||
// Is this the first instruction in the BB? No need to duplicate the entry
|
||||
if (Key == OutputOffset)
|
||||
continue;
|
||||
|
||||
DEBUG(dbgs() << " Key: " << Twine::utohexstr(OutputOffset)
|
||||
<< " Val: " << Twine::utohexstr(InputOffset)
|
||||
<< " (branch)\n");
|
||||
Map.insert(
|
||||
std::pair<uint32_t, uint32_t>(OutputOffset, InputOffset | BRANCHENTRY));
|
||||
}
|
||||
}
|
||||
|
||||
void BoltAddressTranslation::write(raw_ostream &OS) {
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n");
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &Function = BFI.second;
|
||||
|
||||
DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n");
|
||||
DEBUG(dbgs() << " Address reference: 0x"
|
||||
<< Twine::utohexstr(Function.getOutputAddress()) << "\n");
|
||||
MapTy Map;
|
||||
const bool IsSplit = Function.isSplit();
|
||||
for (const auto &BB : Function.layout()) {
|
||||
if (IsSplit && BB->isCold())
|
||||
break;
|
||||
writeEntriesForBB(Map, *BB, Function.getOutputAddress());
|
||||
}
|
||||
Maps.insert(std::pair<uint64_t, MapTy>(Function.getOutputAddress(), Map));
|
||||
|
||||
if (!IsSplit)
|
||||
continue;
|
||||
|
||||
// Cold map
|
||||
Map.clear();
|
||||
DEBUG(dbgs() << " Cold part\n");
|
||||
for (const auto &BB : Function.layout()) {
|
||||
if (!BB->isCold())
|
||||
continue;
|
||||
writeEntriesForBB(Map, *BB, Function.cold().getAddress());
|
||||
}
|
||||
Maps.insert(std::pair<uint64_t, MapTy>(Function.cold().getAddress(), Map));
|
||||
ColdPartSource.insert(std::pair<uint64_t, uint64_t>(
|
||||
Function.cold().getAddress(), Function.getOutputAddress()));
|
||||
}
|
||||
|
||||
const uint32_t NumFuncs = Maps.size();
|
||||
OS.write(reinterpret_cast<const char *>(&NumFuncs), 4);
|
||||
DEBUG(dbgs() << "Writing " << NumFuncs << " functions for BAT.\n");
|
||||
for (auto &MapEntry : Maps) {
|
||||
const uint64_t Address = MapEntry.first;
|
||||
MapTy &Map = MapEntry.second;
|
||||
const uint32_t NumEntries = Map.size();
|
||||
DEBUG(dbgs() << "Writing " << NumEntries << " entries for 0x"
|
||||
<< Twine::utohexstr(Address) << ".\n");
|
||||
OS.write(reinterpret_cast<const char *>(&Address), 8);
|
||||
OS.write(reinterpret_cast<const char *>(&NumEntries), 4);
|
||||
for (auto &KeyVal : Map) {
|
||||
OS.write(reinterpret_cast<const char *>(&KeyVal.first), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&KeyVal.second), 4);
|
||||
}
|
||||
}
|
||||
const uint32_t NumColdEntries = ColdPartSource.size();
|
||||
DEBUG(dbgs() << "Writing " << NumColdEntries << " cold part mappings.\n");
|
||||
OS.write(reinterpret_cast<const char *>(&NumColdEntries), 4);
|
||||
for (auto &ColdEntry : ColdPartSource) {
|
||||
OS.write(reinterpret_cast<const char *>(&ColdEntry.first), 8);
|
||||
OS.write(reinterpret_cast<const char *>(&ColdEntry.second), 8);
|
||||
DEBUG(dbgs() << " " << Twine::utohexstr(ColdEntry.first) << " -> "
|
||||
<< Twine::utohexstr(ColdEntry.second) << "\n");
|
||||
}
|
||||
|
||||
outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
|
||||
outs() << "BOLT-INFO: Wrote " << NumColdEntries
|
||||
<< " BAT cold-to-hot entries\n";
|
||||
}
|
||||
|
||||
std::error_code BoltAddressTranslation::parse(StringRef Buf) {
|
||||
DataExtractor DE = DataExtractor(Buf, true, 8);
|
||||
uint32_t Offset = 0;
|
||||
if (Buf.size() < 12)
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
|
||||
const uint32_t NameSz = DE.getU32(&Offset);
|
||||
const uint32_t DescSz = DE.getU32(&Offset);
|
||||
const uint32_t Type = DE.getU32(&Offset);
|
||||
|
||||
if (Type != BinarySection::NT_BOLT_BAT ||
|
||||
Buf.size() + Offset < alignTo(NameSz, 4) + DescSz)
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
|
||||
StringRef Name = Buf.slice(Offset, Offset + NameSz);
|
||||
Offset = alignTo(Offset + NameSz, 4);
|
||||
if (Name.substr(0, 4) != "BOLT")
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
|
||||
if (Buf.size() - Offset < 4)
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
|
||||
const uint32_t NumFunctions = DE.getU32(&Offset);
|
||||
DEBUG(dbgs() << "Parsing " << NumFunctions << " functions\n");
|
||||
for (uint32_t I = 0; I < NumFunctions; ++I) {
|
||||
if (Buf.size() - Offset < 12)
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
|
||||
const uint64_t Address = DE.getU64(&Offset);
|
||||
const uint32_t NumEntries = DE.getU32(&Offset);
|
||||
MapTy Map;
|
||||
|
||||
DEBUG(dbgs() << "Parsing " << NumEntries << " entries for 0x"
|
||||
<< Twine::utohexstr(Address) << "\n");
|
||||
if (Buf.size() - Offset < 8 * NumEntries)
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
for (uint32_t J = 0; J < NumEntries; ++J) {
|
||||
const uint32_t OutputAddr = DE.getU32(&Offset);
|
||||
const uint32_t InputAddr = DE.getU32(&Offset);
|
||||
Map.insert(std::pair<uint32_t, uint32_t>(OutputAddr, InputAddr));
|
||||
DEBUG(dbgs() << Twine::utohexstr(OutputAddr) << " -> "
|
||||
<< Twine::utohexstr(InputAddr) << "\n");
|
||||
}
|
||||
Maps.insert(std::pair<uint64_t, MapTy>(Address, Map));
|
||||
}
|
||||
|
||||
if (Buf.size() - Offset < 4)
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
|
||||
const uint32_t NumColdEntries = DE.getU32(&Offset);
|
||||
DEBUG(dbgs() << "Parsing " << NumColdEntries << " cold part mappings\n");
|
||||
for (uint32_t I = 0; I < NumColdEntries; ++I) {
|
||||
if (Buf.size() - Offset < 16)
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
const uint32_t ColdAddress = DE.getU64(&Offset);
|
||||
const uint32_t HotAddress = DE.getU64(&Offset);
|
||||
ColdPartSource.insert(
|
||||
std::pair<uint64_t, uint64_t>(ColdAddress, HotAddress));
|
||||
DEBUG(dbgs() << Twine::utohexstr(ColdAddress) << " -> "
|
||||
<< Twine::utohexstr(HotAddress) << "\n");
|
||||
}
|
||||
outs() << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n";
|
||||
outs() << "BOLT-INFO: Parsed " << NumColdEntries
|
||||
<< " BAT cold-to-hot entries\n";
|
||||
|
||||
return std::error_code();
|
||||
}
|
||||
|
||||
uint64_t BoltAddressTranslation::translate(const BinaryFunction &Func,
|
||||
uint64_t Offset,
|
||||
bool IsBranchSrc) const {
|
||||
auto Iter = Maps.find(Func.getAddress());
|
||||
if (Iter == Maps.end())
|
||||
return Offset;
|
||||
|
||||
const MapTy &Map = Iter->second;
|
||||
auto KeyVal = Map.upper_bound(Offset);
|
||||
if (KeyVal == Map.begin())
|
||||
return Offset;
|
||||
|
||||
--KeyVal;
|
||||
|
||||
const uint32_t Val = KeyVal->second & ~BRANCHENTRY;
|
||||
// Branch source addresses are translated to the first instruction of the
|
||||
// source BB to avoid accounting for modifications BOLT may have made in the
|
||||
// BB regarding deletion/addition of instructions.
|
||||
if (IsBranchSrc)
|
||||
return Val;
|
||||
return Offset - KeyVal->first + Val;
|
||||
}
|
||||
|
||||
Optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
|
||||
BoltAddressTranslation::getFallthroughsInTrace(
|
||||
const BinaryFunction &Func,
|
||||
const LBREntry &FirstLBR, const LBREntry &SecondLBR) const {
|
||||
SmallVector<std::pair<uint64_t, uint64_t>, 16> Res;
|
||||
|
||||
// Filter out trivial case
|
||||
if (FirstLBR.To >= SecondLBR.From)
|
||||
return Res;
|
||||
|
||||
const auto From = FirstLBR.To - Func.getAddress();
|
||||
const auto To = SecondLBR.From - Func.getAddress();
|
||||
|
||||
auto Iter = Maps.find(Func.getAddress());
|
||||
if (Iter == Maps.end()) {
|
||||
return NoneType();
|
||||
}
|
||||
|
||||
const MapTy &Map = Iter->second;
|
||||
auto FromIter = Map.upper_bound(From);
|
||||
if (FromIter == Map.begin())
|
||||
return Res;
|
||||
// Skip instruction entries, to create fallthroughs we are only interested in
|
||||
// BB boundaries
|
||||
do {
|
||||
if (FromIter == Map.begin())
|
||||
return Res;
|
||||
--FromIter;
|
||||
} while (FromIter->second & BRANCHENTRY);
|
||||
|
||||
auto ToIter = Map.upper_bound(To);
|
||||
if (ToIter == Map.begin())
|
||||
return Res;
|
||||
--ToIter;
|
||||
if (FromIter->first >= ToIter->first)
|
||||
return Res;
|
||||
|
||||
for (auto Iter = FromIter; Iter != ToIter; ) {
|
||||
const auto Src = Iter->first;
|
||||
if (Iter->second & BRANCHENTRY) {
|
||||
++Iter;
|
||||
continue;
|
||||
}
|
||||
|
||||
++Iter;
|
||||
while (Iter->second & BRANCHENTRY && Iter != ToIter) {
|
||||
++Iter;
|
||||
}
|
||||
if (Iter->second & BRANCHENTRY)
|
||||
break;
|
||||
Res.emplace_back(std::make_pair(Src, Iter->first));
|
||||
}
|
||||
|
||||
return Res;
|
||||
}
|
||||
|
||||
uint64_t BoltAddressTranslation::fetchParentAddress(uint64_t Address) const {
|
||||
auto Iter = ColdPartSource.find(Address);
|
||||
if (Iter == ColdPartSource.end())
|
||||
return 0;
|
||||
return Iter->second;
|
||||
}
|
||||
|
||||
bool BoltAddressTranslation::enabledFor(
|
||||
llvm::object::ELFObjectFileBase *InputFile) const {
|
||||
for (const auto &Section : InputFile->sections()) {
|
||||
StringRef SectionName;
|
||||
if (std::error_code EC = Section.getName(SectionName))
|
||||
continue;
|
||||
|
||||
if (SectionName == SECTION_NAME)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
121
src/BoltAddressTranslation.h
Normal file
121
src/BoltAddressTranslation.h
Normal file
@ -0,0 +1,121 @@
|
||||
//===--- BoltAddressTranslation.h -----------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_BOLTADDRESSTRANSLATION_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_BOLTADDRESSTRANSLATION_H
|
||||
|
||||
#include "BinaryContext.h"
|
||||
#include "llvm/Object/ELFObjectFile.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
namespace bolt {
|
||||
|
||||
/// The map of output addresses to input ones to be used when translating
|
||||
/// samples collected in a binary that was already processed by BOLT. We do not
|
||||
/// support reoptimizing a binary already processed by BOLT, but we do support
|
||||
/// collecting samples in a binary processed by BOLT. We then translate samples
|
||||
/// back to addresses from the input (original) binary, one that can be
|
||||
/// optimized. The goal is to avoid special deployments of non-bolted binaries
|
||||
/// just for the purposes of data collection.
|
||||
///
|
||||
/// The in-memory representation of the map is as follows. Each function has its
|
||||
/// own map. A function is identified by its output address. This is the key to
|
||||
/// retrieve a translation map. The translation map is a collection of ordered
|
||||
/// keys identifying the start of a region (relative to the function start) in
|
||||
/// the output address space (addresses in the binary processed by BOLT).
|
||||
///
|
||||
/// A translation then happens when perf2bolt needs to convert sample addresses
|
||||
/// in the output address space back to input addresses, valid to run BOLT in
|
||||
/// the original input binary. To convert, perf2bolt first needs to fetch the
|
||||
/// translation map for a sample recorded in a given function. It then finds
|
||||
/// the largest key that is still smaller or equal than the recorded address.
|
||||
/// It then converts this address to use the value of this key.
|
||||
///
|
||||
/// Example translation Map for function foo
|
||||
/// KEY VALUE BB?
|
||||
/// Output offset1 (first BB) Original input offset1 Y
|
||||
/// ...
|
||||
/// Output offsetN (last branch) Original input offsetN N
|
||||
///
|
||||
/// The information on whether a given entry is a BB start or an instruction
|
||||
/// that changes control flow is encoded in the last (highest) bit of VALUE.
|
||||
///
|
||||
/// Notes:
|
||||
/// Instructions that will never appear in LBR because they do not cause control
|
||||
/// flow change are omitted from this map. Basic block locations are recorded
|
||||
/// because they can be a target of a jump (To address in the LBR) and also to
|
||||
/// recreate the BB layout of this function. We use the BB layout map to
|
||||
/// recreate fall-through jumps in the profile, given an LBR trace.
|
||||
class BoltAddressTranslation {
|
||||
public:
|
||||
// In-memory representation of the address translation table
|
||||
using MapTy = std::map<uint32_t, uint32_t>;
|
||||
|
||||
/// Name of the ELF section where the table will be serialized to in the
|
||||
/// output binary
|
||||
static const char *SECTION_NAME;
|
||||
|
||||
BoltAddressTranslation(BinaryContext &BC) : BC(BC) {}
|
||||
|
||||
/// Write the serialized address translation tables for each reordered
|
||||
/// function
|
||||
void write(raw_ostream &OS);
|
||||
|
||||
/// Read the serialized address translation tables and load them internally
|
||||
/// in memory. Return a parse error if failed.
|
||||
std::error_code parse(StringRef Buf);
|
||||
|
||||
/// If the maps are loaded in memory, perform the lookup to translate LBR
|
||||
/// addresses in \p Func.
|
||||
uint64_t translate(const BinaryFunction &Func, uint64_t Offset,
|
||||
bool IsBranchSrc) const;
|
||||
|
||||
/// Use the map keys containing basic block addresses to infer fall-throughs
|
||||
/// taken in the path started at FirstLBR.To and ending at SecondLBR.From.
|
||||
/// Return NoneType if trace is invalid or the list of fall-throughs
|
||||
/// otherwise.
|
||||
Optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
|
||||
getFallthroughsInTrace(const BinaryFunction &Func, const LBREntry &FirstLBR,
|
||||
const LBREntry &SecondLBR) const;
|
||||
|
||||
/// If available, fetch the address of the hot part linked to the cold part
|
||||
/// at \p Address. Return 0 otherwise.
|
||||
uint64_t fetchParentAddress(uint64_t Address) const;
|
||||
|
||||
/// True if the input binary has a translation table we can use to convert
|
||||
/// addresses when aggregating profile
|
||||
bool enabledFor(llvm::object::ELFObjectFileBase *InputFile) const;
|
||||
|
||||
private:
|
||||
/// Helper to update \p Map by inserting one or more BAT entries reflecting
|
||||
/// \p BB for function located at \p FuncAddress. At least one entry will be
|
||||
/// emitted for the start of the BB. More entries may be emitted to cover
|
||||
/// the location of calls or any instruction that may change control flow.
|
||||
void writeEntriesForBB(MapTy &Map, const BinaryBasicBlock &BB,
|
||||
uint64_t FuncAddress);
|
||||
|
||||
BinaryContext &BC;
|
||||
|
||||
std::map<uint64_t, MapTy> Maps;
|
||||
|
||||
/// Links outlined cold bocks to their original function
|
||||
std::map<uint64_t, uint64_t> ColdPartSource;
|
||||
|
||||
/// Identifies the address of a control-flow changing instructions in a
|
||||
/// translation map entry
|
||||
const static uint32_t BRANCHENTRY = 0x80000000;
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -204,7 +204,7 @@ class RewriteInstanceDiff {
|
||||
/// later when matching functions in binary 2 to corresponding functions
|
||||
/// in binary 1
|
||||
void buildLookupMaps() {
|
||||
for (const auto &BFI : RI1.BinaryFunctions) {
|
||||
for (const auto &BFI : RI1.BC->getBinaryFunctions()) {
|
||||
StringRef LTOName;
|
||||
const auto &Function = BFI.second;
|
||||
const auto Score = getNormalizedScore(Function, RI1);
|
||||
@ -224,7 +224,7 @@ class RewriteInstanceDiff {
|
||||
}
|
||||
|
||||
// Compute LTONameLookup2 and LargestBin2
|
||||
for (const auto &BFI : RI2.BinaryFunctions) {
|
||||
for (const auto &BFI : RI2.BC->getBinaryFunctions()) {
|
||||
StringRef LTOName;
|
||||
const auto &Function = BFI.second;
|
||||
const auto Score = getNormalizedScore(Function, RI2);
|
||||
@ -245,7 +245,7 @@ class RewriteInstanceDiff {
|
||||
void matchFunctions() {
|
||||
outs() << "BOLT-DIFF: Mapping functions in Binary2 to Binary1\n";
|
||||
|
||||
for (const auto &BFI2 : RI2.BinaryFunctions) {
|
||||
for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) {
|
||||
const auto &Function2 = BFI2.second;
|
||||
StringRef LTOName;
|
||||
bool Match = false;
|
||||
@ -451,7 +451,7 @@ class RewriteInstanceDiff {
|
||||
/// having a large difference in performance because hotness shifted from
|
||||
/// LTO variant 1 to variant 2, even though they represent the same function.
|
||||
void computeAggregatedLTOScore() {
|
||||
for (const auto &BFI : RI1.BinaryFunctions) {
|
||||
for (const auto &BFI : RI1.BC->getBinaryFunctions()) {
|
||||
const auto &Function = BFI.second;
|
||||
double Score = getNormalizedScore(Function, RI1);
|
||||
auto Iter = LTOMap1.find(&Function);
|
||||
@ -461,7 +461,7 @@ class RewriteInstanceDiff {
|
||||
}
|
||||
|
||||
double UnmappedScore{0};
|
||||
for (const auto &BFI : RI2.BinaryFunctions) {
|
||||
for (const auto &BFI : RI2.BC->getBinaryFunctions()) {
|
||||
const auto &Function = BFI.second;
|
||||
bool Matched = FuncMap.find(&Function) != FuncMap.end();
|
||||
double Score = getNormalizedScore(Function, RI2);
|
||||
@ -475,7 +475,8 @@ class RewriteInstanceDiff {
|
||||
if (FuncMap.find(Iter->second) == FuncMap.end())
|
||||
UnmappedScore += Score;
|
||||
}
|
||||
int64_t Unmapped = RI2.BinaryFunctions.size() - Bin2MappedFuncs.size();
|
||||
int64_t Unmapped =
|
||||
RI2.BC->getBinaryFunctions().size() - Bin2MappedFuncs.size();
|
||||
outs() << "BOLT-DIFF: " << Unmapped
|
||||
<< " functions in Binary2 have no correspondence to any other "
|
||||
"function in Binary1.\n";
|
||||
@ -595,7 +596,7 @@ class RewriteInstanceDiff {
|
||||
void reportUnmapped() {
|
||||
outs() << "List of functions from binary 2 that were not matched with any "
|
||||
<< "function in binary 1:\n";
|
||||
for (const auto &BFI2 : RI2.BinaryFunctions) {
|
||||
for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) {
|
||||
const auto &Function2 = BFI2.second;
|
||||
if (Bin2MappedFuncs.count(&Function2))
|
||||
continue;
|
||||
@ -654,9 +655,9 @@ void RewriteInstance::compare(RewriteInstance &RI2) {
|
||||
if (opts::ICF) {
|
||||
IdenticalCodeFolding ICF(opts::NeverPrint);
|
||||
outs() << "BOLT-DIFF: Starting ICF pass for binary 1";
|
||||
ICF.runOnFunctions(*BC, BinaryFunctions, LargeFunctions);
|
||||
ICF.runOnFunctions(*BC);
|
||||
outs() << "BOLT-DIFF: Starting ICF pass for binary 2";
|
||||
ICF.runOnFunctions(*RI2.BC, RI2.BinaryFunctions, RI2.LargeFunctions);
|
||||
ICF.runOnFunctions(*RI2.BC);
|
||||
}
|
||||
|
||||
RewriteInstanceDiff RID(*this, RI2);
|
||||
|
||||
@ -48,8 +48,6 @@ add_public_gen_version_target(GenBoltRevision)
|
||||
set(LLVM_LINK_COMPONENTS
|
||||
${LLVM_TARGETS_TO_BUILD}
|
||||
BOLTPasses
|
||||
BOLTTargetAArch64
|
||||
BOLTTargetX86
|
||||
CodeGen
|
||||
Core
|
||||
DebugInfoDWARF
|
||||
@ -61,6 +59,18 @@ set(LLVM_LINK_COMPONENTS
|
||||
Support
|
||||
)
|
||||
|
||||
string(FIND "${LLVM_TARGETS_TO_BUILD}" "AArch64" POSITION)
|
||||
if (NOT ${POSITION} EQUAL -1)
|
||||
list(APPEND LLVM_LINK_COMPONENTS BOLTTargetAArch64)
|
||||
set(BOLT_AArch64 On)
|
||||
endif()
|
||||
|
||||
string(FIND "${LLVM_TARGETS_TO_BUILD}" "X86" POSITION)
|
||||
if (NOT ${POSITION} EQUAL -1)
|
||||
list(APPEND LLVM_LINK_COMPONENTS BOLTTargetX86)
|
||||
set(BOLT_X64 On)
|
||||
endif()
|
||||
|
||||
add_llvm_tool(llvm-bolt
|
||||
llvm-bolt.cpp
|
||||
BinaryBasicBlock.cpp
|
||||
@ -70,16 +80,20 @@ add_llvm_tool(llvm-bolt
|
||||
BinaryFunctionProfile.cpp
|
||||
BinaryPassManager.cpp
|
||||
BinarySection.cpp
|
||||
BoltAddressTranslation.cpp
|
||||
BoltDiff.cpp
|
||||
CacheMetrics.cpp
|
||||
DataAggregator.cpp
|
||||
DataReader.cpp
|
||||
DebugData.cpp
|
||||
DWARFRewriter.cpp
|
||||
DynoStats.cpp
|
||||
Exceptions.cpp
|
||||
ExecutableFileMemoryManager.cpp
|
||||
Heatmap.cpp
|
||||
JumpTable.cpp
|
||||
MCPlusBuilder.cpp
|
||||
ParallelUtilities.cpp
|
||||
ProfileReader.cpp
|
||||
ProfileWriter.cpp
|
||||
Relocation.cpp
|
||||
@ -87,8 +101,17 @@ add_llvm_tool(llvm-bolt
|
||||
|
||||
DEPENDS
|
||||
intrinsics_gen
|
||||
bolt_rt
|
||||
)
|
||||
|
||||
if (DEFINED BOLT_AArch64)
|
||||
target_compile_definitions(llvm-bolt PRIVATE AARCH64_AVAILABLE)
|
||||
endif()
|
||||
|
||||
if (DEFINED BOLT_X64)
|
||||
target_compile_definitions(llvm-bolt PRIVATE X86_AVAILABLE)
|
||||
endif()
|
||||
|
||||
add_llvm_tool_symlink(perf2bolt llvm-bolt)
|
||||
add_llvm_tool_symlink(llvm-boltdiff llvm-bolt)
|
||||
add_llvm_tool_symlink(llvm-bolt-heatmap llvm-bolt)
|
||||
|
||||
@ -9,11 +9,10 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
||||
#include "BinaryBasicBlock.h"
|
||||
#include "DWARFRewriter.h"
|
||||
#include "BinaryContext.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "RewriteInstance.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/BinaryFormat/Dwarf.h"
|
||||
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
|
||||
@ -57,62 +56,126 @@ KeepARanges("keep-aranges",
|
||||
cl::Hidden,
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
DeterministicDebugInfo("deterministic-debuginfo",
|
||||
cl::desc("disables parallel execution of tasks that may produce"
|
||||
"nondeterministic debug info"),
|
||||
cl::init(true),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
} // namespace opts
|
||||
|
||||
void RewriteInstance::updateDebugInfo() {
|
||||
void DWARFRewriter::updateDebugInfo() {
|
||||
SectionPatchers[".debug_abbrev"] = llvm::make_unique<DebugAbbrevPatcher>();
|
||||
SectionPatchers[".debug_info"] = llvm::make_unique<SimpleBinaryPatcher>();
|
||||
SectionPatchers[".debug_info"] = llvm::make_unique<SimpleBinaryPatcher>();
|
||||
|
||||
RangesSectionsWriter = llvm::make_unique<DebugRangesSectionsWriter>(BC.get());
|
||||
LocationListWriter = llvm::make_unique<DebugLocWriter>(BC.get());
|
||||
DebugInfoPatcher =
|
||||
static_cast<SimpleBinaryPatcher *>(SectionPatchers[".debug_info"].get());
|
||||
AbbrevPatcher =
|
||||
static_cast<DebugAbbrevPatcher *>(SectionPatchers[".debug_abbrev"].get());
|
||||
assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized.");
|
||||
|
||||
for (auto &CU : BC->DwCtx->compile_units()) {
|
||||
updateUnitDebugInfo(CU->getUnitDIE(false),
|
||||
std::vector<const BinaryFunction *>{});
|
||||
RangesSectionsWriter = llvm::make_unique<DebugRangesSectionsWriter>(&BC);
|
||||
LocationListWriter = llvm::make_unique<DebugLocWriter>(&BC);
|
||||
|
||||
auto processUnitDIE = [&](const DWARFDie DIE) {
|
||||
const BinaryFunction *CachedFunction = nullptr;
|
||||
std::map<DebugAddressRangesVector, uint64_t> CachedRanges{};
|
||||
updateUnitDebugInfo(DIE, std::vector<const BinaryFunction *>{},
|
||||
CachedFunction, CachedRanges);
|
||||
};
|
||||
|
||||
if (opts::NoThreads || opts::DeterministicDebugInfo) {
|
||||
for (auto &CU : BC.DwCtx->compile_units())
|
||||
processUnitDIE(CU->getUnitDIE(false));
|
||||
} else {
|
||||
// Update unit debug info in parallel
|
||||
auto &ThreadPool = ParallelUtilities::getThreadPool();
|
||||
for (auto &CU : BC.DwCtx->compile_units())
|
||||
ThreadPool.async(processUnitDIE, CU->getUnitDIE(false));
|
||||
|
||||
ThreadPool.wait();
|
||||
}
|
||||
|
||||
flushPendingRanges();
|
||||
|
||||
finalizeDebugSections();
|
||||
|
||||
updateGdbIndexSection();
|
||||
}
|
||||
|
||||
void RewriteInstance::updateUnitDebugInfo(
|
||||
const DWARFDie DIE,
|
||||
std::vector<const BinaryFunction *> FunctionStack) {
|
||||
|
||||
void DWARFRewriter::updateUnitDebugInfo(
|
||||
const DWARFDie DIE, std::vector<const BinaryFunction *> FunctionStack,
|
||||
const BinaryFunction *&CachedFunction,
|
||||
std::map<DebugAddressRangesVector, uint64_t> &CachedRanges) {
|
||||
bool IsFunctionDef = false;
|
||||
switch (DIE.getTag()) {
|
||||
case dwarf::DW_TAG_compile_unit:
|
||||
{
|
||||
const auto ModuleRanges = DIE.getAddressRanges();
|
||||
auto OutputRanges = translateModuleAddressRanges(ModuleRanges);
|
||||
auto OutputRanges = BC.translateModuleAddressRanges(ModuleRanges);
|
||||
const auto RangesSectionOffset =
|
||||
RangesSectionsWriter->addCURanges(DIE.getDwarfUnit()->getOffset(),
|
||||
std::move(OutputRanges));
|
||||
RangesSectionsWriter->addCURanges(DIE.getDwarfUnit()->getOffset(),
|
||||
std::move(OutputRanges));
|
||||
updateDWARFObjectAddressRanges(DIE, RangesSectionOffset);
|
||||
}
|
||||
break;
|
||||
|
||||
case dwarf::DW_TAG_subprogram:
|
||||
{
|
||||
// The function cannot have multiple ranges on the input.
|
||||
uint64_t SectionIndex, LowPC, HighPC;
|
||||
if (DIE.getLowAndHighPC(LowPC, HighPC, SectionIndex)) {
|
||||
IsFunctionDef = true;
|
||||
const auto *Function = getBinaryFunctionAtAddress(LowPC);
|
||||
if (Function && Function->isFolded()) {
|
||||
Function = nullptr;
|
||||
// Get function address either from ranges or [LowPC, HighPC) pair.
|
||||
bool UsesRanges = false;
|
||||
uint64_t Address;
|
||||
uint64_t SectionIndex, HighPC;
|
||||
if (!DIE.getLowAndHighPC(Address, HighPC, SectionIndex)) {
|
||||
auto Ranges = DIE.getAddressRanges();
|
||||
// Not a function definition.
|
||||
if (Ranges.empty())
|
||||
break;
|
||||
|
||||
Address = Ranges.front().LowPC;
|
||||
UsesRanges = true;
|
||||
}
|
||||
|
||||
IsFunctionDef = true;
|
||||
const auto *Function = BC.getBinaryFunctionAtAddress(Address);
|
||||
if (Function && Function->isFolded())
|
||||
Function = nullptr;
|
||||
FunctionStack.push_back(Function);
|
||||
|
||||
DebugAddressRangesVector FunctionRanges;
|
||||
if (Function)
|
||||
FunctionRanges = Function->getOutputAddressRanges();
|
||||
|
||||
// Update ranges.
|
||||
if (UsesRanges) {
|
||||
updateDWARFObjectAddressRanges(DIE,
|
||||
RangesSectionsWriter->addRanges(FunctionRanges));
|
||||
} else {
|
||||
// Delay conversion of [LowPC, HighPC) into DW_AT_ranges if possible.
|
||||
const auto *Abbrev = DIE.getAbbreviationDeclarationPtr();
|
||||
assert(Abbrev && "abbrev expected");
|
||||
|
||||
// Create a critical section.
|
||||
static std::shared_timed_mutex CriticalSectionMutex;
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(CriticalSectionMutex);
|
||||
|
||||
if (FunctionRanges.size() > 1) {
|
||||
convertPending(Abbrev);
|
||||
// Exit critical section early.
|
||||
Lock.unlock();
|
||||
convertToRanges(DIE, FunctionRanges);
|
||||
} else if (ConvertedRangesAbbrevs.find(Abbrev) !=
|
||||
ConvertedRangesAbbrevs.end()) {
|
||||
// Exit critical section early.
|
||||
Lock.unlock();
|
||||
convertToRanges(DIE, FunctionRanges);
|
||||
} else {
|
||||
if (FunctionRanges.empty())
|
||||
FunctionRanges.emplace_back(DebugAddressRange());
|
||||
PendingRanges[Abbrev].emplace_back(
|
||||
std::make_pair(DIE, FunctionRanges.front()));
|
||||
}
|
||||
FunctionStack.push_back(Function);
|
||||
auto RangesSectionOffset =
|
||||
RangesSectionsWriter->getEmptyRangesOffset();
|
||||
if (Function) {
|
||||
auto FunctionRanges = Function->getOutputAddressRanges();
|
||||
RangesSectionOffset =
|
||||
RangesSectionsWriter->addRanges(Function,
|
||||
std::move(FunctionRanges));
|
||||
}
|
||||
updateDWARFObjectAddressRanges(DIE, RangesSectionOffset);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -136,8 +199,8 @@ void RewriteInstance::updateUnitDebugInfo(
|
||||
<< Twine::utohexstr(DIE.getDwarfUnit()->getOffset()) << '\n';
|
||||
}
|
||||
);
|
||||
RangesSectionOffset =
|
||||
RangesSectionsWriter->addRanges(Function, std::move(OutputRanges));
|
||||
RangesSectionOffset = RangesSectionsWriter->addRanges(
|
||||
Function, std::move(OutputRanges), CachedFunction, CachedRanges);
|
||||
}
|
||||
updateDWARFObjectAddressRanges(DIE, RangesSectionOffset);
|
||||
}
|
||||
@ -186,9 +249,7 @@ void RewriteInstance::updateUnitDebugInfo(
|
||||
}
|
||||
}
|
||||
|
||||
auto DebugInfoPatcher =
|
||||
static_cast<SimpleBinaryPatcher *>(
|
||||
SectionPatchers[".debug_info"].get());
|
||||
std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
|
||||
DebugInfoPatcher->addLE32Patch(AttrOffset, LocListSectionOffset);
|
||||
} else {
|
||||
assert((Value.isFormClass(DWARFFormValue::FC_Exprloc) ||
|
||||
@ -208,9 +269,8 @@ void RewriteInstance::updateUnitDebugInfo(
|
||||
<< " for DIE with tag " << DIE.getTag()
|
||||
<< " to 0x" << Twine::utohexstr(NewAddress) << '\n');
|
||||
}
|
||||
auto DebugInfoPatcher =
|
||||
static_cast<SimpleBinaryPatcher *>(
|
||||
SectionPatchers[".debug_info"].get());
|
||||
|
||||
std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
|
||||
DebugInfoPatcher->addLE64Patch(AttrOffset, NewAddress);
|
||||
} else if (opts::Verbosity >= 1) {
|
||||
errs() << "BOLT-WARNING: unexpected form value for attribute at 0x"
|
||||
@ -222,14 +282,14 @@ void RewriteInstance::updateUnitDebugInfo(
|
||||
|
||||
// Recursively update each child.
|
||||
for (auto Child = DIE.getFirstChild(); Child; Child = Child.getSibling()) {
|
||||
updateUnitDebugInfo(Child, FunctionStack);
|
||||
updateUnitDebugInfo(Child, FunctionStack, CachedFunction, CachedRanges);
|
||||
}
|
||||
|
||||
if (IsFunctionDef)
|
||||
FunctionStack.pop_back();
|
||||
}
|
||||
|
||||
void RewriteInstance::updateDWARFObjectAddressRanges(
|
||||
void DWARFRewriter::updateDWARFObjectAddressRanges(
|
||||
const DWARFDie DIE, uint64_t DebugRangesOffset) {
|
||||
|
||||
// Some objects don't have an associated DIE and cannot be updated (such as
|
||||
@ -239,17 +299,10 @@ void RewriteInstance::updateDWARFObjectAddressRanges(
|
||||
}
|
||||
|
||||
if (opts::Verbosity >= 2 && DebugRangesOffset == -1U) {
|
||||
errs() << "BOLT-WARNING: using invalid DW_AT_range for DIE at offset 0x"
|
||||
errs() << "BOLT-WARNING: using invalid DW_AT_ranges for DIE at offset 0x"
|
||||
<< Twine::utohexstr(DIE.getOffset()) << '\n';
|
||||
}
|
||||
|
||||
auto DebugInfoPatcher =
|
||||
static_cast<SimpleBinaryPatcher *>(SectionPatchers[".debug_info"].get());
|
||||
auto AbbrevPatcher =
|
||||
static_cast<DebugAbbrevPatcher*>(SectionPatchers[".debug_abbrev"].get());
|
||||
|
||||
assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized.");
|
||||
|
||||
const auto *AbbreviationDecl = DIE.getAbbreviationDeclarationPtr();
|
||||
if (!AbbreviationDecl) {
|
||||
if (opts::Verbosity >= 1) {
|
||||
@ -260,14 +313,14 @@ void RewriteInstance::updateDWARFObjectAddressRanges(
|
||||
return;
|
||||
}
|
||||
|
||||
auto AbbrevCode = AbbreviationDecl->getCode();
|
||||
|
||||
if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges)) {
|
||||
// Case 1: The object was already non-contiguous and had DW_AT_ranges.
|
||||
// In this case we simply need to update the value of DW_AT_ranges.
|
||||
uint32_t AttrOffset = -1U;
|
||||
DIE.find(dwarf::DW_AT_ranges, &AttrOffset);
|
||||
assert(AttrOffset != -1U && "failed to locate DWARF attribute");
|
||||
|
||||
std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
|
||||
DebugInfoPatcher->addLE32Patch(AttrOffset, DebugRangesOffset);
|
||||
} else {
|
||||
// Case 2: The object has both DW_AT_low_pc and DW_AT_high_pc emitted back
|
||||
@ -284,50 +337,8 @@ void RewriteInstance::updateDWARFObjectAddressRanges(
|
||||
// large size.
|
||||
if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) &&
|
||||
AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc)) {
|
||||
uint32_t LowPCOffset = -1U;
|
||||
uint32_t HighPCOffset = -1U;
|
||||
DWARFFormValue LowPCFormValue =
|
||||
*DIE.find(dwarf::DW_AT_low_pc, &LowPCOffset);
|
||||
DWARFFormValue HighPCFormValue =
|
||||
*DIE.find(dwarf::DW_AT_high_pc, &HighPCOffset);
|
||||
|
||||
if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr ||
|
||||
(HighPCFormValue.getForm() != dwarf::DW_FORM_addr &&
|
||||
HighPCFormValue.getForm() != dwarf::DW_FORM_data8 &&
|
||||
HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) {
|
||||
errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE "
|
||||
<< "at offset 0x" << Twine::utohexstr(DIE.getOffset())
|
||||
<< "\n";
|
||||
return;
|
||||
}
|
||||
if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) {
|
||||
errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. "
|
||||
<< "Cannot update DIE at offset 0x"
|
||||
<< Twine::utohexstr(DIE.getOffset()) << '\n';
|
||||
return;
|
||||
}
|
||||
|
||||
AbbrevPatcher->addAttributePatch(DIE.getDwarfUnit(),
|
||||
AbbrevCode,
|
||||
dwarf::DW_AT_low_pc,
|
||||
dwarf::DW_AT_ranges,
|
||||
dwarf::DW_FORM_sec_offset);
|
||||
AbbrevPatcher->addAttributePatch(DIE.getDwarfUnit(),
|
||||
AbbrevCode,
|
||||
dwarf::DW_AT_high_pc,
|
||||
dwarf::DW_AT_low_pc,
|
||||
dwarf::DW_FORM_udata);
|
||||
unsigned LowPCSize = 0;
|
||||
if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr ||
|
||||
HighPCFormValue.getForm() == dwarf::DW_FORM_data8) {
|
||||
LowPCSize = 12;
|
||||
} else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) {
|
||||
LowPCSize = 8;
|
||||
} else {
|
||||
llvm_unreachable("unexpected form");
|
||||
}
|
||||
DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset);
|
||||
DebugInfoPatcher->addUDataPatch(LowPCOffset + 4, 0, LowPCSize);
|
||||
convertToRanges(AbbreviationDecl);
|
||||
convertToRanges(DIE, DebugRangesOffset);
|
||||
} else {
|
||||
if (opts::Verbosity >= 1) {
|
||||
errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x"
|
||||
@ -337,8 +348,8 @@ void RewriteInstance::updateDWARFObjectAddressRanges(
|
||||
}
|
||||
}
|
||||
|
||||
void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() {
|
||||
for (auto &It : BinaryFunctions) {
|
||||
void DWARFRewriter::updateDebugLineInfoForNonSimpleFunctions() {
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
const auto &Function = It.second;
|
||||
|
||||
if (Function.isSimple())
|
||||
@ -353,7 +364,7 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() {
|
||||
|
||||
std::vector<uint32_t> Results;
|
||||
MCSectionELF *FunctionSection =
|
||||
BC->Ctx->getELFSection(Function.getCodeSectionName(),
|
||||
BC.Ctx->getELFSection(Function.getCodeSectionName(),
|
||||
ELF::SHT_PROGBITS,
|
||||
ELF::SHF_EXECINSTR | ELF::SHF_ALLOC);
|
||||
|
||||
@ -361,10 +372,10 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() {
|
||||
if (LineTable->lookupAddressRange(Address, Function.getMaxSize(),
|
||||
Results)) {
|
||||
auto &OutputLineTable =
|
||||
BC->Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections();
|
||||
BC.Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections();
|
||||
for (auto RowIndex : Results) {
|
||||
const auto &Row = LineTable->Rows[RowIndex];
|
||||
BC->Ctx->setCurrentDwarfLoc(
|
||||
BC.Ctx->setCurrentDwarfLoc(
|
||||
Row.File,
|
||||
Row.Line,
|
||||
Row.Column,
|
||||
@ -375,17 +386,17 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() {
|
||||
Row.Isa,
|
||||
Row.Discriminator,
|
||||
Row.Address);
|
||||
auto Loc = BC->Ctx->getCurrentDwarfLoc();
|
||||
BC->Ctx->clearDwarfLocSeen();
|
||||
auto Loc = BC.Ctx->getCurrentDwarfLoc();
|
||||
BC.Ctx->clearDwarfLocSeen();
|
||||
OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc},
|
||||
FunctionSection);
|
||||
}
|
||||
// Add an empty entry past the end of the function
|
||||
// for end_sequence mark.
|
||||
BC->Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0,
|
||||
BC.Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0,
|
||||
Address + Function.getMaxSize());
|
||||
auto Loc = BC->Ctx->getCurrentDwarfLoc();
|
||||
BC->Ctx->clearDwarfLocSeen();
|
||||
auto Loc = BC.Ctx->getCurrentDwarfLoc();
|
||||
BC.Ctx->clearDwarfLocSeen();
|
||||
OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc},
|
||||
FunctionSection);
|
||||
} else {
|
||||
@ -395,9 +406,9 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() {
|
||||
}
|
||||
}
|
||||
|
||||
void RewriteInstance::updateLineTableOffsets() {
|
||||
void DWARFRewriter::updateLineTableOffsets() {
|
||||
const auto *LineSection =
|
||||
BC->Ctx->getObjectFileInfo()->getDwarfLineSection();
|
||||
BC.Ctx->getObjectFileInfo()->getDwarfLineSection();
|
||||
auto CurrentFragment = LineSection->begin();
|
||||
uint32_t CurrentOffset = 0;
|
||||
uint32_t Offset = 0;
|
||||
@ -406,7 +417,7 @@ void RewriteInstance::updateLineTableOffsets() {
|
||||
// output file, thus we can compute all table's offset by passing through
|
||||
// each fragment at most once, continuing from the last CU's beginning
|
||||
// instead of from the first fragment.
|
||||
for (const auto &CUIDLineTablePair : BC->Ctx->getMCDwarfLineTables()) {
|
||||
for (const auto &CUIDLineTablePair : BC.Ctx->getMCDwarfLineTables()) {
|
||||
auto Label = CUIDLineTablePair.second.getLabel();
|
||||
if (!Label)
|
||||
continue;
|
||||
@ -415,10 +426,10 @@ void RewriteInstance::updateLineTableOffsets() {
|
||||
if (CUOffset == -1U)
|
||||
continue;
|
||||
|
||||
auto *CU = BC->DwCtx->getCompileUnitForOffset(CUOffset);
|
||||
auto *CU = BC.DwCtx->getCompileUnitForOffset(CUOffset);
|
||||
assert(CU && "no CU found at offset");
|
||||
auto LTOffset =
|
||||
BC->DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list);
|
||||
BC.DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list);
|
||||
if (!LTOffset)
|
||||
continue;
|
||||
|
||||
@ -444,9 +455,9 @@ void RewriteInstance::updateLineTableOffsets() {
|
||||
Offset += Label->getOffset() - CurrentOffset;
|
||||
CurrentOffset = Label->getOffset();
|
||||
|
||||
auto DbgInfoSection = BC->getUniqueSectionByName(".debug_info");
|
||||
auto DbgInfoSection = BC.getUniqueSectionByName(".debug_info");
|
||||
assert(DbgInfoSection && ".debug_info section must exist");
|
||||
auto *Zero = BC->registerNameAtAddress("Zero", 0, 0, 0);
|
||||
auto *Zero = BC.registerNameAtAddress("Zero", 0, 0, 0);
|
||||
DbgInfoSection->addRelocation(LTOffset,
|
||||
Zero,
|
||||
ELF::R_X86_64_32,
|
||||
@ -463,43 +474,43 @@ void RewriteInstance::updateLineTableOffsets() {
|
||||
}
|
||||
}
|
||||
|
||||
void RewriteInstance::finalizeDebugSections() {
|
||||
void DWARFRewriter::finalizeDebugSections() {
|
||||
// Skip .debug_aranges if we are re-generating .gdb_index.
|
||||
if (opts::KeepARanges || !GdbIndexSection) {
|
||||
if (opts::KeepARanges || !BC.getGdbIndexSection()) {
|
||||
SmallVector<char, 16> ARangesBuffer;
|
||||
raw_svector_ostream OS(ARangesBuffer);
|
||||
|
||||
auto MAB = std::unique_ptr<MCAsmBackend>(BC->TheTarget->createMCAsmBackend(
|
||||
*BC->STI, *BC->MRI, MCTargetOptions()));
|
||||
auto MAB = std::unique_ptr<MCAsmBackend>(BC.TheTarget->createMCAsmBackend(
|
||||
*BC.STI, *BC.MRI, MCTargetOptions()));
|
||||
auto Writer = std::unique_ptr<MCObjectWriter>(MAB->createObjectWriter(OS));
|
||||
|
||||
RangesSectionsWriter->writeArangesSection(Writer.get());
|
||||
const auto &ARangesContents = OS.str();
|
||||
|
||||
BC->registerOrUpdateNoteSection(".debug_aranges",
|
||||
BC.registerOrUpdateNoteSection(".debug_aranges",
|
||||
copyByteArray(ARangesContents),
|
||||
ARangesContents.size());
|
||||
}
|
||||
|
||||
auto RangesSectionContents = RangesSectionsWriter->finalize();
|
||||
BC->registerOrUpdateNoteSection(".debug_ranges",
|
||||
BC.registerOrUpdateNoteSection(".debug_ranges",
|
||||
copyByteArray(*RangesSectionContents),
|
||||
RangesSectionContents->size());
|
||||
|
||||
auto LocationListSectionContents = LocationListWriter->finalize();
|
||||
BC->registerOrUpdateNoteSection(".debug_loc",
|
||||
BC.registerOrUpdateNoteSection(".debug_loc",
|
||||
copyByteArray(*LocationListSectionContents),
|
||||
LocationListSectionContents->size());
|
||||
}
|
||||
|
||||
void RewriteInstance::updateGdbIndexSection() {
|
||||
if (!GdbIndexSection)
|
||||
void DWARFRewriter::updateGdbIndexSection() {
|
||||
if (!BC.getGdbIndexSection())
|
||||
return;
|
||||
|
||||
// See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html for
|
||||
// .gdb_index section format.
|
||||
|
||||
StringRef GdbIndexContents = GdbIndexSection->getContents();
|
||||
StringRef GdbIndexContents = BC.getGdbIndexSection()->getContents();
|
||||
|
||||
const auto *Data = GdbIndexContents.data();
|
||||
|
||||
@ -523,13 +534,13 @@ void RewriteInstance::updateGdbIndexSection() {
|
||||
// Map CUs offsets to indices and verify existing index table.
|
||||
std::map<uint32_t, uint32_t> OffsetToIndexMap;
|
||||
const auto CUListSize = CUTypesOffset - CUListOffset;
|
||||
const auto NumCUs = BC->DwCtx->getNumCompileUnits();
|
||||
const auto NumCUs = BC.DwCtx->getNumCompileUnits();
|
||||
if (CUListSize != NumCUs * 16) {
|
||||
errs() << "BOLT-ERROR: .gdb_index: CU count mismatch\n";
|
||||
exit(1);
|
||||
}
|
||||
for (unsigned Index = 0; Index < NumCUs; ++Index, Data += 16) {
|
||||
const auto *CU = BC->DwCtx->getCompileUnitAtIndex(Index);
|
||||
const auto *CU = BC.DwCtx->getCompileUnitAtIndex(Index);
|
||||
const auto Offset = read64le(Data);
|
||||
if (CU->getOffset() != Offset) {
|
||||
errs() << "BOLT-ERROR: .gdb_index CU offset mismatch\n";
|
||||
@ -595,7 +606,123 @@ void RewriteInstance::updateGdbIndexSection() {
|
||||
memcpy(Buffer, Data, TrailingSize);
|
||||
|
||||
// Register the new section.
|
||||
BC->registerOrUpdateNoteSection(".gdb_index",
|
||||
BC.registerOrUpdateNoteSection(".gdb_index",
|
||||
NewGdbIndexContents,
|
||||
NewGdbIndexSize);
|
||||
}
|
||||
|
||||
void
|
||||
DWARFRewriter::convertToRanges(const DWARFAbbreviationDeclaration *Abbrev) {
|
||||
std::lock_guard<std::mutex> Lock(AbbrevPatcherMutex);
|
||||
AbbrevPatcher->addAttributePatch(Abbrev,
|
||||
dwarf::DW_AT_low_pc,
|
||||
dwarf::DW_AT_ranges,
|
||||
dwarf::DW_FORM_sec_offset);
|
||||
AbbrevPatcher->addAttributePatch(Abbrev,
|
||||
dwarf::DW_AT_high_pc,
|
||||
dwarf::DW_AT_low_pc,
|
||||
dwarf::DW_FORM_udata);
|
||||
}
|
||||
|
||||
void DWARFRewriter::convertToRanges(DWARFDie DIE,
|
||||
const DebugAddressRangesVector &Ranges) {
|
||||
uint64_t RangesSectionOffset;
|
||||
if (Ranges.empty()) {
|
||||
RangesSectionOffset = RangesSectionsWriter->getEmptyRangesOffset();
|
||||
} else {
|
||||
RangesSectionOffset = RangesSectionsWriter->addRanges(Ranges);
|
||||
}
|
||||
|
||||
convertToRanges(DIE, RangesSectionOffset);
|
||||
}
|
||||
|
||||
void DWARFRewriter::convertPending(const DWARFAbbreviationDeclaration *Abbrev) {
|
||||
if (ConvertedRangesAbbrevs.count(Abbrev))
|
||||
return;
|
||||
|
||||
convertToRanges(Abbrev);
|
||||
|
||||
auto I = PendingRanges.find(Abbrev);
|
||||
if (I != PendingRanges.end()) {
|
||||
for (auto &Pair : I->second) {
|
||||
convertToRanges(Pair.first, {Pair.second});
|
||||
}
|
||||
PendingRanges.erase(I);
|
||||
}
|
||||
|
||||
ConvertedRangesAbbrevs.emplace(Abbrev);
|
||||
}
|
||||
|
||||
void DWARFRewriter::flushPendingRanges() {
|
||||
for (auto &I : PendingRanges) {
|
||||
for (auto &RangePair : I.second) {
|
||||
patchLowHigh(RangePair.first, RangePair.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void getRangeAttrData(
|
||||
DWARFDie DIE,
|
||||
uint32_t &LowPCOffset, uint32_t &HighPCOffset,
|
||||
DWARFFormValue &LowPCFormValue, DWARFFormValue &HighPCFormValue) {
|
||||
LowPCOffset = -1U;
|
||||
HighPCOffset = -1U;
|
||||
LowPCFormValue = *DIE.find(dwarf::DW_AT_low_pc, &LowPCOffset);
|
||||
HighPCFormValue = *DIE.find(dwarf::DW_AT_high_pc, &HighPCOffset);
|
||||
|
||||
if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr ||
|
||||
(HighPCFormValue.getForm() != dwarf::DW_FORM_addr &&
|
||||
HighPCFormValue.getForm() != dwarf::DW_FORM_data8 &&
|
||||
HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) {
|
||||
errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE "
|
||||
<< "at offset 0x" << Twine::utohexstr(DIE.getOffset()) << "\n";
|
||||
return;
|
||||
}
|
||||
if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) {
|
||||
errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. "
|
||||
<< "Cannot update DIE at offset 0x"
|
||||
<< Twine::utohexstr(DIE.getOffset()) << '\n';
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void DWARFRewriter::patchLowHigh(DWARFDie DIE, DebugAddressRange Range) {
|
||||
uint32_t LowPCOffset, HighPCOffset;
|
||||
DWARFFormValue LowPCFormValue, HighPCFormValue;
|
||||
getRangeAttrData(
|
||||
DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue);
|
||||
DebugInfoPatcher->addLE64Patch(LowPCOffset, Range.LowPC);
|
||||
if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr ||
|
||||
HighPCFormValue.getForm() == dwarf::DW_FORM_data8) {
|
||||
DebugInfoPatcher->addLE64Patch(HighPCOffset, Range.HighPC - Range.LowPC);
|
||||
} else {
|
||||
DebugInfoPatcher->addLE32Patch(HighPCOffset, Range.HighPC - Range.LowPC);
|
||||
}
|
||||
}
|
||||
|
||||
void DWARFRewriter::convertToRanges(DWARFDie DIE,
|
||||
uint64_t RangesSectionOffset) {
|
||||
uint32_t LowPCOffset, HighPCOffset;
|
||||
DWARFFormValue LowPCFormValue, HighPCFormValue;
|
||||
getRangeAttrData(
|
||||
DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue);
|
||||
|
||||
unsigned LowPCSize = 0;
|
||||
if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr ||
|
||||
HighPCFormValue.getForm() == dwarf::DW_FORM_data8) {
|
||||
LowPCSize = 12;
|
||||
} else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) {
|
||||
LowPCSize = 8;
|
||||
} else {
|
||||
llvm_unreachable("unexpected form");
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
|
||||
DebugInfoPatcher->addLE32Patch(LowPCOffset, RangesSectionOffset);
|
||||
DebugInfoPatcher->addUDataPatch(LowPCOffset + 4, 0, LowPCSize);
|
||||
}
|
||||
|
||||
|
||||
125
src/DWARFRewriter.h
Normal file
125
src/DWARFRewriter.h
Normal file
@ -0,0 +1,125 @@
|
||||
//===--- DWARFRewriter.h --------------------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H
|
||||
|
||||
#include "DebugData.h"
|
||||
#include "RewriteInstance.h"
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
|
||||
namespace llvm {
|
||||
|
||||
namespace bolt {
|
||||
|
||||
class BinaryFunction;
|
||||
|
||||
class DWARFRewriter {
|
||||
DWARFRewriter() = delete;
|
||||
|
||||
BinaryContext &BC;
|
||||
|
||||
using SectionPatchersType = RewriteInstance::SectionPatchersType;
|
||||
|
||||
SectionPatchersType &SectionPatchers;
|
||||
|
||||
SimpleBinaryPatcher *DebugInfoPatcher{nullptr};
|
||||
|
||||
std::mutex DebugInfoPatcherMutex;
|
||||
|
||||
DebugAbbrevPatcher *AbbrevPatcher{nullptr};
|
||||
|
||||
std::mutex AbbrevPatcherMutex;
|
||||
|
||||
/// Stores and serializes information that will be put into the .debug_ranges
|
||||
/// and .debug_aranges DWARF sections.
|
||||
std::unique_ptr<DebugRangesSectionsWriter> RangesSectionsWriter;
|
||||
|
||||
std::unique_ptr<DebugLocWriter> LocationListWriter;
|
||||
|
||||
/// Recursively update debug info for all DIEs in \p Unit.
|
||||
/// If \p Function is not empty, it points to a function corresponding
|
||||
/// to a parent DW_TAG_subprogram node of the current \p DIE.
|
||||
void updateUnitDebugInfo(
|
||||
const DWARFDie DIE, std::vector<const BinaryFunction *> FunctionStack,
|
||||
const BinaryFunction *&CachedFunction,
|
||||
std::map<DebugAddressRangesVector, uint64_t> &CachedRanges);
|
||||
|
||||
/// Patches the binary for an object's address ranges to be updated.
|
||||
/// The object can be a anything that has associated address ranges via either
|
||||
/// DW_AT_low/high_pc or DW_AT_ranges (i.e. functions, lexical blocks, etc).
|
||||
/// \p DebugRangesOffset is the offset in .debug_ranges of the object's
|
||||
/// new address ranges in the output binary.
|
||||
/// \p Unit Compile unit the object belongs to.
|
||||
/// \p DIE is the object's DIE in the input binary.
|
||||
void updateDWARFObjectAddressRanges(const DWARFDie DIE,
|
||||
uint64_t DebugRangesOffset);
|
||||
|
||||
/// Generate new contents for .debug_ranges and .debug_aranges section.
|
||||
void finalizeDebugSections();
|
||||
|
||||
/// Patches the binary for DWARF address ranges (e.g. in functions and lexical
|
||||
/// blocks) to be updated.
|
||||
void updateDebugAddressRanges();
|
||||
|
||||
/// Rewrite .gdb_index section if present.
|
||||
void updateGdbIndexSection();
|
||||
|
||||
/// Abbreviations that were converted to use DW_AT_ranges.
|
||||
std::set<const DWARFAbbreviationDeclaration *> ConvertedRangesAbbrevs;
|
||||
|
||||
/// DIEs with abbrevs that were not converted to DW_AT_ranges.
|
||||
/// We only update those when all DIEs have been processed to guarantee that
|
||||
/// the abbrev (which is shared) is intact.
|
||||
std::map<const DWARFAbbreviationDeclaration *,
|
||||
std::vector<std::pair<DWARFDie, DebugAddressRange>>> PendingRanges;
|
||||
|
||||
/// Convert \p Abbrev from using a simple DW_AT_(low|high)_pc range to
|
||||
/// DW_AT_ranges.
|
||||
void convertToRanges(const DWARFAbbreviationDeclaration *Abbrev);
|
||||
|
||||
/// Update \p DIE that was using DW_AT_(low|high)_pc with DW_AT_ranges offset.
|
||||
void convertToRanges(DWARFDie DIE, uint64_t RangesSectionOffset);
|
||||
|
||||
/// Same as above, but takes a vector of \p Ranges as a parameter.
|
||||
void convertToRanges(DWARFDie DIE, const DebugAddressRangesVector &Ranges);
|
||||
|
||||
/// Patch DW_AT_(low|high)_pc values for the \p DIE based on \p Range.
|
||||
void patchLowHigh(DWARFDie DIE, DebugAddressRange Range);
|
||||
|
||||
/// Convert pending ranges associated with the given \p Abbrev.
|
||||
void convertPending(const DWARFAbbreviationDeclaration *Abbrev);
|
||||
|
||||
/// Once all DIEs were seen, update DW_AT_(low|high)_pc values.
|
||||
void flushPendingRanges();
|
||||
|
||||
public:
|
||||
DWARFRewriter(BinaryContext &BC,
|
||||
SectionPatchersType &SectionPatchers)
|
||||
: BC(BC), SectionPatchers(SectionPatchers) {}
|
||||
|
||||
/// Main function for updating the DWARF debug info.
|
||||
void updateDebugInfo();
|
||||
|
||||
/// Computes output .debug_line line table offsets for each compile unit,
|
||||
/// and updates stmt_list for a corresponding compile unit.
|
||||
void updateLineTableOffsets();
|
||||
|
||||
/// Updates debug line information for non-simple functions, which are not
|
||||
/// rewritten.
|
||||
void updateDebugLineInfoForNonSimpleFunctions();
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
||||
@ -14,6 +14,7 @@
|
||||
|
||||
#include "BinaryContext.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "BoltAddressTranslation.h"
|
||||
#include "DataAggregator.h"
|
||||
#include "Heatmap.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
@ -54,6 +55,13 @@ IgnoreBuildID("ignore-build-id",
|
||||
cl::init(false),
|
||||
cl::cat(AggregatorCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
FilterMemProfile("filter-mem-profile",
|
||||
cl::desc("if processing a memory profile, filter out stack or heap accesses that "
|
||||
"won't be useful for BOLT to reduce profile file size"),
|
||||
cl::init(true),
|
||||
cl::cat(AggregatorCategory));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
HeatmapBlock("block-size",
|
||||
cl::desc("size of a heat map block in bytes (default 64)"),
|
||||
@ -88,6 +96,13 @@ TimeAggregator("time-aggr",
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(AggregatorCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
UseEventPC("use-event-pc",
|
||||
cl::desc("use event PC in combination with LBR sampling"),
|
||||
cl::init(false),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(AggregatorCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
WriteAutoFDOData("autofdo",
|
||||
cl::desc("generate autofdo textual data instead of bolt data"),
|
||||
@ -210,6 +225,7 @@ void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
|
||||
*Str++ = 0;
|
||||
} while (true);
|
||||
|
||||
Argv.push_back("-f");
|
||||
Argv.push_back("-i");
|
||||
Argv.push_back(PerfDataFilename.data());
|
||||
Argv.push_back(nullptr);
|
||||
@ -232,13 +248,18 @@ void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
|
||||
TempFiles.push_back(PPI.StderrPath.data());
|
||||
|
||||
Optional<StringRef> Redirects[] = {
|
||||
llvm::None, // Stdin
|
||||
llvm::None, // Stdin
|
||||
StringRef(PPI.StdoutPath.data()), // Stdout
|
||||
StringRef(PPI.StderrPath.data())}; // Stderr
|
||||
|
||||
DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> "
|
||||
<< PPI.StdoutPath.data() << " 2> "
|
||||
<< PPI.StderrPath.data() << "\n");
|
||||
DEBUG({
|
||||
dbgs() << "Launching perf: ";
|
||||
for (const char *Arg : Argv)
|
||||
dbgs() << Arg << " ";
|
||||
dbgs() << " 1> "
|
||||
<< PPI.StdoutPath.data() << " 2> "
|
||||
<< PPI.StderrPath.data() << "\n";
|
||||
});
|
||||
|
||||
if (Wait) {
|
||||
PPI.PI.ReturnCode =
|
||||
@ -422,11 +443,8 @@ std::error_code DataAggregator::writeAutoFDOData() {
|
||||
return std::error_code();
|
||||
}
|
||||
|
||||
void DataAggregator::parseProfile(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs) {
|
||||
void DataAggregator::parseProfile(BinaryContext &BC) {
|
||||
this->BC = &BC;
|
||||
this->BFs = &BFs;
|
||||
|
||||
if (opts::ReadPreAggregated) {
|
||||
parsePreAggregated();
|
||||
@ -546,9 +564,7 @@ void DataAggregator::parseProfile(
|
||||
deleteTempFiles();
|
||||
}
|
||||
|
||||
void DataAggregator::processProfile(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs) {
|
||||
void DataAggregator::processProfile(BinaryContext &BC) {
|
||||
if (opts::ReadPreAggregated)
|
||||
processPreAggregated();
|
||||
else if (opts::BasicAggregation)
|
||||
@ -559,7 +575,7 @@ void DataAggregator::processProfile(
|
||||
processMemEvents();
|
||||
|
||||
// Mark all functions with registered events as having a valid profile.
|
||||
for (auto &BFI : BFs) {
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &BF = BFI.second;
|
||||
if (BF.getBranchData()) {
|
||||
const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE
|
||||
@ -577,19 +593,46 @@ void DataAggregator::processProfile(
|
||||
}
|
||||
|
||||
BinaryFunction *
|
||||
DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) {
|
||||
DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) const {
|
||||
if (!BC->containsAddress(Address))
|
||||
return nullptr;
|
||||
|
||||
auto FI = BFs->upper_bound(Address);
|
||||
if (FI == BFs->begin())
|
||||
return nullptr;
|
||||
--FI;
|
||||
// Use shallow search to avoid fetching the parent function, in case
|
||||
// BinaryContext linked two functions. When aggregating data and writing the
|
||||
// profile, we want to write offsets relative to the closest symbol in the
|
||||
// symbol table, not relative to the parent function, to avoid creating
|
||||
// profile that is too fragile and depends on the layout of other functions.
|
||||
return BC->getBinaryFunctionContainingAddress(Address, /*CheckPastEnd=*/false,
|
||||
/*UseMaxSize=*/true,
|
||||
/*Shallow=*/true);
|
||||
}
|
||||
|
||||
const auto UsedSize = FI->second.getMaxSize();
|
||||
if (Address >= FI->first + UsedSize)
|
||||
return nullptr;
|
||||
return &FI->second;
|
||||
StringRef DataAggregator::getLocationName(BinaryFunction &Func,
|
||||
uint64_t Count) {
|
||||
if (!BAT)
|
||||
return Func.getNames()[0];
|
||||
|
||||
const auto *OrigFunc = &Func;
|
||||
if (const auto HotAddr = BAT->fetchParentAddress(Func.getAddress())) {
|
||||
NumColdSamples += Count;
|
||||
auto *HotFunc = getBinaryFunctionContainingAddress(HotAddr);
|
||||
if (HotFunc)
|
||||
OrigFunc = HotFunc;
|
||||
}
|
||||
const auto &Names = OrigFunc->getNames();
|
||||
// If it is a local function, prefer the name containing the file name where
|
||||
// the local function was declared
|
||||
for (const auto &Name : Names) {
|
||||
StringRef AlternativeName = Name;
|
||||
size_t FileNameIdx = AlternativeName.find('/');
|
||||
// Confirm the alternative name has the pattern Symbol/FileName/1 before
|
||||
// using it
|
||||
if (FileNameIdx == StringRef::npos ||
|
||||
AlternativeName.find('/', FileNameIdx + 1) == StringRef::npos)
|
||||
continue;
|
||||
return AlternativeName;
|
||||
}
|
||||
return Names[0];
|
||||
}
|
||||
|
||||
bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address,
|
||||
@ -597,12 +640,17 @@ bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address,
|
||||
auto I = FuncsToSamples.find(Func.getNames()[0]);
|
||||
if (I == FuncsToSamples.end()) {
|
||||
bool Success;
|
||||
StringRef LocName = getLocationName(Func, Count);
|
||||
std::tie(I, Success) = FuncsToSamples.insert(std::make_pair(
|
||||
Func.getNames()[0],
|
||||
FuncSampleData(Func.getNames()[0], FuncSampleData::ContainerTy())));
|
||||
FuncSampleData(LocName, FuncSampleData::ContainerTy())));
|
||||
}
|
||||
|
||||
I->second.bumpCount(Address - Func.getAddress(), Count);
|
||||
Address -= Func.getAddress();
|
||||
if (BAT)
|
||||
Address = BAT->translate(Func, Address, /*IsBranchSrc=*/false);
|
||||
|
||||
I->second.bumpCount(Address, Count);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -612,12 +660,26 @@ bool DataAggregator::doIntraBranch(BinaryFunction &Func, uint64_t From,
|
||||
FuncBranchData *AggrData = Func.getBranchData();
|
||||
if (!AggrData) {
|
||||
AggrData = &FuncsToBranches[Func.getNames()[0]];
|
||||
AggrData->Name = Func.getNames()[0];
|
||||
AggrData->Name = getLocationName(Func, Count);
|
||||
Func.setBranchData(AggrData);
|
||||
}
|
||||
|
||||
AggrData->bumpBranchCount(From - Func.getAddress(), To - Func.getAddress(),
|
||||
Count, Mispreds);
|
||||
From -= Func.getAddress();
|
||||
To -= Func.getAddress();
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: bumpBranchCount: " << Func.getPrintName()
|
||||
<< " @ " << Twine::utohexstr(From) << " -> "
|
||||
<< Func.getPrintName() << " @ " << Twine::utohexstr(To)
|
||||
<< '\n');
|
||||
if (BAT) {
|
||||
From = BAT->translate(Func, From, /*IsBranchSrc=*/true);
|
||||
To = BAT->translate(Func, To, /*IsBranchSrc=*/false);
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: BAT translation on bumpBranchCount: "
|
||||
<< Func.getPrintName() << " @ " << Twine::utohexstr(From)
|
||||
<< " -> " << Func.getPrintName() << " @ "
|
||||
<< Twine::utohexstr(To) << '\n');
|
||||
}
|
||||
|
||||
AggrData->bumpBranchCount(From, To, Count, Mispreds);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -630,26 +692,30 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
|
||||
StringRef SrcFunc;
|
||||
StringRef DstFunc;
|
||||
if (FromFunc) {
|
||||
SrcFunc = FromFunc->getNames()[0];
|
||||
SrcFunc = getLocationName(*FromFunc, Count);
|
||||
FromAggrData = FromFunc->getBranchData();
|
||||
if (!FromAggrData) {
|
||||
FromAggrData = &FuncsToBranches[SrcFunc];
|
||||
FromAggrData = &FuncsToBranches[FromFunc->getNames()[0]];
|
||||
FromAggrData->Name = SrcFunc;
|
||||
FromFunc->setBranchData(FromAggrData);
|
||||
}
|
||||
From -= FromFunc->getAddress();
|
||||
if (BAT)
|
||||
From = BAT->translate(*FromFunc, From, /*IsBranchSrc=*/true);
|
||||
|
||||
FromFunc->recordExit(From, Mispreds, Count);
|
||||
}
|
||||
if (ToFunc) {
|
||||
DstFunc = ToFunc->getNames()[0];
|
||||
DstFunc = getLocationName(*ToFunc, 0);
|
||||
ToAggrData = ToFunc->getBranchData();
|
||||
if (!ToAggrData) {
|
||||
ToAggrData = &FuncsToBranches[DstFunc];
|
||||
ToAggrData = &FuncsToBranches[ToFunc->getNames()[0]];
|
||||
ToAggrData->Name = DstFunc;
|
||||
ToFunc->setBranchData(ToAggrData);
|
||||
}
|
||||
To -= ToFunc->getAddress();
|
||||
if (BAT)
|
||||
To = BAT->translate(*ToFunc, To, /*IsBranchSrc=*/false);
|
||||
|
||||
ToFunc->recordEntry(To, Mispreds, Count);
|
||||
}
|
||||
@ -684,13 +750,19 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
|
||||
auto *FromFunc = getBinaryFunctionContainingAddress(First.To);
|
||||
auto *ToFunc = getBinaryFunctionContainingAddress(Second.From);
|
||||
if (!FromFunc || !ToFunc) {
|
||||
DEBUG(
|
||||
dbgs() << "Out of range trace starting in " << FromFunc->getPrintName()
|
||||
<< " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
|
||||
<< " and ending in " << ToFunc->getPrintName() << " @ "
|
||||
<< ToFunc->getPrintName() << " @ "
|
||||
<< Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n');
|
||||
NumLongRangeTraces += Count;
|
||||
return false;
|
||||
}
|
||||
if (FromFunc != ToFunc) {
|
||||
NumInvalidTraces += Count;
|
||||
DEBUG(dbgs() << "Trace starting in " << FromFunc->getPrintName() << " @ "
|
||||
<< Twine::utohexstr(First.To - FromFunc->getAddress())
|
||||
DEBUG(dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
|
||||
<< " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
|
||||
<< " and ending in " << ToFunc->getPrintName() << " @ "
|
||||
<< ToFunc->getPrintName() << " @ "
|
||||
<< Twine::utohexstr(Second.From - ToFunc->getAddress())
|
||||
@ -698,12 +770,22 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
|
||||
return false;
|
||||
}
|
||||
|
||||
auto FTs = FromFunc->getFallthroughsInTrace(First, Second, Count);
|
||||
auto FTs = BAT ? BAT->getFallthroughsInTrace(*FromFunc, First, Second)
|
||||
: FromFunc->getFallthroughsInTrace(First, Second, Count);
|
||||
if (!FTs) {
|
||||
DEBUG(dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
|
||||
<< " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
|
||||
<< " and ending in " << ToFunc->getPrintName() << " @ "
|
||||
<< ToFunc->getPrintName() << " @ "
|
||||
<< Twine::utohexstr(Second.From - ToFunc->getAddress())
|
||||
<< '\n');
|
||||
NumInvalidTraces += Count;
|
||||
return false;
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for "
|
||||
<< FromFunc->getPrintName() << ":" << Twine::utohexstr(First.To)
|
||||
<< " to " << Twine::utohexstr(Second.From) << ".\n");
|
||||
for (const auto &Pair : *FTs) {
|
||||
doIntraBranch(*FromFunc, Pair.first + FromFunc->getAddress(),
|
||||
Pair.second + FromFunc->getAddress(), Count, false);
|
||||
@ -796,7 +878,7 @@ ErrorOr<DataAggregator::PerfBranchSample> DataAggregator::parseBranchSample() {
|
||||
auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes);
|
||||
if (MMapInfoIter == BinaryMMapInfo.end()) {
|
||||
consumeRestOfLine();
|
||||
return Res;
|
||||
return make_error_code(errc::no_such_process);
|
||||
}
|
||||
|
||||
while (checkAndConsumeFS()) {}
|
||||
@ -1009,8 +1091,11 @@ std::error_code DataAggregator::printLBRHeatMap() {
|
||||
|
||||
while (hasData()) {
|
||||
auto SampleRes = parseBranchSample();
|
||||
if (std::error_code EC = SampleRes.getError())
|
||||
if (auto EC = SampleRes.getError()) {
|
||||
if (EC == errc::no_such_process)
|
||||
continue;
|
||||
return EC;
|
||||
}
|
||||
|
||||
auto &Sample = SampleRes.get();
|
||||
|
||||
@ -1071,33 +1156,39 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||
uint64_t NumTotalSamples{0};
|
||||
uint64_t NumEntries{0};
|
||||
uint64_t NumSamples{0};
|
||||
uint64_t NumSamplesNoLBR{0};
|
||||
uint64_t NumTraces{0};
|
||||
|
||||
while (hasData()) {
|
||||
++NumTotalSamples;
|
||||
|
||||
auto SampleRes = parseBranchSample();
|
||||
if (std::error_code EC = SampleRes.getError())
|
||||
if (auto EC = SampleRes.getError()) {
|
||||
if (EC == errc::no_such_process)
|
||||
continue;
|
||||
return EC;
|
||||
}
|
||||
++NumSamples;
|
||||
|
||||
auto &Sample = SampleRes.get();
|
||||
if (opts::WriteAutoFDOData)
|
||||
++BasicSamples[Sample.PC];
|
||||
|
||||
if (Sample.LBR.empty())
|
||||
if (Sample.LBR.empty()) {
|
||||
++NumSamplesNoLBR;
|
||||
continue;
|
||||
}
|
||||
|
||||
++NumSamples;
|
||||
NumEntries += Sample.LBR.size();
|
||||
|
||||
// LBRs are stored in reverse execution order. NextLBR refers to the next
|
||||
// executed branch record.
|
||||
const LBREntry *NextLBR{nullptr};
|
||||
// LBRs are stored in reverse execution order. NextPC refers to the next
|
||||
// recorded executed PC.
|
||||
uint64_t NextPC = opts::UseEventPC ? Sample.PC : 0;
|
||||
for (const auto &LBR : Sample.LBR) {
|
||||
if (NextLBR) {
|
||||
if (NextPC) {
|
||||
// Record fall-through trace.
|
||||
const auto TraceFrom = LBR.To;
|
||||
const auto TraceTo = NextLBR->From;
|
||||
const auto TraceTo = NextPC;
|
||||
const auto *TraceBF = getBinaryFunctionContainingAddress(TraceFrom);
|
||||
if (TraceBF && TraceBF->containsAddress(TraceTo)) {
|
||||
auto &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
|
||||
@ -1108,14 +1199,37 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||
}
|
||||
} else {
|
||||
if (TraceBF && getBinaryFunctionContainingAddress(TraceTo)) {
|
||||
DEBUG(dbgs() << "Invalid trace starting in "
|
||||
<< TraceBF->getPrintName() << " @ "
|
||||
<< Twine::utohexstr(TraceFrom - TraceBF->getAddress())
|
||||
<< " and ending @ " << Twine::utohexstr(TraceTo)
|
||||
<< '\n');
|
||||
++NumInvalidTraces;
|
||||
} else {
|
||||
DEBUG(
|
||||
dbgs() << "Out of range trace starting in "
|
||||
<< (TraceBF ? TraceBF->getPrintName() : "None") << " @ "
|
||||
<< Twine::utohexstr(
|
||||
TraceFrom - (TraceBF ? TraceBF->getAddress() : 0))
|
||||
<< " and ending in "
|
||||
<< (getBinaryFunctionContainingAddress(TraceTo)
|
||||
? getBinaryFunctionContainingAddress(TraceTo)
|
||||
->getPrintName()
|
||||
: "None")
|
||||
<< " @ "
|
||||
<< Twine::utohexstr(
|
||||
TraceTo -
|
||||
(getBinaryFunctionContainingAddress(TraceTo)
|
||||
? getBinaryFunctionContainingAddress(TraceTo)
|
||||
->getAddress()
|
||||
: 0))
|
||||
<< '\n');
|
||||
++NumLongRangeTraces;
|
||||
}
|
||||
}
|
||||
++NumTraces;
|
||||
}
|
||||
NextLBR = &LBR;
|
||||
NextPC = LBR.From;
|
||||
|
||||
auto From = LBR.From;
|
||||
if (!getBinaryFunctionContainingAddress(From))
|
||||
@ -1159,14 +1273,23 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||
outs() << "PERF2BOLT: read " << NumSamples << " samples and "
|
||||
<< NumEntries << " LBR entries\n";
|
||||
if (NumTotalSamples) {
|
||||
const auto IgnoredSamples = NumTotalSamples - NumSamples;
|
||||
const auto PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples;
|
||||
outs() << "PERF2BOLT: " << IgnoredSamples << " samples";
|
||||
printColored(outs(), PercentIgnored, 20, 50);
|
||||
outs() << " were ignored\n";
|
||||
if (PercentIgnored > 50.0f) {
|
||||
errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples were "
|
||||
"attributed to the input binary\n";
|
||||
if (NumSamples && NumSamplesNoLBR == NumSamples) {
|
||||
// Note: we don't know if perf2bolt is being used to parse memory samples
|
||||
// at this point. In this case, it is OK to parse zero LBRs.
|
||||
errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
|
||||
"LBR. Record profile with perf record -j any or run perf2bolt "
|
||||
"in no-LBR mode with -nl (the performance improvement in -nl "
|
||||
"mode may be limited)\n";
|
||||
} else {
|
||||
const auto IgnoredSamples = NumTotalSamples - NumSamples;
|
||||
const auto PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples;
|
||||
outs() << "PERF2BOLT: " << IgnoredSamples << " samples";
|
||||
printColored(outs(), PercentIgnored, 20, 50);
|
||||
outs() << " were ignored\n";
|
||||
if (PercentIgnored > 50.0f) {
|
||||
errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples "
|
||||
"were attributed to the input binary\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
outs() << "PERF2BOLT: traces mismatching disassembled function contents: "
|
||||
@ -1191,6 +1314,19 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||
}
|
||||
outs() << "\n";
|
||||
|
||||
if (NumColdSamples > 0) {
|
||||
const auto ColdSamples = NumColdSamples * 100.0f / NumTotalSamples;
|
||||
outs() << "PERF2BOLT: " << NumColdSamples
|
||||
<< format(" (%.1f%%)", ColdSamples)
|
||||
<< " samples recorded in cold regions of split functions.\n";
|
||||
if (ColdSamples > 5.0f) {
|
||||
outs()
|
||||
<< "WARNING: The BOLT-processed binary where samples were collected "
|
||||
"likely used bad data or your service observed a large shift in "
|
||||
"profile. You may want to audit this.\n";
|
||||
}
|
||||
}
|
||||
|
||||
return std::error_code();
|
||||
}
|
||||
|
||||
@ -1330,11 +1466,17 @@ void DataAggregator::processMemEvents() {
|
||||
if (MemFunc) {
|
||||
MemName = MemFunc->getNames()[0];
|
||||
Addr -= MemFunc->getAddress();
|
||||
} else if (Addr) { // TODO: filter heap/stack/nulls here?
|
||||
} else if (Addr) {
|
||||
if (auto *BD = BC->getBinaryDataContainingAddress(Addr)) {
|
||||
MemName = BD->getName();
|
||||
Addr -= BD->getAddress();
|
||||
} else if (opts::FilterMemProfile) {
|
||||
// Filter out heap/stack accesses
|
||||
continue;
|
||||
}
|
||||
} else if (opts::FilterMemProfile) {
|
||||
// Filter out nulls
|
||||
continue;
|
||||
}
|
||||
|
||||
const Location FuncLoc(!FuncName.empty(), FuncName, PC);
|
||||
@ -1394,7 +1536,7 @@ void DataAggregator::processPreAggregated() {
|
||||
AggrEntry.From.Offset, false};
|
||||
LBREntry Second{AggrEntry.To.Offset, AggrEntry.To.Offset, false};
|
||||
doTrace(First, Second, AggrEntry.Count);
|
||||
++NumTraces;
|
||||
NumTraces += AggrEntry.Count;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1776,6 +1918,8 @@ std::error_code DataAggregator::writeAggregatedFile() const {
|
||||
uint64_t BranchValues{0};
|
||||
uint64_t MemValues{0};
|
||||
|
||||
if (BAT)
|
||||
OutFile << "boltedcollection\n";
|
||||
if (opts::BasicAggregation) {
|
||||
OutFile << "no_lbr";
|
||||
for (const auto &Entry : EventNames) {
|
||||
|
||||
@ -28,6 +28,7 @@ namespace bolt {
|
||||
|
||||
class BinaryFunction;
|
||||
class BinaryContext;
|
||||
class BoltAddressTranslation;
|
||||
|
||||
/// DataAggregator inherits all parsing logic from DataReader as well as
|
||||
/// its data structures used to represent aggregated profile data in memory.
|
||||
@ -172,11 +173,13 @@ class DataAggregator : public DataReader {
|
||||
|
||||
/// References to core BOLT data structures
|
||||
BinaryContext *BC{nullptr};
|
||||
std::map<uint64_t, BinaryFunction> *BFs{nullptr};
|
||||
|
||||
BoltAddressTranslation *BAT{nullptr};
|
||||
|
||||
/// Aggregation statistics
|
||||
uint64_t NumInvalidTraces{0};
|
||||
uint64_t NumLongRangeTraces{0};
|
||||
uint64_t NumColdSamples{0};
|
||||
|
||||
/// Looks into system PATH for Linux Perf and set up the aggregator to use it
|
||||
void findPerfExecutable();
|
||||
@ -194,7 +197,16 @@ class DataAggregator : public DataReader {
|
||||
|
||||
/// Look up which function contains an address by using out map of
|
||||
/// disassembled BinaryFunctions
|
||||
BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address);
|
||||
BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address) const;
|
||||
|
||||
/// Retrieve the location name to be used for samples recorded in \p Func.
|
||||
/// If doing BAT translation, link cold parts to the hot part names (used by
|
||||
/// the original binary). \p Count specifies how many samples were recorded
|
||||
/// at that location, so we can tally total activity in cold areas if we are
|
||||
/// dealing with profiling data collected in a bolted binary. For LBRs,
|
||||
/// \p Count should only be used for the source of the branch to avoid
|
||||
/// counting cold activity twice (one for source and another for destination).
|
||||
StringRef getLocationName(BinaryFunction &Func, uint64_t Count);
|
||||
|
||||
/// Semantic actions - parser hooks to interpret parsed perf samples
|
||||
/// Register a sample (non-LBR mode), i.e. a new hit at \p Address
|
||||
@ -226,7 +238,9 @@ class DataAggregator : public DataReader {
|
||||
std::error_code printLBRHeatMap();
|
||||
|
||||
/// Parse a single perf sample containing a PID associated with a sequence of
|
||||
/// LBR entries
|
||||
/// LBR entries. If the PID does not correspond to the binary we are looking
|
||||
/// for, return std::errc::no_such_process. If other parsing errors occur,
|
||||
/// return the error. Otherwise, return the parsed sample.
|
||||
ErrorOr<PerfBranchSample> parseBranchSample();
|
||||
|
||||
/// Parse a single perf sample containing a PID associated with an event name
|
||||
@ -384,6 +398,14 @@ public:
|
||||
/// Set the file name to save aggregate data to
|
||||
void setOutputFDataName(StringRef Name) { OutputFDataName = Name; }
|
||||
|
||||
/// Set Bolt Address Translation Table when processing samples collected in
|
||||
/// bolted binaries
|
||||
void setBAT(BoltAddressTranslation *B) { BAT = B; }
|
||||
|
||||
/// Returns true if this aggregation job is using a translation table to
|
||||
/// remap samples collected on binaries already processed by BOLT.
|
||||
bool usesBAT() const { return BAT; }
|
||||
|
||||
/// Start an aggregation job asynchronously. Call "aggregate" to finish it
|
||||
/// with a list of disassembled functions.
|
||||
void start(StringRef PerfDataFilename);
|
||||
@ -400,12 +422,10 @@ public:
|
||||
|
||||
/// Parse profile and mark functions/objects with profile.
|
||||
/// Don't assign profile to functions yet.
|
||||
void parseProfile(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs);
|
||||
void parseProfile(BinaryContext &BC);
|
||||
|
||||
/// Populate functions with profile.
|
||||
void processProfile(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs);
|
||||
void processProfile(BinaryContext &BC);
|
||||
|
||||
/// Check whether \p FileName is a perf.data file
|
||||
static bool checkPerfDataMagic(StringRef FileName);
|
||||
|
||||
@ -251,16 +251,31 @@ void FuncMemData::update(const Location &Offset, const Location &Addr) {
|
||||
++Data[Iter->second].Count;
|
||||
}
|
||||
|
||||
void DataReader::reset() {
|
||||
for (auto &Pair : getAllFuncsBranchData()) {
|
||||
Pair.second.Used = false;
|
||||
}
|
||||
for (auto &Pair : getAllFuncsMemData()) {
|
||||
Pair.second.Used = false;
|
||||
}
|
||||
}
|
||||
|
||||
ErrorOr<std::unique_ptr<DataReader>>
|
||||
DataReader::readPerfData(StringRef Path, raw_ostream &Diag) {
|
||||
ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
|
||||
MemoryBuffer::getFileOrSTDIN(Path);
|
||||
if (std::error_code EC = MB.getError()) {
|
||||
Diag << "Cannot open " << Path << ": " << EC.message() << "\n";
|
||||
auto MB = MemoryBuffer::getFileOrSTDIN(Path);
|
||||
if (auto EC = MB.getError()) {
|
||||
Diag << "cannot open " << Path << ": " << EC.message() << "\n";
|
||||
return EC;
|
||||
}
|
||||
auto DR = make_unique<DataReader>(std::move(MB.get()), Diag);
|
||||
DR->parse();
|
||||
auto DR = llvm::make_unique<DataReader>(std::move(MB.get()), Diag);
|
||||
if (auto EC = DR->parse()) {
|
||||
return EC;
|
||||
}
|
||||
if (!DR->ParsingBuf.empty()) {
|
||||
Diag << "WARNING: invalid profile data detected at line " << DR->Line
|
||||
<< ". Possibly corrupted profile.\n";
|
||||
}
|
||||
|
||||
DR->buildLTONameMaps();
|
||||
return std::move(DR);
|
||||
}
|
||||
@ -280,6 +295,13 @@ bool DataReader::expectAndConsumeFS() {
|
||||
return true;
|
||||
}
|
||||
|
||||
void DataReader::consumeAllRemainingFS() {
|
||||
while (ParsingBuf[0] == FieldSeparator) {
|
||||
ParsingBuf = ParsingBuf.drop_front(1);
|
||||
Col += 1;
|
||||
}
|
||||
}
|
||||
|
||||
bool DataReader::checkAndConsumeNewLine() {
|
||||
if (ParsingBuf[0] != '\n')
|
||||
return false;
|
||||
@ -374,12 +396,14 @@ ErrorOr<Location> DataReader::parseLocation(char EndChar,
|
||||
|
||||
if (!expectAndConsumeFS())
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
consumeAllRemainingFS();
|
||||
|
||||
// Read the string containing the symbol or the DSO name
|
||||
auto NameRes = parseString(FieldSeparator);
|
||||
if (std::error_code EC = NameRes.getError())
|
||||
return EC;
|
||||
StringRef Name = NameRes.get();
|
||||
consumeAllRemainingFS();
|
||||
|
||||
// Read the offset
|
||||
auto Offset = parseHexField(EndChar, EndNl);
|
||||
@ -395,21 +419,25 @@ ErrorOr<BranchInfo> DataReader::parseBranchInfo() {
|
||||
return EC;
|
||||
Location From = Res.get();
|
||||
|
||||
consumeAllRemainingFS();
|
||||
Res = parseLocation(FieldSeparator);
|
||||
if (std::error_code EC = Res.getError())
|
||||
return EC;
|
||||
Location To = Res.get();
|
||||
|
||||
consumeAllRemainingFS();
|
||||
auto MRes = parseNumberField(FieldSeparator);
|
||||
if (std::error_code EC = MRes.getError())
|
||||
return EC;
|
||||
int64_t NumMispreds = MRes.get();
|
||||
|
||||
consumeAllRemainingFS();
|
||||
auto BRes = parseNumberField(FieldSeparator, /* EndNl = */ true);
|
||||
if (std::error_code EC = BRes.getError())
|
||||
return EC;
|
||||
int64_t NumBranches = BRes.get();
|
||||
|
||||
consumeAllRemainingFS();
|
||||
if (!checkAndConsumeNewLine()) {
|
||||
reportError("expected end of line");
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
@ -424,15 +452,18 @@ ErrorOr<MemInfo> DataReader::parseMemInfo() {
|
||||
return EC;
|
||||
Location Offset = Res.get();
|
||||
|
||||
consumeAllRemainingFS();
|
||||
Res = parseMemLocation(FieldSeparator);
|
||||
if (std::error_code EC = Res.getError())
|
||||
return EC;
|
||||
Location Addr = Res.get();
|
||||
|
||||
consumeAllRemainingFS();
|
||||
auto CountRes = parseNumberField(FieldSeparator, true);
|
||||
if (std::error_code EC = CountRes.getError())
|
||||
return EC;
|
||||
|
||||
consumeAllRemainingFS();
|
||||
if (!checkAndConsumeNewLine()) {
|
||||
reportError("expected end of line");
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
@ -447,11 +478,13 @@ ErrorOr<SampleInfo> DataReader::parseSampleInfo() {
|
||||
return EC;
|
||||
Location Address = Res.get();
|
||||
|
||||
consumeAllRemainingFS();
|
||||
auto BRes = parseNumberField(FieldSeparator, /* EndNl = */ true);
|
||||
if (std::error_code EC = BRes.getError())
|
||||
return EC;
|
||||
int64_t Occurrences = BRes.get();
|
||||
|
||||
consumeAllRemainingFS();
|
||||
if (!checkAndConsumeNewLine()) {
|
||||
reportError("expected end of line");
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
@ -483,6 +516,20 @@ ErrorOr<bool> DataReader::maybeParseNoLBRFlag() {
|
||||
return true;
|
||||
}
|
||||
|
||||
ErrorOr<bool> DataReader::maybeParseBATFlag() {
|
||||
if (ParsingBuf.size() < 16 || ParsingBuf.substr(0, 16) != "boltedcollection")
|
||||
return false;
|
||||
ParsingBuf = ParsingBuf.drop_front(16);
|
||||
Col += 16;
|
||||
|
||||
if (!checkAndConsumeNewLine()) {
|
||||
reportError("malformed boltedcollection line");
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool DataReader::hasBranchData() {
|
||||
if (ParsingBuf.size() == 0)
|
||||
return false;
|
||||
@ -599,6 +646,17 @@ std::error_code DataReader::parse() {
|
||||
if (!FlagOrErr)
|
||||
return FlagOrErr.getError();
|
||||
NoLBRMode = *FlagOrErr;
|
||||
|
||||
auto BATFlagOrErr = maybeParseBATFlag();
|
||||
if (!BATFlagOrErr)
|
||||
return BATFlagOrErr.getError();
|
||||
BATMode = *BATFlagOrErr;
|
||||
|
||||
if (!hasBranchData() && !hasMemData()) {
|
||||
Diag << "ERROR: no valid profile data found\n";
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
}
|
||||
|
||||
if (NoLBRMode)
|
||||
return parseInNoLBRMode();
|
||||
|
||||
|
||||
@ -303,6 +303,9 @@ public:
|
||||
static ErrorOr<std::unique_ptr<DataReader>> readPerfData(StringRef Path,
|
||||
raw_ostream &Diag);
|
||||
|
||||
/// Mark all profile objects unused.
|
||||
void reset();
|
||||
|
||||
/// Parses the input bolt data file into internal data structures. We expect
|
||||
/// the file format to follow the syntax below.
|
||||
///
|
||||
@ -398,6 +401,11 @@ public:
|
||||
/// Return false only if we are running with profiling data that lacks LBR.
|
||||
bool hasLBR() const { return !NoLBRMode; }
|
||||
|
||||
/// Return true if the profiling data was collected in a bolted binary. This
|
||||
/// means we lose the ability to identify stale data at some branch locations,
|
||||
/// since we have to be more permissive in some cases.
|
||||
bool collectedInBoltedBinary() const { return BATMode; }
|
||||
|
||||
/// Return true if event named \p Name was used to collect this profile data.
|
||||
bool usesEvent(StringRef Name) const {
|
||||
for (auto I = EventNames.begin(), E = EventNames.end(); I != E; ++I) {
|
||||
@ -417,6 +425,7 @@ protected:
|
||||
|
||||
void reportError(StringRef ErrorMsg);
|
||||
bool expectAndConsumeFS();
|
||||
void consumeAllRemainingFS();
|
||||
bool checkAndConsumeNewLine();
|
||||
ErrorOr<StringRef> parseString(char EndChar, bool EndNl=false);
|
||||
ErrorOr<int64_t> parseNumberField(char EndChar, bool EndNl=false);
|
||||
@ -432,6 +441,7 @@ protected:
|
||||
ErrorOr<SampleInfo> parseSampleInfo();
|
||||
ErrorOr<MemInfo> parseMemInfo();
|
||||
ErrorOr<bool> maybeParseNoLBRFlag();
|
||||
ErrorOr<bool> maybeParseBATFlag();
|
||||
bool hasBranchData();
|
||||
bool hasMemData();
|
||||
|
||||
@ -448,6 +458,7 @@ protected:
|
||||
FuncsToSamplesMapTy FuncsToSamples;
|
||||
FuncsToMemEventsMapTy FuncsToMemEvents;
|
||||
bool NoLBRMode{false};
|
||||
bool BATMode{false};
|
||||
StringSet<> EventNames;
|
||||
static const char FieldSeparator = ' ';
|
||||
|
||||
|
||||
@ -40,7 +40,7 @@ namespace {
|
||||
// Returns the number of written bytes.
|
||||
uint64_t writeAddressRanges(
|
||||
MCObjectWriter *Writer,
|
||||
const DWARFAddressRangesVector &AddressRanges,
|
||||
const DebugAddressRangesVector &AddressRanges,
|
||||
const bool WriteRelativeRanges = false) {
|
||||
for (auto &Range : AddressRanges) {
|
||||
Writer->writeLE64(Range.LowPC);
|
||||
@ -62,26 +62,26 @@ DebugRangesSectionsWriter::DebugRangesSectionsWriter(BinaryContext *BC) {
|
||||
std::unique_ptr<MCObjectWriter>(BC->createObjectWriter(*RangesStream));
|
||||
|
||||
// Add an empty range as the first entry;
|
||||
SectionOffset += writeAddressRanges(Writer.get(), DWARFAddressRangesVector{});
|
||||
}
|
||||
|
||||
uint64_t DebugRangesSectionsWriter::addCURanges(
|
||||
uint64_t CUOffset,
|
||||
DWARFAddressRangesVector &&Ranges) {
|
||||
const auto RangesOffset = addRanges(Ranges);
|
||||
CUAddressRanges.emplace(CUOffset, std::move(Ranges));
|
||||
|
||||
return RangesOffset;
|
||||
SectionOffset += writeAddressRanges(Writer.get(), DebugAddressRangesVector{});
|
||||
}
|
||||
|
||||
uint64_t
|
||||
DebugRangesSectionsWriter::addRanges(const BinaryFunction *Function,
|
||||
DWARFAddressRangesVector &&Ranges) {
|
||||
DebugRangesSectionsWriter::addCURanges(uint64_t CUOffset,
|
||||
DebugAddressRangesVector &&Ranges) {
|
||||
const auto RangesOffset = addRanges(Ranges);
|
||||
|
||||
std::lock_guard<std::mutex> Lock(CUAddressRangesMutex);
|
||||
CUAddressRanges.emplace(CUOffset, std::move(Ranges));
|
||||
return RangesOffset;
|
||||
}
|
||||
|
||||
uint64_t DebugRangesSectionsWriter::addRanges(
|
||||
const BinaryFunction *Function, DebugAddressRangesVector &&Ranges,
|
||||
const BinaryFunction *&CachedFunction,
|
||||
std::map<DebugAddressRangesVector, uint64_t> &CachedRanges) {
|
||||
if (Ranges.empty())
|
||||
return getEmptyRangesOffset();
|
||||
|
||||
static const BinaryFunction *CachedFunction;
|
||||
|
||||
if (Function == CachedFunction) {
|
||||
const auto RI = CachedRanges.find(Ranges);
|
||||
if (RI != CachedRanges.end())
|
||||
@ -98,10 +98,13 @@ DebugRangesSectionsWriter::addRanges(const BinaryFunction *Function,
|
||||
}
|
||||
|
||||
uint64_t
|
||||
DebugRangesSectionsWriter::addRanges(const DWARFAddressRangesVector &Ranges) {
|
||||
DebugRangesSectionsWriter::addRanges(const DebugAddressRangesVector &Ranges) {
|
||||
if (Ranges.empty())
|
||||
return getEmptyRangesOffset();
|
||||
|
||||
// Reading the SectionOffset and updating it should be atomic to guarantee
|
||||
// unique and correct offsets in patches.
|
||||
std::lock_guard<std::mutex> Lock(WriterMutex);
|
||||
const auto EntryOffset = SectionOffset;
|
||||
SectionOffset += writeAddressRanges(Writer.get(), Ranges);
|
||||
|
||||
@ -165,14 +168,17 @@ uint64_t DebugLocWriter::addList(const DWARFDebugLoc::LocationList &LocList) {
|
||||
if (LocList.Entries.empty())
|
||||
return getEmptyListOffset();
|
||||
|
||||
// Reading the SectionOffset and updating it should be atomic to guarantee
|
||||
// unique and correct offsets in patches.
|
||||
std::lock_guard<std::mutex> Lock(WriterMutex);
|
||||
const auto EntryOffset = SectionOffset;
|
||||
|
||||
for (const auto &Entry : LocList.Entries) {
|
||||
Writer->writeLE64(Entry.Begin);
|
||||
Writer->writeLE64(Entry.End);
|
||||
Writer->writeLE16(Entry.Loc.size());
|
||||
Writer->writeBytes(
|
||||
StringRef(reinterpret_cast<const char *>(Entry.Loc.data()),
|
||||
Entry.Loc.size()));
|
||||
Writer->writeBytes(StringRef(
|
||||
reinterpret_cast<const char *>(Entry.Loc.data()), Entry.Loc.size()));
|
||||
SectionOffset += 2 * 8 + 2 + Entry.Loc.size();
|
||||
}
|
||||
Writer->writeLE64(0);
|
||||
@ -229,42 +235,29 @@ void SimpleBinaryPatcher::patchBinary(std::string &BinaryContents) {
|
||||
}
|
||||
}
|
||||
|
||||
void DebugAbbrevPatcher::addAttributePatch(const DWARFUnit *Unit,
|
||||
uint32_t AbbrevCode,
|
||||
dwarf::Attribute AttrTag,
|
||||
uint8_t NewAttrTag,
|
||||
uint8_t NewAttrForm) {
|
||||
assert(Unit && "No compile unit specified.");
|
||||
Patches[Unit].emplace_back(
|
||||
AbbrevAttrPatch{AbbrevCode, AttrTag, NewAttrTag, NewAttrForm});
|
||||
void DebugAbbrevPatcher::addAttributePatch(
|
||||
const DWARFAbbreviationDeclaration *Abbrev,
|
||||
dwarf::Attribute AttrTag,
|
||||
uint8_t NewAttrTag,
|
||||
uint8_t NewAttrForm) {
|
||||
assert(Abbrev && "no abbreviation specified");
|
||||
AbbrevPatches.emplace(
|
||||
AbbrevAttrPatch{Abbrev, AttrTag, NewAttrTag, NewAttrForm});
|
||||
}
|
||||
|
||||
void DebugAbbrevPatcher::patchBinary(std::string &Contents) {
|
||||
SimpleBinaryPatcher Patcher;
|
||||
|
||||
for (const auto &UnitPatchesPair : Patches) {
|
||||
const auto *Unit = UnitPatchesPair.first;
|
||||
const auto *UnitAbbreviations = Unit->getAbbreviations();
|
||||
assert(UnitAbbreviations &&
|
||||
"Compile unit doesn't have associated abbreviations.");
|
||||
const auto &UnitPatches = UnitPatchesPair.second;
|
||||
for (const auto &AttrPatch : UnitPatches) {
|
||||
const auto *AbbreviationDeclaration =
|
||||
UnitAbbreviations->getAbbreviationDeclaration(AttrPatch.Code);
|
||||
assert(AbbreviationDeclaration && "No abbreviation with given code.");
|
||||
const auto Attribute =
|
||||
AbbreviationDeclaration->findAttribute(AttrPatch.Attr);
|
||||
for (const auto &Patch : AbbrevPatches) {
|
||||
const auto Attribute = Patch.Abbrev->findAttribute(Patch.Attr);
|
||||
assert(Attribute && "Specified attribute doesn't occur in abbreviation.");
|
||||
|
||||
assert(Attribute && "Specified attribute doesn't occur in abbreviation.");
|
||||
// Because we're only handling standard values (i.e. no DW_FORM_GNU_* or
|
||||
// DW_AT_APPLE_*), they are all small (< 128) and encoded in a single
|
||||
// byte in ULEB128, otherwise it'll be more tricky as we may need to
|
||||
// grow or shrink the section.
|
||||
Patcher.addBytePatch(Attribute->AttrOffset,
|
||||
AttrPatch.NewAttr);
|
||||
Patcher.addBytePatch(Attribute->FormOffset,
|
||||
AttrPatch.NewForm);
|
||||
}
|
||||
// Because we're only handling standard values (i.e. no DW_FORM_GNU_* or
|
||||
// DW_AT_APPLE_*), they are all small (< 128) and encoded in a single
|
||||
// byte in ULEB128, otherwise it'll be more tricky as we may need to
|
||||
// grow or shrink the section.
|
||||
Patcher.addBytePatch(Attribute->AttrOffset, Patch.NewAttr);
|
||||
Patcher.addBytePatch(Attribute->FormOffset, Patch.NewForm);
|
||||
}
|
||||
Patcher.patchBinary(Contents);
|
||||
}
|
||||
|
||||
@ -20,26 +20,42 @@
|
||||
#include "llvm/Support/SMLoc.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "BinaryBasicBlock.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
class DWARFCompileUnit;
|
||||
class DWARFDebugInfoEntryMinimal;
|
||||
class MCObjectWriter;
|
||||
|
||||
namespace bolt {
|
||||
|
||||
class BinaryContext;
|
||||
class BasicBlockTable;
|
||||
class BinaryBasicBlock;
|
||||
class BinaryFunction;
|
||||
|
||||
/// Eeferences a row in a DWARFDebugLine::LineTable by the DWARF
|
||||
/// Address range representation. Takes less space than DWARFAddressRange.
|
||||
struct DebugAddressRange {
|
||||
uint64_t LowPC{0};
|
||||
uint64_t HighPC{0};
|
||||
|
||||
DebugAddressRange() = default;
|
||||
|
||||
DebugAddressRange(uint64_t LowPC, uint64_t HighPC)
|
||||
: LowPC(LowPC), HighPC(HighPC) {}
|
||||
};
|
||||
|
||||
static inline bool operator<(const DebugAddressRange &LHS,
|
||||
const DebugAddressRange &RHS) {
|
||||
return std::tie(LHS.LowPC, LHS.HighPC) < std::tie(RHS.LowPC, RHS.HighPC);
|
||||
}
|
||||
|
||||
/// DebugAddressRangesVector - represents a set of absolute address ranges.
|
||||
using DebugAddressRangesVector = SmallVector<DebugAddressRange, 2>;
|
||||
|
||||
/// References a row in a DWARFDebugLine::LineTable by the DWARF
|
||||
/// Context index of the DWARF Compile Unit that owns the Line Table and the row
|
||||
/// index. This is tied to our IR during disassembly so that we can later update
|
||||
/// .debug_line information. RowIndex has a base of 1, which means a RowIndex
|
||||
@ -84,14 +100,16 @@ public:
|
||||
DebugRangesSectionsWriter(BinaryContext *BC);
|
||||
|
||||
/// Add ranges for CU matching \p CUOffset and return offset into section.
|
||||
uint64_t addCURanges(uint64_t CUOffset, DWARFAddressRangesVector &&Ranges);
|
||||
uint64_t addCURanges(uint64_t CUOffset, DebugAddressRangesVector &&Ranges);
|
||||
|
||||
/// Add ranges with caching for \p Function.
|
||||
uint64_t addRanges(const BinaryFunction *Function,
|
||||
DWARFAddressRangesVector &&Ranges);
|
||||
uint64_t
|
||||
addRanges(const BinaryFunction *Function, DebugAddressRangesVector &&Ranges,
|
||||
const BinaryFunction *&CachedFunction,
|
||||
std::map<DebugAddressRangesVector, uint64_t> &CachedRanges);
|
||||
|
||||
/// Add ranges and return offset into section.
|
||||
uint64_t addRanges(const DWARFAddressRangesVector &Ranges);
|
||||
uint64_t addRanges(const DebugAddressRangesVector &Ranges);
|
||||
|
||||
/// Writes .debug_aranges with the added ranges to the MCObjectWriter.
|
||||
void writeArangesSection(MCObjectWriter *Writer) const;
|
||||
@ -106,7 +124,7 @@ public:
|
||||
uint64_t getEmptyRangesOffset() const { return EmptyRangesOffset; }
|
||||
|
||||
/// Map DWARFCompileUnit index to ranges.
|
||||
using CUAddressRangesType = std::map<uint64_t, DWARFAddressRangesVector>;
|
||||
using CUAddressRangesType = std::map<uint64_t, DebugAddressRangesVector>;
|
||||
|
||||
/// Return ranges for a given CU.
|
||||
const CUAddressRangesType &getCUAddressRanges() const {
|
||||
@ -124,6 +142,8 @@ private:
|
||||
|
||||
std::unique_ptr<MCObjectWriter> Writer;
|
||||
|
||||
std::mutex WriterMutex;
|
||||
|
||||
/// Current offset in the section (updated as new entries are written).
|
||||
/// Starts with 16 since the first 16 bytes are reserved for an empty range.
|
||||
uint32_t SectionOffset{0};
|
||||
@ -133,11 +153,10 @@ private:
|
||||
/// (first address, interval size).
|
||||
CUAddressRangesType CUAddressRanges;
|
||||
|
||||
std::mutex CUAddressRangesMutex;
|
||||
|
||||
/// Offset of an empty address ranges list.
|
||||
static constexpr uint64_t EmptyRangesOffset{0};
|
||||
|
||||
/// Cached used for de-duplicating entries for the same function.
|
||||
std::map<DWARFAddressRangesVector, uint64_t> CachedRanges;
|
||||
};
|
||||
|
||||
/// Serializes the .debug_loc DWARF section with LocationLists.
|
||||
@ -160,6 +179,8 @@ private:
|
||||
|
||||
std::unique_ptr<MCObjectWriter> Writer;
|
||||
|
||||
std::mutex WriterMutex;
|
||||
|
||||
/// Offset of an empty location list.
|
||||
static uint64_t const EmptyListOffset = 0;
|
||||
|
||||
@ -219,25 +240,33 @@ class DebugAbbrevPatcher : public BinaryPatcher {
|
||||
private:
|
||||
/// Patch of changing one attribute to another.
|
||||
struct AbbrevAttrPatch {
|
||||
uint32_t Code; // Code of abbreviation to be modified.
|
||||
const DWARFAbbreviationDeclaration *Abbrev;
|
||||
dwarf::Attribute Attr; // ID of attribute to be replaced.
|
||||
uint8_t NewAttr; // ID of the new attribute.
|
||||
uint8_t NewForm; // Form of the new attribute.
|
||||
uint8_t NewAttr; // ID of the new attribute.
|
||||
uint8_t NewForm; // Form of the new attribute.
|
||||
|
||||
bool operator==(const AbbrevAttrPatch &RHS) const {
|
||||
return Abbrev == RHS.Abbrev && Attr == RHS.Attr;
|
||||
}
|
||||
};
|
||||
|
||||
std::map<const DWARFUnit *, std::vector<AbbrevAttrPatch>> Patches;
|
||||
struct AbbrevHash {
|
||||
std::size_t operator()(const AbbrevAttrPatch &P) const {
|
||||
return std::hash<uint64_t>()(((uint64_t)P.Abbrev << 16) + P.Attr);
|
||||
}
|
||||
};
|
||||
|
||||
std::unordered_set<AbbrevAttrPatch, AbbrevHash> AbbrevPatches;
|
||||
|
||||
public:
|
||||
~DebugAbbrevPatcher() { }
|
||||
/// Adds a patch to change an attribute of an abbreviation that belongs to
|
||||
/// \p Unit to another attribute.
|
||||
/// \p AbbrevCode code of the abbreviation to be modified.
|
||||
/// Adds a patch to change an attribute of the abbreviation
|
||||
/// \p Abbrev the abbreviation to be modified.
|
||||
/// \p AttrTag ID of the attribute to be replaced.
|
||||
/// \p NewAttrTag ID of the new attribute.
|
||||
/// \p NewAttrForm Form of the new attribute.
|
||||
/// We only handle standard forms, that are encoded in a single byte.
|
||||
void addAttributePatch(const DWARFUnit *Unit,
|
||||
uint32_t AbbrevCode,
|
||||
void addAttributePatch(const DWARFAbbreviationDeclaration *Abbrev,
|
||||
dwarf::Attribute AttrTag,
|
||||
uint8_t NewAttrTag,
|
||||
uint8_t NewAttrForm);
|
||||
|
||||
259
src/DynoStats.cpp
Normal file
259
src/DynoStats.cpp
Normal file
@ -0,0 +1,259 @@
|
||||
//===--- DynoStats.cpp ----------------------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
||||
#include "DynoStats.h"
|
||||
#include "BinaryBasicBlock.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/MC/MCInst.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
|
||||
#undef DEBUG_TYPE
|
||||
#define DEBUG_TYPE "bolt"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace bolt;
|
||||
|
||||
namespace opts {
|
||||
|
||||
extern cl::OptionCategory BoltCategory;
|
||||
|
||||
static cl::opt<uint32_t>
|
||||
DynoStatsScale("dyno-stats-scale",
|
||||
cl::desc("scale to be applied while reporting dyno stats"),
|
||||
cl::Optional,
|
||||
cl::init(1),
|
||||
cl::Hidden,
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
} // namespace opts
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
constexpr const char *DynoStats::Desc[];
|
||||
|
||||
bool DynoStats::operator<(const DynoStats &Other) const {
|
||||
return std::lexicographical_compare(
|
||||
&Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT],
|
||||
&Other.Stats[FIRST_DYNO_STAT], &Other.Stats[LAST_DYNO_STAT]
|
||||
);
|
||||
}
|
||||
|
||||
bool DynoStats::operator==(const DynoStats &Other) const {
|
||||
return std::equal(
|
||||
&Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT],
|
||||
&Other.Stats[FIRST_DYNO_STAT]
|
||||
);
|
||||
}
|
||||
|
||||
bool DynoStats::lessThan(const DynoStats &Other,
|
||||
ArrayRef<Category> Keys) const {
|
||||
return std::lexicographical_compare(
|
||||
Keys.begin(), Keys.end(),
|
||||
Keys.begin(), Keys.end(),
|
||||
[this,&Other](const Category A, const Category) {
|
||||
return Stats[A] < Other.Stats[A];
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const {
|
||||
auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat,
|
||||
uint64_t OtherStat) {
|
||||
OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name;
|
||||
if (Other) {
|
||||
if (Stat != OtherStat) {
|
||||
OtherStat = std::max(OtherStat, uint64_t(1)); // to prevent divide by 0
|
||||
OS << format(" (%+.1f%%)",
|
||||
( (float) Stat - (float) OtherStat ) * 100.0 /
|
||||
(float) (OtherStat) );
|
||||
} else {
|
||||
OS << " (=)";
|
||||
}
|
||||
}
|
||||
OS << '\n';
|
||||
};
|
||||
|
||||
for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1;
|
||||
Stat < DynoStats::LAST_DYNO_STAT;
|
||||
++Stat) {
|
||||
|
||||
if (!PrintAArch64Stats && Stat == DynoStats::VENEER_CALLS_AARCH64)
|
||||
continue;
|
||||
|
||||
printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0);
|
||||
}
|
||||
}
|
||||
|
||||
void DynoStats::operator+=(const DynoStats &Other) {
|
||||
for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1;
|
||||
Stat < DynoStats::LAST_DYNO_STAT;
|
||||
++Stat) {
|
||||
Stats[Stat] += Other[Stat];
|
||||
}
|
||||
}
|
||||
|
||||
DynoStats getDynoStats(const BinaryFunction &BF) {
|
||||
auto &BC = BF.getBinaryContext();
|
||||
|
||||
DynoStats Stats(/*PrintAArch64Stats*/ BC.isAArch64());
|
||||
|
||||
// Return empty-stats about the function we don't completely understand.
|
||||
if (!BF.isSimple() || !BF.hasValidProfile())
|
||||
return Stats;
|
||||
|
||||
// If the function was folded in non-relocation mode we keep its profile
|
||||
// for optimization. However, it should be excluded from the dyno stats.
|
||||
if (BF.isFolded())
|
||||
return Stats;
|
||||
|
||||
// Update enumeration of basic blocks for correct detection of branch'
|
||||
// direction.
|
||||
BF.updateLayoutIndices();
|
||||
|
||||
for (const auto &BB : BF.layout()) {
|
||||
// The basic block execution count equals to the sum of incoming branch
|
||||
// frequencies. This may deviate from the sum of outgoing branches of the
|
||||
// basic block especially since the block may contain a function that
|
||||
// does not return or a function that throws an exception.
|
||||
const uint64_t BBExecutionCount = BB->getKnownExecutionCount();
|
||||
|
||||
// Ignore empty blocks and blocks that were not executed.
|
||||
if (BB->getNumNonPseudos() == 0 || BBExecutionCount == 0)
|
||||
continue;
|
||||
|
||||
// Count AArch64 linker-inserted veneers
|
||||
if(BF.isAArch64Veneer())
|
||||
Stats[DynoStats::VENEER_CALLS_AARCH64] += BF.getKnownExecutionCount();
|
||||
|
||||
// Count the number of calls by iterating through all instructions.
|
||||
for (const auto &Instr : *BB) {
|
||||
if (BC.MIB->isStore(Instr)) {
|
||||
Stats[DynoStats::STORES] += BBExecutionCount;
|
||||
}
|
||||
if (BC.MIB->isLoad(Instr)) {
|
||||
Stats[DynoStats::LOADS] += BBExecutionCount;
|
||||
}
|
||||
|
||||
if (!BC.MIB->isCall(Instr))
|
||||
continue;
|
||||
|
||||
uint64_t CallFreq = BBExecutionCount;
|
||||
if (BC.MIB->getConditionalTailCall(Instr)) {
|
||||
CallFreq =
|
||||
BC.MIB->getAnnotationWithDefault<uint64_t>(Instr, "CTCTakenCount");
|
||||
}
|
||||
Stats[DynoStats::FUNCTION_CALLS] += CallFreq;
|
||||
if (BC.MIB->isIndirectCall(Instr)) {
|
||||
Stats[DynoStats::INDIRECT_CALLS] += CallFreq;
|
||||
} else if (const auto *CallSymbol = BC.MIB->getTargetSymbol(Instr)) {
|
||||
const auto *BF = BC.getFunctionForSymbol(CallSymbol);
|
||||
if (BF && BF->isPLTFunction()) {
|
||||
Stats[DynoStats::PLT_CALLS] += CallFreq;
|
||||
|
||||
// We don't process PLT functions and hence have to adjust relevant
|
||||
// dynostats here for:
|
||||
//
|
||||
// jmp *GOT_ENTRY(%rip)
|
||||
//
|
||||
// NOTE: this is arch-specific.
|
||||
Stats[DynoStats::FUNCTION_CALLS] += CallFreq;
|
||||
Stats[DynoStats::INDIRECT_CALLS] += CallFreq;
|
||||
Stats[DynoStats::LOADS] += CallFreq;
|
||||
Stats[DynoStats::INSTRUCTIONS] += CallFreq;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount;
|
||||
|
||||
// Jump tables.
|
||||
const auto *LastInstr = BB->getLastNonPseudoInstr();
|
||||
if (BC.MIB->getJumpTable(*LastInstr)) {
|
||||
Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount;
|
||||
DEBUG(
|
||||
static uint64_t MostFrequentJT;
|
||||
if (BBExecutionCount > MostFrequentJT) {
|
||||
MostFrequentJT = BBExecutionCount;
|
||||
dbgs() << "BOLT-INFO: most frequently executed jump table is in "
|
||||
<< "function " << BF << " in basic block " << BB->getName()
|
||||
<< " executed totally " << BBExecutionCount << " times.\n";
|
||||
}
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (BC.MIB->isIndirectBranch(*LastInstr) && !BC.MIB->isCall(*LastInstr)) {
|
||||
Stats[DynoStats::UNKNOWN_INDIRECT_BRANCHES] += BBExecutionCount;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Update stats for branches.
|
||||
const MCSymbol *TBB = nullptr;
|
||||
const MCSymbol *FBB = nullptr;
|
||||
MCInst *CondBranch = nullptr;
|
||||
MCInst *UncondBranch = nullptr;
|
||||
if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!CondBranch && !UncondBranch) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Simple unconditional branch.
|
||||
if (!CondBranch) {
|
||||
Stats[DynoStats::UNCOND_BRANCHES] += BBExecutionCount;
|
||||
continue;
|
||||
}
|
||||
|
||||
// CTCs: instruction annotations could be stripped, hence check the number
|
||||
// of successors to identify conditional tail calls.
|
||||
if (BB->succ_size() == 1) {
|
||||
if (BB->branch_info_begin() != BB->branch_info_end())
|
||||
Stats[DynoStats::UNCOND_BRANCHES] += BB->branch_info_begin()->Count;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Conditional branch that could be followed by an unconditional branch.
|
||||
auto TakenCount = BB->getTakenBranchInfo().Count;
|
||||
if (TakenCount == BinaryBasicBlock::COUNT_NO_PROFILE)
|
||||
TakenCount = 0;
|
||||
|
||||
auto NonTakenCount = BB->getFallthroughBranchInfo().Count;
|
||||
if (NonTakenCount == BinaryBasicBlock::COUNT_NO_PROFILE)
|
||||
NonTakenCount = 0;
|
||||
|
||||
if (BF.isForwardBranch(BB, BB->getConditionalSuccessor(true))) {
|
||||
Stats[DynoStats::FORWARD_COND_BRANCHES] += BBExecutionCount;
|
||||
Stats[DynoStats::FORWARD_COND_BRANCHES_TAKEN] += TakenCount;
|
||||
} else {
|
||||
Stats[DynoStats::BACKWARD_COND_BRANCHES] += BBExecutionCount;
|
||||
Stats[DynoStats::BACKWARD_COND_BRANCHES_TAKEN] += TakenCount;
|
||||
}
|
||||
|
||||
if (UncondBranch) {
|
||||
Stats[DynoStats::UNCOND_BRANCHES] += NonTakenCount;
|
||||
}
|
||||
}
|
||||
|
||||
return Stats;
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
179
src/DynoStats.h
Normal file
179
src/DynoStats.h
Normal file
@ -0,0 +1,179 @@
|
||||
//===--- DynoStats.h ------------------------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H
|
||||
|
||||
#include "BinaryFunction.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
namespace bolt {
|
||||
|
||||
/// Class encapsulating runtime statistics about an execution unit.
|
||||
class DynoStats {
|
||||
|
||||
#define DYNO_STATS\
|
||||
D(FIRST_DYNO_STAT, "<reserved>", Fn)\
|
||||
D(FORWARD_COND_BRANCHES, "executed forward branches", Fn)\
|
||||
D(FORWARD_COND_BRANCHES_TAKEN, "taken forward branches", Fn)\
|
||||
D(BACKWARD_COND_BRANCHES, "executed backward branches", Fn)\
|
||||
D(BACKWARD_COND_BRANCHES_TAKEN, "taken backward branches", Fn)\
|
||||
D(UNCOND_BRANCHES, "executed unconditional branches", Fn)\
|
||||
D(FUNCTION_CALLS, "all function calls", Fn)\
|
||||
D(INDIRECT_CALLS, "indirect calls", Fn)\
|
||||
D(PLT_CALLS, "PLT calls", Fn)\
|
||||
D(INSTRUCTIONS, "executed instructions", Fn)\
|
||||
D(LOADS, "executed load instructions", Fn)\
|
||||
D(STORES, "executed store instructions", Fn)\
|
||||
D(JUMP_TABLE_BRANCHES, "taken jump table branches", Fn)\
|
||||
D(UNKNOWN_INDIRECT_BRANCHES, "taken unknown indirect branches", Fn)\
|
||||
D(ALL_BRANCHES, "total branches",\
|
||||
Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\
|
||||
D(ALL_TAKEN, "taken branches",\
|
||||
Fadd(TAKEN_CONDITIONAL, UNCOND_BRANCHES))\
|
||||
D(NONTAKEN_CONDITIONAL, "non-taken conditional branches",\
|
||||
Fsub(ALL_CONDITIONAL, TAKEN_CONDITIONAL))\
|
||||
D(TAKEN_CONDITIONAL, "taken conditional branches",\
|
||||
Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\
|
||||
D(ALL_CONDITIONAL, "all conditional branches",\
|
||||
Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\
|
||||
D(VENEER_CALLS_AARCH64, "linker-inserted veneer calls", Fn)\
|
||||
D(LAST_DYNO_STAT, "<reserved>", 0)
|
||||
|
||||
public:
|
||||
#define D(name, ...) name,
|
||||
enum Category : uint8_t { DYNO_STATS };
|
||||
#undef D
|
||||
|
||||
|
||||
private:
|
||||
uint64_t Stats[LAST_DYNO_STAT+1];
|
||||
bool PrintAArch64Stats;
|
||||
|
||||
#define D(name, desc, ...) desc,
|
||||
static constexpr const char *Desc[] = { DYNO_STATS };
|
||||
#undef D
|
||||
|
||||
public:
|
||||
DynoStats(bool PrintAArch64Stats) {
|
||||
this->PrintAArch64Stats = PrintAArch64Stats;
|
||||
for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat)
|
||||
Stats[Stat] = 0;
|
||||
}
|
||||
|
||||
uint64_t &operator[](size_t I) {
|
||||
assert(I > FIRST_DYNO_STAT && I < LAST_DYNO_STAT &&
|
||||
"index out of bounds");
|
||||
return Stats[I];
|
||||
}
|
||||
|
||||
uint64_t operator[](size_t I) const {
|
||||
switch (I) {
|
||||
#define D(name, desc, func) \
|
||||
case name: \
|
||||
return func;
|
||||
#define Fn Stats[I]
|
||||
#define Fadd(a, b) operator[](a) + operator[](b)
|
||||
#define Fsub(a, b) operator[](a) - operator[](b)
|
||||
#define F(a) operator[](a)
|
||||
#define Radd(a, b) (a + b)
|
||||
#define Rsub(a, b) (a - b)
|
||||
DYNO_STATS
|
||||
#undef Rsub
|
||||
#undef Radd
|
||||
#undef F
|
||||
#undef Fsub
|
||||
#undef Fadd
|
||||
#undef Fn
|
||||
#undef D
|
||||
default:
|
||||
llvm_unreachable("index out of bounds");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void print(raw_ostream &OS, const DynoStats *Other = nullptr) const;
|
||||
|
||||
void operator+=(const DynoStats &Other);
|
||||
bool operator<(const DynoStats &Other) const;
|
||||
bool operator==(const DynoStats &Other) const;
|
||||
bool operator!=(const DynoStats &Other) const { return !operator==(Other); }
|
||||
bool lessThan(const DynoStats &Other, ArrayRef<Category> Keys) const;
|
||||
|
||||
static const char* Description(const Category C) {
|
||||
return Desc[C];
|
||||
}
|
||||
};
|
||||
|
||||
inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) {
|
||||
Stats.print(OS, nullptr);
|
||||
return OS;
|
||||
}
|
||||
|
||||
DynoStats operator+(const DynoStats &A, const DynoStats &B);
|
||||
|
||||
/// Return dynostats for the function.
|
||||
///
|
||||
/// The function relies on branch instructions being in-sync with CFG for
|
||||
/// branch instructions stats. Thus it is better to call it after
|
||||
/// fixBranches().
|
||||
DynoStats getDynoStats(const BinaryFunction &BF);
|
||||
|
||||
/// Return program-wide dynostats.
|
||||
template <typename FuncsType>
|
||||
inline DynoStats getDynoStats(const FuncsType &Funcs) {
|
||||
bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
|
||||
DynoStats dynoStats(IsAArch64);
|
||||
for (auto &BFI : Funcs) {
|
||||
auto &BF = BFI.second;
|
||||
if (BF.isSimple()) {
|
||||
dynoStats += getDynoStats(BF);
|
||||
}
|
||||
}
|
||||
return dynoStats;
|
||||
}
|
||||
|
||||
/// Call a function with optional before and after dynostats printing.
|
||||
template <typename FnType, typename FuncsType>
|
||||
inline void
|
||||
callWithDynoStats(FnType &&Func,
|
||||
const FuncsType &Funcs,
|
||||
StringRef Phase,
|
||||
const bool Flag) {
|
||||
bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
|
||||
DynoStats DynoStatsBefore(IsAArch64);
|
||||
if (Flag) {
|
||||
DynoStatsBefore = getDynoStats(Funcs);
|
||||
}
|
||||
|
||||
Func();
|
||||
|
||||
if (Flag) {
|
||||
const auto DynoStatsAfter = getDynoStats(Funcs);
|
||||
const auto Changed = (DynoStatsAfter != DynoStatsBefore);
|
||||
outs() << "BOLT-INFO: program-wide dynostats after running "
|
||||
<< Phase << (Changed ? "" : " (no change)") << ":\n\n"
|
||||
<< DynoStatsBefore << '\n';
|
||||
if (Changed) {
|
||||
DynoStatsAfter.print(outs(), &DynoStatsBefore);
|
||||
}
|
||||
outs() << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
||||
@ -266,7 +266,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
|
||||
return;
|
||||
}
|
||||
if (TTypeEncoding & DW_EH_PE_indirect) {
|
||||
auto PointerOrErr = BC.extractPointerAtAddress(TypeAddress);
|
||||
auto PointerOrErr = BC.getPointerAtAddress(TypeAddress);
|
||||
assert(PointerOrErr && "failed to decode indirect address");
|
||||
TypeAddress = *PointerOrErr;
|
||||
}
|
||||
@ -349,9 +349,8 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
|
||||
if ((TTypeEncoding & DW_EH_PE_pcrel) && (TypeAddress == TTEntryAddress)) {
|
||||
TypeAddress = 0;
|
||||
}
|
||||
if (TypeAddress &&
|
||||
(TTypeEncoding & DW_EH_PE_indirect)) {
|
||||
auto PointerOrErr = BC.extractPointerAtAddress(TypeAddress);
|
||||
if (TypeAddress && (TTypeEncoding & DW_EH_PE_indirect)) {
|
||||
auto PointerOrErr = BC.getPointerAtAddress(TypeAddress);
|
||||
assert(PointerOrErr && "failed to decode indirect address");
|
||||
TypeAddress = *PointerOrErr;
|
||||
}
|
||||
@ -431,9 +430,14 @@ void BinaryFunction::updateEHRanges() {
|
||||
continue;
|
||||
|
||||
// Same symbol is used for the beginning and the end of the range.
|
||||
const MCSymbol *EHSymbol = BC.Ctx->createTempSymbol("EH", true);
|
||||
const MCSymbol *EHSymbol;
|
||||
MCInst EHLabel;
|
||||
BC.MIB->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get());
|
||||
{
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
EHSymbol = BC.Ctx->createTempSymbol("EH", true);
|
||||
BC.MIB->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get());
|
||||
}
|
||||
|
||||
II = std::next(BB->insertPseudoInstr(II, EHLabel));
|
||||
|
||||
// At this point we could be in one of the following states:
|
||||
@ -526,42 +530,19 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) {
|
||||
// a landing pad, this means that the first landing pad offset will be 0.
|
||||
// As a result, an exception handling runtime will ignore this landing pad,
|
||||
// because zero offset denotes the absence of a landing pad.
|
||||
// For this reason, we emit LPStart value of 0 and output an absolute value
|
||||
// of the landing pad in the table.
|
||||
//
|
||||
// To workaround this issue, we issue a special LPStart for cold fragments
|
||||
// that is equal to FDE start minus 1 byte.
|
||||
//
|
||||
// Note that main function fragment cannot start with a landing pad and we
|
||||
// omit LPStart.
|
||||
const MCExpr *LPStartExpr = nullptr;
|
||||
std::function<void(const MCSymbol *)> emitLandingPad;
|
||||
if (EmitColdPart) {
|
||||
Streamer->EmitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format
|
||||
LPStartExpr = MCBinaryExpr::createSub(
|
||||
MCSymbolRefExpr::create(StartSymbol, *BC.Ctx.get()),
|
||||
MCConstantExpr::create(1, *BC.Ctx.get()),
|
||||
*BC.Ctx.get());
|
||||
Streamer->EmitValue(LPStartExpr, 4);
|
||||
emitLandingPad = [&](const MCSymbol *LPSymbol) {
|
||||
if (!LPSymbol) {
|
||||
Streamer->EmitIntValue(0, 4);
|
||||
return;
|
||||
}
|
||||
Streamer->EmitValue(MCBinaryExpr::createSub(
|
||||
MCSymbolRefExpr::create(LPSymbol, *BC.Ctx.get()),
|
||||
LPStartExpr,
|
||||
*BC.Ctx.get()),
|
||||
4);
|
||||
};
|
||||
} else {
|
||||
Streamer->EmitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format
|
||||
emitLandingPad = [&](const MCSymbol *LPSymbol) {
|
||||
if (!LPSymbol) {
|
||||
Streamer->EmitIntValue(0, 4);
|
||||
return;
|
||||
}
|
||||
Streamer->emitAbsoluteSymbolDiff(LPSymbol, StartSymbol, 4);
|
||||
};
|
||||
}
|
||||
// FIXME: this may break PIEs and DSOs where the base address is not 0.
|
||||
Streamer->EmitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format
|
||||
Streamer->EmitIntValue(0, 4);
|
||||
auto emitLandingPad = [&](const MCSymbol *LPSymbol) {
|
||||
if (!LPSymbol) {
|
||||
Streamer->EmitIntValue(0, 4);
|
||||
return;
|
||||
}
|
||||
Streamer->EmitSymbolValue(LPSymbol, 4);
|
||||
};
|
||||
|
||||
Streamer->EmitIntValue(TTypeEncoding, 1); // TType format
|
||||
|
||||
@ -697,17 +678,6 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
|
||||
return true;
|
||||
|
||||
const FDE &CurFDE = *I->second;
|
||||
if (Function.getSize() != CurFDE.getAddressRange()) {
|
||||
if (opts::Verbosity >= 1) {
|
||||
errs() << "BOLT-WARNING: CFI information size mismatch for function \""
|
||||
<< Function << "\""
|
||||
<< format(": Function size is %dB, CFI covers "
|
||||
"%dB\n",
|
||||
Function.getSize(), CurFDE.getAddressRange());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
auto LSDA = CurFDE.getLSDAAddress();
|
||||
Function.setLSDAAddress(LSDA ? *LSDA : 0);
|
||||
|
||||
@ -868,7 +838,8 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
|
||||
return false;
|
||||
default:
|
||||
if (opts::Verbosity >= 1) {
|
||||
errs() << "BOLT-WARNING: Unrecognized CFI instruction\n";
|
||||
errs() << "BOLT-WARNING: Unrecognized CFI instruction: "
|
||||
<< Instr.Opcode << '\n';
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
110
src/ExecutableFileMemoryManager.cpp
Normal file
110
src/ExecutableFileMemoryManager.cpp
Normal file
@ -0,0 +1,110 @@
|
||||
//===--- ExecutableFileMemoryManager.cpp ----------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "ExecutableFileMemoryManager.h"
|
||||
#include "RewriteInstance.h"
|
||||
|
||||
#undef DEBUG_TYPE
|
||||
#define DEBUG_TYPE "efmm"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace object;
|
||||
using namespace bolt;
|
||||
|
||||
namespace llvm {
|
||||
|
||||
namespace bolt {
|
||||
|
||||
uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size,
|
||||
unsigned Alignment,
|
||||
unsigned SectionID,
|
||||
StringRef SectionName,
|
||||
bool IsCode,
|
||||
bool IsReadOnly) {
|
||||
// Register a debug section as a note section.
|
||||
if (!ObjectsLoaded && RewriteInstance::isDebugSection(SectionName)) {
|
||||
uint8_t *DataCopy = new uint8_t[Size];
|
||||
auto &Section = BC.registerOrUpdateNoteSection(SectionName,
|
||||
DataCopy,
|
||||
Size,
|
||||
Alignment);
|
||||
Section.setSectionID(SectionID);
|
||||
assert(!Section.isAllocatable() && "note sections cannot be allocatable");
|
||||
return DataCopy;
|
||||
}
|
||||
|
||||
uint8_t *Ret;
|
||||
if (IsCode) {
|
||||
Ret = SectionMemoryManager::allocateCodeSection(Size, Alignment,
|
||||
SectionID, SectionName);
|
||||
} else {
|
||||
Ret = SectionMemoryManager::allocateDataSection(Size, Alignment,
|
||||
SectionID, SectionName,
|
||||
IsReadOnly);
|
||||
}
|
||||
|
||||
const auto Flags = BinarySection::getFlags(IsReadOnly, IsCode, true);
|
||||
SmallVector<char, 256> Buf;
|
||||
if (ObjectsLoaded > 0)
|
||||
SectionName = (Twine(SectionName) + ".bolt.extra." + Twine(ObjectsLoaded))
|
||||
.toStringRef(Buf);
|
||||
|
||||
auto &Section = BC.registerOrUpdateSection(SectionName,
|
||||
ELF::SHT_PROGBITS,
|
||||
Flags,
|
||||
Ret,
|
||||
Size,
|
||||
Alignment);
|
||||
Section.setSectionID(SectionID);
|
||||
assert(Section.isAllocatable() &&
|
||||
"verify that allocatable is marked as allocatable");
|
||||
|
||||
DEBUG(dbgs() << "BOLT: allocating " << (Section.isLocal() ? "local " : "")
|
||||
<< (IsCode ? "code" : (IsReadOnly ? "read-only data" : "data"))
|
||||
<< " section : " << SectionName
|
||||
<< " with size " << Size << ", alignment " << Alignment
|
||||
<< " at 0x" << Ret << ", ID = " << SectionID << "\n");
|
||||
|
||||
return Ret;
|
||||
}
|
||||
|
||||
/// Notifier for non-allocatable (note) section.
|
||||
uint8_t *ExecutableFileMemoryManager::recordNoteSection(
|
||||
const uint8_t *Data,
|
||||
uintptr_t Size,
|
||||
unsigned Alignment,
|
||||
unsigned SectionID,
|
||||
StringRef SectionName) {
|
||||
DEBUG(dbgs() << "BOLT: note section "
|
||||
<< SectionName
|
||||
<< " with size " << Size << ", alignment " << Alignment
|
||||
<< " at 0x"
|
||||
<< Twine::utohexstr(reinterpret_cast<uint64_t>(Data)) << '\n');
|
||||
auto &Section = BC.registerOrUpdateNoteSection(SectionName,
|
||||
copyByteArray(Data, Size),
|
||||
Size,
|
||||
Alignment);
|
||||
Section.setSectionID(SectionID);
|
||||
assert(!Section.isAllocatable() && "note sections cannot be allocatable");
|
||||
return Section.getOutputData();
|
||||
}
|
||||
|
||||
bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) {
|
||||
DEBUG(dbgs() << "BOLT: finalizeMemory()\n");
|
||||
++ObjectsLoaded;
|
||||
return SectionMemoryManager::finalizeMemory(ErrMsg);
|
||||
}
|
||||
|
||||
ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { }
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
100
src/ExecutableFileMemoryManager.h
Normal file
100
src/ExecutableFileMemoryManager.h
Normal file
@ -0,0 +1,100 @@
|
||||
//===--- ExecutableFileMemoryManager.h ------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H
|
||||
|
||||
#include "BinaryContext.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/ADT/Twine.h"
|
||||
#include "llvm/ExecutionEngine/SectionMemoryManager.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
namespace bolt {
|
||||
|
||||
struct SegmentInfo {
|
||||
uint64_t Address; /// Address of the segment in memory.
|
||||
uint64_t Size; /// Size of the segment in memory.
|
||||
uint64_t FileOffset; /// Offset in the file.
|
||||
uint64_t FileSize; /// Size in file.
|
||||
|
||||
void print(raw_ostream &OS) const {
|
||||
OS << "SegmentInfo { Address: 0x"
|
||||
<< Twine::utohexstr(Address) << ", Size: 0x"
|
||||
<< Twine::utohexstr(Size) << ", FileOffset: 0x"
|
||||
<< Twine::utohexstr(FileOffset) << ", FileSize: 0x"
|
||||
<< Twine::utohexstr(FileSize) << "}";
|
||||
};
|
||||
};
|
||||
|
||||
inline raw_ostream &operator<<(raw_ostream &OS, const SegmentInfo &SegInfo) {
|
||||
SegInfo.print(OS);
|
||||
return OS;
|
||||
}
|
||||
|
||||
/// Class responsible for allocating and managing code and data sections.
|
||||
class ExecutableFileMemoryManager : public SectionMemoryManager {
|
||||
private:
|
||||
uint8_t *allocateSection(intptr_t Size,
|
||||
unsigned Alignment,
|
||||
unsigned SectionID,
|
||||
StringRef SectionName,
|
||||
bool IsCode,
|
||||
bool IsReadOnly);
|
||||
BinaryContext &BC;
|
||||
bool AllowStubs;
|
||||
|
||||
public:
|
||||
// Our linker's main purpose is to handle a single object file, created
|
||||
// by RewriteInstance after reading the input binary and reordering it.
|
||||
// After objects finish loading, we increment this. Therefore, whenever
|
||||
// this is greater than zero, we are dealing with additional objects that
|
||||
// will not be managed by BinaryContext but only exist to support linking
|
||||
// user-supplied objects into the main input executable.
|
||||
uint32_t ObjectsLoaded{0};
|
||||
|
||||
/// [start memory address] -> [segment info] mapping.
|
||||
std::map<uint64_t, SegmentInfo> SegmentMapInfo;
|
||||
|
||||
ExecutableFileMemoryManager(BinaryContext &BC, bool AllowStubs)
|
||||
: BC(BC), AllowStubs(AllowStubs) {}
|
||||
|
||||
~ExecutableFileMemoryManager();
|
||||
|
||||
uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
|
||||
unsigned SectionID,
|
||||
StringRef SectionName) override {
|
||||
return allocateSection(Size, Alignment, SectionID, SectionName,
|
||||
/*IsCode=*/true, true);
|
||||
}
|
||||
|
||||
uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
|
||||
unsigned SectionID, StringRef SectionName,
|
||||
bool IsReadOnly) override {
|
||||
return allocateSection(Size, Alignment, SectionID, SectionName,
|
||||
/*IsCode=*/false, IsReadOnly);
|
||||
}
|
||||
|
||||
uint8_t *recordNoteSection(const uint8_t *Data, uintptr_t Size,
|
||||
unsigned Alignment, unsigned SectionID,
|
||||
StringRef SectionName) override;
|
||||
|
||||
bool allowStubAllocation() const override { return AllowStubs; }
|
||||
|
||||
bool finalizeMemory(std::string *ErrMsg = nullptr) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
||||
@ -10,6 +10,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "JumpTable.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "BinarySection.h"
|
||||
#include "Relocation.h"
|
||||
#include "llvm/MC/MCStreamer.h"
|
||||
@ -27,8 +28,27 @@ extern cl::opt<JumpTableSupportLevel> JumpTables;
|
||||
extern cl::opt<unsigned> Verbosity;
|
||||
}
|
||||
|
||||
JumpTable::JumpTable(StringRef Name,
|
||||
uint64_t Address,
|
||||
std::size_t EntrySize,
|
||||
JumpTableType Type,
|
||||
LabelMapType &&Labels,
|
||||
BinaryFunction &BF,
|
||||
BinarySection &Section)
|
||||
: BinaryData(Name, Address, 0, EntrySize, Section),
|
||||
EntrySize(EntrySize),
|
||||
OutputEntrySize(EntrySize),
|
||||
Type(Type),
|
||||
Labels(Labels),
|
||||
Parent(&BF) {
|
||||
}
|
||||
|
||||
std::pair<size_t, size_t>
|
||||
JumpTable::getEntriesForAddress(const uint64_t Addr) const {
|
||||
// Check if this is not an address, but a cloned JT id
|
||||
if ((int64_t)Addr < 0ll)
|
||||
return std::make_pair(0, Entries.size());
|
||||
|
||||
const uint64_t InstOffset = Addr - getAddress();
|
||||
size_t StartIndex = 0, EndIndex = 0;
|
||||
uint64_t Offset = 0;
|
||||
@ -55,13 +75,12 @@ JumpTable::getEntriesForAddress(const uint64_t Addr) const {
|
||||
return std::make_pair(StartIndex, EndIndex);
|
||||
}
|
||||
|
||||
bool JumpTable::replaceDestination(uint64_t JTAddress,
|
||||
const MCSymbol *OldDest,
|
||||
bool JumpTable::replaceDestination(uint64_t JTAddress, const MCSymbol *OldDest,
|
||||
MCSymbol *NewDest) {
|
||||
bool Patched{false};
|
||||
const auto Range = getEntriesForAddress(JTAddress);
|
||||
for (auto I = &Entries[Range.first], E = &Entries[Range.second];
|
||||
I != E; ++I) {
|
||||
for (auto I = &Entries[Range.first], E = &Entries[Range.second]; I != E;
|
||||
++I) {
|
||||
auto &Entry = *I;
|
||||
if (Entry == OldDest) {
|
||||
Patched = true;
|
||||
@ -153,16 +172,20 @@ uint64_t JumpTable::emit(MCStreamer *Streamer,
|
||||
|
||||
void JumpTable::print(raw_ostream &OS) const {
|
||||
uint64_t Offset = 0;
|
||||
if (Type == JTT_PIC)
|
||||
OS << "PIC ";
|
||||
OS << "Jump table " << getName() << " for function " << *Parent << " at 0x"
|
||||
<< Twine::utohexstr(getAddress()) << " with a total count of " << Count
|
||||
<< ":\n";
|
||||
for (const auto EntryOffset : OffsetEntries) {
|
||||
OS << " " << Twine::utohexstr(EntryOffset) << '\n';
|
||||
}
|
||||
for (const auto *Entry : Entries) {
|
||||
auto LI = Labels.find(Offset);
|
||||
if (LI != Labels.end()) {
|
||||
OS << "Jump Table " << LI->second->getName() << " at @0x"
|
||||
<< Twine::utohexstr(getAddress()+Offset);
|
||||
if (Offset) {
|
||||
OS << " (possibly part of larger jump table):\n";
|
||||
} else {
|
||||
OS << " with total count of " << Count << ":\n";
|
||||
}
|
||||
if (Offset && LI != Labels.end()) {
|
||||
OS << "Jump Table " << LI->second->getName() << " at 0x"
|
||||
<< Twine::utohexstr(getAddress() + Offset)
|
||||
<< " (possibly part of larger jump table):\n";
|
||||
}
|
||||
OS << format(" 0x%04" PRIx64 " : ", Offset) << Entry->getName();
|
||||
if (!Counts.empty()) {
|
||||
@ -174,18 +197,3 @@ void JumpTable::print(raw_ostream &OS) const {
|
||||
}
|
||||
OS << "\n\n";
|
||||
}
|
||||
|
||||
JumpTable::JumpTable(StringRef Name,
|
||||
uint64_t Address,
|
||||
std::size_t EntrySize,
|
||||
JumpTableType Type,
|
||||
decltype(OffsetEntries) &&OffsetEntries,
|
||||
decltype(Labels) &&Labels,
|
||||
BinarySection &Section)
|
||||
: BinaryData(Name, Address, 0, EntrySize, Section),
|
||||
EntrySize(EntrySize),
|
||||
OutputEntrySize(EntrySize),
|
||||
Type(Type),
|
||||
OffsetEntries(OffsetEntries),
|
||||
Labels(Labels)
|
||||
{ }
|
||||
|
||||
@ -30,11 +30,19 @@ enum JumpTableSupportLevel : char {
|
||||
JTS_AGGRESSIVE = 4, /// Aggressive splitting of jump tables.
|
||||
};
|
||||
|
||||
class BinaryFunction;
|
||||
|
||||
/// Representation of a jump table.
|
||||
///
|
||||
/// The jump table may include other jump tables that are referenced by
|
||||
/// a different label at a different offset in this jump table.
|
||||
class JumpTable : public BinaryData {
|
||||
friend class BinaryContext;
|
||||
|
||||
JumpTable() = delete;
|
||||
JumpTable(const JumpTable &) = delete;
|
||||
JumpTable &operator=(const JumpTable &) = delete;
|
||||
|
||||
public:
|
||||
enum JumpTableType : char {
|
||||
JTT_NORMAL,
|
||||
@ -60,7 +68,8 @@ public:
|
||||
std::vector<MCSymbol *> Entries;
|
||||
|
||||
/// All the entries as offsets into a function. Invalid after CFG is built.
|
||||
std::vector<uint64_t> OffsetEntries;
|
||||
using OffsetsType = std::vector<uint64_t>;
|
||||
OffsetsType OffsetEntries;
|
||||
|
||||
/// Map <Offset> -> <Label> used for embedded jump tables. Label at 0 offset
|
||||
/// is the main label for the jump table.
|
||||
@ -75,6 +84,20 @@ public:
|
||||
/// Total number of times this jump table was used.
|
||||
uint64_t Count{0};
|
||||
|
||||
/// BinaryFunction this jump tables belongs to.
|
||||
BinaryFunction *Parent{nullptr};
|
||||
|
||||
private:
|
||||
/// Constructor should only be called by a BinaryContext.
|
||||
JumpTable(StringRef Name,
|
||||
uint64_t Address,
|
||||
std::size_t EntrySize,
|
||||
JumpTableType Type,
|
||||
LabelMapType &&Labels,
|
||||
BinaryFunction &BF,
|
||||
BinarySection &Section);
|
||||
|
||||
public:
|
||||
/// Return the size of the jump table.
|
||||
uint64_t getSize() const {
|
||||
return std::max(OffsetEntries.size(), Entries.size()) * EntrySize;
|
||||
@ -89,15 +112,6 @@ public:
|
||||
/// starting at (or containing) 'Addr'.
|
||||
std::pair<size_t, size_t> getEntriesForAddress(const uint64_t Addr) const;
|
||||
|
||||
/// Constructor.
|
||||
JumpTable(StringRef Name,
|
||||
uint64_t Address,
|
||||
std::size_t EntrySize,
|
||||
JumpTableType Type,
|
||||
decltype(OffsetEntries) &&OffsetEntries,
|
||||
LabelMapType &&Labels,
|
||||
BinarySection &Section);
|
||||
|
||||
virtual bool isJumpTable() const override { return true; }
|
||||
|
||||
/// Change all entries of the jump table in \p JTAddress pointing to
|
||||
|
||||
@ -81,7 +81,7 @@ private:
|
||||
template <typename ValueType>
|
||||
class MCSimpleAnnotation : public MCAnnotation {
|
||||
public:
|
||||
const ValueType &getValue() const { return Value; }
|
||||
ValueType &getValue() { return Value; }
|
||||
bool equals(const MCAnnotation &Other) const override {
|
||||
return Value == static_cast<const MCSimpleAnnotation &>(Other).Value;
|
||||
}
|
||||
|
||||
@ -148,12 +148,13 @@ int64_t MCPlusBuilder::getGnuArgsSize(const MCInst &Inst) const {
|
||||
return *Value;
|
||||
}
|
||||
|
||||
void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize) {
|
||||
void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
|
||||
AllocatorIdTy AllocId) {
|
||||
assert(GnuArgsSize >= 0 && "cannot set GNU_args_size to negative value");
|
||||
assert(getGnuArgsSize(Inst) == -1LL && "GNU_args_size already set");
|
||||
assert(isInvoke(Inst) && "GNU_args_size can only be set for invoke");
|
||||
|
||||
setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize);
|
||||
setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize, AllocId);
|
||||
}
|
||||
|
||||
uint64_t MCPlusBuilder::getJumpTable(const MCInst &Inst) const {
|
||||
@ -163,13 +164,24 @@ uint64_t MCPlusBuilder::getJumpTable(const MCInst &Inst) const {
|
||||
return *Value;
|
||||
}
|
||||
|
||||
uint16_t MCPlusBuilder::getJumpTableIndexReg(const MCInst &Inst) const {
|
||||
return getAnnotationAs<uint16_t>(Inst, "JTIndexReg");
|
||||
}
|
||||
|
||||
bool MCPlusBuilder::setJumpTable(MCInst &Inst, uint64_t Value,
|
||||
uint16_t IndexReg) {
|
||||
uint16_t IndexReg, AllocatorIdTy AllocId) {
|
||||
if (!isIndirectBranch(Inst))
|
||||
return false;
|
||||
assert(getJumpTable(Inst) == 0 && "jump table already set");
|
||||
setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value);
|
||||
addAnnotation<>(Inst, "JTIndexReg", IndexReg);
|
||||
setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value, AllocId);
|
||||
getOrCreateAnnotationAs<uint16_t>(Inst, "JTIndexReg", AllocId) = IndexReg;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool MCPlusBuilder::unsetJumpTable(MCInst &Inst) {
|
||||
if (!getJumpTable(Inst))
|
||||
return false;
|
||||
removeAnnotation(Inst, MCAnnotation::kJumpTable);
|
||||
removeAnnotation(Inst, "JTIndexReg");
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -214,41 +226,12 @@ bool MCPlusBuilder::removeAnnotation(MCInst &Inst, unsigned Index) {
|
||||
auto ImmValue = AnnotationInst->getOperand(I).getImm();
|
||||
if (extractAnnotationIndex(ImmValue) == Index) {
|
||||
AnnotationInst->erase(AnnotationInst->begin() + I);
|
||||
auto *Annotation =
|
||||
reinterpret_cast<MCAnnotation *>(extractAnnotationValue(ImmValue));
|
||||
auto Itr = AnnotationPool.find(Annotation);
|
||||
if (Itr != AnnotationPool.end()) {
|
||||
AnnotationPool.erase(Itr);
|
||||
Annotation->~MCAnnotation();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void MCPlusBuilder::removeAllAnnotations(MCInst &Inst) {
|
||||
auto *AnnotationInst = getAnnotationInst(Inst);
|
||||
if (!AnnotationInst)
|
||||
return;
|
||||
|
||||
for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) {
|
||||
auto ImmValue = AnnotationInst->getOperand(I).getImm();
|
||||
AnnotationInst->erase(std::prev(AnnotationInst->end()));
|
||||
auto *Annotation =
|
||||
reinterpret_cast<MCAnnotation *>(extractAnnotationValue(ImmValue));
|
||||
auto Itr = AnnotationPool.find(Annotation);
|
||||
if (Itr != AnnotationPool.end()) {
|
||||
AnnotationPool.erase(Itr);
|
||||
Annotation->~MCAnnotation();
|
||||
}
|
||||
}
|
||||
|
||||
// Clear all attached MC+ info since it's no longer used.
|
||||
Inst.erase(std::prev(Inst.end()));
|
||||
}
|
||||
|
||||
void MCPlusBuilder::stripAnnotations(MCInst &Inst) {
|
||||
auto *AnnotationInst = getAnnotationInst(Inst);
|
||||
if (!AnnotationInst)
|
||||
@ -268,7 +251,7 @@ MCPlusBuilder::printAnnotations(const MCInst &Inst, raw_ostream &OS) const {
|
||||
const auto Index = extractAnnotationIndex(Imm);
|
||||
const auto Value = extractAnnotationValue(Imm);
|
||||
const auto *Annotation =
|
||||
reinterpret_cast<const MCAnnotation *>(Value);
|
||||
reinterpret_cast<const MCAnnotation *>(Value);
|
||||
if (Index >= MCAnnotation::kGeneric) {
|
||||
OS << " # " << AnnotationNames[Index - MCAnnotation::kGeneric]
|
||||
<< ": ";
|
||||
@ -283,7 +266,7 @@ bool MCPlusBuilder::evaluateBranch(const MCInst &Inst, uint64_t Addr,
|
||||
}
|
||||
|
||||
void MCPlusBuilder::getClobberedRegs(const MCInst &Inst,
|
||||
BitVector &Regs) const {
|
||||
BitVector &Regs) const {
|
||||
if (isPrefix(Inst) || isCFI(Inst))
|
||||
return;
|
||||
|
||||
@ -302,7 +285,7 @@ void MCPlusBuilder::getClobberedRegs(const MCInst &Inst,
|
||||
}
|
||||
|
||||
void MCPlusBuilder::getTouchedRegs(const MCInst &Inst,
|
||||
BitVector &Regs) const {
|
||||
BitVector &Regs) const {
|
||||
if (isPrefix(Inst) || isCFI(Inst))
|
||||
return;
|
||||
|
||||
@ -325,7 +308,7 @@ void MCPlusBuilder::getTouchedRegs(const MCInst &Inst,
|
||||
}
|
||||
|
||||
void MCPlusBuilder::getWrittenRegs(const MCInst &Inst,
|
||||
BitVector &Regs) const {
|
||||
BitVector &Regs) const {
|
||||
if (isPrefix(Inst) || isCFI(Inst))
|
||||
return;
|
||||
|
||||
@ -381,7 +364,7 @@ bool MCPlusBuilder::hasUseOfPhysReg(const MCInst &MI, unsigned Reg) const {
|
||||
|
||||
const BitVector &
|
||||
MCPlusBuilder::getAliases(MCPhysReg Reg,
|
||||
bool OnlySmaller) const {
|
||||
bool OnlySmaller) const {
|
||||
// AliasMap caches a mapping of registers to the set of registers that
|
||||
// alias (are sub or superregs of itself, including itself).
|
||||
static std::vector<BitVector> AliasMap;
|
||||
|
||||
@ -35,8 +35,11 @@
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <set>
|
||||
#include <shared_mutex>
|
||||
#include <system_error>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace llvm {
|
||||
@ -44,26 +47,31 @@ namespace bolt {
|
||||
|
||||
/// Different types of indirect branches encountered during disassembly.
|
||||
enum class IndirectBranchType : char {
|
||||
UNKNOWN = 0, /// Unable to determine type.
|
||||
POSSIBLE_TAIL_CALL, /// Possibly a tail call.
|
||||
POSSIBLE_JUMP_TABLE, /// Possibly a switch/jump table.
|
||||
POSSIBLE_PIC_JUMP_TABLE, /// Possibly a jump table for PIC.
|
||||
POSSIBLE_GOTO, /// Possibly a gcc's computed goto.
|
||||
POSSIBLE_FIXED_BRANCH, /// Possibly an indirect branch to a fixed location.
|
||||
UNKNOWN = 0, /// Unable to determine type.
|
||||
POSSIBLE_TAIL_CALL, /// Possibly a tail call.
|
||||
POSSIBLE_JUMP_TABLE, /// Possibly a switch/jump table.
|
||||
POSSIBLE_PIC_JUMP_TABLE, /// Possibly a jump table for PIC.
|
||||
POSSIBLE_GOTO, /// Possibly a gcc's computed goto.
|
||||
POSSIBLE_FIXED_BRANCH, /// Possibly an indirect branch to a fixed location.
|
||||
};
|
||||
|
||||
class MCPlusBuilder {
|
||||
public:
|
||||
using AllocatorIdTy = uint16_t;
|
||||
|
||||
private:
|
||||
/// Annotation instruction allocator.
|
||||
SpecificBumpPtrAllocator<MCInst> MCInstAllocator;
|
||||
/// A struct that represents a single annotation allocator
|
||||
struct AnnotationAllocator {
|
||||
SpecificBumpPtrAllocator<MCInst> MCInstAllocator;
|
||||
BumpPtrAllocator ValueAllocator;
|
||||
std::unordered_set<MCPlus::MCAnnotation *> AnnotationPool;
|
||||
};
|
||||
|
||||
/// Annotation value allocator.
|
||||
BumpPtrAllocator Allocator;
|
||||
/// A set of annotation allocators
|
||||
std::unordered_map<AllocatorIdTy, AnnotationAllocator> AnnotationAllocators;
|
||||
|
||||
/// Record all the annotations with non-trivial type. To prevent leaks, these
|
||||
/// will need destructors called when the annotation is removed or when all
|
||||
/// annotations are destroyed.
|
||||
std::unordered_set<MCPlus::MCAnnotation*> AnnotationPool;
|
||||
/// A variable that is used to generate unique ids for annotation allocators
|
||||
AllocatorIdTy MaxAllocatorId = 0;
|
||||
|
||||
/// We encode Index and Value into a 64-bit immediate operand value.
|
||||
static int64_t encodeAnnotationImm(unsigned Index, int64_t Value) {
|
||||
@ -100,10 +108,12 @@ private:
|
||||
return AnnotationInst;
|
||||
}
|
||||
|
||||
void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value) {
|
||||
void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value,
|
||||
AllocatorIdTy AllocatorId = 0) {
|
||||
auto *AnnotationInst = getAnnotationInst(Inst);
|
||||
if (!AnnotationInst) {
|
||||
AnnotationInst = new (MCInstAllocator.Allocate()) MCInst();
|
||||
auto &Allocator = getAnnotationAllocator(AllocatorId);
|
||||
AnnotationInst = new (Allocator.MCInstAllocator.Allocate()) MCInst();
|
||||
AnnotationInst->setOpcode(TargetOpcode::ANNOTATION_LABEL);
|
||||
Inst.addOperand(MCOperand::createInst(AnnotationInst));
|
||||
}
|
||||
@ -278,20 +288,55 @@ public:
|
||||
public:
|
||||
MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info,
|
||||
const MCRegisterInfo *RegInfo)
|
||||
: Analysis(Analysis), Info(Info), RegInfo(RegInfo) {}
|
||||
: Analysis(Analysis), Info(Info), RegInfo(RegInfo) {
|
||||
// Initialize the default annotation allocator with id 0
|
||||
AnnotationAllocators.emplace(0, AnnotationAllocator());
|
||||
MaxAllocatorId++;
|
||||
}
|
||||
|
||||
/// Initialize a new annotation allocator and return its id
|
||||
AllocatorIdTy initializeNewAnnotationAllocator() {
|
||||
AnnotationAllocators.emplace(MaxAllocatorId, AnnotationAllocator());
|
||||
return MaxAllocatorId++;
|
||||
}
|
||||
|
||||
/// Return the annotation allocator of a given id
|
||||
AnnotationAllocator &getAnnotationAllocator(AllocatorIdTy AllocatorId) {
|
||||
assert(AnnotationAllocators.count(AllocatorId) &&
|
||||
"allocator not initialized");
|
||||
return AnnotationAllocators.find(AllocatorId)->second;
|
||||
}
|
||||
|
||||
// Check if an annotation allocator with the given id exists
|
||||
bool checkAllocatorExists(AllocatorIdTy AllocatorId) {
|
||||
return AnnotationAllocators.count(AllocatorId);
|
||||
}
|
||||
|
||||
/// Free the values allocator within the annotation allocator
|
||||
void freeValuesAllocator(AllocatorIdTy AllocatorId) {
|
||||
auto &Allocator = getAnnotationAllocator(AllocatorId);
|
||||
for (auto *Annotation : Allocator.AnnotationPool)
|
||||
Annotation->~MCAnnotation();
|
||||
|
||||
Allocator.AnnotationPool.clear();
|
||||
Allocator.ValueAllocator.Reset();
|
||||
}
|
||||
|
||||
virtual ~MCPlusBuilder() {
|
||||
freeAnnotations();
|
||||
}
|
||||
|
||||
/// Free all memory allocated for annotations.
|
||||
/// Free all memory allocated for annotations
|
||||
void freeAnnotations() {
|
||||
for (auto *Annotation : AnnotationPool) {
|
||||
Annotation->~MCAnnotation();
|
||||
for (auto &Element : AnnotationAllocators) {
|
||||
auto &Allocator = Element.second;
|
||||
for (auto *Annotation : Allocator.AnnotationPool)
|
||||
Annotation->~MCAnnotation();
|
||||
|
||||
Allocator.AnnotationPool.clear();
|
||||
Allocator.ValueAllocator.Reset();
|
||||
Allocator.MCInstAllocator.DestroyAll();
|
||||
}
|
||||
AnnotationPool.clear();
|
||||
MCInstAllocator.DestroyAll();
|
||||
Allocator.Reset();
|
||||
}
|
||||
|
||||
using CompFuncTy = std::function<bool(const MCSymbol *, const MCSymbol *)>;
|
||||
@ -334,6 +379,11 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Check whether we support inverting this branch
|
||||
virtual bool isUnsupportedBranch(unsigned Opcode) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Return true of the instruction is of pseudo kind.
|
||||
bool isPseudo(const MCInst &Inst) const {
|
||||
return Info->get(Inst.getOpcode()).isPseudo();
|
||||
@ -353,11 +403,28 @@ public:
|
||||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
virtual void createPushRegisterIndirect(MCInst &Inst,
|
||||
const MCPhysReg &BaseReg, int64_t Scale,
|
||||
const MCPhysReg &IndexReg, int64_t Offset,
|
||||
const MCExpr *OffsetExpr,
|
||||
const MCPhysReg &AddrSegmentReg,
|
||||
unsigned Size) const {
|
||||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
virtual void createPopRegister(MCInst &Inst, MCPhysReg Reg,
|
||||
unsigned Size) const {
|
||||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
virtual void createPushFlags(MCInst &Inst, unsigned Size) const {
|
||||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
virtual void createPopFlags(MCInst &Inst, unsigned Size) const {
|
||||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
virtual bool createDirectCall(MCInst &Inst, const MCSymbol *Target,
|
||||
MCContext *Ctx) {
|
||||
llvm_unreachable("not implemented");
|
||||
@ -368,7 +435,22 @@ public:
|
||||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
virtual MCPhysReg getX86NoRegister() const {
|
||||
virtual MCPhysReg getInstructionPointer() const {
|
||||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
/// Return a register number that is guaranteed to not match with
|
||||
/// any real register on the underlying architecture.
|
||||
virtual MCPhysReg getNoRegister() const {
|
||||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
/// Return a register corresponding to a function integer argument \p ArgNo
|
||||
/// if the argument is passed in a register. Or return the result of
|
||||
/// getNoRegister() otherwise. The enumeration starts at 0.
|
||||
///
|
||||
/// Note: this should depend on a used calling convention.
|
||||
virtual MCPhysReg getIntArgRegister(unsigned ArgNo) const {
|
||||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
@ -394,6 +476,11 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool isBreakpoint(const MCInst &Inst) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool isPrefix(const MCInst &Inst) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return false;
|
||||
@ -457,6 +544,11 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool isLfence(const MCInst &Inst) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool isLeave(const MCInst &Inst) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return false;
|
||||
@ -482,6 +574,11 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool isActualLoad(const MCInst &Inst) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool isLoad(const MCInst &Inst) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return false;
|
||||
@ -890,9 +987,9 @@ public:
|
||||
/// of the passed \p Symbol plus \p Addend. If the instruction does not have
|
||||
/// an immediate operand or has more than one - then return false. Otherwise
|
||||
/// return true.
|
||||
virtual bool replaceImmWithSymbol(MCInst &Inst, MCSymbol *Symbol,
|
||||
int64_t Addend, MCContext *Ctx,
|
||||
int64_t &Value, uint64_t RelType) const {
|
||||
virtual bool replaceImmWithSymbolRef(MCInst &Inst, const MCSymbol *Symbol,
|
||||
int64_t Addend, MCContext *Ctx,
|
||||
int64_t &Value, uint64_t RelType) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return false;
|
||||
}
|
||||
@ -957,14 +1054,21 @@ public:
|
||||
int64_t getGnuArgsSize(const MCInst &Inst) const;
|
||||
|
||||
/// Add the value of GNU_args_size to Inst if it already has EH info.
|
||||
void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize);
|
||||
void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
|
||||
AllocatorIdTy AllocId = 0);
|
||||
|
||||
/// Return jump table addressed by this instruction.
|
||||
uint64_t getJumpTable(const MCInst &Inst) const;
|
||||
|
||||
/// Return index register for instruction that uses a jump table.
|
||||
uint16_t getJumpTableIndexReg(const MCInst &Inst) const;
|
||||
|
||||
/// Set jump table addressed by this instruction.
|
||||
bool setJumpTable(MCInst &Inst, uint64_t Value,
|
||||
uint16_t IndexReg);
|
||||
bool setJumpTable(MCInst &Inst, uint64_t Value, uint16_t IndexReg,
|
||||
AllocatorIdTy AllocId = 0);
|
||||
|
||||
/// Disassociate instruction with a jump table.
|
||||
bool unsetJumpTable(MCInst &Inst);
|
||||
|
||||
/// Return destination of conditional tail call instruction if \p Inst is one.
|
||||
Optional<uint64_t> getConditionalTailCall(const MCInst &Inst) const;
|
||||
@ -1126,7 +1230,7 @@ public:
|
||||
}
|
||||
|
||||
/// Replace instruction opcode to be a tail call instead of jump.
|
||||
virtual bool convertJmpToTailCall(MCInst &Inst, MCContext *Ctx) {
|
||||
virtual bool convertJmpToTailCall(MCInst &Inst) {
|
||||
llvm_unreachable("not implemented");
|
||||
return false;
|
||||
}
|
||||
@ -1334,6 +1438,32 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Create instruction to left shift contents of target
|
||||
virtual bool createShl(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
|
||||
const MCPhysReg &IndexReg, int64_t Offset,
|
||||
const MCExpr *OffsetExpr,
|
||||
const MCPhysReg &AddrSegmentReg,
|
||||
uint8_t Immediate, int Size) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Create instruction to load an effective address into a target
|
||||
virtual bool createLea(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
|
||||
const MCPhysReg &IndexReg, int64_t Offset,
|
||||
const MCExpr *OffsetExpr, const MCPhysReg &AddrSegmentReg,
|
||||
const MCPhysReg &DstReg, int Size) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Create instruction to increment contents of target by 1
|
||||
virtual bool createIncMemory(MCInst &Inst, const MCSymbol *Target,
|
||||
MCContext *Ctx) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Create a fragment of code (sequence of instructions) that load a 32-bit
|
||||
/// address from memory, zero-extends it to 64 and jump to it (indirect jump).
|
||||
virtual bool
|
||||
@ -1364,6 +1494,21 @@ public:
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Create an inline version of memcpy(dest, src, 1).
|
||||
virtual std::vector<MCInst> createOneByteMemcpy() const {
|
||||
llvm_unreachable("not implemented");
|
||||
return {};
|
||||
}
|
||||
|
||||
/// Create a sequence of instructions to compare contents of a register
|
||||
/// \p RegNo to immediate \Imm and jump to \p Target if they are equal.
|
||||
virtual std::vector<MCInst>
|
||||
createCmpJE(MCPhysReg RegNo, int64_t Imm, const MCSymbol *Target,
|
||||
MCContext *Ctx) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return {};
|
||||
}
|
||||
|
||||
/// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
|
||||
/// (dest + n) instead of dest.
|
||||
virtual std::vector<MCInst> createInlineMemcpy(bool ReturnEnd) const {
|
||||
@ -1411,7 +1556,6 @@ public:
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/// Return annotation index matching the \p Name.
|
||||
Optional<unsigned> getAnnotationIndex(StringRef Name) const {
|
||||
auto AI = AnnotationNameIndexMap.find(Name);
|
||||
@ -1437,25 +1581,30 @@ public:
|
||||
/// Store an annotation value on an MCInst. This assumes the annotation
|
||||
/// is not already present.
|
||||
template <typename ValueType>
|
||||
const ValueType &addAnnotation(MCInst &Inst,
|
||||
unsigned Index,
|
||||
const ValueType &Val) {
|
||||
const ValueType &addAnnotation(MCInst &Inst, unsigned Index,
|
||||
const ValueType &Val,
|
||||
AllocatorIdTy AllocatorId = 0) {
|
||||
assert(!hasAnnotation(Inst, Index));
|
||||
auto *A = new (Allocator) MCPlus::MCSimpleAnnotation<ValueType>(Val);
|
||||
auto &Allocator = getAnnotationAllocator(AllocatorId);
|
||||
auto *A = new (Allocator.ValueAllocator)
|
||||
MCPlus::MCSimpleAnnotation<ValueType>(Val);
|
||||
|
||||
if (!std::is_trivial<ValueType>::value) {
|
||||
AnnotationPool.insert(A);
|
||||
Allocator.AnnotationPool.insert(A);
|
||||
}
|
||||
setAnnotationOpValue(Inst, Index, reinterpret_cast<int64_t>(A));
|
||||
setAnnotationOpValue(Inst, Index, reinterpret_cast<int64_t>(A),
|
||||
AllocatorId);
|
||||
return A->getValue();
|
||||
}
|
||||
|
||||
/// Store an annotation value on an MCInst. This assumes the annotation
|
||||
/// is not already present.
|
||||
template <typename ValueType>
|
||||
const ValueType &addAnnotation(MCInst &Inst,
|
||||
StringRef Name,
|
||||
const ValueType &Val) {
|
||||
return addAnnotation(Inst, getOrCreateAnnotationIndex(Name), Val);
|
||||
const ValueType &addAnnotation(MCInst &Inst, StringRef Name,
|
||||
const ValueType &Val,
|
||||
AllocatorIdTy AllocatorId = 0) {
|
||||
return addAnnotation(Inst, getOrCreateAnnotationIndex(Name), Val,
|
||||
AllocatorId);
|
||||
}
|
||||
|
||||
/// Get an annotation as a specific value, but if the annotation does not
|
||||
@ -1463,12 +1612,13 @@ public:
|
||||
/// Return a non-const ref so caller can freely modify its contents
|
||||
/// afterwards.
|
||||
template <typename ValueType>
|
||||
ValueType& getOrCreateAnnotationAs(MCInst &Inst, unsigned Index) {
|
||||
ValueType &getOrCreateAnnotationAs(MCInst &Inst, unsigned Index,
|
||||
AllocatorIdTy AllocatorId = 0) {
|
||||
auto Val =
|
||||
tryGetAnnotationAs<ValueType>(const_cast<const MCInst &>(Inst), Index);
|
||||
tryGetAnnotationAs<ValueType>(const_cast<const MCInst &>(Inst), Index);
|
||||
if (!Val)
|
||||
Val = addAnnotation(Inst, Index, ValueType());
|
||||
return const_cast<ValueType&>(*Val);
|
||||
Val = addAnnotation(Inst, Index, ValueType(), AllocatorId);
|
||||
return const_cast<ValueType &>(*Val);
|
||||
}
|
||||
|
||||
/// Get an annotation as a specific value, but if the annotation does not
|
||||
@ -1476,25 +1626,26 @@ public:
|
||||
/// Return a non-const ref so caller can freely modify its contents
|
||||
/// afterwards.
|
||||
template <typename ValueType>
|
||||
ValueType& getOrCreateAnnotationAs(MCInst &Inst, StringRef Name) {
|
||||
ValueType &getOrCreateAnnotationAs(MCInst &Inst, StringRef Name,
|
||||
AllocatorIdTy AllocatorId = 0) {
|
||||
const auto Index = getOrCreateAnnotationIndex(Name);
|
||||
return getOrCreateAnnotationAs<ValueType>(Inst, Index);
|
||||
return getOrCreateAnnotationAs<ValueType>(Inst, Index, AllocatorId);
|
||||
}
|
||||
|
||||
/// Get an annotation as a specific value. Assumes that the annotation exists.
|
||||
/// Use hasAnnotation() if the annotation may not exist.
|
||||
template <typename ValueType>
|
||||
const ValueType &getAnnotationAs(const MCInst &Inst, unsigned Index) const {
|
||||
ValueType &getAnnotationAs(const MCInst &Inst, unsigned Index) const {
|
||||
auto Value = getAnnotationOpValue(Inst, Index);
|
||||
assert(Value && "annotation should exist");
|
||||
return reinterpret_cast<const MCPlus::MCSimpleAnnotation<ValueType> *>
|
||||
return reinterpret_cast<MCPlus::MCSimpleAnnotation<ValueType> *>
|
||||
(*Value)->getValue();
|
||||
}
|
||||
|
||||
/// Get an annotation as a specific value. Assumes that the annotation exists.
|
||||
/// Use hasAnnotation() if the annotation may not exist.
|
||||
template <typename ValueType>
|
||||
const ValueType &getAnnotationAs(const MCInst &Inst, StringRef Name) const {
|
||||
ValueType &getAnnotationAs(const MCInst &Inst, StringRef Name) const {
|
||||
const auto Index = getAnnotationIndex(Name);
|
||||
assert(Index && "annotation should exist");
|
||||
return getAnnotationAs<ValueType>(Inst, *Index);
|
||||
@ -1586,9 +1737,6 @@ public:
|
||||
return removeAnnotation(Inst, *Index);
|
||||
}
|
||||
|
||||
/// Remove all meta-data annotations from Inst.
|
||||
void removeAllAnnotations(MCInst &Inst);
|
||||
|
||||
/// Remove meta-data, but don't destroy it.
|
||||
void stripAnnotations(MCInst &Inst);
|
||||
|
||||
@ -1610,8 +1758,13 @@ public:
|
||||
/// empty vector of instructions. The label is meant to indicate the basic
|
||||
/// block where all previous snippets are joined, i.e. the instructions that
|
||||
/// would immediate follow the original call.
|
||||
using ICPdata = std::vector<std::pair<MCSymbol*, std::vector<MCInst>>>;
|
||||
virtual ICPdata indirectCallPromotion(
|
||||
using BlocksVectorTy = std::vector<std::pair<MCSymbol*, std::vector<MCInst>>>;
|
||||
struct MultiBlocksCode {
|
||||
BlocksVectorTy Blocks;
|
||||
std::vector<MCSymbol*> Successors;
|
||||
};
|
||||
|
||||
virtual BlocksVectorTy indirectCallPromotion(
|
||||
const MCInst &CallInst,
|
||||
const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
|
||||
const std::vector<std::pair<MCSymbol *, uint64_t>> &VtableSyms,
|
||||
@ -1620,19 +1773,18 @@ public:
|
||||
MCContext *Ctx
|
||||
) {
|
||||
llvm_unreachable("not implemented");
|
||||
return ICPdata();
|
||||
return BlocksVectorTy();
|
||||
}
|
||||
|
||||
virtual ICPdata jumpTablePromotion(
|
||||
virtual BlocksVectorTy jumpTablePromotion(
|
||||
const MCInst &IJmpInst,
|
||||
const std::vector<std::pair<MCSymbol *,uint64_t>>& Targets,
|
||||
const std::vector<MCInst *> &TargetFetchInsns,
|
||||
MCContext *Ctx
|
||||
) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return ICPdata();
|
||||
return BlocksVectorTy();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
232
src/ParallelUtilities.cpp
Normal file
232
src/ParallelUtilities.cpp
Normal file
@ -0,0 +1,232 @@
|
||||
//===--- ParallelUtilities.cpp -------------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "ParallelUtilities.h"
|
||||
#include "llvm/Support/Timer.h"
|
||||
#include <mutex>
|
||||
#include <shared_mutex>
|
||||
|
||||
#define DEBUG_TYPE "par-utils"
|
||||
|
||||
namespace opts {
|
||||
extern cl::OptionCategory BoltCategory;
|
||||
|
||||
cl::opt<unsigned>
|
||||
ThreadCount("thread-count",
|
||||
cl::desc("number of threads"),
|
||||
cl::init(hardware_concurrency()),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
cl::opt<bool>
|
||||
NoThreads("no-threads",
|
||||
cl::desc("disable multithreading"),
|
||||
cl::init(false),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
cl::opt<unsigned>
|
||||
TaskCount("tasks-per-thread",
|
||||
cl::desc("number of tasks to be created per thread"),
|
||||
cl::init(20),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
} // namespace opts
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
namespace ParallelUtilities {
|
||||
|
||||
namespace {
|
||||
/// A single thread pool that is used to run parallel tasks
|
||||
std::unique_ptr<ThreadPool> ThreadPoolPtr;
|
||||
|
||||
unsigned computeCostFor(const BinaryFunction &BF,
|
||||
const PredicateTy &SkipPredicate,
|
||||
const SchedulingPolicy &SchedPolicy) {
|
||||
if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
|
||||
return 1;
|
||||
|
||||
if (SkipPredicate && SkipPredicate(BF))
|
||||
return 0;
|
||||
|
||||
switch (SchedPolicy) {
|
||||
case SchedulingPolicy::SP_CONSTANT:
|
||||
return 1;
|
||||
case SchedulingPolicy::SP_INST_LINEAR:
|
||||
return BF.getSize();
|
||||
case SchedulingPolicy::SP_INST_QUADRATIC:
|
||||
return BF.getSize() * BF.getSize();
|
||||
case SchedulingPolicy::SP_BB_LINEAR:
|
||||
return BF.size();
|
||||
case SchedulingPolicy::SP_BB_QUADRATIC:
|
||||
return BF.size() * BF.size();
|
||||
default:
|
||||
llvm_unreachable("unsupported scheduling policy");
|
||||
}
|
||||
}
|
||||
|
||||
inline unsigned estimateTotalCost(const BinaryContext &BC,
|
||||
const PredicateTy &SkipPredicate,
|
||||
SchedulingPolicy &SchedPolicy) {
|
||||
if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
|
||||
return BC.getBinaryFunctions().size();
|
||||
|
||||
unsigned TotalCost = 0;
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &BF = BFI.second;
|
||||
TotalCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
|
||||
}
|
||||
|
||||
// Switch to trivial scheduling if total estimated work is zero
|
||||
if (TotalCost == 0) {
|
||||
outs() << "BOLT-WARNING: Running parallel work of 0 estimated cost, will "
|
||||
"switch to trivial scheduling.\n";
|
||||
|
||||
SchedPolicy = SP_TRIVIAL;
|
||||
TotalCost = BC.getBinaryFunctions().size();
|
||||
}
|
||||
return TotalCost;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
ThreadPool &getThreadPool() {
|
||||
if (ThreadPoolPtr.get())
|
||||
return *ThreadPoolPtr;
|
||||
|
||||
ThreadPoolPtr = std::make_unique<ThreadPool>(opts::ThreadCount);
|
||||
return *ThreadPoolPtr;
|
||||
}
|
||||
|
||||
void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
|
||||
WorkFuncTy WorkFunction, PredicateTy SkipPredicate,
|
||||
std::string LogName, bool ForceSequential,
|
||||
unsigned TasksPerThread) {
|
||||
if (BC.getBinaryFunctions().size() == 0)
|
||||
return;
|
||||
|
||||
auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
|
||||
std::map<uint64_t, BinaryFunction>::iterator BlockEnd) {
|
||||
Timer T(LogName, LogName);
|
||||
DEBUG(T.startTimer());
|
||||
|
||||
for (auto It = BlockBegin; It != BlockEnd; ++It) {
|
||||
auto &BF = It->second;
|
||||
if (SkipPredicate && SkipPredicate(BF))
|
||||
continue;
|
||||
|
||||
WorkFunction(BF);
|
||||
}
|
||||
DEBUG(T.stopTimer());
|
||||
};
|
||||
|
||||
if (opts::NoThreads || ForceSequential) {
|
||||
runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end());
|
||||
return;
|
||||
}
|
||||
|
||||
// Estimate the overall runtime cost using the scheduling policy
|
||||
const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
|
||||
const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
|
||||
const unsigned BlockCost =
|
||||
TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;
|
||||
|
||||
// Divide work into blocks of equal cost
|
||||
ThreadPool &Pool = getThreadPool();
|
||||
auto BlockBegin = BC.getBinaryFunctions().begin();
|
||||
unsigned CurrentCost = 0;
|
||||
|
||||
for (auto It = BC.getBinaryFunctions().begin();
|
||||
It != BC.getBinaryFunctions().end(); ++It) {
|
||||
auto &BF = It->second;
|
||||
CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
|
||||
|
||||
if (CurrentCost >= BlockCost) {
|
||||
Pool.async(runBlock, BlockBegin, std::next(It));
|
||||
BlockBegin = std::next(It);
|
||||
CurrentCost = 0;
|
||||
}
|
||||
}
|
||||
Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end());
|
||||
Pool.wait();
|
||||
}
|
||||
|
||||
void runOnEachFunctionWithUniqueAllocId(
|
||||
BinaryContext &BC, SchedulingPolicy SchedPolicy,
|
||||
WorkFuncWithAllocTy WorkFunction, PredicateTy SkipPredicate,
|
||||
std::string LogName, bool ForceSequential, unsigned TasksPerThread) {
|
||||
if (BC.getBinaryFunctions().size() == 0)
|
||||
return;
|
||||
|
||||
std::shared_timed_mutex MainLock;
|
||||
auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
|
||||
std::map<uint64_t, BinaryFunction>::iterator BlockEnd,
|
||||
MCPlusBuilder::AllocatorIdTy AllocId) {
|
||||
Timer T(LogName, LogName);
|
||||
DEBUG(T.startTimer());
|
||||
std::shared_lock<std::shared_timed_mutex> Lock(MainLock);
|
||||
for (auto It = BlockBegin; It != BlockEnd; ++It) {
|
||||
auto &BF = It->second;
|
||||
if (SkipPredicate && SkipPredicate(BF))
|
||||
continue;
|
||||
|
||||
WorkFunction(BF, AllocId);
|
||||
}
|
||||
DEBUG(T.stopTimer());
|
||||
};
|
||||
|
||||
if (opts::NoThreads || ForceSequential) {
|
||||
runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end(), 0);
|
||||
return;
|
||||
}
|
||||
// This lock is used to postpone task execution
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(MainLock);
|
||||
|
||||
// Estimate the overall runtime cost using the scheduling policy
|
||||
const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
|
||||
const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
|
||||
const unsigned BlockCost =
|
||||
TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;
|
||||
|
||||
// Divide work into blocks of equal cost
|
||||
ThreadPool &Pool = getThreadPool();
|
||||
auto BlockBegin = BC.getBinaryFunctions().begin();
|
||||
unsigned CurrentCost = 0;
|
||||
unsigned AllocId = 1;
|
||||
for (auto It = BC.getBinaryFunctions().begin();
|
||||
It != BC.getBinaryFunctions().end(); ++It) {
|
||||
auto &BF = It->second;
|
||||
CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
|
||||
|
||||
if (CurrentCost >= BlockCost) {
|
||||
if (!BC.MIB->checkAllocatorExists(AllocId)) {
|
||||
auto Id = BC.MIB->initializeNewAnnotationAllocator();
|
||||
assert(AllocId == Id && "unexpected allocator id created");
|
||||
}
|
||||
Pool.async(runBlock, BlockBegin, std::next(It), AllocId);
|
||||
AllocId++;
|
||||
BlockBegin = std::next(It);
|
||||
CurrentCost = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (!BC.MIB->checkAllocatorExists(AllocId)) {
|
||||
auto Id = BC.MIB->initializeNewAnnotationAllocator();
|
||||
assert(AllocId == Id && "unexpected allocator id created");
|
||||
}
|
||||
|
||||
Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end(), AllocId);
|
||||
Lock.unlock();
|
||||
Pool.wait();
|
||||
}
|
||||
|
||||
} // namespace ParallelUtilities
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
78
src/ParallelUtilities.h
Normal file
78
src/ParallelUtilities.h
Normal file
@ -0,0 +1,78 @@
|
||||
//===-- ParallelUtilities.h - ----------------------------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
// This class creates an interface that can be used to run parallel tasks that
|
||||
// operate on functions. Several scheduling criteria are supported using
|
||||
// SchedulingPolicy, and are defined by how the runtime cost should be
|
||||
// estimated.
|
||||
// If the NoThreads flags is passed, work will execute sequentially.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
|
||||
|
||||
#include "BinaryContext.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "MCPlusBuilder.h"
|
||||
#include "llvm/Support/ThreadPool.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace opts {
|
||||
extern cl::opt<unsigned> ThreadCount;
|
||||
extern cl::opt<bool> NoThreads;
|
||||
extern cl::opt<unsigned> TaskCount;
|
||||
}
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
namespace ParallelUtilities {
|
||||
|
||||
using WorkFuncWithAllocTy =
|
||||
std::function<void(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy)>;
|
||||
using WorkFuncTy = std::function<void(BinaryFunction &BF)>;
|
||||
using PredicateTy = std::function<bool(const BinaryFunction &BF)>;
|
||||
|
||||
enum SchedulingPolicy {
|
||||
SP_TRIVIAL, /// cost is estimated by the number of functions
|
||||
SP_CONSTANT, /// cost is estimated by the number of non-skipped functions
|
||||
SP_INST_LINEAR, /// cost is estimated by inst count
|
||||
SP_INST_QUADRATIC, /// cost is estimated by the square of the inst count
|
||||
SP_BB_LINEAR, /// cost is estimated by BB count
|
||||
SP_BB_QUADRATIC, /// cost is estimated by the square of the BB count
|
||||
};
|
||||
|
||||
/// Return the managed threadpool and initialize it if not intiliazed
|
||||
ThreadPool &getThreadPool();
|
||||
|
||||
/// Perform the work on each BinaryFunction except those that are accepted
|
||||
/// by SkipPredicate, scheduling heuristic is based on SchedPolicy.
|
||||
/// ForceSequential will selectively disable parallel execution and perform the
|
||||
/// work sequentially.
|
||||
void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
|
||||
WorkFuncTy WorkFunction,
|
||||
PredicateTy SkipPredicate = PredicateTy(),
|
||||
std::string LogName = "", bool ForceSequential = false,
|
||||
unsigned TasksPerThread = opts::TaskCount);
|
||||
|
||||
/// Perform the work on each BinaryFunction except those that are rejected
|
||||
/// by SkipPredicate, and create a unique annotation allocator for each
|
||||
/// task. This should be used whenever the work function creates annotations to
|
||||
/// allow thread-safe annotation creation.
|
||||
/// ForceSequential will selectively disable parallel execution and perform the
|
||||
/// work sequentially.
|
||||
void runOnEachFunctionWithUniqueAllocId(
|
||||
BinaryContext &BC, SchedulingPolicy SchedPolicy,
|
||||
WorkFuncWithAllocTy WorkFunction, PredicateTy SkipPredicate,
|
||||
std::string LogName = "", bool ForceSequential = false,
|
||||
unsigned TasksPerThread = opts::TaskCount);
|
||||
|
||||
} // namespace ParallelUtilities
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
#endif
|
||||
@ -10,6 +10,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "Aligner.h"
|
||||
#include "ParallelUtilities.h"
|
||||
|
||||
#define DEBUG_TYPE "bolt-aligner"
|
||||
|
||||
@ -88,16 +89,16 @@ void alignMaxBytes(BinaryFunction &Function) {
|
||||
// the fuction by not more than the minimum over
|
||||
// -- the size of the function
|
||||
// -- the specified number of bytes
|
||||
void alignCompact(BinaryFunction &Function) {
|
||||
void alignCompact(BinaryFunction &Function, const MCCodeEmitter *Emitter) {
|
||||
const auto &BC = Function.getBinaryContext();
|
||||
size_t HotSize = 0;
|
||||
size_t ColdSize = 0;
|
||||
|
||||
for (const auto *BB : Function.layout()) {
|
||||
if (BB->isCold())
|
||||
ColdSize += BC.computeCodeSize(BB->begin(), BB->end());
|
||||
ColdSize += BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
|
||||
else
|
||||
HotSize += BC.computeCodeSize(BB->begin(), BB->end());
|
||||
HotSize += BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
|
||||
}
|
||||
|
||||
Function.setAlignment(opts::AlignFunctions);
|
||||
@ -114,13 +115,15 @@ void alignCompact(BinaryFunction &Function) {
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
void AlignerPass::alignBlocks(BinaryFunction &Function) {
|
||||
void AlignerPass::alignBlocks(BinaryFunction &Function,
|
||||
const MCCodeEmitter *Emitter) {
|
||||
if (!Function.hasValidProfile() || !Function.isSimple())
|
||||
return;
|
||||
|
||||
const auto &BC = Function.getBinaryContext();
|
||||
|
||||
const auto FuncCount = std::max(1UL, Function.getKnownExecutionCount());
|
||||
const auto FuncCount =
|
||||
std::max<uint64_t>(1, Function.getKnownExecutionCount());
|
||||
BinaryBasicBlock *PrevBB{nullptr};
|
||||
for (auto *BB : Function.layout()) {
|
||||
auto Count = BB->getKnownExecutionCount();
|
||||
@ -139,8 +142,9 @@ void AlignerPass::alignBlocks(BinaryFunction &Function) {
|
||||
if (Count < FTCount * 2)
|
||||
continue;
|
||||
|
||||
const auto BlockSize = BC.computeCodeSize(BB->begin(), BB->end());
|
||||
const auto BytesToUse = std::min(opts::BlockAlignment - 1UL, BlockSize);
|
||||
const auto BlockSize = BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
|
||||
const auto BytesToUse =
|
||||
std::min<uint64_t>(opts::BlockAlignment - 1, BlockSize);
|
||||
|
||||
if (opts::AlignBlocksMinSize && BlockSize < opts::AlignBlocksMinSize)
|
||||
continue;
|
||||
@ -149,30 +153,36 @@ void AlignerPass::alignBlocks(BinaryFunction &Function) {
|
||||
BB->setAlignmentMaxBytes(BytesToUse);
|
||||
|
||||
// Update stats.
|
||||
AlignHistogram[BytesToUse]++;
|
||||
AlignedBlocksCount += BB->getKnownExecutionCount();
|
||||
DEBUG(
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(AlignHistogramMtx);
|
||||
AlignHistogram[BytesToUse]++;
|
||||
AlignedBlocksCount += BB->getKnownExecutionCount();
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void AlignerPass::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
void AlignerPass::runOnFunctions(BinaryContext &BC) {
|
||||
if (!BC.HasRelocations)
|
||||
return;
|
||||
|
||||
AlignHistogram.resize(opts::BlockAlignment);
|
||||
|
||||
for (auto &It : BFs) {
|
||||
auto &Function = It.second;
|
||||
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
|
||||
// Create a separate MCCodeEmitter to allow lock free execution
|
||||
auto Emitter = BC.createIndependentMCCodeEmitter();
|
||||
|
||||
if (opts::UseCompactAligner)
|
||||
alignCompact(Function);
|
||||
alignCompact(BF, Emitter.MCE.get());
|
||||
else
|
||||
alignMaxBytes(Function);
|
||||
alignMaxBytes(BF);
|
||||
|
||||
if (opts::AlignBlocks && !opts::PreserveBlocksAlignment)
|
||||
alignBlocks(Function);
|
||||
}
|
||||
alignBlocks(BF, Emitter.MCE.get());
|
||||
};
|
||||
|
||||
ParallelUtilities::runOnEachFunction(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun,
|
||||
ParallelUtilities::PredicateTy(nullptr), "AlignerPass");
|
||||
|
||||
DEBUG(
|
||||
dbgs() << "BOLT-DEBUG: max bytes per basic block alignment distribution:\n";
|
||||
|
||||
@ -19,15 +19,15 @@ namespace bolt {
|
||||
|
||||
class AlignerPass : public BinaryFunctionPass {
|
||||
private:
|
||||
|
||||
/// Stats for usage of max bytes for basic block alignment.
|
||||
std::vector<uint32_t> AlignHistogram;
|
||||
std::shared_timed_mutex AlignHistogramMtx;
|
||||
|
||||
/// Stats: execution count of blocks that were aligned.
|
||||
uint64_t AlignedBlocksCount{0};
|
||||
std::atomic<uint64_t> AlignedBlocksCount{0};
|
||||
|
||||
/// Assign alignment to basic blocks based on profile.
|
||||
void alignBlocks(BinaryFunction &Function);
|
||||
void alignBlocks(BinaryFunction &Function, const MCCodeEmitter *Emitter);
|
||||
|
||||
public:
|
||||
explicit AlignerPass() : BinaryFunctionPass(false) {}
|
||||
@ -37,9 +37,7 @@ public:
|
||||
}
|
||||
|
||||
/// Pass entry point
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
@ -100,14 +100,13 @@ void AllocCombinerPass::combineAdjustments(BinaryContext &BC,
|
||||
}
|
||||
}
|
||||
|
||||
void AllocCombinerPass::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
void AllocCombinerPass::runOnFunctions(BinaryContext &BC) {
|
||||
if (opts::FrameOptimization == FOP_NONE)
|
||||
return;
|
||||
|
||||
runForAllWeCare(
|
||||
BFs, [&](BinaryFunction &Function) { combineAdjustments(BC, Function); });
|
||||
BC.getBinaryFunctions(),
|
||||
[&](BinaryFunction &Function) { combineAdjustments(BC, Function); });
|
||||
|
||||
outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
|
||||
<< " empty spaces coalesced.\n";
|
||||
|
||||
@ -40,9 +40,7 @@ public:
|
||||
}
|
||||
|
||||
/// Pass entry point
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
@ -77,7 +77,6 @@ std::deque<BinaryFunction *> BinaryFunctionCallGraph::buildTraversalOrder() {
|
||||
}
|
||||
|
||||
BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
CgFilterFunction Filter,
|
||||
bool CgFromPerfData,
|
||||
bool IncludeColdCalls,
|
||||
@ -126,7 +125,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
|
||||
uint64_t NoProfileCallsites = 0;
|
||||
uint64_t NumFallbacks = 0;
|
||||
uint64_t RecursiveCallsites = 0;
|
||||
for (auto &It : BFs) {
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
auto *Function = &It.second;
|
||||
|
||||
if (Filter(*Function)) {
|
||||
|
||||
@ -57,7 +57,7 @@ private:
|
||||
using CgFilterFunction = std::function<bool (const BinaryFunction &BF)>;
|
||||
inline bool NoFilter(const BinaryFunction &) { return false; }
|
||||
|
||||
/// Builds a call graph from the map of BinaryFunctions provided in BFs.
|
||||
/// Builds a call graph from the map of BinaryFunctions provided in BC.
|
||||
/// The arguments control how the graph is constructed.
|
||||
/// Filter is called on each function, any function that it returns true for
|
||||
/// is omitted from the graph.
|
||||
@ -68,7 +68,6 @@ inline bool NoFilter(const BinaryFunction &) { return false; }
|
||||
/// UseEdgeCounts is used to control if the Weight attribute on Arcs is computed
|
||||
/// using the number of calls.
|
||||
BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
CgFilterFunction Filter = NoFilter,
|
||||
bool CgFromPerfData = false,
|
||||
bool IncludeColdCalls = true,
|
||||
|
||||
@ -10,9 +10,12 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "BinaryPasses.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "Passes/ReorderAlgorithm.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#define DEBUG_TYPE "bolt-opts"
|
||||
|
||||
@ -54,8 +57,10 @@ extern cl::OptionCategory BoltOptCategory;
|
||||
extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
|
||||
extern cl::opt<unsigned> Verbosity;
|
||||
extern cl::opt<bool> SplitEH;
|
||||
extern cl::opt<bool> EnableBAT;
|
||||
extern cl::opt<bolt::BinaryFunction::SplittingType> SplitFunctions;
|
||||
extern bool shouldProcess(const bolt::BinaryFunction &Function);
|
||||
extern bool isHotTextMover(const bolt::BinaryFunction &Function);
|
||||
|
||||
enum DynoStatsSortOrder : char {
|
||||
Ascending,
|
||||
@ -134,6 +139,22 @@ PrintSortedBy("print-sorted-by",
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintUnknown("print-unknown",
|
||||
cl::desc("print names of functions with unknown control flow"),
|
||||
cl::init(false),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltCategory),
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintUnknownCFG("print-unknown-cfg",
|
||||
cl::desc("dump CFG of functions with unknown control flow"),
|
||||
cl::init(false),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltCategory),
|
||||
cl::ReallyHidden);
|
||||
|
||||
static cl::opt<bolt::ReorderBasicBlocks::LayoutType>
|
||||
ReorderBlocks("reorder-blocks",
|
||||
cl::desc("change layout of basic blocks in a function"),
|
||||
@ -267,7 +288,7 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
|
||||
if (!BB->isValid()) {
|
||||
dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName()
|
||||
<< " in function " << Function << "\n";
|
||||
BB->dump();
|
||||
Function.dump();
|
||||
}
|
||||
}
|
||||
});
|
||||
@ -275,7 +296,10 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
|
||||
DeletedBlocks += Count;
|
||||
DeletedBytes += Bytes;
|
||||
if (Count) {
|
||||
Modified.insert(&Function);
|
||||
{
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(ModifiedMtx);
|
||||
Modified.insert(&Function);
|
||||
}
|
||||
if (opts::Verbosity > 0) {
|
||||
outs() << "BOLT-INFO: Removed " << Count
|
||||
<< " dead basic block(s) accounting for " << Bytes
|
||||
@ -285,17 +309,19 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
|
||||
}
|
||||
}
|
||||
|
||||
void EliminateUnreachableBlocks::runOnFunctions(
|
||||
BinaryContext&,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &
|
||||
) {
|
||||
for (auto &It : BFs) {
|
||||
auto &Function = It.second;
|
||||
if (shouldOptimize(Function)) {
|
||||
runOnFunction(Function);
|
||||
}
|
||||
}
|
||||
void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
|
||||
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
|
||||
runOnFunction(BF);
|
||||
};
|
||||
|
||||
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
|
||||
return !shouldOptimize(BF);
|
||||
};
|
||||
|
||||
ParallelUtilities::runOnEachFunction(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
|
||||
"EliminateUnreachableBlocks");
|
||||
|
||||
outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
|
||||
<< DeletedBytes << " bytes of code.\n";
|
||||
}
|
||||
@ -305,43 +331,43 @@ bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
|
||||
opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
|
||||
}
|
||||
|
||||
void ReorderBasicBlocks::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
|
||||
if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
|
||||
return;
|
||||
|
||||
IsAArch64 = BC.isAArch64();
|
||||
std::atomic<uint64_t> ModifiedFuncCount{0};
|
||||
|
||||
uint64_t ModifiedFuncCount = 0;
|
||||
for (auto &It : BFs) {
|
||||
auto &Function = It.second;
|
||||
|
||||
if (!shouldOptimize(Function))
|
||||
continue;
|
||||
|
||||
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
|
||||
const bool ShouldSplit =
|
||||
(opts::SplitFunctions == BinaryFunction::ST_ALL) ||
|
||||
(opts::SplitFunctions == BinaryFunction::ST_EH &&
|
||||
Function.hasEHRanges()) ||
|
||||
(LargeFunctions.find(It.first) != LargeFunctions.end());
|
||||
modifyFunctionLayout(Function, opts::ReorderBlocks, opts::MinBranchClusters,
|
||||
(opts::SplitFunctions == BinaryFunction::ST_ALL) ||
|
||||
(opts::SplitFunctions == BinaryFunction::ST_EH && BF.hasEHRanges()) ||
|
||||
BF.shouldSplit();
|
||||
modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters,
|
||||
ShouldSplit);
|
||||
|
||||
if (Function.hasLayoutChanged()) {
|
||||
if (BF.hasLayoutChanged()) {
|
||||
++ModifiedFuncCount;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
|
||||
return !shouldOptimize(BF);
|
||||
};
|
||||
|
||||
ParallelUtilities::runOnEachFunction(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
|
||||
"ReorderBasicBlocks");
|
||||
|
||||
outs() << "BOLT-INFO: basic block reordering modified layout of "
|
||||
<< format("%zu (%.2lf%%) functions\n",
|
||||
ModifiedFuncCount, 100.0 * ModifiedFuncCount / BFs.size());
|
||||
<< format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(),
|
||||
100.0 * ModifiedFuncCount.load() /
|
||||
BC.getBinaryFunctions().size());
|
||||
|
||||
if (opts::PrintFuncStat > 0) {
|
||||
raw_ostream &OS = outs();
|
||||
// Copy all the values into vector in order to sort them
|
||||
std::map<uint64_t, BinaryFunction &> ScoreMap;
|
||||
auto &BFs = BC.getBinaryFunctions();
|
||||
for (auto It = BFs.begin(); It != BFs.end(); ++It) {
|
||||
ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
|
||||
It->second.getFunctionScore(), It->second));
|
||||
@ -349,8 +375,8 @@ void ReorderBasicBlocks::runOnFunctions(
|
||||
|
||||
OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
|
||||
OS << " There are " << BFs.size() << " functions in total. \n";
|
||||
OS << " Number of functions being modified: " << ModifiedFuncCount
|
||||
<< "\n";
|
||||
OS << " Number of functions being modified: "
|
||||
<< ModifiedFuncCount.load() << "\n";
|
||||
OS << " User asks for detailed information on top "
|
||||
<< opts::PrintFuncStat << " functions. (Ranked by function score)"
|
||||
<< "\n\n";
|
||||
@ -550,11 +576,8 @@ void ReorderBasicBlocks::splitFunction(BinaryFunction &BF) const {
|
||||
}
|
||||
}
|
||||
|
||||
void FixupBranches::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &) {
|
||||
for (auto &It : BFs) {
|
||||
void FixupBranches::runOnFunctions(BinaryContext &BC) {
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
auto &Function = It.second;
|
||||
if (BC.HasRelocations || shouldOptimize(Function)) {
|
||||
if (BC.HasRelocations && !Function.isSimple())
|
||||
@ -564,42 +587,38 @@ void FixupBranches::runOnFunctions(
|
||||
}
|
||||
}
|
||||
|
||||
void FinalizeFunctions::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &
|
||||
) {
|
||||
for (auto &It : BFs) {
|
||||
auto &Function = It.second;
|
||||
const auto ShouldOptimize = shouldOptimize(Function);
|
||||
|
||||
// Always fix functions in relocation mode.
|
||||
if (!BC.HasRelocations && !ShouldOptimize)
|
||||
continue;
|
||||
|
||||
// Fix the CFI state.
|
||||
if (ShouldOptimize && !Function.finalizeCFIState()) {
|
||||
void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
|
||||
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
|
||||
if (shouldOptimize(BF) && !BF.finalizeCFIState()) {
|
||||
if (BC.HasRelocations) {
|
||||
errs() << "BOLT-ERROR: unable to fix CFI state for function "
|
||||
<< Function << ". Exiting.\n";
|
||||
errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
|
||||
<< ". Exiting.\n";
|
||||
exit(1);
|
||||
}
|
||||
Function.setSimple(false);
|
||||
continue;
|
||||
BF.setSimple(false);
|
||||
return;
|
||||
}
|
||||
|
||||
Function.setFinalized();
|
||||
BF.setFinalized();
|
||||
|
||||
// Update exception handling information.
|
||||
Function.updateEHRanges();
|
||||
}
|
||||
BF.updateEHRanges();
|
||||
};
|
||||
|
||||
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
|
||||
return !BC.HasRelocations && !shouldOptimize(BF);
|
||||
};
|
||||
|
||||
ParallelUtilities::runOnEachFunction(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
|
||||
SkipPredicate, "FinalizeFunctions");
|
||||
}
|
||||
|
||||
void LowerAnnotations::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &) {
|
||||
for (auto &It : BFs) {
|
||||
void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
|
||||
std::vector<std::pair<MCInst *, uint64_t>> PreservedSDTAnnotations;
|
||||
std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
|
||||
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
auto &BF = It.second;
|
||||
int64_t CurrentGnuArgsSize = 0;
|
||||
|
||||
@ -612,9 +631,12 @@ void LowerAnnotations::runOnFunctions(
|
||||
CurrentGnuArgsSize = 0;
|
||||
}
|
||||
|
||||
for (auto II = BB->begin(); II != BB->end(); ++II) {
|
||||
// Convert GnuArgsSize annotations into CFIs.
|
||||
if (BF.usesGnuArgsSize() && BC.MIB->isInvoke(*II)) {
|
||||
// First convert GnuArgsSize annotations into CFIs. This may change instr
|
||||
// pointers, so do it before recording ptrs for preserved annotations
|
||||
if (BF.usesGnuArgsSize()) {
|
||||
for (auto II = BB->begin(); II != BB->end(); ++II) {
|
||||
if (!BC.MIB->isInvoke(*II))
|
||||
continue;
|
||||
const auto NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
|
||||
assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size");
|
||||
if (NewGnuArgsSize != CurrentGnuArgsSize) {
|
||||
@ -624,13 +646,33 @@ void LowerAnnotations::runOnFunctions(
|
||||
II = std::next(InsertII);
|
||||
}
|
||||
}
|
||||
BC.MIB->removeAllAnnotations(*II);
|
||||
}
|
||||
|
||||
// Now record preserved annotations separately and then strip annotations
|
||||
for (auto II = BB->begin(); II != BB->end(); ++II) {
|
||||
if (BC.MIB->hasAnnotation(*II, "SDTMarker")) {
|
||||
PreservedSDTAnnotations.push_back(std::make_pair(
|
||||
&(*II), BC.MIB->getAnnotationAs<uint64_t>(*II, "SDTMarker")));
|
||||
}
|
||||
|
||||
if (opts::EnableBAT && BC.MIB->hasAnnotation(*II, "Offset")) {
|
||||
PreservedOffsetAnnotations.push_back(std::make_pair(
|
||||
&(*II), BC.MIB->getAnnotationAs<uint32_t>(*II, "Offset")));
|
||||
}
|
||||
|
||||
BC.MIB->stripAnnotations(*II);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Release all memory taken by annotations.
|
||||
// Release all memory taken by annotations
|
||||
BC.MIB->freeAnnotations();
|
||||
|
||||
// Reinsert preserved annotations we need during code emission.
|
||||
for (const auto &Item : PreservedSDTAnnotations)
|
||||
BC.MIB->addAnnotation<uint64_t>(*Item.first, "SDTMarker", Item.second);
|
||||
for (const auto &Item : PreservedOffsetAnnotations)
|
||||
BC.MIB->addAnnotation<uint32_t>(*Item.first, "Offset", Item.second);
|
||||
}
|
||||
|
||||
namespace {
|
||||
@ -984,15 +1026,11 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC,
|
||||
return NumLocalCTCs > 0;
|
||||
}
|
||||
|
||||
void SimplifyConditionalTailCalls::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &
|
||||
) {
|
||||
void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
|
||||
if (!BC.isX86())
|
||||
return;
|
||||
|
||||
for (auto &It : BFs) {
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
auto &Function = It.second;
|
||||
|
||||
if (!shouldOptimize(Function))
|
||||
@ -1080,9 +1118,7 @@ void Peepholes::removeUselessCondBranches(BinaryContext &BC,
|
||||
}
|
||||
}
|
||||
|
||||
void Peepholes::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
void Peepholes::runOnFunctions(BinaryContext &BC) {
|
||||
const char Opts =
|
||||
std::accumulate(opts::Peepholes.begin(),
|
||||
opts::Peepholes.end(),
|
||||
@ -1093,7 +1129,7 @@ void Peepholes::runOnFunctions(BinaryContext &BC,
|
||||
if (Opts == opts::PEEP_NONE || !BC.isX86())
|
||||
return;
|
||||
|
||||
for (auto &It : BFs) {
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
auto &Function = It.second;
|
||||
if (shouldOptimize(Function)) {
|
||||
if (Opts & opts::PEEP_SHORTEN)
|
||||
@ -1197,12 +1233,8 @@ bool SimplifyRODataLoads::simplifyRODataLoads(
|
||||
return NumLocalLoadsSimplified > 0;
|
||||
}
|
||||
|
||||
void SimplifyRODataLoads::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &
|
||||
) {
|
||||
for (auto &It : BFs) {
|
||||
void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
auto &Function = It.second;
|
||||
if (shouldOptimize(Function) && simplifyRODataLoads(BC, Function)) {
|
||||
Modified.insert(&Function);
|
||||
@ -1216,24 +1248,156 @@ void SimplifyRODataLoads::runOnFunctions(
|
||||
<< "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
|
||||
}
|
||||
|
||||
void AssignSections::runOnFunctions(BinaryContext &BC) {
|
||||
for (auto *Function : BC.getInjectedBinaryFunctions()) {
|
||||
Function->setCodeSectionName(BC.getInjectedCodeSectionName());
|
||||
Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
|
||||
}
|
||||
|
||||
// In non-relocation mode functions have pre-assigned section names.
|
||||
if (!BC.HasRelocations)
|
||||
return;
|
||||
|
||||
const auto UseColdSection = BC.NumProfiledFuncs > 0;
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &Function = BFI.second;
|
||||
if (opts::isHotTextMover(Function)) {
|
||||
Function.setCodeSectionName(BC.getHotTextMoverSectionName());
|
||||
Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!UseColdSection ||
|
||||
Function.hasValidIndex() ||
|
||||
Function.hasValidProfile()) {
|
||||
Function.setCodeSectionName(BC.getMainCodeSectionName());
|
||||
} else {
|
||||
Function.setCodeSectionName(BC.getColdCodeSectionName());
|
||||
}
|
||||
|
||||
if (Function.isSplit())
|
||||
Function.setColdCodeSectionName(BC.getColdCodeSectionName());
|
||||
}
|
||||
}
|
||||
|
||||
void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
|
||||
double FlowImbalanceMean = 0.0;
|
||||
size_t NumBlocksConsidered = 0;
|
||||
double WorstBias = 0.0;
|
||||
const BinaryFunction *WorstBiasFunc = nullptr;
|
||||
|
||||
// For each function CFG, we fill an IncomingMap with the sum of the frequency
|
||||
// of incoming edges for each BB. Likewise for each OutgoingMap and the sum
|
||||
// of the frequency of outgoing edges.
|
||||
using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
|
||||
std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
|
||||
std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
|
||||
|
||||
// Compute mean
|
||||
for (const auto &BFI : BC.getBinaryFunctions()) {
|
||||
const BinaryFunction &Function = BFI.second;
|
||||
if (Function.empty() || !Function.isSimple())
|
||||
continue;
|
||||
FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
|
||||
FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
|
||||
for (const auto &BB : Function) {
|
||||
auto TotalOutgoing = 0ULL;
|
||||
auto SuccBIIter = BB.branch_info_begin();
|
||||
for (auto Succ : BB.successors()) {
|
||||
auto Count = SuccBIIter->Count;
|
||||
if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
|
||||
++SuccBIIter;
|
||||
continue;
|
||||
}
|
||||
TotalOutgoing += Count;
|
||||
IncomingMap[Succ] += Count;
|
||||
++SuccBIIter;
|
||||
}
|
||||
OutgoingMap[&BB] = TotalOutgoing;
|
||||
}
|
||||
|
||||
size_t NumBlocks = 0;
|
||||
double Mean = 0.0;
|
||||
for (const auto &BB : Function) {
|
||||
// Do not compute score for low frequency blocks, entry or exit blocks
|
||||
if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
|
||||
continue;
|
||||
++NumBlocks;
|
||||
const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
|
||||
Mean += fabs(Difference / IncomingMap[&BB]);
|
||||
}
|
||||
|
||||
FlowImbalanceMean += Mean;
|
||||
NumBlocksConsidered += NumBlocks;
|
||||
if (!NumBlocks)
|
||||
continue;
|
||||
double FuncMean = Mean / NumBlocks;
|
||||
if (FuncMean > WorstBias) {
|
||||
WorstBias = FuncMean;
|
||||
WorstBiasFunc = &Function;
|
||||
}
|
||||
}
|
||||
if (NumBlocksConsidered > 0)
|
||||
FlowImbalanceMean /= NumBlocksConsidered;
|
||||
|
||||
// Compute standard deviation
|
||||
NumBlocksConsidered = 0;
|
||||
double FlowImbalanceVar = 0.0;
|
||||
for (const auto &BFI : BC.getBinaryFunctions()) {
|
||||
const BinaryFunction &Function = BFI.second;
|
||||
if (Function.empty() || !Function.isSimple())
|
||||
continue;
|
||||
FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
|
||||
FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
|
||||
for (const auto &BB : Function) {
|
||||
if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
|
||||
continue;
|
||||
++NumBlocksConsidered;
|
||||
const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
|
||||
FlowImbalanceVar +=
|
||||
pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
|
||||
}
|
||||
}
|
||||
if (NumBlocksConsidered) {
|
||||
FlowImbalanceVar /= NumBlocksConsidered;
|
||||
FlowImbalanceVar = sqrt(FlowImbalanceVar);
|
||||
}
|
||||
|
||||
// Report to user
|
||||
outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
|
||||
(100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
|
||||
if (WorstBiasFunc && opts::Verbosity >= 1) {
|
||||
outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
|
||||
<< "\n";
|
||||
DEBUG(WorstBiasFunc->dump());
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
PrintProgramStats::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &) {
|
||||
PrintProgramStats::runOnFunctions(BinaryContext &BC) {
|
||||
uint64_t NumSimpleFunctions{0};
|
||||
uint64_t NumStaleProfileFunctions{0};
|
||||
uint64_t NumNonSimpleProfiledFunctions{0};
|
||||
uint64_t NumUnknownControlFlowFunctions{0};
|
||||
std::vector<BinaryFunction *> ProfiledFunctions;
|
||||
const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
|
||||
for (auto &BFI : BFs) {
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &Function = BFI.second;
|
||||
if (!Function.isSimple()) {
|
||||
if (Function.hasProfile()) {
|
||||
if (Function.hasProfile() && !Function.isPLTFunction()) {
|
||||
++NumNonSimpleProfiledFunctions;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
++NumSimpleFunctions;
|
||||
if (Function.hasUnknownControlFlow()) {
|
||||
if (opts::PrintUnknownCFG) {
|
||||
Function.dump();
|
||||
} else if (opts::PrintUnknown) {
|
||||
errs() << "function with unknown control flow: " << Function <<'\n';
|
||||
}
|
||||
++NumUnknownControlFlowFunctions;
|
||||
}
|
||||
if (!Function.hasProfile())
|
||||
continue;
|
||||
if (Function.hasValidProfile()) {
|
||||
@ -1321,11 +1485,11 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC,
|
||||
std::vector<const BinaryFunction *> Functions;
|
||||
std::map<const BinaryFunction *, DynoStats> Stats;
|
||||
|
||||
for (const auto &BFI : BFs) {
|
||||
for (const auto &BFI : BC.getBinaryFunctions()) {
|
||||
const auto &BF = BFI.second;
|
||||
if (shouldOptimize(BF) && BF.hasValidProfile()) {
|
||||
Functions.push_back(&BF);
|
||||
Stats.emplace(&BF, BF.getDynoStats());
|
||||
Stats.emplace(&BF, getDynoStats(BF));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1377,7 +1541,7 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC,
|
||||
outs() << " are:\n";
|
||||
auto SFI = Functions.begin();
|
||||
for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
|
||||
const auto Stats = (*SFI)->getDynoStats();
|
||||
const auto Stats = getDynoStats(**SFI);
|
||||
outs() << " " << **SFI;
|
||||
if (!SortAll) {
|
||||
outs() << " (";
|
||||
@ -1427,12 +1591,13 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC,
|
||||
// Collect and print information about suboptimal code layout on input.
|
||||
if (opts::ReportBadLayout) {
|
||||
std::vector<const BinaryFunction *> SuboptimalFuncs;
|
||||
for (auto &BFI : BFs) {
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
const auto &BF = BFI.second;
|
||||
if (!BF.hasValidProfile())
|
||||
continue;
|
||||
|
||||
const auto HotThreshold = std::max(BF.getKnownExecutionCount(), 1UL);
|
||||
const auto HotThreshold =
|
||||
std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
|
||||
bool HotSeen = false;
|
||||
for (const auto *BB : BF.rlayout()) {
|
||||
if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
|
||||
@ -1463,13 +1628,19 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (NumUnknownControlFlowFunctions) {
|
||||
outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
|
||||
<< " functions have instructions with unknown control flow";
|
||||
if (!opts::PrintUnknown) {
|
||||
outs() << ". Use -print-unknown to see the list.";
|
||||
}
|
||||
outs() << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
void InstructionLowering::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
for (auto &BFI : BFs) {
|
||||
void InstructionLowering::runOnFunctions(BinaryContext &BC) {
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
for (auto &BB : BFI.second) {
|
||||
for (auto &Instruction : BB) {
|
||||
BC.MIB->lowerTailCall(Instruction);
|
||||
@ -1478,13 +1649,10 @@ void InstructionLowering::runOnFunctions(
|
||||
}
|
||||
}
|
||||
|
||||
void StripRepRet::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
void StripRepRet::runOnFunctions(BinaryContext &BC) {
|
||||
uint64_t NumPrefixesRemoved = 0;
|
||||
uint64_t NumBytesSaved = 0;
|
||||
for (auto &BFI : BFs) {
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
for (auto &BB : BFI.second) {
|
||||
auto LastInstRIter = BB.getLastNonPseudo();
|
||||
if (LastInstRIter == BB.rend() ||
|
||||
@ -1504,17 +1672,15 @@ void StripRepRet::runOnFunctions(
|
||||
}
|
||||
}
|
||||
|
||||
void InlineMemcpy::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
|
||||
if (!BC.isX86())
|
||||
return;
|
||||
|
||||
uint64_t NumInlined = 0;
|
||||
uint64_t NumInlinedDyno = 0;
|
||||
for (auto &BFI : BFs) {
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
for (auto &BB : BFI.second) {
|
||||
for(auto II = BB.begin(); II != BB.end(); ++II) {
|
||||
for (auto II = BB.begin(); II != BB.end(); ++II) {
|
||||
auto &Inst = *II;
|
||||
|
||||
if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
|
||||
@ -1554,5 +1720,139 @@ void InlineMemcpy::runOnFunctions(BinaryContext &BC,
|
||||
}
|
||||
}
|
||||
|
||||
bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
|
||||
if (!BinaryFunctionPass::shouldOptimize(Function))
|
||||
return false;
|
||||
|
||||
for (auto &FunctionSpec : Spec) {
|
||||
auto FunctionName = StringRef(FunctionSpec).split(':').first;
|
||||
if (Function.hasNameRegex(FunctionName))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
std::set<size_t>
|
||||
SpecializeMemcpy1::getCallSitesToOptimize(const BinaryFunction &Function) const{
|
||||
StringRef SitesString;
|
||||
for (auto &FunctionSpec : Spec) {
|
||||
StringRef FunctionName;
|
||||
std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
|
||||
if (Function.hasNameRegex(FunctionName))
|
||||
break;
|
||||
SitesString = "";
|
||||
}
|
||||
|
||||
std::set<size_t> Sites;
|
||||
SmallVector<StringRef, 4> SitesVec;
|
||||
SitesString.split(SitesVec, ':');
|
||||
for (auto SiteString : SitesVec) {
|
||||
if (SiteString.empty())
|
||||
continue;
|
||||
size_t Result;
|
||||
if (!SiteString.getAsInteger(10, Result))
|
||||
Sites.emplace(Result);
|
||||
}
|
||||
|
||||
return Sites;
|
||||
}
|
||||
|
||||
void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
|
||||
if (!BC.isX86())
|
||||
return;
|
||||
|
||||
uint64_t NumSpecialized = 0;
|
||||
uint64_t NumSpecializedDyno = 0;
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &Function = BFI.second;
|
||||
if (!shouldOptimize(Function))
|
||||
continue;
|
||||
|
||||
auto CallsToOptimize = getCallSitesToOptimize(Function);
|
||||
auto shouldOptimize = [&](size_t N) {
|
||||
return CallsToOptimize.empty() || CallsToOptimize.count(N);
|
||||
};
|
||||
|
||||
std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
|
||||
size_t CallSiteID = 0;
|
||||
for (auto *CurBB : Blocks) {
|
||||
for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
|
||||
auto &Inst = *II;
|
||||
|
||||
if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
|
||||
!Inst.getOperand(0).isExpr())
|
||||
continue;
|
||||
|
||||
const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
|
||||
if (CalleeSymbol->getName() != "memcpy" &&
|
||||
CalleeSymbol->getName() != "memcpy@PLT")
|
||||
continue;
|
||||
|
||||
if (BC.MIB->isTailCall(Inst))
|
||||
continue;
|
||||
|
||||
++CallSiteID;
|
||||
|
||||
if (!shouldOptimize(CallSiteID))
|
||||
continue;
|
||||
|
||||
// Create a copy of a call to memcpy(dest, src, size).
|
||||
auto MemcpyInstr = Inst;
|
||||
|
||||
auto *OneByteMemcpyBB = CurBB->splitAt(II);
|
||||
|
||||
BinaryBasicBlock *NextBB{nullptr};
|
||||
if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
|
||||
NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
|
||||
NextBB->eraseInstruction(NextBB->begin());
|
||||
} else {
|
||||
NextBB = OneByteMemcpyBB->getSuccessor();
|
||||
OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
|
||||
assert(NextBB && "unexpected call to memcpy() with no return");
|
||||
}
|
||||
|
||||
auto *MemcpyBB = Function.addBasicBlock(CurBB->getInputOffset());
|
||||
auto CmpJCC = BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2),
|
||||
1,
|
||||
OneByteMemcpyBB->getLabel(),
|
||||
BC.Ctx.get());
|
||||
CurBB->addInstructions(CmpJCC);
|
||||
CurBB->addSuccessor(MemcpyBB);
|
||||
|
||||
MemcpyBB->addInstruction(std::move(MemcpyInstr));
|
||||
MemcpyBB->addSuccessor(NextBB);
|
||||
MemcpyBB->setCFIState(NextBB->getCFIState());
|
||||
MemcpyBB->setExecutionCount(0);
|
||||
|
||||
// To prevent the actual call from being moved to cold, we set its
|
||||
// execution count to 1.
|
||||
if (CurBB->getKnownExecutionCount() > 0)
|
||||
MemcpyBB->setExecutionCount(1);
|
||||
|
||||
auto OneByteMemcpy = BC.MIB->createOneByteMemcpy();
|
||||
OneByteMemcpyBB->addInstructions(OneByteMemcpy);
|
||||
|
||||
++NumSpecialized;
|
||||
NumSpecializedDyno += CurBB->getKnownExecutionCount();
|
||||
|
||||
CurBB = NextBB;
|
||||
|
||||
// Note: we don't expect the next instruction to be a call to memcpy.
|
||||
II = CurBB->begin();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (NumSpecialized) {
|
||||
outs() << "BOLT-INFO: specialized " << NumSpecialized
|
||||
<< " memcpy() call sites for size 1";
|
||||
if (NumSpecializedDyno)
|
||||
outs() << ". The calls were executed " << NumSpecializedDyno
|
||||
<< " times based on profile.";
|
||||
outs() << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
@ -16,9 +16,10 @@
|
||||
|
||||
#include "BinaryContext.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "DynoStats.h"
|
||||
#include "HFSort.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
@ -38,7 +39,7 @@ protected:
|
||||
|
||||
/// Control whether a specific function should be skipped during
|
||||
/// optimization.
|
||||
bool shouldOptimize(const BinaryFunction &BF) const;
|
||||
virtual bool shouldOptimize(const BinaryFunction &BF) const;
|
||||
public:
|
||||
virtual ~BinaryFunctionPass() = default;
|
||||
|
||||
@ -53,9 +54,7 @@ public:
|
||||
virtual bool shouldPrint(const BinaryFunction &BF) const;
|
||||
|
||||
/// Execute this pass on the given functions.
|
||||
virtual void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) = 0;
|
||||
virtual void runOnFunctions(BinaryContext &BC) = 0;
|
||||
};
|
||||
|
||||
/// A pass to print program-wide dynostats.
|
||||
@ -79,10 +78,8 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override {
|
||||
const auto NewDynoStats = getDynoStats(BFs);
|
||||
void runOnFunctions(BinaryContext &BC) override {
|
||||
const auto NewDynoStats = getDynoStats(BC.getBinaryFunctions());
|
||||
const auto Changed = (NewDynoStats != PrevDynoStats);
|
||||
outs() << "BOLT-INFO: program-wide dynostats "
|
||||
<< Title << (Changed ? "" : " (no change)") << ":\n\n"
|
||||
@ -98,9 +95,10 @@ public:
|
||||
/// Detect and eliminate unreachable basic blocks. We could have those
|
||||
/// filled with nops and they are used for alignment.
|
||||
class EliminateUnreachableBlocks : public BinaryFunctionPass {
|
||||
std::shared_timed_mutex ModifiedMtx;
|
||||
std::unordered_set<const BinaryFunction *> Modified;
|
||||
unsigned DeletedBlocks{0};
|
||||
uint64_t DeletedBytes{0};
|
||||
std::atomic<unsigned> DeletedBlocks{0};
|
||||
std::atomic<uint64_t> DeletedBytes{0};
|
||||
void runOnFunction(BinaryFunction& Function);
|
||||
public:
|
||||
EliminateUnreachableBlocks(const cl::opt<bool> &PrintPass)
|
||||
@ -112,9 +110,7 @@ class EliminateUnreachableBlocks : public BinaryFunctionPass {
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
void runOnFunctions(BinaryContext&,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext&) override;
|
||||
};
|
||||
|
||||
// Reorder the basic blocks for each function based on hotness.
|
||||
@ -164,9 +160,7 @@ public:
|
||||
return "reordering";
|
||||
}
|
||||
bool shouldPrint(const BinaryFunction &BF) const override;
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Sync local branches with CFG.
|
||||
@ -178,9 +172,7 @@ class FixupBranches : public BinaryFunctionPass {
|
||||
const char *getName() const override {
|
||||
return "fix-branches";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Fix the CFI state and exception handling information after all other
|
||||
@ -193,9 +185,7 @@ class FinalizeFunctions : public BinaryFunctionPass {
|
||||
const char *getName() const override {
|
||||
return "finalize-functions";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Convert and remove all BOLT-related annotations before LLVM code emission.
|
||||
@ -207,9 +197,7 @@ class LowerAnnotations : public BinaryFunctionPass {
|
||||
const char *getName() const override {
|
||||
return "lower-annotations";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// An optimization to simplify conditional tail calls by removing
|
||||
@ -281,9 +269,7 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass {
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Perform simple peephole optimizations.
|
||||
@ -313,9 +299,7 @@ public:
|
||||
const char *getName() const override {
|
||||
return "peepholes";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// An optimization to simplify loads from read-only sections.The pass converts
|
||||
@ -323,7 +307,7 @@ public:
|
||||
///
|
||||
/// mov 0x12f(%rip), %eax
|
||||
///
|
||||
/// to their counterparts that use immediate opreands instead of memory loads:
|
||||
/// to their counterparts that use immediate operands instead of memory loads:
|
||||
///
|
||||
/// mov $0x4007dc, %eax
|
||||
///
|
||||
@ -348,9 +332,39 @@ public:
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Assign output sections to all functions.
|
||||
class AssignSections : public BinaryFunctionPass {
|
||||
public:
|
||||
explicit AssignSections()
|
||||
: BinaryFunctionPass(false) {
|
||||
}
|
||||
|
||||
const char *getName() const override {
|
||||
return "assign-sections";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Compute and report to the user the imbalance in flow equations for all
|
||||
/// CFGs, so we can detect bad quality profile. Prints average and standard
|
||||
/// deviation of the absolute differences of outgoing flow minus incoming flow
|
||||
/// for blocks of interest (excluding prologues, epilogues, and BB frequency
|
||||
/// lower than 100).
|
||||
class PrintProfileStats : public BinaryFunctionPass {
|
||||
public:
|
||||
explicit PrintProfileStats(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "profile-stats";
|
||||
}
|
||||
bool shouldPrint(const BinaryFunction &) const override {
|
||||
return false;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Prints a list of the top 100 functions sorted by a set of
|
||||
@ -366,9 +380,7 @@ class PrintProgramStats : public BinaryFunctionPass {
|
||||
bool shouldPrint(const BinaryFunction &) const override {
|
||||
return false;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Pass for lowering any instructions that we have raised and that have
|
||||
@ -382,9 +394,7 @@ public:
|
||||
return "inst-lowering";
|
||||
}
|
||||
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Pass for stripping 'repz' from 'repz retq' sequence of instructions.
|
||||
@ -397,9 +407,7 @@ public:
|
||||
return "strip-rep-ret";
|
||||
}
|
||||
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Pass for inlining calls to memcpy using 'rep movsb' on X86.
|
||||
@ -412,9 +420,30 @@ public:
|
||||
return "inline-memcpy";
|
||||
}
|
||||
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Pass for specializing memcpy for a size of 1 byte.
|
||||
class SpecializeMemcpy1 : public BinaryFunctionPass {
|
||||
private:
|
||||
std::vector<std::string> Spec;
|
||||
|
||||
/// Return indices of the call sites to optimize. Count starts at 1.
|
||||
/// Returns an empty set for all call sites in the function.
|
||||
std::set<size_t> getCallSitesToOptimize(const BinaryFunction &) const;
|
||||
|
||||
public:
|
||||
explicit SpecializeMemcpy1(const cl::opt<bool> &PrintPass,
|
||||
cl::list<std::string> &Spec)
|
||||
: BinaryFunctionPass(PrintPass), Spec(Spec) {}
|
||||
|
||||
bool shouldOptimize(const BinaryFunction &BF) const override;
|
||||
|
||||
const char *getName() const override {
|
||||
return "specialize-memcpy";
|
||||
}
|
||||
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
enum FrameOptimizationType : char {
|
||||
|
||||
@ -15,6 +15,7 @@ add_llvm_library(LLVMBOLTPasses
|
||||
IdenticalCodeFolding.cpp
|
||||
IndirectCallPromotion.cpp
|
||||
Inliner.cpp
|
||||
Instrumentation.cpp
|
||||
JTFootprintReduction.cpp
|
||||
LivenessAnalysis.cpp
|
||||
LongJmp.cpp
|
||||
|
||||
@ -23,6 +23,7 @@ using EdgeList = std::vector<std::pair<BinaryBasicBlock *, uint64_t>>;
|
||||
namespace opts {
|
||||
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
extern cl::opt<bool> NoThreads;
|
||||
|
||||
cl::opt<unsigned>
|
||||
ClusterSplitThreshold("cluster-split-threshold",
|
||||
@ -288,6 +289,12 @@ private:
|
||||
ExecutionCounts[BB->getLayoutIndex()] = EC;
|
||||
}
|
||||
|
||||
// Create a separate MCCodeEmitter to allow lock-free execution
|
||||
BinaryContext::IndependentCodeEmitter Emitter;
|
||||
if (!opts::NoThreads) {
|
||||
Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter();
|
||||
}
|
||||
|
||||
// Initialize clusters
|
||||
Clusters.reserve(BF.layout_size());
|
||||
AllClusters.reserve(BF.layout_size());
|
||||
@ -295,7 +302,8 @@ private:
|
||||
Size.reserve(BF.layout_size());
|
||||
for (auto BB : BF.layout()) {
|
||||
size_t Index = BB->getLayoutIndex();
|
||||
Size.push_back(std::max(BB->estimateSize(), size_t(1)));
|
||||
Size.push_back(
|
||||
std::max<uint64_t>(BB->estimateSize(Emitter.MCE.get()), 1));
|
||||
AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
|
||||
Clusters.push_back(&AllClusters[Index]);
|
||||
CurCluster.push_back(&AllClusters[Index]);
|
||||
|
||||
@ -172,6 +172,9 @@ protected:
|
||||
/// Reference to the function being analysed
|
||||
BinaryFunction &Func;
|
||||
|
||||
/// The id of the annotation allocator to be used
|
||||
MCPlusBuilder::AllocatorIdTy AllocatorId = 0;
|
||||
|
||||
/// Tracks the state at basic block start (end) if direction of the dataflow
|
||||
/// is forward (backward).
|
||||
std::unordered_map<const BinaryBasicBlock *, StateTy> StateAtBBEntry;
|
||||
@ -244,7 +247,7 @@ protected:
|
||||
|
||||
StateTy &getOrCreateStateAt(MCInst &Point) {
|
||||
return BC.MIB->getOrCreateAnnotationAs<StateTy>(
|
||||
Point, derived().getAnnotationIndex());
|
||||
Point, derived().getAnnotationIndex(), AllocatorId);
|
||||
}
|
||||
|
||||
StateTy &getOrCreateStateAt(ProgramPoint Point) {
|
||||
@ -254,6 +257,11 @@ protected:
|
||||
}
|
||||
|
||||
public:
|
||||
/// Return the allocator id
|
||||
unsigned getAllocatorId() {
|
||||
return AllocatorId;
|
||||
}
|
||||
|
||||
/// If the direction of the dataflow is forward, operates on the last
|
||||
/// instruction of all predecessors when performing an iteration of the
|
||||
/// dataflow equation for the start of this BB. If backwards, operates on
|
||||
@ -267,8 +275,10 @@ public:
|
||||
|
||||
/// We need the current binary context and the function that will be processed
|
||||
/// in this dataflow analysis.
|
||||
DataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF)
|
||||
: BC(BC), Func(BF) {}
|
||||
DataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF,
|
||||
MCPlusBuilder::AllocatorIdTy AllocatorId = 0)
|
||||
: BC(BC), Func(BF), AllocatorId(AllocatorId) {}
|
||||
|
||||
virtual ~DataflowAnalysis() {
|
||||
cleanAnnotations();
|
||||
}
|
||||
@ -324,15 +334,15 @@ public:
|
||||
void run() {
|
||||
derived().preflight();
|
||||
|
||||
// Initialize state for all points of the function
|
||||
for (auto &BB : Func) {
|
||||
auto &St = getOrCreateStateAt(BB);
|
||||
St = derived().getStartingStateAtBB(BB);
|
||||
for (auto &Inst : BB) {
|
||||
auto &St = getOrCreateStateAt(Inst);
|
||||
St = derived().getStartingStateAtPoint(Inst);
|
||||
// Initialize state for all points of the function
|
||||
for (auto &BB : Func) {
|
||||
auto &St = getOrCreateStateAt(BB);
|
||||
St = derived().getStartingStateAtBB(BB);
|
||||
for (auto &Inst : BB) {
|
||||
auto &St = getOrCreateStateAt(Inst);
|
||||
St = derived().getStartingStateAtPoint(Inst);
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(Func.begin() != Func.end() && "Unexpected empty function");
|
||||
|
||||
std::queue<BinaryBasicBlock *> Worklist;
|
||||
@ -545,8 +555,10 @@ public:
|
||||
return count(*Expressions[PointIdx], Expr);
|
||||
}
|
||||
|
||||
InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF)
|
||||
: DataflowAnalysis<Derived, BitVector, Backward, StatePrinterTy>(BC, BF) {}
|
||||
InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF,
|
||||
MCPlusBuilder::AllocatorIdTy AllocId = 0)
|
||||
: DataflowAnalysis<Derived, BitVector, Backward, StatePrinterTy>(
|
||||
BC, BF, AllocId) {}
|
||||
virtual ~InstrsDataflowAnalysis() {}
|
||||
};
|
||||
|
||||
|
||||
@ -19,7 +19,7 @@ ReachingDefOrUse</*Def=*/true> &DataflowInfoManager::getReachingDefs() {
|
||||
if (RD)
|
||||
return *RD;
|
||||
assert(RA && "RegAnalysis required");
|
||||
RD.reset(new ReachingDefOrUse<true>(*RA, BC, BF));
|
||||
RD.reset(new ReachingDefOrUse<true>(*RA, BC, BF, None, AllocatorId));
|
||||
RD->run();
|
||||
return *RD;
|
||||
}
|
||||
@ -32,7 +32,7 @@ ReachingDefOrUse</*Def=*/false> &DataflowInfoManager::getReachingUses() {
|
||||
if (RU)
|
||||
return *RU;
|
||||
assert(RA && "RegAnalysis required");
|
||||
RU.reset(new ReachingDefOrUse<false>(*RA, BC, BF));
|
||||
RU.reset(new ReachingDefOrUse<false>(*RA, BC, BF, None, AllocatorId));
|
||||
RU->run();
|
||||
return *RU;
|
||||
}
|
||||
@ -45,7 +45,7 @@ LivenessAnalysis &DataflowInfoManager::getLivenessAnalysis() {
|
||||
if (LA)
|
||||
return *LA;
|
||||
assert(RA && "RegAnalysis required");
|
||||
LA.reset(new LivenessAnalysis(*RA, BC, BF));
|
||||
LA.reset(new LivenessAnalysis(*RA, BC, BF, AllocatorId));
|
||||
LA->run();
|
||||
return *LA;
|
||||
}
|
||||
@ -58,7 +58,7 @@ StackReachingUses &DataflowInfoManager::getStackReachingUses() {
|
||||
if (SRU)
|
||||
return *SRU;
|
||||
assert(FA && "FrameAnalysis required");
|
||||
SRU.reset(new StackReachingUses(*FA, BC, BF));
|
||||
SRU.reset(new StackReachingUses(*FA, BC, BF, AllocatorId));
|
||||
SRU->run();
|
||||
return *SRU;
|
||||
}
|
||||
@ -70,7 +70,7 @@ void DataflowInfoManager::invalidateStackReachingUses() {
|
||||
DominatorAnalysis<false> &DataflowInfoManager::getDominatorAnalysis() {
|
||||
if (DA)
|
||||
return *DA;
|
||||
DA.reset(new DominatorAnalysis<false>(BC, BF));
|
||||
DA.reset(new DominatorAnalysis<false>(BC, BF, AllocatorId));
|
||||
DA->run();
|
||||
return *DA;
|
||||
}
|
||||
@ -82,7 +82,7 @@ void DataflowInfoManager::invalidateDominatorAnalysis() {
|
||||
DominatorAnalysis<true> &DataflowInfoManager::getPostDominatorAnalysis() {
|
||||
if (PDA)
|
||||
return *PDA;
|
||||
PDA.reset(new DominatorAnalysis<true>(BC, BF));
|
||||
PDA.reset(new DominatorAnalysis<true>(BC, BF, AllocatorId));
|
||||
PDA->run();
|
||||
return *PDA;
|
||||
}
|
||||
@ -94,7 +94,7 @@ void DataflowInfoManager::invalidatePostDominatorAnalysis() {
|
||||
StackPointerTracking &DataflowInfoManager::getStackPointerTracking() {
|
||||
if (SPT)
|
||||
return *SPT;
|
||||
SPT.reset(new StackPointerTracking(BC, BF));
|
||||
SPT.reset(new StackPointerTracking(BC, BF, AllocatorId));
|
||||
SPT->run();
|
||||
return *SPT;
|
||||
}
|
||||
@ -107,7 +107,7 @@ void DataflowInfoManager::invalidateStackPointerTracking() {
|
||||
ReachingInsns<false> &DataflowInfoManager::getReachingInsns() {
|
||||
if (RI)
|
||||
return *RI;
|
||||
RI.reset(new ReachingInsns<false>(BC, BF));
|
||||
RI.reset(new ReachingInsns<false>(BC, BF, AllocatorId));
|
||||
RI->run();
|
||||
return *RI;
|
||||
}
|
||||
@ -119,7 +119,7 @@ void DataflowInfoManager::invalidateReachingInsns() {
|
||||
ReachingInsns<true> &DataflowInfoManager::getReachingInsnsBackwards() {
|
||||
if (RIB)
|
||||
return *RIB;
|
||||
RIB.reset(new ReachingInsns<true>(BC, BF));
|
||||
RIB.reset(new ReachingInsns<true>(BC, BF, AllocatorId));
|
||||
RIB->run();
|
||||
return *RIB;
|
||||
}
|
||||
@ -131,7 +131,8 @@ void DataflowInfoManager::invalidateReachingInsnsBackwards() {
|
||||
StackAllocationAnalysis &DataflowInfoManager::getStackAllocationAnalysis() {
|
||||
if (SAA)
|
||||
return *SAA;
|
||||
SAA.reset(new StackAllocationAnalysis(BC, BF, getStackPointerTracking()));
|
||||
SAA.reset(new StackAllocationAnalysis(BC, BF, getStackPointerTracking(),
|
||||
AllocatorId));
|
||||
SAA->run();
|
||||
return *SAA;
|
||||
}
|
||||
|
||||
@ -47,10 +47,15 @@ class DataflowInfoManager {
|
||||
std::unique_ptr<std::unordered_map<const MCInst *, BinaryBasicBlock *>>
|
||||
InsnToBB;
|
||||
|
||||
// Id of the allocator to be used for annotations added by any of the managed
|
||||
// analysis
|
||||
MCPlusBuilder::AllocatorIdTy AllocatorId;
|
||||
|
||||
public:
|
||||
DataflowInfoManager(const BinaryContext &BC, BinaryFunction &BF,
|
||||
const RegAnalysis *RA, const FrameAnalysis *FA)
|
||||
: RA(RA), FA(FA), BC(BC), BF(BF){};
|
||||
const RegAnalysis *RA, const FrameAnalysis *FA,
|
||||
MCPlusBuilder::AllocatorIdTy AllocId = 0)
|
||||
: RA(RA), FA(FA), BC(BC), BF(BF), AllocatorId(AllocId){};
|
||||
|
||||
/// Helper function to fetch the parent BB associated with a program point
|
||||
/// If PP is a BB itself, then return itself (cast to a BinaryBasicBlock)
|
||||
|
||||
@ -35,34 +35,35 @@ class DominatorAnalysis
|
||||
Backward>;
|
||||
|
||||
public:
|
||||
DominatorAnalysis(const BinaryContext &BC, BinaryFunction &BF)
|
||||
: InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>(BC, BF) {}
|
||||
DominatorAnalysis(const BinaryContext &BC, BinaryFunction &BF,
|
||||
MCPlusBuilder::AllocatorIdTy AllocId)
|
||||
: InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>(BC, BF,
|
||||
AllocId) {
|
||||
}
|
||||
virtual ~DominatorAnalysis() {}
|
||||
|
||||
SmallVector<ProgramPoint, 4> getDominanceFrontierFor(const MCInst &Dom) {
|
||||
SmallVector<ProgramPoint, 4> Result;
|
||||
SmallSetVector<ProgramPoint, 4> getDominanceFrontierFor(const MCInst &Dom) {
|
||||
SmallSetVector<ProgramPoint, 4> Result;
|
||||
auto DomIdx = this->ExprToIdx[&Dom];
|
||||
assert(!Backward && "Post-dom frontier not implemented");
|
||||
for (auto &BB : this->Func) {
|
||||
bool HasDominatedPred = false;
|
||||
bool HasNonDominatedPred = false;
|
||||
SmallVector<ProgramPoint, 4> Candidates;
|
||||
SmallSetVector<ProgramPoint, 4> Candidates;
|
||||
this->doForAllSuccsOrPreds(BB, [&](ProgramPoint P) {
|
||||
if ((*this->getStateAt(P))[DomIdx]) {
|
||||
Candidates.emplace_back(P);
|
||||
Candidates.insert(P);
|
||||
HasDominatedPred = true;
|
||||
return;
|
||||
}
|
||||
HasNonDominatedPred = true;
|
||||
});
|
||||
if (HasDominatedPred && HasNonDominatedPred)
|
||||
Result.append(Candidates.begin(), Candidates.end());
|
||||
Result.insert(Candidates.begin(), Candidates.end());
|
||||
if ((*this->getStateAt(ProgramPoint::getLastPointAt(BB)))[DomIdx] &&
|
||||
BB.succ_begin() == BB.succ_end())
|
||||
Result.emplace_back(ProgramPoint::getLastPointAt(BB));
|
||||
Result.insert(ProgramPoint::getLastPointAt(BB));
|
||||
}
|
||||
std::sort(Result.begin(), Result.end());
|
||||
Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
|
||||
return Result;
|
||||
}
|
||||
|
||||
@ -104,8 +105,6 @@ public:
|
||||
}
|
||||
|
||||
void run() {
|
||||
NamedRegionTimer T1("DA", "Dominator Analysis", "Dataflow", "Dataflow",
|
||||
opts::TimeOpts);
|
||||
InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>::run();
|
||||
}
|
||||
|
||||
|
||||
@ -10,6 +10,8 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "FrameAnalysis.h"
|
||||
#include "CallGraphWalker.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "llvm/Support/ThreadPool.h"
|
||||
#include <fstream>
|
||||
|
||||
#define DEBUG_TYPE "fa"
|
||||
@ -17,8 +19,9 @@
|
||||
using namespace llvm;
|
||||
|
||||
namespace opts {
|
||||
extern cl::opt<bool> TimeOpts;
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
extern cl::opt<unsigned> Verbosity;
|
||||
|
||||
extern bool shouldProcess(const bolt::BinaryFunction &Function);
|
||||
|
||||
static cl::list<std::string>
|
||||
@ -30,7 +33,17 @@ static cl::opt<std::string> FrameOptFunctionNamesFile(
|
||||
"funcs-file-fop",
|
||||
cl::desc("file with list of functions to frame optimize"));
|
||||
|
||||
static cl::opt<bool>
|
||||
TimeFA("time-fa",
|
||||
cl::desc("time frame analysis steps"),
|
||||
cl::ReallyHidden,
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
bool shouldFrameOptimize(const llvm::bolt::BinaryFunction &Function) {
|
||||
if (Function.hasUnknownControlFlow())
|
||||
return false;
|
||||
|
||||
if (!FrameOptFunctionNamesFile.empty()) {
|
||||
assert(!FrameOptFunctionNamesFile.empty() && "unexpected empty file name");
|
||||
std::ifstream FuncsFile(FrameOptFunctionNamesFile, std::ios::in);
|
||||
@ -85,7 +98,8 @@ namespace {
|
||||
class FrameAccessAnalysis {
|
||||
/// We depend on Stack Pointer Tracking to figure out the current SP offset
|
||||
/// value at a given program point
|
||||
StackPointerTracking SPT;
|
||||
StackPointerTracking &SPT;
|
||||
|
||||
/// Context vars
|
||||
const BinaryContext &BC;
|
||||
const BinaryFunction &BF;
|
||||
@ -150,14 +164,9 @@ class FrameAccessAnalysis {
|
||||
}
|
||||
|
||||
public:
|
||||
FrameAccessAnalysis(const BinaryContext &BC, BinaryFunction &BF)
|
||||
: SPT(BC, BF), BC(BC), BF(BF) {
|
||||
{
|
||||
NamedRegionTimer T1("SPT", "Stack Pointer Tracking", "Dataflow",
|
||||
"Dataflow", opts::TimeOpts);
|
||||
SPT.run();
|
||||
}
|
||||
}
|
||||
FrameAccessAnalysis(const BinaryContext &BC, BinaryFunction &BF,
|
||||
StackPointerTracking &SPT)
|
||||
: SPT(SPT), BC(BC), BF(BF) {}
|
||||
|
||||
void enterNewBB() { Prev = nullptr; }
|
||||
const FrameIndexEntry &getFIE() const { return FIE; }
|
||||
@ -393,7 +402,7 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
|
||||
<< "\n");
|
||||
bool UpdatedArgsTouched = false;
|
||||
bool NoInfo = false;
|
||||
FrameAccessAnalysis FAA(BC, BF);
|
||||
FrameAccessAnalysis FAA(BC, BF, getSPT(BF));
|
||||
|
||||
for (auto BB : BF.layout()) {
|
||||
FAA.enterNewBB();
|
||||
@ -452,7 +461,7 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
|
||||
}
|
||||
|
||||
bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
|
||||
FrameAccessAnalysis FAA(BC, BF);
|
||||
FrameAccessAnalysis FAA(BC, BF, getSPT(BF));
|
||||
|
||||
DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName()
|
||||
<< "\"\n");
|
||||
@ -485,27 +494,42 @@ bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
|
||||
}
|
||||
|
||||
void FrameAnalysis::cleanAnnotations() {
|
||||
for (auto &I : BFs) {
|
||||
for (auto &BB : I.second) {
|
||||
NamedRegionTimer T("cleanannotations", "clean annotations", "FA",
|
||||
"FA breakdown", opts::TimeFA);
|
||||
|
||||
ParallelUtilities::WorkFuncTy CleanFunction = [&](BinaryFunction &BF) {
|
||||
for (auto &BB : BF) {
|
||||
for (auto &Inst : BB) {
|
||||
BC.MIB->removeAnnotation(Inst, "ArgAccessEntry");
|
||||
BC.MIB->removeAnnotation(Inst, "FrameAccessEntry");
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
ParallelUtilities::runOnEachFunction(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, CleanFunction,
|
||||
ParallelUtilities::PredicateTy(nullptr), "cleanAnnotations");
|
||||
}
|
||||
|
||||
FrameAnalysis::FrameAnalysis(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
BinaryFunctionCallGraph &CG)
|
||||
: BC(BC), BFs(BFs) {
|
||||
FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
|
||||
: BC(BC) {
|
||||
// Position 0 of the vector should be always associated with "assume access
|
||||
// everything".
|
||||
ArgAccessesVector.emplace_back(ArgAccesses(/*AssumeEverything*/ true));
|
||||
|
||||
traverseCG(CG);
|
||||
if (!opts::NoThreads) {
|
||||
NamedRegionTimer T1("precomputespt", "pre-compute spt", "FA",
|
||||
"FA breakdown", opts::TimeFA);
|
||||
preComputeSPT();
|
||||
}
|
||||
|
||||
for (auto &I : BFs) {
|
||||
{
|
||||
NamedRegionTimer T1("traversecg", "traverse call graph", "FA",
|
||||
"FA breakdown", opts::TimeFA);
|
||||
traverseCG(CG);
|
||||
}
|
||||
|
||||
for (auto &I : BC.getBinaryFunctions()) {
|
||||
auto Count = I.second.getExecutionCount();
|
||||
if (Count != BinaryFunction::COUNT_NO_PROFILE)
|
||||
CountDenominator += Count;
|
||||
@ -521,8 +545,8 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC,
|
||||
}
|
||||
|
||||
{
|
||||
NamedRegionTimer T1("restorefi", "restore frame index", "FOP",
|
||||
"FOP breakdown", opts::TimeOpts);
|
||||
NamedRegionTimer T1("restorefi", "restore frame index", "FA",
|
||||
"FA breakdown", opts::TimeFA);
|
||||
if (!restoreFrameIndex(I.second)) {
|
||||
++NumFunctionsFailedRestoreFI;
|
||||
auto Count = I.second.getExecutionCount();
|
||||
@ -533,6 +557,18 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC,
|
||||
}
|
||||
AnalyzedFunctions.insert(&I.second);
|
||||
}
|
||||
|
||||
{
|
||||
NamedRegionTimer T1("clearspt", "clear spt", "FA", "FA breakdown",
|
||||
opts::TimeFA);
|
||||
clearSPTMap();
|
||||
|
||||
// Clean up memory allocated for annotation values
|
||||
if (!opts::NoThreads) {
|
||||
for (auto Id : SPTAllocatorsId)
|
||||
BC.MIB->freeValuesAllocator(Id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FrameAnalysis::printStats() {
|
||||
@ -548,5 +584,60 @@ void FrameAnalysis::printStats() {
|
||||
<< " could not have its frame indices restored.\n";
|
||||
}
|
||||
|
||||
void FrameAnalysis::clearSPTMap() {
|
||||
if (opts::NoThreads) {
|
||||
SPTMap.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
ParallelUtilities::WorkFuncTy ClearFunctionSPT = [&](BinaryFunction &BF) {
|
||||
auto &SPTPtr = SPTMap.find(&BF)->second;
|
||||
SPTPtr.reset();
|
||||
};
|
||||
|
||||
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
|
||||
return !BF.isSimple() || !BF.hasCFG();
|
||||
};
|
||||
|
||||
ParallelUtilities::runOnEachFunction(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, ClearFunctionSPT,
|
||||
SkipFunc, "clearSPTMap");
|
||||
|
||||
SPTMap.clear();
|
||||
}
|
||||
|
||||
void FrameAnalysis::preComputeSPT() {
|
||||
// Make sure that the SPTMap is empty
|
||||
assert(SPTMap.size() == 0);
|
||||
|
||||
// Create map entries to allow lock-free parallel execution
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &BF = BFI.second;
|
||||
if (!BF.isSimple() || !BF.hasCFG())
|
||||
continue;
|
||||
SPTMap.emplace(&BF, std::unique_ptr<StackPointerTracking>());
|
||||
}
|
||||
|
||||
// Create an index for the SPT annotation to allow lock-free parallel
|
||||
// execution
|
||||
BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
|
||||
|
||||
// Run SPT in parallel
|
||||
ParallelUtilities::WorkFuncWithAllocTy ProcessFunction =
|
||||
[&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) {
|
||||
auto &SPTPtr = SPTMap.find(&BF)->second;
|
||||
SPTPtr = std::make_unique<StackPointerTracking>(BC, BF, AllocId);
|
||||
SPTPtr->run();
|
||||
};
|
||||
|
||||
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
|
||||
return !BF.isSimple() || !BF.hasCFG();
|
||||
};
|
||||
|
||||
ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_BB_QUADRATIC, ProcessFunction,
|
||||
SkipPredicate, "preComputeSPT");
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
@ -93,7 +93,7 @@ raw_ostream &operator<<(raw_ostream &OS,
|
||||
/// Initialization:
|
||||
///
|
||||
/// FrameAnalysis FA(PrintPass);
|
||||
/// FA.runOnFunctions(BC, BFs, LargeFunctions);
|
||||
/// FA.runOnFunctions(BC);
|
||||
///
|
||||
/// Usage (fetching frame access information about a given instruction):
|
||||
///
|
||||
@ -113,7 +113,6 @@ raw_ostream &operator<<(raw_ostream &OS,
|
||||
///
|
||||
class FrameAnalysis {
|
||||
BinaryContext &BC;
|
||||
std::map<uint64_t, BinaryFunction> &BFs;
|
||||
|
||||
/// Map functions to the set of <stack offsets, size> tuples representing
|
||||
/// accesses to stack positions that belongs to caller
|
||||
@ -168,9 +167,17 @@ class FrameAnalysis {
|
||||
/// to analyze and this information can't be safely determined for \p BF.
|
||||
bool restoreFrameIndex(BinaryFunction &BF);
|
||||
|
||||
/// A store for SPT info per function
|
||||
std::unordered_map<const BinaryFunction *,
|
||||
std::unique_ptr<StackPointerTracking>>
|
||||
SPTMap;
|
||||
|
||||
/// A vector that stores ids of the allocators that are used in SPT
|
||||
/// computation
|
||||
std::vector<MCPlusBuilder::AllocatorIdTy> SPTAllocatorsId;
|
||||
|
||||
public:
|
||||
explicit FrameAnalysis(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
BinaryFunctionCallGraph &CG);
|
||||
|
||||
/// Return true if we could fully analyze \p Func
|
||||
@ -197,10 +204,30 @@ public:
|
||||
cleanAnnotations();
|
||||
}
|
||||
|
||||
|
||||
/// Print to standard output statistics about the analysis performed by this
|
||||
/// pass
|
||||
void printStats();
|
||||
|
||||
/// Get or create an SPT object and run the analysis
|
||||
StackPointerTracking &getSPT(BinaryFunction &BF) {
|
||||
if (!SPTMap.count(&BF)) {
|
||||
SPTMap.emplace(&BF, std::make_unique<StackPointerTracking>(BC, BF));
|
||||
auto Iter = SPTMap.find(&BF);
|
||||
assert(Iter != SPTMap.end() && "item should exist");
|
||||
Iter->second->run();
|
||||
return *Iter->second;
|
||||
}
|
||||
|
||||
auto Iter = SPTMap.find(&BF);
|
||||
assert(Iter != SPTMap.end() && "item should exist");
|
||||
return *Iter->second;
|
||||
}
|
||||
|
||||
/// Clean and de-allocate all SPT objects
|
||||
void clearSPTMap();
|
||||
|
||||
/// Perform SPT analysis for all functions in parallel
|
||||
void preComputeSPT();
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "FrameOptimizer.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "ShrinkWrapping.h"
|
||||
#include "StackAvailableExpressions.h"
|
||||
#include "StackReachingUses.h"
|
||||
@ -47,7 +48,6 @@ RemoveStores("frame-opt-rm-stores",
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
|
||||
} // namespace opts
|
||||
|
||||
namespace llvm {
|
||||
@ -221,21 +221,36 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
|
||||
}
|
||||
}
|
||||
|
||||
void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
|
||||
if (opts::FrameOptimization == FOP_NONE)
|
||||
return;
|
||||
|
||||
// Run FrameAnalysis pass
|
||||
BinaryFunctionCallGraph CG = buildCallGraph(BC, BFs);
|
||||
FrameAnalysis FA(BC, BFs, CG);
|
||||
RegAnalysis RA(BC, &BFs, &CG);
|
||||
std::unique_ptr<BinaryFunctionCallGraph> CG;
|
||||
std::unique_ptr<FrameAnalysis> FA;
|
||||
std::unique_ptr<RegAnalysis> RA;
|
||||
|
||||
// Our main loop: perform caller-saved register optimizations, then
|
||||
// callee-saved register optimizations (shrink wrapping).
|
||||
for (auto &I : BFs) {
|
||||
if (!FA.hasFrameInfo(I.second))
|
||||
{
|
||||
NamedRegionTimer T1("callgraph", "create call graph", "FOP",
|
||||
"FOP breakdown", opts::TimeOpts);
|
||||
CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC));
|
||||
}
|
||||
|
||||
{
|
||||
NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
|
||||
"FOP breakdown", opts::TimeOpts);
|
||||
FA = std::make_unique<FrameAnalysis>(BC, *CG);
|
||||
}
|
||||
|
||||
{
|
||||
NamedRegionTimer T1("reganalysis", "reg analysis", "FOP",
|
||||
"FOP breakdown", opts::TimeOpts);
|
||||
RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get());
|
||||
}
|
||||
|
||||
// Perform caller-saved register optimizations, then callee-saved register
|
||||
// optimizations (shrink wrapping)
|
||||
for (auto &I : BC.getBinaryFunctions()) {
|
||||
if (!FA->hasFrameInfo(I.second))
|
||||
continue;
|
||||
// Restrict pass execution if user asked to only run on hot functions
|
||||
if (opts::FrameOptimization == FOP_HOT) {
|
||||
@ -247,27 +262,28 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
|
||||
<< " ) exceeds our hotness threshold ( "
|
||||
<< BC.getHotThreshold() << " )\n");
|
||||
}
|
||||
|
||||
{
|
||||
NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
|
||||
opts::TimeOpts);
|
||||
removeUnnecessaryLoads(RA, FA, BC, I.second);
|
||||
removeUnnecessaryLoads(*RA, *FA, BC, I.second);
|
||||
}
|
||||
|
||||
if (opts::RemoveStores) {
|
||||
NamedRegionTimer T1("removestores", "remove stores", "FOP",
|
||||
"FOP breakdown", opts::TimeOpts);
|
||||
removeUnusedStores(FA, BC, I.second);
|
||||
removeUnusedStores(*FA, BC, I.second);
|
||||
}
|
||||
// Don't even start shrink wrapping if no profiling info is available
|
||||
if (I.second.getKnownExecutionCount() == 0)
|
||||
continue;
|
||||
{
|
||||
NamedRegionTimer T1("movespills", "move spills", "FOP", "FOP breakdown",
|
||||
opts::TimeOpts);
|
||||
DataflowInfoManager Info(BC, I.second, &RA, &FA);
|
||||
ShrinkWrapping SW(FA, BC, I.second, Info);
|
||||
if (SW.perform())
|
||||
FuncsChanged.insert(&I.second);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
{
|
||||
NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
|
||||
"FOP breakdown", opts::TimeOpts);
|
||||
performShrinkWrapping(*RA, *FA, BC);
|
||||
}
|
||||
|
||||
outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
|
||||
@ -278,9 +294,67 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
|
||||
<< NumLoadsChangedToImm << " to use an immediate.\n"
|
||||
<< "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and "
|
||||
<< NumRedundantStores << " store(s).\n";
|
||||
FA.printStats();
|
||||
FA->printStats();
|
||||
ShrinkWrapping::printStats();
|
||||
}
|
||||
|
||||
void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
|
||||
const FrameAnalysis &FA,
|
||||
BinaryContext &BC) {
|
||||
// Initialize necessary annotations to allow safe parallel accesses to
|
||||
// annotation index in MIB
|
||||
BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
|
||||
BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
|
||||
BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
|
||||
BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
|
||||
BC.MIB->getOrCreateAnnotationIndex(
|
||||
StackLayoutModifier::getOffsetCFIRegTagName());
|
||||
BC.MIB->getOrCreateAnnotationIndex("ReachingDefs");
|
||||
BC.MIB->getOrCreateAnnotationIndex("ReachingUses");
|
||||
BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis");
|
||||
BC.MIB->getOrCreateAnnotationIndex("StackReachingUses");
|
||||
BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis");
|
||||
BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis");
|
||||
BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
|
||||
BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
|
||||
BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions");
|
||||
BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis");
|
||||
BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
|
||||
BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
|
||||
BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward");
|
||||
BC.MIB->getOrCreateAnnotationIndex("ReachingInsns");
|
||||
BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
|
||||
BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
|
||||
|
||||
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
|
||||
if (!FA.hasFrameInfo(BF))
|
||||
return true;
|
||||
|
||||
if (opts::FrameOptimization == FOP_HOT &&
|
||||
(BF.getKnownExecutionCount() < BC.getHotThreshold()))
|
||||
return true;
|
||||
|
||||
if (BF.getKnownExecutionCount() == 0)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
|
||||
[&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
|
||||
DataflowInfoManager Info(BC, BF, &RA, &FA, AllocatorId);
|
||||
ShrinkWrapping SW(FA, BC, BF, Info, AllocatorId);
|
||||
|
||||
if (SW.perform()) {
|
||||
std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
|
||||
FuncsChanged.insert(&BF);
|
||||
}
|
||||
};
|
||||
|
||||
ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
|
||||
SkipPredicate, "shrink-wrapping");
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
@ -86,6 +86,8 @@ class FrameOptimizerPass : public BinaryFunctionPass {
|
||||
|
||||
DenseSet<const BinaryFunction *> FuncsChanged;
|
||||
|
||||
std::mutex FuncsChangedMutex;
|
||||
|
||||
/// Perform a dataflow analysis in \p BF to reveal unnecessary reloads from
|
||||
/// the frame. Use the analysis to convert memory loads to register moves or
|
||||
/// immediate loads. Delete redundant register moves.
|
||||
@ -99,6 +101,10 @@ class FrameOptimizerPass : public BinaryFunctionPass {
|
||||
const BinaryContext &BC,
|
||||
BinaryFunction &BF);
|
||||
|
||||
/// Perform shrinkwrapping step
|
||||
void performShrinkWrapping(const RegAnalysis &RA, const FrameAnalysis &FA,
|
||||
BinaryContext &BC);
|
||||
|
||||
public:
|
||||
explicit FrameOptimizerPass(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) {}
|
||||
@ -108,9 +114,7 @@ public:
|
||||
}
|
||||
|
||||
/// Pass entry point
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
|
||||
#include "BinaryFunction.h"
|
||||
#include "HFSort.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "ReorderUtils.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
|
||||
@ -319,50 +320,115 @@ public:
|
||||
/// Merge pairs of clusters while there is an improvement in the
|
||||
/// expected cache miss ratio
|
||||
void runPassTwo() {
|
||||
while (Clusters.size() > 1) {
|
||||
Cluster *BestClusterPred = nullptr;
|
||||
Cluster *BestClusterSucc = nullptr;
|
||||
double BestGain = -1;
|
||||
for (auto ClusterPred : Clusters) {
|
||||
// get candidates for merging with the current cluster
|
||||
Adjacent.forAllAdjacent(
|
||||
ClusterPred,
|
||||
// find the best candidate
|
||||
[&](Cluster *ClusterSucc) {
|
||||
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
|
||||
// compute the gain of merging two clusters
|
||||
const double Gain = mergeGain(ClusterPred, ClusterSucc);
|
||||
// BucketsCount is hard-coded to make the algorithm determinestic regardless
|
||||
// of the number of threads
|
||||
const unsigned BucketsCount = 124;
|
||||
unsigned IterationCount = 0;
|
||||
|
||||
// breaking ties by density to make the hottest clusters be merged first
|
||||
if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 &&
|
||||
compareClusterPairs(ClusterPred,
|
||||
ClusterSucc,
|
||||
BestClusterPred,
|
||||
BestClusterSucc))) {
|
||||
BestGain = Gain;
|
||||
BestClusterPred = ClusterPred;
|
||||
BestClusterSucc = ClusterSucc;
|
||||
}
|
||||
});
|
||||
llvm::ThreadPool *Pool;
|
||||
if (!opts::NoThreads)
|
||||
Pool = &ParallelUtilities::getThreadPool();
|
||||
|
||||
while (Clusters.size() > 1) {
|
||||
MergeCandidateEntry GlobalMaximum;
|
||||
std::vector<MergeCandidateEntry> LocalMaximums(BucketsCount);
|
||||
|
||||
// Compare two candidates with a given gain
|
||||
auto compareCandidates = [](const MergeCandidateEntry &CandidateA,
|
||||
const MergeCandidateEntry &CandidateB) {
|
||||
// breaking ties by density to make the hottest clusters be
|
||||
// merged first
|
||||
return CandidateA.Gain > CandidateB.Gain ||
|
||||
(std::abs(CandidateA.Gain - CandidateB.Gain) < 1e-8 &&
|
||||
compareClusterPairs(
|
||||
CandidateA.ClusterPred, CandidateA.ClusterSucc,
|
||||
CandidateB.ClusterPred, CandidateB.ClusterSucc));
|
||||
};
|
||||
|
||||
// find the best candidates to merge within a bucket range
|
||||
auto findMaximaInBucket = [&](const unsigned Start, const unsigned End,
|
||||
const unsigned BucketId) {
|
||||
auto &LocalMaximum = LocalMaximums[BucketId];
|
||||
|
||||
for (unsigned Idx = Start; Idx < End; Idx++) {
|
||||
if (Idx >= Clusters.size())
|
||||
return;
|
||||
|
||||
auto ClusterPred = Clusters[Idx];
|
||||
|
||||
// get best candidates to merge with the current cluster
|
||||
Adjacent.forAllAdjacent(
|
||||
ClusterPred,
|
||||
// find the best candidate
|
||||
[&](Cluster *ClusterSucc) {
|
||||
assert(ClusterPred != ClusterSucc &&
|
||||
"loop edges are not supported");
|
||||
|
||||
// compute the gain of merging two clusters
|
||||
const double Gain = mergeGain(ClusterPred, ClusterSucc);
|
||||
|
||||
// create a new candidate
|
||||
MergeCandidateEntry Candidate;
|
||||
Candidate.Gain = Gain;
|
||||
Candidate.ClusterPred = ClusterPred;
|
||||
Candidate.ClusterSucc = ClusterSucc;
|
||||
|
||||
if (compareCandidates(Candidate, LocalMaximum))
|
||||
LocalMaximum = Candidate;
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
unsigned BucketSize = Clusters.size() / BucketsCount;
|
||||
if (Clusters.size() % BucketsCount)
|
||||
BucketSize++;
|
||||
|
||||
// find the best candidate within each bucket
|
||||
unsigned BucketId = 0;
|
||||
for (unsigned ClusterIdx = 0; ClusterIdx < Clusters.size();
|
||||
ClusterIdx += BucketSize, BucketId++) {
|
||||
|
||||
if (opts::NoThreads) {
|
||||
findMaximaInBucket(ClusterIdx, ClusterIdx + BucketSize, BucketId);
|
||||
} else {
|
||||
Pool->async(findMaximaInBucket, ClusterIdx, ClusterIdx + BucketSize,
|
||||
BucketId);
|
||||
}
|
||||
}
|
||||
|
||||
// stop merging when there is no improvement
|
||||
if (BestGain <= 0.0)
|
||||
if (!opts::NoThreads)
|
||||
Pool->wait();
|
||||
|
||||
// find glabal maximum
|
||||
for (auto &LocalMaximum : LocalMaximums) {
|
||||
if (LocalMaximum.Gain > 0 &&
|
||||
compareCandidates(LocalMaximum, GlobalMaximum))
|
||||
GlobalMaximum = LocalMaximum;
|
||||
}
|
||||
|
||||
if (GlobalMaximum.Gain <= 0.0)
|
||||
break;
|
||||
|
||||
// merge the best pair of clusters
|
||||
mergeClusters(BestClusterPred, BestClusterSucc);
|
||||
DEBUG(outs() << "merging##" << GlobalMaximum.ClusterPred->id() << "##"
|
||||
<< GlobalMaximum.ClusterSucc->id() << "@@"
|
||||
<< GlobalMaximum.Gain << "\n");
|
||||
|
||||
mergeClusters(GlobalMaximum.ClusterPred, GlobalMaximum.ClusterSucc);
|
||||
}
|
||||
|
||||
DEBUG(outs() << "BOLT-INFO: hfsort+ pass two finished in" << IterationCount
|
||||
<< " iterations.");
|
||||
}
|
||||
|
||||
/// Run hfsort+ algorithm and return ordered set of function clusters.
|
||||
std::vector<Cluster> run() {
|
||||
DEBUG(dbgs() << "Starting hfsort+ w/"
|
||||
<< (UseGainCache ? "gain cache" : "no cache")
|
||||
<< " for " << Clusters.size() << " clusters "
|
||||
<< (UseGainCache ? "gain cache" : "no cache") << " for "
|
||||
<< Clusters.size() << " clusters "
|
||||
<< "with ITLBPageSize = " << ITLBPageSize << ", "
|
||||
<< "ITLBEntries = " << ITLBEntries << ", "
|
||||
<< "and MergeProbability = " << opts::MergeProbability << "\n");
|
||||
<< "and MergeProbability = " << opts::MergeProbability
|
||||
<< "\n");
|
||||
|
||||
// Pass 1
|
||||
runPassOne();
|
||||
@ -370,7 +436,8 @@ public:
|
||||
// Pass 2
|
||||
runPassTwo();
|
||||
|
||||
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n");
|
||||
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size()
|
||||
<< " clusters\n");
|
||||
|
||||
// Sorting clusters by density in decreasing order
|
||||
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
||||
@ -418,6 +485,13 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
/// A struct that is used to store a merge candidate
|
||||
struct MergeCandidateEntry {
|
||||
double Gain{-1};
|
||||
Cluster *ClusterPred{nullptr};
|
||||
Cluster *ClusterSucc{nullptr};
|
||||
};
|
||||
|
||||
/// Initialize the set of active clusters, function id to cluster mapping,
|
||||
/// total number of samples and function addresses.
|
||||
std::vector<Cluster *> initializeClusters() {
|
||||
@ -502,7 +576,7 @@ private:
|
||||
// when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
|
||||
// containing both x and y and all clusters adjacent to x and y (and recompute
|
||||
// them on the next iteration).
|
||||
mutable ClusterPairCache<Cluster, double> GainCache;
|
||||
mutable ClusterPairCacheThreadSafe<Cluster, double> GainCache;
|
||||
};
|
||||
|
||||
} // end namespace anonymous
|
||||
|
||||
@ -9,9 +9,12 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
||||
#include "Passes/IdenticalCodeFolding.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
#include "llvm/Support/ThreadPool.h"
|
||||
#include "llvm/Support/Timer.h"
|
||||
#include <atomic>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
@ -32,6 +35,12 @@ UseDFS("icf-dfs",
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
TimeICF("time-icf",
|
||||
cl::desc("time icf steps"),
|
||||
cl::ReallyHidden,
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
} // namespace opts
|
||||
|
||||
namespace {
|
||||
@ -276,72 +285,108 @@ bool isIdenticalWith(const BinaryFunction &A, const BinaryFunction &B,
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// This hash table is used to identify identical functions. It maps
|
||||
// a function to a bucket of functions identical to it.
|
||||
struct KeyHash {
|
||||
std::size_t operator()(const BinaryFunction *F) const {
|
||||
return F->hash(/*Recompute=*/false);
|
||||
}
|
||||
};
|
||||
|
||||
struct KeyCongruent {
|
||||
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
|
||||
if (A == B)
|
||||
return true;
|
||||
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/true, opts::UseDFS);
|
||||
}
|
||||
};
|
||||
|
||||
struct KeyEqual {
|
||||
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
|
||||
if (A == B)
|
||||
return true;
|
||||
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/false, opts::UseDFS);
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>,
|
||||
KeyHash, KeyCongruent>
|
||||
CongruentBucketsMap;
|
||||
|
||||
typedef std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
|
||||
KeyHash, KeyEqual>
|
||||
IdenticalBucketsMap;
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &) {
|
||||
const auto OriginalFunctionCount = BFs.size();
|
||||
uint64_t NumFunctionsFolded = 0;
|
||||
uint64_t NumJTFunctionsFolded = 0;
|
||||
uint64_t BytesSavedEstimate = 0;
|
||||
uint64_t CallsSavedEstimate = 0;
|
||||
void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
|
||||
const auto OriginalFunctionCount = BC.getBinaryFunctions().size();
|
||||
uint64_t NumFunctionsFolded{0};
|
||||
std::atomic<uint64_t> NumJTFunctionsFolded{0};
|
||||
std::atomic<uint64_t> BytesSavedEstimate{0};
|
||||
std::atomic<uint64_t> CallsSavedEstimate{0};
|
||||
std::atomic<uint64_t> NumFoldedLastIteration{0};
|
||||
CongruentBucketsMap CongruentBuckets;
|
||||
|
||||
// This hash table is used to identify identical functions. It maps
|
||||
// a function to a bucket of functions identical to it.
|
||||
struct KeyHash {
|
||||
std::size_t operator()(const BinaryFunction *F) const {
|
||||
return F->hash(/*Recompute=*/false);
|
||||
}
|
||||
};
|
||||
struct KeyCongruent {
|
||||
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
|
||||
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/true, opts::UseDFS);
|
||||
}
|
||||
};
|
||||
struct KeyEqual {
|
||||
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
|
||||
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/false, opts::UseDFS);
|
||||
}
|
||||
// Hash all the functions
|
||||
auto hashFunctions = [&]() {
|
||||
NamedRegionTimer HashFunctionsTimer("hashing", "hashing", "ICF breakdown",
|
||||
"ICF breakdown", opts::TimeICF);
|
||||
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
|
||||
// Make sure indices are in-order.
|
||||
BF.updateLayoutIndices();
|
||||
|
||||
// Pre-compute hash before pushing into hashtable.
|
||||
BF.hash(/*Recompute=*/true, opts::UseDFS);
|
||||
};
|
||||
|
||||
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
|
||||
return !shouldOptimize(BF);
|
||||
};
|
||||
|
||||
ParallelUtilities::runOnEachFunction(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
|
||||
"hashFunctions", /*ForceSequential*/ false, 2);
|
||||
};
|
||||
|
||||
// Create buckets with congruent functions - functions that potentially could
|
||||
// be folded.
|
||||
std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>,
|
||||
KeyHash, KeyCongruent> CongruentBuckets;
|
||||
for (auto &BFI : BFs) {
|
||||
auto &BF = BFI.second;
|
||||
if (!shouldOptimize(BF) || BF.isFolded())
|
||||
continue;
|
||||
|
||||
// Make sure indices are in-order.
|
||||
BF.updateLayoutIndices();
|
||||
|
||||
// Pre-compute hash before pushing into hashtable.
|
||||
BF.hash(/*Recompute=*/true, opts::UseDFS);
|
||||
|
||||
CongruentBuckets[&BF].emplace(&BF);
|
||||
}
|
||||
|
||||
// We repeat the pass until no new modifications happen.
|
||||
unsigned Iteration = 1;
|
||||
uint64_t NumFoldedLastIteration;
|
||||
do {
|
||||
NumFoldedLastIteration = 0;
|
||||
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
|
||||
|
||||
for (auto &CBI : CongruentBuckets) {
|
||||
auto &Candidates = CBI.second;
|
||||
if (Candidates.size() < 2)
|
||||
// Creates buckets with congruent functions - functions that potentially
|
||||
// could be folded.
|
||||
auto createCongruentBuckets = [&]() {
|
||||
NamedRegionTimer CongruentBucketsTimer("congruent buckets",
|
||||
"congruent buckets", "ICF breakdown",
|
||||
"ICF breakdown", opts::TimeICF);
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &BF = BFI.second;
|
||||
if (!this->shouldOptimize(BF))
|
||||
continue;
|
||||
CongruentBuckets[&BF].emplace(&BF);
|
||||
}
|
||||
};
|
||||
|
||||
// Partition each set of congruent functions into sets of identical functions
|
||||
// and fold them
|
||||
auto performFoldingPass = [&]() {
|
||||
NamedRegionTimer FoldingPassesTimer("folding passes", "folding passes",
|
||||
"ICF breakdown", "ICF breakdown",
|
||||
opts::TimeICF);
|
||||
Timer SinglePass("single fold pass", "single fold pass");
|
||||
DEBUG(SinglePass.startTimer());
|
||||
|
||||
ThreadPool *ThPool;
|
||||
if (!opts::NoThreads)
|
||||
ThPool = &ParallelUtilities::getThreadPool();
|
||||
|
||||
// Fold identical functions within a single congruent bucket
|
||||
auto procesSingleBucket = [&](std::set<BinaryFunction *> &Candidates) {
|
||||
Timer T("folding single congruent list", "folding single congruent list");
|
||||
DEBUG(T.startTimer());
|
||||
|
||||
// Identical functions go into the same bucket.
|
||||
std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
|
||||
KeyHash, KeyEqual> IdenticalBuckets;
|
||||
IdenticalBucketsMap IdenticalBuckets;
|
||||
for (auto *BF : Candidates) {
|
||||
IdenticalBuckets[BF].emplace_back(BF);
|
||||
}
|
||||
@ -355,9 +400,10 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC,
|
||||
// Fold functions. Keep the order consistent across invocations with
|
||||
// different options.
|
||||
std::stable_sort(Twins.begin(), Twins.end(),
|
||||
[](const BinaryFunction *A, const BinaryFunction *B) {
|
||||
return A->getFunctionNumber() < B->getFunctionNumber();
|
||||
});
|
||||
[](const BinaryFunction *A, const BinaryFunction *B) {
|
||||
return A->getFunctionNumber() <
|
||||
B->getFunctionNumber();
|
||||
});
|
||||
|
||||
BinaryFunction *ParentBF = Twins[0];
|
||||
for (unsigned i = 1; i < Twins.size(); ++i) {
|
||||
@ -375,7 +421,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC,
|
||||
BytesSavedEstimate += ChildBF->getSize();
|
||||
CallsSavedEstimate += std::min(ChildBF->getKnownExecutionCount(),
|
||||
ParentBF->getKnownExecutionCount());
|
||||
BC.foldFunction(*ChildBF, *ParentBF, BFs);
|
||||
BC.foldFunction(*ChildBF, *ParentBF);
|
||||
|
||||
++NumFoldedLastIteration;
|
||||
|
||||
@ -384,13 +430,44 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC,
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG(T.stopTimer());
|
||||
};
|
||||
|
||||
// Create a task for each congruent bucket
|
||||
for (auto &Entry : CongruentBuckets) {
|
||||
auto &Bucket = Entry.second;
|
||||
if (Bucket.size() < 2)
|
||||
continue;
|
||||
|
||||
if (opts::NoThreads)
|
||||
procesSingleBucket(Bucket);
|
||||
else
|
||||
ThPool->async(procesSingleBucket, std::ref(Bucket));
|
||||
}
|
||||
|
||||
if (!opts::NoThreads)
|
||||
ThPool->wait();
|
||||
|
||||
DEBUG(SinglePass.stopTimer());
|
||||
};
|
||||
|
||||
hashFunctions();
|
||||
createCongruentBuckets();
|
||||
|
||||
unsigned Iteration = 1;
|
||||
// We repeat the pass until no new modifications happen.
|
||||
do {
|
||||
NumFoldedLastIteration = 0;
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
|
||||
|
||||
performFoldingPass();
|
||||
|
||||
NumFunctionsFolded += NumFoldedLastIteration;
|
||||
++Iteration;
|
||||
|
||||
} while (NumFoldedLastIteration > 0);
|
||||
|
||||
DEBUG(
|
||||
DEBUG(
|
||||
// Print functions that are congruent but not identical.
|
||||
for (auto &CBI : CongruentBuckets) {
|
||||
auto &Candidates = CBI.second;
|
||||
|
||||
@ -23,6 +23,16 @@ namespace bolt {
|
||||
/// references to a single one of them.
|
||||
///
|
||||
class IdenticalCodeFolding : public BinaryFunctionPass {
|
||||
protected:
|
||||
bool shouldOptimize(const BinaryFunction &BF) const override {
|
||||
if (BF.hasUnknownControlFlow())
|
||||
return false;
|
||||
if (BF.isFolded())
|
||||
return false;
|
||||
if (BF.hasSDTMarker())
|
||||
return false;
|
||||
return BinaryFunctionPass::shouldOptimize(BF);
|
||||
}
|
||||
public:
|
||||
explicit IdenticalCodeFolding(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
@ -30,9 +40,7 @@ public:
|
||||
const char *getName() const override {
|
||||
return "identical-code-folding";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
@ -40,11 +40,43 @@ IndirectCallPromotion("indirect-call-promotion",
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
IndirectCallPromotionThreshold(
|
||||
"indirect-call-promotion-threshold",
|
||||
cl::desc("threshold for optimizing a frequently taken indirect call"),
|
||||
cl::init(90),
|
||||
ICPJTRemainingPercentThreshold(
|
||||
"icp-jt-remaining-percent-threshold",
|
||||
cl::desc("The percentage threshold against remaining unpromoted indirect "
|
||||
"call count for the promotion for jump tables"),
|
||||
cl::init(30),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
ICPJTTotalPercentThreshold(
|
||||
"icp-jt-total-percent-threshold",
|
||||
cl::desc("The percentage threshold against total count for the promotion for "
|
||||
"jump tables"),
|
||||
cl::init(5),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
ICPCallsRemainingPercentThreshold(
|
||||
"icp-calls-remaining-percent-threshold",
|
||||
cl::desc("The percentage threshold against remaining unpromoted indirect "
|
||||
"call count for the promotion for calls"),
|
||||
cl::init(50),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
ICPCallsTotalPercentThreshold(
|
||||
"icp-calls-total-percent-threshold",
|
||||
cl::desc("The percentage threshold against total count for the promotion for "
|
||||
"calls"),
|
||||
cl::init(30),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
@ -52,7 +84,7 @@ IndirectCallPromotionMispredictThreshold(
|
||||
"indirect-call-promotion-mispredict-threshold",
|
||||
cl::desc("misprediction threshold for skipping ICP on an "
|
||||
"indirect call"),
|
||||
cl::init(2),
|
||||
cl::init(0),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
@ -69,17 +101,17 @@ IndirectCallPromotionUseMispredicts(
|
||||
static cl::opt<unsigned>
|
||||
IndirectCallPromotionTopN(
|
||||
"indirect-call-promotion-topn",
|
||||
cl::desc("number of targets to consider when doing indirect "
|
||||
"call promotion"),
|
||||
cl::init(1),
|
||||
cl::desc("limit number of targets to consider when doing indirect "
|
||||
"call promotion. 0 = no limit"),
|
||||
cl::init(3),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
IndirectCallPromotionCallsTopN(
|
||||
"indirect-call-promotion-calls-topn",
|
||||
cl::desc("number of targets to consider when doing indirect "
|
||||
"call promotion on calls"),
|
||||
cl::desc("limit number of targets to consider when doing indirect "
|
||||
"call promotion on calls. 0 = no limit"),
|
||||
cl::init(0),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
@ -87,8 +119,8 @@ IndirectCallPromotionCallsTopN(
|
||||
static cl::opt<unsigned>
|
||||
IndirectCallPromotionJumpTablesTopN(
|
||||
"indirect-call-promotion-jump-tables-topn",
|
||||
cl::desc("number of targets to consider when doing indirect "
|
||||
"call promotion on jump tables"),
|
||||
cl::desc("limit number of targets to consider when doing indirect "
|
||||
"call promotion on jump tables. 0 = no limit"),
|
||||
cl::init(0),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
@ -106,8 +138,8 @@ static cl::opt<unsigned>
|
||||
ICPTopCallsites(
|
||||
"icp-top-callsites",
|
||||
cl::desc("only optimize calls that contribute to this percentage of all "
|
||||
"indirect calls"),
|
||||
cl::init(0),
|
||||
"indirect calls. 0 = all callsites"),
|
||||
cl::init(99),
|
||||
cl::Hidden,
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
@ -181,6 +213,42 @@ IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF,
|
||||
}
|
||||
}
|
||||
|
||||
void IndirectCallPromotion::printDecision(
|
||||
llvm::raw_ostream &OS,
|
||||
std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const {
|
||||
uint64_t TotalCount = 0;
|
||||
uint64_t TotalMispreds = 0;
|
||||
for (const auto &S : Targets) {
|
||||
TotalCount += S.Branches;
|
||||
TotalMispreds += S.Mispreds;
|
||||
}
|
||||
if (!TotalCount)
|
||||
TotalCount = 1;
|
||||
if (!TotalMispreds)
|
||||
TotalMispreds = 1;
|
||||
|
||||
OS << "BOLT-INFO: ICP decision for call site with " << Targets.size()
|
||||
<< " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds
|
||||
<< "\n";
|
||||
|
||||
size_t I = 0;
|
||||
for (const auto &S : Targets) {
|
||||
OS << "Count = " << S.Branches << ", "
|
||||
<< format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
|
||||
<< "Mispreds = " << S.Mispreds << ", "
|
||||
<< format("%.1f", (100.0 * S.Mispreds) / TotalMispreds);
|
||||
if (I < N)
|
||||
OS << " * to be optimized *";
|
||||
if (!S.JTIndices.empty()) {
|
||||
OS << " Indices:";
|
||||
for (const auto Idx : S.JTIndices)
|
||||
OS << " " << Idx;
|
||||
}
|
||||
OS << "\n";
|
||||
I += S.JTIndices.empty() ? 1 : S.JTIndices.size();
|
||||
}
|
||||
}
|
||||
|
||||
// Get list of targets for a given call sorted by most frequently
|
||||
// called first.
|
||||
std::vector<IndirectCallPromotion::Callsite>
|
||||
@ -242,7 +310,8 @@ IndirectCallPromotion::getCallTargets(
|
||||
auto &A = *Result;
|
||||
const auto &B = *First;
|
||||
if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym) {
|
||||
A.JTIndex.insert(A.JTIndex.end(), B.JTIndex.begin(), B.JTIndex.end());
|
||||
A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(),
|
||||
B.JTIndices.end());
|
||||
} else {
|
||||
*(++Result) = *First;
|
||||
}
|
||||
@ -272,10 +341,17 @@ IndirectCallPromotion::getCallTargets(
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by most commonly called targets.
|
||||
// Sort by target count, number of indices in case of jump table, and
|
||||
// mispredicts. We prioritize targets with high count, small number of
|
||||
// indices and high mispredicts
|
||||
std::stable_sort(Targets.begin(), Targets.end(),
|
||||
[](const Callsite &A, const Callsite &B) {
|
||||
return A.Branches > B.Branches;
|
||||
if (A.Branches != B.Branches)
|
||||
return A.Branches > B.Branches;
|
||||
else if (A.JTIndices.size() != B.JTIndices.size())
|
||||
return A.JTIndices.size() < B.JTIndices.size();
|
||||
else
|
||||
return A.Mispreds > B.Mispreds;
|
||||
});
|
||||
|
||||
// Remove non-symbol targets
|
||||
@ -380,9 +456,9 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets(
|
||||
|
||||
uint64_t ArrayStart;
|
||||
if (DispExpr) {
|
||||
auto *BD = BC.getBinaryDataByName(DispExpr->getSymbol().getName());
|
||||
assert(BD && "global symbol needs a value");
|
||||
ArrayStart = BD->getAddress();
|
||||
auto DispValueOrError = BC.getSymbolValue(DispExpr->getSymbol());
|
||||
assert(DispValueOrError && "global symbol needs a value");
|
||||
ArrayStart = *DispValueOrError;
|
||||
} else {
|
||||
ArrayStart = static_cast<uint64_t>(DispValue);
|
||||
}
|
||||
@ -491,7 +567,7 @@ IndirectCallPromotion::SymTargetsType
|
||||
IndirectCallPromotion::findCallTargetSymbols(
|
||||
BinaryContext &BC,
|
||||
std::vector<Callsite> &Targets,
|
||||
const size_t N,
|
||||
size_t &N,
|
||||
BinaryFunction &Function,
|
||||
BinaryBasicBlock *BB,
|
||||
MCInst &CallInst,
|
||||
@ -511,7 +587,7 @@ IndirectCallPromotion::findCallTargetSymbols(
|
||||
if (!HotTargets.empty()) {
|
||||
auto findTargetsIndex = [&](uint64_t JTIndex) {
|
||||
for (size_t I = 0; I < Targets.size(); ++I) {
|
||||
auto &JTIs = Targets[I].JTIndex;
|
||||
auto &JTIs = Targets[I].JTIndices;
|
||||
if (std::find(JTIs.begin(), JTIs.end(), JTIndex) != JTIs.end())
|
||||
return I;
|
||||
}
|
||||
@ -521,35 +597,81 @@ IndirectCallPromotion::findCallTargetSymbols(
|
||||
"callsite");
|
||||
};
|
||||
|
||||
const auto MaxHotTargets = std::min(N, HotTargets.size());
|
||||
|
||||
if (opts::Verbosity >= 1) {
|
||||
for (size_t I = 0; I < MaxHotTargets; ++I) {
|
||||
for (size_t I = 0; I < HotTargets.size(); ++I) {
|
||||
outs() << "BOLT-INFO: HotTarget[" << I << "] = ("
|
||||
<< HotTargets[I].first << ", " << HotTargets[I].second << ")\n";
|
||||
}
|
||||
}
|
||||
|
||||
// Recompute hottest targets, now discriminating which index is hot
|
||||
// NOTE: This is a tradeoff. On one hand, we get index information. On the
|
||||
// other hand, info coming from the memory profile is much less accurate
|
||||
// than LBRs. So we may actually end up working with more coarse
|
||||
// profile granularity in exchange for information about indices.
|
||||
std::vector<Callsite> NewTargets;
|
||||
for (size_t I = 0; I < MaxHotTargets; ++I) {
|
||||
std::map<const MCSymbol *, uint32_t> IndicesPerTarget;
|
||||
uint64_t TotalMemAccesses = 0;
|
||||
for (size_t I = 0; I < HotTargets.size(); ++I) {
|
||||
const auto TargetIndex = findTargetsIndex(HotTargets[I].second);
|
||||
++IndicesPerTarget[Targets[TargetIndex].To.Sym];
|
||||
TotalMemAccesses += HotTargets[I].first;
|
||||
}
|
||||
uint64_t RemainingMemAccesses = TotalMemAccesses;
|
||||
const size_t TopN = opts::IndirectCallPromotionJumpTablesTopN != 0
|
||||
? opts::IndirectCallPromotionTopN
|
||||
: opts::IndirectCallPromotionTopN;
|
||||
size_t I{0};
|
||||
for (; I < HotTargets.size(); ++I) {
|
||||
const auto MemAccesses = HotTargets[I].first;
|
||||
if (100 * MemAccesses <
|
||||
TotalMemAccesses * opts::ICPJTTotalPercentThreshold)
|
||||
break;
|
||||
if (100 * MemAccesses <
|
||||
RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold)
|
||||
break;
|
||||
if (TopN && I >= TopN)
|
||||
break;
|
||||
RemainingMemAccesses -= MemAccesses;
|
||||
|
||||
const auto JTIndex = HotTargets[I].second;
|
||||
const auto TargetIndex = findTargetsIndex(JTIndex);
|
||||
auto &Target = Targets[findTargetsIndex(JTIndex)];
|
||||
|
||||
NewTargets.push_back(Targets[TargetIndex]);
|
||||
std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndex);
|
||||
NewTargets.push_back(Target);
|
||||
std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices);
|
||||
Target.JTIndices.erase(std::remove(Target.JTIndices.begin(),
|
||||
Target.JTIndices.end(), JTIndex),
|
||||
Target.JTIndices.end());
|
||||
|
||||
Targets.erase(Targets.begin() + TargetIndex);
|
||||
// Keep fixCFG counts sane if more indices use this same target later
|
||||
assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map");
|
||||
NewTargets.back().Branches =
|
||||
Target.Branches / IndicesPerTarget[Target.To.Sym];
|
||||
NewTargets.back().Mispreds =
|
||||
Target.Mispreds / IndicesPerTarget[Target.To.Sym];
|
||||
assert(Target.Branches >= NewTargets.back().Branches);
|
||||
assert(Target.Mispreds >= NewTargets.back().Mispreds);
|
||||
Target.Branches -= NewTargets.back().Branches;
|
||||
Target.Mispreds -= NewTargets.back().Mispreds;
|
||||
}
|
||||
std::copy(Targets.begin(), Targets.end(), std::back_inserter(NewTargets));
|
||||
assert(NewTargets.size() == Targets.size() + MaxHotTargets);
|
||||
std::swap(NewTargets, Targets);
|
||||
N = I;
|
||||
|
||||
if (N == 0 && opts::Verbosity >= 1) {
|
||||
outs() << "BOLT-INFO: ICP failed in " << Function << " in "
|
||||
<< BB->getName()
|
||||
<< ": failed to meet thresholds after memory profile data was "
|
||||
"loaded.\n";
|
||||
return SymTargets;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) {
|
||||
auto &Target = Targets[TgtIdx];
|
||||
assert(Target.To.Sym && "All ICP targets must be to known symbols");
|
||||
assert(!Target.JTIndex.empty() && "Jump tables must have indices");
|
||||
for (auto Idx : Target.JTIndex) {
|
||||
assert(!Target.JTIndices.empty() && "Jump tables must have indices");
|
||||
for (auto Idx : Target.JTIndices) {
|
||||
SymTargets.push_back(std::make_pair(Target.To.Sym, Idx));
|
||||
++I;
|
||||
}
|
||||
@ -558,7 +680,7 @@ IndirectCallPromotion::findCallTargetSymbols(
|
||||
for (size_t I = 0; I < N; ++I) {
|
||||
assert(Targets[I].To.Sym &&
|
||||
"All ICP targets must be to known symbols");
|
||||
assert(Targets[I].JTIndex.empty() &&
|
||||
assert(Targets[I].JTIndices.empty() &&
|
||||
"Can't have jump table indices for non-jump tables");
|
||||
SymTargets.push_back(std::make_pair(Targets[I].To.Sym, 0));
|
||||
}
|
||||
@ -647,7 +769,7 @@ IndirectCallPromotion::maybeGetVtableSyms(
|
||||
<< "+" << MethodOffset << "/" << MI.Count
|
||||
<< "\n");
|
||||
|
||||
if (auto MethodAddr = BC.extractPointerAtAddress(Address)) {
|
||||
if (auto MethodAddr = BC.getPointerAtAddress(Address)) {
|
||||
auto *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get());
|
||||
if (!MethodBD) // skip unknown methods
|
||||
continue;
|
||||
@ -697,7 +819,7 @@ IndirectCallPromotion::rewriteCall(
|
||||
BinaryFunction &Function,
|
||||
BinaryBasicBlock *IndCallBlock,
|
||||
const MCInst &CallInst,
|
||||
MCPlusBuilder::ICPdata &&ICPcode,
|
||||
MCPlusBuilder::BlocksVectorTy &&ICPcode,
|
||||
const std::vector<MCInst *> &MethodFetchInsns
|
||||
) const {
|
||||
// Create new basic blocks with correct code in each one first.
|
||||
@ -720,6 +842,10 @@ IndirectCallPromotion::rewriteCall(
|
||||
}
|
||||
|
||||
auto MovedInst = IndCallBlock->splitInstructions(&CallInst);
|
||||
// Link new BBs to the original input offset of the BB where the indirect
|
||||
// call site is, so we can map samples recorded in new BBs back to the
|
||||
// original BB seen in the input binary (if using BAT)
|
||||
const auto OrigOffset = IndCallBlock->getInputOffset();
|
||||
|
||||
IndCallBlock->eraseInstructions(MethodFetchInsns.begin(),
|
||||
MethodFetchInsns.end());
|
||||
@ -737,7 +863,7 @@ IndirectCallPromotion::rewriteCall(
|
||||
auto &Sym = Itr->first;
|
||||
auto &Insts = Itr->second;
|
||||
assert(Sym);
|
||||
auto TBB = Function.createBasicBlock(0, Sym);
|
||||
auto TBB = Function.createBasicBlock(OrigOffset, Sym);
|
||||
for (auto &Inst : Insts) { // sanitize new instructions.
|
||||
if (BC.MIB->isCall(Inst))
|
||||
BC.MIB->removeAnnotation(Inst, "CallProfile");
|
||||
@ -774,10 +900,12 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG(
|
||||
for (const auto &Target : Targets) {
|
||||
TotalIndirectBranches += Target.Branches;
|
||||
}
|
||||
if (TotalIndirectBranches == 0)
|
||||
TotalIndirectBranches = 1;
|
||||
std::vector<BinaryBranchInfo> BBI;
|
||||
std::vector<BinaryBranchInfo> ScaledBBI;
|
||||
for (const auto &Target : Targets) {
|
||||
const auto NumEntries = std::max(1UL, Target.JTIndex.size());
|
||||
const auto NumEntries = std::max(1UL, Target.JTIndices.size());
|
||||
for (size_t I = 0; I < NumEntries; ++I) {
|
||||
BBI.push_back(
|
||||
BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries,
|
||||
@ -796,7 +924,7 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG(
|
||||
|
||||
std::vector<MCSymbol*> SymTargets;
|
||||
for (const auto &Target : Targets) {
|
||||
const auto NumEntries = std::max(1UL, Target.JTIndex.size());
|
||||
const auto NumEntries = std::max(1UL, Target.JTIndices.size());
|
||||
for (size_t I = 0; I < NumEntries; ++I) {
|
||||
SymTargets.push_back(Target.To.Sym);
|
||||
}
|
||||
@ -924,15 +1052,12 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB,
|
||||
} else if (opts::IndirectCallPromotionCallsTopN != 0) {
|
||||
TopN = opts::IndirectCallPromotionCallsTopN;
|
||||
}
|
||||
const auto TrialN = std::min(TopN, Targets.size());
|
||||
const auto TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size();
|
||||
|
||||
if (opts::ICPTopCallsites > 0) {
|
||||
auto &BC = BB->getFunction()->getBinaryContext();
|
||||
if (BC.MIB->hasAnnotation(Inst, "DoICP")) {
|
||||
computeStats(TrialN);
|
||||
return TrialN;
|
||||
}
|
||||
return 0;
|
||||
if (!BC.MIB->hasAnnotation(Inst, "DoICP"))
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Pick the top N targets.
|
||||
@ -974,35 +1099,28 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB,
|
||||
// Count total number of calls for (at most) the top N targets.
|
||||
// We may choose a smaller N (TrialN vs. N) if the frequency threshold
|
||||
// is exceeded by fewer targets.
|
||||
double Threshold = double(opts::IndirectCallPromotionThreshold);
|
||||
for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++MaxTargets) {
|
||||
if (N + (Targets[I].JTIndex.empty() ? 1 : Targets[I].JTIndex.size()) >
|
||||
const unsigned TotalThreshold = IsJumpTable
|
||||
? opts::ICPJTTotalPercentThreshold
|
||||
: opts::ICPCallsTotalPercentThreshold;
|
||||
const unsigned RemainingThreshold =
|
||||
IsJumpTable ? opts::ICPJTRemainingPercentThreshold
|
||||
: opts::ICPCallsRemainingPercentThreshold;
|
||||
uint64_t NumRemainingCalls = NumCalls;
|
||||
for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) {
|
||||
if (100 * Targets[I].Branches < NumCalls * TotalThreshold)
|
||||
break;
|
||||
if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold)
|
||||
break;
|
||||
if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) >
|
||||
TrialN)
|
||||
break;
|
||||
TotalCallsTopN += Targets[I].Branches;
|
||||
TotalMispredictsTopN += Targets[I].Mispreds;
|
||||
Threshold -= (100.0 * Targets[I].Branches) / NumCalls;
|
||||
N += Targets[I].JTIndex.empty() ? 1 : Targets[I].JTIndex.size();
|
||||
NumRemainingCalls -= Targets[I].Branches;
|
||||
N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size();
|
||||
}
|
||||
computeStats(MaxTargets);
|
||||
|
||||
// Compute the frequency of the top N call targets. If this frequency
|
||||
// is greater than the threshold, we should try ICP on this callsite.
|
||||
const double TopNFrequency = (100.0 * TotalCallsTopN) / NumCalls;
|
||||
|
||||
if (TopNFrequency == 0 ||
|
||||
TopNFrequency < opts::IndirectCallPromotionThreshold) {
|
||||
if (opts::Verbosity >= 1) {
|
||||
const auto InstIdx = &Inst - &(*BB->begin());
|
||||
outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ "
|
||||
<< InstIdx << " in " << BB->getName() << ", calls = "
|
||||
<< NumCalls << ", top N frequency "
|
||||
<< format("%.1f", TopNFrequency) << "% < "
|
||||
<< opts::IndirectCallPromotionThreshold << "%\n";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Don't check misprediction frequency for jump tables -- we don't really
|
||||
// care as long as we are saving loads from the jump table.
|
||||
if (!IsJumpTable || opts::ICPJumpTablesByTarget) {
|
||||
@ -1069,7 +1187,7 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB,
|
||||
<< ", taken freq = " << format("%.1f", Frequency) << "%"
|
||||
<< ", mis. freq = " << format("%.1f", MisFrequency) << "%";
|
||||
bool First = true;
|
||||
for (auto JTIndex : Targets[I].JTIndex) {
|
||||
for (auto JTIndex : Targets[I].JTIndices) {
|
||||
outs() << (First ? ", indices = " : ", ") << JTIndex;
|
||||
First = false;
|
||||
}
|
||||
@ -1082,14 +1200,12 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB,
|
||||
});
|
||||
}
|
||||
|
||||
void IndirectCallPromotion::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions
|
||||
) {
|
||||
void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
|
||||
if (opts::IndirectCallPromotion == ICP_NONE)
|
||||
return;
|
||||
|
||||
auto &BFs = BC.getBinaryFunctions();
|
||||
|
||||
const bool OptimizeCalls =
|
||||
(opts::IndirectCallPromotion == ICP_CALLS ||
|
||||
opts::IndirectCallPromotion == ICP_ALL);
|
||||
@ -1100,7 +1216,7 @@ void IndirectCallPromotion::runOnFunctions(
|
||||
std::unique_ptr<RegAnalysis> RA;
|
||||
std::unique_ptr<BinaryFunctionCallGraph> CG;
|
||||
if (OptimizeJumpTables) {
|
||||
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs)));
|
||||
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
|
||||
RA.reset(new RegAnalysis(BC, &BFs, &*CG));
|
||||
}
|
||||
|
||||
@ -1148,8 +1264,13 @@ void IndirectCallPromotion::runOnFunctions(
|
||||
// If icp-top-callsites is enabled, compute the total number of indirect
|
||||
// calls and then optimize the hottest callsites that contribute to that
|
||||
// total.
|
||||
if (opts::ICPTopCallsites > 0) {
|
||||
using IndirectCallsite = std::pair<uint64_t, MCInst *>;
|
||||
SetVector<BinaryFunction *> Functions;
|
||||
if (opts::ICPTopCallsites == 0) {
|
||||
for (auto &KV : BFs) {
|
||||
Functions.insert(&KV.second);
|
||||
}
|
||||
} else {
|
||||
using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>;
|
||||
std::vector<IndirectCallsite> IndirectCalls;
|
||||
size_t TotalIndirectCalls = 0;
|
||||
|
||||
@ -1183,7 +1304,7 @@ void IndirectCallPromotion::runOnFunctions(
|
||||
NumCalls += BInfo.Branches;
|
||||
}
|
||||
|
||||
IndirectCalls.push_back(std::make_pair(NumCalls, &Inst));
|
||||
IndirectCalls.push_back(std::make_tuple(NumCalls, &Inst, &Function));
|
||||
TotalIndirectCalls += NumCalls;
|
||||
}
|
||||
}
|
||||
@ -1198,30 +1319,25 @@ void IndirectCallPromotion::runOnFunctions(
|
||||
const float TopPerc = opts::ICPTopCallsites / 100.0f;
|
||||
int64_t MaxCalls = TotalIndirectCalls * TopPerc;
|
||||
size_t Num = 0;
|
||||
for (auto &IC : IndirectCalls) {
|
||||
for (const auto &IC : IndirectCalls) {
|
||||
if (MaxCalls <= 0)
|
||||
break;
|
||||
MaxCalls -= IC.first;
|
||||
MaxCalls -= std::get<0>(IC);
|
||||
BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true);
|
||||
Functions.insert(std::get<2>(IC));
|
||||
++Num;
|
||||
}
|
||||
outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
|
||||
<< ", " << Num << " callsites cover " << opts::ICPTopCallsites
|
||||
<< "% of all indirect calls\n";
|
||||
|
||||
// Mark sites to optimize with "DoICP" annotation.
|
||||
for (size_t I = 0; I < Num; ++I) {
|
||||
auto *Inst = IndirectCalls[I].second;
|
||||
BC.MIB->addAnnotation(*Inst, "DoICP", true);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &BFIt : BFs) {
|
||||
auto &Function = BFIt.second;
|
||||
for (auto *FuncPtr : Functions) {
|
||||
auto &Function = *FuncPtr;
|
||||
|
||||
if (!Function.isSimple() || !opts::shouldProcess(Function))
|
||||
continue;
|
||||
|
||||
if (!Function.hasProfile())
|
||||
if (!Function.isSimple() ||
|
||||
!opts::shouldProcess(Function) ||
|
||||
!Function.hasProfile())
|
||||
continue;
|
||||
|
||||
const bool HasLayout = !Function.layout_empty();
|
||||
@ -1309,7 +1425,10 @@ void IndirectCallPromotion::runOnFunctions(
|
||||
// this callsite.
|
||||
size_t N = canPromoteCallsite(BB, Inst, Targets, NumCalls);
|
||||
|
||||
if (!N)
|
||||
// If it is a jump table and it failed to meet our initial threshold,
|
||||
// proceed to findCallTargetSymbols -- it may reevaluate N if
|
||||
// memory profile is present
|
||||
if (!N && !IsJumpTable)
|
||||
continue;
|
||||
|
||||
if (opts::Verbosity >= 1) {
|
||||
@ -1326,6 +1445,13 @@ void IndirectCallPromotion::runOnFunctions(
|
||||
Inst,
|
||||
TargetFetchInst);
|
||||
|
||||
// findCallTargetSymbols may have changed N if mem profile is available
|
||||
// for jump tables
|
||||
if (!N)
|
||||
continue;
|
||||
|
||||
DEBUG(printDecision(dbgs(), Targets, N));
|
||||
|
||||
// If we can't resolve any of the target symbols, punt on this callsite.
|
||||
// TODO: can this ever happen?
|
||||
if (SymTargets.size() < N) {
|
||||
@ -1446,12 +1572,12 @@ void IndirectCallPromotion::runOnFunctions(
|
||||
<< "BOLT-INFO: ICP percentage of indirect calls that can be "
|
||||
"optimized = "
|
||||
<< format("%.1f", (100.0 * TotalNumFrequentCalls) /
|
||||
std::max(TotalIndirectCalls, 1ul))
|
||||
std::max<size_t>(TotalIndirectCalls, 1))
|
||||
<< "%\n"
|
||||
<< "BOLT-INFO: ICP percentage of indirect callsites that are "
|
||||
"optimized = "
|
||||
<< format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
|
||||
std::max(TotalIndirectCallsites, 1ul))
|
||||
std::max<uint64_t>(TotalIndirectCallsites, 1))
|
||||
<< "%\n"
|
||||
<< "BOLT-INFO: ICP number of method load elimination candidates = "
|
||||
<< TotalMethodLoadEliminationCandidates
|
||||
@ -1459,17 +1585,17 @@ void IndirectCallPromotion::runOnFunctions(
|
||||
<< "BOLT-INFO: ICP percentage of method calls candidates that have "
|
||||
"loads eliminated = "
|
||||
<< format("%.1f", (100.0 * TotalMethodLoadsEliminated) /
|
||||
std::max(TotalMethodLoadEliminationCandidates, 1ul))
|
||||
std::max<uint64_t>(TotalMethodLoadEliminationCandidates, 1))
|
||||
<< "%\n"
|
||||
<< "BOLT-INFO: ICP percentage of indirect branches that are "
|
||||
"optimized = "
|
||||
<< format("%.1f", (100.0 * TotalNumFrequentJmps) /
|
||||
std::max(TotalIndirectJmps, 1ul))
|
||||
std::max<uint64_t>(TotalIndirectJmps, 1))
|
||||
<< "%\n"
|
||||
<< "BOLT-INFO: ICP percentage of jump table callsites that are "
|
||||
<< "optimized = "
|
||||
<< format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
|
||||
std::max(TotalJumpTableCallsites, 1ul))
|
||||
std::max<uint64_t>(TotalJumpTableCallsites, 1))
|
||||
<< "%\n"
|
||||
<< "BOLT-INFO: ICP number of jump table callsites that can use hot "
|
||||
<< "indices = " << TotalIndexBasedCandidates
|
||||
@ -1477,7 +1603,7 @@ void IndirectCallPromotion::runOnFunctions(
|
||||
<< "BOLT-INFO: ICP percentage of jump table callsites that use hot "
|
||||
"indices = "
|
||||
<< format("%.1f", (100.0 * TotalIndexBasedJumps) /
|
||||
std::max(TotalIndexBasedCandidates, 1ul))
|
||||
std::max<uint64_t>(TotalIndexBasedCandidates, 1))
|
||||
<< "%\n";
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
||||
@ -119,7 +119,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
|
||||
uint64_t Mispreds{0};
|
||||
uint64_t Branches{0};
|
||||
// Indices in the jmp table (jt only)
|
||||
std::vector<uint64_t> JTIndex;
|
||||
std::vector<uint64_t> JTIndices;
|
||||
bool isValid() const {
|
||||
return From.isValid() && To.isValid();
|
||||
}
|
||||
@ -128,7 +128,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
|
||||
uint64_t Mispreds, uint64_t Branches,
|
||||
uint64_t JTIndex)
|
||||
: From(From), To(To), Mispreds(Mispreds), Branches(Branches),
|
||||
JTIndex(1, JTIndex) { }
|
||||
JTIndices(1, JTIndex) { }
|
||||
};
|
||||
|
||||
std::unordered_set<const BinaryFunction *> Modified;
|
||||
@ -177,6 +177,10 @@ class IndirectCallPromotion : public BinaryFunctionPass {
|
||||
// Total number of jump table sites that use hot indices.
|
||||
uint64_t TotalIndexBasedJumps{0};
|
||||
|
||||
void printDecision(llvm::raw_ostream &OS,
|
||||
std::vector<IndirectCallPromotion::Callsite> &Targets,
|
||||
unsigned N) const;
|
||||
|
||||
std::vector<Callsite> getCallTargets(BinaryBasicBlock &BB,
|
||||
const MCInst &Inst) const;
|
||||
|
||||
@ -201,7 +205,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
|
||||
|
||||
SymTargetsType findCallTargetSymbols(BinaryContext &BC,
|
||||
std::vector<Callsite> &Targets,
|
||||
const size_t N,
|
||||
size_t &N,
|
||||
BinaryFunction &Function,
|
||||
BinaryBasicBlock *BB,
|
||||
MCInst &Inst,
|
||||
@ -218,7 +222,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
|
||||
BinaryFunction &Function,
|
||||
BinaryBasicBlock *IndCallBlock,
|
||||
const MCInst &CallInst,
|
||||
MCPlusBuilder::ICPdata &&ICPcode,
|
||||
MCPlusBuilder::BlocksVectorTy &&ICPcode,
|
||||
const std::vector<MCInst *> &MethodFetchInsns) const;
|
||||
|
||||
BinaryBasicBlock *fixCFG(BinaryContext &BC,
|
||||
@ -239,9 +243,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
@ -180,6 +180,9 @@ Inliner::InliningInfo Inliner::getInliningInfo(const BinaryFunction &BF) const {
|
||||
|
||||
// Perform necessary checks unless the option overrides it.
|
||||
if (!opts::mustConsider(BF)) {
|
||||
if (BF.hasSDTMarker())
|
||||
return INL_NONE;
|
||||
|
||||
if (BF.hasEHRanges())
|
||||
return INL_NONE;
|
||||
|
||||
@ -248,9 +251,8 @@ Inliner::InliningInfo Inliner::getInliningInfo(const BinaryFunction &BF) const {
|
||||
}
|
||||
|
||||
void
|
||||
Inliner::findInliningCandidates(BinaryContext &BC,
|
||||
const std::map<uint64_t, BinaryFunction> &BFs) {
|
||||
for (const auto &BFI : BFs) {
|
||||
Inliner::findInliningCandidates(BinaryContext &BC) {
|
||||
for (const auto &BFI : BC.getBinaryFunctions()) {
|
||||
const auto &Function = BFI.second;
|
||||
const auto InlInfo = getInliningInfo(Function);
|
||||
if (InlInfo.Type != INL_NONE)
|
||||
@ -532,16 +534,14 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) {
|
||||
return DidInlining;
|
||||
}
|
||||
|
||||
void Inliner::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &) {
|
||||
void Inliner::runOnFunctions(BinaryContext &BC) {
|
||||
opts::syncOptions();
|
||||
|
||||
if (!opts::inliningEnabled())
|
||||
return;
|
||||
|
||||
uint64_t TotalSize = 0;
|
||||
for (auto &BFI : BFs)
|
||||
for (auto &BFI : BC.getBinaryFunctions())
|
||||
TotalSize += BFI.second.getSize();
|
||||
|
||||
bool InlinedOnce;
|
||||
@ -553,10 +553,10 @@ void Inliner::runOnFunctions(BinaryContext &BC,
|
||||
InlinedOnce = false;
|
||||
|
||||
InliningCandidates.clear();
|
||||
findInliningCandidates(BC, BFs);
|
||||
findInliningCandidates(BC);
|
||||
|
||||
std::vector<BinaryFunction *> ConsideredFunctions;
|
||||
for (auto &BFI : BFs) {
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &Function = BFI.second;
|
||||
if (!shouldOptimize(Function))
|
||||
continue;
|
||||
|
||||
@ -39,7 +39,7 @@ private:
|
||||
: Type(Type)
|
||||
{}
|
||||
};
|
||||
|
||||
|
||||
std::unordered_map<const BinaryFunction *, InliningInfo> InliningCandidates;
|
||||
|
||||
/// Count total amount of bytes inlined for all instances of Inliner.
|
||||
@ -58,7 +58,7 @@ private:
|
||||
|
||||
/// Size in bytes of a tail call instruction.
|
||||
static uint64_t SizeOfTailCallInst;
|
||||
|
||||
|
||||
/// Set of functions modified by inlining (used for printing).
|
||||
std::unordered_set<const BinaryFunction *> Modified;
|
||||
|
||||
@ -68,8 +68,7 @@ private:
|
||||
/// Return the size in bytes of a tail call instruction.
|
||||
uint64_t getSizeOfTailCallInst(const BinaryContext &BC);
|
||||
|
||||
void findInliningCandidates(BinaryContext &BC,
|
||||
const std::map<uint64_t, BinaryFunction> &BFs);
|
||||
void findInliningCandidates(BinaryContext &BC);
|
||||
|
||||
bool inlineCallsInFunction(BinaryFunction &Function);
|
||||
|
||||
@ -97,9 +96,7 @@ public:
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
314
src/Passes/Instrumentation.cpp
Normal file
314
src/Passes/Instrumentation.cpp
Normal file
@ -0,0 +1,314 @@
|
||||
//===--- Passes/Instrumentation.cpp ---------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "Instrumentation.h"
|
||||
#include "Passes/DataflowInfoManager.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
|
||||
#define DEBUG_TYPE "bolt-instrumentation"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace opts {
|
||||
extern cl::OptionCategory BoltCategory;
|
||||
|
||||
extern bool shouldProcess(const llvm::bolt::BinaryFunction &Function);
|
||||
|
||||
cl::opt<std::string> InstrumentationFilename(
|
||||
"instrumentation-file",
|
||||
cl::desc("file name where instrumented profile will be saved"),
|
||||
cl::init("/tmp/prof.fdata"),
|
||||
cl::Optional,
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
cl::opt<bool> InstrumentHotOnly(
|
||||
"instrument-hot-only",
|
||||
cl::desc("only insert instrumentation on hot functions (need profile)"),
|
||||
cl::init(false),
|
||||
cl::Optional,
|
||||
cl::cat(BoltCategory));
|
||||
}
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) {
|
||||
auto Iter = FuncToStringIdx.find(&Function);
|
||||
if (Iter != FuncToStringIdx.end())
|
||||
return Iter->second;
|
||||
auto Idx = StringTable.size();
|
||||
FuncToStringIdx.emplace(std::make_pair(&Function, Idx));
|
||||
StringTable.append(Function.getNames()[0]);
|
||||
StringTable.append(1, '\0');
|
||||
return Idx;
|
||||
}
|
||||
|
||||
Instrumentation::CounterDescription Instrumentation::createDescription(
|
||||
const BinaryFunction &FromFunction, uint32_t From,
|
||||
const BinaryFunction &ToFunction, uint32_t To) {
|
||||
CounterDescription Res;
|
||||
Res.FromFuncStringIdx = getFunctionNameIndex(FromFunction);
|
||||
Res.FromOffset = From;
|
||||
Res.ToFuncStringIdx = getFunctionNameIndex(ToFunction);
|
||||
Res.ToOffset = To;
|
||||
return Res;
|
||||
}
|
||||
|
||||
std::vector<MCInst> Instrumentation::createInstrumentationSnippet(
|
||||
BinaryFunction &FromFunction, uint32_t FromOffset, BinaryFunction &ToFunc,
|
||||
uint32_t ToOffset) {
|
||||
Descriptions.emplace_back(
|
||||
createDescription(FromFunction, FromOffset, ToFunc, ToOffset));
|
||||
|
||||
BinaryContext &BC = FromFunction.getBinaryContext();
|
||||
MCSymbol *Label =
|
||||
BC.Ctx->createTempSymbol("InstrEntry", true);
|
||||
Labels.emplace_back(Label);
|
||||
std::vector<MCInst> CounterInstrs(5);
|
||||
// Don't clobber application red zone (ABI dependent)
|
||||
BC.MIB->createStackPointerIncrement(CounterInstrs[0], 128,
|
||||
/*NoFlagsClobber=*/true);
|
||||
BC.MIB->createPushFlags(CounterInstrs[1], 2);
|
||||
BC.MIB->createIncMemory(CounterInstrs[2], Label, &*BC.Ctx);
|
||||
BC.MIB->createPopFlags(CounterInstrs[3], 2);
|
||||
BC.MIB->createStackPointerDecrement(CounterInstrs[4], 128,
|
||||
/*NoFlagsClobber=*/true);
|
||||
return CounterInstrs;
|
||||
}
|
||||
|
||||
bool Instrumentation::instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
|
||||
BinaryFunction &FromFunction,
|
||||
BinaryBasicBlock &FromBB,
|
||||
uint32_t From, BinaryFunction &ToFunc,
|
||||
BinaryBasicBlock *TargetBB,
|
||||
uint32_t ToOffset) {
|
||||
std::vector<MCInst> CounterInstrs =
|
||||
createInstrumentationSnippet(FromFunction, From, ToFunc, ToOffset);
|
||||
|
||||
BinaryContext &BC = FromFunction.getBinaryContext();
|
||||
const MCInst &Inst = *Iter;
|
||||
if (BC.MIB->isCall(Inst) && !TargetBB) {
|
||||
for (auto &NewInst : CounterInstrs) {
|
||||
Iter = FromBB.insertInstruction(Iter, NewInst);
|
||||
++Iter;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!TargetBB)
|
||||
return false;
|
||||
|
||||
// Indirect branch, conditional branches or fall-throughs
|
||||
// Regular cond branch, put counter at start of target block
|
||||
if (TargetBB->pred_size() == 1 && &FromBB != TargetBB &&
|
||||
!TargetBB->isEntryPoint()) {
|
||||
auto RemoteIter = TargetBB->begin();
|
||||
for (auto &NewInst : CounterInstrs) {
|
||||
RemoteIter = TargetBB->insertInstruction(RemoteIter, NewInst);
|
||||
++RemoteIter;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (FromBB.succ_size() == 1 && &FromBB != TargetBB) {
|
||||
for (auto &NewInst : CounterInstrs) {
|
||||
Iter = FromBB.insertInstruction(Iter, NewInst);
|
||||
++Iter;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// Critical edge, create BB and put counter there
|
||||
SplitWorklist.emplace_back(std::make_pair(&FromBB, TargetBB));
|
||||
SplitInstrs.emplace_back(std::move(CounterInstrs));
|
||||
return true;
|
||||
}
|
||||
|
||||
void Instrumentation::runOnFunctions(BinaryContext &BC) {
|
||||
if (!BC.isX86())
|
||||
return;
|
||||
|
||||
const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/false,
|
||||
/*IsText=*/false,
|
||||
/*IsAllocatable=*/true);
|
||||
BC.registerOrUpdateSection(".bolt.instr.counters", ELF::SHT_PROGBITS, Flags,
|
||||
nullptr, 0, 1,
|
||||
/*local=*/true);
|
||||
|
||||
BC.registerOrUpdateNoteSection(".bolt.instr.tables", nullptr,
|
||||
0,
|
||||
/*Alignment=*/1,
|
||||
/*IsReadOnly=*/true, ELF::SHT_NOTE);
|
||||
|
||||
uint64_t InstrumentationSites{0ULL};
|
||||
uint64_t InstrumentationSitesSavingFlags{0ULL};
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
BinaryFunction &Function = BFI.second;
|
||||
if (!Function.isSimple() || !opts::shouldProcess(Function)
|
||||
|| (opts::InstrumentHotOnly && !Function.getKnownExecutionCount()))
|
||||
continue;
|
||||
Function.disambiguateJumpTables();
|
||||
SplitWorklist.clear();
|
||||
SplitInstrs.clear();
|
||||
|
||||
for (auto BBI = Function.begin(); BBI != Function.end(); ++BBI) {
|
||||
auto &BB{*BBI};
|
||||
bool HasUnconditionalBranch{false};
|
||||
bool HasJumpTable{false};
|
||||
|
||||
for (auto I = BB.begin(); I != BB.end(); ++I) {
|
||||
const auto &Inst = *I;
|
||||
if (!BC.MIB->hasAnnotation(Inst, "Offset"))
|
||||
continue;
|
||||
|
||||
const bool IsJumpTable = Function.getJumpTable(Inst);
|
||||
if (IsJumpTable)
|
||||
HasJumpTable = true;
|
||||
else if (BC.MIB->isUnconditionalBranch(Inst))
|
||||
HasUnconditionalBranch = true;
|
||||
else if ((!BC.MIB->isCall(Inst) &&
|
||||
!BC.MIB->isConditionalBranch(Inst)) ||
|
||||
BC.MIB->isUnsupportedBranch(Inst.getOpcode()))
|
||||
continue;
|
||||
|
||||
uint32_t FromOffset = BC.MIB->getAnnotationAs<uint32_t>(Inst, "Offset");
|
||||
const MCSymbol *Target = BC.MIB->getTargetSymbol(Inst);
|
||||
BinaryBasicBlock *TargetBB = Function.getBasicBlockForLabel(Target);
|
||||
uint32_t ToOffset = TargetBB ? TargetBB->getInputOffset() : 0;
|
||||
BinaryFunction *TargetFunc =
|
||||
TargetBB ? &Function : BC.getFunctionForSymbol(Target);
|
||||
// Should be null for indirect branches/calls
|
||||
if (TargetFunc) {
|
||||
if (instrumentOneTarget(I, Function, BB, FromOffset, *TargetFunc,
|
||||
TargetBB, ToOffset))
|
||||
++InstrumentationSites;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (IsJumpTable) {
|
||||
for (auto &Succ : BB.successors()) {
|
||||
if (instrumentOneTarget(I, Function, BB, FromOffset, Function,
|
||||
&*Succ, Succ->getInputOffset()))
|
||||
++InstrumentationSites;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// FIXME: handle indirect calls
|
||||
} // End of instructions loop
|
||||
|
||||
// Instrument fallthroughs (when the direct jump instruction is missing)
|
||||
if (!HasUnconditionalBranch && !HasJumpTable && BB.succ_size() > 0 &&
|
||||
BB.size() > 0) {
|
||||
auto *FTBB = BB.getFallthrough();
|
||||
assert(FTBB && "expected valid fall-through basic block");
|
||||
auto I = BB.begin();
|
||||
auto LastInstr = BB.end();
|
||||
--LastInstr;
|
||||
while (LastInstr != I && BC.MIB->isPseudo(*LastInstr))
|
||||
--LastInstr;
|
||||
uint32_t FromOffset = 0;
|
||||
// The last instruction in the BB should have an annotation, except
|
||||
// if it was branching to the end of the function as a result of
|
||||
// __builtin_unreachable(), in which case it was deleted by fixBranches.
|
||||
// Ignore this case. FIXME: force fixBranches() to preserve the offset.
|
||||
if (!BC.MIB->hasAnnotation(*LastInstr, "Offset"))
|
||||
continue;
|
||||
|
||||
FromOffset = BC.MIB->getAnnotationAs<uint32_t>(*LastInstr, "Offset");
|
||||
if (instrumentOneTarget(I, Function, BB, FromOffset, Function, FTBB,
|
||||
FTBB->getInputOffset()))
|
||||
++InstrumentationSites;
|
||||
}
|
||||
} // End of BBs loop
|
||||
|
||||
// Consume list of critical edges: split them and add instrumentation to the
|
||||
// newly created BBs
|
||||
auto Iter = SplitInstrs.begin();
|
||||
for (auto &BBPair : SplitWorklist) {
|
||||
auto *NewBB = Function.splitEdge(BBPair.first, BBPair.second);
|
||||
NewBB->addInstructions(Iter->begin(), Iter->end());
|
||||
++Iter;
|
||||
}
|
||||
}
|
||||
|
||||
outs() << "BOLT-INSTRUMENTER: Instrumented " << InstrumentationSites
|
||||
<< " sites, " << InstrumentationSitesSavingFlags << " saving flags.\n";
|
||||
}
|
||||
|
||||
void Instrumentation::emitTablesAsELFNote(BinaryContext &BC) {
|
||||
std::string TablesStr;
|
||||
raw_string_ostream OS(TablesStr);
|
||||
|
||||
// Start of the vector with descriptions (one CounterDescription for each
|
||||
// counter), vector size is Labels.size() CounterDescription-sized elmts
|
||||
for (const auto &Desc : Descriptions) {
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.FromFuncStringIdx), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.FromOffset), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.ToFuncStringIdx), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.ToOffset), 4);
|
||||
}
|
||||
// Our string table lives immediately after descriptions vector
|
||||
OS << StringTable;
|
||||
OS.flush();
|
||||
const auto BoltInfo = BinarySection::encodeELFNote(
|
||||
"BOLT", TablesStr, BinarySection::NT_BOLT_INSTRUMENTATION_TABLES);
|
||||
BC.registerOrUpdateNoteSection(".bolt.instr.tables", copyByteArray(BoltInfo),
|
||||
BoltInfo.size(),
|
||||
/*Alignment=*/1,
|
||||
/*IsReadOnly=*/true, ELF::SHT_NOTE);
|
||||
}
|
||||
|
||||
void Instrumentation::emit(BinaryContext &BC, MCStreamer &Streamer) {
|
||||
emitTablesAsELFNote(BC);
|
||||
|
||||
const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/false,
|
||||
/*IsText=*/false,
|
||||
/*IsAllocatable=*/true);
|
||||
auto *Section = BC.Ctx->getELFSection(".bolt.instr.counters",
|
||||
ELF::SHT_PROGBITS,
|
||||
Flags);
|
||||
|
||||
// All of the following symbols will be exported as globals to be used by the
|
||||
// instrumentation runtime library to dump the instrumentation data to disk.
|
||||
// Label marking start of the memory region containing instrumentation
|
||||
// counters, total vector size is Labels.size() 8-byte counters
|
||||
MCSymbol *Locs = BC.Ctx->getOrCreateSymbol("__bolt_instr_locations");
|
||||
MCSymbol *NumLocs = BC.Ctx->getOrCreateSymbol("__bolt_instr_num_locs");
|
||||
/// File name where profile is going to written to after target binary
|
||||
/// finishes a run
|
||||
MCSymbol *FilenameSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_filename");
|
||||
|
||||
Streamer.SwitchSection(Section);
|
||||
Streamer.EmitLabel(Locs);
|
||||
Streamer.EmitSymbolAttribute(Locs,
|
||||
MCSymbolAttr::MCSA_Global);
|
||||
for (const auto &Label : Labels) {
|
||||
Streamer.EmitLabel(Label);
|
||||
Streamer.emitFill(8, 0);
|
||||
}
|
||||
Streamer.EmitLabel(NumLocs);
|
||||
Streamer.EmitSymbolAttribute(NumLocs,
|
||||
MCSymbolAttr::MCSA_Global);
|
||||
Streamer.EmitIntValue(Labels.size(), /*Size=*/4);
|
||||
Streamer.EmitLabel(FilenameSym);
|
||||
Streamer.EmitBytes(opts::InstrumentationFilename);
|
||||
Streamer.emitFill(1, 0);
|
||||
outs() << "BOLT-INSTRUMENTER: Total size of counters: "
|
||||
<< (Labels.size() * 8) << " bytes (static alloc memory)\n";
|
||||
outs() << "BOLT-INSTRUMENTER: Total size of string table emitted: "
|
||||
<< StringTable.size() << " bytes in file\n";
|
||||
outs() << "BOLT-INSTRUMENTER: Total size of descriptors: "
|
||||
<< (Labels.size() * 16) << " bytes in file\n";
|
||||
outs() << "BOLT-INSTRUMENTER: Profile will be saved to file "
|
||||
<< opts::InstrumentationFilename << "\n";
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
128
src/Passes/Instrumentation.h
Normal file
128
src/Passes/Instrumentation.h
Normal file
@ -0,0 +1,128 @@
|
||||
//===--- Passes/Instrumentation.h -----------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_INSTRUMENTATION_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_PASSES_INSTRUMENTATION_H
|
||||
|
||||
#include "BinaryPasses.h"
|
||||
#include "llvm/MC/MCSection.h"
|
||||
#include "llvm/MC/MCStreamer.h"
|
||||
#include "llvm/MC/MCSymbol.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
/// This is an instrumentation pass that modifies the input binary to generate
|
||||
/// a profile after execution finishes. It modifies branches to increment
|
||||
/// counters stored in the process memory and inserts a new function that
|
||||
/// dumps this data to an fdata file.
|
||||
///
|
||||
/// The runtime for instrumentation has a string table that holds function
|
||||
/// names. It also must include two data structures: the counter values being
|
||||
/// incremented after each instrumented branch and a description of these
|
||||
/// counters to be written in a file during dump. The description references
|
||||
/// string indices in the string table for function names, as well as function
|
||||
/// offsets locating branch source and destination. The counter values will be
|
||||
/// converted to decimal form when writing the dumped fdata.
|
||||
///
|
||||
/// OPPORTUNITIES ON PERFORMANCE
|
||||
/// This instrumentation is experimental and currently uses a naive approach
|
||||
/// where every branch is instrumented. This is not ideal for runtime
|
||||
/// performance, but should be good enough for us to evaluate/debug LBR profile
|
||||
/// quality against instrumentation. Hopefully we can make this more efficient
|
||||
/// in the future, but most optimizations here can cost a lot in BOLT processing
|
||||
/// time. Keep in mind the instrumentation pass runs on every single BB of the
|
||||
/// entire input binary, thus it is very expensive to do analyses, such as FLAGS
|
||||
/// liveness to avoid spilling flags on every branch, if the binary is large.
|
||||
///
|
||||
/// MISSING: instrumentation of indirect calls
|
||||
class Instrumentation {
|
||||
public:
|
||||
Instrumentation() {}
|
||||
|
||||
/// Modifies all functions by inserting instrumentation code (first step)
|
||||
void runOnFunctions(BinaryContext &BC);
|
||||
|
||||
/// Emit data structures that will be necessary during runtime (second step)
|
||||
void emit(BinaryContext &BC, MCStreamer &Streamer);
|
||||
|
||||
private:
|
||||
// Instrumented branch location information
|
||||
struct CounterDescription {
|
||||
uint32_t FromFuncStringIdx;
|
||||
uint32_t FromOffset;
|
||||
uint32_t ToFuncStringIdx;
|
||||
uint32_t ToOffset;
|
||||
};
|
||||
|
||||
/// Retrieve the string table index for the name of \p Function. We encode
|
||||
/// instrumented locations descriptions with the aid of a string table to
|
||||
/// manage memory of the instrumentation runtime in a more efficient way.
|
||||
/// If this function name is not represented in the string table yet, it will
|
||||
/// be inserted and its index returned.
|
||||
uint32_t getFunctionNameIndex(const BinaryFunction &Function);
|
||||
|
||||
/// Populate all information needed to identify an instrumented location:
|
||||
/// branch source location in terms of function name plus offset, as well as
|
||||
/// branch destination (also name + offset). This will be encoded in the
|
||||
/// binary as static data and function name strings will reference a strtab.
|
||||
CounterDescription createDescription(const BinaryFunction &FromFunction,
|
||||
uint32_t From,
|
||||
const BinaryFunction &ToFunction,
|
||||
uint32_t To);
|
||||
|
||||
|
||||
/// Create the sequence of instructions to instrument a branch happening
|
||||
/// at \p FromFunction + \p FromOffset to \p ToFunc + \p ToOffset
|
||||
std::vector<MCInst> createInstrumentationSnippet(BinaryFunction &FromFunction,
|
||||
uint32_t FromOffset,
|
||||
BinaryFunction &ToFunc,
|
||||
uint32_t ToOffset);
|
||||
|
||||
/// Instrument the branch in \p Iter located at \p FromFunction + \p From,
|
||||
/// basic block \p FromBB. The destination of the branch is \p ToFunc +
|
||||
/// \p ToOffset. \p TargetBB should be non-null if this is a local branch
|
||||
/// and null if it is a call. Return true on success.
|
||||
bool instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
|
||||
BinaryFunction &FromFunction,
|
||||
BinaryBasicBlock &FromBB, uint32_t From,
|
||||
BinaryFunction &ToFunc, BinaryBasicBlock *TargetBB,
|
||||
uint32_t ToOffset);
|
||||
|
||||
/// Create a non-allocatable ELF section with read-only tables necessary for
|
||||
/// writing the instrumented data profile during program finish. The runtime
|
||||
/// library needs to open the program executable file and read this data from
|
||||
/// disk, this is not loaded by the system.
|
||||
void emitTablesAsELFNote(BinaryContext &BC);
|
||||
|
||||
/// Critical edges worklist
|
||||
/// This worklist keeps track of CFG edges <From-To> that needs to be split.
|
||||
/// This task is deferred until we finish processing all BBs because we can't
|
||||
/// modify the CFG while iterating over it. For each edge, \p SplitInstrs
|
||||
/// stores the list of instrumentation instructions as a vector of MCInsts.
|
||||
/// instrumentOneTarget() populates this, runOnFunctions() consumes.
|
||||
std::vector<std::pair<BinaryBasicBlock *, BinaryBasicBlock *>> SplitWorklist;
|
||||
std::vector<std::vector<MCInst>> SplitInstrs;
|
||||
|
||||
/// Stores function names, to be emitted to the runtime
|
||||
std::string StringTable;
|
||||
|
||||
/// strtab indices in StringTable for each function name
|
||||
std::unordered_map<const BinaryFunction *, uint32_t> FuncToStringIdx;
|
||||
std::vector<CounterDescription> Descriptions;
|
||||
|
||||
/// Identify all counters used in runtime while instrumentation is running
|
||||
std::vector<MCSymbol *> Labels;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -243,21 +243,17 @@ void JTFootprintReduction::optimizeFunction(BinaryContext &BC,
|
||||
}
|
||||
}
|
||||
|
||||
void JTFootprintReduction::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions
|
||||
) {
|
||||
void JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
|
||||
if (opts::JumpTables == JTS_BASIC && BC.HasRelocations)
|
||||
return;
|
||||
|
||||
std::unique_ptr<RegAnalysis> RA;
|
||||
std::unique_ptr<BinaryFunctionCallGraph> CG;
|
||||
if (!opts::JTFootprintOnlyPIC) {
|
||||
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs)));
|
||||
RA.reset(new RegAnalysis(BC, &BFs, &*CG));
|
||||
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
|
||||
RA.reset(new RegAnalysis(BC, &BC.getBinaryFunctions(), &*CG));
|
||||
}
|
||||
for (auto &BFIt : BFs) {
|
||||
for (auto &BFIt : BC.getBinaryFunctions()) {
|
||||
auto &Function = BFIt.second;
|
||||
|
||||
if (!Function.isSimple() || !opts::shouldProcess(Function))
|
||||
|
||||
@ -75,9 +75,7 @@ public:
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
@ -8,11 +8,21 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This class implements a pass that inserts LFENCE instructions before each
|
||||
// conditional branch to protect against Spectre Variant 1.
|
||||
// The performance impact of this is significant!
|
||||
// conditional branch to protect against Spectre Variant 1, as well as the
|
||||
// various LVI mitigations.
|
||||
//
|
||||
// The runtime performance impact of this is significant!
|
||||
//
|
||||
// NOTE: This pass is incompatible with RetpolineInsertion. It is also
|
||||
// incompatible with ABIs that allow red-zones, due the the
|
||||
// flags-preserving jmp mitigation clobbering 8 bytes in the red-zone.
|
||||
// Options are to disable red-zone when compiling the target binary,
|
||||
// or configure the compilers to never generate memory-indirect jmps.
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "LFenceInsertion.h"
|
||||
#include "RewriteInstance.h"
|
||||
#include "RetpolineInsertion.h" //IndirectBranchInfo
|
||||
#include "ParallelUtilities.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
#define DEBUG_TYPE "bolt-lfence"
|
||||
@ -20,6 +30,7 @@
|
||||
using namespace llvm;
|
||||
using namespace bolt;
|
||||
namespace opts {
|
||||
extern bool shouldProcess(const bolt::BinaryFunction &Function);
|
||||
|
||||
extern cl::OptionCategory BoltCategory;
|
||||
|
||||
@ -30,14 +41,53 @@ InsertLFences("insert-lfences",
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
llvm::cl::opt<bool>
|
||||
LFenceConditionalBranches("lfence-conditional-branches",
|
||||
cl::desc("determine if all conditional branches should be lfence mitigated"),
|
||||
cl::init(true),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden,
|
||||
cl::cat(BoltCategory));
|
||||
llvm::cl::opt<bool>
|
||||
LFenceLoads("lfence-loads",
|
||||
cl::desc("determine if all loads should be lfence mitigated"),
|
||||
cl::init(true),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden,
|
||||
cl::cat(BoltCategory));
|
||||
llvm::cl::opt<bool>
|
||||
LFenceReturns("lfence-returns",
|
||||
cl::desc("determine if all returns should be lfence mitigated"),
|
||||
cl::init(true),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden,
|
||||
cl::cat(BoltCategory));
|
||||
llvm::cl::opt<bool>
|
||||
LFenceIndirectCalls("lfence-indirect-calls",
|
||||
cl::desc("determine if all indirect calls should be lfence mitigated"),
|
||||
cl::init(true),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden,
|
||||
cl::cat(BoltCategory));
|
||||
llvm::cl::opt<bool>
|
||||
LFenceIndirectJumps("lfence-indirect-jumps",
|
||||
cl::desc("determine if all indirect jumps should be lfence mitigated"),
|
||||
cl::init(true),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden,
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
} // namespace opts
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
void LFenceInsertion::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
static void report_redzone_error() {
|
||||
errs() << "BOLT-ERROR: 'Redzone access in function with indirect jmp mitigation'\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void LFenceInsertion::runOnFunctions(BinaryContext &BC) {
|
||||
|
||||
if (!opts::InsertLFences)
|
||||
return;
|
||||
@ -49,25 +99,234 @@ void LFenceInsertion::runOnFunctions(BinaryContext &BC,
|
||||
|
||||
auto &MIB = *BC.MIB;
|
||||
uint32_t LFencedBranches = 0;
|
||||
for (auto &It : BFs) {
|
||||
uint32_t LFencedLoads = 0;
|
||||
uint32_t LFencedRets = 0;
|
||||
uint32_t LFencedIndirectCalls = 0;
|
||||
uint32_t LFencedIndirectJmps = 0;
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
auto &Function = It.second;
|
||||
bool MemIndirectJmp = false;
|
||||
bool Redzone = false;
|
||||
|
||||
// For performance reasons, we may want to skip some functions and
|
||||
// manually add lfences to them only where absolutely needed.
|
||||
if (!opts::shouldProcess(Function))
|
||||
continue;
|
||||
|
||||
for (auto &BB : Function) {
|
||||
bool LastWasLFence = false;
|
||||
for (auto It = BB.begin(); It != BB.end(); ++It) {
|
||||
auto &Inst = *It;
|
||||
|
||||
if (!MIB.isConditionalBranch(Inst))
|
||||
continue;
|
||||
if (MIB.isActualLoad(Inst) && MIB.isBranchOnMem(Inst)) {
|
||||
IndirectBranchInfo BrInfo(Inst, MIB);
|
||||
const auto &MemRef = BrInfo.Memory;
|
||||
|
||||
MCInst LFence;
|
||||
MIB.createLfence(LFence);
|
||||
It = BB.insertInstruction(It, std::move(LFence));
|
||||
++It;
|
||||
LFencedBranches++;
|
||||
if (MemRef.BaseRegNum == MIB.getStackPointer() &&
|
||||
MemRef.DispValue < 0) {
|
||||
if (MemIndirectJmp) {
|
||||
report_redzone_error();
|
||||
}
|
||||
Redzone = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (opts::LFenceConditionalBranches &&
|
||||
MIB.isConditionalBranch(Inst)) {
|
||||
// Inserts a lfence before every conditional branch.
|
||||
// For example:
|
||||
// cmp %reg1, %reg2
|
||||
// je <jump_dest>
|
||||
// gets rewritten to:
|
||||
// cmp %reg1, %reg2
|
||||
// lfence
|
||||
// je <jump_dest>
|
||||
if (!LastWasLFence) {
|
||||
MCInst LFence;
|
||||
MIB.createLfence(LFence);
|
||||
It = BB.insertInstruction(It, std::move(LFence));
|
||||
++It;
|
||||
}
|
||||
LFencedBranches++;
|
||||
LastWasLFence = false;
|
||||
} else if (opts::LFenceLoads &&
|
||||
MIB.isActualLoad(Inst) &&
|
||||
!MIB.isReturn(Inst) &&
|
||||
!MIB.isIndirectBranch(Inst) &&
|
||||
!MIB.isIndirectCall(Inst)) {
|
||||
// Inserts an lfence after every load from memory.
|
||||
// For example:
|
||||
// mov 0x8(%rbx), %rdi
|
||||
// Gets rewritten to:
|
||||
// mov 0x8(%rbx), %rdi
|
||||
// lfence
|
||||
++It;
|
||||
MCInst LFence;
|
||||
MIB.createLfence(LFence);
|
||||
It = BB.insertInstruction(It, std::move(LFence));
|
||||
LFencedLoads++;
|
||||
LastWasLFence = true;
|
||||
} else if (opts::LFenceReturns &&
|
||||
MIB.isReturn(Inst) && !MIB.isIndirectBranch(Inst)) {
|
||||
// Inserts a dummy write + lfence before every ret.
|
||||
// For example:
|
||||
// retq
|
||||
// gets rewritten to:
|
||||
// shlq $0, (%rsp)
|
||||
// lfence
|
||||
// retq
|
||||
MCInst Shlq;
|
||||
MIB.createShl(Shlq, MIB.getStackPointer(), 1, MIB.getNoRegister(), 0, nullptr,
|
||||
MIB.getNoRegister(), 0, 8);
|
||||
It = BB.insertInstruction(It, std::move(Shlq));
|
||||
++It;
|
||||
MCInst LFence;
|
||||
MIB.createLfence(LFence);
|
||||
It = BB.insertInstruction(It, std::move(LFence));
|
||||
++It;
|
||||
LFencedRets++;
|
||||
LastWasLFence = false;
|
||||
} else if (opts::LFenceIndirectCalls &&
|
||||
MIB.isIndirectCall(Inst) && MIB.isLoad(Inst) && !MIB.isIndirectBranch(Inst)) {
|
||||
// Translates indirect calls into lea/mov/jmp then applies the jmp mitigation.
|
||||
// For example:
|
||||
// callq *(%rsi)
|
||||
// gets rewritten to:
|
||||
// pushq %rdi //Dummy to overwrite later
|
||||
// pushq %rdi
|
||||
// leaq 0x18(%rip), %rdi //After the retq
|
||||
// movq %rdi, 8(%rsp) //Overwrite the dummy
|
||||
// popq %rdi
|
||||
// lfence
|
||||
// pushq (%rsi)
|
||||
// lfence //XXX Not needed, according to Intel?
|
||||
// shlq $0, (%rsp)
|
||||
// lfence
|
||||
// retq
|
||||
IndirectBranchInfo BrInfo(Inst, MIB);
|
||||
const auto &MemRef = BrInfo.Memory;
|
||||
auto *Ctx = BC.Ctx.get();
|
||||
assert(BrInfo.isMem());
|
||||
|
||||
// Create a separate MCCodeEmitter to allow lock-free execution
|
||||
BinaryContext::IndependentCodeEmitter Emitter;
|
||||
if (!opts::NoThreads) {
|
||||
Emitter = BC.createIndependentMCCodeEmitter();
|
||||
}
|
||||
|
||||
int offset = 0x15 + BC.computeCodeSize(It, std::next(It), Emitter.MCE.get());
|
||||
|
||||
MCPhysReg ScratchReg = MIB.getIntArgRegister(0);
|
||||
MCInst Pushq1; //Dummy, to overwrite later.
|
||||
MIB.createPushRegister(Pushq1, ScratchReg, 8);
|
||||
It = BB.insertInstruction(It, std::move(Pushq1));
|
||||
++It;
|
||||
MCInst Pushq2;
|
||||
MIB.createPushRegister(Pushq2, ScratchReg, 8);
|
||||
It = BB.insertInstruction(It, std::move(Pushq2));
|
||||
++It;
|
||||
MCInst Leaq;
|
||||
MIB.createLea(Leaq, MIB.getInstructionPointer(), 1, MIB.getNoRegister(),
|
||||
offset, nullptr, MIB.getNoRegister(), ScratchReg, 8);
|
||||
It = BB.insertInstruction(It, std::move(Leaq));
|
||||
++It;
|
||||
MCInst Movq;
|
||||
MIB.createSaveToStack(Movq, MIB.getStackPointer(), 8, ScratchReg, 8);
|
||||
It = BB.insertInstruction(It, std::move(Movq));
|
||||
++It;
|
||||
MCInst Popq;
|
||||
MIB.createPopRegister(Popq, ScratchReg, 8);
|
||||
It = BB.insertInstruction(It, std::move(Popq));
|
||||
++It;
|
||||
MCInst LFence1;
|
||||
MIB.createLfence(LFence1);
|
||||
It = BB.insertInstruction(It, std::move(LFence1));
|
||||
++It;
|
||||
MCInst Pushq3;
|
||||
MIB.createPushRegisterIndirect(Pushq3, MemRef.BaseRegNum, MemRef.ScaleValue,
|
||||
MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
|
||||
MemRef.SegRegNum, 8);
|
||||
It = BB.insertInstruction(It, std::move(Pushq3));
|
||||
++It;
|
||||
MCInst LFence2;
|
||||
MIB.createLfence(LFence2);
|
||||
It = BB.insertInstruction(It, std::move(LFence2));
|
||||
++It;
|
||||
MCInst Shlq;
|
||||
MIB.createShl(Shlq, MIB.getStackPointer(), 1, MIB.getNoRegister(), 0, nullptr,
|
||||
MIB.getNoRegister(), 0, 8);
|
||||
It = BB.insertInstruction(It, std::move(Shlq));
|
||||
++It;
|
||||
MCInst LFence3;
|
||||
MIB.createLfence(LFence3);
|
||||
It = BB.insertInstruction(It, std::move(LFence3));
|
||||
++It;
|
||||
MCInst Retq;
|
||||
MIB.createReturn(Retq);
|
||||
BB.replaceInstruction(It, std::vector<MCInst>({Retq}));
|
||||
LFencedIndirectCalls++;
|
||||
LastWasLFence = false;
|
||||
} else if (opts::LFenceIndirectJumps &&
|
||||
MIB.isIndirectBranch(Inst) && MIB.isLoad(Inst)) {
|
||||
// Maps indirect jumps to "push; ret", then applies ret mitigation.
|
||||
// For example:
|
||||
// jmpq *(%rsi)
|
||||
// gets rewritten to:
|
||||
// pushq (%rsi)
|
||||
// lfence //XXX Not needed, according to Intel?
|
||||
// shlq $0, (%rsp)
|
||||
// lfence
|
||||
// retq
|
||||
|
||||
// Since this mitigation clobbers the redzone, we need to make
|
||||
// sure that this function never uses it.
|
||||
if (Redzone) {
|
||||
report_redzone_error();
|
||||
}
|
||||
MemIndirectJmp = true;
|
||||
|
||||
IndirectBranchInfo BrInfo(Inst, MIB);
|
||||
const auto &MemRef = BrInfo.Memory;
|
||||
|
||||
MCInst Push;
|
||||
MIB.createPushRegisterIndirect(Push, MemRef.BaseRegNum, MemRef.ScaleValue,
|
||||
MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
|
||||
MemRef.SegRegNum, 8);
|
||||
It = BB.insertInstruction(It, std::move(Push));
|
||||
++It;
|
||||
MCInst LFence1;
|
||||
MIB.createLfence(LFence1);
|
||||
It = BB.insertInstruction(It, std::move(LFence1));
|
||||
++It;
|
||||
MCInst Shlq;
|
||||
MIB.createShl(Shlq, MIB.getStackPointer(), 1, MIB.getNoRegister(), 0, nullptr,
|
||||
MIB.getNoRegister(), 0, 8);
|
||||
It = BB.insertInstruction(It, std::move(Shlq));
|
||||
++It;
|
||||
MCInst LFence2;
|
||||
MIB.createLfence(LFence2);
|
||||
It = BB.insertInstruction(It, std::move(LFence2));
|
||||
++It;
|
||||
MCInst Retq;
|
||||
MIB.createReturn(Retq);
|
||||
BB.replaceInstruction(It, std::vector<MCInst>({Retq}));
|
||||
LFencedIndirectJmps++;
|
||||
LastWasLFence = false;
|
||||
} else if (MIB.isLfence(Inst)) {
|
||||
LastWasLFence = true;
|
||||
} else {
|
||||
LastWasLFence = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
outs() << "\nBOLT-INFO: The number of lfenced branches is : " << LFencedBranches
|
||||
<< "\n";
|
||||
|
||||
outs() << "\nBOLT-INFO: The number of lfenced branches is : " << LFencedBranches;
|
||||
outs() << "\nBOLT-INFO: The number of lfenced loads is : " << LFencedLoads;
|
||||
outs() << "\nBOLT-INFO: The number of lfenced rets is : " << LFencedRets;
|
||||
outs() << "\nBOLT-INFO: The number of lfenced indirect calls is : " << LFencedIndirectCalls;
|
||||
outs() << "\nBOLT-INFO: The number of lfenced indirect jmps is : " << LFencedIndirectJmps
|
||||
<< "\n\n";
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
@ -28,9 +28,7 @@ public:
|
||||
|
||||
const char *getName() const override { return "lfence-insertion"; }
|
||||
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
@ -38,8 +38,8 @@ class LivenessAnalysis
|
||||
|
||||
public:
|
||||
LivenessAnalysis(const RegAnalysis &RA, const BinaryContext &BC,
|
||||
BinaryFunction &BF)
|
||||
: Parent(BC, BF), RA(RA), NumRegs(BC.MRI->getNumRegs()) {}
|
||||
BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId)
|
||||
: Parent(BC, BF, AllocId), RA(RA), NumRegs(BC.MRI->getNumRegs()) {}
|
||||
virtual ~LivenessAnalysis();
|
||||
|
||||
bool isAlive(ProgramPoint PP, MCPhysReg Reg) const {
|
||||
@ -50,8 +50,6 @@ public:
|
||||
}
|
||||
|
||||
void run() {
|
||||
NamedRegionTimer T1("LA", "Liveness Analysis", "Dataflow", "Dataflow",
|
||||
opts::TimeOpts);
|
||||
Parent::run();
|
||||
}
|
||||
|
||||
|
||||
@ -84,7 +84,7 @@ LongJmpPass::createNewStub(BinaryBasicBlock &SourceBB, const MCSymbol *TgtSym,
|
||||
MCInst Inst;
|
||||
BC.MIB->createUncondBranch(Inst, TgtSym, BC.Ctx.get());
|
||||
if (TgtIsFunc)
|
||||
BC.MIB->convertJmpToTailCall(Inst, BC.Ctx.get());
|
||||
BC.MIB->convertJmpToTailCall(Inst);
|
||||
StubBB->addInstruction(Inst);
|
||||
StubBB->setExecutionCount(0);
|
||||
|
||||
@ -427,9 +427,9 @@ uint64_t LongJmpPass::getSymbolAddress(const BinaryContext &BC,
|
||||
if (Iter == HotAddresses.end()) {
|
||||
// Look at BinaryContext's resolution for this symbol - this is a symbol not
|
||||
// mapped to a BinaryFunction
|
||||
auto *BD = BC.getBinaryDataByName(Target->getName());
|
||||
assert(BD && "Unrecognized symbol");
|
||||
return BD ? BD->getAddress() : 0;
|
||||
auto ValueOrError = BC.getSymbolValue(*Target);
|
||||
assert(ValueOrError && "Unrecognized symbol");
|
||||
return *ValueOrError;
|
||||
}
|
||||
return Iter->second;
|
||||
}
|
||||
@ -595,11 +595,9 @@ bool LongJmpPass::relax(BinaryFunction &Func) {
|
||||
return Modified;
|
||||
}
|
||||
|
||||
void LongJmpPass::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
void LongJmpPass::runOnFunctions(BinaryContext &BC) {
|
||||
outs() << "BOLT-INFO: Starting stub-insertion pass\n";
|
||||
auto Sorted = BinaryContext::getSortedFunctions(BFs);
|
||||
auto Sorted = BC.getSortedFunctions();
|
||||
bool Modified;
|
||||
uint32_t Iterations{0};
|
||||
do {
|
||||
|
||||
@ -150,9 +150,7 @@ public:
|
||||
|
||||
const char *getName() const override { return "long-jmp"; }
|
||||
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@ -2460,12 +2460,10 @@ void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction) {
|
||||
}
|
||||
};
|
||||
|
||||
size_t CurEdgeNum{0};
|
||||
auto Next = std::next(BBI);
|
||||
for (auto Succ : BB.successors()) {
|
||||
int IsFT = (Next != E && Succ == *Next) ? 1 : 0;
|
||||
AddSuccArc(Succ, BI->Count, IsFT);
|
||||
++CurEdgeNum;
|
||||
++BI;
|
||||
}
|
||||
|
||||
|
||||
@ -43,15 +43,12 @@ PLT("plt",
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
void PLTCall::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &) {
|
||||
void PLTCall::runOnFunctions(BinaryContext &BC) {
|
||||
if (opts::PLT == OT_NONE)
|
||||
return;
|
||||
|
||||
uint64_t NumCallsOptimized = 0;
|
||||
for (auto &It : BFs) {
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
auto &Function = It.second;
|
||||
if (!shouldOptimize(Function))
|
||||
continue;
|
||||
|
||||
@ -38,9 +38,7 @@ public:
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF);
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
@ -36,9 +36,10 @@ class ReachingDefOrUse
|
||||
|
||||
public:
|
||||
ReachingDefOrUse(const RegAnalysis &RA, const BinaryContext &BC,
|
||||
BinaryFunction &BF, Optional<MCPhysReg> TrackingReg = None)
|
||||
: InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>(BC, BF), RA(RA),
|
||||
TrackingReg(TrackingReg) {}
|
||||
BinaryFunction &BF, Optional<MCPhysReg> TrackingReg = None,
|
||||
MCPlusBuilder::AllocatorIdTy AllocId = 0)
|
||||
: InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>(BC, BF, AllocId),
|
||||
RA(RA), TrackingReg(TrackingReg) {}
|
||||
virtual ~ReachingDefOrUse() {}
|
||||
|
||||
bool isReachedBy(MCPhysReg Reg, ExprIterator Candidates) {
|
||||
@ -60,8 +61,6 @@ public:
|
||||
}
|
||||
|
||||
void run() {
|
||||
NamedRegionTimer T1("RD", "Reaching Defs", "Dataflow", "Dataflow",
|
||||
opts::TimeOpts);
|
||||
InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>::run();
|
||||
}
|
||||
|
||||
|
||||
@ -29,8 +29,9 @@ class ReachingInsns
|
||||
friend class DataflowAnalysis<ReachingInsns<Backward>, BitVector, Backward>;
|
||||
|
||||
public:
|
||||
ReachingInsns(const BinaryContext &BC, BinaryFunction &BF)
|
||||
: InstrsDataflowAnalysis<ReachingInsns, Backward>(BC, BF) {}
|
||||
ReachingInsns(const BinaryContext &BC, BinaryFunction &BF,
|
||||
MCPlusBuilder::AllocatorIdTy AllocId = 0)
|
||||
: InstrsDataflowAnalysis<ReachingInsns, Backward>(BC, BF, AllocId) {}
|
||||
virtual ~ReachingInsns() {}
|
||||
|
||||
bool isInLoop(const BinaryBasicBlock &BB) {
|
||||
@ -46,8 +47,6 @@ public:
|
||||
}
|
||||
|
||||
void run() {
|
||||
NamedRegionTimer T1("RI", "Reaching Insns", "Dataflow", "Dataflow",
|
||||
opts::TimeOpts);
|
||||
InstrsDataflowAnalysis<ReachingInsns<Backward>, Backward>::run();
|
||||
}
|
||||
|
||||
|
||||
@ -36,7 +36,8 @@ public:
|
||||
/// set of clobbered registers.
|
||||
BitVector getFunctionClobberList(const BinaryFunction *Func);
|
||||
|
||||
RegAnalysis(BinaryContext &BC, std::map<uint64_t, BinaryFunction> *BFs,
|
||||
RegAnalysis(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> *BFs,
|
||||
BinaryFunctionCallGraph *CG);
|
||||
|
||||
/// Compute the set of registers \p Inst may read from, marking them in
|
||||
|
||||
@ -339,7 +339,7 @@ bool RegReAssign::conservativePassOverFunction(BinaryContext &BC,
|
||||
void RegReAssign::setupAggressivePass(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs) {
|
||||
setupConservativePass(BC, BFs);
|
||||
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs)));
|
||||
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
|
||||
RA.reset(new RegAnalysis(BC, &BFs, &*CG));
|
||||
|
||||
GPRegs = BitVector(BC.MRI->getNumRegs(), false);
|
||||
@ -380,18 +380,16 @@ void RegReAssign::setupConservativePass(
|
||||
});
|
||||
}
|
||||
|
||||
void RegReAssign::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
void RegReAssign::runOnFunctions(BinaryContext &BC) {
|
||||
RegScore = std::vector<int64_t>(BC.MRI->getNumRegs(), 0);
|
||||
RankedRegs = std::vector<size_t>(BC.MRI->getNumRegs(), 0);
|
||||
|
||||
if (opts::AggressiveReAssign)
|
||||
setupAggressivePass(BC, BFs);
|
||||
setupAggressivePass(BC, BC.getBinaryFunctions());
|
||||
else
|
||||
setupConservativePass(BC, BFs);
|
||||
setupConservativePass(BC, BC.getBinaryFunctions());
|
||||
|
||||
for (auto &I : BFs) {
|
||||
for (auto &I : BC.getBinaryFunctions()) {
|
||||
auto &Function = I.second;
|
||||
|
||||
if (!Function.isSimple() || !opts::shouldProcess(Function))
|
||||
|
||||
@ -58,9 +58,7 @@ public:
|
||||
return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
|
||||
}
|
||||
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@ -27,6 +27,7 @@ using namespace bolt;
|
||||
namespace opts {
|
||||
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
extern cl::opt<bool> NoThreads;
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintClusters("print-clusters",
|
||||
@ -65,7 +66,13 @@ struct HashPair {
|
||||
|
||||
}
|
||||
|
||||
void ClusterAlgorithm::computeClusterAverageFrequency() {
|
||||
void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
|
||||
// Create a separate MCCodeEmitter to allow lock-free execution
|
||||
BinaryContext::IndependentCodeEmitter Emitter;
|
||||
if (!opts::NoThreads) {
|
||||
Emitter = BC.createIndependentMCCodeEmitter();
|
||||
}
|
||||
|
||||
AvgFreq.resize(Clusters.size(), 0.0);
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
|
||||
double Freq = 0.0;
|
||||
@ -75,7 +82,7 @@ void ClusterAlgorithm::computeClusterAverageFrequency() {
|
||||
Freq += BB->getExecutionCount();
|
||||
// Estimate the size of a block in bytes at run time
|
||||
// NOTE: This might be inaccurate
|
||||
ClusterSize += BB->estimateSize();
|
||||
ClusterSize += BB->estimateSize(Emitter.MCE.get());
|
||||
}
|
||||
}
|
||||
AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
|
||||
@ -525,7 +532,7 @@ void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
|
||||
auto &ClusterEdges = CAlgo->ClusterEdges;
|
||||
|
||||
// Compute clusters' average frequencies.
|
||||
CAlgo->computeClusterAverageFrequency();
|
||||
CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
|
||||
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
|
||||
|
||||
if (opts::PrintClusters)
|
||||
@ -627,7 +634,7 @@ void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
|
||||
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
|
||||
|
||||
// Compute clusters' average frequencies.
|
||||
CAlgo->computeClusterAverageFrequency();
|
||||
CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
|
||||
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
|
||||
|
||||
if (opts::PrintClusters)
|
||||
|
||||
@ -53,7 +53,7 @@ public:
|
||||
/// the sum of average frequencies of its blocks (execution count / # instrs).
|
||||
/// The average frequencies are stored in the AvgFreq vector, index by the
|
||||
/// cluster indices in the Clusters vector.
|
||||
void computeClusterAverageFrequency();
|
||||
void computeClusterAverageFrequency(const BinaryContext &BC);
|
||||
|
||||
/// Clear clusters and related info.
|
||||
virtual void reset();
|
||||
|
||||
@ -379,9 +379,7 @@ bool ReorderData::markUnmoveableSymbols(BinaryContext &BC,
|
||||
return FoundUnmoveable;
|
||||
}
|
||||
|
||||
void ReorderData::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
void ReorderData::runOnFunctions(BinaryContext &BC) {
|
||||
static const char* DefaultSections[] = {
|
||||
".rodata",
|
||||
".data",
|
||||
@ -435,7 +433,8 @@ void ReorderData::runOnFunctions(BinaryContext &BC,
|
||||
std::tie(Order, SplitPointIdx) = sortedByCount(BC, *Section);
|
||||
} else {
|
||||
outs() << "BOLT-INFO: reorder-sections: ordering data by funcs\n";
|
||||
std::tie(Order, SplitPointIdx) = sortedByFunc(BC, *Section, BFs);
|
||||
std::tie(Order, SplitPointIdx) =
|
||||
sortedByFunc(BC, *Section, BC.getBinaryFunctions());
|
||||
}
|
||||
auto SplitPoint = Order.begin() + SplitPointIdx;
|
||||
|
||||
|
||||
@ -57,9 +57,7 @@ public:
|
||||
return "reorder-data";
|
||||
}
|
||||
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
@ -276,21 +276,13 @@ std::vector<std::string> readFunctionOrderFile() {
|
||||
|
||||
}
|
||||
|
||||
void ReorderFunctions::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
if (!BC.HasRelocations && opts::ReorderFunctions != RT_NONE) {
|
||||
errs() << "BOLT-ERROR: Function reordering only works when "
|
||||
<< "relocs are enabled.\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
|
||||
auto &BFs = BC.getBinaryFunctions();
|
||||
if (opts::ReorderFunctions != RT_NONE &&
|
||||
opts::ReorderFunctions != RT_EXEC_COUNT &&
|
||||
opts::ReorderFunctions != RT_USER) {
|
||||
Cg = buildCallGraph(BC,
|
||||
BFs,
|
||||
[this](const BinaryFunction &BF) {
|
||||
[](const BinaryFunction &BF) {
|
||||
if (!BF.hasProfile())
|
||||
return true;
|
||||
if (BF.getState() != BinaryFunction::State::CFG)
|
||||
|
||||
@ -41,9 +41,7 @@ public:
|
||||
const char *getName() const override {
|
||||
return "reorder-functions";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
||||
@ -106,6 +106,49 @@ private:
|
||||
BitVector Valid;
|
||||
};
|
||||
|
||||
// This class holds cached results of specified type for a pair of Clusters.
|
||||
// It can invalidate all cache entries associated with a given Cluster.
|
||||
// The functions set, get and contains are thread safe when called with
|
||||
// distinct keys.
|
||||
template <typename Cluster, typename ValueType>
|
||||
class ClusterPairCacheThreadSafe {
|
||||
public:
|
||||
explicit ClusterPairCacheThreadSafe(size_t Size)
|
||||
: Size(Size), Cache(Size * Size), Valid(Size * Size, false) {}
|
||||
|
||||
bool contains(const Cluster *First, const Cluster *Second) const {
|
||||
return Valid[index(First, Second)];
|
||||
}
|
||||
|
||||
ValueType get(const Cluster *First, const Cluster *Second) const {
|
||||
assert(contains(First, Second));
|
||||
return Cache[index(First, Second)];
|
||||
}
|
||||
|
||||
void set(const Cluster *First, const Cluster *Second, ValueType Value) {
|
||||
const auto Index = index(First, Second);
|
||||
Cache[Index] = Value;
|
||||
Valid[Index] = true;
|
||||
}
|
||||
|
||||
void invalidate(const Cluster *C) {
|
||||
for (size_t idx = C->id() * Size; idx < (C->id() + 1) * Size; idx++)
|
||||
Valid[idx] = false;
|
||||
|
||||
for (size_t id = 0; id < Size; id++)
|
||||
Valid[(id * Size) + C->id()] = false;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t Size;
|
||||
std::vector<ValueType> Cache;
|
||||
std::vector<ValueType> Valid;
|
||||
|
||||
size_t index(const Cluster *First, const Cluster *Second) const {
|
||||
return (First->id() * Size) + Second->id();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
|
||||
@ -138,9 +138,10 @@ BinaryFunction *createNewRetpoline(BinaryContext &BC,
|
||||
BB2.addInstruction(PushR11);
|
||||
|
||||
MCInst LoadCalleeAddrs;
|
||||
MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue,
|
||||
BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr,
|
||||
BrInfo.SegRegNum, MIB.getX86R11(), 8);
|
||||
const auto &MemRef = BrInfo.Memory;
|
||||
MIB.createLoad(LoadCalleeAddrs, MemRef.BaseRegNum, MemRef.ScaleValue,
|
||||
MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
|
||||
MemRef.SegRegNum, MIB.getX86R11(), 8);
|
||||
|
||||
BB2.addInstruction(LoadCalleeAddrs);
|
||||
|
||||
@ -186,27 +187,29 @@ std::string createRetpolineFunctionTag(BinaryContext &BC,
|
||||
|
||||
std::string Tag = "__retpoline_mem_";
|
||||
|
||||
const auto &MemRef = BrInfo.Memory;
|
||||
|
||||
std::string DispExprStr;
|
||||
if (BrInfo.DispExpr) {
|
||||
if (MemRef.DispExpr) {
|
||||
llvm::raw_string_ostream Ostream(DispExprStr);
|
||||
BrInfo.DispExpr->print(Ostream, BC.AsmInfo.get());
|
||||
MemRef.DispExpr->print(Ostream, BC.AsmInfo.get());
|
||||
Ostream.flush();
|
||||
}
|
||||
|
||||
Tag += BrInfo.BaseRegNum != BC.MIB->getX86NoRegister()
|
||||
? "r" + to_string(BrInfo.BaseRegNum)
|
||||
Tag += MemRef.BaseRegNum != BC.MIB->getNoRegister()
|
||||
? "r" + to_string(MemRef.BaseRegNum)
|
||||
: "";
|
||||
|
||||
Tag +=
|
||||
BrInfo.DispExpr ? "+" + DispExprStr : "+" + to_string(BrInfo.DispValue);
|
||||
MemRef.DispExpr ? "+" + DispExprStr : "+" + to_string(MemRef.DispValue);
|
||||
|
||||
Tag += BrInfo.IndexRegNum != BC.MIB->getX86NoRegister()
|
||||
? "+" + to_string(BrInfo.ScaleValue) + "*" +
|
||||
to_string(BrInfo.IndexRegNum)
|
||||
Tag += MemRef.IndexRegNum != BC.MIB->getNoRegister()
|
||||
? "+" + to_string(MemRef.ScaleValue) + "*" +
|
||||
to_string(MemRef.IndexRegNum)
|
||||
: "";
|
||||
|
||||
Tag += BrInfo.SegRegNum != BC.MIB->getX86NoRegister()
|
||||
? "_seg_" + to_string(BrInfo.SegRegNum)
|
||||
Tag += MemRef.SegRegNum != BC.MIB->getNoRegister()
|
||||
? "_seg_" + to_string(MemRef.SegRegNum)
|
||||
: "";
|
||||
|
||||
return Tag;
|
||||
@ -232,10 +235,11 @@ void createBranchReplacement(BinaryContext &BC,
|
||||
auto &MIB = *BC.MIB;
|
||||
// Load the branch address in r11 if available
|
||||
if (BrInfo.isMem() && R11Available) {
|
||||
const auto &MemRef = BrInfo.Memory;
|
||||
MCInst LoadCalleeAddrs;
|
||||
MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue,
|
||||
BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr,
|
||||
BrInfo.SegRegNum, MIB.getX86R11(), 8);
|
||||
MIB.createLoad(LoadCalleeAddrs, MemRef.BaseRegNum, MemRef.ScaleValue,
|
||||
MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
|
||||
MemRef.SegRegNum, MIB.getX86R11(), 8);
|
||||
Replacement.push_back(LoadCalleeAddrs);
|
||||
}
|
||||
|
||||
@ -255,9 +259,10 @@ IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) {
|
||||
|
||||
if (MIB.isBranchOnMem(Inst)) {
|
||||
IsMem = true;
|
||||
if (!MIB.evaluateX86MemoryOperand(Inst, &BaseRegNum, &ScaleValue,
|
||||
&IndexRegNum, &DispValue, &SegRegNum,
|
||||
&DispExpr)) {
|
||||
if (!MIB.evaluateX86MemoryOperand(Inst, &Memory.BaseRegNum,
|
||||
&Memory.ScaleValue,
|
||||
&Memory.IndexRegNum, &Memory.DispValue,
|
||||
&Memory.SegRegNum, &Memory.DispExpr)) {
|
||||
llvm_unreachable("not expected");
|
||||
}
|
||||
} else if (MIB.isBranchOnReg(Inst)) {
|
||||
@ -268,10 +273,7 @@ IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) {
|
||||
}
|
||||
}
|
||||
|
||||
void RetpolineInsertion::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
|
||||
void RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
|
||||
if (!opts::InsertRetpolines)
|
||||
return;
|
||||
|
||||
@ -282,7 +284,7 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC,
|
||||
|
||||
auto &MIB = *BC.MIB;
|
||||
uint32_t RetpolinedBranches = 0;
|
||||
for (auto &It : BFs) {
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
auto &Function = It.second;
|
||||
for (auto &BB : Function) {
|
||||
for (auto It = BB.begin(); It != BB.end(); ++It) {
|
||||
@ -309,12 +311,13 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC,
|
||||
// If the instruction addressing pattern uses rsp and the retpoline
|
||||
// loads the callee address then displacement needs to be updated
|
||||
if (BrInfo.isMem() && !R11Available) {
|
||||
auto &MemRef = BrInfo.Memory;
|
||||
auto Addend = (BrInfo.isJump() || BrInfo.isTailCall()) ? 8 : 16;
|
||||
if (BrInfo.BaseRegNum == MIB.getStackPointer()) {
|
||||
BrInfo.DispValue += Addend;
|
||||
if (MemRef.BaseRegNum == MIB.getStackPointer()) {
|
||||
MemRef.DispValue += Addend;
|
||||
}
|
||||
if (BrInfo.IndexRegNum == MIB.getStackPointer())
|
||||
BrInfo.DispValue += Addend * BrInfo.ScaleValue;
|
||||
if (MemRef.IndexRegNum == MIB.getStackPointer())
|
||||
MemRef.DispValue += Addend * MemRef.ScaleValue;
|
||||
}
|
||||
|
||||
TargetRetpoline = getOrCreateRetpoline(BC, BrInfo, R11Available);
|
||||
|
||||
@ -34,19 +34,21 @@ public:
|
||||
bool isJump() const { return !IsCall; }
|
||||
bool isTailCall() const { return IsTailCall; }
|
||||
|
||||
struct MemOpInfo {
|
||||
unsigned BaseRegNum;
|
||||
int64_t ScaleValue;
|
||||
unsigned IndexRegNum;
|
||||
int64_t DispValue;
|
||||
unsigned SegRegNum;
|
||||
const MCExpr *DispExpr{nullptr};
|
||||
};
|
||||
|
||||
union {
|
||||
// Register branch information
|
||||
MCPhysReg BranchReg;
|
||||
|
||||
// Memory branch information
|
||||
struct {
|
||||
unsigned BaseRegNum;
|
||||
int64_t ScaleValue;
|
||||
unsigned IndexRegNum;
|
||||
int64_t DispValue;
|
||||
unsigned SegRegNum;
|
||||
const MCExpr *DispExpr{nullptr};
|
||||
};
|
||||
MemOpInfo Memory;
|
||||
};
|
||||
};
|
||||
|
||||
@ -71,9 +73,7 @@ public:
|
||||
|
||||
const char *getName() const override { return "retpoline-insertion"; }
|
||||
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
@ -102,7 +102,7 @@ void CalleeSavedAnalysis::analyzeSaves() {
|
||||
CalleeSaved.set(FIE->RegOrImm);
|
||||
SaveFIEByReg[FIE->RegOrImm] = &*FIE;
|
||||
SavingCost[FIE->RegOrImm] += InsnToBB[&Inst]->getKnownExecutionCount();
|
||||
BC.MIB->addAnnotation(Inst, getSaveTag(), FIE->RegOrImm);
|
||||
BC.MIB->addAnnotation(Inst, getSaveTag(), FIE->RegOrImm, AllocatorId);
|
||||
OffsetsByReg[FIE->RegOrImm] = FIE->StackOffset;
|
||||
DEBUG(dbgs() << "Logging new candidate for Callee-Saved Reg: "
|
||||
<< FIE->RegOrImm << "\n");
|
||||
@ -153,7 +153,8 @@ void CalleeSavedAnalysis::analyzeRestores() {
|
||||
<< "\n");
|
||||
if (LoadFIEByReg[FIE->RegOrImm] == nullptr)
|
||||
LoadFIEByReg[FIE->RegOrImm] = &*FIE;
|
||||
BC.MIB->addAnnotation(Inst, getRestoreTag(), FIE->RegOrImm);
|
||||
BC.MIB->addAnnotation(Inst, getRestoreTag(), FIE->RegOrImm,
|
||||
AllocatorId);
|
||||
HasRestores.set(FIE->RegOrImm);
|
||||
}
|
||||
Prev = &Inst;
|
||||
@ -311,7 +312,7 @@ void StackLayoutModifier::checkStackPointerRestore(MCInst &Point) {
|
||||
|
||||
// We are restoring SP to an old value based on FP. Mark it as a stack
|
||||
// access to be fixed later.
|
||||
BC.MIB->addAnnotation(Point, getSlotTag(), Output);
|
||||
BC.MIB->addAnnotation(Point, getSlotTag(), Output, AllocatorId);
|
||||
}
|
||||
|
||||
void StackLayoutModifier::classifyStackAccesses() {
|
||||
@ -354,7 +355,7 @@ void StackLayoutModifier::classifyStackAccesses() {
|
||||
// We are free to go. Add it as available stack slot which we know how
|
||||
// to move it.
|
||||
AvailableRegions[FIEX->StackOffset] = FIEX->Size;
|
||||
BC.MIB->addAnnotation(Inst, getSlotTag(), FIEX->StackOffset);
|
||||
BC.MIB->addAnnotation(Inst, getSlotTag(), FIEX->StackOffset, AllocatorId);
|
||||
RegionToRegMap[FIEX->StackOffset].insert(FIEX->RegOrImm);
|
||||
RegToRegionMap[FIEX->RegOrImm].insert(FIEX->StackOffset);
|
||||
DEBUG(dbgs() << "Adding region " << FIEX->StackOffset << " size "
|
||||
@ -371,7 +372,7 @@ void StackLayoutModifier::classifyCFIs() {
|
||||
auto recordAccess = [&](MCInst *Inst, int64_t Offset) {
|
||||
const uint16_t Reg = BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false);
|
||||
if (Reg == BC.MIB->getStackPointer() || Reg == BC.MIB->getFramePointer()) {
|
||||
BC.MIB->addAnnotation(*Inst, getSlotTag(), Offset);
|
||||
BC.MIB->addAnnotation(*Inst, getSlotTag(), Offset, AllocatorId);
|
||||
DEBUG(dbgs() << "Recording CFI " << Offset << "\n");
|
||||
} else {
|
||||
IsSimple = false;
|
||||
@ -400,12 +401,14 @@ void StackLayoutModifier::classifyCFIs() {
|
||||
recordAccess(&Inst, CFI->getOffset());
|
||||
BC.MIB->addAnnotation(Inst, getOffsetCFIRegTag(),
|
||||
BC.MRI->getLLVMRegNum(CFI->getRegister(),
|
||||
/*isEH=*/false));
|
||||
/*isEH=*/false),
|
||||
AllocatorId);
|
||||
break;
|
||||
case MCCFIInstruction::OpSameValue:
|
||||
BC.MIB->addAnnotation(Inst, getOffsetCFIRegTag(),
|
||||
BC.MRI->getLLVMRegNum(CFI->getRegister(),
|
||||
/*isEH=*/false));
|
||||
/*isEH=*/false),
|
||||
AllocatorId);
|
||||
break;
|
||||
case MCCFIInstruction::OpRememberState:
|
||||
CFIStack.push(std::make_pair(CfaOffset, CfaReg));
|
||||
@ -432,7 +435,7 @@ void StackLayoutModifier::classifyCFIs() {
|
||||
void StackLayoutModifier::scheduleChange(
|
||||
MCInst &Inst, StackLayoutModifier::WorklistItem Item) {
|
||||
auto &WList = BC.MIB->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
|
||||
Inst, getTodoTag());
|
||||
Inst, getTodoTag(), AllocatorId);
|
||||
WList.push_back(Item);
|
||||
}
|
||||
|
||||
@ -510,7 +513,7 @@ bool StackLayoutModifier::collapseRegion(MCInst *Alloc, int64_t RegionAddr,
|
||||
}
|
||||
|
||||
if (Slot == RegionAddr) {
|
||||
BC.MIB->addAnnotation(Inst, "AccessesDeletedPos", 0U);
|
||||
BC.MIB->addAnnotation(Inst, "AccessesDeletedPos", 0U, AllocatorId);
|
||||
continue;
|
||||
}
|
||||
if (BC.MIB->isPush(Inst) || BC.MIB->isPop(Inst)) {
|
||||
@ -771,7 +774,7 @@ void ShrinkWrapping::pruneUnwantedCSRs() {
|
||||
}
|
||||
|
||||
void ShrinkWrapping::computeSaveLocations() {
|
||||
SavePos = std::vector<SmallPtrSet<MCInst *, 4>>(BC.MRI->getNumRegs());
|
||||
SavePos = std::vector<SmallSetVector<MCInst *, 4>>(BC.MRI->getNumRegs());
|
||||
auto &RI = Info.getReachingInsnsBackwards();
|
||||
auto &DA = Info.getDominatorAnalysis();
|
||||
auto &SPT = Info.getStackPointerTracking();
|
||||
@ -960,7 +963,7 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR,
|
||||
// In case of a critical edge, we need to create extra BBs to host restores
|
||||
// into edges transitioning to the dominance frontier, otherwise we pull these
|
||||
// restores to inside the dominated area.
|
||||
Frontier = DA.getDominanceFrontierFor(*BestPosSave);
|
||||
Frontier = DA.getDominanceFrontierFor(*BestPosSave).takeVector();
|
||||
DEBUG({
|
||||
dbgs() << "Dumping dominance frontier for ";
|
||||
BC.printInstruction(dbgs(), *BestPosSave);
|
||||
@ -1454,13 +1457,13 @@ protected:
|
||||
public:
|
||||
PredictiveStackPointerTracking(const BinaryContext &BC, BinaryFunction &BF,
|
||||
decltype(ShrinkWrapping::Todo) &TodoMap,
|
||||
DataflowInfoManager &Info)
|
||||
: StackPointerTrackingBase<PredictiveStackPointerTracking>(BC, BF),
|
||||
DataflowInfoManager &Info,
|
||||
MCPlusBuilder::AllocatorIdTy AllocatorId = 0)
|
||||
: StackPointerTrackingBase<PredictiveStackPointerTracking>(BC, BF,
|
||||
AllocatorId),
|
||||
TodoMap(TodoMap), Info(Info) {}
|
||||
|
||||
void run() {
|
||||
NamedRegionTimer T1("PSPT", "Predictive Stack Pointer Tracking", "Dataflow",
|
||||
"Dataflow", opts::TimeOpts);
|
||||
StackPointerTrackingBase<PredictiveStackPointerTracking>::run();
|
||||
}
|
||||
};
|
||||
@ -1553,7 +1556,7 @@ void ShrinkWrapping::rebuildCFIForSP() {
|
||||
continue;
|
||||
auto *CFI = BF.getCFIFor(Inst);
|
||||
if (CFI->getOperation() == MCCFIInstruction::OpDefCfaOffset)
|
||||
BC.MIB->addAnnotation(Inst, "DeleteMe", 0U);
|
||||
BC.MIB->addAnnotation(Inst, "DeleteMe", 0U, AllocatorId);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1812,7 +1815,7 @@ BBIterTy ShrinkWrapping::processInsertionsList(
|
||||
}
|
||||
|
||||
bool ShrinkWrapping::processInsertions() {
|
||||
PredictiveStackPointerTracking PSPT(BC, BF, Todo, Info);
|
||||
PredictiveStackPointerTracking PSPT(BC, BF, Todo, Info, AllocatorId);
|
||||
PSPT.run();
|
||||
|
||||
bool Changes{false};
|
||||
@ -1910,6 +1913,15 @@ bool ShrinkWrapping::perform() {
|
||||
PopOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
|
||||
DomOrder = std::vector<MCPhysReg>(BC.MRI->getNumRegs(), 0);
|
||||
|
||||
if (BF.checkForAmbiguousJumpTables()) {
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
|
||||
<< ".\n");
|
||||
// We could call disambiguateJumpTables here, but it is probably not worth
|
||||
// the cost (of duplicating potentially large jump tables that could regress
|
||||
// dcache misses). Moreover, ambiguous JTs are rare and coming from code
|
||||
// written in assembly language. Just bail.
|
||||
return false;
|
||||
}
|
||||
SLM.initialize();
|
||||
CSA.compute();
|
||||
classifyCSRUses();
|
||||
|
||||
@ -27,6 +27,8 @@ class CalleeSavedAnalysis {
|
||||
const BinaryContext &BC;
|
||||
BinaryFunction &BF;
|
||||
DataflowInfoManager &Info;
|
||||
MCPlusBuilder::AllocatorIdTy AllocatorId;
|
||||
|
||||
Optional<unsigned> SaveTagIndex;
|
||||
Optional<unsigned> RestoreTagIndex;
|
||||
|
||||
@ -39,12 +41,6 @@ class CalleeSavedAnalysis {
|
||||
/// function.
|
||||
void analyzeRestores();
|
||||
|
||||
/// Returns the identifying string used to annotate instructions with metadata
|
||||
/// for this analysis. These are deleted in the destructor.
|
||||
static StringRef getSaveTagName() {
|
||||
return StringRef("CSA-SavedReg");
|
||||
}
|
||||
|
||||
unsigned getSaveTag() {
|
||||
if (SaveTagIndex)
|
||||
return *SaveTagIndex;
|
||||
@ -52,10 +48,6 @@ class CalleeSavedAnalysis {
|
||||
return *SaveTagIndex;
|
||||
}
|
||||
|
||||
static StringRef getRestoreTagName() {
|
||||
return StringRef("CSA-RestoredReg");
|
||||
}
|
||||
|
||||
unsigned getRestoreTag() {
|
||||
if (RestoreTagIndex)
|
||||
return *RestoreTagIndex;
|
||||
@ -72,8 +64,9 @@ public:
|
||||
std::vector<const FrameIndexEntry*> LoadFIEByReg;
|
||||
|
||||
CalleeSavedAnalysis(const FrameAnalysis &FA, const BinaryContext &BC,
|
||||
BinaryFunction &BF, DataflowInfoManager &Info)
|
||||
: FA(FA), BC(BC), BF(BF), Info(Info),
|
||||
BinaryFunction &BF, DataflowInfoManager &Info,
|
||||
MCPlusBuilder::AllocatorIdTy AllocId)
|
||||
: FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId),
|
||||
CalleeSaved(BC.MRI->getNumRegs(), false),
|
||||
OffsetsByReg(BC.MRI->getNumRegs(), 0LL),
|
||||
HasRestores(BC.MRI->getNumRegs(), false),
|
||||
@ -112,6 +105,17 @@ public:
|
||||
/// instructions).
|
||||
std::vector<MCInst *> getSavesByReg(uint16_t Reg);
|
||||
std::vector<MCInst *> getRestoresByReg(uint16_t Reg);
|
||||
|
||||
/// Returns the identifying string used to annotate instructions with metadata
|
||||
/// for this analysis. These are deleted in the destructor.
|
||||
static StringRef getSaveTagName() {
|
||||
return StringRef("CSA-SavedReg");
|
||||
}
|
||||
|
||||
static StringRef getRestoreTagName() {
|
||||
return StringRef("CSA-RestoredReg");
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/// Identifies in a given binary function all stack regions being used and allow
|
||||
@ -122,6 +126,7 @@ class StackLayoutModifier {
|
||||
const BinaryContext &BC;
|
||||
BinaryFunction &BF;
|
||||
DataflowInfoManager &Info;
|
||||
MCPlusBuilder::AllocatorIdTy AllocatorId;
|
||||
|
||||
// Keep track of stack slots we know how to safely move
|
||||
std::map<int64_t, int64_t> AvailableRegions;
|
||||
@ -217,20 +222,11 @@ private:
|
||||
return *OffsetCFIRegTagIndex;
|
||||
}
|
||||
|
||||
static StringRef getTodoTagName() {
|
||||
return StringRef("SLM-TodoTag");
|
||||
}
|
||||
static StringRef getSlotTagName() {
|
||||
return StringRef("SLM-SlotTag");
|
||||
}
|
||||
static StringRef getOffsetCFIRegTagName() {
|
||||
return StringRef("SLM-OffsetCFIReg");
|
||||
}
|
||||
|
||||
public:
|
||||
StackLayoutModifier(const FrameAnalysis &FA, const BinaryContext &BC,
|
||||
BinaryFunction &BF, DataflowInfoManager &Info)
|
||||
: FA(FA), BC(BC), BF(BF), Info(Info) {}
|
||||
BinaryFunction &BF, DataflowInfoManager &Info,
|
||||
MCPlusBuilder::AllocatorIdTy AllocId)
|
||||
: FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId) {}
|
||||
|
||||
~StackLayoutModifier() {
|
||||
for (auto &BB : BF) {
|
||||
@ -283,6 +279,19 @@ public:
|
||||
/// Perform initial assessment of the function trying to understand its stack
|
||||
/// accesses.
|
||||
void initialize();
|
||||
|
||||
static StringRef getTodoTagName() {
|
||||
return StringRef("SLM-TodoTag");
|
||||
}
|
||||
|
||||
static StringRef getSlotTagName() {
|
||||
return StringRef("SLM-SlotTag");
|
||||
}
|
||||
|
||||
static StringRef getOffsetCFIRegTagName() {
|
||||
return StringRef("SLM-OffsetCFIReg");
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/// Implements a pass to optimize callee-saved register spills. These spills
|
||||
@ -294,6 +303,7 @@ class ShrinkWrapping {
|
||||
const BinaryContext &BC;
|
||||
BinaryFunction &BF;
|
||||
DataflowInfoManager &Info;
|
||||
MCPlusBuilder::AllocatorIdTy AllocatorId;
|
||||
StackLayoutModifier SLM;
|
||||
/// For each CSR, store a vector of all CFI indexes deleted as a consequence
|
||||
/// of moving this Callee-Saved Reg
|
||||
@ -306,7 +316,7 @@ class ShrinkWrapping {
|
||||
std::vector<int64_t> PopOffsetByReg;
|
||||
std::vector<MCPhysReg> DomOrder;
|
||||
CalleeSavedAnalysis CSA;
|
||||
std::vector<SmallPtrSet<MCInst *, 4>> SavePos;
|
||||
std::vector<SmallSetVector<MCInst *, 4>> SavePos;
|
||||
std::vector<uint64_t> BestSaveCount;
|
||||
std::vector<MCInst *> BestSavePos;
|
||||
|
||||
@ -381,7 +391,7 @@ private:
|
||||
void scheduleChange(ProgramPoint PP, T&& ...Item) {
|
||||
if (PP.isInst()) {
|
||||
auto &WList = BC.MIB->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
|
||||
*PP.getInst(), getAnnotationIndex());
|
||||
*PP.getInst(), getAnnotationIndex(), AllocatorId);
|
||||
WList.emplace_back(std::forward<T>(Item)...);
|
||||
return;
|
||||
}
|
||||
@ -398,7 +408,7 @@ private:
|
||||
BB = *BB->succ_begin();
|
||||
}
|
||||
auto &WList = BC.MIB->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
|
||||
*BB->begin(), getAnnotationIndex());
|
||||
*BB->begin(), getAnnotationIndex(), AllocatorId);
|
||||
WList.emplace_back(std::forward<T>(Item)...);
|
||||
}
|
||||
|
||||
@ -517,9 +527,10 @@ private:
|
||||
|
||||
public:
|
||||
ShrinkWrapping(const FrameAnalysis &FA, const BinaryContext &BC,
|
||||
BinaryFunction &BF, DataflowInfoManager &Info)
|
||||
: FA(FA), BC(BC), BF(BF), Info(Info), SLM(FA, BC, BF, Info),
|
||||
CSA(FA, BC, BF, Info) {}
|
||||
BinaryFunction &BF, DataflowInfoManager &Info,
|
||||
MCPlusBuilder::AllocatorIdTy AllocId)
|
||||
: FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId),
|
||||
SLM(FA, BC, BF, Info, AllocId), CSA(FA, BC, BF, Info, AllocId) {}
|
||||
|
||||
~ShrinkWrapping() {
|
||||
for (auto &BB : BF) {
|
||||
|
||||
@ -35,14 +35,13 @@ class StackAllocationAnalysis
|
||||
|
||||
public:
|
||||
StackAllocationAnalysis(const BinaryContext &BC, BinaryFunction &BF,
|
||||
StackPointerTracking &SPT)
|
||||
: InstrsDataflowAnalysis<StackAllocationAnalysis, false>(BC, BF),
|
||||
StackPointerTracking &SPT,
|
||||
MCPlusBuilder::AllocatorIdTy AllocId)
|
||||
: InstrsDataflowAnalysis<StackAllocationAnalysis, false>(BC, BF, AllocId),
|
||||
SPT(SPT) {}
|
||||
virtual ~StackAllocationAnalysis() {}
|
||||
|
||||
void run() {
|
||||
NamedRegionTimer T1("SAA", "Stack Allocation Analysis", "Dataflow",
|
||||
"Dataflow", opts::TimeOpts);
|
||||
InstrsDataflowAnalysis<StackAllocationAnalysis, false>::run();
|
||||
}
|
||||
|
||||
|
||||
@ -36,8 +36,6 @@ public:
|
||||
virtual ~StackAvailableExpressions() {}
|
||||
|
||||
void run() {
|
||||
NamedRegionTimer T1("SAE", "Stack Available Expressions", "Dataflow",
|
||||
"Dataflow", opts::TimeOpts);
|
||||
InstrsDataflowAnalysis<StackAvailableExpressions>::run();
|
||||
}
|
||||
|
||||
|
||||
@ -14,9 +14,10 @@
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
StackPointerTracking::StackPointerTracking(const BinaryContext &BC,
|
||||
BinaryFunction &BF)
|
||||
: StackPointerTrackingBase<StackPointerTracking>(BC, BF) {}
|
||||
StackPointerTracking::StackPointerTracking(
|
||||
const BinaryContext &BC, BinaryFunction &BF,
|
||||
MCPlusBuilder::AllocatorIdTy AllocatorId)
|
||||
: StackPointerTrackingBase<StackPointerTracking>(BC, BF, AllocatorId) {}
|
||||
|
||||
} // end namespace bolt
|
||||
} // end namespace llvm
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user