BOLT/src/Passes/CachePlusReorderAlgorithm.cpp
laith sakka 227b921898 Run reorder blocks in parallel
Summary:
This diff change reorderBasicBlocks pass to run in parallel,
it does so by adding locks to the fix branches function,
and creating temporary MCCodeEmitters when estimating basic block code size.

Differential Revision: D16161149

fbshipit-source-id: ef3774c51cd
2019-07-15 19:31:24 -07:00

713 lines
24 KiB
C++

//===--- CachePlusReorderAlgorithm.cpp - Order basic blocks ---------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "BinaryBasicBlock.h"
#include "BinaryFunction.h"
#include "CacheMetrics.h"
#include "ReorderAlgorithm.h"
#include "ReorderUtils.h"
#include "llvm/Support/Options.h"
using namespace llvm;
using namespace bolt;
using EdgeList = std::vector<std::pair<BinaryBasicBlock *, uint64_t>>;
namespace opts {
extern cl::OptionCategory BoltOptCategory;
extern cl::opt<bool> NoThreads;
cl::opt<unsigned>
ClusterSplitThreshold("cluster-split-threshold",
cl::desc("The maximum size of a cluster to apply splitting"),
cl::init(128),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
}
namespace llvm {
namespace bolt {
namespace {
// A cluster (ordered sequence) of basic blocks
class Cluster {
public:
Cluster(BinaryBasicBlock *BB, uint64_t ExecutionCount_, uint64_t Size_)
: Id(BB->getLayoutIndex()),
IsEntry(BB->getLayoutIndex() == 0),
ExecutionCount(ExecutionCount_),
Size(Size_),
Score(0) {
Blocks.push_back(BB);
}
size_t id() const {
return Id;
}
uint64_t size() const {
return Size;
}
double density() const {
return static_cast<double>(ExecutionCount) / Size;
}
bool isCold() const {
return ExecutionCount == 0;
}
uint64_t executionCount() const {
return ExecutionCount;
}
bool isEntryPoint() const {
return IsEntry;
}
double score() const {
return Score;
}
const std::vector<BinaryBasicBlock *> &blocks() const {
return Blocks;
}
/// Update the list of basic blocks and aggregated cluster data
void merge(const Cluster *Other,
const std::vector<BinaryBasicBlock *> &MergedBlocks,
double MergedScore) {
Blocks = MergedBlocks;
IsEntry |= Other->IsEntry;
ExecutionCount += Other->ExecutionCount;
Size += Other->Size;
Score = MergedScore;
}
void clear() {
Blocks.clear();
}
private:
std::vector<BinaryBasicBlock *> Blocks;
size_t Id;
bool IsEntry;
uint64_t ExecutionCount;
uint64_t Size;
double Score;
};
using ClusterIter = std::vector<BinaryBasicBlock *>::const_iterator;
// A wrapper around three clusters of basic blocks; it is used to avoid extra
// instantiation of the vectors.
class MergedCluster {
public:
MergedCluster(ClusterIter Begin1,
ClusterIter End1,
ClusterIter Begin2,
ClusterIter End2,
ClusterIter Begin3,
ClusterIter End3)
: Begin1(Begin1),
End1(End1),
Begin2(Begin2),
End2(End2),
Begin3(Begin3),
End3(End3) {}
template<typename F>
void forEach(const F &Func) const {
for (auto It = Begin1; It != End1; It++)
Func(*It);
for (auto It = Begin2; It != End2; It++)
Func(*It);
for (auto It = Begin3; It != End3; It++)
Func(*It);
}
std::vector<BinaryBasicBlock *> getBlocks() const {
std::vector<BinaryBasicBlock *> Result;
Result.reserve(std::distance(Begin1, End1) +
std::distance(Begin2, End2) +
std::distance(Begin3, End3));
Result.insert(Result.end(), Begin1, End1);
Result.insert(Result.end(), Begin2, End2);
Result.insert(Result.end(), Begin3, End3);
return Result;
}
const BinaryBasicBlock *getFirstBlock() const {
return *Begin1;
}
private:
ClusterIter Begin1;
ClusterIter End1;
ClusterIter Begin2;
ClusterIter End2;
ClusterIter Begin3;
ClusterIter End3;
};
/// Deterministically compare clusters by their density in decreasing order
bool compareClusters(const Cluster *C1, const Cluster *C2) {
// Original entry point to the front
if (C1->isEntryPoint())
return true;
if (C2->isEntryPoint())
return false;
const double D1 = C1->density();
const double D2 = C2->density();
if (D1 != D2)
return D1 > D2;
// Making the order deterministic
return C1->id() < C2->id();
}
/// Deterministically compare pairs of clusters
bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
const Cluster *A2, const Cluster *B2) {
const auto Samples1 = A1->executionCount() + B1->executionCount();
const auto Samples2 = A2->executionCount() + B2->executionCount();
if (Samples1 != Samples2)
return Samples1 < Samples2;
// Making the order deterministic
if (A1 != A2)
return A1->id() < A2->id();
return B1->id() < B2->id();
}
} // end namespace anonymous
/// CachePlus - layout of basic blocks with i-cache optimization.
///
/// Similarly to OptimizeCacheReorderAlgorithm, this algorithm is a greedy
/// heuristic that works with clusters (ordered sequences) of basic blocks.
/// Initially all clusters are isolated basic blocks. On every iteration,
/// we pick a pair of clusters whose merging yields the biggest increase in
/// the ExtTSP metric (see CacheMetrics.cpp for exact implementation), which
/// models how i-cache "friendly" a specific cluster is. A pair of clusters
/// giving the maximum gain is merged into a new cluster. The procedure stops
/// when there is only one cluster left, or when merging does not increase
/// ExtTSP. In the latter case, the remaining clusters are sorted by density.
///
/// An important aspect is the way two clusters are merged. Unlike earlier
/// algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
/// clusters, X and Y, are first split into three, X1, X2, and Y. Then we
/// consider all possible ways of gluing the three clusters (e.g., X1YX2, X1X2Y,
/// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
/// This improves the quality of the final result (the search space is larger)
/// while keeping the implementation sufficiently fast.
class CachePlus {
public:
CachePlus(const BinaryFunction &BF)
: BF(BF),
Adjacent(BF.layout_size()),
Cache(BF.layout_size()) {
initialize();
}
/// Run cache+ algorithm and return a basic block ordering
std::vector<BinaryBasicBlock *> run() {
// Pass 1: Merge blocks with their fallthrough successors
mergeFallthroughs();
// Pass 2: Merge pairs of clusters while improving the ExtTSP metric
mergeClusterPairs();
// Pass 3: Merge cold blocks to reduce code size
mergeColdClusters();
// Sorting clusters by density
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
// Collect the basic blocks in the order specified by their clusters
std::vector<BinaryBasicBlock *> Result;
Result.reserve(BF.layout_size());
for (auto Cluster : Clusters) {
Result.insert(Result.end(),
Cluster->blocks().begin(),
Cluster->blocks().end());
}
return Result;
}
private:
/// Initialize the set of active clusters, edges between blocks, and
/// adjacency matrix.
void initialize() {
// Initialize indices of basic blocks
size_t LayoutIndex = 0;
for (auto BB : BF.layout()) {
BB->setLayoutIndex(LayoutIndex);
LayoutIndex++;
}
// Initialize edges for the blocks and compute their total in/out weights
OutEdges = std::vector<EdgeList>(BF.layout_size());
auto InWeight = std::vector<uint64_t>(BF.layout_size(), 0);
auto OutWeight = std::vector<uint64_t>(BF.layout_size(), 0);
for (auto BB : BF.layout()) {
auto BI = BB->branch_info_begin();
for (auto I : BB->successors()) {
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"missing profile for a jump");
if (I != BB && BI->Count > 0) {
InWeight[I->getLayoutIndex()] += BI->Count;
OutEdges[BB->getLayoutIndex()].push_back(std::make_pair(I, BI->Count));
OutWeight[BB->getLayoutIndex()] += BI->Count;
}
++BI;
}
}
// Initialize execution count for every basic block, which is the
// maximum over the sums of all in and out edge weights.
// Also execution count of the entry point is set to at least 1
auto ExecutionCounts = std::vector<uint64_t>(BF.layout_size(), 0);
for (auto BB : BF.layout()) {
uint64_t EC = BB->getKnownExecutionCount();
EC = std::max(EC, InWeight[BB->getLayoutIndex()]);
EC = std::max(EC, OutWeight[BB->getLayoutIndex()]);
if (BB->getLayoutIndex() == 0)
EC = std::max(EC, uint64_t(1));
ExecutionCounts[BB->getLayoutIndex()] = EC;
}
// Create a separate MCCodeEmitter to allow lock-free execution
BinaryContext::IndependentCodeEmitter Emitter;
if (!opts::NoThreads) {
Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter();
}
// Initialize clusters
Clusters.reserve(BF.layout_size());
AllClusters.reserve(BF.layout_size());
CurCluster.reserve(BF.layout_size());
Size.reserve(BF.layout_size());
for (auto BB : BF.layout()) {
size_t Index = BB->getLayoutIndex();
Size.push_back(
std::max<uint64_t>(BB->estimateSize(Emitter.MCE.get()), 1));
AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
Clusters.push_back(&AllClusters[Index]);
CurCluster.push_back(&AllClusters[Index]);
}
// Initialize adjacency matrix
Adjacent.initialize(Clusters);
for (auto BB : BF.layout()) {
auto BI = BB->branch_info_begin();
for (auto I : BB->successors()) {
if (BB != I && BI->Count > 0) {
Adjacent.set(Clusters[BB->getLayoutIndex()],
Clusters[I->getLayoutIndex()]);
}
++BI;
}
}
// Initialize fallthrough successors
findFallthroughBlocks(InWeight, OutWeight);
}
/// Merge blocks with their fallthrough successors.
void mergeFallthroughs() {
for (auto BB : BF.layout()) {
if (FallthroughPred[BB->getLayoutIndex()] == nullptr &&
FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
auto CurBB = BB;
while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) {
const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()];
mergeClusters(&AllClusters[BB->getLayoutIndex()],
&AllClusters[NextBB->getLayoutIndex()],
0);
CurBB = NextBB;
}
}
}
}
/// Merge pairs of clusters while improving the ExtTSP metric
void mergeClusterPairs() {
while (Clusters.size() > 1) {
Cluster *BestClusterPred = nullptr;
Cluster *BestClusterSucc = nullptr;
std::pair<double, size_t> BestGain(-1, 0);
for (auto ClusterPred : Clusters) {
// Do not merge cold blocks
if (ClusterPred->isCold())
continue;
// Get candidates for merging with the current cluster
Adjacent.forAllAdjacent(
ClusterPred,
// Find the best candidate
[&](Cluster *ClusterSucc) {
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
assert(!ClusterSucc->isCold() && "cannot merge cold clusters");
// Compute the gain of merging two clusters
auto Gain = mergeGain(ClusterPred, ClusterSucc);
if (Gain.first <= 0.0)
return;
// Breaking ties by density to make the hottest clusters be merged first
if (Gain.first > BestGain.first ||
(std::abs(Gain.first - BestGain.first) < 1e-8 &&
compareClusterPairs(ClusterPred,
ClusterSucc,
BestClusterPred,
BestClusterSucc))) {
BestGain = Gain;
BestClusterPred = ClusterPred;
BestClusterSucc = ClusterSucc;
}
});
}
// Stop merging when there is no improvement
if (BestGain.first <= 0.0)
break;
// Merge the best pair of clusters
mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second);
}
}
/// Merge cold blocks to reduce code size
void mergeColdClusters() {
for (auto SrcBB : BF.layout()) {
// Iterating in reverse order to make sure original fall-trough jumps are
// merged first
for (auto Itr = SrcBB->succ_rbegin(); Itr != SrcBB->succ_rend(); ++Itr) {
BinaryBasicBlock *DstBB = *Itr;
auto SrcCluster = CurCluster[SrcBB->getLayoutIndex()];
auto DstCluster = CurCluster[DstBB->getLayoutIndex()];
if (SrcCluster != DstCluster && !DstCluster->isEntryPoint() &&
SrcCluster->blocks().back() == SrcBB &&
DstCluster->blocks().front() == DstBB) {
mergeClusters(SrcCluster, DstCluster, 0);
}
}
}
}
/// For a pair of blocks, A and B, block B is the fallthrough successor of A,
/// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
/// to B are from A. Such blocks should be adjacent in an optimal ordering,
/// and the method finds such pairs of blocks.
void findFallthroughBlocks(const std::vector<uint64_t> &InWeight,
const std::vector<uint64_t> &OutWeight) {
FallthroughSucc = std::vector<BinaryBasicBlock *>(BF.size(), nullptr);
FallthroughPred = std::vector<BinaryBasicBlock *>(BF.size(), nullptr);
// Find fallthroughs based on edge weights
for (auto BB : BF.layout()) {
if (BB->succ_size() == 1 &&
BB->getSuccessor()->pred_size() == 1 &&
BB->getSuccessor()->getLayoutIndex() != 0) {
FallthroughSucc[BB->getLayoutIndex()] = BB->getSuccessor();
FallthroughPred[BB->getSuccessor()->getLayoutIndex()] = BB;
continue;
}
if (OutWeight[BB->getLayoutIndex()] == 0)
continue;
for (auto Edge : OutEdges[BB->getLayoutIndex()]) {
const auto SuccBB = Edge.first;
// Successor cannot be the first BB, which is pinned
if (OutWeight[BB->getLayoutIndex()] == Edge.second &&
InWeight[SuccBB->getLayoutIndex()] == Edge.second &&
SuccBB->getLayoutIndex() != 0) {
FallthroughSucc[BB->getLayoutIndex()] = SuccBB;
FallthroughPred[SuccBB->getLayoutIndex()] = BB;
break;
}
}
}
// There might be 'cycles' in the fallthrough dependencies (since profile
// data isn't 100% accurate).
// Break the cycles by choosing the block with smallest index as the tail
for (auto BB : BF.layout()) {
const auto Idx = BB->getLayoutIndex();
if (FallthroughSucc[Idx] == nullptr || FallthroughPred[Idx] == nullptr)
continue;
auto SuccBB = FallthroughSucc[Idx];
while (SuccBB != nullptr && SuccBB != BB) {
SuccBB = FallthroughSucc[SuccBB->getLayoutIndex()];
}
if (SuccBB == nullptr)
continue;
// break the cycle
FallthroughSucc[FallthroughPred[Idx]->getLayoutIndex()] = nullptr;
FallthroughPred[Idx] = nullptr;
}
}
/// Compute ExtTSP score for a given order of basic blocks
double score(const MergedCluster& MergedBlocks) const {
uint64_t NotSet = static_cast<uint64_t>(-1);
EstimatedAddr.assign(BF.layout_size(), NotSet);
uint64_t CurAddr = 0;
MergedBlocks.forEach(
[&](const BinaryBasicBlock *BB) {
size_t Index = BB->getLayoutIndex();
EstimatedAddr[Index] = CurAddr;
CurAddr += Size[Index];
}
);
double Score = 0;
MergedBlocks.forEach(
[&](const BinaryBasicBlock *BB) {
size_t Index = BB->getLayoutIndex();
for (auto Edge : OutEdges[Index]) {
auto SuccBB = Edge.first;
size_t SuccIndex = SuccBB->getLayoutIndex();
if (EstimatedAddr[SuccIndex] != NotSet) {
Score += CacheMetrics::extTSPScore(EstimatedAddr[Index],
Size[Index],
EstimatedAddr[SuccIndex],
Edge.second);
}
}
}
);
return Score;
}
/// Verify if it is valid to merge two clusters into the new one
bool isValidMerge(const Cluster *ClusterPred,
const Cluster *ClusterSucc,
size_t MergeType,
const MergedCluster& MergedBlocks) const {
// Does the new cluster preserve the original entry point?
if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) &&
MergedBlocks.getFirstBlock()->getLayoutIndex() != 0)
return false;
// This corresponds to a concatentation of clusters w/o splitting, which is
// always safe
if (MergeType == 0)
return true;
size_t Offset = MergeType / 5;
// The basic blocks on the boundary of a split of ClusterPred
auto BB1 = ClusterPred->blocks()[Offset - 1];
auto BB2 = ClusterPred->blocks()[Offset];
// Does the splitting break FT successors?
if (FallthroughSucc[BB1->getLayoutIndex()] != nullptr) {
assert(FallthroughSucc[BB1->getLayoutIndex()] == BB2 &&
"Fallthrough successor is not preserved");
return false;
}
// Do not split large clusters to reduce computation time
if (ClusterPred->blocks().size() > opts::ClusterSplitThreshold) {
return false;
}
return true;
}
/// The gain of merging two clusters.
///
/// The function considers all possible ways of merging two clusters and
/// computes the one having the largest increase in ExtTSP metric. The result
/// is a pair with the first element being the gain and the second element being
/// the corresponding merging type (encoded as an integer).
std::pair<double, size_t> mergeGain(const Cluster *ClusterPred,
const Cluster *ClusterSucc) const {
if (Cache.contains(ClusterPred, ClusterSucc)) {
return Cache.get(ClusterPred, ClusterSucc);
}
// The current score of two separate clusters
const auto CurScore = ClusterPred->score() + ClusterSucc->score();
// Merge two clusters and update the best Gain
auto computeMergeGain = [&](const std::pair<double, size_t> &CurGain,
const Cluster *ClusterPred,
const Cluster *ClusterSucc,
size_t MergeType) {
auto MergedBlocks = mergeBlocks(ClusterPred->blocks(),
ClusterSucc->blocks(),
MergeType);
if (!isValidMerge(ClusterPred, ClusterSucc, MergeType, MergedBlocks))
return CurGain;
// The score of the new cluster
const auto NewScore = score(MergedBlocks);
if (NewScore > CurScore && NewScore - CurScore > CurGain.first)
return std::make_pair(NewScore - CurScore, MergeType);
else
return CurGain;
};
std::pair<double, size_t> Gain = std::make_pair(-1, 0);
// Try to concatenate two clusters w/o splitting
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0);
// Try to split ClusterPred into two sub-clusters in various ways and then
// merge it with ClusterSucc
for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) {
for (size_t Type = 1; Type <= 4; Type++) {
size_t MergeType = Type + Offset * 5;
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType);
}
}
Cache.set(ClusterPred, ClusterSucc, Gain);
return Gain;
}
/// Merge two clusters of blocks respecting a given merge 'type' and 'offset'.
///
/// If MergeType == 0, then the result is a concatentation of two clusters.
/// Otherwise, the first cluster is cut into two sub-clusters at the offset,
/// and merged using all possible ways of concatenating three clusters.
MergedCluster mergeBlocks(const std::vector<BinaryBasicBlock *> &X,
const std::vector<BinaryBasicBlock *> &Y,
size_t MergeType) const {
// Merging w/o splitting existing clusters
if (MergeType == 0) {
ClusterIter Empty;
return MergedCluster(X.begin(), X.end(), Y.begin(), Y.end(), Empty, Empty);
}
size_t Type = MergeType % 5;
size_t Offset = MergeType / 5;
assert(0 < Offset && Offset < X.size() &&
"Invalid offset while merging clusters");
// Split the first cluster, X, into X1 and X2
ClusterIter BeginX1 = X.begin();
ClusterIter EndX1 = X.begin() + Offset;
ClusterIter BeginX2 = X.begin() + Offset;
ClusterIter EndX2 = X.end();
ClusterIter BeginY = Y.begin();
ClusterIter EndY = Y.end();
// Construct a new cluster from three existing ones
switch(Type) {
case 1: return MergedCluster(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
case 2: return MergedCluster(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
case 3: return MergedCluster(BeginX2, EndX2, BeginY, EndY, BeginX1, EndX1);
case 4: return MergedCluster(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
default:
llvm_unreachable("unexpected merge type");
}
}
/// Merge cluster From into cluster Into, update the list of active clusters,
/// adjacency information, and the corresponding cache.
void mergeClusters(Cluster *Into, Cluster *From, size_t MergeType) {
assert(Into != From && "Cluster cannot be merged with itself");
// Merge the blocks of clusters
auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
Into->merge(From, MergedBlocks.getBlocks(), score(MergedBlocks));
From->clear();
// Remove cluster From from the list of active clusters
auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
Clusters.erase(Iter, Clusters.end());
// Update block clusters
for (auto BB : Into->blocks()) {
CurCluster[BB->getLayoutIndex()] = Into;
}
// Invalidate caches
Cache.invalidate(Into);
// Update the adjacency matrix
Adjacent.merge(Into, From);
}
// The binary function
const BinaryFunction &BF;
// All clusters
std::vector<Cluster> AllClusters;
// Active clusters. The vector gets udpated at runtime when clusters are merged
std::vector<Cluster *> Clusters;
// Current cluster of a basic block
std::vector<Cluster *> CurCluster;
// Size of the block
std::vector<uint64_t> Size;
// Outgoing edges of the block
std::vector<EdgeList> OutEdges;
// Cluster adjacency matrix
AdjacencyMatrix<Cluster> Adjacent;
// Fallthrough successor of the block
std::vector<BinaryBasicBlock *> FallthroughSucc;
// Fallthrough predecessor of the block
std::vector<BinaryBasicBlock *> FallthroughPred;
// A cache that keeps precomputed values of mergeGain for pairs of clusters;
// when a pair of clusters (x,y) gets merged, we invalidate the pairs
// containing both x and y and all clusters adjacent to x and y (and recompute
// them on the next iteration).
mutable ClusterPairCache<Cluster, std::pair<double, size_t>> Cache;
// A reusable vector used within score() method
mutable std::vector<uint64_t> EstimatedAddr;
};
void CachePlusReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Are there jumps with positive execution count?
size_t NumHotBlocks = 0;
for (auto BB : BF.layout()) {
if (BB->getKnownExecutionCount() > 0)
NumHotBlocks++;
}
// Do not change layout of functions w/o profile information
if (NumHotBlocks == 0 || BF.layout_size() <= 1) {
for (auto BB : BF.layout()) {
Order.push_back(BB);
}
return;
}
// Apply the algorithm
Order = CachePlus(BF).run();
// Verify correctness
assert(Order[0]->isEntryPoint() && "Original entry point is not preserved");
assert(Order.size() == BF.layout_size() && "Wrong size of reordered layout");
}
} // namespace bolt
} // namespace llvm