Intel® Implicit SPMD Program Compiler (Intel® ISPC)  1.13.0
opt.cpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2010-2020, Intel Corporation
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are
7  met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16  * Neither the name of Intel Corporation nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22  IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
25  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 /** @file opt.cpp
35  @brief Implementations of various ispc optimization passes that operate
36  on the LLVM IR.
37 */
38 
39 #include "opt.h"
40 #include "ctx.h"
41 #include "llvmutil.h"
42 #include "module.h"
43 #include "sym.h"
44 #include "util.h"
45 
46 #include <map>
47 #include <set>
48 #include <stdio.h>
49 
50 #include "llvm/InitializePasses.h"
51 #include <llvm/IR/BasicBlock.h>
52 #include <llvm/IR/Constants.h>
53 #include <llvm/IR/Function.h>
54 #include <llvm/IR/Instructions.h>
55 #include <llvm/IR/Intrinsics.h>
56 #include <llvm/IR/Module.h>
57 #include <llvm/Pass.h>
58 
59 #include <llvm/Transforms/Instrumentation.h>
60 
61 #include "llvm/IR/LegacyPassManager.h"
62 
63 #include <llvm/PassRegistry.h>
64 
65 #include <llvm/IR/DebugInfo.h>
66 #include <llvm/IR/IRPrintingPasses.h>
67 #include <llvm/IR/PatternMatch.h>
68 #include <llvm/IR/Verifier.h>
69 
70 #include <llvm/Analysis/ConstantFolding.h>
71 
72 #include <llvm/Analysis/TargetLibraryInfo.h>
73 #if ISPC_LLVM_VERSION >= ISPC_LLVM_7_0
74 #include "llvm/Transforms/InstCombine/InstCombine.h"
75 #include "llvm/Transforms/Utils.h"
76 #endif
77 #include <llvm/ADT/SmallSet.h>
78 #include <llvm/ADT/Triple.h>
79 #include <llvm/Target/TargetOptions.h>
80 #include <llvm/Transforms/IPO.h>
81 #include <llvm/Transforms/Scalar.h>
82 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
83 
84 #include <llvm/Analysis/TargetTransformInfo.h>
85 #include <llvm/IR/DataLayout.h>
86 
87 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
88 #include "llvm/Transforms/IPO/FunctionAttrs.h"
89 #include "llvm/Transforms/Scalar/GVN.h"
90 #include <llvm/Analysis/BasicAliasAnalysis.h>
91 #include <llvm/Analysis/Passes.h>
92 #include <llvm/BinaryFormat/Dwarf.h>
93 #include <llvm/Support/raw_ostream.h>
94 #include <llvm/Target/TargetMachine.h>
95 #if ISPC_LLVM_VERSION >= ISPC_LLVM_10_0
96 #include "llvm/IR/IntrinsicsX86.h"
97 #endif
98 
99 #include <llvm/IR/IntrinsicInst.h>
100 #ifdef ISPC_HOST_IS_LINUX
101 #include <alloca.h>
102 #elif defined(ISPC_HOST_IS_WINDOWS)
103 #include <malloc.h>
104 #ifndef __MINGW32__
105 #define alloca _alloca
106 #endif
107 #endif // ISPC_HOST_IS_WINDOWS
108 
109 #ifndef PRId64
110 #define PRId64 "lld"
111 #endif
112 #ifndef PRIu64
113 #define PRIu64 "llu"
114 #endif
115 #ifndef ISPC_NO_DUMPS
116 #include <llvm/Support/FileSystem.h>
117 #include <llvm/Support/Regex.h>
118 #endif
119 
120 static llvm::Pass *CreateIntrinsicsOptPass();
121 static llvm::Pass *CreateInstructionSimplifyPass();
122 static llvm::Pass *CreatePeepholePass();
123 
124 static llvm::Pass *CreateImproveMemoryOpsPass();
125 static llvm::Pass *CreateGatherCoalescePass();
126 static llvm::Pass *CreateReplacePseudoMemoryOpsPass();
127 
128 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
129 static llvm::Pass *CreateMakeInternalFuncsStaticPass();
130 
131 #ifndef ISPC_NO_DUMPS
132 static llvm::Pass *CreateDebugPass(char *output);
133 static llvm::Pass *CreateDebugPassFile(int number, llvm::StringRef name);
134 #endif
135 
136 static llvm::Pass *CreateReplaceStdlibShiftPass();
137 
138 static llvm::Pass *CreateFixBooleanSelectPass();
139 
140 #ifndef ISPC_NO_DUMPS
141 #define DEBUG_START_PASS(NAME) \
142  if (g->debugPrint && \
143  (getenv("FUNC") == NULL || (getenv("FUNC") != NULL && !strncmp(bb.getParent()->getName().str().c_str(), \
144  getenv("FUNC"), strlen(getenv("FUNC")))))) { \
145  fprintf(stderr, "Start of " NAME "\n"); \
146  fprintf(stderr, "---------------\n"); \
147  bb.dump(); \
148  fprintf(stderr, "---------------\n\n"); \
149  } else /* eat semicolon */
150 
151 #define DEBUG_END_PASS(NAME) \
152  if (g->debugPrint && \
153  (getenv("FUNC") == NULL || (getenv("FUNC") != NULL && !strncmp(bb.getParent()->getName().str().c_str(), \
154  getenv("FUNC"), strlen(getenv("FUNC")))))) { \
155  fprintf(stderr, "End of " NAME " %s\n", modifiedAny ? "** CHANGES **" : ""); \
156  fprintf(stderr, "---------------\n"); \
157  bb.dump(); \
158  fprintf(stderr, "---------------\n\n"); \
159  } else /* eat semicolon */
160 #else
161 #define DEBUG_START_PASS(NAME)
162 #define DEBUG_END_PASS(NAME)
163 #endif
164 
165 ///////////////////////////////////////////////////////////////////////////
166 
167 /** This utility routine copies the metadata (if any) attached to the
168  'from' instruction in the IR to the 'to' instruction.
169 
170  For flexibility, this function takes an llvm::Value rather than an
171  llvm::Instruction for the 'to' parameter; at some places in the code
172  below, we sometimes use a llvm::Value to start out storing a value and
173  then later store instructions. If a llvm::Value is passed to this, the
174  routine just returns without doing anything; if it is in fact an
175  LLVM::Instruction, then the metadata can be copied to it.
176  */
177 static void lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from) {
178  llvm::Instruction *to = llvm::dyn_cast<llvm::Instruction>(vto);
179  if (!to)
180  return;
181 
182  llvm::SmallVector<std::pair<unsigned int, llvm::MDNode *>, 8> metadata;
183 
184  from->getAllMetadata(metadata);
185  for (unsigned int i = 0; i < metadata.size(); ++i)
186  to->setMetadata(metadata[i].first, metadata[i].second);
187 }
188 
189 /** We have a protocol with the front-end LLVM IR code generation process
190  that allows us to encode the source file position that corresponds with
191  instructions. (For example, this allows us to issue performance
192  warnings related to things like scatter and gather after optimization
193  has been performed, so that we aren't warning about scatters and
194  gathers that have been improved to stores and loads by optimization
195  passes.) Note that this is slightly redundant with the source file
196  position encoding generated for debugging symbols, though we don't
197  always generate debugging information but we do always generate this
198  position data.
199 
200  This function finds the SourcePos that the metadata in the instruction
201  (if present) corresponds to. See the implementation of
202  FunctionEmitContext::addGSMetadata(), which encodes the source position during
203  code generation.
204 
205  @param inst Instruction to try to find the source position of
206  @param pos Output variable in which to store the position
207  @returns True if source file position metadata was present and *pos
208  has been set. False otherwise.
209 */
210 static bool lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos) {
211  llvm::MDNode *filename = inst->getMetadata("filename");
212  llvm::MDNode *first_line = inst->getMetadata("first_line");
213  llvm::MDNode *first_column = inst->getMetadata("first_column");
214  llvm::MDNode *last_line = inst->getMetadata("last_line");
215  llvm::MDNode *last_column = inst->getMetadata("last_column");
216 
217  if (!filename || !first_line || !first_column || !last_line || !last_column)
218  return false;
219 
220  // All of these asserts are things that FunctionEmitContext::addGSMetadata() is
221  // expected to have done in its operation
222  llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(filename->getOperand(0));
223  Assert(str);
224  llvm::ConstantInt *first_lnum =
225 
226  llvm::mdconst::extract<llvm::ConstantInt>(first_line->getOperand(0));
227  Assert(first_lnum);
228 
229  llvm::ConstantInt *first_colnum =
230 
231  llvm::mdconst::extract<llvm::ConstantInt>(first_column->getOperand(0));
232  Assert(first_column);
233 
234  llvm::ConstantInt *last_lnum =
235 
236  llvm::mdconst::extract<llvm::ConstantInt>(last_line->getOperand(0));
237  Assert(last_lnum);
238 
239  llvm::ConstantInt *last_colnum = llvm::mdconst::extract<llvm::ConstantInt>(last_column->getOperand(0));
240  Assert(last_column);
241 
242  *pos = SourcePos(str->getString().data(), (int)first_lnum->getZExtValue(), (int)first_colnum->getZExtValue(),
243  (int)last_lnum->getZExtValue(), (int)last_colnum->getZExtValue());
244  return true;
245 }
246 
247 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, const char *name,
248  llvm::Instruction *insertBefore = NULL) {
249  llvm::Value *args[2] = {arg0, arg1};
250  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
251  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
252 }
253 
254 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
255  const char *name, llvm::Instruction *insertBefore = NULL) {
256  llvm::Value *args[3] = {arg0, arg1, arg2};
257  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
258  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
259 }
260 
261 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
262  llvm::Value *arg3, const char *name, llvm::Instruction *insertBefore = NULL) {
263  llvm::Value *args[4] = {arg0, arg1, arg2, arg3};
264  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[4]);
265  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
266 }
267 
268 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
269  llvm::Value *arg3, llvm::Value *arg4, const char *name,
270  llvm::Instruction *insertBefore = NULL) {
271  llvm::Value *args[5] = {arg0, arg1, arg2, arg3, arg4};
272  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[5]);
273  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
274 }
275 
276 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
277  llvm::Value *arg3, llvm::Value *arg4, llvm::Value *arg5, const char *name,
278  llvm::Instruction *insertBefore = NULL) {
279  llvm::Value *args[6] = {arg0, arg1, arg2, arg3, arg4, arg5};
280  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
281  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
282 }
283 
284 static llvm::Instruction *lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name,
285  llvm::Instruction *insertBefore) {
286  llvm::Value *index[1] = {offset};
287  llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
288 
289  return llvm::GetElementPtrInst::Create(PTYPE(ptr), ptr, arrayRef, name, insertBefore);
290 }
291 
292 /** Given a vector of constant values (int, float, or bool) representing an
293  execution mask, convert it to a bitvector where the 0th bit corresponds
294  to the first vector value and so forth.
295 */
296 static uint64_t lConstElementsToMask(const llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> &elements) {
297  Assert(elements.size() <= 64);
298 
299  uint64_t mask = 0;
300  for (unsigned int i = 0; i < elements.size(); ++i) {
301  llvm::APInt intMaskValue;
302  // SSE has the "interesting" approach of encoding blending
303  // masks as <n x float>.
304  llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
305  if (cf != NULL) {
306  llvm::APFloat apf = cf->getValueAPF();
307  intMaskValue = apf.bitcastToAPInt();
308  } else {
309  // Otherwise get it as an int
310  llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
311  Assert(ci != NULL); // vs return -1 if NULL?
312  intMaskValue = ci->getValue();
313  }
314  // Is the high-bit set? If so, OR in the appropriate bit in
315  // the result mask
316  if (intMaskValue.countLeadingOnes() > 0)
317  mask |= (1ull << i);
318  }
319  return mask;
320 }
321 
322 /** Given an llvm::Value represinting a vector mask, see if the value is a
323  constant. If so, return true and set *bits to be the integer mask
324  found by taking the high bits of the mask values in turn and
325  concatenating them into a single integer. In other words, given the
326  4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, we have 0b1001 = 9.
327  */
328 static bool lGetMask(llvm::Value *factor, uint64_t *mask) {
329  llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
330  if (cdv != NULL) {
331  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
332  for (int i = 0; i < (int)cdv->getNumElements(); ++i)
333  elements.push_back(cdv->getElementAsConstant(i));
334  *mask = lConstElementsToMask(elements);
335  return true;
336  }
337 
338  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
339  if (cv != NULL) {
340  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
341  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
342  llvm::Constant *c = llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
343  if (c == NULL)
344  return false;
345  if (llvm::isa<llvm::ConstantExpr>(cv->getOperand(i)))
346  return false; // We can not handle constant expressions here
347  elements.push_back(c);
348  }
349  *mask = lConstElementsToMask(elements);
350  return true;
351  } else if (llvm::isa<llvm::ConstantAggregateZero>(factor)) {
352  *mask = 0;
353  return true;
354  } else {
355 #if 0
356  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
357  if (ce != NULL) {
358  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
359  const llvm::TargetData *td = targetMachine->getTargetData();
360  llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
361  c->dump();
362  factor = c;
363  }
364  // else we should be able to handle it above...
365  Assert(!llvm::isa<llvm::Constant>(factor));
366 #endif
367  return false;
368  }
369 }
370 
372 
373 /** Determines if the given mask value is all on, all off, mixed, or
374  unknown at compile time.
375 */
376 static MaskStatus lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
377  uint64_t bits;
378  if (lGetMask(mask, &bits) == false)
379  return UNKNOWN;
380 
381  if (bits == 0)
382  return ALL_OFF;
383 
384  if (vecWidth == -1)
385  vecWidth = g->target->getVectorWidth();
386  Assert(vecWidth <= 64);
387 
388  for (int i = 0; i < vecWidth; ++i) {
389  if ((bits & (1ull << i)) == 0)
390  return MIXED;
391  }
392  return ALL_ON;
393 }
394 
395 ///////////////////////////////////////////////////////////////////////////
396 // This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
397 // and change PassManager function add by adding some checks and debug passes.
398 // This wrap can control:
399 // - If we want to switch off optimization with given number.
400 // - If we want to dump LLVM IR after optimization with given number.
401 // - If we want to generate LLVM IR debug for gdb after optimization with given number.
403  public:
405  void add(llvm::Pass *P, int stage);
406  bool run(llvm::Module &M) { return PM.run(M); }
407  llvm::legacy::PassManager &getPM() { return PM; }
408 
409  private:
410  llvm::legacy::PassManager PM;
411  int number;
412 };
413 
414 void DebugPassManager::add(llvm::Pass *P, int stage = -1) {
415  // taking number of optimization
416  if (stage == -1) {
417  number++;
418  } else {
419  number = stage;
420  }
421  if (g->off_stages.find(number) == g->off_stages.end()) {
422  // adding optimization (not switched off)
423  PM.add(P);
424 #ifndef ISPC_NO_DUMPS
425  if (g->debug_stages.find(number) != g->debug_stages.end()) {
426  // adding dump of LLVM IR after optimization
427  if (g->dumpFile) {
428  PM.add(CreateDebugPassFile(number, P->getPassName()));
429  } else {
430  char buf[100];
431  snprintf(buf, sizeof(buf), "\n\n*****LLVM IR after phase %d: %s*****\n\n", number,
432  P->getPassName().data());
433  PM.add(CreateDebugPass(buf));
434  }
435  }
436 #endif
437  }
438 }
439 ///////////////////////////////////////////////////////////////////////////
440 
441 void Optimize(llvm::Module *module, int optLevel) {
442 #ifndef ISPC_NO_DUMPS
443  if (g->debugPrint) {
444  printf("*** Code going into optimization ***\n");
445  module->dump();
446  }
447 #endif
448  DebugPassManager optPM;
449  optPM.add(llvm::createVerifierPass(), 0);
450 
451  optPM.add(new llvm::TargetLibraryInfoWrapperPass(llvm::Triple(module->getTargetTriple())));
452 
453  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
454 
455  optPM.getPM().add(createTargetTransformInfoWrapperPass(targetMachine->getTargetIRAnalysis()));
456 
457  optPM.add(llvm::createIndVarSimplifyPass());
458 
459  if (optLevel == 0) {
460  // This is more or less the minimum set of optimizations that we
461  // need to do to generate code that will actually run. (We can't
462  // run absolutely no optimizations, since the front-end needs us to
463  // take the various __pseudo_* functions it has emitted and turn
464  // them into something that can actually execute.
465  optPM.add(CreateImproveMemoryOpsPass(), 100);
466 
467  if (g->opt.disableHandlePseudoMemoryOps == false)
469 
470  optPM.add(CreateIntrinsicsOptPass(), 102);
472  optPM.add(llvm::createFunctionInliningPass());
474  optPM.add(llvm::createCFGSimplificationPass());
475  optPM.add(llvm::createGlobalDCEPass());
476  } else {
477  llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
478  llvm::initializeCore(*registry);
479  llvm::initializeScalarOpts(*registry);
480  llvm::initializeIPO(*registry);
481  llvm::initializeAnalysis(*registry);
482  llvm::initializeTransformUtils(*registry);
483  llvm::initializeInstCombine(*registry);
484  llvm::initializeInstrumentation(*registry);
485  llvm::initializeTarget(*registry);
486 
487  optPM.add(llvm::createGlobalDCEPass(), 185);
488 
489  // Setup to use LLVM default AliasAnalysis
490  // Ideally, we want call:
491  // llvm::PassManagerBuilder pm_Builder;
492  // pm_Builder.OptLevel = optLevel;
493  // pm_Builder.addInitialAliasAnalysisPasses(optPM);
494  // but the addInitialAliasAnalysisPasses() is a private function
495  // so we explicitly enable them here.
496  // Need to keep sync with future LLVM change
497  // An alternative is to call populateFunctionPassManager()
498  optPM.add(llvm::createTypeBasedAAWrapperPass(), 190);
499  optPM.add(llvm::createBasicAAWrapperPass());
500  optPM.add(llvm::createCFGSimplificationPass());
501 
502  optPM.add(llvm::createSROAPass());
503 
504  optPM.add(llvm::createEarlyCSEPass());
505  optPM.add(llvm::createLowerExpectIntrinsicPass());
506 
507  // Early optimizations to try to reduce the total amount of code to
508  // work with if we can
509  optPM.add(llvm::createReassociatePass(), 200);
510  optPM.add(llvm::createConstantPropagationPass());
511  optPM.add(llvm::createDeadInstEliminationPass());
512  optPM.add(llvm::createCFGSimplificationPass());
513 
514  optPM.add(llvm::createPromoteMemoryToRegisterPass());
515  optPM.add(llvm::createAggressiveDCEPass());
516 
517  if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
518  optPM.add(llvm::createInstructionCombiningPass(), 210);
520  }
522  optPM.add(CreateIntrinsicsOptPass(), 215);
524  }
525  optPM.add(llvm::createDeadInstEliminationPass(), 220);
526 
527  // On to more serious optimizations
528  optPM.add(llvm::createSROAPass());
529  optPM.add(llvm::createInstructionCombiningPass());
530  optPM.add(llvm::createCFGSimplificationPass());
531  optPM.add(llvm::createPromoteMemoryToRegisterPass());
532  optPM.add(llvm::createGlobalOptimizerPass());
533  optPM.add(llvm::createReassociatePass());
534  optPM.add(llvm::createIPConstantPropagationPass());
535 
536  optPM.add(CreateReplaceStdlibShiftPass(), 229);
537 
538  optPM.add(llvm::createDeadArgEliminationPass(), 230);
539  optPM.add(llvm::createInstructionCombiningPass());
540  optPM.add(llvm::createCFGSimplificationPass());
541  optPM.add(llvm::createPruneEHPass());
542  optPM.add(llvm::createPostOrderFunctionAttrsLegacyPass());
543  optPM.add(llvm::createReversePostOrderFunctionAttrsPass());
544 
545  optPM.add(llvm::createFunctionInliningPass());
546  optPM.add(llvm::createConstantPropagationPass());
547  optPM.add(llvm::createDeadInstEliminationPass());
548  optPM.add(llvm::createCFGSimplificationPass());
549 
550  optPM.add(llvm::createArgumentPromotionPass());
551 
552  optPM.add(llvm::createAggressiveDCEPass());
553  optPM.add(llvm::createInstructionCombiningPass(), 241);
554  optPM.add(llvm::createJumpThreadingPass());
555  optPM.add(llvm::createCFGSimplificationPass());
556 
557  optPM.add(llvm::createSROAPass());
558 
559  optPM.add(llvm::createInstructionCombiningPass());
560  optPM.add(llvm::createTailCallEliminationPass());
561 
563  optPM.add(CreateIntrinsicsOptPass(), 250);
565  }
566 
567  if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
568  optPM.add(llvm::createInstructionCombiningPass(), 255);
570 
571  if (g->opt.disableCoalescing == false && g->target->getISA() != Target::GENERIC) {
572  // It is important to run this here to make it easier to
573  // finding matching gathers we can coalesce..
574  optPM.add(llvm::createEarlyCSEPass(), 260);
575  optPM.add(CreateGatherCoalescePass());
576  }
577  }
578 
579  optPM.add(llvm::createFunctionInliningPass(), 265);
580  optPM.add(llvm::createConstantPropagationPass());
581  optPM.add(CreateIntrinsicsOptPass());
583 
584  if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
585  optPM.add(llvm::createInstructionCombiningPass(), 270);
587  }
588 
589  optPM.add(llvm::createIPSCCPPass(), 275);
590  optPM.add(llvm::createDeadArgEliminationPass());
591  optPM.add(llvm::createAggressiveDCEPass());
592  optPM.add(llvm::createInstructionCombiningPass());
593  optPM.add(llvm::createCFGSimplificationPass());
594 
595  if (g->opt.disableHandlePseudoMemoryOps == false) {
597  }
598  optPM.add(CreateIntrinsicsOptPass(), 281);
600 
601  optPM.add(llvm::createFunctionInliningPass());
602  optPM.add(llvm::createArgumentPromotionPass());
603 
604  optPM.add(llvm::createSROAPass());
605 
606  optPM.add(llvm::createInstructionCombiningPass());
608  optPM.add(llvm::createCFGSimplificationPass());
609  optPM.add(llvm::createReassociatePass());
610  optPM.add(llvm::createLoopRotatePass());
611  optPM.add(llvm::createLICMPass());
612  optPM.add(llvm::createLoopUnswitchPass(false));
613  optPM.add(llvm::createInstructionCombiningPass());
615  optPM.add(llvm::createIndVarSimplifyPass());
616  optPM.add(llvm::createLoopIdiomPass());
617  optPM.add(llvm::createLoopDeletionPass());
618  if (g->opt.unrollLoops) {
619  optPM.add(llvm::createLoopUnrollPass(), 300);
620  }
621  optPM.add(llvm::createGVNPass(), 301);
622 
624  optPM.add(CreateIntrinsicsOptPass());
626 
627  optPM.add(llvm::createMemCpyOptPass());
628  optPM.add(llvm::createSCCPPass());
629  optPM.add(llvm::createInstructionCombiningPass());
631  optPM.add(llvm::createJumpThreadingPass());
632  optPM.add(llvm::createCorrelatedValuePropagationPass());
633  optPM.add(llvm::createDeadStoreEliminationPass());
634  optPM.add(llvm::createAggressiveDCEPass());
635  optPM.add(llvm::createCFGSimplificationPass());
636  optPM.add(llvm::createInstructionCombiningPass());
638  optPM.add(CreatePeepholePass());
639  optPM.add(llvm::createFunctionInliningPass());
640  optPM.add(llvm::createAggressiveDCEPass());
641  optPM.add(llvm::createStripDeadPrototypesPass());
643  optPM.add(llvm::createGlobalDCEPass());
644  optPM.add(llvm::createConstantMergePass());
645 
646  // Should be the last
647  optPM.add(CreateFixBooleanSelectPass(), 400);
648  }
649 
650  // Finish up by making sure we didn't mess anything up in the IR along
651  // the way.
652  optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
653  optPM.run(*module);
654 
655 #ifndef ISPC_NO_DUMPS
656  if (g->debugPrint) {
657  printf("\n*****\nFINAL OUTPUT\n*****\n");
658  module->dump();
659  }
660 #endif
661 }
662 
663 ///////////////////////////////////////////////////////////////////////////
664 // IntrinsicsOpt
665 
666 /** This is a relatively simple optimization pass that does a few small
667  optimizations that LLVM's x86 optimizer doesn't currently handle.
668  (Specifically, MOVMSK of a constant can be replaced with the
669  corresponding constant value, BLENDVPS and AVX masked load/store with
670  either an 'all on' or 'all off' masks can be replaced with simpler
671  operations.
672 
673  @todo The better thing to do would be to submit a patch to LLVM to get
674  these; they're presumably pretty simple patterns to match.
675 */
676 class IntrinsicsOpt : public llvm::FunctionPass {
677  public:
678  IntrinsicsOpt() : FunctionPass(ID){};
679 
680  llvm::StringRef getPassName() const { return "Intrinsics Cleanup Optimization"; }
681 
682  bool runOnBasicBlock(llvm::BasicBlock &BB);
683 
684  bool runOnFunction(llvm::Function &F);
685 
686  static char ID;
687 
688  private:
690  MaskInstruction(llvm::Function *f) { function = f; }
691  llvm::Function *function;
692  };
693  std::vector<MaskInstruction> maskInstructions;
694 
695  /** Structure that records everything we need to know about a blend
696  instruction for this optimization pass.
697  */
699  BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
700  : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) {}
701  /** Function pointer for the blend instruction */
702  llvm::Function *function;
703  /** Mask value for an "all on" mask for this instruction */
704  uint64_t allOnMask;
705  /** The operand number in the llvm CallInst corresponds to the
706  first operand to blend with. */
707  int op0;
708  /** The operand number in the CallInst corresponding to the second
709  operand to blend with. */
710  int op1;
711  /** The operand in the call inst where the blending factor is
712  found. */
713  int opFactor;
714  };
715  std::vector<BlendInstruction> blendInstructions;
716 
717  bool matchesMaskInstruction(llvm::Function *function);
718  BlendInstruction *matchingBlendInstruction(llvm::Function *function);
719 };
720 
721 char IntrinsicsOpt::ID = 0;
722 
723 /** Given an llvm::Value, return true if we can determine that it's an
724  undefined value. This only makes a weak attempt at chasing this down,
725  only detecting flat-out undef values, and bitcasts of undef values.
726 
727  @todo Is it worth working harder to find more of these? It starts to
728  get tricky, since having an undef operand doesn't necessarily mean that
729  the result will be undefined. (And for that matter, is there an LLVM
730  call that will do this for us?)
731  */
732 static bool lIsUndef(llvm::Value *value) {
733  if (llvm::isa<llvm::UndefValue>(value))
734  return true;
735 
736  llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(value);
737  if (bci)
738  return lIsUndef(bci->getOperand(0));
739 
740  return false;
741 }
742 
743 bool IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
744  DEBUG_START_PASS("IntrinsicsOpt");
745 
746  // We can't initialize mask/blend function vector during pass initialization,
747  // as they may be optimized out by the time the pass is invoked.
748 
749  // All of the mask instructions we may encounter. Note that even if
750  // compiling for AVX, we may still encounter the regular 4-wide SSE
751  // MOVMSK instruction.
752  if (llvm::Function *ssei8Movmsk =
753  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse2_pmovmskb_128))) {
754  maskInstructions.push_back(ssei8Movmsk);
755  }
756  if (llvm::Function *sseFloatMovmsk =
757  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse_movmsk_ps))) {
758  maskInstructions.push_back(sseFloatMovmsk);
759  }
760  if (llvm::Function *__movmsk = m->module->getFunction("__movmsk")) {
761  maskInstructions.push_back(__movmsk);
762  }
763  if (llvm::Function *avxFloatMovmsk =
764  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_movmsk_ps_256))) {
765  maskInstructions.push_back(avxFloatMovmsk);
766  }
767 
768  // And all of the blend instructions
769  blendInstructions.push_back(BlendInstruction(
770  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse41_blendvps)), 0xf, 0, 1, 2));
771  blendInstructions.push_back(BlendInstruction(
772  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_blendv_ps_256)), 0xff, 0, 1, 2));
773 
774  llvm::Function *avxMaskedLoad32 =
775  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_ps_256));
776  llvm::Function *avxMaskedLoad64 =
777  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_pd_256));
778  llvm::Function *avxMaskedStore32 =
779  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_ps_256));
780  llvm::Function *avxMaskedStore64 =
781  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_pd_256));
782 
783  bool modifiedAny = false;
784 restart:
785  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
786  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
787  if (callInst == NULL || callInst->getCalledFunction() == NULL)
788  continue;
789 
790  BlendInstruction *blend = matchingBlendInstruction(callInst->getCalledFunction());
791  if (blend != NULL) {
792  llvm::Value *v[2] = {callInst->getArgOperand(blend->op0), callInst->getArgOperand(blend->op1)};
793  llvm::Value *factor = callInst->getArgOperand(blend->opFactor);
794 
795  // If the values are the same, then no need to blend..
796  if (v[0] == v[1]) {
797  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[0]);
798  modifiedAny = true;
799  goto restart;
800  }
801 
802  // If one of the two is undefined, we're allowed to replace
803  // with the value of the other. (In other words, the only
804  // valid case is that the blend factor ends up having a value
805  // that only selects from the defined one of the two operands,
806  // otherwise the result is undefined and any value is fine,
807  // ergo the defined one is an acceptable result.)
808  if (lIsUndef(v[0])) {
809  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[1]);
810  modifiedAny = true;
811  goto restart;
812  }
813  if (lIsUndef(v[1])) {
814  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[0]);
815  modifiedAny = true;
816  goto restart;
817  }
818 
819  uint64_t mask;
820  if (lGetMask(factor, &mask) == true) {
821  llvm::Value *value = NULL;
822  if (mask == 0)
823  // Mask all off -> replace with the first blend value
824  value = v[0];
825  else if (mask == blend->allOnMask)
826  // Mask all on -> replace with the second blend value
827  value = v[1];
828 
829  if (value != NULL) {
830  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
831  modifiedAny = true;
832  goto restart;
833  }
834  }
835  } else if (matchesMaskInstruction(callInst->getCalledFunction())) {
836  llvm::Value *factor = callInst->getArgOperand(0);
837  uint64_t mask;
838  if (lGetMask(factor, &mask) == true) {
839  // If the vector-valued mask has a known value, replace it
840  // with the corresponding integer mask from its elements
841  // high bits.
842  llvm::Value *value = (callInst->getType() == LLVMTypes::Int32Type) ? LLVMInt32(mask) : LLVMInt64(mask);
843  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
844  modifiedAny = true;
845  goto restart;
846  }
847  } else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
848  callInst->getCalledFunction() == avxMaskedLoad64) {
849  llvm::Value *factor = callInst->getArgOperand(1);
850  uint64_t mask;
851  if (lGetMask(factor, &mask) == true) {
852  if (mask == 0) {
853  // nothing being loaded, replace with undef value
854  llvm::Type *returnType = callInst->getType();
855  Assert(llvm::isa<llvm::VectorType>(returnType));
856  llvm::Value *undefValue = llvm::UndefValue::get(returnType);
857  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, undefValue);
858  modifiedAny = true;
859  goto restart;
860  } else if (mask == 0xff) {
861  // all lanes active; replace with a regular load
862  llvm::Type *returnType = callInst->getType();
863  Assert(llvm::isa<llvm::VectorType>(returnType));
864  // cast the i8 * to the appropriate type
865  const char *name = LLVMGetName(callInst->getArgOperand(0), "_cast");
866  llvm::Value *castPtr = new llvm::BitCastInst(callInst->getArgOperand(0),
867  llvm::PointerType::get(returnType, 0), name, callInst);
868  lCopyMetadata(castPtr, callInst);
869  int align;
870  if (g->opt.forceAlignedMemory)
871  align = g->target->getNativeVectorAlignment();
872  else
873  align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
874  name = LLVMGetName(callInst->getArgOperand(0), "_load");
875  llvm::Instruction *loadInst =
876 #if ISPC_LLVM_VERSION <= ISPC_LLVM_9_0
877  new llvm::LoadInst(castPtr, name, false /* not volatile */, align, (llvm::Instruction *)NULL);
878 #else // LLVM 10.0+
879  new llvm::LoadInst(castPtr, name, false /* not volatile */, llvm::MaybeAlign(align),
880  (llvm::Instruction *)NULL);
881 #endif
882  lCopyMetadata(loadInst, callInst);
883  llvm::ReplaceInstWithInst(callInst, loadInst);
884  modifiedAny = true;
885  goto restart;
886  }
887  }
888  } else if (callInst->getCalledFunction() == avxMaskedStore32 ||
889  callInst->getCalledFunction() == avxMaskedStore64) {
890  // NOTE: mask is the 2nd parameter, not the 3rd one!!
891  llvm::Value *factor = callInst->getArgOperand(1);
892  uint64_t mask;
893  if (lGetMask(factor, &mask) == true) {
894  if (mask == 0) {
895  // nothing actually being stored, just remove the inst
896  callInst->eraseFromParent();
897  modifiedAny = true;
898  goto restart;
899  } else if (mask == 0xff) {
900  // all lanes storing, so replace with a regular store
901  llvm::Value *rvalue = callInst->getArgOperand(2);
902  llvm::Type *storeType = rvalue->getType();
903  const char *name = LLVMGetName(callInst->getArgOperand(0), "_ptrcast");
904  llvm::Value *castPtr = new llvm::BitCastInst(callInst->getArgOperand(0),
905  llvm::PointerType::get(storeType, 0), name, callInst);
906  lCopyMetadata(castPtr, callInst);
907 
908  llvm::StoreInst *storeInst = new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
909  int align;
910  if (g->opt.forceAlignedMemory)
911  align = g->target->getNativeVectorAlignment();
912  else
913  align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
914 #if ISPC_LLVM_VERSION <= ISPC_LLVM_9_0
915  storeInst->setAlignment(align);
916 #else // LLVM 10.0+
917  storeInst->setAlignment(llvm::MaybeAlign(align));
918 #endif
919  lCopyMetadata(storeInst, callInst);
920  llvm::ReplaceInstWithInst(callInst, storeInst);
921 
922  modifiedAny = true;
923  goto restart;
924  }
925  }
926  }
927  }
928 
929  DEBUG_END_PASS("IntrinsicsOpt");
930 
931  return modifiedAny;
932 }
933 
934 bool IntrinsicsOpt::runOnFunction(llvm::Function &F) {
935 
936  bool modifiedAny = false;
937  for (llvm::BasicBlock &BB : F) {
938  modifiedAny |= runOnBasicBlock(BB);
939  }
940  return modifiedAny;
941 }
942 
943 bool IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
944  for (unsigned int i = 0; i < maskInstructions.size(); ++i) {
945  if (maskInstructions[i].function != NULL && function == maskInstructions[i].function) {
946  return true;
947  }
948  }
949  return false;
950 }
951 
953  for (unsigned int i = 0; i < blendInstructions.size(); ++i) {
954  if (blendInstructions[i].function != NULL && function == blendInstructions[i].function) {
955  return &blendInstructions[i];
956  }
957  }
958  return NULL;
959 }
960 
961 static llvm::Pass *CreateIntrinsicsOptPass() { return new IntrinsicsOpt; }
962 
963 ///////////////////////////////////////////////////////////////////////////
964 
965 /** This simple optimization pass looks for a vector select instruction
966  with an all-on or all-off constant mask, simplifying it to the
967  appropriate operand if so.
968 
969  @todo The better thing to do would be to submit a patch to LLVM to get
970  these; they're presumably pretty simple patterns to match.
971 */
972 class InstructionSimplifyPass : public llvm::FunctionPass {
973  public:
974  InstructionSimplifyPass() : FunctionPass(ID) {}
975 
976  llvm::StringRef getPassName() const { return "Vector Select Optimization"; }
977  bool runOnBasicBlock(llvm::BasicBlock &BB);
978  bool runOnFunction(llvm::Function &F);
979 
980  static char ID;
981 
982  private:
983  static bool simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter);
984  static llvm::Value *simplifyBoolVec(llvm::Value *value);
985  static bool simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter);
986 };
987 
989 
990 llvm::Value *InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) {
991  llvm::TruncInst *trunc = llvm::dyn_cast<llvm::TruncInst>(value);
992  if (trunc != NULL) {
993  // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
994  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
995  if (sext && sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
996  return sext->getOperand(0);
997 
998  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
999  if (zext && zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
1000  return zext->getOperand(0);
1001  }
1002  /*
1003  // This optimization has discernable benefit on the perf
1004  // suite on latest LLVM versions.
1005  // On 3.4+ (maybe even older), it can result in illegal
1006  // operations, so it's being disabled.
1007  llvm::ICmpInst *icmp = llvm::dyn_cast<llvm::ICmpInst>(value);
1008  if (icmp != NULL) {
1009  // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo
1010  if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) {
1011  llvm::Value *op1 = icmp->getOperand(1);
1012  if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
1013  llvm::Value *op0 = icmp->getOperand(0);
1014  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(op0);
1015  if (sext)
1016  return sext->getOperand(0);
1017  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(op0);
1018  if (zext)
1019  return zext->getOperand(0);
1020  }
1021  }
1022 
1023  }
1024  */
1025  return NULL;
1026 }
1027 
1028 bool InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter) {
1029  if (selectInst->getType()->isVectorTy() == false)
1030  return false;
1031  Assert(selectInst->getOperand(1) != NULL);
1032  Assert(selectInst->getOperand(2) != NULL);
1033  llvm::Value *factor = selectInst->getOperand(0);
1034 
1035  // Simplify all-on or all-off mask values
1036  MaskStatus maskStatus = lGetMaskStatus(factor);
1037  llvm::Value *value = NULL;
1038  if (maskStatus == ALL_ON)
1039  // Mask all on -> replace with the first select value
1040  value = selectInst->getOperand(1);
1041  else if (maskStatus == ALL_OFF)
1042  // Mask all off -> replace with the second select value
1043  value = selectInst->getOperand(2);
1044  if (value != NULL) {
1045  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
1046  return true;
1047  }
1048 
1049  // Sometimes earlier LLVM optimization passes generate unnecessarily
1050  // complex expressions for the selection vector, which in turn confuses
1051  // the code generators and leads to sub-optimal code (particularly for
1052  // 8 and 16-bit masks). We'll try to simplify them out here so that
1053  // the code generator patterns match..
1054  if ((factor = simplifyBoolVec(factor)) != NULL) {
1055  llvm::Instruction *newSelect = llvm::SelectInst::Create(factor, selectInst->getOperand(1),
1056  selectInst->getOperand(2), selectInst->getName());
1057  llvm::ReplaceInstWithInst(selectInst, newSelect);
1058  return true;
1059  }
1060 
1061  return false;
1062 }
1063 
1064 bool InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter) {
1065  llvm::Function *calledFunc = callInst->getCalledFunction();
1066 
1067  // Turn a __movmsk call with a compile-time constant vector into the
1068  // equivalent scalar value.
1069  if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
1070  return false;
1071 
1072  uint64_t mask;
1073  if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
1074  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, LLVMInt64(mask));
1075  return true;
1076  }
1077  return false;
1078 }
1079 
1080 bool InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) {
1081  DEBUG_START_PASS("InstructionSimplify");
1082 
1083  bool modifiedAny = false;
1084 
1085 restart:
1086  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
1087  llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
1088  if (selectInst && simplifySelect(selectInst, iter)) {
1089  modifiedAny = true;
1090  goto restart;
1091  }
1092  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
1093  if (callInst && simplifyCall(callInst, iter)) {
1094  modifiedAny = true;
1095  goto restart;
1096  }
1097  }
1098 
1099  DEBUG_END_PASS("InstructionSimplify");
1100 
1101  return modifiedAny;
1102 }
1103 
1105 
1106  bool modifiedAny = false;
1107  for (llvm::BasicBlock &BB : F) {
1108  modifiedAny |= runOnBasicBlock(BB);
1109  }
1110  return modifiedAny;
1111 }
1112 
1113 static llvm::Pass *CreateInstructionSimplifyPass() { return new InstructionSimplifyPass; }
1114 
1115 ///////////////////////////////////////////////////////////////////////////
1116 // ImproveMemoryOpsPass
1117 
1118 /** When the front-end emits gathers and scatters, it generates an array of
1119  vector-width pointers to represent the set of addresses to read from or
1120  write to. This optimization detects cases when the base pointer is a
1121  uniform pointer or when the indexing is into an array that can be
1122  converted into scatters/gathers from a single base pointer and an array
1123  of offsets.
1124 
1125  See for example the comments discussing the __pseudo_gather functions
1126  in builtins.cpp for more information about this.
1127  */
1128 class ImproveMemoryOpsPass : public llvm::FunctionPass {
1129  public:
1130  static char ID;
1131  ImproveMemoryOpsPass() : FunctionPass(ID) {}
1132 
1133  llvm::StringRef getPassName() const { return "Improve Memory Ops"; }
1134 
1135  bool runOnBasicBlock(llvm::BasicBlock &BB);
1136 
1137  bool runOnFunction(llvm::Function &F);
1138 };
1139 
1140 char ImproveMemoryOpsPass::ID = 0;
1141 
1142 /** Check to make sure that this value is actually a pointer in the end.
1143  We need to make sure that given an expression like vec(offset) +
1144  ptr2int(ptr), lGetBasePointer() doesn't return vec(offset) for the base
1145  pointer such that we then treat ptr2int(ptr) as an offset. This ends
1146  up being important so that we don't generate LLVM GEP instructions like
1147  "gep inttoptr 8, i64 %ptr", which in turn can lead to incorrect code
1148  since LLVM's pointer aliasing analysis assumes that operands after the
1149  first one to a GEP aren't pointers.
1150  */
1151 static llvm::Value *lCheckForActualPointer(llvm::Value *v) {
1152  if (v == NULL) {
1153  return NULL;
1154  } else if (llvm::isa<llvm::PointerType>(v->getType())) {
1155  return v;
1156  } else if (llvm::isa<llvm::PtrToIntInst>(v)) {
1157  return v;
1158  }
1159  // This one is tricky, as it's heuristic tuned for LLVM 3.7+, which may
1160  // optimize loading double* with consequent ptr2int to straight load of i64.
1161  // This heuristic should be good enough to catch all the cases we should
1162  // detect and nothing else.
1163  else if (llvm::isa<llvm::LoadInst>(v)) {
1164  return v;
1165  }
1166 
1167  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1168  llvm::Value *t = lCheckForActualPointer(ci->getOperand(0));
1169  if (t == NULL) {
1170  return NULL;
1171  } else {
1172  return v;
1173  }
1174  } else {
1175  llvm::ConstantExpr *uce = llvm::dyn_cast<llvm::ConstantExpr>(v);
1176  if (uce != NULL && uce->getOpcode() == llvm::Instruction::PtrToInt)
1177  return v;
1178  return NULL;
1179  }
1180 }
1181 
1182 /** Given a llvm::Value representing a varying pointer, this function
1183  checks to see if all of the elements of the vector have the same value
1184  (i.e. there's a common base pointer). If broadcast has been already detected
1185  it checks that the first element of the vector is not undef. If one of the conditions
1186  is true, it returns the common pointer value; otherwise it returns NULL.
1187  */
1188 static llvm::Value *lGetBasePointer(llvm::Value *v, llvm::Instruction *insertBefore, bool broadcastDetected) {
1189  if (llvm::isa<llvm::InsertElementInst>(v) || llvm::isa<llvm::ShuffleVectorInst>(v)) {
1190  // If we have already detected broadcast we want to look for
1191  // the vector with the first not-undef element
1192  llvm::Value *element = LLVMFlattenInsertChain(v, g->target->getVectorWidth(), true, false, broadcastDetected);
1193  // TODO: it's probably ok to allow undefined elements and return
1194  // the base pointer if all of the other elements have the same
1195  // value.
1196  if (element != NULL) {
1197  // all elements are the same and not NULLs
1198  return lCheckForActualPointer(element);
1199  } else {
1200  return NULL;
1201  }
1202  }
1203 
1204  // This case comes up with global/static arrays
1205  if (llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v)) {
1206  return lCheckForActualPointer(cv->getSplatValue());
1207  } else if (llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v)) {
1208  return lCheckForActualPointer(cdv->getSplatValue());
1209  }
1210  // It is a little bit tricky to use operations with pointers, casted to int with another bit size
1211  // but sometimes it is useful, so we handle this case here.
1212  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1213  llvm::Value *t = lGetBasePointer(ci->getOperand(0), insertBefore, broadcastDetected);
1214  if (t == NULL) {
1215  return NULL;
1216  } else {
1217  return llvm::CastInst::Create(ci->getOpcode(), t, ci->getType()->getScalarType(), LLVMGetName(t, "_cast"),
1218  insertBefore);
1219  }
1220  }
1221 
1222  return NULL;
1223 }
1224 
1225 /** Given the two operands to a constant add expression, see if we have the
1226  form "base pointer + offset", whee op0 is the base pointer and op1 is
1227  the offset; if so return the base and the offset. */
1228 static llvm::Constant *lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1, llvm::Constant **delta) {
1229  llvm::ConstantExpr *op = llvm::dyn_cast<llvm::ConstantExpr>(op0);
1230  if (op == NULL || op->getOpcode() != llvm::Instruction::PtrToInt)
1231  // the first operand isn't a pointer
1232  return NULL;
1233 
1234  llvm::ConstantInt *opDelta = llvm::dyn_cast<llvm::ConstantInt>(op1);
1235  if (opDelta == NULL)
1236  // the second operand isn't an integer operand
1237  return NULL;
1238 
1239  *delta = opDelta;
1240  return op0;
1241 }
1242 
1243 static llvm::Value *lExtractFromInserts(llvm::Value *v, unsigned int index) {
1244  llvm::InsertValueInst *iv = llvm::dyn_cast<llvm::InsertValueInst>(v);
1245  if (iv == NULL)
1246  return NULL;
1247 
1248  Assert(iv->hasIndices() && iv->getNumIndices() == 1);
1249  if (iv->getIndices()[0] == index)
1250  return iv->getInsertedValueOperand();
1251  else
1252  return lExtractFromInserts(iv->getAggregateOperand(), index);
1253 }
1254 
1255 /** Given a varying pointer in ptrs, this function checks to see if it can
1256  be determined to be indexing from a common uniform base pointer. If
1257  so, the function returns the base pointer llvm::Value and initializes
1258  *offsets with an int vector of the per-lane offsets
1259  */
1260 static llvm::Value *lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets, llvm::Instruction *insertBefore) {
1261 #ifndef ISPC_NO_DUMPS
1262  if (g->debugPrint) {
1263  fprintf(stderr, "lGetBasePtrAndOffsets\n");
1264  LLVMDumpValue(ptrs);
1265  }
1266 #endif
1267 
1268  bool broadcastDetected = false;
1269  // Looking for %gep_offset = shufflevector <8 x i64> %0, <8 x i64> undef, <8 x i32> zeroinitializer
1270  llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(ptrs);
1271  if (shuffle != NULL) {
1272  llvm::Value *indices = shuffle->getOperand(2);
1273  llvm::Value *vec = shuffle->getOperand(1);
1274  if (lIsUndef(vec) && llvm::isa<llvm::ConstantAggregateZero>(indices)) {
1275  broadcastDetected = true;
1276  }
1277  }
1278  llvm::Value *base = lGetBasePointer(ptrs, insertBefore, broadcastDetected);
1279  if (base != NULL) {
1280  // We have a straight up varying pointer with no indexing that's
1281  // actually all the same value.
1282  if (g->target->is32Bit())
1283  *offsets = LLVMInt32Vector(0);
1284  else
1285  *offsets = LLVMInt64Vector((int64_t)0);
1286 
1287  if (broadcastDetected) {
1288  llvm::Value *op = shuffle->getOperand(0);
1289  llvm::BinaryOperator *bop_var = llvm::dyn_cast<llvm::BinaryOperator>(op);
1290  if (bop_var != NULL && ((bop_var->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(bop_var))) {
1291  // We expect here ConstantVector as
1292  // <i64 4, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>
1293  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(bop_var->getOperand(1));
1294  if (cv != NULL) {
1295  llvm::Value *zeroMask =
1296 #if ISPC_LLVM_VERSION < ISPC_LLVM_11_0
1297  llvm::ConstantVector::getSplat(cv->getType()->getVectorNumElements(),
1298 #else // LLVM 11.0+
1299  llvm::ConstantVector::getSplat({cv->getType()->getVectorNumElements(), false},
1300 #endif
1301  llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
1302  // Create offset
1303  llvm::Value *shuffle_offset = new llvm::ShuffleVectorInst(cv, llvm::UndefValue::get(cv->getType()),
1304  zeroMask, "shuffle", bop_var);
1305  *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, shuffle_offset,
1306  "new_offsets", insertBefore);
1307  }
1308  }
1309  }
1310  return base;
1311  }
1312 
1313  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(ptrs);
1314  if (bop != NULL && ((bop->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(bop))) {
1315  // If we have a common pointer plus something, then we're also
1316  // good.
1317  if ((base = lGetBasePtrAndOffsets(bop->getOperand(0), offsets, insertBefore)) != NULL) {
1318  *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, bop->getOperand(1), "new_offsets",
1319  insertBefore);
1320  return base;
1321  } else if ((base = lGetBasePtrAndOffsets(bop->getOperand(1), offsets, insertBefore)) != NULL) {
1322  *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, bop->getOperand(0), "new_offsets",
1323  insertBefore);
1324  return base;
1325  }
1326  }
1327  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(ptrs);
1328  if (cv != NULL) {
1329  // Indexing into global arrays can lead to this form, with
1330  // ConstantVectors..
1331  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
1332  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
1333  llvm::Constant *c = llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
1334  if (c == NULL)
1335  return NULL;
1336  elements.push_back(c);
1337  }
1338 
1339  llvm::Constant *delta[ISPC_MAX_NVEC];
1340  for (unsigned int i = 0; i < elements.size(); ++i) {
1341  // For each element, try to decompose it into either a straight
1342  // up base pointer, or a base pointer plus an integer value.
1343  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(elements[i]);
1344  if (ce == NULL)
1345  return NULL;
1346 
1347  delta[i] = NULL;
1348  llvm::Value *elementBase = NULL; // base pointer for this element
1349  if (ce->getOpcode() == llvm::Instruction::PtrToInt) {
1350  // If the element is just a ptr to int instruction, treat
1351  // it as having an offset of zero
1352  elementBase = ce;
1353  delta[i] = g->target->is32Bit() ? LLVMInt32(0) : LLVMInt64(0);
1354  } else if ((ce->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(ce)) {
1355  // Try both orderings of the operands to see if we can get
1356  // a pointer+offset out of them.
1357  elementBase = lGetConstantAddExprBaseOffset(ce->getOperand(0), ce->getOperand(1), &delta[i]);
1358  if (elementBase == NULL)
1359  elementBase = lGetConstantAddExprBaseOffset(ce->getOperand(1), ce->getOperand(0), &delta[i]);
1360  }
1361 
1362  // We weren't able to find a base pointer in the above. (We
1363  // don't expect this to happen; if it does, it may be necessary
1364  // to handle more cases in the decomposition above.)
1365  if (elementBase == NULL)
1366  return NULL;
1367 
1368  Assert(delta[i] != NULL);
1369  if (base == NULL)
1370  // The first time we've found a base pointer
1371  base = elementBase;
1372  else if (base != elementBase)
1373  // Different program instances have different base
1374  // pointers, so no luck.
1375  return NULL;
1376  }
1377 
1378  Assert(base != NULL);
1379  llvm::ArrayRef<llvm::Constant *> deltas(&delta[0], &delta[elements.size()]);
1380  *offsets = llvm::ConstantVector::get(deltas);
1381  return base;
1382  }
1383 
1384  llvm::ExtractValueInst *ev = llvm::dyn_cast<llvm::ExtractValueInst>(ptrs);
1385  if (ev != NULL) {
1386  Assert(ev->getNumIndices() == 1);
1387  int index = ev->getIndices()[0];
1388  ptrs = lExtractFromInserts(ev->getAggregateOperand(), index);
1389  if (ptrs != NULL)
1390  return lGetBasePtrAndOffsets(ptrs, offsets, insertBefore);
1391  }
1392 
1393  return NULL;
1394 }
1395 
1396 /** Given a vector expression in vec, separate it into a compile-time
1397  constant component and a variable component, returning the two parts in
1398  *constOffset and *variableOffset. (It should be the case that the sum
1399  of these two is exactly equal to the original vector.)
1400 
1401  This routine only handles some (important) patterns; in some cases it
1402  will fail and return components that are actually compile-time
1403  constants in *variableOffset.
1404 
1405  Finally, if there aren't any constant (or, respectivaly, variable)
1406  components, the corresponding return value may be set to NULL.
1407  */
1408 static void lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset, llvm::Value **variableOffset,
1409  llvm::Instruction *insertBefore) {
1410  if (llvm::isa<llvm::ConstantVector>(vec) || llvm::isa<llvm::ConstantDataVector>(vec) ||
1411  llvm::isa<llvm::ConstantAggregateZero>(vec)) {
1412  *constOffset = vec;
1413  *variableOffset = NULL;
1414  return;
1415  }
1416 
1417  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(vec);
1418  if (cast != NULL) {
1419  // Check the cast target.
1420  llvm::Value *co, *vo;
1421  lExtractConstantOffset(cast->getOperand(0), &co, &vo, insertBefore);
1422 
1423  // make new cast instructions for the two parts
1424  if (co == NULL)
1425  *constOffset = NULL;
1426  else
1427  *constOffset =
1428  llvm::CastInst::Create(cast->getOpcode(), co, cast->getType(), LLVMGetName(co, "_cast"), insertBefore);
1429  if (vo == NULL)
1430  *variableOffset = NULL;
1431  else
1432  *variableOffset =
1433  llvm::CastInst::Create(cast->getOpcode(), vo, cast->getType(), LLVMGetName(vo, "_cast"), insertBefore);
1434  return;
1435  }
1436 
1437  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(vec);
1438  if (bop != NULL) {
1439  llvm::Value *op0 = bop->getOperand(0);
1440  llvm::Value *op1 = bop->getOperand(1);
1441  llvm::Value *c0, *v0, *c1, *v1;
1442 
1443  if ((bop->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(bop)) {
1444  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1445  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1446 
1447  if (c0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c0))
1448  *constOffset = c1;
1449  else if (c1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c1))
1450  *constOffset = c0;
1451  else
1452  *constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, c0, c1, LLVMGetName("add", c0, c1),
1453  insertBefore);
1454 
1455  if (v0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v0))
1456  *variableOffset = v1;
1457  else if (v1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v1))
1458  *variableOffset = v0;
1459  else
1460  *variableOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, v0, v1,
1461  LLVMGetName("add", v0, v1), insertBefore);
1462  return;
1463  } else if (bop->getOpcode() == llvm::Instruction::Shl) {
1464  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1465  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1466 
1467  // Given the product of constant and variable terms, we have:
1468  // (c0 + v0) * (2^(c1 + v1)) = c0 * 2^c1 * 2^v1 + v0 * 2^c1 * 2^v1
1469  // We can optimize only if v1 == NULL.
1470  if ((v1 != NULL) || (c0 == NULL) || (c1 == NULL)) {
1471  *constOffset = NULL;
1472  *variableOffset = vec;
1473  } else if (v0 == NULL) {
1474  *constOffset = vec;
1475  *variableOffset = NULL;
1476  } else {
1477  *constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Shl, c0, c1, LLVMGetName("shl", c0, c1),
1478  insertBefore);
1479  *variableOffset = llvm::BinaryOperator::Create(llvm::Instruction::Shl, v0, c1,
1480  LLVMGetName("shl", v0, c1), insertBefore);
1481  }
1482  return;
1483  } else if (bop->getOpcode() == llvm::Instruction::Mul) {
1484  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1485  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1486 
1487  // Given the product of constant and variable terms, we have:
1488  // (c0 + v0) * (c1 + v1) == (c0 c1) + (v0 c1 + c0 v1 + v0 v1)
1489  // Note that the first term is a constant and the last three are
1490  // variable.
1491  if (c0 != NULL && c1 != NULL)
1492  *constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, c1, LLVMGetName("mul", c0, c1),
1493  insertBefore);
1494  else
1495  *constOffset = NULL;
1496 
1497  llvm::Value *va = NULL, *vb = NULL, *vc = NULL;
1498  if (v0 != NULL && c1 != NULL)
1499  va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1, LLVMGetName("mul", v0, c1),
1500  insertBefore);
1501  if (c0 != NULL && v1 != NULL)
1502  vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1, LLVMGetName("mul", c0, v1),
1503  insertBefore);
1504  if (v0 != NULL && v1 != NULL)
1505  vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1, LLVMGetName("mul", v0, v1),
1506  insertBefore);
1507 
1508  llvm::Value *vab = NULL;
1509  if (va != NULL && vb != NULL)
1510  vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb, LLVMGetName("add", va, vb),
1511  insertBefore);
1512  else if (va != NULL)
1513  vab = va;
1514  else
1515  vab = vb;
1516 
1517  if (vab != NULL && vc != NULL)
1518  *variableOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, vab, vc,
1519  LLVMGetName("add", vab, vc), insertBefore);
1520  else if (vab != NULL)
1521  *variableOffset = vab;
1522  else
1523  *variableOffset = vc;
1524 
1525  return;
1526  }
1527  }
1528 
1529  // Nothing matched, just return what we have as a variable component
1530  *constOffset = NULL;
1531  *variableOffset = vec;
1532 }
1533 
1534 /* Returns true if the given value is a constant vector of integers with
1535  the same value in all of the elements. (Returns the splatted value in
1536  *splat, if so). */
1537 static bool lIsIntegerSplat(llvm::Value *v, int *splat) {
1538  llvm::ConstantDataVector *cvec = llvm::dyn_cast<llvm::ConstantDataVector>(v);
1539  if (cvec == NULL)
1540  return false;
1541 
1542  llvm::Constant *splatConst = cvec->getSplatValue();
1543  if (splatConst == NULL)
1544  return false;
1545 
1546  llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(splatConst);
1547  if (ci == NULL)
1548  return false;
1549 
1550  int64_t splatVal = ci->getSExtValue();
1551  *splat = (int)splatVal;
1552  return true;
1553 }
1554 
1555 static llvm::Value *lExtract248Scale(llvm::Value *splatOperand, int splatValue, llvm::Value *otherOperand,
1556  llvm::Value **result) {
1557  if (splatValue == 2 || splatValue == 4 || splatValue == 8) {
1558  *result = otherOperand;
1559  return LLVMInt32(splatValue);
1560  }
1561  // Even if we don't have a common scale by exactly 2, 4, or 8, we'll
1562  // see if we can pull out that much of the scale anyway; this may in
1563  // turn allow other optimizations later.
1564  for (int scale = 8; scale >= 2; scale /= 2) {
1565  llvm::Instruction *insertBefore = llvm::dyn_cast<llvm::Instruction>(*result);
1566  Assert(insertBefore != NULL);
1567 
1568  if ((splatValue % scale) == 0) {
1569  // *result = otherOperand * splatOperand / scale;
1570  llvm::Value *splatScaleVec = (splatOperand->getType() == LLVMTypes::Int32VectorType)
1571  ? LLVMInt32Vector(scale)
1572  : LLVMInt64Vector(scale);
1573  llvm::Value *splatDiv =
1574  llvm::BinaryOperator::Create(llvm::Instruction::SDiv, splatOperand, splatScaleVec, "div", insertBefore);
1575  *result = llvm::BinaryOperator::Create(llvm::Instruction::Mul, splatDiv, otherOperand, "mul", insertBefore);
1576  return LLVMInt32(scale);
1577  }
1578  }
1579  return LLVMInt32(1);
1580 }
1581 
1582 /** Given a vector of integer offsets to a base pointer being used for a
1583  gather or a scatter, see if its root operation is a multiply by a
1584  vector of some value by all 2s/4s/8s. If not, return NULL.
1585 
1586  If it is return an i32 value of 2, 4, 8 from the function and modify
1587  *vec so that it points to the operand that is being multiplied by
1588  2/4/8.
1589 
1590  We go through all this trouble so that we can pass the i32 scale factor
1591  to the {gather,scatter}_base_offsets function as a separate scale
1592  factor for the offsets. This in turn is used in a way so that the LLVM
1593  x86 code generator matches it to apply x86's free scale by 2x, 4x, or
1594  8x to one of two registers being added together for an addressing
1595  calculation.
1596  */
1597 static llvm::Value *lExtractOffsetVector248Scale(llvm::Value **vec) {
1598  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(*vec);
1599  if (cast != NULL) {
1600  llvm::Value *castOp = cast->getOperand(0);
1601  // Check the cast target.
1602  llvm::Value *scale = lExtractOffsetVector248Scale(&castOp);
1603  if (scale == NULL)
1604  return NULL;
1605 
1606  // make a new cast instruction so that we end up with the right
1607  // type
1608  *vec = llvm::CastInst::Create(cast->getOpcode(), castOp, cast->getType(), "offset_cast", cast);
1609  return scale;
1610  }
1611 
1612  // If we don't have a binary operator, then just give up
1613  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
1614  if (bop == NULL)
1615  return LLVMInt32(1);
1616 
1617  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
1618  if ((bop->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(bop)) {
1619  if (llvm::isa<llvm::ConstantAggregateZero>(op0)) {
1620  *vec = op1;
1621  return lExtractOffsetVector248Scale(vec);
1622  } else if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
1623  *vec = op0;
1624  return lExtractOffsetVector248Scale(vec);
1625  } else {
1626  llvm::Value *s0 = lExtractOffsetVector248Scale(&op0);
1627  llvm::Value *s1 = lExtractOffsetVector248Scale(&op1);
1628  if (s0 == s1) {
1629  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add, op0, op1, "new_add", bop);
1630  return s0;
1631  } else
1632  return LLVMInt32(1);
1633  }
1634  } else if (bop->getOpcode() == llvm::Instruction::Mul) {
1635  // Check each operand for being one of the scale factors we care about.
1636  int splat;
1637  if (lIsIntegerSplat(op0, &splat))
1638  return lExtract248Scale(op0, splat, op1, vec);
1639  else if (lIsIntegerSplat(op1, &splat))
1640  return lExtract248Scale(op1, splat, op0, vec);
1641  else
1642  return LLVMInt32(1);
1643  } else
1644  return LLVMInt32(1);
1645 }
1646 
1647 #if 0
1648 static llvm::Value *
1649 lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
1650  fprintf(stderr, " lextract: ");
1651  (*vec)->dump();
1652  fprintf(stderr, "\n");
1653 
1654  if (llvm::isa<llvm::ConstantVector>(*vec) ||
1655  llvm::isa<llvm::ConstantDataVector>(*vec) ||
1656  llvm::isa<llvm::ConstantAggregateZero>(*vec))
1657  return NULL;
1658 
1659  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
1660  if (sext != NULL) {
1661  llvm::Value *sextOp = sext->getOperand(0);
1662  // Check the sext target.
1663  llvm::Value *unif = lExtractUniforms(&sextOp, insertBefore);
1664  if (unif == NULL)
1665  return NULL;
1666 
1667  // make a new sext instruction so that we end up with the right
1668  // type
1669  *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
1670  return unif;
1671  }
1672 
1673  if (LLVMVectorValuesAllEqual(*vec)) {
1674  // FIXME: we may want to redo all of the expression here, in scalar
1675  // form (if at all possible), for code quality...
1676  llvm::Value *unif =
1677  llvm::ExtractElementInst::Create(*vec, LLVMInt32(0),
1678  "first_uniform", insertBefore);
1679  *vec = NULL;
1680  return unif;
1681  }
1682 
1683  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
1684  if (bop == NULL)
1685  return NULL;
1686 
1687  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
1688  if (bop->getOpcode() == llvm::Instruction::Add) {
1689  llvm::Value *s0 = lExtractUniforms(&op0, insertBefore);
1690  llvm::Value *s1 = lExtractUniforms(&op1, insertBefore);
1691  if (s0 == NULL && s1 == NULL)
1692  return NULL;
1693 
1694  if (op0 == NULL)
1695  *vec = op1;
1696  else if (op1 == NULL)
1697  *vec = op0;
1698  else
1699  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
1700  op0, op1, "new_add", insertBefore);
1701 
1702  if (s0 == NULL)
1703  return s1;
1704  else if (s1 == NULL)
1705  return s0;
1706  else
1707  return llvm::BinaryOperator::Create(llvm::Instruction::Add, s0, s1,
1708  "add_unif", insertBefore);
1709  }
1710 #if 0
1711  else if (bop->getOpcode() == llvm::Instruction::Mul) {
1712  // Check each operand for being one of the scale factors we care about.
1713  int splat;
1714  if (lIs248Splat(op0, &splat)) {
1715  *vec = op1;
1716  return LLVMInt32(splat);
1717  }
1718  else if (lIs248Splat(op1, &splat)) {
1719  *vec = op0;
1720  return LLVMInt32(splat);
1721  }
1722  else
1723  return LLVMInt32(1);
1724  }
1725 #endif
1726  else
1727  return NULL;
1728 }
1729 
1730 
1731 static void
1732 lExtractUniformsFromOffset(llvm::Value **basePtr, llvm::Value **offsetVector,
1733  llvm::Value *offsetScale,
1734  llvm::Instruction *insertBefore) {
1735 #if 1
1736  (*basePtr)->dump();
1737  printf("\n");
1738  (*offsetVector)->dump();
1739  printf("\n");
1740  offsetScale->dump();
1741  printf("-----\n");
1742 #endif
1743 
1744  llvm::Value *uniformDelta = lExtractUniforms(offsetVector, insertBefore);
1745  if (uniformDelta == NULL)
1746  return;
1747 
1748  *basePtr = lGEPInst(*basePtr, arrayRef, "new_base", insertBefore);
1749 
1750  // this should only happen if we have only uniforms, but that in turn
1751  // shouldn't be a gather/scatter!
1752  Assert(*offsetVector != NULL);
1753 }
1754 #endif
1755 
1756 static bool lVectorIs32BitInts(llvm::Value *v) {
1757  int nElts;
1758  int64_t elts[ISPC_MAX_NVEC];
1759  if (!LLVMExtractVectorInts(v, elts, &nElts))
1760  return false;
1761 
1762  for (int i = 0; i < nElts; ++i)
1763  if ((int32_t)elts[i] != elts[i])
1764  return false;
1765 
1766  return true;
1767 }
1768 
1769 /** Check to see if the two offset vectors can safely be represented with
1770  32-bit values. If so, return true and update the pointed-to
1771  llvm::Value *s to be the 32-bit equivalents. */
1772 static bool lOffsets32BitSafe(llvm::Value **variableOffsetPtr, llvm::Value **constOffsetPtr,
1773  llvm::Instruction *insertBefore) {
1774  llvm::Value *variableOffset = *variableOffsetPtr;
1775  llvm::Value *constOffset = *constOffsetPtr;
1776 
1777  if (variableOffset->getType() != LLVMTypes::Int32VectorType) {
1778  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(variableOffset);
1779  if (sext != NULL && sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType)
1780  // sext of a 32-bit vector -> the 32-bit vector is good
1781  variableOffset = sext->getOperand(0);
1782  else if (lVectorIs32BitInts(variableOffset))
1783  // The only constant vector we should have here is a vector of
1784  // all zeros (i.e. a ConstantAggregateZero, but just in case,
1785  // do the more general check with lVectorIs32BitInts().
1786  variableOffset = new llvm::TruncInst(variableOffset, LLVMTypes::Int32VectorType,
1787  LLVMGetName(variableOffset, "_trunc"), insertBefore);
1788  else
1789  return false;
1790  }
1791 
1792  if (constOffset->getType() != LLVMTypes::Int32VectorType) {
1793  if (lVectorIs32BitInts(constOffset)) {
1794  // Truncate them so we have a 32-bit vector type for them.
1795  constOffset = new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
1796  LLVMGetName(constOffset, "_trunc"), insertBefore);
1797  } else {
1798  // FIXME: otherwise we just assume that all constant offsets
1799  // can actually always fit into 32-bits... (This could be
1800  // wrong, but it should be only in pretty esoteric cases). We
1801  // make this assumption for now since we sometimes generate
1802  // constants that need constant folding before we really have a
1803  // constant vector out of them, and
1804  // llvm::ConstantFoldInstruction() doesn't seem to be doing
1805  // enough for us in some cases if we call it from here.
1806  constOffset = new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
1807  LLVMGetName(constOffset, "_trunc"), insertBefore);
1808  }
1809  }
1810 
1811  *variableOffsetPtr = variableOffset;
1812  *constOffsetPtr = constOffset;
1813  return true;
1814 }
1815 
1816 /** Check to see if the offset value is composed of a string of Adds,
1817  SExts, and Constant Vectors that are 32-bit safe. Recursively
1818  explores the operands of Add instructions (as they might themselves
1819  be adds that eventually terminate in constant vectors or a SExt.)
1820  */
1821 
1822 static bool lIs32BitSafeHelper(llvm::Value *v) {
1823  // handle Adds, SExts, Constant Vectors
1824  if (llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v)) {
1825  if ((bop->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(bop)) {
1826  return lIs32BitSafeHelper(bop->getOperand(0)) && lIs32BitSafeHelper(bop->getOperand(1));
1827  }
1828  return false;
1829  } else if (llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(v)) {
1830  return sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType;
1831  } else
1832  return lVectorIs32BitInts(v);
1833 }
1834 
1835 /** Check to see if the single offset vector can safely be represented with
1836  32-bit values. If so, return true and update the pointed-to
1837  llvm::Value * to be the 32-bit equivalent. */
1838 static bool lOffsets32BitSafe(llvm::Value **offsetPtr, llvm::Instruction *insertBefore) {
1839  llvm::Value *offset = *offsetPtr;
1840 
1841  if (offset->getType() == LLVMTypes::Int32VectorType)
1842  return true;
1843 
1844  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offset);
1845  if (sext != NULL && sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
1846  // sext of a 32-bit vector -> the 32-bit vector is good
1847  *offsetPtr = sext->getOperand(0);
1848  return true;
1849  } else if (lIs32BitSafeHelper(offset)) {
1850  // The only constant vector we should have here is a vector of
1851  // all zeros (i.e. a ConstantAggregateZero, but just in case,
1852  // do the more general check with lVectorIs32BitInts().
1853 
1854  // Alternatively, offset could be a sequence of adds terminating
1855  // in safe constant vectors or a SExt.
1856  *offsetPtr =
1857  new llvm::TruncInst(offset, LLVMTypes::Int32VectorType, LLVMGetName(offset, "_trunc"), insertBefore);
1858  return true;
1859  } else
1860  return false;
1861 }
1862 
1863 static bool lGSToGSBaseOffsets(llvm::CallInst *callInst) {
1864  struct GSInfo {
1865  GSInfo(const char *pgFuncName, const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
1866  : isGather(ig), isPrefetch(ip) {
1867  func = m->module->getFunction(pgFuncName);
1868  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
1869  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
1870  }
1871  llvm::Function *func;
1872  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
1873  const bool isGather;
1874  const bool isPrefetch;
1875  };
1876 
1877  GSInfo gsFuncs[] = {
1878  GSInfo(
1879  "__pseudo_gather32_i8",
1880  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
1881  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
1882  true, false),
1883  GSInfo("__pseudo_gather32_i16",
1884  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
1885  : "__pseudo_gather_factored_base_offsets32_i16",
1886  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
1887  : "__pseudo_gather_factored_base_offsets32_i16",
1888  true, false),
1889  GSInfo("__pseudo_gather32_i32",
1890  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
1891  : "__pseudo_gather_factored_base_offsets32_i32",
1892  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
1893  : "__pseudo_gather_factored_base_offsets32_i32",
1894  true, false),
1895  GSInfo("__pseudo_gather32_float",
1896  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
1897  : "__pseudo_gather_factored_base_offsets32_float",
1898  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
1899  : "__pseudo_gather_factored_base_offsets32_float",
1900  true, false),
1901  GSInfo("__pseudo_gather32_i64",
1902  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
1903  : "__pseudo_gather_factored_base_offsets32_i64",
1904  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
1905  : "__pseudo_gather_factored_base_offsets32_i64",
1906  true, false),
1907  GSInfo("__pseudo_gather32_double",
1908  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
1909  : "__pseudo_gather_factored_base_offsets32_double",
1910  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
1911  : "__pseudo_gather_factored_base_offsets32_double",
1912  true, false),
1913 
1914  GSInfo("__pseudo_scatter32_i8",
1915  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
1916  : "__pseudo_scatter_factored_base_offsets32_i8",
1917  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
1918  : "__pseudo_scatter_factored_base_offsets32_i8",
1919  false, false),
1920  GSInfo("__pseudo_scatter32_i16",
1921  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
1922  : "__pseudo_scatter_factored_base_offsets32_i16",
1923  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
1924  : "__pseudo_scatter_factored_base_offsets32_i16",
1925  false, false),
1926  GSInfo("__pseudo_scatter32_i32",
1927  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
1928  : "__pseudo_scatter_factored_base_offsets32_i32",
1929  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
1930  : "__pseudo_scatter_factored_base_offsets32_i32",
1931  false, false),
1932  GSInfo("__pseudo_scatter32_float",
1933  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
1934  : "__pseudo_scatter_factored_base_offsets32_float",
1935  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
1936  : "__pseudo_scatter_factored_base_offsets32_float",
1937  false, false),
1938  GSInfo("__pseudo_scatter32_i64",
1939  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
1940  : "__pseudo_scatter_factored_base_offsets32_i64",
1941  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
1942  : "__pseudo_scatter_factored_base_offsets32_i64",
1943  false, false),
1944  GSInfo("__pseudo_scatter32_double",
1945  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
1946  : "__pseudo_scatter_factored_base_offsets32_double",
1947  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
1948  : "__pseudo_scatter_factored_base_offsets32_double",
1949  false, false),
1950 
1951  GSInfo(
1952  "__pseudo_gather64_i8",
1953  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" : "__pseudo_gather_factored_base_offsets64_i8",
1954  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
1955  true, false),
1956  GSInfo("__pseudo_gather64_i16",
1957  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16"
1958  : "__pseudo_gather_factored_base_offsets64_i16",
1959  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
1960  : "__pseudo_gather_factored_base_offsets32_i16",
1961  true, false),
1962  GSInfo("__pseudo_gather64_i32",
1963  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32"
1964  : "__pseudo_gather_factored_base_offsets64_i32",
1965  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
1966  : "__pseudo_gather_factored_base_offsets32_i32",
1967  true, false),
1968  GSInfo("__pseudo_gather64_float",
1969  g->target->hasGather() ? "__pseudo_gather_base_offsets64_float"
1970  : "__pseudo_gather_factored_base_offsets64_float",
1971  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
1972  : "__pseudo_gather_factored_base_offsets32_float",
1973  true, false),
1974  GSInfo("__pseudo_gather64_i64",
1975  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64"
1976  : "__pseudo_gather_factored_base_offsets64_i64",
1977  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
1978  : "__pseudo_gather_factored_base_offsets32_i64",
1979  true, false),
1980  GSInfo("__pseudo_gather64_double",
1981  g->target->hasGather() ? "__pseudo_gather_base_offsets64_double"
1982  : "__pseudo_gather_factored_base_offsets64_double",
1983  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
1984  : "__pseudo_gather_factored_base_offsets32_double",
1985  true, false),
1986 
1987  GSInfo("__pseudo_scatter64_i8",
1988  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8"
1989  : "__pseudo_scatter_factored_base_offsets64_i8",
1990  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
1991  : "__pseudo_scatter_factored_base_offsets32_i8",
1992  false, false),
1993  GSInfo("__pseudo_scatter64_i16",
1994  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16"
1995  : "__pseudo_scatter_factored_base_offsets64_i16",
1996  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
1997  : "__pseudo_scatter_factored_base_offsets32_i16",
1998  false, false),
1999  GSInfo("__pseudo_scatter64_i32",
2000  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32"
2001  : "__pseudo_scatter_factored_base_offsets64_i32",
2002  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2003  : "__pseudo_scatter_factored_base_offsets32_i32",
2004  false, false),
2005  GSInfo("__pseudo_scatter64_float",
2006  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float"
2007  : "__pseudo_scatter_factored_base_offsets64_float",
2008  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2009  : "__pseudo_scatter_factored_base_offsets32_float",
2010  false, false),
2011  GSInfo("__pseudo_scatter64_i64",
2012  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64"
2013  : "__pseudo_scatter_factored_base_offsets64_i64",
2014  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2015  : "__pseudo_scatter_factored_base_offsets32_i64",
2016  false, false),
2017  GSInfo("__pseudo_scatter64_double",
2018  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double"
2019  : "__pseudo_scatter_factored_base_offsets64_double",
2020  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2021  : "__pseudo_scatter_factored_base_offsets32_double",
2022  false, false),
2023  GSInfo("__pseudo_prefetch_read_varying_1",
2024  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2025  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2026  false, true),
2027 
2028  GSInfo("__pseudo_prefetch_read_varying_2",
2029  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2030  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2031  false, true),
2032 
2033  GSInfo("__pseudo_prefetch_read_varying_3",
2034  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2035  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2036  false, true),
2037 
2038  GSInfo("__pseudo_prefetch_read_varying_nt",
2039  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2040  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2041  false, true),
2042  };
2043 
2044  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2045  for (int i = 0; i < numGSFuncs; ++i)
2046  Assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL && gsFuncs[i].baseOffsets32Func != NULL);
2047 
2048  GSInfo *info = NULL;
2049  for (int i = 0; i < numGSFuncs; ++i)
2050  if (gsFuncs[i].func != NULL && callInst->getCalledFunction() == gsFuncs[i].func) {
2051  info = &gsFuncs[i];
2052  break;
2053  }
2054  if (info == NULL)
2055  return false;
2056 
2057  // Try to transform the array of pointers to a single base pointer
2058  // and an array of int32 offsets. (All the hard work is done by
2059  // lGetBasePtrAndOffsets).
2060  llvm::Value *ptrs = callInst->getArgOperand(0);
2061  llvm::Value *offsetVector = NULL;
2062  llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector, callInst);
2063 
2064  if (basePtr == NULL || offsetVector == NULL ||
2065  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
2066  // It's actually a fully general gather/scatter with a varying
2067  // set of base pointers, so leave it as is and continune onward
2068  // to the next instruction...
2069  return false;
2070 
2071  // Cast the base pointer to a void *, since that's what the
2072  // __pseudo_*_base_offsets_* functions want.
2073  basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType, LLVMGetName(basePtr, "_2void"), callInst);
2074  lCopyMetadata(basePtr, callInst);
2075 
2076  llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
2077 
2078  if ((info->isGather == true && g->target->hasGather()) ||
2079  (info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
2080  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
2081 
2082  // See if the offsets are scaled by 2, 4, or 8. If so,
2083  // extract that scale factor and rewrite the offsets to remove
2084  // it.
2085  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
2086 
2087  // If we're doing 32-bit addressing on a 64-bit target, here we
2088  // will see if we can call one of the 32-bit variants of the pseudo
2089  // gather/scatter functions.
2090  if (g->opt.force32BitAddressing && lOffsets32BitSafe(&offsetVector, callInst)) {
2091  gatherScatterFunc = info->baseOffsets32Func;
2092  }
2093 
2094  if (info->isGather || info->isPrefetch) {
2095  llvm::Value *mask = callInst->getArgOperand(1);
2096 
2097  // Generate a new function call to the next pseudo gather
2098  // base+offsets instruction. Note that we're passing a NULL
2099  // llvm::Instruction to llvm::CallInst::Create; this means that
2100  // the instruction isn't inserted into a basic block and that
2101  // way we can then call ReplaceInstWithInst().
2102  llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector, mask,
2103  callInst->getName().str().c_str(), NULL);
2104  lCopyMetadata(newCall, callInst);
2105  llvm::ReplaceInstWithInst(callInst, newCall);
2106  } else {
2107  llvm::Value *storeValue = callInst->getArgOperand(1);
2108  llvm::Value *mask = callInst->getArgOperand(2);
2109 
2110  // Generate a new function call to the next pseudo scatter
2111  // base+offsets instruction. See above for why passing NULL
2112  // for the Instruction * is intended.
2113  llvm::Instruction *newCall =
2114  lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector, storeValue, mask, "", NULL);
2115  lCopyMetadata(newCall, callInst);
2116  llvm::ReplaceInstWithInst(callInst, newCall);
2117  }
2118  } else {
2119  // Try to decompose the offset vector into a compile time constant
2120  // component and a varying component. The constant component is
2121  // passed as a separate parameter to the gather/scatter functions,
2122  // which in turn allows their implementations to end up emitting
2123  // x86 instructions with constant offsets encoded in them.
2124  llvm::Value *constOffset = NULL;
2125  llvm::Value *variableOffset = NULL;
2126  lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, callInst);
2127  if (constOffset == NULL)
2128  constOffset = LLVMIntAsType(0, offsetVector->getType());
2129  if (variableOffset == NULL)
2130  variableOffset = LLVMIntAsType(0, offsetVector->getType());
2131 
2132  // See if the varying component is scaled by 2, 4, or 8. If so,
2133  // extract that scale factor and rewrite variableOffset to remove
2134  // it. (This also is pulled out so that we can match the scales by
2135  // 2/4/8 offered by x86 addressing operators.)
2136  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);
2137 
2138  // If we're doing 32-bit addressing on a 64-bit target, here we
2139  // will see if we can call one of the 32-bit variants of the pseudo
2140  // gather/scatter functions.
2141  if (g->opt.force32BitAddressing && lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) {
2142  gatherScatterFunc = info->baseOffsets32Func;
2143  }
2144 
2145  if (info->isGather || info->isPrefetch) {
2146  llvm::Value *mask = callInst->getArgOperand(1);
2147 
2148  // Generate a new function call to the next pseudo gather
2149  // base+offsets instruction. Note that we're passing a NULL
2150  // llvm::Instruction to llvm::CallInst::Create; this means that
2151  // the instruction isn't inserted into a basic block and that
2152  // way we can then call ReplaceInstWithInst().
2153  llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, constOffset,
2154  mask, callInst->getName().str().c_str(), NULL);
2155  lCopyMetadata(newCall, callInst);
2156  llvm::ReplaceInstWithInst(callInst, newCall);
2157  } else {
2158  llvm::Value *storeValue = callInst->getArgOperand(1);
2159  llvm::Value *mask = callInst->getArgOperand(2);
2160 
2161  // Generate a new function call to the next pseudo scatter
2162  // base+offsets instruction. See above for why passing NULL
2163  // for the Instruction * is intended.
2164  llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, constOffset,
2165  storeValue, mask, "", NULL);
2166  lCopyMetadata(newCall, callInst);
2167  llvm::ReplaceInstWithInst(callInst, newCall);
2168  }
2169  }
2170  return true;
2171 }
2172 
2173 /** Try to improve the decomposition between compile-time constant and
2174  compile-time unknown offsets in calls to the __pseudo_*_base_offsets*
2175  functions. Other other optimizations have run, we will sometimes be
2176  able to pull more terms out of the unknown part and add them into the
2177  compile-time-known part.
2178  */
2179 static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
2180  struct GSBOInfo {
2181  GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
2182  : isGather(ig), isPrefetch(ip) {
2183  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
2184  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
2185  }
2186  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
2187  const bool isGather;
2188  const bool isPrefetch;
2189  };
2190 
2191  GSBOInfo gsFuncs[] = {
2192  GSBOInfo(
2193  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2194  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2195  true, false),
2196  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2197  : "__pseudo_gather_factored_base_offsets32_i16",
2198  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2199  : "__pseudo_gather_factored_base_offsets32_i16",
2200  true, false),
2201  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2202  : "__pseudo_gather_factored_base_offsets32_i32",
2203  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2204  : "__pseudo_gather_factored_base_offsets32_i32",
2205  true, false),
2206  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2207  : "__pseudo_gather_factored_base_offsets32_float",
2208  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2209  : "__pseudo_gather_factored_base_offsets32_float",
2210  true, false),
2211  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2212  : "__pseudo_gather_factored_base_offsets32_i64",
2213  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2214  : "__pseudo_gather_factored_base_offsets32_i64",
2215  true, false),
2216  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2217  : "__pseudo_gather_factored_base_offsets32_double",
2218  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2219  : "__pseudo_gather_factored_base_offsets32_double",
2220  true, false),
2221 
2222  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2223  : "__pseudo_scatter_factored_base_offsets32_i8",
2224  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2225  : "__pseudo_scatter_factored_base_offsets32_i8",
2226  false, false),
2227  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2228  : "__pseudo_scatter_factored_base_offsets32_i16",
2229  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2230  : "__pseudo_scatter_factored_base_offsets32_i16",
2231  false, false),
2232  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2233  : "__pseudo_scatter_factored_base_offsets32_i32",
2234  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2235  : "__pseudo_scatter_factored_base_offsets32_i32",
2236  false, false),
2237  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2238  : "__pseudo_scatter_factored_base_offsets32_float",
2239  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2240  : "__pseudo_scatter_factored_base_offsets32_float",
2241  false, false),
2242  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2243  : "__pseudo_scatter_factored_base_offsets32_i64",
2244  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2245  : "__pseudo_scatter_factored_base_offsets32_i64",
2246  false, false),
2247  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2248  : "__pseudo_scatter_factored_base_offsets32_double",
2249  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2250  : "__pseudo_scatter_factored_base_offsets32_double",
2251  false, false),
2252 
2253  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2254  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2255  false, true),
2256 
2257  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2258  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2259  false, true),
2260 
2261  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2262  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2263  false, true),
2264 
2265  GSBOInfo(
2266  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2267  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2268  false, true),
2269  };
2270 
2271  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2272  for (int i = 0; i < numGSFuncs; ++i)
2273  Assert(gsFuncs[i].baseOffsetsFunc != NULL && gsFuncs[i].baseOffsets32Func != NULL);
2274 
2275  llvm::Function *calledFunc = callInst->getCalledFunction();
2276  Assert(calledFunc != NULL);
2277 
2278  // Is one of the gather/scatter functins that decompose into
2279  // base+offsets being called?
2280  GSBOInfo *info = NULL;
2281  for (int i = 0; i < numGSFuncs; ++i)
2282  if (calledFunc == gsFuncs[i].baseOffsetsFunc || calledFunc == gsFuncs[i].baseOffsets32Func) {
2283  info = &gsFuncs[i];
2284  break;
2285  }
2286  if (info == NULL)
2287  return false;
2288 
2289  // Grab the old variable offset
2290  llvm::Value *origVariableOffset = callInst->getArgOperand(1);
2291 
2292  // If it's zero, we're done. Don't go and think that we're clever by
2293  // adding these zeros to the constant offsets.
2294  if (llvm::isa<llvm::ConstantAggregateZero>(origVariableOffset))
2295  return false;
2296 
2297  // Try to decompose the old variable offset
2298  llvm::Value *constOffset = NULL;
2299  llvm::Value *variableOffset = NULL;
2300  lExtractConstantOffset(origVariableOffset, &constOffset, &variableOffset, callInst);
2301 
2302  // No luck
2303  if (constOffset == NULL)
2304  return false;
2305 
2306  // Total luck: everything could be moved to the constant offset
2307  if (variableOffset == NULL)
2308  variableOffset = LLVMIntAsType(0, origVariableOffset->getType());
2309 
2310  // We need to scale the value we add to the constant offset by the
2311  // 2/4/8 scale for the variable offset, if present.
2312  llvm::ConstantInt *varScale = llvm::dyn_cast<llvm::ConstantInt>(callInst->getArgOperand(2));
2313  Assert(varScale != NULL);
2314 
2315  llvm::Value *scaleSmear;
2316  if (origVariableOffset->getType() == LLVMTypes::Int64VectorType)
2317  scaleSmear = LLVMInt64Vector((int64_t)varScale->getZExtValue());
2318  else
2319  scaleSmear = LLVMInt32Vector((int32_t)varScale->getZExtValue());
2320 
2321  constOffset =
2322  llvm::BinaryOperator::Create(llvm::Instruction::Mul, constOffset, scaleSmear, constOffset->getName(), callInst);
2323 
2324  // And add the additional offset to the original constant offset
2325  constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, constOffset, callInst->getArgOperand(3),
2326  callInst->getArgOperand(3)->getName(), callInst);
2327 
2328  // Finally, update the values of the operands to the gather/scatter
2329  // function.
2330  callInst->setArgOperand(1, variableOffset);
2331  callInst->setArgOperand(3, constOffset);
2332 
2333  return true;
2334 }
2335 
2336 static llvm::Value *lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets, llvm::Instruction *insertBefore) {
2337  llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets);
2338  return lGEPInst(base, firstOffset, "ptr", insertBefore);
2339 }
2340 
2341 static llvm::Constant *lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType) {
2342  llvm::ConstantInt *offsetScaleInt = llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
2343  Assert(offsetScaleInt != NULL);
2344  uint64_t scaleValue = offsetScaleInt->getZExtValue();
2345 
2346  std::vector<llvm::Constant *> scales;
2347  for (int i = 0; i < g->target->getVectorWidth(); ++i) {
2348  if (vecType == LLVMTypes::Int64VectorType)
2349  scales.push_back(LLVMInt64(scaleValue));
2350  else {
2351  Assert(vecType == LLVMTypes::Int32VectorType);
2352  scales.push_back(LLVMInt32((int32_t)scaleValue));
2353  }
2354  }
2355  return llvm::ConstantVector::get(scales);
2356 }
2357 
2358 /** After earlier optimization passes have run, we are sometimes able to
2359  determine that gathers/scatters are actually accessing memory in a more
2360  regular fashion and then change the operation to something simpler and
2361  more efficient. For example, if all of the lanes in a gather are
2362  reading from the same location, we can instead do a scalar load and
2363  broadcast. This pass examines gathers and scatters and tries to
2364  simplify them if at all possible.
2365 
2366  @todo Currently, this only looks for all program instances going to the
2367  same location and all going to a linear sequence of locations in
2368  memory. There are a number of other cases that might make sense to
2369  look for, including things that could be handled with a vector load +
2370  shuffle or things that could be handled with hybrids of e.g. 2 4-wide
2371  vector loads with AVX, etc.
2372 */
2373 static bool lGSToLoadStore(llvm::CallInst *callInst) {
2374  struct GatherImpInfo {
2375  GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st, int a)
2376  : align(a), isFactored(!g->target->hasGather()) {
2377  pseudoFunc = m->module->getFunction(pName);
2378  loadMaskedFunc = m->module->getFunction(lmName);
2379  Assert(pseudoFunc != NULL && loadMaskedFunc != NULL);
2380  scalarType = st;
2381  }
2382 
2383  llvm::Function *pseudoFunc;
2384  llvm::Function *loadMaskedFunc;
2385  llvm::Type *scalarType;
2386  const int align;
2387  const bool isFactored;
2388  };
2389 
2390  GatherImpInfo gInfo[] = {
2391  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8"
2392  : "__pseudo_gather_factored_base_offsets32_i8",
2393  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2394  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2395  : "__pseudo_gather_factored_base_offsets32_i16",
2396  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2397  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2398  : "__pseudo_gather_factored_base_offsets32_i32",
2399  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2400  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2401  : "__pseudo_gather_factored_base_offsets32_float",
2402  "__masked_load_float", LLVMTypes::FloatType, 4),
2403  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2404  : "__pseudo_gather_factored_base_offsets32_i64",
2405  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2406  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2407  : "__pseudo_gather_factored_base_offsets32_double",
2408  "__masked_load_double", LLVMTypes::DoubleType, 8),
2409  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8"
2410  : "__pseudo_gather_factored_base_offsets64_i8",
2411  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2412  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16"
2413  : "__pseudo_gather_factored_base_offsets64_i16",
2414  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2415  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32"
2416  : "__pseudo_gather_factored_base_offsets64_i32",
2417  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2418  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_float"
2419  : "__pseudo_gather_factored_base_offsets64_float",
2420  "__masked_load_float", LLVMTypes::FloatType, 4),
2421  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64"
2422  : "__pseudo_gather_factored_base_offsets64_i64",
2423  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2424  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_double"
2425  : "__pseudo_gather_factored_base_offsets64_double",
2426  "__masked_load_double", LLVMTypes::DoubleType, 8),
2427  };
2428 
2429  struct ScatterImpInfo {
2430  ScatterImpInfo(const char *pName, const char *msName, llvm::Type *vpt, int a)
2431  : align(a), isFactored(!g->target->hasScatter()) {
2432  pseudoFunc = m->module->getFunction(pName);
2433  maskedStoreFunc = m->module->getFunction(msName);
2434  vecPtrType = vpt;
2435  Assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
2436  }
2437  llvm::Function *pseudoFunc;
2438  llvm::Function *maskedStoreFunc;
2439  llvm::Type *vecPtrType;
2440  const int align;
2441  const bool isFactored;
2442  };
2443 
2444  ScatterImpInfo sInfo[] = {
2445  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2446  : "__pseudo_scatter_factored_base_offsets32_i8",
2447  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2448  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2449  : "__pseudo_scatter_factored_base_offsets32_i16",
2450  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2451  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2452  : "__pseudo_scatter_factored_base_offsets32_i32",
2453  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2454  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2455  : "__pseudo_scatter_factored_base_offsets32_float",
2456  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2457  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2458  : "__pseudo_scatter_factored_base_offsets32_i64",
2459  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2460  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2461  : "__pseudo_scatter_factored_base_offsets32_double",
2462  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2463  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8"
2464  : "__pseudo_scatter_factored_base_offsets64_i8",
2465  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2466  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16"
2467  : "__pseudo_scatter_factored_base_offsets64_i16",
2468  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2469  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32"
2470  : "__pseudo_scatter_factored_base_offsets64_i32",
2471  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2472  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float"
2473  : "__pseudo_scatter_factored_base_offsets64_float",
2474  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2475  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64"
2476  : "__pseudo_scatter_factored_base_offsets64_i64",
2477  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2478  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double"
2479  : "__pseudo_scatter_factored_base_offsets64_double",
2480  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2481  };
2482 
2483  llvm::Function *calledFunc = callInst->getCalledFunction();
2484 
2485  GatherImpInfo *gatherInfo = NULL;
2486  ScatterImpInfo *scatterInfo = NULL;
2487  for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
2488  if (gInfo[i].pseudoFunc != NULL && calledFunc == gInfo[i].pseudoFunc) {
2489  gatherInfo = &gInfo[i];
2490  break;
2491  }
2492  }
2493  for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
2494  if (sInfo[i].pseudoFunc != NULL && calledFunc == sInfo[i].pseudoFunc) {
2495  scatterInfo = &sInfo[i];
2496  break;
2497  }
2498  }
2499  if (gatherInfo == NULL && scatterInfo == NULL)
2500  return false;
2501 
2502  SourcePos pos;
2503  lGetSourcePosFromMetadata(callInst, &pos);
2504 
2505  llvm::Value *base = callInst->getArgOperand(0);
2506  llvm::Value *fullOffsets = NULL;
2507  llvm::Value *storeValue = NULL;
2508  llvm::Value *mask = NULL;
2509 
2510  if ((gatherInfo != NULL && gatherInfo->isFactored) || (scatterInfo != NULL && scatterInfo->isFactored)) {
2511  llvm::Value *varyingOffsets = callInst->getArgOperand(1);
2512  llvm::Value *offsetScale = callInst->getArgOperand(2);
2513  llvm::Value *constOffsets = callInst->getArgOperand(3);
2514  if (scatterInfo)
2515  storeValue = callInst->getArgOperand(4);
2516  mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);
2517 
2518  // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
2519  llvm::Constant *offsetScaleVec = lGetOffsetScaleVec(offsetScale, varyingOffsets->getType());
2520 
2521  llvm::Value *scaledVarying = llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
2522  varyingOffsets, "scaled_varying", callInst);
2523  fullOffsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying, constOffsets,
2524  "varying+const_offsets", callInst);
2525  } else {
2526  if (scatterInfo)
2527  storeValue = callInst->getArgOperand(3);
2528  mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
2529 
2530  llvm::Value *offsetScale = callInst->getArgOperand(1);
2531  llvm::Value *offsets = callInst->getArgOperand(2);
2532  llvm::Value *offsetScaleVec = lGetOffsetScaleVec(offsetScale, offsets->getType());
2533 
2534  fullOffsets =
2535  llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, offsets, "scaled_offsets", callInst);
2536  }
2537 
2538  Debug(SourcePos(), "GSToLoadStore: %s.", fullOffsets->getName().str().c_str());
2539 
2540  if (LLVMVectorValuesAllEqual(fullOffsets)) {
2541  // If all the offsets are equal, then compute the single
2542  // pointer they all represent based on the first one of them
2543  // (arbitrarily).
2544  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
2545  lCopyMetadata(ptr, callInst);
2546 
2547  if (gatherInfo != NULL) {
2548  // A gather with everyone going to the same location is
2549  // handled as a scalar load and broadcast across the lanes.
2550  Debug(pos, "Transformed gather to scalar load and broadcast!");
2551 
2552  ptr =
2553  new llvm::BitCastInst(ptr, llvm::PointerType::get(gatherInfo->scalarType, 0), ptr->getName(), callInst);
2554  llvm::Value *scalarValue = new llvm::LoadInst(ptr, callInst->getName(), callInst);
2555 
2556  // Generate the following sequence:
2557  // %name123 = insertelement <4 x i32> undef, i32 %val, i32 0
2558  // %name124 = shufflevector <4 x i32> %name123, <4 x i32> undef,
2559  // <4 x i32> zeroinitializer
2560  llvm::Value *undef1Value = llvm::UndefValue::get(callInst->getType());
2561  llvm::Value *undef2Value = llvm::UndefValue::get(callInst->getType());
2562  llvm::Value *insertVec =
2563  llvm::InsertElementInst::Create(undef1Value, scalarValue, LLVMInt32(0), callInst->getName(), callInst);
2564  llvm::Value *zeroMask =
2565 #if ISPC_LLVM_VERSION < ISPC_LLVM_11_0
2566  llvm::ConstantVector::getSplat(callInst->getType()->getVectorNumElements(),
2567 #else // LLVM 11.0+
2568  llvm::ConstantVector::getSplat({callInst->getType()->getVectorNumElements(), false},
2569 #endif
2570  llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
2571  llvm::Value *shufValue = new llvm::ShuffleVectorInst(insertVec, undef2Value, zeroMask, callInst->getName());
2572 
2573  lCopyMetadata(shufValue, callInst);
2574  llvm::ReplaceInstWithInst(callInst, llvm::dyn_cast<llvm::Instruction>(shufValue));
2575  return true;
2576  } else {
2577  // A scatter with everyone going to the same location is
2578  // undefined (if there's more than one program instance in
2579  // the gang). Issue a warning.
2580  if (g->target->getVectorWidth() > 1)
2581  Warning(pos, "Undefined behavior: all program instances are "
2582  "writing to the same location!");
2583 
2584  // We could do something similar to the gather case, where
2585  // we arbitrarily write one of the values, but we need to
2586  // a) check to be sure the mask isn't all off and b) pick
2587  // the value from an executing program instance in that
2588  // case. We'll just let a bunch of the program instances
2589  // do redundant writes, since this isn't important to make
2590  // fast anyway...
2591  return false;
2592  }
2593  } else {
2594  int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
2595 
2596  if (step > 0 && LLVMVectorIsLinear(fullOffsets, step)) {
2597  // We have a linear sequence of memory locations being accessed
2598  // starting with the location given by the offset from
2599  // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
2600  // and 64 bit gather/scatters, respectively.)
2601  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
2602  lCopyMetadata(ptr, callInst);
2603 
2604  if (gatherInfo != NULL) {
2605  Debug(pos, "Transformed gather to unaligned vector load!");
2606  llvm::Instruction *newCall =
2607  lCallInst(gatherInfo->loadMaskedFunc, ptr, mask, LLVMGetName(ptr, "_masked_load"));
2608  lCopyMetadata(newCall, callInst);
2609  llvm::ReplaceInstWithInst(callInst, newCall);
2610  return true;
2611  } else {
2612  Debug(pos, "Transformed scatter to unaligned vector store!");
2613  ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast", callInst);
2614  llvm::Instruction *newCall = lCallInst(scatterInfo->maskedStoreFunc, ptr, storeValue, mask, "");
2615  lCopyMetadata(newCall, callInst);
2616  llvm::ReplaceInstWithInst(callInst, newCall);
2617  return true;
2618  }
2619  }
2620  return false;
2621  }
2622 }
2623 
2624 ///////////////////////////////////////////////////////////////////////////
2625 // MaskedStoreOptPass
2626 
2627 /** Masked stores are generally more complex than regular stores; for
2628  example, they require multiple instructions to simulate under SSE.
2629  This optimization detects cases where masked stores can be replaced
2630  with regular stores or removed entirely, for the cases of an 'all on'
2631  mask and an 'all off' mask, respectively.
2632 */
2633 static bool lImproveMaskedStore(llvm::CallInst *callInst) {
2634  struct MSInfo {
2635  MSInfo(const char *name, const int a) : align(a) {
2636  func = m->module->getFunction(name);
2637  Assert(func != NULL);
2638  }
2639  llvm::Function *func;
2640  const int align;
2641  };
2642 
2643  MSInfo msInfo[] = {MSInfo("__pseudo_masked_store_i8", 1), MSInfo("__pseudo_masked_store_i16", 2),
2644  MSInfo("__pseudo_masked_store_i32", 4), MSInfo("__pseudo_masked_store_float", 4),
2645  MSInfo("__pseudo_masked_store_i64", 8), MSInfo("__pseudo_masked_store_double", 8),
2646  MSInfo("__masked_store_blend_i8", 1), MSInfo("__masked_store_blend_i16", 2),
2647  MSInfo("__masked_store_blend_i32", 4), MSInfo("__masked_store_blend_float", 4),
2648  MSInfo("__masked_store_blend_i64", 8), MSInfo("__masked_store_blend_double", 8),
2649  MSInfo("__masked_store_i8", 1), MSInfo("__masked_store_i16", 2),
2650  MSInfo("__masked_store_i32", 4), MSInfo("__masked_store_float", 4),
2651  MSInfo("__masked_store_i64", 8), MSInfo("__masked_store_double", 8)};
2652 
2653  llvm::Function *called = callInst->getCalledFunction();
2654 
2655  int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
2656  MSInfo *info = NULL;
2657  for (int i = 0; i < nMSFuncs; ++i) {
2658  if (msInfo[i].func != NULL && called == msInfo[i].func) {
2659  info = &msInfo[i];
2660  break;
2661  }
2662  }
2663  if (info == NULL)
2664  return false;
2665 
2666  // Got one; grab the operands
2667  llvm::Value *lvalue = callInst->getArgOperand(0);
2668  llvm::Value *rvalue = callInst->getArgOperand(1);
2669  llvm::Value *mask = callInst->getArgOperand(2);
2670 
2671  MaskStatus maskStatus = lGetMaskStatus(mask);
2672  if (maskStatus == ALL_OFF) {
2673  // Zero mask - no-op, so remove the store completely. (This
2674  // may in turn lead to being able to optimize out instructions
2675  // that compute the rvalue...)
2676  callInst->eraseFromParent();
2677  return true;
2678  } else if (maskStatus == ALL_ON) {
2679  // The mask is all on, so turn this into a regular store
2680  llvm::Type *rvalueType = rvalue->getType();
2681  llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
2682 
2683  lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
2684  lCopyMetadata(lvalue, callInst);
2685  llvm::Instruction *store =
2686  new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
2688  g->opt.forceAlignedMemory ? g->target->getNativeVectorAlignment() : info->align);
2689 #else // LLVM 10.0+
2690  llvm::MaybeAlign(g->opt.forceAlignedMemory ? g->target->getNativeVectorAlignment()
2691  : info->align));
2692 #endif
2693  lCopyMetadata(store, callInst);
2694  llvm::ReplaceInstWithInst(callInst, store);
2695  return true;
2696  }
2697 
2698  return false;
2699 }
2700 
2701 static bool lImproveMaskedLoad(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter) {
2702  struct MLInfo {
2703  MLInfo(const char *name, const int a) : align(a) {
2704  func = m->module->getFunction(name);
2705  Assert(func != NULL);
2706  }
2707  llvm::Function *func;
2708  const int align;
2709  };
2710 
2711  MLInfo mlInfo[] = {MLInfo("__masked_load_i8", 1), MLInfo("__masked_load_i16", 2),
2712  MLInfo("__masked_load_i32", 4), MLInfo("__masked_load_float", 4),
2713  MLInfo("__masked_load_i64", 8), MLInfo("__masked_load_double", 8)};
2714 
2715  llvm::Function *called = callInst->getCalledFunction();
2716 
2717  int nFuncs = sizeof(mlInfo) / sizeof(mlInfo[0]);
2718  MLInfo *info = NULL;
2719  for (int i = 0; i < nFuncs; ++i) {
2720  if (mlInfo[i].func != NULL && called == mlInfo[i].func) {
2721  info = &mlInfo[i];
2722  break;
2723  }
2724  }
2725  if (info == NULL)
2726  return false;
2727 
2728  // Got one; grab the operands
2729  llvm::Value *ptr = callInst->getArgOperand(0);
2730  llvm::Value *mask = callInst->getArgOperand(1);
2731 
2732  MaskStatus maskStatus = lGetMaskStatus(mask);
2733  if (maskStatus == ALL_OFF) {
2734  // Zero mask - no-op, so replace the load with an undef value
2735  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, llvm::UndefValue::get(callInst->getType()));
2736  return true;
2737  } else if (maskStatus == ALL_ON) {
2738  // The mask is all on, so turn this into a regular load
2739  llvm::Type *ptrType = llvm::PointerType::get(callInst->getType(), 0);
2740  ptr = new llvm::BitCastInst(ptr, ptrType, "ptr_cast_for_load", callInst);
2741  llvm::Instruction *load = new llvm::LoadInst(
2742  ptr, callInst->getName(), false /* not volatile */,
2743 #if ISPC_LLVM_VERSION <= ISPC_LLVM_9_0
2744  g->opt.forceAlignedMemory ? g->target->getNativeVectorAlignment() : info->align, (llvm::Instruction *)NULL);
2745 #else // LLVM 10.0+
2746  llvm::MaybeAlign(g->opt.forceAlignedMemory ? g->target->getNativeVectorAlignment() : info->align),
2747  (llvm::Instruction *)NULL);
2748 #endif
2749  lCopyMetadata(load, callInst);
2750  llvm::ReplaceInstWithInst(callInst, load);
2751  return true;
2752  } else
2753  return false;
2754 }
2755 
2756 bool ImproveMemoryOpsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
2757  DEBUG_START_PASS("ImproveMemoryOps");
2758 
2759  bool modifiedAny = false;
2760 restart:
2761  // Iterate through all of the instructions in the basic block.
2762  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
2763  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
2764  // If we don't have a call to one of the
2765  // __pseudo_{gather,scatter}_* functions, then just go on to the
2766  // next instruction.
2767  if (callInst == NULL || callInst->getCalledFunction() == NULL)
2768  continue;
2769 
2770  if (lGSToGSBaseOffsets(callInst)) {
2771  modifiedAny = true;
2772  goto restart;
2773  }
2774  if (lGSBaseOffsetsGetMoreConst(callInst)) {
2775  modifiedAny = true;
2776  goto restart;
2777  }
2778  if (lGSToLoadStore(callInst)) {
2779  modifiedAny = true;
2780  goto restart;
2781  }
2782  if (lImproveMaskedStore(callInst)) {
2783  modifiedAny = true;
2784  goto restart;
2785  }
2786  if (lImproveMaskedLoad(callInst, iter)) {
2787  modifiedAny = true;
2788  goto restart;
2789  }
2790  }
2791 
2792  DEBUG_END_PASS("ImproveMemoryOps");
2793 
2794  return modifiedAny;
2795 }
2796 
2797 bool ImproveMemoryOpsPass::runOnFunction(llvm::Function &F) {
2798 
2799  bool modifiedAny = false;
2800  for (llvm::BasicBlock &BB : F) {
2801  modifiedAny |= runOnBasicBlock(BB);
2802  }
2803  return modifiedAny;
2804 }
2805 
2806 static llvm::Pass *CreateImproveMemoryOpsPass() { return new ImproveMemoryOpsPass; }
2807 
2808 ///////////////////////////////////////////////////////////////////////////
2809 // GatherCoalescePass
2810 
2811 // This pass implements two optimizations to improve the performance of
2812 // gathers; currently only gathers of 32-bit values where it can be
2813 // determined at compile time that the mask is all on are supported, though
2814 // both of those limitations may be generalized in the future.
2815 //
2816 // First, for any single gather, see if it's worthwhile to break it into
2817 // any of scalar, 2-wide (i.e. 64-bit), 4-wide, or 8-wide loads. Further,
2818 // we generate code that shuffles these loads around. Doing fewer, larger
2819 // loads in this manner, when possible, can be more efficient.
2820 //
2821 // Second, this pass can coalesce memory accesses across multiple
2822 // gathers. If we have a series of gathers without any memory writes in
2823 // the middle, then we try to analyze their reads collectively and choose
2824 // an efficient set of loads for them. Not only does this help if
2825 // different gathers reuse values from the same location in memory, but
2826 // it's specifically helpful when data with AOS layout is being accessed;
2827 // in this case, we're often able to generate wide vector loads and
2828 // appropriate shuffles automatically.
2829 
2830 class GatherCoalescePass : public llvm::FunctionPass {
2831  public:
2832  static char ID;
2833  GatherCoalescePass() : FunctionPass(ID) {}
2834 
2835  llvm::StringRef getPassName() const { return "Gather Coalescing"; }
2836  bool runOnBasicBlock(llvm::BasicBlock &BB);
2837  bool runOnFunction(llvm::Function &F);
2838 };
2839 
2840 char GatherCoalescePass::ID = 0;
2841 
2842 /** Representation of a memory load that the gather coalescing code has
2843  decided to generate.
2844  */
2846  CoalescedLoadOp(int64_t s, int c) {
2847  start = s;
2848  count = c;
2849  load = element0 = element1 = NULL;
2850  }
2851 
2852  /** Starting offset of the load from the common base pointer (in terms
2853  of numbers of items of the underlying element type--*not* in terms
2854  of bytes). */
2855  int64_t start;
2856 
2857  /** Number of elements to load at this location */
2858  int count;
2859 
2860  /** Value loaded from memory for this load op */
2861  llvm::Value *load;
2862 
2863  /** For 2-wide loads (i.e. 64-bit loads), these store the lower and
2864  upper 32 bits of the result, respectively. */
2865  llvm::Value *element0, *element1;
2866 };
2867 
2868 /** This function determines whether it makes sense (and is safe) to
2869  generate a vector load of width vectorWidth, starting at *iter. It
2870  returns true if so, setting *newIter to point to the next element in
2871  the set that isn't taken care of by the generated load. If a vector
2872  load of the given width doesn't make sense, then false is returned.
2873  */
2874 static bool lVectorLoadIsEfficient(std::set<int64_t>::iterator iter, std::set<int64_t>::iterator end,
2875  std::set<int64_t>::iterator *newIter, int vectorWidth) {
2876  // We're considering a vector load of width vectorWidth, starting at
2877  // the offset "start".
2878  int64_t start = *iter;
2879 
2880  // The basic idea is that we'll look at the subsequent elements in the
2881  // load set after the initial one at start. As long as subsequent
2882  // elements:
2883  //
2884  // 1. Aren't so far separated that they no longer fit into the range
2885  // [start, start+vectorWidth)
2886  //
2887  // 2. And don't have too large a gap in between them (e.g., it's not
2888  // worth generating an 8-wide load for two elements with offsets 0
2889  // and 7, but no loads requested in between).
2890  //
2891  // Then we continue moving forward through the elements until we either
2892  // fill up the vector or run out of elements.
2893 
2894  // lastAccepted holds the last offset we've processed and accepted as
2895  // valid for the vector load underconsideration
2896  int64_t lastAccepted = start;
2897 
2898  while (iter != end) {
2899  // What is the separation in offset values from the last element we
2900  // added to the set for this load?
2901  int64_t delta = *iter - lastAccepted;
2902  if (delta > 3)
2903  // If there's too big a gap, then we won't issue the load
2904  return false;
2905 
2906  int64_t span = *iter - start + 1;
2907 
2908  if (span == vectorWidth) {
2909  // We've extended far enough that we have exactly filled up the
2910  // entire vector width; we can't go any further, so return with
2911  // success. (Update *newIter to point at the next element
2912  // after the last one accepted here.)
2913  *newIter = ++iter;
2914  return true;
2915  } else if (span > vectorWidth) {
2916  // The current offset won't fit into a vectorWidth-wide load
2917  // starting from start. It's still generally worthwhile
2918  // issuing the load we've been considering, though, since it
2919  // will provide values for a number of previous offsets. This
2920  // load will have one or more elements at the end of its range
2921  // that is not needed by any of the offsets under
2922  // consideration. As such, there are three cases where issuing
2923  // this load is a bad idea:
2924  //
2925  // 1. 2-wide loads: we know that we haven't completely filled
2926  // the 2-wide vector, since otherwise the if() test above
2927  // would have succeeded previously. Therefore, we must have
2928  // a situation with offsets like (4,6,...); it would be a
2929  // silly idea to issue a 2-wide load to get the value for
2930  // the 4 offset, versus failing here and issuing a scalar
2931  // load instead.
2932  //
2933  // 2. If there are too many unnecessary values at the end of
2934  // the load extent (defined as more than half of them)--in
2935  // this case, it'd be better to issue a vector load of
2936  // smaller width anyway.
2937  //
2938  // 3. If the gap between the last accepted offset and the
2939  // current one under consideration is more than the page
2940  // size. In this case we can't be sure whether or not some
2941  // of the unused elements at the end of the load will
2942  // straddle a page boundary and thus lead to an undesirable
2943  // fault. (It's hard to imagine this happening in practice,
2944  // except under contrived circumstances, but better safe
2945  // than sorry.)
2946  const int pageSize = 4096;
2947  if (vectorWidth != 2 && (lastAccepted - start) > (vectorWidth / 2) && (*iter - lastAccepted) < pageSize) {
2948  *newIter = iter;
2949  return true;
2950  } else
2951  return false;
2952  }
2953 
2954  // Continue moving forward
2955  lastAccepted = *iter;
2956  ++iter;
2957  }
2958 
2959  return false;
2960 }
2961 
2962 /** Given a set of offsets from a common base pointer that we need to get
2963  loaded into memory, determine a reasonable set of load operations that
2964  gets all of the corresponding values in memory (ideally, including as
2965  many as possible wider vector loads rather than scalar loads). Return
2966  a CoalescedLoadOp for each one in the *loads array.
2967  */
2968 static void lSelectLoads(const std::vector<int64_t> &loadOffsets, std::vector<CoalescedLoadOp> *loads) {
2969  // First, get a sorted set of unique offsets to load from.
2970  std::set<int64_t> allOffsets;
2971  for (unsigned int i = 0; i < loadOffsets.size(); ++i)
2972  allOffsets.insert(loadOffsets[i]);
2973 
2974  std::set<int64_t>::iterator iter = allOffsets.begin();
2975  while (iter != allOffsets.end()) {
2976  Debug(SourcePos(), "Load needed at %" PRId64 ".", *iter);
2977  ++iter;
2978  }
2979 
2980  // Now, iterate over the offsets from low to high. Starting at the
2981  // current offset, we see if a vector load starting from that offset
2982  // will cover loads at subsequent offsets as well.
2983  iter = allOffsets.begin();
2984  while (iter != allOffsets.end()) {
2985  // Consider vector loads of width of each of the elements of
2986  // spanSizes[], in order.
2987  int vectorWidths[] = {8, 4, 2};
2988  int nVectorWidths = sizeof(vectorWidths) / sizeof(vectorWidths[0]);
2989  bool gotOne = false;
2990  for (int i = 0; i < nVectorWidths; ++i) {
2991  // See if a load of vector with width vectorWidths[i] would be
2992  // effective (i.e. would cover a reasonable number of the
2993  // offsets that need to be loaded from).
2994  std::set<int64_t>::iterator newIter;
2995  if (lVectorLoadIsEfficient(iter, allOffsets.end(), &newIter, vectorWidths[i])) {
2996  // Yes: create the corresponding coalesced load and update
2997  // the iterator to the returned iterator; doing so skips
2998  // over the additional offsets that are taken care of by
2999  // this load.
3000  loads->push_back(CoalescedLoadOp(*iter, vectorWidths[i]));
3001  iter = newIter;
3002  gotOne = true;
3003  break;
3004  }
3005  }
3006 
3007  if (gotOne == false) {
3008  // We couldn't find a vector load starting from this offset
3009  // that made sense, so emit a scalar load and continue onward.
3010  loads->push_back(CoalescedLoadOp(*iter, 1));
3011  ++iter;
3012  }
3013  }
3014 }
3015 
3016 /** Print a performance message with the details of the result of
3017  coalescing over a group of gathers. */
3018 static void lCoalescePerfInfo(const std::vector<llvm::CallInst *> &coalesceGroup,
3019  const std::vector<CoalescedLoadOp> &loadOps) {
3020  SourcePos pos;
3021  lGetSourcePosFromMetadata(coalesceGroup[0], &pos);
3022 
3023  // Create a string that indicates the line numbers of the subsequent
3024  // gathers from the first one that were coalesced here.
3025  char otherPositions[512];
3026  otherPositions[0] = '\0';
3027  if (coalesceGroup.size() > 1) {
3028  const char *plural = (coalesceGroup.size() > 2) ? "s" : "";
3029  char otherBuf[32];
3030  snprintf(otherBuf, sizeof(otherBuf), "(other%s at line%s ", plural, plural);
3031  strncat(otherPositions, otherBuf, sizeof(otherPositions) - strlen(otherPositions) - 1);
3032 
3033  for (int i = 1; i < (int)coalesceGroup.size(); ++i) {
3034  SourcePos p;
3035  bool ok = lGetSourcePosFromMetadata(coalesceGroup[i], &p);
3036  if (ok) {
3037  char buf[32];
3038  snprintf(buf, sizeof(buf), "%d", p.first_line);
3039  strncat(otherPositions, buf, sizeof(otherPositions) - strlen(otherPositions) - 1);
3040  if (i < (int)coalesceGroup.size() - 1)
3041  strncat(otherPositions, ", ", sizeof(otherPositions) - strlen(otherPositions) - 1);
3042  }
3043  }
3044  strncat(otherPositions, ") ", sizeof(otherPositions) - strlen(otherPositions) - 1);
3045  }
3046 
3047  // Count how many loads of each size there were.
3048  std::map<int, int> loadOpsCount;
3049  for (int i = 0; i < (int)loadOps.size(); ++i)
3050  ++loadOpsCount[loadOps[i].count];
3051 
3052  // Generate a string the describes the mix of load ops
3053  char loadOpsInfo[512];
3054  loadOpsInfo[0] = '\0';
3055  std::map<int, int>::const_iterator iter = loadOpsCount.begin();
3056  while (iter != loadOpsCount.end()) {
3057  char buf[32];
3058  snprintf(buf, sizeof(buf), "%d x %d-wide", iter->second, iter->first);
3059  if ((strlen(loadOpsInfo) + strlen(buf)) >= 512) {
3060  break;
3061  }
3062  strncat(loadOpsInfo, buf, sizeof(loadOpsInfo) - strlen(loadOpsInfo) - 1);
3063  ++iter;
3064  if (iter != loadOpsCount.end())
3065  strncat(loadOpsInfo, ", ", sizeof(loadOpsInfo) - strlen(loadOpsInfo) - 1);
3066  }
3067 
3068  if (g->opt.level > 0) {
3069  if (coalesceGroup.size() == 1)
3070  PerformanceWarning(pos, "Coalesced gather into %d load%s (%s).", (int)loadOps.size(),
3071  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3072  else
3073  PerformanceWarning(pos,
3074  "Coalesced %d gathers starting here %sinto %d "
3075  "load%s (%s).",
3076  (int)coalesceGroup.size(), otherPositions, (int)loadOps.size(),
3077  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3078  }
3079 }
3080 
3081 /** Utility routine that computes an offset from a base pointer and then
3082  returns the result of a load of the given type from the resulting
3083  location:
3084 
3085  return *((type *)(basePtr + offset))
3086  */
3087 llvm::Value *lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align, llvm::Instruction *insertBefore,
3088  llvm::Type *type) {
3089  llvm::Value *ptr = lGEPInst(basePtr, LLVMInt64(offset), "new_base", insertBefore);
3090  ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(type, 0), "ptr_cast", insertBefore);
3091 #if ISPC_LLVM_VERSION <= ISPC_LLVM_9_0
3092  return new llvm::LoadInst(ptr, "gather_load", false /* not volatile */, align, insertBefore);
3093 #else // LLVM 10.0+
3094  return new llvm::LoadInst(ptr, "gather_load", false /* not volatile */, llvm::MaybeAlign(align), insertBefore);
3095 #endif
3096 }
3097 
3098 /* Having decided that we're doing to emit a series of loads, as encoded in
3099  the loadOps array, this function emits the corresponding load
3100  instructions.
3101  */
3102 static void lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps, int elementSize,
3103  llvm::Instruction *insertBefore) {
3104  Debug(SourcePos(), "Coalesce doing %d loads.", (int)loadOps.size());
3105  for (int i = 0; i < (int)loadOps.size(); ++i) {
3106  Debug(SourcePos(), "Load #%d @ %" PRId64 ", %d items", i, loadOps[i].start, loadOps[i].count);
3107 
3108  // basePtr is an i8 *, so the offset from it should be in terms of
3109  // bytes, not underlying i32 elements.
3110  int64_t start = loadOps[i].start * elementSize;
3111 
3112  int align = 4;
3113  switch (loadOps[i].count) {
3114  case 1:
3115  // Single 32-bit scalar load
3116  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, LLVMTypes::Int32Type);
3117  break;
3118  case 2: {
3119  // Emit 2 x i32 loads as i64 loads and then break the result
3120  // into two 32-bit parts.
3121  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, LLVMTypes::Int64Type);
3122  // element0 = (int32)value;
3123  loadOps[i].element0 =
3124  new llvm::TruncInst(loadOps[i].load, LLVMTypes::Int32Type, "load64_elt0", insertBefore);
3125  // element1 = (int32)(value >> 32)
3126  llvm::Value *shift = llvm::BinaryOperator::Create(llvm::Instruction::LShr, loadOps[i].load, LLVMInt64(32),
3127  "load64_shift", insertBefore);
3128  loadOps[i].element1 = new llvm::TruncInst(shift, LLVMTypes::Int32Type, "load64_elt1", insertBefore);
3129  break;
3130  }
3131  case 4: {
3132  // 4-wide vector load
3133  if (g->opt.forceAlignedMemory) {
3134  align = g->target->getNativeVectorAlignment();
3135  }
3136  llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3137  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, vt);
3138  break;
3139  }
3140  case 8: {
3141  // 8-wide vector load
3142  if (g->opt.forceAlignedMemory) {
3143  align = g->target->getNativeVectorAlignment();
3144  }
3145  llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 8);
3146  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, vt);
3147  break;
3148  }
3149  default:
3150  FATAL("Unexpected load count in lEmitLoads()");
3151  }
3152  }
3153 }
3154 
3155 /** Convert any loads of 8-wide vectors into two 4-wide vectors
3156  (logically). This allows the assembly code below to always operate on
3157  4-wide vectors, which leads to better code. Returns a new vector of
3158  load operations.
3159  */
3160 static std::vector<CoalescedLoadOp> lSplit8WideLoads(const std::vector<CoalescedLoadOp> &loadOps,
3161  llvm::Instruction *insertBefore) {
3162  std::vector<CoalescedLoadOp> ret;
3163  for (unsigned int i = 0; i < loadOps.size(); ++i) {
3164  if (loadOps[i].count == 8) {
3165  // Create fake CoalescedLOadOps, where the load llvm::Value is
3166  // actually a shuffle that pulls either the first 4 or the last
3167  // 4 values out of the original 8-wide loaded value.
3168  int32_t shuf[2][4] = {{0, 1, 2, 3}, {4, 5, 6, 7}};
3169 
3170  ret.push_back(CoalescedLoadOp(loadOps[i].start, 4));
3171  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load, shuf[0], 4, insertBefore);
3172 
3173  ret.push_back(CoalescedLoadOp(loadOps[i].start + 4, 4));
3174  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load, shuf[1], 4, insertBefore);
3175  } else
3176  ret.push_back(loadOps[i]);
3177  }
3178 
3179  return ret;
3180 }
3181 
3182 /** Given a 1-wide load of a 32-bit value, merge its value into the result
3183  vector for any and all elements for which it applies.
3184  */
3185 static llvm::Value *lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
3186  llvm::Instruction *insertBefore) {
3187  for (int elt = 0; elt < 4; ++elt) {
3188  if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
3189  Debug(SourcePos(),
3190  "Load 1 @ %" PRId64 " matches for element #%d "
3191  "(value %" PRId64 ")",
3192  load.start, elt, offsets[elt]);
3193  // If this load gives one of the values that we need, then we
3194  // can just insert it in directly
3195  Assert(set[elt] == false);
3196  result = llvm::InsertElementInst::Create(result, load.load, LLVMInt32(elt), "insert_load", insertBefore);
3197  set[elt] = true;
3198  }
3199  }
3200 
3201  return result;
3202 }
3203 
3204 /** Similarly, incorporate the values from a 2-wide load into any vector
3205  elements that they apply to. */
3206 static llvm::Value *lApplyLoad2(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
3207  llvm::Instruction *insertBefore) {
3208  int elt = 0;
3209  while (elt < 4) {
3210  // First, try to do a 64-bit-wide insert into the result vector.
3211  // We can do this when we're currently at an even element, when the
3212  // current and next element have consecutive values, and where the
3213  // original 64-bit load is at the offset needed by the current
3214  // element.
3215  if ((elt & 1) == 0 && offsets[elt] + 1 == offsets[elt + 1] && offsets[elt] == load.start) {
3216  Debug(SourcePos(),
3217  "Load 2 @ %" PRId64 " matches for elements #%d,%d "
3218  "(values %" PRId64 ",%" PRId64 ")",
3219  load.start, elt, elt + 1, offsets[elt], offsets[elt + 1]);
3220  Assert(set[elt] == false && ((elt < 3) && set[elt + 1] == false));
3221 
3222  // In this case, we bitcast from a 4xi32 to a 2xi64 vector
3223  llvm::Type *vec2x64Type = llvm::VectorType::get(LLVMTypes::Int64Type, 2);
3224  result = new llvm::BitCastInst(result, vec2x64Type, "to2x64", insertBefore);
3225 
3226  // And now we can insert the 64-bit wide value into the
3227  // appropriate elment
3228  result = llvm::InsertElementInst::Create(result, load.load, LLVMInt32(elt / 2), "insert64", insertBefore);
3229 
3230  // And back to 4xi32.
3231  llvm::Type *vec4x32Type = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3232  result = new llvm::BitCastInst(result, vec4x32Type, "to4x32", insertBefore);
3233 
3234  set[elt] = true;
3235  if (elt < 3) {
3236  set[elt + 1] = true;
3237  }
3238  // Advance elt one extra time, since we just took care of two
3239  // elements
3240  ++elt;
3241  } else if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
3242  Debug(SourcePos(),
3243  "Load 2 @ %" PRId64 " matches for element #%d "
3244  "(value %" PRId64 ")",
3245  load.start, elt, offsets[elt]);
3246  // Otherwise, insert one of the 32-bit pieces into an element
3247  // of the final vector
3248  Assert(set[elt] == false);
3249  llvm::Value *toInsert = (offsets[elt] == load.start) ? load.element0 : load.element1;
3250  result = llvm::InsertElementInst::Create(result, toInsert, LLVMInt32(elt), "insert_load", insertBefore);
3251  set[elt] = true;
3252  }
3253  ++elt;
3254  }
3255 
3256  return result;
3257 }
3258 
3259 #if 1
3260 /* This approach works better with AVX, while the #else path generates
3261  slightly better code with SSE. Need to continue to dig into performance
3262  details with this stuff in general... */
3263 
3264 /** And handle a 4-wide load */
3265 static llvm::Value *lApplyLoad4(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
3266  llvm::Instruction *insertBefore) {
3267  // Conceptually, we're doing to consider doing a shuffle vector with
3268  // the 4-wide load and the 4-wide result we have so far to generate a
3269  // new 4-wide vector. We'll start with shuffle indices that just
3270  // select each element of the result so far for the result.
3271  int32_t shuf[4] = {4, 5, 6, 7};
3272 
3273  for (int elt = 0; elt < 4; ++elt) {
3274  if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
3275  Debug(SourcePos(),
3276  "Load 4 @ %" PRId64 " matches for element #%d "
3277  "(value %" PRId64 ")",
3278  load.start, elt, offsets[elt]);
3279 
3280  // If the current element falls within the range of locations
3281  // that the 4-wide load covers, then compute the appropriate
3282  // shuffle index that extracts the appropriate element from the
3283  // load.
3284  Assert(set[elt] == false);
3285  shuf[elt] = int32_t(offsets[elt] - load.start);
3286  set[elt] = true;
3287  }
3288  }
3289 
3290  // Now, issue a shufflevector instruction if any of the values from the
3291  // load we just considered were applicable.
3292  if (shuf[0] != 4 || shuf[1] != 5 || shuf[2] != 6 || shuf[3] != 7)
3293  result = LLVMShuffleVectors(load.load, result, shuf, 4, insertBefore);
3294 
3295  return result;
3296 }
3297 
3298 /** We're need to fill in the values for a 4-wide result vector. This
3299  function looks at all of the generated loads and extracts the
3300  appropriate elements from the appropriate loads to assemble the result.
3301  Here the offsets[] parameter gives the 4 offsets from the base pointer
3302  for the four elements of the result.
3303 */
3304 static llvm::Value *lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps, const int64_t offsets[4],
3305  llvm::Instruction *insertBefore) {
3306  llvm::Type *returnType = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3307  llvm::Value *result = llvm::UndefValue::get(returnType);
3308 
3309  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3310  offsets[1], offsets[2], offsets[3]);
3311 
3312  // Track whether we have found a valid value for each of the four
3313  // elements of the result
3314  bool set[4] = {false, false, false, false};
3315 
3316  // Loop over all of the loads and check each one to see if it provides
3317  // a value that's applicable to the result
3318  for (int load = 0; load < (int)loadOps.size(); ++load) {
3319  const CoalescedLoadOp &li = loadOps[load];
3320 
3321  switch (li.count) {
3322  case 1:
3323  result = lApplyLoad1(result, li, offsets, set, insertBefore);
3324  break;
3325  case 2:
3326  result = lApplyLoad2(result, li, offsets, set, insertBefore);
3327  break;
3328  case 4:
3329  result = lApplyLoad4(result, li, offsets, set, insertBefore);
3330  break;
3331  default:
3332  FATAL("Unexpected load count in lAssemble4Vector()");
3333  }
3334  }
3335 
3336  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3337  offsets[1], offsets[2], offsets[3]);
3338 
3339  for (int i = 0; i < 4; ++i)
3340  Assert(set[i] == true);
3341 
3342  return result;
3343 }
3344 
3345 #else
3346 
3347 static llvm::Value *lApplyLoad4s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3348  const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore) {
3349  int32_t firstMatchElements[4] = {-1, -1, -1, -1};
3350  const CoalescedLoadOp *firstMatch = NULL;
3351 
3352  Assert(llvm::isa<llvm::UndefValue>(result));
3353 
3354  for (int load = 0; load < (int)loadOps.size(); ++load) {
3355  const CoalescedLoadOp &loadop = loadOps[load];
3356  if (loadop.count != 4)
3357  continue;
3358 
3359  int32_t matchElements[4] = {-1, -1, -1, -1};
3360  bool anyMatched = false;
3361  for (int elt = 0; elt < 4; ++elt) {
3362  if (offsets[elt] >= loadop.start && offsets[elt] < loadop.start + loadop.count) {
3363  Debug(SourcePos(),
3364  "Load 4 @ %" PRId64 " matches for element #%d "
3365  "(value %" PRId64 ")",
3366  loadop.start, elt, offsets[elt]);
3367  anyMatched = true;
3368  Assert(set[elt] == false);
3369  matchElements[elt] = offsets[elt] - loadop.start;
3370  set[elt] = true;
3371  }
3372  }
3373 
3374  if (anyMatched) {
3375  if (llvm::isa<llvm::UndefValue>(result)) {
3376  if (firstMatch == NULL) {
3377  firstMatch = &loadop;
3378  for (int i = 0; i < 4; ++i)
3379  firstMatchElements[i] = matchElements[i];
3380  } else {
3381  int32_t shuffle[4] = {-1, -1, -1, -1};
3382  for (int i = 0; i < 4; ++i) {
3383  if (firstMatchElements[i] != -1)
3384  shuffle[i] = firstMatchElements[i];
3385  else
3386  shuffle[i] = 4 + matchElements[i];
3387  }
3388  result = LLVMShuffleVectors(firstMatch->load, loadop.load, shuffle, 4, insertBefore);
3389  firstMatch = NULL;
3390  }
3391  } else {
3392  int32_t shuffle[4] = {-1, -1, -1, -1};
3393  for (int i = 0; i < 4; ++i) {
3394  if (matchElements[i] != -1)
3395  shuffle[i] = 4 + matchElements[i];
3396  else
3397  shuffle[i] = i;
3398  }
3399  result = LLVMShuffleVectors(result, loadop.load, shuffle, 4, insertBefore);
3400  }
3401  }
3402  }
3403 
3404  if (firstMatch != NULL && llvm::isa<llvm::UndefValue>(result))
3405  return LLVMShuffleVectors(firstMatch->load, result, firstMatchElements, 4, insertBefore);
3406  else
3407  return result;
3408 }
3409 
3410 static llvm::Value *lApplyLoad12s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3411  const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore) {
3412  // Loop over all of the loads and check each one to see if it provides
3413  // a value that's applicable to the result
3414  for (int load = 0; load < (int)loadOps.size(); ++load) {
3415  const CoalescedLoadOp &loadop = loadOps[load];
3416  Assert(loadop.count == 1 || loadop.count == 2 || loadop.count == 4);
3417 
3418  if (loadop.count == 1)
3419  result = lApplyLoad1(result, loadop, offsets, set, insertBefore);
3420  else if (loadop.count == 2)
3421  result = lApplyLoad2(result, loadop, offsets, set, insertBefore);
3422  }
3423  return result;
3424 }
3425 
3426 /** We're need to fill in the values for a 4-wide result vector. This
3427  function looks at all of the generated loads and extracts the
3428  appropriate elements from the appropriate loads to assemble the result.
3429  Here the offsets[] parameter gives the 4 offsets from the base pointer
3430  for the four elements of the result.
3431 */
3432 static llvm::Value *lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps, const int64_t offsets[4],
3433  llvm::Instruction *insertBefore) {
3434  llvm::Type *returnType = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3435  llvm::Value *result = llvm::UndefValue::get(returnType);
3436 
3437  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3438  offsets[1], offsets[2], offsets[3]);
3439 
3440  // Track whether we have found a valid value for each of the four
3441  // elements of the result
3442  bool set[4] = {false, false, false, false};
3443 
3444  result = lApplyLoad4s(result, loadOps, offsets, set, insertBefore);
3445  result = lApplyLoad12s(result, loadOps, offsets, set, insertBefore);
3446 
3447  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3448  offsets[1], offsets[2], offsets[3]);
3449 
3450  for (int i = 0; i < 4; ++i)
3451  Assert(set[i] == true);
3452 
3453  return result;
3454 }
3455 #endif
3456 
3457 /** Given the set of loads that we've done and the set of result values to
3458  be computed, this function computes the final llvm::Value *s for each
3459  result vector.
3460  */
3461 static void lAssembleResultVectors(const std::vector<CoalescedLoadOp> &loadOps,
3462  const std::vector<int64_t> &constOffsets, std::vector<llvm::Value *> &results,
3463  llvm::Instruction *insertBefore) {
3464  // We work on 4-wide chunks of the final values, even when we're
3465  // computing 8-wide or 16-wide vectors. This gives better code from
3466  // LLVM's SSE/AVX code generators.
3467  Assert((constOffsets.size() % 4) == 0);
3468  std::vector<llvm::Value *> vec4s;
3469  for (int i = 0; i < (int)constOffsets.size(); i += 4)
3470  vec4s.push_back(lAssemble4Vector(loadOps, &constOffsets[i], insertBefore));
3471 
3472  // And now concatenate 1, 2, or 4 of the 4-wide vectors computed above
3473  // into 4, 8, or 16-wide final result vectors.
3474  int numGathers = constOffsets.size() / g->target->getVectorWidth();
3475  for (int i = 0; i < numGathers; ++i) {
3476  llvm::Value *result = NULL;
3477  switch (g->target->getVectorWidth()) {
3478  case 4:
3479  result = vec4s[i];
3480  break;
3481  case 8:
3482  result = LLVMConcatVectors(vec4s[2 * i], vec4s[2 * i + 1], insertBefore);
3483  break;
3484  case 16: {
3485  llvm::Value *v1 = LLVMConcatVectors(vec4s[4 * i], vec4s[4 * i + 1], insertBefore);
3486  llvm::Value *v2 = LLVMConcatVectors(vec4s[4 * i + 2], vec4s[4 * i + 3], insertBefore);
3487  result = LLVMConcatVectors(v1, v2, insertBefore);
3488  break;
3489  }
3490  default:
3491  FATAL("Unhandled vector width in lAssembleResultVectors()");
3492  }
3493 
3494  results.push_back(result);
3495  }
3496 }
3497 
3498 /** Given a call to a gather function, extract the base pointer, the 2/4/8
3499  scale, and the first varying offsets value to use them to compute that
3500  scalar base pointer that is shared by all of the gathers in the group.
3501  (Thus, this base pointer plus the constant offsets term for each gather
3502  gives the set of addresses to use for each gather.
3503  */
3504 static llvm::Value *lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore) {
3505  llvm::Value *basePtr = gatherInst->getArgOperand(0);
3506  llvm::Value *variableOffsets = gatherInst->getArgOperand(1);
3507  llvm::Value *offsetScale = gatherInst->getArgOperand(2);
3508 
3509  // All of the variable offsets values should be the same, due to
3510  // checking for this in GatherCoalescePass::runOnBasicBlock(). Thus,
3511  // extract the first value and use that as a scalar.
3512  llvm::Value *variable = LLVMExtractFirstVectorElement(variableOffsets);
3513  Assert(variable != NULL);
3514  if (variable->getType() == LLVMTypes::Int64Type)
3515  offsetScale = new llvm::ZExtInst(offsetScale, LLVMTypes::Int64Type, "scale_to64", insertBefore);
3516  llvm::Value *offset =
3517  llvm::BinaryOperator::Create(llvm::Instruction::Mul, variable, offsetScale, "offset", insertBefore);
3518 
3519  return lGEPInst(basePtr, offset, "new_base", insertBefore);
3520 }
3521 
3522 /** Extract the constant offsets (from the common base pointer) from each
3523  of the gathers in a set to be coalesced. These come in as byte
3524  offsets, but we'll transform them into offsets in terms of the size of
3525  the base scalar type being gathered. (e.g. for an i32 gather, we might
3526  have offsets like <0,4,16,20>, which would be transformed to <0,1,4,5>
3527  here.)
3528  */
3529 static void lExtractConstOffsets(const std::vector<llvm::CallInst *> &coalesceGroup, int elementSize,
3530  std::vector<int64_t> *constOffsets) {
3531  int width = g->target->getVectorWidth();
3532  *constOffsets = std::vector<int64_t>(coalesceGroup.size() * width, 0);
3533 
3534  int64_t *endPtr = &((*constOffsets)[0]);
3535  for (int i = 0; i < (int)coalesceGroup.size(); ++i, endPtr += width) {
3536  llvm::Value *offsets = coalesceGroup[i]->getArgOperand(3);
3537  int nElts;
3538  bool ok = LLVMExtractVectorInts(offsets, endPtr, &nElts);
3539  Assert(ok && nElts == width);
3540  }
3541 
3542  for (int i = 0; i < (int)constOffsets->size(); ++i)
3543  (*constOffsets)[i] /= elementSize;
3544 }
3545 
3546 /** Actually do the coalescing. We have a set of gathers all accessing
3547  addresses of the form:
3548 
3549  (ptr + {1,2,4,8} * varyingOffset) + constOffset, a.k.a.
3550  basePtr + constOffset
3551 
3552  where varyingOffset actually has the same value across all of the SIMD
3553  lanes and where the part in parenthesis has the same value for all of
3554  the gathers in the group.
3555  */
3556 static bool lCoalesceGathers(const std::vector<llvm::CallInst *> &coalesceGroup) {
3557  llvm::Instruction *insertBefore = coalesceGroup[0];
3558 
3559  // First, compute the shared base pointer for all of the gathers
3560  llvm::Value *basePtr = lComputeBasePtr(coalesceGroup[0], insertBefore);
3561 
3562  int elementSize = 0;
3563  if (coalesceGroup[0]->getType() == LLVMTypes::Int32VectorType ||
3564  coalesceGroup[0]->getType() == LLVMTypes::FloatVectorType)
3565  elementSize = 4;
3566  else if (coalesceGroup[0]->getType() == LLVMTypes::Int64VectorType ||
3567  coalesceGroup[0]->getType() == LLVMTypes::DoubleVectorType)
3568  elementSize = 8;
3569  else
3570  FATAL("Unexpected gather type in lCoalesceGathers");
3571 
3572  // Extract the constant offsets from the gathers into the constOffsets
3573  // vector: the first vectorWidth elements will be those for the first
3574  // gather, the next vectorWidth those for the next gather, and so
3575  // forth.
3576  std::vector<int64_t> constOffsets;
3577  lExtractConstOffsets(coalesceGroup, elementSize, &constOffsets);
3578 
3579  // Determine a set of loads to perform to get all of the values we need
3580  // loaded.
3581  std::vector<CoalescedLoadOp> loadOps;
3582  lSelectLoads(constOffsets, &loadOps);
3583 
3584  lCoalescePerfInfo(coalesceGroup, loadOps);
3585 
3586  // Actually emit load instructions for them
3587  lEmitLoads(basePtr, loadOps, elementSize, insertBefore);
3588 
3589  // Now, for any loads that give us <8 x i32> vectors, split their
3590  // values into two <4 x i32> vectors; it turns out that LLVM gives us
3591  // better code on AVX when we assemble the pieces from 4-wide vectors.
3592  loadOps = lSplit8WideLoads(loadOps, insertBefore);
3593 
3594  // Given all of these chunks of values, shuffle together a vector that
3595  // gives us each result value; the i'th element of results[] gives the
3596  // result for the i'th gather in coalesceGroup.
3597  std::vector<llvm::Value *> results;
3598  lAssembleResultVectors(loadOps, constOffsets, results, insertBefore);
3599 
3600  // Finally, replace each of the original gathers with the instruction
3601  // that gives the value from the coalescing process.
3602  Assert(results.size() == coalesceGroup.size());
3603  for (int i = 0; i < (int)results.size(); ++i) {
3604  llvm::Instruction *ir = llvm::dyn_cast<llvm::Instruction>(results[i]);
3605  Assert(ir != NULL);
3606 
3607  llvm::Type *origType = coalesceGroup[i]->getType();
3608  if (origType != ir->getType())
3609  ir = new llvm::BitCastInst(ir, origType, ir->getName(), coalesceGroup[i]);
3610 
3611  // Previously, all of the instructions to compute the final result
3612  // were into the basic block here; here we remove the very last one
3613  // of them (that holds the final result) from the basic block.
3614  // This way, the following ReplaceInstWithInst() call will operate
3615  // successfully. (It expects that the second argument not be in any
3616  // basic block.)
3617  ir->removeFromParent();
3618 
3619  llvm::ReplaceInstWithInst(coalesceGroup[i], ir);
3620  }
3621 
3622  return true;
3623 }
3624 
3625 /** Given an instruction, returns true if the instructon may write to
3626  memory. This is a conservative test in that it may return true for
3627  some instructions that don't actually end up writing to memory, but
3628  should never return false for an instruction that does write to
3629  memory. */
3630 static bool lInstructionMayWriteToMemory(llvm::Instruction *inst) {
3631  if (llvm::isa<llvm::StoreInst>(inst) || llvm::isa<llvm::AtomicRMWInst>(inst) ||
3632  llvm::isa<llvm::AtomicCmpXchgInst>(inst))
3633  // FIXME: we could be less conservative and try to allow stores if
3634  // we are sure that the pointers don't overlap..
3635  return true;
3636 
3637  // Otherwise, any call instruction that doesn't have an attribute
3638  // indicating it won't write to memory has to be treated as a potential
3639  // store.
3640  llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
3641  if (ci != NULL) {
3642  llvm::Function *calledFunc = ci->getCalledFunction();
3643  if (calledFunc == NULL)
3644  return true;
3645 
3646  if (calledFunc->onlyReadsMemory() || calledFunc->doesNotAccessMemory())
3647  return false;
3648  return true;
3649  }
3650 
3651  return false;
3652 }
3653 
3654 bool GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
3655  DEBUG_START_PASS("GatherCoalescePass");
3656 
3657  llvm::Function *gatherFuncs[] = {
3658  m->module->getFunction("__pseudo_gather_factored_base_offsets32_i32"),
3659  m->module->getFunction("__pseudo_gather_factored_base_offsets32_float"),
3660  m->module->getFunction("__pseudo_gather_factored_base_offsets64_i32"),
3661  m->module->getFunction("__pseudo_gather_factored_base_offsets64_float"),
3662  };
3663  int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]);
3664 
3665  bool modifiedAny = false;
3666 
3667 restart:
3668  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
3669  // Iterate over all of the instructions and look for calls to
3670  // __pseudo_gather_factored_base_offsets{32,64}_{i32,float} calls.
3671  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
3672  if (callInst == NULL)
3673  continue;
3674 
3675  llvm::Function *calledFunc = callInst->getCalledFunction();
3676  if (calledFunc == NULL)
3677  continue;
3678 
3679  int i;
3680  for (i = 0; i < nGatherFuncs; ++i)
3681  if (gatherFuncs[i] != NULL && calledFunc == gatherFuncs[i])
3682  break;
3683  if (i == nGatherFuncs)
3684  // Doesn't match any of the types of gathers we care about
3685  continue;
3686 
3687  SourcePos pos;
3688  lGetSourcePosFromMetadata(callInst, &pos);
3689  Debug(pos, "Checking for coalescable gathers starting here...");
3690 
3691  llvm::Value *base = callInst->getArgOperand(0);
3692  llvm::Value *variableOffsets = callInst->getArgOperand(1);
3693  llvm::Value *offsetScale = callInst->getArgOperand(2);
3694  llvm::Value *mask = callInst->getArgOperand(4);
3695 
3696  // To apply this optimization, we need a set of one or more gathers
3697  // that fulfill the following conditions:
3698  //
3699  // - Mask all on
3700  // - The variable offsets to all have the same value (i.e., to be
3701  // uniform).
3702  // - Same base pointer, variable offsets, and offset scale (for
3703  // more than one gather)
3704  //
3705  // Then and only then do we have a common base pointer with all
3706  // offsets from that constants (in which case we can potentially
3707  // coalesce).
3708  if (lGetMaskStatus(mask) != ALL_ON)
3709  continue;
3710 
3711  if (!LLVMVectorValuesAllEqual(variableOffsets))
3712  continue;
3713 
3714  // coalesceGroup stores the set of gathers that we're going to try to
3715  // coalesce over
3716  std::vector<llvm::CallInst *> coalesceGroup;
3717  coalesceGroup.push_back(callInst);
3718 
3719  // Start iterating at the instruction after the initial gather;
3720  // look at the remainder of instructions in the basic block (up
3721  // until we reach a write to memory) to try to find any other
3722  // gathers that can coalesce with this one.
3723  llvm::BasicBlock::iterator fwdIter = iter;
3724  ++fwdIter;
3725  for (; fwdIter != bb.end(); ++fwdIter) {
3726  // Must stop once we come to an instruction that may write to
3727  // memory; otherwise we could end up moving a read before this
3728  // write.
3729  if (lInstructionMayWriteToMemory(&*fwdIter))
3730  break;
3731 
3732  llvm::CallInst *fwdCall = llvm::dyn_cast<llvm::CallInst>(&*fwdIter);
3733  if (fwdCall == NULL || fwdCall->getCalledFunction() != calledFunc)
3734  continue;
3735 
3736  SourcePos fwdPos;
3737  // TODO: need to redesign metadata attached to pseudo calls,
3738  // LLVM drops metadata frequently and it results in bad disgnostics.
3739  lGetSourcePosFromMetadata(fwdCall, &fwdPos);
3740 
3741 #ifndef ISPC_NO_DUMPS
3742  if (g->debugPrint) {
3743  if (base != fwdCall->getArgOperand(0)) {
3744  Debug(fwdPos, "base pointers mismatch");
3745  LLVMDumpValue(base);
3746  LLVMDumpValue(fwdCall->getArgOperand(0));
3747  }
3748  if (variableOffsets != fwdCall->getArgOperand(1)) {
3749  Debug(fwdPos, "varying offsets mismatch");
3750  LLVMDumpValue(variableOffsets);
3751  LLVMDumpValue(fwdCall->getArgOperand(1));
3752  }
3753  if (offsetScale != fwdCall->getArgOperand(2)) {
3754  Debug(fwdPos, "offset scales mismatch");
3755  LLVMDumpValue(offsetScale);
3756  LLVMDumpValue(fwdCall->getArgOperand(2));
3757  }
3758  if (mask != fwdCall->getArgOperand(4)) {
3759  Debug(fwdPos, "masks mismatch");
3760  LLVMDumpValue(mask);
3761  LLVMDumpValue(fwdCall->getArgOperand(4));
3762  }
3763  }
3764 #endif
3765 
3766  if (base == fwdCall->getArgOperand(0) && variableOffsets == fwdCall->getArgOperand(1) &&
3767  offsetScale == fwdCall->getArgOperand(2) && mask == fwdCall->getArgOperand(4)) {
3768  Debug(fwdPos, "This gather can be coalesced.");
3769  coalesceGroup.push_back(fwdCall);
3770 
3771  if (coalesceGroup.size() == 4)
3772  // FIXME: untested heuristic: don't try to coalesce
3773  // over a window of more than 4 gathers, so that we
3774  // don't cause too much register pressure and end up
3775  // spilling to memory anyway.
3776  break;
3777  } else
3778  Debug(fwdPos, "This gather doesn't match the initial one.");
3779  }
3780 
3781  Debug(pos, "Done with checking for matching gathers");
3782 
3783  // Now that we have a group of gathers, see if we can coalesce them
3784  // into something more efficient than the original set of gathers.
3785  if (lCoalesceGathers(coalesceGroup)) {
3786  modifiedAny = true;
3787  goto restart;
3788  }
3789  }
3790 
3791  DEBUG_END_PASS("GatherCoalescePass");
3792 
3793  return modifiedAny;
3794 }
3795 
3796 bool GatherCoalescePass::runOnFunction(llvm::Function &F) {
3797 
3798  bool modifiedAny = false;
3799  for (llvm::BasicBlock &BB : F) {
3800  modifiedAny |= runOnBasicBlock(BB);
3801  }
3802  return modifiedAny;
3803 }
3804 
3805 static llvm::Pass *CreateGatherCoalescePass() { return new GatherCoalescePass; }
3806 
3807 ///////////////////////////////////////////////////////////////////////////
3808 // ReplacePseudoMemoryOpsPass
3809 
3810 /** For any gathers and scatters remaining after the GSToLoadStorePass
3811  runs, we need to turn them into actual native gathers and scatters.
3812  This task is handled by the ReplacePseudoMemoryOpsPass here.
3813  */
3814 class ReplacePseudoMemoryOpsPass : public llvm::FunctionPass {
3815  public:
3816  static char ID;
3817  ReplacePseudoMemoryOpsPass() : FunctionPass(ID) {}
3818 
3819  llvm::StringRef getPassName() const { return "Replace Pseudo Memory Ops"; }
3820  bool runOnBasicBlock(llvm::BasicBlock &BB);
3821  bool runOnFunction(llvm::Function &F);
3822 };
3823 
3825 
3826 /** This routine attempts to determine if the given pointer in lvalue is
3827  pointing to stack-allocated memory. It's conservative in that it
3828  should never return true for non-stack allocated memory, but may return
3829  false for memory that actually is stack allocated. The basic strategy
3830  is to traverse through the operands and see if the pointer originally
3831  comes from an AllocaInst.
3832 */
3833 static bool lIsSafeToBlend(llvm::Value *lvalue) {
3834  llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(lvalue);
3835  if (bc != NULL)
3836  return lIsSafeToBlend(bc->getOperand(0));
3837  else {
3838  llvm::AllocaInst *ai = llvm::dyn_cast<llvm::AllocaInst>(lvalue);
3839  if (ai) {
3840  llvm::Type *type = ai->getType();
3841  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(type);
3842  Assert(pt != NULL);
3843  type = pt->getElementType();
3844  llvm::ArrayType *at;
3845  while ((at = llvm::dyn_cast<llvm::ArrayType>(type))) {
3846  type = at->getElementType();
3847  }
3848  llvm::VectorType *vt = llvm::dyn_cast<llvm::VectorType>(type);
3849  return (vt != NULL && (int)vt->getNumElements() == g->target->getVectorWidth());
3850  } else {
3851  llvm::GetElementPtrInst *gep = llvm::dyn_cast<llvm::GetElementPtrInst>(lvalue);
3852  if (gep != NULL)
3853  return lIsSafeToBlend(gep->getOperand(0));
3854  else
3855  return false;
3856  }
3857  }
3858 }
3859 
3860 static bool lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
3861  struct LMSInfo {
3862  LMSInfo(const char *pname, const char *bname, const char *msname) {
3863  pseudoFunc = m->module->getFunction(pname);
3864  blendFunc = m->module->getFunction(bname);
3865  maskedStoreFunc = m->module->getFunction(msname);
3866  Assert(pseudoFunc != NULL && blendFunc != NULL && maskedStoreFunc != NULL);
3867  }
3868  llvm::Function *pseudoFunc;
3869  llvm::Function *blendFunc;
3870  llvm::Function *maskedStoreFunc;
3871  };
3872 
3873  LMSInfo msInfo[] = {
3874  LMSInfo("__pseudo_masked_store_i8", "__masked_store_blend_i8", "__masked_store_i8"),
3875  LMSInfo("__pseudo_masked_store_i16", "__masked_store_blend_i16", "__masked_store_i16"),
3876  LMSInfo("__pseudo_masked_store_i32", "__masked_store_blend_i32", "__masked_store_i32"),
3877  LMSInfo("__pseudo_masked_store_float", "__masked_store_blend_float", "__masked_store_float"),
3878  LMSInfo("__pseudo_masked_store_i64", "__masked_store_blend_i64", "__masked_store_i64"),
3879  LMSInfo("__pseudo_masked_store_double", "__masked_store_blend_double", "__masked_store_double")};
3880 
3881  LMSInfo *info = NULL;
3882  for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
3883  if (msInfo[i].pseudoFunc != NULL && callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
3884  info = &msInfo[i];
3885  break;
3886  }
3887  }
3888  if (info == NULL)
3889  return false;
3890 
3891  llvm::Value *lvalue = callInst->getArgOperand(0);
3892  llvm::Value *rvalue = callInst->getArgOperand(1);
3893  llvm::Value *mask = callInst->getArgOperand(2);
3894 
3895  // We need to choose between doing the load + blend + store trick,
3896  // or serializing the masked store. Even on targets with a native
3897  // masked store instruction, this is preferable since it lets us
3898  // keep values in registers rather than going out to the stack.
3899  bool doBlend = (!g->opt.disableBlendedMaskedStores && lIsSafeToBlend(lvalue));
3900 
3901  // Generate the call to the appropriate masked store function and
3902  // replace the __pseudo_* one with it.
3903  llvm::Function *fms = doBlend ? info->blendFunc : info->maskedStoreFunc;
3904  llvm::Instruction *inst = lCallInst(fms, lvalue, rvalue, mask, "", callInst);
3905  lCopyMetadata(inst, callInst);
3906 
3907  callInst->eraseFromParent();
3908  return true;
3909 }
3910 
3911 static bool lReplacePseudoGS(llvm::CallInst *callInst) {
3912  struct LowerGSInfo {
3913  LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip) : isGather(ig), isPrefetch(ip) {
3914  pseudoFunc = m->module->getFunction(pName);
3915  actualFunc = m->module->getFunction(aName);
3916  }
3917  llvm::Function *pseudoFunc;
3918  llvm::Function *actualFunc;
3919  const bool isGather;
3920  const bool isPrefetch;
3921  };
3922 
3923  LowerGSInfo lgsInfo[] = {
3924  LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false),
3925  LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
3926  LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
3927  LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
3928  LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
3929  LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),
3930 
3931  LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false),
3932  LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
3933  LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
3934  LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
3935  LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
3936  LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),
3937 
3938  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i8", true, false),
3939  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i16", true, false),
3940  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i32", true, false),
3941  LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_float", true,
3942  false),
3943  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", "__gather_factored_base_offsets32_i64", true, false),
3944  LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", "__gather_factored_base_offsets32_double", true,
3945  false),
3946 
3947  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i8", true, false),
3948  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i16", true, false),
3949  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i32", true, false),
3950  LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_float", true,
3951  false),
3952  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_i64", true, false),
3953  LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", "__gather_factored_base_offsets64_double", true,
3954  false),
3955 
3956  LowerGSInfo("__pseudo_gather_base_offsets32_i8", "__gather_base_offsets32_i8", true, false),
3957  LowerGSInfo("__pseudo_gather_base_offsets32_i16", "__gather_base_offsets32_i16", true, false),
3958  LowerGSInfo("__pseudo_gather_base_offsets32_i32", "__gather_base_offsets32_i32", true, false),
3959  LowerGSInfo("__pseudo_gather_base_offsets32_float", "__gather_base_offsets32_float", true, false),
3960  LowerGSInfo("__pseudo_gather_base_offsets32_i64", "__gather_base_offsets32_i64", true, false),
3961  LowerGSInfo("__pseudo_gather_base_offsets32_double", "__gather_base_offsets32_double", true, false),
3962 
3963  LowerGSInfo("__pseudo_gather_base_offsets64_i8", "__gather_base_offsets64_i8", true, false),
3964  LowerGSInfo("__pseudo_gather_base_offsets64_i16", "__gather_base_offsets64_i16", true, false),
3965  LowerGSInfo("__pseudo_gather_base_offsets64_i32", "__gather_base_offsets64_i32", true, false),
3966  LowerGSInfo("__pseudo_gather_base_offsets64_float", "__gather_base_offsets64_float", true, false),
3967  LowerGSInfo("__pseudo_gather_base_offsets64_i64", "__gather_base_offsets64_i64", true, false),
3968  LowerGSInfo("__pseudo_gather_base_offsets64_double", "__gather_base_offsets64_double", true, false),
3969 
3970  LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false),
3971  LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
3972  LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
3973  LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
3974  LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
3975  LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),
3976 
3977  LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false),
3978  LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
3979  LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
3980  LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
3981  LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
3982  LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),
3983 
3984  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i8", false,
3985  false),
3986  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", "__scatter_factored_base_offsets32_i16", false,
3987  false),
3988  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i32", false,
3989  false),
3990  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_float", false,
3991  false),
3992  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", "__scatter_factored_base_offsets32_i64", false,
3993  false),
3994  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", "__scatter_factored_base_offsets32_double",
3995  false, false),
3996 
3997  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i8", false,
3998  false),
3999  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i16", false,
4000  false),
4001  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i32", false,
4002  false),
4003  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_float", false,
4004  false),
4005  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_i64", false,
4006  false),
4007  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", "__scatter_factored_base_offsets64_double",
4008  false, false),
4009 
4010  LowerGSInfo("__pseudo_scatter_base_offsets32_i8", "__scatter_base_offsets32_i8", false, false),
4011  LowerGSInfo("__pseudo_scatter_base_offsets32_i16", "__scatter_base_offsets32_i16", false, false),
4012  LowerGSInfo("__pseudo_scatter_base_offsets32_i32", "__scatter_base_offsets32_i32", false, false),
4013  LowerGSInfo("__pseudo_scatter_base_offsets32_float", "__scatter_base_offsets32_float", false, false),
4014  LowerGSInfo("__pseudo_scatter_base_offsets32_i64", "__scatter_base_offsets32_i64", false, false),
4015  LowerGSInfo("__pseudo_scatter_base_offsets32_double", "__scatter_base_offsets32_double", false, false),
4016 
4017  LowerGSInfo("__pseudo_scatter_base_offsets64_i8", "__scatter_base_offsets64_i8", false, false),
4018  LowerGSInfo("__pseudo_scatter_base_offsets64_i16", "__scatter_base_offsets64_i16", false, false),
4019  LowerGSInfo("__pseudo_scatter_base_offsets64_i32", "__scatter_base_offsets64_i32", false, false),
4020  LowerGSInfo("__pseudo_scatter_base_offsets64_float", "__scatter_base_offsets64_float", false, false),
4021  LowerGSInfo("__pseudo_scatter_base_offsets64_i64", "__scatter_base_offsets64_i64", false, false),
4022  LowerGSInfo("__pseudo_scatter_base_offsets64_double", "__scatter_base_offsets64_double", false, false),
4023 
4024  LowerGSInfo("__pseudo_prefetch_read_varying_1", "__prefetch_read_varying_1", false, true),
4025  LowerGSInfo("__pseudo_prefetch_read_varying_1_native", "__prefetch_read_varying_1_native", false, true),
4026 
4027  LowerGSInfo("__pseudo_prefetch_read_varying_2", "__prefetch_read_varying_2", false, true),
4028  LowerGSInfo("__pseudo_prefetch_read_varying_2_native", "__prefetch_read_varying_2_native", false, true),
4029 
4030  LowerGSInfo("__pseudo_prefetch_read_varying_3", "__prefetch_read_varying_3", false, true),
4031  LowerGSInfo("__pseudo_prefetch_read_varying_3_native", "__prefetch_read_varying_3_native", false, true),
4032 
4033  LowerGSInfo("__pseudo_prefetch_read_varying_nt", "__prefetch_read_varying_nt", false, true),
4034  LowerGSInfo("__pseudo_prefetch_read_varying_nt_native", "__prefetch_read_varying_nt_native", false, true),
4035  };
4036 
4037  llvm::Function *calledFunc = callInst->getCalledFunction();
4038 
4039  LowerGSInfo *info = NULL;
4040  for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
4041  if (lgsInfo[i].pseudoFunc != NULL && calledFunc == lgsInfo[i].pseudoFunc) {
4042  info = &lgsInfo[i];
4043  break;
4044  }
4045  }
4046  if (info == NULL)
4047  return false;
4048 
4049  Assert(info->actualFunc != NULL);
4050 
4051  // Get the source position from the metadata attached to the call
4052  // instruction so that we can issue PerformanceWarning()s below.
4053  SourcePos pos;
4054  bool gotPosition = lGetSourcePosFromMetadata(callInst, &pos);
4055 
4056  callInst->setCalledFunction(info->actualFunc);
4057  if (gotPosition && (g->target->getVectorWidth() > 1) && (g->opt.level > 0)) {
4058  if (info->isGather)
4059  PerformanceWarning(pos, "Gather required to load value.");
4060  else if (!info->isPrefetch)
4061  PerformanceWarning(pos, "Scatter required to store value.");
4062  }
4063  return true;
4064 }
4065 
4067  DEBUG_START_PASS("ReplacePseudoMemoryOpsPass");
4068 
4069  bool modifiedAny = false;
4070 
4071 restart:
4072  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
4073  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
4074  if (callInst == NULL || callInst->getCalledFunction() == NULL)
4075  continue;
4076 
4077  if (lReplacePseudoGS(callInst)) {
4078  modifiedAny = true;
4079  goto restart;
4080  } else if (lReplacePseudoMaskedStore(callInst)) {
4081  modifiedAny = true;
4082  goto restart;
4083  }
4084  }
4085 
4086  DEBUG_END_PASS("ReplacePseudoMemoryOpsPass");
4087 
4088  return modifiedAny;
4089 }
4090 
4092 
4093  bool modifiedAny = false;
4094  for (llvm::BasicBlock &BB : F) {
4095  modifiedAny |= runOnBasicBlock(BB);
4096  }
4097  return modifiedAny;
4098 }
4099 
4101 
4102 ///////////////////////////////////////////////////////////////////////////
4103 // IsCompileTimeConstantPass
4104 
4105 /** LLVM IR implementations of target-specific functions may include calls
4106  to the functions "bool __is_compile_time_constant_*(...)"; these allow
4107  them to have specialied code paths for where the corresponding value is
4108  known at compile time. For masks, for example, this allows them to not
4109  incur the cost of a MOVMSK call at runtime to compute its value in
4110  cases where the mask value isn't known until runtime.
4111 
4112  This pass resolves these calls into either 'true' or 'false' values so
4113  that later optimization passes can operate with these as constants.
4114 
4115  See stdlib.m4 for a number of uses of this idiom.
4116  */
4117 
4118 class IsCompileTimeConstantPass : public llvm::FunctionPass {
4119  public:
4120  static char ID;
4121  IsCompileTimeConstantPass(bool last = false) : FunctionPass(ID) { isLastTry = last; }
4122 
4123  llvm::StringRef getPassName() const { return "Resolve \"is compile time constant\""; }
4124  bool runOnBasicBlock(llvm::BasicBlock &BB);
4125  bool runOnFunction(llvm::Function &F);
4126 
4128 };
4129 
4131 
4133  DEBUG_START_PASS("IsCompileTimeConstantPass");
4134 
4135  llvm::Function *funcs[] = {m->module->getFunction("__is_compile_time_constant_mask"),
4136  m->module->getFunction("__is_compile_time_constant_uniform_int32"),
4137  m->module->getFunction("__is_compile_time_constant_varying_int32")};
4138 
4139  bool modifiedAny = false;
4140 restart:
4141  for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
4142  // Iterate through the instructions looking for calls to the
4143  // __is_compile_time_constant_*() functions
4144  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
4145  if (callInst == NULL)
4146  continue;
4147 
4148  int j;
4149  int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
4150  for (j = 0; j < nFuncs; ++j) {
4151  if (funcs[j] != NULL && callInst->getCalledFunction() == funcs[j])
4152  break;
4153  }
4154  if (j == nFuncs)
4155  // not a __is_compile_time_constant_* function
4156  continue;
4157 
4158  // This optimization pass can be disabled with both the (poorly
4159  // named) disableGatherScatterFlattening option and
4160  // disableMaskAllOnOptimizations.
4162  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4163  modifiedAny = true;
4164  goto restart;
4165  }
4166 
4167  // Is it a constant? Bingo, turn the call's value into a constant
4168  // true value.
4169  llvm::Value *operand = callInst->getArgOperand(0);
4170  if (llvm::isa<llvm::Constant>(operand)) {
4171  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue);
4172  modifiedAny = true;
4173  goto restart;
4174  }
4175 
4176  // This pass runs multiple times during optimization. Up until the
4177  // very last time, it only replaces the call with a 'true' if the
4178  // value is known to be constant and otherwise leaves the call
4179  // alone, in case further optimization passes can help resolve its
4180  // value. The last time through, it eventually has to give up, and
4181  // replaces any remaining ones with 'false' constants.
4182  if (isLastTry) {
4183  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4184  modifiedAny = true;
4185  goto restart;
4186  }
4187  }
4188 
4189  DEBUG_END_PASS("IsCompileTimeConstantPass");
4190 
4191  return modifiedAny;
4192 }
4193 
4195 
4196  bool modifiedAny = false;
4197  for (llvm::BasicBlock &BB : F) {
4198  modifiedAny |= runOnBasicBlock(BB);
4199  }
4200  return modifiedAny;
4201 }
4202 
4203 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry) { return new IsCompileTimeConstantPass(isLastTry); }
4204 
4205 //////////////////////////////////////////////////////////////////////////
4206 // DebugPass
4207 
4208 /** This pass is added in list of passes after optimizations which
4209  we want to debug and print dump of LLVM IR in stderr. Also it
4210  prints name and number of previous optimization.
4211  */
4212 #ifndef ISPC_NO_DUMPS
4213 class DebugPass : public llvm::ModulePass {
4214  public:
4215  static char ID;
4216  DebugPass(char *output) : ModulePass(ID) { snprintf(str_output, sizeof(str_output), "%s", output); }
4217 
4218  llvm::StringRef getPassName() const { return "Dump LLVM IR"; }
4219  bool runOnModule(llvm::Module &m);
4220 
4221  private:
4222  char str_output[100];
4223 };
4224 
4225 char DebugPass::ID = 0;
4226 
4227 bool DebugPass::runOnModule(llvm::Module &module) {
4228  fprintf(stderr, "%s", str_output);
4229  fflush(stderr);
4230  module.dump();
4231  return true;
4232 }
4233 
4234 static llvm::Pass *CreateDebugPass(char *output) { return new DebugPass(output); }
4235 #endif
4236 
4237 //////////////////////////////////////////////////////////////////////////
4238 // DebugPassFile
4239 
4240 /** This pass is added in list of passes after optimizations which
4241  we want to debug and print dump of LLVM IR to file.
4242  */
4243 #ifndef ISPC_NO_DUMPS
4244 class DebugPassFile : public llvm::ModulePass {
4245  public:
4246  static char ID;
4247  DebugPassFile(int number, llvm::StringRef name) : ModulePass(ID), pnum(number), pname(name) {}
4248 
4249  llvm::StringRef getPassName() const { return "Dump LLVM IR"; }
4250  bool runOnModule(llvm::Module &m);
4251  bool doInitialization(llvm::Module &m);
4252 
4253  private:
4254  void run(llvm::Module &m, bool init);
4255  int pnum;
4256  llvm::StringRef pname;
4257 };
4258 
4259 char DebugPassFile::ID = 0;
4260 
4261 /**
4262  * Strips all non-alphanumeric characters from given string.
4263  */
4264 std::string sanitize(std::string in) {
4265  llvm::Regex r("[^[:alnum:]]");
4266  while (r.match(in))
4267  in = r.sub("", in);
4268  return in;
4269 }
4270 
4271 void DebugPassFile::run(llvm::Module &module, bool init) {
4272  std::error_code EC;
4273  char fname[100];
4274  snprintf(fname, sizeof(fname), "%s_%d_%s.ll", init ? "init" : "ir", pnum, sanitize(std::string(pname)).c_str());
4275  llvm::raw_fd_ostream OS(fname, EC, llvm::sys::fs::F_None);
4276  Assert(!EC && "IR dump file creation failed!");
4277  module.print(OS, 0);
4278 }
4279 
4280 bool DebugPassFile::runOnModule(llvm::Module &module) {
4281  run(module, false);
4282  return true;
4283 }
4284 
4285 bool DebugPassFile::doInitialization(llvm::Module &module) {
4286  run(module, true);
4287  return true;
4288 }
4289 
4290 static llvm::Pass *CreateDebugPassFile(int number, llvm::StringRef name) { return new DebugPassFile(number, name); }
4291 #endif
4292 
4293 ///////////////////////////////////////////////////////////////////////////
4294 // MakeInternalFuncsStaticPass
4295 
4296 /** There are a number of target-specific functions that we use during
4297  these optimization passes. By the time we are done with optimization,
4298  any uses of these should be inlined and no calls to these functions
4299  should remain. This pass marks all of these functions as having
4300  private linkage so that subsequent passes can eliminate them as dead
4301  code, thus cleaning up the final code output by the compiler. We can't
4302  just declare these as static from the start, however, since then they
4303  end up being eliminated as dead code during early optimization passes
4304  even though we may need to generate calls to them during later
4305  optimization passes.
4306  */
4307 class MakeInternalFuncsStaticPass : public llvm::ModulePass {
4308  public:
4309  static char ID;
4310  MakeInternalFuncsStaticPass(bool last = false) : ModulePass(ID) {}
4311 
4312  void getAnalysisUsage(llvm::AnalysisUsage &AU) const { AU.setPreservesCFG(); }
4313 
4314  llvm::StringRef getPassName() const { return "Make internal funcs \"static\""; }
4315  bool runOnModule(llvm::Module &m);
4316 };
4317 
4319 
4320 bool MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
4321  const char *names[] = {
4322  "__avg_up_uint8",
4323  "__avg_up_int8",
4324  "__avg_up_uint16",
4325  "__avg_up_int16",
4326  "__avg_down_uint8",
4327  "__avg_down_int8",
4328  "__avg_down_uint16",
4329  "__avg_down_int16",
4330  "__fast_masked_vload",
4331  "__gather_factored_base_offsets32_i8",
4332  "__gather_factored_base_offsets32_i16",
4333  "__gather_factored_base_offsets32_i32",
4334  "__gather_factored_base_offsets32_i64",
4335  "__gather_factored_base_offsets32_float",
4336  "__gather_factored_base_offsets32_double",
4337  "__gather_factored_base_offsets64_i8",
4338  "__gather_factored_base_offsets64_i16",
4339  "__gather_factored_base_offsets64_i32",
4340  "__gather_factored_base_offsets64_i64",
4341  "__gather_factored_base_offsets64_float",
4342  "__gather_factored_base_offsets64_double",
4343  "__gather_base_offsets32_i8",
4344  "__gather_base_offsets32_i16",
4345  "__gather_base_offsets32_i32",
4346  "__gather_base_offsets32_i64",
4347  "__gather_base_offsets32_float",
4348  "__gather_base_offsets32_double",
4349  "__gather_base_offsets64_i8",
4350  "__gather_base_offsets64_i16",
4351  "__gather_base_offsets64_i32",
4352  "__gather_base_offsets64_i64",
4353  "__gather_base_offsets64_float",
4354  "__gather_base_offsets64_double",
4355  "__gather32_i8",
4356  "__gather32_i16",
4357  "__gather32_i32",
4358  "__gather32_i64",
4359  "__gather32_float",
4360  "__gather32_double",
4361  "__gather64_i8",
4362  "__gather64_i16",
4363  "__gather64_i32",
4364  "__gather64_i64",
4365  "__gather64_float",
4366  "__gather64_double",
4367  "__gather_elt32_i8",
4368  "__gather_elt32_i16",
4369  "__gather_elt32_i32",
4370  "__gather_elt32_i64",
4371  "__gather_elt32_float",
4372  "__gather_elt32_double",
4373  "__gather_elt64_i8",
4374  "__gather_elt64_i16",
4375  "__gather_elt64_i32",
4376  "__gather_elt64_i64",
4377  "__gather_elt64_float",
4378  "__gather_elt64_double",
4379  "__masked_load_i8",
4380  "__masked_load_i16",
4381  "__masked_load_i32",
4382  "__masked_load_i64",
4383  "__masked_load_float",
4384  "__masked_load_double",
4385  "__masked_store_i8",
4386  "__masked_store_i16",
4387  "__masked_store_i32",
4388  "__masked_store_i64",
4389  "__masked_store_float",
4390  "__masked_store_double",
4391  "__masked_store_blend_i8",
4392  "__masked_store_blend_i16",
4393  "__masked_store_blend_i32",
4394  "__masked_store_blend_i64",
4395  "__masked_store_blend_float",
4396  "__masked_store_blend_double",
4397  "__scatter_factored_base_offsets32_i8",
4398  "__scatter_factored_base_offsets32_i16",
4399  "__scatter_factored_base_offsets32_i32",
4400  "__scatter_factored_base_offsets32_i64",
4401  "__scatter_factored_base_offsets32_float",
4402  "__scatter_factored_base_offsets32_double",
4403  "__scatter_factored_base_offsets64_i8",
4404  "__scatter_factored_base_offsets64_i16",
4405  "__scatter_factored_base_offsets64_i32",
4406  "__scatter_factored_base_offsets64_i64",
4407  "__scatter_factored_base_offsets64_float",
4408  "__scatter_factored_base_offsets64_double",
4409  "__scatter_base_offsets32_i8",
4410  "__scatter_base_offsets32_i16",
4411  "__scatter_base_offsets32_i32",
4412  "__scatter_base_offsets32_i64",
4413  "__scatter_base_offsets32_float",
4414  "__scatter_base_offsets32_double",
4415  "__scatter_base_offsets64_i8",
4416  "__scatter_base_offsets64_i16",
4417  "__scatter_base_offsets64_i32",
4418  "__scatter_base_offsets64_i64",
4419  "__scatter_base_offsets64_float",
4420  "__scatter_base_offsets64_double",
4421  "__scatter_elt32_i8",
4422  "__scatter_elt32_i16",
4423  "__scatter_elt32_i32",
4424  "__scatter_elt32_i64",
4425  "__scatter_elt32_float",
4426  "__scatter_elt32_double",
4427  "__scatter_elt64_i8",
4428  "__scatter_elt64_i16",
4429  "__scatter_elt64_i32",
4430  "__scatter_elt64_i64",
4431  "__scatter_elt64_float",
4432  "__scatter_elt64_double",
4433  "__scatter32_i8",
4434  "__scatter32_i16",
4435  "__scatter32_i32",
4436  "__scatter32_i64",
4437  "__scatter32_float",
4438  "__scatter32_double",
4439  "__scatter64_i8",
4440  "__scatter64_i16",
4441  "__scatter64_i32",
4442  "__scatter64_i64",
4443  "__scatter64_float",
4444  "__scatter64_double",
4445  "__prefetch_read_varying_1",
4446  "__prefetch_read_varying_2",
4447  "__prefetch_read_varying_3",
4448  "__prefetch_read_varying_nt",
4449  "__keep_funcs_live",
4450  };
4451 
4452  bool modifiedAny = false;
4453  int count = sizeof(names) / sizeof(names[0]);
4454  for (int i = 0; i < count; ++i) {
4455  llvm::Function *f = m->module->getFunction(names[i]);
4456  if (f != NULL && f->empty() == false) {
4457  f->setLinkage(llvm::GlobalValue::InternalLinkage);
4458  modifiedAny = true;
4459  }
4460  }
4461 
4462  return modifiedAny;
4463 }
4464 
4466 
4467 ///////////////////////////////////////////////////////////////////////////
4468 // PeepholePass
4469 
4470 class PeepholePass : public llvm::FunctionPass {
4471  public:
4472  PeepholePass();
4473 
4474  llvm::StringRef getPassName() const { return "Peephole Optimizations"; }
4475  bool runOnBasicBlock(llvm::BasicBlock &BB);
4476  bool runOnFunction(llvm::Function &F);
4477 
4478  static char ID;
4479 };
4480 
4481 char PeepholePass::ID = 0;
4482 
4483 PeepholePass::PeepholePass() : FunctionPass(ID) {}
4484 
4485 using namespace llvm::PatternMatch;
4486 
4487 template <typename Op_t, unsigned Opcode> struct CastClassTypes_match {
4488  Op_t Op;
4489  const llvm::Type *fromType, *toType;
4490 
4491  CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f, const llvm::Type *t)
4492  : Op(OpMatch), fromType(f), toType(t) {}
4493 
4494  template <typename OpTy> bool match(OpTy *V) {
4495  if (llvm::Operator *O = llvm::dyn_cast<llvm::Operator>(V))
4496  return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) && O->getType() == toType &&
4497  O->getOperand(0)->getType() == fromType);
4498  return false;
4499  }
4500 };
4501 
4502 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::SExt> m_SExt8To16(const OpTy &Op) {
4505 }
4506 
4507 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt> m_ZExt8To16(const OpTy &Op) {
4510 }
4511 
4512 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc> m_Trunc16To8(const OpTy &Op) {
4515 }
4516 
4517 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::SExt> m_SExt16To32(const OpTy &Op) {
4520 }
4521 
4522 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt> m_ZExt16To32(const OpTy &Op) {
4525 }
4526 
4527 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc> m_Trunc32To16(const OpTy &Op) {
4530 }
4531 
4532 template <typename Op_t> struct UDiv2_match {
4533  Op_t Op;
4534 
4535  UDiv2_match(const Op_t &OpMatch) : Op(OpMatch) {}
4536 
4537  template <typename OpTy> bool match(OpTy *V) {
4538  llvm::BinaryOperator *bop;
4539  llvm::ConstantDataVector *cdv;
4540  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
4541  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) && cdv->getSplatValue() != NULL) {
4542  const llvm::APInt &apInt = cdv->getUniqueInteger();
4543 
4544  switch (bop->getOpcode()) {
4545  case llvm::Instruction::UDiv:
4546  // divide by 2
4547  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
4548  case llvm::Instruction::LShr:
4549  // shift left by 1
4550  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
4551  default:
4552  return false;
4553  }
4554  }
4555  return false;
4556  }
4557 };
4558 
4559 template <typename V> inline UDiv2_match<V> m_UDiv2(const V &v) { return UDiv2_match<V>(v); }
4560 
4561 template <typename Op_t> struct SDiv2_match {
4562  Op_t Op;
4563 
4564  SDiv2_match(const Op_t &OpMatch) : Op(OpMatch) {}
4565 
4566  template <typename OpTy> bool match(OpTy *V) {
4567  llvm::BinaryOperator *bop;
4568  llvm::ConstantDataVector *cdv;
4569  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
4570  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) && cdv->getSplatValue() != NULL) {
4571  const llvm::APInt &apInt = cdv->getUniqueInteger();
4572 
4573  switch (bop->getOpcode()) {
4574  case llvm::Instruction::SDiv:
4575  // divide by 2
4576  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
4577  case llvm::Instruction::AShr:
4578  // shift left by 1
4579  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
4580  default:
4581  return false;
4582  }
4583  }
4584  return false;
4585  }
4586 };
4587 
4588 template <typename V> inline SDiv2_match<V> m_SDiv2(const V &v) { return SDiv2_match<V>(v); }
4589 
4590 // Returns true if the given function has a call to an intrinsic function
4591 // in its definition.
4592 static bool lHasIntrinsicInDefinition(llvm::Function *func) {
4593  llvm::Function::iterator bbiter = func->begin();
4594  for (; bbiter != func->end(); ++bbiter) {
4595  for (llvm::BasicBlock::iterator institer = bbiter->begin(); institer != bbiter->end(); ++institer) {
4596  if (llvm::isa<llvm::IntrinsicInst>(institer))
4597  return true;
4598  }
4599  }
4600  return false;
4601 }
4602 
4603 static llvm::Instruction *lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) {
4604  llvm::Function *func = m->module->getFunction(name);
4605  Assert(func != NULL);
4606 
4607  // Make sure that the definition of the llvm::Function has a call to an
4608  // intrinsic function in its instructions; otherwise we will generate
4609  // infinite loops where we "helpfully" turn the default implementations
4610  // of target builtins like __avg_up_uint8 that are implemented with plain
4611  // arithmetic ops into recursive calls to themselves.
4612  if (lHasIntrinsicInDefinition(func))
4613  return lCallInst(func, opa, opb, name);
4614  else
4615  return NULL;
4616 }
4617 
4618 //////////////////////////////////////////////////
4619 
4620 static llvm::Instruction *lMatchAvgUpUInt8(llvm::Value *inst) {
4621  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
4622  llvm::Value *opa, *opb;
4623  const llvm::APInt *delta;
4624  if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr(
4625  m_CombineOr(m_Add(m_ZExt8To16(m_Value(opa)), m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))),
4626  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)), m_ZExt8To16(m_Value(opb)))),
4627  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))), m_APInt(delta))))))) {
4628  if (delta->isIntN(1) == false)
4629  return NULL;
4630 
4631  return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
4632  }
4633  return NULL;
4634 }
4635 
4636 static llvm::Instruction *lMatchAvgDownUInt8(llvm::Value *inst) {
4637  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
4638  llvm::Value *opa, *opb;
4639  if (match(inst, m_Trunc16To8(m_UDiv2(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))))))) {
4640  return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb);
4641  }
4642  return NULL;
4643 }
4644 
4645 static llvm::Instruction *lMatchAvgUpUInt16(llvm::Value *inst) {
4646  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
4647  llvm::Value *opa, *opb;
4648  const llvm::APInt *delta;
4649  if (match(inst,
4650  m_Trunc32To16(m_UDiv2(m_CombineOr(
4651  m_CombineOr(m_Add(m_ZExt16To32(m_Value(opa)), m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))),
4652  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)), m_ZExt16To32(m_Value(opb)))),
4653  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))), m_APInt(delta))))))) {
4654  if (delta->isIntN(1) == false)
4655  return NULL;
4656 
4657  return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
4658  }
4659  return NULL;
4660 }
4661 
4662 static llvm::Instruction *lMatchAvgDownUInt16(llvm::Value *inst) {
4663  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
4664  llvm::Value *opa, *opb;
4665  if (match(inst, m_Trunc32To16(m_UDiv2(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))))))) {
4666  return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb);
4667  }
4668  return NULL;
4669 }
4670 
4671 static llvm::Instruction *lMatchAvgUpInt8(llvm::Value *inst) {
4672  // (int8)(((int16)a + (int16)b + 1)/2)
4673  llvm::Value *opa, *opb;
4674  const llvm::APInt *delta;
4675  if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr(
4676  m_CombineOr(m_Add(m_SExt8To16(m_Value(opa)), m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))),
4677  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)), m_SExt8To16(m_Value(opb)))),
4678  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))), m_APInt(delta))))))) {
4679  if (delta->isIntN(1) == false)
4680  return NULL;
4681 
4682  return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
4683  }
4684  return NULL;
4685 }
4686 
4687 static llvm::Instruction *lMatchAvgDownInt8(llvm::Value *inst) {
4688  // (int8)(((int16)a + (int16)b)/2)
4689  llvm::Value *opa, *opb;
4690  if (match(inst, m_Trunc16To8(m_SDiv2(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))))))) {
4691  return lGetBinaryIntrinsic("__avg_down_int8", opa, opb);
4692  }
4693  return NULL;
4694 }
4695 
4696 static llvm::Instruction *lMatchAvgUpInt16(llvm::Value *inst) {
4697  // (int16)(((int32)a + (int32)b + 1)/2)
4698  llvm::Value *opa, *opb;
4699  const llvm::APInt *delta;
4700  if (match(inst,
4701  m_Trunc32To16(m_SDiv2(m_CombineOr(
4702  m_CombineOr(m_Add(m_SExt16To32(m_Value(opa)), m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))),
4703  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)), m_SExt16To32(m_Value(opb)))),
4704  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))), m_APInt(delta))))))) {
4705  if (delta->isIntN(1) == false)
4706  return NULL;
4707 
4708  return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
4709  }
4710  return NULL;
4711 }
4712 
4713 static llvm::Instruction *lMatchAvgDownInt16(llvm::Value *inst) {
4714  // (int16)(((int32)a + (int32)b)/2)
4715  llvm::Value *opa, *opb;
4716  if (match(inst, m_Trunc32To16(m_SDiv2(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))))))) {
4717  return lGetBinaryIntrinsic("__avg_down_int16", opa, opb);
4718  }
4719  return NULL;
4720 }
4721 
4722 bool PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
4723  DEBUG_START_PASS("PeepholePass");
4724 
4725  bool modifiedAny = false;
4726 restart:
4727  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
4728  llvm::Instruction *inst = &*iter;
4729 
4730  llvm::Instruction *builtinCall = lMatchAvgUpUInt8(inst);
4731  if (!builtinCall)
4732  builtinCall = lMatchAvgUpUInt16(inst);
4733  if (!builtinCall)
4734  builtinCall = lMatchAvgDownUInt8(inst);
4735  if (!builtinCall)
4736  builtinCall = lMatchAvgDownUInt16(inst);
4737  if (!builtinCall)
4738  builtinCall = lMatchAvgUpInt8(inst);
4739  if (!builtinCall)
4740  builtinCall = lMatchAvgUpInt16(inst);
4741  if (!builtinCall)
4742  builtinCall = lMatchAvgDownInt8(inst);
4743  if (!builtinCall)
4744  builtinCall = lMatchAvgDownInt16(inst);
4745  if (builtinCall != NULL) {
4746  llvm::ReplaceInstWithInst(inst, builtinCall);
4747  modifiedAny = true;
4748  goto restart;
4749  }
4750  }
4751 
4752  DEBUG_END_PASS("PeepholePass");
4753 
4754  return modifiedAny;
4755 }
4756 
4757 bool PeepholePass::runOnFunction(llvm::Function &F) {
4758 
4759  bool modifiedAny = false;
4760  for (llvm::BasicBlock &BB : F) {
4761  modifiedAny |= runOnBasicBlock(BB);
4762  }
4763  return modifiedAny;
4764 }
4765 
4766 static llvm::Pass *CreatePeepholePass() { return new PeepholePass; }
4767 
4768 /** Given an llvm::Value known to be an integer, return its value as
4769  an int64_t.
4770 */
4771 static int64_t lGetIntValue(llvm::Value *offset) {
4772  llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
4773  Assert(intOffset && (intOffset->getBitWidth() == 32 || intOffset->getBitWidth() == 64));
4774  return intOffset->getSExtValue();
4775 }
4776 
4777 ///////////////////////////////////////////////////////////////////////////
4778 // ReplaceStdlibShiftPass
4779 
4780 class ReplaceStdlibShiftPass : public llvm::FunctionPass {
4781  public:
4782  static char ID;
4783  ReplaceStdlibShiftPass() : FunctionPass(ID) {}
4784 
4785  llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
4786 
4787  bool runOnBasicBlock(llvm::BasicBlock &BB);
4788 
4789  bool runOnFunction(llvm::Function &F);
4790 };
4791 
4793 
4794 // This pass replaces shift() with ShuffleVector when the offset is a constant.
4795 // rotate() which is similar in functionality has a slightly different
4796 // implementation. This is due to LLVM(createInstructionCombiningPass)
4797 // optimizing rotate() implementation better when similar implementations
4798 // are used for both. This is a hack to produce similarly optimized code for
4799 // shift.
4800 bool ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
4801  DEBUG_START_PASS("ReplaceStdlibShiftPass");
4802  bool modifiedAny = false;
4803 
4804  llvm::Function *shifts[6];
4805  shifts[0] = m->module->getFunction("shift___vytuni");
4806  shifts[1] = m->module->getFunction("shift___vysuni");
4807  shifts[2] = m->module->getFunction("shift___vyiuni");
4808  shifts[3] = m->module->getFunction("shift___vyIuni");
4809  shifts[4] = m->module->getFunction("shift___vyfuni");
4810  shifts[5] = m->module->getFunction("shift___vyduni");
4811 
4812  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
4813  llvm::Instruction *inst = &*iter;
4814 
4815  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
4816  llvm::Function *func = ci->getCalledFunction();
4817  for (int i = 0; i < 6; i++) {
4818  if (shifts[i] && (shifts[i] == func)) {
4819  // we matched a call
4820  llvm::Value *shiftedVec = ci->getArgOperand(0);
4821  llvm::Value *shiftAmt = ci->getArgOperand(1);
4822  if (llvm::isa<llvm::Constant>(shiftAmt)) {
4823  int vectorWidth = g->target->getVectorWidth();
4824  int *shuffleVals = new int[vectorWidth];
4825  int shiftInt = lGetIntValue(shiftAmt);
4826  for (int i = 0; i < vectorWidth; i++) {
4827  int s = i + shiftInt;
4828  s = (s < 0) ? vectorWidth : s;
4829  s = (s >= vectorWidth) ? vectorWidth : s;
4830  shuffleVals[i] = s;
4831  }
4832  llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
4833  llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
4834  llvm::Value *shuffle =
4835  new llvm::ShuffleVectorInst(shiftedVec, zeroVec, shuffleIdxs, "vecShift", ci);
4836  ci->replaceAllUsesWith(shuffle);
4837  modifiedAny = true;
4838  delete[] shuffleVals;
4839  } else if (g->opt.level > 0) {
4840  PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount.");
4841  }
4842  }
4843  }
4844  }
4845  }
4846 
4847  DEBUG_END_PASS("ReplaceStdlibShiftPass");
4848 
4849  return modifiedAny;
4850 }
4851 
4852 bool ReplaceStdlibShiftPass::runOnFunction(llvm::Function &F) {
4853 
4854  bool modifiedAny = false;
4855  for (llvm::BasicBlock &BB : F) {
4856  modifiedAny |= runOnBasicBlock(BB);
4857  }
4858  return modifiedAny;
4859 }
4860 
4861 static llvm::Pass *CreateReplaceStdlibShiftPass() { return new ReplaceStdlibShiftPass(); }
4862 
4863 ///////////////////////////////////////////////////////////////////////////////
4864 // FixBooleanSelect
4865 //
4866 // The problem is that in LLVM 3.3, optimizer doesn't like
4867 // the following instruction sequence:
4868 // %cmp = fcmp olt <8 x float> %a, %b
4869 // %sext_cmp = sext <8 x i1> %cmp to <8 x i32>
4870 // %new_mask = and <8 x i32> %sext_cmp, %mask
4871 // and optimizes it to the following:
4872 // %cmp = fcmp olt <8 x float> %a, %b
4873 // %cond = select <8 x i1> %cmp, <8 x i32> %mask, <8 x i32> zeroinitializer
4874 //
4875 // It wouldn't be a problem if codegen produced good code for it. But it
4876 // doesn't, especially for vectors larger than native vectors.
4877 //
4878 // This optimization reverts this pattern and should be the last one before
4879 // code gen.
4880 //
4881 // Note that this problem was introduced in LLVM 3.3. But in LLVM 3.4 it was
4882 // fixed. See commit r194542.
4883 //
4884 // After LLVM 3.3 this optimization should probably stay for experimental
4885 // purposes and code should be compared with and without this optimization from
4886 // time to time to make sure that LLVM does right thing.
4887 ///////////////////////////////////////////////////////////////////////////////
4888 
4889 class FixBooleanSelectPass : public llvm::FunctionPass {
4890  public:
4891  static char ID;
4892  FixBooleanSelectPass() : FunctionPass(ID) {}
4893 
4894  llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
4895  bool runOnFunction(llvm::Function &F);
4896 
4897  private:
4898  llvm::Instruction *fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext);
4899 };
4900 
4901 char FixBooleanSelectPass::ID = 0;
4902 
4903 llvm::Instruction *FixBooleanSelectPass::fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext) {
4904  // Select instruction result type and its integer equivalent
4905  llvm::VectorType *orig_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
4906  Assert(orig_type);
4907  llvm::VectorType *int_type = llvm::VectorType::getInteger(orig_type);
4908 
4909  // Result value and optional pointer to instruction to delete
4910  llvm::Instruction *result = 0, *optional_to_delete = 0;
4911 
4912  // It can be vector of integers or vector of floating point values.
4913  if (orig_type->getElementType()->isIntegerTy()) {
4914  // Generate sext+and, remove select.
4915  result = llvm::BinaryOperator::CreateAnd(sext, sel->getTrueValue(), "and_mask", sel);
4916  } else {
4917  llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(sel->getTrueValue());
4918 
4919  if (bc && bc->hasOneUse() && bc->getSrcTy()->isIntOrIntVectorTy() && bc->getSrcTy()->isVectorTy() &&
4920  llvm::isa<llvm::Instruction>(bc->getOperand(0)) &&
4921  llvm::dyn_cast<llvm::Instruction>(bc->getOperand(0))->getParent() == sel->getParent()) {
4922  // Bitcast is casting form integer type, it's operand is instruction, which is located in the same basic
4923  // block (otherwise it's unsafe to use it). bitcast+select => sext+and+bicast Create and
4924  llvm::BinaryOperator *and_inst = llvm::BinaryOperator::CreateAnd(sext, bc->getOperand(0), "and_mask", sel);
4925  // Bitcast back to original type
4926  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
4927  // Original bitcast will be removed
4928  optional_to_delete = bc;
4929  } else {
4930  // General case: select => bitcast+sext+and+bitcast
4931  // Bitcast
4932  llvm::BitCastInst *bc_in = new llvm::BitCastInst(sel->getTrueValue(), int_type, "bitcast_mask_in", sel);
4933  // And
4934  llvm::BinaryOperator *and_inst = llvm::BinaryOperator::CreateAnd(sext, bc_in, "and_mask", sel);
4935  // Bitcast back to original type
4936  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
4937  }
4938  }
4939 
4940  // Done, finalize.
4941  sel->replaceAllUsesWith(result);
4942  sel->eraseFromParent();
4943  if (optional_to_delete) {
4944  optional_to_delete->eraseFromParent();
4945  }
4946 
4947  return result;
4948 }
4949 
4950 bool FixBooleanSelectPass::runOnFunction(llvm::Function &F) {
4951  bool modifiedAny = false;
4952 
4953  return modifiedAny;
4954 }
4955 
4956 static llvm::Pass *CreateFixBooleanSelectPass() { return new FixBooleanSelectPass(); }
static llvm::Pass * CreateFixBooleanSelectPass()
Definition: opt.cpp:4956
llvm::Constant * LLVMIntAsType(int64_t val, llvm::Type *type)
Definition: llvmutil.cpp:463
static void lExtractConstOffsets(const std::vector< llvm::CallInst *> &coalesceGroup, int elementSize, std::vector< int64_t > *constOffsets)
Definition: opt.cpp:3529
void run(llvm::Module &m, bool init)
Definition: opt.cpp:4271
static llvm::Type * FloatType
Definition: llvmutil.h:69
static llvm::Type * Int32VectorPointerType
Definition: llvmutil.h:93
llvm::legacy::PassManager & getPM()
Definition: opt.cpp:407
static llvm::Instruction * lMatchAvgDownInt16(llvm::Value *inst)
Definition: opt.cpp:4713
Opt opt
Definition: ispc.h:509
DebugPassFile(int number, llvm::StringRef name)
Definition: opt.cpp:4247
SDiv2_match(const Op_t &OpMatch)
Definition: opt.cpp:4564
llvm::Instruction * fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext)
Definition: opt.cpp:4903
llvm::StringRef getPassName() const
Definition: opt.cpp:4785
static bool lIsSafeToBlend(llvm::Value *lvalue)
Definition: opt.cpp:3833
static bool lCoalesceGathers(const std::vector< llvm::CallInst *> &coalesceGroup)
Definition: opt.cpp:3556
Declaration of the FunctionEmitContext class
llvm::StringRef getPassName() const
Definition: opt.cpp:2835
bool hasVecPrefetch() const
Definition: ispc.h:269
static llvm::Type * DoubleType
Definition: llvmutil.h:70
bool runOnFunction(llvm::Function &F)
Definition: opt.cpp:3796
static llvm::Value * lExtractFromInserts(llvm::Value *v, unsigned int index)
Definition: opt.cpp:1243
static llvm::Instruction * lMatchAvgDownUInt8(llvm::Value *inst)
Definition: opt.cpp:4636
bool disableBlendedMaskedStores
Definition: ispc.h:447
static llvm::Value * lExtractOffsetVector248Scale(llvm::Value **vec)
Definition: opt.cpp:1597
static bool simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter)
Definition: opt.cpp:1064
llvm::Constant * LLVMInt64Vector(int64_t ival)
Definition: llvmutil.cpp:373
void Optimize(llvm::Module *module, int optLevel)
Definition: opt.cpp:441
static llvm::Pass * CreateImproveMemoryOpsPass()
Definition: opt.cpp:2806
int first_line
Definition: ispc.h:127
bool runOnModule(llvm::Module &m)
Definition: opt.cpp:4320
Target * target
Definition: ispc.h:512
llvm::StringRef getPassName() const
Definition: opt.cpp:680
static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, const char *name, llvm::Instruction *insertBefore=NULL)
Definition: opt.cpp:247
static bool lVectorLoadIsEfficient(std::set< int64_t >::iterator iter, std::set< int64_t >::iterator end, std::set< int64_t >::iterator *newIter, int vectorWidth)
Definition: opt.cpp:2874
bool LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts)
Definition: llvmutil.cpp:709
int getNativeVectorAlignment() const
Definition: ispc.h:241
const char * LLVMGetName(llvm::Value *v, const char *s)
Definition: llvmutil.cpp:1549
static void lSelectLoads(const std::vector< int64_t > &loadOffsets, std::vector< CoalescedLoadOp > *loads)
Definition: opt.cpp:2968
static llvm::Constant * lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1, llvm::Constant **delta)
Definition: opt.cpp:1228
static llvm::Value * lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3185
#define ISPC_LLVM_VERSION
Definition: ispc_version.h:45
llvm::Value * lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align, llvm::Instruction *insertBefore, llvm::Type *type)
Definition: opt.cpp:3087
#define DEBUG_START_PASS(NAME)
Definition: opt.cpp:141
static char ID
Definition: opt.cpp:4478
static char ID
Definition: opt.cpp:4891
static llvm::Value * lAssemble4Vector(const std::vector< CoalescedLoadOp > &loadOps, const int64_t offsets[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3304
static llvm::VectorType * Int32VectorType
Definition: llvmutil.h:86
Declarations related to optimization passes.
llvm::Value * element0
Definition: opt.cpp:2865
static llvm::Pass * CreateReplaceStdlibShiftPass()
Definition: opt.cpp:4861
std::vector< MaskInstruction > maskInstructions
Definition: opt.cpp:693
bool forceAlignedMemory
Definition: ispc.h:426
static void lCoalescePerfInfo(const std::vector< llvm::CallInst *> &coalesceGroup, const std::vector< CoalescedLoadOp > &loadOps)
Definition: opt.cpp:3018
static llvm::Type * FloatVectorPointerType
Definition: llvmutil.h:95
bool runOnModule(llvm::Module &m)
Definition: opt.cpp:4280
BlendInstruction * matchingBlendInstruction(llvm::Function *function)
Definition: opt.cpp:952
static char ID
Definition: opt.cpp:2832
static bool lVectorIs32BitInts(llvm::Value *v)
Definition: opt.cpp:1756
static bool lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos)
Definition: opt.cpp:210
IsCompileTimeConstantPass(bool last=false)
Definition: opt.cpp:4121
static bool lGetMask(llvm::Value *factor, uint64_t *mask)
Definition: opt.cpp:328
Module * m
Definition: ispc.cpp:73
static llvm::Type * Int16VectorPointerType
Definition: llvmutil.h:92
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:1080
bool runOnFunction(llvm::Function &F)
Definition: opt.cpp:1104
static bool lIsIntegerSplat(llvm::Value *v, int *splat)
Definition: opt.cpp:1537
MakeInternalFuncsStaticPass(bool last=false)
Definition: opt.cpp:4310
static llvm::Pass * CreateReplacePseudoMemoryOpsPass()
Definition: opt.cpp:4100
UDiv2_match< V > m_UDiv2(const V &v)
Definition: opt.cpp:4559
static bool lHasIntrinsicInDefinition(llvm::Function *func)
Definition: opt.cpp:4592
static llvm::Type * Int16Type
Definition: llvmutil.h:66
static llvm::Type * DoubleVectorPointerType
Definition: llvmutil.h:96
bool run(llvm::Module &M)
Definition: opt.cpp:406
static bool lInstructionMayWriteToMemory(llvm::Instruction *inst)
Definition: opt.cpp:3630
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:2756
static llvm::Pass * CreateIntrinsicsOptPass()
Definition: opt.cpp:961
static llvm::Instruction * lMatchAvgDownUInt16(llvm::Value *inst)
Definition: opt.cpp:4662
bool disableCoalescing
Definition: ispc.h:487
static llvm::Pass * CreatePeepholePass()
Definition: opt.cpp:4766
llvm::StringRef getPassName() const
Definition: opt.cpp:4249
static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst)
Definition: opt.cpp:2179
llvm::StringRef getPassName() const
Definition: opt.cpp:3819
static llvm::VectorType * Int1VectorType
Definition: llvmutil.h:83
llvm::legacy::PassManager PM
Definition: opt.cpp:410
static llvm::Instruction * lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name, llvm::Instruction *insertBefore)
Definition: opt.cpp:284
static bool lIsUndef(llvm::Value *value)
Definition: opt.cpp:732
static void lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from)
Definition: opt.cpp:177
header file with declarations for symbol and symbol table classes.
llvm::StringRef getPassName() const
Definition: opt.cpp:4474
static llvm::Pass * CreateDebugPassFile(int number, llvm::StringRef name)
Definition: opt.cpp:4290
std::set< int > debug_stages
Definition: ispc.h:545
bool match(OpTy *V)
Definition: opt.cpp:4494
bool disableMaskAllOnOptimizations
Definition: ispc.h:431
int level
Definition: ispc.h:392
llvm::Module * module
Definition: module.h:151
bool matchesMaskInstruction(llvm::Function *function)
Definition: opt.cpp:943
static llvm::Type * Int8VectorPointerType
Definition: llvmutil.h:91
bool runOnFunction(llvm::Function &F)
Definition: opt.cpp:2797
Definition: opt.cpp:371
void PerformanceWarning(SourcePos p, const char *fmt,...)
Definition: util.cpp:397
bool disableGatherScatterOptimizations
Definition: ispc.h:465
bool debugPrint
Definition: ispc.h:536
static llvm::VectorType * Int8VectorType
Definition: llvmutil.h:84
llvm::Constant * LLVMInt32Vector(int32_t ival)
Definition: llvmutil.cpp:313
std::string sanitize(std::string in)
Definition: opt.cpp:4264
static uint64_t lConstElementsToMask(const llvm::SmallVector< llvm::Constant *, ISPC_MAX_NVEC > &elements)
Definition: opt.cpp:296
Definition: opt.cpp:371
static llvm::Pass * CreateInstructionSimplifyPass()
Definition: opt.cpp:1113
static llvm::Pass * CreateDebugPass(char *output)
Definition: opt.cpp:4234
llvm::Constant * LLVMTrue
Definition: llvmutil.cpp:90
static llvm::Value * simplifyBoolVec(llvm::Value *value)
Definition: opt.cpp:990
static llvm::Pass * CreateIsCompileTimeConstantPass(bool isLastTry)
Definition: opt.cpp:4203
llvm::Value * element1
Definition: opt.cpp:2865
const llvm::Type * toType
Definition: opt.cpp:4489
static llvm::Instruction * lMatchAvgUpInt8(llvm::Value *inst)
Definition: opt.cpp:4671
static llvm::VectorType * FloatVectorType
Definition: llvmutil.h:88
llvm::StringRef getPassName() const
Definition: opt.cpp:1133
static bool lReplacePseudoGS(llvm::CallInst *callInst)
Definition: opt.cpp:3911
static bool lReplacePseudoMaskedStore(llvm::CallInst *callInst)
Definition: opt.cpp:3860
static llvm::Type * Int64Type
Definition: llvmutil.h:68
static llvm::Type * Int8Type
Definition: llvmutil.h:65
bool IsOrEquivalentToAdd(llvm::Value *op)
Definition: llvmutil.cpp:1098
static llvm::VectorType * Int64VectorType
Definition: llvmutil.h:87
Header file with declarations for various LLVM utility stuff.
static bool simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter)
Definition: opt.cpp:1028
static llvm::Value * lGetBasePointer(llvm::Value *v, llvm::Instruction *insertBefore, bool broadcastDetected)
Definition: opt.cpp:1188
UDiv2_match(const Op_t &OpMatch)
Definition: opt.cpp:4535
static llvm::Value * lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore)
Definition: opt.cpp:3504
bool runOnFunction(llvm::Function &F)
Definition: opt.cpp:4091
bool hasScatter() const
Definition: ispc.h:259
bool runOnModule(llvm::Module &m)
Definition: opt.cpp:4227
BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
Definition: opt.cpp:699
bool runOnFunction(llvm::Function &F)
Definition: opt.cpp:4852
CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f, const llvm::Type *t)
Definition: opt.cpp:4491
CastClassTypes_match< OpTy, llvm::Instruction::Trunc > m_Trunc16To8(const OpTy &Op)
Definition: opt.cpp:4512
llvm::StringRef getPassName() const
Definition: opt.cpp:4218
bool unrollLoops
Definition: ispc.h:406
DebugPass(char *output)
Definition: opt.cpp:4216
CastClassTypes_match< OpTy, llvm::Instruction::ZExt > m_ZExt16To32(const OpTy &Op)
Definition: opt.cpp:4522
SDiv2_match< V > m_SDiv2(const V &v)
Definition: opt.cpp:4588
llvm::StringRef getPassName() const
Definition: opt.cpp:4123
bool runOnFunction(llvm::Function &F)
Definition: opt.cpp:4194
Representation of a range of positions in a source file.
Definition: ispc.h:123
void getAnalysisUsage(llvm::AnalysisUsage &AU) const
Definition: opt.cpp:4312
static char ID
Definition: opt.cpp:4215
llvm::Value * LLVMConcatVectors(llvm::Value *v1, llvm::Value *v2, llvm::Instruction *insertBefore)
Definition: llvmutil.cpp:1514
static char ID
Definition: opt.cpp:686
llvm::ConstantInt * LLVMInt32(int32_t ival)
Definition: llvmutil.cpp:233
static char ID
Definition: opt.cpp:1130
static std::vector< CoalescedLoadOp > lSplit8WideLoads(const std::vector< CoalescedLoadOp > &loadOps, llvm::Instruction *insertBefore)
Definition: opt.cpp:3160
CastClassTypes_match< OpTy, llvm::Instruction::SExt > m_SExt8To16(const OpTy &Op)
Definition: opt.cpp:4502
Definition: opt.cpp:371
static llvm::Pass * CreateGatherCoalescePass()
Definition: opt.cpp:3805
bool disableHandlePseudoMemoryOps
Definition: ispc.h:437
bool force32BitAddressing
Definition: ispc.h:412
static char ID
Definition: opt.cpp:980
static llvm::Pass * CreateMakeInternalFuncsStaticPass()
Definition: opt.cpp:4465
bool LLVMVectorIsLinear(llvm::Value *v, int stride)
Definition: llvmutil.cpp:1361
static llvm::Instruction * lMatchAvgDownInt8(llvm::Value *inst)
Definition: opt.cpp:4687
bool hasGather() const
Definition: ispc.h:257
static llvm::Instruction * lMatchAvgUpUInt16(llvm::Value *inst)
Definition: opt.cpp:4645
bool runOnFunction(llvm::Function &F)
Definition: opt.cpp:4950
static llvm::PointerType * VoidPointerType
Definition: llvmutil.h:60
llvm::StringRef getPassName() const
Definition: opt.cpp:4314
int getVectorWidth() const
Definition: ispc.h:245
bool match(OpTy *V)
Definition: opt.cpp:4566
llvm::Value * load
Definition: opt.cpp:2861
bool runOnFunction(llvm::Function &F)
Definition: opt.cpp:4757
#define FATAL(message)
Definition: util.h:116
bool doInitialization(llvm::Module &m)
Definition: opt.cpp:4285
void LLVMDumpValue(llvm::Value *v)
Definition: llvmutil.cpp:1397
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:3654
static llvm::Type * Int64VectorPointerType
Definition: llvmutil.h:94
static llvm::Type * Int32Type
Definition: llvmutil.h:67
bool match(OpTy *V)
Definition: opt.cpp:4537
#define PTYPE(p)
Definition: llvmutil.h:47
#define ISPC_MAX_NVEC
Definition: ispc.h:69
Op_t Op
Definition: opt.cpp:4562
static llvm::Instruction * lMatchAvgUpUInt8(llvm::Value *inst)
Definition: opt.cpp:4620
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:4132
static bool lOffsets32BitSafe(llvm::Value **variableOffsetPtr, llvm::Value **constOffsetPtr, llvm::Instruction *insertBefore)
Definition: opt.cpp:1772
#define Assert(expr)
Definition: util.h:128
static char ID
Definition: opt.cpp:4782
#define ISPC_LLVM_9_0
Definition: ispc_version.h:51
llvm::StringRef getPassName() const
Definition: opt.cpp:4894
Op_t Op
Definition: opt.cpp:4533
#define DEBUG_END_PASS(NAME)
Definition: opt.cpp:151
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:743
void add(llvm::Pass *P, int stage)
Definition: opt.cpp:414
ISA getISA() const
Definition: ispc.h:231
llvm::Constant * LLVMFalse
Definition: llvmutil.cpp:91
CastClassTypes_match< OpTy, llvm::Instruction::SExt > m_SExt16To32(const OpTy &Op)
Definition: opt.cpp:4517
bool dumpFile
Definition: ispc.h:548
llvm::Value * LLVMExtractFirstVectorElement(llvm::Value *v)
Definition: llvmutil.cpp:1503
Globals * g
Definition: ispc.cpp:72
CastClassTypes_match< OpTy, llvm::Instruction::Trunc > m_Trunc32To16(const OpTy &Op)
Definition: opt.cpp:4527
static bool lGSToGSBaseOffsets(llvm::CallInst *callInst)
Definition: opt.cpp:1863
void Debug(SourcePos p, const char *fmt,...)
Definition: util.cpp:366
IntrinsicsOpt()
Definition: opt.cpp:678
bool LLVMVectorValuesAllEqual(llvm::Value *v, llvm::Value **splat)
Definition: llvmutil.cpp:1080
bool runOnFunction(llvm::Function &F)
Definition: opt.cpp:934
CoalescedLoadOp(int64_t s, int c)
Definition: opt.cpp:2846
int64_t start
Definition: opt.cpp:2855
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:4800
static void lEmitLoads(llvm::Value *basePtr, std::vector< CoalescedLoadOp > &loadOps, int elementSize, llvm::Instruction *insertBefore)
Definition: opt.cpp:3102
static llvm::VectorType * DoubleVectorType
Definition: llvmutil.h:89
static void lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset, llvm::Value **variableOffset, llvm::Instruction *insertBefore)
Definition: opt.cpp:1408
MaskStatus
Definition: opt.cpp:371
CastClassTypes_match< OpTy, llvm::Instruction::ZExt > m_ZExt8To16(const OpTy &Op)
Definition: opt.cpp:4507
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:4066
std::set< int > off_stages
Definition: ispc.h:555
static llvm::Value * lCheckForActualPointer(llvm::Value *v)
Definition: opt.cpp:1151
PeepholePass()
Definition: opt.cpp:4483
llvm::ConstantInt * LLVMInt64(int64_t ival)
Definition: llvmutil.cpp:241
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:4722
static llvm::VectorType * Int16VectorType
Definition: llvmutil.h:85
static llvm::Constant * lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType)
Definition: opt.cpp:2341
Definition: opt.cpp:371
static llvm::Value * lExtract248Scale(llvm::Value *splatOperand, int splatValue, llvm::Value *otherOperand, llvm::Value **result)
Definition: opt.cpp:1555
static llvm::Value * lApplyLoad2(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3206
#define PRId64
Definition: opt.cpp:110
static char ID
Definition: opt.cpp:4246
static void lAssembleResultVectors(const std::vector< CoalescedLoadOp > &loadOps, const std::vector< int64_t > &constOffsets, std::vector< llvm::Value *> &results, llvm::Instruction *insertBefore)
Definition: opt.cpp:3461
bool is32Bit() const
Definition: ispc.h:235
static llvm::Value * lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets, llvm::Instruction *insertBefore)
Definition: opt.cpp:1260
Declaration of the Module class, which is the ispc-side representation of the results of compiling a ...
llvm::LLVMContext * ctx
Definition: ispc.h:611
static bool lImproveMaskedLoad(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter)
Definition: opt.cpp:2701
static bool lIs32BitSafeHelper(llvm::Value *v)
Definition: opt.cpp:1822
DebugPassManager()
Definition: opt.cpp:404
static bool lGSToLoadStore(llvm::CallInst *callInst)
Definition: opt.cpp:2373
#define LAST_OPT_NUMBER
Definition: ispc.h:72
llvm::StringRef getPassName() const
Definition: opt.cpp:976
static llvm::Instruction * lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb)
Definition: opt.cpp:4603
static llvm::Instruction * lMatchAvgUpInt16(llvm::Value *inst)
Definition: opt.cpp:4696
MaskInstruction(llvm::Function *f)
Definition: opt.cpp:690
void Warning(SourcePos p, const char *fmt,...)
Definition: util.cpp:378
static MaskStatus lGetMaskStatus(llvm::Value *mask, int vecWidth=-1)
Definition: opt.cpp:376
llvm::Value * LLVMShuffleVectors(llvm::Value *v1, llvm::Value *v2, int32_t shuf[], int shufSize, llvm::Instruction *insertBefore)
Definition: llvmutil.cpp:1533
static llvm::Value * lApplyLoad4(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3265
llvm::TargetMachine * GetTargetMachine() const
Definition: ispc.h:191
std::vector< BlendInstruction > blendInstructions
Definition: opt.cpp:715
static llvm::Value * lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets, llvm::Instruction *insertBefore)
Definition: opt.cpp:2336
bool disableGatherScatterFlattening
Definition: ispc.h:477
llvm::Value * LLVMFlattenInsertChain(llvm::Value *inst, int vectorWidth, bool compare, bool undef, bool searchFirstUndef)
Definition: llvmutil.cpp:587
static bool lImproveMaskedStore(llvm::CallInst *callInst)
Definition: opt.cpp:2633
static int64_t lGetIntValue(llvm::Value *offset)
Definition: opt.cpp:4771
llvm::StringRef pname
Definition: opt.cpp:4256