Intel SPMD Program Compiler  1.12.0
opt.cpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2010-2019, Intel Corporation
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are
7  met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16  * Neither the name of Intel Corporation nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22  IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
25  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 /** @file opt.cpp
35  @brief Implementations of various ispc optimization passes that operate
36  on the LLVM IR.
37 */
38 
39 #include "opt.h"
40 #include "ctx.h"
41 #include "llvmutil.h"
42 #include "module.h"
43 #include "sym.h"
44 #include "util.h"
45 
46 #include <map>
47 #include <set>
48 #include <stdio.h>
49 
50 #include <llvm/Pass.h>
51 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
52 #include <llvm/BasicBlock.h>
53 #include <llvm/Constants.h>
54 #include <llvm/Function.h>
55 #include <llvm/Instructions.h>
56 #include <llvm/Intrinsics.h>
57 #include <llvm/Module.h>
58 #ifdef ISPC_NVPTX_ENABLED
59 #include <llvm/InlineAsm.h>
60 #endif /* ISPC_NVPTX_ENABLED */
61 #else // LLVM 3.3+
62 #include <llvm/IR/BasicBlock.h>
63 #include <llvm/IR/Constants.h>
64 #include <llvm/IR/Function.h>
65 #include <llvm/IR/Instructions.h>
66 #include <llvm/IR/Intrinsics.h>
67 #include <llvm/IR/Module.h>
68 #ifdef ISPC_NVPTX_ENABLED
69 #include <llvm/IR/InlineAsm.h>
70 #endif /* ISPC_NVPTX_ENABLED */
71 #endif
72 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_4 // LLVM 3.4+
73 #include <llvm/Transforms/Instrumentation.h>
74 #endif
75 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
76 #include "llvm/PassManager.h"
77 #else // LLVM 3.7+
78 #include "llvm/IR/LegacyPassManager.h"
79 #endif
80 #include <llvm/PassRegistry.h>
81 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
82 #include <llvm/IR/DebugInfo.h>
83 #include <llvm/IR/IRPrintingPasses.h>
84 #include <llvm/IR/PatternMatch.h>
85 #include <llvm/IR/Verifier.h>
86 #else // < 3.5
87 #include <llvm/Analysis/Verifier.h>
88 #include <llvm/Assembly/PrintModulePass.h>
89 #include <llvm/DebugInfo.h>
90 #include <llvm/Support/PatternMatch.h>
91 #endif
92 #include <llvm/Analysis/ConstantFolding.h>
93 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
94 #include <llvm/Target/TargetLibraryInfo.h>
95 #else // LLVM 3.7+
96 #include <llvm/Analysis/TargetLibraryInfo.h>
97 #endif
98 #include <llvm/ADT/SmallSet.h>
99 #include <llvm/ADT/Triple.h>
100 #include <llvm/Transforms/IPO.h>
101 #include <llvm/Transforms/Scalar.h>
102 #if ISPC_LLVM_VERSION >= ISPC_LLVM_7_0
103 #include "llvm/Transforms/InstCombine/InstCombine.h"
104 #include "llvm/Transforms/Utils.h"
105 #endif
106 #include <llvm/Target/TargetOptions.h>
107 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
108 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
109 #include <llvm/DataLayout.h>
110 #else // LLVM 3.3+
111 #include <llvm/Analysis/TargetTransformInfo.h>
112 #include <llvm/IR/DataLayout.h>
113 #endif
114 #include <llvm/Target/TargetMachine.h>
115 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
116 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
117 #include <llvm/Analysis/BasicAliasAnalysis.h>
118 #endif
119 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_9 // LLVM 3.9+
120 #include "llvm/Transforms/IPO/FunctionAttrs.h"
121 #include "llvm/Transforms/Scalar/GVN.h"
122 #endif
123 #include <llvm/Analysis/Passes.h>
124 #include <llvm/Support/raw_ostream.h>
125 #if ISPC_LLVM_VERSION >= ISPC_LLVM_5_0 // LLVM 5.0+
126 #include <llvm/BinaryFormat/Dwarf.h>
127 #else // LLVM up to 4.x
128 #include <llvm/Support/Dwarf.h>
129 #endif
130 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_6
131 #include <llvm/IR/IntrinsicInst.h>
132 #endif
133 #ifdef ISPC_HOST_IS_LINUX
134 #include <alloca.h>
135 #elif defined(ISPC_HOST_IS_WINDOWS)
136 #include <malloc.h>
137 #ifndef __MINGW32__
138 #define alloca _alloca
139 #endif
140 #endif // ISPC_HOST_IS_WINDOWS
141 
142 #ifndef PRId64
143 #define PRId64 "lld"
144 #endif
145 #ifndef PRIu64
146 #define PRIu64 "llu"
147 #endif
148 #ifndef ISPC_NO_DUMPS
149 #include <llvm/Support/FileSystem.h>
150 #include <llvm/Support/Regex.h>
151 #endif
152 
153 static llvm::Pass *CreateIntrinsicsOptPass();
154 static llvm::Pass *CreateInstructionSimplifyPass();
155 static llvm::Pass *CreatePeepholePass();
156 
157 static llvm::Pass *CreateImproveMemoryOpsPass();
158 static llvm::Pass *CreateGatherCoalescePass();
159 static llvm::Pass *CreateReplacePseudoMemoryOpsPass();
160 
161 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
162 static llvm::Pass *CreateMakeInternalFuncsStaticPass();
163 
164 #ifndef ISPC_NO_DUMPS
165 static llvm::Pass *CreateDebugPass(char *output);
166 static llvm::Pass *CreateDebugPassFile(int number, llvm::StringRef name);
167 #endif
168 
169 static llvm::Pass *CreateReplaceStdlibShiftPass();
170 
171 static llvm::Pass *CreateFixBooleanSelectPass();
172 #ifdef ISPC_NVPTX_ENABLED
173 static llvm::Pass *CreatePromoteLocalToPrivatePass();
174 #endif /* ISPC_NVPTX_ENABLED */
175 
176 #ifndef ISPC_NO_DUMPS
177 #define DEBUG_START_PASS(NAME) \
178  if (g->debugPrint && \
179  (getenv("FUNC") == NULL || (getenv("FUNC") != NULL && !strncmp(bb.getParent()->getName().str().c_str(), \
180  getenv("FUNC"), strlen(getenv("FUNC")))))) { \
181  fprintf(stderr, "Start of " NAME "\n"); \
182  fprintf(stderr, "---------------\n"); \
183  bb.dump(); \
184  fprintf(stderr, "---------------\n\n"); \
185  } else /* eat semicolon */
186 
187 #define DEBUG_END_PASS(NAME) \
188  if (g->debugPrint && \
189  (getenv("FUNC") == NULL || (getenv("FUNC") != NULL && !strncmp(bb.getParent()->getName().str().c_str(), \
190  getenv("FUNC"), strlen(getenv("FUNC")))))) { \
191  fprintf(stderr, "End of " NAME " %s\n", modifiedAny ? "** CHANGES **" : ""); \
192  fprintf(stderr, "---------------\n"); \
193  bb.dump(); \
194  fprintf(stderr, "---------------\n\n"); \
195  } else /* eat semicolon */
196 #else
197 #define DEBUG_START_PASS(NAME)
198 #define DEBUG_END_PASS(NAME)
199 #endif
200 
201 ///////////////////////////////////////////////////////////////////////////
202 
203 /** This utility routine copies the metadata (if any) attached to the
204  'from' instruction in the IR to the 'to' instruction.
205 
206  For flexibility, this function takes an llvm::Value rather than an
207  llvm::Instruction for the 'to' parameter; at some places in the code
208  below, we sometimes use a llvm::Value to start out storing a value and
209  then later store instructions. If a llvm::Value is passed to this, the
210  routine just returns without doing anything; if it is in fact an
211  LLVM::Instruction, then the metadata can be copied to it.
212  */
213 static void lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from) {
214  llvm::Instruction *to = llvm::dyn_cast<llvm::Instruction>(vto);
215  if (!to)
216  return;
217 
218  llvm::SmallVector<std::pair<unsigned int, llvm::MDNode *>, 8> metadata;
219 
220  from->getAllMetadata(metadata);
221  for (unsigned int i = 0; i < metadata.size(); ++i)
222  to->setMetadata(metadata[i].first, metadata[i].second);
223 }
224 
225 /** We have a protocol with the front-end LLVM IR code generation process
226  that allows us to encode the source file position that corresponds with
227  instructions. (For example, this allows us to issue performance
228  warnings related to things like scatter and gather after optimization
229  has been performed, so that we aren't warning about scatters and
230  gathers that have been improved to stores and loads by optimization
231  passes.) Note that this is slightly redundant with the source file
232  position encoding generated for debugging symbols, though we don't
233  always generate debugging information but we do always generate this
234  position data.
235 
236  This function finds the SourcePos that the metadata in the instruction
237  (if present) corresponds to. See the implementation of
238  FunctionEmitContext::addGSMetadata(), which encodes the source position during
239  code generation.
240 
241  @param inst Instruction to try to find the source position of
242  @param pos Output variable in which to store the position
243  @returns True if source file position metadata was present and *pos
244  has been set. False otherwise.
245 */
246 static bool lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos) {
247  llvm::MDNode *filename = inst->getMetadata("filename");
248  llvm::MDNode *first_line = inst->getMetadata("first_line");
249  llvm::MDNode *first_column = inst->getMetadata("first_column");
250  llvm::MDNode *last_line = inst->getMetadata("last_line");
251  llvm::MDNode *last_column = inst->getMetadata("last_column");
252 
253  if (!filename || !first_line || !first_column || !last_line || !last_column)
254  return false;
255 
256  // All of these asserts are things that FunctionEmitContext::addGSMetadata() is
257  // expected to have done in its operation
258  llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(filename->getOperand(0));
259  Assert(str);
260  llvm::ConstantInt *first_lnum =
261 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
262  llvm::dyn_cast<llvm::ConstantInt>(first_line->getOperand(0));
263 #else /* LLVN 3.6+ */
264  llvm::mdconst::extract<llvm::ConstantInt>(first_line->getOperand(0));
265 #endif
266  Assert(first_lnum);
267 
268  llvm::ConstantInt *first_colnum =
269 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
270  llvm::dyn_cast<llvm::ConstantInt>(first_column->getOperand(0));
271 #else /* LLVN 3.6+ */
272  llvm::mdconst::extract<llvm::ConstantInt>(first_column->getOperand(0));
273 #endif
274  Assert(first_column);
275 
276  llvm::ConstantInt *last_lnum =
277 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
278  llvm::dyn_cast<llvm::ConstantInt>(last_line->getOperand(0));
279 #else /* LLVN 3.6+ */
280  llvm::mdconst::extract<llvm::ConstantInt>(last_line->getOperand(0));
281 #endif
282  Assert(last_lnum);
283 
284  llvm::ConstantInt *last_colnum =
285 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
286  llvm::dyn_cast<llvm::ConstantInt>(last_column->getOperand(0));
287 #else /* LLVN 3.6+ */
288  llvm::mdconst::extract<llvm::ConstantInt>(last_column->getOperand(0));
289 #endif
290  Assert(last_column);
291 
292  *pos = SourcePos(str->getString().data(), (int)first_lnum->getZExtValue(), (int)first_colnum->getZExtValue(),
293  (int)last_lnum->getZExtValue(), (int)last_colnum->getZExtValue());
294  return true;
295 }
296 
297 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, const char *name,
298  llvm::Instruction *insertBefore = NULL) {
299  llvm::Value *args[2] = {arg0, arg1};
300  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
301  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
302 }
303 
304 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
305  const char *name, llvm::Instruction *insertBefore = NULL) {
306  llvm::Value *args[3] = {arg0, arg1, arg2};
307  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
308  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
309 }
310 
311 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
312  llvm::Value *arg3, const char *name, llvm::Instruction *insertBefore = NULL) {
313  llvm::Value *args[4] = {arg0, arg1, arg2, arg3};
314  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[4]);
315  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
316 }
317 
318 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
319  llvm::Value *arg3, llvm::Value *arg4, const char *name,
320  llvm::Instruction *insertBefore = NULL) {
321  llvm::Value *args[5] = {arg0, arg1, arg2, arg3, arg4};
322  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[5]);
323  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
324 }
325 
326 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
327  llvm::Value *arg3, llvm::Value *arg4, llvm::Value *arg5, const char *name,
328  llvm::Instruction *insertBefore = NULL) {
329  llvm::Value *args[6] = {arg0, arg1, arg2, arg3, arg4, arg5};
330  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
331  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
332 }
333 
334 static llvm::Instruction *lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name,
335  llvm::Instruction *insertBefore) {
336  llvm::Value *index[1] = {offset};
337  llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
338 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
339  return llvm::GetElementPtrInst::Create(ptr, arrayRef, name, insertBefore);
340 #else // LLVM 3.7+
341  return llvm::GetElementPtrInst::Create(PTYPE(ptr), ptr, arrayRef, name, insertBefore);
342 #endif
343 }
344 
345 /** Given a vector of constant values (int, float, or bool) representing an
346  execution mask, convert it to a bitvector where the 0th bit corresponds
347  to the first vector value and so forth.
348 */
349 static uint64_t lConstElementsToMask(const llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> &elements) {
350  Assert(elements.size() <= 64);
351 
352  uint64_t mask = 0;
353  for (unsigned int i = 0; i < elements.size(); ++i) {
354  llvm::APInt intMaskValue;
355  // SSE has the "interesting" approach of encoding blending
356  // masks as <n x float>.
357  llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
358  if (cf != NULL) {
359  llvm::APFloat apf = cf->getValueAPF();
360  intMaskValue = apf.bitcastToAPInt();
361  } else {
362  // Otherwise get it as an int
363  llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
364  Assert(ci != NULL); // vs return -1 if NULL?
365  intMaskValue = ci->getValue();
366  }
367  // Is the high-bit set? If so, OR in the appropriate bit in
368  // the result mask
369  if (intMaskValue.countLeadingOnes() > 0)
370  mask |= (1ull << i);
371  }
372  return mask;
373 }
374 
375 /** Given an llvm::Value represinting a vector mask, see if the value is a
376  constant. If so, return true and set *bits to be the integer mask
377  found by taking the high bits of the mask values in turn and
378  concatenating them into a single integer. In other words, given the
379  4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, we have 0b1001 = 9.
380  */
381 static bool lGetMask(llvm::Value *factor, uint64_t *mask) {
382  llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
383  if (cdv != NULL) {
384  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
385  for (int i = 0; i < (int)cdv->getNumElements(); ++i)
386  elements.push_back(cdv->getElementAsConstant(i));
387  *mask = lConstElementsToMask(elements);
388  return true;
389  }
390 
391  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
392  if (cv != NULL) {
393  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
394  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
395  llvm::Constant *c = llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
396  if (c == NULL)
397  return false;
398  if (llvm::isa<llvm::ConstantExpr>(cv->getOperand(i)))
399  return false; // We can not handle constant expressions here
400  elements.push_back(c);
401  }
402  *mask = lConstElementsToMask(elements);
403  return true;
404  } else if (llvm::isa<llvm::ConstantAggregateZero>(factor)) {
405  *mask = 0;
406  return true;
407  } else {
408 #if 0
409  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
410  if (ce != NULL) {
411  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
412  const llvm::TargetData *td = targetMachine->getTargetData();
413  llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
414  c->dump();
415  factor = c;
416  }
417  // else we should be able to handle it above...
418  Assert(!llvm::isa<llvm::Constant>(factor));
419 #endif
420  return false;
421  }
422 }
423 
425 
426 /** Determines if the given mask value is all on, all off, mixed, or
427  unknown at compile time.
428 */
429 static MaskStatus lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
430  uint64_t bits;
431  if (lGetMask(mask, &bits) == false)
432  return UNKNOWN;
433 
434  if (bits == 0)
435  return ALL_OFF;
436 
437  if (vecWidth == -1)
438  vecWidth = g->target->getVectorWidth();
439  Assert(vecWidth <= 64);
440 
441  for (int i = 0; i < vecWidth; ++i) {
442  if ((bits & (1ull << i)) == 0)
443  return MIXED;
444  }
445  return ALL_ON;
446 }
447 
448 ///////////////////////////////////////////////////////////////////////////
449 // This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
450 // and change PassManager function add by adding some checks and debug passes.
451 // This wrap can control:
452 // - If we want to switch off optimization with given number.
453 // - If we want to dump LLVM IR after optimization with given number.
454 // - If we want to generate LLVM IR debug for gdb after optimization with given number.
456  public:
458  void add(llvm::Pass *P, int stage);
459  bool run(llvm::Module &M) { return PM.run(M); }
460 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
461  llvm::PassManager &getPM() { return PM; }
462 #else /* LLVM 3.7+ */
463  llvm::legacy::PassManager &getPM() { return PM; }
464 #endif
465  private:
466 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
467  llvm::PassManager PM;
468 #else /* LLVM 3.7+ */
469  llvm::legacy::PassManager PM;
470 #endif
471  int number;
472 };
473 
474 void DebugPassManager::add(llvm::Pass *P, int stage = -1) {
475  // taking number of optimization
476  if (stage == -1) {
477  number++;
478  } else {
479  number = stage;
480  }
481  if (g->off_stages.find(number) == g->off_stages.end()) {
482  // adding optimization (not switched off)
483  PM.add(P);
484 #ifndef ISPC_NO_DUMPS
485  if (g->debug_stages.find(number) != g->debug_stages.end()) {
486  // adding dump of LLVM IR after optimization
487  if (g->dumpFile) {
488  PM.add(CreateDebugPassFile(number, P->getPassName()));
489  } else {
490  char buf[100];
491 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
492  snprintf(buf, sizeof(buf), "\n\n*****LLVM IR after phase %d: %s*****\n\n", number, P->getPassName());
493 #else // LLVM 4.0+
494  snprintf(buf, sizeof(buf), "\n\n*****LLVM IR after phase %d: %s*****\n\n", number,
495  P->getPassName().data());
496 #endif
497  PM.add(CreateDebugPass(buf));
498  }
499  }
500 #endif
501 
502 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_4 || ISPC_LLVM_VERSION == ISPC_LLVM_3_5 // only 3.4 and 3.5
503  if (g->debugIR == number) {
504  // adding generating of LLVM IR debug after optimization
505  char buf[100];
506  snprintf(buf, sizeof(buf), "Debug_IR_after_%d_phase.bc", number);
507  PM.add(llvm::createDebugIRPass(true, true, ".", buf));
508  }
509 #endif
510  }
511 }
512 ///////////////////////////////////////////////////////////////////////////
513 
514 void Optimize(llvm::Module *module, int optLevel) {
515 #ifndef ISPC_NO_DUMPS
516  if (g->debugPrint) {
517  printf("*** Code going into optimization ***\n");
518  module->dump();
519  }
520 #endif
521  DebugPassManager optPM;
522  optPM.add(llvm::createVerifierPass(), 0);
523 
524 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
525  llvm::TargetLibraryInfo *targetLibraryInfo = new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
526  optPM.add(targetLibraryInfo);
527 #else // LLVM 3.7+
528  optPM.add(new llvm::TargetLibraryInfoWrapperPass(llvm::Triple(module->getTargetTriple())));
529 #endif
530 
531 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_4
532  optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
533 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_5
534  optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
535 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_6
536  llvm::DataLayoutPass *dlp = new llvm::DataLayoutPass();
537  dlp->doInitialization(*module);
538  optPM.add(dlp);
539 #endif // LLVM 3.7+ doesn't have DataLayoutPass anymore.
540 
541  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
542 
543 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
544  optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
545  targetMachine->getVectorTargetTransformInfo()));
546 #elif ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
547  targetMachine->addAnalysisPasses(optPM.getPM());
548 #else // LLVM 3.7+
549  optPM.getPM().add(createTargetTransformInfoWrapperPass(targetMachine->getTargetIRAnalysis()));
550 #endif
551 
552  optPM.add(llvm::createIndVarSimplifyPass());
553 
554  if (optLevel == 0) {
555  // This is more or less the minimum set of optimizations that we
556  // need to do to generate code that will actually run. (We can't
557  // run absolutely no optimizations, since the front-end needs us to
558  // take the various __pseudo_* functions it has emitted and turn
559  // them into something that can actually execute.
560  optPM.add(CreateImproveMemoryOpsPass(), 100);
561 #ifdef ISPC_NVPTX_ENABLED
563 #endif /* ISPC_NVPTX_ENABLED */
564  optPM.add(CreateImproveMemoryOpsPass(), 100);
565 
566  if (g->opt.disableHandlePseudoMemoryOps == false)
568 
569  optPM.add(CreateIntrinsicsOptPass(), 102);
571  optPM.add(llvm::createFunctionInliningPass());
573  optPM.add(llvm::createCFGSimplificationPass());
574  optPM.add(llvm::createGlobalDCEPass());
575  } else {
576  llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
577  llvm::initializeCore(*registry);
578  llvm::initializeScalarOpts(*registry);
579  llvm::initializeIPO(*registry);
580  llvm::initializeAnalysis(*registry);
581 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7
582  llvm::initializeIPA(*registry);
583 #endif
584  llvm::initializeTransformUtils(*registry);
585  llvm::initializeInstCombine(*registry);
586  llvm::initializeInstrumentation(*registry);
587  llvm::initializeTarget(*registry);
588 
589  optPM.add(llvm::createGlobalDCEPass(), 185);
590 
591  // Setup to use LLVM default AliasAnalysis
592  // Ideally, we want call:
593  // llvm::PassManagerBuilder pm_Builder;
594  // pm_Builder.OptLevel = optLevel;
595  // pm_Builder.addInitialAliasAnalysisPasses(optPM);
596  // but the addInitialAliasAnalysisPasses() is a private function
597  // so we explicitly enable them here.
598  // Need to keep sync with future LLVM change
599  // An alternative is to call populateFunctionPassManager()
600 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7
601  optPM.add(llvm::createTypeBasedAliasAnalysisPass(), 190);
602  optPM.add(llvm::createBasicAliasAnalysisPass());
603 #else
604  optPM.add(llvm::createTypeBasedAAWrapperPass(), 190);
605  optPM.add(llvm::createBasicAAWrapperPass());
606 #endif
607  optPM.add(llvm::createCFGSimplificationPass());
608 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
609  optPM.add(llvm::createScalarReplAggregatesPass());
610 #else
611  optPM.add(llvm::createSROAPass());
612 #endif
613  optPM.add(llvm::createEarlyCSEPass());
614  optPM.add(llvm::createLowerExpectIntrinsicPass());
615 
616  // Early optimizations to try to reduce the total amount of code to
617  // work with if we can
618  optPM.add(llvm::createReassociatePass(), 200);
619  optPM.add(llvm::createConstantPropagationPass());
620  optPM.add(llvm::createDeadInstEliminationPass());
621  optPM.add(llvm::createCFGSimplificationPass());
622 
623  optPM.add(llvm::createPromoteMemoryToRegisterPass());
624  optPM.add(llvm::createAggressiveDCEPass());
625 
626  if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
627  optPM.add(llvm::createInstructionCombiningPass(), 210);
629  }
631  optPM.add(CreateIntrinsicsOptPass(), 215);
633  }
634  optPM.add(llvm::createDeadInstEliminationPass(), 220);
635 
636  // Max struct size threshold for scalar replacement is
637  // 1) 4 fields (r,g,b,w)
638  // 2) field size: vectorWidth * sizeof(float)
639 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
640  const int field_limit = 4;
641  int sr_threshold = g->target->getVectorWidth() * sizeof(float) * field_limit;
642 #endif
643 
644  // On to more serious optimizations
645 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
646  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
647 #else
648  optPM.add(llvm::createSROAPass());
649 #endif
650  optPM.add(llvm::createInstructionCombiningPass());
651  optPM.add(llvm::createCFGSimplificationPass());
652  optPM.add(llvm::createPromoteMemoryToRegisterPass());
653  optPM.add(llvm::createGlobalOptimizerPass());
654  optPM.add(llvm::createReassociatePass());
655  optPM.add(llvm::createIPConstantPropagationPass());
656 
657 #ifdef ISPC_NVPTX_ENABLED
658  if (g->target->getISA() != Target::NVPTX)
659 #endif /* ISPC_NVPTX_ENABLED */
660  optPM.add(CreateReplaceStdlibShiftPass(), 229);
661 
662  optPM.add(llvm::createDeadArgEliminationPass(), 230);
663  optPM.add(llvm::createInstructionCombiningPass());
664  optPM.add(llvm::createCFGSimplificationPass());
665  optPM.add(llvm::createPruneEHPass());
666 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_9 // 3.9+
667  optPM.add(llvm::createPostOrderFunctionAttrsLegacyPass());
668  optPM.add(llvm::createReversePostOrderFunctionAttrsPass());
669 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_8 // 3.8
670  optPM.add(llvm::createPostOrderFunctionAttrsPass());
671  optPM.add(llvm::createReversePostOrderFunctionAttrsPass());
672 #else // 3.7 and earlier
673  optPM.add(llvm::createFunctionAttrsPass());
674 #endif
675  optPM.add(llvm::createFunctionInliningPass());
676  optPM.add(llvm::createConstantPropagationPass());
677  optPM.add(llvm::createDeadInstEliminationPass());
678  optPM.add(llvm::createCFGSimplificationPass());
679 
680  optPM.add(llvm::createArgumentPromotionPass());
681 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3
682  // Starting from 3.4 this functionality was moved to
683  // InstructionCombiningPass. See r184459 for details.
684  optPM.add(llvm::createSimplifyLibCallsPass(), 240);
685 #endif
686  optPM.add(llvm::createAggressiveDCEPass());
687  optPM.add(llvm::createInstructionCombiningPass(), 241);
688  optPM.add(llvm::createJumpThreadingPass());
689  optPM.add(llvm::createCFGSimplificationPass());
690 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
691  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
692 #else
693  optPM.add(llvm::createSROAPass());
694 #endif
695  optPM.add(llvm::createInstructionCombiningPass());
696  optPM.add(llvm::createTailCallEliminationPass());
697 
699  optPM.add(CreateIntrinsicsOptPass(), 250);
701  }
702 
703  if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
704  optPM.add(llvm::createInstructionCombiningPass(), 255);
706 
707  if (g->opt.disableCoalescing == false && g->target->getISA() != Target::GENERIC) {
708  // It is important to run this here to make it easier to
709  // finding matching gathers we can coalesce..
710  optPM.add(llvm::createEarlyCSEPass(), 260);
711  optPM.add(CreateGatherCoalescePass());
712  }
713  }
714 
715  optPM.add(llvm::createFunctionInliningPass(), 265);
716  optPM.add(llvm::createConstantPropagationPass());
717  optPM.add(CreateIntrinsicsOptPass());
719 
720  if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
721  optPM.add(llvm::createInstructionCombiningPass(), 270);
723  }
724 
725  optPM.add(llvm::createIPSCCPPass(), 275);
726  optPM.add(llvm::createDeadArgEliminationPass());
727  optPM.add(llvm::createAggressiveDCEPass());
728  optPM.add(llvm::createInstructionCombiningPass());
729  optPM.add(llvm::createCFGSimplificationPass());
730 
731  if (g->opt.disableHandlePseudoMemoryOps == false) {
733  }
734  optPM.add(CreateIntrinsicsOptPass(), 281);
736 
737  optPM.add(llvm::createFunctionInliningPass());
738  optPM.add(llvm::createArgumentPromotionPass());
739 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
740  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false));
741 #else
742  optPM.add(llvm::createSROAPass());
743 #endif
744  optPM.add(llvm::createInstructionCombiningPass());
746  optPM.add(llvm::createCFGSimplificationPass());
747  optPM.add(llvm::createReassociatePass());
748  optPM.add(llvm::createLoopRotatePass());
749  optPM.add(llvm::createLICMPass());
750  optPM.add(llvm::createLoopUnswitchPass(false));
751  optPM.add(llvm::createInstructionCombiningPass());
753  optPM.add(llvm::createIndVarSimplifyPass());
754  optPM.add(llvm::createLoopIdiomPass());
755  optPM.add(llvm::createLoopDeletionPass());
756  if (g->opt.unrollLoops) {
757  optPM.add(llvm::createLoopUnrollPass(), 300);
758  }
759  optPM.add(llvm::createGVNPass(), 301);
760 
762  optPM.add(CreateIntrinsicsOptPass());
764 
765  optPM.add(llvm::createMemCpyOptPass());
766  optPM.add(llvm::createSCCPPass());
767  optPM.add(llvm::createInstructionCombiningPass());
769  optPM.add(llvm::createJumpThreadingPass());
770  optPM.add(llvm::createCorrelatedValuePropagationPass());
771  optPM.add(llvm::createDeadStoreEliminationPass());
772  optPM.add(llvm::createAggressiveDCEPass());
773  optPM.add(llvm::createCFGSimplificationPass());
774  optPM.add(llvm::createInstructionCombiningPass());
776  optPM.add(CreatePeepholePass());
777  optPM.add(llvm::createFunctionInliningPass());
778  optPM.add(llvm::createAggressiveDCEPass());
779  optPM.add(llvm::createStripDeadPrototypesPass());
781  optPM.add(llvm::createGlobalDCEPass());
782  optPM.add(llvm::createConstantMergePass());
783 
784  // Should be the last
785  optPM.add(CreateFixBooleanSelectPass(), 400);
786 #ifdef ISPC_NVPTX_ENABLED
787  if (g->target->getISA() == Target::NVPTX) {
788  optPM.add(CreatePromoteLocalToPrivatePass());
789  optPM.add(llvm::createGlobalDCEPass());
790 
791  optPM.add(llvm::createTypeBasedAliasAnalysisPass());
792  optPM.add(llvm::createBasicAliasAnalysisPass());
793  optPM.add(llvm::createCFGSimplificationPass());
794  // Here clang has an experimental pass SROAPass instead of
795  // ScalarReplAggregatesPass. We should add it in the future.
796 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
797  optPM.add(llvm::createScalarReplAggregatesPass());
798 #else
799  optPM.add(llvm::createSROAPass());
800 #endif
801  optPM.add(llvm::createEarlyCSEPass());
802  optPM.add(llvm::createLowerExpectIntrinsicPass());
803  optPM.add(llvm::createTypeBasedAliasAnalysisPass());
804  optPM.add(llvm::createBasicAliasAnalysisPass());
805 
806  // Early optimizations to try to reduce the total amount of code to
807  // work with if we can
808  optPM.add(llvm::createReassociatePass());
809  optPM.add(llvm::createConstantPropagationPass());
810  optPM.add(llvm::createDeadInstEliminationPass());
811  optPM.add(llvm::createCFGSimplificationPass());
812 
813  optPM.add(llvm::createPromoteMemoryToRegisterPass());
814  optPM.add(llvm::createAggressiveDCEPass());
815 
816  optPM.add(llvm::createInstructionCombiningPass());
817  optPM.add(llvm::createDeadInstEliminationPass());
818 
819  // On to more serious optimizations
820  optPM.add(llvm::createInstructionCombiningPass());
821  optPM.add(llvm::createCFGSimplificationPass());
822  optPM.add(llvm::createPromoteMemoryToRegisterPass());
823  optPM.add(llvm::createGlobalOptimizerPass());
824  optPM.add(llvm::createReassociatePass());
825  optPM.add(llvm::createIPConstantPropagationPass());
826 
827  optPM.add(llvm::createDeadArgEliminationPass());
828  optPM.add(llvm::createInstructionCombiningPass());
829  optPM.add(llvm::createCFGSimplificationPass());
830  optPM.add(llvm::createPruneEHPass());
831  optPM.add(llvm::createFunctionAttrsPass());
832  optPM.add(llvm::createFunctionInliningPass());
833  optPM.add(llvm::createConstantPropagationPass());
834  optPM.add(llvm::createDeadInstEliminationPass());
835  optPM.add(llvm::createCFGSimplificationPass());
836 
837  optPM.add(llvm::createArgumentPromotionPass());
838 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3
839  // Starting from 3.4 this functionality was moved to
840  // InstructionCombiningPass. See r184459 for details.
841  optPM.add(llvm::createSimplifyLibCallsPass());
842 #endif
843  optPM.add(llvm::createAggressiveDCEPass());
844  optPM.add(llvm::createInstructionCombiningPass());
845  optPM.add(llvm::createJumpThreadingPass());
846  optPM.add(llvm::createCFGSimplificationPass());
847  optPM.add(llvm::createInstructionCombiningPass());
848  optPM.add(llvm::createTailCallEliminationPass());
849 
850  optPM.add(llvm::createInstructionCombiningPass());
851 
852  optPM.add(llvm::createFunctionInliningPass());
853  optPM.add(llvm::createConstantPropagationPass());
854 
855  optPM.add(llvm::createInstructionCombiningPass());
856 
857  optPM.add(llvm::createIPSCCPPass());
858  optPM.add(llvm::createDeadArgEliminationPass());
859  optPM.add(llvm::createAggressiveDCEPass());
860  optPM.add(llvm::createInstructionCombiningPass());
861  optPM.add(llvm::createCFGSimplificationPass());
862 
863  optPM.add(llvm::createFunctionInliningPass());
864  optPM.add(llvm::createArgumentPromotionPass());
865  optPM.add(llvm::createInstructionCombiningPass());
866  optPM.add(llvm::createCFGSimplificationPass());
867  optPM.add(llvm::createReassociatePass());
868  optPM.add(llvm::createLoopRotatePass());
869  optPM.add(llvm::createLICMPass());
870 // optPM.add(llvm::createLoopUnswitchPass(false));
871 #if 1
872  optPM.add(llvm::createInstructionCombiningPass());
873  optPM.add(llvm::createIndVarSimplifyPass());
874  optPM.add(llvm::createLoopIdiomPass());
875  optPM.add(llvm::createLoopDeletionPass());
876  optPM.add(llvm::createLoopUnrollPass());
877  optPM.add(llvm::createGVNPass());
878  optPM.add(llvm::createMemCpyOptPass());
879  optPM.add(llvm::createSCCPPass());
880  optPM.add(llvm::createInstructionCombiningPass());
881  optPM.add(llvm::createJumpThreadingPass());
882  optPM.add(llvm::createCorrelatedValuePropagationPass());
883  optPM.add(llvm::createDeadStoreEliminationPass());
884  optPM.add(llvm::createAggressiveDCEPass());
885  optPM.add(llvm::createCFGSimplificationPass());
886  optPM.add(llvm::createInstructionCombiningPass());
887  optPM.add(llvm::createFunctionInliningPass());
888  optPM.add(llvm::createAggressiveDCEPass());
889  optPM.add(llvm::createStripDeadPrototypesPass());
890  optPM.add(llvm::createGlobalDCEPass());
891  optPM.add(llvm::createConstantMergePass());
892 #endif
893  }
894 #endif /* ISPC_NVPTX_ENABLED */
895  }
896 
897  // Finish up by making sure we didn't mess anything up in the IR along
898  // the way.
899  optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
900  optPM.run(*module);
901 
902 #ifndef ISPC_NO_DUMPS
903  if (g->debugPrint) {
904  printf("\n*****\nFINAL OUTPUT\n*****\n");
905  module->dump();
906  }
907 #endif
908 }
909 
910 ///////////////////////////////////////////////////////////////////////////
911 // IntrinsicsOpt
912 
913 /** This is a relatively simple optimization pass that does a few small
914  optimizations that LLVM's x86 optimizer doesn't currently handle.
915  (Specifically, MOVMSK of a constant can be replaced with the
916  corresponding constant value, BLENDVPS and AVX masked load/store with
917  either an 'all on' or 'all off' masks can be replaced with simpler
918  operations.
919 
920  @todo The better thing to do would be to submit a patch to LLVM to get
921  these; they're presumably pretty simple patterns to match.
922 */
923 class IntrinsicsOpt : public llvm::BasicBlockPass {
924  public:
925  IntrinsicsOpt() : BasicBlockPass(ID){};
926 
927 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
928  const char *getPassName() const { return "Intrinsics Cleanup Optimization"; }
929 #else // LLVM 4.0+
930  llvm::StringRef getPassName() const { return "Intrinsics Cleanup Optimization"; }
931 #endif
932  bool runOnBasicBlock(llvm::BasicBlock &BB);
933 
934  static char ID;
935 
936  private:
938  MaskInstruction(llvm::Function *f) { function = f; }
939  llvm::Function *function;
940  };
941  std::vector<MaskInstruction> maskInstructions;
942 
943  /** Structure that records everything we need to know about a blend
944  instruction for this optimization pass.
945  */
947  BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
948  : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) {}
949  /** Function pointer for the blend instruction */
950  llvm::Function *function;
951  /** Mask value for an "all on" mask for this instruction */
952  uint64_t allOnMask;
953  /** The operand number in the llvm CallInst corresponds to the
954  first operand to blend with. */
955  int op0;
956  /** The operand number in the CallInst corresponding to the second
957  operand to blend with. */
958  int op1;
959  /** The operand in the call inst where the blending factor is
960  found. */
961  int opFactor;
962  };
963  std::vector<BlendInstruction> blendInstructions;
964 
965  bool matchesMaskInstruction(llvm::Function *function);
966  BlendInstruction *matchingBlendInstruction(llvm::Function *function);
967 };
968 
969 char IntrinsicsOpt::ID = 0;
970 
971 /** Given an llvm::Value, return true if we can determine that it's an
972  undefined value. This only makes a weak attempt at chasing this down,
973  only detecting flat-out undef values, and bitcasts of undef values.
974 
975  @todo Is it worth working harder to find more of these? It starts to
976  get tricky, since having an undef operand doesn't necessarily mean that
977  the result will be undefined. (And for that matter, is there an LLVM
978  call that will do this for us?)
979  */
980 static bool lIsUndef(llvm::Value *value) {
981  if (llvm::isa<llvm::UndefValue>(value))
982  return true;
983 
984  llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(value);
985  if (bci)
986  return lIsUndef(bci->getOperand(0));
987 
988  return false;
989 }
990 
991 bool IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
992  DEBUG_START_PASS("IntrinsicsOpt");
993 
994  // We can't initialize mask/blend function vector during pass initialization,
995  // as they may be optimized out by the time the pass is invoked.
996 
997  // All of the mask instructions we may encounter. Note that even if
998  // compiling for AVX, we may still encounter the regular 4-wide SSE
999  // MOVMSK instruction.
1000  if (llvm::Function *ssei8Movmsk =
1001  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse2_pmovmskb_128))) {
1002  maskInstructions.push_back(ssei8Movmsk);
1003  }
1004  if (llvm::Function *sseFloatMovmsk =
1005  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse_movmsk_ps))) {
1006  maskInstructions.push_back(sseFloatMovmsk);
1007  }
1008  if (llvm::Function *__movmsk = m->module->getFunction("__movmsk")) {
1009  maskInstructions.push_back(__movmsk);
1010  }
1011  if (llvm::Function *avxFloatMovmsk =
1012  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_movmsk_ps_256))) {
1013  maskInstructions.push_back(avxFloatMovmsk);
1014  }
1015 
1016  // And all of the blend instructions
1017  blendInstructions.push_back(BlendInstruction(
1018  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse41_blendvps)), 0xf, 0, 1, 2));
1019  blendInstructions.push_back(BlendInstruction(
1020  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_blendv_ps_256)), 0xff, 0, 1, 2));
1021 
1022  llvm::Function *avxMaskedLoad32 =
1023  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_ps_256));
1024  llvm::Function *avxMaskedLoad64 =
1025  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_pd_256));
1026  llvm::Function *avxMaskedStore32 =
1027  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_ps_256));
1028  llvm::Function *avxMaskedStore64 =
1029  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_pd_256));
1030 
1031  bool modifiedAny = false;
1032 restart:
1033  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
1034  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
1035  if (callInst == NULL || callInst->getCalledFunction() == NULL)
1036  continue;
1037 
1038  BlendInstruction *blend = matchingBlendInstruction(callInst->getCalledFunction());
1039  if (blend != NULL) {
1040  llvm::Value *v[2] = {callInst->getArgOperand(blend->op0), callInst->getArgOperand(blend->op1)};
1041  llvm::Value *factor = callInst->getArgOperand(blend->opFactor);
1042 
1043  // If the values are the same, then no need to blend..
1044  if (v[0] == v[1]) {
1045  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[0]);
1046  modifiedAny = true;
1047  goto restart;
1048  }
1049 
1050  // If one of the two is undefined, we're allowed to replace
1051  // with the value of the other. (In other words, the only
1052  // valid case is that the blend factor ends up having a value
1053  // that only selects from the defined one of the two operands,
1054  // otherwise the result is undefined and any value is fine,
1055  // ergo the defined one is an acceptable result.)
1056  if (lIsUndef(v[0])) {
1057  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[1]);
1058  modifiedAny = true;
1059  goto restart;
1060  }
1061  if (lIsUndef(v[1])) {
1062  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[0]);
1063  modifiedAny = true;
1064  goto restart;
1065  }
1066 
1067  uint64_t mask;
1068  if (lGetMask(factor, &mask) == true) {
1069  llvm::Value *value = NULL;
1070  if (mask == 0)
1071  // Mask all off -> replace with the first blend value
1072  value = v[0];
1073  else if (mask == blend->allOnMask)
1074  // Mask all on -> replace with the second blend value
1075  value = v[1];
1076 
1077  if (value != NULL) {
1078  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
1079  modifiedAny = true;
1080  goto restart;
1081  }
1082  }
1083  } else if (matchesMaskInstruction(callInst->getCalledFunction())) {
1084  llvm::Value *factor = callInst->getArgOperand(0);
1085  uint64_t mask;
1086  if (lGetMask(factor, &mask) == true) {
1087  // If the vector-valued mask has a known value, replace it
1088  // with the corresponding integer mask from its elements
1089  // high bits.
1090  llvm::Value *value = (callInst->getType() == LLVMTypes::Int32Type) ? LLVMInt32(mask) : LLVMInt64(mask);
1091  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
1092  modifiedAny = true;
1093  goto restart;
1094  }
1095  } else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
1096  callInst->getCalledFunction() == avxMaskedLoad64) {
1097  llvm::Value *factor = callInst->getArgOperand(1);
1098  uint64_t mask;
1099  if (lGetMask(factor, &mask) == true) {
1100  if (mask == 0) {
1101  // nothing being loaded, replace with undef value
1102  llvm::Type *returnType = callInst->getType();
1103  Assert(llvm::isa<llvm::VectorType>(returnType));
1104  llvm::Value *undefValue = llvm::UndefValue::get(returnType);
1105  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, undefValue);
1106  modifiedAny = true;
1107  goto restart;
1108  } else if (mask == 0xff) {
1109  // all lanes active; replace with a regular load
1110  llvm::Type *returnType = callInst->getType();
1111  Assert(llvm::isa<llvm::VectorType>(returnType));
1112  // cast the i8 * to the appropriate type
1113  const char *name = LLVMGetName(callInst->getArgOperand(0), "_cast");
1114  llvm::Value *castPtr = new llvm::BitCastInst(callInst->getArgOperand(0),
1115  llvm::PointerType::get(returnType, 0), name, callInst);
1116  lCopyMetadata(castPtr, callInst);
1117  int align;
1118  if (g->opt.forceAlignedMemory)
1119  align = g->target->getNativeVectorAlignment();
1120  else
1121  align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
1122  name = LLVMGetName(callInst->getArgOperand(0), "_load");
1123  llvm::Instruction *loadInst =
1124  new llvm::LoadInst(castPtr, name, false /* not volatile */, align, (llvm::Instruction *)NULL);
1125  lCopyMetadata(loadInst, callInst);
1126  llvm::ReplaceInstWithInst(callInst, loadInst);
1127  modifiedAny = true;
1128  goto restart;
1129  }
1130  }
1131  } else if (callInst->getCalledFunction() == avxMaskedStore32 ||
1132  callInst->getCalledFunction() == avxMaskedStore64) {
1133  // NOTE: mask is the 2nd parameter, not the 3rd one!!
1134  llvm::Value *factor = callInst->getArgOperand(1);
1135  uint64_t mask;
1136  if (lGetMask(factor, &mask) == true) {
1137  if (mask == 0) {
1138  // nothing actually being stored, just remove the inst
1139  callInst->eraseFromParent();
1140  modifiedAny = true;
1141  goto restart;
1142  } else if (mask == 0xff) {
1143  // all lanes storing, so replace with a regular store
1144  llvm::Value *rvalue = callInst->getArgOperand(2);
1145  llvm::Type *storeType = rvalue->getType();
1146  const char *name = LLVMGetName(callInst->getArgOperand(0), "_ptrcast");
1147  llvm::Value *castPtr = new llvm::BitCastInst(callInst->getArgOperand(0),
1148  llvm::PointerType::get(storeType, 0), name, callInst);
1149  lCopyMetadata(castPtr, callInst);
1150 
1151  llvm::StoreInst *storeInst = new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
1152  int align;
1153  if (g->opt.forceAlignedMemory)
1154  align = g->target->getNativeVectorAlignment();
1155  else
1156  align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
1157  storeInst->setAlignment(align);
1158  lCopyMetadata(storeInst, callInst);
1159  llvm::ReplaceInstWithInst(callInst, storeInst);
1160 
1161  modifiedAny = true;
1162  goto restart;
1163  }
1164  }
1165  }
1166  }
1167 
1168  DEBUG_END_PASS("IntrinsicsOpt");
1169 
1170  return modifiedAny;
1171 }
1172 
1173 bool IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
1174  for (unsigned int i = 0; i < maskInstructions.size(); ++i) {
1175  if (maskInstructions[i].function != NULL && function == maskInstructions[i].function) {
1176  return true;
1177  }
1178  }
1179  return false;
1180 }
1181 
1183  for (unsigned int i = 0; i < blendInstructions.size(); ++i) {
1184  if (blendInstructions[i].function != NULL && function == blendInstructions[i].function) {
1185  return &blendInstructions[i];
1186  }
1187  }
1188  return NULL;
1189 }
1190 
1191 static llvm::Pass *CreateIntrinsicsOptPass() { return new IntrinsicsOpt; }
1192 
1193 ///////////////////////////////////////////////////////////////////////////
1194 
1195 /** This simple optimization pass looks for a vector select instruction
1196  with an all-on or all-off constant mask, simplifying it to the
1197  appropriate operand if so.
1198 
1199  @todo The better thing to do would be to submit a patch to LLVM to get
1200  these; they're presumably pretty simple patterns to match.
1201 */
1202 class InstructionSimplifyPass : public llvm::BasicBlockPass {
1203  public:
1204  InstructionSimplifyPass() : BasicBlockPass(ID) {}
1205 
1206 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
1207  const char *getPassName() const { return "Vector Select Optimization"; }
1208 #else // LLVM 4.0+
1209  llvm::StringRef getPassName() const { return "Vector Select Optimization"; }
1210 #endif
1211  bool runOnBasicBlock(llvm::BasicBlock &BB);
1212 
1213  static char ID;
1214 
1215  private:
1216  static bool simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter);
1217  static llvm::Value *simplifyBoolVec(llvm::Value *value);
1218  static bool simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter);
1219 };
1220 
1222 
1223 llvm::Value *InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) {
1224  llvm::TruncInst *trunc = llvm::dyn_cast<llvm::TruncInst>(value);
1225  if (trunc != NULL) {
1226  // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
1227  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
1228  if (sext && sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
1229  return sext->getOperand(0);
1230 
1231  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
1232  if (zext && zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
1233  return zext->getOperand(0);
1234  }
1235  /*
1236  // This optimization has discernable benefit on the perf
1237  // suite on latest LLVM versions.
1238  // On 3.4+ (maybe even older), it can result in illegal
1239  // operations, so it's being disabled.
1240  llvm::ICmpInst *icmp = llvm::dyn_cast<llvm::ICmpInst>(value);
1241  if (icmp != NULL) {
1242  // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo
1243  if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) {
1244  llvm::Value *op1 = icmp->getOperand(1);
1245  if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
1246  llvm::Value *op0 = icmp->getOperand(0);
1247  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(op0);
1248  if (sext)
1249  return sext->getOperand(0);
1250  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(op0);
1251  if (zext)
1252  return zext->getOperand(0);
1253  }
1254  }
1255 
1256  }
1257  */
1258  return NULL;
1259 }
1260 
1261 bool InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter) {
1262  if (selectInst->getType()->isVectorTy() == false)
1263  return false;
1264 
1265  llvm::Value *factor = selectInst->getOperand(0);
1266 
1267  // Simplify all-on or all-off mask values
1268  MaskStatus maskStatus = lGetMaskStatus(factor);
1269  llvm::Value *value = NULL;
1270  if (maskStatus == ALL_ON)
1271  // Mask all on -> replace with the first select value
1272  value = selectInst->getOperand(1);
1273  else if (maskStatus == ALL_OFF)
1274  // Mask all off -> replace with the second select value
1275  value = selectInst->getOperand(2);
1276  if (value != NULL) {
1277  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
1278  return true;
1279  }
1280 
1281  // Sometimes earlier LLVM optimization passes generate unnecessarily
1282  // complex expressions for the selection vector, which in turn confuses
1283  // the code generators and leads to sub-optimal code (particularly for
1284  // 8 and 16-bit masks). We'll try to simplify them out here so that
1285  // the code generator patterns match..
1286  if ((factor = simplifyBoolVec(factor)) != NULL) {
1287  llvm::Instruction *newSelect = llvm::SelectInst::Create(factor, selectInst->getOperand(1),
1288  selectInst->getOperand(2), selectInst->getName());
1289  llvm::ReplaceInstWithInst(selectInst, newSelect);
1290  return true;
1291  }
1292 
1293  return false;
1294 }
1295 
1296 bool InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter) {
1297  llvm::Function *calledFunc = callInst->getCalledFunction();
1298 
1299  // Turn a __movmsk call with a compile-time constant vector into the
1300  // equivalent scalar value.
1301  if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
1302  return false;
1303 
1304  uint64_t mask;
1305  if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
1306  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, LLVMInt64(mask));
1307  return true;
1308  }
1309  return false;
1310 }
1311 
1312 bool InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) {
1313  DEBUG_START_PASS("InstructionSimplify");
1314 
1315  bool modifiedAny = false;
1316 
1317 restart:
1318  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
1319  llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
1320  if (selectInst && simplifySelect(selectInst, iter)) {
1321  modifiedAny = true;
1322  goto restart;
1323  }
1324  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
1325  if (callInst && simplifyCall(callInst, iter)) {
1326  modifiedAny = true;
1327  goto restart;
1328  }
1329  }
1330 
1331  DEBUG_END_PASS("InstructionSimplify");
1332 
1333  return modifiedAny;
1334 }
1335 
1336 static llvm::Pass *CreateInstructionSimplifyPass() { return new InstructionSimplifyPass; }
1337 
1338 ///////////////////////////////////////////////////////////////////////////
1339 // ImproveMemoryOpsPass
1340 
1341 /** When the front-end emits gathers and scatters, it generates an array of
1342  vector-width pointers to represent the set of addresses to read from or
1343  write to. This optimization detects cases when the base pointer is a
1344  uniform pointer or when the indexing is into an array that can be
1345  converted into scatters/gathers from a single base pointer and an array
1346  of offsets.
1347 
1348  See for example the comments discussing the __pseudo_gather functions
1349  in builtins.cpp for more information about this.
1350  */
1351 class ImproveMemoryOpsPass : public llvm::BasicBlockPass {
1352  public:
1353  static char ID;
1354  ImproveMemoryOpsPass() : BasicBlockPass(ID) {}
1355 
1356 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
1357  const char *getPassName() const { return "Improve Memory Ops"; }
1358 #else // LLVM 4.0+
1359  llvm::StringRef getPassName() const { return "Improve Memory Ops"; }
1360 #endif
1361  bool runOnBasicBlock(llvm::BasicBlock &BB);
1362 };
1363 
1364 char ImproveMemoryOpsPass::ID = 0;
1365 
1366 /** Check to make sure that this value is actually a pointer in the end.
1367  We need to make sure that given an expression like vec(offset) +
1368  ptr2int(ptr), lGetBasePointer() doesn't return vec(offset) for the base
1369  pointer such that we then treat ptr2int(ptr) as an offset. This ends
1370  up being important so that we don't generate LLVM GEP instructions like
1371  "gep inttoptr 8, i64 %ptr", which in turn can lead to incorrect code
1372  since LLVM's pointer aliasing analysis assumes that operands after the
1373  first one to a GEP aren't pointers.
1374  */
1375 static llvm::Value *lCheckForActualPointer(llvm::Value *v) {
1376  if (v == NULL) {
1377  return NULL;
1378  } else if (llvm::isa<llvm::PointerType>(v->getType())) {
1379  return v;
1380  } else if (llvm::isa<llvm::PtrToIntInst>(v)) {
1381  return v;
1382  }
1383 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_7
1384  // This one is tricky, as it's heuristic tuned for LLVM 3.7+, which may
1385  // optimize loading double* with consequent ptr2int to straight load of i64.
1386  // This heuristic should be good enough to catch all the cases we should
1387  // detect and nothing else.
1388  else if (llvm::isa<llvm::LoadInst>(v)) {
1389  return v;
1390  }
1391 #endif
1392  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1393  llvm::Value *t = lCheckForActualPointer(ci->getOperand(0));
1394  if (t == NULL) {
1395  return NULL;
1396  } else {
1397  return v;
1398  }
1399  } else {
1400  llvm::ConstantExpr *uce = llvm::dyn_cast<llvm::ConstantExpr>(v);
1401  if (uce != NULL && uce->getOpcode() == llvm::Instruction::PtrToInt)
1402  return v;
1403  return NULL;
1404  }
1405 }
1406 
1407 /** Given a llvm::Value representing a varying pointer, this function
1408  checks to see if all of the elements of the vector have the same value
1409  (i.e. there's a common base pointer). If broadcast has been already detected
1410  it checks that the first element of the vector is not undef. If one of the conditions
1411  is true, it returns the common pointer value; otherwise it returns NULL.
1412  */
1413 static llvm::Value *lGetBasePointer(llvm::Value *v, llvm::Instruction *insertBefore, bool broadcastDetected) {
1414  if (llvm::isa<llvm::InsertElementInst>(v) || llvm::isa<llvm::ShuffleVectorInst>(v)) {
1415  // If we have already detected broadcast we want to look for
1416  // the vector with the first not-undef element
1417  llvm::Value *element = LLVMFlattenInsertChain(v, g->target->getVectorWidth(), true, false, broadcastDetected);
1418  // TODO: it's probably ok to allow undefined elements and return
1419  // the base pointer if all of the other elements have the same
1420  // value.
1421  if (element != NULL) {
1422  // all elements are the same and not NULLs
1423  return lCheckForActualPointer(element);
1424  } else {
1425  return NULL;
1426  }
1427  }
1428 
1429  // This case comes up with global/static arrays
1430  if (llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v)) {
1431  return lCheckForActualPointer(cv->getSplatValue());
1432  } else if (llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v)) {
1433  return lCheckForActualPointer(cdv->getSplatValue());
1434  }
1435  // It is a little bit tricky to use operations with pointers, casted to int with another bit size
1436  // but sometimes it is useful, so we handle this case here.
1437  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1438  llvm::Value *t = lGetBasePointer(ci->getOperand(0), insertBefore, broadcastDetected);
1439  if (t == NULL) {
1440  return NULL;
1441  } else {
1442  return llvm::CastInst::Create(ci->getOpcode(), t, ci->getType()->getScalarType(), LLVMGetName(t, "_cast"),
1443  insertBefore);
1444  }
1445  }
1446 
1447  return NULL;
1448 }
1449 
1450 /** Given the two operands to a constant add expression, see if we have the
1451  form "base pointer + offset", whee op0 is the base pointer and op1 is
1452  the offset; if so return the base and the offset. */
1453 static llvm::Constant *lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1, llvm::Constant **delta) {
1454  llvm::ConstantExpr *op = llvm::dyn_cast<llvm::ConstantExpr>(op0);
1455  if (op == NULL || op->getOpcode() != llvm::Instruction::PtrToInt)
1456  // the first operand isn't a pointer
1457  return NULL;
1458 
1459  llvm::ConstantInt *opDelta = llvm::dyn_cast<llvm::ConstantInt>(op1);
1460  if (opDelta == NULL)
1461  // the second operand isn't an integer operand
1462  return NULL;
1463 
1464  *delta = opDelta;
1465  return op0;
1466 }
1467 
1468 static llvm::Value *lExtractFromInserts(llvm::Value *v, unsigned int index) {
1469  llvm::InsertValueInst *iv = llvm::dyn_cast<llvm::InsertValueInst>(v);
1470  if (iv == NULL)
1471  return NULL;
1472 
1473  Assert(iv->hasIndices() && iv->getNumIndices() == 1);
1474  if (iv->getIndices()[0] == index)
1475  return iv->getInsertedValueOperand();
1476  else
1477  return lExtractFromInserts(iv->getAggregateOperand(), index);
1478 }
1479 
1480 /** Given a varying pointer in ptrs, this function checks to see if it can
1481  be determined to be indexing from a common uniform base pointer. If
1482  so, the function returns the base pointer llvm::Value and initializes
1483  *offsets with an int vector of the per-lane offsets
1484  */
1485 static llvm::Value *lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets, llvm::Instruction *insertBefore) {
1486 #ifndef ISPC_NO_DUMPS
1487  if (g->debugPrint) {
1488  fprintf(stderr, "lGetBasePtrAndOffsets\n");
1489  LLVMDumpValue(ptrs);
1490  }
1491 #endif
1492 
1493  bool broadcastDetected = false;
1494  // Looking for %gep_offset = shufflevector <8 x i64> %0, <8 x i64> undef, <8 x i32> zeroinitializer
1495  llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(ptrs);
1496  if (shuffle != NULL) {
1497  llvm::Value *indices = shuffle->getOperand(2);
1498  llvm::Value *vec = shuffle->getOperand(1);
1499  if (lIsUndef(vec) && llvm::isa<llvm::ConstantAggregateZero>(indices)) {
1500  broadcastDetected = true;
1501  }
1502  }
1503  llvm::Value *base = lGetBasePointer(ptrs, insertBefore, broadcastDetected);
1504  if (base != NULL) {
1505  // We have a straight up varying pointer with no indexing that's
1506  // actually all the same value.
1507  if (g->target->is32Bit())
1508  *offsets = LLVMInt32Vector(0);
1509  else
1510  *offsets = LLVMInt64Vector((int64_t)0);
1511 
1512  if (broadcastDetected) {
1513  llvm::Value *op = shuffle->getOperand(0);
1514  llvm::BinaryOperator *bop_var = llvm::dyn_cast<llvm::BinaryOperator>(op);
1515  if (bop_var != NULL && bop_var->getOpcode() == llvm::Instruction::Add) {
1516  // We expect here ConstantVector as
1517  // <i64 4, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>
1518  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(bop_var->getOperand(1));
1519  if (cv != NULL) {
1520  llvm::Value *zeroMask =
1521  llvm::ConstantVector::getSplat(cv->getType()->getVectorNumElements(),
1522  llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
1523  // Create offset
1524  llvm::Value *shuffle_offset = new llvm::ShuffleVectorInst(cv, llvm::UndefValue::get(cv->getType()),
1525  zeroMask, "shuffle", bop_var);
1526  *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, shuffle_offset,
1527  "new_offsets", insertBefore);
1528  }
1529  }
1530  }
1531  return base;
1532  }
1533 
1534  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(ptrs);
1535  if (bop != NULL && bop->getOpcode() == llvm::Instruction::Add) {
1536  // If we have a common pointer plus something, then we're also
1537  // good.
1538  if ((base = lGetBasePtrAndOffsets(bop->getOperand(0), offsets, insertBefore)) != NULL) {
1539  *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, bop->getOperand(1), "new_offsets",
1540  insertBefore);
1541  return base;
1542  } else if ((base = lGetBasePtrAndOffsets(bop->getOperand(1), offsets, insertBefore)) != NULL) {
1543  *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, bop->getOperand(0), "new_offsets",
1544  insertBefore);
1545  return base;
1546  }
1547  }
1548  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(ptrs);
1549  if (cv != NULL) {
1550  // Indexing into global arrays can lead to this form, with
1551  // ConstantVectors..
1552  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
1553  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
1554  llvm::Constant *c = llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
1555  if (c == NULL)
1556  return NULL;
1557  elements.push_back(c);
1558  }
1559 
1560  llvm::Constant *delta[ISPC_MAX_NVEC];
1561  for (unsigned int i = 0; i < elements.size(); ++i) {
1562  // For each element, try to decompose it into either a straight
1563  // up base pointer, or a base pointer plus an integer value.
1564  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(elements[i]);
1565  if (ce == NULL)
1566  return NULL;
1567 
1568  delta[i] = NULL;
1569  llvm::Value *elementBase = NULL; // base pointer for this element
1570  if (ce->getOpcode() == llvm::Instruction::PtrToInt) {
1571  // If the element is just a ptr to int instruction, treat
1572  // it as having an offset of zero
1573  elementBase = ce;
1574  delta[i] = g->target->is32Bit() ? LLVMInt32(0) : LLVMInt64(0);
1575  } else if (ce->getOpcode() == llvm::Instruction::Add) {
1576  // Try both orderings of the operands to see if we can get
1577  // a pointer+offset out of them.
1578  elementBase = lGetConstantAddExprBaseOffset(ce->getOperand(0), ce->getOperand(1), &delta[i]);
1579  if (elementBase == NULL)
1580  elementBase = lGetConstantAddExprBaseOffset(ce->getOperand(1), ce->getOperand(0), &delta[i]);
1581  }
1582 
1583  // We weren't able to find a base pointer in the above. (We
1584  // don't expect this to happen; if it does, it may be necessary
1585  // to handle more cases in the decomposition above.)
1586  if (elementBase == NULL)
1587  return NULL;
1588 
1589  Assert(delta[i] != NULL);
1590  if (base == NULL)
1591  // The first time we've found a base pointer
1592  base = elementBase;
1593  else if (base != elementBase)
1594  // Different program instances have different base
1595  // pointers, so no luck.
1596  return NULL;
1597  }
1598 
1599  Assert(base != NULL);
1600  llvm::ArrayRef<llvm::Constant *> deltas(&delta[0], &delta[elements.size()]);
1601  *offsets = llvm::ConstantVector::get(deltas);
1602  return base;
1603  }
1604 
1605  llvm::ExtractValueInst *ev = llvm::dyn_cast<llvm::ExtractValueInst>(ptrs);
1606  if (ev != NULL) {
1607  Assert(ev->getNumIndices() == 1);
1608  int index = ev->getIndices()[0];
1609  ptrs = lExtractFromInserts(ev->getAggregateOperand(), index);
1610  if (ptrs != NULL)
1611  return lGetBasePtrAndOffsets(ptrs, offsets, insertBefore);
1612  }
1613 
1614  return NULL;
1615 }
1616 
1617 /** Given a vector expression in vec, separate it into a compile-time
1618  constant component and a variable component, returning the two parts in
1619  *constOffset and *variableOffset. (It should be the case that the sum
1620  of these two is exactly equal to the original vector.)
1621 
1622  This routine only handles some (important) patterns; in some cases it
1623  will fail and return components that are actually compile-time
1624  constants in *variableOffset.
1625 
1626  Finally, if there aren't any constant (or, respectivaly, variable)
1627  components, the corresponding return value may be set to NULL.
1628  */
1629 static void lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset, llvm::Value **variableOffset,
1630  llvm::Instruction *insertBefore) {
1631  if (llvm::isa<llvm::ConstantVector>(vec) || llvm::isa<llvm::ConstantDataVector>(vec) ||
1632  llvm::isa<llvm::ConstantAggregateZero>(vec)) {
1633  *constOffset = vec;
1634  *variableOffset = NULL;
1635  return;
1636  }
1637 
1638  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(vec);
1639  if (cast != NULL) {
1640  // Check the cast target.
1641  llvm::Value *co, *vo;
1642  lExtractConstantOffset(cast->getOperand(0), &co, &vo, insertBefore);
1643 
1644  // make new cast instructions for the two parts
1645  if (co == NULL)
1646  *constOffset = NULL;
1647  else
1648  *constOffset =
1649  llvm::CastInst::Create(cast->getOpcode(), co, cast->getType(), LLVMGetName(co, "_cast"), insertBefore);
1650  if (vo == NULL)
1651  *variableOffset = NULL;
1652  else
1653  *variableOffset =
1654  llvm::CastInst::Create(cast->getOpcode(), vo, cast->getType(), LLVMGetName(vo, "_cast"), insertBefore);
1655  return;
1656  }
1657 
1658  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(vec);
1659  if (bop != NULL) {
1660  llvm::Value *op0 = bop->getOperand(0);
1661  llvm::Value *op1 = bop->getOperand(1);
1662  llvm::Value *c0, *v0, *c1, *v1;
1663 
1664  if (bop->getOpcode() == llvm::Instruction::Add) {
1665  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1666  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1667 
1668  if (c0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c0))
1669  *constOffset = c1;
1670  else if (c1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c1))
1671  *constOffset = c0;
1672  else
1673  *constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, c0, c1, LLVMGetName("add", c0, c1),
1674  insertBefore);
1675 
1676  if (v0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v0))
1677  *variableOffset = v1;
1678  else if (v1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v1))
1679  *variableOffset = v0;
1680  else
1681  *variableOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, v0, v1,
1682  LLVMGetName("add", v0, v1), insertBefore);
1683  return;
1684  } else if (bop->getOpcode() == llvm::Instruction::Shl) {
1685  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1686  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1687 
1688  // Given the product of constant and variable terms, we have:
1689  // (c0 + v0) * (2^(c1 + v1)) = c0 * 2^c1 * 2^v1 + v0 * 2^c1 * 2^v1
1690  // We can optimize only if v1 == NULL.
1691  if ((v1 != NULL) || (c0 == NULL) || (c1 == NULL)) {
1692  *constOffset = NULL;
1693  *variableOffset = vec;
1694  } else if (v0 == NULL) {
1695  *constOffset = vec;
1696  *variableOffset = NULL;
1697  } else {
1698  *constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Shl, c0, c1, LLVMGetName("shl", c0, c1),
1699  insertBefore);
1700  *variableOffset = llvm::BinaryOperator::Create(llvm::Instruction::Shl, v0, c1,
1701  LLVMGetName("shl", v0, c1), insertBefore);
1702  }
1703  return;
1704  } else if (bop->getOpcode() == llvm::Instruction::Mul) {
1705  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1706  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1707 
1708  // Given the product of constant and variable terms, we have:
1709  // (c0 + v0) * (c1 + v1) == (c0 c1) + (v0 c1 + c0 v1 + v0 v1)
1710  // Note that the first term is a constant and the last three are
1711  // variable.
1712  if (c0 != NULL && c1 != NULL)
1713  *constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, c1, LLVMGetName("mul", c0, c1),
1714  insertBefore);
1715  else
1716  *constOffset = NULL;
1717 
1718  llvm::Value *va = NULL, *vb = NULL, *vc = NULL;
1719  if (v0 != NULL && c1 != NULL)
1720  va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1, LLVMGetName("mul", v0, c1),
1721  insertBefore);
1722  if (c0 != NULL && v1 != NULL)
1723  vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1, LLVMGetName("mul", c0, v1),
1724  insertBefore);
1725  if (v0 != NULL && v1 != NULL)
1726  vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1, LLVMGetName("mul", v0, v1),
1727  insertBefore);
1728 
1729  llvm::Value *vab = NULL;
1730  if (va != NULL && vb != NULL)
1731  vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb, LLVMGetName("add", va, vb),
1732  insertBefore);
1733  else if (va != NULL)
1734  vab = va;
1735  else
1736  vab = vb;
1737 
1738  if (vab != NULL && vc != NULL)
1739  *variableOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, vab, vc,
1740  LLVMGetName("add", vab, vc), insertBefore);
1741  else if (vab != NULL)
1742  *variableOffset = vab;
1743  else
1744  *variableOffset = vc;
1745 
1746  return;
1747  }
1748  }
1749 
1750  // Nothing matched, just return what we have as a variable component
1751  *constOffset = NULL;
1752  *variableOffset = vec;
1753 }
1754 
1755 /* Returns true if the given value is a constant vector of integers with
1756  the same value in all of the elements. (Returns the splatted value in
1757  *splat, if so). */
1758 static bool lIsIntegerSplat(llvm::Value *v, int *splat) {
1759  llvm::ConstantDataVector *cvec = llvm::dyn_cast<llvm::ConstantDataVector>(v);
1760  if (cvec == NULL)
1761  return false;
1762 
1763  llvm::Constant *splatConst = cvec->getSplatValue();
1764  if (splatConst == NULL)
1765  return false;
1766 
1767  llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(splatConst);
1768  if (ci == NULL)
1769  return false;
1770 
1771  int64_t splatVal = ci->getSExtValue();
1772  *splat = (int)splatVal;
1773  return true;
1774 }
1775 
1776 static llvm::Value *lExtract248Scale(llvm::Value *splatOperand, int splatValue, llvm::Value *otherOperand,
1777  llvm::Value **result) {
1778  if (splatValue == 2 || splatValue == 4 || splatValue == 8) {
1779  *result = otherOperand;
1780  return LLVMInt32(splatValue);
1781  }
1782  // Even if we don't have a common scale by exactly 2, 4, or 8, we'll
1783  // see if we can pull out that much of the scale anyway; this may in
1784  // turn allow other optimizations later.
1785  for (int scale = 8; scale >= 2; scale /= 2) {
1786  llvm::Instruction *insertBefore = llvm::dyn_cast<llvm::Instruction>(*result);
1787  Assert(insertBefore != NULL);
1788 
1789  if ((splatValue % scale) == 0) {
1790  // *result = otherOperand * splatOperand / scale;
1791  llvm::Value *splatScaleVec = (splatOperand->getType() == LLVMTypes::Int32VectorType)
1792  ? LLVMInt32Vector(scale)
1793  : LLVMInt64Vector(scale);
1794  llvm::Value *splatDiv =
1795  llvm::BinaryOperator::Create(llvm::Instruction::SDiv, splatOperand, splatScaleVec, "div", insertBefore);
1796  *result = llvm::BinaryOperator::Create(llvm::Instruction::Mul, splatDiv, otherOperand, "mul", insertBefore);
1797  return LLVMInt32(scale);
1798  }
1799  }
1800  return LLVMInt32(1);
1801 }
1802 
1803 /** Given a vector of integer offsets to a base pointer being used for a
1804  gather or a scatter, see if its root operation is a multiply by a
1805  vector of some value by all 2s/4s/8s. If not, return NULL.
1806 
1807  If it is return an i32 value of 2, 4, 8 from the function and modify
1808  *vec so that it points to the operand that is being multiplied by
1809  2/4/8.
1810 
1811  We go through all this trouble so that we can pass the i32 scale factor
1812  to the {gather,scatter}_base_offsets function as a separate scale
1813  factor for the offsets. This in turn is used in a way so that the LLVM
1814  x86 code generator matches it to apply x86's free scale by 2x, 4x, or
1815  8x to one of two registers being added together for an addressing
1816  calculation.
1817  */
1818 static llvm::Value *lExtractOffsetVector248Scale(llvm::Value **vec) {
1819  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(*vec);
1820  if (cast != NULL) {
1821  llvm::Value *castOp = cast->getOperand(0);
1822  // Check the cast target.
1823  llvm::Value *scale = lExtractOffsetVector248Scale(&castOp);
1824  if (scale == NULL)
1825  return NULL;
1826 
1827  // make a new cast instruction so that we end up with the right
1828  // type
1829  *vec = llvm::CastInst::Create(cast->getOpcode(), castOp, cast->getType(), "offset_cast", cast);
1830  return scale;
1831  }
1832 
1833  // If we don't have a binary operator, then just give up
1834  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
1835  if (bop == NULL)
1836  return LLVMInt32(1);
1837 
1838  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
1839  if (bop->getOpcode() == llvm::Instruction::Add) {
1840  if (llvm::isa<llvm::ConstantAggregateZero>(op0)) {
1841  *vec = op1;
1842  return lExtractOffsetVector248Scale(vec);
1843  } else if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
1844  *vec = op0;
1845  return lExtractOffsetVector248Scale(vec);
1846  } else {
1847  llvm::Value *s0 = lExtractOffsetVector248Scale(&op0);
1848  llvm::Value *s1 = lExtractOffsetVector248Scale(&op1);
1849  if (s0 == s1) {
1850  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add, op0, op1, "new_add", bop);
1851  return s0;
1852  } else
1853  return LLVMInt32(1);
1854  }
1855  } else if (bop->getOpcode() == llvm::Instruction::Mul) {
1856  // Check each operand for being one of the scale factors we care about.
1857  int splat;
1858  if (lIsIntegerSplat(op0, &splat))
1859  return lExtract248Scale(op0, splat, op1, vec);
1860  else if (lIsIntegerSplat(op1, &splat))
1861  return lExtract248Scale(op1, splat, op0, vec);
1862  else
1863  return LLVMInt32(1);
1864  } else
1865  return LLVMInt32(1);
1866 }
1867 
1868 #if 0
1869 static llvm::Value *
1870 lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
1871  fprintf(stderr, " lextract: ");
1872  (*vec)->dump();
1873  fprintf(stderr, "\n");
1874 
1875  if (llvm::isa<llvm::ConstantVector>(*vec) ||
1876  llvm::isa<llvm::ConstantDataVector>(*vec) ||
1877  llvm::isa<llvm::ConstantAggregateZero>(*vec))
1878  return NULL;
1879 
1880  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
1881  if (sext != NULL) {
1882  llvm::Value *sextOp = sext->getOperand(0);
1883  // Check the sext target.
1884  llvm::Value *unif = lExtractUniforms(&sextOp, insertBefore);
1885  if (unif == NULL)
1886  return NULL;
1887 
1888  // make a new sext instruction so that we end up with the right
1889  // type
1890  *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
1891  return unif;
1892  }
1893 
1894  if (LLVMVectorValuesAllEqual(*vec)) {
1895  // FIXME: we may want to redo all of the expression here, in scalar
1896  // form (if at all possible), for code quality...
1897  llvm::Value *unif =
1898  llvm::ExtractElementInst::Create(*vec, LLVMInt32(0),
1899  "first_uniform", insertBefore);
1900  *vec = NULL;
1901  return unif;
1902  }
1903 
1904  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
1905  if (bop == NULL)
1906  return NULL;
1907 
1908  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
1909  if (bop->getOpcode() == llvm::Instruction::Add) {
1910  llvm::Value *s0 = lExtractUniforms(&op0, insertBefore);
1911  llvm::Value *s1 = lExtractUniforms(&op1, insertBefore);
1912  if (s0 == NULL && s1 == NULL)
1913  return NULL;
1914 
1915  if (op0 == NULL)
1916  *vec = op1;
1917  else if (op1 == NULL)
1918  *vec = op0;
1919  else
1920  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
1921  op0, op1, "new_add", insertBefore);
1922 
1923  if (s0 == NULL)
1924  return s1;
1925  else if (s1 == NULL)
1926  return s0;
1927  else
1928  return llvm::BinaryOperator::Create(llvm::Instruction::Add, s0, s1,
1929  "add_unif", insertBefore);
1930  }
1931 #if 0
1932  else if (bop->getOpcode() == llvm::Instruction::Mul) {
1933  // Check each operand for being one of the scale factors we care about.
1934  int splat;
1935  if (lIs248Splat(op0, &splat)) {
1936  *vec = op1;
1937  return LLVMInt32(splat);
1938  }
1939  else if (lIs248Splat(op1, &splat)) {
1940  *vec = op0;
1941  return LLVMInt32(splat);
1942  }
1943  else
1944  return LLVMInt32(1);
1945  }
1946 #endif
1947  else
1948  return NULL;
1949 }
1950 
1951 
1952 static void
1953 lExtractUniformsFromOffset(llvm::Value **basePtr, llvm::Value **offsetVector,
1954  llvm::Value *offsetScale,
1955  llvm::Instruction *insertBefore) {
1956 #if 1
1957  (*basePtr)->dump();
1958  printf("\n");
1959  (*offsetVector)->dump();
1960  printf("\n");
1961  offsetScale->dump();
1962  printf("-----\n");
1963 #endif
1964 
1965  llvm::Value *uniformDelta = lExtractUniforms(offsetVector, insertBefore);
1966  if (uniformDelta == NULL)
1967  return;
1968 
1969  *basePtr = lGEPInst(*basePtr, arrayRef, "new_base", insertBefore);
1970 
1971  // this should only happen if we have only uniforms, but that in turn
1972  // shouldn't be a gather/scatter!
1973  Assert(*offsetVector != NULL);
1974 }
1975 #endif
1976 
1977 static bool lVectorIs32BitInts(llvm::Value *v) {
1978  int nElts;
1979  int64_t elts[ISPC_MAX_NVEC];
1980  if (!LLVMExtractVectorInts(v, elts, &nElts))
1981  return false;
1982 
1983  for (int i = 0; i < nElts; ++i)
1984  if ((int32_t)elts[i] != elts[i])
1985  return false;
1986 
1987  return true;
1988 }
1989 
1990 /** Check to see if the two offset vectors can safely be represented with
1991  32-bit values. If so, return true and update the pointed-to
1992  llvm::Value *s to be the 32-bit equivalents. */
1993 static bool lOffsets32BitSafe(llvm::Value **variableOffsetPtr, llvm::Value **constOffsetPtr,
1994  llvm::Instruction *insertBefore) {
1995  llvm::Value *variableOffset = *variableOffsetPtr;
1996  llvm::Value *constOffset = *constOffsetPtr;
1997 
1998  if (variableOffset->getType() != LLVMTypes::Int32VectorType) {
1999  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(variableOffset);
2000  if (sext != NULL && sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType)
2001  // sext of a 32-bit vector -> the 32-bit vector is good
2002  variableOffset = sext->getOperand(0);
2003  else if (lVectorIs32BitInts(variableOffset))
2004  // The only constant vector we should have here is a vector of
2005  // all zeros (i.e. a ConstantAggregateZero, but just in case,
2006  // do the more general check with lVectorIs32BitInts().
2007  variableOffset = new llvm::TruncInst(variableOffset, LLVMTypes::Int32VectorType,
2008  LLVMGetName(variableOffset, "_trunc"), insertBefore);
2009  else
2010  return false;
2011  }
2012 
2013  if (constOffset->getType() != LLVMTypes::Int32VectorType) {
2014  if (lVectorIs32BitInts(constOffset)) {
2015  // Truncate them so we have a 32-bit vector type for them.
2016  constOffset = new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
2017  LLVMGetName(constOffset, "_trunc"), insertBefore);
2018  } else {
2019  // FIXME: otherwise we just assume that all constant offsets
2020  // can actually always fit into 32-bits... (This could be
2021  // wrong, but it should be only in pretty esoteric cases). We
2022  // make this assumption for now since we sometimes generate
2023  // constants that need constant folding before we really have a
2024  // constant vector out of them, and
2025  // llvm::ConstantFoldInstruction() doesn't seem to be doing
2026  // enough for us in some cases if we call it from here.
2027  constOffset = new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
2028  LLVMGetName(constOffset, "_trunc"), insertBefore);
2029  }
2030  }
2031 
2032  *variableOffsetPtr = variableOffset;
2033  *constOffsetPtr = constOffset;
2034  return true;
2035 }
2036 
2037 /** Check to see if the offset value is composed of a string of Adds,
2038  SExts, and Constant Vectors that are 32-bit safe. Recursively
2039  explores the operands of Add instructions (as they might themselves
2040  be adds that eventually terminate in constant vectors or a SExt.)
2041  */
2042 
2043 static bool lIs32BitSafeHelper(llvm::Value *v) {
2044  // handle Adds, SExts, Constant Vectors
2045  if (llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v)) {
2046  if (bop->getOpcode() == llvm::Instruction::Add) {
2047  return lIs32BitSafeHelper(bop->getOperand(0)) && lIs32BitSafeHelper(bop->getOperand(1));
2048  }
2049  return false;
2050  } else if (llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(v)) {
2051  return sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType;
2052  } else
2053  return lVectorIs32BitInts(v);
2054 }
2055 
2056 /** Check to see if the single offset vector can safely be represented with
2057  32-bit values. If so, return true and update the pointed-to
2058  llvm::Value * to be the 32-bit equivalent. */
2059 static bool lOffsets32BitSafe(llvm::Value **offsetPtr, llvm::Instruction *insertBefore) {
2060  llvm::Value *offset = *offsetPtr;
2061 
2062  if (offset->getType() == LLVMTypes::Int32VectorType)
2063  return true;
2064 
2065  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offset);
2066  if (sext != NULL && sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
2067  // sext of a 32-bit vector -> the 32-bit vector is good
2068  *offsetPtr = sext->getOperand(0);
2069  return true;
2070  } else if (lIs32BitSafeHelper(offset)) {
2071  // The only constant vector we should have here is a vector of
2072  // all zeros (i.e. a ConstantAggregateZero, but just in case,
2073  // do the more general check with lVectorIs32BitInts().
2074 
2075  // Alternatively, offset could be a sequence of adds terminating
2076  // in safe constant vectors or a SExt.
2077  *offsetPtr =
2078  new llvm::TruncInst(offset, LLVMTypes::Int32VectorType, LLVMGetName(offset, "_trunc"), insertBefore);
2079  return true;
2080  } else
2081  return false;
2082 }
2083 
2084 static bool lGSToGSBaseOffsets(llvm::CallInst *callInst) {
2085  struct GSInfo {
2086  GSInfo(const char *pgFuncName, const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
2087  : isGather(ig), isPrefetch(ip) {
2088  func = m->module->getFunction(pgFuncName);
2089  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
2090  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
2091  }
2092  llvm::Function *func;
2093  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
2094  const bool isGather;
2095  const bool isPrefetch;
2096  };
2097 
2098  GSInfo gsFuncs[] = {
2099  GSInfo(
2100  "__pseudo_gather32_i8",
2101  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2102  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2103  true, false),
2104  GSInfo("__pseudo_gather32_i16",
2105  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2106  : "__pseudo_gather_factored_base_offsets32_i16",
2107  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2108  : "__pseudo_gather_factored_base_offsets32_i16",
2109  true, false),
2110  GSInfo("__pseudo_gather32_i32",
2111  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2112  : "__pseudo_gather_factored_base_offsets32_i32",
2113  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2114  : "__pseudo_gather_factored_base_offsets32_i32",
2115  true, false),
2116  GSInfo("__pseudo_gather32_float",
2117  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2118  : "__pseudo_gather_factored_base_offsets32_float",
2119  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2120  : "__pseudo_gather_factored_base_offsets32_float",
2121  true, false),
2122  GSInfo("__pseudo_gather32_i64",
2123  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2124  : "__pseudo_gather_factored_base_offsets32_i64",
2125  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2126  : "__pseudo_gather_factored_base_offsets32_i64",
2127  true, false),
2128  GSInfo("__pseudo_gather32_double",
2129  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2130  : "__pseudo_gather_factored_base_offsets32_double",
2131  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2132  : "__pseudo_gather_factored_base_offsets32_double",
2133  true, false),
2134 
2135  GSInfo("__pseudo_scatter32_i8",
2136  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2137  : "__pseudo_scatter_factored_base_offsets32_i8",
2138  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2139  : "__pseudo_scatter_factored_base_offsets32_i8",
2140  false, false),
2141  GSInfo("__pseudo_scatter32_i16",
2142  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2143  : "__pseudo_scatter_factored_base_offsets32_i16",
2144  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2145  : "__pseudo_scatter_factored_base_offsets32_i16",
2146  false, false),
2147  GSInfo("__pseudo_scatter32_i32",
2148  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2149  : "__pseudo_scatter_factored_base_offsets32_i32",
2150  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2151  : "__pseudo_scatter_factored_base_offsets32_i32",
2152  false, false),
2153  GSInfo("__pseudo_scatter32_float",
2154  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2155  : "__pseudo_scatter_factored_base_offsets32_float",
2156  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2157  : "__pseudo_scatter_factored_base_offsets32_float",
2158  false, false),
2159  GSInfo("__pseudo_scatter32_i64",
2160  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2161  : "__pseudo_scatter_factored_base_offsets32_i64",
2162  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2163  : "__pseudo_scatter_factored_base_offsets32_i64",
2164  false, false),
2165  GSInfo("__pseudo_scatter32_double",
2166  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2167  : "__pseudo_scatter_factored_base_offsets32_double",
2168  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2169  : "__pseudo_scatter_factored_base_offsets32_double",
2170  false, false),
2171 
2172  GSInfo(
2173  "__pseudo_gather64_i8",
2174  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" : "__pseudo_gather_factored_base_offsets64_i8",
2175  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2176  true, false),
2177  GSInfo("__pseudo_gather64_i16",
2178  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16"
2179  : "__pseudo_gather_factored_base_offsets64_i16",
2180  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2181  : "__pseudo_gather_factored_base_offsets32_i16",
2182  true, false),
2183  GSInfo("__pseudo_gather64_i32",
2184  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32"
2185  : "__pseudo_gather_factored_base_offsets64_i32",
2186  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2187  : "__pseudo_gather_factored_base_offsets32_i32",
2188  true, false),
2189  GSInfo("__pseudo_gather64_float",
2190  g->target->hasGather() ? "__pseudo_gather_base_offsets64_float"
2191  : "__pseudo_gather_factored_base_offsets64_float",
2192  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2193  : "__pseudo_gather_factored_base_offsets32_float",
2194  true, false),
2195  GSInfo("__pseudo_gather64_i64",
2196  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64"
2197  : "__pseudo_gather_factored_base_offsets64_i64",
2198  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2199  : "__pseudo_gather_factored_base_offsets32_i64",
2200  true, false),
2201  GSInfo("__pseudo_gather64_double",
2202  g->target->hasGather() ? "__pseudo_gather_base_offsets64_double"
2203  : "__pseudo_gather_factored_base_offsets64_double",
2204  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2205  : "__pseudo_gather_factored_base_offsets32_double",
2206  true, false),
2207 
2208  GSInfo("__pseudo_scatter64_i8",
2209  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8"
2210  : "__pseudo_scatter_factored_base_offsets64_i8",
2211  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2212  : "__pseudo_scatter_factored_base_offsets32_i8",
2213  false, false),
2214  GSInfo("__pseudo_scatter64_i16",
2215  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16"
2216  : "__pseudo_scatter_factored_base_offsets64_i16",
2217  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2218  : "__pseudo_scatter_factored_base_offsets32_i16",
2219  false, false),
2220  GSInfo("__pseudo_scatter64_i32",
2221  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32"
2222  : "__pseudo_scatter_factored_base_offsets64_i32",
2223  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2224  : "__pseudo_scatter_factored_base_offsets32_i32",
2225  false, false),
2226  GSInfo("__pseudo_scatter64_float",
2227  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float"
2228  : "__pseudo_scatter_factored_base_offsets64_float",
2229  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2230  : "__pseudo_scatter_factored_base_offsets32_float",
2231  false, false),
2232  GSInfo("__pseudo_scatter64_i64",
2233  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64"
2234  : "__pseudo_scatter_factored_base_offsets64_i64",
2235  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2236  : "__pseudo_scatter_factored_base_offsets32_i64",
2237  false, false),
2238  GSInfo("__pseudo_scatter64_double",
2239  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double"
2240  : "__pseudo_scatter_factored_base_offsets64_double",
2241  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2242  : "__pseudo_scatter_factored_base_offsets32_double",
2243  false, false),
2244  GSInfo("__pseudo_prefetch_read_varying_1",
2245  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2246  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2247  false, true),
2248 
2249  GSInfo("__pseudo_prefetch_read_varying_2",
2250  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2251  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2252  false, true),
2253 
2254  GSInfo("__pseudo_prefetch_read_varying_3",
2255  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2256  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2257  false, true),
2258 
2259  GSInfo("__pseudo_prefetch_read_varying_nt",
2260  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2261  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2262  false, true),
2263  };
2264 
2265  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2266  for (int i = 0; i < numGSFuncs; ++i)
2267  Assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL && gsFuncs[i].baseOffsets32Func != NULL);
2268 
2269  GSInfo *info = NULL;
2270  for (int i = 0; i < numGSFuncs; ++i)
2271  if (gsFuncs[i].func != NULL && callInst->getCalledFunction() == gsFuncs[i].func) {
2272  info = &gsFuncs[i];
2273  break;
2274  }
2275  if (info == NULL)
2276  return false;
2277 
2278  // Try to transform the array of pointers to a single base pointer
2279  // and an array of int32 offsets. (All the hard work is done by
2280  // lGetBasePtrAndOffsets).
2281  llvm::Value *ptrs = callInst->getArgOperand(0);
2282  llvm::Value *offsetVector = NULL;
2283  llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector, callInst);
2284 
2285  if (basePtr == NULL || offsetVector == NULL ||
2286  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
2287  // It's actually a fully general gather/scatter with a varying
2288  // set of base pointers, so leave it as is and continune onward
2289  // to the next instruction...
2290  return false;
2291 
2292  // Cast the base pointer to a void *, since that's what the
2293  // __pseudo_*_base_offsets_* functions want.
2294  basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType, LLVMGetName(basePtr, "_2void"), callInst);
2295  lCopyMetadata(basePtr, callInst);
2296 
2297  llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
2298 
2299  if ((info->isGather == true && g->target->hasGather()) ||
2300  (info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
2301  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
2302 
2303  // See if the offsets are scaled by 2, 4, or 8. If so,
2304  // extract that scale factor and rewrite the offsets to remove
2305  // it.
2306  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
2307 
2308  // If we're doing 32-bit addressing on a 64-bit target, here we
2309  // will see if we can call one of the 32-bit variants of the pseudo
2310  // gather/scatter functions.
2311  if (g->opt.force32BitAddressing && lOffsets32BitSafe(&offsetVector, callInst)) {
2312  gatherScatterFunc = info->baseOffsets32Func;
2313  }
2314 
2315  if (info->isGather || info->isPrefetch) {
2316  llvm::Value *mask = callInst->getArgOperand(1);
2317 
2318  // Generate a new function call to the next pseudo gather
2319  // base+offsets instruction. Note that we're passing a NULL
2320  // llvm::Instruction to llvm::CallInst::Create; this means that
2321  // the instruction isn't inserted into a basic block and that
2322  // way we can then call ReplaceInstWithInst().
2323  llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector, mask,
2324  callInst->getName().str().c_str(), NULL);
2325  lCopyMetadata(newCall, callInst);
2326  llvm::ReplaceInstWithInst(callInst, newCall);
2327  } else {
2328  llvm::Value *storeValue = callInst->getArgOperand(1);
2329  llvm::Value *mask = callInst->getArgOperand(2);
2330 
2331  // Generate a new function call to the next pseudo scatter
2332  // base+offsets instruction. See above for why passing NULL
2333  // for the Instruction * is intended.
2334  llvm::Instruction *newCall =
2335  lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector, storeValue, mask, "", NULL);
2336  lCopyMetadata(newCall, callInst);
2337  llvm::ReplaceInstWithInst(callInst, newCall);
2338  }
2339  } else {
2340  // Try to decompose the offset vector into a compile time constant
2341  // component and a varying component. The constant component is
2342  // passed as a separate parameter to the gather/scatter functions,
2343  // which in turn allows their implementations to end up emitting
2344  // x86 instructions with constant offsets encoded in them.
2345  llvm::Value *constOffset = NULL;
2346  llvm::Value *variableOffset = NULL;
2347  lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, callInst);
2348  if (constOffset == NULL)
2349  constOffset = LLVMIntAsType(0, offsetVector->getType());
2350  if (variableOffset == NULL)
2351  variableOffset = LLVMIntAsType(0, offsetVector->getType());
2352 
2353  // See if the varying component is scaled by 2, 4, or 8. If so,
2354  // extract that scale factor and rewrite variableOffset to remove
2355  // it. (This also is pulled out so that we can match the scales by
2356  // 2/4/8 offered by x86 addressing operators.)
2357  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);
2358 
2359  // If we're doing 32-bit addressing on a 64-bit target, here we
2360  // will see if we can call one of the 32-bit variants of the pseudo
2361  // gather/scatter functions.
2362  if (g->opt.force32BitAddressing && lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) {
2363  gatherScatterFunc = info->baseOffsets32Func;
2364  }
2365 
2366  if (info->isGather || info->isPrefetch) {
2367  llvm::Value *mask = callInst->getArgOperand(1);
2368 
2369  // Generate a new function call to the next pseudo gather
2370  // base+offsets instruction. Note that we're passing a NULL
2371  // llvm::Instruction to llvm::CallInst::Create; this means that
2372  // the instruction isn't inserted into a basic block and that
2373  // way we can then call ReplaceInstWithInst().
2374  llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, constOffset,
2375  mask, callInst->getName().str().c_str(), NULL);
2376  lCopyMetadata(newCall, callInst);
2377  llvm::ReplaceInstWithInst(callInst, newCall);
2378  } else {
2379  llvm::Value *storeValue = callInst->getArgOperand(1);
2380  llvm::Value *mask = callInst->getArgOperand(2);
2381 
2382  // Generate a new function call to the next pseudo scatter
2383  // base+offsets instruction. See above for why passing NULL
2384  // for the Instruction * is intended.
2385  llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, constOffset,
2386  storeValue, mask, "", NULL);
2387  lCopyMetadata(newCall, callInst);
2388  llvm::ReplaceInstWithInst(callInst, newCall);
2389  }
2390  }
2391  return true;
2392 }
2393 
2394 /** Try to improve the decomposition between compile-time constant and
2395  compile-time unknown offsets in calls to the __pseudo_*_base_offsets*
2396  functions. Other other optimizations have run, we will sometimes be
2397  able to pull more terms out of the unknown part and add them into the
2398  compile-time-known part.
2399  */
2400 static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
2401  struct GSBOInfo {
2402  GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
2403  : isGather(ig), isPrefetch(ip) {
2404  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
2405  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
2406  }
2407  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
2408  const bool isGather;
2409  const bool isPrefetch;
2410  };
2411 
2412  GSBOInfo gsFuncs[] = {
2413  GSBOInfo(
2414  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2415  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2416  true, false),
2417  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2418  : "__pseudo_gather_factored_base_offsets32_i16",
2419  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2420  : "__pseudo_gather_factored_base_offsets32_i16",
2421  true, false),
2422  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2423  : "__pseudo_gather_factored_base_offsets32_i32",
2424  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2425  : "__pseudo_gather_factored_base_offsets32_i32",
2426  true, false),
2427  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2428  : "__pseudo_gather_factored_base_offsets32_float",
2429  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2430  : "__pseudo_gather_factored_base_offsets32_float",
2431  true, false),
2432  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2433  : "__pseudo_gather_factored_base_offsets32_i64",
2434  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2435  : "__pseudo_gather_factored_base_offsets32_i64",
2436  true, false),
2437  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2438  : "__pseudo_gather_factored_base_offsets32_double",
2439  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2440  : "__pseudo_gather_factored_base_offsets32_double",
2441  true, false),
2442 
2443  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2444  : "__pseudo_scatter_factored_base_offsets32_i8",
2445  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2446  : "__pseudo_scatter_factored_base_offsets32_i8",
2447  false, false),
2448  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2449  : "__pseudo_scatter_factored_base_offsets32_i16",
2450  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2451  : "__pseudo_scatter_factored_base_offsets32_i16",
2452  false, false),
2453  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2454  : "__pseudo_scatter_factored_base_offsets32_i32",
2455  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2456  : "__pseudo_scatter_factored_base_offsets32_i32",
2457  false, false),
2458  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2459  : "__pseudo_scatter_factored_base_offsets32_float",
2460  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2461  : "__pseudo_scatter_factored_base_offsets32_float",
2462  false, false),
2463  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2464  : "__pseudo_scatter_factored_base_offsets32_i64",
2465  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2466  : "__pseudo_scatter_factored_base_offsets32_i64",
2467  false, false),
2468  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2469  : "__pseudo_scatter_factored_base_offsets32_double",
2470  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2471  : "__pseudo_scatter_factored_base_offsets32_double",
2472  false, false),
2473 
2474  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2475  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2476  false, true),
2477 
2478  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2479  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2480  false, true),
2481 
2482  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2483  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2484  false, true),
2485 
2486  GSBOInfo(
2487  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2488  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2489  false, true),
2490  };
2491 
2492  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2493  for (int i = 0; i < numGSFuncs; ++i)
2494  Assert(gsFuncs[i].baseOffsetsFunc != NULL && gsFuncs[i].baseOffsets32Func != NULL);
2495 
2496  llvm::Function *calledFunc = callInst->getCalledFunction();
2497  Assert(calledFunc != NULL);
2498 
2499  // Is one of the gather/scatter functins that decompose into
2500  // base+offsets being called?
2501  GSBOInfo *info = NULL;
2502  for (int i = 0; i < numGSFuncs; ++i)
2503  if (calledFunc == gsFuncs[i].baseOffsetsFunc || calledFunc == gsFuncs[i].baseOffsets32Func) {
2504  info = &gsFuncs[i];
2505  break;
2506  }
2507  if (info == NULL)
2508  return false;
2509 
2510  // Grab the old variable offset
2511  llvm::Value *origVariableOffset = callInst->getArgOperand(1);
2512 
2513  // If it's zero, we're done. Don't go and think that we're clever by
2514  // adding these zeros to the constant offsets.
2515  if (llvm::isa<llvm::ConstantAggregateZero>(origVariableOffset))
2516  return false;
2517 
2518  // Try to decompose the old variable offset
2519  llvm::Value *constOffset = NULL;
2520  llvm::Value *variableOffset = NULL;
2521  lExtractConstantOffset(origVariableOffset, &constOffset, &variableOffset, callInst);
2522 
2523  // No luck
2524  if (constOffset == NULL)
2525  return false;
2526 
2527  // Total luck: everything could be moved to the constant offset
2528  if (variableOffset == NULL)
2529  variableOffset = LLVMIntAsType(0, origVariableOffset->getType());
2530 
2531  // We need to scale the value we add to the constant offset by the
2532  // 2/4/8 scale for the variable offset, if present.
2533  llvm::ConstantInt *varScale = llvm::dyn_cast<llvm::ConstantInt>(callInst->getArgOperand(2));
2534  Assert(varScale != NULL);
2535 
2536  llvm::Value *scaleSmear;
2537  if (origVariableOffset->getType() == LLVMTypes::Int64VectorType)
2538  scaleSmear = LLVMInt64Vector((int64_t)varScale->getZExtValue());
2539  else
2540  scaleSmear = LLVMInt32Vector((int32_t)varScale->getZExtValue());
2541 
2542  constOffset =
2543  llvm::BinaryOperator::Create(llvm::Instruction::Mul, constOffset, scaleSmear, constOffset->getName(), callInst);
2544 
2545  // And add the additional offset to the original constant offset
2546  constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, constOffset, callInst->getArgOperand(3),
2547  callInst->getArgOperand(3)->getName(), callInst);
2548 
2549  // Finally, update the values of the operands to the gather/scatter
2550  // function.
2551  callInst->setArgOperand(1, variableOffset);
2552  callInst->setArgOperand(3, constOffset);
2553 
2554  return true;
2555 }
2556 
2557 static llvm::Value *lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets, llvm::Instruction *insertBefore) {
2558  llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets);
2559  return lGEPInst(base, firstOffset, "ptr", insertBefore);
2560 }
2561 
2562 static llvm::Constant *lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType) {
2563  llvm::ConstantInt *offsetScaleInt = llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
2564  Assert(offsetScaleInt != NULL);
2565  uint64_t scaleValue = offsetScaleInt->getZExtValue();
2566 
2567  std::vector<llvm::Constant *> scales;
2568  for (int i = 0; i < g->target->getVectorWidth(); ++i) {
2569  if (vecType == LLVMTypes::Int64VectorType)
2570  scales.push_back(LLVMInt64(scaleValue));
2571  else {
2572  Assert(vecType == LLVMTypes::Int32VectorType);
2573  scales.push_back(LLVMInt32((int32_t)scaleValue));
2574  }
2575  }
2576  return llvm::ConstantVector::get(scales);
2577 }
2578 
2579 /** After earlier optimization passes have run, we are sometimes able to
2580  determine that gathers/scatters are actually accessing memory in a more
2581  regular fashion and then change the operation to something simpler and
2582  more efficient. For example, if all of the lanes in a gather are
2583  reading from the same location, we can instead do a scalar load and
2584  broadcast. This pass examines gathers and scatters and tries to
2585  simplify them if at all possible.
2586 
2587  @todo Currently, this only looks for all program instances going to the
2588  same location and all going to a linear sequence of locations in
2589  memory. There are a number of other cases that might make sense to
2590  look for, including things that could be handled with a vector load +
2591  shuffle or things that could be handled with hybrids of e.g. 2 4-wide
2592  vector loads with AVX, etc.
2593 */
2594 static bool lGSToLoadStore(llvm::CallInst *callInst) {
2595  struct GatherImpInfo {
2596  GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st, int a)
2597  : align(a), isFactored(!g->target->hasGather()) {
2598  pseudoFunc = m->module->getFunction(pName);
2599  loadMaskedFunc = m->module->getFunction(lmName);
2600  Assert(pseudoFunc != NULL && loadMaskedFunc != NULL);
2601  scalarType = st;
2602  }
2603 
2604  llvm::Function *pseudoFunc;
2605  llvm::Function *loadMaskedFunc;
2606  llvm::Type *scalarType;
2607  const int align;
2608  const bool isFactored;
2609  };
2610 
2611  GatherImpInfo gInfo[] = {
2612  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8"
2613  : "__pseudo_gather_factored_base_offsets32_i8",
2614  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2615  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2616  : "__pseudo_gather_factored_base_offsets32_i16",
2617  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2618  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2619  : "__pseudo_gather_factored_base_offsets32_i32",
2620  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2621  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2622  : "__pseudo_gather_factored_base_offsets32_float",
2623  "__masked_load_float", LLVMTypes::FloatType, 4),
2624  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2625  : "__pseudo_gather_factored_base_offsets32_i64",
2626  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2627  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2628  : "__pseudo_gather_factored_base_offsets32_double",
2629  "__masked_load_double", LLVMTypes::DoubleType, 8),
2630  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8"
2631  : "__pseudo_gather_factored_base_offsets64_i8",
2632  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2633  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16"
2634  : "__pseudo_gather_factored_base_offsets64_i16",
2635  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2636  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32"
2637  : "__pseudo_gather_factored_base_offsets64_i32",
2638  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2639  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_float"
2640  : "__pseudo_gather_factored_base_offsets64_float",
2641  "__masked_load_float", LLVMTypes::FloatType, 4),
2642  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64"
2643  : "__pseudo_gather_factored_base_offsets64_i64",
2644  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2645  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_double"
2646  : "__pseudo_gather_factored_base_offsets64_double",
2647  "__masked_load_double", LLVMTypes::DoubleType, 8),
2648  };
2649 
2650  struct ScatterImpInfo {
2651  ScatterImpInfo(const char *pName, const char *msName, llvm::Type *vpt, int a)
2652  : align(a), isFactored(!g->target->hasScatter()) {
2653  pseudoFunc = m->module->getFunction(pName);
2654  maskedStoreFunc = m->module->getFunction(msName);
2655  vecPtrType = vpt;
2656  Assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
2657  }
2658  llvm::Function *pseudoFunc;
2659  llvm::Function *maskedStoreFunc;
2660  llvm::Type *vecPtrType;
2661  const int align;
2662  const bool isFactored;
2663  };
2664 
2665  ScatterImpInfo sInfo[] = {
2666  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2667  : "__pseudo_scatter_factored_base_offsets32_i8",
2668  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2669  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2670  : "__pseudo_scatter_factored_base_offsets32_i16",
2671  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2672  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2673  : "__pseudo_scatter_factored_base_offsets32_i32",
2674  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2675  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2676  : "__pseudo_scatter_factored_base_offsets32_float",
2677  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2678  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2679  : "__pseudo_scatter_factored_base_offsets32_i64",
2680  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2681  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2682  : "__pseudo_scatter_factored_base_offsets32_double",
2683  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2684  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8"
2685  : "__pseudo_scatter_factored_base_offsets64_i8",
2686  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2687  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16"
2688  : "__pseudo_scatter_factored_base_offsets64_i16",
2689  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2690  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32"
2691  : "__pseudo_scatter_factored_base_offsets64_i32",
2692  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2693  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float"
2694  : "__pseudo_scatter_factored_base_offsets64_float",
2695  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2696  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64"
2697  : "__pseudo_scatter_factored_base_offsets64_i64",
2698  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2699  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double"
2700  : "__pseudo_scatter_factored_base_offsets64_double",
2701  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2702  };
2703 
2704  llvm::Function *calledFunc = callInst->getCalledFunction();
2705 
2706  GatherImpInfo *gatherInfo = NULL;
2707  ScatterImpInfo *scatterInfo = NULL;
2708  for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
2709  if (gInfo[i].pseudoFunc != NULL && calledFunc == gInfo[i].pseudoFunc) {
2710  gatherInfo = &gInfo[i];
2711  break;
2712  }
2713  }
2714  for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
2715  if (sInfo[i].pseudoFunc != NULL && calledFunc == sInfo[i].pseudoFunc) {
2716  scatterInfo = &sInfo[i];
2717  break;
2718  }
2719  }
2720  if (gatherInfo == NULL && scatterInfo == NULL)
2721  return false;
2722 
2723  SourcePos pos;
2724  lGetSourcePosFromMetadata(callInst, &pos);
2725 
2726  llvm::Value *base = callInst->getArgOperand(0);
2727  llvm::Value *fullOffsets = NULL;
2728  llvm::Value *storeValue = NULL;
2729  llvm::Value *mask = NULL;
2730 
2731  if ((gatherInfo != NULL && gatherInfo->isFactored) || (scatterInfo != NULL && scatterInfo->isFactored)) {
2732  llvm::Value *varyingOffsets = callInst->getArgOperand(1);
2733  llvm::Value *offsetScale = callInst->getArgOperand(2);
2734  llvm::Value *constOffsets = callInst->getArgOperand(3);
2735  if (scatterInfo)
2736  storeValue = callInst->getArgOperand(4);
2737  mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);
2738 
2739  // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
2740  llvm::Constant *offsetScaleVec = lGetOffsetScaleVec(offsetScale, varyingOffsets->getType());
2741 
2742  llvm::Value *scaledVarying = llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
2743  varyingOffsets, "scaled_varying", callInst);
2744  fullOffsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying, constOffsets,
2745  "varying+const_offsets", callInst);
2746  } else {
2747  if (scatterInfo)
2748  storeValue = callInst->getArgOperand(3);
2749  mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
2750 
2751  llvm::Value *offsetScale = callInst->getArgOperand(1);
2752  llvm::Value *offsets = callInst->getArgOperand(2);
2753  llvm::Value *offsetScaleVec = lGetOffsetScaleVec(offsetScale, offsets->getType());
2754 
2755  fullOffsets =
2756  llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, offsets, "scaled_offsets", callInst);
2757  }
2758 
2759  Debug(SourcePos(), "GSToLoadStore: %s.", fullOffsets->getName().str().c_str());
2760 
2761  if (LLVMVectorValuesAllEqual(fullOffsets)) {
2762  // If all the offsets are equal, then compute the single
2763  // pointer they all represent based on the first one of them
2764  // (arbitrarily).
2765  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
2766  lCopyMetadata(ptr, callInst);
2767 
2768  if (gatherInfo != NULL) {
2769  // A gather with everyone going to the same location is
2770  // handled as a scalar load and broadcast across the lanes.
2771  Debug(pos, "Transformed gather to scalar load and broadcast!");
2772 
2773  ptr =
2774  new llvm::BitCastInst(ptr, llvm::PointerType::get(gatherInfo->scalarType, 0), ptr->getName(), callInst);
2775  llvm::Value *scalarValue = new llvm::LoadInst(ptr, callInst->getName(), callInst);
2776 
2777  // Generate the following sequence:
2778  // %name123 = insertelement <4 x i32> undef, i32 %val, i32 0
2779  // %name124 = shufflevector <4 x i32> %name123, <4 x i32> undef,
2780  // <4 x i32> zeroinitializer
2781  llvm::Value *undef1Value = llvm::UndefValue::get(callInst->getType());
2782  llvm::Value *undef2Value = llvm::UndefValue::get(callInst->getType());
2783  llvm::Value *insertVec =
2784  llvm::InsertElementInst::Create(undef1Value, scalarValue, LLVMInt32(0), callInst->getName(), callInst);
2785  llvm::Value *zeroMask =
2786  llvm::ConstantVector::getSplat(callInst->getType()->getVectorNumElements(),
2787  llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
2788  llvm::Value *shufValue = new llvm::ShuffleVectorInst(insertVec, undef2Value, zeroMask, callInst->getName());
2789 
2790  lCopyMetadata(shufValue, callInst);
2791  llvm::ReplaceInstWithInst(callInst, llvm::dyn_cast<llvm::Instruction>(shufValue));
2792  return true;
2793  } else {
2794  // A scatter with everyone going to the same location is
2795  // undefined (if there's more than one program instance in
2796  // the gang). Issue a warning.
2797  if (g->target->getVectorWidth() > 1)
2798  Warning(pos, "Undefined behavior: all program instances are "
2799  "writing to the same location!");
2800 
2801  // We could do something similar to the gather case, where
2802  // we arbitrarily write one of the values, but we need to
2803  // a) check to be sure the mask isn't all off and b) pick
2804  // the value from an executing program instance in that
2805  // case. We'll just let a bunch of the program instances
2806  // do redundant writes, since this isn't important to make
2807  // fast anyway...
2808  return false;
2809  }
2810  } else {
2811  int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
2812 
2813  if (step > 0 && LLVMVectorIsLinear(fullOffsets, step)) {
2814  // We have a linear sequence of memory locations being accessed
2815  // starting with the location given by the offset from
2816  // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
2817  // and 64 bit gather/scatters, respectively.)
2818  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
2819  lCopyMetadata(ptr, callInst);
2820 
2821  if (gatherInfo != NULL) {
2822  Debug(pos, "Transformed gather to unaligned vector load!");
2823  llvm::Instruction *newCall =
2824  lCallInst(gatherInfo->loadMaskedFunc, ptr, mask, LLVMGetName(ptr, "_masked_load"));
2825  lCopyMetadata(newCall, callInst);
2826  llvm::ReplaceInstWithInst(callInst, newCall);
2827  return true;
2828  } else {
2829  Debug(pos, "Transformed scatter to unaligned vector store!");
2830  ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast", callInst);
2831  llvm::Instruction *newCall = lCallInst(scatterInfo->maskedStoreFunc, ptr, storeValue, mask, "");
2832  lCopyMetadata(newCall, callInst);
2833  llvm::ReplaceInstWithInst(callInst, newCall);
2834  return true;
2835  }
2836  }
2837  return false;
2838  }
2839 }
2840 
2841 ///////////////////////////////////////////////////////////////////////////
2842 // MaskedStoreOptPass
2843 
2844 /** Masked stores are generally more complex than regular stores; for
2845  example, they require multiple instructions to simulate under SSE.
2846  This optimization detects cases where masked stores can be replaced
2847  with regular stores or removed entirely, for the cases of an 'all on'
2848  mask and an 'all off' mask, respectively.
2849 */
2850 static bool lImproveMaskedStore(llvm::CallInst *callInst) {
2851  struct MSInfo {
2852  MSInfo(const char *name, const int a) : align(a) {
2853  func = m->module->getFunction(name);
2854  Assert(func != NULL);
2855  }
2856  llvm::Function *func;
2857  const int align;
2858  };
2859 
2860  MSInfo msInfo[] = {MSInfo("__pseudo_masked_store_i8", 1), MSInfo("__pseudo_masked_store_i16", 2),
2861  MSInfo("__pseudo_masked_store_i32", 4), MSInfo("__pseudo_masked_store_float", 4),
2862  MSInfo("__pseudo_masked_store_i64", 8), MSInfo("__pseudo_masked_store_double", 8),
2863  MSInfo("__masked_store_blend_i8", 1), MSInfo("__masked_store_blend_i16", 2),
2864  MSInfo("__masked_store_blend_i32", 4), MSInfo("__masked_store_blend_float", 4),
2865  MSInfo("__masked_store_blend_i64", 8), MSInfo("__masked_store_blend_double", 8),
2866  MSInfo("__masked_store_i8", 1), MSInfo("__masked_store_i16", 2),
2867  MSInfo("__masked_store_i32", 4), MSInfo("__masked_store_float", 4),
2868  MSInfo("__masked_store_i64", 8), MSInfo("__masked_store_double", 8)};
2869 
2870  llvm::Function *called = callInst->getCalledFunction();
2871 
2872  int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
2873  MSInfo *info = NULL;
2874  for (int i = 0; i < nMSFuncs; ++i) {
2875  if (msInfo[i].func != NULL && called == msInfo[i].func) {
2876  info = &msInfo[i];
2877  break;
2878  }
2879  }
2880  if (info == NULL)
2881  return false;
2882 
2883  // Got one; grab the operands
2884  llvm::Value *lvalue = callInst->getArgOperand(0);
2885  llvm::Value *rvalue = callInst->getArgOperand(1);
2886  llvm::Value *mask = callInst->getArgOperand(2);
2887 
2888  MaskStatus maskStatus = lGetMaskStatus(mask);
2889  if (maskStatus == ALL_OFF) {
2890  // Zero mask - no-op, so remove the store completely. (This
2891  // may in turn lead to being able to optimize out instructions
2892  // that compute the rvalue...)
2893  callInst->eraseFromParent();
2894  return true;
2895  } else if (maskStatus == ALL_ON) {
2896  // The mask is all on, so turn this into a regular store
2897  llvm::Type *rvalueType = rvalue->getType();
2898  llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
2899 
2900  lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
2901  lCopyMetadata(lvalue, callInst);
2902  llvm::Instruction *store =
2903  new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
2904  g->opt.forceAlignedMemory ? g->target->getNativeVectorAlignment() : info->align);
2905  lCopyMetadata(store, callInst);
2906  llvm::ReplaceInstWithInst(callInst, store);
2907  return true;
2908  }
2909 
2910  return false;
2911 }
2912 
2913 static bool lImproveMaskedLoad(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter) {
2914  struct MLInfo {
2915  MLInfo(const char *name, const int a) : align(a) {
2916  func = m->module->getFunction(name);
2917  Assert(func != NULL);
2918  }
2919  llvm::Function *func;
2920  const int align;
2921  };
2922 
2923  MLInfo mlInfo[] = {MLInfo("__masked_load_i8", 1), MLInfo("__masked_load_i16", 2),
2924  MLInfo("__masked_load_i32", 4), MLInfo("__masked_load_float", 4),
2925  MLInfo("__masked_load_i64", 8), MLInfo("__masked_load_double", 8)};
2926 
2927  llvm::Function *called = callInst->getCalledFunction();
2928 
2929  int nFuncs = sizeof(mlInfo) / sizeof(mlInfo[0]);
2930  MLInfo *info = NULL;
2931  for (int i = 0; i < nFuncs; ++i) {
2932  if (mlInfo[i].func != NULL && called == mlInfo[i].func) {
2933  info = &mlInfo[i];
2934  break;
2935  }
2936  }
2937  if (info == NULL)
2938  return false;
2939 
2940  // Got one; grab the operands
2941  llvm::Value *ptr = callInst->getArgOperand(0);
2942  llvm::Value *mask = callInst->getArgOperand(1);
2943 
2944  MaskStatus maskStatus = lGetMaskStatus(mask);
2945  if (maskStatus == ALL_OFF) {
2946  // Zero mask - no-op, so replace the load with an undef value
2947  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, llvm::UndefValue::get(callInst->getType()));
2948  return true;
2949  } else if (maskStatus == ALL_ON) {
2950  // The mask is all on, so turn this into a regular load
2951  llvm::Type *ptrType = llvm::PointerType::get(callInst->getType(), 0);
2952  ptr = new llvm::BitCastInst(ptr, ptrType, "ptr_cast_for_load", callInst);
2953  llvm::Instruction *load = new llvm::LoadInst(
2954  ptr, callInst->getName(), false /* not volatile */,
2955  g->opt.forceAlignedMemory ? g->target->getNativeVectorAlignment() : info->align, (llvm::Instruction *)NULL);
2956  lCopyMetadata(load, callInst);
2957  llvm::ReplaceInstWithInst(callInst, load);
2958  return true;
2959  } else
2960  return false;
2961 }
2962 
2963 bool ImproveMemoryOpsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
2964  DEBUG_START_PASS("ImproveMemoryOps");
2965 
2966  bool modifiedAny = false;
2967 restart:
2968  // Iterate through all of the instructions in the basic block.
2969  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
2970  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
2971  // If we don't have a call to one of the
2972  // __pseudo_{gather,scatter}_* functions, then just go on to the
2973  // next instruction.
2974  if (callInst == NULL || callInst->getCalledFunction() == NULL)
2975  continue;
2976 
2977  if (lGSToGSBaseOffsets(callInst)) {
2978  modifiedAny = true;
2979  goto restart;
2980  }
2981  if (lGSBaseOffsetsGetMoreConst(callInst)) {
2982  modifiedAny = true;
2983  goto restart;
2984  }
2985  if (lGSToLoadStore(callInst)) {
2986  modifiedAny = true;
2987  goto restart;
2988  }
2989  if (lImproveMaskedStore(callInst)) {
2990  modifiedAny = true;
2991  goto restart;
2992  }
2993  if (lImproveMaskedLoad(callInst, iter)) {
2994  modifiedAny = true;
2995  goto restart;
2996  }
2997  }
2998 
2999  DEBUG_END_PASS("ImproveMemoryOps");
3000 
3001  return modifiedAny;
3002 }
3003 
3004 static llvm::Pass *CreateImproveMemoryOpsPass() { return new ImproveMemoryOpsPass; }
3005 
3006 ///////////////////////////////////////////////////////////////////////////
3007 // GatherCoalescePass
3008 
3009 // This pass implements two optimizations to improve the performance of
3010 // gathers; currently only gathers of 32-bit values where it can be
3011 // determined at compile time that the mask is all on are supported, though
3012 // both of those limitations may be generalized in the future.
3013 //
3014 // First, for any single gather, see if it's worthwhile to break it into
3015 // any of scalar, 2-wide (i.e. 64-bit), 4-wide, or 8-wide loads. Further,
3016 // we generate code that shuffles these loads around. Doing fewer, larger
3017 // loads in this manner, when possible, can be more efficient.
3018 //
3019 // Second, this pass can coalesce memory accesses across multiple
3020 // gathers. If we have a series of gathers without any memory writes in
3021 // the middle, then we try to analyze their reads collectively and choose
3022 // an efficient set of loads for them. Not only does this help if
3023 // different gathers reuse values from the same location in memory, but
3024 // it's specifically helpful when data with AOS layout is being accessed;
3025 // in this case, we're often able to generate wide vector loads and
3026 // appropriate shuffles automatically.
3027 
3028 class GatherCoalescePass : public llvm::BasicBlockPass {
3029  public:
3030  static char ID;
3031  GatherCoalescePass() : BasicBlockPass(ID) {}
3032 
3033 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
3034  const char *getPassName() const { return "Gather Coalescing"; }
3035 #else // LLVM 4.0+
3036  llvm::StringRef getPassName() const { return "Gather Coalescing"; }
3037 #endif
3038  bool runOnBasicBlock(llvm::BasicBlock &BB);
3039 };
3040 
3041 char GatherCoalescePass::ID = 0;
3042 
3043 /** Representation of a memory load that the gather coalescing code has
3044  decided to generate.
3045  */
3047  CoalescedLoadOp(int64_t s, int c) {
3048  start = s;
3049  count = c;
3050  load = element0 = element1 = NULL;
3051  }
3052 
3053  /** Starting offset of the load from the common base pointer (in terms
3054  of numbers of items of the underlying element type--*not* in terms
3055  of bytes). */
3056  int64_t start;
3057 
3058  /** Number of elements to load at this location */
3059  int count;
3060 
3061  /** Value loaded from memory for this load op */
3062  llvm::Value *load;
3063 
3064  /** For 2-wide loads (i.e. 64-bit loads), these store the lower and
3065  upper 32 bits of the result, respectively. */
3066  llvm::Value *element0, *element1;
3067 };
3068 
3069 /** This function determines whether it makes sense (and is safe) to
3070  generate a vector load of width vectorWidth, starting at *iter. It
3071  returns true if so, setting *newIter to point to the next element in
3072  the set that isn't taken care of by the generated load. If a vector
3073  load of the given width doesn't make sense, then false is returned.
3074  */
3075 static bool lVectorLoadIsEfficient(std::set<int64_t>::iterator iter, std::set<int64_t>::iterator end,
3076  std::set<int64_t>::iterator *newIter, int vectorWidth) {
3077  // We're considering a vector load of width vectorWidth, starting at
3078  // the offset "start".
3079  int64_t start = *iter;
3080 
3081  // The basic idea is that we'll look at the subsequent elements in the
3082  // load set after the initial one at start. As long as subsequent
3083  // elements:
3084  //
3085  // 1. Aren't so far separated that they no longer fit into the range
3086  // [start, start+vectorWidth)
3087  //
3088  // 2. And don't have too large a gap in between them (e.g., it's not
3089  // worth generating an 8-wide load for two elements with offsets 0
3090  // and 7, but no loads requested in between).
3091  //
3092  // Then we continue moving forward through the elements until we either
3093  // fill up the vector or run out of elements.
3094 
3095  // lastAccepted holds the last offset we've processed and accepted as
3096  // valid for the vector load underconsideration
3097  int64_t lastAccepted = start;
3098 
3099  while (iter != end) {
3100  // What is the separation in offset values from the last element we
3101  // added to the set for this load?
3102  int64_t delta = *iter - lastAccepted;
3103  if (delta > 3)
3104  // If there's too big a gap, then we won't issue the load
3105  return false;
3106 
3107  int64_t span = *iter - start + 1;
3108 
3109  if (span == vectorWidth) {
3110  // We've extended far enough that we have exactly filled up the
3111  // entire vector width; we can't go any further, so return with
3112  // success. (Update *newIter to point at the next element
3113  // after the last one accepted here.)
3114  *newIter = ++iter;
3115  return true;
3116  } else if (span > vectorWidth) {
3117  // The current offset won't fit into a vectorWidth-wide load
3118  // starting from start. It's still generally worthwhile
3119  // issuing the load we've been considering, though, since it
3120  // will provide values for a number of previous offsets. This
3121  // load will have one or more elements at the end of its range
3122  // that is not needed by any of the offsets under
3123  // consideration. As such, there are three cases where issuing
3124  // this load is a bad idea:
3125  //
3126  // 1. 2-wide loads: we know that we haven't completely filled
3127  // the 2-wide vector, since otherwise the if() test above
3128  // would have succeeded previously. Therefore, we must have
3129  // a situation with offsets like (4,6,...); it would be a
3130  // silly idea to issue a 2-wide load to get the value for
3131  // the 4 offset, versus failing here and issuing a scalar
3132  // load instead.
3133  //
3134  // 2. If there are too many unnecessary values at the end of
3135  // the load extent (defined as more than half of them)--in
3136  // this case, it'd be better to issue a vector load of
3137  // smaller width anyway.
3138  //
3139  // 3. If the gap between the last accepted offset and the
3140  // current one under consideration is more than the page
3141  // size. In this case we can't be sure whether or not some
3142  // of the unused elements at the end of the load will
3143  // straddle a page boundary and thus lead to an undesirable
3144  // fault. (It's hard to imagine this happening in practice,
3145  // except under contrived circumstances, but better safe
3146  // than sorry.)
3147  const int pageSize = 4096;
3148  if (vectorWidth != 2 && (lastAccepted - start) > (vectorWidth / 2) && (*iter - lastAccepted) < pageSize) {
3149  *newIter = iter;
3150  return true;
3151  } else
3152  return false;
3153  }
3154 
3155  // Continue moving forward
3156  lastAccepted = *iter;
3157  ++iter;
3158  }
3159 
3160  return false;
3161 }
3162 
3163 /** Given a set of offsets from a common base pointer that we need to get
3164  loaded into memory, determine a reasonable set of load operations that
3165  gets all of the corresponding values in memory (ideally, including as
3166  many as possible wider vector loads rather than scalar loads). Return
3167  a CoalescedLoadOp for each one in the *loads array.
3168  */
3169 static void lSelectLoads(const std::vector<int64_t> &loadOffsets, std::vector<CoalescedLoadOp> *loads) {
3170  // First, get a sorted set of unique offsets to load from.
3171  std::set<int64_t> allOffsets;
3172  for (unsigned int i = 0; i < loadOffsets.size(); ++i)
3173  allOffsets.insert(loadOffsets[i]);
3174 
3175  std::set<int64_t>::iterator iter = allOffsets.begin();
3176  while (iter != allOffsets.end()) {
3177  Debug(SourcePos(), "Load needed at %" PRId64 ".", *iter);
3178  ++iter;
3179  }
3180 
3181  // Now, iterate over the offsets from low to high. Starting at the
3182  // current offset, we see if a vector load starting from that offset
3183  // will cover loads at subsequent offsets as well.
3184  iter = allOffsets.begin();
3185  while (iter != allOffsets.end()) {
3186  // Consider vector loads of width of each of the elements of
3187  // spanSizes[], in order.
3188  int vectorWidths[] = {8, 4, 2};
3189  int nVectorWidths = sizeof(vectorWidths) / sizeof(vectorWidths[0]);
3190  bool gotOne = false;
3191  for (int i = 0; i < nVectorWidths; ++i) {
3192  // See if a load of vector with width vectorWidths[i] would be
3193  // effective (i.e. would cover a reasonable number of the
3194  // offsets that need to be loaded from).
3195  std::set<int64_t>::iterator newIter;
3196  if (lVectorLoadIsEfficient(iter, allOffsets.end(), &newIter, vectorWidths[i])) {
3197  // Yes: create the corresponding coalesced load and update
3198  // the iterator to the returned iterator; doing so skips
3199  // over the additional offsets that are taken care of by
3200  // this load.
3201  loads->push_back(CoalescedLoadOp(*iter, vectorWidths[i]));
3202  iter = newIter;
3203  gotOne = true;
3204  break;
3205  }
3206  }
3207 
3208  if (gotOne == false) {
3209  // We couldn't find a vector load starting from this offset
3210  // that made sense, so emit a scalar load and continue onward.
3211  loads->push_back(CoalescedLoadOp(*iter, 1));
3212  ++iter;
3213  }
3214  }
3215 }
3216 
3217 /** Print a performance message with the details of the result of
3218  coalescing over a group of gathers. */
3219 static void lCoalescePerfInfo(const std::vector<llvm::CallInst *> &coalesceGroup,
3220  const std::vector<CoalescedLoadOp> &loadOps) {
3221  SourcePos pos;
3222  lGetSourcePosFromMetadata(coalesceGroup[0], &pos);
3223 
3224  // Create a string that indicates the line numbers of the subsequent
3225  // gathers from the first one that were coalesced here.
3226  char otherPositions[512];
3227  otherPositions[0] = '\0';
3228  if (coalesceGroup.size() > 1) {
3229  const char *plural = (coalesceGroup.size() > 2) ? "s" : "";
3230  char otherBuf[32];
3231  snprintf(otherBuf, sizeof(otherBuf), "(other%s at line%s ", plural, plural);
3232  strncat(otherPositions, otherBuf, sizeof(otherPositions) - strlen(otherPositions) - 1);
3233 
3234  for (int i = 1; i < (int)coalesceGroup.size(); ++i) {
3235  SourcePos p;
3236  bool ok = lGetSourcePosFromMetadata(coalesceGroup[i], &p);
3237  if (ok) {
3238  char buf[32];
3239  snprintf(buf, sizeof(buf), "%d", p.first_line);
3240  strncat(otherPositions, buf, sizeof(otherPositions) - strlen(otherPositions) - 1);
3241  if (i < (int)coalesceGroup.size() - 1)
3242  strncat(otherPositions, ", ", sizeof(otherPositions) - strlen(otherPositions) - 1);
3243  }
3244  }
3245  strncat(otherPositions, ") ", sizeof(otherPositions) - strlen(otherPositions) - 1);
3246  }
3247 
3248  // Count how many loads of each size there were.
3249  std::map<int, int> loadOpsCount;
3250  for (int i = 0; i < (int)loadOps.size(); ++i)
3251  ++loadOpsCount[loadOps[i].count];
3252 
3253  // Generate a string the describes the mix of load ops
3254  char loadOpsInfo[512];
3255  loadOpsInfo[0] = '\0';
3256  std::map<int, int>::const_iterator iter = loadOpsCount.begin();
3257  while (iter != loadOpsCount.end()) {
3258  char buf[32];
3259  snprintf(buf, sizeof(buf), "%d x %d-wide", iter->second, iter->first);
3260  if ((strlen(loadOpsInfo) + strlen(buf)) >= 512) {
3261  break;
3262  }
3263  strncat(loadOpsInfo, buf, sizeof(loadOpsInfo) - strlen(loadOpsInfo) - 1);
3264  ++iter;
3265  if (iter != loadOpsCount.end())
3266  strncat(loadOpsInfo, ", ", sizeof(loadOpsInfo) - strlen(loadOpsInfo) - 1);
3267  }
3268 
3269  if (g->opt.level > 0) {
3270  if (coalesceGroup.size() == 1)
3271  PerformanceWarning(pos, "Coalesced gather into %d load%s (%s).", (int)loadOps.size(),
3272  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3273  else
3274  PerformanceWarning(pos,
3275  "Coalesced %d gathers starting here %sinto %d "
3276  "load%s (%s).",
3277  (int)coalesceGroup.size(), otherPositions, (int)loadOps.size(),
3278  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3279  }
3280 }
3281 
3282 /** Utility routine that computes an offset from a base pointer and then
3283  returns the result of a load of the given type from the resulting
3284  location:
3285 
3286  return *((type *)(basePtr + offset))
3287  */
3288 llvm::Value *lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align, llvm::Instruction *insertBefore,
3289  llvm::Type *type) {
3290  llvm::Value *ptr = lGEPInst(basePtr, LLVMInt64(offset), "new_base", insertBefore);
3291  ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(type, 0), "ptr_cast", insertBefore);
3292  return new llvm::LoadInst(ptr, "gather_load", false /* not volatile */, align, insertBefore);
3293 }
3294 
3295 /* Having decided that we're doing to emit a series of loads, as encoded in
3296  the loadOps array, this function emits the corresponding load
3297  instructions.
3298  */
3299 static void lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps, int elementSize,
3300  llvm::Instruction *insertBefore) {
3301  Debug(SourcePos(), "Coalesce doing %d loads.", (int)loadOps.size());
3302  for (int i = 0; i < (int)loadOps.size(); ++i) {
3303  Debug(SourcePos(), "Load #%d @ %" PRId64 ", %d items", i, loadOps[i].start, loadOps[i].count);
3304 
3305  // basePtr is an i8 *, so the offset from it should be in terms of
3306  // bytes, not underlying i32 elements.
3307  int64_t start = loadOps[i].start * elementSize;
3308 
3309  int align = 4;
3310  switch (loadOps[i].count) {
3311  case 1:
3312  // Single 32-bit scalar load
3313  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, LLVMTypes::Int32Type);
3314  break;
3315  case 2: {
3316  // Emit 2 x i32 loads as i64 loads and then break the result
3317  // into two 32-bit parts.
3318  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, LLVMTypes::Int64Type);
3319  // element0 = (int32)value;
3320  loadOps[i].element0 =
3321  new llvm::TruncInst(loadOps[i].load, LLVMTypes::Int32Type, "load64_elt0", insertBefore);
3322  // element1 = (int32)(value >> 32)
3323  llvm::Value *shift = llvm::BinaryOperator::Create(llvm::Instruction::LShr, loadOps[i].load, LLVMInt64(32),
3324  "load64_shift", insertBefore);
3325  loadOps[i].element1 = new llvm::TruncInst(shift, LLVMTypes::Int32Type, "load64_elt1", insertBefore);
3326  break;
3327  }
3328  case 4: {
3329  // 4-wide vector load
3330  if (g->opt.forceAlignedMemory) {
3331  align = g->target->getNativeVectorAlignment();
3332  }
3333  llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3334  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, vt);
3335  break;
3336  }
3337  case 8: {
3338  // 8-wide vector load
3339  if (g->opt.forceAlignedMemory) {
3340  align = g->target->getNativeVectorAlignment();
3341  }
3342  llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 8);
3343  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, vt);
3344  break;
3345  }
3346  default:
3347  FATAL("Unexpected load count in lEmitLoads()");
3348  }
3349  }
3350 }
3351 
3352 /** Convert any loads of 8-wide vectors into two 4-wide vectors
3353  (logically). This allows the assembly code below to always operate on
3354  4-wide vectors, which leads to better code. Returns a new vector of
3355  load operations.
3356  */
3357 static std::vector<CoalescedLoadOp> lSplit8WideLoads(const std::vector<CoalescedLoadOp> &loadOps,
3358  llvm::Instruction *insertBefore) {
3359  std::vector<CoalescedLoadOp> ret;
3360  for (unsigned int i = 0; i < loadOps.size(); ++i) {
3361  if (loadOps[i].count == 8) {
3362  // Create fake CoalescedLOadOps, where the load llvm::Value is
3363  // actually a shuffle that pulls either the first 4 or the last
3364  // 4 values out of the original 8-wide loaded value.
3365  int32_t shuf[2][4] = {{0, 1, 2, 3}, {4, 5, 6, 7}};
3366 
3367  ret.push_back(CoalescedLoadOp(loadOps[i].start, 4));
3368  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load, shuf[0], 4, insertBefore);
3369 
3370  ret.push_back(CoalescedLoadOp(loadOps[i].start + 4, 4));
3371  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load, shuf[1], 4, insertBefore);
3372  } else
3373  ret.push_back(loadOps[i]);
3374  }
3375 
3376  return ret;
3377 }
3378 
3379 /** Given a 1-wide load of a 32-bit value, merge its value into the result
3380  vector for any and all elements for which it applies.
3381  */
3382 static llvm::Value *lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
3383  llvm::Instruction *insertBefore) {
3384  for (int elt = 0; elt < 4; ++elt) {
3385  if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
3386  Debug(SourcePos(),
3387  "Load 1 @ %" PRId64 " matches for element #%d "
3388  "(value %" PRId64 ")",
3389  load.start, elt, offsets[elt]);
3390  // If this load gives one of the values that we need, then we
3391  // can just insert it in directly
3392  Assert(set[elt] == false);
3393  result = llvm::InsertElementInst::Create(result, load.load, LLVMInt32(elt), "insert_load", insertBefore);
3394  set[elt] = true;
3395  }
3396  }
3397 
3398  return result;
3399 }
3400 
3401 /** Similarly, incorporate the values from a 2-wide load into any vector
3402  elements that they apply to. */
3403 static llvm::Value *lApplyLoad2(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
3404  llvm::Instruction *insertBefore) {
3405  int elt = 0;
3406  while (elt < 4) {
3407  // First, try to do a 64-bit-wide insert into the result vector.
3408  // We can do this when we're currently at an even element, when the
3409  // current and next element have consecutive values, and where the
3410  // original 64-bit load is at the offset needed by the current
3411  // element.
3412  if ((elt & 1) == 0 && offsets[elt] + 1 == offsets[elt + 1] && offsets[elt] == load.start) {
3413  Debug(SourcePos(),
3414  "Load 2 @ %" PRId64 " matches for elements #%d,%d "
3415  "(values %" PRId64 ",%" PRId64 ")",
3416  load.start, elt, elt + 1, offsets[elt], offsets[elt + 1]);
3417  Assert(set[elt] == false && ((elt < 3) && set[elt + 1] == false));
3418 
3419  // In this case, we bitcast from a 4xi32 to a 2xi64 vector
3420  llvm::Type *vec2x64Type = llvm::VectorType::get(LLVMTypes::Int64Type, 2);
3421  result = new llvm::BitCastInst(result, vec2x64Type, "to2x64", insertBefore);
3422 
3423  // And now we can insert the 64-bit wide value into the
3424  // appropriate elment
3425  result = llvm::InsertElementInst::Create(result, load.load, LLVMInt32(elt / 2), "insert64", insertBefore);
3426 
3427  // And back to 4xi32.
3428  llvm::Type *vec4x32Type = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3429  result = new llvm::BitCastInst(result, vec4x32Type, "to4x32", insertBefore);
3430 
3431  set[elt] = true;
3432  if (elt < 3) {
3433  set[elt + 1] = true;
3434  }
3435  // Advance elt one extra time, since we just took care of two
3436  // elements
3437  ++elt;
3438  } else if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
3439  Debug(SourcePos(),
3440  "Load 2 @ %" PRId64 " matches for element #%d "
3441  "(value %" PRId64 ")",
3442  load.start, elt, offsets[elt]);
3443  // Otherwise, insert one of the 32-bit pieces into an element
3444  // of the final vector
3445  Assert(set[elt] == false);
3446  llvm::Value *toInsert = (offsets[elt] == load.start) ? load.element0 : load.element1;
3447  result = llvm::InsertElementInst::Create(result, toInsert, LLVMInt32(elt), "insert_load", insertBefore);
3448  set[elt] = true;
3449  }
3450  ++elt;
3451  }
3452 
3453  return result;
3454 }
3455 
3456 #if 1
3457 /* This approach works better with AVX, while the #else path generates
3458  slightly better code with SSE. Need to continue to dig into performance
3459  details with this stuff in general... */
3460 
3461 /** And handle a 4-wide load */
3462 static llvm::Value *lApplyLoad4(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
3463  llvm::Instruction *insertBefore) {
3464  // Conceptually, we're doing to consider doing a shuffle vector with
3465  // the 4-wide load and the 4-wide result we have so far to generate a
3466  // new 4-wide vector. We'll start with shuffle indices that just
3467  // select each element of the result so far for the result.
3468  int32_t shuf[4] = {4, 5, 6, 7};
3469 
3470  for (int elt = 0; elt < 4; ++elt) {
3471  if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
3472  Debug(SourcePos(),
3473  "Load 4 @ %" PRId64 " matches for element #%d "
3474  "(value %" PRId64 ")",
3475  load.start, elt, offsets[elt]);
3476 
3477  // If the current element falls within the range of locations
3478  // that the 4-wide load covers, then compute the appropriate
3479  // shuffle index that extracts the appropriate element from the
3480  // load.
3481  Assert(set[elt] == false);
3482  shuf[elt] = int32_t(offsets[elt] - load.start);
3483  set[elt] = true;
3484  }
3485  }
3486 
3487  // Now, issue a shufflevector instruction if any of the values from the
3488  // load we just considered were applicable.
3489  if (shuf[0] != 4 || shuf[1] != 5 || shuf[2] != 6 || shuf[3] != 7)
3490  result = LLVMShuffleVectors(load.load, result, shuf, 4, insertBefore);
3491 
3492  return result;
3493 }
3494 
3495 /** We're need to fill in the values for a 4-wide result vector. This
3496  function looks at all of the generated loads and extracts the
3497  appropriate elements from the appropriate loads to assemble the result.
3498  Here the offsets[] parameter gives the 4 offsets from the base pointer
3499  for the four elements of the result.
3500 */
3501 static llvm::Value *lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps, const int64_t offsets[4],
3502  llvm::Instruction *insertBefore) {
3503  llvm::Type *returnType = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3504  llvm::Value *result = llvm::UndefValue::get(returnType);
3505 
3506  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3507  offsets[1], offsets[2], offsets[3]);
3508 
3509  // Track whether we have found a valid value for each of the four
3510  // elements of the result
3511  bool set[4] = {false, false, false, false};
3512 
3513  // Loop over all of the loads and check each one to see if it provides
3514  // a value that's applicable to the result
3515  for (int load = 0; load < (int)loadOps.size(); ++load) {
3516  const CoalescedLoadOp &li = loadOps[load];
3517 
3518  switch (li.count) {
3519  case 1:
3520  result = lApplyLoad1(result, li, offsets, set, insertBefore);
3521  break;
3522  case 2:
3523  result = lApplyLoad2(result, li, offsets, set, insertBefore);
3524  break;
3525  case 4:
3526  result = lApplyLoad4(result, li, offsets, set, insertBefore);
3527  break;
3528  default:
3529  FATAL("Unexpected load count in lAssemble4Vector()");
3530  }
3531  }
3532 
3533  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3534  offsets[1], offsets[2], offsets[3]);
3535 
3536  for (int i = 0; i < 4; ++i)
3537  Assert(set[i] == true);
3538 
3539  return result;
3540 }
3541 
3542 #else
3543 
3544 static llvm::Value *lApplyLoad4s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3545  const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore) {
3546  int32_t firstMatchElements[4] = {-1, -1, -1, -1};
3547  const CoalescedLoadOp *firstMatch = NULL;
3548 
3549  Assert(llvm::isa<llvm::UndefValue>(result));
3550 
3551  for (int load = 0; load < (int)loadOps.size(); ++load) {
3552  const CoalescedLoadOp &loadop = loadOps[load];
3553  if (loadop.count != 4)
3554  continue;
3555 
3556  int32_t matchElements[4] = {-1, -1, -1, -1};
3557  bool anyMatched = false;
3558  for (int elt = 0; elt < 4; ++elt) {
3559  if (offsets[elt] >= loadop.start && offsets[elt] < loadop.start + loadop.count) {
3560  Debug(SourcePos(),
3561  "Load 4 @ %" PRId64 " matches for element #%d "
3562  "(value %" PRId64 ")",
3563  loadop.start, elt, offsets[elt]);
3564  anyMatched = true;
3565  Assert(set[elt] == false);
3566  matchElements[elt] = offsets[elt] - loadop.start;
3567  set[elt] = true;
3568  }
3569  }
3570 
3571  if (anyMatched) {
3572  if (llvm::isa<llvm::UndefValue>(result)) {
3573  if (firstMatch == NULL) {
3574  firstMatch = &loadop;
3575  for (int i = 0; i < 4; ++i)
3576  firstMatchElements[i] = matchElements[i];
3577  } else {
3578  int32_t shuffle[4] = {-1, -1, -1, -1};
3579  for (int i = 0; i < 4; ++i) {
3580  if (firstMatchElements[i] != -1)
3581  shuffle[i] = firstMatchElements[i];
3582  else
3583  shuffle[i] = 4 + matchElements[i];
3584  }
3585  result = LLVMShuffleVectors(firstMatch->load, loadop.load, shuffle, 4, insertBefore);
3586  firstMatch = NULL;
3587  }
3588  } else {
3589  int32_t shuffle[4] = {-1, -1, -1, -1};
3590  for (int i = 0; i < 4; ++i) {
3591  if (matchElements[i] != -1)
3592  shuffle[i] = 4 + matchElements[i];
3593  else
3594  shuffle[i] = i;
3595  }
3596  result = LLVMShuffleVectors(result, loadop.load, shuffle, 4, insertBefore);
3597  }
3598  }
3599  }
3600 
3601  if (firstMatch != NULL && llvm::isa<llvm::UndefValue>(result))
3602  return LLVMShuffleVectors(firstMatch->load, result, firstMatchElements, 4, insertBefore);
3603  else
3604  return result;
3605 }
3606 
3607 static llvm::Value *lApplyLoad12s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3608  const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore) {
3609  // Loop over all of the loads and check each one to see if it provides
3610  // a value that's applicable to the result
3611  for (int load = 0; load < (int)loadOps.size(); ++load) {
3612  const CoalescedLoadOp &loadop = loadOps[load];
3613  Assert(loadop.count == 1 || loadop.count == 2 || loadop.count == 4);
3614 
3615  if (loadop.count == 1)
3616  result = lApplyLoad1(result, loadop, offsets, set, insertBefore);
3617  else if (loadop.count == 2)
3618  result = lApplyLoad2(result, loadop, offsets, set, insertBefore);
3619  }
3620  return result;
3621 }
3622 
3623 /** We're need to fill in the values for a 4-wide result vector. This
3624  function looks at all of the generated loads and extracts the
3625  appropriate elements from the appropriate loads to assemble the result.
3626  Here the offsets[] parameter gives the 4 offsets from the base pointer
3627  for the four elements of the result.
3628 */
3629 static llvm::Value *lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps, const int64_t offsets[4],
3630  llvm::Instruction *insertBefore) {
3631  llvm::Type *returnType = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3632  llvm::Value *result = llvm::UndefValue::get(returnType);
3633 
3634  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3635  offsets[1], offsets[2], offsets[3]);
3636 
3637  // Track whether we have found a valid value for each of the four
3638  // elements of the result
3639  bool set[4] = {false, false, false, false};
3640 
3641  result = lApplyLoad4s(result, loadOps, offsets, set, insertBefore);
3642  result = lApplyLoad12s(result, loadOps, offsets, set, insertBefore);
3643 
3644  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3645  offsets[1], offsets[2], offsets[3]);
3646 
3647  for (int i = 0; i < 4; ++i)
3648  Assert(set[i] == true);
3649 
3650  return result;
3651 }
3652 #endif
3653 
3654 /** Given the set of loads that we've done and the set of result values to
3655  be computed, this function computes the final llvm::Value *s for each
3656  result vector.
3657  */
3658 static void lAssembleResultVectors(const std::vector<CoalescedLoadOp> &loadOps,
3659  const std::vector<int64_t> &constOffsets, std::vector<llvm::Value *> &results,
3660  llvm::Instruction *insertBefore) {
3661  // We work on 4-wide chunks of the final values, even when we're
3662  // computing 8-wide or 16-wide vectors. This gives better code from
3663  // LLVM's SSE/AVX code generators.
3664  Assert((constOffsets.size() % 4) == 0);
3665  std::vector<llvm::Value *> vec4s;
3666  for (int i = 0; i < (int)constOffsets.size(); i += 4)
3667  vec4s.push_back(lAssemble4Vector(loadOps, &constOffsets[i], insertBefore));
3668 
3669  // And now concatenate 1, 2, or 4 of the 4-wide vectors computed above
3670  // into 4, 8, or 16-wide final result vectors.
3671  int numGathers = constOffsets.size() / g->target->getVectorWidth();
3672  for (int i = 0; i < numGathers; ++i) {
3673  llvm::Value *result = NULL;
3674  switch (g->target->getVectorWidth()) {
3675  case 4:
3676  result = vec4s[i];
3677  break;
3678  case 8:
3679  result = LLVMConcatVectors(vec4s[2 * i], vec4s[2 * i + 1], insertBefore);
3680  break;
3681  case 16: {
3682  llvm::Value *v1 = LLVMConcatVectors(vec4s[4 * i], vec4s[4 * i + 1], insertBefore);
3683  llvm::Value *v2 = LLVMConcatVectors(vec4s[4 * i + 2], vec4s[4 * i + 3], insertBefore);
3684  result = LLVMConcatVectors(v1, v2, insertBefore);
3685  break;
3686  }
3687  default:
3688  FATAL("Unhandled vector width in lAssembleResultVectors()");
3689  }
3690 
3691  results.push_back(result);
3692  }
3693 }
3694 
3695 /** Given a call to a gather function, extract the base pointer, the 2/4/8
3696  scale, and the first varying offsets value to use them to compute that
3697  scalar base pointer that is shared by all of the gathers in the group.
3698  (Thus, this base pointer plus the constant offsets term for each gather
3699  gives the set of addresses to use for each gather.
3700  */
3701 static llvm::Value *lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore) {
3702  llvm::Value *basePtr = gatherInst->getArgOperand(0);
3703  llvm::Value *variableOffsets = gatherInst->getArgOperand(1);
3704  llvm::Value *offsetScale = gatherInst->getArgOperand(2);
3705 
3706  // All of the variable offsets values should be the same, due to
3707  // checking for this in GatherCoalescePass::runOnBasicBlock(). Thus,
3708  // extract the first value and use that as a scalar.
3709  llvm::Value *variable = LLVMExtractFirstVectorElement(variableOffsets);
3710  if (variable->getType() == LLVMTypes::Int64Type)
3711  offsetScale = new llvm::ZExtInst(offsetScale, LLVMTypes::Int64Type, "scale_to64", insertBefore);
3712  llvm::Value *offset =
3713  llvm::BinaryOperator::Create(llvm::Instruction::Mul, variable, offsetScale, "offset", insertBefore);
3714 
3715  return lGEPInst(basePtr, offset, "new_base", insertBefore);
3716 }
3717 
3718 /** Extract the constant offsets (from the common base pointer) from each
3719  of the gathers in a set to be coalesced. These come in as byte
3720  offsets, but we'll transform them into offsets in terms of the size of
3721  the base scalar type being gathered. (e.g. for an i32 gather, we might
3722  have offsets like <0,4,16,20>, which would be transformed to <0,1,4,5>
3723  here.)
3724  */
3725 static void lExtractConstOffsets(const std::vector<llvm::CallInst *> &coalesceGroup, int elementSize,
3726  std::vector<int64_t> *constOffsets) {
3727  int width = g->target->getVectorWidth();
3728  *constOffsets = std::vector<int64_t>(coalesceGroup.size() * width, 0);
3729 
3730  int64_t *endPtr = &((*constOffsets)[0]);
3731  for (int i = 0; i < (int)coalesceGroup.size(); ++i, endPtr += width) {
3732  llvm::Value *offsets = coalesceGroup[i]->getArgOperand(3);
3733  int nElts;
3734  bool ok = LLVMExtractVectorInts(offsets, endPtr, &nElts);
3735  Assert(ok && nElts == width);
3736  }
3737 
3738  for (int i = 0; i < (int)constOffsets->size(); ++i)
3739  (*constOffsets)[i] /= elementSize;
3740 }
3741 
3742 /** Actually do the coalescing. We have a set of gathers all accessing
3743  addresses of the form:
3744 
3745  (ptr + {1,2,4,8} * varyingOffset) + constOffset, a.k.a.
3746  basePtr + constOffset
3747 
3748  where varyingOffset actually has the same value across all of the SIMD
3749  lanes and where the part in parenthesis has the same value for all of
3750  the gathers in the group.
3751  */
3752 static bool lCoalesceGathers(const std::vector<llvm::CallInst *> &coalesceGroup) {
3753  llvm::Instruction *insertBefore = coalesceGroup[0];
3754 
3755  // First, compute the shared base pointer for all of the gathers
3756  llvm::Value *basePtr = lComputeBasePtr(coalesceGroup[0], insertBefore);
3757 
3758  int elementSize = 0;
3759  if (coalesceGroup[0]->getType() == LLVMTypes::Int32VectorType ||
3760  coalesceGroup[0]->getType() == LLVMTypes::FloatVectorType)
3761  elementSize = 4;
3762  else if (coalesceGroup[0]->getType() == LLVMTypes::Int64VectorType ||
3763  coalesceGroup[0]->getType() == LLVMTypes::DoubleVectorType)
3764  elementSize = 8;
3765  else
3766  FATAL("Unexpected gather type in lCoalesceGathers");
3767 
3768  // Extract the constant offsets from the gathers into the constOffsets
3769  // vector: the first vectorWidth elements will be those for the first
3770  // gather, the next vectorWidth those for the next gather, and so
3771  // forth.
3772  std::vector<int64_t> constOffsets;
3773  lExtractConstOffsets(coalesceGroup, elementSize, &constOffsets);
3774 
3775  // Determine a set of loads to perform to get all of the values we need
3776  // loaded.
3777  std::vector<CoalescedLoadOp> loadOps;
3778  lSelectLoads(constOffsets, &loadOps);
3779 
3780  lCoalescePerfInfo(coalesceGroup, loadOps);
3781 
3782  // Actually emit load instructions for them
3783  lEmitLoads(basePtr, loadOps, elementSize, insertBefore);
3784 
3785  // Now, for any loads that give us <8 x i32> vectors, split their
3786  // values into two <4 x i32> vectors; it turns out that LLVM gives us
3787  // better code on AVX when we assemble the pieces from 4-wide vectors.
3788  loadOps = lSplit8WideLoads(loadOps, insertBefore);
3789 
3790  // Given all of these chunks of values, shuffle together a vector that
3791  // gives us each result value; the i'th element of results[] gives the
3792  // result for the i'th gather in coalesceGroup.
3793  std::vector<llvm::Value *> results;
3794  lAssembleResultVectors(loadOps, constOffsets, results, insertBefore);
3795 
3796  // Finally, replace each of the original gathers with the instruction
3797  // that gives the value from the coalescing process.
3798  Assert(results.size() == coalesceGroup.size());
3799  for (int i = 0; i < (int)results.size(); ++i) {
3800  llvm::Instruction *ir = llvm::dyn_cast<llvm::Instruction>(results[i]);
3801  Assert(ir != NULL);
3802 
3803  llvm::Type *origType = coalesceGroup[i]->getType();
3804  if (origType != ir->getType())
3805  ir = new llvm::BitCastInst(ir, origType, ir->getName(), coalesceGroup[i]);
3806 
3807  // Previously, all of the instructions to compute the final result
3808  // were into the basic block here; here we remove the very last one
3809  // of them (that holds the final result) from the basic block.
3810  // This way, the following ReplaceInstWithInst() call will operate
3811  // successfully. (It expects that the second argument not be in any
3812  // basic block.)
3813  ir->removeFromParent();
3814 
3815  llvm::ReplaceInstWithInst(coalesceGroup[i], ir);
3816  }
3817 
3818  return true;
3819 }
3820 
3821 /** Given an instruction, returns true if the instructon may write to
3822  memory. This is a conservative test in that it may return true for
3823  some instructions that don't actually end up writing to memory, but
3824  should never return false for an instruction that does write to
3825  memory. */
3826 static bool lInstructionMayWriteToMemory(llvm::Instruction *inst) {
3827  if (llvm::isa<llvm::StoreInst>(inst) || llvm::isa<llvm::AtomicRMWInst>(inst) ||
3828  llvm::isa<llvm::AtomicCmpXchgInst>(inst))
3829  // FIXME: we could be less conservative and try to allow stores if
3830  // we are sure that the pointers don't overlap..
3831  return true;
3832 
3833  // Otherwise, any call instruction that doesn't have an attribute
3834  // indicating it won't write to memory has to be treated as a potential
3835  // store.
3836  llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
3837  if (ci != NULL) {
3838  llvm::Function *calledFunc = ci->getCalledFunction();
3839  if (calledFunc == NULL)
3840  return true;
3841 
3842  if (calledFunc->onlyReadsMemory() || calledFunc->doesNotAccessMemory())
3843  return false;
3844  return true;
3845  }
3846 
3847  return false;
3848 }
3849 
3850 bool GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
3851  DEBUG_START_PASS("GatherCoalescePass");
3852 
3853  llvm::Function *gatherFuncs[] = {
3854  m->module->getFunction("__pseudo_gather_factored_base_offsets32_i32"),
3855  m->module->getFunction("__pseudo_gather_factored_base_offsets32_float"),
3856  m->module->getFunction("__pseudo_gather_factored_base_offsets64_i32"),
3857  m->module->getFunction("__pseudo_gather_factored_base_offsets64_float"),
3858  };
3859  int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]);
3860 
3861  bool modifiedAny = false;
3862 
3863 restart:
3864  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
3865  // Iterate over all of the instructions and look for calls to
3866  // __pseudo_gather_factored_base_offsets{32,64}_{i32,float} calls.
3867  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
3868  if (callInst == NULL)
3869  continue;
3870 
3871  llvm::Function *calledFunc = callInst->getCalledFunction();
3872  if (calledFunc == NULL)
3873  continue;
3874 
3875  int i;
3876  for (i = 0; i < nGatherFuncs; ++i)
3877  if (gatherFuncs[i] != NULL && calledFunc == gatherFuncs[i])
3878  break;
3879  if (i == nGatherFuncs)
3880  // Doesn't match any of the types of gathers we care about
3881  continue;
3882 
3883  SourcePos pos;
3884  lGetSourcePosFromMetadata(callInst, &pos);
3885  Debug(pos, "Checking for coalescable gathers starting here...");
3886 
3887  llvm::Value *base = callInst->getArgOperand(0);
3888  llvm::Value *variableOffsets = callInst->getArgOperand(1);
3889  llvm::Value *offsetScale = callInst->getArgOperand(2);
3890  llvm::Value *mask = callInst->getArgOperand(4);
3891 
3892  // To apply this optimization, we need a set of one or more gathers
3893  // that fulfill the following conditions:
3894  //
3895  // - Mask all on
3896  // - The variable offsets to all have the same value (i.e., to be
3897  // uniform).
3898  // - Same base pointer, variable offsets, and offset scale (for
3899  // more than one gather)
3900  //
3901  // Then and only then do we have a common base pointer with all
3902  // offsets from that constants (in which case we can potentially
3903  // coalesce).
3904  if (lGetMaskStatus(mask) != ALL_ON)
3905  continue;
3906 
3907  if (!LLVMVectorValuesAllEqual(variableOffsets))
3908  continue;
3909 
3910  // coalesceGroup stores the set of gathers that we're going to try to
3911  // coalesce over
3912  std::vector<llvm::CallInst *> coalesceGroup;
3913  coalesceGroup.push_back(callInst);
3914 
3915  // Start iterating at the instruction after the initial gather;
3916  // look at the remainder of instructions in the basic block (up
3917  // until we reach a write to memory) to try to find any other
3918  // gathers that can coalesce with this one.
3919  llvm::BasicBlock::iterator fwdIter = iter;
3920  ++fwdIter;
3921  for (; fwdIter != bb.end(); ++fwdIter) {
3922  // Must stop once we come to an instruction that may write to
3923  // memory; otherwise we could end up moving a read before this
3924  // write.
3925  if (lInstructionMayWriteToMemory(&*fwdIter))
3926  break;
3927 
3928  llvm::CallInst *fwdCall = llvm::dyn_cast<llvm::CallInst>(&*fwdIter);
3929  if (fwdCall == NULL || fwdCall->getCalledFunction() != calledFunc)
3930  continue;
3931 
3932  SourcePos fwdPos;
3933  // TODO: need to redesign metadata attached to pseudo calls,
3934  // LLVM drops metadata frequently and it results in bad disgnostics.
3935  lGetSourcePosFromMetadata(fwdCall, &fwdPos);
3936 
3937 #ifndef ISPC_NO_DUMPS
3938  if (g->debugPrint) {
3939  if (base != fwdCall->getArgOperand(0)) {
3940  Debug(fwdPos, "base pointers mismatch");
3941  LLVMDumpValue(base);
3942  LLVMDumpValue(fwdCall->getArgOperand(0));
3943  }
3944  if (variableOffsets != fwdCall->getArgOperand(1)) {
3945  Debug(fwdPos, "varying offsets mismatch");
3946  LLVMDumpValue(variableOffsets);
3947  LLVMDumpValue(fwdCall->getArgOperand(1));
3948  }
3949  if (offsetScale != fwdCall->getArgOperand(2)) {
3950  Debug(fwdPos, "offset scales mismatch");
3951  LLVMDumpValue(offsetScale);
3952  LLVMDumpValue(fwdCall->getArgOperand(2));
3953  }
3954  if (mask != fwdCall->getArgOperand(4)) {
3955  Debug(fwdPos, "masks mismatch");
3956  LLVMDumpValue(mask);
3957  LLVMDumpValue(fwdCall->getArgOperand(4));
3958  }
3959  }
3960 #endif
3961 
3962  if (base == fwdCall->getArgOperand(0) && variableOffsets == fwdCall->getArgOperand(1) &&
3963  offsetScale == fwdCall->getArgOperand(2) && mask == fwdCall->getArgOperand(4)) {
3964  Debug(fwdPos, "This gather can be coalesced.");
3965  coalesceGroup.push_back(fwdCall);
3966 
3967  if (coalesceGroup.size() == 4)
3968  // FIXME: untested heuristic: don't try to coalesce
3969  // over a window of more than 4 gathers, so that we
3970  // don't cause too much register pressure and end up
3971  // spilling to memory anyway.
3972  break;
3973  } else
3974  Debug(fwdPos, "This gather doesn't match the initial one.");
3975  }
3976 
3977  Debug(pos, "Done with checking for matching gathers");
3978 
3979  // Now that we have a group of gathers, see if we can coalesce them
3980  // into something more efficient than the original set of gathers.
3981  if (lCoalesceGathers(coalesceGroup)) {
3982  modifiedAny = true;
3983  goto restart;
3984  }
3985  }
3986 
3987  DEBUG_END_PASS("GatherCoalescePass");
3988 
3989  return modifiedAny;
3990 }
3991 
3992 static llvm::Pass *CreateGatherCoalescePass() { return new GatherCoalescePass; }
3993 
3994 ///////////////////////////////////////////////////////////////////////////
3995 // ReplacePseudoMemoryOpsPass
3996 
3997 /** For any gathers and scatters remaining after the GSToLoadStorePass
3998  runs, we need to turn them into actual native gathers and scatters.
3999  This task is handled by the ReplacePseudoMemoryOpsPass here.
4000  */
4001 class ReplacePseudoMemoryOpsPass : public llvm::BasicBlockPass {
4002  public:
4003  static char ID;
4004  ReplacePseudoMemoryOpsPass() : BasicBlockPass(ID) {}
4005 
4006 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4007  const char *getPassName() const { return "Replace Pseudo Memory Ops"; }
4008 #else // LLVM 4.0+
4009  llvm::StringRef getPassName() const { return "Replace Pseudo Memory Ops"; }
4010 #endif
4011  bool runOnBasicBlock(llvm::BasicBlock &BB);
4012 };
4013 
4015 
4016 /** This routine attempts to determine if the given pointer in lvalue is
4017  pointing to stack-allocated memory. It's conservative in that it
4018  should never return true for non-stack allocated memory, but may return
4019  false for memory that actually is stack allocated. The basic strategy
4020  is to traverse through the operands and see if the pointer originally
4021  comes from an AllocaInst.
4022 */
4023 static bool lIsSafeToBlend(llvm::Value *lvalue) {
4024  llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(lvalue);
4025  if (bc != NULL)
4026  return lIsSafeToBlend(bc->getOperand(0));
4027  else {
4028  llvm::AllocaInst *ai = llvm::dyn_cast<llvm::AllocaInst>(lvalue);
4029  if (ai) {
4030  llvm::Type *type = ai->getType();
4031  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(type);
4032  assert(pt != NULL);
4033  type = pt->getElementType();
4034  llvm::ArrayType *at;
4035  while ((at = llvm::dyn_cast<llvm::ArrayType>(type))) {
4036  type = at->getElementType();
4037  }
4038  llvm::VectorType *vt = llvm::dyn_cast<llvm::VectorType>(type);
4039  return (vt != NULL && (int)vt->getNumElements() == g->target->getVectorWidth());
4040  } else {
4041  llvm::GetElementPtrInst *gep = llvm::dyn_cast<llvm::GetElementPtrInst>(lvalue);
4042  if (gep != NULL)
4043  return lIsSafeToBlend(gep->getOperand(0));
4044  else
4045  return false;
4046  }
4047  }
4048 }
4049 
4050 static bool lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
4051  struct LMSInfo {
4052  LMSInfo(const char *pname, const char *bname, const char *msname) {
4053  pseudoFunc = m->module->getFunction(pname);
4054  blendFunc = m->module->getFunction(bname);
4055  maskedStoreFunc = m->module->getFunction(msname);
4056  Assert(pseudoFunc != NULL && blendFunc != NULL && maskedStoreFunc != NULL);
4057  }
4058  llvm::Function *pseudoFunc;
4059  llvm::Function *blendFunc;
4060  llvm::Function *maskedStoreFunc;
4061  };
4062 
4063  LMSInfo msInfo[] = {
4064  LMSInfo("__pseudo_masked_store_i8", "__masked_store_blend_i8", "__masked_store_i8"),
4065  LMSInfo("__pseudo_masked_store_i16", "__masked_store_blend_i16", "__masked_store_i16"),
4066  LMSInfo("__pseudo_masked_store_i32", "__masked_store_blend_i32", "__masked_store_i32"),
4067  LMSInfo("__pseudo_masked_store_float", "__masked_store_blend_float", "__masked_store_float"),
4068  LMSInfo("__pseudo_masked_store_i64", "__masked_store_blend_i64", "__masked_store_i64"),
4069  LMSInfo("__pseudo_masked_store_double", "__masked_store_blend_double", "__masked_store_double")};
4070 
4071  LMSInfo *info = NULL;
4072  for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
4073  if (msInfo[i].pseudoFunc != NULL && callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
4074  info = &msInfo[i];
4075  break;
4076  }
4077  }
4078  if (info == NULL)
4079  return false;
4080 
4081  llvm::Value *lvalue = callInst->getArgOperand(0);
4082  llvm::Value *rvalue = callInst->getArgOperand(1);
4083  llvm::Value *mask = callInst->getArgOperand(2);
4084 
4085  // We need to choose between doing the load + blend + store trick,
4086  // or serializing the masked store. Even on targets with a native
4087  // masked store instruction, this is preferable since it lets us
4088  // keep values in registers rather than going out to the stack.
4089  bool doBlend = (!g->opt.disableBlendedMaskedStores && lIsSafeToBlend(lvalue));
4090 
4091  // Generate the call to the appropriate masked store function and
4092  // replace the __pseudo_* one with it.
4093  llvm::Function *fms = doBlend ? info->blendFunc : info->maskedStoreFunc;
4094  llvm::Instruction *inst = lCallInst(fms, lvalue, rvalue, mask, "", callInst);
4095  lCopyMetadata(inst, callInst);
4096 
4097  callInst->eraseFromParent();
4098  return true;
4099 }
4100 
4101 static bool lReplacePseudoGS(llvm::CallInst *callInst) {
4102  struct LowerGSInfo {
4103  LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip) : isGather(ig), isPrefetch(ip) {
4104  pseudoFunc = m->module->getFunction(pName);
4105  actualFunc = m->module->getFunction(aName);
4106  }
4107  llvm::Function *pseudoFunc;
4108  llvm::Function *actualFunc;
4109  const bool isGather;
4110  const bool isPrefetch;
4111  };
4112 
4113  LowerGSInfo lgsInfo[] = {
4114  LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false),
4115  LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
4116  LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
4117  LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
4118  LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
4119  LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),
4120 
4121  LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false),
4122  LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
4123  LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
4124  LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
4125  LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
4126  LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),
4127 
4128  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i8", true, false),
4129  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i16", true, false),
4130  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i32", true, false),
4131  LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_float", true,
4132  false),
4133  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", "__gather_factored_base_offsets32_i64", true, false),
4134  LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", "__gather_factored_base_offsets32_double", true,
4135  false),
4136 
4137  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i8", true, false),
4138  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i16", true, false),
4139  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i32", true, false),
4140  LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_float", true,
4141  false),
4142  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_i64", true, false),
4143  LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", "__gather_factored_base_offsets64_double", true,
4144  false),
4145 
4146  LowerGSInfo("__pseudo_gather_base_offsets32_i8", "__gather_base_offsets32_i8", true, false),
4147  LowerGSInfo("__pseudo_gather_base_offsets32_i16", "__gather_base_offsets32_i16", true, false),
4148  LowerGSInfo("__pseudo_gather_base_offsets32_i32", "__gather_base_offsets32_i32", true, false),
4149  LowerGSInfo("__pseudo_gather_base_offsets32_float", "__gather_base_offsets32_float", true, false),
4150  LowerGSInfo("__pseudo_gather_base_offsets32_i64", "__gather_base_offsets32_i64", true, false),
4151  LowerGSInfo("__pseudo_gather_base_offsets32_double", "__gather_base_offsets32_double", true, false),
4152 
4153  LowerGSInfo("__pseudo_gather_base_offsets64_i8", "__gather_base_offsets64_i8", true, false),
4154  LowerGSInfo("__pseudo_gather_base_offsets64_i16", "__gather_base_offsets64_i16", true, false),
4155  LowerGSInfo("__pseudo_gather_base_offsets64_i32", "__gather_base_offsets64_i32", true, false),
4156  LowerGSInfo("__pseudo_gather_base_offsets64_float", "__gather_base_offsets64_float", true, false),
4157  LowerGSInfo("__pseudo_gather_base_offsets64_i64", "__gather_base_offsets64_i64", true, false),
4158  LowerGSInfo("__pseudo_gather_base_offsets64_double", "__gather_base_offsets64_double", true, false),
4159 
4160  LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false),
4161  LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
4162  LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
4163  LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
4164  LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
4165  LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),
4166 
4167  LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false),
4168  LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
4169  LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
4170  LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
4171  LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
4172  LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),
4173 
4174  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i8", false,
4175  false),
4176  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", "__scatter_factored_base_offsets32_i16", false,
4177  false),
4178  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i32", false,
4179  false),
4180  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_float", false,
4181  false),
4182  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", "__scatter_factored_base_offsets32_i64", false,
4183  false),
4184  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", "__scatter_factored_base_offsets32_double",
4185  false, false),
4186 
4187  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i8", false,
4188  false),
4189  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i16", false,
4190  false),
4191  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i32", false,
4192  false),
4193  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_float", false,
4194  false),
4195  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_i64", false,
4196  false),
4197  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", "__scatter_factored_base_offsets64_double",
4198  false, false),
4199 
4200  LowerGSInfo("__pseudo_scatter_base_offsets32_i8", "__scatter_base_offsets32_i8", false, false),
4201  LowerGSInfo("__pseudo_scatter_base_offsets32_i16", "__scatter_base_offsets32_i16", false, false),
4202  LowerGSInfo("__pseudo_scatter_base_offsets32_i32", "__scatter_base_offsets32_i32", false, false),
4203  LowerGSInfo("__pseudo_scatter_base_offsets32_float", "__scatter_base_offsets32_float", false, false),
4204  LowerGSInfo("__pseudo_scatter_base_offsets32_i64", "__scatter_base_offsets32_i64", false, false),
4205  LowerGSInfo("__pseudo_scatter_base_offsets32_double", "__scatter_base_offsets32_double", false, false),
4206 
4207  LowerGSInfo("__pseudo_scatter_base_offsets64_i8", "__scatter_base_offsets64_i8", false, false),
4208  LowerGSInfo("__pseudo_scatter_base_offsets64_i16", "__scatter_base_offsets64_i16", false, false),
4209  LowerGSInfo("__pseudo_scatter_base_offsets64_i32", "__scatter_base_offsets64_i32", false, false),
4210  LowerGSInfo("__pseudo_scatter_base_offsets64_float", "__scatter_base_offsets64_float", false, false),
4211  LowerGSInfo("__pseudo_scatter_base_offsets64_i64", "__scatter_base_offsets64_i64", false, false),
4212  LowerGSInfo("__pseudo_scatter_base_offsets64_double", "__scatter_base_offsets64_double", false, false),
4213 
4214  LowerGSInfo("__pseudo_prefetch_read_varying_1", "__prefetch_read_varying_1", false, true),
4215  LowerGSInfo("__pseudo_prefetch_read_varying_1_native", "__prefetch_read_varying_1_native", false, true),
4216 
4217  LowerGSInfo("__pseudo_prefetch_read_varying_2", "__prefetch_read_varying_2", false, true),
4218  LowerGSInfo("__pseudo_prefetch_read_varying_2_native", "__prefetch_read_varying_2_native", false, true),
4219 
4220  LowerGSInfo("__pseudo_prefetch_read_varying_3", "__prefetch_read_varying_3", false, true),
4221  LowerGSInfo("__pseudo_prefetch_read_varying_3_native", "__prefetch_read_varying_3_native", false, true),
4222 
4223  LowerGSInfo("__pseudo_prefetch_read_varying_nt", "__prefetch_read_varying_nt", false, true),
4224  LowerGSInfo("__pseudo_prefetch_read_varying_nt_native", "__prefetch_read_varying_nt_native", false, true),
4225  };
4226 
4227  llvm::Function *calledFunc = callInst->getCalledFunction();
4228 
4229  LowerGSInfo *info = NULL;
4230  for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
4231  if (lgsInfo[i].pseudoFunc != NULL && calledFunc == lgsInfo[i].pseudoFunc) {
4232  info = &lgsInfo[i];
4233  break;
4234  }
4235  }
4236  if (info == NULL)
4237  return false;
4238 
4239  Assert(info->actualFunc != NULL);
4240 
4241  // Get the source position from the metadata attached to the call
4242  // instruction so that we can issue PerformanceWarning()s below.
4243  SourcePos pos;
4244  bool gotPosition = lGetSourcePosFromMetadata(callInst, &pos);
4245 
4246  callInst->setCalledFunction(info->actualFunc);
4247  if (gotPosition && (g->target->getVectorWidth() > 1) && (g->opt.level > 0)) {
4248  if (info->isGather)
4249  PerformanceWarning(pos, "Gather required to load value.");
4250  else if (!info->isPrefetch)
4251  PerformanceWarning(pos, "Scatter required to store value.");
4252  }
4253  return true;
4254 }
4255 
4257  DEBUG_START_PASS("ReplacePseudoMemoryOpsPass");
4258 
4259  bool modifiedAny = false;
4260 
4261 restart:
4262  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
4263  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
4264  if (callInst == NULL || callInst->getCalledFunction() == NULL)
4265  continue;
4266 
4267  if (lReplacePseudoGS(callInst)) {
4268  modifiedAny = true;
4269  goto restart;
4270  } else if (lReplacePseudoMaskedStore(callInst)) {
4271  modifiedAny = true;
4272  goto restart;
4273  }
4274  }
4275 
4276  DEBUG_END_PASS("ReplacePseudoMemoryOpsPass");
4277 
4278  return modifiedAny;
4279 }
4280 
4282 
4283 ///////////////////////////////////////////////////////////////////////////
4284 // IsCompileTimeConstantPass
4285 
4286 /** LLVM IR implementations of target-specific functions may include calls
4287  to the functions "bool __is_compile_time_constant_*(...)"; these allow
4288  them to have specialied code paths for where the corresponding value is
4289  known at compile time. For masks, for example, this allows them to not
4290  incur the cost of a MOVMSK call at runtime to compute its value in
4291  cases where the mask value isn't known until runtime.
4292 
4293  This pass resolves these calls into either 'true' or 'false' values so
4294  that later optimization passes can operate with these as constants.
4295 
4296  See stdlib.m4 for a number of uses of this idiom.
4297  */
4298 
4299 class IsCompileTimeConstantPass : public llvm::BasicBlockPass {
4300  public:
4301  static char ID;
4302  IsCompileTimeConstantPass(bool last = false) : BasicBlockPass(ID) { isLastTry = last; }
4303 
4304 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4305  const char *getPassName() const { return "Resolve \"is compile time constant\""; }
4306 #else // LLVM 4.0+
4307  llvm::StringRef getPassName() const { return "Resolve \"is compile time constant\""; }
4308 #endif
4309  bool runOnBasicBlock(llvm::BasicBlock &BB);
4310 
4312 };
4313 
4315 
4317  DEBUG_START_PASS("IsCompileTimeConstantPass");
4318 
4319  llvm::Function *funcs[] = {m->module->getFunction("__is_compile_time_constant_mask"),
4320  m->module->getFunction("__is_compile_time_constant_uniform_int32"),
4321  m->module->getFunction("__is_compile_time_constant_varying_int32")};
4322 
4323  bool modifiedAny = false;
4324 restart:
4325  for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
4326  // Iterate through the instructions looking for calls to the
4327  // __is_compile_time_constant_*() functions
4328  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
4329  if (callInst == NULL)
4330  continue;
4331 
4332  int j;
4333  int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
4334  for (j = 0; j < nFuncs; ++j) {
4335  if (funcs[j] != NULL && callInst->getCalledFunction() == funcs[j])
4336  break;
4337  }
4338  if (j == nFuncs)
4339  // not a __is_compile_time_constant_* function
4340  continue;
4341 
4342  // This optimization pass can be disabled with both the (poorly
4343  // named) disableGatherScatterFlattening option and
4344  // disableMaskAllOnOptimizations.
4346  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4347  modifiedAny = true;
4348  goto restart;
4349  }
4350 
4351  // Is it a constant? Bingo, turn the call's value into a constant
4352  // true value.
4353  llvm::Value *operand = callInst->getArgOperand(0);
4354  if (llvm::isa<llvm::Constant>(operand)) {
4355  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue);
4356  modifiedAny = true;
4357  goto restart;
4358  }
4359 
4360  // This pass runs multiple times during optimization. Up until the
4361  // very last time, it only replaces the call with a 'true' if the
4362  // value is known to be constant and otherwise leaves the call
4363  // alone, in case further optimization passes can help resolve its
4364  // value. The last time through, it eventually has to give up, and
4365  // replaces any remaining ones with 'false' constants.
4366  if (isLastTry) {
4367  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4368  modifiedAny = true;
4369  goto restart;
4370  }
4371  }
4372 
4373  DEBUG_END_PASS("IsCompileTimeConstantPass");
4374 
4375  return modifiedAny;
4376 }
4377 
4378 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry) { return new IsCompileTimeConstantPass(isLastTry); }
4379 
4380 //////////////////////////////////////////////////////////////////////////
4381 // DebugPass
4382 
4383 /** This pass is added in list of passes after optimizations which
4384  we want to debug and print dump of LLVM IR in stderr. Also it
4385  prints name and number of previous optimization.
4386  */
4387 #ifndef ISPC_NO_DUMPS
4388 class DebugPass : public llvm::ModulePass {
4389  public:
4390  static char ID;
4391  DebugPass(char *output) : ModulePass(ID) { snprintf(str_output, sizeof(str_output), "%s", output); }
4392 
4393 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4394  const char *getPassName() const { return "Dump LLVM IR"; }
4395 #else // LLVM 4.0+
4396  llvm::StringRef getPassName() const { return "Dump LLVM IR"; }
4397 #endif
4398  bool runOnModule(llvm::Module &m);
4399 
4400  private:
4401  char str_output[100];
4402 };
4403 
4404 char DebugPass::ID = 0;
4405 
4406 bool DebugPass::runOnModule(llvm::Module &module) {
4407  fprintf(stderr, "%s", str_output);
4408  fflush(stderr);
4409  module.dump();
4410  return true;
4411 }
4412 
4413 static llvm::Pass *CreateDebugPass(char *output) { return new DebugPass(output); }
4414 #endif
4415 
4416 //////////////////////////////////////////////////////////////////////////
4417 // DebugPassFile
4418 
4419 /** This pass is added in list of passes after optimizations which
4420  we want to debug and print dump of LLVM IR to file.
4421  */
4422 #ifndef ISPC_NO_DUMPS
4423 class DebugPassFile : public llvm::ModulePass {
4424  public:
4425  static char ID;
4426  DebugPassFile(int number, llvm::StringRef name) : ModulePass(ID), pnum(number), pname(name) {}
4427 
4428 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4429  const char *getPassName() const { return "Dump LLVM IR"; }
4430 #else // LLVM 4.0+
4431  llvm::StringRef getPassName() const { return "Dump LLVM IR"; }
4432 #endif
4433  bool runOnModule(llvm::Module &m);
4434  bool doInitialization(llvm::Module &m);
4435 
4436  private:
4437  void run(llvm::Module &m, bool init);
4438  int pnum;
4439  llvm::StringRef pname;
4440 };
4441 
4442 char DebugPassFile::ID = 0;
4443 
4444 /**
4445  * Strips all non-alphanumeric characters from given string.
4446  */
4447 std::string sanitize(std::string in) {
4448  llvm::Regex r("[^[:alnum:]]");
4449  while (r.match(in))
4450  in = r.sub("", in);
4451  return in;
4452 }
4453 
4454 void DebugPassFile::run(llvm::Module &module, bool init) {
4455  std::error_code EC;
4456  char fname[100];
4457  snprintf(fname, sizeof(fname), "%s_%d_%s.ll", init ? "init" : "ir", pnum, sanitize(pname).c_str());
4458  llvm::raw_fd_ostream OS(fname, EC, llvm::sys::fs::F_None);
4459  Assert(!EC && "IR dump file creation failed!");
4460  module.print(OS, 0);
4461 }
4462 
4463 bool DebugPassFile::runOnModule(llvm::Module &module) {
4464  run(module, false);
4465  return true;
4466 }
4467 
4468 bool DebugPassFile::doInitialization(llvm::Module &module) {
4469  run(module, true);
4470  return true;
4471 }
4472 
4473 static llvm::Pass *CreateDebugPassFile(int number, llvm::StringRef name) { return new DebugPassFile(number, name); }
4474 #endif
4475 
4476 ///////////////////////////////////////////////////////////////////////////
4477 // MakeInternalFuncsStaticPass
4478 
4479 /** There are a number of target-specific functions that we use during
4480  these optimization passes. By the time we are done with optimization,
4481  any uses of these should be inlined and no calls to these functions
4482  should remain. This pass marks all of these functions as having
4483  private linkage so that subsequent passes can eliminate them as dead
4484  code, thus cleaning up the final code output by the compiler. We can't
4485  just declare these as static from the start, however, since then they
4486  end up being eliminated as dead code during early optimization passes
4487  even though we may need to generate calls to them during later
4488  optimization passes.
4489  */
4490 class MakeInternalFuncsStaticPass : public llvm::ModulePass {
4491  public:
4492  static char ID;
4493  MakeInternalFuncsStaticPass(bool last = false) : ModulePass(ID) {}
4494 
4495  void getAnalysisUsage(llvm::AnalysisUsage &AU) const { AU.setPreservesCFG(); }
4496 
4497 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4498  const char *getPassName() const { return "Make internal funcs \"static\""; }
4499 #else // LLVM 4.0+
4500  llvm::StringRef getPassName() const { return "Make internal funcs \"static\""; }
4501 #endif
4502  bool runOnModule(llvm::Module &m);
4503 };
4504 
4506 
4507 bool MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
4508  const char *names[] = {
4509  "__avg_up_uint8",
4510  "__avg_up_int8",
4511  "__avg_up_uint16",
4512  "__avg_up_int16",
4513  "__avg_down_uint8",
4514  "__avg_down_int8",
4515  "__avg_down_uint16",
4516  "__avg_down_int16",
4517  "__fast_masked_vload",
4518  "__gather_factored_base_offsets32_i8",
4519  "__gather_factored_base_offsets32_i16",
4520  "__gather_factored_base_offsets32_i32",
4521  "__gather_factored_base_offsets32_i64",
4522  "__gather_factored_base_offsets32_float",
4523  "__gather_factored_base_offsets32_double",
4524  "__gather_factored_base_offsets64_i8",
4525  "__gather_factored_base_offsets64_i16",
4526  "__gather_factored_base_offsets64_i32",
4527  "__gather_factored_base_offsets64_i64",
4528  "__gather_factored_base_offsets64_float",
4529  "__gather_factored_base_offsets64_double",
4530  "__gather_base_offsets32_i8",
4531  "__gather_base_offsets32_i16",
4532  "__gather_base_offsets32_i32",
4533  "__gather_base_offsets32_i64",
4534  "__gather_base_offsets32_float",
4535  "__gather_base_offsets32_double",
4536  "__gather_base_offsets64_i8",
4537  "__gather_base_offsets64_i16",
4538  "__gather_base_offsets64_i32",
4539  "__gather_base_offsets64_i64",
4540  "__gather_base_offsets64_float",
4541  "__gather_base_offsets64_double",
4542  "__gather32_i8",
4543  "__gather32_i16",
4544  "__gather32_i32",
4545  "__gather32_i64",
4546  "__gather32_float",
4547  "__gather32_double",
4548  "__gather64_i8",
4549  "__gather64_i16",
4550  "__gather64_i32",
4551  "__gather64_i64",
4552  "__gather64_float",
4553  "__gather64_double",
4554  "__gather_elt32_i8",
4555  "__gather_elt32_i16",
4556  "__gather_elt32_i32",
4557  "__gather_elt32_i64",
4558  "__gather_elt32_float",
4559  "__gather_elt32_double",
4560  "__gather_elt64_i8",
4561  "__gather_elt64_i16",
4562  "__gather_elt64_i32",
4563  "__gather_elt64_i64",
4564  "__gather_elt64_float",
4565  "__gather_elt64_double",
4566  "__masked_load_i8",
4567  "__masked_load_i16",
4568  "__masked_load_i32",
4569  "__masked_load_i64",
4570  "__masked_load_float",
4571  "__masked_load_double",
4572  "__masked_store_i8",
4573  "__masked_store_i16",
4574  "__masked_store_i32",
4575  "__masked_store_i64",
4576  "__masked_store_float",
4577  "__masked_store_double",
4578  "__masked_store_blend_i8",
4579  "__masked_store_blend_i16",
4580  "__masked_store_blend_i32",
4581  "__masked_store_blend_i64",
4582  "__masked_store_blend_float",
4583  "__masked_store_blend_double",
4584  "__scatter_factored_base_offsets32_i8",
4585  "__scatter_factored_base_offsets32_i16",
4586  "__scatter_factored_base_offsets32_i32",
4587  "__scatter_factored_base_offsets32_i64",
4588  "__scatter_factored_base_offsets32_float",
4589  "__scatter_factored_base_offsets32_double",
4590  "__scatter_factored_base_offsets64_i8",
4591  "__scatter_factored_base_offsets64_i16",
4592  "__scatter_factored_base_offsets64_i32",
4593  "__scatter_factored_base_offsets64_i64",
4594  "__scatter_factored_base_offsets64_float",
4595  "__scatter_factored_base_offsets64_double",
4596  "__scatter_base_offsets32_i8",
4597  "__scatter_base_offsets32_i16",
4598  "__scatter_base_offsets32_i32",
4599  "__scatter_base_offsets32_i64",
4600  "__scatter_base_offsets32_float",
4601  "__scatter_base_offsets32_double",
4602  "__scatter_base_offsets64_i8",
4603  "__scatter_base_offsets64_i16",
4604  "__scatter_base_offsets64_i32",
4605  "__scatter_base_offsets64_i64",
4606  "__scatter_base_offsets64_float",
4607  "__scatter_base_offsets64_double",
4608  "__scatter_elt32_i8",
4609  "__scatter_elt32_i16",
4610  "__scatter_elt32_i32",
4611  "__scatter_elt32_i64",
4612  "__scatter_elt32_float",
4613  "__scatter_elt32_double",
4614  "__scatter_elt64_i8",
4615  "__scatter_elt64_i16",
4616  "__scatter_elt64_i32",
4617  "__scatter_elt64_i64",
4618  "__scatter_elt64_float",
4619  "__scatter_elt64_double",
4620  "__scatter32_i8",
4621  "__scatter32_i16",
4622  "__scatter32_i32",
4623  "__scatter32_i64",
4624  "__scatter32_float",
4625  "__scatter32_double",
4626  "__scatter64_i8",
4627  "__scatter64_i16",
4628  "__scatter64_i32",
4629  "__scatter64_i64",
4630  "__scatter64_float",
4631  "__scatter64_double",
4632  "__prefetch_read_varying_1",
4633  "__prefetch_read_varying_2",
4634  "__prefetch_read_varying_3",
4635  "__prefetch_read_varying_nt",
4636  "__keep_funcs_live",
4637  };
4638 
4639  bool modifiedAny = false;
4640  int count = sizeof(names) / sizeof(names[0]);
4641  for (int i = 0; i < count; ++i) {
4642  llvm::Function *f = m->module->getFunction(names[i]);
4643  if (f != NULL && f->empty() == false) {
4644  f->setLinkage(llvm::GlobalValue::InternalLinkage);
4645  modifiedAny = true;
4646  }
4647  }
4648 
4649  return modifiedAny;
4650 }
4651 
4653 
4654 ///////////////////////////////////////////////////////////////////////////
4655 // PeepholePass
4656 
4657 class PeepholePass : public llvm::BasicBlockPass {
4658  public:
4659  PeepholePass();
4660 
4661 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4662  const char *getPassName() const { return "Peephole Optimizations"; }
4663 #else // LLVM 4.0+
4664  llvm::StringRef getPassName() const { return "Peephole Optimizations"; }
4665 #endif
4666  bool runOnBasicBlock(llvm::BasicBlock &BB);
4667 
4668  static char ID;
4669 };
4670 
4671 char PeepholePass::ID = 0;
4672 
4673 PeepholePass::PeepholePass() : BasicBlockPass(ID) {}
4674 
4675 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3
4676 
4677 using namespace llvm::PatternMatch;
4678 
4679 template <typename Op_t, unsigned Opcode> struct CastClassTypes_match {
4680  Op_t Op;
4681  const llvm::Type *fromType, *toType;
4682 
4683  CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f, const llvm::Type *t)
4684  : Op(OpMatch), fromType(f), toType(t) {}
4685 
4686  template <typename OpTy> bool match(OpTy *V) {
4687  if (llvm::Operator *O = llvm::dyn_cast<llvm::Operator>(V))
4688  return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) && O->getType() == toType &&
4689  O->getOperand(0)->getType() == fromType);
4690  return false;
4691  }
4692 };
4693 
4694 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::SExt> m_SExt8To16(const OpTy &Op) {
4695  return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(Op, LLVMTypes::Int8VectorType,
4697 }
4698 
4699 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt> m_ZExt8To16(const OpTy &Op) {
4700  return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(Op, LLVMTypes::Int8VectorType,
4702 }
4703 
4704 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc> m_Trunc16To8(const OpTy &Op) {
4705  return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(Op, LLVMTypes::Int16VectorType,
4707 }
4708 
4709 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::SExt> m_SExt16To32(const OpTy &Op) {
4710  return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(Op, LLVMTypes::Int16VectorType,
4712 }
4713 
4714 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt> m_ZExt16To32(const OpTy &Op) {
4715  return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(Op, LLVMTypes::Int16VectorType,
4717 }
4718 
4719 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc> m_Trunc32To16(const OpTy &Op) {
4720  return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(Op, LLVMTypes::Int32VectorType,
4722 }
4723 
4724 template <typename Op_t> struct UDiv2_match {
4725  Op_t Op;
4726 
4727  UDiv2_match(const Op_t &OpMatch) : Op(OpMatch) {}
4728 
4729  template <typename OpTy> bool match(OpTy *V) {
4730  llvm::BinaryOperator *bop;
4731  llvm::ConstantDataVector *cdv;
4732  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
4733  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) && cdv->getSplatValue() != NULL) {
4734  const llvm::APInt &apInt = cdv->getUniqueInteger();
4735 
4736  switch (bop->getOpcode()) {
4737  case llvm::Instruction::UDiv:
4738  // divide by 2
4739  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
4740  case llvm::Instruction::LShr:
4741  // shift left by 1
4742  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
4743  default:
4744  return false;
4745  }
4746  }
4747  return false;
4748  }
4749 };
4750 
4751 template <typename V> inline UDiv2_match<V> m_UDiv2(const V &v) { return UDiv2_match<V>(v); }
4752 
4753 template <typename Op_t> struct SDiv2_match {
4754  Op_t Op;
4755 
4756  SDiv2_match(const Op_t &OpMatch) : Op(OpMatch) {}
4757 
4758  template <typename OpTy> bool match(OpTy *V) {
4759  llvm::BinaryOperator *bop;
4760  llvm::ConstantDataVector *cdv;
4761  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
4762  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) && cdv->getSplatValue() != NULL) {
4763  const llvm::APInt &apInt = cdv->getUniqueInteger();
4764 
4765  switch (bop->getOpcode()) {
4766  case llvm::Instruction::SDiv:
4767  // divide by 2
4768  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
4769  case llvm::Instruction::AShr:
4770  // shift left by 1
4771  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
4772  default:
4773  return false;
4774  }
4775  }
4776  return false;
4777  }
4778 };
4779 
4780 template <typename V> inline SDiv2_match<V> m_SDiv2(const V &v) { return SDiv2_match<V>(v); }
4781 
4782 // Returns true if the given function has a call to an intrinsic function
4783 // in its definition.
4784 static bool lHasIntrinsicInDefinition(llvm::Function *func) {
4785  llvm::Function::iterator bbiter = func->begin();
4786  for (; bbiter != func->end(); ++bbiter) {
4787  for (llvm::BasicBlock::iterator institer = bbiter->begin(); institer != bbiter->end(); ++institer) {
4788  if (llvm::isa<llvm::IntrinsicInst>(institer))
4789  return true;
4790  }
4791  }
4792  return false;
4793 }
4794 
4795 static llvm::Instruction *lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) {
4796  llvm::Function *func = m->module->getFunction(name);
4797  Assert(func != NULL);
4798 
4799  // Make sure that the definition of the llvm::Function has a call to an
4800  // intrinsic function in its instructions; otherwise we will generate
4801  // infinite loops where we "helpfully" turn the default implementations
4802  // of target builtins like __avg_up_uint8 that are implemented with plain
4803  // arithmetic ops into recursive calls to themselves.
4804  if (lHasIntrinsicInDefinition(func))
4805  return lCallInst(func, opa, opb, name);
4806  else
4807  return NULL;
4808 }
4809 
4810 //////////////////////////////////////////////////
4811 
4812 static llvm::Instruction *lMatchAvgUpUInt8(llvm::Value *inst) {
4813  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
4814  llvm::Value *opa, *opb;
4815  const llvm::APInt *delta;
4816  if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr(
4817  m_CombineOr(m_Add(m_ZExt8To16(m_Value(opa)), m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))),
4818  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)), m_ZExt8To16(m_Value(opb)))),
4819  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))), m_APInt(delta))))))) {
4820  if (delta->isIntN(1) == false)
4821  return NULL;
4822 
4823  return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
4824  }
4825  return NULL;
4826 }
4827 
4828 static llvm::Instruction *lMatchAvgDownUInt8(llvm::Value *inst) {
4829  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
4830  llvm::Value *opa, *opb;
4831  if (match(inst, m_Trunc16To8(m_UDiv2(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))))))) {
4832  return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb);
4833  }
4834  return NULL;
4835 }
4836 
4837 static llvm::Instruction *lMatchAvgUpUInt16(llvm::Value *inst) {
4838  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
4839  llvm::Value *opa, *opb;
4840  const llvm::APInt *delta;
4841  if (match(inst,
4842  m_Trunc32To16(m_UDiv2(m_CombineOr(
4843  m_CombineOr(m_Add(m_ZExt16To32(m_Value(opa)), m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))),
4844  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)), m_ZExt16To32(m_Value(opb)))),
4845  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))), m_APInt(delta))))))) {
4846  if (delta->isIntN(1) == false)
4847  return NULL;
4848 
4849  return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
4850  }
4851  return NULL;
4852 }
4853 
4854 static llvm::Instruction *lMatchAvgDownUInt16(llvm::Value *inst) {
4855  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
4856  llvm::Value *opa, *opb;
4857  if (match(inst, m_Trunc32To16(m_UDiv2(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))))))) {
4858  return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb);
4859  }
4860  return NULL;
4861 }
4862 
4863 static llvm::Instruction *lMatchAvgUpInt8(llvm::Value *inst) {
4864  // (int8)(((int16)a + (int16)b + 1)/2)
4865  llvm::Value *opa, *opb;
4866  const llvm::APInt *delta;
4867  if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr(
4868  m_CombineOr(m_Add(m_SExt8To16(m_Value(opa)), m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))),
4869  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)), m_SExt8To16(m_Value(opb)))),
4870  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))), m_APInt(delta))))))) {
4871  if (delta->isIntN(1) == false)
4872  return NULL;
4873 
4874  return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
4875  }
4876  return NULL;
4877 }
4878 
4879 static llvm::Instruction *lMatchAvgDownInt8(llvm::Value *inst) {
4880  // (int8)(((int16)a + (int16)b)/2)
4881  llvm::Value *opa, *opb;
4882  if (match(inst, m_Trunc16To8(m_SDiv2(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))))))) {
4883  return lGetBinaryIntrinsic("__avg_down_int8", opa, opb);
4884  }
4885  return NULL;
4886 }
4887 
4888 static llvm::Instruction *lMatchAvgUpInt16(llvm::Value *inst) {
4889  // (int16)(((int32)a + (int32)b + 1)/2)
4890  llvm::Value *opa, *opb;
4891  const llvm::APInt *delta;
4892  if (match(inst,
4893  m_Trunc32To16(m_SDiv2(m_CombineOr(
4894  m_CombineOr(m_Add(m_SExt16To32(m_Value(opa)), m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))),
4895  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)), m_SExt16To32(m_Value(opb)))),
4896  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))), m_APInt(delta))))))) {
4897  if (delta->isIntN(1) == false)
4898  return NULL;
4899 
4900  return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
4901  }
4902  return NULL;
4903 }
4904 
4905 static llvm::Instruction *lMatchAvgDownInt16(llvm::Value *inst) {
4906  // (int16)(((int32)a + (int32)b)/2)
4907  llvm::Value *opa, *opb;
4908  if (match(inst, m_Trunc32To16(m_SDiv2(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))))))) {
4909  return lGetBinaryIntrinsic("__avg_down_int16", opa, opb);
4910  }
4911  return NULL;
4912 }
4913 #endif // !LLVM_3_2
4914 
4915 bool PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
4916  DEBUG_START_PASS("PeepholePass");
4917 
4918  bool modifiedAny = false;
4919 restart:
4920  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
4921  llvm::Instruction *inst = &*iter;
4922 
4923  llvm::Instruction *builtinCall = NULL;
4924 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3
4925  if (!builtinCall)
4926  builtinCall = lMatchAvgUpUInt8(inst);
4927  if (!builtinCall)
4928  builtinCall = lMatchAvgUpUInt16(inst);
4929  if (!builtinCall)
4930  builtinCall = lMatchAvgDownUInt8(inst);
4931  if (!builtinCall)
4932  builtinCall = lMatchAvgDownUInt16(inst);
4933  if (!builtinCall)
4934  builtinCall = lMatchAvgUpInt8(inst);
4935  if (!builtinCall)
4936  builtinCall = lMatchAvgUpInt16(inst);
4937  if (!builtinCall)
4938  builtinCall = lMatchAvgDownInt8(inst);
4939  if (!builtinCall)
4940  builtinCall = lMatchAvgDownInt16(inst);
4941 #endif // !LLVM_3_2
4942  if (builtinCall != NULL) {
4943  llvm::ReplaceInstWithInst(inst, builtinCall);
4944  modifiedAny = true;
4945  goto restart;
4946  }
4947  }
4948 
4949  DEBUG_END_PASS("PeepholePass");
4950 
4951  return modifiedAny;
4952 }
4953 
4954 static llvm::Pass *CreatePeepholePass() { return new PeepholePass; }
4955 
4956 /** Given an llvm::Value known to be an integer, return its value as
4957  an int64_t.
4958 */
4959 static int64_t lGetIntValue(llvm::Value *offset) {
4960  llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
4961  Assert(intOffset && (intOffset->getBitWidth() == 32 || intOffset->getBitWidth() == 64));
4962  return intOffset->getSExtValue();
4963 }
4964 
4965 ///////////////////////////////////////////////////////////////////////////
4966 // ReplaceStdlibShiftPass
4967 
4968 class ReplaceStdlibShiftPass : public llvm::BasicBlockPass {
4969  public:
4970  static char ID;
4971  ReplaceStdlibShiftPass() : BasicBlockPass(ID) {}
4972 
4973 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4974  const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
4975 #else // LLVM 4.0+
4976  llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
4977 #endif
4978  bool runOnBasicBlock(llvm::BasicBlock &BB);
4979 };
4980 
4982 
4983 // This pass replaces shift() with ShuffleVector when the offset is a constant.
4984 // rotate() which is similar in functionality has a slightly different
4985 // implementation. This is due to LLVM(createInstructionCombiningPass)
4986 // optimizing rotate() implementation better when similar implementations
4987 // are used for both. This is a hack to produce similarly optimized code for
4988 // shift.
4989 bool ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
4990  DEBUG_START_PASS("ReplaceStdlibShiftPass");
4991  bool modifiedAny = false;
4992 
4993  llvm::Function *shifts[6];
4994  shifts[0] = m->module->getFunction("shift___vytuni");
4995  shifts[1] = m->module->getFunction("shift___vysuni");
4996  shifts[2] = m->module->getFunction("shift___vyiuni");
4997  shifts[3] = m->module->getFunction("shift___vyIuni");
4998  shifts[4] = m->module->getFunction("shift___vyfuni");
4999  shifts[5] = m->module->getFunction("shift___vyduni");
5000 
5001  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
5002  llvm::Instruction *inst = &*iter;
5003 
5004  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
5005  llvm::Function *func = ci->getCalledFunction();
5006  for (int i = 0; i < 6; i++) {
5007  if (shifts[i] && (shifts[i] == func)) {
5008  // we matched a call
5009  llvm::Value *shiftedVec = ci->getArgOperand(0);
5010  llvm::Value *shiftAmt = ci->getArgOperand(1);
5011  if (llvm::isa<llvm::Constant>(shiftAmt)) {
5012  int vectorWidth = g->target->getVectorWidth();
5013  int *shuffleVals = new int[vectorWidth];
5014  int shiftInt = lGetIntValue(shiftAmt);
5015  for (int i = 0; i < vectorWidth; i++) {
5016  int s = i + shiftInt;
5017  s = (s < 0) ? vectorWidth : s;
5018  s = (s >= vectorWidth) ? vectorWidth : s;
5019  shuffleVals[i] = s;
5020  }
5021  llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
5022  llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
5023  llvm::Value *shuffle =
5024  new llvm::ShuffleVectorInst(shiftedVec, zeroVec, shuffleIdxs, "vecShift", ci);
5025  ci->replaceAllUsesWith(shuffle);
5026  modifiedAny = true;
5027  delete[] shuffleVals;
5028  } else if (g->opt.level > 0) {
5029  PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount.");
5030  }
5031  }
5032  }
5033  }
5034  }
5035 
5036  DEBUG_END_PASS("ReplaceStdlibShiftPass");
5037 
5038  return modifiedAny;
5039 }
5040 
5041 static llvm::Pass *CreateReplaceStdlibShiftPass() { return new ReplaceStdlibShiftPass(); }
5042 
5043 ///////////////////////////////////////////////////////////////////////////////
5044 // FixBooleanSelect
5045 //
5046 // The problem is that in LLVM 3.3, optimizer doesn't like
5047 // the following instruction sequence:
5048 // %cmp = fcmp olt <8 x float> %a, %b
5049 // %sext_cmp = sext <8 x i1> %cmp to <8 x i32>
5050 // %new_mask = and <8 x i32> %sext_cmp, %mask
5051 // and optimizes it to the following:
5052 // %cmp = fcmp olt <8 x float> %a, %b
5053 // %cond = select <8 x i1> %cmp, <8 x i32> %mask, <8 x i32> zeroinitializer
5054 //
5055 // It wouldn't be a problem if codegen produced good code for it. But it
5056 // doesn't, especially for vectors larger than native vectors.
5057 //
5058 // This optimization reverts this pattern and should be the last one before
5059 // code gen.
5060 //
5061 // Note that this problem was introduced in LLVM 3.3. But in LLVM 3.4 it was
5062 // fixed. See commit r194542.
5063 //
5064 // After LLVM 3.3 this optimization should probably stay for experimental
5065 // purposes and code should be compared with and without this optimization from
5066 // time to time to make sure that LLVM does right thing.
5067 ///////////////////////////////////////////////////////////////////////////////
5068 
5069 class FixBooleanSelectPass : public llvm::FunctionPass {
5070  public:
5071  static char ID;
5072  FixBooleanSelectPass() : FunctionPass(ID) {}
5073 
5074 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
5075  const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
5076 #else // LLVM 4.0+
5077  llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
5078 #endif
5079  bool runOnFunction(llvm::Function &F);
5080 
5081  private:
5082  llvm::Instruction *fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext);
5083 };
5084 
5085 char FixBooleanSelectPass::ID = 0;
5086 
5087 llvm::Instruction *FixBooleanSelectPass::fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext) {
5088  // Select instruction result type and its integer equivalent
5089  llvm::VectorType *orig_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
5090  llvm::VectorType *int_type = llvm::VectorType::getInteger(orig_type);
5091 
5092  // Result value and optional pointer to instruction to delete
5093  llvm::Instruction *result = 0, *optional_to_delete = 0;
5094 
5095  // It can be vector of integers or vector of floating point values.
5096  if (orig_type->getElementType()->isIntegerTy()) {
5097  // Generate sext+and, remove select.
5098  result = llvm::BinaryOperator::CreateAnd(sext, sel->getTrueValue(), "and_mask", sel);
5099  } else {
5100  llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(sel->getTrueValue());
5101 
5102  if (bc && bc->hasOneUse() && bc->getSrcTy()->isIntOrIntVectorTy() && bc->getSrcTy()->isVectorTy() &&
5103  llvm::isa<llvm::Instruction>(bc->getOperand(0)) &&
5104  llvm::dyn_cast<llvm::Instruction>(bc->getOperand(0))->getParent() == sel->getParent()) {
5105  // Bitcast is casting form integer type, it's operand is instruction, which is located in the same basic
5106  // block (otherwise it's unsafe to use it). bitcast+select => sext+and+bicast Create and
5107  llvm::BinaryOperator *and_inst = llvm::BinaryOperator::CreateAnd(sext, bc->getOperand(0), "and_mask", sel);
5108  // Bitcast back to original type
5109  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
5110  // Original bitcast will be removed
5111  optional_to_delete = bc;
5112  } else {
5113  // General case: select => bitcast+sext+and+bitcast
5114  // Bitcast
5115  llvm::BitCastInst *bc_in = new llvm::BitCastInst(sel->getTrueValue(), int_type, "bitcast_mask_in", sel);
5116  // And
5117  llvm::BinaryOperator *and_inst = llvm::BinaryOperator::CreateAnd(sext, bc_in, "and_mask", sel);
5118  // Bitcast back to original type
5119  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
5120  }
5121  }
5122 
5123  // Done, finalize.
5124  sel->replaceAllUsesWith(result);
5125  sel->eraseFromParent();
5126  if (optional_to_delete) {
5127  optional_to_delete->eraseFromParent();
5128  }
5129 
5130  return result;
5131 }
5132 
5133 bool FixBooleanSelectPass::runOnFunction(llvm::Function &F) {
5134  bool modifiedAny = false;
5135 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_3 // LLVM 3.3 only
5136 
5137  // Don't optimize generic targets.
5138  if (g->target->getISA() == Target::GENERIC) {
5139  return false;
5140  }
5141 
5142  for (llvm::Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
5143  llvm::BasicBlock *bb = &*I;
5144  for (llvm::BasicBlock::iterator iter = bb->begin(), e = bb->end(); iter != e; ++iter) {
5145  llvm::Instruction *inst = &*iter;
5146 
5147  llvm::CmpInst *cmp = llvm::dyn_cast<llvm::CmpInst>(inst);
5148 
5149  if (cmp && cmp->getType()->isVectorTy() && cmp->getType()->getVectorElementType()->isIntegerTy(1)) {
5150 
5151  // Search for select instruction uses.
5152  int selects = 0;
5153  llvm::VectorType *sext_type = 0;
5154  for (llvm::Instruction::use_iterator it = cmp->use_begin(); it != cmp->use_end(); ++it) {
5155  llvm::SelectInst *sel = llvm::dyn_cast<llvm::SelectInst>(*it);
5156  if (sel && sel->getType()->isVectorTy() && sel->getType()->getScalarSizeInBits() > 1) {
5157  selects++;
5158  // We pick the first one, but typical case when all select types are the same.
5159  sext_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
5160  break;
5161  }
5162  }
5163  if (selects == 0) {
5164  continue;
5165  }
5166  // Get an integer equivalent, if it's not yet an integer.
5167  sext_type = llvm::VectorType::getInteger(sext_type);
5168 
5169  // Do transformation
5170  llvm::BasicBlock::iterator iter_copy = iter;
5171  llvm::Instruction *next_inst = &*(++iter_copy);
5172  // Create or reuse sext
5173  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(next_inst);
5174  if (sext && sext->getOperand(0) == cmp && sext->getDestTy() == sext_type) {
5175  // This sext can be reused
5176  } else {
5177  if (next_inst) {
5178  sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", next_inst);
5179  } else {
5180  sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", bb);
5181  }
5182  }
5183 
5184  // Walk and fix selects
5185  std::vector<llvm::SelectInst *> sel_uses;
5186  for (llvm::Instruction::use_iterator it = cmp->use_begin(); it != cmp->use_end(); ++it) {
5187  llvm::SelectInst *sel = llvm::dyn_cast<llvm::SelectInst>(*it);
5188  if (sel && sel->getType()->getScalarSizeInBits() == sext_type->getScalarSizeInBits()) {
5189 
5190  // Check that second operand is zero.
5191  llvm::Constant *false_cond = llvm::dyn_cast<llvm::Constant>(sel->getFalseValue());
5192  if (false_cond && false_cond->isZeroValue()) {
5193  sel_uses.push_back(sel);
5194  modifiedAny = true;
5195  }
5196  }
5197  }
5198 
5199  for (int i = 0; i < sel_uses.size(); i++) {
5200  fixSelect(sel_uses[i], sext);
5201  }
5202  }
5203  }
5204  }
5205 
5206 #endif // LLVM 3.3
5207 
5208  return modifiedAny;
5209 }
5210 
5211 static llvm::Pass *CreateFixBooleanSelectPass() { return new FixBooleanSelectPass(); }
5212 
5213 #ifdef ISPC_NVPTX_ENABLED
5214 ///////////////////////////////////////////////////////////////////////////////
5215 // Detect addrspace(3)
5216 ///////////////////////////////////////////////////////////////////////////////
5217 
5218 class PromoteLocalToPrivatePass : public llvm::BasicBlockPass {
5219  public:
5220  static char ID; // Pass identification, replacement for typeid
5221  PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
5222 
5223  bool runOnBasicBlock(llvm::BasicBlock &BB);
5224 };
5225 
5226 char PromoteLocalToPrivatePass::ID = 0;
5227 
5228 bool PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB) {
5229  std::vector<llvm::AllocaInst *> Allocas;
5230 
5231  bool modifiedAny = false;
5232 
5233 #if 1
5234 restart:
5235  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) {
5236  llvm::Instruction *inst = &*I;
5237  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
5238  llvm::Function *func = ci->getCalledFunction();
5239  if (func && func->getName() == "llvm.trap") {
5240  std::vector<llvm::Type *> funcTyArgs;
5241  llvm::FunctionType *funcTy = llvm::FunctionType::get(
5242  /*Result=*/llvm::Type::getVoidTy(*g->ctx),
5243  /*Params=*/funcTyArgs,
5244  /*isVarArg=*/false);
5245  llvm::InlineAsm *trap_ptx = llvm::InlineAsm::get(funcTy, "trap;", "", false);
5246  assert(trap_ptx != NULL);
5247  llvm::Instruction *trap_call = llvm::CallInst::Create(trap_ptx);
5248  assert(trap_call != NULL);
5249  llvm::ReplaceInstWithInst(ci, trap_call);
5250  modifiedAny = true;
5251  goto restart;
5252  }
5253  }
5254  }
5255 #endif
5256 
5257 #if 0
5258  llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
5259 
5260  // Find allocas that are safe to promote, by looking at all instructions in
5261  // the entry node
5262  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
5263  {
5264  llvm::Instruction *inst = &*I;
5265  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
5266  {
5267  llvm::Function *func = ci->getCalledFunction();
5268  if (cvtFunc && (cvtFunc == func))
5269  {
5270 #if 0
5271  fprintf(stderr , "--found cvt-- name= %s \n",
5272  I->getName().str().c_str());
5273 #endif
5274  llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
5275  assert(alloca != NULL);
5276 #if 0
5277  const int align = 8; // g->target->getNativeVectorAlignment();
5278  alloca->setAlignment(align);
5279 #endif
5280  ci->replaceAllUsesWith(alloca);
5281  modifiedAny = true;
5282  }
5283  }
5284  }
5285 #endif
5286  return modifiedAny;
5287 }
5288 
5289 static llvm::Pass *CreatePromoteLocalToPrivatePass() { return new PromoteLocalToPrivatePass(); }
5290 
5291 #endif /* ISPC_NVPTX_ENABLED */
static llvm::Pass * CreateFixBooleanSelectPass()
Definition: opt.cpp:5211
static void lExtractConstOffsets(const std::vector< llvm::CallInst *> &coalesceGroup, int elementSize, std::vector< int64_t > *constOffsets)
Definition: opt.cpp:3725
void run(llvm::Module &m, bool init)
Definition: opt.cpp:4454
static llvm::Type * FloatType
Definition: llvmutil.h:79
static llvm::Type * Int32VectorPointerType
Definition: llvmutil.h:102
const char * getPassName() const
Definition: opt.cpp:5075
const char * getPassName() const
Definition: opt.cpp:928
llvm::Value * LLVMShuffleVectors(llvm::Value *v1, llvm::Value *v2, int32_t shuf[], int shufSize, llvm::Instruction *insertBefore)
Definition: llvmutil.cpp:1502
Opt opt
Definition: ispc.h:542
DebugPassFile(int number, llvm::StringRef name)
Definition: opt.cpp:4426
llvm::Constant * LLVMInt64Vector(int64_t i)
Definition: llvmutil.cpp:368
llvm::Instruction * fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext)
Definition: opt.cpp:5087
static bool lIsSafeToBlend(llvm::Value *lvalue)
Definition: opt.cpp:4023
static bool lCoalesceGathers(const std::vector< llvm::CallInst *> &coalesceGroup)
Definition: opt.cpp:3752
Declaration of the FunctionEmitContext class
void PerformanceWarning(SourcePos p, const char *format,...) PRINTF_FUNC
Definition: util.cpp:394
bool hasVecPrefetch() const
Definition: ispc.h:303
static llvm::Type * DoubleType
Definition: llvmutil.h:80
static llvm::Value * lExtractFromInserts(llvm::Value *v, unsigned int index)
Definition: opt.cpp:1468
bool disableBlendedMaskedStores
Definition: ispc.h:487
static llvm::Value * lExtractOffsetVector248Scale(llvm::Value **vec)
Definition: opt.cpp:1818
static bool simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter)
Definition: opt.cpp:1296
Module * m
Definition: ispc.cpp:102
void Optimize(llvm::Module *module, int optLevel)
Definition: opt.cpp:514
static llvm::Pass * CreateImproveMemoryOpsPass()
Definition: opt.cpp:3004
int first_line
Definition: ispc.h:135
bool runOnModule(llvm::Module &m)
Definition: opt.cpp:4507
Target * target
Definition: ispc.h:544
static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, const char *name, llvm::Instruction *insertBefore=NULL)
Definition: opt.cpp:297
static bool lVectorLoadIsEfficient(std::set< int64_t >::iterator iter, std::set< int64_t >::iterator end, std::set< int64_t >::iterator *newIter, int vectorWidth)
Definition: opt.cpp:3075
int getNativeVectorAlignment() const
Definition: ispc.h:275
static void lSelectLoads(const std::vector< int64_t > &loadOffsets, std::vector< CoalescedLoadOp > *loads)
Definition: opt.cpp:3169
static llvm::Constant * lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1, llvm::Constant **delta)
Definition: opt.cpp:1453
const char * getPassName() const
Definition: opt.cpp:4429
static llvm::Value * lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3382
llvm::Value * lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align, llvm::Instruction *insertBefore, llvm::Type *type)
Definition: opt.cpp:3288
#define DEBUG_START_PASS(NAME)
Definition: opt.cpp:177
static char ID
Definition: opt.cpp:4668
static char ID
Definition: opt.cpp:5071
bool LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts)
Definition: llvmutil.cpp:690
llvm::Constant * LLVMInt32Vector(int32_t i)
Definition: llvmutil.cpp:308
static llvm::Value * lAssemble4Vector(const std::vector< CoalescedLoadOp > &loadOps, const int64_t offsets[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3501
static llvm::VectorType * Int32VectorType
Definition: llvmutil.h:95
Declarations related to optimization passes.
llvm::Value * element0
Definition: opt.cpp:3066
static llvm::Pass * CreateReplaceStdlibShiftPass()
Definition: opt.cpp:5041
std::vector< MaskInstruction > maskInstructions
Definition: opt.cpp:941
bool forceAlignedMemory
Definition: ispc.h:466
static void lCoalescePerfInfo(const std::vector< llvm::CallInst *> &coalesceGroup, const std::vector< CoalescedLoadOp > &loadOps)
Definition: opt.cpp:3219
static llvm::Type * FloatVectorPointerType
Definition: llvmutil.h:104
bool runOnModule(llvm::Module &m)
Definition: opt.cpp:4463
BlendInstruction * matchingBlendInstruction(llvm::Function *function)
Definition: opt.cpp:1182
static char ID
Definition: opt.cpp:3030
static bool lVectorIs32BitInts(llvm::Value *v)
Definition: opt.cpp:1977
static bool lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos)
Definition: opt.cpp:246
IsCompileTimeConstantPass(bool last=false)
Definition: opt.cpp:4302
static bool lGetMask(llvm::Value *factor, uint64_t *mask)
Definition: opt.cpp:381
static llvm::Type * Int16VectorPointerType
Definition: llvmutil.h:101
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:1312
const char * getPassName() const
Definition: opt.cpp:4974
static bool lIsIntegerSplat(llvm::Value *v, int *splat)
Definition: opt.cpp:1758
MakeInternalFuncsStaticPass(bool last=false)
Definition: opt.cpp:4493
static llvm::Pass * CreateReplacePseudoMemoryOpsPass()
Definition: opt.cpp:4281
static llvm::Type * Int16Type
Definition: llvmutil.h:76
static llvm::Type * DoubleVectorPointerType
Definition: llvmutil.h:105
bool run(llvm::Module &M)
Definition: opt.cpp:459
llvm::Constant * LLVMFalse
Definition: llvmutil.cpp:91
static bool lInstructionMayWriteToMemory(llvm::Instruction *inst)
Definition: opt.cpp:3826
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:2963
static llvm::Pass * CreateIntrinsicsOptPass()
Definition: opt.cpp:1191
bool disableCoalescing
Definition: ispc.h:527
static llvm::Pass * CreatePeepholePass()
Definition: opt.cpp:4954
llvm::PassManager PM
Definition: opt.cpp:467
static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst)
Definition: opt.cpp:2400
#define Assert(expr)
Definition: ispc.h:163
static llvm::VectorType * Int1VectorType
Definition: llvmutil.h:92
static llvm::Instruction * lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name, llvm::Instruction *insertBefore)
Definition: opt.cpp:334
static bool lIsUndef(llvm::Value *value)
Definition: opt.cpp:980
static void lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from)
Definition: opt.cpp:213
header file with declarations for symbol and symbol table classes.
static llvm::Pass * CreateDebugPassFile(int number, llvm::StringRef name)
Definition: opt.cpp:4473
std::set< int > debug_stages
Definition: ispc.h:577
bool disableMaskAllOnOptimizations
Definition: ispc.h:471
int level
Definition: ispc.h:432
llvm::ConstantInt * LLVMInt32(int32_t i)
Definition: llvmutil.cpp:228
llvm::Module * module
Definition: module.h:156
bool matchesMaskInstruction(llvm::Function *function)
Definition: opt.cpp:1173
const char * getPassName() const
Definition: opt.cpp:4394
static llvm::Type * Int8VectorPointerType
Definition: llvmutil.h:100
Globals * g
Definition: ispc.cpp:101
Definition: opt.cpp:424
bool disableGatherScatterOptimizations
Definition: ispc.h:505
bool debugPrint
Definition: ispc.h:568
static llvm::VectorType * Int8VectorType
Definition: llvmutil.h:93
bool LLVMVectorValuesAllEqual(llvm::Value *v, llvm::Value **splat=NULL)
Definition: llvmutil.cpp:1061
std::string sanitize(std::string in)
Definition: opt.cpp:4447
static uint64_t lConstElementsToMask(const llvm::SmallVector< llvm::Constant *, ISPC_MAX_NVEC > &elements)
Definition: opt.cpp:349
llvm::Constant * LLVMTrue
Definition: llvmutil.cpp:90
Definition: opt.cpp:424
static llvm::Pass * CreateInstructionSimplifyPass()
Definition: opt.cpp:1336
static llvm::Pass * CreateDebugPass(char *output)
Definition: opt.cpp:4413
static llvm::Value * simplifyBoolVec(llvm::Value *value)
Definition: opt.cpp:1223
static llvm::Pass * CreateIsCompileTimeConstantPass(bool isLastTry)
Definition: opt.cpp:4378
llvm::Value * element1
Definition: opt.cpp:3066
static llvm::VectorType * FloatVectorType
Definition: llvmutil.h:97
const char * getPassName() const
Definition: opt.cpp:4305
static bool lReplacePseudoGS(llvm::CallInst *callInst)
Definition: opt.cpp:4101
static bool lReplacePseudoMaskedStore(llvm::CallInst *callInst)
Definition: opt.cpp:4050
static llvm::Type * Int64Type
Definition: llvmutil.h:78
static llvm::Type * Int8Type
Definition: llvmutil.h:75
static llvm::VectorType * Int64VectorType
Definition: llvmutil.h:96
Header file with declarations for various LLVM utility stuff.
static bool simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter)
Definition: opt.cpp:1261
static llvm::Value * lGetBasePointer(llvm::Value *v, llvm::Instruction *insertBefore, bool broadcastDetected)
Definition: opt.cpp:1413
static llvm::Value * lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore)
Definition: opt.cpp:3701
const char * getPassName() const
Definition: opt.cpp:4498
bool hasScatter() const
Definition: ispc.h:293
bool runOnModule(llvm::Module &m)
Definition: opt.cpp:4406
BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
Definition: opt.cpp:947
bool unrollLoops
Definition: ispc.h:446
DebugPass(char *output)
Definition: opt.cpp:4391
llvm::Value * LLVMExtractFirstVectorElement(llvm::Value *v)
Definition: llvmutil.cpp:1472
const char * getPassName() const
Definition: opt.cpp:4007
Representation of a range of positions in a source file.
Definition: ispc.h:131
void getAnalysisUsage(llvm::AnalysisUsage &AU) const
Definition: opt.cpp:4495
static char ID
Definition: opt.cpp:4390
static char ID
Definition: opt.cpp:934
static char ID
Definition: opt.cpp:1353
static std::vector< CoalescedLoadOp > lSplit8WideLoads(const std::vector< CoalescedLoadOp > &loadOps, llvm::Instruction *insertBefore)
Definition: opt.cpp:3357
const char * LLVMGetName(llvm::Value *v, const char *)
Definition: llvmutil.cpp:1518
Definition: opt.cpp:424
static llvm::Pass * CreateGatherCoalescePass()
Definition: opt.cpp:3992
void LLVMDumpValue(llvm::Value *v)
Definition: llvmutil.cpp:1362
bool disableHandlePseudoMemoryOps
Definition: ispc.h:477
bool force32BitAddressing
Definition: ispc.h:452
static char ID
Definition: opt.cpp:1213
static llvm::Pass * CreateMakeInternalFuncsStaticPass()
Definition: opt.cpp:4652
bool hasGather() const
Definition: ispc.h:291
bool runOnFunction(llvm::Function &F)
Definition: opt.cpp:5133
void Warning(SourcePos p, const char *format,...) PRINTF_FUNC
Definition: util.cpp:375
static llvm::PointerType * VoidPointerType
Definition: llvmutil.h:71
int getVectorWidth() const
Definition: ispc.h:279
llvm::Value * load
Definition: opt.cpp:3062
#define FATAL(message)
Definition: util.h:112
bool doInitialization(llvm::Module &m)
Definition: opt.cpp:4468
const char * getPassName() const
Definition: opt.cpp:4662
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:3850
static llvm::Type * Int64VectorPointerType
Definition: llvmutil.h:103
static llvm::Type * Int32Type
Definition: llvmutil.h:77
const llvm::DataLayout * getDataLayout() const
Definition: ispc.h:258
#define PTYPE(p)
Definition: llvmutil.h:55
#define ISPC_MAX_NVEC
Definition: ispc.h:66
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:4316
static bool lOffsets32BitSafe(llvm::Value **variableOffsetPtr, llvm::Value **constOffsetPtr, llvm::Instruction *insertBefore)
Definition: opt.cpp:1993
static char ID
Definition: opt.cpp:4970
#define DEBUG_END_PASS(NAME)
Definition: opt.cpp:187
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:991
void add(llvm::Pass *P, int stage)
Definition: opt.cpp:474
ISA getISA() const
Definition: ispc.h:263
bool dumpFile
Definition: ispc.h:580
llvm::Value * LLVMFlattenInsertChain(llvm::Value *inst, int vectorWidth, bool compare=true, bool undef=true, bool searchFirstUndef=false)
Definition: llvmutil.cpp:565
static bool lGSToGSBaseOffsets(llvm::CallInst *callInst)
Definition: opt.cpp:2084
const char * getPassName() const
Definition: opt.cpp:1357
IntrinsicsOpt()
Definition: opt.cpp:925
CoalescedLoadOp(int64_t s, int c)
Definition: opt.cpp:3047
int64_t start
Definition: opt.cpp:3056
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:4989
static void lEmitLoads(llvm::Value *basePtr, std::vector< CoalescedLoadOp > &loadOps, int elementSize, llvm::Instruction *insertBefore)
Definition: opt.cpp:3299
static llvm::VectorType * DoubleVectorType
Definition: llvmutil.h:98
static void lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset, llvm::Value **variableOffset, llvm::Instruction *insertBefore)
Definition: opt.cpp:1629
MaskStatus
Definition: opt.cpp:424
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:4256