Intel SPMD Program Compiler  1.9.1
opt.cpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2010-2016, Intel Corporation
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are
7  met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16  * Neither the name of Intel Corporation nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22  IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
25  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 /** @file opt.cpp
35  @brief Implementations of various ispc optimization passes that operate
36  on the LLVM IR.
37 */
38 
39 #include "opt.h"
40 #include "ctx.h"
41 #include "sym.h"
42 #include "module.h"
43 #include "util.h"
44 #include "llvmutil.h"
45 
46 #include <stdio.h>
47 #include <map>
48 #include <set>
49 
50 #include <llvm/Pass.h>
51 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
52  #include <llvm/Module.h>
53  #include <llvm/Instructions.h>
54  #include <llvm/Intrinsics.h>
55  #include <llvm/Function.h>
56  #include <llvm/BasicBlock.h>
57  #include <llvm/Constants.h>
58 #ifdef ISPC_NVPTX_ENABLED
59  #include <llvm/InlineAsm.h>
60 #endif /* ISPC_NVPTX_ENABLED */
61 #else // LLVM 3.3+
62  #include <llvm/IR/Module.h>
63  #include <llvm/IR/Instructions.h>
64  #include <llvm/IR/Intrinsics.h>
65  #include <llvm/IR/Function.h>
66  #include <llvm/IR/BasicBlock.h>
67  #include <llvm/IR/Constants.h>
68 #ifdef ISPC_NVPTX_ENABLED
69  #include <llvm/IR/InlineAsm.h>
70 #endif /* ISPC_NVPTX_ENABLED */
71 #endif
72 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_4 // LLVM 3.4+
73  #include <llvm/Transforms/Instrumentation.h>
74 #endif
75 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
76  #include "llvm/PassManager.h"
77 #else // LLVM 3.7+
78  #include "llvm/IR/LegacyPassManager.h"
79 #endif
80 #include <llvm/PassRegistry.h>
81 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
82  #include <llvm/IR/Verifier.h>
83  #include <llvm/IR/IRPrintingPasses.h>
84  #include <llvm/IR/PatternMatch.h>
85  #include <llvm/IR/DebugInfo.h>
86 #else // < 3.5
87  #include <llvm/Analysis/Verifier.h>
88  #include <llvm/Assembly/PrintModulePass.h>
89  #include <llvm/Support/PatternMatch.h>
90  #include <llvm/DebugInfo.h>
91 #endif
92 #include <llvm/Analysis/ConstantFolding.h>
93 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
94  #include <llvm/Target/TargetLibraryInfo.h>
95 #else // LLVM 3.7+
96  #include <llvm/Analysis/TargetLibraryInfo.h>
97 #endif
98 #include <llvm/ADT/Triple.h>
99 #include <llvm/ADT/SmallSet.h>
100 #include <llvm/Transforms/Scalar.h>
101 #include <llvm/Transforms/IPO.h>
102 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
103 #include <llvm/Target/TargetOptions.h>
104 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
105  #include <llvm/DataLayout.h>
106 #else // LLVM 3.3+
107  #include <llvm/IR/DataLayout.h>
108  #include <llvm/Analysis/TargetTransformInfo.h>
109 #endif
110 #include <llvm/Target/TargetMachine.h>
111 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
112  #include <llvm/Analysis/BasicAliasAnalysis.h>
113  #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
114 #endif
115 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_9 // LLVM 3.9+
116  #include "llvm/Transforms/IPO/FunctionAttrs.h"
117  #include "llvm/Transforms/Scalar/GVN.h"
118 #endif
119 #include <llvm/Analysis/Passes.h>
120 #include <llvm/Support/raw_ostream.h>
121 #include <llvm/Support/Dwarf.h>
122 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_6
123  #include <llvm/IR/IntrinsicInst.h>
124 #endif
125 #ifdef ISPC_IS_LINUX
126  #include <alloca.h>
127 #elif defined(ISPC_IS_WINDOWS)
128  #include <malloc.h>
129  #ifndef __MINGW32__
130  #define alloca _alloca
131  #endif
132 #endif // ISPC_IS_WINDOWS
133 
134 #ifndef PRId64
135 #define PRId64 "lld"
136 #endif
137 #ifndef PRIu64
138 #define PRIu64 "llu"
139 #endif
140 
141 static llvm::Pass *CreateIntrinsicsOptPass();
142 static llvm::Pass *CreateInstructionSimplifyPass();
143 static llvm::Pass *CreatePeepholePass();
144 
145 static llvm::Pass *CreateImproveMemoryOpsPass();
146 static llvm::Pass *CreateGatherCoalescePass();
147 static llvm::Pass *CreateReplacePseudoMemoryOpsPass();
148 
149 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
150 static llvm::Pass *CreateMakeInternalFuncsStaticPass();
151 
152 static llvm::Pass *CreateDebugPass(char * output);
153 
154 static llvm::Pass *CreateReplaceStdlibShiftPass();
155 
156 static llvm::Pass *CreateFixBooleanSelectPass();
157 #ifdef ISPC_NVPTX_ENABLED
158 static llvm::Pass *CreatePromoteLocalToPrivatePass();
159 #endif /* ISPC_NVPTX_ENABLED */
160 
161 #define DEBUG_START_PASS(NAME) \
162  if (g->debugPrint && \
163  (getenv("FUNC") == NULL || \
164  !strncmp(bb.getParent()->getName().str().c_str(), getenv("FUNC"), \
165  strlen(getenv("FUNC"))))) { \
166  fprintf(stderr, "Start of " NAME "\n"); \
167  fprintf(stderr, "---------------\n"); \
168  bb.dump(); \
169  fprintf(stderr, "---------------\n\n"); \
170  } else /* eat semicolon */
171 
172 #define DEBUG_END_PASS(NAME) \
173  if (g->debugPrint && \
174  (getenv("FUNC") == NULL || \
175  !strncmp(bb.getParent()->getName().str().c_str(), getenv("FUNC"), \
176  strlen(getenv("FUNC"))))) { \
177  fprintf(stderr, "End of " NAME " %s\n", modifiedAny ? "** CHANGES **" : ""); \
178  fprintf(stderr, "---------------\n"); \
179  bb.dump(); \
180  fprintf(stderr, "---------------\n\n"); \
181  } else /* eat semicolon */
182 
183 
184 
185 ///////////////////////////////////////////////////////////////////////////
186 
187 
188 /** This utility routine copies the metadata (if any) attached to the
189  'from' instruction in the IR to the 'to' instruction.
190 
191  For flexibility, this function takes an llvm::Value rather than an
192  llvm::Instruction for the 'to' parameter; at some places in the code
193  below, we sometimes use a llvm::Value to start out storing a value and
194  then later store instructions. If a llvm::Value is passed to this, the
195  routine just returns without doing anything; if it is in fact an
196  LLVM::Instruction, then the metadata can be copied to it.
197  */
198 static void
199 lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from) {
200  llvm::Instruction *to = llvm::dyn_cast<llvm::Instruction>(vto);
201  if (!to)
202  return;
203 
204  llvm::SmallVector<std::pair<unsigned int, llvm::MDNode *>, 8> metadata;
205 
206  from->getAllMetadata(metadata);
207  for (unsigned int i = 0; i < metadata.size(); ++i)
208  to->setMetadata(metadata[i].first, metadata[i].second);
209 }
210 
211 
212 /** We have a protocol with the front-end LLVM IR code generation process
213  that allows us to encode the source file position that corresponds with
214  instructions. (For example, this allows us to issue performance
215  warnings related to things like scatter and gather after optimization
216  has been performed, so that we aren't warning about scatters and
217  gathers that have been improved to stores and loads by optimization
218  passes.) Note that this is slightly redundant with the source file
219  position encoding generated for debugging symbols, though we don't
220  always generate debugging information but we do always generate this
221  position data.
222 
223  This function finds the SourcePos that the metadata in the instruction
224  (if present) corresponds to. See the implementation of
225  FunctionEmitContext::addGSMetadata(), which encodes the source position during
226  code generation.
227 
228  @param inst Instruction to try to find the source position of
229  @param pos Output variable in which to store the position
230  @returns True if source file position metadata was present and *pos
231  has been set. False otherwise.
232 */
233 static bool
234 lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos) {
235  llvm::MDNode *filename = inst->getMetadata("filename");
236  llvm::MDNode *first_line = inst->getMetadata("first_line");
237  llvm::MDNode *first_column = inst->getMetadata("first_column");
238  llvm::MDNode *last_line = inst->getMetadata("last_line");
239  llvm::MDNode *last_column = inst->getMetadata("last_column");
240 
241  if (!filename || !first_line || !first_column || !last_line || !last_column)
242  return false;
243 
244  // All of these asserts are things that FunctionEmitContext::addGSMetadata() is
245  // expected to have done in its operation
246  llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(filename->getOperand(0));
247  Assert(str);
248  llvm::ConstantInt *first_lnum =
249 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
250  llvm::dyn_cast<llvm::ConstantInt>(first_line->getOperand(0));
251 #else /* LLVN 3.6+ */
252  llvm::mdconst::extract<llvm::ConstantInt>(first_line->getOperand(0));
253 #endif
254  Assert(first_lnum);
255 
256  llvm::ConstantInt *first_colnum =
257 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
258  llvm::dyn_cast<llvm::ConstantInt>(first_column->getOperand(0));
259 #else /* LLVN 3.6+ */
260  llvm::mdconst::extract<llvm::ConstantInt>(first_column->getOperand(0));
261 #endif
262  Assert(first_column);
263 
264  llvm::ConstantInt *last_lnum =
265 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
266  llvm::dyn_cast<llvm::ConstantInt>(last_line->getOperand(0));
267 #else /* LLVN 3.6+ */
268  llvm::mdconst::extract<llvm::ConstantInt>(last_line->getOperand(0));
269 #endif
270  Assert(last_lnum);
271 
272  llvm::ConstantInt *last_colnum =
273 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
274  llvm::dyn_cast<llvm::ConstantInt>(last_column->getOperand(0));
275 #else /* LLVN 3.6+ */
276  llvm::mdconst::extract<llvm::ConstantInt>(last_column->getOperand(0));
277 #endif
278  Assert(last_column);
279 
280  *pos = SourcePos(str->getString().data(), (int)first_lnum->getZExtValue(),
281  (int)first_colnum->getZExtValue(), (int)last_lnum->getZExtValue(),
282  (int)last_colnum->getZExtValue());
283  return true;
284 }
285 
286 
287 static llvm::Instruction *
288 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
289  const char *name, llvm::Instruction *insertBefore = NULL) {
290  llvm::Value *args[2] = { arg0, arg1 };
291  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
292  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
293 }
294 
295 
296 static llvm::Instruction *
297 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
298  llvm::Value *arg2, const char *name,
299  llvm::Instruction *insertBefore = NULL) {
300  llvm::Value *args[3] = { arg0, arg1, arg2 };
301  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
302  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
303 }
304 
305 
306 static llvm::Instruction *
307 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
308  llvm::Value *arg2, llvm::Value *arg3, const char *name,
309  llvm::Instruction *insertBefore = NULL) {
310  llvm::Value *args[4] = { arg0, arg1, arg2, arg3 };
311  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[4]);
312  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
313 }
314 
315 static llvm::Instruction *
316 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
317  llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4,
318  const char *name, llvm::Instruction *insertBefore = NULL) {
319  llvm::Value *args[5] = { arg0, arg1, arg2, arg3, arg4 };
320  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[5]);
321  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
322 }
323 
324 
325 static llvm::Instruction *
326 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
327  llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4,
328  llvm::Value *arg5, const char *name,
329  llvm::Instruction *insertBefore = NULL) {
330  llvm::Value *args[6] = { arg0, arg1, arg2, arg3, arg4, arg5 };
331  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
332  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
333 }
334 
335 
336 static llvm::Instruction *
337 lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name,
338  llvm::Instruction *insertBefore) {
339  llvm::Value *index[1] = { offset };
340  llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
341 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
342  return llvm::GetElementPtrInst::Create(ptr, arrayRef, name,
343  insertBefore);
344 #else // LLVM 3.7+
345  return llvm::GetElementPtrInst::Create(PTYPE(ptr), ptr, arrayRef,
346  name, insertBefore);
347 #endif
348 }
349 
350 
351 /** Given a vector of constant values (int, float, or bool) representing an
352  execution mask, convert it to a bitvector where the 0th bit corresponds
353  to the first vector value and so forth.
354 */
355 static uint64_t
356 lConstElementsToMask(const llvm::SmallVector<llvm::Constant *,
357  ISPC_MAX_NVEC> &elements) {
358  Assert(elements.size() <= 64);
359 
360  uint64_t mask = 0;
361  for (unsigned int i = 0; i < elements.size(); ++i) {
362  llvm::APInt intMaskValue;
363  // SSE has the "interesting" approach of encoding blending
364  // masks as <n x float>.
365  llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
366  if (cf != NULL) {
367  llvm::APFloat apf = cf->getValueAPF();
368  intMaskValue = apf.bitcastToAPInt();
369  }
370  else {
371  // Otherwise get it as an int
372  llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
373  Assert(ci != NULL); // vs return -1 if NULL?
374  intMaskValue = ci->getValue();
375  }
376  // Is the high-bit set? If so, OR in the appropriate bit in
377  // the result mask
378  if (intMaskValue.countLeadingOnes() > 0)
379  mask |= (1ull << i);
380  }
381  return mask;
382 }
383 
384 
385 /** Given an llvm::Value represinting a vector mask, see if the value is a
386  constant. If so, return true and set *bits to be the integer mask
387  found by taking the high bits of the mask values in turn and
388  concatenating them into a single integer. In other words, given the
389  4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, we have 0b1001 = 9.
390  */
391 static bool
392 lGetMask(llvm::Value *factor, uint64_t *mask) {
393  llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
394  if (cdv != NULL) {
395  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
396  for (int i = 0; i < (int)cdv->getNumElements(); ++i)
397  elements.push_back(cdv->getElementAsConstant(i));
398  *mask = lConstElementsToMask(elements);
399  return true;
400  }
401 
402  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
403  if (cv != NULL) {
404  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
405  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
406  llvm::Constant *c =
407  llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
408  if (c == NULL)
409  return false;
410  if (llvm::isa<llvm::ConstantExpr>(cv->getOperand(i)) )
411  return false; // We can not handle constant expressions here
412  elements.push_back(c);
413  }
414  *mask = lConstElementsToMask(elements);
415  return true;
416  }
417  else if (llvm::isa<llvm::ConstantAggregateZero>(factor)) {
418  *mask = 0;
419  return true;
420  }
421  else {
422 #if 0
423  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
424  if (ce != NULL) {
425  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
426  const llvm::TargetData *td = targetMachine->getTargetData();
427  llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
428  c->dump();
429  factor = c;
430  }
431  // else we should be able to handle it above...
432  Assert(!llvm::isa<llvm::Constant>(factor));
433 #endif
434  return false;
435  }
436 }
437 
438 
440 
441 /** Determines if the given mask value is all on, all off, mixed, or
442  unknown at compile time.
443 */
444 static MaskStatus
445 lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
446  uint64_t bits;
447  if (lGetMask(mask, &bits) == false)
448  return UNKNOWN;
449 
450  if (bits == 0)
451  return ALL_OFF;
452 
453  if (vecWidth == -1)
454  vecWidth = g->target->getVectorWidth();
455  Assert(vecWidth <= 64);
456 
457  for (int i = 0; i < vecWidth; ++i) {
458  if ((bits & (1ull << i)) == 0)
459  return MIXED;
460  }
461  return ALL_ON;
462 }
463 
464 
465 ///////////////////////////////////////////////////////////////////////////
466 // This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
467 // and change PassManager function add by adding some checks and debug passes.
468 // This wrap can control:
469 // - If we want to switch off optimization with given number.
470 // - If we want to dump LLVM IR after optimization with given number.
471 // - If we want to generate LLVM IR debug for gdb after optimization with given number.
473 public:
475  void add(llvm::Pass * P, int stage);
476  bool run(llvm::Module& M) {return PM.run(M);}
477 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
478  llvm::PassManager& getPM() {return PM;}
479 #else /* LLVM 3.7+ */
480  llvm::legacy::PassManager& getPM() {return PM;}
481 #endif
482 private:
483 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
484  llvm::PassManager PM;
485 #else /* LLVM 3.7+ */
486  llvm::legacy::PassManager PM;
487 #endif
488  int number;
489 };
490 
491 void
492 DebugPassManager::add(llvm::Pass * P, int stage = -1) {
493  // taking number of optimization
494  if (stage == -1) {
495  number++;
496  }
497  else {
498  number = stage;
499  }
500  if (g->off_stages.find(number) == g->off_stages.end()) {
501  // adding optimization (not switched off)
502  PM.add(P);
503  if (g->debug_stages.find(number) != g->debug_stages.end()) {
504  // adding dump of LLVM IR after optimization
505  char buf[100];
506  sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
507  number, P->getPassName());
508  PM.add(CreateDebugPass(buf));
509  }
510 
511 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_4 || ISPC_LLVM_VERSION == ISPC_LLVM_3_5 // only 3.4 and 3.5
512  if (g->debugIR == number) {
513  // adding generating of LLVM IR debug after optimization
514  char buf[100];
515  sprintf(buf, "Debug_IR_after_%d_phase.bc", number);
516  PM.add(llvm::createDebugIRPass(true, true, ".", buf));
517  }
518 #endif
519  }
520 }
521 ///////////////////////////////////////////////////////////////////////////
522 
523 void
524 Optimize(llvm::Module *module, int optLevel) {
525  if (g->debugPrint) {
526  printf("*** Code going into optimization ***\n");
527  module->dump();
528  }
529  DebugPassManager optPM;
530  optPM.add(llvm::createVerifierPass(),0);
531 
532 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
533  llvm::TargetLibraryInfo *targetLibraryInfo =
534  new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
535  optPM.add(targetLibraryInfo);
536 #else // LLVM 3.7+
537  optPM.add(new llvm::TargetLibraryInfoWrapperPass(llvm::Triple(module->getTargetTriple())));
538 #endif
539 
540 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_4
541  optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
542 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_5
543  optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
544 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_6
545  llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
546  dlp->doInitialization(*module);
547  optPM.add(dlp);
548 #endif // LLVM 3.7+ doesn't have DataLayoutPass anymore.
549 
550  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
551 
552 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
553  optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
554  targetMachine->getVectorTargetTransformInfo()));
555 #elif ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
556  targetMachine->addAnalysisPasses(optPM.getPM());
557 #else // LLVM 3.7+
558  optPM.getPM().add(createTargetTransformInfoWrapperPass(targetMachine->getTargetIRAnalysis()));
559 #endif
560 
561  optPM.add(llvm::createIndVarSimplifyPass());
562 
563  if (optLevel == 0) {
564  // This is more or less the minimum set of optimizations that we
565  // need to do to generate code that will actually run. (We can't
566  // run absolutely no optimizations, since the front-end needs us to
567  // take the various __pseudo_* functions it has emitted and turn
568  // them into something that can actually execute.
569  optPM.add(CreateImproveMemoryOpsPass(), 100);
570 #ifdef ISPC_NVPTX_ENABLED
571  if (g->opt.disableGatherScatterOptimizations == false &&
572  g->target->getVectorWidth() > 1)
573 #endif /* ISPC_NVPTX_ENABLED */
574  optPM.add(CreateImproveMemoryOpsPass(), 100);
575 
576  if (g->opt.disableHandlePseudoMemoryOps == false)
578 
579  optPM.add(CreateIntrinsicsOptPass(), 102);
581  optPM.add(llvm::createFunctionInliningPass());
583  optPM.add(llvm::createCFGSimplificationPass());
584  optPM.add(llvm::createGlobalDCEPass());
585  }
586  else {
587  llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
588  llvm::initializeCore(*registry);
589  llvm::initializeScalarOpts(*registry);
590  llvm::initializeIPO(*registry);
591  llvm::initializeAnalysis(*registry);
592 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7
593  llvm::initializeIPA(*registry);
594 #endif
595  llvm::initializeTransformUtils(*registry);
596  llvm::initializeInstCombine(*registry);
597  llvm::initializeInstrumentation(*registry);
598  llvm::initializeTarget(*registry);
599 
600  optPM.add(llvm::createGlobalDCEPass(), 185);
601 
602  // Setup to use LLVM default AliasAnalysis
603  // Ideally, we want call:
604  // llvm::PassManagerBuilder pm_Builder;
605  // pm_Builder.OptLevel = optLevel;
606  // pm_Builder.addInitialAliasAnalysisPasses(optPM);
607  // but the addInitialAliasAnalysisPasses() is a private function
608  // so we explicitly enable them here.
609  // Need to keep sync with future LLVM change
610  // An alternative is to call populateFunctionPassManager()
611 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7
612  optPM.add(llvm::createTypeBasedAliasAnalysisPass(), 190);
613  optPM.add(llvm::createBasicAliasAnalysisPass());
614 #else
615  optPM.add(llvm::createTypeBasedAAWrapperPass(), 190);
616  optPM.add(llvm::createBasicAAWrapperPass());
617 #endif
618  optPM.add(llvm::createCFGSimplificationPass());
619 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
620  optPM.add(llvm::createScalarReplAggregatesPass());
621 #else
622  optPM.add(llvm::createSROAPass());
623 #endif
624  optPM.add(llvm::createEarlyCSEPass());
625  optPM.add(llvm::createLowerExpectIntrinsicPass());
626 
627  // Early optimizations to try to reduce the total amount of code to
628  // work with if we can
629  optPM.add(llvm::createReassociatePass(), 200);
630  optPM.add(llvm::createConstantPropagationPass());
631  optPM.add(llvm::createDeadInstEliminationPass());
632  optPM.add(llvm::createCFGSimplificationPass());
633 
634  optPM.add(llvm::createPromoteMemoryToRegisterPass());
635  optPM.add(llvm::createAggressiveDCEPass());
636 
637 
638  if (g->opt.disableGatherScatterOptimizations == false &&
639  g->target->getVectorWidth() > 1) {
640  optPM.add(llvm::createInstructionCombiningPass(), 210);
642  }
644  optPM.add(CreateIntrinsicsOptPass(), 215);
646  }
647  optPM.add(llvm::createDeadInstEliminationPass(), 220);
648 
649  // Max struct size threshold for scalar replacement is
650  // 1) 4 fields (r,g,b,w)
651  // 2) field size: vectorWidth * sizeof(float)
652 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
653  const int field_limit = 4;
654  int sr_threshold = g->target->getVectorWidth() * sizeof(float) * field_limit;
655 #endif
656 
657  // On to more serious optimizations
658 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
659  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
660 #else
661  optPM.add(llvm::createSROAPass());
662 #endif
663  optPM.add(llvm::createInstructionCombiningPass());
664  optPM.add(llvm::createCFGSimplificationPass());
665  optPM.add(llvm::createPromoteMemoryToRegisterPass());
666  optPM.add(llvm::createGlobalOptimizerPass());
667  optPM.add(llvm::createReassociatePass());
668  optPM.add(llvm::createIPConstantPropagationPass());
669 
670 #ifdef ISPC_NVPTX_ENABLED
671  if (g->target->getISA() != Target::NVPTX)
672 #endif /* ISPC_NVPTX_ENABLED */
673  optPM.add(CreateReplaceStdlibShiftPass(),229);
674 
675  optPM.add(llvm::createDeadArgEliminationPass(),230);
676  optPM.add(llvm::createInstructionCombiningPass());
677  optPM.add(llvm::createCFGSimplificationPass());
678  optPM.add(llvm::createPruneEHPass());
679 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_9 // 3.9+
680  optPM.add(llvm::createPostOrderFunctionAttrsLegacyPass());
681  optPM.add(llvm::createReversePostOrderFunctionAttrsPass());
682 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_8 // 3.8
683  optPM.add(llvm::createPostOrderFunctionAttrsPass());
684  optPM.add(llvm::createReversePostOrderFunctionAttrsPass());
685 #else // 3.7 and earlier
686  optPM.add(llvm::createFunctionAttrsPass());
687 #endif
688  optPM.add(llvm::createFunctionInliningPass());
689  optPM.add(llvm::createConstantPropagationPass());
690  optPM.add(llvm::createDeadInstEliminationPass());
691  optPM.add(llvm::createCFGSimplificationPass());
692 
693  optPM.add(llvm::createArgumentPromotionPass());
694 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3
695  // Starting from 3.4 this functionality was moved to
696  // InstructionCombiningPass. See r184459 for details.
697  optPM.add(llvm::createSimplifyLibCallsPass(), 240);
698 #endif
699  optPM.add(llvm::createAggressiveDCEPass());
700  optPM.add(llvm::createInstructionCombiningPass(), 241);
701  optPM.add(llvm::createJumpThreadingPass());
702  optPM.add(llvm::createCFGSimplificationPass());
703 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
704  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
705 #else
706  optPM.add(llvm::createSROAPass());
707 #endif
708  optPM.add(llvm::createInstructionCombiningPass());
709  optPM.add(llvm::createTailCallEliminationPass());
710 
712  optPM.add(CreateIntrinsicsOptPass(), 250);
714  }
715 
716  if (g->opt.disableGatherScatterOptimizations == false &&
717  g->target->getVectorWidth() > 1) {
718  optPM.add(llvm::createInstructionCombiningPass(), 255);
720 
721  if (g->opt.disableCoalescing == false &&
722  g->target->getISA() != Target::GENERIC) {
723  // It is important to run this here to make it easier to
724  // finding matching gathers we can coalesce..
725  optPM.add(llvm::createEarlyCSEPass(), 260);
726  optPM.add(CreateGatherCoalescePass());
727  }
728  }
729 
730  optPM.add(llvm::createFunctionInliningPass(), 265);
731  optPM.add(llvm::createConstantPropagationPass());
732  optPM.add(CreateIntrinsicsOptPass());
734 
735  if (g->opt.disableGatherScatterOptimizations == false &&
736  g->target->getVectorWidth() > 1) {
737  optPM.add(llvm::createInstructionCombiningPass(), 270);
739  }
740 
741  optPM.add(llvm::createIPSCCPPass(), 275);
742  optPM.add(llvm::createDeadArgEliminationPass());
743  optPM.add(llvm::createAggressiveDCEPass());
744  optPM.add(llvm::createInstructionCombiningPass());
745  optPM.add(llvm::createCFGSimplificationPass());
746 
747  if (g->opt.disableHandlePseudoMemoryOps == false) {
749  }
750  optPM.add(CreateIntrinsicsOptPass(),281);
752 
753  optPM.add(llvm::createFunctionInliningPass());
754  optPM.add(llvm::createArgumentPromotionPass());
755 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
756  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false));
757 #else
758  optPM.add(llvm::createSROAPass());
759 #endif
760  optPM.add(llvm::createInstructionCombiningPass());
762  optPM.add(llvm::createCFGSimplificationPass());
763  optPM.add(llvm::createReassociatePass());
764  optPM.add(llvm::createLoopRotatePass());
765  optPM.add(llvm::createLICMPass());
766  optPM.add(llvm::createLoopUnswitchPass(false));
767  optPM.add(llvm::createInstructionCombiningPass());
769  optPM.add(llvm::createIndVarSimplifyPass());
770  optPM.add(llvm::createLoopIdiomPass());
771  optPM.add(llvm::createLoopDeletionPass());
772  if (g->opt.unrollLoops) {
773  optPM.add(llvm::createLoopUnrollPass(), 300);
774  }
775  optPM.add(llvm::createGVNPass(), 301);
776 
778  optPM.add(CreateIntrinsicsOptPass());
780 
781  optPM.add(llvm::createMemCpyOptPass());
782  optPM.add(llvm::createSCCPPass());
783  optPM.add(llvm::createInstructionCombiningPass());
785  optPM.add(llvm::createJumpThreadingPass());
786  optPM.add(llvm::createCorrelatedValuePropagationPass());
787  optPM.add(llvm::createDeadStoreEliminationPass());
788  optPM.add(llvm::createAggressiveDCEPass());
789  optPM.add(llvm::createCFGSimplificationPass());
790  optPM.add(llvm::createInstructionCombiningPass());
792  optPM.add(CreatePeepholePass());
793  optPM.add(llvm::createFunctionInliningPass());
794  optPM.add(llvm::createAggressiveDCEPass());
795  optPM.add(llvm::createStripDeadPrototypesPass());
797  optPM.add(llvm::createGlobalDCEPass());
798  optPM.add(llvm::createConstantMergePass());
799 
800  // Should be the last
801  optPM.add(CreateFixBooleanSelectPass(), 400);
802 #ifdef ISPC_NVPTX_ENABLED
803  if (g->target->getISA() == Target::NVPTX)
804  {
805  optPM.add(CreatePromoteLocalToPrivatePass());
806  optPM.add(llvm::createGlobalDCEPass());
807 
808  optPM.add(llvm::createTypeBasedAliasAnalysisPass());
809  optPM.add(llvm::createBasicAliasAnalysisPass());
810  optPM.add(llvm::createCFGSimplificationPass());
811  // Here clang has an experimental pass SROAPass instead of
812  // ScalarReplAggregatesPass. We should add it in the future.
813 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
814  optPM.add(llvm::createScalarReplAggregatesPass());
815 #else
816  optPM.add(llvm::createSROAPass());
817 #endif
818  optPM.add(llvm::createEarlyCSEPass());
819  optPM.add(llvm::createLowerExpectIntrinsicPass());
820  optPM.add(llvm::createTypeBasedAliasAnalysisPass());
821  optPM.add(llvm::createBasicAliasAnalysisPass());
822 
823  // Early optimizations to try to reduce the total amount of code to
824  // work with if we can
825  optPM.add(llvm::createReassociatePass());
826  optPM.add(llvm::createConstantPropagationPass());
827  optPM.add(llvm::createDeadInstEliminationPass());
828  optPM.add(llvm::createCFGSimplificationPass());
829 
830  optPM.add(llvm::createPromoteMemoryToRegisterPass());
831  optPM.add(llvm::createAggressiveDCEPass());
832 
833 
834  optPM.add(llvm::createInstructionCombiningPass());
835  optPM.add(llvm::createDeadInstEliminationPass());
836 
837  // On to more serious optimizations
838  optPM.add(llvm::createInstructionCombiningPass());
839  optPM.add(llvm::createCFGSimplificationPass());
840  optPM.add(llvm::createPromoteMemoryToRegisterPass());
841  optPM.add(llvm::createGlobalOptimizerPass());
842  optPM.add(llvm::createReassociatePass());
843  optPM.add(llvm::createIPConstantPropagationPass());
844 
845  optPM.add(llvm::createDeadArgEliminationPass());
846  optPM.add(llvm::createInstructionCombiningPass());
847  optPM.add(llvm::createCFGSimplificationPass());
848  optPM.add(llvm::createPruneEHPass());
849  optPM.add(llvm::createFunctionAttrsPass());
850  optPM.add(llvm::createFunctionInliningPass());
851  optPM.add(llvm::createConstantPropagationPass());
852  optPM.add(llvm::createDeadInstEliminationPass());
853  optPM.add(llvm::createCFGSimplificationPass());
854 
855  optPM.add(llvm::createArgumentPromotionPass());
856 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3
857  // Starting from 3.4 this functionality was moved to
858  // InstructionCombiningPass. See r184459 for details.
859  optPM.add(llvm::createSimplifyLibCallsPass());
860 #endif
861  optPM.add(llvm::createAggressiveDCEPass());
862  optPM.add(llvm::createInstructionCombiningPass());
863  optPM.add(llvm::createJumpThreadingPass());
864  optPM.add(llvm::createCFGSimplificationPass());
865  optPM.add(llvm::createInstructionCombiningPass());
866  optPM.add(llvm::createTailCallEliminationPass());
867 
868  optPM.add(llvm::createInstructionCombiningPass());
869 
870  optPM.add(llvm::createFunctionInliningPass());
871  optPM.add(llvm::createConstantPropagationPass());
872 
873  optPM.add(llvm::createInstructionCombiningPass());
874 
875  optPM.add(llvm::createIPSCCPPass());
876  optPM.add(llvm::createDeadArgEliminationPass());
877  optPM.add(llvm::createAggressiveDCEPass());
878  optPM.add(llvm::createInstructionCombiningPass());
879  optPM.add(llvm::createCFGSimplificationPass());
880 
881  optPM.add(llvm::createFunctionInliningPass());
882  optPM.add(llvm::createArgumentPromotionPass());
883  optPM.add(llvm::createInstructionCombiningPass());
884  optPM.add(llvm::createCFGSimplificationPass());
885  optPM.add(llvm::createReassociatePass());
886  optPM.add(llvm::createLoopRotatePass());
887  optPM.add(llvm::createLICMPass());
888 // optPM.add(llvm::createLoopUnswitchPass(false));
889 #if 1
890  optPM.add(llvm::createInstructionCombiningPass());
891  optPM.add(llvm::createIndVarSimplifyPass());
892  optPM.add(llvm::createLoopIdiomPass());
893  optPM.add(llvm::createLoopDeletionPass());
894  optPM.add(llvm::createLoopUnrollPass());
895  optPM.add(llvm::createGVNPass());
896  optPM.add(llvm::createMemCpyOptPass());
897  optPM.add(llvm::createSCCPPass());
898  optPM.add(llvm::createInstructionCombiningPass());
899  optPM.add(llvm::createJumpThreadingPass());
900  optPM.add(llvm::createCorrelatedValuePropagationPass());
901  optPM.add(llvm::createDeadStoreEliminationPass());
902  optPM.add(llvm::createAggressiveDCEPass());
903  optPM.add(llvm::createCFGSimplificationPass());
904  optPM.add(llvm::createInstructionCombiningPass());
905  optPM.add(llvm::createFunctionInliningPass());
906  optPM.add(llvm::createAggressiveDCEPass());
907  optPM.add(llvm::createStripDeadPrototypesPass());
908  optPM.add(llvm::createGlobalDCEPass());
909  optPM.add(llvm::createConstantMergePass());
910 #endif
911  }
912 #endif /* ISPC_NVPTX_ENABLED */
913  }
914 
915  // Finish up by making sure we didn't mess anything up in the IR along
916  // the way.
917  optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
918  optPM.run(*module);
919 
920  if (g->debugPrint) {
921  printf("\n*****\nFINAL OUTPUT\n*****\n");
922  module->dump();
923  }
924 
925 }
926 
927 
928 
929 ///////////////////////////////////////////////////////////////////////////
930 // IntrinsicsOpt
931 
932 /** This is a relatively simple optimization pass that does a few small
933  optimizations that LLVM's x86 optimizer doesn't currently handle.
934  (Specifically, MOVMSK of a constant can be replaced with the
935  corresponding constant value, BLENDVPS and AVX masked load/store with
936  either an 'all on' or 'all off' masks can be replaced with simpler
937  operations.
938 
939  @todo The better thing to do would be to submit a patch to LLVM to get
940  these; they're presumably pretty simple patterns to match.
941 */
942 class IntrinsicsOpt : public llvm::BasicBlockPass {
943 public:
944  IntrinsicsOpt() : BasicBlockPass(ID) {};
945 
946  const char *getPassName() const { return "Intrinsics Cleanup Optimization"; }
947  bool runOnBasicBlock(llvm::BasicBlock &BB);
948 
949  static char ID;
950 
951 private:
953  MaskInstruction(llvm::Function *f) { function = f; }
954  llvm::Function *function;
955  };
956  std::vector<MaskInstruction> maskInstructions;
957 
958  /** Structure that records everything we need to know about a blend
959  instruction for this optimization pass.
960  */
962  BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
963  : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) { }
964  /** Function pointer for the blend instruction */
965  llvm::Function *function;
966  /** Mask value for an "all on" mask for this instruction */
967  uint64_t allOnMask;
968  /** The operand number in the llvm CallInst corresponds to the
969  first operand to blend with. */
970  int op0;
971  /** The operand number in the CallInst corresponding to the second
972  operand to blend with. */
973  int op1;
974  /** The operand in the call inst where the blending factor is
975  found. */
976  int opFactor;
977  };
978  std::vector<BlendInstruction> blendInstructions;
979 
980  bool matchesMaskInstruction(llvm::Function *function);
981  BlendInstruction *matchingBlendInstruction(llvm::Function *function);
982 };
983 
984 char IntrinsicsOpt::ID = 0;
985 
986 
987 /** Given an llvm::Value, return true if we can determine that it's an
988  undefined value. This only makes a weak attempt at chasing this down,
989  only detecting flat-out undef values, and bitcasts of undef values.
990 
991  @todo Is it worth working harder to find more of these? It starts to
992  get tricky, since having an undef operand doesn't necessarily mean that
993  the result will be undefined. (And for that matter, is there an LLVM
994  call that will do this for us?)
995  */
996 static bool
997 lIsUndef(llvm::Value *value) {
998  if (llvm::isa<llvm::UndefValue>(value))
999  return true;
1000 
1001  llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(value);
1002  if (bci)
1003  return lIsUndef(bci->getOperand(0));
1004 
1005  return false;
1006 }
1007 
1008 
1009 bool
1010 IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
1011  DEBUG_START_PASS("IntrinsicsOpt");
1012 
1013  // We can’t initialize mask/blend function vector during pass initialization,
1014  // as they may be optimized out by the time the pass is invoked.
1015 
1016  // All of the mask instructions we may encounter. Note that even if
1017  // compiling for AVX, we may still encounter the regular 4-wide SSE
1018  // MOVMSK instruction.
1019  if (llvm::Function *ssei8Movmsk =
1020  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse2_pmovmskb_128))) {
1021  maskInstructions.push_back(ssei8Movmsk);
1022  }
1023  if (llvm::Function *sseFloatMovmsk =
1024  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse_movmsk_ps))) {
1025  maskInstructions.push_back(sseFloatMovmsk);
1026  }
1027  if (llvm::Function *__movmsk =
1028  m->module->getFunction("__movmsk")) {
1029  maskInstructions.push_back(__movmsk);
1030  }
1031  if (llvm::Function *avxFloatMovmsk =
1032  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_movmsk_ps_256))) {
1033  maskInstructions.push_back(avxFloatMovmsk);
1034  }
1035 
1036  // And all of the blend instructions
1038  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse41_blendvps)),
1039  0xf, 0, 1, 2));
1041  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_blendv_ps_256)),
1042  0xff, 0, 1, 2));
1043 
1044  llvm::Function *avxMaskedLoad32 =
1045  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_ps_256));
1046  llvm::Function *avxMaskedLoad64 =
1047  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_pd_256));
1048  llvm::Function *avxMaskedStore32 =
1049  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_ps_256));
1050  llvm::Function *avxMaskedStore64 =
1051  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_pd_256));
1052 
1053  bool modifiedAny = false;
1054  restart:
1055  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
1056  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
1057  if (callInst == NULL || callInst->getCalledFunction() == NULL)
1058  continue;
1059 
1060  BlendInstruction *blend = matchingBlendInstruction(callInst->getCalledFunction());
1061  if (blend != NULL) {
1062  llvm::Value *v[2] = { callInst->getArgOperand(blend->op0),
1063  callInst->getArgOperand(blend->op1) };
1064  llvm::Value *factor = callInst->getArgOperand(blend->opFactor);
1065 
1066  // If the values are the same, then no need to blend..
1067  if (v[0] == v[1]) {
1068  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1069  iter, v[0]);
1070  modifiedAny = true;
1071  goto restart;
1072  }
1073 
1074  // If one of the two is undefined, we're allowed to replace
1075  // with the value of the other. (In other words, the only
1076  // valid case is that the blend factor ends up having a value
1077  // that only selects from the defined one of the two operands,
1078  // otherwise the result is undefined and any value is fine,
1079  // ergo the defined one is an acceptable result.)
1080  if (lIsUndef(v[0])) {
1081  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1082  iter, v[1]);
1083  modifiedAny = true;
1084  goto restart;
1085  }
1086  if (lIsUndef(v[1])) {
1087  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1088  iter, v[0]);
1089  modifiedAny = true;
1090  goto restart;
1091  }
1092 
1093  uint64_t mask;
1094  if (lGetMask(factor, &mask) == true) {
1095  llvm::Value *value = NULL;
1096  if (mask == 0)
1097  // Mask all off -> replace with the first blend value
1098  value = v[0];
1099  else if (mask == blend->allOnMask)
1100  // Mask all on -> replace with the second blend value
1101  value = v[1];
1102 
1103  if (value != NULL) {
1104  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1105  iter, value);
1106  modifiedAny = true;
1107  goto restart;
1108  }
1109  }
1110  }
1111  else if (matchesMaskInstruction(callInst->getCalledFunction())) {
1112  llvm::Value *factor = callInst->getArgOperand(0);
1113  uint64_t mask;
1114  if (lGetMask(factor, &mask) == true) {
1115  // If the vector-valued mask has a known value, replace it
1116  // with the corresponding integer mask from its elements
1117  // high bits.
1118  llvm::Value *value = (callInst->getType() == LLVMTypes::Int32Type) ?
1119  LLVMInt32(mask) : LLVMInt64(mask);
1120  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1121  iter, value);
1122  modifiedAny = true;
1123  goto restart;
1124  }
1125  }
1126  else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
1127  callInst->getCalledFunction() == avxMaskedLoad64) {
1128  llvm::Value *factor = callInst->getArgOperand(1);
1129  uint64_t mask;
1130  if (lGetMask(factor, &mask) == true) {
1131  if (mask == 0) {
1132  // nothing being loaded, replace with undef value
1133  llvm::Type *returnType = callInst->getType();
1134  Assert(llvm::isa<llvm::VectorType>(returnType));
1135  llvm::Value *undefValue = llvm::UndefValue::get(returnType);
1136  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1137  iter, undefValue);
1138  modifiedAny = true;
1139  goto restart;
1140  }
1141  else if (mask == 0xff) {
1142  // all lanes active; replace with a regular load
1143  llvm::Type *returnType = callInst->getType();
1144  Assert(llvm::isa<llvm::VectorType>(returnType));
1145  // cast the i8 * to the appropriate type
1146  const char *name = LLVMGetName(callInst->getArgOperand(0), "_cast");
1147  llvm::Value *castPtr =
1148  new llvm::BitCastInst(callInst->getArgOperand(0),
1149  llvm::PointerType::get(returnType, 0),
1150  name, callInst);
1151  lCopyMetadata(castPtr, callInst);
1152  int align;
1153  if (g->opt.forceAlignedMemory)
1154  align = g->target->getNativeVectorAlignment();
1155  else
1156  align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
1157  name = LLVMGetName(callInst->getArgOperand(0), "_load");
1158  llvm::Instruction *loadInst =
1159  new llvm::LoadInst(castPtr, name, false /* not volatile */,
1160  align, (llvm::Instruction *)NULL);
1161  lCopyMetadata(loadInst, callInst);
1162  llvm::ReplaceInstWithInst(callInst, loadInst);
1163  modifiedAny = true;
1164  goto restart;
1165  }
1166  }
1167  }
1168  else if (callInst->getCalledFunction() == avxMaskedStore32 ||
1169  callInst->getCalledFunction() == avxMaskedStore64) {
1170  // NOTE: mask is the 2nd parameter, not the 3rd one!!
1171  llvm::Value *factor = callInst->getArgOperand(1);
1172  uint64_t mask;
1173  if (lGetMask(factor, &mask) == true) {
1174  if (mask == 0) {
1175  // nothing actually being stored, just remove the inst
1176  callInst->eraseFromParent();
1177  modifiedAny = true;
1178  goto restart;
1179  }
1180  else if (mask == 0xff) {
1181  // all lanes storing, so replace with a regular store
1182  llvm::Value *rvalue = callInst->getArgOperand(2);
1183  llvm::Type *storeType = rvalue->getType();
1184  const char *name = LLVMGetName(callInst->getArgOperand(0),
1185  "_ptrcast");
1186  llvm::Value *castPtr =
1187  new llvm::BitCastInst(callInst->getArgOperand(0),
1188  llvm::PointerType::get(storeType, 0),
1189  name, callInst);
1190  lCopyMetadata(castPtr, callInst);
1191 
1192  llvm::StoreInst *storeInst =
1193  new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
1194  int align;
1195  if (g->opt.forceAlignedMemory)
1196  align = g->target->getNativeVectorAlignment();
1197  else
1198  align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
1199  storeInst->setAlignment(align);
1200  lCopyMetadata(storeInst, callInst);
1201  llvm::ReplaceInstWithInst(callInst, storeInst);
1202 
1203  modifiedAny = true;
1204  goto restart;
1205  }
1206  }
1207  }
1208  }
1209 
1210  DEBUG_END_PASS("IntrinsicsOpt");
1211 
1212  return modifiedAny;
1213 }
1214 
1215 
1216 bool
1217 IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
1218  for (unsigned int i = 0; i < maskInstructions.size(); ++i) {
1219  if (maskInstructions[i].function != NULL &&
1220  function == maskInstructions[i].function) {
1221  return true;
1222  }
1223  }
1224  return false;
1225 }
1226 
1227 
1229 IntrinsicsOpt::matchingBlendInstruction(llvm::Function *function) {
1230  for (unsigned int i = 0; i < blendInstructions.size(); ++i) {
1231  if (blendInstructions[i].function != NULL &&
1232  function == blendInstructions[i].function) {
1233  return &blendInstructions[i];
1234  }
1235  }
1236  return NULL;
1237 }
1238 
1239 
1240 static llvm::Pass *
1242  return new IntrinsicsOpt;
1243 }
1244 
1245 
1246 ///////////////////////////////////////////////////////////////////////////
1247 
1248 /** This simple optimization pass looks for a vector select instruction
1249  with an all-on or all-off constant mask, simplifying it to the
1250  appropriate operand if so.
1251 
1252  @todo The better thing to do would be to submit a patch to LLVM to get
1253  these; they're presumably pretty simple patterns to match.
1254 */
1255 class InstructionSimplifyPass : public llvm::BasicBlockPass {
1256 public:
1258  : BasicBlockPass(ID) { }
1259 
1260  const char *getPassName() const { return "Vector Select Optimization"; }
1261  bool runOnBasicBlock(llvm::BasicBlock &BB);
1262 
1263  static char ID;
1264 
1265 private:
1266  static bool simplifySelect(llvm::SelectInst *selectInst,
1267  llvm::BasicBlock::iterator iter);
1268  static llvm::Value *simplifyBoolVec(llvm::Value *value);
1269  static bool simplifyCall(llvm::CallInst *callInst,
1270  llvm::BasicBlock::iterator iter);
1271 };
1272 
1274 
1275 
1276 llvm::Value *
1278  llvm::TruncInst *trunc = llvm::dyn_cast<llvm::TruncInst>(value);
1279  if (trunc != NULL) {
1280  // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
1281  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
1282  if (sext &&
1283  sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
1284  return sext->getOperand(0);
1285 
1286  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
1287  if (zext &&
1288  zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
1289  return zext->getOperand(0);
1290  }
1291  /*
1292  // This optimization has discernable benefit on the perf
1293  // suite on latest LLVM versions.
1294  // On 3.4+ (maybe even older), it can result in illegal
1295  // operations, so it's being disabled.
1296  llvm::ICmpInst *icmp = llvm::dyn_cast<llvm::ICmpInst>(value);
1297  if (icmp != NULL) {
1298  // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo
1299  if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) {
1300  llvm::Value *op1 = icmp->getOperand(1);
1301  if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
1302  llvm::Value *op0 = icmp->getOperand(0);
1303  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(op0);
1304  if (sext)
1305  return sext->getOperand(0);
1306  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(op0);
1307  if (zext)
1308  return zext->getOperand(0);
1309  }
1310  }
1311 
1312  }
1313  */
1314  return NULL;
1315 }
1316 
1317 
1318 bool
1319 InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst,
1320  llvm::BasicBlock::iterator iter) {
1321  if (selectInst->getType()->isVectorTy() == false)
1322  return false;
1323 
1324  llvm::Value *factor = selectInst->getOperand(0);
1325 
1326  // Simplify all-on or all-off mask values
1327  MaskStatus maskStatus = lGetMaskStatus(factor);
1328  llvm::Value *value = NULL;
1329  if (maskStatus == ALL_ON)
1330  // Mask all on -> replace with the first select value
1331  value = selectInst->getOperand(1);
1332  else if (maskStatus == ALL_OFF)
1333  // Mask all off -> replace with the second select value
1334  value = selectInst->getOperand(2);
1335  if (value != NULL) {
1336  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1337  iter, value);
1338  return true;
1339  }
1340 
1341  // Sometimes earlier LLVM optimization passes generate unnecessarily
1342  // complex expressions for the selection vector, which in turn confuses
1343  // the code generators and leads to sub-optimal code (particularly for
1344  // 8 and 16-bit masks). We'll try to simplify them out here so that
1345  // the code generator patterns match..
1346  if ((factor = simplifyBoolVec(factor)) != NULL) {
1347  llvm::Instruction *newSelect =
1348  llvm::SelectInst::Create(factor, selectInst->getOperand(1),
1349  selectInst->getOperand(2),
1350  selectInst->getName());
1351  llvm::ReplaceInstWithInst(selectInst, newSelect);
1352  return true;
1353  }
1354 
1355  return false;
1356 }
1357 
1358 
1359 bool
1360 InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst,
1361  llvm::BasicBlock::iterator iter) {
1362  llvm::Function *calledFunc = callInst->getCalledFunction();
1363 
1364  // Turn a __movmsk call with a compile-time constant vector into the
1365  // equivalent scalar value.
1366  if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
1367  return false;
1368 
1369  uint64_t mask;
1370  if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
1371  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1372  iter, LLVMInt64(mask));
1373  return true;
1374  }
1375  return false;
1376 }
1377 
1378 
1379 bool
1381  DEBUG_START_PASS("InstructionSimplify");
1382 
1383  bool modifiedAny = false;
1384 
1385  restart:
1386  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
1387  llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
1388  if (selectInst && simplifySelect(selectInst, iter)) {
1389  modifiedAny = true;
1390  goto restart;
1391  }
1392  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
1393  if (callInst && simplifyCall(callInst, iter)) {
1394  modifiedAny = true;
1395  goto restart;
1396  }
1397  }
1398 
1399  DEBUG_END_PASS("InstructionSimplify");
1400 
1401  return modifiedAny;
1402 }
1403 
1404 
1405 static llvm::Pass *
1407  return new InstructionSimplifyPass;
1408 }
1409 
1410 
1411 ///////////////////////////////////////////////////////////////////////////
1412 // ImproveMemoryOpsPass
1413 
1414 /** When the front-end emits gathers and scatters, it generates an array of
1415  vector-width pointers to represent the set of addresses to read from or
1416  write to. This optimization detects cases when the base pointer is a
1417  uniform pointer or when the indexing is into an array that can be
1418  converted into scatters/gathers from a single base pointer and an array
1419  of offsets.
1420 
1421  See for example the comments discussing the __pseudo_gather functions
1422  in builtins.cpp for more information about this.
1423  */
1424 class ImproveMemoryOpsPass : public llvm::BasicBlockPass {
1425 public:
1426  static char ID;
1427  ImproveMemoryOpsPass() : BasicBlockPass(ID) { }
1428 
1429  const char *getPassName() const { return "Improve Memory Ops"; }
1430  bool runOnBasicBlock(llvm::BasicBlock &BB);
1431 };
1432 
1433 char ImproveMemoryOpsPass::ID = 0;
1434 
1435 
1436 
1437 /** Check to make sure that this value is actually a pointer in the end.
1438  We need to make sure that given an expression like vec(offset) +
1439  ptr2int(ptr), lGetBasePointer() doesn't return vec(offset) for the base
1440  pointer such that we then treat ptr2int(ptr) as an offset. This ends
1441  up being important so that we don't generate LLVM GEP instructions like
1442  "gep inttoptr 8, i64 %ptr", which in turn can lead to incorrect code
1443  since LLVM's pointer aliasing analysis assumes that operands after the
1444  first one to a GEP aren't pointers.
1445  */
1446 static llvm::Value *
1447 lCheckForActualPointer(llvm::Value *v) {
1448  if (v == NULL) {
1449  return NULL;
1450  }
1451  else if (llvm::isa<llvm::PointerType>(v->getType())) {
1452  return v;
1453  }
1454  else if (llvm::isa<llvm::PtrToIntInst>(v)) {
1455  return v;
1456  }
1457  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1458  llvm::Value *t = lCheckForActualPointer(ci->getOperand(0));
1459  if (t == NULL) {
1460  return NULL;
1461  }
1462  else {
1463  return v;
1464  }
1465  }
1466  else {
1467  llvm::ConstantExpr *uce =
1468  llvm::dyn_cast<llvm::ConstantExpr>(v);
1469  if (uce != NULL &&
1470  uce->getOpcode() == llvm::Instruction::PtrToInt)
1471  return v;
1472  return NULL;
1473  }
1474 }
1475 
1476 
1477 /** Given a llvm::Value representing a varying pointer, this function
1478  checks to see if all of the elements of the vector have the same value
1479  (i.e. there's a common base pointer). If so, it returns the common
1480  pointer value; otherwise it returns NULL.
1481  */
1482 static llvm::Value *
1483 lGetBasePointer(llvm::Value *v, llvm::Instruction *insertBefore) {
1484  if (llvm::isa<llvm::InsertElementInst>(v) ||
1485  llvm::isa<llvm::ShuffleVectorInst>(v)) {
1486  llvm::Value *element = LLVMFlattenInsertChain
1487  (v, g->target->getVectorWidth(), true, false);
1488  // TODO: it's probably ok to allow undefined elements and return
1489  // the base pointer if all of the other elements have the same
1490  // value.
1491  if (element != NULL) {
1492  //all elements are the same and not NULLs
1493  return lCheckForActualPointer(element);
1494  }
1495  else {
1496  return NULL;
1497  }
1498  }
1499 
1500  // This case comes up with global/static arrays
1501  if (llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v)) {
1502  return lCheckForActualPointer(cv->getSplatValue());
1503  }
1504  else if (llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v)) {
1505  return lCheckForActualPointer(cdv->getSplatValue());
1506  }
1507  // It is a little bit tricky to use operations with pointers, casted to int with another bit size
1508  // but sometimes it is useful, so we handle this case here.
1509  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1510  llvm::Value *t = lGetBasePointer(ci->getOperand(0), insertBefore);
1511  if (t == NULL) {
1512  return NULL;
1513  }
1514  else {
1515  return llvm::CastInst::Create(ci->getOpcode(),
1516  t, ci->getType()->getScalarType(),
1517  LLVMGetName(t, "_cast"),
1518  insertBefore);
1519  }
1520  }
1521 
1522  return NULL;
1523 }
1524 
1525 
1526 /** Given the two operands to a constant add expression, see if we have the
1527  form "base pointer + offset", whee op0 is the base pointer and op1 is
1528  the offset; if so return the base and the offset. */
1529 static llvm::Constant *
1530 lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1,
1531  llvm::Constant **delta) {
1532  llvm::ConstantExpr *op = llvm::dyn_cast<llvm::ConstantExpr>(op0);
1533  if (op == NULL || op->getOpcode() != llvm::Instruction::PtrToInt)
1534  // the first operand isn't a pointer
1535  return NULL;
1536 
1537  llvm::ConstantInt *opDelta = llvm::dyn_cast<llvm::ConstantInt>(op1);
1538  if (opDelta == NULL)
1539  // the second operand isn't an integer operand
1540  return NULL;
1541 
1542  *delta = opDelta;
1543  return op0;
1544 }
1545 
1546 
1547 static llvm::Value *
1548 lExtractFromInserts(llvm::Value *v, unsigned int index) {
1549  llvm::InsertValueInst *iv = llvm::dyn_cast<llvm::InsertValueInst>(v);
1550  if (iv == NULL)
1551  return NULL;
1552 
1553  Assert(iv->hasIndices() && iv->getNumIndices() == 1);
1554  if (iv->getIndices()[0] == index)
1555  return iv->getInsertedValueOperand();
1556  else
1557  return lExtractFromInserts(iv->getAggregateOperand(), index);
1558 }
1559 
1560 
1561 /** Given a varying pointer in ptrs, this function checks to see if it can
1562  be determined to be indexing from a common uniform base pointer. If
1563  so, the function returns the base pointer llvm::Value and initializes
1564  *offsets with an int vector of the per-lane offsets
1565  */
1566 static llvm::Value *
1567 lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets,
1568  llvm::Instruction *insertBefore) {
1569  if (g->debugPrint) {
1570  fprintf(stderr, "lGetBasePtrAndOffsets\n");
1571  LLVMDumpValue(ptrs);
1572  }
1573 
1574  llvm::Value *base = lGetBasePointer(ptrs, insertBefore);
1575  if (base != NULL) {
1576  // We have a straight up varying pointer with no indexing that's
1577  // actually all the same value.
1578  if (g->target->is32Bit())
1579  *offsets = LLVMInt32Vector(0);
1580  else
1581  *offsets = LLVMInt64Vector((int64_t)0);
1582  return base;
1583  }
1584 
1585  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(ptrs);
1586  if (bop != NULL && bop->getOpcode() == llvm::Instruction::Add) {
1587  // If we have a common pointer plus something, then we're also
1588  // good.
1589  if ((base = lGetBasePtrAndOffsets(bop->getOperand(0),
1590  offsets, insertBefore)) != NULL) {
1591  *offsets =
1592  llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets,
1593  bop->getOperand(1), "new_offsets",
1594  insertBefore);
1595  return base;
1596  }
1597  else if ((base = lGetBasePtrAndOffsets(bop->getOperand(1),
1598  offsets, insertBefore)) != NULL) {
1599  *offsets =
1600  llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets,
1601  bop->getOperand(0), "new_offsets",
1602  insertBefore);
1603  return base;
1604  }
1605  }
1606 
1607  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(ptrs);
1608  if (cv != NULL) {
1609  // Indexing into global arrays can lead to this form, with
1610  // ConstantVectors..
1611  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
1612  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
1613  llvm::Constant *c =
1614  llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
1615  if (c == NULL)
1616  return NULL;
1617  elements.push_back(c);
1618  }
1619 
1620  llvm::Constant *delta[ISPC_MAX_NVEC];
1621  for (unsigned int i = 0; i < elements.size(); ++i) {
1622  // For each element, try to decompose it into either a straight
1623  // up base pointer, or a base pointer plus an integer value.
1624  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(elements[i]);
1625  if (ce == NULL)
1626  return NULL;
1627 
1628  delta[i] = NULL;
1629  llvm::Value *elementBase = NULL; // base pointer for this element
1630  if (ce->getOpcode() == llvm::Instruction::PtrToInt) {
1631  // If the element is just a ptr to int instruction, treat
1632  // it as having an offset of zero
1633  elementBase = ce;
1634  delta[i] = g->target->is32Bit() ? LLVMInt32(0) : LLVMInt64(0);
1635  }
1636  else if (ce->getOpcode() == llvm::Instruction::Add) {
1637  // Try both orderings of the operands to see if we can get
1638  // a pointer+offset out of them.
1639  elementBase =
1640  lGetConstantAddExprBaseOffset(ce->getOperand(0),
1641  ce->getOperand(1),
1642  &delta[i]);
1643  if (elementBase == NULL)
1644  elementBase =
1645  lGetConstantAddExprBaseOffset(ce->getOperand(1),
1646  ce->getOperand(0),
1647  &delta[i]);
1648  }
1649 
1650  // We weren't able to find a base pointer in the above. (We
1651  // don't expect this to happen; if it does, it may be necessary
1652  // to handle more cases in the decomposition above.)
1653  if (elementBase == NULL)
1654  return NULL;
1655 
1656  Assert(delta[i] != NULL);
1657  if (base == NULL)
1658  // The first time we've found a base pointer
1659  base = elementBase;
1660  else if (base != elementBase)
1661  // Different program instances have different base
1662  // pointers, so no luck.
1663  return NULL;
1664  }
1665 
1666  Assert(base != NULL);
1667  llvm::ArrayRef<llvm::Constant *> deltas(&delta[0],
1668  &delta[elements.size()]);
1669  *offsets = llvm::ConstantVector::get(deltas);
1670  return base;
1671  }
1672 
1673  llvm::ExtractValueInst *ev = llvm::dyn_cast<llvm::ExtractValueInst>(ptrs);
1674  if (ev != NULL) {
1675  Assert(ev->getNumIndices() == 1);
1676  int index = ev->getIndices()[0];
1677  ptrs = lExtractFromInserts(ev->getAggregateOperand(), index);
1678  if (ptrs != NULL)
1679  return lGetBasePtrAndOffsets(ptrs, offsets, insertBefore);
1680  }
1681 
1682  return NULL;
1683 }
1684 
1685 
1686 /** Given a vector expression in vec, separate it into a compile-time
1687  constant component and a variable component, returning the two parts in
1688  *constOffset and *variableOffset. (It should be the case that the sum
1689  of these two is exactly equal to the original vector.)
1690 
1691  This routine only handles some (important) patterns; in some cases it
1692  will fail and return components that are actually compile-time
1693  constants in *variableOffset.
1694 
1695  Finally, if there aren't any constant (or, respectivaly, variable)
1696  components, the corresponding return value may be set to NULL.
1697  */
1698 static void
1699 lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
1700  llvm::Value **variableOffset,
1701  llvm::Instruction *insertBefore) {
1702  if (llvm::isa<llvm::ConstantVector>(vec) ||
1703  llvm::isa<llvm::ConstantDataVector>(vec) ||
1704  llvm::isa<llvm::ConstantAggregateZero>(vec)) {
1705  *constOffset = vec;
1706  *variableOffset = NULL;
1707  return;
1708  }
1709 
1710  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(vec);
1711  if (cast != NULL) {
1712  // Check the cast target.
1713  llvm::Value *co, *vo;
1714  lExtractConstantOffset(cast->getOperand(0), &co, &vo, insertBefore);
1715 
1716  // make new cast instructions for the two parts
1717  if (co == NULL)
1718  *constOffset = NULL;
1719  else
1720  *constOffset = llvm::CastInst::Create(cast->getOpcode(),
1721  co, cast->getType(),
1722  LLVMGetName(co, "_cast"),
1723  insertBefore);
1724  if (vo == NULL)
1725  *variableOffset = NULL;
1726  else
1727  *variableOffset = llvm::CastInst::Create(cast->getOpcode(),
1728  vo, cast->getType(),
1729  LLVMGetName(vo, "_cast"),
1730  insertBefore);
1731  return;
1732  }
1733 
1734  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(vec);
1735  if (bop != NULL) {
1736  llvm::Value *op0 = bop->getOperand(0);
1737  llvm::Value *op1 = bop->getOperand(1);
1738  llvm::Value *c0, *v0, *c1, *v1;
1739 
1740  if (bop->getOpcode() == llvm::Instruction::Add) {
1741  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1742  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1743 
1744  if (c0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c0))
1745  *constOffset = c1;
1746  else if (c1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c1))
1747  *constOffset = c0;
1748  else
1749  *constOffset =
1750  llvm::BinaryOperator::Create(llvm::Instruction::Add, c0, c1,
1751  LLVMGetName("add", c0, c1),
1752  insertBefore);
1753 
1754  if (v0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v0))
1755  *variableOffset = v1;
1756  else if (v1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v1))
1757  *variableOffset = v0;
1758  else
1759  *variableOffset =
1760  llvm::BinaryOperator::Create(llvm::Instruction::Add, v0, v1,
1761  LLVMGetName("add", v0, v1),
1762  insertBefore);
1763  return;
1764  }
1765  else if (bop->getOpcode() == llvm::Instruction::Shl) {
1766  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1767  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1768 
1769  // Given the product of constant and variable terms, we have:
1770  // (c0 + v0) * (2^(c1 + v1)) = c0 * 2^c1 * 2^v1 + v0 * 2^c1 * 2^v1
1771  // We can optimize only if v1 == NULL.
1772  if ((v1 != NULL) || (c0 == NULL) || (c1 == NULL)) {
1773  *constOffset = NULL;
1774  *variableOffset = vec;
1775  }
1776  else if (v0 == NULL) {
1777  *constOffset = vec;
1778  *variableOffset = NULL;
1779  }
1780  else {
1781  *constOffset =
1782  llvm::BinaryOperator::Create(llvm::Instruction::Shl, c0, c1,
1783  LLVMGetName("shl", c0, c1),
1784  insertBefore);
1785  *variableOffset =
1786  llvm::BinaryOperator::Create(llvm::Instruction::Shl, v0, c1,
1787  LLVMGetName("shl", v0, c1),
1788  insertBefore);
1789  }
1790  return;
1791  }
1792  else if (bop->getOpcode() == llvm::Instruction::Mul) {
1793  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1794  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1795 
1796  // Given the product of constant and variable terms, we have:
1797  // (c0 + v0) * (c1 + v1) == (c0 c1) + (v0 c1 + c0 v1 + v0 v1)
1798  // Note that the first term is a constant and the last three are
1799  // variable.
1800  if (c0 != NULL && c1 != NULL)
1801  *constOffset =
1802  llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, c1,
1803  LLVMGetName("mul", c0, c1),
1804  insertBefore);
1805  else
1806  *constOffset = NULL;
1807 
1808  llvm::Value *va = NULL, *vb = NULL, *vc = NULL;
1809  if (v0 != NULL && c1 != NULL)
1810  va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1,
1811  LLVMGetName("mul", v0, c1), insertBefore);
1812  if (c0 != NULL && v1 != NULL)
1813  vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1,
1814  LLVMGetName("mul", c0, v1), insertBefore);
1815  if (v0 != NULL && v1 != NULL)
1816  vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1,
1817  LLVMGetName("mul", v0, v1), insertBefore);
1818 
1819 
1820  llvm::Value *vab = NULL;
1821  if (va != NULL && vb != NULL)
1822  vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb,
1823  LLVMGetName("add", va, vb), insertBefore);
1824  else if (va != NULL)
1825  vab = va;
1826  else
1827  vab = vb;
1828 
1829  if (vab != NULL && vc != NULL)
1830  *variableOffset =
1831  llvm::BinaryOperator::Create(llvm::Instruction::Add, vab, vc,
1832  LLVMGetName("add", vab, vc), insertBefore);
1833  else if (vab != NULL)
1834  *variableOffset = vab;
1835  else
1836  *variableOffset = vc;
1837 
1838  return;
1839  }
1840  }
1841 
1842  // Nothing matched, just return what we have as a variable component
1843  *constOffset = NULL;
1844  *variableOffset = vec;
1845 }
1846 
1847 
1848 /* Returns true if the given value is a constant vector of integers with
1849  the same value in all of the elements. (Returns the splatted value in
1850  *splat, if so). */
1851 static bool
1852 lIsIntegerSplat(llvm::Value *v, int *splat) {
1853  llvm::ConstantDataVector *cvec =
1854  llvm::dyn_cast<llvm::ConstantDataVector>(v);
1855  if (cvec == NULL)
1856  return false;
1857 
1858  llvm::Constant *splatConst = cvec->getSplatValue();
1859  if (splatConst == NULL)
1860  return false;
1861 
1862  llvm::ConstantInt *ci =
1863  llvm::dyn_cast<llvm::ConstantInt>(splatConst);
1864  if (ci == NULL)
1865  return false;
1866 
1867  int64_t splatVal = ci->getSExtValue();
1868  *splat = (int)splatVal;
1869  return true;
1870 }
1871 
1872 
1873 static llvm::Value *
1874 lExtract248Scale(llvm::Value *splatOperand, int splatValue,
1875  llvm::Value *otherOperand, llvm::Value **result) {
1876  if (splatValue == 2 || splatValue == 4 || splatValue == 8) {
1877  *result = otherOperand;
1878  return LLVMInt32(splatValue);
1879  }
1880  // Even if we don't have a common scale by exactly 2, 4, or 8, we'll
1881  // see if we can pull out that much of the scale anyway; this may in
1882  // turn allow other optimizations later.
1883  for (int scale = 8; scale >= 2; scale /= 2) {
1884  llvm::Instruction *insertBefore =
1885  llvm::dyn_cast<llvm::Instruction>(*result);
1886  Assert(insertBefore != NULL);
1887 
1888  if ((splatValue % scale) == 0) {
1889  // *result = otherOperand * splatOperand / scale;
1890  llvm::Value *splatScaleVec =
1891  (splatOperand->getType() == LLVMTypes::Int32VectorType) ?
1892  LLVMInt32Vector(scale) : LLVMInt64Vector(scale);
1893  llvm::Value *splatDiv =
1894  llvm::BinaryOperator::Create(llvm::Instruction::SDiv,
1895  splatOperand, splatScaleVec,
1896  "div", insertBefore);
1897  *result =
1898  llvm::BinaryOperator::Create(llvm::Instruction::Mul,
1899  splatDiv, otherOperand,
1900  "mul", insertBefore);
1901  return LLVMInt32(scale);
1902  }
1903  }
1904  return LLVMInt32(1);
1905 }
1906 
1907 
1908 /** Given a vector of integer offsets to a base pointer being used for a
1909  gather or a scatter, see if its root operation is a multiply by a
1910  vector of some value by all 2s/4s/8s. If not, return NULL.
1911 
1912  If it is return an i32 value of 2, 4, 8 from the function and modify
1913  *vec so that it points to the operand that is being multiplied by
1914  2/4/8.
1915 
1916  We go through all this trouble so that we can pass the i32 scale factor
1917  to the {gather,scatter}_base_offsets function as a separate scale
1918  factor for the offsets. This in turn is used in a way so that the LLVM
1919  x86 code generator matches it to apply x86's free scale by 2x, 4x, or
1920  8x to one of two registers being added together for an addressing
1921  calculation.
1922  */
1923 static llvm::Value *
1924 lExtractOffsetVector248Scale(llvm::Value **vec) {
1925  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(*vec);
1926  if (cast != NULL) {
1927  llvm::Value *castOp = cast->getOperand(0);
1928  // Check the cast target.
1929  llvm::Value *scale = lExtractOffsetVector248Scale(&castOp);
1930  if (scale == NULL)
1931  return NULL;
1932 
1933  // make a new cast instruction so that we end up with the right
1934  // type
1935  *vec = llvm::CastInst::Create(cast->getOpcode(), castOp, cast->getType(), "offset_cast", cast);
1936  return scale;
1937  }
1938 
1939 
1940  // If we don't have a binary operator, then just give up
1941  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
1942  if (bop == NULL)
1943  return LLVMInt32(1);
1944 
1945  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
1946  if (bop->getOpcode() == llvm::Instruction::Add) {
1947  if (llvm::isa<llvm::ConstantAggregateZero>(op0)) {
1948  *vec = op1;
1949  return lExtractOffsetVector248Scale(vec);
1950  }
1951  else if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
1952  *vec = op0;
1953  return lExtractOffsetVector248Scale(vec);
1954  }
1955  else {
1956  llvm::Value *s0 = lExtractOffsetVector248Scale(&op0);
1957  llvm::Value *s1 = lExtractOffsetVector248Scale(&op1);
1958  if (s0 == s1) {
1959  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
1960  op0, op1, "new_add", bop);
1961  return s0;
1962  }
1963  else
1964  return LLVMInt32(1);
1965  }
1966  }
1967  else if (bop->getOpcode() == llvm::Instruction::Mul) {
1968  // Check each operand for being one of the scale factors we care about.
1969  int splat;
1970  if (lIsIntegerSplat(op0, &splat))
1971  return lExtract248Scale(op0, splat, op1, vec);
1972  else if (lIsIntegerSplat(op1, &splat))
1973  return lExtract248Scale(op1, splat, op0, vec);
1974  else
1975  return LLVMInt32(1);
1976  }
1977  else
1978  return LLVMInt32(1);
1979 }
1980 
1981 #if 0
1982 static llvm::Value *
1983 lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
1984  fprintf(stderr, " lextract: ");
1985  (*vec)->dump();
1986  fprintf(stderr, "\n");
1987 
1988  if (llvm::isa<llvm::ConstantVector>(*vec) ||
1989  llvm::isa<llvm::ConstantDataVector>(*vec) ||
1990  llvm::isa<llvm::ConstantAggregateZero>(*vec))
1991  return NULL;
1992 
1993  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
1994  if (sext != NULL) {
1995  llvm::Value *sextOp = sext->getOperand(0);
1996  // Check the sext target.
1997  llvm::Value *unif = lExtractUniforms(&sextOp, insertBefore);
1998  if (unif == NULL)
1999  return NULL;
2000 
2001  // make a new sext instruction so that we end up with the right
2002  // type
2003  *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
2004  return unif;
2005  }
2006 
2007  if (LLVMVectorValuesAllEqual(*vec)) {
2008  // FIXME: we may want to redo all of the expression here, in scalar
2009  // form (if at all possible), for code quality...
2010  llvm::Value *unif =
2011  llvm::ExtractElementInst::Create(*vec, LLVMInt32(0),
2012  "first_uniform", insertBefore);
2013  *vec = NULL;
2014  return unif;
2015  }
2016 
2017  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
2018  if (bop == NULL)
2019  return NULL;
2020 
2021  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
2022  if (bop->getOpcode() == llvm::Instruction::Add) {
2023  llvm::Value *s0 = lExtractUniforms(&op0, insertBefore);
2024  llvm::Value *s1 = lExtractUniforms(&op1, insertBefore);
2025  if (s0 == NULL && s1 == NULL)
2026  return NULL;
2027 
2028  if (op0 == NULL)
2029  *vec = op1;
2030  else if (op1 == NULL)
2031  *vec = op0;
2032  else
2033  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
2034  op0, op1, "new_add", insertBefore);
2035 
2036  if (s0 == NULL)
2037  return s1;
2038  else if (s1 == NULL)
2039  return s0;
2040  else
2041  return llvm::BinaryOperator::Create(llvm::Instruction::Add, s0, s1,
2042  "add_unif", insertBefore);
2043  }
2044 #if 0
2045  else if (bop->getOpcode() == llvm::Instruction::Mul) {
2046  // Check each operand for being one of the scale factors we care about.
2047  int splat;
2048  if (lIs248Splat(op0, &splat)) {
2049  *vec = op1;
2050  return LLVMInt32(splat);
2051  }
2052  else if (lIs248Splat(op1, &splat)) {
2053  *vec = op0;
2054  return LLVMInt32(splat);
2055  }
2056  else
2057  return LLVMInt32(1);
2058  }
2059 #endif
2060  else
2061  return NULL;
2062 }
2063 
2064 
2065 static void
2066 lExtractUniformsFromOffset(llvm::Value **basePtr, llvm::Value **offsetVector,
2067  llvm::Value *offsetScale,
2068  llvm::Instruction *insertBefore) {
2069 #if 1
2070  (*basePtr)->dump();
2071  printf("\n");
2072  (*offsetVector)->dump();
2073  printf("\n");
2074  offsetScale->dump();
2075  printf("-----\n");
2076 #endif
2077 
2078  llvm::Value *uniformDelta = lExtractUniforms(offsetVector, insertBefore);
2079  if (uniformDelta == NULL)
2080  return;
2081 
2082  *basePtr = lGEPInst(*basePtr, arrayRef, "new_base", insertBefore);
2083 
2084  // this should only happen if we have only uniforms, but that in turn
2085  // shouldn't be a gather/scatter!
2086  Assert(*offsetVector != NULL);
2087 }
2088 #endif
2089 
2090 
2091 static bool
2092 lVectorIs32BitInts(llvm::Value *v) {
2093  int nElts;
2094  int64_t elts[ISPC_MAX_NVEC];
2095  if (!LLVMExtractVectorInts(v, elts, &nElts))
2096  return false;
2097 
2098  for (int i = 0; i < nElts; ++i)
2099  if ((int32_t)elts[i] != elts[i])
2100  return false;
2101 
2102  return true;
2103 }
2104 
2105 
2106 /** Check to see if the two offset vectors can safely be represented with
2107  32-bit values. If so, return true and update the pointed-to
2108  llvm::Value *s to be the 32-bit equivalents. */
2109 static bool
2110 lOffsets32BitSafe(llvm::Value **variableOffsetPtr,
2111  llvm::Value **constOffsetPtr,
2112  llvm::Instruction *insertBefore) {
2113  llvm::Value *variableOffset = *variableOffsetPtr;
2114  llvm::Value *constOffset = *constOffsetPtr;
2115 
2116  if (variableOffset->getType() != LLVMTypes::Int32VectorType) {
2117  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(variableOffset);
2118  if (sext != NULL &&
2119  sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType)
2120  // sext of a 32-bit vector -> the 32-bit vector is good
2121  variableOffset = sext->getOperand(0);
2122  else if (lVectorIs32BitInts(variableOffset))
2123  // The only constant vector we should have here is a vector of
2124  // all zeros (i.e. a ConstantAggregateZero, but just in case,
2125  // do the more general check with lVectorIs32BitInts().
2126  variableOffset =
2127  new llvm::TruncInst(variableOffset, LLVMTypes::Int32VectorType,
2128  LLVMGetName(variableOffset, "_trunc"),
2129  insertBefore);
2130  else
2131  return false;
2132  }
2133 
2134  if (constOffset->getType() != LLVMTypes::Int32VectorType) {
2135  if (lVectorIs32BitInts(constOffset)) {
2136  // Truncate them so we have a 32-bit vector type for them.
2137  constOffset =
2138  new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
2139  LLVMGetName(constOffset, "_trunc"), insertBefore);
2140  }
2141  else {
2142  // FIXME: otherwise we just assume that all constant offsets
2143  // can actually always fit into 32-bits... (This could be
2144  // wrong, but it should be only in pretty esoteric cases). We
2145  // make this assumption for now since we sometimes generate
2146  // constants that need constant folding before we really have a
2147  // constant vector out of them, and
2148  // llvm::ConstantFoldInstruction() doesn't seem to be doing
2149  // enough for us in some cases if we call it from here.
2150  constOffset =
2151  new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
2152  LLVMGetName(constOffset, "_trunc"), insertBefore);
2153  }
2154  }
2155 
2156  *variableOffsetPtr = variableOffset;
2157  *constOffsetPtr = constOffset;
2158  return true;
2159 }
2160 
2161 /** Check to see if the offset value is composed of a string of Adds,
2162  SExts, and Constant Vectors that are 32-bit safe. Recursively
2163  explores the operands of Add instructions (as they might themselves
2164  be adds that eventually terminate in constant vectors or a SExt.)
2165  */
2166 
2167 static bool
2168 lIs32BitSafeHelper(llvm::Value *v) {
2169  // handle Adds, SExts, Constant Vectors
2170  if (llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v)) {
2171  if (bop->getOpcode() == llvm::Instruction::Add) {
2172  return lIs32BitSafeHelper(bop->getOperand(0))
2173  && lIs32BitSafeHelper(bop->getOperand(1));
2174  }
2175  return false;
2176  }
2177  else if (llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(v)) {
2178  return sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType;
2179  }
2180  else return lVectorIs32BitInts(v);
2181 }
2182 
2183 /** Check to see if the single offset vector can safely be represented with
2184  32-bit values. If so, return true and update the pointed-to
2185  llvm::Value * to be the 32-bit equivalent. */
2186 static bool
2187 lOffsets32BitSafe(llvm::Value **offsetPtr,
2188  llvm::Instruction *insertBefore) {
2189  llvm::Value *offset = *offsetPtr;
2190 
2191  if (offset->getType() == LLVMTypes::Int32VectorType)
2192  return true;
2193 
2194  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offset);
2195  if (sext != NULL &&
2196  sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
2197  // sext of a 32-bit vector -> the 32-bit vector is good
2198  *offsetPtr = sext->getOperand(0);
2199  return true;
2200  }
2201  else if (lIs32BitSafeHelper(offset)) {
2202  // The only constant vector we should have here is a vector of
2203  // all zeros (i.e. a ConstantAggregateZero, but just in case,
2204  // do the more general check with lVectorIs32BitInts().
2205 
2206  // Alternatively, offset could be a sequence of adds terminating
2207  // in safe constant vectors or a SExt.
2208  *offsetPtr =
2209  new llvm::TruncInst(offset, LLVMTypes::Int32VectorType,
2210  LLVMGetName(offset, "_trunc"),
2211  insertBefore);
2212  return true;
2213  }
2214  else
2215  return false;
2216 }
2217 
2218 
2219 static bool
2220 lGSToGSBaseOffsets(llvm::CallInst *callInst) {
2221  struct GSInfo {
2222  GSInfo(const char *pgFuncName, const char *pgboFuncName,
2223  const char *pgbo32FuncName, bool ig, bool ip)
2224  : isGather(ig), isPrefetch(ip) {
2225  func = m->module->getFunction(pgFuncName);
2226  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
2227  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
2228  }
2229  llvm::Function *func;
2230  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
2231  const bool isGather;
2232  const bool isPrefetch;
2233  };
2234 
2235  GSInfo gsFuncs[] = {
2236  GSInfo("__pseudo_gather32_i8",
2237  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2238  "__pseudo_gather_factored_base_offsets32_i8",
2239  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2240  "__pseudo_gather_factored_base_offsets32_i8",
2241  true, false),
2242  GSInfo("__pseudo_gather32_i16",
2243  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2244  "__pseudo_gather_factored_base_offsets32_i16",
2245  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2246  "__pseudo_gather_factored_base_offsets32_i16",
2247  true, false),
2248  GSInfo("__pseudo_gather32_i32",
2249  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2250  "__pseudo_gather_factored_base_offsets32_i32",
2251  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2252  "__pseudo_gather_factored_base_offsets32_i32",
2253  true, false),
2254  GSInfo("__pseudo_gather32_float",
2255  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2256  "__pseudo_gather_factored_base_offsets32_float",
2257  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2258  "__pseudo_gather_factored_base_offsets32_float",
2259  true, false),
2260  GSInfo("__pseudo_gather32_i64",
2261  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2262  "__pseudo_gather_factored_base_offsets32_i64",
2263  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2264  "__pseudo_gather_factored_base_offsets32_i64",
2265  true, false),
2266  GSInfo("__pseudo_gather32_double",
2267  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2268  "__pseudo_gather_factored_base_offsets32_double",
2269  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2270  "__pseudo_gather_factored_base_offsets32_double",
2271  true, false),
2272 
2273  GSInfo("__pseudo_scatter32_i8",
2274  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2275  "__pseudo_scatter_factored_base_offsets32_i8",
2276  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2277  "__pseudo_scatter_factored_base_offsets32_i8",
2278  false, false),
2279  GSInfo("__pseudo_scatter32_i16",
2280  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2281  "__pseudo_scatter_factored_base_offsets32_i16",
2282  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2283  "__pseudo_scatter_factored_base_offsets32_i16",
2284  false, false),
2285  GSInfo("__pseudo_scatter32_i32",
2286  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2287  "__pseudo_scatter_factored_base_offsets32_i32",
2288  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2289  "__pseudo_scatter_factored_base_offsets32_i32",
2290  false, false),
2291  GSInfo("__pseudo_scatter32_float",
2292  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2293  "__pseudo_scatter_factored_base_offsets32_float",
2294  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2295  "__pseudo_scatter_factored_base_offsets32_float",
2296  false, false),
2297  GSInfo("__pseudo_scatter32_i64",
2298  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2299  "__pseudo_scatter_factored_base_offsets32_i64",
2300  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2301  "__pseudo_scatter_factored_base_offsets32_i64",
2302  false, false),
2303  GSInfo("__pseudo_scatter32_double",
2304  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2305  "__pseudo_scatter_factored_base_offsets32_double",
2306  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2307  "__pseudo_scatter_factored_base_offsets32_double",
2308  false, false),
2309 
2310  GSInfo("__pseudo_gather64_i8",
2311  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
2312  "__pseudo_gather_factored_base_offsets64_i8",
2313  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2314  "__pseudo_gather_factored_base_offsets32_i8",
2315  true, false),
2316  GSInfo("__pseudo_gather64_i16",
2317  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
2318  "__pseudo_gather_factored_base_offsets64_i16",
2319  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2320  "__pseudo_gather_factored_base_offsets32_i16",
2321  true, false),
2322  GSInfo("__pseudo_gather64_i32",
2323  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
2324  "__pseudo_gather_factored_base_offsets64_i32",
2325  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2326  "__pseudo_gather_factored_base_offsets32_i32",
2327  true, false),
2328  GSInfo("__pseudo_gather64_float",
2329  g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
2330  "__pseudo_gather_factored_base_offsets64_float",
2331  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2332  "__pseudo_gather_factored_base_offsets32_float",
2333  true, false),
2334  GSInfo("__pseudo_gather64_i64",
2335  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
2336  "__pseudo_gather_factored_base_offsets64_i64",
2337  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2338  "__pseudo_gather_factored_base_offsets32_i64",
2339  true, false),
2340  GSInfo("__pseudo_gather64_double",
2341  g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
2342  "__pseudo_gather_factored_base_offsets64_double",
2343  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2344  "__pseudo_gather_factored_base_offsets32_double",
2345  true, false),
2346 
2347  GSInfo("__pseudo_scatter64_i8",
2348  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
2349  "__pseudo_scatter_factored_base_offsets64_i8",
2350  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2351  "__pseudo_scatter_factored_base_offsets32_i8",
2352  false, false),
2353  GSInfo("__pseudo_scatter64_i16",
2354  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
2355  "__pseudo_scatter_factored_base_offsets64_i16",
2356  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2357  "__pseudo_scatter_factored_base_offsets32_i16",
2358  false, false),
2359  GSInfo("__pseudo_scatter64_i32",
2360  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
2361  "__pseudo_scatter_factored_base_offsets64_i32",
2362  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2363  "__pseudo_scatter_factored_base_offsets32_i32",
2364  false, false),
2365  GSInfo("__pseudo_scatter64_float",
2366  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
2367  "__pseudo_scatter_factored_base_offsets64_float",
2368  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2369  "__pseudo_scatter_factored_base_offsets32_float",
2370  false, false),
2371  GSInfo("__pseudo_scatter64_i64",
2372  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
2373  "__pseudo_scatter_factored_base_offsets64_i64",
2374  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2375  "__pseudo_scatter_factored_base_offsets32_i64",
2376  false, false),
2377  GSInfo("__pseudo_scatter64_double",
2378  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
2379  "__pseudo_scatter_factored_base_offsets64_double",
2380  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2381  "__pseudo_scatter_factored_base_offsets32_double",
2382  false, false),
2383 
2384  GSInfo("__pseudo_prefetch_read_varying_1",
2385  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2386  "__prefetch_read_varying_1",
2387  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2388  "__prefetch_read_varying_1",
2389  false, true),
2390 
2391  GSInfo("__pseudo_prefetch_read_varying_2",
2392  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2393  "__prefetch_read_varying_2",
2394  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2395  "__prefetch_read_varying_2",
2396  false, true),
2397 
2398  GSInfo("__pseudo_prefetch_read_varying_3",
2399  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2400  "__prefetch_read_varying_3",
2401  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2402  "__prefetch_read_varying_3",
2403  false, true),
2404 
2405  GSInfo("__pseudo_prefetch_read_varying_nt",
2406  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2407  "__prefetch_read_varying_nt",
2408  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2409  "__prefetch_read_varying_nt",
2410  false, true),
2411  };
2412 
2413  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2414  for (int i = 0; i < numGSFuncs; ++i)
2415  Assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL &&
2416  gsFuncs[i].baseOffsets32Func != NULL);
2417 
2418  GSInfo *info = NULL;
2419  for (int i = 0; i < numGSFuncs; ++i)
2420  if (gsFuncs[i].func != NULL &&
2421  callInst->getCalledFunction() == gsFuncs[i].func) {
2422  info = &gsFuncs[i];
2423  break;
2424  }
2425  if (info == NULL)
2426  return false;
2427 
2428  // Try to transform the array of pointers to a single base pointer
2429  // and an array of int32 offsets. (All the hard work is done by
2430  // lGetBasePtrAndOffsets).
2431  llvm::Value *ptrs = callInst->getArgOperand(0);
2432  llvm::Value *offsetVector = NULL;
2433  llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector,
2434  callInst);
2435 
2436  if (basePtr == NULL || offsetVector == NULL ||
2437  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
2438  // It's actually a fully general gather/scatter with a varying
2439  // set of base pointers, so leave it as is and continune onward
2440  // to the next instruction...
2441  return false;
2442 
2443  // Cast the base pointer to a void *, since that's what the
2444  // __pseudo_*_base_offsets_* functions want.
2445  basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType,
2446  LLVMGetName(basePtr, "_2void"), callInst);
2447  lCopyMetadata(basePtr, callInst);
2448 
2449  llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
2450 
2451  if ((info->isGather == true && g->target->hasGather()) ||
2452  (info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
2453  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
2454 
2455  // See if the offsets are scaled by 2, 4, or 8. If so,
2456  // extract that scale factor and rewrite the offsets to remove
2457  // it.
2458  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
2459 
2460  // If we're doing 32-bit addressing on a 64-bit target, here we
2461  // will see if we can call one of the 32-bit variants of the pseudo
2462  // gather/scatter functions.
2463  if (g->opt.force32BitAddressing &&
2464  lOffsets32BitSafe(&offsetVector, callInst)) {
2465  gatherScatterFunc = info->baseOffsets32Func;
2466  }
2467 
2468  if (info->isGather || info->isPrefetch) {
2469  llvm::Value *mask = callInst->getArgOperand(1);
2470 
2471  // Generate a new function call to the next pseudo gather
2472  // base+offsets instruction. Note that we're passing a NULL
2473  // llvm::Instruction to llvm::CallInst::Create; this means that
2474  // the instruction isn't inserted into a basic block and that
2475  // way we can then call ReplaceInstWithInst().
2476  llvm::Instruction *newCall =
2477  lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector,
2478  mask, callInst->getName().str().c_str(),
2479  NULL);
2480  lCopyMetadata(newCall, callInst);
2481  llvm::ReplaceInstWithInst(callInst, newCall);
2482  }
2483  else {
2484  llvm::Value *storeValue = callInst->getArgOperand(1);
2485  llvm::Value *mask = callInst->getArgOperand(2);
2486 
2487  // Generate a new function call to the next pseudo scatter
2488  // base+offsets instruction. See above for why passing NULL
2489  // for the Instruction * is intended.
2490  llvm::Instruction *newCall =
2491  lCallInst(gatherScatterFunc, basePtr, offsetScale,
2492  offsetVector, storeValue, mask, "", NULL);
2493  lCopyMetadata(newCall, callInst);
2494  llvm::ReplaceInstWithInst(callInst, newCall);
2495  }
2496  }
2497  else {
2498  // Try to decompose the offset vector into a compile time constant
2499  // component and a varying component. The constant component is
2500  // passed as a separate parameter to the gather/scatter functions,
2501  // which in turn allows their implementations to end up emitting
2502  // x86 instructions with constant offsets encoded in them.
2503  llvm::Value *constOffset, *variableOffset;
2504  lExtractConstantOffset(offsetVector, &constOffset, &variableOffset,
2505  callInst);
2506  if (constOffset == NULL)
2507  constOffset = LLVMIntAsType(0, offsetVector->getType());
2508  if (variableOffset == NULL)
2509  variableOffset = LLVMIntAsType(0, offsetVector->getType());
2510 
2511  // See if the varying component is scaled by 2, 4, or 8. If so,
2512  // extract that scale factor and rewrite variableOffset to remove
2513  // it. (This also is pulled out so that we can match the scales by
2514  // 2/4/8 offered by x86 addressing operators.)
2515  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);
2516 
2517  // If we're doing 32-bit addressing on a 64-bit target, here we
2518  // will see if we can call one of the 32-bit variants of the pseudo
2519  // gather/scatter functions.
2520  if (g->opt.force32BitAddressing &&
2521  lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) {
2522  gatherScatterFunc = info->baseOffsets32Func;
2523  }
2524 
2525  if (info->isGather || info->isPrefetch) {
2526  llvm::Value *mask = callInst->getArgOperand(1);
2527 
2528  // Generate a new function call to the next pseudo gather
2529  // base+offsets instruction. Note that we're passing a NULL
2530  // llvm::Instruction to llvm::CallInst::Create; this means that
2531  // the instruction isn't inserted into a basic block and that
2532  // way we can then call ReplaceInstWithInst().
2533  llvm::Instruction *newCall =
2534  lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
2535  constOffset, mask, callInst->getName().str().c_str(),
2536  NULL);
2537  lCopyMetadata(newCall, callInst);
2538  llvm::ReplaceInstWithInst(callInst, newCall);
2539  }
2540  else {
2541  llvm::Value *storeValue = callInst->getArgOperand(1);
2542  llvm::Value *mask = callInst->getArgOperand(2);
2543 
2544  // Generate a new function call to the next pseudo scatter
2545  // base+offsets instruction. See above for why passing NULL
2546  // for the Instruction * is intended.
2547  llvm::Instruction *newCall =
2548  lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
2549  constOffset, storeValue, mask, "", NULL);
2550  lCopyMetadata(newCall, callInst);
2551  llvm::ReplaceInstWithInst(callInst, newCall);
2552  }
2553  }
2554  return true;
2555 }
2556 
2557 
2558 /** Try to improve the decomposition between compile-time constant and
2559  compile-time unknown offsets in calls to the __pseudo_*_base_offsets*
2560  functions. Other other optimizations have run, we will sometimes be
2561  able to pull more terms out of the unknown part and add them into the
2562  compile-time-known part.
2563  */
2564 static bool
2565 lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
2566  struct GSBOInfo {
2567  GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
2568  : isGather(ig), isPrefetch(ip) {
2569  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
2570  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
2571  }
2572  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
2573  const bool isGather;
2574  const bool isPrefetch;
2575  };
2576 
2577  GSBOInfo gsFuncs[] = {
2578  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2579  "__pseudo_gather_factored_base_offsets32_i8",
2580  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2581  "__pseudo_gather_factored_base_offsets32_i8",
2582  true, false),
2583  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2584  "__pseudo_gather_factored_base_offsets32_i16",
2585  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2586  "__pseudo_gather_factored_base_offsets32_i16",
2587  true, false),
2588  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2589  "__pseudo_gather_factored_base_offsets32_i32",
2590  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2591  "__pseudo_gather_factored_base_offsets32_i32",
2592  true, false),
2593  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2594  "__pseudo_gather_factored_base_offsets32_float",
2595  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2596  "__pseudo_gather_factored_base_offsets32_float",
2597  true, false),
2598  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2599  "__pseudo_gather_factored_base_offsets32_i64",
2600  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2601  "__pseudo_gather_factored_base_offsets32_i64",
2602  true, false),
2603  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2604  "__pseudo_gather_factored_base_offsets32_double",
2605  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2606  "__pseudo_gather_factored_base_offsets32_double",
2607  true, false),
2608 
2609  GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2610  "__pseudo_scatter_factored_base_offsets32_i8",
2611  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2612  "__pseudo_scatter_factored_base_offsets32_i8",
2613  false, false),
2614  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2615  "__pseudo_scatter_factored_base_offsets32_i16",
2616  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2617  "__pseudo_scatter_factored_base_offsets32_i16",
2618  false, false),
2619  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2620  "__pseudo_scatter_factored_base_offsets32_i32",
2621  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2622  "__pseudo_scatter_factored_base_offsets32_i32",
2623  false, false),
2624  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2625  "__pseudo_scatter_factored_base_offsets32_float",
2626  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2627  "__pseudo_scatter_factored_base_offsets32_float",
2628  false, false),
2629  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2630  "__pseudo_scatter_factored_base_offsets32_i64",
2631  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2632  "__pseudo_scatter_factored_base_offsets32_i64",
2633  false, false),
2634  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2635  "__pseudo_scatter_factored_base_offsets32_double",
2636  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2637  "__pseudo_scatter_factored_base_offsets32_double",
2638  false, false),
2639 
2640  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2641  "__prefetch_read_varying_1",
2642  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2643  "__prefetch_read_varying_1",
2644  false, true),
2645 
2646  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2647  "__prefetch_read_varying_2",
2648  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2649  "__prefetch_read_varying_2",
2650  false, true),
2651 
2652  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2653  "__prefetch_read_varying_3",
2654  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2655  "__prefetch_read_varying_3",
2656  false, true),
2657 
2658  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2659  "__prefetch_read_varying_nt",
2660  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2661  "__prefetch_read_varying_nt",
2662  false, true),
2663  };
2664 
2665  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2666  for (int i = 0; i < numGSFuncs; ++i)
2667  Assert(gsFuncs[i].baseOffsetsFunc != NULL &&
2668  gsFuncs[i].baseOffsets32Func != NULL);
2669 
2670  llvm::Function *calledFunc = callInst->getCalledFunction();
2671  Assert(calledFunc != NULL);
2672 
2673  // Is one of the gather/scatter functins that decompose into
2674  // base+offsets being called?
2675  GSBOInfo *info = NULL;
2676  for (int i = 0; i < numGSFuncs; ++i)
2677  if (calledFunc == gsFuncs[i].baseOffsetsFunc ||
2678  calledFunc == gsFuncs[i].baseOffsets32Func) {
2679  info = &gsFuncs[i];
2680  break;
2681  }
2682  if (info == NULL)
2683  return false;
2684 
2685  // Grab the old variable offset
2686  llvm::Value *origVariableOffset = callInst->getArgOperand(1);
2687 
2688  // If it's zero, we're done. Don't go and think that we're clever by
2689  // adding these zeros to the constant offsets.
2690  if (llvm::isa<llvm::ConstantAggregateZero>(origVariableOffset))
2691  return false;
2692 
2693  // Try to decompose the old variable offset
2694  llvm::Value *constOffset, *variableOffset;
2695  lExtractConstantOffset(origVariableOffset, &constOffset, &variableOffset,
2696  callInst);
2697 
2698  // No luck
2699  if (constOffset == NULL)
2700  return false;
2701 
2702  // Total luck: everything could be moved to the constant offset
2703  if (variableOffset == NULL)
2704  variableOffset = LLVMIntAsType(0, origVariableOffset->getType());
2705 
2706  // We need to scale the value we add to the constant offset by the
2707  // 2/4/8 scale for the variable offset, if present.
2708  llvm::ConstantInt *varScale =
2709  llvm::dyn_cast<llvm::ConstantInt>(callInst->getArgOperand(2));
2710  Assert(varScale != NULL);
2711 
2712  llvm::Value *scaleSmear;
2713  if (origVariableOffset->getType() == LLVMTypes::Int64VectorType)
2714  scaleSmear = LLVMInt64Vector((int64_t)varScale->getZExtValue());
2715  else
2716  scaleSmear = LLVMInt32Vector((int32_t)varScale->getZExtValue());
2717 
2718  constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Mul, constOffset,
2719  scaleSmear, constOffset->getName(),
2720  callInst);
2721 
2722  // And add the additional offset to the original constant offset
2723  constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, constOffset,
2724  callInst->getArgOperand(3),
2725  callInst->getArgOperand(3)->getName(),
2726  callInst);
2727 
2728  // Finally, update the values of the operands to the gather/scatter
2729  // function.
2730  callInst->setArgOperand(1, variableOffset);
2731  callInst->setArgOperand(3, constOffset);
2732 
2733  return true;
2734 }
2735 
2736 
2737 static llvm::Value *
2738 lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets,
2739  llvm::Instruction *insertBefore) {
2740  llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets);
2741  return lGEPInst(base, firstOffset, "ptr", insertBefore);
2742 }
2743 
2744 
2745 static llvm::Constant *
2746 lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType) {
2747  llvm::ConstantInt *offsetScaleInt =
2748  llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
2749  Assert(offsetScaleInt != NULL);
2750  uint64_t scaleValue = offsetScaleInt->getZExtValue();
2751 
2752  std::vector<llvm::Constant *> scales;
2753  for (int i = 0; i < g->target->getVectorWidth(); ++i) {
2754  if (vecType == LLVMTypes::Int64VectorType)
2755  scales.push_back(LLVMInt64(scaleValue));
2756  else {
2757  Assert(vecType == LLVMTypes::Int32VectorType);
2758  scales.push_back(LLVMInt32((int32_t)scaleValue));
2759  }
2760  }
2761  return llvm::ConstantVector::get(scales);
2762 }
2763 
2764 
2765 /** After earlier optimization passes have run, we are sometimes able to
2766  determine that gathers/scatters are actually accessing memory in a more
2767  regular fashion and then change the operation to something simpler and
2768  more efficient. For example, if all of the lanes in a gather are
2769  reading from the same location, we can instead do a scalar load and
2770  broadcast. This pass examines gathers and scatters and tries to
2771  simplify them if at all possible.
2772 
2773  @todo Currently, this only looks for all program instances going to the
2774  same location and all going to a linear sequence of locations in
2775  memory. There are a number of other cases that might make sense to
2776  look for, including things that could be handled with a vector load +
2777  shuffle or things that could be handled with hybrids of e.g. 2 4-wide
2778  vector loads with AVX, etc.
2779 */
2780 static bool
2781 lGSToLoadStore(llvm::CallInst *callInst) {
2782  struct GatherImpInfo {
2783  GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st,
2784  int a)
2785  : align(a), isFactored(!g->target->hasGather()) {
2786  pseudoFunc = m->module->getFunction(pName);
2787  loadMaskedFunc = m->module->getFunction(lmName);
2788  Assert(pseudoFunc != NULL && loadMaskedFunc != NULL);
2789  scalarType = st;
2790  }
2791 
2792  llvm::Function *pseudoFunc;
2793  llvm::Function *loadMaskedFunc;
2794  llvm::Type *scalarType;
2795  const int align;
2796  const bool isFactored;
2797  };
2798 
2799  GatherImpInfo gInfo[] = {
2800  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2801  "__pseudo_gather_factored_base_offsets32_i8",
2802  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2803  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2804  "__pseudo_gather_factored_base_offsets32_i16",
2805  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2806  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2807  "__pseudo_gather_factored_base_offsets32_i32",
2808  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2809  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2810  "__pseudo_gather_factored_base_offsets32_float",
2811  "__masked_load_float", LLVMTypes::FloatType, 4),
2812  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2813  "__pseudo_gather_factored_base_offsets32_i64",
2814  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2815  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2816  "__pseudo_gather_factored_base_offsets32_double",
2817  "__masked_load_double", LLVMTypes::DoubleType, 8),
2818  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
2819  "__pseudo_gather_factored_base_offsets64_i8",
2820  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2821  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
2822  "__pseudo_gather_factored_base_offsets64_i16",
2823  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2824  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
2825  "__pseudo_gather_factored_base_offsets64_i32",
2826  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2827  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
2828  "__pseudo_gather_factored_base_offsets64_float",
2829  "__masked_load_float", LLVMTypes::FloatType, 4),
2830  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
2831  "__pseudo_gather_factored_base_offsets64_i64",
2832  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2833  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
2834  "__pseudo_gather_factored_base_offsets64_double",
2835  "__masked_load_double", LLVMTypes::DoubleType, 8),
2836  };
2837 
2838  struct ScatterImpInfo {
2839  ScatterImpInfo(const char *pName, const char *msName,
2840  llvm::Type *vpt, int a)
2841  : align(a), isFactored(!g->target->hasScatter()) {
2842  pseudoFunc = m->module->getFunction(pName);
2843  maskedStoreFunc = m->module->getFunction(msName);
2844  vecPtrType = vpt;
2845  Assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
2846  }
2847  llvm::Function *pseudoFunc;
2848  llvm::Function *maskedStoreFunc;
2849  llvm::Type *vecPtrType;
2850  const int align;
2851  const bool isFactored;
2852  };
2853 
2854  ScatterImpInfo sInfo[] = {
2855  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2856  "__pseudo_scatter_factored_base_offsets32_i8",
2857  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2858  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2859  "__pseudo_scatter_factored_base_offsets32_i16",
2860  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2861  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2862  "__pseudo_scatter_factored_base_offsets32_i32",
2863  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2864  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2865  "__pseudo_scatter_factored_base_offsets32_float",
2866  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2867  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2868  "__pseudo_scatter_factored_base_offsets32_i64",
2869  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2870  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2871  "__pseudo_scatter_factored_base_offsets32_double",
2872  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2873  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
2874  "__pseudo_scatter_factored_base_offsets64_i8",
2875  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2876  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
2877  "__pseudo_scatter_factored_base_offsets64_i16",
2878  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2879  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
2880  "__pseudo_scatter_factored_base_offsets64_i32",
2881  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2882  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
2883  "__pseudo_scatter_factored_base_offsets64_float",
2884  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2885  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
2886  "__pseudo_scatter_factored_base_offsets64_i64",
2887  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2888  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
2889  "__pseudo_scatter_factored_base_offsets64_double",
2890  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2891  };
2892 
2893  llvm::Function *calledFunc = callInst->getCalledFunction();
2894 
2895  GatherImpInfo *gatherInfo = NULL;
2896  ScatterImpInfo *scatterInfo = NULL;
2897  for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
2898  if (gInfo[i].pseudoFunc != NULL &&
2899  calledFunc == gInfo[i].pseudoFunc) {
2900  gatherInfo = &gInfo[i];
2901  break;
2902  }
2903  }
2904  for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
2905  if (sInfo[i].pseudoFunc != NULL &&
2906  calledFunc == sInfo[i].pseudoFunc) {
2907  scatterInfo = &sInfo[i];
2908  break;
2909  }
2910  }
2911  if (gatherInfo == NULL && scatterInfo == NULL)
2912  return false;
2913 
2914  SourcePos pos;
2915  lGetSourcePosFromMetadata(callInst, &pos);
2916 
2917  llvm::Value *base = callInst->getArgOperand(0);
2918  llvm::Value *fullOffsets = NULL;
2919  llvm::Value *storeValue = NULL;
2920  llvm::Value *mask = NULL;
2921 
2922  if ((gatherInfo != NULL && gatherInfo->isFactored) ||
2923  (scatterInfo != NULL && scatterInfo->isFactored)) {
2924  llvm::Value *varyingOffsets = callInst->getArgOperand(1);
2925  llvm::Value *offsetScale = callInst->getArgOperand(2);
2926  llvm::Value *constOffsets = callInst->getArgOperand(3);
2927  if (scatterInfo)
2928  storeValue = callInst->getArgOperand(4);
2929  mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);
2930 
2931  // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
2932  llvm::Constant *offsetScaleVec =
2933  lGetOffsetScaleVec(offsetScale, varyingOffsets->getType());
2934 
2935  llvm::Value *scaledVarying =
2936  llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
2937  varyingOffsets, "scaled_varying", callInst);
2938  fullOffsets =
2939  llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying,
2940  constOffsets, "varying+const_offsets",
2941  callInst);
2942  }
2943  else {
2944  if (scatterInfo)
2945  storeValue = callInst->getArgOperand(3);
2946  mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
2947 
2948  llvm::Value *offsetScale = callInst->getArgOperand(1);
2949  llvm::Value *offsets = callInst->getArgOperand(2);
2950  llvm::Value *offsetScaleVec =
2951  lGetOffsetScaleVec(offsetScale, offsets->getType());
2952 
2953  fullOffsets =
2954  llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
2955  offsets, "scaled_offsets", callInst);
2956  }
2957 
2958  Debug(SourcePos(), "GSToLoadStore: %s.",
2959  fullOffsets->getName().str().c_str());
2960 
2961  if (LLVMVectorValuesAllEqual(fullOffsets)) {
2962  // If all the offsets are equal, then compute the single
2963  // pointer they all represent based on the first one of them
2964  // (arbitrarily).
2965  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
2966  lCopyMetadata(ptr, callInst);
2967 
2968  if (gatherInfo != NULL) {
2969  // A gather with everyone going to the same location is
2970  // handled as a scalar load and broadcast across the lanes.
2971  Debug(pos, "Transformed gather to scalar load and broadcast!");
2972 
2973  ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(gatherInfo->scalarType, 0),
2974  ptr->getName(), callInst);
2975  llvm::Value *scalarValue = new llvm::LoadInst(ptr, callInst->getName(), callInst);
2976 
2977  // Generate the following sequence:
2978  // %name123 = insertelement <4 x i32> undef, i32 %val, i32 0
2979  // %name124 = shufflevector <4 x i32> %name123, <4 x i32> undef,
2980  // <4 x i32> zeroinitializer
2981  llvm::Value *undef1Value = llvm::UndefValue::get(callInst->getType());
2982  llvm::Value *undef2Value = llvm::UndefValue::get(callInst->getType());
2983  llvm::Value *insertVec = llvm::InsertElementInst::Create(
2984  undef1Value, scalarValue, LLVMInt32(0), callInst->getName(), callInst);
2985  llvm::Value *zeroMask = llvm::ConstantVector::getSplat(
2986  callInst->getType()->getVectorNumElements(),
2987  llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
2988  llvm::Value *shufValue = new llvm::ShuffleVectorInst(
2989  insertVec, undef2Value, zeroMask, callInst->getName());
2990 
2991  lCopyMetadata(shufValue, callInst);
2992  llvm::ReplaceInstWithInst(callInst,
2993  llvm::dyn_cast<llvm::Instruction>(shufValue));
2994  return true;
2995  }
2996  else {
2997  // A scatter with everyone going to the same location is
2998  // undefined (if there's more than one program instance in
2999  // the gang). Issue a warning.
3000  if (g->target->getVectorWidth() > 1)
3001  Warning(pos, "Undefined behavior: all program instances are "
3002  "writing to the same location!");
3003 
3004  // We could do something similar to the gather case, where
3005  // we arbitrarily write one of the values, but we need to
3006  // a) check to be sure the mask isn't all off and b) pick
3007  // the value from an executing program instance in that
3008  // case. We'll just let a bunch of the program instances
3009  // do redundant writes, since this isn't important to make
3010  // fast anyway...
3011  return false;
3012  }
3013  }
3014  else {
3015  int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
3016 
3017  if (step > 0 && LLVMVectorIsLinear(fullOffsets, step)) {
3018  // We have a linear sequence of memory locations being accessed
3019  // starting with the location given by the offset from
3020  // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
3021  // and 64 bit gather/scatters, respectively.)
3022  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
3023  lCopyMetadata(ptr, callInst);
3024 
3025  if (gatherInfo != NULL) {
3026  Debug(pos, "Transformed gather to unaligned vector load!");
3027  llvm::Instruction *newCall =
3028  lCallInst(gatherInfo->loadMaskedFunc, ptr, mask,
3029  LLVMGetName(ptr, "_masked_load"));
3030  lCopyMetadata(newCall, callInst);
3031  llvm::ReplaceInstWithInst(callInst, newCall);
3032  return true;
3033  }
3034  else {
3035  Debug(pos, "Transformed scatter to unaligned vector store!");
3036  ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast",
3037  callInst);
3038  llvm::Instruction *newCall =
3039  lCallInst(scatterInfo->maskedStoreFunc, ptr, storeValue,
3040  mask, "");
3041  lCopyMetadata(newCall, callInst);
3042  llvm::ReplaceInstWithInst(callInst, newCall);
3043  return true;
3044  }
3045  }
3046  return false;
3047  }
3048 }
3049 
3050 
3051 ///////////////////////////////////////////////////////////////////////////
3052 // MaskedStoreOptPass
3053 
3054 /** Masked stores are generally more complex than regular stores; for
3055  example, they require multiple instructions to simulate under SSE.
3056  This optimization detects cases where masked stores can be replaced
3057  with regular stores or removed entirely, for the cases of an 'all on'
3058  mask and an 'all off' mask, respectively.
3059 */
3060 static bool
3061 lImproveMaskedStore(llvm::CallInst *callInst) {
3062  struct MSInfo {
3063  MSInfo(const char *name, const int a)
3064  : align(a) {
3065  func = m->module->getFunction(name);
3066  Assert(func != NULL);
3067  }
3068  llvm::Function *func;
3069  const int align;
3070  };
3071 
3072  MSInfo msInfo[] = {
3073  MSInfo("__pseudo_masked_store_i8", 1),
3074  MSInfo("__pseudo_masked_store_i16", 2),
3075  MSInfo("__pseudo_masked_store_i32", 4),
3076  MSInfo("__pseudo_masked_store_float", 4),
3077  MSInfo("__pseudo_masked_store_i64", 8),
3078  MSInfo("__pseudo_masked_store_double", 8),
3079  MSInfo("__masked_store_blend_i8", 1),
3080  MSInfo("__masked_store_blend_i16", 2),
3081  MSInfo("__masked_store_blend_i32", 4),
3082  MSInfo("__masked_store_blend_float", 4),
3083  MSInfo("__masked_store_blend_i64", 8),
3084  MSInfo("__masked_store_blend_double", 8),
3085  MSInfo("__masked_store_i8", 1),
3086  MSInfo("__masked_store_i16", 2),
3087  MSInfo("__masked_store_i32", 4),
3088  MSInfo("__masked_store_float", 4),
3089  MSInfo("__masked_store_i64", 8),
3090  MSInfo("__masked_store_double", 8)
3091  };
3092 
3093  llvm::Function *called = callInst->getCalledFunction();
3094 
3095  int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
3096  MSInfo *info = NULL;
3097  for (int i = 0; i < nMSFuncs; ++i) {
3098  if (msInfo[i].func != NULL && called == msInfo[i].func) {
3099  info = &msInfo[i];
3100  break;
3101  }
3102  }
3103  if (info == NULL)
3104  return false;
3105 
3106  // Got one; grab the operands
3107  llvm::Value *lvalue = callInst->getArgOperand(0);
3108  llvm::Value *rvalue = callInst->getArgOperand(1);
3109  llvm::Value *mask = callInst->getArgOperand(2);
3110 
3111  MaskStatus maskStatus = lGetMaskStatus(mask);
3112  if (maskStatus == ALL_OFF) {
3113  // Zero mask - no-op, so remove the store completely. (This
3114  // may in turn lead to being able to optimize out instructions
3115  // that compute the rvalue...)
3116  callInst->eraseFromParent();
3117  return true;
3118  }
3119  else if (maskStatus == ALL_ON) {
3120  // The mask is all on, so turn this into a regular store
3121  llvm::Type *rvalueType = rvalue->getType();
3122  llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
3123 
3124  lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
3125  lCopyMetadata(lvalue, callInst);
3126  llvm::Instruction *store =
3127  new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
3129  g->target->getNativeVectorAlignment() : info->align);
3130  lCopyMetadata(store, callInst);
3131  llvm::ReplaceInstWithInst(callInst, store);
3132  return true;
3133  }
3134 
3135  return false;
3136 }
3137 
3138 
3139 static bool
3140 lImproveMaskedLoad(llvm::CallInst *callInst,
3141  llvm::BasicBlock::iterator iter) {
3142  struct MLInfo {
3143  MLInfo(const char *name, const int a)
3144  : align(a) {
3145  func = m->module->getFunction(name);
3146  Assert(func != NULL);
3147  }
3148  llvm::Function *func;
3149  const int align;
3150  };
3151 
3152  MLInfo mlInfo[] = {
3153  MLInfo("__masked_load_i8", 1),
3154  MLInfo("__masked_load_i16", 2),
3155  MLInfo("__masked_load_i32", 4),
3156  MLInfo("__masked_load_float", 4),
3157  MLInfo("__masked_load_i64", 8),
3158  MLInfo("__masked_load_double", 8)
3159  };
3160 
3161  llvm::Function *called = callInst->getCalledFunction();
3162 
3163  int nFuncs = sizeof(mlInfo) / sizeof(mlInfo[0]);
3164  MLInfo *info = NULL;
3165  for (int i = 0; i < nFuncs; ++i) {
3166  if (mlInfo[i].func != NULL && called == mlInfo[i].func) {
3167  info = &mlInfo[i];
3168  break;
3169  }
3170  }
3171  if (info == NULL)
3172  return false;
3173 
3174  // Got one; grab the operands
3175  llvm::Value *ptr = callInst->getArgOperand(0);
3176  llvm::Value *mask = callInst->getArgOperand(1);
3177 
3178  MaskStatus maskStatus = lGetMaskStatus(mask);
3179  if (maskStatus == ALL_OFF) {
3180  // Zero mask - no-op, so replace the load with an undef value
3181  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
3182  iter, llvm::UndefValue::get(callInst->getType()));
3183  return true;
3184  }
3185  else if (maskStatus == ALL_ON) {
3186  // The mask is all on, so turn this into a regular load
3187  llvm::Type *ptrType = llvm::PointerType::get(callInst->getType(), 0);
3188  ptr = new llvm::BitCastInst(ptr, ptrType, "ptr_cast_for_load",
3189  callInst);
3190  llvm::Instruction *load =
3191  new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
3193  g->target->getNativeVectorAlignment() : info->align,
3194  (llvm::Instruction *)NULL);
3195  lCopyMetadata(load, callInst);
3196  llvm::ReplaceInstWithInst(callInst, load);
3197  return true;
3198  }
3199  else
3200  return false;
3201 }
3202 
3203 
3204 bool
3206  DEBUG_START_PASS("ImproveMemoryOps");
3207 
3208  bool modifiedAny = false;
3209  restart:
3210  // Iterate through all of the instructions in the basic block.
3211  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
3212  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
3213  // If we don't have a call to one of the
3214  // __pseudo_{gather,scatter}_* functions, then just go on to the
3215  // next instruction.
3216  if (callInst == NULL ||
3217  callInst->getCalledFunction() == NULL)
3218  continue;
3219 
3220  if (lGSToGSBaseOffsets(callInst)) {
3221  modifiedAny = true;
3222  goto restart;
3223  }
3224  if (lGSBaseOffsetsGetMoreConst(callInst)) {
3225  modifiedAny = true;
3226  goto restart;
3227  }
3228  if (lGSToLoadStore(callInst)) {
3229  modifiedAny = true;
3230  goto restart;
3231  }
3232  if (lImproveMaskedStore(callInst)) {
3233  modifiedAny = true;
3234  goto restart;
3235  }
3236  if (lImproveMaskedLoad(callInst, iter)) {
3237  modifiedAny = true;
3238  goto restart;
3239  }
3240  }
3241 
3242  DEBUG_END_PASS("ImproveMemoryOps");
3243 
3244  return modifiedAny;
3245 }
3246 
3247 
3248 static llvm::Pass *
3250  return new ImproveMemoryOpsPass;
3251 }
3252 
3253 
3254 ///////////////////////////////////////////////////////////////////////////
3255 // GatherCoalescePass
3256 
3257 // This pass implements two optimizations to improve the performance of
3258 // gathers; currently only gathers of 32-bit values where it can be
3259 // determined at compile time that the mask is all on are supported, though
3260 // both of those limitations may be generalized in the future.
3261 //
3262 // First, for any single gather, see if it's worthwhile to break it into
3263 // any of scalar, 2-wide (i.e. 64-bit), 4-wide, or 8-wide loads. Further,
3264 // we generate code that shuffles these loads around. Doing fewer, larger
3265 // loads in this manner, when possible, can be more efficient.
3266 //
3267 // Second, this pass can coalesce memory accesses across multiple
3268 // gathers. If we have a series of gathers without any memory writes in
3269 // the middle, then we try to analyze their reads collectively and choose
3270 // an efficient set of loads for them. Not only does this help if
3271 // different gathers reuse values from the same location in memory, but
3272 // it's specifically helpful when data with AOS layout is being accessed;
3273 // in this case, we're often able to generate wide vector loads and
3274 // appropriate shuffles automatically.
3275 
3276 class GatherCoalescePass : public llvm::BasicBlockPass {
3277 public:
3278  static char ID;
3279  GatherCoalescePass() : BasicBlockPass(ID) { }
3280 
3281  const char *getPassName() const { return "Gather Coalescing"; }
3282  bool runOnBasicBlock(llvm::BasicBlock &BB);
3283 };
3284 
3285 char GatherCoalescePass::ID = 0;
3286 
3287 
3288 /** Representation of a memory load that the gather coalescing code has
3289  decided to generate.
3290  */
3292  CoalescedLoadOp(int64_t s, int c) {
3293  start = s;
3294  count = c;
3295  load = element0 = element1 = NULL;
3296  }
3297 
3298  /** Starting offset of the load from the common base pointer (in terms
3299  of numbers of items of the underlying element type--*not* in terms
3300  of bytes). */
3301  int64_t start;
3302 
3303  /** Number of elements to load at this location */
3304  int count;
3305 
3306  /** Value loaded from memory for this load op */
3307  llvm::Value *load;
3308 
3309  /** For 2-wide loads (i.e. 64-bit loads), these store the lower and
3310  upper 32 bits of the result, respectively. */
3311  llvm::Value *element0, *element1;
3312 };
3313 
3314 
3315 /** This function determines whether it makes sense (and is safe) to
3316  generate a vector load of width vectorWidth, starting at *iter. It
3317  returns true if so, setting *newIter to point to the next element in
3318  the set that isn't taken care of by the generated load. If a vector
3319  load of the given width doesn't make sense, then false is returned.
3320  */
3321 static bool
3322 lVectorLoadIsEfficient(std::set<int64_t>::iterator iter,
3323  std::set<int64_t>::iterator end,
3324  std::set<int64_t>::iterator *newIter, int vectorWidth) {
3325  // We're considering a vector load of width vectorWidth, starting at
3326  // the offset "start".
3327  int64_t start = *iter;
3328 
3329  // The basic idea is that we'll look at the subsequent elements in the
3330  // load set after the initial one at start. As long as subsequent
3331  // elements:
3332  //
3333  // 1. Aren't so far separated that they no longer fit into the range
3334  // [start, start+vectorWidth)
3335  //
3336  // 2. And don't have too large a gap in between them (e.g., it's not
3337  // worth generating an 8-wide load for two elements with offsets 0
3338  // and 7, but no loads requested in between).
3339  //
3340  // Then we continue moving forward through the elements until we either
3341  // fill up the vector or run out of elements.
3342 
3343  // lastAccepted holds the last offset we've processed and accepted as
3344  // valid for the vector load underconsideration
3345  int64_t lastAccepted = start;
3346 
3347  while (iter != end) {
3348  // What is the separation in offset values from the last element we
3349  // added to the set for this load?
3350  int64_t delta = *iter - lastAccepted;
3351  if (delta > 3)
3352  // If there's too big a gap, then we won't issue the load
3353  return false;
3354 
3355  int64_t span = *iter - start + 1;
3356 
3357  if (span == vectorWidth) {
3358  // We've extended far enough that we have exactly filled up the
3359  // entire vector width; we can't go any further, so return with
3360  // success. (Update *newIter to point at the next element
3361  // after the last one accepted here.)
3362  *newIter = ++iter;
3363  return true;
3364  }
3365  else if (span > vectorWidth) {
3366  // The current offset won't fit into a vectorWidth-wide load
3367  // starting from start. It's still generally worthwhile
3368  // issuing the load we've been considering, though, since it
3369  // will provide values for a number of previous offsets. This
3370  // load will have one or more elements at the end of its range
3371  // that is not needed by any of the offsets under
3372  // consideration. As such, there are three cases where issuing
3373  // this load is a bad idea:
3374  //
3375  // 1. 2-wide loads: we know that we haven't completely filled
3376  // the 2-wide vector, since otherwise the if() test above
3377  // would have succeeded previously. Therefore, we must have
3378  // a situation with offsets like (4,6,...); it would be a
3379  // silly idea to issue a 2-wide load to get the value for
3380  // the 4 offset, versus failing here and issuing a scalar
3381  // load instead.
3382  //
3383  // 2. If there are too many unnecessary values at the end of
3384  // the load extent (defined as more than half of them)--in
3385  // this case, it'd be better to issue a vector load of
3386  // smaller width anyway.
3387  //
3388  // 3. If the gap between the last accepted offset and the
3389  // current one under consideration is more than the page
3390  // size. In this case we can't be sure whether or not some
3391  // of the unused elements at the end of the load will
3392  // straddle a page boundary and thus lead to an undesirable
3393  // fault. (It's hard to imagine this happening in practice,
3394  // except under contrived circumstances, but better safe
3395  // than sorry.)
3396  const int pageSize = 4096;
3397  if (vectorWidth != 2 &&
3398  (lastAccepted - start) > (vectorWidth / 2) &&
3399  (*iter - lastAccepted) < pageSize) {
3400  *newIter = iter;
3401  return true;
3402  }
3403  else
3404  return false;
3405  }
3406 
3407  // Continue moving forward
3408  lastAccepted = *iter;
3409  ++iter;
3410  }
3411 
3412  return false;
3413 }
3414 
3415 
3416 /** Given a set of offsets from a common base pointer that we need to get
3417  loaded into memory, determine a reasonable set of load operations that
3418  gets all of the corresponding values in memory (ideally, including as
3419  many as possible wider vector loads rather than scalar loads). Return
3420  a CoalescedLoadOp for each one in the *loads array.
3421  */
3422 static void
3423 lSelectLoads(const std::vector<int64_t> &loadOffsets,
3424  std::vector<CoalescedLoadOp> *loads) {
3425  // First, get a sorted set of unique offsets to load from.
3426  std::set<int64_t> allOffsets;
3427  for (unsigned int i = 0; i < loadOffsets.size(); ++i)
3428  allOffsets.insert(loadOffsets[i]);
3429 
3430  std::set<int64_t>::iterator iter = allOffsets.begin();
3431  while (iter != allOffsets.end()) {
3432  Debug(SourcePos(), "Load needed at %" PRId64 ".", *iter);
3433  ++iter;
3434  }
3435 
3436  // Now, iterate over the offsets from low to high. Starting at the
3437  // current offset, we see if a vector load starting from that offset
3438  // will cover loads at subsequent offsets as well.
3439  iter = allOffsets.begin();
3440  while (iter != allOffsets.end()) {
3441  // Consider vector loads of width of each of the elements of
3442  // spanSizes[], in order.
3443  int vectorWidths[] = { 8, 4, 2 };
3444  int nVectorWidths = sizeof(vectorWidths) / sizeof(vectorWidths[0]);
3445  bool gotOne = false;
3446  for (int i = 0; i < nVectorWidths; ++i) {
3447  // See if a load of vector with width vectorWidths[i] would be
3448  // effective (i.e. would cover a reasonable number of the
3449  // offsets that need to be loaded from).
3450  std::set<int64_t>::iterator newIter;
3451  if (lVectorLoadIsEfficient(iter, allOffsets.end(), &newIter,
3452  vectorWidths[i])) {
3453  // Yes: create the corresponding coalesced load and update
3454  // the iterator to the returned iterator; doing so skips
3455  // over the additional offsets that are taken care of by
3456  // this load.
3457  loads->push_back(CoalescedLoadOp(*iter, vectorWidths[i]));
3458  iter = newIter;
3459  gotOne = true;
3460  break;
3461  }
3462  }
3463 
3464  if (gotOne == false) {
3465  // We couldn't find a vector load starting from this offset
3466  // that made sense, so emit a scalar load and continue onward.
3467  loads->push_back(CoalescedLoadOp(*iter, 1));
3468  ++iter;
3469  }
3470  }
3471 }
3472 
3473 
3474 /** Print a performance message with the details of the result of
3475  coalescing over a group of gathers. */
3476 static void
3477 lCoalescePerfInfo(const std::vector<llvm::CallInst *> &coalesceGroup,
3478  const std::vector<CoalescedLoadOp> &loadOps) {
3479  SourcePos pos;
3480  lGetSourcePosFromMetadata(coalesceGroup[0], &pos);
3481 
3482  // Create a string that indicates the line numbers of the subsequent
3483  // gathers from the first one that were coalesced here.
3484  char otherPositions[512];
3485  otherPositions[0] = '\0';
3486  if (coalesceGroup.size() > 1) {
3487  const char *plural = (coalesceGroup.size() > 2) ? "s" : "";
3488  char otherBuf[32];
3489  sprintf(otherBuf, "(other%s at line%s ", plural, plural);
3490  strcat(otherPositions, otherBuf);
3491 
3492  for (int i = 1; i < (int)coalesceGroup.size(); ++i) {
3493  SourcePos p;
3494  bool ok = lGetSourcePosFromMetadata(coalesceGroup[i], &p);
3495  if (ok) {
3496  char buf[32];
3497  sprintf(buf, "%d", p.first_line);
3498  strcat(otherPositions, buf);
3499  if (i < (int)coalesceGroup.size() - 1)
3500  strcat(otherPositions, ", ");
3501  }
3502  }
3503  strcat(otherPositions, ") ");
3504  }
3505 
3506  // Count how many loads of each size there were.
3507  std::map<int, int> loadOpsCount;
3508  for (int i = 0; i < (int)loadOps.size(); ++i)
3509  ++loadOpsCount[loadOps[i].count];
3510 
3511  // Generate a string the describes the mix of load ops
3512  char loadOpsInfo[512];
3513  loadOpsInfo[0] = '\0';
3514  std::map<int, int>::const_iterator iter = loadOpsCount.begin();
3515  while (iter != loadOpsCount.end()) {
3516  char buf[32];
3517  sprintf(buf, "%d x %d-wide", iter->second, iter->first);
3518  strcat(loadOpsInfo, buf);
3519  ++iter;
3520  if (iter != loadOpsCount.end())
3521  strcat(loadOpsInfo, ", ");
3522  }
3523 
3524  if (coalesceGroup.size() == 1)
3525  PerformanceWarning(pos, "Coalesced gather into %d load%s (%s).",
3526  (int)loadOps.size(),
3527  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3528  else
3529  PerformanceWarning(pos, "Coalesced %d gathers starting here %sinto %d "
3530  "load%s (%s).", (int)coalesceGroup.size(),
3531  otherPositions,(int)loadOps.size(),
3532  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3533 }
3534 
3535 
3536 /** Utility routine that computes an offset from a base pointer and then
3537  returns the result of a load of the given type from the resulting
3538  location:
3539 
3540  return *((type *)(basePtr + offset))
3541  */
3542 llvm::Value *
3543 lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align,
3544  llvm::Instruction *insertBefore, llvm::Type *type) {
3545  llvm::Value *ptr = lGEPInst(basePtr, LLVMInt64(offset), "new_base",
3546  insertBefore);
3547  ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(type, 0),
3548  "ptr_cast", insertBefore);
3549  return new llvm::LoadInst(ptr, "gather_load", false /* not volatile */,
3550  align, insertBefore);
3551 }
3552 
3553 
3554 /* Having decided that we're doing to emit a series of loads, as encoded in
3555  the loadOps array, this function emits the corresponding load
3556  instructions.
3557  */
3558 static void
3559 lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
3560  int elementSize, llvm::Instruction *insertBefore) {
3561  Debug(SourcePos(), "Coalesce doing %d loads.", (int)loadOps.size());
3562  for (int i = 0; i < (int)loadOps.size(); ++i) {
3563  Debug(SourcePos(), "Load #%d @ %" PRId64 ", %d items", i, loadOps[i].start,
3564  loadOps[i].count);
3565 
3566  // basePtr is an i8 *, so the offset from it should be in terms of
3567  // bytes, not underlying i32 elements.
3568  int64_t start = loadOps[i].start * elementSize;
3569 
3570  int align = 4;
3571  switch (loadOps[i].count) {
3572  case 1:
3573  // Single 32-bit scalar load
3574  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore,
3576  break;
3577  case 2: {
3578  // Emit 2 x i32 loads as i64 loads and then break the result
3579  // into two 32-bit parts.
3580  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore,
3582  // element0 = (int32)value;
3583  loadOps[i].element0 =
3584  new llvm::TruncInst(loadOps[i].load, LLVMTypes::Int32Type,
3585  "load64_elt0", insertBefore);
3586  // element1 = (int32)(value >> 32)
3587  llvm::Value *shift =
3588  llvm::BinaryOperator::Create(llvm::Instruction::LShr,
3589  loadOps[i].load, LLVMInt64(32),
3590  "load64_shift", insertBefore);
3591  loadOps[i].element1 =
3592  new llvm::TruncInst(shift, LLVMTypes::Int32Type,
3593  "load64_elt1", insertBefore);
3594  break;
3595  }
3596  case 4: {
3597  // 4-wide vector load
3598  if (g->opt.forceAlignedMemory) {
3599  align = g->target->getNativeVectorAlignment();
3600  }
3601  llvm::VectorType *vt =
3602  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3603  loadOps[i].load = lGEPAndLoad(basePtr, start, align,
3604  insertBefore, vt);
3605  break;
3606  }
3607  case 8: {
3608  // 8-wide vector load
3609  if (g->opt.forceAlignedMemory) {
3610  align = g->target->getNativeVectorAlignment();
3611  }
3612  llvm::VectorType *vt =
3613  llvm::VectorType::get(LLVMTypes::Int32Type, 8);
3614  loadOps[i].load = lGEPAndLoad(basePtr, start, align,
3615  insertBefore, vt);
3616  break;
3617  }
3618  default:
3619  FATAL("Unexpected load count in lEmitLoads()");
3620  }
3621  }
3622 }
3623 
3624 
3625 /** Convert any loads of 8-wide vectors into two 4-wide vectors
3626  (logically). This allows the assembly code below to always operate on
3627  4-wide vectors, which leads to better code. Returns a new vector of
3628  load operations.
3629  */
3630 static std::vector<CoalescedLoadOp>
3631 lSplit8WideLoads(const std::vector<CoalescedLoadOp> &loadOps,
3632  llvm::Instruction *insertBefore) {
3633  std::vector<CoalescedLoadOp> ret;
3634  for (unsigned int i = 0; i < loadOps.size(); ++i) {
3635  if (loadOps[i].count == 8) {
3636  // Create fake CoalescedLOadOps, where the load llvm::Value is
3637  // actually a shuffle that pulls either the first 4 or the last
3638  // 4 values out of the original 8-wide loaded value.
3639  int32_t shuf[2][4] = { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } };
3640 
3641  ret.push_back(CoalescedLoadOp(loadOps[i].start, 4));
3642  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load,
3643  shuf[0], 4, insertBefore);
3644 
3645  ret.push_back(CoalescedLoadOp(loadOps[i].start+4, 4));
3646  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load,
3647  shuf[1], 4, insertBefore);
3648  }
3649  else
3650  ret.push_back(loadOps[i]);
3651  }
3652 
3653  return ret;
3654 }
3655 
3656 
3657 /** Given a 1-wide load of a 32-bit value, merge its value into the result
3658  vector for any and all elements for which it applies.
3659  */
3660 static llvm::Value *
3661 lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load,
3662  const int64_t offsets[4], bool set[4],
3663  llvm::Instruction *insertBefore) {
3664  for (int elt = 0; elt < 4; ++elt) {
3665  if (offsets[elt] >= load.start &&
3666  offsets[elt] < load.start + load.count) {
3667  Debug(SourcePos(), "Load 1 @ %" PRId64 " matches for element #%d "
3668  "(value %" PRId64 ")", load.start, elt, offsets[elt]);
3669  // If this load gives one of the values that we need, then we
3670  // can just insert it in directly
3671  Assert(set[elt] == false);
3672  result =
3673  llvm::InsertElementInst::Create(result, load.load, LLVMInt32(elt),
3674  "insert_load", insertBefore);
3675  set[elt] = true;
3676  }
3677  }
3678 
3679  return result;
3680 }
3681 
3682 
3683 /** Similarly, incorporate the values from a 2-wide load into any vector
3684  elements that they apply to. */
3685 static llvm::Value *
3686 lApplyLoad2(llvm::Value *result, const CoalescedLoadOp &load,
3687  const int64_t offsets[4], bool set[4],
3688  llvm::Instruction *insertBefore) {
3689  for (int elt = 0; elt < 4; ++elt) {
3690  // First, try to do a 64-bit-wide insert into the result vector.
3691  // We can do this when we're currently at an even element, when the
3692  // current and next element have consecutive values, and where the
3693  // original 64-bit load is at the offset needed by the current
3694  // element.
3695  if ((elt & 1) == 0 &&
3696  offsets[elt] + 1 == offsets[elt+1] &&
3697  offsets[elt] == load.start) {
3698  Debug(SourcePos(), "Load 2 @ %" PRId64 " matches for elements #%d,%d "
3699  "(values %" PRId64 ",%" PRId64 ")", load.start, elt, elt+1,
3700  offsets[elt], offsets[elt+1]);
3701  Assert(set[elt] == false && set[elt+1] == false);
3702 
3703  // In this case, we bitcast from a 4xi32 to a 2xi64 vector
3704  llvm::Type *vec2x64Type =
3705  llvm::VectorType::get(LLVMTypes::Int64Type, 2);
3706  result = new llvm::BitCastInst(result, vec2x64Type, "to2x64",
3707  insertBefore);
3708 
3709  // And now we can insert the 64-bit wide value into the
3710  // appropriate elment
3711  result = llvm::InsertElementInst::Create(result, load.load,
3712  LLVMInt32(elt/2),
3713  "insert64", insertBefore);
3714 
3715  // And back to 4xi32.
3716  llvm::Type *vec4x32Type =
3717  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3718  result = new llvm::BitCastInst(result, vec4x32Type, "to4x32",
3719  insertBefore);
3720 
3721  set[elt] = set[elt+1] = true;
3722  // Advance elt one extra time, since we just took care of two
3723  // elements
3724  ++elt;
3725  }
3726  else if (offsets[elt] >= load.start &&
3727  offsets[elt] < load.start + load.count) {
3728  Debug(SourcePos(), "Load 2 @ %" PRId64 " matches for element #%d "
3729  "(value %" PRId64 ")", load.start, elt, offsets[elt]);
3730  // Otherwise, insert one of the 32-bit pieces into an element
3731  // of the final vector
3732  Assert(set[elt] == false);
3733  llvm::Value *toInsert = (offsets[elt] == load.start) ?
3734  load.element0 : load.element1;
3735  result =
3736  llvm::InsertElementInst::Create(result, toInsert, LLVMInt32(elt),
3737  "insert_load", insertBefore);
3738  set[elt] = true;
3739  }
3740  }
3741 
3742  return result;
3743 }
3744 
3745 
3746 #if 1
3747 /* This approach works better with AVX, while the #else path generates
3748  slightly better code with SSE. Need to continue to dig into performance
3749  details with this stuff in general... */
3750 
3751 /** And handle a 4-wide load */
3752 static llvm::Value *
3753 lApplyLoad4(llvm::Value *result, const CoalescedLoadOp &load,
3754  const int64_t offsets[4], bool set[4],
3755  llvm::Instruction *insertBefore) {
3756  // Conceptually, we're doing to consider doing a shuffle vector with
3757  // the 4-wide load and the 4-wide result we have so far to generate a
3758  // new 4-wide vector. We'll start with shuffle indices that just
3759  // select each element of the result so far for the result.
3760  int32_t shuf[4] = { 4, 5, 6, 7 };
3761 
3762  for (int elt = 0; elt < 4; ++elt) {
3763  if (offsets[elt] >= load.start &&
3764  offsets[elt] < load.start + load.count) {
3765  Debug(SourcePos(), "Load 4 @ %" PRId64 " matches for element #%d "
3766  "(value %" PRId64 ")", load.start, elt, offsets[elt]);
3767 
3768  // If the current element falls within the range of locations
3769  // that the 4-wide load covers, then compute the appropriate
3770  // shuffle index that extracts the appropriate element from the
3771  // load.
3772  Assert(set[elt] == false);
3773  shuf[elt] = int32_t(offsets[elt] - load.start);
3774  set[elt] = true;
3775  }
3776  }
3777 
3778  // Now, issue a shufflevector instruction if any of the values from the
3779  // load we just considered were applicable.
3780  if (shuf[0] != 4 || shuf[1] != 5 || shuf[2] != 6 || shuf[3] != 7)
3781  result = LLVMShuffleVectors(load.load, result, shuf, 4, insertBefore);
3782 
3783  return result;
3784 }
3785 
3786 
3787 /** We're need to fill in the values for a 4-wide result vector. This
3788  function looks at all of the generated loads and extracts the
3789  appropriate elements from the appropriate loads to assemble the result.
3790  Here the offsets[] parameter gives the 4 offsets from the base pointer
3791  for the four elements of the result.
3792 */
3793 static llvm::Value *
3794 lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps,
3795  const int64_t offsets[4], llvm::Instruction *insertBefore) {
3796  llvm::Type *returnType =
3797  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3798  llvm::Value *result = llvm::UndefValue::get(returnType);
3799 
3800  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %"
3801  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
3802 
3803  // Track whether we have found a valid value for each of the four
3804  // elements of the result
3805  bool set[4] = { false, false, false, false };
3806 
3807  // Loop over all of the loads and check each one to see if it provides
3808  // a value that's applicable to the result
3809  for (int load = 0; load < (int)loadOps.size(); ++load) {
3810  const CoalescedLoadOp &li = loadOps[load];
3811 
3812  switch (li.count) {
3813  case 1:
3814  result = lApplyLoad1(result, li, offsets, set, insertBefore);
3815  break;
3816  case 2:
3817  result = lApplyLoad2(result, li, offsets, set, insertBefore);
3818  break;
3819  case 4:
3820  result = lApplyLoad4(result, li, offsets, set, insertBefore);
3821  break;
3822  default:
3823  FATAL("Unexpected load count in lAssemble4Vector()");
3824  }
3825  }
3826 
3827  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %"
3828  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
3829 
3830  for (int i = 0; i < 4; ++i)
3831  Assert(set[i] == true);
3832 
3833  return result;
3834 }
3835 
3836 #else
3837 
3838 static llvm::Value *
3839 lApplyLoad4s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3840  const int64_t offsets[4], bool set[4],
3841  llvm::Instruction *insertBefore) {
3842  int32_t firstMatchElements[4] = { -1, -1, -1, -1 };
3843  const CoalescedLoadOp *firstMatch = NULL;
3844 
3845  Assert(llvm::isa<llvm::UndefValue>(result));
3846 
3847  for (int load = 0; load < (int)loadOps.size(); ++load) {
3848  const CoalescedLoadOp &loadop = loadOps[load];
3849  if (loadop.count != 4)
3850  continue;
3851 
3852  int32_t matchElements[4] = { -1, -1, -1, -1 };
3853  bool anyMatched = false;
3854  for (int elt = 0; elt < 4; ++elt) {
3855  if (offsets[elt] >= loadop.start &&
3856  offsets[elt] < loadop.start + loadop.count) {
3857  Debug(SourcePos(), "Load 4 @ %" PRId64 " matches for element #%d "
3858  "(value %" PRId64 ")", loadop.start, elt, offsets[elt]);
3859  anyMatched = true;
3860  Assert(set[elt] == false);
3861  matchElements[elt] = offsets[elt] - loadop.start;
3862  set[elt] = true;
3863  }
3864  }
3865 
3866  if (anyMatched) {
3867  if (llvm::isa<llvm::UndefValue>(result)) {
3868  if (firstMatch == NULL) {
3869  firstMatch = &loadop;
3870  for (int i = 0; i < 4; ++i)
3871  firstMatchElements[i] = matchElements[i];
3872  }
3873  else {
3874  int32_t shuffle[4] = { -1, -1, -1, -1 };
3875  for (int i = 0; i < 4; ++i) {
3876  if (firstMatchElements[i] != -1)
3877  shuffle[i] = firstMatchElements[i];
3878  else
3879  shuffle[i] = 4 + matchElements[i];
3880  }
3881  result = LLVMShuffleVectors(firstMatch->load, loadop.load, shuffle,
3882  4, insertBefore);
3883  firstMatch = NULL;
3884  }
3885  }
3886  else {
3887  int32_t shuffle[4] = { -1, -1, -1, -1 };
3888  for (int i = 0; i < 4; ++i) {
3889  if (matchElements[i] != -1)
3890  shuffle[i] = 4 + matchElements[i];
3891  else
3892  shuffle[i] = i;
3893  }
3894  result = LLVMShuffleVectors(result, loadop.load, shuffle, 4,
3895  insertBefore);
3896  }
3897  }
3898  }
3899 
3900  if (firstMatch != NULL && llvm::isa<llvm::UndefValue>(result))
3901  return LLVMShuffleVectors(firstMatch->load, result, firstMatchElements,
3902  4, insertBefore);
3903  else
3904  return result;
3905 }
3906 
3907 
3908 static llvm::Value *
3909 lApplyLoad12s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3910  const int64_t offsets[4], bool set[4],
3911  llvm::Instruction *insertBefore) {
3912  // Loop over all of the loads and check each one to see if it provides
3913  // a value that's applicable to the result
3914  for (int load = 0; load < (int)loadOps.size(); ++load) {
3915  const CoalescedLoadOp &loadop = loadOps[load];
3916  Assert(loadop.count == 1 || loadop.count == 2 || loadop.count == 4);
3917 
3918  if (loadop.count == 1)
3919  result = lApplyLoad1(result, loadop, offsets, set, insertBefore);
3920  else if (loadop.count == 2)
3921  result = lApplyLoad2(result, loadop, offsets, set, insertBefore);
3922  }
3923  return result;
3924 }
3925 
3926 
3927 /** We're need to fill in the values for a 4-wide result vector. This
3928  function looks at all of the generated loads and extracts the
3929  appropriate elements from the appropriate loads to assemble the result.
3930  Here the offsets[] parameter gives the 4 offsets from the base pointer
3931  for the four elements of the result.
3932 */
3933 static llvm::Value *
3934 lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps,
3935  const int64_t offsets[4], llvm::Instruction *insertBefore) {
3936  llvm::Type *returnType =
3937  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3938  llvm::Value *result = llvm::UndefValue::get(returnType);
3939 
3940  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %"
3941  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
3942 
3943  // Track whether we have found a valid value for each of the four
3944  // elements of the result
3945  bool set[4] = { false, false, false, false };
3946 
3947  result = lApplyLoad4s(result, loadOps, offsets, set, insertBefore);
3948  result = lApplyLoad12s(result, loadOps, offsets, set, insertBefore);
3949 
3950  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %"
3951  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
3952 
3953  for (int i = 0; i < 4; ++i)
3954  Assert(set[i] == true);
3955 
3956  return result;
3957 }
3958 #endif
3959 
3960 
3961 /** Given the set of loads that we've done and the set of result values to
3962  be computed, this function computes the final llvm::Value *s for each
3963  result vector.
3964  */
3965 static void
3966 lAssembleResultVectors(const std::vector<CoalescedLoadOp> &loadOps,
3967  const std::vector<int64_t> &constOffsets,
3968  std::vector<llvm::Value *> &results,
3969  llvm::Instruction *insertBefore) {
3970  // We work on 4-wide chunks of the final values, even when we're
3971  // computing 8-wide or 16-wide vectors. This gives better code from
3972  // LLVM's SSE/AVX code generators.
3973  Assert((constOffsets.size() % 4) == 0);
3974  std::vector<llvm::Value *> vec4s;
3975  for (int i = 0; i < (int)constOffsets.size(); i += 4)
3976  vec4s.push_back(lAssemble4Vector(loadOps, &constOffsets[i],
3977  insertBefore));
3978 
3979  // And now concatenate 1, 2, or 4 of the 4-wide vectors computed above
3980  // into 4, 8, or 16-wide final result vectors.
3981  int numGathers = constOffsets.size() / g->target->getVectorWidth();
3982  for (int i = 0; i < numGathers; ++i) {
3983  llvm::Value *result = NULL;
3984  switch (g->target->getVectorWidth()) {
3985  case 4:
3986  result = vec4s[i];
3987  break;
3988  case 8:
3989  result = LLVMConcatVectors(vec4s[2*i], vec4s[2*i+1], insertBefore);
3990  break;
3991  case 16: {
3992  llvm::Value *v1 = LLVMConcatVectors(vec4s[4*i], vec4s[4*i+1],
3993  insertBefore);
3994  llvm::Value *v2 = LLVMConcatVectors(vec4s[4*i+2], vec4s[4*i+3],
3995  insertBefore);
3996  result = LLVMConcatVectors(v1, v2, insertBefore);
3997  break;
3998  }
3999  default:
4000  FATAL("Unhandled vector width in lAssembleResultVectors()");
4001  }
4002 
4003  results.push_back(result);
4004  }
4005 }
4006 
4007 
4008 /** Given a call to a gather function, extract the base pointer, the 2/4/8
4009  scale, and the first varying offsets value to use them to compute that
4010  scalar base pointer that is shared by all of the gathers in the group.
4011  (Thus, this base pointer plus the constant offsets term for each gather
4012  gives the set of addresses to use for each gather.
4013  */
4014 static llvm::Value *
4015 lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore) {
4016  llvm::Value *basePtr = gatherInst->getArgOperand(0);
4017  llvm::Value *variableOffsets = gatherInst->getArgOperand(1);
4018  llvm::Value *offsetScale = gatherInst->getArgOperand(2);
4019 
4020  // All of the variable offsets values should be the same, due to
4021  // checking for this in GatherCoalescePass::runOnBasicBlock(). Thus,
4022  // extract the first value and use that as a scalar.
4023  llvm::Value *variable = LLVMExtractFirstVectorElement(variableOffsets);
4024  if (variable->getType() == LLVMTypes::Int64Type)
4025  offsetScale = new llvm::ZExtInst(offsetScale, LLVMTypes::Int64Type,
4026  "scale_to64", insertBefore);
4027  llvm::Value *offset =
4028  llvm::BinaryOperator::Create(llvm::Instruction::Mul, variable,
4029  offsetScale, "offset", insertBefore);
4030 
4031  return lGEPInst(basePtr, offset, "new_base", insertBefore);
4032 }
4033 
4034 
4035 /** Extract the constant offsets (from the common base pointer) from each
4036  of the gathers in a set to be coalesced. These come in as byte
4037  offsets, but we'll transform them into offsets in terms of the size of
4038  the base scalar type being gathered. (e.g. for an i32 gather, we might
4039  have offsets like <0,4,16,20>, which would be transformed to <0,1,4,5>
4040  here.)
4041  */
4042 static void
4043 lExtractConstOffsets(const std::vector<llvm::CallInst *> &coalesceGroup,
4044  int elementSize, std::vector<int64_t> *constOffsets) {
4045  int width = g->target->getVectorWidth();
4046  *constOffsets = std::vector<int64_t>(coalesceGroup.size() * width, 0);
4047 
4048  int64_t *endPtr = &((*constOffsets)[0]);
4049  for (int i = 0; i < (int)coalesceGroup.size(); ++i, endPtr += width) {
4050  llvm::Value *offsets = coalesceGroup[i]->getArgOperand(3);
4051  int nElts;
4052  bool ok = LLVMExtractVectorInts(offsets, endPtr, &nElts);
4053  Assert(ok && nElts == width);
4054  }
4055 
4056  for (int i = 0; i < (int)constOffsets->size(); ++i)
4057  (*constOffsets)[i] /= elementSize;
4058 }
4059 
4060 
4061 /** Actually do the coalescing. We have a set of gathers all accessing
4062  addresses of the form:
4063 
4064  (ptr + {1,2,4,8} * varyingOffset) + constOffset, a.k.a.
4065  basePtr + constOffset
4066 
4067  where varyingOffset actually has the same value across all of the SIMD
4068  lanes and where the part in parenthesis has the same value for all of
4069  the gathers in the group.
4070  */
4071 static bool
4072 lCoalesceGathers(const std::vector<llvm::CallInst *> &coalesceGroup) {
4073  llvm::Instruction *insertBefore = coalesceGroup[0];
4074 
4075  // First, compute the shared base pointer for all of the gathers
4076  llvm::Value *basePtr = lComputeBasePtr(coalesceGroup[0], insertBefore);
4077 
4078  int elementSize = 0;
4079  if (coalesceGroup[0]->getType() == LLVMTypes::Int32VectorType ||
4080  coalesceGroup[0]->getType() == LLVMTypes::FloatVectorType)
4081  elementSize = 4;
4082  else if (coalesceGroup[0]->getType() == LLVMTypes::Int64VectorType ||
4083  coalesceGroup[0]->getType() == LLVMTypes::DoubleVectorType)
4084  elementSize = 8;
4085  else
4086  FATAL("Unexpected gather type in lCoalesceGathers");
4087 
4088  // Extract the constant offsets from the gathers into the constOffsets
4089  // vector: the first vectorWidth elements will be those for the first
4090  // gather, the next vectorWidth those for the next gather, and so
4091  // forth.
4092  std::vector<int64_t> constOffsets;
4093  lExtractConstOffsets(coalesceGroup, elementSize, &constOffsets);
4094 
4095  // Determine a set of loads to perform to get all of the values we need
4096  // loaded.
4097  std::vector<CoalescedLoadOp> loadOps;
4098  lSelectLoads(constOffsets, &loadOps);
4099 
4100  lCoalescePerfInfo(coalesceGroup, loadOps);
4101 
4102  // Actually emit load instructions for them
4103  lEmitLoads(basePtr, loadOps, elementSize, insertBefore);
4104 
4105  // Now, for any loads that give us <8 x i32> vectors, split their
4106  // values into two <4 x i32> vectors; it turns out that LLVM gives us
4107  // better code on AVX when we assemble the pieces from 4-wide vectors.
4108  loadOps = lSplit8WideLoads(loadOps, insertBefore);
4109 
4110  // Given all of these chunks of values, shuffle together a vector that
4111  // gives us each result value; the i'th element of results[] gives the
4112  // result for the i'th gather in coalesceGroup.
4113  std::vector<llvm::Value *> results;
4114  lAssembleResultVectors(loadOps, constOffsets, results, insertBefore);
4115 
4116  // Finally, replace each of the original gathers with the instruction
4117  // that gives the value from the coalescing process.
4118  Assert(results.size() == coalesceGroup.size());
4119  for (int i = 0; i < (int)results.size(); ++i) {
4120  llvm::Instruction *ir = llvm::dyn_cast<llvm::Instruction>(results[i]);
4121  Assert(ir != NULL);
4122 
4123  llvm::Type *origType = coalesceGroup[i]->getType();
4124  if (origType != ir->getType())
4125  ir = new llvm::BitCastInst(ir, origType, ir->getName(),
4126  coalesceGroup[i]);
4127 
4128  // Previously, all of the instructions to compute the final result
4129  // were into the basic block here; here we remove the very last one
4130  // of them (that holds the final result) from the basic block.
4131  // This way, the following ReplaceInstWithInst() call will operate
4132  // successfully. (It expects that the second argument not be in any
4133  // basic block.)
4134  ir->removeFromParent();
4135 
4136  llvm::ReplaceInstWithInst(coalesceGroup[i], ir);
4137  }
4138 
4139  return true;
4140 }
4141 
4142 
4143 /** Given an instruction, returns true if the instructon may write to
4144  memory. This is a conservative test in that it may return true for
4145  some instructions that don't actually end up writing to memory, but
4146  should never return false for an instruction that does write to
4147  memory. */
4148 static bool
4149 lInstructionMayWriteToMemory(llvm::Instruction *inst) {
4150  if (llvm::isa<llvm::StoreInst>(inst) ||
4151  llvm::isa<llvm::AtomicRMWInst>(inst) ||
4152  llvm::isa<llvm::AtomicCmpXchgInst>(inst))
4153  // FIXME: we could be less conservative and try to allow stores if
4154  // we are sure that the pointers don't overlap..
4155  return true;
4156 
4157  // Otherwise, any call instruction that doesn't have an attribute
4158  // indicating it won't write to memory has to be treated as a potential
4159  // store.
4160  llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
4161  if (ci != NULL) {
4162  llvm::Function *calledFunc = ci->getCalledFunction();
4163  if (calledFunc == NULL)
4164  return true;
4165 
4166  if (calledFunc->onlyReadsMemory() || calledFunc->doesNotAccessMemory())
4167  return false;
4168  return true;
4169  }
4170 
4171  return false;
4172 }
4173 
4174 
4175 bool
4177  DEBUG_START_PASS("GatherCoalescePass");
4178 
4179  llvm::Function *gatherFuncs[] = {
4180  m->module->getFunction("__pseudo_gather_factored_base_offsets32_i32"),
4181  m->module->getFunction("__pseudo_gather_factored_base_offsets32_float"),
4182  m->module->getFunction("__pseudo_gather_factored_base_offsets64_i32"),
4183  m->module->getFunction("__pseudo_gather_factored_base_offsets64_float"),
4184  };
4185  int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]);
4186 
4187  bool modifiedAny = false;
4188 
4189  restart:
4190  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e;
4191  ++iter) {
4192  // Iterate over all of the instructions and look for calls to
4193  // __pseudo_gather_factored_base_offsets{32,64}_{i32,float} calls.
4194  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
4195  if (callInst == NULL)
4196  continue;
4197 
4198  llvm::Function *calledFunc = callInst->getCalledFunction();
4199  if (calledFunc == NULL)
4200  continue;
4201 
4202  int i;
4203  for (i = 0; i < nGatherFuncs; ++i)
4204  if (gatherFuncs[i] != NULL && calledFunc == gatherFuncs[i])
4205  break;
4206  if (i == nGatherFuncs)
4207  // Doesn't match any of the types of gathers we care about
4208  continue;
4209 
4210  SourcePos pos;
4211  lGetSourcePosFromMetadata(callInst, &pos);
4212  Debug(pos, "Checking for coalescable gathers starting here...");
4213 
4214  llvm::Value *base = callInst->getArgOperand(0);
4215  llvm::Value *variableOffsets = callInst->getArgOperand(1);
4216  llvm::Value *offsetScale = callInst->getArgOperand(2);
4217  llvm::Value *mask = callInst->getArgOperand(4);
4218 
4219  // To apply this optimization, we need a set of one or more gathers
4220  // that fulfill the following conditions:
4221  //
4222  // - Mask all on
4223  // - The variable offsets to all have the same value (i.e., to be
4224  // uniform).
4225  // - Same base pointer, variable offsets, and offset scale (for
4226  // more than one gather)
4227  //
4228  // Then and only then do we have a common base pointer with all
4229  // offsets from that constants (in which case we can potentially
4230  // coalesce).
4231  if (lGetMaskStatus(mask) != ALL_ON)
4232  continue;
4233 
4234  if (!LLVMVectorValuesAllEqual(variableOffsets))
4235  continue;
4236 
4237  // coalesceGroup stores the set of gathers that we're going to try to
4238  // coalesce over
4239  std::vector<llvm::CallInst *> coalesceGroup;
4240  coalesceGroup.push_back(callInst);
4241 
4242  // Start iterating at the instruction after the initial gather;
4243  // look at the remainder of instructions in the basic block (up
4244  // until we reach a write to memory) to try to find any other
4245  // gathers that can coalesce with this one.
4246  llvm::BasicBlock::iterator fwdIter = iter;
4247  ++fwdIter;
4248  for (; fwdIter != bb.end(); ++fwdIter) {
4249  // Must stop once we come to an instruction that may write to
4250  // memory; otherwise we could end up moving a read before this
4251  // write.
4252  if (lInstructionMayWriteToMemory(&*fwdIter))
4253  break;
4254 
4255  llvm::CallInst *fwdCall = llvm::dyn_cast<llvm::CallInst>(&*fwdIter);
4256  if (fwdCall == NULL ||
4257  fwdCall->getCalledFunction() != calledFunc)
4258  continue;
4259 
4260  SourcePos fwdPos;
4261  bool ok = lGetSourcePosFromMetadata(fwdCall, &fwdPos);
4262  Assert(ok);
4263 
4264  if (g->debugPrint) {
4265  if (base != fwdCall->getArgOperand(0)) {
4266  Debug(fwdPos, "base pointers mismatch");
4267  LLVMDumpValue(base);
4268  LLVMDumpValue(fwdCall->getArgOperand(0));
4269  }
4270  if (variableOffsets != fwdCall->getArgOperand(1)) {
4271  Debug(fwdPos, "varying offsets mismatch");
4272  LLVMDumpValue(variableOffsets);
4273  LLVMDumpValue(fwdCall->getArgOperand(1));
4274  }
4275  if (offsetScale != fwdCall->getArgOperand(2)) {
4276  Debug(fwdPos, "offset scales mismatch");
4277  LLVMDumpValue(offsetScale);
4278  LLVMDumpValue(fwdCall->getArgOperand(2));
4279  }
4280  if (mask != fwdCall->getArgOperand(4)) {
4281  Debug(fwdPos, "masks mismatch");
4282  LLVMDumpValue(mask);
4283  LLVMDumpValue(fwdCall->getArgOperand(4));
4284  }
4285  }
4286 
4287  if (base == fwdCall->getArgOperand(0) &&
4288  variableOffsets == fwdCall->getArgOperand(1) &&
4289  offsetScale == fwdCall->getArgOperand(2) &&
4290  mask == fwdCall->getArgOperand(4)) {
4291  Debug(fwdPos, "This gather can be coalesced.");
4292  coalesceGroup.push_back(fwdCall);
4293 
4294  if (coalesceGroup.size() == 4)
4295  // FIXME: untested heuristic: don't try to coalesce
4296  // over a window of more than 4 gathers, so that we
4297  // don't cause too much register pressure and end up
4298  // spilling to memory anyway.
4299  break;
4300  }
4301  else
4302  Debug(fwdPos, "This gather doesn't match the initial one.");
4303  }
4304 
4305  Debug(pos, "Done with checking for matching gathers");
4306 
4307  // Now that we have a group of gathers, see if we can coalesce them
4308  // into something more efficient than the original set of gathers.
4309  if (lCoalesceGathers(coalesceGroup)) {
4310  modifiedAny = true;
4311  goto restart;
4312  }
4313  }
4314 
4315  DEBUG_END_PASS("GatherCoalescePass");
4316 
4317  return modifiedAny;
4318 }
4319 
4320 
4321 static llvm::Pass *
4323  return new GatherCoalescePass;
4324 }
4325 
4326 
4327 ///////////////////////////////////////////////////////////////////////////
4328 // ReplacePseudoMemoryOpsPass
4329 
4330 /** For any gathers and scatters remaining after the GSToLoadStorePass
4331  runs, we need to turn them into actual native gathers and scatters.
4332  This task is handled by the ReplacePseudoMemoryOpsPass here.
4333  */
4334 class ReplacePseudoMemoryOpsPass : public llvm::BasicBlockPass {
4335 public:
4336  static char ID;
4337  ReplacePseudoMemoryOpsPass() : BasicBlockPass(ID) { }
4338 
4339  const char *getPassName() const { return "Replace Pseudo Memory Ops"; }
4340  bool runOnBasicBlock(llvm::BasicBlock &BB);
4341 };
4342 
4343 
4345 
4346 /** This routine attempts to determine if the given pointer in lvalue is
4347  pointing to stack-allocated memory. It's conservative in that it
4348  should never return true for non-stack allocated memory, but may return
4349  false for memory that actually is stack allocated. The basic strategy
4350  is to traverse through the operands and see if the pointer originally
4351  comes from an AllocaInst.
4352 */
4353 static bool
4354 lIsSafeToBlend(llvm::Value *lvalue) {
4355  llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(lvalue);
4356  if (bc != NULL)
4357  return lIsSafeToBlend(bc->getOperand(0));
4358  else {
4359  llvm::AllocaInst *ai = llvm::dyn_cast<llvm::AllocaInst>(lvalue);
4360  if (ai) {
4361  llvm::Type *type = ai->getType();
4362  llvm::PointerType *pt =
4363  llvm::dyn_cast<llvm::PointerType>(type);
4364  assert(pt != NULL);
4365  type = pt->getElementType();
4366  llvm::ArrayType *at;
4367  while ((at = llvm::dyn_cast<llvm::ArrayType>(type))) {
4368  type = at->getElementType();
4369  }
4370  llvm::VectorType *vt =
4371  llvm::dyn_cast<llvm::VectorType>(type);
4372  return (vt != NULL &&
4373  (int)vt->getNumElements() == g->target->getVectorWidth());
4374  }
4375  else {
4376  llvm::GetElementPtrInst *gep =
4377  llvm::dyn_cast<llvm::GetElementPtrInst>(lvalue);
4378  if (gep != NULL)
4379  return lIsSafeToBlend(gep->getOperand(0));
4380  else
4381  return false;
4382  }
4383  }
4384 }
4385 
4386 
4387 static bool
4388 lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
4389  struct LMSInfo {
4390  LMSInfo(const char *pname, const char *bname, const char *msname) {
4391  pseudoFunc = m->module->getFunction(pname);
4392  blendFunc = m->module->getFunction(bname);
4393  maskedStoreFunc = m->module->getFunction(msname);
4394  Assert(pseudoFunc != NULL && blendFunc != NULL &&
4395  maskedStoreFunc != NULL);
4396  }
4397  llvm::Function *pseudoFunc;
4398  llvm::Function *blendFunc;
4399  llvm::Function *maskedStoreFunc;
4400  };
4401 
4402  LMSInfo msInfo[] = {
4403  LMSInfo("__pseudo_masked_store_i8", "__masked_store_blend_i8",
4404  "__masked_store_i8"),
4405  LMSInfo("__pseudo_masked_store_i16", "__masked_store_blend_i16",
4406  "__masked_store_i16"),
4407  LMSInfo("__pseudo_masked_store_i32", "__masked_store_blend_i32",
4408  "__masked_store_i32"),
4409  LMSInfo("__pseudo_masked_store_float", "__masked_store_blend_float",
4410  "__masked_store_float"),
4411  LMSInfo("__pseudo_masked_store_i64", "__masked_store_blend_i64",
4412  "__masked_store_i64"),
4413  LMSInfo("__pseudo_masked_store_double", "__masked_store_blend_double",
4414  "__masked_store_double")
4415  };
4416 
4417  LMSInfo *info = NULL;
4418  for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
4419  if (msInfo[i].pseudoFunc != NULL &&
4420  callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
4421  info = &msInfo[i];
4422  break;
4423  }
4424  }
4425  if (info == NULL)
4426  return false;
4427 
4428  llvm::Value *lvalue = callInst->getArgOperand(0);
4429  llvm::Value *rvalue = callInst->getArgOperand(1);
4430  llvm::Value *mask = callInst->getArgOperand(2);
4431 
4432  // We need to choose between doing the load + blend + store trick,
4433  // or serializing the masked store. Even on targets with a native
4434  // masked store instruction, this is preferable since it lets us
4435  // keep values in registers rather than going out to the stack.
4436  bool doBlend = (!g->opt.disableBlendedMaskedStores &&
4437  lIsSafeToBlend(lvalue));
4438 
4439  // Generate the call to the appropriate masked store function and
4440  // replace the __pseudo_* one with it.
4441  llvm::Function *fms = doBlend ? info->blendFunc : info->maskedStoreFunc;
4442  llvm::Instruction *inst = lCallInst(fms, lvalue, rvalue, mask, "", callInst);
4443  lCopyMetadata(inst, callInst);
4444 
4445  callInst->eraseFromParent();
4446  return true;
4447 }
4448 
4449 
4450 static bool
4451 lReplacePseudoGS(llvm::CallInst *callInst) {
4452  struct LowerGSInfo {
4453  LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip)
4454  : isGather(ig), isPrefetch(ip) {
4455  pseudoFunc = m->module->getFunction(pName);
4456  actualFunc = m->module->getFunction(aName);
4457  }
4458  llvm::Function *pseudoFunc;
4459  llvm::Function *actualFunc;
4460  const bool isGather;
4461  const bool isPrefetch;
4462  };
4463 
4464  LowerGSInfo lgsInfo[] = {
4465  LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false),
4466  LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
4467  LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
4468  LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
4469  LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
4470  LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),
4471 
4472  LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false),
4473  LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
4474  LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
4475  LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
4476  LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
4477  LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),
4478 
4479  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",
4480  "__gather_factored_base_offsets32_i8", true, false),
4481  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16",
4482  "__gather_factored_base_offsets32_i16", true, false),
4483  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32",
4484  "__gather_factored_base_offsets32_i32", true, false),
4485  LowerGSInfo("__pseudo_gather_factored_base_offsets32_float",
4486  "__gather_factored_base_offsets32_float", true, false),
4487  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64",
4488  "__gather_factored_base_offsets32_i64", true, false),
4489  LowerGSInfo("__pseudo_gather_factored_base_offsets32_double",
4490  "__gather_factored_base_offsets32_double", true, false),
4491 
4492  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",
4493  "__gather_factored_base_offsets64_i8", true, false),
4494  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16",
4495  "__gather_factored_base_offsets64_i16", true, false),
4496  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32",
4497  "__gather_factored_base_offsets64_i32", true, false),
4498  LowerGSInfo("__pseudo_gather_factored_base_offsets64_float",
4499  "__gather_factored_base_offsets64_float", true, false),
4500  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64",
4501  "__gather_factored_base_offsets64_i64", true, false),
4502  LowerGSInfo("__pseudo_gather_factored_base_offsets64_double",
4503  "__gather_factored_base_offsets64_double", true, false),
4504 
4505  LowerGSInfo("__pseudo_gather_base_offsets32_i8",
4506  "__gather_base_offsets32_i8", true, false),
4507  LowerGSInfo("__pseudo_gather_base_offsets32_i16",
4508  "__gather_base_offsets32_i16", true, false),
4509  LowerGSInfo("__pseudo_gather_base_offsets32_i32",
4510  "__gather_base_offsets32_i32", true, false),
4511  LowerGSInfo("__pseudo_gather_base_offsets32_float",
4512  "__gather_base_offsets32_float", true, false),
4513  LowerGSInfo("__pseudo_gather_base_offsets32_i64",
4514  "__gather_base_offsets32_i64", true, false),
4515  LowerGSInfo("__pseudo_gather_base_offsets32_double",
4516  "__gather_base_offsets32_double", true, false),
4517 
4518  LowerGSInfo("__pseudo_gather_base_offsets64_i8",
4519  "__gather_base_offsets64_i8", true, false),
4520  LowerGSInfo("__pseudo_gather_base_offsets64_i16",
4521  "__gather_base_offsets64_i16", true, false),
4522  LowerGSInfo("__pseudo_gather_base_offsets64_i32",
4523  "__gather_base_offsets64_i32", true, false),
4524  LowerGSInfo("__pseudo_gather_base_offsets64_float",
4525  "__gather_base_offsets64_float", true, false),
4526  LowerGSInfo("__pseudo_gather_base_offsets64_i64",
4527  "__gather_base_offsets64_i64", true, false),
4528  LowerGSInfo("__pseudo_gather_base_offsets64_double",
4529  "__gather_base_offsets64_double", true, false),
4530 
4531  LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false),
4532  LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
4533  LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
4534  LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
4535  LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
4536  LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),
4537 
4538  LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false),
4539  LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
4540  LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
4541  LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
4542  LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
4543  LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),
4544 
4545  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",
4546  "__scatter_factored_base_offsets32_i8", false, false),
4547  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16",
4548  "__scatter_factored_base_offsets32_i16", false, false),
4549  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32",
4550  "__scatter_factored_base_offsets32_i32", false, false),
4551  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float",
4552  "__scatter_factored_base_offsets32_float", false, false),
4553  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64",
4554  "__scatter_factored_base_offsets32_i64", false, false),
4555  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double",
4556  "__scatter_factored_base_offsets32_double", false, false),
4557 
4558  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",
4559  "__scatter_factored_base_offsets64_i8", false, false),
4560  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16",
4561  "__scatter_factored_base_offsets64_i16", false, false),
4562  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32",
4563  "__scatter_factored_base_offsets64_i32", false, false),
4564  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float",
4565  "__scatter_factored_base_offsets64_float", false, false),
4566  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64",
4567  "__scatter_factored_base_offsets64_i64", false, false),
4568  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double",
4569  "__scatter_factored_base_offsets64_double", false, false),
4570 
4571 
4572  LowerGSInfo("__pseudo_scatter_base_offsets32_i8",
4573  "__scatter_base_offsets32_i8", false, false),
4574  LowerGSInfo("__pseudo_scatter_base_offsets32_i16",
4575  "__scatter_base_offsets32_i16", false, false),
4576  LowerGSInfo("__pseudo_scatter_base_offsets32_i32",
4577  "__scatter_base_offsets32_i32", false, false),
4578  LowerGSInfo("__pseudo_scatter_base_offsets32_float",
4579  "__scatter_base_offsets32_float", false, false),
4580  LowerGSInfo("__pseudo_scatter_base_offsets32_i64",
4581  "__scatter_base_offsets32_i64", false, false),
4582  LowerGSInfo("__pseudo_scatter_base_offsets32_double",
4583  "__scatter_base_offsets32_double", false, false),
4584 
4585  LowerGSInfo("__pseudo_scatter_base_offsets64_i8",
4586  "__scatter_base_offsets64_i8", false, false),
4587  LowerGSInfo("__pseudo_scatter_base_offsets64_i16",
4588  "__scatter_base_offsets64_i16", false, false),
4589  LowerGSInfo("__pseudo_scatter_base_offsets64_i32",
4590  "__scatter_base_offsets64_i32", false, false),
4591  LowerGSInfo("__pseudo_scatter_base_offsets64_float",
4592  "__scatter_base_offsets64_float", false, false),
4593  LowerGSInfo("__pseudo_scatter_base_offsets64_i64",
4594  "__scatter_base_offsets64_i64", false, false),
4595  LowerGSInfo("__pseudo_scatter_base_offsets64_double",
4596  "__scatter_base_offsets64_double", false, false),
4597 
4598  LowerGSInfo("__pseudo_prefetch_read_varying_1",
4599  "__prefetch_read_varying_1", false, true),
4600  LowerGSInfo("__pseudo_prefetch_read_varying_1_native",
4601  "__prefetch_read_varying_1_native", false, true),
4602 
4603  LowerGSInfo("__pseudo_prefetch_read_varying_2",
4604  "__prefetch_read_varying_2", false, true),
4605  LowerGSInfo("__pseudo_prefetch_read_varying_2_native",
4606  "__prefetch_read_varying_2_native", false, true),
4607 
4608  LowerGSInfo("__pseudo_prefetch_read_varying_3",
4609  "__prefetch_read_varying_3", false, true),
4610  LowerGSInfo("__pseudo_prefetch_read_varying_3_native",
4611  "__prefetch_read_varying_3_native", false, true),
4612 
4613  LowerGSInfo("__pseudo_prefetch_read_varying_nt",
4614  "__prefetch_read_varying_nt", false, true),
4615  LowerGSInfo("__pseudo_prefetch_read_varying_nt_native",
4616  "__prefetch_read_varying_nt_native", false, true),
4617  };
4618 
4619  llvm::Function *calledFunc = callInst->getCalledFunction();
4620 
4621  LowerGSInfo *info = NULL;
4622  for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
4623  if (lgsInfo[i].pseudoFunc != NULL &&
4624  calledFunc == lgsInfo[i].pseudoFunc) {
4625  info = &lgsInfo[i];
4626  break;
4627  }
4628  }
4629  if (info == NULL)
4630  return false;
4631 
4632  Assert(info->actualFunc != NULL);
4633 
4634  // Get the source position from the metadata attached to the call
4635  // instruction so that we can issue PerformanceWarning()s below.
4636  SourcePos pos;
4637  bool gotPosition = lGetSourcePosFromMetadata(callInst, &pos);
4638 
4639  callInst->setCalledFunction(info->actualFunc);
4640  if (gotPosition && g->target->getVectorWidth() > 1) {
4641  if (info->isGather)
4642  PerformanceWarning(pos, "Gather required to load value.");
4643  else if (!info->isPrefetch)
4644  PerformanceWarning(pos, "Scatter required to store value.");
4645  }
4646  return true;
4647 }
4648 
4649 
4650 bool
4652  DEBUG_START_PASS("ReplacePseudoMemoryOpsPass");
4653 
4654  bool modifiedAny = false;
4655 
4656  restart:
4657  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
4658  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
4659  if (callInst == NULL ||
4660  callInst->getCalledFunction() == NULL)
4661  continue;
4662 
4663  if (lReplacePseudoGS(callInst)) {
4664  modifiedAny = true;
4665  goto restart;
4666  }
4667  else if (lReplacePseudoMaskedStore(callInst)) {
4668  modifiedAny = true;
4669  goto restart;
4670  }
4671  }
4672 
4673  DEBUG_END_PASS("ReplacePseudoMemoryOpsPass");
4674 
4675  return modifiedAny;
4676 }
4677 
4678 
4679 static llvm::Pass *
4681  return new ReplacePseudoMemoryOpsPass;
4682 }
4683 
4684 
4685 ///////////////////////////////////////////////////////////////////////////
4686 // IsCompileTimeConstantPass
4687 
4688 /** LLVM IR implementations of target-specific functions may include calls
4689  to the functions "bool __is_compile_time_constant_*(...)"; these allow
4690  them to have specialied code paths for where the corresponding value is
4691  known at compile time. For masks, for example, this allows them to not
4692  incur the cost of a MOVMSK call at runtime to compute its value in
4693  cases where the mask value isn't known until runtime.
4694 
4695  This pass resolves these calls into either 'true' or 'false' values so
4696  that later optimization passes can operate with these as constants.
4697 
4698  See stdlib.m4 for a number of uses of this idiom.
4699  */
4700 
4701 class IsCompileTimeConstantPass : public llvm::BasicBlockPass {
4702 public:
4703  static char ID;
4704  IsCompileTimeConstantPass(bool last = false) : BasicBlockPass(ID) {
4705  isLastTry = last;
4706  }
4707 
4708  const char *getPassName() const { return "Resolve \"is compile time constant\""; }
4709  bool runOnBasicBlock(llvm::BasicBlock &BB);
4710 
4712 };
4713 
4715 
4716 
4717 bool
4719  DEBUG_START_PASS("IsCompileTimeConstantPass");
4720 
4721  llvm::Function *funcs[] = {
4722  m->module->getFunction("__is_compile_time_constant_mask"),
4723  m->module->getFunction("__is_compile_time_constant_uniform_int32"),
4724  m->module->getFunction("__is_compile_time_constant_varying_int32")
4725  };
4726 
4727  bool modifiedAny = false;
4728  restart:
4729  for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
4730  // Iterate through the instructions looking for calls to the
4731  // __is_compile_time_constant_*() functions
4732  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
4733  if (callInst == NULL)
4734  continue;
4735 
4736  int j;
4737  int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
4738  for (j = 0; j < nFuncs; ++j) {
4739  if (funcs[j] != NULL && callInst->getCalledFunction() == funcs[j])
4740  break;
4741  }
4742  if (j == nFuncs)
4743  // not a __is_compile_time_constant_* function
4744  continue;
4745 
4746  // This optimization pass can be disabled with both the (poorly
4747  // named) disableGatherScatterFlattening option and
4748  // disableMaskAllOnOptimizations.
4751  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4752  modifiedAny = true;
4753  goto restart;
4754  }
4755 
4756  // Is it a constant? Bingo, turn the call's value into a constant
4757  // true value.
4758  llvm::Value *operand = callInst->getArgOperand(0);
4759  if (llvm::isa<llvm::Constant>(operand)) {
4760  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue);
4761  modifiedAny = true;
4762  goto restart;
4763  }
4764 
4765  // This pass runs multiple times during optimization. Up until the
4766  // very last time, it only replaces the call with a 'true' if the
4767  // value is known to be constant and otherwise leaves the call
4768  // alone, in case further optimization passes can help resolve its
4769  // value. The last time through, it eventually has to give up, and
4770  // replaces any remaining ones with 'false' constants.
4771  if (isLastTry) {
4772  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4773  modifiedAny = true;
4774  goto restart;
4775  }
4776  }
4777 
4778  DEBUG_END_PASS("IsCompileTimeConstantPass");
4779 
4780  return modifiedAny;
4781 }
4782 
4783 
4784 static llvm::Pass *
4786  return new IsCompileTimeConstantPass(isLastTry);
4787 }
4788 
4789 //////////////////////////////////////////////////////////////////////////
4790 // DebugPass
4791 
4792 /** This pass is added in list of passes after optimizations which
4793  we want to debug and print dump of LLVM IR in stderr. Also it
4794  prints name and number of previous optimization.
4795  */
4796 class DebugPass : public llvm::ModulePass {
4797 public:
4798  static char ID;
4799  DebugPass(char * output) : ModulePass(ID) {
4800  sprintf(str_output, "%s", output);
4801  }
4802 
4803  const char *getPassName() const { return "Dump LLVM IR"; }
4804  bool runOnModule(llvm::Module &m);
4805 
4806 private:
4807  char str_output[100];
4808 };
4809 
4810 char DebugPass::ID = 0;
4811 
4812 bool
4813 DebugPass::runOnModule(llvm::Module &module) {
4814  fprintf(stderr, "%s", str_output);
4815  fflush(stderr);
4816  module.dump();
4817  return true;
4818 }
4819 
4820 static llvm::Pass *
4821 CreateDebugPass(char * output) {
4822  return new DebugPass(output);
4823 }
4824 
4825 ///////////////////////////////////////////////////////////////////////////
4826 // MakeInternalFuncsStaticPass
4827 
4828 /** There are a number of target-specific functions that we use during
4829  these optimization passes. By the time we are done with optimization,
4830  any uses of these should be inlined and no calls to these functions
4831  should remain. This pass marks all of these functions as having
4832  private linkage so that subsequent passes can eliminate them as dead
4833  code, thus cleaning up the final code output by the compiler. We can't
4834  just declare these as static from the start, however, since then they
4835  end up being eliminated as dead code during early optimization passes
4836  even though we may need to generate calls to them during later
4837  optimization passes.
4838  */
4839 class MakeInternalFuncsStaticPass : public llvm::ModulePass {
4840 public:
4841  static char ID;
4842  MakeInternalFuncsStaticPass(bool last = false) : ModulePass(ID) {
4843  }
4844 
4845  void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
4846  AU.setPreservesCFG();
4847  }
4848 
4849  const char *getPassName() const { return "Make internal funcs \"static\""; }
4850  bool runOnModule(llvm::Module &m);
4851 };
4852 
4854 
4855 bool
4857  const char *names[] = {
4858  "__avg_up_uint8",
4859  "__avg_up_int8",
4860  "__avg_up_uint16",
4861  "__avg_up_int16",
4862  "__avg_down_uint8",
4863  "__avg_down_int8",
4864  "__avg_down_uint16",
4865  "__avg_down_int16",
4866  "__fast_masked_vload",
4867  "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16",
4868  "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64",
4869  "__gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_double",
4870  "__gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i16",
4871  "__gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i64",
4872  "__gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_double",
4873  "__gather_base_offsets32_i8", "__gather_base_offsets32_i16",
4874  "__gather_base_offsets32_i32", "__gather_base_offsets32_i64",
4875  "__gather_base_offsets32_float", "__gather_base_offsets32_double",
4876  "__gather_base_offsets64_i8", "__gather_base_offsets64_i16",
4877  "__gather_base_offsets64_i32", "__gather_base_offsets64_i64",
4878  "__gather_base_offsets64_float", "__gather_base_offsets64_double",
4879  "__gather32_i8", "__gather32_i16",
4880  "__gather32_i32", "__gather32_i64",
4881  "__gather32_float", "__gather32_double",
4882  "__gather64_i8", "__gather64_i16",
4883  "__gather64_i32", "__gather64_i64",
4884  "__gather64_float", "__gather64_double",
4885  "__gather_elt32_i8", "__gather_elt32_i16",
4886  "__gather_elt32_i32", "__gather_elt32_i64",
4887  "__gather_elt32_float", "__gather_elt32_double",
4888  "__gather_elt64_i8", "__gather_elt64_i16",
4889  "__gather_elt64_i32", "__gather_elt64_i64",
4890  "__gather_elt64_float", "__gather_elt64_double",
4891  "__masked_load_i8", "__masked_load_i16",
4892  "__masked_load_i32", "__masked_load_i64",
4893  "__masked_load_float", "__masked_load_double",
4894  "__masked_store_i8", "__masked_store_i16",
4895  "__masked_store_i32", "__masked_store_i64",
4896  "__masked_store_float", "__masked_store_double",
4897  "__masked_store_blend_i8", "__masked_store_blend_i16",
4898  "__masked_store_blend_i32", "__masked_store_blend_i64",
4899  "__masked_store_blend_float", "__masked_store_blend_double",
4900  "__scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i16",
4901  "__scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i64",
4902  "__scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_double",
4903  "__scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i16",
4904  "__scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i64",
4905  "__scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_double",
4906  "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16",
4907  "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64",
4908  "__scatter_base_offsets32_float", "__scatter_base_offsets32_double",
4909  "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16",
4910  "__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64",
4911  "__scatter_base_offsets64_float", "__scatter_base_offsets64_double",
4912  "__scatter_elt32_i8", "__scatter_elt32_i16",
4913  "__scatter_elt32_i32", "__scatter_elt32_i64",
4914  "__scatter_elt32_float", "__scatter_elt32_double",
4915  "__scatter_elt64_i8", "__scatter_elt64_i16",
4916  "__scatter_elt64_i32", "__scatter_elt64_i64",
4917  "__scatter_elt64_float", "__scatter_elt64_double",
4918  "__scatter32_i8", "__scatter32_i16",
4919  "__scatter32_i32", "__scatter32_i64",
4920  "__scatter32_float", "__scatter32_double",
4921  "__scatter64_i8", "__scatter64_i16",
4922  "__scatter64_i32", "__scatter64_i64",
4923  "__scatter64_float", "__scatter64_double",
4924  "__prefetch_read_varying_1", "__prefetch_read_varying_2",
4925  "__prefetch_read_varying_3", "__prefetch_read_varying_nt",
4926  "__keep_funcs_live",
4927  };
4928 
4929  bool modifiedAny = false;
4930  int count = sizeof(names) / sizeof(names[0]);
4931  for (int i = 0; i < count; ++i) {
4932  llvm::Function *f = m->module->getFunction(names[i]);
4933  if (f != NULL && f->empty() == false) {
4934  f->setLinkage(llvm::GlobalValue::InternalLinkage);
4935  modifiedAny = true;
4936  }
4937  }
4938 
4939  return modifiedAny;
4940 }
4941 
4942 
4943 static llvm::Pass *
4945  return new MakeInternalFuncsStaticPass;
4946 }
4947 
4948 
4949 ///////////////////////////////////////////////////////////////////////////
4950 // PeepholePass
4951 
4952 class PeepholePass : public llvm::BasicBlockPass {
4953 public:
4954  PeepholePass();
4955 
4956  const char *getPassName() const { return "Peephole Optimizations"; }
4957  bool runOnBasicBlock(llvm::BasicBlock &BB);
4958 
4959  static char ID;
4960 };
4961 
4962 char PeepholePass::ID = 0;
4963 
4965  : BasicBlockPass(ID) {
4966 }
4967 
4968 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3
4969 
4970 using namespace llvm::PatternMatch;
4971 
4972 template<typename Op_t, unsigned Opcode>
4973 struct CastClassTypes_match {
4974  Op_t Op;
4975  const llvm::Type *fromType, *toType;
4976 
4977  CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f,
4978  const llvm::Type *t)
4979  : Op(OpMatch), fromType(f), toType(t) {}
4980 
4981  template<typename OpTy>
4982  bool match(OpTy *V) {
4983  if (llvm::Operator *O = llvm::dyn_cast<llvm::Operator>(V))
4984  return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) &&
4985  O->getType() == toType &&
4986  O->getOperand(0)->getType() == fromType);
4987  return false;
4988  }
4989 };
4990 
4991 template<typename OpTy>
4992 inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
4993 m_SExt8To16(const OpTy &Op) {
4994  return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
4995  Op,
4998 }
4999 
5000 template<typename OpTy>
5001 inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
5002 m_ZExt8To16(const OpTy &Op) {
5003  return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
5004  Op,
5007 }
5008 
5009 
5010 template<typename OpTy>
5011 inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
5012 m_Trunc16To8(const OpTy &Op) {
5013  return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
5014  Op,
5017 }
5018 
5019 template<typename OpTy>
5020 inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
5021 m_SExt16To32(const OpTy &Op) {
5022  return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
5023  Op,
5026 }
5027 
5028 template<typename OpTy>
5029 inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
5030 m_ZExt16To32(const OpTy &Op) {
5031  return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
5032  Op,
5035 }
5036 
5037 
5038 template<typename OpTy>
5039 inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
5040 m_Trunc32To16(const OpTy &Op) {
5041  return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
5042  Op,
5045 }
5046 
5047 template<typename Op_t>
5048 struct UDiv2_match {
5049  Op_t Op;
5050 
5051  UDiv2_match(const Op_t &OpMatch)
5052  : Op(OpMatch) {}
5053 
5054  template<typename OpTy>
5055  bool match(OpTy *V) {
5056  llvm::BinaryOperator *bop;
5057  llvm::ConstantDataVector *cdv;
5058  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
5059  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
5060  cdv->getSplatValue() != NULL) {
5061  const llvm::APInt &apInt = cdv->getUniqueInteger();
5062 
5063  switch (bop->getOpcode()) {
5064  case llvm::Instruction::UDiv:
5065  // divide by 2
5066  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
5067  case llvm::Instruction::LShr:
5068  // shift left by 1
5069  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
5070  default:
5071  return false;
5072  }
5073  }
5074  return false;
5075  }
5076 };
5077 
5078 template<typename V>
5079 inline UDiv2_match<V>
5080 m_UDiv2(const V &v) {
5081  return UDiv2_match<V>(v);
5082 }
5083 
5084 template<typename Op_t>
5085 struct SDiv2_match {
5086  Op_t Op;
5087 
5088  SDiv2_match(const Op_t &OpMatch)
5089  : Op(OpMatch) {}
5090 
5091  template<typename OpTy>
5092  bool match(OpTy *V) {
5093  llvm::BinaryOperator *bop;
5094  llvm::ConstantDataVector *cdv;
5095  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
5096  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
5097  cdv->getSplatValue() != NULL) {
5098  const llvm::APInt &apInt = cdv->getUniqueInteger();
5099 
5100  switch (bop->getOpcode()) {
5101  case llvm::Instruction::SDiv:
5102  // divide by 2
5103  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
5104  case llvm::Instruction::AShr:
5105  // shift left by 1
5106  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
5107  default:
5108  return false;
5109  }
5110  }
5111  return false;
5112  }
5113 };
5114 
5115 template<typename V>
5116 inline SDiv2_match<V>
5117 m_SDiv2(const V &v) {
5118  return SDiv2_match<V>(v);
5119 }
5120 
5121 // Returns true if the given function has a call to an intrinsic function
5122 // in its definition.
5123 static bool
5124 lHasIntrinsicInDefinition(llvm::Function *func) {
5125  llvm::Function::iterator bbiter = func->begin();
5126  for (; bbiter != func->end(); ++bbiter) {
5127  for (llvm::BasicBlock::iterator institer = bbiter->begin();
5128  institer != bbiter->end(); ++institer) {
5129  if (llvm::isa<llvm::IntrinsicInst>(institer))
5130  return true;
5131  }
5132  }
5133  return false;
5134 }
5135 
5136 static llvm::Instruction *
5137 lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) {
5138  llvm::Function *func = m->module->getFunction(name);
5139  Assert(func != NULL);
5140 
5141  // Make sure that the definition of the llvm::Function has a call to an
5142  // intrinsic function in its instructions; otherwise we will generate
5143  // infinite loops where we "helpfully" turn the default implementations
5144  // of target builtins like __avg_up_uint8 that are implemented with plain
5145  // arithmetic ops into recursive calls to themselves.
5146  if (lHasIntrinsicInDefinition(func))
5147  return lCallInst(func, opa, opb, name);
5148  else
5149  return NULL;
5150 }
5151 
5152 //////////////////////////////////////////////////
5153 
5154 static llvm::Instruction *
5155 lMatchAvgUpUInt8(llvm::Value *inst) {
5156  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
5157  llvm::Value *opa, *opb;
5158  const llvm::APInt *delta;
5159  if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr(
5160  m_CombineOr(
5161  m_Add(m_ZExt8To16(m_Value(opa)),
5162  m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))),
5163  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)),
5164  m_ZExt8To16(m_Value(opb)))),
5165  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))),
5166  m_APInt(delta))))))) {
5167  if (delta->isIntN(1) == false)
5168  return NULL;
5169 
5170  return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
5171  }
5172  return NULL;
5173 }
5174 
5175 
5176 static llvm::Instruction *
5177 lMatchAvgDownUInt8(llvm::Value *inst) {
5178  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
5179  llvm::Value *opa, *opb;
5180  if (match(inst, m_Trunc16To8(m_UDiv2(
5181  m_Add(m_ZExt8To16(m_Value(opa)),
5182  m_ZExt8To16(m_Value(opb))))))) {
5183  return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb);
5184  }
5185  return NULL;
5186 }
5187 
5188 static llvm::Instruction *
5189 lMatchAvgUpUInt16(llvm::Value *inst) {
5190  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
5191  llvm::Value *opa, *opb;
5192  const llvm::APInt *delta;
5193  if (match(inst, m_Trunc32To16(m_UDiv2(m_CombineOr(
5194  m_CombineOr(
5195  m_Add(m_ZExt16To32(m_Value(opa)),
5196  m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))),
5197  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)),
5198  m_ZExt16To32(m_Value(opb)))),
5199  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))),
5200  m_APInt(delta))))))) {
5201  if (delta->isIntN(1) == false)
5202  return NULL;
5203 
5204  return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
5205  }
5206  return NULL;
5207 }
5208 
5209 
5210 static llvm::Instruction *
5211 lMatchAvgDownUInt16(llvm::Value *inst) {
5212  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
5213  llvm::Value *opa, *opb;
5214  if (match(inst, m_Trunc32To16(m_UDiv2(
5215  m_Add(m_ZExt16To32(m_Value(opa)),
5216  m_ZExt16To32(m_Value(opb))))))) {
5217  return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb);
5218  }
5219  return NULL;
5220 }
5221 
5222 
5223 static llvm::Instruction *
5224 lMatchAvgUpInt8(llvm::Value *inst) {
5225  // (int8)(((int16)a + (int16)b + 1)/2)
5226  llvm::Value *opa, *opb;
5227  const llvm::APInt *delta;
5228  if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr(
5229  m_CombineOr(
5230  m_Add(m_SExt8To16(m_Value(opa)),
5231  m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))),
5232  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)),
5233  m_SExt8To16(m_Value(opb)))),
5234  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))),
5235  m_APInt(delta))))))) {
5236  if (delta->isIntN(1) == false)
5237  return NULL;
5238 
5239  return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
5240  }
5241  return NULL;
5242 }
5243 
5244 
5245 static llvm::Instruction *
5246 lMatchAvgDownInt8(llvm::Value *inst) {
5247  // (int8)(((int16)a + (int16)b)/2)
5248  llvm::Value *opa, *opb;
5249  if (match(inst, m_Trunc16To8(m_SDiv2(
5250  m_Add(m_SExt8To16(m_Value(opa)),
5251  m_SExt8To16(m_Value(opb))))))) {
5252  return lGetBinaryIntrinsic("__avg_down_int8", opa, opb);
5253  }
5254  return NULL;
5255 }
5256 
5257 static llvm::Instruction *
5258 lMatchAvgUpInt16(llvm::Value *inst) {
5259  // (int16)(((int32)a + (int32)b + 1)/2)
5260  llvm::Value *opa, *opb;
5261  const llvm::APInt *delta;
5262  if (match(inst, m_Trunc32To16(m_SDiv2(m_CombineOr(
5263  m_CombineOr(
5264  m_Add(m_SExt16To32(m_Value(opa)),
5265  m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))),
5266  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)),
5267  m_SExt16To32(m_Value(opb)))),
5268  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))),
5269  m_APInt(delta))))))) {
5270  if (delta->isIntN(1) == false)
5271  return NULL;
5272 
5273  return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
5274  }
5275  return NULL;
5276 }
5277 
5278 static llvm::Instruction *
5279 lMatchAvgDownInt16(llvm::Value *inst) {
5280  // (int16)(((int32)a + (int32)b)/2)
5281  llvm::Value *opa, *opb;
5282  if (match(inst, m_Trunc32To16(m_SDiv2(
5283  m_Add(m_SExt16To32(m_Value(opa)),
5284  m_SExt16To32(m_Value(opb))))))) {
5285  return lGetBinaryIntrinsic("__avg_down_int16", opa, opb);
5286  }
5287  return NULL;
5288 }
5289 #endif // !LLVM_3_2
5290 
5291 
5292 bool
5293 PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
5294  DEBUG_START_PASS("PeepholePass");
5295 
5296  bool modifiedAny = false;
5297  restart:
5298  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
5299  llvm::Instruction *inst = &*iter;
5300 
5301  llvm::Instruction *builtinCall = NULL;
5302 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3
5303  if (!builtinCall)
5304  builtinCall = lMatchAvgUpUInt8(inst);
5305  if (!builtinCall)
5306  builtinCall = lMatchAvgUpUInt16(inst);
5307  if (!builtinCall)
5308  builtinCall = lMatchAvgDownUInt8(inst);
5309  if (!builtinCall)
5310  builtinCall = lMatchAvgDownUInt16(inst);
5311  if (!builtinCall)
5312  builtinCall = lMatchAvgUpInt8(inst);
5313  if (!builtinCall)
5314  builtinCall = lMatchAvgUpInt16(inst);
5315  if (!builtinCall)
5316  builtinCall = lMatchAvgDownInt8(inst);
5317  if (!builtinCall)
5318  builtinCall = lMatchAvgDownInt16(inst);
5319 #endif // !LLVM_3_2
5320  if (builtinCall != NULL) {
5321  llvm::ReplaceInstWithInst(inst, builtinCall);
5322  modifiedAny = true;
5323  goto restart;
5324  }
5325  }
5326 
5327  DEBUG_END_PASS("PeepholePass");
5328 
5329  return modifiedAny;
5330 }
5331 
5332 static llvm::Pass *
5334  return new PeepholePass;
5335 }
5336 
5337 /** Given an llvm::Value known to be an integer, return its value as
5338  an int64_t.
5339 */
5340 static int64_t
5341 lGetIntValue(llvm::Value *offset) {
5342  llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
5343  Assert(intOffset && (intOffset->getBitWidth() == 32 ||
5344  intOffset->getBitWidth() == 64));
5345  return intOffset->getSExtValue();
5346 }
5347 
5348 ///////////////////////////////////////////////////////////////////////////
5349 // ReplaceStdlibShiftPass
5350 
5351 class ReplaceStdlibShiftPass : public llvm::BasicBlockPass {
5352 public:
5353  static char ID;
5354  ReplaceStdlibShiftPass() : BasicBlockPass(ID) {
5355  }
5356 
5357  const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
5358  bool runOnBasicBlock(llvm::BasicBlock &BB);
5359 
5360 };
5361 
5363 
5364 bool
5366  DEBUG_START_PASS("ReplaceStdlibShiftPass");
5367  bool modifiedAny = false;
5368 
5369  llvm::Function *shifts[6];
5370  shifts[0] = m->module->getFunction("__shift_i8");
5371  shifts[1] = m->module->getFunction("__shift_i16");
5372  shifts[2] = m->module->getFunction("__shift_i32");
5373  shifts[3] = m->module->getFunction("__shift_i64");
5374  shifts[4] = m->module->getFunction("__shift_float");
5375  shifts[5] = m->module->getFunction("__shift_double");
5376 
5377  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
5378  llvm::Instruction *inst = &*iter;
5379 
5380  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
5381  llvm::Function *func = ci->getCalledFunction();
5382  for (int i = 0; i < 6; i++) {
5383  if (shifts[i] && (shifts[i] == func)) {
5384  // we matched a call
5385  llvm::Value *shiftedVec = ci->getArgOperand(0);
5386  llvm::Value *shiftAmt = ci->getArgOperand(1);
5387  if (llvm::isa<llvm::Constant>(shiftAmt)) {
5388  int vectorWidth = g->target->getVectorWidth();
5389  int * shuffleVals = new int[vectorWidth];
5390  int shiftInt = lGetIntValue(shiftAmt);
5391  for (int i = 0; i < vectorWidth; i++) {
5392  int s = i + shiftInt;
5393  s = (s < 0) ? vectorWidth : s;
5394  s = (s >= vectorWidth) ? vectorWidth : s;
5395  shuffleVals[i] = s;
5396  }
5397  llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
5398  llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
5399  llvm::Value *shuffle = new llvm::ShuffleVectorInst(shiftedVec, zeroVec,
5400  shuffleIdxs, "vecShift", ci);
5401  ci->replaceAllUsesWith(shuffle);
5402  modifiedAny = true;
5403  delete [] shuffleVals;
5404  } else {
5405  PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount.");
5406  }
5407  }
5408  }
5409  }
5410  }
5411 
5412  DEBUG_END_PASS("ReplaceStdlibShiftPass");
5413 
5414  return modifiedAny;
5415 }
5416 
5417 
5418 static llvm::Pass *
5420  return new ReplaceStdlibShiftPass();
5421 }
5422 
5423 
5424 
5425 ///////////////////////////////////////////////////////////////////////////////
5426 // FixBooleanSelect
5427 //
5428 // The problem is that in LLVM 3.3, optimizer doesn't like
5429 // the following instruction sequence:
5430 // %cmp = fcmp olt <8 x float> %a, %b
5431 // %sext_cmp = sext <8 x i1> %cmp to <8 x i32>
5432 // %new_mask = and <8 x i32> %sext_cmp, %mask
5433 // and optimizes it to the following:
5434 // %cmp = fcmp olt <8 x float> %a, %b
5435 // %cond = select <8 x i1> %cmp, <8 x i32> %mask, <8 x i32> zeroinitializer
5436 //
5437 // It wouldn't be a problem if codegen produced good code for it. But it
5438 // doesn't, especially for vectors larger than native vectors.
5439 //
5440 // This optimization reverts this pattern and should be the last one before
5441 // code gen.
5442 //
5443 // Note that this problem was introduced in LLVM 3.3. But in LLVM 3.4 it was
5444 // fixed. See commit r194542.
5445 //
5446 // After LLVM 3.3 this optimization should probably stay for experimental
5447 // purposes and code should be compared with and without this optimization from
5448 // time to time to make sure that LLVM does right thing.
5449 ///////////////////////////////////////////////////////////////////////////////
5450 
5451 class FixBooleanSelectPass : public llvm::FunctionPass {
5452 public:
5453  static char ID;
5454  FixBooleanSelectPass() :FunctionPass(ID) {}
5455 
5456  const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
5457  bool runOnFunction(llvm::Function &F);
5458 
5459 private:
5460  llvm::Instruction* fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext);
5461 };
5462 
5463 char FixBooleanSelectPass::ID = 0;
5464 
5465 llvm::Instruction* FixBooleanSelectPass::fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext) {
5466  // Select instruction result type and its integer equivalent
5467  llvm::VectorType *orig_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
5468  llvm::VectorType *int_type = llvm::VectorType::getInteger(orig_type);
5469 
5470  // Result value and optional pointer to instruction to delete
5471  llvm::Instruction *result = 0, *optional_to_delete = 0;
5472 
5473  // It can be vector of integers or vector of floating point values.
5474  if (orig_type->getElementType()->isIntegerTy()) {
5475  // Generate sext+and, remove select.
5476  result = llvm::BinaryOperator::CreateAnd(sext, sel->getTrueValue(), "and_mask", sel);
5477  } else {
5478  llvm::BitCastInst* bc = llvm::dyn_cast<llvm::BitCastInst>(sel->getTrueValue());
5479 
5480  if (bc && bc->hasOneUse() && bc->getSrcTy()->isIntOrIntVectorTy() && bc->getSrcTy()->isVectorTy() &&
5481  llvm::isa<llvm::Instruction>(bc->getOperand(0)) &&
5482  llvm::dyn_cast<llvm::Instruction>(bc->getOperand(0))->getParent() == sel->getParent()) {
5483  // Bitcast is casting form integer type, it's operand is instruction, which is located in the same basic block (otherwise it's unsafe to use it).
5484  // bitcast+select => sext+and+bicast
5485  // Create and
5486  llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc->getOperand(0), "and_mask", sel);
5487  // Bitcast back to original type
5488  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
5489  // Original bitcast will be removed
5490  optional_to_delete = bc;
5491  } else {
5492  // General case: select => bitcast+sext+and+bitcast
5493  // Bitcast
5494  llvm::BitCastInst* bc_in = new llvm::BitCastInst(sel->getTrueValue(), int_type, "bitcast_mask_in", sel);
5495  // And
5496  llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc_in, "and_mask", sel);
5497  // Bitcast back to original type
5498  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
5499  }
5500  }
5501 
5502  // Done, finalize.
5503  sel->replaceAllUsesWith(result);
5504  sel->eraseFromParent();
5505  if (optional_to_delete) {
5506  optional_to_delete->eraseFromParent();
5507  }
5508 
5509  return result;
5510 }
5511 
5512 bool
5514  bool modifiedAny = false;
5515 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_3 // LLVM 3.3 only
5516 
5517  // Don't optimize generic targets.
5518  if (g->target->getISA() == Target::GENERIC) {
5519  return false;
5520  }
5521 
5522  for (llvm::Function::iterator I = F.begin(), E = F.end();
5523  I != E; ++I) {
5524  llvm::BasicBlock* bb = &*I;
5525  for (llvm::BasicBlock::iterator iter = bb->begin(), e = bb->end(); iter != e; ++iter) {
5526  llvm::Instruction *inst = &*iter;
5527 
5528  llvm::CmpInst *cmp = llvm::dyn_cast<llvm::CmpInst>(inst);
5529 
5530  if (cmp &&
5531  cmp->getType()->isVectorTy() &&
5532  cmp->getType()->getVectorElementType()->isIntegerTy(1)) {
5533 
5534  // Search for select instruction uses.
5535  int selects = 0;
5536  llvm::VectorType* sext_type = 0;
5537  for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it ) {
5538  llvm::SelectInst* sel = llvm::dyn_cast<llvm::SelectInst>(*it);
5539  if (sel &&
5540  sel->getType()->isVectorTy() &&
5541  sel->getType()->getScalarSizeInBits() > 1) {
5542  selects++;
5543  // We pick the first one, but typical case when all select types are the same.
5544  sext_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
5545  break;
5546  }
5547  }
5548  if (selects == 0) {
5549  continue;
5550  }
5551  // Get an integer equivalent, if it's not yet an integer.
5552  sext_type = llvm::VectorType::getInteger(sext_type);
5553 
5554  // Do transformation
5555  llvm::BasicBlock::iterator iter_copy=iter;
5556  llvm::Instruction* next_inst = &*(++iter_copy);
5557  // Create or reuse sext
5558  llvm::SExtInst* sext = llvm::dyn_cast<llvm::SExtInst>(next_inst);
5559  if (sext &&
5560  sext->getOperand(0) == cmp &&
5561  sext->getDestTy() == sext_type) {
5562  // This sext can be reused
5563  } else {
5564  if (next_inst) {
5565  sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", next_inst);
5566  } else {
5567  sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", bb);
5568  }
5569  }
5570 
5571  // Walk and fix selects
5572  std::vector<llvm::SelectInst*> sel_uses;
5573  for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it) {
5574  llvm::SelectInst* sel = llvm::dyn_cast<llvm::SelectInst>(*it);
5575  if (sel &&
5576  sel->getType()->getScalarSizeInBits() == sext_type->getScalarSizeInBits()) {
5577 
5578  // Check that second operand is zero.
5579  llvm::Constant* false_cond = llvm::dyn_cast<llvm::Constant>(sel->getFalseValue());
5580  if (false_cond &&
5581  false_cond->isZeroValue()) {
5582  sel_uses.push_back(sel);
5583  modifiedAny = true;
5584  }
5585  }
5586  }
5587 
5588  for (int i=0; i<sel_uses.size(); i++) {
5589  fixSelect(sel_uses[i], sext);
5590  }
5591  }
5592  }
5593  }
5594 
5595 #endif // LLVM 3.3
5596 
5597  return modifiedAny;
5598 }
5599 
5600 
5601 static llvm::Pass *
5603  return new FixBooleanSelectPass();
5604 }
5605 
5606 #ifdef ISPC_NVPTX_ENABLED
5607 ///////////////////////////////////////////////////////////////////////////////
5608 // Detect addrspace(3)
5609 ///////////////////////////////////////////////////////////////////////////////
5610 
5611 class PromoteLocalToPrivatePass: public llvm::BasicBlockPass
5612 {
5613  public:
5614  static char ID; // Pass identification, replacement for typeid
5615  PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
5616 
5617  bool runOnBasicBlock(llvm::BasicBlock &BB);
5618 };
5619 
5620 char PromoteLocalToPrivatePass::ID = 0;
5621 
5622 bool
5623 PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB)
5624 {
5625  std::vector<llvm::AllocaInst*> Allocas;
5626 
5627  bool modifiedAny = false;
5628 
5629 #if 1
5630 restart:
5631  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
5632  {
5633  llvm::Instruction *inst = &*I;
5634  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
5635  {
5636  llvm::Function *func = ci->getCalledFunction();
5637  if (func && func->getName() == "llvm.trap")
5638  {
5639  std::vector<llvm::Type*> funcTyArgs;
5640  llvm::FunctionType *funcTy = llvm::FunctionType::get(
5641  /*Result=*/llvm::Type::getVoidTy(*g->ctx),
5642  /*Params=*/funcTyArgs,
5643  /*isVarArg=*/false);
5644  llvm::InlineAsm *trap_ptx = llvm::InlineAsm::get(funcTy, "trap;", "", false);
5645  assert(trap_ptx != NULL);
5646  llvm::Instruction *trap_call = llvm::CallInst::Create(trap_ptx);
5647  assert(trap_call != NULL);
5648  llvm::ReplaceInstWithInst(ci, trap_call);
5649  modifiedAny = true;
5650  goto restart;
5651  }
5652  }
5653  }
5654 #endif
5655 
5656 #if 0
5657  llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
5658 
5659  // Find allocas that are safe to promote, by looking at all instructions in
5660  // the entry node
5661  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
5662  {
5663  llvm::Instruction *inst = &*I;
5664  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
5665  {
5666  llvm::Function *func = ci->getCalledFunction();
5667  if (cvtFunc && (cvtFunc == func))
5668  {
5669 #if 0
5670  fprintf(stderr , "--found cvt-- name= %s \n",
5671  I->getName().str().c_str());
5672 #endif
5673  llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
5674  assert(alloca != NULL);
5675 #if 0
5676  const int align = 8; // g->target->getNativeVectorAlignment();
5677  alloca->setAlignment(align);
5678 #endif
5679  ci->replaceAllUsesWith(alloca);
5680  modifiedAny = true;
5681  }
5682  }
5683  }
5684 #endif
5685  return modifiedAny;
5686 }
5687 
5688 static llvm::Pass *
5689 CreatePromoteLocalToPrivatePass() {
5690  return new PromoteLocalToPrivatePass();
5691 }
5692 
5693 
5694 
5695 #endif /* ISPC_NVPTX_ENABLED */
5696 
static llvm::Pass * CreateFixBooleanSelectPass()
Definition: opt.cpp:5602
static llvm::Type * FloatType
Definition: llvmutil.h:76
char str_output[100]
Definition: opt.cpp:4807
static llvm::Type * Int32VectorPointerType
Definition: llvmutil.h:99
static bool lCoalesceGathers(const std::vector< llvm::CallInst * > &coalesceGroup)
Definition: opt.cpp:4072
llvm::TargetMachine * GetTargetMachine() const
Definition: ispc.h:230
llvm::Value * LLVMShuffleVectors(llvm::Value *v1, llvm::Value *v2, int32_t shuf[], int shufSize, llvm::Instruction *insertBefore)
Definition: llvmutil.cpp:1694
Opt opt
Definition: ispc.h:541
llvm::Constant * LLVMInt64Vector(int64_t i)
Definition: llvmutil.cpp:455
llvm::Instruction * fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext)
Definition: opt.cpp:5465
static bool lIsSafeToBlend(llvm::Value *lvalue)
Definition: opt.cpp:4354
Declaration of the FunctionEmitContext class
void PerformanceWarning(SourcePos p, const char *format,...) PRINTF_FUNC
Definition: util.cpp:426
static llvm::Type * DoubleType
Definition: llvmutil.h:77
static llvm::Value * lExtractFromInserts(llvm::Value *v, unsigned int index)
Definition: opt.cpp:1548
bool disableBlendedMaskedStores
Definition: ispc.h:486
static llvm::Value * lExtractOffsetVector248Scale(llvm::Value **vec)
Definition: opt.cpp:1924
static bool simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter)
Definition: opt.cpp:1360
int getVectorWidth() const
Definition: ispc.h:283
Module * m
Definition: ispc.cpp:89
void Optimize(llvm::Module *module, int optLevel)
Definition: opt.cpp:524
static llvm::Pass * CreateImproveMemoryOpsPass()
Definition: opt.cpp:3249
int first_line
Definition: ispc.h:139
bool runOnModule(llvm::Module &m)
Definition: opt.cpp:4856
Target * target
Definition: ispc.h:543
static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, const char *name, llvm::Instruction *insertBefore=NULL)
Definition: opt.cpp:288
static bool lVectorLoadIsEfficient(std::set< int64_t >::iterator iter, std::set< int64_t >::iterator end, std::set< int64_t >::iterator *newIter, int vectorWidth)
Definition: opt.cpp:3322
static void lSelectLoads(const std::vector< int64_t > &loadOffsets, std::vector< CoalescedLoadOp > *loads)
Definition: opt.cpp:3423
static llvm::Constant * lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1, llvm::Constant **delta)
Definition: opt.cpp:1530
static llvm::Value * lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3661
llvm::Value * lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align, llvm::Instruction *insertBefore, llvm::Type *type)
Definition: opt.cpp:3543
const char * getPassName() const
Definition: opt.cpp:4956
#define Assert(expr)
Definition: ispc.h:170
static char ID
Definition: opt.cpp:4959
static char ID
Definition: opt.cpp:5453
bool LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts)
Definition: llvmutil.cpp:794
llvm::Constant * LLVMInt32Vector(int32_t i)
Definition: llvmutil.cpp:379
static llvm::Value * lAssemble4Vector(const std::vector< CoalescedLoadOp > &loadOps, const int64_t offsets[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3794
static llvm::VectorType * Int32VectorType
Definition: llvmutil.h:92
Declarations related to optimization passes.
llvm::Value * element0
Definition: opt.cpp:3311
static llvm::Pass * CreateReplaceStdlibShiftPass()
Definition: opt.cpp:5419
std::vector< MaskInstruction > maskInstructions
Definition: opt.cpp:956
bool forceAlignedMemory
Definition: ispc.h:465
static llvm::Type * FloatVectorPointerType
Definition: llvmutil.h:101
BlendInstruction * matchingBlendInstruction(llvm::Function *function)
Definition: opt.cpp:1229
static char ID
Definition: opt.cpp:3278
static bool lVectorIs32BitInts(llvm::Value *v)
Definition: opt.cpp:2092
static bool lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos)
Definition: opt.cpp:234
IsCompileTimeConstantPass(bool last=false)
Definition: opt.cpp:4704
static bool lGetMask(llvm::Value *factor, uint64_t *mask)
Definition: opt.cpp:392
static llvm::Type * Int16VectorPointerType
Definition: llvmutil.h:98
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:1380
static bool lIsIntegerSplat(llvm::Value *v, int *splat)
Definition: opt.cpp:1852
MakeInternalFuncsStaticPass(bool last=false)
Definition: opt.cpp:4842
static llvm::Pass * CreateReplacePseudoMemoryOpsPass()
Definition: opt.cpp:4680
static void lExtractConstOffsets(const std::vector< llvm::CallInst * > &coalesceGroup, int elementSize, std::vector< int64_t > *constOffsets)
Definition: opt.cpp:4043
int getNativeVectorAlignment() const
Definition: ispc.h:279
static llvm::Type * Int16Type
Definition: llvmutil.h:73
static llvm::Type * DoubleVectorPointerType
Definition: llvmutil.h:102
bool run(llvm::Module &M)
Definition: opt.cpp:476
llvm::Constant * LLVMFalse
Definition: llvmutil.cpp:91
static bool lInstructionMayWriteToMemory(llvm::Instruction *inst)
Definition: opt.cpp:4149
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:3205
static llvm::Pass * CreateIntrinsicsOptPass()
Definition: opt.cpp:1241
bool disableCoalescing
Definition: ispc.h:526
static llvm::Pass * CreatePeepholePass()
Definition: opt.cpp:5333
llvm::PassManager PM
Definition: opt.cpp:484
static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst)
Definition: opt.cpp:2565
static llvm::VectorType * Int1VectorType
Definition: llvmutil.h:89
static llvm::Instruction * lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name, llvm::Instruction *insertBefore)
Definition: opt.cpp:337
static bool lIsUndef(llvm::Value *value)
Definition: opt.cpp:997
static void lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from)
Definition: opt.cpp:199
header file with declarations for symbol and symbol table classes.
std::set< int > debug_stages
Definition: ispc.h:569
bool disableMaskAllOnOptimizations
Definition: ispc.h:470
llvm::ConstantInt * LLVMInt32(int32_t i)
Definition: llvmutil.cpp:263
llvm::Module * module
Definition: module.h:158
bool matchesMaskInstruction(llvm::Function *function)
Definition: opt.cpp:1217
static llvm::Type * Int8VectorPointerType
Definition: llvmutil.h:97
Globals * g
Definition: ispc.cpp:88
Definition: opt.cpp:439
bool disableGatherScatterOptimizations
Definition: ispc.h:504
bool debugPrint
Definition: ispc.h:560
static llvm::VectorType * Int8VectorType
Definition: llvmutil.h:90
bool LLVMVectorValuesAllEqual(llvm::Value *v, llvm::Value **splat=NULL)
Definition: llvmutil.cpp:1199
void getAnalysisUsage(llvm::AnalysisUsage &AU) const
Definition: opt.cpp:4845
bool hasVecPrefetch() const
Definition: ispc.h:307
static uint64_t lConstElementsToMask(const llvm::SmallVector< llvm::Constant *, ISPC_MAX_NVEC > &elements)
Definition: opt.cpp:356
llvm::Constant * LLVMTrue
Definition: llvmutil.cpp:90
Definition: opt.cpp:439
static llvm::Pass * CreateInstructionSimplifyPass()
Definition: opt.cpp:1406
static llvm::Pass * CreateDebugPass(char *output)
Definition: opt.cpp:4821
static llvm::Value * simplifyBoolVec(llvm::Value *value)
Definition: opt.cpp:1277
static llvm::Pass * CreateIsCompileTimeConstantPass(bool isLastTry)
Definition: opt.cpp:4785
llvm::Value * element1
Definition: opt.cpp:3311
static llvm::VectorType * FloatVectorType
Definition: llvmutil.h:94
static bool lReplacePseudoGS(llvm::CallInst *callInst)
Definition: opt.cpp:4451
static bool lReplacePseudoMaskedStore(llvm::CallInst *callInst)
Definition: opt.cpp:4388
static llvm::Type * Int64Type
Definition: llvmutil.h:75
static llvm::Type * Int8Type
Definition: llvmutil.h:72
static void lAssembleResultVectors(const std::vector< CoalescedLoadOp > &loadOps, const std::vector< int64_t > &constOffsets, std::vector< llvm::Value * > &results, llvm::Instruction *insertBefore)
Definition: opt.cpp:3966
static llvm::VectorType * Int64VectorType
Definition: llvmutil.h:93
Header file with declarations for various LLVM utility stuff.
static bool simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter)
Definition: