Intel SPMD Program Compiler  1.9.2
opt.cpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2010-2016, Intel Corporation
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are
7  met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16  * Neither the name of Intel Corporation nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22  IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
25  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 /** @file opt.cpp
35  @brief Implementations of various ispc optimization passes that operate
36  on the LLVM IR.
37 */
38 
39 #include "opt.h"
40 #include "ctx.h"
41 #include "sym.h"
42 #include "module.h"
43 #include "util.h"
44 #include "llvmutil.h"
45 
46 #include <stdio.h>
47 #include <map>
48 #include <set>
49 
50 #include <llvm/Pass.h>
51 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
52  #include <llvm/Module.h>
53  #include <llvm/Instructions.h>
54  #include <llvm/Intrinsics.h>
55  #include <llvm/Function.h>
56  #include <llvm/BasicBlock.h>
57  #include <llvm/Constants.h>
58 #ifdef ISPC_NVPTX_ENABLED
59  #include <llvm/InlineAsm.h>
60 #endif /* ISPC_NVPTX_ENABLED */
61 #else // LLVM 3.3+
62  #include <llvm/IR/Module.h>
63  #include <llvm/IR/Instructions.h>
64  #include <llvm/IR/Intrinsics.h>
65  #include <llvm/IR/Function.h>
66  #include <llvm/IR/BasicBlock.h>
67  #include <llvm/IR/Constants.h>
68 #ifdef ISPC_NVPTX_ENABLED
69  #include <llvm/IR/InlineAsm.h>
70 #endif /* ISPC_NVPTX_ENABLED */
71 #endif
72 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_4 // LLVM 3.4+
73  #include <llvm/Transforms/Instrumentation.h>
74 #endif
75 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
76  #include "llvm/PassManager.h"
77 #else // LLVM 3.7+
78  #include "llvm/IR/LegacyPassManager.h"
79 #endif
80 #include <llvm/PassRegistry.h>
81 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
82  #include <llvm/IR/Verifier.h>
83  #include <llvm/IR/IRPrintingPasses.h>
84  #include <llvm/IR/PatternMatch.h>
85  #include <llvm/IR/DebugInfo.h>
86 #else // < 3.5
87  #include <llvm/Analysis/Verifier.h>
88  #include <llvm/Assembly/PrintModulePass.h>
89  #include <llvm/Support/PatternMatch.h>
90  #include <llvm/DebugInfo.h>
91 #endif
92 #include <llvm/Analysis/ConstantFolding.h>
93 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
94  #include <llvm/Target/TargetLibraryInfo.h>
95 #else // LLVM 3.7+
96  #include <llvm/Analysis/TargetLibraryInfo.h>
97 #endif
98 #include <llvm/ADT/Triple.h>
99 #include <llvm/ADT/SmallSet.h>
100 #include <llvm/Transforms/Scalar.h>
101 #include <llvm/Transforms/IPO.h>
102 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
103 #include <llvm/Target/TargetOptions.h>
104 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
105  #include <llvm/DataLayout.h>
106 #else // LLVM 3.3+
107  #include <llvm/IR/DataLayout.h>
108  #include <llvm/Analysis/TargetTransformInfo.h>
109 #endif
110 #include <llvm/Target/TargetMachine.h>
111 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
112  #include <llvm/Analysis/BasicAliasAnalysis.h>
113  #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
114 #endif
115 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_9 // LLVM 3.9+
116  #include "llvm/Transforms/IPO/FunctionAttrs.h"
117  #include "llvm/Transforms/Scalar/GVN.h"
118 #endif
119 #include <llvm/Analysis/Passes.h>
120 #include <llvm/Support/raw_ostream.h>
121 #if ISPC_LLVM_VERSION >= ISPC_LLVM_5_0 // LLVM 5.0+
122  #include <llvm/BinaryFormat/Dwarf.h>
123 #else // LLVM up to 4.x
124  #include <llvm/Support/Dwarf.h>
125 #endif
126 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_6
127  #include <llvm/IR/IntrinsicInst.h>
128 #endif
129 #ifdef ISPC_IS_LINUX
130  #include <alloca.h>
131 #elif defined(ISPC_IS_WINDOWS)
132  #include <malloc.h>
133  #ifndef __MINGW32__
134  #define alloca _alloca
135  #endif
136 #endif // ISPC_IS_WINDOWS
137 
138 #ifndef PRId64
139 #define PRId64 "lld"
140 #endif
141 #ifndef PRIu64
142 #define PRIu64 "llu"
143 #endif
144 
145 static llvm::Pass *CreateIntrinsicsOptPass();
146 static llvm::Pass *CreateInstructionSimplifyPass();
147 static llvm::Pass *CreatePeepholePass();
148 
149 static llvm::Pass *CreateImproveMemoryOpsPass();
150 static llvm::Pass *CreateGatherCoalescePass();
151 static llvm::Pass *CreateReplacePseudoMemoryOpsPass();
152 
153 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
154 static llvm::Pass *CreateMakeInternalFuncsStaticPass();
155 
156 static llvm::Pass *CreateDebugPass(char * output);
157 
158 static llvm::Pass *CreateReplaceStdlibShiftPass();
159 
160 static llvm::Pass *CreateFixBooleanSelectPass();
161 #ifdef ISPC_NVPTX_ENABLED
162 static llvm::Pass *CreatePromoteLocalToPrivatePass();
163 #endif /* ISPC_NVPTX_ENABLED */
164 
165 #define DEBUG_START_PASS(NAME) \
166  if (g->debugPrint && \
167  (getenv("FUNC") == NULL || \
168  !strncmp(bb.getParent()->getName().str().c_str(), getenv("FUNC"), \
169  strlen(getenv("FUNC"))))) { \
170  fprintf(stderr, "Start of " NAME "\n"); \
171  fprintf(stderr, "---------------\n"); \
172  bb.dump(); \
173  fprintf(stderr, "---------------\n\n"); \
174  } else /* eat semicolon */
175 
176 #define DEBUG_END_PASS(NAME) \
177  if (g->debugPrint && \
178  (getenv("FUNC") == NULL || \
179  !strncmp(bb.getParent()->getName().str().c_str(), getenv("FUNC"), \
180  strlen(getenv("FUNC"))))) { \
181  fprintf(stderr, "End of " NAME " %s\n", modifiedAny ? "** CHANGES **" : ""); \
182  fprintf(stderr, "---------------\n"); \
183  bb.dump(); \
184  fprintf(stderr, "---------------\n\n"); \
185  } else /* eat semicolon */
186 
187 
188 
189 ///////////////////////////////////////////////////////////////////////////
190 
191 
192 /** This utility routine copies the metadata (if any) attached to the
193  'from' instruction in the IR to the 'to' instruction.
194 
195  For flexibility, this function takes an llvm::Value rather than an
196  llvm::Instruction for the 'to' parameter; at some places in the code
197  below, we sometimes use a llvm::Value to start out storing a value and
198  then later store instructions. If a llvm::Value is passed to this, the
199  routine just returns without doing anything; if it is in fact an
200  LLVM::Instruction, then the metadata can be copied to it.
201  */
202 static void
203 lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from) {
204  llvm::Instruction *to = llvm::dyn_cast<llvm::Instruction>(vto);
205  if (!to)
206  return;
207 
208  llvm::SmallVector<std::pair<unsigned int, llvm::MDNode *>, 8> metadata;
209 
210  from->getAllMetadata(metadata);
211  for (unsigned int i = 0; i < metadata.size(); ++i)
212  to->setMetadata(metadata[i].first, metadata[i].second);
213 }
214 
215 
216 /** We have a protocol with the front-end LLVM IR code generation process
217  that allows us to encode the source file position that corresponds with
218  instructions. (For example, this allows us to issue performance
219  warnings related to things like scatter and gather after optimization
220  has been performed, so that we aren't warning about scatters and
221  gathers that have been improved to stores and loads by optimization
222  passes.) Note that this is slightly redundant with the source file
223  position encoding generated for debugging symbols, though we don't
224  always generate debugging information but we do always generate this
225  position data.
226 
227  This function finds the SourcePos that the metadata in the instruction
228  (if present) corresponds to. See the implementation of
229  FunctionEmitContext::addGSMetadata(), which encodes the source position during
230  code generation.
231 
232  @param inst Instruction to try to find the source position of
233  @param pos Output variable in which to store the position
234  @returns True if source file position metadata was present and *pos
235  has been set. False otherwise.
236 */
237 static bool
238 lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos) {
239  llvm::MDNode *filename = inst->getMetadata("filename");
240  llvm::MDNode *first_line = inst->getMetadata("first_line");
241  llvm::MDNode *first_column = inst->getMetadata("first_column");
242  llvm::MDNode *last_line = inst->getMetadata("last_line");
243  llvm::MDNode *last_column = inst->getMetadata("last_column");
244 
245  if (!filename || !first_line || !first_column || !last_line || !last_column)
246  return false;
247 
248  // All of these asserts are things that FunctionEmitContext::addGSMetadata() is
249  // expected to have done in its operation
250  llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(filename->getOperand(0));
251  Assert(str);
252  llvm::ConstantInt *first_lnum =
253 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
254  llvm::dyn_cast<llvm::ConstantInt>(first_line->getOperand(0));
255 #else /* LLVN 3.6+ */
256  llvm::mdconst::extract<llvm::ConstantInt>(first_line->getOperand(0));
257 #endif
258  Assert(first_lnum);
259 
260  llvm::ConstantInt *first_colnum =
261 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
262  llvm::dyn_cast<llvm::ConstantInt>(first_column->getOperand(0));
263 #else /* LLVN 3.6+ */
264  llvm::mdconst::extract<llvm::ConstantInt>(first_column->getOperand(0));
265 #endif
266  Assert(first_column);
267 
268  llvm::ConstantInt *last_lnum =
269 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
270  llvm::dyn_cast<llvm::ConstantInt>(last_line->getOperand(0));
271 #else /* LLVN 3.6+ */
272  llvm::mdconst::extract<llvm::ConstantInt>(last_line->getOperand(0));
273 #endif
274  Assert(last_lnum);
275 
276  llvm::ConstantInt *last_colnum =
277 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
278  llvm::dyn_cast<llvm::ConstantInt>(last_column->getOperand(0));
279 #else /* LLVN 3.6+ */
280  llvm::mdconst::extract<llvm::ConstantInt>(last_column->getOperand(0));
281 #endif
282  Assert(last_column);
283 
284  *pos = SourcePos(str->getString().data(), (int)first_lnum->getZExtValue(),
285  (int)first_colnum->getZExtValue(), (int)last_lnum->getZExtValue(),
286  (int)last_colnum->getZExtValue());
287  return true;
288 }
289 
290 
291 static llvm::Instruction *
292 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
293  const char *name, llvm::Instruction *insertBefore = NULL) {
294  llvm::Value *args[2] = { arg0, arg1 };
295  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
296  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
297 }
298 
299 
300 static llvm::Instruction *
301 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
302  llvm::Value *arg2, const char *name,
303  llvm::Instruction *insertBefore = NULL) {
304  llvm::Value *args[3] = { arg0, arg1, arg2 };
305  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
306  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
307 }
308 
309 
310 static llvm::Instruction *
311 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
312  llvm::Value *arg2, llvm::Value *arg3, const char *name,
313  llvm::Instruction *insertBefore = NULL) {
314  llvm::Value *args[4] = { arg0, arg1, arg2, arg3 };
315  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[4]);
316  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
317 }
318 
319 static llvm::Instruction *
320 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
321  llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4,
322  const char *name, llvm::Instruction *insertBefore = NULL) {
323  llvm::Value *args[5] = { arg0, arg1, arg2, arg3, arg4 };
324  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[5]);
325  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
326 }
327 
328 
329 static llvm::Instruction *
330 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
331  llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4,
332  llvm::Value *arg5, const char *name,
333  llvm::Instruction *insertBefore = NULL) {
334  llvm::Value *args[6] = { arg0, arg1, arg2, arg3, arg4, arg5 };
335  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
336  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
337 }
338 
339 
340 static llvm::Instruction *
341 lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name,
342  llvm::Instruction *insertBefore) {
343  llvm::Value *index[1] = { offset };
344  llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
345 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
346  return llvm::GetElementPtrInst::Create(ptr, arrayRef, name,
347  insertBefore);
348 #else // LLVM 3.7+
349  return llvm::GetElementPtrInst::Create(PTYPE(ptr), ptr, arrayRef,
350  name, insertBefore);
351 #endif
352 }
353 
354 
355 /** Given a vector of constant values (int, float, or bool) representing an
356  execution mask, convert it to a bitvector where the 0th bit corresponds
357  to the first vector value and so forth.
358 */
359 static uint64_t
360 lConstElementsToMask(const llvm::SmallVector<llvm::Constant *,
361  ISPC_MAX_NVEC> &elements) {
362  Assert(elements.size() <= 64);
363 
364  uint64_t mask = 0;
365  for (unsigned int i = 0; i < elements.size(); ++i) {
366  llvm::APInt intMaskValue;
367  // SSE has the "interesting" approach of encoding blending
368  // masks as <n x float>.
369  llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
370  if (cf != NULL) {
371  llvm::APFloat apf = cf->getValueAPF();
372  intMaskValue = apf.bitcastToAPInt();
373  }
374  else {
375  // Otherwise get it as an int
376  llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
377  Assert(ci != NULL); // vs return -1 if NULL?
378  intMaskValue = ci->getValue();
379  }
380  // Is the high-bit set? If so, OR in the appropriate bit in
381  // the result mask
382  if (intMaskValue.countLeadingOnes() > 0)
383  mask |= (1ull << i);
384  }
385  return mask;
386 }
387 
388 
389 /** Given an llvm::Value represinting a vector mask, see if the value is a
390  constant. If so, return true and set *bits to be the integer mask
391  found by taking the high bits of the mask values in turn and
392  concatenating them into a single integer. In other words, given the
393  4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, we have 0b1001 = 9.
394  */
395 static bool
396 lGetMask(llvm::Value *factor, uint64_t *mask) {
397  llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
398  if (cdv != NULL) {
399  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
400  for (int i = 0; i < (int)cdv->getNumElements(); ++i)
401  elements.push_back(cdv->getElementAsConstant(i));
402  *mask = lConstElementsToMask(elements);
403  return true;
404  }
405 
406  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
407  if (cv != NULL) {
408  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
409  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
410  llvm::Constant *c =
411  llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
412  if (c == NULL)
413  return false;
414  if (llvm::isa<llvm::ConstantExpr>(cv->getOperand(i)) )
415  return false; // We can not handle constant expressions here
416  elements.push_back(c);
417  }
418  *mask = lConstElementsToMask(elements);
419  return true;
420  }
421  else if (llvm::isa<llvm::ConstantAggregateZero>(factor)) {
422  *mask = 0;
423  return true;
424  }
425  else {
426 #if 0
427  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
428  if (ce != NULL) {
429  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
430  const llvm::TargetData *td = targetMachine->getTargetData();
431  llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
432  c->dump();
433  factor = c;
434  }
435  // else we should be able to handle it above...
436  Assert(!llvm::isa<llvm::Constant>(factor));
437 #endif
438  return false;
439  }
440 }
441 
442 
444 
445 /** Determines if the given mask value is all on, all off, mixed, or
446  unknown at compile time.
447 */
448 static MaskStatus
449 lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
450  uint64_t bits;
451  if (lGetMask(mask, &bits) == false)
452  return UNKNOWN;
453 
454  if (bits == 0)
455  return ALL_OFF;
456 
457  if (vecWidth == -1)
458  vecWidth = g->target->getVectorWidth();
459  Assert(vecWidth <= 64);
460 
461  for (int i = 0; i < vecWidth; ++i) {
462  if ((bits & (1ull << i)) == 0)
463  return MIXED;
464  }
465  return ALL_ON;
466 }
467 
468 
469 ///////////////////////////////////////////////////////////////////////////
470 // This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
471 // and change PassManager function add by adding some checks and debug passes.
472 // This wrap can control:
473 // - If we want to switch off optimization with given number.
474 // - If we want to dump LLVM IR after optimization with given number.
475 // - If we want to generate LLVM IR debug for gdb after optimization with given number.
477 public:
479  void add(llvm::Pass * P, int stage);
480  bool run(llvm::Module& M) {return PM.run(M);}
481 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
482  llvm::PassManager& getPM() {return PM;}
483 #else /* LLVM 3.7+ */
484  llvm::legacy::PassManager& getPM() {return PM;}
485 #endif
486 private:
487 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
488  llvm::PassManager PM;
489 #else /* LLVM 3.7+ */
490  llvm::legacy::PassManager PM;
491 #endif
492  int number;
493 };
494 
495 void
496 DebugPassManager::add(llvm::Pass * P, int stage = -1) {
497  // taking number of optimization
498  if (stage == -1) {
499  number++;
500  }
501  else {
502  number = stage;
503  }
504  if (g->off_stages.find(number) == g->off_stages.end()) {
505  // adding optimization (not switched off)
506  PM.add(P);
507  if (g->debug_stages.find(number) != g->debug_stages.end()) {
508  // adding dump of LLVM IR after optimization
509  char buf[100];
510 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
511  sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
512  number, P->getPassName());
513 #else // LLVM 4.0+
514  sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
515  number, P->getPassName().data());
516 #endif
517  PM.add(CreateDebugPass(buf));
518  }
519 
520 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_4 || ISPC_LLVM_VERSION == ISPC_LLVM_3_5 // only 3.4 and 3.5
521  if (g->debugIR == number) {
522  // adding generating of LLVM IR debug after optimization
523  char buf[100];
524  sprintf(buf, "Debug_IR_after_%d_phase.bc", number);
525  PM.add(llvm::createDebugIRPass(true, true, ".", buf));
526  }
527 #endif
528  }
529 }
530 ///////////////////////////////////////////////////////////////////////////
531 
532 void
533 Optimize(llvm::Module *module, int optLevel) {
534  if (g->debugPrint) {
535  printf("*** Code going into optimization ***\n");
536  module->dump();
537  }
538  DebugPassManager optPM;
539  optPM.add(llvm::createVerifierPass(),0);
540 
541 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
542  llvm::TargetLibraryInfo *targetLibraryInfo =
543  new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
544  optPM.add(targetLibraryInfo);
545 #else // LLVM 3.7+
546  optPM.add(new llvm::TargetLibraryInfoWrapperPass(llvm::Triple(module->getTargetTriple())));
547 #endif
548 
549 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_4
550  optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
551 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_5
552  optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
553 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_6
554  llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
555  dlp->doInitialization(*module);
556  optPM.add(dlp);
557 #endif // LLVM 3.7+ doesn't have DataLayoutPass anymore.
558 
559  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
560 
561 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
562  optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
563  targetMachine->getVectorTargetTransformInfo()));
564 #elif ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
565  targetMachine->addAnalysisPasses(optPM.getPM());
566 #else // LLVM 3.7+
567  optPM.getPM().add(createTargetTransformInfoWrapperPass(targetMachine->getTargetIRAnalysis()));
568 #endif
569 
570  optPM.add(llvm::createIndVarSimplifyPass());
571 
572  if (optLevel == 0) {
573  // This is more or less the minimum set of optimizations that we
574  // need to do to generate code that will actually run. (We can't
575  // run absolutely no optimizations, since the front-end needs us to
576  // take the various __pseudo_* functions it has emitted and turn
577  // them into something that can actually execute.
578  optPM.add(CreateImproveMemoryOpsPass(), 100);
579 #ifdef ISPC_NVPTX_ENABLED
580  if (g->opt.disableGatherScatterOptimizations == false &&
581  g->target->getVectorWidth() > 1)
582 #endif /* ISPC_NVPTX_ENABLED */
583  optPM.add(CreateImproveMemoryOpsPass(), 100);
584 
585  if (g->opt.disableHandlePseudoMemoryOps == false)
587 
588  optPM.add(CreateIntrinsicsOptPass(), 102);
590  optPM.add(llvm::createFunctionInliningPass());
592  optPM.add(llvm::createCFGSimplificationPass());
593  optPM.add(llvm::createGlobalDCEPass());
594  }
595  else {
596  llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
597  llvm::initializeCore(*registry);
598  llvm::initializeScalarOpts(*registry);
599  llvm::initializeIPO(*registry);
600  llvm::initializeAnalysis(*registry);
601 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7
602  llvm::initializeIPA(*registry);
603 #endif
604  llvm::initializeTransformUtils(*registry);
605  llvm::initializeInstCombine(*registry);
606  llvm::initializeInstrumentation(*registry);
607  llvm::initializeTarget(*registry);
608 
609  optPM.add(llvm::createGlobalDCEPass(), 185);
610 
611  // Setup to use LLVM default AliasAnalysis
612  // Ideally, we want call:
613  // llvm::PassManagerBuilder pm_Builder;
614  // pm_Builder.OptLevel = optLevel;
615  // pm_Builder.addInitialAliasAnalysisPasses(optPM);
616  // but the addInitialAliasAnalysisPasses() is a private function
617  // so we explicitly enable them here.
618  // Need to keep sync with future LLVM change
619  // An alternative is to call populateFunctionPassManager()
620 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7
621  optPM.add(llvm::createTypeBasedAliasAnalysisPass(), 190);
622  optPM.add(llvm::createBasicAliasAnalysisPass());
623 #else
624  optPM.add(llvm::createTypeBasedAAWrapperPass(), 190);
625  optPM.add(llvm::createBasicAAWrapperPass());
626 #endif
627  optPM.add(llvm::createCFGSimplificationPass());
628 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
629  optPM.add(llvm::createScalarReplAggregatesPass());
630 #else
631  optPM.add(llvm::createSROAPass());
632 #endif
633  optPM.add(llvm::createEarlyCSEPass());
634  optPM.add(llvm::createLowerExpectIntrinsicPass());
635 
636  // Early optimizations to try to reduce the total amount of code to
637  // work with if we can
638  optPM.add(llvm::createReassociatePass(), 200);
639  optPM.add(llvm::createConstantPropagationPass());
640  optPM.add(llvm::createDeadInstEliminationPass());
641  optPM.add(llvm::createCFGSimplificationPass());
642 
643  optPM.add(llvm::createPromoteMemoryToRegisterPass());
644  optPM.add(llvm::createAggressiveDCEPass());
645 
646 
647  if (g->opt.disableGatherScatterOptimizations == false &&
648  g->target->getVectorWidth() > 1) {
649  optPM.add(llvm::createInstructionCombiningPass(), 210);
651  }
653  optPM.add(CreateIntrinsicsOptPass(), 215);
655  }
656  optPM.add(llvm::createDeadInstEliminationPass(), 220);
657 
658  // Max struct size threshold for scalar replacement is
659  // 1) 4 fields (r,g,b,w)
660  // 2) field size: vectorWidth * sizeof(float)
661 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
662  const int field_limit = 4;
663  int sr_threshold = g->target->getVectorWidth() * sizeof(float) * field_limit;
664 #endif
665 
666  // On to more serious optimizations
667 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
668  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
669 #else
670  optPM.add(llvm::createSROAPass());
671 #endif
672  optPM.add(llvm::createInstructionCombiningPass());
673  optPM.add(llvm::createCFGSimplificationPass());
674  optPM.add(llvm::createPromoteMemoryToRegisterPass());
675  optPM.add(llvm::createGlobalOptimizerPass());
676  optPM.add(llvm::createReassociatePass());
677  optPM.add(llvm::createIPConstantPropagationPass());
678 
679 #ifdef ISPC_NVPTX_ENABLED
680  if (g->target->getISA() != Target::NVPTX)
681 #endif /* ISPC_NVPTX_ENABLED */
682  optPM.add(CreateReplaceStdlibShiftPass(),229);
683 
684  optPM.add(llvm::createDeadArgEliminationPass(),230);
685  optPM.add(llvm::createInstructionCombiningPass());
686  optPM.add(llvm::createCFGSimplificationPass());
687  optPM.add(llvm::createPruneEHPass());
688 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_9 // 3.9+
689  optPM.add(llvm::createPostOrderFunctionAttrsLegacyPass());
690  optPM.add(llvm::createReversePostOrderFunctionAttrsPass());
691 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_8 // 3.8
692  optPM.add(llvm::createPostOrderFunctionAttrsPass());
693  optPM.add(llvm::createReversePostOrderFunctionAttrsPass());
694 #else // 3.7 and earlier
695  optPM.add(llvm::createFunctionAttrsPass());
696 #endif
697  optPM.add(llvm::createFunctionInliningPass());
698  optPM.add(llvm::createConstantPropagationPass());
699  optPM.add(llvm::createDeadInstEliminationPass());
700  optPM.add(llvm::createCFGSimplificationPass());
701 
702  optPM.add(llvm::createArgumentPromotionPass());
703 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3
704  // Starting from 3.4 this functionality was moved to
705  // InstructionCombiningPass. See r184459 for details.
706  optPM.add(llvm::createSimplifyLibCallsPass(), 240);
707 #endif
708  optPM.add(llvm::createAggressiveDCEPass());
709  optPM.add(llvm::createInstructionCombiningPass(), 241);
710  optPM.add(llvm::createJumpThreadingPass());
711  optPM.add(llvm::createCFGSimplificationPass());
712 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
713  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
714 #else
715  optPM.add(llvm::createSROAPass());
716 #endif
717  optPM.add(llvm::createInstructionCombiningPass());
718  optPM.add(llvm::createTailCallEliminationPass());
719 
721  optPM.add(CreateIntrinsicsOptPass(), 250);
723  }
724 
725  if (g->opt.disableGatherScatterOptimizations == false &&
726  g->target->getVectorWidth() > 1) {
727  optPM.add(llvm::createInstructionCombiningPass(), 255);
729 
730  if (g->opt.disableCoalescing == false &&
731  g->target->getISA() != Target::GENERIC) {
732  // It is important to run this here to make it easier to
733  // finding matching gathers we can coalesce..
734  optPM.add(llvm::createEarlyCSEPass(), 260);
735  optPM.add(CreateGatherCoalescePass());
736  }
737  }
738 
739  optPM.add(llvm::createFunctionInliningPass(), 265);
740  optPM.add(llvm::createConstantPropagationPass());
741  optPM.add(CreateIntrinsicsOptPass());
743 
744  if (g->opt.disableGatherScatterOptimizations == false &&
745  g->target->getVectorWidth() > 1) {
746  optPM.add(llvm::createInstructionCombiningPass(), 270);
748  }
749 
750  optPM.add(llvm::createIPSCCPPass(), 275);
751  optPM.add(llvm::createDeadArgEliminationPass());
752  optPM.add(llvm::createAggressiveDCEPass());
753  optPM.add(llvm::createInstructionCombiningPass());
754  optPM.add(llvm::createCFGSimplificationPass());
755 
756  if (g->opt.disableHandlePseudoMemoryOps == false) {
758  }
759  optPM.add(CreateIntrinsicsOptPass(),281);
761 
762  optPM.add(llvm::createFunctionInliningPass());
763  optPM.add(llvm::createArgumentPromotionPass());
764 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
765  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false));
766 #else
767  optPM.add(llvm::createSROAPass());
768 #endif
769  optPM.add(llvm::createInstructionCombiningPass());
771  optPM.add(llvm::createCFGSimplificationPass());
772  optPM.add(llvm::createReassociatePass());
773  optPM.add(llvm::createLoopRotatePass());
774  optPM.add(llvm::createLICMPass());
775  optPM.add(llvm::createLoopUnswitchPass(false));
776  optPM.add(llvm::createInstructionCombiningPass());
778  optPM.add(llvm::createIndVarSimplifyPass());
779  optPM.add(llvm::createLoopIdiomPass());
780  optPM.add(llvm::createLoopDeletionPass());
781  if (g->opt.unrollLoops) {
782  optPM.add(llvm::createLoopUnrollPass(), 300);
783  }
784  optPM.add(llvm::createGVNPass(), 301);
785 
787  optPM.add(CreateIntrinsicsOptPass());
789 
790  optPM.add(llvm::createMemCpyOptPass());
791  optPM.add(llvm::createSCCPPass());
792  optPM.add(llvm::createInstructionCombiningPass());
794  optPM.add(llvm::createJumpThreadingPass());
795  optPM.add(llvm::createCorrelatedValuePropagationPass());
796  optPM.add(llvm::createDeadStoreEliminationPass());
797  optPM.add(llvm::createAggressiveDCEPass());
798  optPM.add(llvm::createCFGSimplificationPass());
799  optPM.add(llvm::createInstructionCombiningPass());
801  optPM.add(CreatePeepholePass());
802  optPM.add(llvm::createFunctionInliningPass());
803  optPM.add(llvm::createAggressiveDCEPass());
804  optPM.add(llvm::createStripDeadPrototypesPass());
806  optPM.add(llvm::createGlobalDCEPass());
807  optPM.add(llvm::createConstantMergePass());
808 
809  // Should be the last
810  optPM.add(CreateFixBooleanSelectPass(), 400);
811 #ifdef ISPC_NVPTX_ENABLED
812  if (g->target->getISA() == Target::NVPTX)
813  {
814  optPM.add(CreatePromoteLocalToPrivatePass());
815  optPM.add(llvm::createGlobalDCEPass());
816 
817  optPM.add(llvm::createTypeBasedAliasAnalysisPass());
818  optPM.add(llvm::createBasicAliasAnalysisPass());
819  optPM.add(llvm::createCFGSimplificationPass());
820  // Here clang has an experimental pass SROAPass instead of
821  // ScalarReplAggregatesPass. We should add it in the future.
822 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
823  optPM.add(llvm::createScalarReplAggregatesPass());
824 #else
825  optPM.add(llvm::createSROAPass());
826 #endif
827  optPM.add(llvm::createEarlyCSEPass());
828  optPM.add(llvm::createLowerExpectIntrinsicPass());
829  optPM.add(llvm::createTypeBasedAliasAnalysisPass());
830  optPM.add(llvm::createBasicAliasAnalysisPass());
831 
832  // Early optimizations to try to reduce the total amount of code to
833  // work with if we can
834  optPM.add(llvm::createReassociatePass());
835  optPM.add(llvm::createConstantPropagationPass());
836  optPM.add(llvm::createDeadInstEliminationPass());
837  optPM.add(llvm::createCFGSimplificationPass());
838 
839  optPM.add(llvm::createPromoteMemoryToRegisterPass());
840  optPM.add(llvm::createAggressiveDCEPass());
841 
842 
843  optPM.add(llvm::createInstructionCombiningPass());
844  optPM.add(llvm::createDeadInstEliminationPass());
845 
846  // On to more serious optimizations
847  optPM.add(llvm::createInstructionCombiningPass());
848  optPM.add(llvm::createCFGSimplificationPass());
849  optPM.add(llvm::createPromoteMemoryToRegisterPass());
850  optPM.add(llvm::createGlobalOptimizerPass());
851  optPM.add(llvm::createReassociatePass());
852  optPM.add(llvm::createIPConstantPropagationPass());
853 
854  optPM.add(llvm::createDeadArgEliminationPass());
855  optPM.add(llvm::createInstructionCombiningPass());
856  optPM.add(llvm::createCFGSimplificationPass());
857  optPM.add(llvm::createPruneEHPass());
858  optPM.add(llvm::createFunctionAttrsPass());
859  optPM.add(llvm::createFunctionInliningPass());
860  optPM.add(llvm::createConstantPropagationPass());
861  optPM.add(llvm::createDeadInstEliminationPass());
862  optPM.add(llvm::createCFGSimplificationPass());
863 
864  optPM.add(llvm::createArgumentPromotionPass());
865 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3
866  // Starting from 3.4 this functionality was moved to
867  // InstructionCombiningPass. See r184459 for details.
868  optPM.add(llvm::createSimplifyLibCallsPass());
869 #endif
870  optPM.add(llvm::createAggressiveDCEPass());
871  optPM.add(llvm::createInstructionCombiningPass());
872  optPM.add(llvm::createJumpThreadingPass());
873  optPM.add(llvm::createCFGSimplificationPass());
874  optPM.add(llvm::createInstructionCombiningPass());
875  optPM.add(llvm::createTailCallEliminationPass());
876 
877  optPM.add(llvm::createInstructionCombiningPass());
878 
879  optPM.add(llvm::createFunctionInliningPass());
880  optPM.add(llvm::createConstantPropagationPass());
881 
882  optPM.add(llvm::createInstructionCombiningPass());
883 
884  optPM.add(llvm::createIPSCCPPass());
885  optPM.add(llvm::createDeadArgEliminationPass());
886  optPM.add(llvm::createAggressiveDCEPass());
887  optPM.add(llvm::createInstructionCombiningPass());
888  optPM.add(llvm::createCFGSimplificationPass());
889 
890  optPM.add(llvm::createFunctionInliningPass());
891  optPM.add(llvm::createArgumentPromotionPass());
892  optPM.add(llvm::createInstructionCombiningPass());
893  optPM.add(llvm::createCFGSimplificationPass());
894  optPM.add(llvm::createReassociatePass());
895  optPM.add(llvm::createLoopRotatePass());
896  optPM.add(llvm::createLICMPass());
897 // optPM.add(llvm::createLoopUnswitchPass(false));
898 #if 1
899  optPM.add(llvm::createInstructionCombiningPass());
900  optPM.add(llvm::createIndVarSimplifyPass());
901  optPM.add(llvm::createLoopIdiomPass());
902  optPM.add(llvm::createLoopDeletionPass());
903  optPM.add(llvm::createLoopUnrollPass());
904  optPM.add(llvm::createGVNPass());
905  optPM.add(llvm::createMemCpyOptPass());
906  optPM.add(llvm::createSCCPPass());
907  optPM.add(llvm::createInstructionCombiningPass());
908  optPM.add(llvm::createJumpThreadingPass());
909  optPM.add(llvm::createCorrelatedValuePropagationPass());
910  optPM.add(llvm::createDeadStoreEliminationPass());
911  optPM.add(llvm::createAggressiveDCEPass());
912  optPM.add(llvm::createCFGSimplificationPass());
913  optPM.add(llvm::createInstructionCombiningPass());
914  optPM.add(llvm::createFunctionInliningPass());
915  optPM.add(llvm::createAggressiveDCEPass());
916  optPM.add(llvm::createStripDeadPrototypesPass());
917  optPM.add(llvm::createGlobalDCEPass());
918  optPM.add(llvm::createConstantMergePass());
919 #endif
920  }
921 #endif /* ISPC_NVPTX_ENABLED */
922  }
923 
924  // Finish up by making sure we didn't mess anything up in the IR along
925  // the way.
926  optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
927  optPM.run(*module);
928 
929  if (g->debugPrint) {
930  printf("\n*****\nFINAL OUTPUT\n*****\n");
931  module->dump();
932  }
933 
934 }
935 
936 
937 
938 ///////////////////////////////////////////////////////////////////////////
939 // IntrinsicsOpt
940 
941 /** This is a relatively simple optimization pass that does a few small
942  optimizations that LLVM's x86 optimizer doesn't currently handle.
943  (Specifically, MOVMSK of a constant can be replaced with the
944  corresponding constant value, BLENDVPS and AVX masked load/store with
945  either an 'all on' or 'all off' masks can be replaced with simpler
946  operations.
947 
948  @todo The better thing to do would be to submit a patch to LLVM to get
949  these; they're presumably pretty simple patterns to match.
950 */
951 class IntrinsicsOpt : public llvm::BasicBlockPass {
952 public:
953  IntrinsicsOpt() : BasicBlockPass(ID) {};
954 
955 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
956  const char *getPassName() const { return "Intrinsics Cleanup Optimization"; }
957 #else // LLVM 4.0+
958  llvm::StringRef getPassName() const { return "Intrinsics Cleanup Optimization"; }
959 #endif
960  bool runOnBasicBlock(llvm::BasicBlock &BB);
961 
962  static char ID;
963 
964 private:
966  MaskInstruction(llvm::Function *f) { function = f; }
967  llvm::Function *function;
968  };
969  std::vector<MaskInstruction> maskInstructions;
970 
971  /** Structure that records everything we need to know about a blend
972  instruction for this optimization pass.
973  */
975  BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
976  : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) { }
977  /** Function pointer for the blend instruction */
978  llvm::Function *function;
979  /** Mask value for an "all on" mask for this instruction */
980  uint64_t allOnMask;
981  /** The operand number in the llvm CallInst corresponds to the
982  first operand to blend with. */
983  int op0;
984  /** The operand number in the CallInst corresponding to the second
985  operand to blend with. */
986  int op1;
987  /** The operand in the call inst where the blending factor is
988  found. */
989  int opFactor;
990  };
991  std::vector<BlendInstruction> blendInstructions;
992 
993  bool matchesMaskInstruction(llvm::Function *function);
994  BlendInstruction *matchingBlendInstruction(llvm::Function *function);
995 };
996 
997 char IntrinsicsOpt::ID = 0;
998 
999 
1000 /** Given an llvm::Value, return true if we can determine that it's an
1001  undefined value. This only makes a weak attempt at chasing this down,
1002  only detecting flat-out undef values, and bitcasts of undef values.
1003 
1004  @todo Is it worth working harder to find more of these? It starts to
1005  get tricky, since having an undef operand doesn't necessarily mean that
1006  the result will be undefined. (And for that matter, is there an LLVM
1007  call that will do this for us?)
1008  */
1009 static bool
1010 lIsUndef(llvm::Value *value) {
1011  if (llvm::isa<llvm::UndefValue>(value))
1012  return true;
1013 
1014  llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(value);
1015  if (bci)
1016  return lIsUndef(bci->getOperand(0));
1017 
1018  return false;
1019 }
1020 
1021 
1022 bool
1023 IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
1024  DEBUG_START_PASS("IntrinsicsOpt");
1025 
1026  // We can’t initialize mask/blend function vector during pass initialization,
1027  // as they may be optimized out by the time the pass is invoked.
1028 
1029  // All of the mask instructions we may encounter. Note that even if
1030  // compiling for AVX, we may still encounter the regular 4-wide SSE
1031  // MOVMSK instruction.
1032  if (llvm::Function *ssei8Movmsk =
1033  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse2_pmovmskb_128))) {
1034  maskInstructions.push_back(ssei8Movmsk);
1035  }
1036  if (llvm::Function *sseFloatMovmsk =
1037  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse_movmsk_ps))) {
1038  maskInstructions.push_back(sseFloatMovmsk);
1039  }
1040  if (llvm::Function *__movmsk =
1041  m->module->getFunction("__movmsk")) {
1042  maskInstructions.push_back(__movmsk);
1043  }
1044  if (llvm::Function *avxFloatMovmsk =
1045  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_movmsk_ps_256))) {
1046  maskInstructions.push_back(avxFloatMovmsk);
1047  }
1048 
1049  // And all of the blend instructions
1050  blendInstructions.push_back(BlendInstruction(
1051  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse41_blendvps)),
1052  0xf, 0, 1, 2));
1053  blendInstructions.push_back(BlendInstruction(
1054  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_blendv_ps_256)),
1055  0xff, 0, 1, 2));
1056 
1057  llvm::Function *avxMaskedLoad32 =
1058  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_ps_256));
1059  llvm::Function *avxMaskedLoad64 =
1060  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_pd_256));
1061  llvm::Function *avxMaskedStore32 =
1062  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_ps_256));
1063  llvm::Function *avxMaskedStore64 =
1064  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_pd_256));
1065 
1066  bool modifiedAny = false;
1067  restart:
1068  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
1069  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
1070  if (callInst == NULL || callInst->getCalledFunction() == NULL)
1071  continue;
1072 
1073  BlendInstruction *blend = matchingBlendInstruction(callInst->getCalledFunction());
1074  if (blend != NULL) {
1075  llvm::Value *v[2] = { callInst->getArgOperand(blend->op0),
1076  callInst->getArgOperand(blend->op1) };
1077  llvm::Value *factor = callInst->getArgOperand(blend->opFactor);
1078 
1079  // If the values are the same, then no need to blend..
1080  if (v[0] == v[1]) {
1081  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1082  iter, v[0]);
1083  modifiedAny = true;
1084  goto restart;
1085  }
1086 
1087  // If one of the two is undefined, we're allowed to replace
1088  // with the value of the other. (In other words, the only
1089  // valid case is that the blend factor ends up having a value
1090  // that only selects from the defined one of the two operands,
1091  // otherwise the result is undefined and any value is fine,
1092  // ergo the defined one is an acceptable result.)
1093  if (lIsUndef(v[0])) {
1094  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1095  iter, v[1]);
1096  modifiedAny = true;
1097  goto restart;
1098  }
1099  if (lIsUndef(v[1])) {
1100  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1101  iter, v[0]);
1102  modifiedAny = true;
1103  goto restart;
1104  }
1105 
1106  uint64_t mask;
1107  if (lGetMask(factor, &mask) == true) {
1108  llvm::Value *value = NULL;
1109  if (mask == 0)
1110  // Mask all off -> replace with the first blend value
1111  value = v[0];
1112  else if (mask == blend->allOnMask)
1113  // Mask all on -> replace with the second blend value
1114  value = v[1];
1115 
1116  if (value != NULL) {
1117  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1118  iter, value);
1119  modifiedAny = true;
1120  goto restart;
1121  }
1122  }
1123  }
1124  else if (matchesMaskInstruction(callInst->getCalledFunction())) {
1125  llvm::Value *factor = callInst->getArgOperand(0);
1126  uint64_t mask;
1127  if (lGetMask(factor, &mask) == true) {
1128  // If the vector-valued mask has a known value, replace it
1129  // with the corresponding integer mask from its elements
1130  // high bits.
1131  llvm::Value *value = (callInst->getType() == LLVMTypes::Int32Type) ?
1132  LLVMInt32(mask) : LLVMInt64(mask);
1133  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1134  iter, value);
1135  modifiedAny = true;
1136  goto restart;
1137  }
1138  }
1139  else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
1140  callInst->getCalledFunction() == avxMaskedLoad64) {
1141  llvm::Value *factor = callInst->getArgOperand(1);
1142  uint64_t mask;
1143  if (lGetMask(factor, &mask) == true) {
1144  if (mask == 0) {
1145  // nothing being loaded, replace with undef value
1146  llvm::Type *returnType = callInst->getType();
1147  Assert(llvm::isa<llvm::VectorType>(returnType));
1148  llvm::Value *undefValue = llvm::UndefValue::get(returnType);
1149  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1150  iter, undefValue);
1151  modifiedAny = true;
1152  goto restart;
1153  }
1154  else if (mask == 0xff) {
1155  // all lanes active; replace with a regular load
1156  llvm::Type *returnType = callInst->getType();
1157  Assert(llvm::isa<llvm::VectorType>(returnType));
1158  // cast the i8 * to the appropriate type
1159  const char *name = LLVMGetName(callInst->getArgOperand(0), "_cast");
1160  llvm::Value *castPtr =
1161  new llvm::BitCastInst(callInst->getArgOperand(0),
1162  llvm::PointerType::get(returnType, 0),
1163  name, callInst);
1164  lCopyMetadata(castPtr, callInst);
1165  int align;
1166  if (g->opt.forceAlignedMemory)
1167  align = g->target->getNativeVectorAlignment();
1168  else
1169  align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
1170  name = LLVMGetName(callInst->getArgOperand(0), "_load");
1171  llvm::Instruction *loadInst =
1172  new llvm::LoadInst(castPtr, name, false /* not volatile */,
1173  align, (llvm::Instruction *)NULL);
1174  lCopyMetadata(loadInst, callInst);
1175  llvm::ReplaceInstWithInst(callInst, loadInst);
1176  modifiedAny = true;
1177  goto restart;
1178  }
1179  }
1180  }
1181  else if (callInst->getCalledFunction() == avxMaskedStore32 ||
1182  callInst->getCalledFunction() == avxMaskedStore64) {
1183  // NOTE: mask is the 2nd parameter, not the 3rd one!!
1184  llvm::Value *factor = callInst->getArgOperand(1);
1185  uint64_t mask;
1186  if (lGetMask(factor, &mask) == true) {
1187  if (mask == 0) {
1188  // nothing actually being stored, just remove the inst
1189  callInst->eraseFromParent();
1190  modifiedAny = true;
1191  goto restart;
1192  }
1193  else if (mask == 0xff) {
1194  // all lanes storing, so replace with a regular store
1195  llvm::Value *rvalue = callInst->getArgOperand(2);
1196  llvm::Type *storeType = rvalue->getType();
1197  const char *name = LLVMGetName(callInst->getArgOperand(0),
1198  "_ptrcast");
1199  llvm::Value *castPtr =
1200  new llvm::BitCastInst(callInst->getArgOperand(0),
1201  llvm::PointerType::get(storeType, 0),
1202  name, callInst);
1203  lCopyMetadata(castPtr, callInst);
1204 
1205  llvm::StoreInst *storeInst =
1206  new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
1207  int align;
1208  if (g->opt.forceAlignedMemory)
1209  align = g->target->getNativeVectorAlignment();
1210  else
1211  align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
1212  storeInst->setAlignment(align);
1213  lCopyMetadata(storeInst, callInst);
1214  llvm::ReplaceInstWithInst(callInst, storeInst);
1215 
1216  modifiedAny = true;
1217  goto restart;
1218  }
1219  }
1220  }
1221  }
1222 
1223  DEBUG_END_PASS("IntrinsicsOpt");
1224 
1225  return modifiedAny;
1226 }
1227 
1228 
1229 bool
1230 IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
1231  for (unsigned int i = 0; i < maskInstructions.size(); ++i) {
1232  if (maskInstructions[i].function != NULL &&
1233  function == maskInstructions[i].function) {
1234  return true;
1235  }
1236  }
1237  return false;
1238 }
1239 
1240 
1242 IntrinsicsOpt::matchingBlendInstruction(llvm::Function *function) {
1243  for (unsigned int i = 0; i < blendInstructions.size(); ++i) {
1244  if (blendInstructions[i].function != NULL &&
1245  function == blendInstructions[i].function) {
1246  return &blendInstructions[i];
1247  }
1248  }
1249  return NULL;
1250 }
1251 
1252 
1253 static llvm::Pass *
1255  return new IntrinsicsOpt;
1256 }
1257 
1258 
1259 ///////////////////////////////////////////////////////////////////////////
1260 
1261 /** This simple optimization pass looks for a vector select instruction
1262  with an all-on or all-off constant mask, simplifying it to the
1263  appropriate operand if so.
1264 
1265  @todo The better thing to do would be to submit a patch to LLVM to get
1266  these; they're presumably pretty simple patterns to match.
1267 */
1268 class InstructionSimplifyPass : public llvm::BasicBlockPass {
1269 public:
1271  : BasicBlockPass(ID) { }
1272 
1273 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
1274  const char *getPassName() const { return "Vector Select Optimization"; }
1275 #else // LLVM 4.0+
1276  llvm::StringRef getPassName() const { return "Vector Select Optimization"; }
1277 #endif
1278  bool runOnBasicBlock(llvm::BasicBlock &BB);
1279 
1280  static char ID;
1281 
1282 private:
1283  static bool simplifySelect(llvm::SelectInst *selectInst,
1284  llvm::BasicBlock::iterator iter);
1285  static llvm::Value *simplifyBoolVec(llvm::Value *value);
1286  static bool simplifyCall(llvm::CallInst *callInst,
1287  llvm::BasicBlock::iterator iter);
1288 };
1289 
1291 
1292 
1293 llvm::Value *
1295  llvm::TruncInst *trunc = llvm::dyn_cast<llvm::TruncInst>(value);
1296  if (trunc != NULL) {
1297  // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
1298  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
1299  if (sext &&
1300  sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
1301  return sext->getOperand(0);
1302 
1303  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
1304  if (zext &&
1305  zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
1306  return zext->getOperand(0);
1307  }
1308  /*
1309  // This optimization has discernable benefit on the perf
1310  // suite on latest LLVM versions.
1311  // On 3.4+ (maybe even older), it can result in illegal
1312  // operations, so it's being disabled.
1313  llvm::ICmpInst *icmp = llvm::dyn_cast<llvm::ICmpInst>(value);
1314  if (icmp != NULL) {
1315  // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo
1316  if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) {
1317  llvm::Value *op1 = icmp->getOperand(1);
1318  if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
1319  llvm::Value *op0 = icmp->getOperand(0);
1320  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(op0);
1321  if (sext)
1322  return sext->getOperand(0);
1323  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(op0);
1324  if (zext)
1325  return zext->getOperand(0);
1326  }
1327  }
1328 
1329  }
1330  */
1331  return NULL;
1332 }
1333 
1334 
1335 bool
1336 InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst,
1337  llvm::BasicBlock::iterator iter) {
1338  if (selectInst->getType()->isVectorTy() == false)
1339  return false;
1340 
1341  llvm::Value *factor = selectInst->getOperand(0);
1342 
1343  // Simplify all-on or all-off mask values
1344  MaskStatus maskStatus = lGetMaskStatus(factor);
1345  llvm::Value *value = NULL;
1346  if (maskStatus == ALL_ON)
1347  // Mask all on -> replace with the first select value
1348  value = selectInst->getOperand(1);
1349  else if (maskStatus == ALL_OFF)
1350  // Mask all off -> replace with the second select value
1351  value = selectInst->getOperand(2);
1352  if (value != NULL) {
1353  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1354  iter, value);
1355  return true;
1356  }
1357 
1358  // Sometimes earlier LLVM optimization passes generate unnecessarily
1359  // complex expressions for the selection vector, which in turn confuses
1360  // the code generators and leads to sub-optimal code (particularly for
1361  // 8 and 16-bit masks). We'll try to simplify them out here so that
1362  // the code generator patterns match..
1363  if ((factor = simplifyBoolVec(factor)) != NULL) {
1364  llvm::Instruction *newSelect =
1365  llvm::SelectInst::Create(factor, selectInst->getOperand(1),
1366  selectInst->getOperand(2),
1367  selectInst->getName());
1368  llvm::ReplaceInstWithInst(selectInst, newSelect);
1369  return true;
1370  }
1371 
1372  return false;
1373 }
1374 
1375 
1376 bool
1377 InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst,
1378  llvm::BasicBlock::iterator iter) {
1379  llvm::Function *calledFunc = callInst->getCalledFunction();
1380 
1381  // Turn a __movmsk call with a compile-time constant vector into the
1382  // equivalent scalar value.
1383  if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
1384  return false;
1385 
1386  uint64_t mask;
1387  if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
1388  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1389  iter, LLVMInt64(mask));
1390  return true;
1391  }
1392  return false;
1393 }
1394 
1395 
1396 bool
1398  DEBUG_START_PASS("InstructionSimplify");
1399 
1400  bool modifiedAny = false;
1401 
1402  restart:
1403  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
1404  llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
1405  if (selectInst && simplifySelect(selectInst, iter)) {
1406  modifiedAny = true;
1407  goto restart;
1408  }
1409  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
1410  if (callInst && simplifyCall(callInst, iter)) {
1411  modifiedAny = true;
1412  goto restart;
1413  }
1414  }
1415 
1416  DEBUG_END_PASS("InstructionSimplify");
1417 
1418  return modifiedAny;
1419 }
1420 
1421 
1422 static llvm::Pass *
1424  return new InstructionSimplifyPass;
1425 }
1426 
1427 
1428 ///////////////////////////////////////////////////////////////////////////
1429 // ImproveMemoryOpsPass
1430 
1431 /** When the front-end emits gathers and scatters, it generates an array of
1432  vector-width pointers to represent the set of addresses to read from or
1433  write to. This optimization detects cases when the base pointer is a
1434  uniform pointer or when the indexing is into an array that can be
1435  converted into scatters/gathers from a single base pointer and an array
1436  of offsets.
1437 
1438  See for example the comments discussing the __pseudo_gather functions
1439  in builtins.cpp for more information about this.
1440  */
1441 class ImproveMemoryOpsPass : public llvm::BasicBlockPass {
1442 public:
1443  static char ID;
1444  ImproveMemoryOpsPass() : BasicBlockPass(ID) { }
1445 
1446 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
1447  const char *getPassName() const { return "Improve Memory Ops"; }
1448 #else // LLVM 4.0+
1449  llvm::StringRef getPassName() const { return "Improve Memory Ops"; }
1450 #endif
1451  bool runOnBasicBlock(llvm::BasicBlock &BB);
1452 };
1453 
1454 char ImproveMemoryOpsPass::ID = 0;
1455 
1456 
1457 
1458 /** Check to make sure that this value is actually a pointer in the end.
1459  We need to make sure that given an expression like vec(offset) +
1460  ptr2int(ptr), lGetBasePointer() doesn't return vec(offset) for the base
1461  pointer such that we then treat ptr2int(ptr) as an offset. This ends
1462  up being important so that we don't generate LLVM GEP instructions like
1463  "gep inttoptr 8, i64 %ptr", which in turn can lead to incorrect code
1464  since LLVM's pointer aliasing analysis assumes that operands after the
1465  first one to a GEP aren't pointers.
1466  */
1467 static llvm::Value *
1468 lCheckForActualPointer(llvm::Value *v) {
1469  if (v == NULL) {
1470  return NULL;
1471  }
1472  else if (llvm::isa<llvm::PointerType>(v->getType())) {
1473  return v;
1474  }
1475  else if (llvm::isa<llvm::PtrToIntInst>(v)) {
1476  return v;
1477  }
1478 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_7
1479  // This one is tricky, as it's heuristic tuned for LLVM 3.7+, which may
1480  // optimize loading double* with consequent ptr2int to straight load of i64.
1481  // This heuristic should be good enough to catch all the cases we should
1482  // detect and nothing else.
1483  else if (llvm::isa<llvm::LoadInst>(v)) {
1484  return v;
1485  }
1486 #endif
1487  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1488  llvm::Value *t = lCheckForActualPointer(ci->getOperand(0));
1489  if (t == NULL) {
1490  return NULL;
1491  }
1492  else {
1493  return v;
1494  }
1495  }
1496  else {
1497  llvm::ConstantExpr *uce =
1498  llvm::dyn_cast<llvm::ConstantExpr>(v);
1499  if (uce != NULL &&
1500  uce->getOpcode() == llvm::Instruction::PtrToInt)
1501  return v;
1502  return NULL;
1503  }
1504 }
1505 
1506 
1507 /** Given a llvm::Value representing a varying pointer, this function
1508  checks to see if all of the elements of the vector have the same value
1509  (i.e. there's a common base pointer). If so, it returns the common
1510  pointer value; otherwise it returns NULL.
1511  */
1512 static llvm::Value *
1513 lGetBasePointer(llvm::Value *v, llvm::Instruction *insertBefore) {
1514  if (llvm::isa<llvm::InsertElementInst>(v) ||
1515  llvm::isa<llvm::ShuffleVectorInst>(v)) {
1516  llvm::Value *element = LLVMFlattenInsertChain
1517  (v, g->target->getVectorWidth(), true, false);
1518  // TODO: it's probably ok to allow undefined elements and return
1519  // the base pointer if all of the other elements have the same
1520  // value.
1521  if (element != NULL) {
1522  //all elements are the same and not NULLs
1523  return lCheckForActualPointer(element);
1524  }
1525  else {
1526  return NULL;
1527  }
1528  }
1529 
1530  // This case comes up with global/static arrays
1531  if (llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v)) {
1532  return lCheckForActualPointer(cv->getSplatValue());
1533  }
1534  else if (llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v)) {
1535  return lCheckForActualPointer(cdv->getSplatValue());
1536  }
1537  // It is a little bit tricky to use operations with pointers, casted to int with another bit size
1538  // but sometimes it is useful, so we handle this case here.
1539  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1540  llvm::Value *t = lGetBasePointer(ci->getOperand(0), insertBefore);
1541  if (t == NULL) {
1542  return NULL;
1543  }
1544  else {
1545  return llvm::CastInst::Create(ci->getOpcode(),
1546  t, ci->getType()->getScalarType(),
1547  LLVMGetName(t, "_cast"),
1548  insertBefore);
1549  }
1550  }
1551 
1552  return NULL;
1553 }
1554 
1555 
1556 /** Given the two operands to a constant add expression, see if we have the
1557  form "base pointer + offset", whee op0 is the base pointer and op1 is
1558  the offset; if so return the base and the offset. */
1559 static llvm::Constant *
1560 lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1,
1561  llvm::Constant **delta) {
1562  llvm::ConstantExpr *op = llvm::dyn_cast<llvm::ConstantExpr>(op0);
1563  if (op == NULL || op->getOpcode() != llvm::Instruction::PtrToInt)
1564  // the first operand isn't a pointer
1565  return NULL;
1566 
1567  llvm::ConstantInt *opDelta = llvm::dyn_cast<llvm::ConstantInt>(op1);
1568  if (opDelta == NULL)
1569  // the second operand isn't an integer operand
1570  return NULL;
1571 
1572  *delta = opDelta;
1573  return op0;
1574 }
1575 
1576 
1577 static llvm::Value *
1578 lExtractFromInserts(llvm::Value *v, unsigned int index) {
1579  llvm::InsertValueInst *iv = llvm::dyn_cast<llvm::InsertValueInst>(v);
1580  if (iv == NULL)
1581  return NULL;
1582 
1583  Assert(iv->hasIndices() && iv->getNumIndices() == 1);
1584  if (iv->getIndices()[0] == index)
1585  return iv->getInsertedValueOperand();
1586  else
1587  return lExtractFromInserts(iv->getAggregateOperand(), index);
1588 }
1589 
1590 
1591 /** Given a varying pointer in ptrs, this function checks to see if it can
1592  be determined to be indexing from a common uniform base pointer. If
1593  so, the function returns the base pointer llvm::Value and initializes
1594  *offsets with an int vector of the per-lane offsets
1595  */
1596 static llvm::Value *
1597 lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets,
1598  llvm::Instruction *insertBefore) {
1599  if (g->debugPrint) {
1600  fprintf(stderr, "lGetBasePtrAndOffsets\n");
1601  LLVMDumpValue(ptrs);
1602  }
1603 
1604  llvm::Value *base = lGetBasePointer(ptrs, insertBefore);
1605  if (base != NULL) {
1606  // We have a straight up varying pointer with no indexing that's
1607  // actually all the same value.
1608  if (g->target->is32Bit())
1609  *offsets = LLVMInt32Vector(0);
1610  else
1611  *offsets = LLVMInt64Vector((int64_t)0);
1612  return base;
1613  }
1614 
1615  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(ptrs);
1616  if (bop != NULL && bop->getOpcode() == llvm::Instruction::Add) {
1617  // If we have a common pointer plus something, then we're also
1618  // good.
1619  if ((base = lGetBasePtrAndOffsets(bop->getOperand(0),
1620  offsets, insertBefore)) != NULL) {
1621  *offsets =
1622  llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets,
1623  bop->getOperand(1), "new_offsets",
1624  insertBefore);
1625  return base;
1626  }
1627  else if ((base = lGetBasePtrAndOffsets(bop->getOperand(1),
1628  offsets, insertBefore)) != NULL) {
1629  *offsets =
1630  llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets,
1631  bop->getOperand(0), "new_offsets",
1632  insertBefore);
1633  return base;
1634  }
1635  }
1636 
1637  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(ptrs);
1638  if (cv != NULL) {
1639  // Indexing into global arrays can lead to this form, with
1640  // ConstantVectors..
1641  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
1642  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
1643  llvm::Constant *c =
1644  llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
1645  if (c == NULL)
1646  return NULL;
1647  elements.push_back(c);
1648  }
1649 
1650  llvm::Constant *delta[ISPC_MAX_NVEC];
1651  for (unsigned int i = 0; i < elements.size(); ++i) {
1652  // For each element, try to decompose it into either a straight
1653  // up base pointer, or a base pointer plus an integer value.
1654  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(elements[i]);
1655  if (ce == NULL)
1656  return NULL;
1657 
1658  delta[i] = NULL;
1659  llvm::Value *elementBase = NULL; // base pointer for this element
1660  if (ce->getOpcode() == llvm::Instruction::PtrToInt) {
1661  // If the element is just a ptr to int instruction, treat
1662  // it as having an offset of zero
1663  elementBase = ce;
1664  delta[i] = g->target->is32Bit() ? LLVMInt32(0) : LLVMInt64(0);
1665  }
1666  else if (ce->getOpcode() == llvm::Instruction::Add) {
1667  // Try both orderings of the operands to see if we can get
1668  // a pointer+offset out of them.
1669  elementBase =
1670  lGetConstantAddExprBaseOffset(ce->getOperand(0),
1671  ce->getOperand(1),
1672  &delta[i]);
1673  if (elementBase == NULL)
1674  elementBase =
1675  lGetConstantAddExprBaseOffset(ce->getOperand(1),
1676  ce->getOperand(0),
1677  &delta[i]);
1678  }
1679 
1680  // We weren't able to find a base pointer in the above. (We
1681  // don't expect this to happen; if it does, it may be necessary
1682  // to handle more cases in the decomposition above.)
1683  if (elementBase == NULL)
1684  return NULL;
1685 
1686  Assert(delta[i] != NULL);
1687  if (base == NULL)
1688  // The first time we've found a base pointer
1689  base = elementBase;
1690  else if (base != elementBase)
1691  // Different program instances have different base
1692  // pointers, so no luck.
1693  return NULL;
1694  }
1695 
1696  Assert(base != NULL);
1697  llvm::ArrayRef<llvm::Constant *> deltas(&delta[0],
1698  &delta[elements.size()]);
1699  *offsets = llvm::ConstantVector::get(deltas);
1700  return base;
1701  }
1702 
1703  llvm::ExtractValueInst *ev = llvm::dyn_cast<llvm::ExtractValueInst>(ptrs);
1704  if (ev != NULL) {
1705  Assert(ev->getNumIndices() == 1);
1706  int index = ev->getIndices()[0];
1707  ptrs = lExtractFromInserts(ev->getAggregateOperand(), index);
1708  if (ptrs != NULL)
1709  return lGetBasePtrAndOffsets(ptrs, offsets, insertBefore);
1710  }
1711 
1712  return NULL;
1713 }
1714 
1715 
1716 /** Given a vector expression in vec, separate it into a compile-time
1717  constant component and a variable component, returning the two parts in
1718  *constOffset and *variableOffset. (It should be the case that the sum
1719  of these two is exactly equal to the original vector.)
1720 
1721  This routine only handles some (important) patterns; in some cases it
1722  will fail and return components that are actually compile-time
1723  constants in *variableOffset.
1724 
1725  Finally, if there aren't any constant (or, respectivaly, variable)
1726  components, the corresponding return value may be set to NULL.
1727  */
1728 static void
1729 lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
1730  llvm::Value **variableOffset,
1731  llvm::Instruction *insertBefore) {
1732  if (llvm::isa<llvm::ConstantVector>(vec) ||
1733  llvm::isa<llvm::ConstantDataVector>(vec) ||
1734  llvm::isa<llvm::ConstantAggregateZero>(vec)) {
1735  *constOffset = vec;
1736  *variableOffset = NULL;
1737  return;
1738  }
1739 
1740  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(vec);
1741  if (cast != NULL) {
1742  // Check the cast target.
1743  llvm::Value *co, *vo;
1744  lExtractConstantOffset(cast->getOperand(0), &co, &vo, insertBefore);
1745 
1746  // make new cast instructions for the two parts
1747  if (co == NULL)
1748  *constOffset = NULL;
1749  else
1750  *constOffset = llvm::CastInst::Create(cast->getOpcode(),
1751  co, cast->getType(),
1752  LLVMGetName(co, "_cast"),
1753  insertBefore);
1754  if (vo == NULL)
1755  *variableOffset = NULL;
1756  else
1757  *variableOffset = llvm::CastInst::Create(cast->getOpcode(),
1758  vo, cast->getType(),
1759  LLVMGetName(vo, "_cast"),
1760  insertBefore);
1761  return;
1762  }
1763 
1764  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(vec);
1765  if (bop != NULL) {
1766  llvm::Value *op0 = bop->getOperand(0);
1767  llvm::Value *op1 = bop->getOperand(1);
1768  llvm::Value *c0, *v0, *c1, *v1;
1769 
1770  if (bop->getOpcode() == llvm::Instruction::Add) {
1771  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1772  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1773 
1774  if (c0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c0))
1775  *constOffset = c1;
1776  else if (c1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c1))
1777  *constOffset = c0;
1778  else
1779  *constOffset =
1780  llvm::BinaryOperator::Create(llvm::Instruction::Add, c0, c1,
1781  LLVMGetName("add", c0, c1),
1782  insertBefore);
1783 
1784  if (v0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v0))
1785  *variableOffset = v1;
1786  else if (v1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v1))
1787  *variableOffset = v0;
1788  else
1789  *variableOffset =
1790  llvm::BinaryOperator::Create(llvm::Instruction::Add, v0, v1,
1791  LLVMGetName("add", v0, v1),
1792  insertBefore);
1793  return;
1794  }
1795  else if (bop->getOpcode() == llvm::Instruction::Shl) {
1796  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1797  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1798 
1799  // Given the product of constant and variable terms, we have:
1800  // (c0 + v0) * (2^(c1 + v1)) = c0 * 2^c1 * 2^v1 + v0 * 2^c1 * 2^v1
1801  // We can optimize only if v1 == NULL.
1802  if ((v1 != NULL) || (c0 == NULL) || (c1 == NULL)) {
1803  *constOffset = NULL;
1804  *variableOffset = vec;
1805  }
1806  else if (v0 == NULL) {
1807  *constOffset = vec;
1808  *variableOffset = NULL;
1809  }
1810  else {
1811  *constOffset =
1812  llvm::BinaryOperator::Create(llvm::Instruction::Shl, c0, c1,
1813  LLVMGetName("shl", c0, c1),
1814  insertBefore);
1815  *variableOffset =
1816  llvm::BinaryOperator::Create(llvm::Instruction::Shl, v0, c1,
1817  LLVMGetName("shl", v0, c1),
1818  insertBefore);
1819  }
1820  return;
1821  }
1822  else if (bop->getOpcode() == llvm::Instruction::Mul) {
1823  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1824  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1825 
1826  // Given the product of constant and variable terms, we have:
1827  // (c0 + v0) * (c1 + v1) == (c0 c1) + (v0 c1 + c0 v1 + v0 v1)
1828  // Note that the first term is a constant and the last three are
1829  // variable.
1830  if (c0 != NULL && c1 != NULL)
1831  *constOffset =
1832  llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, c1,
1833  LLVMGetName("mul", c0, c1),
1834  insertBefore);
1835  else
1836  *constOffset = NULL;
1837 
1838  llvm::Value *va = NULL, *vb = NULL, *vc = NULL;
1839  if (v0 != NULL && c1 != NULL)
1840  va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1,
1841  LLVMGetName("mul", v0, c1), insertBefore);
1842  if (c0 != NULL && v1 != NULL)
1843  vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1,
1844  LLVMGetName("mul", c0, v1), insertBefore);
1845  if (v0 != NULL && v1 != NULL)
1846  vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1,
1847  LLVMGetName("mul", v0, v1), insertBefore);
1848 
1849 
1850  llvm::Value *vab = NULL;
1851  if (va != NULL && vb != NULL)
1852  vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb,
1853  LLVMGetName("add", va, vb), insertBefore);
1854  else if (va != NULL)
1855  vab = va;
1856  else
1857  vab = vb;
1858 
1859  if (vab != NULL && vc != NULL)
1860  *variableOffset =
1861  llvm::BinaryOperator::Create(llvm::Instruction::Add, vab, vc,
1862  LLVMGetName("add", vab, vc), insertBefore);
1863  else if (vab != NULL)
1864  *variableOffset = vab;
1865  else
1866  *variableOffset = vc;
1867 
1868  return;
1869  }
1870  }
1871 
1872  // Nothing matched, just return what we have as a variable component
1873  *constOffset = NULL;
1874  *variableOffset = vec;
1875 }
1876 
1877 
1878 /* Returns true if the given value is a constant vector of integers with
1879  the same value in all of the elements. (Returns the splatted value in
1880  *splat, if so). */
1881 static bool
1882 lIsIntegerSplat(llvm::Value *v, int *splat) {
1883  llvm::ConstantDataVector *cvec =
1884  llvm::dyn_cast<llvm::ConstantDataVector>(v);
1885  if (cvec == NULL)
1886  return false;
1887 
1888  llvm::Constant *splatConst = cvec->getSplatValue();
1889  if (splatConst == NULL)
1890  return false;
1891 
1892  llvm::ConstantInt *ci =
1893  llvm::dyn_cast<llvm::ConstantInt>(splatConst);
1894  if (ci == NULL)
1895  return false;
1896 
1897  int64_t splatVal = ci->getSExtValue();
1898  *splat = (int)splatVal;
1899  return true;
1900 }
1901 
1902 
1903 static llvm::Value *
1904 lExtract248Scale(llvm::Value *splatOperand, int splatValue,
1905  llvm::Value *otherOperand, llvm::Value **result) {
1906  if (splatValue == 2 || splatValue == 4 || splatValue == 8) {
1907  *result = otherOperand;
1908  return LLVMInt32(splatValue);
1909  }
1910  // Even if we don't have a common scale by exactly 2, 4, or 8, we'll
1911  // see if we can pull out that much of the scale anyway; this may in
1912  // turn allow other optimizations later.
1913  for (int scale = 8; scale >= 2; scale /= 2) {
1914  llvm::Instruction *insertBefore =
1915  llvm::dyn_cast<llvm::Instruction>(*result);
1916  Assert(insertBefore != NULL);
1917 
1918  if ((splatValue % scale) == 0) {
1919  // *result = otherOperand * splatOperand / scale;
1920  llvm::Value *splatScaleVec =
1921  (splatOperand->getType() == LLVMTypes::Int32VectorType) ?
1922  LLVMInt32Vector(scale) : LLVMInt64Vector(scale);
1923  llvm::Value *splatDiv =
1924  llvm::BinaryOperator::Create(llvm::Instruction::SDiv,
1925  splatOperand, splatScaleVec,
1926  "div", insertBefore);
1927  *result =
1928  llvm::BinaryOperator::Create(llvm::Instruction::Mul,
1929  splatDiv, otherOperand,
1930  "mul", insertBefore);
1931  return LLVMInt32(scale);
1932  }
1933  }
1934  return LLVMInt32(1);
1935 }
1936 
1937 
1938 /** Given a vector of integer offsets to a base pointer being used for a
1939  gather or a scatter, see if its root operation is a multiply by a
1940  vector of some value by all 2s/4s/8s. If not, return NULL.
1941 
1942  If it is return an i32 value of 2, 4, 8 from the function and modify
1943  *vec so that it points to the operand that is being multiplied by
1944  2/4/8.
1945 
1946  We go through all this trouble so that we can pass the i32 scale factor
1947  to the {gather,scatter}_base_offsets function as a separate scale
1948  factor for the offsets. This in turn is used in a way so that the LLVM
1949  x86 code generator matches it to apply x86's free scale by 2x, 4x, or
1950  8x to one of two registers being added together for an addressing
1951  calculation.
1952  */
1953 static llvm::Value *
1954 lExtractOffsetVector248Scale(llvm::Value **vec) {
1955  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(*vec);
1956  if (cast != NULL) {
1957  llvm::Value *castOp = cast->getOperand(0);
1958  // Check the cast target.
1959  llvm::Value *scale = lExtractOffsetVector248Scale(&castOp);
1960  if (scale == NULL)
1961  return NULL;
1962 
1963  // make a new cast instruction so that we end up with the right
1964  // type
1965  *vec = llvm::CastInst::Create(cast->getOpcode(), castOp, cast->getType(), "offset_cast", cast);
1966  return scale;
1967  }
1968 
1969 
1970  // If we don't have a binary operator, then just give up
1971  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
1972  if (bop == NULL)
1973  return LLVMInt32(1);
1974 
1975  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
1976  if (bop->getOpcode() == llvm::Instruction::Add) {
1977  if (llvm::isa<llvm::ConstantAggregateZero>(op0)) {
1978  *vec = op1;
1979  return lExtractOffsetVector248Scale(vec);
1980  }
1981  else if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
1982  *vec = op0;
1983  return lExtractOffsetVector248Scale(vec);
1984  }
1985  else {
1986  llvm::Value *s0 = lExtractOffsetVector248Scale(&op0);
1987  llvm::Value *s1 = lExtractOffsetVector248Scale(&op1);
1988  if (s0 == s1) {
1989  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
1990  op0, op1, "new_add", bop);
1991  return s0;
1992  }
1993  else
1994  return LLVMInt32(1);
1995  }
1996  }
1997  else if (bop->getOpcode() == llvm::Instruction::Mul) {
1998  // Check each operand for being one of the scale factors we care about.
1999  int splat;
2000  if (lIsIntegerSplat(op0, &splat))
2001  return lExtract248Scale(op0, splat, op1, vec);
2002  else if (lIsIntegerSplat(op1, &splat))
2003  return lExtract248Scale(op1, splat, op0, vec);
2004  else
2005  return LLVMInt32(1);
2006  }
2007  else
2008  return LLVMInt32(1);
2009 }
2010 
2011 #if 0
2012 static llvm::Value *
2013 lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
2014  fprintf(stderr, " lextract: ");
2015  (*vec)->dump();
2016  fprintf(stderr, "\n");
2017 
2018  if (llvm::isa<llvm::ConstantVector>(*vec) ||
2019  llvm::isa<llvm::ConstantDataVector>(*vec) ||
2020  llvm::isa<llvm::ConstantAggregateZero>(*vec))
2021  return NULL;
2022 
2023  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
2024  if (sext != NULL) {
2025  llvm::Value *sextOp = sext->getOperand(0);
2026  // Check the sext target.
2027  llvm::Value *unif = lExtractUniforms(&sextOp, insertBefore);
2028  if (unif == NULL)
2029  return NULL;
2030 
2031  // make a new sext instruction so that we end up with the right
2032  // type
2033  *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
2034  return unif;
2035  }
2036 
2037  if (LLVMVectorValuesAllEqual(*vec)) {
2038  // FIXME: we may want to redo all of the expression here, in scalar
2039  // form (if at all possible), for code quality...
2040  llvm::Value *unif =
2041  llvm::ExtractElementInst::Create(*vec, LLVMInt32(0),
2042  "first_uniform", insertBefore);
2043  *vec = NULL;
2044  return unif;
2045  }
2046 
2047  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
2048  if (bop == NULL)
2049  return NULL;
2050 
2051  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
2052  if (bop->getOpcode() == llvm::Instruction::Add) {
2053  llvm::Value *s0 = lExtractUniforms(&op0, insertBefore);
2054  llvm::Value *s1 = lExtractUniforms(&op1, insertBefore);
2055  if (s0 == NULL && s1 == NULL)
2056  return NULL;
2057 
2058  if (op0 == NULL)
2059  *vec = op1;
2060  else if (op1 == NULL)
2061  *vec = op0;
2062  else
2063  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
2064  op0, op1, "new_add", insertBefore);
2065 
2066  if (s0 == NULL)
2067  return s1;
2068  else if (s1 == NULL)
2069  return s0;
2070  else
2071  return llvm::BinaryOperator::Create(llvm::Instruction::Add, s0, s1,
2072  "add_unif", insertBefore);
2073  }
2074 #if 0
2075  else if (bop->getOpcode() == llvm::Instruction::Mul) {
2076  // Check each operand for being one of the scale factors we care about.
2077  int splat;
2078  if (lIs248Splat(op0, &splat)) {
2079  *vec = op1;
2080  return LLVMInt32(splat);
2081  }
2082  else if (lIs248Splat(op1, &splat)) {
2083  *vec = op0;
2084  return LLVMInt32(splat);
2085  }
2086  else
2087  return LLVMInt32(1);
2088  }
2089 #endif
2090  else
2091  return NULL;
2092 }
2093 
2094 
2095 static void
2096 lExtractUniformsFromOffset(llvm::Value **basePtr, llvm::Value **offsetVector,
2097  llvm::Value *offsetScale,
2098  llvm::Instruction *insertBefore) {
2099 #if 1
2100  (*basePtr)->dump();
2101  printf("\n");
2102  (*offsetVector)->dump();
2103  printf("\n");
2104  offsetScale->dump();
2105  printf("-----\n");
2106 #endif
2107 
2108  llvm::Value *uniformDelta = lExtractUniforms(offsetVector, insertBefore);
2109  if (uniformDelta == NULL)
2110  return;
2111 
2112  *basePtr = lGEPInst(*basePtr, arrayRef, "new_base", insertBefore);
2113 
2114  // this should only happen if we have only uniforms, but that in turn
2115  // shouldn't be a gather/scatter!
2116  Assert(*offsetVector != NULL);
2117 }
2118 #endif
2119 
2120 
2121 static bool
2122 lVectorIs32BitInts(llvm::Value *v) {
2123  int nElts;
2124  int64_t elts[ISPC_MAX_NVEC];
2125  if (!LLVMExtractVectorInts(v, elts, &nElts))
2126  return false;
2127 
2128  for (int i = 0; i < nElts; ++i)
2129  if ((int32_t)elts[i] != elts[i])
2130  return false;
2131 
2132  return true;
2133 }
2134 
2135 
2136 /** Check to see if the two offset vectors can safely be represented with
2137  32-bit values. If so, return true and update the pointed-to
2138  llvm::Value *s to be the 32-bit equivalents. */
2139 static bool
2140 lOffsets32BitSafe(llvm::Value **variableOffsetPtr,
2141  llvm::Value **constOffsetPtr,
2142  llvm::Instruction *insertBefore) {
2143  llvm::Value *variableOffset = *variableOffsetPtr;
2144  llvm::Value *constOffset = *constOffsetPtr;
2145 
2146  if (variableOffset->getType() != LLVMTypes::Int32VectorType) {
2147  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(variableOffset);
2148  if (sext != NULL &&
2149  sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType)
2150  // sext of a 32-bit vector -> the 32-bit vector is good
2151  variableOffset = sext->getOperand(0);
2152  else if (lVectorIs32BitInts(variableOffset))
2153  // The only constant vector we should have here is a vector of
2154  // all zeros (i.e. a ConstantAggregateZero, but just in case,
2155  // do the more general check with lVectorIs32BitInts().
2156  variableOffset =
2157  new llvm::TruncInst(variableOffset, LLVMTypes::Int32VectorType,
2158  LLVMGetName(variableOffset, "_trunc"),
2159  insertBefore);
2160  else
2161  return false;
2162  }
2163 
2164  if (constOffset->getType() != LLVMTypes::Int32VectorType) {
2165  if (lVectorIs32BitInts(constOffset)) {
2166  // Truncate them so we have a 32-bit vector type for them.
2167  constOffset =
2168  new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
2169  LLVMGetName(constOffset, "_trunc"), insertBefore);
2170  }
2171  else {
2172  // FIXME: otherwise we just assume that all constant offsets
2173  // can actually always fit into 32-bits... (This could be
2174  // wrong, but it should be only in pretty esoteric cases). We
2175  // make this assumption for now since we sometimes generate
2176  // constants that need constant folding before we really have a
2177  // constant vector out of them, and
2178  // llvm::ConstantFoldInstruction() doesn't seem to be doing
2179  // enough for us in some cases if we call it from here.
2180  constOffset =
2181  new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
2182  LLVMGetName(constOffset, "_trunc"), insertBefore);
2183  }
2184  }
2185 
2186  *variableOffsetPtr = variableOffset;
2187  *constOffsetPtr = constOffset;
2188  return true;
2189 }
2190 
2191 /** Check to see if the offset value is composed of a string of Adds,
2192  SExts, and Constant Vectors that are 32-bit safe. Recursively
2193  explores the operands of Add instructions (as they might themselves
2194  be adds that eventually terminate in constant vectors or a SExt.)
2195  */
2196 
2197 static bool
2198 lIs32BitSafeHelper(llvm::Value *v) {
2199  // handle Adds, SExts, Constant Vectors
2200  if (llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v)) {
2201  if (bop->getOpcode() == llvm::Instruction::Add) {
2202  return lIs32BitSafeHelper(bop->getOperand(0))
2203  && lIs32BitSafeHelper(bop->getOperand(1));
2204  }
2205  return false;
2206  }
2207  else if (llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(v)) {
2208  return sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType;
2209  }
2210  else return lVectorIs32BitInts(v);
2211 }
2212 
2213 /** Check to see if the single offset vector can safely be represented with
2214  32-bit values. If so, return true and update the pointed-to
2215  llvm::Value * to be the 32-bit equivalent. */
2216 static bool
2217 lOffsets32BitSafe(llvm::Value **offsetPtr,
2218  llvm::Instruction *insertBefore) {
2219  llvm::Value *offset = *offsetPtr;
2220 
2221  if (offset->getType() == LLVMTypes::Int32VectorType)
2222  return true;
2223 
2224  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offset);
2225  if (sext != NULL &&
2226  sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
2227  // sext of a 32-bit vector -> the 32-bit vector is good
2228  *offsetPtr = sext->getOperand(0);
2229  return true;
2230  }
2231  else if (lIs32BitSafeHelper(offset)) {
2232  // The only constant vector we should have here is a vector of
2233  // all zeros (i.e. a ConstantAggregateZero, but just in case,
2234  // do the more general check with lVectorIs32BitInts().
2235 
2236  // Alternatively, offset could be a sequence of adds terminating
2237  // in safe constant vectors or a SExt.
2238  *offsetPtr =
2239  new llvm::TruncInst(offset, LLVMTypes::Int32VectorType,
2240  LLVMGetName(offset, "_trunc"),
2241  insertBefore);
2242  return true;
2243  }
2244  else
2245  return false;
2246 }
2247 
2248 
2249 static bool
2250 lGSToGSBaseOffsets(llvm::CallInst *callInst) {
2251  struct GSInfo {
2252  GSInfo(const char *pgFuncName, const char *pgboFuncName,
2253  const char *pgbo32FuncName, bool ig, bool ip)
2254  : isGather(ig), isPrefetch(ip) {
2255  func = m->module->getFunction(pgFuncName);
2256  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
2257  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
2258  }
2259  llvm::Function *func;
2260  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
2261  const bool isGather;
2262  const bool isPrefetch;
2263  };
2264 
2265  GSInfo gsFuncs[] = {
2266  GSInfo("__pseudo_gather32_i8",
2267  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2268  "__pseudo_gather_factored_base_offsets32_i8",
2269  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2270  "__pseudo_gather_factored_base_offsets32_i8",
2271  true, false),
2272  GSInfo("__pseudo_gather32_i16",
2273  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2274  "__pseudo_gather_factored_base_offsets32_i16",
2275  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2276  "__pseudo_gather_factored_base_offsets32_i16",
2277  true, false),
2278  GSInfo("__pseudo_gather32_i32",
2279  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2280  "__pseudo_gather_factored_base_offsets32_i32",
2281  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2282  "__pseudo_gather_factored_base_offsets32_i32",
2283  true, false),
2284  GSInfo("__pseudo_gather32_float",
2285  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2286  "__pseudo_gather_factored_base_offsets32_float",
2287  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2288  "__pseudo_gather_factored_base_offsets32_float",
2289  true, false),
2290  GSInfo("__pseudo_gather32_i64",
2291  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2292  "__pseudo_gather_factored_base_offsets32_i64",
2293  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2294  "__pseudo_gather_factored_base_offsets32_i64",
2295  true, false),
2296  GSInfo("__pseudo_gather32_double",
2297  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2298  "__pseudo_gather_factored_base_offsets32_double",
2299  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2300  "__pseudo_gather_factored_base_offsets32_double",
2301  true, false),
2302 
2303  GSInfo("__pseudo_scatter32_i8",
2304  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2305  "__pseudo_scatter_factored_base_offsets32_i8",
2306  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2307  "__pseudo_scatter_factored_base_offsets32_i8",
2308  false, false),
2309  GSInfo("__pseudo_scatter32_i16",
2310  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2311  "__pseudo_scatter_factored_base_offsets32_i16",
2312  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2313  "__pseudo_scatter_factored_base_offsets32_i16",
2314  false, false),
2315  GSInfo("__pseudo_scatter32_i32",
2316  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2317  "__pseudo_scatter_factored_base_offsets32_i32",
2318  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2319  "__pseudo_scatter_factored_base_offsets32_i32",
2320  false, false),
2321  GSInfo("__pseudo_scatter32_float",
2322  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2323  "__pseudo_scatter_factored_base_offsets32_float",
2324  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2325  "__pseudo_scatter_factored_base_offsets32_float",
2326  false, false),
2327  GSInfo("__pseudo_scatter32_i64",
2328  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2329  "__pseudo_scatter_factored_base_offsets32_i64",
2330  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2331  "__pseudo_scatter_factored_base_offsets32_i64",
2332  false, false),
2333  GSInfo("__pseudo_scatter32_double",
2334  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2335  "__pseudo_scatter_factored_base_offsets32_double",
2336  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2337  "__pseudo_scatter_factored_base_offsets32_double",
2338  false, false),
2339 
2340  GSInfo("__pseudo_gather64_i8",
2341  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
2342  "__pseudo_gather_factored_base_offsets64_i8",
2343  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2344  "__pseudo_gather_factored_base_offsets32_i8",
2345  true, false),
2346  GSInfo("__pseudo_gather64_i16",
2347  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
2348  "__pseudo_gather_factored_base_offsets64_i16",
2349  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2350  "__pseudo_gather_factored_base_offsets32_i16",
2351  true, false),
2352  GSInfo("__pseudo_gather64_i32",
2353  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
2354  "__pseudo_gather_factored_base_offsets64_i32",
2355  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2356  "__pseudo_gather_factored_base_offsets32_i32",
2357  true, false),
2358  GSInfo("__pseudo_gather64_float",
2359  g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
2360  "__pseudo_gather_factored_base_offsets64_float",
2361  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2362  "__pseudo_gather_factored_base_offsets32_float",
2363  true, false),
2364  GSInfo("__pseudo_gather64_i64",
2365  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
2366  "__pseudo_gather_factored_base_offsets64_i64",
2367  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2368  "__pseudo_gather_factored_base_offsets32_i64",
2369  true, false),
2370  GSInfo("__pseudo_gather64_double",
2371  g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
2372  "__pseudo_gather_factored_base_offsets64_double",
2373  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2374  "__pseudo_gather_factored_base_offsets32_double",
2375  true, false),
2376 
2377  GSInfo("__pseudo_scatter64_i8",
2378  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
2379  "__pseudo_scatter_factored_base_offsets64_i8",
2380  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2381  "__pseudo_scatter_factored_base_offsets32_i8",
2382  false, false),
2383  GSInfo("__pseudo_scatter64_i16",
2384  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
2385  "__pseudo_scatter_factored_base_offsets64_i16",
2386  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2387  "__pseudo_scatter_factored_base_offsets32_i16",
2388  false, false),
2389  GSInfo("__pseudo_scatter64_i32",
2390  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
2391  "__pseudo_scatter_factored_base_offsets64_i32",
2392  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2393  "__pseudo_scatter_factored_base_offsets32_i32",
2394  false, false),
2395  GSInfo("__pseudo_scatter64_float",
2396  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
2397  "__pseudo_scatter_factored_base_offsets64_float",
2398  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2399  "__pseudo_scatter_factored_base_offsets32_float",
2400  false, false),
2401  GSInfo("__pseudo_scatter64_i64",
2402  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
2403  "__pseudo_scatter_factored_base_offsets64_i64",
2404  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2405  "__pseudo_scatter_factored_base_offsets32_i64",
2406  false, false),
2407  GSInfo("__pseudo_scatter64_double",
2408  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
2409  "__pseudo_scatter_factored_base_offsets64_double",
2410  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2411  "__pseudo_scatter_factored_base_offsets32_double",
2412  false, false),
2413  GSInfo("__pseudo_prefetch_read_varying_1",
2414  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2415  "__prefetch_read_varying_1",
2416  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2417  "__prefetch_read_varying_1",
2418  false, true),
2419 
2420  GSInfo("__pseudo_prefetch_read_varying_2",
2421  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2422  "__prefetch_read_varying_2",
2423  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2424  "__prefetch_read_varying_2",
2425  false, true),
2426 
2427  GSInfo("__pseudo_prefetch_read_varying_3",
2428  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2429  "__prefetch_read_varying_3",
2430  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2431  "__prefetch_read_varying_3",
2432  false, true),
2433 
2434  GSInfo("__pseudo_prefetch_read_varying_nt",
2435  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2436  "__prefetch_read_varying_nt",
2437  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2438  "__prefetch_read_varying_nt",
2439  false, true),
2440  };
2441 
2442  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2443  for (int i = 0; i < numGSFuncs; ++i)
2444  Assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL &&
2445  gsFuncs[i].baseOffsets32Func != NULL);
2446 
2447  GSInfo *info = NULL;
2448  for (int i = 0; i < numGSFuncs; ++i)
2449  if (gsFuncs[i].func != NULL &&
2450  callInst->getCalledFunction() == gsFuncs[i].func) {
2451  info = &gsFuncs[i];
2452  break;
2453  }
2454  if (info == NULL)
2455  return false;
2456 
2457  // Try to transform the array of pointers to a single base pointer
2458  // and an array of int32 offsets. (All the hard work is done by
2459  // lGetBasePtrAndOffsets).
2460  llvm::Value *ptrs = callInst->getArgOperand(0);
2461  llvm::Value *offsetVector = NULL;
2462  llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector,
2463  callInst);
2464 
2465  if (basePtr == NULL || offsetVector == NULL ||
2466  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
2467  // It's actually a fully general gather/scatter with a varying
2468  // set of base pointers, so leave it as is and continune onward
2469  // to the next instruction...
2470  return false;
2471 
2472  // Cast the base pointer to a void *, since that's what the
2473  // __pseudo_*_base_offsets_* functions want.
2474  basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType,
2475  LLVMGetName(basePtr, "_2void"), callInst);
2476  lCopyMetadata(basePtr, callInst);
2477 
2478  llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
2479 
2480  if ((info->isGather == true && g->target->hasGather()) ||
2481  (info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
2482  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
2483 
2484  // See if the offsets are scaled by 2, 4, or 8. If so,
2485  // extract that scale factor and rewrite the offsets to remove
2486  // it.
2487  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
2488 
2489  // If we're doing 32-bit addressing on a 64-bit target, here we
2490  // will see if we can call one of the 32-bit variants of the pseudo
2491  // gather/scatter functions.
2492  if (g->opt.force32BitAddressing &&
2493  lOffsets32BitSafe(&offsetVector, callInst)) {
2494  gatherScatterFunc = info->baseOffsets32Func;
2495  }
2496 
2497  if (info->isGather || info->isPrefetch) {
2498  llvm::Value *mask = callInst->getArgOperand(1);
2499 
2500  // Generate a new function call to the next pseudo gather
2501  // base+offsets instruction. Note that we're passing a NULL
2502  // llvm::Instruction to llvm::CallInst::Create; this means that
2503  // the instruction isn't inserted into a basic block and that
2504  // way we can then call ReplaceInstWithInst().
2505  llvm::Instruction *newCall =
2506  lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector,
2507  mask, callInst->getName().str().c_str(),
2508  NULL);
2509  lCopyMetadata(newCall, callInst);
2510  llvm::ReplaceInstWithInst(callInst, newCall);
2511  }
2512  else {
2513  llvm::Value *storeValue = callInst->getArgOperand(1);
2514  llvm::Value *mask = callInst->getArgOperand(2);
2515 
2516  // Generate a new function call to the next pseudo scatter
2517  // base+offsets instruction. See above for why passing NULL
2518  // for the Instruction * is intended.
2519  llvm::Instruction *newCall =
2520  lCallInst(gatherScatterFunc, basePtr, offsetScale,
2521  offsetVector, storeValue, mask, "", NULL);
2522  lCopyMetadata(newCall, callInst);
2523  llvm::ReplaceInstWithInst(callInst, newCall);
2524  }
2525  }
2526  else {
2527  // Try to decompose the offset vector into a compile time constant
2528  // component and a varying component. The constant component is
2529  // passed as a separate parameter to the gather/scatter functions,
2530  // which in turn allows their implementations to end up emitting
2531  // x86 instructions with constant offsets encoded in them.
2532  llvm::Value *constOffset, *variableOffset;
2533  lExtractConstantOffset(offsetVector, &constOffset, &variableOffset,
2534  callInst);
2535  if (constOffset == NULL)
2536  constOffset = LLVMIntAsType(0, offsetVector->getType());
2537  if (variableOffset == NULL)
2538  variableOffset = LLVMIntAsType(0, offsetVector->getType());
2539 
2540  // See if the varying component is scaled by 2, 4, or 8. If so,
2541  // extract that scale factor and rewrite variableOffset to remove
2542  // it. (This also is pulled out so that we can match the scales by
2543  // 2/4/8 offered by x86 addressing operators.)
2544  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);
2545 
2546  // If we're doing 32-bit addressing on a 64-bit target, here we
2547  // will see if we can call one of the 32-bit variants of the pseudo
2548  // gather/scatter functions.
2549  if (g->opt.force32BitAddressing &&
2550  lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) {
2551  gatherScatterFunc = info->baseOffsets32Func;
2552  }
2553 
2554  if (info->isGather || info->isPrefetch) {
2555  llvm::Value *mask = callInst->getArgOperand(1);
2556 
2557  // Generate a new function call to the next pseudo gather
2558  // base+offsets instruction. Note that we're passing a NULL
2559  // llvm::Instruction to llvm::CallInst::Create; this means that
2560  // the instruction isn't inserted into a basic block and that
2561  // way we can then call ReplaceInstWithInst().
2562  llvm::Instruction *newCall =
2563  lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
2564  constOffset, mask, callInst->getName().str().c_str(),
2565  NULL);
2566  lCopyMetadata(newCall, callInst);
2567  llvm::ReplaceInstWithInst(callInst, newCall);
2568  }
2569  else {
2570  llvm::Value *storeValue = callInst->getArgOperand(1);
2571  llvm::Value *mask = callInst->getArgOperand(2);
2572 
2573  // Generate a new function call to the next pseudo scatter
2574  // base+offsets instruction. See above for why passing NULL
2575  // for the Instruction * is intended.
2576  llvm::Instruction *newCall =
2577  lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
2578  constOffset, storeValue, mask, "", NULL);
2579  lCopyMetadata(newCall, callInst);
2580  llvm::ReplaceInstWithInst(callInst, newCall);
2581  }
2582  }
2583  return true;
2584 }
2585 
2586 
2587 /** Try to improve the decomposition between compile-time constant and
2588  compile-time unknown offsets in calls to the __pseudo_*_base_offsets*
2589  functions. Other other optimizations have run, we will sometimes be
2590  able to pull more terms out of the unknown part and add them into the
2591  compile-time-known part.
2592  */
2593 static bool
2594 lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
2595  struct GSBOInfo {
2596  GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
2597  : isGather(ig), isPrefetch(ip) {
2598  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
2599  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
2600  }
2601  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
2602  const bool isGather;
2603  const bool isPrefetch;
2604  };
2605 
2606  GSBOInfo gsFuncs[] = {
2607  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2608  "__pseudo_gather_factored_base_offsets32_i8",
2609  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2610  "__pseudo_gather_factored_base_offsets32_i8",
2611  true, false),
2612  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2613  "__pseudo_gather_factored_base_offsets32_i16",
2614  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2615  "__pseudo_gather_factored_base_offsets32_i16",
2616  true, false),
2617  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2618  "__pseudo_gather_factored_base_offsets32_i32",
2619  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2620  "__pseudo_gather_factored_base_offsets32_i32",
2621  true, false),
2622  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2623  "__pseudo_gather_factored_base_offsets32_float",
2624  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2625  "__pseudo_gather_factored_base_offsets32_float",
2626  true, false),
2627  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2628  "__pseudo_gather_factored_base_offsets32_i64",
2629  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2630  "__pseudo_gather_factored_base_offsets32_i64",
2631  true, false),
2632  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2633  "__pseudo_gather_factored_base_offsets32_double",
2634  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2635  "__pseudo_gather_factored_base_offsets32_double",
2636  true, false),
2637 
2638  GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2639  "__pseudo_scatter_factored_base_offsets32_i8",
2640  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2641  "__pseudo_scatter_factored_base_offsets32_i8",
2642  false, false),
2643  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2644  "__pseudo_scatter_factored_base_offsets32_i16",
2645  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2646  "__pseudo_scatter_factored_base_offsets32_i16",
2647  false, false),
2648  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2649  "__pseudo_scatter_factored_base_offsets32_i32",
2650  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2651  "__pseudo_scatter_factored_base_offsets32_i32",
2652  false, false),
2653  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2654  "__pseudo_scatter_factored_base_offsets32_float",
2655  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2656  "__pseudo_scatter_factored_base_offsets32_float",
2657  false, false),
2658  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2659  "__pseudo_scatter_factored_base_offsets32_i64",
2660  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2661  "__pseudo_scatter_factored_base_offsets32_i64",
2662  false, false),
2663  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2664  "__pseudo_scatter_factored_base_offsets32_double",
2665  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2666  "__pseudo_scatter_factored_base_offsets32_double",
2667  false, false),
2668 
2669  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2670  "__prefetch_read_varying_1",
2671  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2672  "__prefetch_read_varying_1",
2673  false, true),
2674 
2675  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2676  "__prefetch_read_varying_2",
2677  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2678  "__prefetch_read_varying_2",
2679  false, true),
2680 
2681  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2682  "__prefetch_read_varying_3",
2683  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2684  "__prefetch_read_varying_3",
2685  false, true),
2686 
2687  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2688  "__prefetch_read_varying_nt",
2689  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2690  "__prefetch_read_varying_nt",
2691  false, true),
2692  };
2693 
2694  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2695  for (int i = 0; i < numGSFuncs; ++i)
2696  Assert(gsFuncs[i].baseOffsetsFunc != NULL &&
2697  gsFuncs[i].baseOffsets32Func != NULL);
2698 
2699  llvm::Function *calledFunc = callInst->getCalledFunction();
2700  Assert(calledFunc != NULL);
2701 
2702  // Is one of the gather/scatter functins that decompose into
2703  // base+offsets being called?
2704  GSBOInfo *info = NULL;
2705  for (int i = 0; i < numGSFuncs; ++i)
2706  if (calledFunc == gsFuncs[i].baseOffsetsFunc ||
2707  calledFunc == gsFuncs[i].baseOffsets32Func) {
2708  info = &gsFuncs[i];
2709  break;
2710  }
2711  if (info == NULL)
2712  return false;
2713 
2714  // Grab the old variable offset
2715  llvm::Value *origVariableOffset = callInst->getArgOperand(1);
2716 
2717  // If it's zero, we're done. Don't go and think that we're clever by
2718  // adding these zeros to the constant offsets.
2719  if (llvm::isa<llvm::ConstantAggregateZero>(origVariableOffset))
2720  return false;
2721 
2722  // Try to decompose the old variable offset
2723  llvm::Value *constOffset, *variableOffset;
2724  lExtractConstantOffset(origVariableOffset, &constOffset, &variableOffset,
2725  callInst);
2726 
2727  // No luck
2728  if (constOffset == NULL)
2729  return false;
2730 
2731  // Total luck: everything could be moved to the constant offset
2732  if (variableOffset == NULL)
2733  variableOffset = LLVMIntAsType(0, origVariableOffset->getType());
2734 
2735  // We need to scale the value we add to the constant offset by the
2736  // 2/4/8 scale for the variable offset, if present.
2737  llvm::ConstantInt *varScale =
2738  llvm::dyn_cast<llvm::ConstantInt>(callInst->getArgOperand(2));
2739  Assert(varScale != NULL);
2740 
2741  llvm::Value *scaleSmear;
2742  if (origVariableOffset->getType() == LLVMTypes::Int64VectorType)
2743  scaleSmear = LLVMInt64Vector((int64_t)varScale->getZExtValue());
2744  else
2745  scaleSmear = LLVMInt32Vector((int32_t)varScale->getZExtValue());
2746 
2747  constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Mul, constOffset,
2748  scaleSmear, constOffset->getName(),
2749  callInst);
2750 
2751  // And add the additional offset to the original constant offset
2752  constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, constOffset,
2753  callInst->getArgOperand(3),
2754  callInst->getArgOperand(3)->getName(),
2755  callInst);
2756 
2757  // Finally, update the values of the operands to the gather/scatter
2758  // function.
2759  callInst->setArgOperand(1, variableOffset);
2760  callInst->setArgOperand(3, constOffset);
2761 
2762  return true;
2763 }
2764 
2765 
2766 static llvm::Value *
2767 lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets,
2768  llvm::Instruction *insertBefore) {
2769  llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets);
2770  return lGEPInst(base, firstOffset, "ptr", insertBefore);
2771 }
2772 
2773 
2774 static llvm::Constant *
2775 lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType) {
2776  llvm::ConstantInt *offsetScaleInt =
2777  llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
2778  Assert(offsetScaleInt != NULL);
2779  uint64_t scaleValue = offsetScaleInt->getZExtValue();
2780 
2781  std::vector<llvm::Constant *> scales;
2782  for (int i = 0; i < g->target->getVectorWidth(); ++i) {
2783  if (vecType == LLVMTypes::Int64VectorType)
2784  scales.push_back(LLVMInt64(scaleValue));
2785  else {
2786  Assert(vecType == LLVMTypes::Int32VectorType);
2787  scales.push_back(LLVMInt32((int32_t)scaleValue));
2788  }
2789  }
2790  return llvm::ConstantVector::get(scales);
2791 }
2792 
2793 
2794 /** After earlier optimization passes have run, we are sometimes able to
2795  determine that gathers/scatters are actually accessing memory in a more
2796  regular fashion and then change the operation to something simpler and
2797  more efficient. For example, if all of the lanes in a gather are
2798  reading from the same location, we can instead do a scalar load and
2799  broadcast. This pass examines gathers and scatters and tries to
2800  simplify them if at all possible.
2801 
2802  @todo Currently, this only looks for all program instances going to the
2803  same location and all going to a linear sequence of locations in
2804  memory. There are a number of other cases that might make sense to
2805  look for, including things that could be handled with a vector load +
2806  shuffle or things that could be handled with hybrids of e.g. 2 4-wide
2807  vector loads with AVX, etc.
2808 */
2809 static bool
2810 lGSToLoadStore(llvm::CallInst *callInst) {
2811  struct GatherImpInfo {
2812  GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st,
2813  int a)
2814  : align(a), isFactored(!g->target->hasGather()) {
2815  pseudoFunc = m->module->getFunction(pName);
2816  loadMaskedFunc = m->module->getFunction(lmName);
2817  Assert(pseudoFunc != NULL && loadMaskedFunc != NULL);
2818  scalarType = st;
2819  }
2820 
2821  llvm::Function *pseudoFunc;
2822  llvm::Function *loadMaskedFunc;
2823  llvm::Type *scalarType;
2824  const int align;
2825  const bool isFactored;
2826  };
2827 
2828  GatherImpInfo gInfo[] = {
2829  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2830  "__pseudo_gather_factored_base_offsets32_i8",
2831  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2832  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2833  "__pseudo_gather_factored_base_offsets32_i16",
2834  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2835  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2836  "__pseudo_gather_factored_base_offsets32_i32",
2837  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2838  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2839  "__pseudo_gather_factored_base_offsets32_float",
2840  "__masked_load_float", LLVMTypes::FloatType, 4),
2841  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2842  "__pseudo_gather_factored_base_offsets32_i64",
2843  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2844  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2845  "__pseudo_gather_factored_base_offsets32_double",
2846  "__masked_load_double", LLVMTypes::DoubleType, 8),
2847  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
2848  "__pseudo_gather_factored_base_offsets64_i8",
2849  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2850  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
2851  "__pseudo_gather_factored_base_offsets64_i16",
2852  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2853  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
2854  "__pseudo_gather_factored_base_offsets64_i32",
2855  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2856  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
2857  "__pseudo_gather_factored_base_offsets64_float",
2858  "__masked_load_float", LLVMTypes::FloatType, 4),
2859  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
2860  "__pseudo_gather_factored_base_offsets64_i64",
2861  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2862  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
2863  "__pseudo_gather_factored_base_offsets64_double",
2864  "__masked_load_double", LLVMTypes::DoubleType, 8),
2865  };
2866 
2867  struct ScatterImpInfo {
2868  ScatterImpInfo(const char *pName, const char *msName,
2869  llvm::Type *vpt, int a)
2870  : align(a), isFactored(!g->target->hasScatter()) {
2871  pseudoFunc = m->module->getFunction(pName);
2872  maskedStoreFunc = m->module->getFunction(msName);
2873  vecPtrType = vpt;
2874  Assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
2875  }
2876  llvm::Function *pseudoFunc;
2877  llvm::Function *maskedStoreFunc;
2878  llvm::Type *vecPtrType;
2879  const int align;
2880  const bool isFactored;
2881  };
2882 
2883  ScatterImpInfo sInfo[] = {
2884  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2885  "__pseudo_scatter_factored_base_offsets32_i8",
2886  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2887  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2888  "__pseudo_scatter_factored_base_offsets32_i16",
2889  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2890  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2891  "__pseudo_scatter_factored_base_offsets32_i32",
2892  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2893  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2894  "__pseudo_scatter_factored_base_offsets32_float",
2895  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2896  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2897  "__pseudo_scatter_factored_base_offsets32_i64",
2898  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2899  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2900  "__pseudo_scatter_factored_base_offsets32_double",
2901  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2902  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
2903  "__pseudo_scatter_factored_base_offsets64_i8",
2904  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2905  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
2906  "__pseudo_scatter_factored_base_offsets64_i16",
2907  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2908  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
2909  "__pseudo_scatter_factored_base_offsets64_i32",
2910  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2911  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
2912  "__pseudo_scatter_factored_base_offsets64_float",
2913  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2914  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
2915  "__pseudo_scatter_factored_base_offsets64_i64",
2916  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2917  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
2918  "__pseudo_scatter_factored_base_offsets64_double",
2919  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2920  };
2921 
2922  llvm::Function *calledFunc = callInst->getCalledFunction();
2923 
2924  GatherImpInfo *gatherInfo = NULL;
2925  ScatterImpInfo *scatterInfo = NULL;
2926  for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
2927  if (gInfo[i].pseudoFunc != NULL &&
2928  calledFunc == gInfo[i].pseudoFunc) {
2929  gatherInfo = &gInfo[i];
2930  break;
2931  }
2932  }
2933  for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
2934  if (sInfo[i].pseudoFunc != NULL &&
2935  calledFunc == sInfo[i].pseudoFunc) {
2936  scatterInfo = &sInfo[i];
2937  break;
2938  }
2939  }
2940  if (gatherInfo == NULL && scatterInfo == NULL)
2941  return false;
2942 
2943  SourcePos pos;
2944  lGetSourcePosFromMetadata(callInst, &pos);
2945 
2946  llvm::Value *base = callInst->getArgOperand(0);
2947  llvm::Value *fullOffsets = NULL;
2948  llvm::Value *storeValue = NULL;
2949  llvm::Value *mask = NULL;
2950 
2951  if ((gatherInfo != NULL && gatherInfo->isFactored) ||
2952  (scatterInfo != NULL && scatterInfo->isFactored)) {
2953  llvm::Value *varyingOffsets = callInst->getArgOperand(1);
2954  llvm::Value *offsetScale = callInst->getArgOperand(2);
2955  llvm::Value *constOffsets = callInst->getArgOperand(3);
2956  if (scatterInfo)
2957  storeValue = callInst->getArgOperand(4);
2958  mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);
2959 
2960  // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
2961  llvm::Constant *offsetScaleVec =
2962  lGetOffsetScaleVec(offsetScale, varyingOffsets->getType());
2963 
2964  llvm::Value *scaledVarying =
2965  llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
2966  varyingOffsets, "scaled_varying", callInst);
2967  fullOffsets =
2968  llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying,
2969  constOffsets, "varying+const_offsets",
2970  callInst);
2971  }
2972  else {
2973  if (scatterInfo)
2974  storeValue = callInst->getArgOperand(3);
2975  mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
2976 
2977  llvm::Value *offsetScale = callInst->getArgOperand(1);
2978  llvm::Value *offsets = callInst->getArgOperand(2);
2979  llvm::Value *offsetScaleVec =
2980  lGetOffsetScaleVec(offsetScale, offsets->getType());
2981 
2982  fullOffsets =
2983  llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
2984  offsets, "scaled_offsets", callInst);
2985  }
2986 
2987  Debug(SourcePos(), "GSToLoadStore: %s.",
2988  fullOffsets->getName().str().c_str());
2989 
2990  if (LLVMVectorValuesAllEqual(fullOffsets)) {
2991  // If all the offsets are equal, then compute the single
2992  // pointer they all represent based on the first one of them
2993  // (arbitrarily).
2994  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
2995  lCopyMetadata(ptr, callInst);
2996 
2997  if (gatherInfo != NULL) {
2998  // A gather with everyone going to the same location is
2999  // handled as a scalar load and broadcast across the lanes.
3000  Debug(pos, "Transformed gather to scalar load and broadcast!");
3001 
3002  ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(gatherInfo->scalarType, 0),
3003  ptr->getName(), callInst);
3004  llvm::Value *scalarValue = new llvm::LoadInst(ptr, callInst->getName(), callInst);
3005 
3006  // Generate the following sequence:
3007  // %name123 = insertelement <4 x i32> undef, i32 %val, i32 0
3008  // %name124 = shufflevector <4 x i32> %name123, <4 x i32> undef,
3009  // <4 x i32> zeroinitializer
3010  llvm::Value *undef1Value = llvm::UndefValue::get(callInst->getType());
3011  llvm::Value *undef2Value = llvm::UndefValue::get(callInst->getType());
3012  llvm::Value *insertVec = llvm::InsertElementInst::Create(
3013  undef1Value, scalarValue, LLVMInt32(0), callInst->getName(), callInst);
3014  llvm::Value *zeroMask = llvm::ConstantVector::getSplat(
3015  callInst->getType()->getVectorNumElements(),
3016  llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
3017  llvm::Value *shufValue = new llvm::ShuffleVectorInst(
3018  insertVec, undef2Value, zeroMask, callInst->getName());
3019 
3020  lCopyMetadata(shufValue, callInst);
3021  llvm::ReplaceInstWithInst(callInst,
3022  llvm::dyn_cast<llvm::Instruction>(shufValue));
3023  return true;
3024  }
3025  else {
3026  // A scatter with everyone going to the same location is
3027  // undefined (if there's more than one program instance in
3028  // the gang). Issue a warning.
3029  if (g->target->getVectorWidth() > 1)
3030  Warning(pos, "Undefined behavior: all program instances are "
3031  "writing to the same location!");
3032 
3033  // We could do something similar to the gather case, where
3034  // we arbitrarily write one of the values, but we need to
3035  // a) check to be sure the mask isn't all off and b) pick
3036  // the value from an executing program instance in that
3037  // case. We'll just let a bunch of the program instances
3038  // do redundant writes, since this isn't important to make
3039  // fast anyway...
3040  return false;
3041  }
3042  }
3043  else {
3044  int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
3045 
3046  if (step > 0 && LLVMVectorIsLinear(fullOffsets, step)) {
3047  // We have a linear sequence of memory locations being accessed
3048  // starting with the location given by the offset from
3049  // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
3050  // and 64 bit gather/scatters, respectively.)
3051  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
3052  lCopyMetadata(ptr, callInst);
3053 
3054  if (gatherInfo != NULL) {
3055  Debug(pos, "Transformed gather to unaligned vector load!");
3056  llvm::Instruction *newCall =
3057  lCallInst(gatherInfo->loadMaskedFunc, ptr, mask,
3058  LLVMGetName(ptr, "_masked_load"));
3059  lCopyMetadata(newCall, callInst);
3060  llvm::ReplaceInstWithInst(callInst, newCall);
3061  return true;
3062  }
3063  else {
3064  Debug(pos, "Transformed scatter to unaligned vector store!");
3065  ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast",
3066  callInst);
3067  llvm::Instruction *newCall =
3068  lCallInst(scatterInfo->maskedStoreFunc, ptr, storeValue,
3069  mask, "");
3070  lCopyMetadata(newCall, callInst);
3071  llvm::ReplaceInstWithInst(callInst, newCall);
3072  return true;
3073  }
3074  }
3075  return false;
3076  }
3077 }
3078 
3079 
3080 ///////////////////////////////////////////////////////////////////////////
3081 // MaskedStoreOptPass
3082 
3083 /** Masked stores are generally more complex than regular stores; for
3084  example, they require multiple instructions to simulate under SSE.
3085  This optimization detects cases where masked stores can be replaced
3086  with regular stores or removed entirely, for the cases of an 'all on'
3087  mask and an 'all off' mask, respectively.
3088 */
3089 static bool
3090 lImproveMaskedStore(llvm::CallInst *callInst) {
3091  struct MSInfo {
3092  MSInfo(const char *name, const int a)
3093  : align(a) {
3094  func = m->module->getFunction(name);
3095  Assert(func != NULL);
3096  }
3097  llvm::Function *func;
3098  const int align;
3099  };
3100 
3101  MSInfo msInfo[] = {
3102  MSInfo("__pseudo_masked_store_i8", 1),
3103  MSInfo("__pseudo_masked_store_i16", 2),
3104  MSInfo("__pseudo_masked_store_i32", 4),
3105  MSInfo("__pseudo_masked_store_float", 4),
3106  MSInfo("__pseudo_masked_store_i64", 8),
3107  MSInfo("__pseudo_masked_store_double", 8),
3108  MSInfo("__masked_store_blend_i8", 1),
3109  MSInfo("__masked_store_blend_i16", 2),
3110  MSInfo("__masked_store_blend_i32", 4),
3111  MSInfo("__masked_store_blend_float", 4),
3112  MSInfo("__masked_store_blend_i64", 8),
3113  MSInfo("__masked_store_blend_double", 8),
3114  MSInfo("__masked_store_i8", 1),
3115  MSInfo("__masked_store_i16", 2),
3116  MSInfo("__masked_store_i32", 4),
3117  MSInfo("__masked_store_float", 4),
3118  MSInfo("__masked_store_i64", 8),
3119  MSInfo("__masked_store_double", 8)
3120  };
3121 
3122  llvm::Function *called = callInst->getCalledFunction();
3123 
3124  int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
3125  MSInfo *info = NULL;
3126  for (int i = 0; i < nMSFuncs; ++i) {
3127  if (msInfo[i].func != NULL && called == msInfo[i].func) {
3128  info = &msInfo[i];
3129  break;
3130  }
3131  }
3132  if (info == NULL)
3133  return false;
3134 
3135  // Got one; grab the operands
3136  llvm::Value *lvalue = callInst->getArgOperand(0);
3137  llvm::Value *rvalue = callInst->getArgOperand(1);
3138  llvm::Value *mask = callInst->getArgOperand(2);
3139 
3140  MaskStatus maskStatus = lGetMaskStatus(mask);
3141  if (maskStatus == ALL_OFF) {
3142  // Zero mask - no-op, so remove the store completely. (This
3143  // may in turn lead to being able to optimize out instructions
3144  // that compute the rvalue...)
3145  callInst->eraseFromParent();
3146  return true;
3147  }
3148  else if (maskStatus == ALL_ON) {
3149  // The mask is all on, so turn this into a regular store
3150  llvm::Type *rvalueType = rvalue->getType();
3151  llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
3152 
3153  lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
3154  lCopyMetadata(lvalue, callInst);
3155  llvm::Instruction *store =
3156  new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
3158  g->target->getNativeVectorAlignment() : info->align);
3159  lCopyMetadata(store, callInst);
3160  llvm::ReplaceInstWithInst(callInst, store);
3161  return true;
3162  }
3163 
3164  return false;
3165 }
3166 
3167 
3168 static bool
3169 lImproveMaskedLoad(llvm::CallInst *callInst,
3170  llvm::BasicBlock::iterator iter) {
3171  struct MLInfo {
3172  MLInfo(const char *name, const int a)
3173  : align(a) {
3174  func = m->module->getFunction(name);
3175  Assert(func != NULL);
3176  }
3177  llvm::Function *func;
3178  const int align;
3179  };
3180 
3181  MLInfo mlInfo[] = {
3182  MLInfo("__masked_load_i8", 1),
3183  MLInfo("__masked_load_i16", 2),
3184  MLInfo("__masked_load_i32", 4),
3185  MLInfo("__masked_load_float", 4),
3186  MLInfo("__masked_load_i64", 8),
3187  MLInfo("__masked_load_double", 8)
3188  };
3189 
3190  llvm::Function *called = callInst->getCalledFunction();
3191 
3192  int nFuncs = sizeof(mlInfo) / sizeof(mlInfo[0]);
3193  MLInfo *info = NULL;
3194  for (int i = 0; i < nFuncs; ++i) {
3195  if (mlInfo[i].func != NULL && called == mlInfo[i].func) {
3196  info = &mlInfo[i];
3197  break;
3198  }
3199  }
3200  if (info == NULL)
3201  return false;
3202 
3203  // Got one; grab the operands
3204  llvm::Value *ptr = callInst->getArgOperand(0);
3205  llvm::Value *mask = callInst->getArgOperand(1);
3206 
3207  MaskStatus maskStatus = lGetMaskStatus(mask);
3208  if (maskStatus == ALL_OFF) {
3209  // Zero mask - no-op, so replace the load with an undef value
3210  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
3211  iter, llvm::UndefValue::get(callInst->getType()));
3212  return true;
3213  }
3214  else if (maskStatus == ALL_ON) {
3215  // The mask is all on, so turn this into a regular load
3216  llvm::Type *ptrType = llvm::PointerType::get(callInst->getType(), 0);
3217  ptr = new llvm::BitCastInst(ptr, ptrType, "ptr_cast_for_load",
3218  callInst);
3219  llvm::Instruction *load =
3220  new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
3222  g->target->getNativeVectorAlignment() : info->align,
3223  (llvm::Instruction *)NULL);
3224  lCopyMetadata(load, callInst);
3225  llvm::ReplaceInstWithInst(callInst, load);
3226  return true;
3227  }
3228  else
3229  return false;
3230 }
3231 
3232 
3233 bool
3235  DEBUG_START_PASS("ImproveMemoryOps");
3236 
3237  bool modifiedAny = false;
3238  restart:
3239  // Iterate through all of the instructions in the basic block.
3240  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
3241  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
3242  // If we don't have a call to one of the
3243  // __pseudo_{gather,scatter}_* functions, then just go on to the
3244  // next instruction.
3245  if (callInst == NULL ||
3246  callInst->getCalledFunction() == NULL)
3247  continue;
3248 
3249  if (lGSToGSBaseOffsets(callInst)) {
3250  modifiedAny = true;
3251  goto restart;
3252  }
3253  if (lGSBaseOffsetsGetMoreConst(callInst)) {
3254  modifiedAny = true;
3255  goto restart;
3256  }
3257  if (lGSToLoadStore(callInst)) {
3258  modifiedAny = true;
3259  goto restart;
3260  }
3261  if (lImproveMaskedStore(callInst)) {
3262  modifiedAny = true;
3263  goto restart;
3264  }
3265  if (lImproveMaskedLoad(callInst, iter)) {
3266  modifiedAny = true;
3267  goto restart;
3268  }
3269  }
3270 
3271  DEBUG_END_PASS("ImproveMemoryOps");
3272 
3273  return modifiedAny;
3274 }
3275 
3276 
3277 static llvm::Pass *
3279  return new ImproveMemoryOpsPass;
3280 }
3281 
3282 
3283 ///////////////////////////////////////////////////////////////////////////
3284 // GatherCoalescePass
3285 
3286 // This pass implements two optimizations to improve the performance of
3287 // gathers; currently only gathers of 32-bit values where it can be
3288 // determined at compile time that the mask is all on are supported, though
3289 // both of those limitations may be generalized in the future.
3290 //
3291 // First, for any single gather, see if it's worthwhile to break it into
3292 // any of scalar, 2-wide (i.e. 64-bit), 4-wide, or 8-wide loads. Further,
3293 // we generate code that shuffles these loads around. Doing fewer, larger
3294 // loads in this manner, when possible, can be more efficient.
3295 //
3296 // Second, this pass can coalesce memory accesses across multiple
3297 // gathers. If we have a series of gathers without any memory writes in
3298 // the middle, then we try to analyze their reads collectively and choose
3299 // an efficient set of loads for them. Not only does this help if
3300 // different gathers reuse values from the same location in memory, but
3301 // it's specifically helpful when data with AOS layout is being accessed;
3302 // in this case, we're often able to generate wide vector loads and
3303 // appropriate shuffles automatically.
3304 
3305 class GatherCoalescePass : public llvm::BasicBlockPass {
3306 public:
3307  static char ID;
3308  GatherCoalescePass() : BasicBlockPass(ID) { }
3309 
3310 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
3311  const char *getPassName() const { return "Gather Coalescing"; }
3312 #else // LLVM 4.0+
3313  llvm::StringRef getPassName() const { return "Gather Coalescing"; }
3314 #endif
3315  bool runOnBasicBlock(llvm::BasicBlock &BB);
3316 };
3317 
3318 char GatherCoalescePass::ID = 0;
3319 
3320 
3321 /** Representation of a memory load that the gather coalescing code has
3322  decided to generate.
3323  */
3325  CoalescedLoadOp(int64_t s, int c) {
3326  start = s;
3327  count = c;
3328  load = element0 = element1 = NULL;
3329  }
3330 
3331  /** Starting offset of the load from the common base pointer (in terms
3332  of numbers of items of the underlying element type--*not* in terms
3333  of bytes). */
3334  int64_t start;
3335 
3336  /** Number of elements to load at this location */
3337  int count;
3338 
3339  /** Value loaded from memory for this load op */
3340  llvm::Value *load;
3341 
3342  /** For 2-wide loads (i.e. 64-bit loads), these store the lower and
3343  upper 32 bits of the result, respectively. */
3344  llvm::Value *element0, *element1;
3345 };
3346 
3347 
3348 /** This function determines whether it makes sense (and is safe) to
3349  generate a vector load of width vectorWidth, starting at *iter. It
3350  returns true if so, setting *newIter to point to the next element in
3351  the set that isn't taken care of by the generated load. If a vector
3352  load of the given width doesn't make sense, then false is returned.
3353  */
3354 static bool
3355 lVectorLoadIsEfficient(std::set<int64_t>::iterator iter,
3356  std::set<int64_t>::iterator end,
3357  std::set<int64_t>::iterator *newIter, int vectorWidth) {
3358  // We're considering a vector load of width vectorWidth, starting at
3359  // the offset "start".
3360  int64_t start = *iter;
3361 
3362  // The basic idea is that we'll look at the subsequent elements in the
3363  // load set after the initial one at start. As long as subsequent
3364  // elements:
3365  //
3366  // 1. Aren't so far separated that they no longer fit into the range
3367  // [start, start+vectorWidth)
3368  //
3369  // 2. And don't have too large a gap in between them (e.g., it's not
3370  // worth generating an 8-wide load for two elements with offsets 0
3371  // and 7, but no loads requested in between).
3372  //
3373  // Then we continue moving forward through the elements until we either
3374  // fill up the vector or run out of elements.
3375 
3376  // lastAccepted holds the last offset we've processed and accepted as
3377  // valid for the vector load underconsideration
3378  int64_t lastAccepted = start;
3379 
3380  while (iter != end) {
3381  // What is the separation in offset values from the last element we
3382  // added to the set for this load?
3383  int64_t delta = *iter - lastAccepted;
3384  if (delta > 3)
3385  // If there's too big a gap, then we won't issue the load
3386  return false;
3387 
3388  int64_t span = *iter - start + 1;
3389 
3390  if (span == vectorWidth) {
3391  // We've extended far enough that we have exactly filled up the
3392  // entire vector width; we can't go any further, so return with
3393  // success. (Update *newIter to point at the next element
3394  // after the last one accepted here.)
3395  *newIter = ++iter;
3396  return true;
3397  }
3398  else if (span > vectorWidth) {
3399  // The current offset won't fit into a vectorWidth-wide load
3400  // starting from start. It's still generally worthwhile
3401  // issuing the load we've been considering, though, since it
3402  // will provide values for a number of previous offsets. This
3403  // load will have one or more elements at the end of its range
3404  // that is not needed by any of the offsets under
3405  // consideration. As such, there are three cases where issuing
3406  // this load is a bad idea:
3407  //
3408  // 1. 2-wide loads: we know that we haven't completely filled
3409  // the 2-wide vector, since otherwise the if() test above
3410  // would have succeeded previously. Therefore, we must have
3411  // a situation with offsets like (4,6,...); it would be a
3412  // silly idea to issue a 2-wide load to get the value for
3413  // the 4 offset, versus failing here and issuing a scalar
3414  // load instead.
3415  //
3416  // 2. If there are too many unnecessary values at the end of
3417  // the load extent (defined as more than half of them)--in
3418  // this case, it'd be better to issue a vector load of
3419  // smaller width anyway.
3420  //
3421  // 3. If the gap between the last accepted offset and the
3422  // current one under consideration is more than the page
3423  // size. In this case we can't be sure whether or not some
3424  // of the unused elements at the end of the load will
3425  // straddle a page boundary and thus lead to an undesirable
3426  // fault. (It's hard to imagine this happening in practice,
3427  // except under contrived circumstances, but better safe
3428  // than sorry.)
3429  const int pageSize = 4096;
3430  if (vectorWidth != 2 &&
3431  (lastAccepted - start) > (vectorWidth / 2) &&
3432  (*iter - lastAccepted) < pageSize) {
3433  *newIter = iter;
3434  return true;
3435  }
3436  else
3437  return false;
3438  }
3439 
3440  // Continue moving forward
3441  lastAccepted = *iter;
3442  ++iter;
3443  }
3444 
3445  return false;
3446 }
3447 
3448 
3449 /** Given a set of offsets from a common base pointer that we need to get
3450  loaded into memory, determine a reasonable set of load operations that
3451  gets all of the corresponding values in memory (ideally, including as
3452  many as possible wider vector loads rather than scalar loads). Return
3453  a CoalescedLoadOp for each one in the *loads array.
3454  */
3455 static void
3456 lSelectLoads(const std::vector<int64_t> &loadOffsets,
3457  std::vector<CoalescedLoadOp> *loads) {
3458  // First, get a sorted set of unique offsets to load from.
3459  std::set<int64_t> allOffsets;
3460  for (unsigned int i = 0; i < loadOffsets.size(); ++i)
3461  allOffsets.insert(loadOffsets[i]);
3462 
3463  std::set<int64_t>::iterator iter = allOffsets.begin();
3464  while (iter != allOffsets.end()) {
3465  Debug(SourcePos(), "Load needed at %" PRId64 ".", *iter);
3466  ++iter;
3467  }
3468 
3469  // Now, iterate over the offsets from low to high. Starting at the
3470  // current offset, we see if a vector load starting from that offset
3471  // will cover loads at subsequent offsets as well.
3472  iter = allOffsets.begin();
3473  while (iter != allOffsets.end()) {
3474  // Consider vector loads of width of each of the elements of
3475  // spanSizes[], in order.
3476  int vectorWidths[] = { 8, 4, 2 };
3477  int nVectorWidths = sizeof(vectorWidths) / sizeof(vectorWidths[0]);
3478  bool gotOne = false;
3479  for (int i = 0; i < nVectorWidths; ++i) {
3480  // See if a load of vector with width vectorWidths[i] would be
3481  // effective (i.e. would cover a reasonable number of the
3482  // offsets that need to be loaded from).
3483  std::set<int64_t>::iterator newIter;
3484  if (lVectorLoadIsEfficient(iter, allOffsets.end(), &newIter,
3485  vectorWidths[i])) {
3486  // Yes: create the corresponding coalesced load and update
3487  // the iterator to the returned iterator; doing so skips
3488  // over the additional offsets that are taken care of by
3489  // this load.
3490  loads->push_back(CoalescedLoadOp(*iter, vectorWidths[i]));
3491  iter = newIter;
3492  gotOne = true;
3493  break;
3494  }
3495  }
3496 
3497  if (gotOne == false) {
3498  // We couldn't find a vector load starting from this offset
3499  // that made sense, so emit a scalar load and continue onward.
3500  loads->push_back(CoalescedLoadOp(*iter, 1));
3501  ++iter;
3502  }
3503  }
3504 }
3505 
3506 
3507 /** Print a performance message with the details of the result of
3508  coalescing over a group of gathers. */
3509 static void
3510 lCoalescePerfInfo(const std::vector<llvm::CallInst *> &coalesceGroup,
3511  const std::vector<CoalescedLoadOp> &loadOps) {
3512  SourcePos pos;
3513  lGetSourcePosFromMetadata(coalesceGroup[0], &pos);
3514 
3515  // Create a string that indicates the line numbers of the subsequent
3516  // gathers from the first one that were coalesced here.
3517  char otherPositions[512];
3518  otherPositions[0] = '\0';
3519  if (coalesceGroup.size() > 1) {
3520  const char *plural = (coalesceGroup.size() > 2) ? "s" : "";
3521  char otherBuf[32];
3522  sprintf(otherBuf, "(other%s at line%s ", plural, plural);
3523  strcat(otherPositions, otherBuf);
3524 
3525  for (int i = 1; i < (int)coalesceGroup.size(); ++i) {
3526  SourcePos p;
3527  bool ok = lGetSourcePosFromMetadata(coalesceGroup[i], &p);
3528  if (ok) {
3529  char buf[32];
3530  sprintf(buf, "%d", p.first_line);
3531  strcat(otherPositions, buf);
3532  if (i < (int)coalesceGroup.size() - 1)
3533  strcat(otherPositions, ", ");
3534  }
3535  }
3536  strcat(otherPositions, ") ");
3537  }
3538 
3539  // Count how many loads of each size there were.
3540  std::map<int, int> loadOpsCount;
3541  for (int i = 0; i < (int)loadOps.size(); ++i)
3542  ++loadOpsCount[loadOps[i].count];
3543 
3544  // Generate a string the describes the mix of load ops
3545  char loadOpsInfo[512];
3546  loadOpsInfo[0] = '\0';
3547  std::map<int, int>::const_iterator iter = loadOpsCount.begin();
3548  while (iter != loadOpsCount.end()) {
3549  char buf[32];
3550  sprintf(buf, "%d x %d-wide", iter->second, iter->first);
3551  strcat(loadOpsInfo, buf);
3552  ++iter;
3553  if (iter != loadOpsCount.end())
3554  strcat(loadOpsInfo, ", ");
3555  }
3556 
3557  if (coalesceGroup.size() == 1)
3558  PerformanceWarning(pos, "Coalesced gather into %d load%s (%s).",
3559  (int)loadOps.size(),
3560  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3561  else
3562  PerformanceWarning(pos, "Coalesced %d gathers starting here %sinto %d "
3563  "load%s (%s).", (int)coalesceGroup.size(),
3564  otherPositions,(int)loadOps.size(),
3565  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3566 }
3567 
3568 
3569 /** Utility routine that computes an offset from a base pointer and then
3570  returns the result of a load of the given type from the resulting
3571  location:
3572 
3573  return *((type *)(basePtr + offset))
3574  */
3575 llvm::Value *
3576 lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align,
3577  llvm::Instruction *insertBefore, llvm::Type *type) {
3578  llvm::Value *ptr = lGEPInst(basePtr, LLVMInt64(offset), "new_base",
3579  insertBefore);
3580  ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(type, 0),
3581  "ptr_cast", insertBefore);
3582  return new llvm::LoadInst(ptr, "gather_load", false /* not volatile */,
3583  align, insertBefore);
3584 }
3585 
3586 
3587 /* Having decided that we're doing to emit a series of loads, as encoded in
3588  the loadOps array, this function emits the corresponding load
3589  instructions.
3590  */
3591 static void
3592 lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
3593  int elementSize, llvm::Instruction *insertBefore) {
3594  Debug(SourcePos(), "Coalesce doing %d loads.", (int)loadOps.size());
3595  for (int i = 0; i < (int)loadOps.size(); ++i) {
3596  Debug(SourcePos(), "Load #%d @ %" PRId64 ", %d items", i, loadOps[i].start,
3597  loadOps[i].count);
3598 
3599  // basePtr is an i8 *, so the offset from it should be in terms of
3600  // bytes, not underlying i32 elements.
3601  int64_t start = loadOps[i].start * elementSize;
3602 
3603  int align = 4;
3604  switch (loadOps[i].count) {
3605  case 1:
3606  // Single 32-bit scalar load
3607  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore,
3609  break;
3610  case 2: {
3611  // Emit 2 x i32 loads as i64 loads and then break the result
3612  // into two 32-bit parts.
3613  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore,
3615  // element0 = (int32)value;
3616  loadOps[i].element0 =
3617  new llvm::TruncInst(loadOps[i].load, LLVMTypes::Int32Type,
3618  "load64_elt0", insertBefore);
3619  // element1 = (int32)(value >> 32)
3620  llvm::Value *shift =
3621  llvm::BinaryOperator::Create(llvm::Instruction::LShr,
3622  loadOps[i].load, LLVMInt64(32),
3623  "load64_shift", insertBefore);
3624  loadOps[i].element1 =
3625  new llvm::TruncInst(shift, LLVMTypes::Int32Type,
3626  "load64_elt1", insertBefore);
3627  break;
3628  }
3629  case 4: {
3630  // 4-wide vector load
3631  if (g->opt.forceAlignedMemory) {
3632  align = g->target->getNativeVectorAlignment();
3633  }
3634  llvm::VectorType *vt =
3635  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3636  loadOps[i].load = lGEPAndLoad(basePtr, start, align,
3637  insertBefore, vt);
3638  break;
3639  }
3640  case 8: {
3641  // 8-wide vector load
3642  if (g->opt.forceAlignedMemory) {
3643  align = g->target->getNativeVectorAlignment();
3644  }
3645  llvm::VectorType *vt =
3646  llvm::VectorType::get(LLVMTypes::Int32Type, 8);
3647  loadOps[i].load = lGEPAndLoad(basePtr, start, align,
3648  insertBefore, vt);
3649  break;
3650  }
3651  default:
3652  FATAL("Unexpected load count in lEmitLoads()");
3653  }
3654  }
3655 }
3656 
3657 
3658 /** Convert any loads of 8-wide vectors into two 4-wide vectors
3659  (logically). This allows the assembly code below to always operate on
3660  4-wide vectors, which leads to better code. Returns a new vector of
3661  load operations.
3662  */
3663 static std::vector<CoalescedLoadOp>
3664 lSplit8WideLoads(const std::vector<CoalescedLoadOp> &loadOps,
3665  llvm::Instruction *insertBefore) {
3666  std::vector<CoalescedLoadOp> ret;
3667  for (unsigned int i = 0; i < loadOps.size(); ++i) {
3668  if (loadOps[i].count == 8) {
3669  // Create fake CoalescedLOadOps, where the load llvm::Value is
3670  // actually a shuffle that pulls either the first 4 or the last
3671  // 4 values out of the original 8-wide loaded value.
3672  int32_t shuf[2][4] = { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } };
3673 
3674  ret.push_back(CoalescedLoadOp(loadOps[i].start, 4));
3675  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load,
3676  shuf[0], 4, insertBefore);
3677 
3678  ret.push_back(CoalescedLoadOp(loadOps[i].start+4, 4));
3679  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load,
3680  shuf[1], 4, insertBefore);
3681  }
3682  else
3683  ret.push_back(loadOps[i]);
3684  }
3685 
3686  return ret;
3687 }
3688 
3689 
3690 /** Given a 1-wide load of a 32-bit value, merge its value into the result
3691  vector for any and all elements for which it applies.
3692  */
3693 static llvm::Value *
3694 lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load,
3695  const int64_t offsets[4], bool set[4],
3696  llvm::Instruction *insertBefore) {
3697  for (int elt = 0; elt < 4; ++elt) {
3698  if (offsets[elt] >= load.start &&
3699  offsets[elt] < load.start + load.count) {
3700  Debug(SourcePos(), "Load 1 @ %" PRId64 " matches for element #%d "
3701  "(value %" PRId64 ")", load.start, elt, offsets[elt]);
3702  // If this load gives one of the values that we need, then we
3703  // can just insert it in directly
3704  Assert(set[elt] == false);
3705  result =
3706  llvm::InsertElementInst::Create(result, load.load, LLVMInt32(elt),
3707  "insert_load", insertBefore);
3708  set[elt] = true;
3709  }
3710  }
3711 
3712  return result;
3713 }
3714 
3715 
3716 /** Similarly, incorporate the values from a 2-wide load into any vector
3717  elements that they apply to. */
3718 static llvm::Value *
3719 lApplyLoad2(llvm::Value *result, const CoalescedLoadOp &load,
3720  const int64_t offsets[4], bool set[4],
3721  llvm::Instruction *insertBefore) {
3722  for (int elt = 0; elt < 4; ++elt) {
3723  // First, try to do a 64-bit-wide insert into the result vector.
3724  // We can do this when we're currently at an even element, when the
3725  // current and next element have consecutive values, and where the
3726  // original 64-bit load is at the offset needed by the current
3727  // element.
3728  if ((elt & 1) == 0 &&
3729  offsets[elt] + 1 == offsets[elt+1] &&
3730  offsets[elt] == load.start) {
3731  Debug(SourcePos(), "Load 2 @ %" PRId64 " matches for elements #%d,%d "
3732  "(values %" PRId64 ",%" PRId64 ")", load.start, elt, elt+1,
3733  offsets[elt], offsets[elt+1]);
3734  Assert(set[elt] == false && set[elt+1] == false);
3735 
3736  // In this case, we bitcast from a 4xi32 to a 2xi64 vector
3737  llvm::Type *vec2x64Type =
3738  llvm::VectorType::get(LLVMTypes::Int64Type, 2);
3739  result = new llvm::BitCastInst(result, vec2x64Type, "to2x64",
3740  insertBefore);
3741 
3742  // And now we can insert the 64-bit wide value into the
3743  // appropriate elment
3744  result = llvm::InsertElementInst::Create(result, load.load,
3745  LLVMInt32(elt/2),
3746  "insert64", insertBefore);
3747 
3748  // And back to 4xi32.
3749  llvm::Type *vec4x32Type =
3750  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3751  result = new llvm::BitCastInst(result, vec4x32Type, "to4x32",
3752  insertBefore);
3753 
3754  set[elt] = set[elt+1] = true;
3755  // Advance elt one extra time, since we just took care of two
3756  // elements
3757  ++elt;
3758  }
3759  else if (offsets[elt] >= load.start &&
3760  offsets[elt] < load.start + load.count) {
3761  Debug(SourcePos(), "Load 2 @ %" PRId64 " matches for element #%d "
3762  "(value %" PRId64 ")", load.start, elt, offsets[elt]);
3763  // Otherwise, insert one of the 32-bit pieces into an element
3764  // of the final vector
3765  Assert(set[elt] == false);
3766  llvm::Value *toInsert = (offsets[elt] == load.start) ?
3767  load.element0 : load.element1;
3768  result =
3769  llvm::InsertElementInst::Create(result, toInsert, LLVMInt32(elt),
3770  "insert_load", insertBefore);
3771  set[elt] = true;
3772  }
3773  }
3774 
3775  return result;
3776 }
3777 
3778 
3779 #if 1
3780 /* This approach works better with AVX, while the #else path generates
3781  slightly better code with SSE. Need to continue to dig into performance
3782  details with this stuff in general... */
3783 
3784 /** And handle a 4-wide load */
3785 static llvm::Value *
3786 lApplyLoad4(llvm::Value *result, const CoalescedLoadOp &load,
3787  const int64_t offsets[4], bool set[4],
3788  llvm::Instruction *insertBefore) {
3789  // Conceptually, we're doing to consider doing a shuffle vector with
3790  // the 4-wide load and the 4-wide result we have so far to generate a
3791  // new 4-wide vector. We'll start with shuffle indices that just
3792  // select each element of the result so far for the result.
3793  int32_t shuf[4] = { 4, 5, 6, 7 };
3794 
3795  for (int elt = 0; elt < 4; ++elt) {
3796  if (offsets[elt] >= load.start &&
3797  offsets[elt] < load.start + load.count) {
3798  Debug(SourcePos(), "Load 4 @ %" PRId64 " matches for element #%d "
3799  "(value %" PRId64 ")", load.start, elt, offsets[elt]);
3800 
3801  // If the current element falls within the range of locations
3802  // that the 4-wide load covers, then compute the appropriate
3803  // shuffle index that extracts the appropriate element from the
3804  // load.
3805  Assert(set[elt] == false);
3806  shuf[elt] = int32_t(offsets[elt] - load.start);
3807  set[elt] = true;
3808  }
3809  }
3810 
3811  // Now, issue a shufflevector instruction if any of the values from the
3812  // load we just considered were applicable.
3813  if (shuf[0] != 4 || shuf[1] != 5 || shuf[2] != 6 || shuf[3] != 7)
3814  result = LLVMShuffleVectors(load.load, result, shuf, 4, insertBefore);
3815 
3816  return result;
3817 }
3818 
3819 
3820 /** We're need to fill in the values for a 4-wide result vector. This
3821  function looks at all of the generated loads and extracts the
3822  appropriate elements from the appropriate loads to assemble the result.
3823  Here the offsets[] parameter gives the 4 offsets from the base pointer
3824  for the four elements of the result.
3825 */
3826 static llvm::Value *
3827 lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps,
3828  const int64_t offsets[4], llvm::Instruction *insertBefore) {
3829  llvm::Type *returnType =
3830  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3831  llvm::Value *result = llvm::UndefValue::get(returnType);
3832 
3833  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %"
3834  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
3835 
3836  // Track whether we have found a valid value for each of the four
3837  // elements of the result
3838  bool set[4] = { false, false, false, false };
3839 
3840  // Loop over all of the loads and check each one to see if it provides
3841  // a value that's applicable to the result
3842  for (int load = 0; load < (int)loadOps.size(); ++load) {
3843  const CoalescedLoadOp &li = loadOps[load];
3844 
3845  switch (li.count) {
3846  case 1:
3847  result = lApplyLoad1(result, li, offsets, set, insertBefore);
3848  break;
3849  case 2:
3850  result = lApplyLoad2(result, li, offsets, set, insertBefore);
3851  break;
3852  case 4:
3853  result = lApplyLoad4(result, li, offsets, set, insertBefore);
3854  break;
3855  default:
3856  FATAL("Unexpected load count in lAssemble4Vector()");
3857  }
3858  }
3859 
3860  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %"
3861  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
3862 
3863  for (int i = 0; i < 4; ++i)
3864  Assert(set[i] == true);
3865 
3866  return result;
3867 }
3868 
3869 #else
3870 
3871 static llvm::Value *
3872 lApplyLoad4s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3873  const int64_t offsets[4], bool set[4],
3874  llvm::Instruction *insertBefore) {
3875  int32_t firstMatchElements[4] = { -1, -1, -1, -1 };
3876  const CoalescedLoadOp *firstMatch = NULL;
3877 
3878  Assert(llvm::isa<llvm::UndefValue>(result));
3879 
3880  for (int load = 0; load < (int)loadOps.size(); ++load) {
3881  const CoalescedLoadOp &loadop = loadOps[load];
3882  if (loadop.count != 4)
3883  continue;
3884 
3885  int32_t matchElements[4] = { -1, -1, -1, -1 };
3886  bool anyMatched = false;
3887  for (int elt = 0; elt < 4; ++elt) {
3888  if (offsets[elt] >= loadop.start &&
3889  offsets[elt] < loadop.start + loadop.count) {
3890  Debug(SourcePos(), "Load 4 @ %" PRId64 " matches for element #%d "
3891  "(value %" PRId64 ")", loadop.start, elt, offsets[elt]);
3892  anyMatched = true;
3893  Assert(set[elt] == false);
3894  matchElements[elt] = offsets[elt] - loadop.start;
3895  set[elt] = true;
3896  }
3897  }
3898 
3899  if (anyMatched) {
3900  if (llvm::isa<llvm::UndefValue>(result)) {
3901  if (firstMatch == NULL) {
3902  firstMatch = &loadop;
3903  for (int i = 0; i < 4; ++i)
3904  firstMatchElements[i] = matchElements[i];
3905  }
3906  else {
3907  int32_t shuffle[4] = { -1, -1, -1, -1 };
3908  for (int i = 0; i < 4; ++i) {
3909  if (firstMatchElements[i] != -1)
3910  shuffle[i] = firstMatchElements[i];
3911  else
3912  shuffle[i] = 4 + matchElements[i];
3913  }
3914  result = LLVMShuffleVectors(firstMatch->load, loadop.load, shuffle,
3915  4, insertBefore);
3916  firstMatch = NULL;
3917  }
3918  }
3919  else {
3920  int32_t shuffle[4] = { -1, -1, -1, -1 };
3921  for (int i = 0; i < 4; ++i) {
3922  if (matchElements[i] != -1)
3923  shuffle[i] = 4 + matchElements[i];
3924  else
3925  shuffle[i] = i;
3926  }
3927  result = LLVMShuffleVectors(result, loadop.load, shuffle, 4,
3928  insertBefore);
3929  }
3930  }
3931  }
3932 
3933  if (firstMatch != NULL && llvm::isa<llvm::UndefValue>(result))
3934  return LLVMShuffleVectors(firstMatch->load, result, firstMatchElements,
3935  4, insertBefore);
3936  else
3937  return result;
3938 }
3939 
3940 
3941 static llvm::Value *
3942 lApplyLoad12s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3943  const int64_t offsets[4], bool set[4],
3944  llvm::Instruction *insertBefore) {
3945  // Loop over all of the loads and check each one to see if it provides
3946  // a value that's applicable to the result
3947  for (int load = 0; load < (int)loadOps.size(); ++load) {
3948  const CoalescedLoadOp &loadop = loadOps[load];
3949  Assert(loadop.count == 1 || loadop.count == 2 || loadop.count == 4);
3950 
3951  if (loadop.count == 1)
3952  result = lApplyLoad1(result, loadop, offsets, set, insertBefore);
3953  else if (loadop.count == 2)
3954  result = lApplyLoad2(result, loadop, offsets, set, insertBefore);
3955  }
3956  return result;
3957 }
3958 
3959 
3960 /** We're need to fill in the values for a 4-wide result vector. This
3961  function looks at all of the generated loads and extracts the
3962  appropriate elements from the appropriate loads to assemble the result.
3963  Here the offsets[] parameter gives the 4 offsets from the base pointer
3964  for the four elements of the result.
3965 */
3966 static llvm::Value *
3967 lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps,
3968  const int64_t offsets[4], llvm::Instruction *insertBefore) {
3969  llvm::Type *returnType =
3970  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3971  llvm::Value *result = llvm::UndefValue::get(returnType);
3972 
3973  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %"
3974  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
3975 
3976  // Track whether we have found a valid value for each of the four
3977  // elements of the result
3978  bool set[4] = { false, false, false, false };
3979 
3980  result = lApplyLoad4s(result, loadOps, offsets, set, insertBefore);
3981  result = lApplyLoad12s(result, loadOps, offsets, set, insertBefore);
3982 
3983  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %"
3984  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
3985 
3986  for (int i = 0; i < 4; ++i)
3987  Assert(set[i] == true);
3988 
3989  return result;
3990 }
3991 #endif
3992 
3993 
3994 /** Given the set of loads that we've done and the set of result values to
3995  be computed, this function computes the final llvm::Value *s for each
3996  result vector.
3997  */
3998 static void
3999 lAssembleResultVectors(const std::vector<CoalescedLoadOp> &loadOps,
4000  const std::vector<int64_t> &constOffsets,
4001  std::vector<llvm::Value *> &results,
4002  llvm::Instruction *insertBefore) {
4003  // We work on 4-wide chunks of the final values, even when we're
4004  // computing 8-wide or 16-wide vectors. This gives better code from
4005  // LLVM's SSE/AVX code generators.
4006  Assert((constOffsets.size() % 4) == 0);
4007  std::vector<llvm::Value *> vec4s;
4008  for (int i = 0; i < (int)constOffsets.size(); i += 4)
4009  vec4s.push_back(lAssemble4Vector(loadOps, &constOffsets[i],
4010  insertBefore));
4011 
4012  // And now concatenate 1, 2, or 4 of the 4-wide vectors computed above
4013  // into 4, 8, or 16-wide final result vectors.
4014  int numGathers = constOffsets.size() / g->target->getVectorWidth();
4015  for (int i = 0; i < numGathers; ++i) {
4016  llvm::Value *result = NULL;
4017  switch (g->target->getVectorWidth()) {
4018  case 4:
4019  result = vec4s[i];
4020  break;
4021  case 8:
4022  result = LLVMConcatVectors(vec4s[2*i], vec4s[2*i+1], insertBefore);
4023  break;
4024  case 16: {
4025  llvm::Value *v1 = LLVMConcatVectors(vec4s[4*i], vec4s[4*i+1],
4026  insertBefore);
4027  llvm::Value *v2 = LLVMConcatVectors(vec4s[4*i+2], vec4s[4*i+3],
4028  insertBefore);
4029  result = LLVMConcatVectors(v1, v2, insertBefore);
4030  break;
4031  }
4032  default:
4033  FATAL("Unhandled vector width in lAssembleResultVectors()");
4034  }
4035 
4036  results.push_back(result);
4037  }
4038 }
4039 
4040 
4041 /** Given a call to a gather function, extract the base pointer, the 2/4/8
4042  scale, and the first varying offsets value to use them to compute that
4043  scalar base pointer that is shared by all of the gathers in the group.
4044  (Thus, this base pointer plus the constant offsets term for each gather
4045  gives the set of addresses to use for each gather.
4046  */
4047 static llvm::Value *
4048 lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore) {
4049  llvm::Value *basePtr = gatherInst->getArgOperand(0);
4050  llvm::Value *variableOffsets = gatherInst->getArgOperand(1);
4051  llvm::Value *offsetScale = gatherInst->getArgOperand(2);
4052 
4053  // All of the variable offsets values should be the same, due to
4054  // checking for this in GatherCoalescePass::runOnBasicBlock(). Thus,
4055  // extract the first value and use that as a scalar.
4056  llvm::Value *variable = LLVMExtractFirstVectorElement(variableOffsets);
4057  if (variable->getType() == LLVMTypes::Int64Type)
4058  offsetScale = new llvm::ZExtInst(offsetScale, LLVMTypes::Int64Type,
4059  "scale_to64", insertBefore);
4060  llvm::Value *offset =
4061  llvm::BinaryOperator::Create(llvm::Instruction::Mul, variable,
4062  offsetScale, "offset", insertBefore);
4063 
4064  return lGEPInst(basePtr, offset, "new_base", insertBefore);
4065 }
4066 
4067 
4068 /** Extract the constant offsets (from the common base pointer) from each
4069  of the gathers in a set to be coalesced. These come in as byte
4070  offsets, but we'll transform them into offsets in terms of the size of
4071  the base scalar type being gathered. (e.g. for an i32 gather, we might
4072  have offsets like <0,4,16,20>, which would be transformed to <0,1,4,5>
4073  here.)
4074  */
4075 static void
4076 lExtractConstOffsets(const std::vector<llvm::CallInst *> &coalesceGroup,
4077  int elementSize, std::vector<int64_t> *constOffsets) {
4078  int width = g->target->getVectorWidth();
4079  *constOffsets = std::vector<int64_t>(coalesceGroup.size() * width, 0);
4080 
4081  int64_t *endPtr = &((*constOffsets)[0]);
4082  for (int i = 0; i < (int)coalesceGroup.size(); ++i, endPtr += width) {
4083  llvm::Value *offsets = coalesceGroup[i]->getArgOperand(3);
4084  int nElts;
4085  bool ok = LLVMExtractVectorInts(offsets, endPtr, &nElts);
4086  Assert(ok && nElts == width);
4087  }
4088 
4089  for (int i = 0; i < (int)constOffsets->size(); ++i)
4090  (*constOffsets)[i] /= elementSize;
4091 }
4092 
4093 
4094 /** Actually do the coalescing. We have a set of gathers all accessing
4095  addresses of the form:
4096 
4097  (ptr + {1,2,4,8} * varyingOffset) + constOffset, a.k.a.
4098  basePtr + constOffset
4099 
4100  where varyingOffset actually has the same value across all of the SIMD
4101  lanes and where the part in parenthesis has the same value for all of
4102  the gathers in the group.
4103  */
4104 static bool
4105 lCoalesceGathers(const std::vector<llvm::CallInst *> &coalesceGroup) {
4106  llvm::Instruction *insertBefore = coalesceGroup[0];
4107 
4108  // First, compute the shared base pointer for all of the gathers
4109  llvm::Value *basePtr = lComputeBasePtr(coalesceGroup[0], insertBefore);
4110 
4111  int elementSize = 0;
4112  if (coalesceGroup[0]->getType() == LLVMTypes::Int32VectorType ||
4113  coalesceGroup[0]->getType() == LLVMTypes::FloatVectorType)
4114  elementSize = 4;
4115  else if (coalesceGroup[0]->getType() == LLVMTypes::Int64VectorType ||
4116  coalesceGroup[0]->getType() == LLVMTypes::DoubleVectorType)
4117  elementSize = 8;
4118  else
4119  FATAL("Unexpected gather type in lCoalesceGathers");
4120 
4121  // Extract the constant offsets from the gathers into the constOffsets
4122  // vector: the first vectorWidth elements will be those for the first
4123  // gather, the next vectorWidth those for the next gather, and so
4124  // forth.
4125  std::vector<int64_t> constOffsets;
4126  lExtractConstOffsets(coalesceGroup, elementSize, &constOffsets);
4127 
4128  // Determine a set of loads to perform to get all of the values we need
4129  // loaded.
4130  std::vector<CoalescedLoadOp> loadOps;
4131  lSelectLoads(constOffsets, &loadOps);
4132 
4133  lCoalescePerfInfo(coalesceGroup, loadOps);
4134 
4135  // Actually emit load instructions for them
4136  lEmitLoads(basePtr, loadOps, elementSize, insertBefore);
4137 
4138  // Now, for any loads that give us <8 x i32> vectors, split their
4139  // values into two <4 x i32> vectors; it turns out that LLVM gives us
4140  // better code on AVX when we assemble the pieces from 4-wide vectors.
4141  loadOps = lSplit8WideLoads(loadOps, insertBefore);
4142 
4143  // Given all of these chunks of values, shuffle together a vector that
4144  // gives us each result value; the i'th element of results[] gives the
4145  // result for the i'th gather in coalesceGroup.
4146  std::vector<llvm::Value *> results;
4147  lAssembleResultVectors(loadOps, constOffsets, results, insertBefore);
4148 
4149  // Finally, replace each of the original gathers with the instruction
4150  // that gives the value from the coalescing process.
4151  Assert(results.size() == coalesceGroup.size());
4152  for (int i = 0; i < (int)results.size(); ++i) {
4153  llvm::Instruction *ir = llvm::dyn_cast<llvm::Instruction>(results[i]);
4154  Assert(ir != NULL);
4155 
4156  llvm::Type *origType = coalesceGroup[i]->getType();
4157  if (origType != ir->getType())
4158  ir = new llvm::BitCastInst(ir, origType, ir->getName(),
4159  coalesceGroup[i]);
4160 
4161  // Previously, all of the instructions to compute the final result
4162  // were into the basic block here; here we remove the very last one
4163  // of them (that holds the final result) from the basic block.
4164  // This way, the following ReplaceInstWithInst() call will operate
4165  // successfully. (It expects that the second argument not be in any
4166  // basic block.)
4167  ir->removeFromParent();
4168 
4169  llvm::ReplaceInstWithInst(coalesceGroup[i], ir);
4170  }
4171 
4172  return true;
4173 }
4174 
4175 
4176 /** Given an instruction, returns true if the instructon may write to
4177  memory. This is a conservative test in that it may return true for
4178  some instructions that don't actually end up writing to memory, but
4179  should never return false for an instruction that does write to
4180  memory. */
4181 static bool
4182 lInstructionMayWriteToMemory(llvm::Instruction *inst) {
4183  if (llvm::isa<llvm::StoreInst>(inst) ||
4184  llvm::isa<llvm::AtomicRMWInst>(inst) ||
4185  llvm::isa<llvm::AtomicCmpXchgInst>(inst))
4186  // FIXME: we could be less conservative and try to allow stores if
4187  // we are sure that the pointers don't overlap..
4188  return true;
4189 
4190  // Otherwise, any call instruction that doesn't have an attribute
4191  // indicating it won't write to memory has to be treated as a potential
4192  // store.
4193  llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
4194  if (ci != NULL) {
4195  llvm::Function *calledFunc = ci->getCalledFunction();
4196  if (calledFunc == NULL)
4197  return true;
4198 
4199  if (calledFunc->onlyReadsMemory() || calledFunc->doesNotAccessMemory())
4200  return false;
4201  return true;
4202  }
4203 
4204  return false;
4205 }
4206 
4207 
4208 bool
4210  DEBUG_START_PASS("GatherCoalescePass");
4211 
4212  llvm::Function *gatherFuncs[] = {
4213  m->module->getFunction("__pseudo_gather_factored_base_offsets32_i32"),
4214  m->module->getFunction("__pseudo_gather_factored_base_offsets32_float"),
4215  m->module->getFunction("__pseudo_gather_factored_base_offsets64_i32"),
4216  m->module->getFunction("__pseudo_gather_factored_base_offsets64_float"),
4217  };
4218  int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]);
4219 
4220  bool modifiedAny = false;
4221 
4222  restart:
4223  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e;
4224  ++iter) {
4225  // Iterate over all of the instructions and look for calls to
4226  // __pseudo_gather_factored_base_offsets{32,64}_{i32,float} calls.
4227  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
4228  if (callInst == NULL)
4229  continue;
4230 
4231  llvm::Function *calledFunc = callInst->getCalledFunction();
4232  if (calledFunc == NULL)
4233  continue;
4234 
4235  int i;
4236  for (i = 0; i < nGatherFuncs; ++i)
4237  if (gatherFuncs[i] != NULL && calledFunc == gatherFuncs[i])
4238  break;
4239  if (i == nGatherFuncs)
4240  // Doesn't match any of the types of gathers we care about
4241  continue;
4242 
4243  SourcePos pos;
4244  lGetSourcePosFromMetadata(callInst, &pos);
4245  Debug(pos, "Checking for coalescable gathers starting here...");
4246 
4247  llvm::Value *base = callInst->getArgOperand(0);
4248  llvm::Value *variableOffsets = callInst->getArgOperand(1);
4249  llvm::Value *offsetScale = callInst->getArgOperand(2);
4250  llvm::Value *mask = callInst->getArgOperand(4);
4251 
4252  // To apply this optimization, we need a set of one or more gathers
4253  // that fulfill the following conditions:
4254  //
4255  // - Mask all on
4256  // - The variable offsets to all have the same value (i.e., to be
4257  // uniform).
4258  // - Same base pointer, variable offsets, and offset scale (for
4259  // more than one gather)
4260  //
4261  // Then and only then do we have a common base pointer with all
4262  // offsets from that constants (in which case we can potentially
4263  // coalesce).
4264  if (lGetMaskStatus(mask) != ALL_ON)
4265  continue;
4266 
4267  if (!LLVMVectorValuesAllEqual(variableOffsets))
4268  continue;
4269 
4270  // coalesceGroup stores the set of gathers that we're going to try to
4271  // coalesce over
4272  std::vector<llvm::CallInst *> coalesceGroup;
4273  coalesceGroup.push_back(callInst);
4274 
4275  // Start iterating at the instruction after the initial gather;
4276  // look at the remainder of instructions in the basic block (up
4277  // until we reach a write to memory) to try to find any other
4278  // gathers that can coalesce with this one.
4279  llvm::BasicBlock::iterator fwdIter = iter;
4280  ++fwdIter;
4281  for (; fwdIter != bb.end(); ++fwdIter) {
4282  // Must stop once we come to an instruction that may write to
4283  // memory; otherwise we could end up moving a read before this
4284  // write.
4285  if (lInstructionMayWriteToMemory(&*fwdIter))
4286  break;
4287 
4288  llvm::CallInst *fwdCall = llvm::dyn_cast<llvm::CallInst>(&*fwdIter);
4289  if (fwdCall == NULL ||
4290  fwdCall->getCalledFunction() != calledFunc)
4291  continue;
4292 
4293  SourcePos fwdPos;
4294  // TODO: need to redesign metadata attached to pseudo calls,
4295  // LLVM drops metadata frequently and it results in bad disgnostics.
4296  lGetSourcePosFromMetadata(fwdCall, &fwdPos);
4297 
4298  if (g->debugPrint) {
4299  if (base != fwdCall->getArgOperand(0)) {
4300  Debug(fwdPos, "base pointers mismatch");
4301  LLVMDumpValue(base);
4302  LLVMDumpValue(fwdCall->getArgOperand(0));
4303  }
4304  if (variableOffsets != fwdCall->getArgOperand(1)) {
4305  Debug(fwdPos, "varying offsets mismatch");
4306  LLVMDumpValue(variableOffsets);
4307  LLVMDumpValue(fwdCall->getArgOperand(1));
4308  }
4309  if (offsetScale != fwdCall->getArgOperand(2)) {
4310  Debug(fwdPos, "offset scales mismatch");
4311  LLVMDumpValue(offsetScale);
4312  LLVMDumpValue(fwdCall->getArgOperand(2));
4313  }
4314  if (mask != fwdCall->getArgOperand(4)) {
4315  Debug(fwdPos, "masks mismatch");
4316  LLVMDumpValue(mask);
4317  LLVMDumpValue(fwdCall->getArgOperand(4));
4318  }
4319  }
4320 
4321  if (base == fwdCall->getArgOperand(0) &&
4322  variableOffsets == fwdCall->getArgOperand(1) &&
4323  offsetScale == fwdCall->getArgOperand(2) &&
4324  mask == fwdCall->getArgOperand(4)) {
4325  Debug(fwdPos, "This gather can be coalesced.");
4326  coalesceGroup.push_back(fwdCall);
4327 
4328  if (coalesceGroup.size() == 4)
4329  // FIXME: untested heuristic: don't try to coalesce
4330  // over a window of more than 4 gathers, so that we
4331  // don't cause too much register pressure and end up
4332  // spilling to memory anyway.
4333  break;
4334  }
4335  else
4336  Debug(fwdPos, "This gather doesn't match the initial one.");
4337  }
4338 
4339  Debug(pos, "Done with checking for matching gathers");
4340 
4341  // Now that we have a group of gathers, see if we can coalesce them
4342  // into something more efficient than the original set of gathers.
4343  if (lCoalesceGathers(coalesceGroup)) {
4344  modifiedAny = true;
4345  goto restart;
4346  }
4347  }
4348 
4349  DEBUG_END_PASS("GatherCoalescePass");
4350 
4351  return modifiedAny;
4352 }
4353 
4354 
4355 static llvm::Pass *
4357  return new GatherCoalescePass;
4358 }
4359 
4360 
4361 ///////////////////////////////////////////////////////////////////////////
4362 // ReplacePseudoMemoryOpsPass
4363 
4364 /** For any gathers and scatters remaining after the GSToLoadStorePass
4365  runs, we need to turn them into actual native gathers and scatters.
4366  This task is handled by the ReplacePseudoMemoryOpsPass here.
4367  */
4368 class ReplacePseudoMemoryOpsPass : public llvm::BasicBlockPass {
4369 public:
4370  static char ID;
4371  ReplacePseudoMemoryOpsPass() : BasicBlockPass(ID) { }
4372 
4373 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4374  const char *getPassName() const { return "Replace Pseudo Memory Ops"; }
4375 #else // LLVM 4.0+
4376  llvm::StringRef getPassName() const { return "Replace Pseudo Memory Ops"; }
4377 #endif
4378  bool runOnBasicBlock(llvm::BasicBlock &BB);
4379 };
4380 
4381 
4383 
4384 /** This routine attempts to determine if the given pointer in lvalue is
4385  pointing to stack-allocated memory. It's conservative in that it
4386  should never return true for non-stack allocated memory, but may return
4387  false for memory that actually is stack allocated. The basic strategy
4388  is to traverse through the operands and see if the pointer originally
4389  comes from an AllocaInst.
4390 */
4391 static bool
4392 lIsSafeToBlend(llvm::Value *lvalue) {
4393  llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(lvalue);
4394  if (bc != NULL)
4395  return lIsSafeToBlend(bc->getOperand(0));
4396  else {
4397  llvm::AllocaInst *ai = llvm::dyn_cast<llvm::AllocaInst>(lvalue);
4398  if (ai) {
4399  llvm::Type *type = ai->getType();
4400  llvm::PointerType *pt =
4401  llvm::dyn_cast<llvm::PointerType>(type);
4402  assert(pt != NULL);
4403  type = pt->getElementType();
4404  llvm::ArrayType *at;
4405  while ((at = llvm::dyn_cast<llvm::ArrayType>(type))) {
4406  type = at->getElementType();
4407  }
4408  llvm::VectorType *vt =
4409  llvm::dyn_cast<llvm::VectorType>(type);
4410  return (vt != NULL &&
4411  (int)vt->getNumElements() == g->target->getVectorWidth());
4412  }
4413  else {
4414  llvm::GetElementPtrInst *gep =
4415  llvm::dyn_cast<llvm::GetElementPtrInst>(lvalue);
4416  if (gep != NULL)
4417  return lIsSafeToBlend(gep->getOperand(0));
4418  else
4419  return false;
4420  }
4421  }
4422 }
4423 
4424 
4425 static bool
4426 lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
4427  struct LMSInfo {
4428  LMSInfo(const char *pname, const char *bname, const char *msname) {
4429  pseudoFunc = m->module->getFunction(pname);
4430  blendFunc = m->module->getFunction(bname);
4431  maskedStoreFunc = m->module->getFunction(msname);
4432  Assert(pseudoFunc != NULL && blendFunc != NULL &&
4433  maskedStoreFunc != NULL);
4434  }
4435  llvm::Function *pseudoFunc;
4436  llvm::Function *blendFunc;
4437  llvm::Function *maskedStoreFunc;
4438  };
4439 
4440  LMSInfo msInfo[] = {
4441  LMSInfo("__pseudo_masked_store_i8", "__masked_store_blend_i8",
4442  "__masked_store_i8"),
4443  LMSInfo("__pseudo_masked_store_i16", "__masked_store_blend_i16",
4444  "__masked_store_i16"),
4445  LMSInfo("__pseudo_masked_store_i32", "__masked_store_blend_i32",
4446  "__masked_store_i32"),
4447  LMSInfo("__pseudo_masked_store_float", "__masked_store_blend_float",
4448  "__masked_store_float"),
4449  LMSInfo("__pseudo_masked_store_i64", "__masked_store_blend_i64",
4450  "__masked_store_i64"),
4451  LMSInfo("__pseudo_masked_store_double", "__masked_store_blend_double",
4452  "__masked_store_double")
4453  };
4454 
4455  LMSInfo *info = NULL;
4456  for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
4457  if (msInfo[i].pseudoFunc != NULL &&
4458  callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
4459  info = &msInfo[i];
4460  break;
4461  }
4462  }
4463  if (info == NULL)
4464  return false;
4465 
4466  llvm::Value *lvalue = callInst->getArgOperand(0);
4467  llvm::Value *rvalue = callInst->getArgOperand(1);
4468  llvm::Value *mask = callInst->getArgOperand(2);
4469 
4470  // We need to choose between doing the load + blend + store trick,
4471  // or serializing the masked store. Even on targets with a native
4472  // masked store instruction, this is preferable since it lets us
4473  // keep values in registers rather than going out to the stack.
4474  bool doBlend = (!g->opt.disableBlendedMaskedStores &&
4475  lIsSafeToBlend(lvalue));
4476 
4477  // Generate the call to the appropriate masked store function and
4478  // replace the __pseudo_* one with it.
4479  llvm::Function *fms = doBlend ? info->blendFunc : info->maskedStoreFunc;
4480  llvm::Instruction *inst = lCallInst(fms, lvalue, rvalue, mask, "", callInst);
4481  lCopyMetadata(inst, callInst);
4482 
4483  callInst->eraseFromParent();
4484  return true;
4485 }
4486 
4487 
4488 static bool
4489 lReplacePseudoGS(llvm::CallInst *callInst) {
4490  struct LowerGSInfo {
4491  LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip)
4492  : isGather(ig), isPrefetch(ip) {
4493  pseudoFunc = m->module->getFunction(pName);
4494  actualFunc = m->module->getFunction(aName);
4495  }
4496  llvm::Function *pseudoFunc;
4497  llvm::Function *actualFunc;
4498  const bool isGather;
4499  const bool isPrefetch;
4500  };
4501 
4502  LowerGSInfo lgsInfo[] = {
4503  LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false),
4504  LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
4505  LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
4506  LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
4507  LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
4508  LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),
4509 
4510  LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false),
4511  LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
4512  LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
4513  LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
4514  LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
4515  LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),
4516 
4517  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",
4518  "__gather_factored_base_offsets32_i8", true, false),
4519  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16",
4520  "__gather_factored_base_offsets32_i16", true, false),
4521  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32",
4522  "__gather_factored_base_offsets32_i32", true, false),
4523  LowerGSInfo("__pseudo_gather_factored_base_offsets32_float",
4524  "__gather_factored_base_offsets32_float", true, false),
4525  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64",
4526  "__gather_factored_base_offsets32_i64", true, false),
4527  LowerGSInfo("__pseudo_gather_factored_base_offsets32_double",
4528  "__gather_factored_base_offsets32_double", true, false),
4529 
4530  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",
4531  "__gather_factored_base_offsets64_i8", true, false),
4532  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16",
4533  "__gather_factored_base_offsets64_i16", true, false),
4534  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32",
4535  "__gather_factored_base_offsets64_i32", true, false),
4536  LowerGSInfo("__pseudo_gather_factored_base_offsets64_float",
4537  "__gather_factored_base_offsets64_float", true, false),
4538  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64",
4539  "__gather_factored_base_offsets64_i64", true, false),
4540  LowerGSInfo("__pseudo_gather_factored_base_offsets64_double",
4541  "__gather_factored_base_offsets64_double", true, false),
4542 
4543  LowerGSInfo("__pseudo_gather_base_offsets32_i8",
4544  "__gather_base_offsets32_i8", true, false),
4545  LowerGSInfo("__pseudo_gather_base_offsets32_i16",
4546  "__gather_base_offsets32_i16", true, false),
4547  LowerGSInfo("__pseudo_gather_base_offsets32_i32",
4548  "__gather_base_offsets32_i32", true, false),
4549  LowerGSInfo("__pseudo_gather_base_offsets32_float",
4550  "__gather_base_offsets32_float", true, false),
4551  LowerGSInfo("__pseudo_gather_base_offsets32_i64",
4552  "__gather_base_offsets32_i64", true, false),
4553  LowerGSInfo("__pseudo_gather_base_offsets32_double",
4554  "__gather_base_offsets32_double", true, false),
4555 
4556  LowerGSInfo("__pseudo_gather_base_offsets64_i8",
4557  "__gather_base_offsets64_i8", true, false),
4558  LowerGSInfo("__pseudo_gather_base_offsets64_i16",
4559  "__gather_base_offsets64_i16", true, false),
4560  LowerGSInfo("__pseudo_gather_base_offsets64_i32",
4561  "__gather_base_offsets64_i32", true, false),
4562  LowerGSInfo("__pseudo_gather_base_offsets64_float",
4563  "__gather_base_offsets64_float", true, false),
4564  LowerGSInfo("__pseudo_gather_base_offsets64_i64",
4565  "__gather_base_offsets64_i64", true, false),
4566  LowerGSInfo("__pseudo_gather_base_offsets64_double",
4567  "__gather_base_offsets64_double", true, false),
4568 
4569  LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false),
4570  LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
4571  LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
4572  LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
4573  LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
4574  LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),
4575 
4576  LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false),
4577  LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
4578  LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
4579  LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
4580  LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
4581  LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),
4582 
4583  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",
4584  "__scatter_factored_base_offsets32_i8", false, false),
4585  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16",
4586  "__scatter_factored_base_offsets32_i16", false, false),
4587  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32",
4588  "__scatter_factored_base_offsets32_i32", false, false),
4589  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float",
4590  "__scatter_factored_base_offsets32_float", false, false),
4591  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64",
4592  "__scatter_factored_base_offsets32_i64", false, false),
4593  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double",
4594  "__scatter_factored_base_offsets32_double", false, false),
4595 
4596  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",
4597  "__scatter_factored_base_offsets64_i8", false, false),
4598  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16",
4599  "__scatter_factored_base_offsets64_i16", false, false),
4600  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32",
4601  "__scatter_factored_base_offsets64_i32", false, false),
4602  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float",
4603  "__scatter_factored_base_offsets64_float", false, false),
4604  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64",
4605  "__scatter_factored_base_offsets64_i64", false, false),
4606  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double",
4607  "__scatter_factored_base_offsets64_double", false, false),
4608 
4609 
4610  LowerGSInfo("__pseudo_scatter_base_offsets32_i8",
4611  "__scatter_base_offsets32_i8", false, false),
4612  LowerGSInfo("__pseudo_scatter_base_offsets32_i16",
4613  "__scatter_base_offsets32_i16", false, false),
4614  LowerGSInfo("__pseudo_scatter_base_offsets32_i32",
4615  "__scatter_base_offsets32_i32", false, false),
4616  LowerGSInfo("__pseudo_scatter_base_offsets32_float",
4617  "__scatter_base_offsets32_float", false, false),
4618  LowerGSInfo("__pseudo_scatter_base_offsets32_i64",
4619  "__scatter_base_offsets32_i64", false, false),
4620  LowerGSInfo("__pseudo_scatter_base_offsets32_double",
4621  "__scatter_base_offsets32_double", false, false),
4622 
4623  LowerGSInfo("__pseudo_scatter_base_offsets64_i8",
4624  "__scatter_base_offsets64_i8", false, false),
4625  LowerGSInfo("__pseudo_scatter_base_offsets64_i16",
4626  "__scatter_base_offsets64_i16", false, false),
4627  LowerGSInfo("__pseudo_scatter_base_offsets64_i32",
4628  "__scatter_base_offsets64_i32", false, false),
4629  LowerGSInfo("__pseudo_scatter_base_offsets64_float",
4630  "__scatter_base_offsets64_float", false, false),
4631  LowerGSInfo("__pseudo_scatter_base_offsets64_i64",
4632  "__scatter_base_offsets64_i64", false, false),
4633  LowerGSInfo("__pseudo_scatter_base_offsets64_double",
4634  "__scatter_base_offsets64_double", false, false),
4635 
4636  LowerGSInfo("__pseudo_prefetch_read_varying_1",
4637  "__prefetch_read_varying_1", false, true),
4638  LowerGSInfo("__pseudo_prefetch_read_varying_1_native",
4639  "__prefetch_read_varying_1_native", false, true),
4640 
4641  LowerGSInfo("__pseudo_prefetch_read_varying_2",
4642  "__prefetch_read_varying_2", false, true),
4643  LowerGSInfo("__pseudo_prefetch_read_varying_2_native",
4644  "__prefetch_read_varying_2_native", false, true),
4645 
4646  LowerGSInfo("__pseudo_prefetch_read_varying_3",
4647  "__prefetch_read_varying_3", false, true),
4648  LowerGSInfo("__pseudo_prefetch_read_varying_3_native",
4649  "__prefetch_read_varying_3_native", false, true),
4650 
4651  LowerGSInfo("__pseudo_prefetch_read_varying_nt",
4652  "__prefetch_read_varying_nt", false, true),
4653  LowerGSInfo("__pseudo_prefetch_read_varying_nt_native",
4654  "__prefetch_read_varying_nt_native", false, true),
4655  };
4656 
4657  llvm::Function *calledFunc = callInst->getCalledFunction();
4658 
4659  LowerGSInfo *info = NULL;
4660  for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
4661  if (lgsInfo[i].pseudoFunc != NULL &&
4662  calledFunc == lgsInfo[i].pseudoFunc) {
4663  info = &lgsInfo[i];
4664  break;
4665  }
4666  }
4667  if (info == NULL)
4668  return false;
4669 
4670  Assert(info->actualFunc != NULL);
4671 
4672  // Get the source position from the metadata attached to the call
4673  // instruction so that we can issue PerformanceWarning()s below.
4674  SourcePos pos;
4675  bool gotPosition = lGetSourcePosFromMetadata(callInst, &pos);
4676 
4677  callInst->setCalledFunction(info->actualFunc);
4678  if (gotPosition && g->target->getVectorWidth() > 1) {
4679  if (info->isGather)
4680  PerformanceWarning(pos, "Gather required to load value.");
4681  else if (!info->isPrefetch)
4682  PerformanceWarning(pos, "Scatter required to store value.");
4683  }
4684  return true;
4685 }
4686 
4687 
4688 bool
4690  DEBUG_START_PASS("ReplacePseudoMemoryOpsPass");
4691 
4692  bool modifiedAny = false;
4693 
4694  restart:
4695  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
4696  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
4697  if (callInst == NULL ||
4698  callInst->getCalledFunction() == NULL)
4699  continue;
4700 
4701  if (lReplacePseudoGS(callInst)) {
4702  modifiedAny = true;
4703  goto restart;
4704  }
4705  else if (lReplacePseudoMaskedStore(callInst)) {
4706  modifiedAny = true;
4707  goto restart;
4708  }
4709  }
4710 
4711  DEBUG_END_PASS("ReplacePseudoMemoryOpsPass");
4712 
4713  return modifiedAny;
4714 }
4715 
4716 
4717 static llvm::Pass *
4719  return new ReplacePseudoMemoryOpsPass;
4720 }
4721 
4722 
4723 ///////////////////////////////////////////////////////////////////////////
4724 // IsCompileTimeConstantPass
4725 
4726 /** LLVM IR implementations of target-specific functions may include calls
4727  to the functions "bool __is_compile_time_constant_*(...)"; these allow
4728  them to have specialied code paths for where the corresponding value is
4729  known at compile time. For masks, for example, this allows them to not
4730  incur the cost of a MOVMSK call at runtime to compute its value in
4731  cases where the mask value isn't known until runtime.
4732 
4733  This pass resolves these calls into either 'true' or 'false' values so
4734  that later optimization passes can operate with these as constants.
4735 
4736  See stdlib.m4 for a number of uses of this idiom.
4737  */
4738 
4739 class IsCompileTimeConstantPass : public llvm::BasicBlockPass {
4740 public:
4741  static char ID;
4742  IsCompileTimeConstantPass(bool last = false) : BasicBlockPass(ID) {
4743  isLastTry = last;
4744  }
4745 
4746 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4747  const char *getPassName() const { return "Resolve \"is compile time constant\""; }
4748 #else // LLVM 4.0+
4749  llvm::StringRef getPassName() const { return "Resolve \"is compile time constant\""; }
4750 #endif
4751  bool runOnBasicBlock(llvm::BasicBlock &BB);
4752 
4754 };
4755 
4757 
4758 
4759 bool
4761  DEBUG_START_PASS("IsCompileTimeConstantPass");
4762 
4763  llvm::Function *funcs[] = {
4764  m->module->getFunction("__is_compile_time_constant_mask"),
4765  m->module->getFunction("__is_compile_time_constant_uniform_int32"),
4766  m->module->getFunction("__is_compile_time_constant_varying_int32")
4767  };
4768 
4769  bool modifiedAny = false;
4770  restart:
4771  for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
4772  // Iterate through the instructions looking for calls to the
4773  // __is_compile_time_constant_*() functions
4774  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
4775  if (callInst == NULL)
4776  continue;
4777 
4778  int j;
4779  int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
4780  for (j = 0; j < nFuncs; ++j) {
4781  if (funcs[j] != NULL && callInst->getCalledFunction() == funcs[j])
4782  break;
4783  }
4784  if (j == nFuncs)
4785  // not a __is_compile_time_constant_* function
4786  continue;
4787 
4788  // This optimization pass can be disabled with both the (poorly
4789  // named) disableGatherScatterFlattening option and
4790  // disableMaskAllOnOptimizations.
4793  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4794  modifiedAny = true;
4795  goto restart;
4796  }
4797 
4798  // Is it a constant? Bingo, turn the call's value into a constant
4799  // true value.
4800  llvm::Value *operand = callInst->getArgOperand(0);
4801  if (llvm::isa<llvm::Constant>(operand)) {
4802  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue);
4803  modifiedAny = true;
4804  goto restart;
4805  }
4806 
4807  // This pass runs multiple times during optimization. Up until the
4808  // very last time, it only replaces the call with a 'true' if the
4809  // value is known to be constant and otherwise leaves the call
4810  // alone, in case further optimization passes can help resolve its
4811  // value. The last time through, it eventually has to give up, and
4812  // replaces any remaining ones with 'false' constants.
4813  if (isLastTry) {
4814  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4815  modifiedAny = true;
4816  goto restart;
4817  }
4818  }
4819 
4820  DEBUG_END_PASS("IsCompileTimeConstantPass");
4821 
4822  return modifiedAny;
4823 }
4824 
4825 
4826 static llvm::Pass *
4828  return new IsCompileTimeConstantPass(isLastTry);
4829 }
4830 
4831 //////////////////////////////////////////////////////////////////////////
4832 // DebugPass
4833 
4834 /** This pass is added in list of passes after optimizations which
4835  we want to debug and print dump of LLVM IR in stderr. Also it
4836  prints name and number of previous optimization.
4837  */
4838 class DebugPass : public llvm::ModulePass {
4839 public:
4840  static char ID;
4841  DebugPass(char * output) : ModulePass(ID) {
4842  sprintf(str_output, "%s", output);
4843  }
4844 
4845 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4846  const char *getPassName() const { return "Dump LLVM IR"; }
4847 #else // LLVM 4.0+
4848  llvm::StringRef getPassName() const { return "Dump LLVM IR"; }
4849 #endif
4850  bool runOnModule(llvm::Module &m);
4851 
4852 private:
4853  char str_output[100];
4854 };
4855 
4856 char DebugPass::ID = 0;
4857 
4858 bool
4859 DebugPass::runOnModule(llvm::Module &module) {
4860  fprintf(stderr, "%s", str_output);
4861  fflush(stderr);
4862  module.dump();
4863  return true;
4864 }
4865 
4866 static llvm::Pass *
4867 CreateDebugPass(char * output) {
4868  return new DebugPass(output);
4869 }
4870 
4871 ///////////////////////////////////////////////////////////////////////////
4872 // MakeInternalFuncsStaticPass
4873 
4874 /** There are a number of target-specific functions that we use during
4875  these optimization passes. By the time we are done with optimization,
4876  any uses of these should be inlined and no calls to these functions
4877  should remain. This pass marks all of these functions as having
4878  private linkage so that subsequent passes can eliminate them as dead
4879  code, thus cleaning up the final code output by the compiler. We can't
4880  just declare these as static from the start, however, since then they
4881  end up being eliminated as dead code during early optimization passes
4882  even though we may need to generate calls to them during later
4883  optimization passes.
4884  */
4885 class MakeInternalFuncsStaticPass : public llvm::ModulePass {
4886 public:
4887  static char ID;
4888  MakeInternalFuncsStaticPass(bool last = false) : ModulePass(ID) {
4889  }
4890 
4891  void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
4892  AU.setPreservesCFG();
4893  }
4894 
4895 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4896  const char *getPassName() const { return "Make internal funcs \"static\""; }
4897 #else // LLVM 4.0+
4898  llvm::StringRef getPassName() const { return "Make internal funcs \"static\""; }
4899 #endif
4900  bool runOnModule(llvm::Module &m);
4901 };
4902 
4904 
4905 bool
4907  const char *names[] = {
4908  "__avg_up_uint8",
4909  "__avg_up_int8",
4910  "__avg_up_uint16",
4911  "__avg_up_int16",
4912  "__avg_down_uint8",
4913  "__avg_down_int8",
4914  "__avg_down_uint16",
4915  "__avg_down_int16",
4916  "__fast_masked_vload",
4917  "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16",
4918  "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64",
4919  "__gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_double",
4920  "__gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i16",
4921  "__gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i64",
4922  "__gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_double",
4923  "__gather_base_offsets32_i8", "__gather_base_offsets32_i16",
4924  "__gather_base_offsets32_i32", "__gather_base_offsets32_i64",
4925  "__gather_base_offsets32_float", "__gather_base_offsets32_double",
4926  "__gather_base_offsets64_i8", "__gather_base_offsets64_i16",
4927  "__gather_base_offsets64_i32", "__gather_base_offsets64_i64",
4928  "__gather_base_offsets64_float", "__gather_base_offsets64_double",
4929  "__gather32_i8", "__gather32_i16",
4930  "__gather32_i32", "__gather32_i64",
4931  "__gather32_float", "__gather32_double",
4932  "__gather64_i8", "__gather64_i16",
4933  "__gather64_i32", "__gather64_i64",
4934  "__gather64_float", "__gather64_double",
4935  "__gather_elt32_i8", "__gather_elt32_i16",
4936  "__gather_elt32_i32", "__gather_elt32_i64",
4937  "__gather_elt32_float", "__gather_elt32_double",
4938  "__gather_elt64_i8", "__gather_elt64_i16",
4939  "__gather_elt64_i32", "__gather_elt64_i64",
4940  "__gather_elt64_float", "__gather_elt64_double",
4941  "__masked_load_i8", "__masked_load_i16",
4942  "__masked_load_i32", "__masked_load_i64",
4943  "__masked_load_float", "__masked_load_double",
4944  "__masked_store_i8", "__masked_store_i16",
4945  "__masked_store_i32", "__masked_store_i64",
4946  "__masked_store_float", "__masked_store_double",
4947  "__masked_store_blend_i8", "__masked_store_blend_i16",
4948  "__masked_store_blend_i32", "__masked_store_blend_i64",
4949  "__masked_store_blend_float", "__masked_store_blend_double",
4950  "__scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i16",
4951  "__scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i64",
4952  "__scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_double",
4953  "__scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i16",
4954  "__scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i64",
4955  "__scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_double",
4956  "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16",
4957  "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64",
4958  "__scatter_base_offsets32_float", "__scatter_base_offsets32_double",
4959  "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16",
4960  "__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64",
4961  "__scatter_base_offsets64_float", "__scatter_base_offsets64_double",
4962  "__scatter_elt32_i8", "__scatter_elt32_i16",
4963  "__scatter_elt32_i32", "__scatter_elt32_i64",
4964  "__scatter_elt32_float", "__scatter_elt32_double",
4965  "__scatter_elt64_i8", "__scatter_elt64_i16",
4966  "__scatter_elt64_i32", "__scatter_elt64_i64",
4967  "__scatter_elt64_float", "__scatter_elt64_double",
4968  "__scatter32_i8", "__scatter32_i16",
4969  "__scatter32_i32", "__scatter32_i64",
4970  "__scatter32_float", "__scatter32_double",
4971  "__scatter64_i8", "__scatter64_i16",
4972  "__scatter64_i32", "__scatter64_i64",
4973  "__scatter64_float", "__scatter64_double",
4974  "__prefetch_read_varying_1", "__prefetch_read_varying_2",
4975  "__prefetch_read_varying_3", "__prefetch_read_varying_nt",
4976  "__keep_funcs_live",
4977  };
4978 
4979  bool modifiedAny = false;
4980  int count = sizeof(names) / sizeof(names[0]);
4981  for (int i = 0; i < count; ++i) {
4982  llvm::Function *f = m->module->getFunction(names[i]);
4983  if (f != NULL && f->empty() == false) {
4984  f->setLinkage(llvm::GlobalValue::InternalLinkage);
4985  modifiedAny = true;
4986  }
4987  }
4988 
4989  return modifiedAny;
4990 }
4991 
4992 
4993 static llvm::Pass *
4995  return new MakeInternalFuncsStaticPass;
4996 }
4997 
4998 
4999 ///////////////////////////////////////////////////////////////////////////
5000 // PeepholePass
5001 
5002 class PeepholePass : public llvm::BasicBlockPass {
5003 public:
5004  PeepholePass();
5005 
5006 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
5007  const char *getPassName() const { return "Peephole Optimizations"; }
5008 #else // LLVM 4.0+
5009  llvm::StringRef getPassName() const { return "Peephole Optimizations"; }
5010 #endif
5011  bool runOnBasicBlock(llvm::BasicBlock &BB);
5012 
5013  static char ID;
5014 };
5015 
5016 char PeepholePass::ID = 0;
5017 
5019  : BasicBlockPass(ID) {
5020 }
5021 
5022 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3
5023 
5024 using namespace llvm::PatternMatch;
5025 
5026 template<typename Op_t, unsigned Opcode>
5027 struct CastClassTypes_match {
5028  Op_t Op;
5029  const llvm::Type *fromType, *toType;
5030 
5031  CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f,
5032  const llvm::Type *t)
5033  : Op(OpMatch), fromType(f), toType(t) {}
5034 
5035  template<typename OpTy>
5036  bool match(OpTy *V) {
5037  if (llvm::Operator *O = llvm::dyn_cast<llvm::Operator>(V))
5038  return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) &&
5039  O->getType() == toType &&
5040  O->getOperand(0)->getType() == fromType);
5041  return false;
5042  }
5043 };
5044 
5045 template<typename OpTy>
5046 inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
5047 m_SExt8To16(const OpTy &Op) {
5048  return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
5049  Op,
5052 }
5053 
5054 template<typename OpTy>
5055 inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
5056 m_ZExt8To16(const OpTy &Op) {
5057  return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
5058  Op,
5061 }
5062 
5063 
5064 template<typename OpTy>
5065 inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
5066 m_Trunc16To8(const OpTy &Op) {
5067  return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
5068  Op,
5071 }
5072 
5073 template<typename OpTy>
5074 inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
5075 m_SExt16To32(const OpTy &Op) {
5076  return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
5077  Op,
5080 }
5081 
5082 template<typename OpTy>
5083 inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
5084 m_ZExt16To32(const OpTy &Op) {
5085  return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
5086  Op,
5089 }
5090 
5091 
5092 template<typename OpTy>
5093 inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
5094 m_Trunc32To16(const OpTy &Op) {
5095  return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
5096  Op,
5099 }
5100 
5101 template<typename Op_t>
5102 struct UDiv2_match {
5103  Op_t Op;
5104 
5105  UDiv2_match(const Op_t &OpMatch)
5106  : Op(OpMatch) {}
5107 
5108  template<typename OpTy>
5109  bool match(OpTy *V) {
5110  llvm::BinaryOperator *bop;
5111  llvm::ConstantDataVector *cdv;
5112  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
5113  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
5114  cdv->getSplatValue() != NULL) {
5115  const llvm::APInt &apInt = cdv->getUniqueInteger();
5116 
5117  switch (bop->getOpcode()) {
5118  case llvm::Instruction::UDiv:
5119  // divide by 2
5120  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
5121  case llvm::Instruction::LShr:
5122  // shift left by 1
5123  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
5124  default:
5125  return false;
5126  }
5127  }
5128  return false;
5129  }
5130 };
5131 
5132 template<typename V>
5133 inline UDiv2_match<V>
5134 m_UDiv2(const V &v) {
5135  return UDiv2_match<V>(v);
5136 }
5137 
5138 template<typename Op_t>
5139 struct SDiv2_match {
5140  Op_t Op;
5141 
5142  SDiv2_match(const Op_t &OpMatch)
5143  : Op(OpMatch) {}
5144 
5145  template<typename OpTy>
5146  bool match(OpTy *V) {
5147  llvm::BinaryOperator *bop;
5148  llvm::ConstantDataVector *cdv;
5149  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
5150  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
5151  cdv->getSplatValue() != NULL) {
5152  const llvm::APInt &apInt = cdv->getUniqueInteger();
5153 
5154  switch (bop->getOpcode()) {
5155  case llvm::Instruction::SDiv:
5156  // divide by 2
5157  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
5158  case llvm::Instruction::AShr:
5159  // shift left by 1
5160  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
5161  default:
5162  return false;
5163  }
5164  }
5165  return false;
5166  }
5167 };
5168 
5169 template<typename V>
5170 inline SDiv2_match<V>
5171 m_SDiv2(const V &v) {
5172  return SDiv2_match<V>(v);
5173 }
5174 
5175 // Returns true if the given function has a call to an intrinsic function
5176 // in its definition.
5177 static bool
5178 lHasIntrinsicInDefinition(llvm::Function *func) {
5179  llvm::Function::iterator bbiter = func->begin();
5180  for (; bbiter != func->end(); ++bbiter) {
5181  for (llvm::BasicBlock::iterator institer = bbiter->begin();
5182  institer != bbiter->end(); ++institer) {
5183  if (llvm::isa<llvm::IntrinsicInst>(institer))
5184  return true;
5185  }
5186  }
5187  return false;
5188 }
5189 
5190 static llvm::Instruction *
5191 lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) {
5192  llvm::Function *func = m->module->getFunction(name);
5193  Assert(func != NULL);
5194 
5195  // Make sure that the definition of the llvm::Function has a call to an
5196  // intrinsic function in its instructions; otherwise we will generate
5197  // infinite loops where we "helpfully" turn the default implementations
5198  // of target builtins like __avg_up_uint8 that are implemented with plain
5199  // arithmetic ops into recursive calls to themselves.
5200  if (lHasIntrinsicInDefinition(func))
5201  return lCallInst(func, opa, opb, name);
5202  else
5203  return NULL;
5204 }
5205 
5206 //////////////////////////////////////////////////
5207 
5208 static llvm::Instruction *
5209 lMatchAvgUpUInt8(llvm::Value *inst) {
5210  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
5211  llvm::Value *opa, *opb;
5212  const llvm::APInt *delta;
5213  if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr(
5214  m_CombineOr(
5215  m_Add(m_ZExt8To16(m_Value(opa)),
5216  m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))),
5217  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)),
5218  m_ZExt8To16(m_Value(opb)))),
5219  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))),
5220  m_APInt(delta))))))) {
5221  if (delta->isIntN(1) == false)
5222  return NULL;
5223 
5224  return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
5225  }
5226  return NULL;
5227 }
5228 
5229 
5230 static llvm::Instruction *
5231 lMatchAvgDownUInt8(llvm::Value *inst) {
5232  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
5233  llvm::Value *opa, *opb;
5234  if (match(inst, m_Trunc16To8(m_UDiv2(
5235  m_Add(m_ZExt8To16(m_Value(opa)),
5236  m_ZExt8To16(m_Value(opb))))))) {
5237  return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb);
5238  }
5239  return NULL;
5240 }
5241 
5242 static llvm::Instruction *
5243 lMatchAvgUpUInt16(llvm::Value *inst) {
5244  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
5245  llvm::Value *opa, *opb;
5246  const llvm::APInt *delta;
5247  if (match(inst, m_Trunc32To16(m_UDiv2(m_CombineOr(
5248  m_CombineOr(
5249  m_Add(m_ZExt16To32(m_Value(opa)),
5250  m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))),
5251  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)),
5252  m_ZExt16To32(m_Value(opb)))),
5253  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))),
5254  m_APInt(delta))))))) {
5255  if (delta->isIntN(1) == false)
5256  return NULL;
5257 
5258  return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
5259  }
5260  return NULL;
5261 }
5262 
5263 
5264 static llvm::Instruction *
5265 lMatchAvgDownUInt16(llvm::Value *inst) {
5266  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
5267  llvm::Value *opa, *opb;
5268  if (match(inst, m_Trunc32To16(m_UDiv2(
5269  m_Add(m_ZExt16To32(m_Value(opa)),
5270  m_ZExt16To32(m_Value(opb))))))) {
5271  return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb);
5272  }
5273  return NULL;
5274 }
5275 
5276 
5277 static llvm::Instruction *
5278 lMatchAvgUpInt8(llvm::Value *inst) {
5279  // (int8)(((int16)a + (int16)b + 1)/2)
5280  llvm::Value *opa, *opb;
5281  const llvm::APInt *delta;
5282  if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr(
5283  m_CombineOr(
5284  m_Add(m_SExt8To16(m_Value(opa)),
5285  m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))),
5286  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)),
5287  m_SExt8To16(m_Value(opb)))),
5288  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))),
5289  m_APInt(delta))))))) {
5290  if (delta->isIntN(1) == false)
5291  return NULL;
5292 
5293  return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
5294  }
5295  return NULL;
5296 }
5297 
5298 
5299 static llvm::Instruction *
5300 lMatchAvgDownInt8(llvm::Value *inst) {
5301  // (int8)(((int16)a + (int16)b)/2)
5302  llvm::Value *opa, *opb;
5303  if (match(inst, m_Trunc16To8(m_SDiv2(
5304  m_Add(m_SExt8To16(m_Value(opa)),
5305  m_SExt8To16(m_Value(opb))))))) {
5306  return lGetBinaryIntrinsic("__avg_down_int8", opa, opb);
5307  }
5308  return NULL;
5309 }
5310 
5311 static llvm::Instruction *
5312 lMatchAvgUpInt16(llvm::Value *inst) {
5313  // (int16)(((int32)a + (int32)b + 1)/2)
5314  llvm::Value *opa, *opb;
5315  const llvm::APInt *delta;
5316  if (match(inst, m_Trunc32To16(m_SDiv2(m_CombineOr(
5317  m_CombineOr(
5318  m_Add(m_SExt16To32(m_Value(opa)),
5319  m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))),
5320  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)),
5321  m_SExt16To32(m_Value(opb)))),
5322  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))),
5323  m_APInt(delta))))))) {
5324  if (delta->isIntN(1) == false)
5325  return NULL;
5326 
5327  return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
5328  }
5329  return NULL;
5330 }
5331 
5332 static llvm::Instruction *
5333 lMatchAvgDownInt16(llvm::Value *inst) {
5334  // (int16)(((int32)a + (int32)b)/2)
5335  llvm::Value *opa, *opb;
5336  if (match(inst, m_Trunc32To16(m_SDiv2(
5337  m_Add(m_SExt16To32(m_Value(opa)),
5338  m_SExt16To32(m_Value(opb))))))) {
5339  return lGetBinaryIntrinsic("__avg_down_int16", opa, opb);
5340  }
5341  return NULL;
5342 }
5343 #endif // !LLVM_3_2
5344 
5345 
5346 bool
5347 PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
5348  DEBUG_START_PASS("PeepholePass");
5349 
5350  bool modifiedAny = false;
5351  restart:
5352  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
5353  llvm::Instruction *inst = &*iter;
5354 
5355  llvm::Instruction *builtinCall = NULL;
5356 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3
5357  if (!builtinCall)
5358  builtinCall = lMatchAvgUpUInt8(inst);
5359  if (!builtinCall)
5360  builtinCall = lMatchAvgUpUInt16(inst);
5361  if (!builtinCall)
5362  builtinCall = lMatchAvgDownUInt8(inst);
5363  if (!builtinCall)
5364  builtinCall = lMatchAvgDownUInt16(inst);
5365  if (!builtinCall)
5366  builtinCall = lMatchAvgUpInt8(inst);
5367  if (!builtinCall)
5368  builtinCall = lMatchAvgUpInt16(inst);
5369  if (!builtinCall)
5370  builtinCall = lMatchAvgDownInt8(inst);
5371  if (!builtinCall)
5372  builtinCall = lMatchAvgDownInt16(inst);
5373 #endif // !LLVM_3_2
5374  if (builtinCall != NULL) {
5375  llvm::ReplaceInstWithInst(inst, builtinCall);
5376  modifiedAny = true;
5377  goto restart;
5378  }
5379  }
5380 
5381  DEBUG_END_PASS("PeepholePass");
5382 
5383  return modifiedAny;
5384 }
5385 
5386 static llvm::Pass *
5388  return new PeepholePass;
5389 }
5390 
5391 /** Given an llvm::Value known to be an integer, return its value as
5392  an int64_t.
5393 */
5394 static int64_t
5395 lGetIntValue(llvm::Value *offset) {
5396  llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
5397  Assert(intOffset && (intOffset->getBitWidth() == 32 ||
5398  intOffset->getBitWidth() == 64));
5399  return intOffset->getSExtValue();
5400 }
5401 
5402 ///////////////////////////////////////////////////////////////////////////
5403 // ReplaceStdlibShiftPass
5404 
5405 class ReplaceStdlibShiftPass : public llvm::BasicBlockPass {
5406 public:
5407  static char ID;
5408  ReplaceStdlibShiftPass() : BasicBlockPass(ID) {
5409  }
5410 
5411 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
5412  const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
5413 #else // LLVM 4.0+
5414  llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
5415 #endif
5416  bool runOnBasicBlock(llvm::BasicBlock &BB);
5417 
5418 };
5419 
5421 
5422 bool
5424  DEBUG_START_PASS("ReplaceStdlibShiftPass");
5425  bool modifiedAny = false;
5426 
5427  llvm::Function *shifts[6];
5428  shifts[0] = m->module->getFunction("__shift_i8");
5429  shifts[1] = m->module->getFunction("__shift_i16");
5430  shifts[2] = m->module->getFunction("__shift_i32");
5431  shifts[3] = m->module->getFunction("__shift_i64");
5432  shifts[4] = m->module->getFunction("__shift_float");
5433  shifts[5] = m->module->getFunction("__shift_double");
5434 
5435  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
5436  llvm::Instruction *inst = &*iter;
5437 
5438  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
5439  llvm::Function *func = ci->getCalledFunction();
5440  for (int i = 0; i < 6; i++) {
5441  if (shifts[i] && (shifts[i] == func)) {
5442  // we matched a call
5443  llvm::Value *shiftedVec = ci->getArgOperand(0);
5444  llvm::Value *shiftAmt = ci->getArgOperand(1);
5445  if (llvm::isa<llvm::Constant>(shiftAmt)) {
5446  int vectorWidth = g->target->getVectorWidth();
5447  int * shuffleVals = new int[vectorWidth];
5448  int shiftInt = lGetIntValue(shiftAmt);
5449  for (int i = 0; i < vectorWidth; i++) {
5450  int s = i + shiftInt;
5451  s = (s < 0) ? vectorWidth : s;
5452  s = (s >= vectorWidth) ? vectorWidth : s;
5453  shuffleVals[i] = s;
5454  }
5455  llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
5456  llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
5457  llvm::Value *shuffle = new llvm::ShuffleVectorInst(shiftedVec, zeroVec,
5458  shuffleIdxs, "vecShift", ci);
5459  ci->replaceAllUsesWith(shuffle);
5460  modifiedAny = true;
5461  delete [] shuffleVals;
5462  } else {
5463  PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount.");
5464  }
5465  }
5466  }
5467  }
5468  }
5469 
5470  DEBUG_END_PASS("ReplaceStdlibShiftPass");
5471 
5472  return modifiedAny;
5473 }
5474 
5475 
5476 static llvm::Pass *
5478  return new ReplaceStdlibShiftPass();
5479 }
5480 
5481 
5482 
5483 ///////////////////////////////////////////////////////////////////////////////
5484 // FixBooleanSelect
5485 //
5486 // The problem is that in LLVM 3.3, optimizer doesn't like
5487 // the following instruction sequence:
5488 // %cmp = fcmp olt <8 x float> %a, %b
5489 // %sext_cmp = sext <8 x i1> %cmp to <8 x i32>
5490 // %new_mask = and <8 x i32> %sext_cmp, %mask
5491 // and optimizes it to the following:
5492 // %cmp = fcmp olt <8 x float> %a, %b
5493 // %cond = select <8 x i1> %cmp, <8 x i32> %mask, <8 x i32> zeroinitializer
5494 //
5495 // It wouldn't be a problem if codegen produced good code for it. But it
5496 // doesn't, especially for vectors larger than native vectors.
5497 //
5498 // This optimization reverts this pattern and should be the last one before
5499 // code gen.
5500 //
5501 // Note that this problem was introduced in LLVM 3.3. But in LLVM 3.4 it was
5502 // fixed. See commit r194542.
5503 //
5504 // After LLVM 3.3 this optimization should probably stay for experimental
5505 // purposes and code should be compared with and without this optimization from
5506 // time to time to make sure that LLVM does right thing.
5507 ///////////////////////////////////////////////////////////////////////////////
5508 
5509 class FixBooleanSelectPass : public llvm::FunctionPass {
5510 public:
5511  static char ID;
5512  FixBooleanSelectPass() :FunctionPass(ID) {}
5513 
5514 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
5515  const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
5516 #else // LLVM 4.0+
5517  llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
5518 #endif
5519  bool runOnFunction(llvm::Function &F);
5520 
5521 private:
5522  llvm::Instruction* fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext);
5523 };
5524 
5525 char FixBooleanSelectPass::ID = 0;
5526 
5527 llvm::Instruction* FixBooleanSelectPass::fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext) {
5528  // Select instruction result type and its integer equivalent
5529  llvm::VectorType *orig_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
5530  llvm::VectorType *int_type = llvm::VectorType::getInteger(orig_type);
5531 
5532  // Result value and optional pointer to instruction to delete
5533  llvm::Instruction *result = 0, *optional_to_delete = 0;
5534 
5535  // It can be vector of integers or vector of floating point values.
5536  if (orig_type->getElementType()->isIntegerTy()) {
5537  // Generate sext+and, remove select.
5538  result = llvm::BinaryOperator::CreateAnd(sext, sel->getTrueValue(), "and_mask", sel);
5539  } else {
5540  llvm::BitCastInst* bc = llvm::dyn_cast<llvm::BitCastInst>(sel->getTrueValue());
5541 
5542  if (bc && bc->hasOneUse() && bc->getSrcTy()->isIntOrIntVectorTy() && bc->getSrcTy()->isVectorTy() &&
5543  llvm::isa<llvm::Instruction>(bc->getOperand(0)) &&
5544  llvm::dyn_cast<llvm::Instruction>(bc->getOperand(0))->getParent() == sel->getParent()) {
5545  // Bitcast is casting form integer type, it's operand is instruction, which is located in the same basic block (otherwise it's unsafe to use it).
5546  // bitcast+select => sext+and+bicast
5547  // Create and
5548  llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc->getOperand(0), "and_mask", sel);
5549  // Bitcast back to original type
5550  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
5551  // Original bitcast will be removed
5552  optional_to_delete = bc;
5553  } else {
5554  // General case: select => bitcast+sext+and+bitcast
5555  // Bitcast
5556  llvm::BitCastInst* bc_in = new llvm::BitCastInst(sel->getTrueValue(), int_type, "bitcast_mask_in", sel);
5557  // And
5558  llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc_in, "and_mask", sel);
5559  // Bitcast back to original type
5560  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
5561  }
5562  }
5563 
5564  // Done, finalize.
5565  sel->replaceAllUsesWith(result);
5566  sel->eraseFromParent();
5567  if (optional_to_delete) {
5568  optional_to_delete->eraseFromParent();
5569  }
5570 
5571  return result;
5572 }
5573 
5574 bool
5576  bool modifiedAny = false;
5577 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_3 // LLVM 3.3 only
5578 
5579  // Don't optimize generic targets.
5580  if (g->target->getISA() == Target::GENERIC) {
5581  return false;
5582  }
5583 
5584  for (llvm::Function::iterator I = F.begin(), E = F.end();
5585  I != E; ++I) {
5586  llvm::BasicBlock* bb = &*I;
5587  for (llvm::BasicBlock::iterator iter = bb->begin(), e = bb->end(); iter != e; ++iter) {
5588  llvm::Instruction *inst = &*iter;
5589 
5590  llvm::CmpInst *cmp = llvm::dyn_cast<llvm::CmpInst>(inst);
5591 
5592  if (cmp &&
5593  cmp->getType()->isVectorTy() &&
5594  cmp->getType()->getVectorElementType()->isIntegerTy(1)) {
5595 
5596  // Search for select instruction uses.
5597  int selects = 0;
5598  llvm::VectorType* sext_type = 0;
5599  for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it ) {
5600  llvm::SelectInst* sel = llvm::dyn_cast<llvm::SelectInst>(*it);
5601  if (sel &&
5602  sel->getType()->isVectorTy() &&
5603  sel->getType()->getScalarSizeInBits() > 1) {
5604  selects++;
5605  // We pick the first one, but typical case when all select types are the same.
5606  sext_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
5607  break;
5608  }
5609  }
5610  if (selects == 0) {
5611  continue;
5612  }
5613  // Get an integer equivalent, if it's not yet an integer.
5614  sext_type = llvm::VectorType::getInteger(sext_type);
5615 
5616  // Do transformation
5617  llvm::BasicBlock::iterator iter_copy=iter;
5618  llvm::Instruction* next_inst = &*(++iter_copy);
5619  // Create or reuse sext
5620  llvm::SExtInst* sext = llvm::dyn_cast<llvm::SExtInst>(next_inst);
5621  if (sext &&
5622  sext->getOperand(0) == cmp &&
5623  sext->getDestTy() == sext_type) {
5624  // This sext can be reused
5625  } else {
5626  if (next_inst) {
5627  sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", next_inst);
5628  } else {
5629  sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", bb);
5630  }
5631  }
5632 
5633  // Walk and fix selects
5634  std::vector<llvm::SelectInst*> sel_uses;
5635  for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it) {
5636  llvm::SelectInst* sel = llvm::dyn_cast<llvm::SelectInst>(*it);
5637  if (sel &&
5638  sel->getType()->getScalarSizeInBits() == sext_type->getScalarSizeInBits()) {
5639 
5640  // Check that second operand is zero.
5641  llvm::Constant* false_cond = llvm::dyn_cast<llvm::Constant>(sel->getFalseValue());
5642  if (false_cond &&
5643  false_cond->isZeroValue()) {
5644  sel_uses.push_back(sel);
5645  modifiedAny = true;
5646  }
5647  }
5648  }
5649 
5650  for (int i=0; i<sel_uses.size(); i++) {
5651  fixSelect(sel_uses[i], sext);
5652  }
5653  }
5654  }
5655  }
5656 
5657 #endif // LLVM 3.3
5658 
5659  return modifiedAny;
5660 }
5661 
5662 
5663 static llvm::Pass *
5665  return new FixBooleanSelectPass();
5666 }
5667 
5668 #ifdef ISPC_NVPTX_ENABLED
5669 ///////////////////////////////////////////////////////////////////////////////
5670 // Detect addrspace(3)
5671 ///////////////////////////////////////////////////////////////////////////////
5672 
5673 class PromoteLocalToPrivatePass: public llvm::BasicBlockPass
5674 {
5675  public:
5676  static char ID; // Pass identification, replacement for typeid
5677  PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
5678 
5679  bool runOnBasicBlock(llvm::BasicBlock &BB);
5680 };
5681 
5682 char PromoteLocalToPrivatePass::ID = 0;
5683 
5684 bool
5685 PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB)
5686 {
5687  std::vector<llvm::AllocaInst*> Allocas;
5688 
5689  bool modifiedAny = false;
5690 
5691 #if 1
5692 restart:
5693  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
5694  {
5695  llvm::Instruction *inst = &*I;
5696  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
5697  {
5698  llvm::Function *func = ci->getCalledFunction();
5699  if (func && func->getName() == "llvm.trap")
5700  {
5701  std::vector<llvm::Type*> funcTyArgs;
5702  llvm::FunctionType *funcTy = llvm::FunctionType::get(
5703  /*Result=*/llvm::Type::getVoidTy(*g->ctx),
5704  /*Params=*/funcTyArgs,
5705  /*isVarArg=*/false);
5706  llvm::InlineAsm *trap_ptx = llvm::InlineAsm::get(funcTy, "trap;", "", false);
5707  assert(trap_ptx != NULL);
5708  llvm::Instruction *trap_call = llvm::CallInst::Create(trap_ptx);
5709  assert(trap_call != NULL);
5710  llvm::ReplaceInstWithInst(ci, trap_call);
5711  modifiedAny = true;
5712  goto restart;
5713  }
5714  }
5715  }
5716 #endif
5717 
5718 #if 0
5719  llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
5720 
5721  // Find allocas that are safe to promote, by looking at all instructions in
5722  // the entry node
5723  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
5724  {
5725  llvm::Instruction *inst = &*I;
5726  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
5727  {
5728  llvm::Function *func = ci->getCalledFunction();
5729  if (cvtFunc && (cvtFunc == func))
5730  {
5731 #if 0
5732  fprintf(stderr , "--found cvt-- name= %s \n",
5733  I->getName().str().c_str());
5734 #endif
5735  llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
5736  assert(alloca != NULL);
5737 #if 0
5738  const int align = 8; // g->target->getNativeVectorAlignment();
5739  alloca->setAlignment(align);
5740 #endif
5741  ci->replaceAllUsesWith(alloca);
5742  modifiedAny = true;
5743  }
5744  }
5745  }
5746 #endif
5747  return modifiedAny;
5748 }
5749 
5750 static llvm::Pass *
5751 CreatePromoteLocalToPrivatePass() {
5752  return new PromoteLocalToPrivatePass();
5753 }
5754 
5755 
5756 
5757 #endif /* ISPC_NVPTX_ENABLED */
5758 
static llvm::Pass * CreateFixBooleanSelectPass()
Definition: opt.cpp:5664
static llvm::Type * FloatType
Definition: llvmutil.h:80
static llvm::Type * Int32VectorPointerType
Definition: llvmutil.h:103
static bool lCoalesceGathers(const std::vector< llvm::CallInst * > &coalesceGroup)
Definition: opt.cpp:4105
llvm::TargetMachine * GetTargetMachine() const
Definition: ispc.h:232
llvm::Value * LLVMShuffleVectors(llvm::Value *v1, llvm::Value *v2, int32_t shuf[], int shufSize, llvm::Instruction *insertBefore)
Definition: llvmutil.cpp:1694
Opt opt
Definition: ispc.h:547
llvm::Constant * LLVMInt64Vector(int64_t i)
Definition: llvmutil.cpp:455
llvm::Instruction * fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext)
Definition: opt.cpp:5527
static bool lIsSafeToBlend(llvm::Value *lvalue)
Definition: opt.cpp:4392
Declaration of the FunctionEmitContext class
void PerformanceWarning(SourcePos p, const char *format,...) PRINTF_FUNC
Definition: util.cpp:426
static llvm::Type * DoubleType
Definition: llvmutil.h:81
static llvm::Value * lExtractFromInserts(llvm::Value *v, unsigned int index)
Definition: opt.cpp:1578
bool disableBlendedMaskedStores
Definition: ispc.h:492
static llvm::Value * lExtractOffsetVector248Scale(llvm::Value **vec)
Definition: opt.cpp:1954
static bool simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter)
Definition: opt.cpp:1377
int getVectorWidth() const
Definition: ispc.h:285
Module * m
Definition: ispc.cpp:93
void Optimize(llvm::Module *module, int optLevel)
Definition: opt.cpp:533
static llvm::Pass * CreateImproveMemoryOpsPass()
Definition: opt.cpp:3278
int first_line
Definition: ispc.h:141
bool runOnModule(llvm::Module &m)
Definition: opt.cpp:4906
Target * target
Definition: ispc.h:549
static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, const char *name, llvm::Instruction *insertBefore=NULL)
Definition: opt.cpp:292
static bool lVectorLoadIsEfficient(std::set< int64_t >::iterator iter, std::set< int64_t >::iterator end, std::set< int64_t >::iterator *newIter, int vectorWidth)
Definition: opt.cpp:3355
static void lSelectLoads(const std::vector< int64_t > &loadOffsets, std::vector< CoalescedLoadOp > *loads)
Definition: opt.cpp:3456
static llvm::Constant * lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1, llvm::Constant **delta)
Definition: opt.cpp:1560
static llvm::Value * lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3694
llvm::Value * lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align, llvm::Instruction *insertBefore, llvm::Type *type)
Definition: opt.cpp:3576
const char * getPassName() const
Definition: opt.cpp:5007
#define Assert(expr)
Definition: ispc.h:172
static char ID
Definition: opt.cpp:5013
static char ID
Definition: opt.cpp:5511
bool LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts)
Definition: llvmutil.cpp:794
llvm::Constant * LLVMInt32Vector(int32_t i)
Definition: llvmutil.cpp:379
static llvm::Value * lAssemble4Vector(const std::vector< CoalescedLoadOp > &loadOps, const int64_t offsets[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3827
static llvm::VectorType * Int32VectorType
Definition: llvmutil.h:96
Declarations related to optimization passes.
llvm::Value * element0
Definition: opt.cpp:3344
static llvm::Pass * CreateReplaceStdlibShiftPass()
Definition: opt.cpp:5477
std::vector< MaskInstruction > maskInstructions
Definition: opt.cpp:969
bool forceAlignedMemory
Definition: ispc.h:471
static llvm::Type * FloatVectorPointerType
Definition: llvmutil.h:105
BlendInstruction * matchingBlendInstruction(llvm::Function *function)
Definition: opt.cpp:1242
static char ID
Definition: opt.cpp:3307
static bool lVectorIs32BitInts(llvm::Value *v)
Definition: opt.cpp:2122
static bool lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos)
Definition: opt.cpp:238
IsCompileTimeConstantPass(bool last=false)
Definition: opt.cpp:4742
static bool lGetMask(llvm::Value *factor, uint64_t *mask)
Definition: opt.cpp:396
static llvm::Type * Int16VectorPointerType
Definition: llvmutil.h:102
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:1397
static bool lIsIntegerSplat(llvm::Value *v, int *splat)
Definition: opt.cpp:1882
MakeInternalFuncsStaticPass(bool last=false)
Definition: opt.cpp:4888
static llvm::Pass * CreateReplacePseudoMemoryOpsPass()
Definition: opt.cpp:4718
static void lExtractConstOffsets(const std::vector< llvm::CallInst * > &coalesceGroup, int elementSize, std::vector< int64_t > *constOffsets)
Definition: opt.cpp:4076
int getNativeVectorAlignment() const
Definition: ispc.h:281
static llvm::Type * Int16Type
Definition: llvmutil.h:77
static llvm::Type * DoubleVectorPointerType
Definition: llvmutil.h:106
bool run(llvm::Module &M)
Definition: opt.cpp:480
llvm::Constant * LLVMFalse
Definition: llvmutil.cpp:91
static bool lInstructionMayWriteToMemory(llvm::Instruction *inst)
Definition: opt.cpp:4182
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:3234
static llvm::Pass * CreateIntrinsicsOptPass()
Definition: opt.cpp:1254
bool disableCoalescing
Definition: ispc.h:532
static llvm::Pass * CreatePeepholePass()
Definition: opt.cpp:5387
llvm::PassManager PM
Definition: opt.cpp:488
static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst)
Definition: opt.cpp:2594
static llvm::VectorType * Int1VectorType
Definition: llvmutil.h:93
static llvm::Instruction * lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name, llvm::Instruction *insertBefore)
Definition: opt.cpp:341
static bool lIsUndef(llvm::Value *value)
Definition: opt.cpp:1010
static void lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from)
Definition: opt.cpp:203
header file with declarations for symbol and symbol table classes.
std::set< int > debug_stages
Definition: ispc.h:575
bool disableMaskAllOnOptimizations
Definition: ispc.h:476
llvm::ConstantInt * LLVMInt32(int32_t i)
Definition: llvmutil.cpp:263
llvm::Module * module
Definition: module.h:158
bool matchesMaskInstruction(llvm::Function *function)
Definition: opt.cpp:1230
static llvm::Type * Int8VectorPointerType
Definition: llvmutil.h:101
Globals * g
Definition: ispc.cpp:92
Definition: opt.cpp:443
bool disableGatherScatterOptimizations
Definition: ispc.h:510
bool debugPrint
Definition: ispc.h:566
static llvm::VectorType * Int8VectorType
Definition: llvmutil.h:94
bool LLVMVectorValuesAllEqual(llvm::Value *v, llvm::Value **splat=NULL)
Definition: llvmutil.cpp:1199
void getAnalysisUsage(llvm::AnalysisUsage &AU) const
Definition: opt.cpp:4891
bool hasVecPrefetch() const
Definition: ispc.h:309
static uint64_t lConstElementsToMask(const llvm::SmallVector< llvm::Constant *, ISPC_MAX_NVEC > &elements)
Definition: opt.cpp:360
llvm::Constant * LLVMTrue
Definition: llvmutil.cpp:90
Definition: opt.cpp:443
static llvm::Pass * CreateInstructionSimplifyPass()
Definition: opt.cpp:1423
static llvm::Pass * CreateDebugPass(char *output)
Definition: opt.cpp:4867
static llvm::Value * simplifyBoolVec(llvm::Value *value)
Definition: opt.cpp:1294
static llvm::Pass * CreateIsCompileTimeConstantPass(bool isLastTry)
Definition: opt.cpp:4827
llvm::Value * element1
Definition: opt.cpp:3344
static llvm::VectorType * FloatVectorType
Definition: llvmutil.h:98
static bool lReplacePseudoGS(llvm::CallInst *callInst)
Definition: opt.cpp:4489
static bool lReplacePseudoMaskedStore(llvm::CallInst *callInst)
Definition: opt.cpp:4426
static llvm::Type * Int64Type
Definition: llvmutil.h:79
static llvm::Type * Int8Type
Definition: llvmutil.h:76
static void lAssembleResultVectors(const std::vector< CoalescedLoadOp > &loadOps, const std::vector< int64_t > &constOffsets, std::vector< llvm::Value * > &results, llvm::Instruction *insertBefore)
Definition: opt.cpp:3999