Intel SPMD Program Compiler  1.10.0
opt.cpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2010-2016, Intel Corporation
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are
7  met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16  * Neither the name of Intel Corporation nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22  IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
25  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 /** @file opt.cpp
35  @brief Implementations of various ispc optimization passes that operate
36  on the LLVM IR.
37 */
38 
39 #include "opt.h"
40 #include "ctx.h"
41 #include "sym.h"
42 #include "module.h"
43 #include "util.h"
44 #include "llvmutil.h"
45 
46 #include <stdio.h>
47 #include <map>
48 #include <set>
49 
50 #include <llvm/Pass.h>
51 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
52  #include <llvm/Module.h>
53  #include <llvm/Instructions.h>
54  #include <llvm/Intrinsics.h>
55  #include <llvm/Function.h>
56  #include <llvm/BasicBlock.h>
57  #include <llvm/Constants.h>
58 #ifdef ISPC_NVPTX_ENABLED
59  #include <llvm/InlineAsm.h>
60 #endif /* ISPC_NVPTX_ENABLED */
61 #else // LLVM 3.3+
62  #include <llvm/IR/Module.h>
63  #include <llvm/IR/Instructions.h>
64  #include <llvm/IR/Intrinsics.h>
65  #include <llvm/IR/Function.h>
66  #include <llvm/IR/BasicBlock.h>
67  #include <llvm/IR/Constants.h>
68 #ifdef ISPC_NVPTX_ENABLED
69  #include <llvm/IR/InlineAsm.h>
70 #endif /* ISPC_NVPTX_ENABLED */
71 #endif
72 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_4 // LLVM 3.4+
73  #include <llvm/Transforms/Instrumentation.h>
74 #endif
75 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
76  #include "llvm/PassManager.h"
77 #else // LLVM 3.7+
78  #include "llvm/IR/LegacyPassManager.h"
79 #endif
80 #include <llvm/PassRegistry.h>
81 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
82  #include <llvm/IR/Verifier.h>
83  #include <llvm/IR/IRPrintingPasses.h>
84  #include <llvm/IR/PatternMatch.h>
85  #include <llvm/IR/DebugInfo.h>
86 #else // < 3.5
87  #include <llvm/Analysis/Verifier.h>
88  #include <llvm/Assembly/PrintModulePass.h>
89  #include <llvm/Support/PatternMatch.h>
90  #include <llvm/DebugInfo.h>
91 #endif
92 #include <llvm/Analysis/ConstantFolding.h>
93 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
94  #include <llvm/Target/TargetLibraryInfo.h>
95 #else // LLVM 3.7+
96  #include <llvm/Analysis/TargetLibraryInfo.h>
97 #endif
98 #include <llvm/ADT/Triple.h>
99 #include <llvm/ADT/SmallSet.h>
100 #include <llvm/Transforms/Scalar.h>
101 #include <llvm/Transforms/IPO.h>
102 #if ISPC_LLVM_VERSION >= ISPC_LLVM_7_0
103  #include "llvm/Transforms/Utils.h"
104  #include "llvm/Transforms/InstCombine/InstCombine.h"
105 #endif
106 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
107 #include <llvm/Target/TargetOptions.h>
108 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
109  #include <llvm/DataLayout.h>
110 #else // LLVM 3.3+
111  #include <llvm/IR/DataLayout.h>
112  #include <llvm/Analysis/TargetTransformInfo.h>
113 #endif
114 #include <llvm/Target/TargetMachine.h>
115 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
116  #include <llvm/Analysis/BasicAliasAnalysis.h>
117  #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
118 #endif
119 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_9 // LLVM 3.9+
120  #include "llvm/Transforms/IPO/FunctionAttrs.h"
121  #include "llvm/Transforms/Scalar/GVN.h"
122 #endif
123 #include <llvm/Analysis/Passes.h>
124 #include <llvm/Support/raw_ostream.h>
125 #if ISPC_LLVM_VERSION >= ISPC_LLVM_5_0 // LLVM 5.0+
126  #include <llvm/BinaryFormat/Dwarf.h>
127 #else // LLVM up to 4.x
128  #include <llvm/Support/Dwarf.h>
129 #endif
130 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_6
131  #include <llvm/IR/IntrinsicInst.h>
132 #endif
133 #ifdef ISPC_IS_LINUX
134  #include <alloca.h>
135 #elif defined(ISPC_IS_WINDOWS)
136  #include <malloc.h>
137  #ifndef __MINGW32__
138  #define alloca _alloca
139  #endif
140 #endif // ISPC_IS_WINDOWS
141 
142 #ifndef PRId64
143 #define PRId64 "lld"
144 #endif
145 #ifndef PRIu64
146 #define PRIu64 "llu"
147 #endif
148 
149 static llvm::Pass *CreateIntrinsicsOptPass();
150 static llvm::Pass *CreateInstructionSimplifyPass();
151 static llvm::Pass *CreatePeepholePass();
152 
153 static llvm::Pass *CreateImproveMemoryOpsPass();
154 static llvm::Pass *CreateGatherCoalescePass();
155 static llvm::Pass *CreateReplacePseudoMemoryOpsPass();
156 
157 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
158 static llvm::Pass *CreateMakeInternalFuncsStaticPass();
159 
160 static llvm::Pass *CreateDebugPass(char * output);
161 
162 static llvm::Pass *CreateReplaceStdlibShiftPass();
163 
164 static llvm::Pass *CreateFixBooleanSelectPass();
165 #ifdef ISPC_NVPTX_ENABLED
166 static llvm::Pass *CreatePromoteLocalToPrivatePass();
167 #endif /* ISPC_NVPTX_ENABLED */
168 
169 #define DEBUG_START_PASS(NAME) \
170  if (g->debugPrint && \
171  (getenv("FUNC") == NULL || \
172  !strncmp(bb.getParent()->getName().str().c_str(), getenv("FUNC"), \
173  strlen(getenv("FUNC"))))) { \
174  fprintf(stderr, "Start of " NAME "\n"); \
175  fprintf(stderr, "---------------\n"); \
176  bb.dump(); \
177  fprintf(stderr, "---------------\n\n"); \
178  } else /* eat semicolon */
179 
180 #define DEBUG_END_PASS(NAME) \
181  if (g->debugPrint && \
182  (getenv("FUNC") == NULL || \
183  !strncmp(bb.getParent()->getName().str().c_str(), getenv("FUNC"), \
184  strlen(getenv("FUNC"))))) { \
185  fprintf(stderr, "End of " NAME " %s\n", modifiedAny ? "** CHANGES **" : ""); \
186  fprintf(stderr, "---------------\n"); \
187  bb.dump(); \
188  fprintf(stderr, "---------------\n\n"); \
189  } else /* eat semicolon */
190 
191 
192 
193 ///////////////////////////////////////////////////////////////////////////
194 
195 
196 /** This utility routine copies the metadata (if any) attached to the
197  'from' instruction in the IR to the 'to' instruction.
198 
199  For flexibility, this function takes an llvm::Value rather than an
200  llvm::Instruction for the 'to' parameter; at some places in the code
201  below, we sometimes use a llvm::Value to start out storing a value and
202  then later store instructions. If a llvm::Value is passed to this, the
203  routine just returns without doing anything; if it is in fact an
204  LLVM::Instruction, then the metadata can be copied to it.
205  */
206 static void
207 lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from) {
208  llvm::Instruction *to = llvm::dyn_cast<llvm::Instruction>(vto);
209  if (!to)
210  return;
211 
212  llvm::SmallVector<std::pair<unsigned int, llvm::MDNode *>, 8> metadata;
213 
214  from->getAllMetadata(metadata);
215  for (unsigned int i = 0; i < metadata.size(); ++i)
216  to->setMetadata(metadata[i].first, metadata[i].second);
217 }
218 
219 
220 /** We have a protocol with the front-end LLVM IR code generation process
221  that allows us to encode the source file position that corresponds with
222  instructions. (For example, this allows us to issue performance
223  warnings related to things like scatter and gather after optimization
224  has been performed, so that we aren't warning about scatters and
225  gathers that have been improved to stores and loads by optimization
226  passes.) Note that this is slightly redundant with the source file
227  position encoding generated for debugging symbols, though we don't
228  always generate debugging information but we do always generate this
229  position data.
230 
231  This function finds the SourcePos that the metadata in the instruction
232  (if present) corresponds to. See the implementation of
233  FunctionEmitContext::addGSMetadata(), which encodes the source position during
234  code generation.
235 
236  @param inst Instruction to try to find the source position of
237  @param pos Output variable in which to store the position
238  @returns True if source file position metadata was present and *pos
239  has been set. False otherwise.
240 */
241 static bool
242 lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos) {
243  llvm::MDNode *filename = inst->getMetadata("filename");
244  llvm::MDNode *first_line = inst->getMetadata("first_line");
245  llvm::MDNode *first_column = inst->getMetadata("first_column");
246  llvm::MDNode *last_line = inst->getMetadata("last_line");
247  llvm::MDNode *last_column = inst->getMetadata("last_column");
248 
249  if (!filename || !first_line || !first_column || !last_line || !last_column)
250  return false;
251 
252  // All of these asserts are things that FunctionEmitContext::addGSMetadata() is
253  // expected to have done in its operation
254  llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(filename->getOperand(0));
255  Assert(str);
256  llvm::ConstantInt *first_lnum =
257 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
258  llvm::dyn_cast<llvm::ConstantInt>(first_line->getOperand(0));
259 #else /* LLVN 3.6+ */
260  llvm::mdconst::extract<llvm::ConstantInt>(first_line->getOperand(0));
261 #endif
262  Assert(first_lnum);
263 
264  llvm::ConstantInt *first_colnum =
265 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
266  llvm::dyn_cast<llvm::ConstantInt>(first_column->getOperand(0));
267 #else /* LLVN 3.6+ */
268  llvm::mdconst::extract<llvm::ConstantInt>(first_column->getOperand(0));
269 #endif
270  Assert(first_column);
271 
272  llvm::ConstantInt *last_lnum =
273 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
274  llvm::dyn_cast<llvm::ConstantInt>(last_line->getOperand(0));
275 #else /* LLVN 3.6+ */
276  llvm::mdconst::extract<llvm::ConstantInt>(last_line->getOperand(0));
277 #endif
278  Assert(last_lnum);
279 
280  llvm::ConstantInt *last_colnum =
281 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
282  llvm::dyn_cast<llvm::ConstantInt>(last_column->getOperand(0));
283 #else /* LLVN 3.6+ */
284  llvm::mdconst::extract<llvm::ConstantInt>(last_column->getOperand(0));
285 #endif
286  Assert(last_column);
287 
288  *pos = SourcePos(str->getString().data(), (int)first_lnum->getZExtValue(),
289  (int)first_colnum->getZExtValue(), (int)last_lnum->getZExtValue(),
290  (int)last_colnum->getZExtValue());
291  return true;
292 }
293 
294 
295 static llvm::Instruction *
296 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
297  const char *name, llvm::Instruction *insertBefore = NULL) {
298  llvm::Value *args[2] = { arg0, arg1 };
299  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
300  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
301 }
302 
303 
304 static llvm::Instruction *
305 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
306  llvm::Value *arg2, const char *name,
307  llvm::Instruction *insertBefore = NULL) {
308  llvm::Value *args[3] = { arg0, arg1, arg2 };
309  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
310  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
311 }
312 
313 
314 static llvm::Instruction *
315 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
316  llvm::Value *arg2, llvm::Value *arg3, const char *name,
317  llvm::Instruction *insertBefore = NULL) {
318  llvm::Value *args[4] = { arg0, arg1, arg2, arg3 };
319  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[4]);
320  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
321 }
322 
323 static llvm::Instruction *
324 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
325  llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4,
326  const char *name, llvm::Instruction *insertBefore = NULL) {
327  llvm::Value *args[5] = { arg0, arg1, arg2, arg3, arg4 };
328  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[5]);
329  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
330 }
331 
332 
333 static llvm::Instruction *
334 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
335  llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4,
336  llvm::Value *arg5, const char *name,
337  llvm::Instruction *insertBefore = NULL) {
338  llvm::Value *args[6] = { arg0, arg1, arg2, arg3, arg4, arg5 };
339  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
340  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
341 }
342 
343 
344 static llvm::Instruction *
345 lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name,
346  llvm::Instruction *insertBefore) {
347  llvm::Value *index[1] = { offset };
348  llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
349 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
350  return llvm::GetElementPtrInst::Create(ptr, arrayRef, name,
351  insertBefore);
352 #else // LLVM 3.7+
353  return llvm::GetElementPtrInst::Create(PTYPE(ptr), ptr, arrayRef,
354  name, insertBefore);
355 #endif
356 }
357 
358 
359 /** Given a vector of constant values (int, float, or bool) representing an
360  execution mask, convert it to a bitvector where the 0th bit corresponds
361  to the first vector value and so forth.
362 */
363 static uint64_t
364 lConstElementsToMask(const llvm::SmallVector<llvm::Constant *,
365  ISPC_MAX_NVEC> &elements) {
366  Assert(elements.size() <= 64);
367 
368  uint64_t mask = 0;
369  for (unsigned int i = 0; i < elements.size(); ++i) {
370  llvm::APInt intMaskValue;
371  // SSE has the "interesting" approach of encoding blending
372  // masks as <n x float>.
373  llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
374  if (cf != NULL) {
375  llvm::APFloat apf = cf->getValueAPF();
376  intMaskValue = apf.bitcastToAPInt();
377  }
378  else {
379  // Otherwise get it as an int
380  llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
381  Assert(ci != NULL); // vs return -1 if NULL?
382  intMaskValue = ci->getValue();
383  }
384  // Is the high-bit set? If so, OR in the appropriate bit in
385  // the result mask
386  if (intMaskValue.countLeadingOnes() > 0)
387  mask |= (1ull << i);
388  }
389  return mask;
390 }
391 
392 
393 /** Given an llvm::Value represinting a vector mask, see if the value is a
394  constant. If so, return true and set *bits to be the integer mask
395  found by taking the high bits of the mask values in turn and
396  concatenating them into a single integer. In other words, given the
397  4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, we have 0b1001 = 9.
398  */
399 static bool
400 lGetMask(llvm::Value *factor, uint64_t *mask) {
401  llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
402  if (cdv != NULL) {
403  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
404  for (int i = 0; i < (int)cdv->getNumElements(); ++i)
405  elements.push_back(cdv->getElementAsConstant(i));
406  *mask = lConstElementsToMask(elements);
407  return true;
408  }
409 
410  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
411  if (cv != NULL) {
412  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
413  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
414  llvm::Constant *c =
415  llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
416  if (c == NULL)
417  return false;
418  if (llvm::isa<llvm::ConstantExpr>(cv->getOperand(i)) )
419  return false; // We can not handle constant expressions here
420  elements.push_back(c);
421  }
422  *mask = lConstElementsToMask(elements);
423  return true;
424  }
425  else if (llvm::isa<llvm::ConstantAggregateZero>(factor)) {
426  *mask = 0;
427  return true;
428  }
429  else {
430 #if 0
431  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
432  if (ce != NULL) {
433  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
434  const llvm::TargetData *td = targetMachine->getTargetData();
435  llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
436  c->dump();
437  factor = c;
438  }
439  // else we should be able to handle it above...
440  Assert(!llvm::isa<llvm::Constant>(factor));
441 #endif
442  return false;
443  }
444 }
445 
446 
448 
449 /** Determines if the given mask value is all on, all off, mixed, or
450  unknown at compile time.
451 */
452 static MaskStatus
453 lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
454  uint64_t bits;
455  if (lGetMask(mask, &bits) == false)
456  return UNKNOWN;
457 
458  if (bits == 0)
459  return ALL_OFF;
460 
461  if (vecWidth == -1)
462  vecWidth = g->target->getVectorWidth();
463  Assert(vecWidth <= 64);
464 
465  for (int i = 0; i < vecWidth; ++i) {
466  if ((bits & (1ull << i)) == 0)
467  return MIXED;
468  }
469  return ALL_ON;
470 }
471 
472 
473 ///////////////////////////////////////////////////////////////////////////
474 // This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
475 // and change PassManager function add by adding some checks and debug passes.
476 // This wrap can control:
477 // - If we want to switch off optimization with given number.
478 // - If we want to dump LLVM IR after optimization with given number.
479 // - If we want to generate LLVM IR debug for gdb after optimization with given number.
481 public:
483  void add(llvm::Pass * P, int stage);
484  bool run(llvm::Module& M) {return PM.run(M);}
485 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
486  llvm::PassManager& getPM() {return PM;}
487 #else /* LLVM 3.7+ */
488  llvm::legacy::PassManager& getPM() {return PM;}
489 #endif
490 private:
491 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
492  llvm::PassManager PM;
493 #else /* LLVM 3.7+ */
494  llvm::legacy::PassManager PM;
495 #endif
496  int number;
497 };
498 
499 void
500 DebugPassManager::add(llvm::Pass * P, int stage = -1) {
501  // taking number of optimization
502  if (stage == -1) {
503  number++;
504  }
505  else {
506  number = stage;
507  }
508  if (g->off_stages.find(number) == g->off_stages.end()) {
509  // adding optimization (not switched off)
510  PM.add(P);
511  if (g->debug_stages.find(number) != g->debug_stages.end()) {
512  // adding dump of LLVM IR after optimization
513  char buf[100];
514 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
515  sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
516  number, P->getPassName());
517 #else // LLVM 4.0+
518  sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
519  number, P->getPassName().data());
520 #endif
521  PM.add(CreateDebugPass(buf));
522  }
523 
524 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_4 || ISPC_LLVM_VERSION == ISPC_LLVM_3_5 // only 3.4 and 3.5
525  if (g->debugIR == number) {
526  // adding generating of LLVM IR debug after optimization
527  char buf[100];
528  sprintf(buf, "Debug_IR_after_%d_phase.bc", number);
529  PM.add(llvm::createDebugIRPass(true, true, ".", buf));
530  }
531 #endif
532  }
533 }
534 ///////////////////////////////////////////////////////////////////////////
535 
536 void
537 Optimize(llvm::Module *module, int optLevel) {
538  if (g->debugPrint) {
539  printf("*** Code going into optimization ***\n");
540  module->dump();
541  }
542  DebugPassManager optPM;
543  optPM.add(llvm::createVerifierPass(),0);
544 
545 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
546  llvm::TargetLibraryInfo *targetLibraryInfo =
547  new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
548  optPM.add(targetLibraryInfo);
549 #else // LLVM 3.7+
550  optPM.add(new llvm::TargetLibraryInfoWrapperPass(llvm::Triple(module->getTargetTriple())));
551 #endif
552 
553 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_4
554  optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
555 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_5
556  optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
557 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_6
558  llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
559  dlp->doInitialization(*module);
560  optPM.add(dlp);
561 #endif // LLVM 3.7+ doesn't have DataLayoutPass anymore.
562 
563  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
564 
565 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
566  optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
567  targetMachine->getVectorTargetTransformInfo()));
568 #elif ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
569  targetMachine->addAnalysisPasses(optPM.getPM());
570 #else // LLVM 3.7+
571  optPM.getPM().add(createTargetTransformInfoWrapperPass(targetMachine->getTargetIRAnalysis()));
572 #endif
573 
574  optPM.add(llvm::createIndVarSimplifyPass());
575 
576  if (optLevel == 0) {
577  // This is more or less the minimum set of optimizations that we
578  // need to do to generate code that will actually run. (We can't
579  // run absolutely no optimizations, since the front-end needs us to
580  // take the various __pseudo_* functions it has emitted and turn
581  // them into something that can actually execute.
582  optPM.add(CreateImproveMemoryOpsPass(), 100);
583 #ifdef ISPC_NVPTX_ENABLED
584  if (g->opt.disableGatherScatterOptimizations == false &&
585  g->target->getVectorWidth() > 1)
586 #endif /* ISPC_NVPTX_ENABLED */
587  optPM.add(CreateImproveMemoryOpsPass(), 100);
588 
589  if (g->opt.disableHandlePseudoMemoryOps == false)
591 
592  optPM.add(CreateIntrinsicsOptPass(), 102);
594  optPM.add(llvm::createFunctionInliningPass());
596  optPM.add(llvm::createCFGSimplificationPass());
597  optPM.add(llvm::createGlobalDCEPass());
598  }
599  else {
600  llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
601  llvm::initializeCore(*registry);
602  llvm::initializeScalarOpts(*registry);
603  llvm::initializeIPO(*registry);
604  llvm::initializeAnalysis(*registry);
605 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7
606  llvm::initializeIPA(*registry);
607 #endif
608  llvm::initializeTransformUtils(*registry);
609  llvm::initializeInstCombine(*registry);
610  llvm::initializeInstrumentation(*registry);
611  llvm::initializeTarget(*registry);
612 
613  optPM.add(llvm::createGlobalDCEPass(), 185);
614 
615  // Setup to use LLVM default AliasAnalysis
616  // Ideally, we want call:
617  // llvm::PassManagerBuilder pm_Builder;
618  // pm_Builder.OptLevel = optLevel;
619  // pm_Builder.addInitialAliasAnalysisPasses(optPM);
620  // but the addInitialAliasAnalysisPasses() is a private function
621  // so we explicitly enable them here.
622  // Need to keep sync with future LLVM change
623  // An alternative is to call populateFunctionPassManager()
624 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7
625  optPM.add(llvm::createTypeBasedAliasAnalysisPass(), 190);
626  optPM.add(llvm::createBasicAliasAnalysisPass());
627 #else
628  optPM.add(llvm::createTypeBasedAAWrapperPass(), 190);
629  optPM.add(llvm::createBasicAAWrapperPass());
630 #endif
631  optPM.add(llvm::createCFGSimplificationPass());
632 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
633  optPM.add(llvm::createScalarReplAggregatesPass());
634 #else
635  optPM.add(llvm::createSROAPass());
636 #endif
637  optPM.add(llvm::createEarlyCSEPass());
638  optPM.add(llvm::createLowerExpectIntrinsicPass());
639 
640  // Early optimizations to try to reduce the total amount of code to
641  // work with if we can
642  optPM.add(llvm::createReassociatePass(), 200);
643  optPM.add(llvm::createConstantPropagationPass());
644  optPM.add(llvm::createDeadInstEliminationPass());
645  optPM.add(llvm::createCFGSimplificationPass());
646 
647  optPM.add(llvm::createPromoteMemoryToRegisterPass());
648  optPM.add(llvm::createAggressiveDCEPass());
649 
650 
651  if (g->opt.disableGatherScatterOptimizations == false &&
652  g->target->getVectorWidth() > 1) {
653  optPM.add(llvm::createInstructionCombiningPass(), 210);
655  }
657  optPM.add(CreateIntrinsicsOptPass(), 215);
659  }
660  optPM.add(llvm::createDeadInstEliminationPass(), 220);
661 
662  // Max struct size threshold for scalar replacement is
663  // 1) 4 fields (r,g,b,w)
664  // 2) field size: vectorWidth * sizeof(float)
665 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
666  const int field_limit = 4;
667  int sr_threshold = g->target->getVectorWidth() * sizeof(float) * field_limit;
668 #endif
669 
670  // On to more serious optimizations
671 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
672  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
673 #else
674  optPM.add(llvm::createSROAPass());
675 #endif
676  optPM.add(llvm::createInstructionCombiningPass());
677  optPM.add(llvm::createCFGSimplificationPass());
678  optPM.add(llvm::createPromoteMemoryToRegisterPass());
679  optPM.add(llvm::createGlobalOptimizerPass());
680  optPM.add(llvm::createReassociatePass());
681  optPM.add(llvm::createIPConstantPropagationPass());
682 
683 #ifdef ISPC_NVPTX_ENABLED
684  if (g->target->getISA() != Target::NVPTX)
685 #endif /* ISPC_NVPTX_ENABLED */
686  optPM.add(CreateReplaceStdlibShiftPass(),229);
687 
688  optPM.add(llvm::createDeadArgEliminationPass(),230);
689  optPM.add(llvm::createInstructionCombiningPass());
690  optPM.add(llvm::createCFGSimplificationPass());
691  optPM.add(llvm::createPruneEHPass());
692 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_9 // 3.9+
693  optPM.add(llvm::createPostOrderFunctionAttrsLegacyPass());
694  optPM.add(llvm::createReversePostOrderFunctionAttrsPass());
695 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_8 // 3.8
696  optPM.add(llvm::createPostOrderFunctionAttrsPass());
697  optPM.add(llvm::createReversePostOrderFunctionAttrsPass());
698 #else // 3.7 and earlier
699  optPM.add(llvm::createFunctionAttrsPass());
700 #endif
701  optPM.add(llvm::createFunctionInliningPass());
702  optPM.add(llvm::createConstantPropagationPass());
703  optPM.add(llvm::createDeadInstEliminationPass());
704  optPM.add(llvm::createCFGSimplificationPass());
705 
706  optPM.add(llvm::createArgumentPromotionPass());
707 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3
708  // Starting from 3.4 this functionality was moved to
709  // InstructionCombiningPass. See r184459 for details.
710  optPM.add(llvm::createSimplifyLibCallsPass(), 240);
711 #endif
712  optPM.add(llvm::createAggressiveDCEPass());
713  optPM.add(llvm::createInstructionCombiningPass(), 241);
714  optPM.add(llvm::createJumpThreadingPass());
715  optPM.add(llvm::createCFGSimplificationPass());
716 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
717  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
718 #else
719  optPM.add(llvm::createSROAPass());
720 #endif
721  optPM.add(llvm::createInstructionCombiningPass());
722  optPM.add(llvm::createTailCallEliminationPass());
723 
725  optPM.add(CreateIntrinsicsOptPass(), 250);
727  }
728 
729  if (g->opt.disableGatherScatterOptimizations == false &&
730  g->target->getVectorWidth() > 1) {
731  optPM.add(llvm::createInstructionCombiningPass(), 255);
733 
734  if (g->opt.disableCoalescing == false &&
735  g->target->getISA() != Target::GENERIC) {
736  // It is important to run this here to make it easier to
737  // finding matching gathers we can coalesce..
738  optPM.add(llvm::createEarlyCSEPass(), 260);
739  optPM.add(CreateGatherCoalescePass());
740  }
741  }
742 
743  optPM.add(llvm::createFunctionInliningPass(), 265);
744  optPM.add(llvm::createConstantPropagationPass());
745  optPM.add(CreateIntrinsicsOptPass());
747 
748  if (g->opt.disableGatherScatterOptimizations == false &&
749  g->target->getVectorWidth() > 1) {
750  optPM.add(llvm::createInstructionCombiningPass(), 270);
752  }
753 
754  optPM.add(llvm::createIPSCCPPass(), 275);
755  optPM.add(llvm::createDeadArgEliminationPass());
756  optPM.add(llvm::createAggressiveDCEPass());
757  optPM.add(llvm::createInstructionCombiningPass());
758  optPM.add(llvm::createCFGSimplificationPass());
759 
760  if (g->opt.disableHandlePseudoMemoryOps == false) {
762  }
763  optPM.add(CreateIntrinsicsOptPass(),281);
765 
766  optPM.add(llvm::createFunctionInliningPass());
767  optPM.add(llvm::createArgumentPromotionPass());
768 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
769  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false));
770 #else
771  optPM.add(llvm::createSROAPass());
772 #endif
773  optPM.add(llvm::createInstructionCombiningPass());
775  optPM.add(llvm::createCFGSimplificationPass());
776  optPM.add(llvm::createReassociatePass());
777  optPM.add(llvm::createLoopRotatePass());
778  optPM.add(llvm::createLICMPass());
779  optPM.add(llvm::createLoopUnswitchPass(false));
780  optPM.add(llvm::createInstructionCombiningPass());
782  optPM.add(llvm::createIndVarSimplifyPass());
783  optPM.add(llvm::createLoopIdiomPass());
784  optPM.add(llvm::createLoopDeletionPass());
785  if (g->opt.unrollLoops) {
786  optPM.add(llvm::createLoopUnrollPass(), 300);
787  }
788  optPM.add(llvm::createGVNPass(), 301);
789 
791  optPM.add(CreateIntrinsicsOptPass());
793 
794  optPM.add(llvm::createMemCpyOptPass());
795  optPM.add(llvm::createSCCPPass());
796  optPM.add(llvm::createInstructionCombiningPass());
798  optPM.add(llvm::createJumpThreadingPass());
799  optPM.add(llvm::createCorrelatedValuePropagationPass());
800  optPM.add(llvm::createDeadStoreEliminationPass());
801  optPM.add(llvm::createAggressiveDCEPass());
802  optPM.add(llvm::createCFGSimplificationPass());
803  optPM.add(llvm::createInstructionCombiningPass());
805  optPM.add(CreatePeepholePass());
806  optPM.add(llvm::createFunctionInliningPass());
807  optPM.add(llvm::createAggressiveDCEPass());
808  optPM.add(llvm::createStripDeadPrototypesPass());
810  optPM.add(llvm::createGlobalDCEPass());
811  optPM.add(llvm::createConstantMergePass());
812 
813  // Should be the last
814  optPM.add(CreateFixBooleanSelectPass(), 400);
815 #ifdef ISPC_NVPTX_ENABLED
816  if (g->target->getISA() == Target::NVPTX)
817  {
818  optPM.add(CreatePromoteLocalToPrivatePass());
819  optPM.add(llvm::createGlobalDCEPass());
820 
821  optPM.add(llvm::createTypeBasedAliasAnalysisPass());
822  optPM.add(llvm::createBasicAliasAnalysisPass());
823  optPM.add(llvm::createCFGSimplificationPass());
824  // Here clang has an experimental pass SROAPass instead of
825  // ScalarReplAggregatesPass. We should add it in the future.
826 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
827  optPM.add(llvm::createScalarReplAggregatesPass());
828 #else
829  optPM.add(llvm::createSROAPass());
830 #endif
831  optPM.add(llvm::createEarlyCSEPass());
832  optPM.add(llvm::createLowerExpectIntrinsicPass());
833  optPM.add(llvm::createTypeBasedAliasAnalysisPass());
834  optPM.add(llvm::createBasicAliasAnalysisPass());
835 
836  // Early optimizations to try to reduce the total amount of code to
837  // work with if we can
838  optPM.add(llvm::createReassociatePass());
839  optPM.add(llvm::createConstantPropagationPass());
840  optPM.add(llvm::createDeadInstEliminationPass());
841  optPM.add(llvm::createCFGSimplificationPass());
842 
843  optPM.add(llvm::createPromoteMemoryToRegisterPass());
844  optPM.add(llvm::createAggressiveDCEPass());
845 
846 
847  optPM.add(llvm::createInstructionCombiningPass());
848  optPM.add(llvm::createDeadInstEliminationPass());
849 
850  // On to more serious optimizations
851  optPM.add(llvm::createInstructionCombiningPass());
852  optPM.add(llvm::createCFGSimplificationPass());
853  optPM.add(llvm::createPromoteMemoryToRegisterPass());
854  optPM.add(llvm::createGlobalOptimizerPass());
855  optPM.add(llvm::createReassociatePass());
856  optPM.add(llvm::createIPConstantPropagationPass());
857 
858  optPM.add(llvm::createDeadArgEliminationPass());
859  optPM.add(llvm::createInstructionCombiningPass());
860  optPM.add(llvm::createCFGSimplificationPass());
861  optPM.add(llvm::createPruneEHPass());
862  optPM.add(llvm::createFunctionAttrsPass());
863  optPM.add(llvm::createFunctionInliningPass());
864  optPM.add(llvm::createConstantPropagationPass());
865  optPM.add(llvm::createDeadInstEliminationPass());
866  optPM.add(llvm::createCFGSimplificationPass());
867 
868  optPM.add(llvm::createArgumentPromotionPass());
869 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3
870  // Starting from 3.4 this functionality was moved to
871  // InstructionCombiningPass. See r184459 for details.
872  optPM.add(llvm::createSimplifyLibCallsPass());
873 #endif
874  optPM.add(llvm::createAggressiveDCEPass());
875  optPM.add(llvm::createInstructionCombiningPass());
876  optPM.add(llvm::createJumpThreadingPass());
877  optPM.add(llvm::createCFGSimplificationPass());
878  optPM.add(llvm::createInstructionCombiningPass());
879  optPM.add(llvm::createTailCallEliminationPass());
880 
881  optPM.add(llvm::createInstructionCombiningPass());
882 
883  optPM.add(llvm::createFunctionInliningPass());
884  optPM.add(llvm::createConstantPropagationPass());
885 
886  optPM.add(llvm::createInstructionCombiningPass());
887 
888  optPM.add(llvm::createIPSCCPPass());
889  optPM.add(llvm::createDeadArgEliminationPass());
890  optPM.add(llvm::createAggressiveDCEPass());
891  optPM.add(llvm::createInstructionCombiningPass());
892  optPM.add(llvm::createCFGSimplificationPass());
893 
894  optPM.add(llvm::createFunctionInliningPass());
895  optPM.add(llvm::createArgumentPromotionPass());
896  optPM.add(llvm::createInstructionCombiningPass());
897  optPM.add(llvm::createCFGSimplificationPass());
898  optPM.add(llvm::createReassociatePass());
899  optPM.add(llvm::createLoopRotatePass());
900  optPM.add(llvm::createLICMPass());
901 // optPM.add(llvm::createLoopUnswitchPass(false));
902 #if 1
903  optPM.add(llvm::createInstructionCombiningPass());
904  optPM.add(llvm::createIndVarSimplifyPass());
905  optPM.add(llvm::createLoopIdiomPass());
906  optPM.add(llvm::createLoopDeletionPass());
907  optPM.add(llvm::createLoopUnrollPass());
908  optPM.add(llvm::createGVNPass());
909  optPM.add(llvm::createMemCpyOptPass());
910  optPM.add(llvm::createSCCPPass());
911  optPM.add(llvm::createInstructionCombiningPass());
912  optPM.add(llvm::createJumpThreadingPass());
913  optPM.add(llvm::createCorrelatedValuePropagationPass());
914  optPM.add(llvm::createDeadStoreEliminationPass());
915  optPM.add(llvm::createAggressiveDCEPass());
916  optPM.add(llvm::createCFGSimplificationPass());
917  optPM.add(llvm::createInstructionCombiningPass());
918  optPM.add(llvm::createFunctionInliningPass());
919  optPM.add(llvm::createAggressiveDCEPass());
920  optPM.add(llvm::createStripDeadPrototypesPass());
921  optPM.add(llvm::createGlobalDCEPass());
922  optPM.add(llvm::createConstantMergePass());
923 #endif
924  }
925 #endif /* ISPC_NVPTX_ENABLED */
926  }
927 
928  // Finish up by making sure we didn't mess anything up in the IR along
929  // the way.
930  optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
931  optPM.run(*module);
932 
933  if (g->debugPrint) {
934  printf("\n*****\nFINAL OUTPUT\n*****\n");
935  module->dump();
936  }
937 
938 }
939 
940 
941 
942 ///////////////////////////////////////////////////////////////////////////
943 // IntrinsicsOpt
944 
945 /** This is a relatively simple optimization pass that does a few small
946  optimizations that LLVM's x86 optimizer doesn't currently handle.
947  (Specifically, MOVMSK of a constant can be replaced with the
948  corresponding constant value, BLENDVPS and AVX masked load/store with
949  either an 'all on' or 'all off' masks can be replaced with simpler
950  operations.
951 
952  @todo The better thing to do would be to submit a patch to LLVM to get
953  these; they're presumably pretty simple patterns to match.
954 */
955 class IntrinsicsOpt : public llvm::BasicBlockPass {
956 public:
957  IntrinsicsOpt() : BasicBlockPass(ID) {};
958 
959 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
960  const char *getPassName() const { return "Intrinsics Cleanup Optimization"; }
961 #else // LLVM 4.0+
962  llvm::StringRef getPassName() const { return "Intrinsics Cleanup Optimization"; }
963 #endif
964  bool runOnBasicBlock(llvm::BasicBlock &BB);
965 
966  static char ID;
967 
968 private:
970  MaskInstruction(llvm::Function *f) { function = f; }
971  llvm::Function *function;
972  };
973  std::vector<MaskInstruction> maskInstructions;
974 
975  /** Structure that records everything we need to know about a blend
976  instruction for this optimization pass.
977  */
979  BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
980  : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) { }
981  /** Function pointer for the blend instruction */
982  llvm::Function *function;
983  /** Mask value for an "all on" mask for this instruction */
984  uint64_t allOnMask;
985  /** The operand number in the llvm CallInst corresponds to the
986  first operand to blend with. */
987  int op0;
988  /** The operand number in the CallInst corresponding to the second
989  operand to blend with. */
990  int op1;
991  /** The operand in the call inst where the blending factor is
992  found. */
993  int opFactor;
994  };
995  std::vector<BlendInstruction> blendInstructions;
996 
997  bool matchesMaskInstruction(llvm::Function *function);
998  BlendInstruction *matchingBlendInstruction(llvm::Function *function);
999 };
1000 
1001 char IntrinsicsOpt::ID = 0;
1002 
1003 
1004 /** Given an llvm::Value, return true if we can determine that it's an
1005  undefined value. This only makes a weak attempt at chasing this down,
1006  only detecting flat-out undef values, and bitcasts of undef values.
1007 
1008  @todo Is it worth working harder to find more of these? It starts to
1009  get tricky, since having an undef operand doesn't necessarily mean that
1010  the result will be undefined. (And for that matter, is there an LLVM
1011  call that will do this for us?)
1012  */
1013 static bool
1014 lIsUndef(llvm::Value *value) {
1015  if (llvm::isa<llvm::UndefValue>(value))
1016  return true;
1017 
1018  llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(value);
1019  if (bci)
1020  return lIsUndef(bci->getOperand(0));
1021 
1022  return false;
1023 }
1024 
1025 
1026 bool
1027 IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
1028  DEBUG_START_PASS("IntrinsicsOpt");
1029 
1030  // We can't initialize mask/blend function vector during pass initialization,
1031  // as they may be optimized out by the time the pass is invoked.
1032 
1033  // All of the mask instructions we may encounter. Note that even if
1034  // compiling for AVX, we may still encounter the regular 4-wide SSE
1035  // MOVMSK instruction.
1036  if (llvm::Function *ssei8Movmsk =
1037  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse2_pmovmskb_128))) {
1038  maskInstructions.push_back(ssei8Movmsk);
1039  }
1040  if (llvm::Function *sseFloatMovmsk =
1041  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse_movmsk_ps))) {
1042  maskInstructions.push_back(sseFloatMovmsk);
1043  }
1044  if (llvm::Function *__movmsk =
1045  m->module->getFunction("__movmsk")) {
1046  maskInstructions.push_back(__movmsk);
1047  }
1048  if (llvm::Function *avxFloatMovmsk =
1049  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_movmsk_ps_256))) {
1050  maskInstructions.push_back(avxFloatMovmsk);
1051  }
1052 
1053  // And all of the blend instructions
1054  blendInstructions.push_back(BlendInstruction(
1055  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse41_blendvps)),
1056  0xf, 0, 1, 2));
1057  blendInstructions.push_back(BlendInstruction(
1058  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_blendv_ps_256)),
1059  0xff, 0, 1, 2));
1060 
1061  llvm::Function *avxMaskedLoad32 =
1062  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_ps_256));
1063  llvm::Function *avxMaskedLoad64 =
1064  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_pd_256));
1065  llvm::Function *avxMaskedStore32 =
1066  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_ps_256));
1067  llvm::Function *avxMaskedStore64 =
1068  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_pd_256));
1069 
1070  bool modifiedAny = false;
1071  restart:
1072  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
1073  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
1074  if (callInst == NULL || callInst->getCalledFunction() == NULL)
1075  continue;
1076 
1077  BlendInstruction *blend = matchingBlendInstruction(callInst->getCalledFunction());
1078  if (blend != NULL) {
1079  llvm::Value *v[2] = { callInst->getArgOperand(blend->op0),
1080  callInst->getArgOperand(blend->op1) };
1081  llvm::Value *factor = callInst->getArgOperand(blend->opFactor);
1082 
1083  // If the values are the same, then no need to blend..
1084  if (v[0] == v[1]) {
1085  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1086  iter, v[0]);
1087  modifiedAny = true;
1088  goto restart;
1089  }
1090 
1091  // If one of the two is undefined, we're allowed to replace
1092  // with the value of the other. (In other words, the only
1093  // valid case is that the blend factor ends up having a value
1094  // that only selects from the defined one of the two operands,
1095  // otherwise the result is undefined and any value is fine,
1096  // ergo the defined one is an acceptable result.)
1097  if (lIsUndef(v[0])) {
1098  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1099  iter, v[1]);
1100  modifiedAny = true;
1101  goto restart;
1102  }
1103  if (lIsUndef(v[1])) {
1104  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1105  iter, v[0]);
1106  modifiedAny = true;
1107  goto restart;
1108  }
1109 
1110  uint64_t mask;
1111  if (lGetMask(factor, &mask) == true) {
1112  llvm::Value *value = NULL;
1113  if (mask == 0)
1114  // Mask all off -> replace with the first blend value
1115  value = v[0];
1116  else if (mask == blend->allOnMask)
1117  // Mask all on -> replace with the second blend value
1118  value = v[1];
1119 
1120  if (value != NULL) {
1121  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1122  iter, value);
1123  modifiedAny = true;
1124  goto restart;
1125  }
1126  }
1127  }
1128  else if (matchesMaskInstruction(callInst->getCalledFunction())) {
1129  llvm::Value *factor = callInst->getArgOperand(0);
1130  uint64_t mask;
1131  if (lGetMask(factor, &mask) == true) {
1132  // If the vector-valued mask has a known value, replace it
1133  // with the corresponding integer mask from its elements
1134  // high bits.
1135  llvm::Value *value = (callInst->getType() == LLVMTypes::Int32Type) ?
1136  LLVMInt32(mask) : LLVMInt64(mask);
1137  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1138  iter, value);
1139  modifiedAny = true;
1140  goto restart;
1141  }
1142  }
1143  else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
1144  callInst->getCalledFunction() == avxMaskedLoad64) {
1145  llvm::Value *factor = callInst->getArgOperand(1);
1146  uint64_t mask;
1147  if (lGetMask(factor, &mask) == true) {
1148  if (mask == 0) {
1149  // nothing being loaded, replace with undef value
1150  llvm::Type *returnType = callInst->getType();
1151  Assert(llvm::isa<llvm::VectorType>(returnType));
1152  llvm::Value *undefValue = llvm::UndefValue::get(returnType);
1153  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1154  iter, undefValue);
1155  modifiedAny = true;
1156  goto restart;
1157  }
1158  else if (mask == 0xff) {
1159  // all lanes active; replace with a regular load
1160  llvm::Type *returnType = callInst->getType();
1161  Assert(llvm::isa<llvm::VectorType>(returnType));
1162  // cast the i8 * to the appropriate type
1163  const char *name = LLVMGetName(callInst->getArgOperand(0), "_cast");
1164  llvm::Value *castPtr =
1165  new llvm::BitCastInst(callInst->getArgOperand(0),
1166  llvm::PointerType::get(returnType, 0),
1167  name, callInst);
1168  lCopyMetadata(castPtr, callInst);
1169  int align;
1170  if (g->opt.forceAlignedMemory)
1171  align = g->target->getNativeVectorAlignment();
1172  else
1173  align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
1174  name = LLVMGetName(callInst->getArgOperand(0), "_load");
1175  llvm::Instruction *loadInst =
1176  new llvm::LoadInst(castPtr, name, false /* not volatile */,
1177  align, (llvm::Instruction *)NULL);
1178  lCopyMetadata(loadInst, callInst);
1179  llvm::ReplaceInstWithInst(callInst, loadInst);
1180  modifiedAny = true;
1181  goto restart;
1182  }
1183  }
1184  }
1185  else if (callInst->getCalledFunction() == avxMaskedStore32 ||
1186  callInst->getCalledFunction() == avxMaskedStore64) {
1187  // NOTE: mask is the 2nd parameter, not the 3rd one!!
1188  llvm::Value *factor = callInst->getArgOperand(1);
1189  uint64_t mask;
1190  if (lGetMask(factor, &mask) == true) {
1191  if (mask == 0) {
1192  // nothing actually being stored, just remove the inst
1193  callInst->eraseFromParent();
1194  modifiedAny = true;
1195  goto restart;
1196  }
1197  else if (mask == 0xff) {
1198  // all lanes storing, so replace with a regular store
1199  llvm::Value *rvalue = callInst->getArgOperand(2);
1200  llvm::Type *storeType = rvalue->getType();
1201  const char *name = LLVMGetName(callInst->getArgOperand(0),
1202  "_ptrcast");
1203  llvm::Value *castPtr =
1204  new llvm::BitCastInst(callInst->getArgOperand(0),
1205  llvm::PointerType::get(storeType, 0),
1206  name, callInst);
1207  lCopyMetadata(castPtr, callInst);
1208 
1209  llvm::StoreInst *storeInst =
1210  new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
1211  int align;
1212  if (g->opt.forceAlignedMemory)
1213  align = g->target->getNativeVectorAlignment();
1214  else
1215  align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
1216  storeInst->setAlignment(align);
1217  lCopyMetadata(storeInst, callInst);
1218  llvm::ReplaceInstWithInst(callInst, storeInst);
1219 
1220  modifiedAny = true;
1221  goto restart;
1222  }
1223  }
1224  }
1225  }
1226 
1227  DEBUG_END_PASS("IntrinsicsOpt");
1228 
1229  return modifiedAny;
1230 }
1231 
1232 
1233 bool
1234 IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
1235  for (unsigned int i = 0; i < maskInstructions.size(); ++i) {
1236  if (maskInstructions[i].function != NULL &&
1237  function == maskInstructions[i].function) {
1238  return true;
1239  }
1240  }
1241  return false;
1242 }
1243 
1244 
1246 IntrinsicsOpt::matchingBlendInstruction(llvm::Function *function) {
1247  for (unsigned int i = 0; i < blendInstructions.size(); ++i) {
1248  if (blendInstructions[i].function != NULL &&
1249  function == blendInstructions[i].function) {
1250  return &blendInstructions[i];
1251  }
1252  }
1253  return NULL;
1254 }
1255 
1256 
1257 static llvm::Pass *
1259  return new IntrinsicsOpt;
1260 }
1261 
1262 
1263 ///////////////////////////////////////////////////////////////////////////
1264 
1265 /** This simple optimization pass looks for a vector select instruction
1266  with an all-on or all-off constant mask, simplifying it to the
1267  appropriate operand if so.
1268 
1269  @todo The better thing to do would be to submit a patch to LLVM to get
1270  these; they're presumably pretty simple patterns to match.
1271 */
1272 class InstructionSimplifyPass : public llvm::BasicBlockPass {
1273 public:
1275  : BasicBlockPass(ID) { }
1276 
1277 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
1278  const char *getPassName() const { return "Vector Select Optimization"; }
1279 #else // LLVM 4.0+
1280  llvm::StringRef getPassName() const { return "Vector Select Optimization"; }
1281 #endif
1282  bool runOnBasicBlock(llvm::BasicBlock &BB);
1283 
1284  static char ID;
1285 
1286 private:
1287  static bool simplifySelect(llvm::SelectInst *selectInst,
1288  llvm::BasicBlock::iterator iter);
1289  static llvm::Value *simplifyBoolVec(llvm::Value *value);
1290  static bool simplifyCall(llvm::CallInst *callInst,
1291  llvm::BasicBlock::iterator iter);
1292 };
1293 
1295 
1296 
1297 llvm::Value *
1299  llvm::TruncInst *trunc = llvm::dyn_cast<llvm::TruncInst>(value);
1300  if (trunc != NULL) {
1301  // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
1302  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
1303  if (sext &&
1304  sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
1305  return sext->getOperand(0);
1306 
1307  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
1308  if (zext &&
1309  zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
1310  return zext->getOperand(0);
1311  }
1312  /*
1313  // This optimization has discernable benefit on the perf
1314  // suite on latest LLVM versions.
1315  // On 3.4+ (maybe even older), it can result in illegal
1316  // operations, so it's being disabled.
1317  llvm::ICmpInst *icmp = llvm::dyn_cast<llvm::ICmpInst>(value);
1318  if (icmp != NULL) {
1319  // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo
1320  if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) {
1321  llvm::Value *op1 = icmp->getOperand(1);
1322  if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
1323  llvm::Value *op0 = icmp->getOperand(0);
1324  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(op0);
1325  if (sext)
1326  return sext->getOperand(0);
1327  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(op0);
1328  if (zext)
1329  return zext->getOperand(0);
1330  }
1331  }
1332 
1333  }
1334  */
1335  return NULL;
1336 }
1337 
1338 
1339 bool
1340 InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst,
1341  llvm::BasicBlock::iterator iter) {
1342  if (selectInst->getType()->isVectorTy() == false)
1343  return false;
1344 
1345  llvm::Value *factor = selectInst->getOperand(0);
1346 
1347  // Simplify all-on or all-off mask values
1348  MaskStatus maskStatus = lGetMaskStatus(factor);
1349  llvm::Value *value = NULL;
1350  if (maskStatus == ALL_ON)
1351  // Mask all on -> replace with the first select value
1352  value = selectInst->getOperand(1);
1353  else if (maskStatus == ALL_OFF)
1354  // Mask all off -> replace with the second select value
1355  value = selectInst->getOperand(2);
1356  if (value != NULL) {
1357  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1358  iter, value);
1359  return true;
1360  }
1361 
1362  // Sometimes earlier LLVM optimization passes generate unnecessarily
1363  // complex expressions for the selection vector, which in turn confuses
1364  // the code generators and leads to sub-optimal code (particularly for
1365  // 8 and 16-bit masks). We'll try to simplify them out here so that
1366  // the code generator patterns match..
1367  if ((factor = simplifyBoolVec(factor)) != NULL) {
1368  llvm::Instruction *newSelect =
1369  llvm::SelectInst::Create(factor, selectInst->getOperand(1),
1370  selectInst->getOperand(2),
1371  selectInst->getName());
1372  llvm::ReplaceInstWithInst(selectInst, newSelect);
1373  return true;
1374  }
1375 
1376  return false;
1377 }
1378 
1379 
1380 bool
1381 InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst,
1382  llvm::BasicBlock::iterator iter) {
1383  llvm::Function *calledFunc = callInst->getCalledFunction();
1384 
1385  // Turn a __movmsk call with a compile-time constant vector into the
1386  // equivalent scalar value.
1387  if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
1388  return false;
1389 
1390  uint64_t mask;
1391  if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
1392  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
1393  iter, LLVMInt64(mask));
1394  return true;
1395  }
1396  return false;
1397 }
1398 
1399 
1400 bool
1402  DEBUG_START_PASS("InstructionSimplify");
1403 
1404  bool modifiedAny = false;
1405 
1406  restart:
1407  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
1408  llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
1409  if (selectInst && simplifySelect(selectInst, iter)) {
1410  modifiedAny = true;
1411  goto restart;
1412  }
1413  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
1414  if (callInst && simplifyCall(callInst, iter)) {
1415  modifiedAny = true;
1416  goto restart;
1417  }
1418  }
1419 
1420  DEBUG_END_PASS("InstructionSimplify");
1421 
1422  return modifiedAny;
1423 }
1424 
1425 
1426 static llvm::Pass *
1428  return new InstructionSimplifyPass;
1429 }
1430 
1431 
1432 ///////////////////////////////////////////////////////////////////////////
1433 // ImproveMemoryOpsPass
1434 
1435 /** When the front-end emits gathers and scatters, it generates an array of
1436  vector-width pointers to represent the set of addresses to read from or
1437  write to. This optimization detects cases when the base pointer is a
1438  uniform pointer or when the indexing is into an array that can be
1439  converted into scatters/gathers from a single base pointer and an array
1440  of offsets.
1441 
1442  See for example the comments discussing the __pseudo_gather functions
1443  in builtins.cpp for more information about this.
1444  */
1445 class ImproveMemoryOpsPass : public llvm::BasicBlockPass {
1446 public:
1447  static char ID;
1448  ImproveMemoryOpsPass() : BasicBlockPass(ID) { }
1449 
1450 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
1451  const char *getPassName() const { return "Improve Memory Ops"; }
1452 #else // LLVM 4.0+
1453  llvm::StringRef getPassName() const { return "Improve Memory Ops"; }
1454 #endif
1455  bool runOnBasicBlock(llvm::BasicBlock &BB);
1456 };
1457 
1458 char ImproveMemoryOpsPass::ID = 0;
1459 
1460 
1461 
1462 /** Check to make sure that this value is actually a pointer in the end.
1463  We need to make sure that given an expression like vec(offset) +
1464  ptr2int(ptr), lGetBasePointer() doesn't return vec(offset) for the base
1465  pointer such that we then treat ptr2int(ptr) as an offset. This ends
1466  up being important so that we don't generate LLVM GEP instructions like
1467  "gep inttoptr 8, i64 %ptr", which in turn can lead to incorrect code
1468  since LLVM's pointer aliasing analysis assumes that operands after the
1469  first one to a GEP aren't pointers.
1470  */
1471 static llvm::Value *
1472 lCheckForActualPointer(llvm::Value *v) {
1473  if (v == NULL) {
1474  return NULL;
1475  }
1476  else if (llvm::isa<llvm::PointerType>(v->getType())) {
1477  return v;
1478  }
1479  else if (llvm::isa<llvm::PtrToIntInst>(v)) {
1480  return v;
1481  }
1482 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_7
1483  // This one is tricky, as it's heuristic tuned for LLVM 3.7+, which may
1484  // optimize loading double* with consequent ptr2int to straight load of i64.
1485  // This heuristic should be good enough to catch all the cases we should
1486  // detect and nothing else.
1487  else if (llvm::isa<llvm::LoadInst>(v)) {
1488  return v;
1489  }
1490 #endif
1491  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1492  llvm::Value *t = lCheckForActualPointer(ci->getOperand(0));
1493  if (t == NULL) {
1494  return NULL;
1495  }
1496  else {
1497  return v;
1498  }
1499  }
1500  else {
1501  llvm::ConstantExpr *uce =
1502  llvm::dyn_cast<llvm::ConstantExpr>(v);
1503  if (uce != NULL &&
1504  uce->getOpcode() == llvm::Instruction::PtrToInt)
1505  return v;
1506  return NULL;
1507  }
1508 }
1509 
1510 
1511 /** Given a llvm::Value representing a varying pointer, this function
1512  checks to see if all of the elements of the vector have the same value
1513  (i.e. there's a common base pointer). If broadcast has been already detected
1514  it checks that the first element of the vector is not undef. If one of the conditions
1515  is true, it returns the common pointer value; otherwise it returns NULL.
1516  */
1517 static llvm::Value *
1518 lGetBasePointer(llvm::Value *v, llvm::Instruction *insertBefore, bool broadcastDetected) {
1519  if (llvm::isa<llvm::InsertElementInst>(v) ||
1520  llvm::isa<llvm::ShuffleVectorInst>(v)) {
1521  // If we have already detected broadcast we want to look for
1522  // the vector with the first not-undef element
1523  llvm::Value *element = LLVMFlattenInsertChain
1524  (v, g->target->getVectorWidth(), true, false, broadcastDetected);
1525  // TODO: it's probably ok to allow undefined elements and return
1526  // the base pointer if all of the other elements have the same
1527  // value.
1528  if (element != NULL) {
1529  //all elements are the same and not NULLs
1530  return lCheckForActualPointer(element);
1531  }
1532  else {
1533  return NULL;
1534  }
1535  }
1536 
1537  // This case comes up with global/static arrays
1538  if (llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v)) {
1539  return lCheckForActualPointer(cv->getSplatValue());
1540  }
1541  else if (llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v)) {
1542  return lCheckForActualPointer(cdv->getSplatValue());
1543  }
1544  // It is a little bit tricky to use operations with pointers, casted to int with another bit size
1545  // but sometimes it is useful, so we handle this case here.
1546  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1547  llvm::Value *t = lGetBasePointer(ci->getOperand(0), insertBefore, broadcastDetected);
1548  if (t == NULL) {
1549  return NULL;
1550  }
1551  else {
1552  return llvm::CastInst::Create(ci->getOpcode(),
1553  t, ci->getType()->getScalarType(),
1554  LLVMGetName(t, "_cast"),
1555  insertBefore);
1556  }
1557  }
1558 
1559  return NULL;
1560 }
1561 
1562 
1563 /** Given the two operands to a constant add expression, see if we have the
1564  form "base pointer + offset", whee op0 is the base pointer and op1 is
1565  the offset; if so return the base and the offset. */
1566 static llvm::Constant *
1567 lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1,
1568  llvm::Constant **delta) {
1569  llvm::ConstantExpr *op = llvm::dyn_cast<llvm::ConstantExpr>(op0);
1570  if (op == NULL || op->getOpcode() != llvm::Instruction::PtrToInt)
1571  // the first operand isn't a pointer
1572  return NULL;
1573 
1574  llvm::ConstantInt *opDelta = llvm::dyn_cast<llvm::ConstantInt>(op1);
1575  if (opDelta == NULL)
1576  // the second operand isn't an integer operand
1577  return NULL;
1578 
1579  *delta = opDelta;
1580  return op0;
1581 }
1582 
1583 
1584 static llvm::Value *
1585 lExtractFromInserts(llvm::Value *v, unsigned int index) {
1586  llvm::InsertValueInst *iv = llvm::dyn_cast<llvm::InsertValueInst>(v);
1587  if (iv == NULL)
1588  return NULL;
1589 
1590  Assert(iv->hasIndices() && iv->getNumIndices() == 1);
1591  if (iv->getIndices()[0] == index)
1592  return iv->getInsertedValueOperand();
1593  else
1594  return lExtractFromInserts(iv->getAggregateOperand(), index);
1595 }
1596 
1597 
1598 /** Given a varying pointer in ptrs, this function checks to see if it can
1599  be determined to be indexing from a common uniform base pointer. If
1600  so, the function returns the base pointer llvm::Value and initializes
1601  *offsets with an int vector of the per-lane offsets
1602  */
1603 static llvm::Value *
1604 lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets,
1605  llvm::Instruction *insertBefore) {
1606  if (g->debugPrint) {
1607  fprintf(stderr, "lGetBasePtrAndOffsets\n");
1608  LLVMDumpValue(ptrs);
1609  }
1610 
1611  bool broadcastDetected = false;
1612  // Looking for %gep_offset = shufflevector <8 x i64> %0, <8 x i64> undef, <8 x i32> zeroinitializer
1613  llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(ptrs);
1614  if (shuffle != NULL) {
1615  llvm::Value *indices = shuffle->getOperand(2);
1616  llvm::Value *vec = shuffle->getOperand(1);
1617  if (lIsUndef(vec) && llvm::isa<llvm::ConstantAggregateZero>(indices)) {
1618  broadcastDetected = true;
1619  }
1620  }
1621  llvm::Value *base = lGetBasePointer(ptrs, insertBefore, broadcastDetected);
1622  if (base != NULL) {
1623  // We have a straight up varying pointer with no indexing that's
1624  // actually all the same value.
1625  if (g->target->is32Bit())
1626  *offsets = LLVMInt32Vector(0);
1627  else
1628  *offsets = LLVMInt64Vector((int64_t)0);
1629 
1630  if (broadcastDetected){
1631  llvm::Value *op = shuffle->getOperand(0);
1632  llvm::BinaryOperator * bop_var = llvm::dyn_cast<llvm::BinaryOperator>(op);
1633  if (bop_var != NULL && bop_var->getOpcode() == llvm::Instruction::Add) {
1634  // We expect here ConstantVector as
1635  // <i64 4, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>
1636  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(bop_var->getOperand(1));
1637  if (cv!= NULL) {
1638  llvm::Value *zeroMask = llvm::ConstantVector::getSplat(
1639  cv->getType()->getVectorNumElements(),
1640  llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
1641  // Create offset
1642  llvm::Value *shuffle_offset = new llvm::ShuffleVectorInst(cv, llvm::UndefValue::get(cv->getType()),
1643  zeroMask, "shuffle", bop_var);
1644  *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets,
1645  shuffle_offset, "new_offsets", insertBefore);
1646  }
1647  }
1648  }
1649  return base;
1650  }
1651 
1652  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(ptrs);
1653  if (bop != NULL && bop->getOpcode() == llvm::Instruction::Add) {
1654  // If we have a common pointer plus something, then we're also
1655  // good.
1656  if ((base = lGetBasePtrAndOffsets(bop->getOperand(0),
1657  offsets, insertBefore)) != NULL) {
1658  *offsets =
1659  llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets,
1660  bop->getOperand(1), "new_offsets",
1661  insertBefore);
1662  return base;
1663  }
1664  else if ((base = lGetBasePtrAndOffsets(bop->getOperand(1),
1665  offsets, insertBefore)) != NULL) {
1666  *offsets =
1667  llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets,
1668  bop->getOperand(0), "new_offsets",
1669  insertBefore);
1670  return base;
1671  }
1672  }
1673  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(ptrs);
1674  if (cv != NULL) {
1675  // Indexing into global arrays can lead to this form, with
1676  // ConstantVectors..
1677  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
1678  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
1679  llvm::Constant *c =
1680  llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
1681  if (c == NULL)
1682  return NULL;
1683  elements.push_back(c);
1684  }
1685 
1686  llvm::Constant *delta[ISPC_MAX_NVEC];
1687  for (unsigned int i = 0; i < elements.size(); ++i) {
1688  // For each element, try to decompose it into either a straight
1689  // up base pointer, or a base pointer plus an integer value.
1690  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(elements[i]);
1691  if (ce == NULL)
1692  return NULL;
1693 
1694  delta[i] = NULL;
1695  llvm::Value *elementBase = NULL; // base pointer for this element
1696  if (ce->getOpcode() == llvm::Instruction::PtrToInt) {
1697  // If the element is just a ptr to int instruction, treat
1698  // it as having an offset of zero
1699  elementBase = ce;
1700  delta[i] = g->target->is32Bit() ? LLVMInt32(0) : LLVMInt64(0);
1701  }
1702  else if (ce->getOpcode() == llvm::Instruction::Add) {
1703  // Try both orderings of the operands to see if we can get
1704  // a pointer+offset out of them.
1705  elementBase =
1706  lGetConstantAddExprBaseOffset(ce->getOperand(0),
1707  ce->getOperand(1),
1708  &delta[i]);
1709  if (elementBase == NULL)
1710  elementBase =
1711  lGetConstantAddExprBaseOffset(ce->getOperand(1),
1712  ce->getOperand(0),
1713  &delta[i]);
1714  }
1715 
1716  // We weren't able to find a base pointer in the above. (We
1717  // don't expect this to happen; if it does, it may be necessary
1718  // to handle more cases in the decomposition above.)
1719  if (elementBase == NULL)
1720  return NULL;
1721 
1722  Assert(delta[i] != NULL);
1723  if (base == NULL)
1724  // The first time we've found a base pointer
1725  base = elementBase;
1726  else if (base != elementBase)
1727  // Different program instances have different base
1728  // pointers, so no luck.
1729  return NULL;
1730  }
1731 
1732  Assert(base != NULL);
1733  llvm::ArrayRef<llvm::Constant *> deltas(&delta[0],
1734  &delta[elements.size()]);
1735  *offsets = llvm::ConstantVector::get(deltas);
1736  return base;
1737  }
1738 
1739  llvm::ExtractValueInst *ev = llvm::dyn_cast<llvm::ExtractValueInst>(ptrs);
1740  if (ev != NULL) {
1741  Assert(ev->getNumIndices() == 1);
1742  int index = ev->getIndices()[0];
1743  ptrs = lExtractFromInserts(ev->getAggregateOperand(), index);
1744  if (ptrs != NULL)
1745  return lGetBasePtrAndOffsets(ptrs, offsets, insertBefore);
1746  }
1747 
1748  return NULL;
1749 }
1750 
1751 
1752 /** Given a vector expression in vec, separate it into a compile-time
1753  constant component and a variable component, returning the two parts in
1754  *constOffset and *variableOffset. (It should be the case that the sum
1755  of these two is exactly equal to the original vector.)
1756 
1757  This routine only handles some (important) patterns; in some cases it
1758  will fail and return components that are actually compile-time
1759  constants in *variableOffset.
1760 
1761  Finally, if there aren't any constant (or, respectivaly, variable)
1762  components, the corresponding return value may be set to NULL.
1763  */
1764 static void
1765 lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
1766  llvm::Value **variableOffset,
1767  llvm::Instruction *insertBefore) {
1768  if (llvm::isa<llvm::ConstantVector>(vec) ||
1769  llvm::isa<llvm::ConstantDataVector>(vec) ||
1770  llvm::isa<llvm::ConstantAggregateZero>(vec)) {
1771  *constOffset = vec;
1772  *variableOffset = NULL;
1773  return;
1774  }
1775 
1776  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(vec);
1777  if (cast != NULL) {
1778  // Check the cast target.
1779  llvm::Value *co, *vo;
1780  lExtractConstantOffset(cast->getOperand(0), &co, &vo, insertBefore);
1781 
1782  // make new cast instructions for the two parts
1783  if (co == NULL)
1784  *constOffset = NULL;
1785  else
1786  *constOffset = llvm::CastInst::Create(cast->getOpcode(),
1787  co, cast->getType(),
1788  LLVMGetName(co, "_cast"),
1789  insertBefore);
1790  if (vo == NULL)
1791  *variableOffset = NULL;
1792  else
1793  *variableOffset = llvm::CastInst::Create(cast->getOpcode(),
1794  vo, cast->getType(),
1795  LLVMGetName(vo, "_cast"),
1796  insertBefore);
1797  return;
1798  }
1799 
1800  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(vec);
1801  if (bop != NULL) {
1802  llvm::Value *op0 = bop->getOperand(0);
1803  llvm::Value *op1 = bop->getOperand(1);
1804  llvm::Value *c0, *v0, *c1, *v1;
1805 
1806  if (bop->getOpcode() == llvm::Instruction::Add) {
1807  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1808  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1809 
1810  if (c0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c0))
1811  *constOffset = c1;
1812  else if (c1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c1))
1813  *constOffset = c0;
1814  else
1815  *constOffset =
1816  llvm::BinaryOperator::Create(llvm::Instruction::Add, c0, c1,
1817  LLVMGetName("add", c0, c1),
1818  insertBefore);
1819 
1820  if (v0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v0))
1821  *variableOffset = v1;
1822  else if (v1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v1))
1823  *variableOffset = v0;
1824  else
1825  *variableOffset =
1826  llvm::BinaryOperator::Create(llvm::Instruction::Add, v0, v1,
1827  LLVMGetName("add", v0, v1),
1828  insertBefore);
1829  return;
1830  }
1831  else if (bop->getOpcode() == llvm::Instruction::Shl) {
1832  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1833  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1834 
1835  // Given the product of constant and variable terms, we have:
1836  // (c0 + v0) * (2^(c1 + v1)) = c0 * 2^c1 * 2^v1 + v0 * 2^c1 * 2^v1
1837  // We can optimize only if v1 == NULL.
1838  if ((v1 != NULL) || (c0 == NULL) || (c1 == NULL)) {
1839  *constOffset = NULL;
1840  *variableOffset = vec;
1841  }
1842  else if (v0 == NULL) {
1843  *constOffset = vec;
1844  *variableOffset = NULL;
1845  }
1846  else {
1847  *constOffset =
1848  llvm::BinaryOperator::Create(llvm::Instruction::Shl, c0, c1,
1849  LLVMGetName("shl", c0, c1),
1850  insertBefore);
1851  *variableOffset =
1852  llvm::BinaryOperator::Create(llvm::Instruction::Shl, v0, c1,
1853  LLVMGetName("shl", v0, c1),
1854  insertBefore);
1855  }
1856  return;
1857  }
1858  else if (bop->getOpcode() == llvm::Instruction::Mul) {
1859  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1860  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1861 
1862  // Given the product of constant and variable terms, we have:
1863  // (c0 + v0) * (c1 + v1) == (c0 c1) + (v0 c1 + c0 v1 + v0 v1)
1864  // Note that the first term is a constant and the last three are
1865  // variable.
1866  if (c0 != NULL && c1 != NULL)
1867  *constOffset =
1868  llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, c1,
1869  LLVMGetName("mul", c0, c1),
1870  insertBefore);
1871  else
1872  *constOffset = NULL;
1873 
1874  llvm::Value *va = NULL, *vb = NULL, *vc = NULL;
1875  if (v0 != NULL && c1 != NULL)
1876  va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1,
1877  LLVMGetName("mul", v0, c1), insertBefore);
1878  if (c0 != NULL && v1 != NULL)
1879  vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1,
1880  LLVMGetName("mul", c0, v1), insertBefore);
1881  if (v0 != NULL && v1 != NULL)
1882  vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1,
1883  LLVMGetName("mul", v0, v1), insertBefore);
1884 
1885 
1886  llvm::Value *vab = NULL;
1887  if (va != NULL && vb != NULL)
1888  vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb,
1889  LLVMGetName("add", va, vb), insertBefore);
1890  else if (va != NULL)
1891  vab = va;
1892  else
1893  vab = vb;
1894 
1895  if (vab != NULL && vc != NULL)
1896  *variableOffset =
1897  llvm::BinaryOperator::Create(llvm::Instruction::Add, vab, vc,
1898  LLVMGetName("add", vab, vc), insertBefore);
1899  else if (vab != NULL)
1900  *variableOffset = vab;
1901  else
1902  *variableOffset = vc;
1903 
1904  return;
1905  }
1906  }
1907 
1908  // Nothing matched, just return what we have as a variable component
1909  *constOffset = NULL;
1910  *variableOffset = vec;
1911 }
1912 
1913 
1914 /* Returns true if the given value is a constant vector of integers with
1915  the same value in all of the elements. (Returns the splatted value in
1916  *splat, if so). */
1917 static bool
1918 lIsIntegerSplat(llvm::Value *v, int *splat) {
1919  llvm::ConstantDataVector *cvec =
1920  llvm::dyn_cast<llvm::ConstantDataVector>(v);
1921  if (cvec == NULL)
1922  return false;
1923 
1924  llvm::Constant *splatConst = cvec->getSplatValue();
1925  if (splatConst == NULL)
1926  return false;
1927 
1928  llvm::ConstantInt *ci =
1929  llvm::dyn_cast<llvm::ConstantInt>(splatConst);
1930  if (ci == NULL)
1931  return false;
1932 
1933  int64_t splatVal = ci->getSExtValue();
1934  *splat = (int)splatVal;
1935  return true;
1936 }
1937 
1938 
1939 static llvm::Value *
1940 lExtract248Scale(llvm::Value *splatOperand, int splatValue,
1941  llvm::Value *otherOperand, llvm::Value **result) {
1942  if (splatValue == 2 || splatValue == 4 || splatValue == 8) {
1943  *result = otherOperand;
1944  return LLVMInt32(splatValue);
1945  }
1946  // Even if we don't have a common scale by exactly 2, 4, or 8, we'll
1947  // see if we can pull out that much of the scale anyway; this may in
1948  // turn allow other optimizations later.
1949  for (int scale = 8; scale >= 2; scale /= 2) {
1950  llvm::Instruction *insertBefore =
1951  llvm::dyn_cast<llvm::Instruction>(*result);
1952  Assert(insertBefore != NULL);
1953 
1954  if ((splatValue % scale) == 0) {
1955  // *result = otherOperand * splatOperand / scale;
1956  llvm::Value *splatScaleVec =
1957  (splatOperand->getType() == LLVMTypes::Int32VectorType) ?
1958  LLVMInt32Vector(scale) : LLVMInt64Vector(scale);
1959  llvm::Value *splatDiv =
1960  llvm::BinaryOperator::Create(llvm::Instruction::SDiv,
1961  splatOperand, splatScaleVec,
1962  "div", insertBefore);
1963  *result =
1964  llvm::BinaryOperator::Create(llvm::Instruction::Mul,
1965  splatDiv, otherOperand,
1966  "mul", insertBefore);
1967  return LLVMInt32(scale);
1968  }
1969  }
1970  return LLVMInt32(1);
1971 }
1972 
1973 
1974 /** Given a vector of integer offsets to a base pointer being used for a
1975  gather or a scatter, see if its root operation is a multiply by a
1976  vector of some value by all 2s/4s/8s. If not, return NULL.
1977 
1978  If it is return an i32 value of 2, 4, 8 from the function and modify
1979  *vec so that it points to the operand that is being multiplied by
1980  2/4/8.
1981 
1982  We go through all this trouble so that we can pass the i32 scale factor
1983  to the {gather,scatter}_base_offsets function as a separate scale
1984  factor for the offsets. This in turn is used in a way so that the LLVM
1985  x86 code generator matches it to apply x86's free scale by 2x, 4x, or
1986  8x to one of two registers being added together for an addressing
1987  calculation.
1988  */
1989 static llvm::Value *
1990 lExtractOffsetVector248Scale(llvm::Value **vec) {
1991  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(*vec);
1992  if (cast != NULL) {
1993  llvm::Value *castOp = cast->getOperand(0);
1994  // Check the cast target.
1995  llvm::Value *scale = lExtractOffsetVector248Scale(&castOp);
1996  if (scale == NULL)
1997  return NULL;
1998 
1999  // make a new cast instruction so that we end up with the right
2000  // type
2001  *vec = llvm::CastInst::Create(cast->getOpcode(), castOp, cast->getType(), "offset_cast", cast);
2002  return scale;
2003  }
2004 
2005 
2006  // If we don't have a binary operator, then just give up
2007  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
2008  if (bop == NULL)
2009  return LLVMInt32(1);
2010 
2011  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
2012  if (bop->getOpcode() == llvm::Instruction::Add) {
2013  if (llvm::isa<llvm::ConstantAggregateZero>(op0)) {
2014  *vec = op1;
2015  return lExtractOffsetVector248Scale(vec);
2016  }
2017  else if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
2018  *vec = op0;
2019  return lExtractOffsetVector248Scale(vec);
2020  }
2021  else {
2022  llvm::Value *s0 = lExtractOffsetVector248Scale(&op0);
2023  llvm::Value *s1 = lExtractOffsetVector248Scale(&op1);
2024  if (s0 == s1) {
2025  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
2026  op0, op1, "new_add", bop);
2027  return s0;
2028  }
2029  else
2030  return LLVMInt32(1);
2031  }
2032  }
2033  else if (bop->getOpcode() == llvm::Instruction::Mul) {
2034  // Check each operand for being one of the scale factors we care about.
2035  int splat;
2036  if (lIsIntegerSplat(op0, &splat))
2037  return lExtract248Scale(op0, splat, op1, vec);
2038  else if (lIsIntegerSplat(op1, &splat))
2039  return lExtract248Scale(op1, splat, op0, vec);
2040  else
2041  return LLVMInt32(1);
2042  }
2043  else
2044  return LLVMInt32(1);
2045 }
2046 
2047 #if 0
2048 static llvm::Value *
2049 lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
2050  fprintf(stderr, " lextract: ");
2051  (*vec)->dump();
2052  fprintf(stderr, "\n");
2053 
2054  if (llvm::isa<llvm::ConstantVector>(*vec) ||
2055  llvm::isa<llvm::ConstantDataVector>(*vec) ||
2056  llvm::isa<llvm::ConstantAggregateZero>(*vec))
2057  return NULL;
2058 
2059  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
2060  if (sext != NULL) {
2061  llvm::Value *sextOp = sext->getOperand(0);
2062  // Check the sext target.
2063  llvm::Value *unif = lExtractUniforms(&sextOp, insertBefore);
2064  if (unif == NULL)
2065  return NULL;
2066 
2067  // make a new sext instruction so that we end up with the right
2068  // type
2069  *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
2070  return unif;
2071  }
2072 
2073  if (LLVMVectorValuesAllEqual(*vec)) {
2074  // FIXME: we may want to redo all of the expression here, in scalar
2075  // form (if at all possible), for code quality...
2076  llvm::Value *unif =
2077  llvm::ExtractElementInst::Create(*vec, LLVMInt32(0),
2078  "first_uniform", insertBefore);
2079  *vec = NULL;
2080  return unif;
2081  }
2082 
2083  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
2084  if (bop == NULL)
2085  return NULL;
2086 
2087  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
2088  if (bop->getOpcode() == llvm::Instruction::Add) {
2089  llvm::Value *s0 = lExtractUniforms(&op0, insertBefore);
2090  llvm::Value *s1 = lExtractUniforms(&op1, insertBefore);
2091  if (s0 == NULL && s1 == NULL)
2092  return NULL;
2093 
2094  if (op0 == NULL)
2095  *vec = op1;
2096  else if (op1 == NULL)
2097  *vec = op0;
2098  else
2099  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
2100  op0, op1, "new_add", insertBefore);
2101 
2102  if (s0 == NULL)
2103  return s1;
2104  else if (s1 == NULL)
2105  return s0;
2106  else
2107  return llvm::BinaryOperator::Create(llvm::Instruction::Add, s0, s1,
2108  "add_unif", insertBefore);
2109  }
2110 #if 0
2111  else if (bop->getOpcode() == llvm::Instruction::Mul) {
2112  // Check each operand for being one of the scale factors we care about.
2113  int splat;
2114  if (lIs248Splat(op0, &splat)) {
2115  *vec = op1;
2116  return LLVMInt32(splat);
2117  }
2118  else if (lIs248Splat(op1, &splat)) {
2119  *vec = op0;
2120  return LLVMInt32(splat);
2121  }
2122  else
2123  return LLVMInt32(1);
2124  }
2125 #endif
2126  else
2127  return NULL;
2128 }
2129 
2130 
2131 static void
2132 lExtractUniformsFromOffset(llvm::Value **basePtr, llvm::Value **offsetVector,
2133  llvm::Value *offsetScale,
2134  llvm::Instruction *insertBefore) {
2135 #if 1
2136  (*basePtr)->dump();
2137  printf("\n");
2138  (*offsetVector)->dump();
2139  printf("\n");
2140  offsetScale->dump();
2141  printf("-----\n");
2142 #endif
2143 
2144  llvm::Value *uniformDelta = lExtractUniforms(offsetVector, insertBefore);
2145  if (uniformDelta == NULL)
2146  return;
2147 
2148  *basePtr = lGEPInst(*basePtr, arrayRef, "new_base", insertBefore);
2149 
2150  // this should only happen if we have only uniforms, but that in turn
2151  // shouldn't be a gather/scatter!
2152  Assert(*offsetVector != NULL);
2153 }
2154 #endif
2155 
2156 
2157 static bool
2158 lVectorIs32BitInts(llvm::Value *v) {
2159  int nElts;
2160  int64_t elts[ISPC_MAX_NVEC];
2161  if (!LLVMExtractVectorInts(v, elts, &nElts))
2162  return false;
2163 
2164  for (int i = 0; i < nElts; ++i)
2165  if ((int32_t)elts[i] != elts[i])
2166  return false;
2167 
2168  return true;
2169 }
2170 
2171 
2172 /** Check to see if the two offset vectors can safely be represented with
2173  32-bit values. If so, return true and update the pointed-to
2174  llvm::Value *s to be the 32-bit equivalents. */
2175 static bool
2176 lOffsets32BitSafe(llvm::Value **variableOffsetPtr,
2177  llvm::Value **constOffsetPtr,
2178  llvm::Instruction *insertBefore) {
2179  llvm::Value *variableOffset = *variableOffsetPtr;
2180  llvm::Value *constOffset = *constOffsetPtr;
2181 
2182  if (variableOffset->getType() != LLVMTypes::Int32VectorType) {
2183  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(variableOffset);
2184  if (sext != NULL &&
2185  sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType)
2186  // sext of a 32-bit vector -> the 32-bit vector is good
2187  variableOffset = sext->getOperand(0);
2188  else if (lVectorIs32BitInts(variableOffset))
2189  // The only constant vector we should have here is a vector of
2190  // all zeros (i.e. a ConstantAggregateZero, but just in case,
2191  // do the more general check with lVectorIs32BitInts().
2192  variableOffset =
2193  new llvm::TruncInst(variableOffset, LLVMTypes::Int32VectorType,
2194  LLVMGetName(variableOffset, "_trunc"),
2195  insertBefore);
2196  else
2197  return false;
2198  }
2199 
2200  if (constOffset->getType() != LLVMTypes::Int32VectorType) {
2201  if (lVectorIs32BitInts(constOffset)) {
2202  // Truncate them so we have a 32-bit vector type for them.
2203  constOffset =
2204  new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
2205  LLVMGetName(constOffset, "_trunc"), insertBefore);
2206  }
2207  else {
2208  // FIXME: otherwise we just assume that all constant offsets
2209  // can actually always fit into 32-bits... (This could be
2210  // wrong, but it should be only in pretty esoteric cases). We
2211  // make this assumption for now since we sometimes generate
2212  // constants that need constant folding before we really have a
2213  // constant vector out of them, and
2214  // llvm::ConstantFoldInstruction() doesn't seem to be doing
2215  // enough for us in some cases if we call it from here.
2216  constOffset =
2217  new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
2218  LLVMGetName(constOffset, "_trunc"), insertBefore);
2219  }
2220  }
2221 
2222  *variableOffsetPtr = variableOffset;
2223  *constOffsetPtr = constOffset;
2224  return true;
2225 }
2226 
2227 /** Check to see if the offset value is composed of a string of Adds,
2228  SExts, and Constant Vectors that are 32-bit safe. Recursively
2229  explores the operands of Add instructions (as they might themselves
2230  be adds that eventually terminate in constant vectors or a SExt.)
2231  */
2232 
2233 static bool
2234 lIs32BitSafeHelper(llvm::Value *v) {
2235  // handle Adds, SExts, Constant Vectors
2236  if (llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v)) {
2237  if (bop->getOpcode() == llvm::Instruction::Add) {
2238  return lIs32BitSafeHelper(bop->getOperand(0))
2239  && lIs32BitSafeHelper(bop->getOperand(1));
2240  }
2241  return false;
2242  }
2243  else if (llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(v)) {
2244  return sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType;
2245  }
2246  else return lVectorIs32BitInts(v);
2247 }
2248 
2249 /** Check to see if the single offset vector can safely be represented with
2250  32-bit values. If so, return true and update the pointed-to
2251  llvm::Value * to be the 32-bit equivalent. */
2252 static bool
2253 lOffsets32BitSafe(llvm::Value **offsetPtr,
2254  llvm::Instruction *insertBefore) {
2255  llvm::Value *offset = *offsetPtr;
2256 
2257  if (offset->getType() == LLVMTypes::Int32VectorType)
2258  return true;
2259 
2260  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offset);
2261  if (sext != NULL &&
2262  sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
2263  // sext of a 32-bit vector -> the 32-bit vector is good
2264  *offsetPtr = sext->getOperand(0);
2265  return true;
2266  }
2267  else if (lIs32BitSafeHelper(offset)) {
2268  // The only constant vector we should have here is a vector of
2269  // all zeros (i.e. a ConstantAggregateZero, but just in case,
2270  // do the more general check with lVectorIs32BitInts().
2271 
2272  // Alternatively, offset could be a sequence of adds terminating
2273  // in safe constant vectors or a SExt.
2274  *offsetPtr =
2275  new llvm::TruncInst(offset, LLVMTypes::Int32VectorType,
2276  LLVMGetName(offset, "_trunc"),
2277  insertBefore);
2278  return true;
2279  }
2280  else
2281  return false;
2282 }
2283 
2284 
2285 static bool
2286 lGSToGSBaseOffsets(llvm::CallInst *callInst) {
2287  struct GSInfo {
2288  GSInfo(const char *pgFuncName, const char *pgboFuncName,
2289  const char *pgbo32FuncName, bool ig, bool ip)
2290  : isGather(ig), isPrefetch(ip) {
2291  func = m->module->getFunction(pgFuncName);
2292  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
2293  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
2294  }
2295  llvm::Function *func;
2296  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
2297  const bool isGather;
2298  const bool isPrefetch;
2299  };
2300 
2301  GSInfo gsFuncs[] = {
2302  GSInfo("__pseudo_gather32_i8",
2303  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2304  "__pseudo_gather_factored_base_offsets32_i8",
2305  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2306  "__pseudo_gather_factored_base_offsets32_i8",
2307  true, false),
2308  GSInfo("__pseudo_gather32_i16",
2309  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2310  "__pseudo_gather_factored_base_offsets32_i16",
2311  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2312  "__pseudo_gather_factored_base_offsets32_i16",
2313  true, false),
2314  GSInfo("__pseudo_gather32_i32",
2315  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2316  "__pseudo_gather_factored_base_offsets32_i32",
2317  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2318  "__pseudo_gather_factored_base_offsets32_i32",
2319  true, false),
2320  GSInfo("__pseudo_gather32_float",
2321  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2322  "__pseudo_gather_factored_base_offsets32_float",
2323  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2324  "__pseudo_gather_factored_base_offsets32_float",
2325  true, false),
2326  GSInfo("__pseudo_gather32_i64",
2327  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2328  "__pseudo_gather_factored_base_offsets32_i64",
2329  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2330  "__pseudo_gather_factored_base_offsets32_i64",
2331  true, false),
2332  GSInfo("__pseudo_gather32_double",
2333  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2334  "__pseudo_gather_factored_base_offsets32_double",
2335  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2336  "__pseudo_gather_factored_base_offsets32_double",
2337  true, false),
2338 
2339  GSInfo("__pseudo_scatter32_i8",
2340  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2341  "__pseudo_scatter_factored_base_offsets32_i8",
2342  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2343  "__pseudo_scatter_factored_base_offsets32_i8",
2344  false, false),
2345  GSInfo("__pseudo_scatter32_i16",
2346  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2347  "__pseudo_scatter_factored_base_offsets32_i16",
2348  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2349  "__pseudo_scatter_factored_base_offsets32_i16",
2350  false, false),
2351  GSInfo("__pseudo_scatter32_i32",
2352  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2353  "__pseudo_scatter_factored_base_offsets32_i32",
2354  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2355  "__pseudo_scatter_factored_base_offsets32_i32",
2356  false, false),
2357  GSInfo("__pseudo_scatter32_float",
2358  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2359  "__pseudo_scatter_factored_base_offsets32_float",
2360  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2361  "__pseudo_scatter_factored_base_offsets32_float",
2362  false, false),
2363  GSInfo("__pseudo_scatter32_i64",
2364  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2365  "__pseudo_scatter_factored_base_offsets32_i64",
2366  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2367  "__pseudo_scatter_factored_base_offsets32_i64",
2368  false, false),
2369  GSInfo("__pseudo_scatter32_double",
2370  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2371  "__pseudo_scatter_factored_base_offsets32_double",
2372  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2373  "__pseudo_scatter_factored_base_offsets32_double",
2374  false, false),
2375 
2376  GSInfo("__pseudo_gather64_i8",
2377  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
2378  "__pseudo_gather_factored_base_offsets64_i8",
2379  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2380  "__pseudo_gather_factored_base_offsets32_i8",
2381  true, false),
2382  GSInfo("__pseudo_gather64_i16",
2383  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
2384  "__pseudo_gather_factored_base_offsets64_i16",
2385  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2386  "__pseudo_gather_factored_base_offsets32_i16",
2387  true, false),
2388  GSInfo("__pseudo_gather64_i32",
2389  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
2390  "__pseudo_gather_factored_base_offsets64_i32",
2391  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2392  "__pseudo_gather_factored_base_offsets32_i32",
2393  true, false),
2394  GSInfo("__pseudo_gather64_float",
2395  g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
2396  "__pseudo_gather_factored_base_offsets64_float",
2397  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2398  "__pseudo_gather_factored_base_offsets32_float",
2399  true, false),
2400  GSInfo("__pseudo_gather64_i64",
2401  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
2402  "__pseudo_gather_factored_base_offsets64_i64",
2403  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2404  "__pseudo_gather_factored_base_offsets32_i64",
2405  true, false),
2406  GSInfo("__pseudo_gather64_double",
2407  g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
2408  "__pseudo_gather_factored_base_offsets64_double",
2409  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2410  "__pseudo_gather_factored_base_offsets32_double",
2411  true, false),
2412 
2413  GSInfo("__pseudo_scatter64_i8",
2414  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
2415  "__pseudo_scatter_factored_base_offsets64_i8",
2416  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2417  "__pseudo_scatter_factored_base_offsets32_i8",
2418  false, false),
2419  GSInfo("__pseudo_scatter64_i16",
2420  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
2421  "__pseudo_scatter_factored_base_offsets64_i16",
2422  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2423  "__pseudo_scatter_factored_base_offsets32_i16",
2424  false, false),
2425  GSInfo("__pseudo_scatter64_i32",
2426  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
2427  "__pseudo_scatter_factored_base_offsets64_i32",
2428  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2429  "__pseudo_scatter_factored_base_offsets32_i32",
2430  false, false),
2431  GSInfo("__pseudo_scatter64_float",
2432  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
2433  "__pseudo_scatter_factored_base_offsets64_float",
2434  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2435  "__pseudo_scatter_factored_base_offsets32_float",
2436  false, false),
2437  GSInfo("__pseudo_scatter64_i64",
2438  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
2439  "__pseudo_scatter_factored_base_offsets64_i64",
2440  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2441  "__pseudo_scatter_factored_base_offsets32_i64",
2442  false, false),
2443  GSInfo("__pseudo_scatter64_double",
2444  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
2445  "__pseudo_scatter_factored_base_offsets64_double",
2446  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2447  "__pseudo_scatter_factored_base_offsets32_double",
2448  false, false),
2449  GSInfo("__pseudo_prefetch_read_varying_1",
2450  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2451  "__prefetch_read_varying_1",
2452  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2453  "__prefetch_read_varying_1",
2454  false, true),
2455 
2456  GSInfo("__pseudo_prefetch_read_varying_2",
2457  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2458  "__prefetch_read_varying_2",
2459  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2460  "__prefetch_read_varying_2",
2461  false, true),
2462 
2463  GSInfo("__pseudo_prefetch_read_varying_3",
2464  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2465  "__prefetch_read_varying_3",
2466  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2467  "__prefetch_read_varying_3",
2468  false, true),
2469 
2470  GSInfo("__pseudo_prefetch_read_varying_nt",
2471  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2472  "__prefetch_read_varying_nt",
2473  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2474  "__prefetch_read_varying_nt",
2475  false, true),
2476  };
2477 
2478  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2479  for (int i = 0; i < numGSFuncs; ++i)
2480  Assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL &&
2481  gsFuncs[i].baseOffsets32Func != NULL);
2482 
2483  GSInfo *info = NULL;
2484  for (int i = 0; i < numGSFuncs; ++i)
2485  if (gsFuncs[i].func != NULL &&
2486  callInst->getCalledFunction() == gsFuncs[i].func) {
2487  info = &gsFuncs[i];
2488  break;
2489  }
2490  if (info == NULL)
2491  return false;
2492 
2493  // Try to transform the array of pointers to a single base pointer
2494  // and an array of int32 offsets. (All the hard work is done by
2495  // lGetBasePtrAndOffsets).
2496  llvm::Value *ptrs = callInst->getArgOperand(0);
2497  llvm::Value *offsetVector = NULL;
2498  llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector,
2499  callInst);
2500 
2501  if (basePtr == NULL || offsetVector == NULL ||
2502  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
2503  // It's actually a fully general gather/scatter with a varying
2504  // set of base pointers, so leave it as is and continune onward
2505  // to the next instruction...
2506  return false;
2507 
2508  // Cast the base pointer to a void *, since that's what the
2509  // __pseudo_*_base_offsets_* functions want.
2510  basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType,
2511  LLVMGetName(basePtr, "_2void"), callInst);
2512  lCopyMetadata(basePtr, callInst);
2513 
2514  llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
2515 
2516  if ((info->isGather == true && g->target->hasGather()) ||
2517  (info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
2518  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
2519 
2520  // See if the offsets are scaled by 2, 4, or 8. If so,
2521  // extract that scale factor and rewrite the offsets to remove
2522  // it.
2523  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
2524 
2525  // If we're doing 32-bit addressing on a 64-bit target, here we
2526  // will see if we can call one of the 32-bit variants of the pseudo
2527  // gather/scatter functions.
2528  if (g->opt.force32BitAddressing &&
2529  lOffsets32BitSafe(&offsetVector, callInst)) {
2530  gatherScatterFunc = info->baseOffsets32Func;
2531  }
2532 
2533  if (info->isGather || info->isPrefetch) {
2534  llvm::Value *mask = callInst->getArgOperand(1);
2535 
2536  // Generate a new function call to the next pseudo gather
2537  // base+offsets instruction. Note that we're passing a NULL
2538  // llvm::Instruction to llvm::CallInst::Create; this means that
2539  // the instruction isn't inserted into a basic block and that
2540  // way we can then call ReplaceInstWithInst().
2541  llvm::Instruction *newCall =
2542  lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector,
2543  mask, callInst->getName().str().c_str(),
2544  NULL);
2545  lCopyMetadata(newCall, callInst);
2546  llvm::ReplaceInstWithInst(callInst, newCall);
2547  }
2548  else {
2549  llvm::Value *storeValue = callInst->getArgOperand(1);
2550  llvm::Value *mask = callInst->getArgOperand(2);
2551 
2552  // Generate a new function call to the next pseudo scatter
2553  // base+offsets instruction. See above for why passing NULL
2554  // for the Instruction * is intended.
2555  llvm::Instruction *newCall =
2556  lCallInst(gatherScatterFunc, basePtr, offsetScale,
2557  offsetVector, storeValue, mask, "", NULL);
2558  lCopyMetadata(newCall, callInst);
2559  llvm::ReplaceInstWithInst(callInst, newCall);
2560  }
2561  }
2562  else {
2563  // Try to decompose the offset vector into a compile time constant
2564  // component and a varying component. The constant component is
2565  // passed as a separate parameter to the gather/scatter functions,
2566  // which in turn allows their implementations to end up emitting
2567  // x86 instructions with constant offsets encoded in them.
2568  llvm::Value *constOffset, *variableOffset;
2569  lExtractConstantOffset(offsetVector, &constOffset, &variableOffset,
2570  callInst);
2571  if (constOffset == NULL)
2572  constOffset = LLVMIntAsType(0, offsetVector->getType());
2573  if (variableOffset == NULL)
2574  variableOffset = LLVMIntAsType(0, offsetVector->getType());
2575 
2576  // See if the varying component is scaled by 2, 4, or 8. If so,
2577  // extract that scale factor and rewrite variableOffset to remove
2578  // it. (This also is pulled out so that we can match the scales by
2579  // 2/4/8 offered by x86 addressing operators.)
2580  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);
2581 
2582  // If we're doing 32-bit addressing on a 64-bit target, here we
2583  // will see if we can call one of the 32-bit variants of the pseudo
2584  // gather/scatter functions.
2585  if (g->opt.force32BitAddressing &&
2586  lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) {
2587  gatherScatterFunc = info->baseOffsets32Func;
2588  }
2589 
2590  if (info->isGather || info->isPrefetch) {
2591  llvm::Value *mask = callInst->getArgOperand(1);
2592 
2593  // Generate a new function call to the next pseudo gather
2594  // base+offsets instruction. Note that we're passing a NULL
2595  // llvm::Instruction to llvm::CallInst::Create; this means that
2596  // the instruction isn't inserted into a basic block and that
2597  // way we can then call ReplaceInstWithInst().
2598  llvm::Instruction *newCall =
2599  lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
2600  constOffset, mask, callInst->getName().str().c_str(),
2601  NULL);
2602  lCopyMetadata(newCall, callInst);
2603  llvm::ReplaceInstWithInst(callInst, newCall);
2604  }
2605  else {
2606  llvm::Value *storeValue = callInst->getArgOperand(1);
2607  llvm::Value *mask = callInst->getArgOperand(2);
2608 
2609  // Generate a new function call to the next pseudo scatter
2610  // base+offsets instruction. See above for why passing NULL
2611  // for the Instruction * is intended.
2612  llvm::Instruction *newCall =
2613  lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
2614  constOffset, storeValue, mask, "", NULL);
2615  lCopyMetadata(newCall, callInst);
2616  llvm::ReplaceInstWithInst(callInst, newCall);
2617  }
2618  }
2619  return true;
2620 }
2621 
2622 
2623 /** Try to improve the decomposition between compile-time constant and
2624  compile-time unknown offsets in calls to the __pseudo_*_base_offsets*
2625  functions. Other other optimizations have run, we will sometimes be
2626  able to pull more terms out of the unknown part and add them into the
2627  compile-time-known part.
2628  */
2629 static bool
2630 lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
2631  struct GSBOInfo {
2632  GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
2633  : isGather(ig), isPrefetch(ip) {
2634  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
2635  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
2636  }
2637  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
2638  const bool isGather;
2639  const bool isPrefetch;
2640  };
2641 
2642  GSBOInfo gsFuncs[] = {
2643  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2644  "__pseudo_gather_factored_base_offsets32_i8",
2645  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2646  "__pseudo_gather_factored_base_offsets32_i8",
2647  true, false),
2648  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2649  "__pseudo_gather_factored_base_offsets32_i16",
2650  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2651  "__pseudo_gather_factored_base_offsets32_i16",
2652  true, false),
2653  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2654  "__pseudo_gather_factored_base_offsets32_i32",
2655  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2656  "__pseudo_gather_factored_base_offsets32_i32",
2657  true, false),
2658  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2659  "__pseudo_gather_factored_base_offsets32_float",
2660  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2661  "__pseudo_gather_factored_base_offsets32_float",
2662  true, false),
2663  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2664  "__pseudo_gather_factored_base_offsets32_i64",
2665  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2666  "__pseudo_gather_factored_base_offsets32_i64",
2667  true, false),
2668  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2669  "__pseudo_gather_factored_base_offsets32_double",
2670  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2671  "__pseudo_gather_factored_base_offsets32_double",
2672  true, false),
2673 
2674  GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2675  "__pseudo_scatter_factored_base_offsets32_i8",
2676  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2677  "__pseudo_scatter_factored_base_offsets32_i8",
2678  false, false),
2679  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2680  "__pseudo_scatter_factored_base_offsets32_i16",
2681  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2682  "__pseudo_scatter_factored_base_offsets32_i16",
2683  false, false),
2684  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2685  "__pseudo_scatter_factored_base_offsets32_i32",
2686  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2687  "__pseudo_scatter_factored_base_offsets32_i32",
2688  false, false),
2689  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2690  "__pseudo_scatter_factored_base_offsets32_float",
2691  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2692  "__pseudo_scatter_factored_base_offsets32_float",
2693  false, false),
2694  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2695  "__pseudo_scatter_factored_base_offsets32_i64",
2696  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2697  "__pseudo_scatter_factored_base_offsets32_i64",
2698  false, false),
2699  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2700  "__pseudo_scatter_factored_base_offsets32_double",
2701  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2702  "__pseudo_scatter_factored_base_offsets32_double",
2703  false, false),
2704 
2705  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2706  "__prefetch_read_varying_1",
2707  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
2708  "__prefetch_read_varying_1",
2709  false, true),
2710 
2711  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2712  "__prefetch_read_varying_2",
2713  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
2714  "__prefetch_read_varying_2",
2715  false, true),
2716 
2717  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2718  "__prefetch_read_varying_3",
2719  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
2720  "__prefetch_read_varying_3",
2721  false, true),
2722 
2723  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2724  "__prefetch_read_varying_nt",
2725  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
2726  "__prefetch_read_varying_nt",
2727  false, true),
2728  };
2729 
2730  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2731  for (int i = 0; i < numGSFuncs; ++i)
2732  Assert(gsFuncs[i].baseOffsetsFunc != NULL &&
2733  gsFuncs[i].baseOffsets32Func != NULL);
2734 
2735  llvm::Function *calledFunc = callInst->getCalledFunction();
2736  Assert(calledFunc != NULL);
2737 
2738  // Is one of the gather/scatter functins that decompose into
2739  // base+offsets being called?
2740  GSBOInfo *info = NULL;
2741  for (int i = 0; i < numGSFuncs; ++i)
2742  if (calledFunc == gsFuncs[i].baseOffsetsFunc ||
2743  calledFunc == gsFuncs[i].baseOffsets32Func) {
2744  info = &gsFuncs[i];
2745  break;
2746  }
2747  if (info == NULL)
2748  return false;
2749 
2750  // Grab the old variable offset
2751  llvm::Value *origVariableOffset = callInst->getArgOperand(1);
2752 
2753  // If it's zero, we're done. Don't go and think that we're clever by
2754  // adding these zeros to the constant offsets.
2755  if (llvm::isa<llvm::ConstantAggregateZero>(origVariableOffset))
2756  return false;
2757 
2758  // Try to decompose the old variable offset
2759  llvm::Value *constOffset, *variableOffset;
2760  lExtractConstantOffset(origVariableOffset, &constOffset, &variableOffset,
2761  callInst);
2762 
2763  // No luck
2764  if (constOffset == NULL)
2765  return false;
2766 
2767  // Total luck: everything could be moved to the constant offset
2768  if (variableOffset == NULL)
2769  variableOffset = LLVMIntAsType(0, origVariableOffset->getType());
2770 
2771  // We need to scale the value we add to the constant offset by the
2772  // 2/4/8 scale for the variable offset, if present.
2773  llvm::ConstantInt *varScale =
2774  llvm::dyn_cast<llvm::ConstantInt>(callInst->getArgOperand(2));
2775  Assert(varScale != NULL);
2776 
2777  llvm::Value *scaleSmear;
2778  if (origVariableOffset->getType() == LLVMTypes::Int64VectorType)
2779  scaleSmear = LLVMInt64Vector((int64_t)varScale->getZExtValue());
2780  else
2781  scaleSmear = LLVMInt32Vector((int32_t)varScale->getZExtValue());
2782 
2783  constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Mul, constOffset,
2784  scaleSmear, constOffset->getName(),
2785  callInst);
2786 
2787  // And add the additional offset to the original constant offset
2788  constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, constOffset,
2789  callInst->getArgOperand(3),
2790  callInst->getArgOperand(3)->getName(),
2791  callInst);
2792 
2793  // Finally, update the values of the operands to the gather/scatter
2794  // function.
2795  callInst->setArgOperand(1, variableOffset);
2796  callInst->setArgOperand(3, constOffset);
2797 
2798  return true;
2799 }
2800 
2801 
2802 static llvm::Value *
2803 lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets,
2804  llvm::Instruction *insertBefore) {
2805  llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets);
2806  return lGEPInst(base, firstOffset, "ptr", insertBefore);
2807 }
2808 
2809 
2810 static llvm::Constant *
2811 lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType) {
2812  llvm::ConstantInt *offsetScaleInt =
2813  llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
2814  Assert(offsetScaleInt != NULL);
2815  uint64_t scaleValue = offsetScaleInt->getZExtValue();
2816 
2817  std::vector<llvm::Constant *> scales;
2818  for (int i = 0; i < g->target->getVectorWidth(); ++i) {
2819  if (vecType == LLVMTypes::Int64VectorType)
2820  scales.push_back(LLVMInt64(scaleValue));
2821  else {
2822  Assert(vecType == LLVMTypes::Int32VectorType);
2823  scales.push_back(LLVMInt32((int32_t)scaleValue));
2824  }
2825  }
2826  return llvm::ConstantVector::get(scales);
2827 }
2828 
2829 
2830 /** After earlier optimization passes have run, we are sometimes able to
2831  determine that gathers/scatters are actually accessing memory in a more
2832  regular fashion and then change the operation to something simpler and
2833  more efficient. For example, if all of the lanes in a gather are
2834  reading from the same location, we can instead do a scalar load and
2835  broadcast. This pass examines gathers and scatters and tries to
2836  simplify them if at all possible.
2837 
2838  @todo Currently, this only looks for all program instances going to the
2839  same location and all going to a linear sequence of locations in
2840  memory. There are a number of other cases that might make sense to
2841  look for, including things that could be handled with a vector load +
2842  shuffle or things that could be handled with hybrids of e.g. 2 4-wide
2843  vector loads with AVX, etc.
2844 */
2845 static bool
2846 lGSToLoadStore(llvm::CallInst *callInst) {
2847  struct GatherImpInfo {
2848  GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st,
2849  int a)
2850  : align(a), isFactored(!g->target->hasGather()) {
2851  pseudoFunc = m->module->getFunction(pName);
2852  loadMaskedFunc = m->module->getFunction(lmName);
2853  Assert(pseudoFunc != NULL && loadMaskedFunc != NULL);
2854  scalarType = st;
2855  }
2856 
2857  llvm::Function *pseudoFunc;
2858  llvm::Function *loadMaskedFunc;
2859  llvm::Type *scalarType;
2860  const int align;
2861  const bool isFactored;
2862  };
2863 
2864  GatherImpInfo gInfo[] = {
2865  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
2866  "__pseudo_gather_factored_base_offsets32_i8",
2867  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2868  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
2869  "__pseudo_gather_factored_base_offsets32_i16",
2870  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2871  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
2872  "__pseudo_gather_factored_base_offsets32_i32",
2873  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2874  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
2875  "__pseudo_gather_factored_base_offsets32_float",
2876  "__masked_load_float", LLVMTypes::FloatType, 4),
2877  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
2878  "__pseudo_gather_factored_base_offsets32_i64",
2879  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2880  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
2881  "__pseudo_gather_factored_base_offsets32_double",
2882  "__masked_load_double", LLVMTypes::DoubleType, 8),
2883  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
2884  "__pseudo_gather_factored_base_offsets64_i8",
2885  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2886  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
2887  "__pseudo_gather_factored_base_offsets64_i16",
2888  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2889  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
2890  "__pseudo_gather_factored_base_offsets64_i32",
2891  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2892  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
2893  "__pseudo_gather_factored_base_offsets64_float",
2894  "__masked_load_float", LLVMTypes::FloatType, 4),
2895  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
2896  "__pseudo_gather_factored_base_offsets64_i64",
2897  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2898  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
2899  "__pseudo_gather_factored_base_offsets64_double",
2900  "__masked_load_double", LLVMTypes::DoubleType, 8),
2901  };
2902 
2903  struct ScatterImpInfo {
2904  ScatterImpInfo(const char *pName, const char *msName,
2905  llvm::Type *vpt, int a)
2906  : align(a), isFactored(!g->target->hasScatter()) {
2907  pseudoFunc = m->module->getFunction(pName);
2908  maskedStoreFunc = m->module->getFunction(msName);
2909  vecPtrType = vpt;
2910  Assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
2911  }
2912  llvm::Function *pseudoFunc;
2913  llvm::Function *maskedStoreFunc;
2914  llvm::Type *vecPtrType;
2915  const int align;
2916  const bool isFactored;
2917  };
2918 
2919  ScatterImpInfo sInfo[] = {
2920  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
2921  "__pseudo_scatter_factored_base_offsets32_i8",
2922  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2923  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
2924  "__pseudo_scatter_factored_base_offsets32_i16",
2925  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2926  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
2927  "__pseudo_scatter_factored_base_offsets32_i32",
2928  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2929  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
2930  "__pseudo_scatter_factored_base_offsets32_float",
2931  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2932  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
2933  "__pseudo_scatter_factored_base_offsets32_i64",
2934  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2935  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
2936  "__pseudo_scatter_factored_base_offsets32_double",
2937  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2938  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
2939  "__pseudo_scatter_factored_base_offsets64_i8",
2940  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2941  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
2942  "__pseudo_scatter_factored_base_offsets64_i16",
2943  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2944  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
2945  "__pseudo_scatter_factored_base_offsets64_i32",
2946  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2947  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
2948  "__pseudo_scatter_factored_base_offsets64_float",
2949  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2950  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
2951  "__pseudo_scatter_factored_base_offsets64_i64",
2952  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2953  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
2954  "__pseudo_scatter_factored_base_offsets64_double",
2955  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2956  };
2957 
2958  llvm::Function *calledFunc = callInst->getCalledFunction();
2959 
2960  GatherImpInfo *gatherInfo = NULL;
2961  ScatterImpInfo *scatterInfo = NULL;
2962  for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
2963  if (gInfo[i].pseudoFunc != NULL &&
2964  calledFunc == gInfo[i].pseudoFunc) {
2965  gatherInfo = &gInfo[i];
2966  break;
2967  }
2968  }
2969  for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
2970  if (sInfo[i].pseudoFunc != NULL &&
2971  calledFunc == sInfo[i].pseudoFunc) {
2972  scatterInfo = &sInfo[i];
2973  break;
2974  }
2975  }
2976  if (gatherInfo == NULL && scatterInfo == NULL)
2977  return false;
2978 
2979  SourcePos pos;
2980  lGetSourcePosFromMetadata(callInst, &pos);
2981 
2982  llvm::Value *base = callInst->getArgOperand(0);
2983  llvm::Value *fullOffsets = NULL;
2984  llvm::Value *storeValue = NULL;
2985  llvm::Value *mask = NULL;
2986 
2987  if ((gatherInfo != NULL && gatherInfo->isFactored) ||
2988  (scatterInfo != NULL && scatterInfo->isFactored)) {
2989  llvm::Value *varyingOffsets = callInst->getArgOperand(1);
2990  llvm::Value *offsetScale = callInst->getArgOperand(2);
2991  llvm::Value *constOffsets = callInst->getArgOperand(3);
2992  if (scatterInfo)
2993  storeValue = callInst->getArgOperand(4);
2994  mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);
2995 
2996  // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
2997  llvm::Constant *offsetScaleVec =
2998  lGetOffsetScaleVec(offsetScale, varyingOffsets->getType());
2999 
3000  llvm::Value *scaledVarying =
3001  llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
3002  varyingOffsets, "scaled_varying", callInst);
3003  fullOffsets =
3004  llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying,
3005  constOffsets, "varying+const_offsets",
3006  callInst);
3007  }
3008  else {
3009  if (scatterInfo)
3010  storeValue = callInst->getArgOperand(3);
3011  mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
3012 
3013  llvm::Value *offsetScale = callInst->getArgOperand(1);
3014  llvm::Value *offsets = callInst->getArgOperand(2);
3015  llvm::Value *offsetScaleVec =
3016  lGetOffsetScaleVec(offsetScale, offsets->getType());
3017 
3018  fullOffsets =
3019  llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
3020  offsets, "scaled_offsets", callInst);
3021  }
3022 
3023  Debug(SourcePos(), "GSToLoadStore: %s.",
3024  fullOffsets->getName().str().c_str());
3025 
3026  if (LLVMVectorValuesAllEqual(fullOffsets)) {
3027  // If all the offsets are equal, then compute the single
3028  // pointer they all represent based on the first one of them
3029  // (arbitrarily).
3030  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
3031  lCopyMetadata(ptr, callInst);
3032 
3033  if (gatherInfo != NULL) {
3034  // A gather with everyone going to the same location is
3035  // handled as a scalar load and broadcast across the lanes.
3036  Debug(pos, "Transformed gather to scalar load and broadcast!");
3037 
3038  ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(gatherInfo->scalarType, 0),
3039  ptr->getName(), callInst);
3040  llvm::Value *scalarValue = new llvm::LoadInst(ptr, callInst->getName(), callInst);
3041 
3042  // Generate the following sequence:
3043  // %name123 = insertelement <4 x i32> undef, i32 %val, i32 0
3044  // %name124 = shufflevector <4 x i32> %name123, <4 x i32> undef,
3045  // <4 x i32> zeroinitializer
3046  llvm::Value *undef1Value = llvm::UndefValue::get(callInst->getType());
3047  llvm::Value *undef2Value = llvm::UndefValue::get(callInst->getType());
3048  llvm::Value *insertVec = llvm::InsertElementInst::Create(
3049  undef1Value, scalarValue, LLVMInt32(0), callInst->getName(), callInst);
3050  llvm::Value *zeroMask = llvm::ConstantVector::getSplat(
3051  callInst->getType()->getVectorNumElements(),
3052  llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
3053  llvm::Value *shufValue = new llvm::ShuffleVectorInst(
3054  insertVec, undef2Value, zeroMask, callInst->getName());
3055 
3056  lCopyMetadata(shufValue, callInst);
3057  llvm::ReplaceInstWithInst(callInst,
3058  llvm::dyn_cast<llvm::Instruction>(shufValue));
3059  return true;
3060  }
3061  else {
3062  // A scatter with everyone going to the same location is
3063  // undefined (if there's more than one program instance in
3064  // the gang). Issue a warning.
3065  if (g->target->getVectorWidth() > 1)
3066  Warning(pos, "Undefined behavior: all program instances are "
3067  "writing to the same location!");
3068 
3069  // We could do something similar to the gather case, where
3070  // we arbitrarily write one of the values, but we need to
3071  // a) check to be sure the mask isn't all off and b) pick
3072  // the value from an executing program instance in that
3073  // case. We'll just let a bunch of the program instances
3074  // do redundant writes, since this isn't important to make
3075  // fast anyway...
3076  return false;
3077  }
3078  }
3079  else {
3080  int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
3081 
3082  if (step > 0 && LLVMVectorIsLinear(fullOffsets, step)) {
3083  // We have a linear sequence of memory locations being accessed
3084  // starting with the location given by the offset from
3085  // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
3086  // and 64 bit gather/scatters, respectively.)
3087  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
3088  lCopyMetadata(ptr, callInst);
3089 
3090  if (gatherInfo != NULL) {
3091  Debug(pos, "Transformed gather to unaligned vector load!");
3092  llvm::Instruction *newCall =
3093  lCallInst(gatherInfo->loadMaskedFunc, ptr, mask,
3094  LLVMGetName(ptr, "_masked_load"));
3095  lCopyMetadata(newCall, callInst);
3096  llvm::ReplaceInstWithInst(callInst, newCall);
3097  return true;
3098  }
3099  else {
3100  Debug(pos, "Transformed scatter to unaligned vector store!");
3101  ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast",
3102  callInst);
3103  llvm::Instruction *newCall =
3104  lCallInst(scatterInfo->maskedStoreFunc, ptr, storeValue,
3105  mask, "");
3106  lCopyMetadata(newCall, callInst);
3107  llvm::ReplaceInstWithInst(callInst, newCall);
3108  return true;
3109  }
3110  }
3111  return false;
3112  }
3113 }
3114 
3115 
3116 ///////////////////////////////////////////////////////////////////////////
3117 // MaskedStoreOptPass
3118 
3119 /** Masked stores are generally more complex than regular stores; for
3120  example, they require multiple instructions to simulate under SSE.
3121  This optimization detects cases where masked stores can be replaced
3122  with regular stores or removed entirely, for the cases of an 'all on'
3123  mask and an 'all off' mask, respectively.
3124 */
3125 static bool
3126 lImproveMaskedStore(llvm::CallInst *callInst) {
3127  struct MSInfo {
3128  MSInfo(const char *name, const int a)
3129  : align(a) {
3130  func = m->module->getFunction(name);
3131  Assert(func != NULL);
3132  }
3133  llvm::Function *func;
3134  const int align;
3135  };
3136 
3137  MSInfo msInfo[] = {
3138  MSInfo("__pseudo_masked_store_i8", 1),
3139  MSInfo("__pseudo_masked_store_i16", 2),
3140  MSInfo("__pseudo_masked_store_i32", 4),
3141  MSInfo("__pseudo_masked_store_float", 4),
3142  MSInfo("__pseudo_masked_store_i64", 8),
3143  MSInfo("__pseudo_masked_store_double", 8),
3144  MSInfo("__masked_store_blend_i8", 1),
3145  MSInfo("__masked_store_blend_i16", 2),
3146  MSInfo("__masked_store_blend_i32", 4),
3147  MSInfo("__masked_store_blend_float", 4),
3148  MSInfo("__masked_store_blend_i64", 8),
3149  MSInfo("__masked_store_blend_double", 8),
3150  MSInfo("__masked_store_i8", 1),
3151  MSInfo("__masked_store_i16", 2),
3152  MSInfo("__masked_store_i32", 4),
3153  MSInfo("__masked_store_float", 4),
3154  MSInfo("__masked_store_i64", 8),
3155  MSInfo("__masked_store_double", 8)
3156  };
3157 
3158  llvm::Function *called = callInst->getCalledFunction();
3159 
3160  int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
3161  MSInfo *info = NULL;
3162  for (int i = 0; i < nMSFuncs; ++i) {
3163  if (msInfo[i].func != NULL && called == msInfo[i].func) {
3164  info = &msInfo[i];
3165  break;
3166  }
3167  }
3168  if (info == NULL)
3169  return false;
3170 
3171  // Got one; grab the operands
3172  llvm::Value *lvalue = callInst->getArgOperand(0);
3173  llvm::Value *rvalue = callInst->getArgOperand(1);
3174  llvm::Value *mask = callInst->getArgOperand(2);
3175 
3176  MaskStatus maskStatus = lGetMaskStatus(mask);
3177  if (maskStatus == ALL_OFF) {
3178  // Zero mask - no-op, so remove the store completely. (This
3179  // may in turn lead to being able to optimize out instructions
3180  // that compute the rvalue...)
3181  callInst->eraseFromParent();
3182  return true;
3183  }
3184  else if (maskStatus == ALL_ON) {
3185  // The mask is all on, so turn this into a regular store
3186  llvm::Type *rvalueType = rvalue->getType();
3187  llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
3188 
3189  lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
3190  lCopyMetadata(lvalue, callInst);
3191  llvm::Instruction *store =
3192  new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
3194  g->target->getNativeVectorAlignment() : info->align);
3195  lCopyMetadata(store, callInst);
3196  llvm::ReplaceInstWithInst(callInst, store);
3197  return true;
3198  }
3199 
3200  return false;
3201 }
3202 
3203 
3204 static bool
3205 lImproveMaskedLoad(llvm::CallInst *callInst,
3206  llvm::BasicBlock::iterator iter) {
3207  struct MLInfo {
3208  MLInfo(const char *name, const int a)
3209  : align(a) {
3210  func = m->module->getFunction(name);
3211  Assert(func != NULL);
3212  }
3213  llvm::Function *func;
3214  const int align;
3215  };
3216 
3217  MLInfo mlInfo[] = {
3218  MLInfo("__masked_load_i8", 1),
3219  MLInfo("__masked_load_i16", 2),
3220  MLInfo("__masked_load_i32", 4),
3221  MLInfo("__masked_load_float", 4),
3222  MLInfo("__masked_load_i64", 8),
3223  MLInfo("__masked_load_double", 8)
3224  };
3225 
3226  llvm::Function *called = callInst->getCalledFunction();
3227 
3228  int nFuncs = sizeof(mlInfo) / sizeof(mlInfo[0]);
3229  MLInfo *info = NULL;
3230  for (int i = 0; i < nFuncs; ++i) {
3231  if (mlInfo[i].func != NULL && called == mlInfo[i].func) {
3232  info = &mlInfo[i];
3233  break;
3234  }
3235  }
3236  if (info == NULL)
3237  return false;
3238 
3239  // Got one; grab the operands
3240  llvm::Value *ptr = callInst->getArgOperand(0);
3241  llvm::Value *mask = callInst->getArgOperand(1);
3242 
3243  MaskStatus maskStatus = lGetMaskStatus(mask);
3244  if (maskStatus == ALL_OFF) {
3245  // Zero mask - no-op, so replace the load with an undef value
3246  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
3247  iter, llvm::UndefValue::get(callInst->getType()));
3248  return true;
3249  }
3250  else if (maskStatus == ALL_ON) {
3251  // The mask is all on, so turn this into a regular load
3252  llvm::Type *ptrType = llvm::PointerType::get(callInst->getType(), 0);
3253  ptr = new llvm::BitCastInst(ptr, ptrType, "ptr_cast_for_load",
3254  callInst);
3255  llvm::Instruction *load =
3256  new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
3258  g->target->getNativeVectorAlignment() : info->align,
3259  (llvm::Instruction *)NULL);
3260  lCopyMetadata(load, callInst);
3261  llvm::ReplaceInstWithInst(callInst, load);
3262  return true;
3263  }
3264  else
3265  return false;
3266 }
3267 
3268 
3269 bool
3271  DEBUG_START_PASS("ImproveMemoryOps");
3272 
3273  bool modifiedAny = false;
3274  restart:
3275  // Iterate through all of the instructions in the basic block.
3276  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
3277  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
3278  // If we don't have a call to one of the
3279  // __pseudo_{gather,scatter}_* functions, then just go on to the
3280  // next instruction.
3281  if (callInst == NULL ||
3282  callInst->getCalledFunction() == NULL)
3283  continue;
3284 
3285  if (lGSToGSBaseOffsets(callInst)) {
3286  modifiedAny = true;
3287  goto restart;
3288  }
3289  if (lGSBaseOffsetsGetMoreConst(callInst)) {
3290  modifiedAny = true;
3291  goto restart;
3292  }
3293  if (lGSToLoadStore(callInst)) {
3294  modifiedAny = true;
3295  goto restart;
3296  }
3297  if (lImproveMaskedStore(callInst)) {
3298  modifiedAny = true;
3299  goto restart;
3300  }
3301  if (lImproveMaskedLoad(callInst, iter)) {
3302  modifiedAny = true;
3303  goto restart;
3304  }
3305  }
3306 
3307  DEBUG_END_PASS("ImproveMemoryOps");
3308 
3309  return modifiedAny;
3310 }
3311 
3312 
3313 static llvm::Pass *
3315  return new ImproveMemoryOpsPass;
3316 }
3317 
3318 
3319 ///////////////////////////////////////////////////////////////////////////
3320 // GatherCoalescePass
3321 
3322 // This pass implements two optimizations to improve the performance of
3323 // gathers; currently only gathers of 32-bit values where it can be
3324 // determined at compile time that the mask is all on are supported, though
3325 // both of those limitations may be generalized in the future.
3326 //
3327 // First, for any single gather, see if it's worthwhile to break it into
3328 // any of scalar, 2-wide (i.e. 64-bit), 4-wide, or 8-wide loads. Further,
3329 // we generate code that shuffles these loads around. Doing fewer, larger
3330 // loads in this manner, when possible, can be more efficient.
3331 //
3332 // Second, this pass can coalesce memory accesses across multiple
3333 // gathers. If we have a series of gathers without any memory writes in
3334 // the middle, then we try to analyze their reads collectively and choose
3335 // an efficient set of loads for them. Not only does this help if
3336 // different gathers reuse values from the same location in memory, but
3337 // it's specifically helpful when data with AOS layout is being accessed;
3338 // in this case, we're often able to generate wide vector loads and
3339 // appropriate shuffles automatically.
3340 
3341 class GatherCoalescePass : public llvm::BasicBlockPass {
3342 public:
3343  static char ID;
3344  GatherCoalescePass() : BasicBlockPass(ID) { }
3345 
3346 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
3347  const char *getPassName() const { return "Gather Coalescing"; }
3348 #else // LLVM 4.0+
3349  llvm::StringRef getPassName() const { return "Gather Coalescing"; }
3350 #endif
3351  bool runOnBasicBlock(llvm::BasicBlock &BB);
3352 };
3353 
3354 char GatherCoalescePass::ID = 0;
3355 
3356 
3357 /** Representation of a memory load that the gather coalescing code has
3358  decided to generate.
3359  */
3361  CoalescedLoadOp(int64_t s, int c) {
3362  start = s;
3363  count = c;
3364  load = element0 = element1 = NULL;
3365  }
3366 
3367  /** Starting offset of the load from the common base pointer (in terms
3368  of numbers of items of the underlying element type--*not* in terms
3369  of bytes). */
3370  int64_t start;
3371 
3372  /** Number of elements to load at this location */
3373  int count;
3374 
3375  /** Value loaded from memory for this load op */
3376  llvm::Value *load;
3377 
3378  /** For 2-wide loads (i.e. 64-bit loads), these store the lower and
3379  upper 32 bits of the result, respectively. */
3380  llvm::Value *element0, *element1;
3381 };
3382 
3383 
3384 /** This function determines whether it makes sense (and is safe) to
3385  generate a vector load of width vectorWidth, starting at *iter. It
3386  returns true if so, setting *newIter to point to the next element in
3387  the set that isn't taken care of by the generated load. If a vector
3388  load of the given width doesn't make sense, then false is returned.
3389  */
3390 static bool
3391 lVectorLoadIsEfficient(std::set<int64_t>::iterator iter,
3392  std::set<int64_t>::iterator end,
3393  std::set<int64_t>::iterator *newIter, int vectorWidth) {
3394  // We're considering a vector load of width vectorWidth, starting at
3395  // the offset "start".
3396  int64_t start = *iter;
3397 
3398  // The basic idea is that we'll look at the subsequent elements in the
3399  // load set after the initial one at start. As long as subsequent
3400  // elements:
3401  //
3402  // 1. Aren't so far separated that they no longer fit into the range
3403  // [start, start+vectorWidth)
3404  //
3405  // 2. And don't have too large a gap in between them (e.g., it's not
3406  // worth generating an 8-wide load for two elements with offsets 0
3407  // and 7, but no loads requested in between).
3408  //
3409  // Then we continue moving forward through the elements until we either
3410  // fill up the vector or run out of elements.
3411 
3412  // lastAccepted holds the last offset we've processed and accepted as
3413  // valid for the vector load underconsideration
3414  int64_t lastAccepted = start;
3415 
3416  while (iter != end) {
3417  // What is the separation in offset values from the last element we
3418  // added to the set for this load?
3419  int64_t delta = *iter - lastAccepted;
3420  if (delta > 3)
3421  // If there's too big a gap, then we won't issue the load
3422  return false;
3423 
3424  int64_t span = *iter - start + 1;
3425 
3426  if (span == vectorWidth) {
3427  // We've extended far enough that we have exactly filled up the
3428  // entire vector width; we can't go any further, so return with
3429  // success. (Update *newIter to point at the next element
3430  // after the last one accepted here.)
3431  *newIter = ++iter;
3432  return true;
3433  }
3434  else if (span > vectorWidth) {
3435  // The current offset won't fit into a vectorWidth-wide load
3436  // starting from start. It's still generally worthwhile
3437  // issuing the load we've been considering, though, since it
3438  // will provide values for a number of previous offsets. This
3439  // load will have one or more elements at the end of its range
3440  // that is not needed by any of the offsets under
3441  // consideration. As such, there are three cases where issuing
3442  // this load is a bad idea:
3443  //
3444  // 1. 2-wide loads: we know that we haven't completely filled
3445  // the 2-wide vector, since otherwise the if() test above
3446  // would have succeeded previously. Therefore, we must have
3447  // a situation with offsets like (4,6,...); it would be a
3448  // silly idea to issue a 2-wide load to get the value for
3449  // the 4 offset, versus failing here and issuing a scalar
3450  // load instead.
3451  //
3452  // 2. If there are too many unnecessary values at the end of
3453  // the load extent (defined as more than half of them)--in
3454  // this case, it'd be better to issue a vector load of
3455  // smaller width anyway.
3456  //
3457  // 3. If the gap between the last accepted offset and the
3458  // current one under consideration is more than the page
3459  // size. In this case we can't be sure whether or not some
3460  // of the unused elements at the end of the load will
3461  // straddle a page boundary and thus lead to an undesirable
3462  // fault. (It's hard to imagine this happening in practice,
3463  // except under contrived circumstances, but better safe
3464  // than sorry.)
3465  const int pageSize = 4096;
3466  if (vectorWidth != 2 &&
3467  (lastAccepted - start) > (vectorWidth / 2) &&
3468  (*iter - lastAccepted) < pageSize) {
3469  *newIter = iter;
3470  return true;
3471  }
3472  else
3473  return false;
3474  }
3475 
3476  // Continue moving forward
3477  lastAccepted = *iter;
3478  ++iter;
3479  }
3480 
3481  return false;
3482 }
3483 
3484 
3485 /** Given a set of offsets from a common base pointer that we need to get
3486  loaded into memory, determine a reasonable set of load operations that
3487  gets all of the corresponding values in memory (ideally, including as
3488  many as possible wider vector loads rather than scalar loads). Return
3489  a CoalescedLoadOp for each one in the *loads array.
3490  */
3491 static void
3492 lSelectLoads(const std::vector<int64_t> &loadOffsets,
3493  std::vector<CoalescedLoadOp> *loads) {
3494  // First, get a sorted set of unique offsets to load from.
3495  std::set<int64_t> allOffsets;
3496  for (unsigned int i = 0; i < loadOffsets.size(); ++i)
3497  allOffsets.insert(loadOffsets[i]);
3498 
3499  std::set<int64_t>::iterator iter = allOffsets.begin();
3500  while (iter != allOffsets.end()) {
3501  Debug(SourcePos(), "Load needed at %" PRId64 ".", *iter);
3502  ++iter;
3503  }
3504 
3505  // Now, iterate over the offsets from low to high. Starting at the
3506  // current offset, we see if a vector load starting from that offset
3507  // will cover loads at subsequent offsets as well.
3508  iter = allOffsets.begin();
3509  while (iter != allOffsets.end()) {
3510  // Consider vector loads of width of each of the elements of
3511  // spanSizes[], in order.
3512  int vectorWidths[] = { 8, 4, 2 };
3513  int nVectorWidths = sizeof(vectorWidths) / sizeof(vectorWidths[0]);
3514  bool gotOne = false;
3515  for (int i = 0; i < nVectorWidths; ++i) {
3516  // See if a load of vector with width vectorWidths[i] would be
3517  // effective (i.e. would cover a reasonable number of the
3518  // offsets that need to be loaded from).
3519  std::set<int64_t>::iterator newIter;
3520  if (lVectorLoadIsEfficient(iter, allOffsets.end(), &newIter,
3521  vectorWidths[i])) {
3522  // Yes: create the corresponding coalesced load and update
3523  // the iterator to the returned iterator; doing so skips
3524  // over the additional offsets that are taken care of by
3525  // this load.
3526  loads->push_back(CoalescedLoadOp(*iter, vectorWidths[i]));
3527  iter = newIter;
3528  gotOne = true;
3529  break;
3530  }
3531  }
3532 
3533  if (gotOne == false) {
3534  // We couldn't find a vector load starting from this offset
3535  // that made sense, so emit a scalar load and continue onward.
3536  loads->push_back(CoalescedLoadOp(*iter, 1));
3537  ++iter;
3538  }
3539  }
3540 }
3541 
3542 
3543 /** Print a performance message with the details of the result of
3544  coalescing over a group of gathers. */
3545 static void
3546 lCoalescePerfInfo(const std::vector<llvm::CallInst *> &coalesceGroup,
3547  const std::vector<CoalescedLoadOp> &loadOps) {
3548  SourcePos pos;
3549  lGetSourcePosFromMetadata(coalesceGroup[0], &pos);
3550 
3551  // Create a string that indicates the line numbers of the subsequent
3552  // gathers from the first one that were coalesced here.
3553  char otherPositions[512];
3554  otherPositions[0] = '\0';
3555  if (coalesceGroup.size() > 1) {
3556  const char *plural = (coalesceGroup.size() > 2) ? "s" : "";
3557  char otherBuf[32];
3558  sprintf(otherBuf, "(other%s at line%s ", plural, plural);
3559  strcat(otherPositions, otherBuf);
3560 
3561  for (int i = 1; i < (int)coalesceGroup.size(); ++i) {
3562  SourcePos p;
3563  bool ok = lGetSourcePosFromMetadata(coalesceGroup[i], &p);
3564  if (ok) {
3565  char buf[32];
3566  sprintf(buf, "%d", p.first_line);
3567  strcat(otherPositions, buf);
3568  if (i < (int)coalesceGroup.size() - 1)
3569  strcat(otherPositions, ", ");
3570  }
3571  }
3572  strcat(otherPositions, ") ");
3573  }
3574 
3575  // Count how many loads of each size there were.
3576  std::map<int, int> loadOpsCount;
3577  for (int i = 0; i < (int)loadOps.size(); ++i)
3578  ++loadOpsCount[loadOps[i].count];
3579 
3580  // Generate a string the describes the mix of load ops
3581  char loadOpsInfo[512];
3582  loadOpsInfo[0] = '\0';
3583  std::map<int, int>::const_iterator iter = loadOpsCount.begin();
3584  while (iter != loadOpsCount.end()) {
3585  char buf[32];
3586  sprintf(buf, "%d x %d-wide", iter->second, iter->first);
3587  strcat(loadOpsInfo, buf);
3588  ++iter;
3589  if (iter != loadOpsCount.end())
3590  strcat(loadOpsInfo, ", ");
3591  }
3592 
3593  if (coalesceGroup.size() == 1)
3594  PerformanceWarning(pos, "Coalesced gather into %d load%s (%s).",
3595  (int)loadOps.size(),
3596  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3597  else
3598  PerformanceWarning(pos, "Coalesced %d gathers starting here %sinto %d "
3599  "load%s (%s).", (int)coalesceGroup.size(),
3600  otherPositions,(int)loadOps.size(),
3601  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3602 }
3603 
3604 
3605 /** Utility routine that computes an offset from a base pointer and then
3606  returns the result of a load of the given type from the resulting
3607  location:
3608 
3609  return *((type *)(basePtr + offset))
3610  */
3611 llvm::Value *
3612 lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align,
3613  llvm::Instruction *insertBefore, llvm::Type *type) {
3614  llvm::Value *ptr = lGEPInst(basePtr, LLVMInt64(offset), "new_base",
3615  insertBefore);
3616  ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(type, 0),
3617  "ptr_cast", insertBefore);
3618  return new llvm::LoadInst(ptr, "gather_load", false /* not volatile */,
3619  align, insertBefore);
3620 }
3621 
3622 
3623 /* Having decided that we're doing to emit a series of loads, as encoded in
3624  the loadOps array, this function emits the corresponding load
3625  instructions.
3626  */
3627 static void
3628 lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
3629  int elementSize, llvm::Instruction *insertBefore) {
3630  Debug(SourcePos(), "Coalesce doing %d loads.", (int)loadOps.size());
3631  for (int i = 0; i < (int)loadOps.size(); ++i) {
3632  Debug(SourcePos(), "Load #%d @ %" PRId64 ", %d items", i, loadOps[i].start,
3633  loadOps[i].count);
3634 
3635  // basePtr is an i8 *, so the offset from it should be in terms of
3636  // bytes, not underlying i32 elements.
3637  int64_t start = loadOps[i].start * elementSize;
3638 
3639  int align = 4;
3640  switch (loadOps[i].count) {
3641  case 1:
3642  // Single 32-bit scalar load
3643  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore,
3645  break;
3646  case 2: {
3647  // Emit 2 x i32 loads as i64 loads and then break the result
3648  // into two 32-bit parts.
3649  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore,
3651  // element0 = (int32)value;
3652  loadOps[i].element0 =
3653  new llvm::TruncInst(loadOps[i].load, LLVMTypes::Int32Type,
3654  "load64_elt0", insertBefore);
3655  // element1 = (int32)(value >> 32)
3656  llvm::Value *shift =
3657  llvm::BinaryOperator::Create(llvm::Instruction::LShr,
3658  loadOps[i].load, LLVMInt64(32),
3659  "load64_shift", insertBefore);
3660  loadOps[i].element1 =
3661  new llvm::TruncInst(shift, LLVMTypes::Int32Type,
3662  "load64_elt1", insertBefore);
3663  break;
3664  }
3665  case 4: {
3666  // 4-wide vector load
3667  if (g->opt.forceAlignedMemory) {
3668  align = g->target->getNativeVectorAlignment();
3669  }
3670  llvm::VectorType *vt =
3671  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3672  loadOps[i].load = lGEPAndLoad(basePtr, start, align,
3673  insertBefore, vt);
3674  break;
3675  }
3676  case 8: {
3677  // 8-wide vector load
3678  if (g->opt.forceAlignedMemory) {
3679  align = g->target->getNativeVectorAlignment();
3680  }
3681  llvm::VectorType *vt =
3682  llvm::VectorType::get(LLVMTypes::Int32Type, 8);
3683  loadOps[i].load = lGEPAndLoad(basePtr, start, align,
3684  insertBefore, vt);
3685  break;
3686  }
3687  default:
3688  FATAL("Unexpected load count in lEmitLoads()");
3689  }
3690  }
3691 }
3692 
3693 
3694 /** Convert any loads of 8-wide vectors into two 4-wide vectors
3695  (logically). This allows the assembly code below to always operate on
3696  4-wide vectors, which leads to better code. Returns a new vector of
3697  load operations.
3698  */
3699 static std::vector<CoalescedLoadOp>
3700 lSplit8WideLoads(const std::vector<CoalescedLoadOp> &loadOps,
3701  llvm::Instruction *insertBefore) {
3702  std::vector<CoalescedLoadOp> ret;
3703  for (unsigned int i = 0; i < loadOps.size(); ++i) {
3704  if (loadOps[i].count == 8) {
3705  // Create fake CoalescedLOadOps, where the load llvm::Value is
3706  // actually a shuffle that pulls either the first 4 or the last
3707  // 4 values out of the original 8-wide loaded value.
3708  int32_t shuf[2][4] = { { 0, 1, 2, 3 }, { 4, 5, 6, 7 } };
3709 
3710  ret.push_back(CoalescedLoadOp(loadOps[i].start, 4));
3711  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load,
3712  shuf[0], 4, insertBefore);
3713 
3714  ret.push_back(CoalescedLoadOp(loadOps[i].start+4, 4));
3715  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load,
3716  shuf[1], 4, insertBefore);
3717  }
3718  else
3719  ret.push_back(loadOps[i]);
3720  }
3721 
3722  return ret;
3723 }
3724 
3725 
3726 /** Given a 1-wide load of a 32-bit value, merge its value into the result
3727  vector for any and all elements for which it applies.
3728  */
3729 static llvm::Value *
3730 lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load,
3731  const int64_t offsets[4], bool set[4],
3732  llvm::Instruction *insertBefore) {
3733  for (int elt = 0; elt < 4; ++elt) {
3734  if (offsets[elt] >= load.start &&
3735  offsets[elt] < load.start + load.count) {
3736  Debug(SourcePos(), "Load 1 @ %" PRId64 " matches for element #%d "
3737  "(value %" PRId64 ")", load.start, elt, offsets[elt]);
3738  // If this load gives one of the values that we need, then we
3739  // can just insert it in directly
3740  Assert(set[elt] == false);
3741  result =
3742  llvm::InsertElementInst::Create(result, load.load, LLVMInt32(elt),
3743  "insert_load", insertBefore);
3744  set[elt] = true;
3745  }
3746  }
3747 
3748  return result;
3749 }
3750 
3751 
3752 /** Similarly, incorporate the values from a 2-wide load into any vector
3753  elements that they apply to. */
3754 static llvm::Value *
3755 lApplyLoad2(llvm::Value *result, const CoalescedLoadOp &load,
3756  const int64_t offsets[4], bool set[4],
3757  llvm::Instruction *insertBefore) {
3758  for (int elt = 0; elt < 4; ++elt) {
3759  // First, try to do a 64-bit-wide insert into the result vector.
3760  // We can do this when we're currently at an even element, when the
3761  // current and next element have consecutive values, and where the
3762  // original 64-bit load is at the offset needed by the current
3763  // element.
3764  if ((elt & 1) == 0 &&
3765  offsets[elt] + 1 == offsets[elt+1] &&
3766  offsets[elt] == load.start) {
3767  Debug(SourcePos(), "Load 2 @ %" PRId64 " matches for elements #%d,%d "
3768  "(values %" PRId64 ",%" PRId64 ")", load.start, elt, elt+1,
3769  offsets[elt], offsets[elt+1]);
3770  Assert(set[elt] == false && set[elt+1] == false);
3771 
3772  // In this case, we bitcast from a 4xi32 to a 2xi64 vector
3773  llvm::Type *vec2x64Type =
3774  llvm::VectorType::get(LLVMTypes::Int64Type, 2);
3775  result = new llvm::BitCastInst(result, vec2x64Type, "to2x64",
3776  insertBefore);
3777 
3778  // And now we can insert the 64-bit wide value into the
3779  // appropriate elment
3780  result = llvm::InsertElementInst::Create(result, load.load,
3781  LLVMInt32(elt/2),
3782  "insert64", insertBefore);
3783 
3784  // And back to 4xi32.
3785  llvm::Type *vec4x32Type =
3786  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3787  result = new llvm::BitCastInst(result, vec4x32Type, "to4x32",
3788  insertBefore);
3789 
3790  set[elt] = set[elt+1] = true;
3791  // Advance elt one extra time, since we just took care of two
3792  // elements
3793  ++elt;
3794  }
3795  else if (offsets[elt] >= load.start &&
3796  offsets[elt] < load.start + load.count) {
3797  Debug(SourcePos(), "Load 2 @ %" PRId64 " matches for element #%d "
3798  "(value %" PRId64 ")", load.start, elt, offsets[elt]);
3799  // Otherwise, insert one of the 32-bit pieces into an element
3800  // of the final vector
3801  Assert(set[elt] == false);
3802  llvm::Value *toInsert = (offsets[elt] == load.start) ?
3803  load.element0 : load.element1;
3804  result =
3805  llvm::InsertElementInst::Create(result, toInsert, LLVMInt32(elt),
3806  "insert_load", insertBefore);
3807  set[elt] = true;
3808  }
3809  }
3810 
3811  return result;
3812 }
3813 
3814 
3815 #if 1
3816 /* This approach works better with AVX, while the #else path generates
3817  slightly better code with SSE. Need to continue to dig into performance
3818  details with this stuff in general... */
3819 
3820 /** And handle a 4-wide load */
3821 static llvm::Value *
3822 lApplyLoad4(llvm::Value *result, const CoalescedLoadOp &load,
3823  const int64_t offsets[4], bool set[4],
3824  llvm::Instruction *insertBefore) {
3825  // Conceptually, we're doing to consider doing a shuffle vector with
3826  // the 4-wide load and the 4-wide result we have so far to generate a
3827  // new 4-wide vector. We'll start with shuffle indices that just
3828  // select each element of the result so far for the result.
3829  int32_t shuf[4] = { 4, 5, 6, 7 };
3830 
3831  for (int elt = 0; elt < 4; ++elt) {
3832  if (offsets[elt] >= load.start &&
3833  offsets[elt] < load.start + load.count) {
3834  Debug(SourcePos(), "Load 4 @ %" PRId64 " matches for element #%d "
3835  "(value %" PRId64 ")", load.start, elt, offsets[elt]);
3836 
3837  // If the current element falls within the range of locations
3838  // that the 4-wide load covers, then compute the appropriate
3839  // shuffle index that extracts the appropriate element from the
3840  // load.
3841  Assert(set[elt] == false);
3842  shuf[elt] = int32_t(offsets[elt] - load.start);
3843  set[elt] = true;
3844  }
3845  }
3846 
3847  // Now, issue a shufflevector instruction if any of the values from the
3848  // load we just considered were applicable.
3849  if (shuf[0] != 4 || shuf[1] != 5 || shuf[2] != 6 || shuf[3] != 7)
3850  result = LLVMShuffleVectors(load.load, result, shuf, 4, insertBefore);
3851 
3852  return result;
3853 }
3854 
3855 
3856 /** We're need to fill in the values for a 4-wide result vector. This
3857  function looks at all of the generated loads and extracts the
3858  appropriate elements from the appropriate loads to assemble the result.
3859  Here the offsets[] parameter gives the 4 offsets from the base pointer
3860  for the four elements of the result.
3861 */
3862 static llvm::Value *
3863 lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps,
3864  const int64_t offsets[4], llvm::Instruction *insertBefore) {
3865  llvm::Type *returnType =
3866  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3867  llvm::Value *result = llvm::UndefValue::get(returnType);
3868 
3869  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %"
3870  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
3871 
3872  // Track whether we have found a valid value for each of the four
3873  // elements of the result
3874  bool set[4] = { false, false, false, false };
3875 
3876  // Loop over all of the loads and check each one to see if it provides
3877  // a value that's applicable to the result
3878  for (int load = 0; load < (int)loadOps.size(); ++load) {
3879  const CoalescedLoadOp &li = loadOps[load];
3880 
3881  switch (li.count) {
3882  case 1:
3883  result = lApplyLoad1(result, li, offsets, set, insertBefore);
3884  break;
3885  case 2:
3886  result = lApplyLoad2(result, li, offsets, set, insertBefore);
3887  break;
3888  case 4:
3889  result = lApplyLoad4(result, li, offsets, set, insertBefore);
3890  break;
3891  default:
3892  FATAL("Unexpected load count in lAssemble4Vector()");
3893  }
3894  }
3895 
3896  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %"
3897  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
3898 
3899  for (int i = 0; i < 4; ++i)
3900  Assert(set[i] == true);
3901 
3902  return result;
3903 }
3904 
3905 #else
3906 
3907 static llvm::Value *
3908 lApplyLoad4s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3909  const int64_t offsets[4], bool set[4],
3910  llvm::Instruction *insertBefore) {
3911  int32_t firstMatchElements[4] = { -1, -1, -1, -1 };
3912  const CoalescedLoadOp *firstMatch = NULL;
3913 
3914  Assert(llvm::isa<llvm::UndefValue>(result));
3915 
3916  for (int load = 0; load < (int)loadOps.size(); ++load) {
3917  const CoalescedLoadOp &loadop = loadOps[load];
3918  if (loadop.count != 4)
3919  continue;
3920 
3921  int32_t matchElements[4] = { -1, -1, -1, -1 };
3922  bool anyMatched = false;
3923  for (int elt = 0; elt < 4; ++elt) {
3924  if (offsets[elt] >= loadop.start &&
3925  offsets[elt] < loadop.start + loadop.count) {
3926  Debug(SourcePos(), "Load 4 @ %" PRId64 " matches for element #%d "
3927  "(value %" PRId64 ")", loadop.start, elt, offsets[elt]);
3928  anyMatched = true;
3929  Assert(set[elt] == false);
3930  matchElements[elt] = offsets[elt] - loadop.start;
3931  set[elt] = true;
3932  }
3933  }
3934 
3935  if (anyMatched) {
3936  if (llvm::isa<llvm::UndefValue>(result)) {
3937  if (firstMatch == NULL) {
3938  firstMatch = &loadop;
3939  for (int i = 0; i < 4; ++i)
3940  firstMatchElements[i] = matchElements[i];
3941  }
3942  else {
3943  int32_t shuffle[4] = { -1, -1, -1, -1 };
3944  for (int i = 0; i < 4; ++i) {
3945  if (firstMatchElements[i] != -1)
3946  shuffle[i] = firstMatchElements[i];
3947  else
3948  shuffle[i] = 4 + matchElements[i];
3949  }
3950  result = LLVMShuffleVectors(firstMatch->load, loadop.load, shuffle,
3951  4, insertBefore);
3952  firstMatch = NULL;
3953  }
3954  }
3955  else {
3956  int32_t shuffle[4] = { -1, -1, -1, -1 };
3957  for (int i = 0; i < 4; ++i) {
3958  if (matchElements[i] != -1)
3959  shuffle[i] = 4 + matchElements[i];
3960  else
3961  shuffle[i] = i;
3962  }
3963  result = LLVMShuffleVectors(result, loadop.load, shuffle, 4,
3964  insertBefore);
3965  }
3966  }
3967  }
3968 
3969  if (firstMatch != NULL && llvm::isa<llvm::UndefValue>(result))
3970  return LLVMShuffleVectors(firstMatch->load, result, firstMatchElements,
3971  4, insertBefore);
3972  else
3973  return result;
3974 }
3975 
3976 
3977 static llvm::Value *
3978 lApplyLoad12s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3979  const int64_t offsets[4], bool set[4],
3980  llvm::Instruction *insertBefore) {
3981  // Loop over all of the loads and check each one to see if it provides
3982  // a value that's applicable to the result
3983  for (int load = 0; load < (int)loadOps.size(); ++load) {
3984  const CoalescedLoadOp &loadop = loadOps[load];
3985  Assert(loadop.count == 1 || loadop.count == 2 || loadop.count == 4);
3986 
3987  if (loadop.count == 1)
3988  result = lApplyLoad1(result, loadop, offsets, set, insertBefore);
3989  else if (loadop.count == 2)
3990  result = lApplyLoad2(result, loadop, offsets, set, insertBefore);
3991  }
3992  return result;
3993 }
3994 
3995 
3996 /** We're need to fill in the values for a 4-wide result vector. This
3997  function looks at all of the generated loads and extracts the
3998  appropriate elements from the appropriate loads to assemble the result.
3999  Here the offsets[] parameter gives the 4 offsets from the base pointer
4000  for the four elements of the result.
4001 */
4002 static llvm::Value *
4003 lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps,
4004  const int64_t offsets[4], llvm::Instruction *insertBefore) {
4005  llvm::Type *returnType =
4006  llvm::VectorType::get(LLVMTypes::Int32Type, 4);
4007  llvm::Value *result = llvm::UndefValue::get(returnType);
4008 
4009  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %"
4010  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
4011 
4012  // Track whether we have found a valid value for each of the four
4013  // elements of the result
4014  bool set[4] = { false, false, false, false };
4015 
4016  result = lApplyLoad4s(result, loadOps, offsets, set, insertBefore);
4017  result = lApplyLoad12s(result, loadOps, offsets, set, insertBefore);
4018 
4019  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %"
4020  PRId64 " %" PRId64 "].", offsets[0], offsets[1], offsets[2], offsets[3]);
4021 
4022  for (int i = 0; i < 4; ++i)
4023  Assert(set[i] == true);
4024 
4025  return result;
4026 }
4027 #endif
4028 
4029 
4030 /** Given the set of loads that we've done and the set of result values to
4031  be computed, this function computes the final llvm::Value *s for each
4032  result vector.
4033  */
4034 static void
4035 lAssembleResultVectors(const std::vector<CoalescedLoadOp> &loadOps,
4036  const std::vector<int64_t> &constOffsets,
4037  std::vector<llvm::Value *> &results,
4038  llvm::Instruction *insertBefore) {
4039  // We work on 4-wide chunks of the final values, even when we're
4040  // computing 8-wide or 16-wide vectors. This gives better code from
4041  // LLVM's SSE/AVX code generators.
4042  Assert((constOffsets.size() % 4) == 0);
4043  std::vector<llvm::Value *> vec4s;
4044  for (int i = 0; i < (int)constOffsets.size(); i += 4)
4045  vec4s.push_back(lAssemble4Vector(loadOps, &constOffsets[i],
4046  insertBefore));
4047 
4048  // And now concatenate 1, 2, or 4 of the 4-wide vectors computed above
4049  // into 4, 8, or 16-wide final result vectors.
4050  int numGathers = constOffsets.size() / g->target->getVectorWidth();
4051  for (int i = 0; i < numGathers; ++i) {
4052  llvm::Value *result = NULL;
4053  switch (g->target->getVectorWidth()) {
4054  case 4:
4055  result = vec4s[i];
4056  break;
4057  case 8:
4058  result = LLVMConcatVectors(vec4s[2*i], vec4s[2*i+1], insertBefore);
4059  break;
4060  case 16: {
4061  llvm::Value *v1 = LLVMConcatVectors(vec4s[4*i], vec4s[4*i+1],
4062  insertBefore);
4063  llvm::Value *v2 = LLVMConcatVectors(vec4s[4*i+2], vec4s[4*i+3],
4064  insertBefore);
4065  result = LLVMConcatVectors(v1, v2, insertBefore);
4066  break;
4067  }
4068  default:
4069  FATAL("Unhandled vector width in lAssembleResultVectors()");
4070  }
4071 
4072  results.push_back(result);
4073  }
4074 }
4075 
4076 
4077 /** Given a call to a gather function, extract the base pointer, the 2/4/8
4078  scale, and the first varying offsets value to use them to compute that
4079  scalar base pointer that is shared by all of the gathers in the group.
4080  (Thus, this base pointer plus the constant offsets term for each gather
4081  gives the set of addresses to use for each gather.
4082  */
4083 static llvm::Value *
4084 lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore) {
4085  llvm::Value *basePtr = gatherInst->getArgOperand(0);
4086  llvm::Value *variableOffsets = gatherInst->getArgOperand(1);
4087  llvm::Value *offsetScale = gatherInst->getArgOperand(2);
4088 
4089  // All of the variable offsets values should be the same, due to
4090  // checking for this in GatherCoalescePass::runOnBasicBlock(). Thus,
4091  // extract the first value and use that as a scalar.
4092  llvm::Value *variable = LLVMExtractFirstVectorElement(variableOffsets);
4093  if (variable->getType() == LLVMTypes::Int64Type)
4094  offsetScale = new llvm::ZExtInst(offsetScale, LLVMTypes::Int64Type,
4095  "scale_to64", insertBefore);
4096  llvm::Value *offset =
4097  llvm::BinaryOperator::Create(llvm::Instruction::Mul, variable,
4098  offsetScale, "offset", insertBefore);
4099 
4100  return lGEPInst(basePtr, offset, "new_base", insertBefore);
4101 }
4102 
4103 
4104 /** Extract the constant offsets (from the common base pointer) from each
4105  of the gathers in a set to be coalesced. These come in as byte
4106  offsets, but we'll transform them into offsets in terms of the size of
4107  the base scalar type being gathered. (e.g. for an i32 gather, we might
4108  have offsets like <0,4,16,20>, which would be transformed to <0,1,4,5>
4109  here.)
4110  */
4111 static void
4112 lExtractConstOffsets(const std::vector<llvm::CallInst *> &coalesceGroup,
4113  int elementSize, std::vector<int64_t> *constOffsets) {
4114  int width = g->target->getVectorWidth();
4115  *constOffsets = std::vector<int64_t>(coalesceGroup.size() * width, 0);
4116 
4117  int64_t *endPtr = &((*constOffsets)[0]);
4118  for (int i = 0; i < (int)coalesceGroup.size(); ++i, endPtr += width) {
4119  llvm::Value *offsets = coalesceGroup[i]->getArgOperand(3);
4120  int nElts;
4121  bool ok = LLVMExtractVectorInts(offsets, endPtr, &nElts);
4122  Assert(ok && nElts == width);
4123  }
4124 
4125  for (int i = 0; i < (int)constOffsets->size(); ++i)
4126  (*constOffsets)[i] /= elementSize;
4127 }
4128 
4129 
4130 /** Actually do the coalescing. We have a set of gathers all accessing
4131  addresses of the form:
4132 
4133  (ptr + {1,2,4,8} * varyingOffset) + constOffset, a.k.a.
4134  basePtr + constOffset
4135 
4136  where varyingOffset actually has the same value across all of the SIMD
4137  lanes and where the part in parenthesis has the same value for all of
4138  the gathers in the group.
4139  */
4140 static bool
4141 lCoalesceGathers(const std::vector<llvm::CallInst *> &coalesceGroup) {
4142  llvm::Instruction *insertBefore = coalesceGroup[0];
4143 
4144  // First, compute the shared base pointer for all of the gathers
4145  llvm::Value *basePtr = lComputeBasePtr(coalesceGroup[0], insertBefore);
4146 
4147  int elementSize = 0;
4148  if (coalesceGroup[0]->getType() == LLVMTypes::Int32VectorType ||
4149  coalesceGroup[0]->getType() == LLVMTypes::FloatVectorType)
4150  elementSize = 4;
4151  else if (coalesceGroup[0]->getType() == LLVMTypes::Int64VectorType ||
4152  coalesceGroup[0]->getType() == LLVMTypes::DoubleVectorType)
4153  elementSize = 8;
4154  else
4155  FATAL("Unexpected gather type in lCoalesceGathers");
4156 
4157  // Extract the constant offsets from the gathers into the constOffsets
4158  // vector: the first vectorWidth elements will be those for the first
4159  // gather, the next vectorWidth those for the next gather, and so
4160  // forth.
4161  std::vector<int64_t> constOffsets;
4162  lExtractConstOffsets(coalesceGroup, elementSize, &constOffsets);
4163 
4164  // Determine a set of loads to perform to get all of the values we need
4165  // loaded.
4166  std::vector<CoalescedLoadOp> loadOps;
4167  lSelectLoads(constOffsets, &loadOps);
4168 
4169  lCoalescePerfInfo(coalesceGroup, loadOps);
4170 
4171  // Actually emit load instructions for them
4172  lEmitLoads(basePtr, loadOps, elementSize, insertBefore);
4173 
4174  // Now, for any loads that give us <8 x i32> vectors, split their
4175  // values into two <4 x i32> vectors; it turns out that LLVM gives us
4176  // better code on AVX when we assemble the pieces from 4-wide vectors.
4177  loadOps = lSplit8WideLoads(loadOps, insertBefore);
4178 
4179  // Given all of these chunks of values, shuffle together a vector that
4180  // gives us each result value; the i'th element of results[] gives the
4181  // result for the i'th gather in coalesceGroup.
4182  std::vector<llvm::Value *> results;
4183  lAssembleResultVectors(loadOps, constOffsets, results, insertBefore);
4184 
4185  // Finally, replace each of the original gathers with the instruction
4186  // that gives the value from the coalescing process.
4187  Assert(results.size() == coalesceGroup.size());
4188  for (int i = 0; i < (int)results.size(); ++i) {
4189  llvm::Instruction *ir = llvm::dyn_cast<llvm::Instruction>(results[i]);
4190  Assert(ir != NULL);
4191 
4192  llvm::Type *origType = coalesceGroup[i]->getType();
4193  if (origType != ir->getType())
4194  ir = new llvm::BitCastInst(ir, origType, ir->getName(),
4195  coalesceGroup[i]);
4196 
4197  // Previously, all of the instructions to compute the final result
4198  // were into the basic block here; here we remove the very last one
4199  // of them (that holds the final result) from the basic block.
4200  // This way, the following ReplaceInstWithInst() call will operate
4201  // successfully. (It expects that the second argument not be in any
4202  // basic block.)
4203  ir->removeFromParent();
4204 
4205  llvm::ReplaceInstWithInst(coalesceGroup[i], ir);
4206  }
4207 
4208  return true;
4209 }
4210 
4211 
4212 /** Given an instruction, returns true if the instructon may write to
4213  memory. This is a conservative test in that it may return true for
4214  some instructions that don't actually end up writing to memory, but
4215  should never return false for an instruction that does write to
4216  memory. */
4217 static bool
4218 lInstructionMayWriteToMemory(llvm::Instruction *inst) {
4219  if (llvm::isa<llvm::StoreInst>(inst) ||
4220  llvm::isa<llvm::AtomicRMWInst>(inst) ||
4221  llvm::isa<llvm::AtomicCmpXchgInst>(inst))
4222  // FIXME: we could be less conservative and try to allow stores if
4223  // we are sure that the pointers don't overlap..
4224  return true;
4225 
4226  // Otherwise, any call instruction that doesn't have an attribute
4227  // indicating it won't write to memory has to be treated as a potential
4228  // store.
4229  llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
4230  if (ci != NULL) {
4231  llvm::Function *calledFunc = ci->getCalledFunction();
4232  if (calledFunc == NULL)
4233  return true;
4234 
4235  if (calledFunc->onlyReadsMemory() || calledFunc->doesNotAccessMemory())
4236  return false;
4237  return true;
4238  }
4239 
4240  return false;
4241 }
4242 
4243 
4244 bool
4246  DEBUG_START_PASS("GatherCoalescePass");
4247 
4248  llvm::Function *gatherFuncs[] = {
4249  m->module->getFunction("__pseudo_gather_factored_base_offsets32_i32"),
4250  m->module->getFunction("__pseudo_gather_factored_base_offsets32_float"),
4251  m->module->getFunction("__pseudo_gather_factored_base_offsets64_i32"),
4252  m->module->getFunction("__pseudo_gather_factored_base_offsets64_float"),
4253  };
4254  int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]);
4255 
4256  bool modifiedAny = false;
4257 
4258  restart:
4259  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e;
4260  ++iter) {
4261  // Iterate over all of the instructions and look for calls to
4262  // __pseudo_gather_factored_base_offsets{32,64}_{i32,float} calls.
4263  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
4264  if (callInst == NULL)
4265  continue;
4266 
4267  llvm::Function *calledFunc = callInst->getCalledFunction();
4268  if (calledFunc == NULL)
4269  continue;
4270 
4271  int i;
4272  for (i = 0; i < nGatherFuncs; ++i)
4273  if (gatherFuncs[i] != NULL && calledFunc == gatherFuncs[i])
4274  break;
4275  if (i == nGatherFuncs)
4276  // Doesn't match any of the types of gathers we care about
4277  continue;
4278 
4279  SourcePos pos;
4280  lGetSourcePosFromMetadata(callInst, &pos);
4281  Debug(pos, "Checking for coalescable gathers starting here...");
4282 
4283  llvm::Value *base = callInst->getArgOperand(0);
4284  llvm::Value *variableOffsets = callInst->getArgOperand(1);
4285  llvm::Value *offsetScale = callInst->getArgOperand(2);
4286  llvm::Value *mask = callInst->getArgOperand(4);
4287 
4288  // To apply this optimization, we need a set of one or more gathers
4289  // that fulfill the following conditions:
4290  //
4291  // - Mask all on
4292  // - The variable offsets to all have the same value (i.e., to be
4293  // uniform).
4294  // - Same base pointer, variable offsets, and offset scale (for
4295  // more than one gather)
4296  //
4297  // Then and only then do we have a common base pointer with all
4298  // offsets from that constants (in which case we can potentially
4299  // coalesce).
4300  if (lGetMaskStatus(mask) != ALL_ON)
4301  continue;
4302 
4303  if (!LLVMVectorValuesAllEqual(variableOffsets))
4304  continue;
4305 
4306  // coalesceGroup stores the set of gathers that we're going to try to
4307  // coalesce over
4308  std::vector<llvm::CallInst *> coalesceGroup;
4309  coalesceGroup.push_back(callInst);
4310 
4311  // Start iterating at the instruction after the initial gather;
4312  // look at the remainder of instructions in the basic block (up
4313  // until we reach a write to memory) to try to find any other
4314  // gathers that can coalesce with this one.
4315  llvm::BasicBlock::iterator fwdIter = iter;
4316  ++fwdIter;
4317  for (; fwdIter != bb.end(); ++fwdIter) {
4318  // Must stop once we come to an instruction that may write to
4319  // memory; otherwise we could end up moving a read before this
4320  // write.
4321  if (lInstructionMayWriteToMemory(&*fwdIter))
4322  break;
4323 
4324  llvm::CallInst *fwdCall = llvm::dyn_cast<llvm::CallInst>(&*fwdIter);
4325  if (fwdCall == NULL ||
4326  fwdCall->getCalledFunction() != calledFunc)
4327  continue;
4328 
4329  SourcePos fwdPos;
4330  // TODO: need to redesign metadata attached to pseudo calls,
4331  // LLVM drops metadata frequently and it results in bad disgnostics.
4332  lGetSourcePosFromMetadata(fwdCall, &fwdPos);
4333 
4334  if (g->debugPrint) {
4335  if (base != fwdCall->getArgOperand(0)) {
4336  Debug(fwdPos, "base pointers mismatch");
4337  LLVMDumpValue(base);
4338  LLVMDumpValue(fwdCall->getArgOperand(0));
4339  }
4340  if (variableOffsets != fwdCall->getArgOperand(1)) {
4341  Debug(fwdPos, "varying offsets mismatch");
4342  LLVMDumpValue(variableOffsets);
4343  LLVMDumpValue(fwdCall->getArgOperand(1));
4344  }
4345  if (offsetScale != fwdCall->getArgOperand(2)) {
4346  Debug(fwdPos, "offset scales mismatch");
4347  LLVMDumpValue(offsetScale);
4348  LLVMDumpValue(fwdCall->getArgOperand(2));
4349  }
4350  if (mask != fwdCall->getArgOperand(4)) {
4351  Debug(fwdPos, "masks mismatch");
4352  LLVMDumpValue(mask);
4353  LLVMDumpValue(fwdCall->getArgOperand(4));
4354  }
4355  }
4356 
4357  if (base == fwdCall->getArgOperand(0) &&
4358  variableOffsets == fwdCall->getArgOperand(1) &&
4359  offsetScale == fwdCall->getArgOperand(2) &&
4360  mask == fwdCall->getArgOperand(4)) {
4361  Debug(fwdPos, "This gather can be coalesced.");
4362  coalesceGroup.push_back(fwdCall);
4363 
4364  if (coalesceGroup.size() == 4)
4365  // FIXME: untested heuristic: don't try to coalesce
4366  // over a window of more than 4 gathers, so that we
4367  // don't cause too much register pressure and end up
4368  // spilling to memory anyway.
4369  break;
4370  }
4371  else
4372  Debug(fwdPos, "This gather doesn't match the initial one.");
4373  }
4374 
4375  Debug(pos, "Done with checking for matching gathers");
4376 
4377  // Now that we have a group of gathers, see if we can coalesce them
4378  // into something more efficient than the original set of gathers.
4379  if (lCoalesceGathers(coalesceGroup)) {
4380  modifiedAny = true;
4381  goto restart;
4382  }
4383  }
4384 
4385  DEBUG_END_PASS("GatherCoalescePass");
4386 
4387  return modifiedAny;
4388 }
4389 
4390 
4391 static llvm::Pass *
4393  return new GatherCoalescePass;
4394 }
4395 
4396 
4397 ///////////////////////////////////////////////////////////////////////////
4398 // ReplacePseudoMemoryOpsPass
4399 
4400 /** For any gathers and scatters remaining after the GSToLoadStorePass
4401  runs, we need to turn them into actual native gathers and scatters.
4402  This task is handled by the ReplacePseudoMemoryOpsPass here.
4403  */
4404 class ReplacePseudoMemoryOpsPass : public llvm::BasicBlockPass {
4405 public:
4406  static char ID;
4407  ReplacePseudoMemoryOpsPass() : BasicBlockPass(ID) { }
4408 
4409 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4410  const char *getPassName() const { return "Replace Pseudo Memory Ops"; }
4411 #else // LLVM 4.0+
4412  llvm::StringRef getPassName() const { return "Replace Pseudo Memory Ops"; }
4413 #endif
4414  bool runOnBasicBlock(llvm::BasicBlock &BB);
4415 };
4416 
4417 
4419 
4420 /** This routine attempts to determine if the given pointer in lvalue is
4421  pointing to stack-allocated memory. It's conservative in that it
4422  should never return true for non-stack allocated memory, but may return
4423  false for memory that actually is stack allocated. The basic strategy
4424  is to traverse through the operands and see if the pointer originally
4425  comes from an AllocaInst.
4426 */
4427 static bool
4428 lIsSafeToBlend(llvm::Value *lvalue) {
4429  llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(lvalue);
4430  if (bc != NULL)
4431  return lIsSafeToBlend(bc->getOperand(0));
4432  else {
4433  llvm::AllocaInst *ai = llvm::dyn_cast<llvm::AllocaInst>(lvalue);
4434  if (ai) {
4435  llvm::Type *type = ai->getType();
4436  llvm::PointerType *pt =
4437  llvm::dyn_cast<llvm::PointerType>(type);
4438  assert(pt != NULL);
4439  type = pt->getElementType();
4440  llvm::ArrayType *at;
4441  while ((at = llvm::dyn_cast<llvm::ArrayType>(type))) {
4442  type = at->getElementType();
4443  }
4444  llvm::VectorType *vt =
4445  llvm::dyn_cast<llvm::VectorType>(type);
4446  return (vt != NULL &&
4447  (int)vt->getNumElements() == g->target->getVectorWidth());
4448  }
4449  else {
4450  llvm::GetElementPtrInst *gep =
4451  llvm::dyn_cast<llvm::GetElementPtrInst>(lvalue);
4452  if (gep != NULL)
4453  return lIsSafeToBlend(gep->getOperand(0));
4454  else
4455  return false;
4456  }
4457  }
4458 }
4459 
4460 
4461 static bool
4462 lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
4463  struct LMSInfo {
4464  LMSInfo(const char *pname, const char *bname, const char *msname) {
4465  pseudoFunc = m->module->getFunction(pname);
4466  blendFunc = m->module->getFunction(bname);
4467  maskedStoreFunc = m->module->getFunction(msname);
4468  Assert(pseudoFunc != NULL && blendFunc != NULL &&
4469  maskedStoreFunc != NULL);
4470  }
4471  llvm::Function *pseudoFunc;
4472  llvm::Function *blendFunc;
4473  llvm::Function *maskedStoreFunc;
4474  };
4475 
4476  LMSInfo msInfo[] = {
4477  LMSInfo("__pseudo_masked_store_i8", "__masked_store_blend_i8",
4478  "__masked_store_i8"),
4479  LMSInfo("__pseudo_masked_store_i16", "__masked_store_blend_i16",
4480  "__masked_store_i16"),
4481  LMSInfo("__pseudo_masked_store_i32", "__masked_store_blend_i32",
4482  "__masked_store_i32"),
4483  LMSInfo("__pseudo_masked_store_float", "__masked_store_blend_float",
4484  "__masked_store_float"),
4485  LMSInfo("__pseudo_masked_store_i64", "__masked_store_blend_i64",
4486  "__masked_store_i64"),
4487  LMSInfo("__pseudo_masked_store_double", "__masked_store_blend_double",
4488  "__masked_store_double")
4489  };
4490 
4491  LMSInfo *info = NULL;
4492  for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
4493  if (msInfo[i].pseudoFunc != NULL &&
4494  callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
4495  info = &msInfo[i];
4496  break;
4497  }
4498  }
4499  if (info == NULL)
4500  return false;
4501 
4502  llvm::Value *lvalue = callInst->getArgOperand(0);
4503  llvm::Value *rvalue = callInst->getArgOperand(1);
4504  llvm::Value *mask = callInst->getArgOperand(2);
4505 
4506  // We need to choose between doing the load + blend + store trick,
4507  // or serializing the masked store. Even on targets with a native
4508  // masked store instruction, this is preferable since it lets us
4509  // keep values in registers rather than going out to the stack.
4510  bool doBlend = (!g->opt.disableBlendedMaskedStores &&
4511  lIsSafeToBlend(lvalue));
4512 
4513  // Generate the call to the appropriate masked store function and
4514  // replace the __pseudo_* one with it.
4515  llvm::Function *fms = doBlend ? info->blendFunc : info->maskedStoreFunc;
4516  llvm::Instruction *inst = lCallInst(fms, lvalue, rvalue, mask, "", callInst);
4517  lCopyMetadata(inst, callInst);
4518 
4519  callInst->eraseFromParent();
4520  return true;
4521 }
4522 
4523 
4524 static bool
4525 lReplacePseudoGS(llvm::CallInst *callInst) {
4526  struct LowerGSInfo {
4527  LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip)
4528  : isGather(ig), isPrefetch(ip) {
4529  pseudoFunc = m->module->getFunction(pName);
4530  actualFunc = m->module->getFunction(aName);
4531  }
4532  llvm::Function *pseudoFunc;
4533  llvm::Function *actualFunc;
4534  const bool isGather;
4535  const bool isPrefetch;
4536  };
4537 
4538  LowerGSInfo lgsInfo[] = {
4539  LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false),
4540  LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
4541  LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
4542  LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
4543  LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
4544  LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),
4545 
4546  LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false),
4547  LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
4548  LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
4549  LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
4550  LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
4551  LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),
4552 
4553  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",
4554  "__gather_factored_base_offsets32_i8", true, false),
4555  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16",
4556  "__gather_factored_base_offsets32_i16", true, false),
4557  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32",
4558  "__gather_factored_base_offsets32_i32", true, false),
4559  LowerGSInfo("__pseudo_gather_factored_base_offsets32_float",
4560  "__gather_factored_base_offsets32_float", true, false),
4561  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64",
4562  "__gather_factored_base_offsets32_i64", true, false),
4563  LowerGSInfo("__pseudo_gather_factored_base_offsets32_double",
4564  "__gather_factored_base_offsets32_double", true, false),
4565 
4566  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",
4567  "__gather_factored_base_offsets64_i8", true, false),
4568  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16",
4569  "__gather_factored_base_offsets64_i16", true, false),
4570  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32",
4571  "__gather_factored_base_offsets64_i32", true, false),
4572  LowerGSInfo("__pseudo_gather_factored_base_offsets64_float",
4573  "__gather_factored_base_offsets64_float", true, false),
4574  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64",
4575  "__gather_factored_base_offsets64_i64", true, false),
4576  LowerGSInfo("__pseudo_gather_factored_base_offsets64_double",
4577  "__gather_factored_base_offsets64_double", true, false),
4578 
4579  LowerGSInfo("__pseudo_gather_base_offsets32_i8",
4580  "__gather_base_offsets32_i8", true, false),
4581  LowerGSInfo("__pseudo_gather_base_offsets32_i16",
4582  "__gather_base_offsets32_i16", true, false),
4583  LowerGSInfo("__pseudo_gather_base_offsets32_i32",
4584  "__gather_base_offsets32_i32", true, false),
4585  LowerGSInfo("__pseudo_gather_base_offsets32_float",
4586  "__gather_base_offsets32_float", true, false),
4587  LowerGSInfo("__pseudo_gather_base_offsets32_i64",
4588  "__gather_base_offsets32_i64", true, false),
4589  LowerGSInfo("__pseudo_gather_base_offsets32_double",
4590  "__gather_base_offsets32_double", true, false),
4591 
4592  LowerGSInfo("__pseudo_gather_base_offsets64_i8",
4593  "__gather_base_offsets64_i8", true, false),
4594  LowerGSInfo("__pseudo_gather_base_offsets64_i16",
4595  "__gather_base_offsets64_i16", true, false),
4596  LowerGSInfo("__pseudo_gather_base_offsets64_i32",
4597  "__gather_base_offsets64_i32", true, false),
4598  LowerGSInfo("__pseudo_gather_base_offsets64_float",
4599  "__gather_base_offsets64_float", true, false),
4600  LowerGSInfo("__pseudo_gather_base_offsets64_i64",
4601  "__gather_base_offsets64_i64", true, false),
4602  LowerGSInfo("__pseudo_gather_base_offsets64_double",
4603  "__gather_base_offsets64_double", true, false),
4604 
4605  LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false),
4606  LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
4607  LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
4608  LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
4609  LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
4610  LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),
4611 
4612  LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false),
4613  LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
4614  LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
4615  LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
4616  LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
4617  LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),
4618 
4619  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",
4620  "__scatter_factored_base_offsets32_i8", false, false),
4621  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16",
4622  "__scatter_factored_base_offsets32_i16", false, false),
4623  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32",
4624  "__scatter_factored_base_offsets32_i32", false, false),
4625  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float",
4626  "__scatter_factored_base_offsets32_float", false, false),
4627  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64",
4628  "__scatter_factored_base_offsets32_i64", false, false),
4629  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double",
4630  "__scatter_factored_base_offsets32_double", false, false),
4631 
4632  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",
4633  "__scatter_factored_base_offsets64_i8", false, false),
4634  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16",
4635  "__scatter_factored_base_offsets64_i16", false, false),
4636  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32",
4637  "__scatter_factored_base_offsets64_i32", false, false),
4638  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float",
4639  "__scatter_factored_base_offsets64_float", false, false),
4640  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64",
4641  "__scatter_factored_base_offsets64_i64", false, false),
4642  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double",
4643  "__scatter_factored_base_offsets64_double", false, false),
4644 
4645 
4646  LowerGSInfo("__pseudo_scatter_base_offsets32_i8",
4647  "__scatter_base_offsets32_i8", false, false),
4648  LowerGSInfo("__pseudo_scatter_base_offsets32_i16",
4649  "__scatter_base_offsets32_i16", false, false),
4650  LowerGSInfo("__pseudo_scatter_base_offsets32_i32",
4651  "__scatter_base_offsets32_i32", false, false),
4652  LowerGSInfo("__pseudo_scatter_base_offsets32_float",
4653  "__scatter_base_offsets32_float", false, false),
4654  LowerGSInfo("__pseudo_scatter_base_offsets32_i64",
4655  "__scatter_base_offsets32_i64", false, false),
4656  LowerGSInfo("__pseudo_scatter_base_offsets32_double",
4657  "__scatter_base_offsets32_double", false, false),
4658 
4659  LowerGSInfo("__pseudo_scatter_base_offsets64_i8",
4660  "__scatter_base_offsets64_i8", false, false),
4661  LowerGSInfo("__pseudo_scatter_base_offsets64_i16",
4662  "__scatter_base_offsets64_i16", false, false),
4663  LowerGSInfo("__pseudo_scatter_base_offsets64_i32",
4664  "__scatter_base_offsets64_i32", false, false),
4665  LowerGSInfo("__pseudo_scatter_base_offsets64_float",
4666  "__scatter_base_offsets64_float", false, false),
4667  LowerGSInfo("__pseudo_scatter_base_offsets64_i64",
4668  "__scatter_base_offsets64_i64", false, false),
4669  LowerGSInfo("__pseudo_scatter_base_offsets64_double",
4670  "__scatter_base_offsets64_double", false, false),
4671 
4672  LowerGSInfo("__pseudo_prefetch_read_varying_1",
4673  "__prefetch_read_varying_1", false, true),
4674  LowerGSInfo("__pseudo_prefetch_read_varying_1_native",
4675  "__prefetch_read_varying_1_native", false, true),
4676 
4677  LowerGSInfo("__pseudo_prefetch_read_varying_2",
4678  "__prefetch_read_varying_2", false, true),
4679  LowerGSInfo("__pseudo_prefetch_read_varying_2_native",
4680  "__prefetch_read_varying_2_native", false, true),
4681 
4682  LowerGSInfo("__pseudo_prefetch_read_varying_3",
4683  "__prefetch_read_varying_3", false, true),
4684  LowerGSInfo("__pseudo_prefetch_read_varying_3_native",
4685  "__prefetch_read_varying_3_native", false, true),
4686 
4687  LowerGSInfo("__pseudo_prefetch_read_varying_nt",
4688  "__prefetch_read_varying_nt", false, true),
4689  LowerGSInfo("__pseudo_prefetch_read_varying_nt_native",
4690  "__prefetch_read_varying_nt_native", false, true),
4691  };
4692 
4693  llvm::Function *calledFunc = callInst->getCalledFunction();
4694 
4695  LowerGSInfo *info = NULL;
4696  for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
4697  if (lgsInfo[i].pseudoFunc != NULL &&
4698  calledFunc == lgsInfo[i].pseudoFunc) {
4699  info = &lgsInfo[i];
4700  break;
4701  }
4702  }
4703  if (info == NULL)
4704  return false;
4705 
4706  Assert(info->actualFunc != NULL);
4707 
4708  // Get the source position from the metadata attached to the call
4709  // instruction so that we can issue PerformanceWarning()s below.
4710  SourcePos pos;
4711  bool gotPosition = lGetSourcePosFromMetadata(callInst, &pos);
4712 
4713  callInst->setCalledFunction(info->actualFunc);
4714  if (gotPosition && g->target->getVectorWidth() > 1) {
4715  if (info->isGather)
4716  PerformanceWarning(pos, "Gather required to load value.");
4717  else if (!info->isPrefetch)
4718  PerformanceWarning(pos, "Scatter required to store value.");
4719  }
4720  return true;
4721 }
4722 
4723 
4724 bool
4726  DEBUG_START_PASS("ReplacePseudoMemoryOpsPass");
4727 
4728  bool modifiedAny = false;
4729 
4730  restart:
4731  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
4732  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
4733  if (callInst == NULL ||
4734  callInst->getCalledFunction() == NULL)
4735  continue;
4736 
4737  if (lReplacePseudoGS(callInst)) {
4738  modifiedAny = true;
4739  goto restart;
4740  }
4741  else if (lReplacePseudoMaskedStore(callInst)) {
4742  modifiedAny = true;
4743  goto restart;
4744  }
4745  }
4746 
4747  DEBUG_END_PASS("ReplacePseudoMemoryOpsPass");
4748 
4749  return modifiedAny;
4750 }
4751 
4752 
4753 static llvm::Pass *
4755  return new ReplacePseudoMemoryOpsPass;
4756 }
4757 
4758 
4759 ///////////////////////////////////////////////////////////////////////////
4760 // IsCompileTimeConstantPass
4761 
4762 /** LLVM IR implementations of target-specific functions may include calls
4763  to the functions "bool __is_compile_time_constant_*(...)"; these allow
4764  them to have specialied code paths for where the corresponding value is
4765  known at compile time. For masks, for example, this allows them to not
4766  incur the cost of a MOVMSK call at runtime to compute its value in
4767  cases where the mask value isn't known until runtime.
4768 
4769  This pass resolves these calls into either 'true' or 'false' values so
4770  that later optimization passes can operate with these as constants.
4771 
4772  See stdlib.m4 for a number of uses of this idiom.
4773  */
4774 
4775 class IsCompileTimeConstantPass : public llvm::BasicBlockPass {
4776 public:
4777  static char ID;
4778  IsCompileTimeConstantPass(bool last = false) : BasicBlockPass(ID) {
4779  isLastTry = last;
4780  }
4781 
4782 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4783  const char *getPassName() const { return "Resolve \"is compile time constant\""; }
4784 #else // LLVM 4.0+
4785  llvm::StringRef getPassName() const { return "Resolve \"is compile time constant\""; }
4786 #endif
4787  bool runOnBasicBlock(llvm::BasicBlock &BB);
4788 
4790 };
4791 
4793 
4794 
4795 bool
4797  DEBUG_START_PASS("IsCompileTimeConstantPass");
4798 
4799  llvm::Function *funcs[] = {
4800  m->module->getFunction("__is_compile_time_constant_mask"),
4801  m->module->getFunction("__is_compile_time_constant_uniform_int32"),
4802  m->module->getFunction("__is_compile_time_constant_varying_int32")
4803  };
4804 
4805  bool modifiedAny = false;
4806  restart:
4807  for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
4808  // Iterate through the instructions looking for calls to the
4809  // __is_compile_time_constant_*() functions
4810  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
4811  if (callInst == NULL)
4812  continue;
4813 
4814  int j;
4815  int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
4816  for (j = 0; j < nFuncs; ++j) {
4817  if (funcs[j] != NULL && callInst->getCalledFunction() == funcs[j])
4818  break;
4819  }
4820  if (j == nFuncs)
4821  // not a __is_compile_time_constant_* function
4822  continue;
4823 
4824  // This optimization pass can be disabled with both the (poorly
4825  // named) disableGatherScatterFlattening option and
4826  // disableMaskAllOnOptimizations.
4829  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4830  modifiedAny = true;
4831  goto restart;
4832  }
4833 
4834  // Is it a constant? Bingo, turn the call's value into a constant
4835  // true value.
4836  llvm::Value *operand = callInst->getArgOperand(0);
4837  if (llvm::isa<llvm::Constant>(operand)) {
4838  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue);
4839  modifiedAny = true;
4840  goto restart;
4841  }
4842 
4843  // This pass runs multiple times during optimization. Up until the
4844  // very last time, it only replaces the call with a 'true' if the
4845  // value is known to be constant and otherwise leaves the call
4846  // alone, in case further optimization passes can help resolve its
4847  // value. The last time through, it eventually has to give up, and
4848  // replaces any remaining ones with 'false' constants.
4849  if (isLastTry) {
4850  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4851  modifiedAny = true;
4852  goto restart;
4853  }
4854  }
4855 
4856  DEBUG_END_PASS("IsCompileTimeConstantPass");
4857 
4858  return modifiedAny;
4859 }
4860 
4861 
4862 static llvm::Pass *
4864  return new IsCompileTimeConstantPass(isLastTry);
4865 }
4866 
4867 //////////////////////////////////////////////////////////////////////////
4868 // DebugPass
4869 
4870 /** This pass is added in list of passes after optimizations which
4871  we want to debug and print dump of LLVM IR in stderr. Also it
4872  prints name and number of previous optimization.
4873  */
4874 class DebugPass : public llvm::ModulePass {
4875 public:
4876  static char ID;
4877  DebugPass(char * output) : ModulePass(ID) {
4878  sprintf(str_output, "%s", output);
4879  }
4880 
4881 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4882  const char *getPassName() const { return "Dump LLVM IR"; }
4883 #else // LLVM 4.0+
4884  llvm::StringRef getPassName() const { return "Dump LLVM IR"; }
4885 #endif
4886  bool runOnModule(llvm::Module &m);
4887 
4888 private:
4889  char str_output[100];
4890 };
4891 
4892 char DebugPass::ID = 0;
4893 
4894 bool
4895 DebugPass::runOnModule(llvm::Module &module) {
4896  fprintf(stderr, "%s", str_output);
4897  fflush(stderr);
4898  module.dump();
4899  return true;
4900 }
4901 
4902 static llvm::Pass *
4903 CreateDebugPass(char * output) {
4904  return new DebugPass(output);
4905 }
4906 
4907 ///////////////////////////////////////////////////////////////////////////
4908 // MakeInternalFuncsStaticPass
4909 
4910 /** There are a number of target-specific functions that we use during
4911  these optimization passes. By the time we are done with optimization,
4912  any uses of these should be inlined and no calls to these functions
4913  should remain. This pass marks all of these functions as having
4914  private linkage so that subsequent passes can eliminate them as dead
4915  code, thus cleaning up the final code output by the compiler. We can't
4916  just declare these as static from the start, however, since then they
4917  end up being eliminated as dead code during early optimization passes
4918  even though we may need to generate calls to them during later
4919  optimization passes.
4920  */
4921 class MakeInternalFuncsStaticPass : public llvm::ModulePass {
4922 public:
4923  static char ID;
4924  MakeInternalFuncsStaticPass(bool last = false) : ModulePass(ID) {
4925  }
4926 
4927  void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
4928  AU.setPreservesCFG();
4929  }
4930 
4931 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4932  const char *getPassName() const { return "Make internal funcs \"static\""; }
4933 #else // LLVM 4.0+
4934  llvm::StringRef getPassName() const { return "Make internal funcs \"static\""; }
4935 #endif
4936  bool runOnModule(llvm::Module &m);
4937 };
4938 
4940 
4941 bool
4943  const char *names[] = {
4944  "__avg_up_uint8",
4945  "__avg_up_int8",
4946  "__avg_up_uint16",
4947  "__avg_up_int16",
4948  "__avg_down_uint8",
4949  "__avg_down_int8",
4950  "__avg_down_uint16",
4951  "__avg_down_int16",
4952  "__fast_masked_vload",
4953  "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16",
4954  "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64",
4955  "__gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_double",
4956  "__gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i16",
4957  "__gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i64",
4958  "__gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_double",
4959  "__gather_base_offsets32_i8", "__gather_base_offsets32_i16",
4960  "__gather_base_offsets32_i32", "__gather_base_offsets32_i64",
4961  "__gather_base_offsets32_float", "__gather_base_offsets32_double",
4962  "__gather_base_offsets64_i8", "__gather_base_offsets64_i16",
4963  "__gather_base_offsets64_i32", "__gather_base_offsets64_i64",
4964  "__gather_base_offsets64_float", "__gather_base_offsets64_double",
4965  "__gather32_i8", "__gather32_i16",
4966  "__gather32_i32", "__gather32_i64",
4967  "__gather32_float", "__gather32_double",
4968  "__gather64_i8", "__gather64_i16",
4969  "__gather64_i32", "__gather64_i64",
4970  "__gather64_float", "__gather64_double",
4971  "__gather_elt32_i8", "__gather_elt32_i16",
4972  "__gather_elt32_i32", "__gather_elt32_i64",
4973  "__gather_elt32_float", "__gather_elt32_double",
4974  "__gather_elt64_i8", "__gather_elt64_i16",
4975  "__gather_elt64_i32", "__gather_elt64_i64",
4976  "__gather_elt64_float", "__gather_elt64_double",
4977  "__masked_load_i8", "__masked_load_i16",
4978  "__masked_load_i32", "__masked_load_i64",
4979  "__masked_load_float", "__masked_load_double",
4980  "__masked_store_i8", "__masked_store_i16",
4981  "__masked_store_i32", "__masked_store_i64",
4982  "__masked_store_float", "__masked_store_double",
4983  "__masked_store_blend_i8", "__masked_store_blend_i16",
4984  "__masked_store_blend_i32", "__masked_store_blend_i64",
4985  "__masked_store_blend_float", "__masked_store_blend_double",
4986  "__scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i16",
4987  "__scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i64",
4988  "__scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_double",
4989  "__scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i16",
4990  "__scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i64",
4991  "__scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_double",
4992  "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16",
4993  "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64",
4994  "__scatter_base_offsets32_float", "__scatter_base_offsets32_double",
4995  "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16",
4996  "__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64",
4997  "__scatter_base_offsets64_float", "__scatter_base_offsets64_double",
4998  "__scatter_elt32_i8", "__scatter_elt32_i16",
4999  "__scatter_elt32_i32", "__scatter_elt32_i64",
5000  "__scatter_elt32_float", "__scatter_elt32_double",
5001  "__scatter_elt64_i8", "__scatter_elt64_i16",
5002  "__scatter_elt64_i32", "__scatter_elt64_i64",
5003  "__scatter_elt64_float", "__scatter_elt64_double",
5004  "__scatter32_i8", "__scatter32_i16",
5005  "__scatter32_i32", "__scatter32_i64",
5006  "__scatter32_float", "__scatter32_double",
5007  "__scatter64_i8", "__scatter64_i16",
5008  "__scatter64_i32", "__scatter64_i64",
5009  "__scatter64_float", "__scatter64_double",
5010  "__prefetch_read_varying_1", "__prefetch_read_varying_2",
5011  "__prefetch_read_varying_3", "__prefetch_read_varying_nt",
5012  "__keep_funcs_live",
5013  };
5014 
5015  bool modifiedAny = false;
5016  int count = sizeof(names) / sizeof(names[0]);
5017  for (int i = 0; i < count; ++i) {
5018  llvm::Function *f = m->module->getFunction(names[i]);
5019  if (f != NULL && f->empty() == false) {
5020  f->setLinkage(llvm::GlobalValue::InternalLinkage);
5021  modifiedAny = true;
5022  }
5023  }
5024 
5025  return modifiedAny;
5026 }
5027 
5028 
5029 static llvm::Pass *
5031  return new MakeInternalFuncsStaticPass;
5032 }
5033 
5034 
5035 ///////////////////////////////////////////////////////////////////////////
5036 // PeepholePass
5037 
5038 class PeepholePass : public llvm::BasicBlockPass {
5039 public:
5040  PeepholePass();
5041 
5042 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
5043  const char *getPassName() const { return "Peephole Optimizations"; }
5044 #else // LLVM 4.0+
5045  llvm::StringRef getPassName() const { return "Peephole Optimizations"; }
5046 #endif
5047  bool runOnBasicBlock(llvm::BasicBlock &BB);
5048 
5049  static char ID;
5050 };
5051 
5052 char PeepholePass::ID = 0;
5053 
5055  : BasicBlockPass(ID) {
5056 }
5057 
5058 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3
5059 
5060 using namespace llvm::PatternMatch;
5061 
5062 template<typename Op_t, unsigned Opcode>
5063 struct CastClassTypes_match {
5064  Op_t Op;
5065  const llvm::Type *fromType, *toType;
5066 
5067  CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f,
5068  const llvm::Type *t)
5069  : Op(OpMatch), fromType(f), toType(t) {}
5070 
5071  template<typename OpTy>
5072  bool match(OpTy *V) {
5073  if (llvm::Operator *O = llvm::dyn_cast<llvm::Operator>(V))
5074  return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) &&
5075  O->getType() == toType &&
5076  O->getOperand(0)->getType() == fromType);
5077  return false;
5078  }
5079 };
5080 
5081 template<typename OpTy>
5082 inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
5083 m_SExt8To16(const OpTy &Op) {
5084  return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
5085  Op,
5088 }
5089 
5090 template<typename OpTy>
5091 inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
5092 m_ZExt8To16(const OpTy &Op) {
5093  return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
5094  Op,
5097 }
5098 
5099 
5100 template<typename OpTy>
5101 inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
5102 m_Trunc16To8(const OpTy &Op) {
5103  return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
5104  Op,
5107 }
5108 
5109 template<typename OpTy>
5110 inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
5111 m_SExt16To32(const OpTy &Op) {
5112  return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
5113  Op,
5116 }
5117 
5118 template<typename OpTy>
5119 inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
5120 m_ZExt16To32(const OpTy &Op) {
5121  return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
5122  Op,
5125 }
5126 
5127 
5128 template<typename OpTy>
5129 inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
5130 m_Trunc32To16(const OpTy &Op) {
5131  return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
5132  Op,
5135 }
5136 
5137 template<typename Op_t>
5138 struct UDiv2_match {
5139  Op_t Op;
5140 
5141  UDiv2_match(const Op_t &OpMatch)
5142  : Op(OpMatch) {}
5143 
5144  template<typename OpTy>
5145  bool match(OpTy *V) {
5146  llvm::BinaryOperator *bop;
5147  llvm::ConstantDataVector *cdv;
5148  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
5149  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
5150  cdv->getSplatValue() != NULL) {
5151  const llvm::APInt &apInt = cdv->getUniqueInteger();
5152 
5153  switch (bop->getOpcode()) {
5154  case llvm::Instruction::UDiv:
5155  // divide by 2
5156  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
5157  case llvm::Instruction::LShr:
5158  // shift left by 1
5159  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
5160  default:
5161  return false;
5162  }
5163  }
5164  return false;
5165  }
5166 };
5167 
5168 template<typename V>
5169 inline UDiv2_match<V>
5170 m_UDiv2(const V &v) {
5171  return UDiv2_match<V>(v);
5172 }
5173 
5174 template<typename Op_t>
5175 struct SDiv2_match {
5176  Op_t Op;
5177 
5178  SDiv2_match(const Op_t &OpMatch)
5179  : Op(OpMatch) {}
5180 
5181  template<typename OpTy>
5182  bool match(OpTy *V) {
5183  llvm::BinaryOperator *bop;
5184  llvm::ConstantDataVector *cdv;
5185  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
5186  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
5187  cdv->getSplatValue() != NULL) {
5188  const llvm::APInt &apInt = cdv->getUniqueInteger();
5189 
5190  switch (bop->getOpcode()) {
5191  case llvm::Instruction::SDiv:
5192  // divide by 2
5193  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
5194  case llvm::Instruction::AShr:
5195  // shift left by 1
5196  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
5197  default:
5198  return false;
5199  }
5200  }
5201  return false;
5202  }
5203 };
5204 
5205 template<typename V>
5206 inline SDiv2_match<V>
5207 m_SDiv2(const V &v) {
5208  return SDiv2_match<V>(v);
5209 }
5210 
5211 // Returns true if the given function has a call to an intrinsic function
5212 // in its definition.
5213 static bool
5214 lHasIntrinsicInDefinition(llvm::Function *func) {
5215  llvm::Function::iterator bbiter = func->begin();
5216  for (; bbiter != func->end(); ++bbiter) {
5217  for (llvm::BasicBlock::iterator institer = bbiter->begin();
5218  institer != bbiter->end(); ++institer) {
5219  if (llvm::isa<llvm::IntrinsicInst>(institer))
5220  return true;
5221  }
5222  }
5223  return false;
5224 }
5225 
5226 static llvm::Instruction *
5227 lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) {
5228  llvm::Function *func = m->module->getFunction(name);
5229  Assert(func != NULL);
5230 
5231  // Make sure that the definition of the llvm::Function has a call to an
5232  // intrinsic function in its instructions; otherwise we will generate
5233  // infinite loops where we "helpfully" turn the default implementations
5234  // of target builtins like __avg_up_uint8 that are implemented with plain
5235  // arithmetic ops into recursive calls to themselves.
5236  if (lHasIntrinsicInDefinition(func))
5237  return lCallInst(func, opa, opb, name);
5238  else
5239  return NULL;
5240 }
5241 
5242 //////////////////////////////////////////////////
5243 
5244 static llvm::Instruction *
5245 lMatchAvgUpUInt8(llvm::Value *inst) {
5246  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
5247  llvm::Value *opa, *opb;
5248  const llvm::APInt *delta;
5249  if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr(
5250  m_CombineOr(
5251  m_Add(m_ZExt8To16(m_Value(opa)),
5252  m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))),
5253  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)),
5254  m_ZExt8To16(m_Value(opb)))),
5255  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))),
5256  m_APInt(delta))))))) {
5257  if (delta->isIntN(1) == false)
5258  return NULL;
5259 
5260  return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
5261  }
5262  return NULL;
5263 }
5264 
5265 
5266 static llvm::Instruction *
5267 lMatchAvgDownUInt8(llvm::Value *inst) {
5268  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
5269  llvm::Value *opa, *opb;
5270  if (match(inst, m_Trunc16To8(m_UDiv2(
5271  m_Add(m_ZExt8To16(m_Value(opa)),
5272  m_ZExt8To16(m_Value(opb))))))) {
5273  return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb);
5274  }
5275  return NULL;
5276 }
5277 
5278 static llvm::Instruction *
5279 lMatchAvgUpUInt16(llvm::Value *inst) {
5280  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
5281  llvm::Value *opa, *opb;
5282  const llvm::APInt *delta;
5283  if (match(inst, m_Trunc32To16(m_UDiv2(m_CombineOr(
5284  m_CombineOr(
5285  m_Add(m_ZExt16To32(m_Value(opa)),
5286  m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))),
5287  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)),
5288  m_ZExt16To32(m_Value(opb)))),
5289  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))),
5290  m_APInt(delta))))))) {
5291  if (delta->isIntN(1) == false)
5292  return NULL;
5293 
5294  return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
5295  }
5296  return NULL;
5297 }
5298 
5299 
5300 static llvm::Instruction *
5301 lMatchAvgDownUInt16(llvm::Value *inst) {
5302  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
5303  llvm::Value *opa, *opb;
5304  if (match(inst, m_Trunc32To16(m_UDiv2(
5305  m_Add(m_ZExt16To32(m_Value(opa)),
5306  m_ZExt16To32(m_Value(opb))))))) {
5307  return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb);
5308  }
5309  return NULL;
5310 }
5311 
5312 
5313 static llvm::Instruction *
5314 lMatchAvgUpInt8(llvm::Value *inst) {
5315  // (int8)(((int16)a + (int16)b + 1)/2)
5316  llvm::Value *opa, *opb;
5317  const llvm::APInt *delta;
5318  if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr(
5319  m_CombineOr(
5320  m_Add(m_SExt8To16(m_Value(opa)),
5321  m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))),
5322  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)),
5323  m_SExt8To16(m_Value(opb)))),
5324  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))),
5325  m_APInt(delta))))))) {
5326  if (delta->isIntN(1) == false)
5327  return NULL;
5328 
5329  return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
5330  }
5331  return NULL;
5332 }
5333 
5334 
5335 static llvm::Instruction *
5336 lMatchAvgDownInt8(llvm::Value *inst) {
5337  // (int8)(((int16)a + (int16)b)/2)
5338  llvm::Value *opa, *opb;
5339  if (match(inst, m_Trunc16To8(m_SDiv2(
5340  m_Add(m_SExt8To16(m_Value(opa)),
5341  m_SExt8To16(m_Value(opb))))))) {
5342  return lGetBinaryIntrinsic("__avg_down_int8", opa, opb);
5343  }
5344  return NULL;
5345 }
5346 
5347 static llvm::Instruction *
5348 lMatchAvgUpInt16(llvm::Value *inst) {
5349  // (int16)(((int32)a + (int32)b + 1)/2)
5350  llvm::Value *opa, *opb;
5351  const llvm::APInt *delta;
5352  if (match(inst, m_Trunc32To16(m_SDiv2(m_CombineOr(
5353  m_CombineOr(
5354  m_Add(m_SExt16To32(m_Value(opa)),
5355  m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))),
5356  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)),
5357  m_SExt16To32(m_Value(opb)))),
5358  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))),
5359  m_APInt(delta))))))) {
5360  if (delta->isIntN(1) == false)
5361  return NULL;
5362 
5363  return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
5364  }
5365  return NULL;
5366 }
5367 
5368 static llvm::Instruction *
5369 lMatchAvgDownInt16(llvm::Value *inst) {
5370  // (int16)(((int32)a + (int32)b)/2)
5371  llvm::Value *opa, *opb;
5372  if (match(inst, m_Trunc32To16(m_SDiv2(
5373  m_Add(m_SExt16To32(m_Value(opa)),
5374  m_SExt16To32(m_Value(opb))))))) {
5375  return lGetBinaryIntrinsic("__avg_down_int16", opa, opb);
5376  }
5377  return NULL;
5378 }
5379 #endif // !LLVM_3_2
5380 
5381 
5382 bool
5383 PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
5384  DEBUG_START_PASS("PeepholePass");
5385 
5386  bool modifiedAny = false;
5387  restart:
5388  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
5389  llvm::Instruction *inst = &*iter;
5390 
5391  llvm::Instruction *builtinCall = NULL;
5392 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3
5393  if (!builtinCall)
5394  builtinCall = lMatchAvgUpUInt8(inst);
5395  if (!builtinCall)
5396  builtinCall = lMatchAvgUpUInt16(inst);
5397  if (!builtinCall)
5398  builtinCall = lMatchAvgDownUInt8(inst);
5399  if (!builtinCall)
5400  builtinCall = lMatchAvgDownUInt16(inst);
5401  if (!builtinCall)
5402  builtinCall = lMatchAvgUpInt8(inst);
5403  if (!builtinCall)
5404  builtinCall = lMatchAvgUpInt16(inst);
5405  if (!builtinCall)
5406  builtinCall = lMatchAvgDownInt8(inst);
5407  if (!builtinCall)
5408  builtinCall = lMatchAvgDownInt16(inst);
5409 #endif // !LLVM_3_2
5410  if (builtinCall != NULL) {
5411  llvm::ReplaceInstWithInst(inst, builtinCall);
5412  modifiedAny = true;
5413  goto restart;
5414  }
5415  }
5416 
5417  DEBUG_END_PASS("PeepholePass");
5418 
5419  return modifiedAny;
5420 }
5421 
5422 static llvm::Pass *
5424  return new PeepholePass;
5425 }
5426 
5427 /** Given an llvm::Value known to be an integer, return its value as
5428  an int64_t.
5429 */
5430 static int64_t
5431 lGetIntValue(llvm::Value *offset) {
5432  llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
5433  Assert(intOffset && (intOffset->getBitWidth() == 32 ||
5434  intOffset->getBitWidth() == 64));
5435  return intOffset->getSExtValue();
5436 }
5437 
5438 ///////////////////////////////////////////////////////////////////////////
5439 // ReplaceStdlibShiftPass
5440 
5441 class ReplaceStdlibShiftPass : public llvm::BasicBlockPass {
5442 public:
5443  static char ID;
5444  ReplaceStdlibShiftPass() : BasicBlockPass(ID) {
5445  }
5446 
5447 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
5448  const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
5449 #else // LLVM 4.0+
5450  llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
5451 #endif
5452  bool runOnBasicBlock(llvm::BasicBlock &BB);
5453 
5454 };
5455 
5457 
5458 // This pass replaces shift() with ShuffleVector when the offset is a constant.
5459 // rotate() which is similar in functionality has a slightly different
5460 // implementation. This is due to LLVM(createInstructionCombiningPass)
5461 // optimizing rotate() implementation better when similar implementations
5462 // are used for both. This is a hack to produce similarly optimized code for
5463 // shift.
5464 bool
5466  DEBUG_START_PASS("ReplaceStdlibShiftPass");
5467  bool modifiedAny = false;
5468 
5469  llvm::Function *shifts[6];
5470  shifts[0] = m->module->getFunction("shift___vytuni");
5471  shifts[1] = m->module->getFunction("shift___vysuni");
5472  shifts[2] = m->module->getFunction("shift___vyiuni");
5473  shifts[3] = m->module->getFunction("shift___vyIuni");
5474  shifts[4] = m->module->getFunction("shift___vyfuni");
5475  shifts[5] = m->module->getFunction("shift___vyduni");
5476 
5477  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
5478  llvm::Instruction *inst = &*iter;
5479 
5480  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
5481  llvm::Function *func = ci->getCalledFunction();
5482  for (int i = 0; i < 6; i++) {
5483  if (shifts[i] && (shifts[i] == func)) {
5484  // we matched a call
5485  llvm::Value *shiftedVec = ci->getArgOperand(0);
5486  llvm::Value *shiftAmt = ci->getArgOperand(1);
5487  if (llvm::isa<llvm::Constant>(shiftAmt)) {
5488  int vectorWidth = g->target->getVectorWidth();
5489  int * shuffleVals = new int[vectorWidth];
5490  int shiftInt = lGetIntValue(shiftAmt);
5491  for (int i = 0; i < vectorWidth; i++) {
5492  int s = i + shiftInt;
5493  s = (s < 0) ? vectorWidth : s;
5494  s = (s >= vectorWidth) ? vectorWidth : s;
5495  shuffleVals[i] = s;
5496  }
5497  llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
5498  llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
5499  llvm::Value *shuffle = new llvm::ShuffleVectorInst(shiftedVec, zeroVec,
5500  shuffleIdxs, "vecShift", ci);
5501  ci->replaceAllUsesWith(shuffle);
5502  modifiedAny = true;
5503  delete [] shuffleVals;
5504  } else {
5505  PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount.");
5506  }
5507  }
5508  }
5509  }
5510  }
5511 
5512  DEBUG_END_PASS("ReplaceStdlibShiftPass");
5513 
5514  return modifiedAny;
5515 }
5516 
5517 
5518 static llvm::Pass *
5520  return new ReplaceStdlibShiftPass();
5521 }
5522 
5523 
5524 
5525 ///////////////////////////////////////////////////////////////////////////////
5526 // FixBooleanSelect
5527 //
5528 // The problem is that in LLVM 3.3, optimizer doesn't like
5529 // the following instruction sequence:
5530 // %cmp = fcmp olt <8 x float> %a, %b
5531 // %sext_cmp = sext <8 x i1> %cmp to <8 x i32>
5532 // %new_mask = and <8 x i32> %sext_cmp, %mask
5533 // and optimizes it to the following:
5534 // %cmp = fcmp olt <8 x float> %a, %b
5535 // %cond = select <8 x i1> %cmp, <8 x i32> %mask, <8 x i32> zeroinitializer
5536 //
5537 // It wouldn't be a problem if codegen produced good code for it. But it
5538 // doesn't, especially for vectors larger than native vectors.
5539 //
5540 // This optimization reverts this pattern and should be the last one before
5541 // code gen.
5542 //
5543 // Note that this problem was introduced in LLVM 3.3. But in LLVM 3.4 it was
5544 // fixed. See commit r194542.
5545 //
5546 // After LLVM 3.3 this optimization should probably stay for experimental
5547 // purposes and code should be compared with and without this optimization from
5548 // time to time to make sure that LLVM does right thing.
5549 ///////////////////////////////////////////////////////////////////////////////
5550 
5551 class FixBooleanSelectPass : public llvm::FunctionPass {
5552 public:
5553  static char ID;
5554  FixBooleanSelectPass() :FunctionPass(ID) {}
5555 
5556 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
5557  const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
5558 #else // LLVM 4.0+
5559  llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
5560 #endif
5561  bool runOnFunction(llvm::Function &F);
5562 
5563 private:
5564  llvm::Instruction* fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext);
5565 };
5566 
5567 char FixBooleanSelectPass::ID = 0;
5568 
5569 llvm::Instruction* FixBooleanSelectPass::fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext) {
5570  // Select instruction result type and its integer equivalent
5571  llvm::VectorType *orig_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
5572  llvm::VectorType *int_type = llvm::VectorType::getInteger(orig_type);
5573 
5574  // Result value and optional pointer to instruction to delete
5575  llvm::Instruction *result = 0, *optional_to_delete = 0;
5576 
5577  // It can be vector of integers or vector of floating point values.
5578  if (orig_type->getElementType()->isIntegerTy()) {
5579  // Generate sext+and, remove select.
5580  result = llvm::BinaryOperator::CreateAnd(sext, sel->getTrueValue(), "and_mask", sel);
5581  } else {
5582  llvm::BitCastInst* bc = llvm::dyn_cast<llvm::BitCastInst>(sel->getTrueValue());
5583 
5584  if (bc && bc->hasOneUse() && bc->getSrcTy()->isIntOrIntVectorTy() && bc->getSrcTy()->isVectorTy() &&
5585  llvm::isa<llvm::Instruction>(bc->getOperand(0)) &&
5586  llvm::dyn_cast<llvm::Instruction>(bc->getOperand(0))->getParent() == sel->getParent()) {
5587  // Bitcast is casting form integer type, it's operand is instruction, which is located in the same basic block (otherwise it's unsafe to use it).
5588  // bitcast+select => sext+and+bicast
5589  // Create and
5590  llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc->getOperand(0), "and_mask", sel);
5591  // Bitcast back to original type
5592  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
5593  // Original bitcast will be removed
5594  optional_to_delete = bc;
5595  } else {
5596  // General case: select => bitcast+sext+and+bitcast
5597  // Bitcast
5598  llvm::BitCastInst* bc_in = new llvm::BitCastInst(sel->getTrueValue(), int_type, "bitcast_mask_in", sel);
5599  // And
5600  llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc_in, "and_mask", sel);
5601  // Bitcast back to original type
5602  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
5603  }
5604  }
5605 
5606  // Done, finalize.
5607  sel->replaceAllUsesWith(result);
5608  sel->eraseFromParent();
5609  if (optional_to_delete) {
5610  optional_to_delete->eraseFromParent();
5611  }
5612 
5613  return result;
5614 }
5615 
5616 bool
5618  bool modifiedAny = false;
5619 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_3 // LLVM 3.3 only
5620 
5621  // Don't optimize generic targets.
5622  if (g->target->getISA() == Target::GENERIC) {
5623  return false;
5624  }
5625 
5626  for (llvm::Function::iterator I = F.begin(), E = F.end();
5627  I != E; ++I) {
5628  llvm::BasicBlock* bb = &*I;
5629  for (llvm::BasicBlock::iterator iter = bb->begin(), e = bb->end(); iter != e; ++iter) {
5630  llvm::Instruction *inst = &*iter;
5631 
5632  llvm::CmpInst *cmp = llvm::dyn_cast<llvm::CmpInst>(inst);
5633 
5634  if (cmp &&
5635  cmp->getType()->isVectorTy() &&
5636  cmp->getType()->getVectorElementType()->isIntegerTy(1)) {
5637 
5638  // Search for select instruction uses.
5639  int selects = 0;
5640  llvm::VectorType* sext_type = 0;
5641  for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it ) {
5642  llvm::SelectInst* sel = llvm::dyn_cast<llvm::SelectInst>(*it);
5643  if (sel &&
5644  sel->getType()->isVectorTy() &&
5645  sel->getType()->getScalarSizeInBits() > 1) {
5646  selects++;
5647  // We pick the first one, but typical case when all select types are the same.
5648  sext_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
5649  break;
5650  }
5651  }
5652  if (selects == 0) {
5653  continue;
5654  }
5655  // Get an integer equivalent, if it's not yet an integer.
5656  sext_type = llvm::VectorType::getInteger(sext_type);
5657 
5658  // Do transformation
5659  llvm::BasicBlock::iterator iter_copy=iter;
5660  llvm::Instruction* next_inst = &*(++iter_copy);
5661  // Create or reuse sext
5662  llvm::SExtInst* sext = llvm::dyn_cast<llvm::SExtInst>(next_inst);
5663  if (sext &&
5664  sext->getOperand(0) == cmp &&
5665  sext->getDestTy() == sext_type) {
5666  // This sext can be reused
5667  } else {
5668  if (next_inst) {
5669  sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", next_inst);
5670  } else {
5671  sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", bb);
5672  }
5673  }
5674 
5675  // Walk and fix selects
5676  std::vector<llvm::SelectInst*> sel_uses;
5677  for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it) {
5678  llvm::SelectInst* sel = llvm::dyn_cast<llvm::SelectInst>(*it);
5679  if (sel &&
5680  sel->getType()->getScalarSizeInBits() == sext_type->getScalarSizeInBits()) {
5681 
5682  // Check that second operand is zero.
5683  llvm::Constant* false_cond = llvm::dyn_cast<llvm::Constant>(sel->getFalseValue());
5684  if (false_cond &&
5685  false_cond->isZeroValue()) {
5686  sel_uses.push_back(sel);
5687  modifiedAny = true;
5688  }
5689  }
5690  }
5691 
5692  for (int i=0; i<sel_uses.size(); i++) {
5693  fixSelect(sel_uses[i], sext);
5694  }
5695  }
5696  }
5697  }
5698 
5699 #endif // LLVM 3.3
5700 
5701  return modifiedAny;
5702 }
5703 
5704 
5705 static llvm::Pass *
5707  return new FixBooleanSelectPass();
5708 }
5709 
5710 #ifdef ISPC_NVPTX_ENABLED
5711 ///////////////////////////////////////////////////////////////////////////////
5712 // Detect addrspace(3)
5713 ///////////////////////////////////////////////////////////////////////////////
5714 
5715 class PromoteLocalToPrivatePass: public llvm::BasicBlockPass
5716 {
5717  public:
5718  static char ID; // Pass identification, replacement for typeid
5719  PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
5720 
5721  bool runOnBasicBlock(llvm::BasicBlock &BB);
5722 };
5723 
5724 char PromoteLocalToPrivatePass::ID = 0;
5725 
5726 bool
5727 PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB)
5728 {
5729  std::vector<llvm::AllocaInst*> Allocas;
5730 
5731  bool modifiedAny = false;
5732 
5733 #if 1
5734 restart:
5735  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
5736  {
5737  llvm::Instruction *inst = &*I;
5738  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
5739  {
5740  llvm::Function *func = ci->getCalledFunction();
5741  if (func && func->getName() == "llvm.trap")
5742  {
5743  std::vector<llvm::Type*> funcTyArgs;
5744  llvm::FunctionType *funcTy = llvm::FunctionType::get(
5745  /*Result=*/llvm::Type::getVoidTy(*g->ctx),
5746  /*Params=*/funcTyArgs,
5747  /*isVarArg=*/false);
5748  llvm::InlineAsm *trap_ptx = llvm::InlineAsm::get(funcTy, "trap;", "", false);
5749  assert(trap_ptx != NULL);
5750  llvm::Instruction *trap_call = llvm::CallInst::Create(trap_ptx);
5751  assert(trap_call != NULL);
5752  llvm::ReplaceInstWithInst(ci, trap_call);
5753  modifiedAny = true;
5754  goto restart;
5755  }
5756  }
5757  }
5758 #endif
5759 
5760 #if 0
5761  llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
5762 
5763  // Find allocas that are safe to promote, by looking at all instructions in
5764  // the entry node
5765  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
5766  {
5767  llvm::Instruction *inst = &*I;
5768  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
5769  {
5770  llvm::Function *func = ci->getCalledFunction();
5771  if (cvtFunc && (cvtFunc == func))
5772  {
5773 #if 0
5774  fprintf(stderr , "--found cvt-- name= %s \n",
5775  I->getName().str().c_str());
5776 #endif
5777  llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
5778  assert(alloca != NULL);
5779 #if 0
5780  const int align = 8; // g->target->getNativeVectorAlignment();
5781  alloca->setAlignment(align);
5782 #endif
5783  ci->replaceAllUsesWith(alloca);
5784  modifiedAny = true;
5785  }
5786  }
5787  }
5788 #endif
5789  return modifiedAny;
5790 }
5791 
5792 static llvm::Pass *
5793 CreatePromoteLocalToPrivatePass() {
5794  return new PromoteLocalToPrivatePass();
5795 }
5796 
5797 
5798 
5799 #endif /* ISPC_NVPTX_ENABLED */
static llvm::Pass * CreateFixBooleanSelectPass()
Definition: opt.cpp:5706
static llvm::Type * FloatType
Definition: llvmutil.h:80
static llvm::Type * Int32VectorPointerType
Definition: llvmutil.h:103
static bool lCoalesceGathers(const std::vector< llvm::CallInst * > &coalesceGroup)
Definition: opt.cpp:4141
llvm::TargetMachine * GetTargetMachine() const
Definition: ispc.h:233
llvm::Value * LLVMShuffleVectors(llvm::Value *v1, llvm::Value *v2, int32_t shuf[], int shufSize, llvm::Instruction *insertBefore)
Definition: llvmutil.cpp:1728
Opt opt
Definition: ispc.h:548
llvm::Constant * LLVMInt64Vector(int64_t i)
Definition: llvmutil.cpp:455
llvm::Instruction * fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext)
Definition: opt.cpp:5569
static bool lIsSafeToBlend(llvm::Value *lvalue)
Definition: opt.cpp:4428
Declaration of the FunctionEmitContext class
void PerformanceWarning(SourcePos p, const char *format,...) PRINTF_FUNC
Definition: util.cpp:432
static llvm::Type * DoubleType
Definition: llvmutil.h:81
static llvm::Value * lExtractFromInserts(llvm::Value *v, unsigned int index)
Definition: opt.cpp:1585
bool disableBlendedMaskedStores
Definition: ispc.h:493
static llvm::Value * lExtractOffsetVector248Scale(llvm::Value **vec)
Definition: opt.cpp:1990
static bool simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter)
Definition: opt.cpp:1381
int getVectorWidth() const
Definition: ispc.h:286
Module * m
Definition: ispc.cpp:101
void Optimize(llvm::Module *module, int optLevel)
Definition: opt.cpp:537
static llvm::Pass * CreateImproveMemoryOpsPass()
Definition: opt.cpp:3314
int first_line
Definition: ispc.h:142
bool runOnModule(llvm::Module &m)
Definition: opt.cpp:4942
Target * target
Definition: ispc.h:550
static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, const char *name, llvm::Instruction *insertBefore=NULL)
Definition: opt.cpp:296
static bool lVectorLoadIsEfficient(std::set< int64_t >::iterator iter, std::set< int64_t >::iterator end, std::set< int64_t >::iterator *newIter, int vectorWidth)
Definition: opt.cpp:3391
static void lSelectLoads(const std::vector< int64_t > &loadOffsets, std::vector< CoalescedLoadOp > *loads)
Definition: opt.cpp:3492
static llvm::Constant * lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1, llvm::Constant **delta)
Definition: opt.cpp:1567
static llvm::Value * lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3730
llvm::Value * lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align, llvm::Instruction *insertBefore, llvm::Type *type)
Definition: opt.cpp:3612
const char * getPassName() const
Definition: opt.cpp:5043
#define Assert(expr)
Definition: ispc.h:173
static char ID
Definition: opt.cpp:5049
static char ID
Definition: opt.cpp:5553
bool LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts)
Definition: llvmutil.cpp:828
llvm::Constant * LLVMInt32Vector(int32_t i)
Definition: llvmutil.cpp:379
static llvm::Value * lAssemble4Vector(const std::vector< CoalescedLoadOp > &loadOps, const int64_t offsets[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3863
static llvm::VectorType * Int32VectorType
Definition: llvmutil.h:96
Declarations related to optimization passes.
llvm::Value * element0
Definition: opt.cpp:3380
static llvm::Pass * CreateReplaceStdlibShiftPass()
Definition: opt.cpp:5519
std::vector< MaskInstruction > maskInstructions
Definition: opt.cpp:973
bool forceAlignedMemory
Definition: ispc.h:472
static llvm::Type * FloatVectorPointerType
Definition: llvmutil.h:105
BlendInstruction * matchingBlendInstruction(llvm::Function *function)
Definition: opt.cpp:1246
static char ID
Definition: opt.cpp:3343
static bool lVectorIs32BitInts(llvm::Value *v)
Definition: opt.cpp:2158
static bool lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos)
Definition: opt.cpp:242
IsCompileTimeConstantPass(bool last=false)
Definition: opt.cpp:4778
static bool lGetMask(llvm::Value *factor, uint64_t *mask)
Definition: opt.cpp:400
static llvm::Type * Int16VectorPointerType
Definition: llvmutil.h:102
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:1401
static bool lIsIntegerSplat(llvm::Value *v, int *splat)
Definition: opt.cpp:1918
MakeInternalFuncsStaticPass(bool last=false)
Definition: opt.cpp:4924
static llvm::Pass * CreateReplacePseudoMemoryOpsPass()
Definition: opt.cpp:4754
static void lExtractConstOffsets(const std::vector< llvm::CallInst * > &coalesceGroup, int elementSize, std::vector< int64_t > *constOffsets)
Definition: opt.cpp:4112
int getNativeVectorAlignment() const
Definition: ispc.h:282
static llvm::Type * Int16Type
Definition: llvmutil.h:77
static llvm::Type * DoubleVectorPointerType
Definition: llvmutil.h:106
bool run(llvm::Module &M)
Definition: opt.cpp:484
llvm::Constant * LLVMFalse
Definition: llvmutil.cpp:91
static bool lInstructionMayWriteToMemory(llvm::Instruction *inst)
Definition: opt.cpp:4218
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:3270
static llvm::Pass * CreateIntrinsicsOptPass()
Definition: opt.cpp:1258
bool disableCoalescing
Definition: ispc.h:533
static llvm::Pass * CreatePeepholePass()
Definition: opt.cpp:5423
llvm::PassManager PM
Definition: opt.cpp:492
static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst)
Definition: opt.cpp:2630
static llvm::VectorType * Int1VectorType
Definition: llvmutil.h:93
static llvm::Instruction * lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name, llvm::Instruction *insertBefore)
Definition: opt.cpp:345
static bool lIsUndef(llvm::Value *value)
Definition: opt.cpp:1014
static void lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from)
Definition: opt.cpp:207
header file with declarations for symbol and symbol table classes.
std::set< int > debug_stages
Definition: ispc.h:576
bool disableMaskAllOnOptimizations
Definition: ispc.h:477
llvm::ConstantInt * LLVMInt32(int32_t i)
Definition: llvmutil.cpp:263
llvm::Module * module
Definition: module.h:166
bool matchesMaskInstruction(llvm::Function *function)
Definition: opt.cpp:1234
static llvm::Type * Int8VectorPointerType
Definition: llvmutil.h:101
Globals * g
Definition: ispc.cpp:100
Definition: opt.cpp:447
bool disableGatherScatterOptimizations
Definition: ispc.h:511
bool debugPrint
Definition: ispc.h:567
static llvm::VectorType * Int8VectorType
Definition: llvmutil.h:94