Intel SPMD Program Compiler  1.11.0
opt.cpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2010-2019, Intel Corporation
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are
7  met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16  * Neither the name of Intel Corporation nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22  IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
25  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 /** @file opt.cpp
35  @brief Implementations of various ispc optimization passes that operate
36  on the LLVM IR.
37 */
38 
39 #include "opt.h"
40 #include "ctx.h"
41 #include "llvmutil.h"
42 #include "module.h"
43 #include "sym.h"
44 #include "util.h"
45 
46 #include <map>
47 #include <set>
48 #include <stdio.h>
49 
50 #include <llvm/Pass.h>
51 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
52 #include <llvm/BasicBlock.h>
53 #include <llvm/Constants.h>
54 #include <llvm/Function.h>
55 #include <llvm/Instructions.h>
56 #include <llvm/Intrinsics.h>
57 #include <llvm/Module.h>
58 #ifdef ISPC_NVPTX_ENABLED
59 #include <llvm/InlineAsm.h>
60 #endif /* ISPC_NVPTX_ENABLED */
61 #else // LLVM 3.3+
62 #include <llvm/IR/BasicBlock.h>
63 #include <llvm/IR/Constants.h>
64 #include <llvm/IR/Function.h>
65 #include <llvm/IR/Instructions.h>
66 #include <llvm/IR/Intrinsics.h>
67 #include <llvm/IR/Module.h>
68 #ifdef ISPC_NVPTX_ENABLED
69 #include <llvm/IR/InlineAsm.h>
70 #endif /* ISPC_NVPTX_ENABLED */
71 #endif
72 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_4 // LLVM 3.4+
73 #include <llvm/Transforms/Instrumentation.h>
74 #endif
75 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
76 #include "llvm/PassManager.h"
77 #else // LLVM 3.7+
78 #include "llvm/IR/LegacyPassManager.h"
79 #endif
80 #include <llvm/PassRegistry.h>
81 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
82 #include <llvm/IR/DebugInfo.h>
83 #include <llvm/IR/IRPrintingPasses.h>
84 #include <llvm/IR/PatternMatch.h>
85 #include <llvm/IR/Verifier.h>
86 #else // < 3.5
87 #include <llvm/Analysis/Verifier.h>
88 #include <llvm/Assembly/PrintModulePass.h>
89 #include <llvm/DebugInfo.h>
90 #include <llvm/Support/PatternMatch.h>
91 #endif
92 #include <llvm/Analysis/ConstantFolding.h>
93 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
94 #include <llvm/Target/TargetLibraryInfo.h>
95 #else // LLVM 3.7+
96 #include <llvm/Analysis/TargetLibraryInfo.h>
97 #endif
98 #include <llvm/ADT/SmallSet.h>
99 #include <llvm/ADT/Triple.h>
100 #include <llvm/Transforms/IPO.h>
101 #include <llvm/Transforms/Scalar.h>
102 #if ISPC_LLVM_VERSION >= ISPC_LLVM_7_0
103 #include "llvm/Transforms/InstCombine/InstCombine.h"
104 #include "llvm/Transforms/Utils.h"
105 #endif
106 #include <llvm/Target/TargetOptions.h>
107 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
108 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
109 #include <llvm/DataLayout.h>
110 #else // LLVM 3.3+
111 #include <llvm/Analysis/TargetTransformInfo.h>
112 #include <llvm/IR/DataLayout.h>
113 #endif
114 #include <llvm/Target/TargetMachine.h>
115 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
116 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
117 #include <llvm/Analysis/BasicAliasAnalysis.h>
118 #endif
119 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_9 // LLVM 3.9+
120 #include "llvm/Transforms/IPO/FunctionAttrs.h"
121 #include "llvm/Transforms/Scalar/GVN.h"
122 #endif
123 #include <llvm/Analysis/Passes.h>
124 #include <llvm/Support/raw_ostream.h>
125 #if ISPC_LLVM_VERSION >= ISPC_LLVM_5_0 // LLVM 5.0+
126 #include <llvm/BinaryFormat/Dwarf.h>
127 #else // LLVM up to 4.x
128 #include <llvm/Support/Dwarf.h>
129 #endif
130 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_6
131 #include <llvm/IR/IntrinsicInst.h>
132 #endif
133 #ifdef ISPC_IS_LINUX
134 #include <alloca.h>
135 #elif defined(ISPC_IS_WINDOWS)
136 #include <malloc.h>
137 #ifndef __MINGW32__
138 #define alloca _alloca
139 #endif
140 #endif // ISPC_IS_WINDOWS
141 
142 #ifndef PRId64
143 #define PRId64 "lld"
144 #endif
145 #ifndef PRIu64
146 #define PRIu64 "llu"
147 #endif
148 
149 static llvm::Pass *CreateIntrinsicsOptPass();
150 static llvm::Pass *CreateInstructionSimplifyPass();
151 static llvm::Pass *CreatePeepholePass();
152 
153 static llvm::Pass *CreateImproveMemoryOpsPass();
154 static llvm::Pass *CreateGatherCoalescePass();
155 static llvm::Pass *CreateReplacePseudoMemoryOpsPass();
156 
157 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
158 static llvm::Pass *CreateMakeInternalFuncsStaticPass();
159 
160 #ifndef ISPC_NO_DUMPS
161 static llvm::Pass *CreateDebugPass(char *output);
162 #endif
163 
164 static llvm::Pass *CreateReplaceStdlibShiftPass();
165 
166 static llvm::Pass *CreateFixBooleanSelectPass();
167 #ifdef ISPC_NVPTX_ENABLED
168 static llvm::Pass *CreatePromoteLocalToPrivatePass();
169 #endif /* ISPC_NVPTX_ENABLED */
170 
171 #ifndef ISPC_NO_DUMPS
172 #define DEBUG_START_PASS(NAME) \
173  if (g->debugPrint && \
174  (getenv("FUNC") == NULL || (getenv("FUNC") != NULL && !strncmp(bb.getParent()->getName().str().c_str(), \
175  getenv("FUNC"), strlen(getenv("FUNC")))))) { \
176  fprintf(stderr, "Start of " NAME "\n"); \
177  fprintf(stderr, "---------------\n"); \
178  bb.dump(); \
179  fprintf(stderr, "---------------\n\n"); \
180  } else /* eat semicolon */
181 
182 #define DEBUG_END_PASS(NAME) \
183  if (g->debugPrint && \
184  (getenv("FUNC") == NULL || (getenv("FUNC") != NULL && !strncmp(bb.getParent()->getName().str().c_str(), \
185  getenv("FUNC"), strlen(getenv("FUNC")))))) { \
186  fprintf(stderr, "End of " NAME " %s\n", modifiedAny ? "** CHANGES **" : ""); \
187  fprintf(stderr, "---------------\n"); \
188  bb.dump(); \
189  fprintf(stderr, "---------------\n\n"); \
190  } else /* eat semicolon */
191 #else
192 #define DEBUG_START_PASS(NAME)
193 #define DEBUG_END_PASS(NAME)
194 #endif
195 
196 ///////////////////////////////////////////////////////////////////////////
197 
198 /** This utility routine copies the metadata (if any) attached to the
199  'from' instruction in the IR to the 'to' instruction.
200 
201  For flexibility, this function takes an llvm::Value rather than an
202  llvm::Instruction for the 'to' parameter; at some places in the code
203  below, we sometimes use a llvm::Value to start out storing a value and
204  then later store instructions. If a llvm::Value is passed to this, the
205  routine just returns without doing anything; if it is in fact an
206  LLVM::Instruction, then the metadata can be copied to it.
207  */
208 static void lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from) {
209  llvm::Instruction *to = llvm::dyn_cast<llvm::Instruction>(vto);
210  if (!to)
211  return;
212 
213  llvm::SmallVector<std::pair<unsigned int, llvm::MDNode *>, 8> metadata;
214 
215  from->getAllMetadata(metadata);
216  for (unsigned int i = 0; i < metadata.size(); ++i)
217  to->setMetadata(metadata[i].first, metadata[i].second);
218 }
219 
220 /** We have a protocol with the front-end LLVM IR code generation process
221  that allows us to encode the source file position that corresponds with
222  instructions. (For example, this allows us to issue performance
223  warnings related to things like scatter and gather after optimization
224  has been performed, so that we aren't warning about scatters and
225  gathers that have been improved to stores and loads by optimization
226  passes.) Note that this is slightly redundant with the source file
227  position encoding generated for debugging symbols, though we don't
228  always generate debugging information but we do always generate this
229  position data.
230 
231  This function finds the SourcePos that the metadata in the instruction
232  (if present) corresponds to. See the implementation of
233  FunctionEmitContext::addGSMetadata(), which encodes the source position during
234  code generation.
235 
236  @param inst Instruction to try to find the source position of
237  @param pos Output variable in which to store the position
238  @returns True if source file position metadata was present and *pos
239  has been set. False otherwise.
240 */
241 static bool lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos) {
242  llvm::MDNode *filename = inst->getMetadata("filename");
243  llvm::MDNode *first_line = inst->getMetadata("first_line");
244  llvm::MDNode *first_column = inst->getMetadata("first_column");
245  llvm::MDNode *last_line = inst->getMetadata("last_line");
246  llvm::MDNode *last_column = inst->getMetadata("last_column");
247 
248  if (!filename || !first_line || !first_column || !last_line || !last_column)
249  return false;
250 
251  // All of these asserts are things that FunctionEmitContext::addGSMetadata() is
252  // expected to have done in its operation
253  llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(filename->getOperand(0));
254  Assert(str);
255  llvm::ConstantInt *first_lnum =
256 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
257  llvm::dyn_cast<llvm::ConstantInt>(first_line->getOperand(0));
258 #else /* LLVN 3.6+ */
259  llvm::mdconst::extract<llvm::ConstantInt>(first_line->getOperand(0));
260 #endif
261  Assert(first_lnum);
262 
263  llvm::ConstantInt *first_colnum =
264 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
265  llvm::dyn_cast<llvm::ConstantInt>(first_column->getOperand(0));
266 #else /* LLVN 3.6+ */
267  llvm::mdconst::extract<llvm::ConstantInt>(first_column->getOperand(0));
268 #endif
269  Assert(first_column);
270 
271  llvm::ConstantInt *last_lnum =
272 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
273  llvm::dyn_cast<llvm::ConstantInt>(last_line->getOperand(0));
274 #else /* LLVN 3.6+ */
275  llvm::mdconst::extract<llvm::ConstantInt>(last_line->getOperand(0));
276 #endif
277  Assert(last_lnum);
278 
279  llvm::ConstantInt *last_colnum =
280 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
281  llvm::dyn_cast<llvm::ConstantInt>(last_column->getOperand(0));
282 #else /* LLVN 3.6+ */
283  llvm::mdconst::extract<llvm::ConstantInt>(last_column->getOperand(0));
284 #endif
285  Assert(last_column);
286 
287  *pos = SourcePos(str->getString().data(), (int)first_lnum->getZExtValue(), (int)first_colnum->getZExtValue(),
288  (int)last_lnum->getZExtValue(), (int)last_colnum->getZExtValue());
289  return true;
290 }
291 
292 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, const char *name,
293  llvm::Instruction *insertBefore = NULL) {
294  llvm::Value *args[2] = {arg0, arg1};
295  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
296  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
297 }
298 
299 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
300  const char *name, llvm::Instruction *insertBefore = NULL) {
301  llvm::Value *args[3] = {arg0, arg1, arg2};
302  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
303  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
304 }
305 
306 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
307  llvm::Value *arg3, const char *name, llvm::Instruction *insertBefore = NULL) {
308  llvm::Value *args[4] = {arg0, arg1, arg2, arg3};
309  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[4]);
310  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
311 }
312 
313 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
314  llvm::Value *arg3, llvm::Value *arg4, const char *name,
315  llvm::Instruction *insertBefore = NULL) {
316  llvm::Value *args[5] = {arg0, arg1, arg2, arg3, arg4};
317  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[5]);
318  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
319 }
320 
321 static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
322  llvm::Value *arg3, llvm::Value *arg4, llvm::Value *arg5, const char *name,
323  llvm::Instruction *insertBefore = NULL) {
324  llvm::Value *args[6] = {arg0, arg1, arg2, arg3, arg4, arg5};
325  llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
326  return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
327 }
328 
329 static llvm::Instruction *lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name,
330  llvm::Instruction *insertBefore) {
331  llvm::Value *index[1] = {offset};
332  llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
333 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
334  return llvm::GetElementPtrInst::Create(ptr, arrayRef, name, insertBefore);
335 #else // LLVM 3.7+
336  return llvm::GetElementPtrInst::Create(PTYPE(ptr), ptr, arrayRef, name, insertBefore);
337 #endif
338 }
339 
340 /** Given a vector of constant values (int, float, or bool) representing an
341  execution mask, convert it to a bitvector where the 0th bit corresponds
342  to the first vector value and so forth.
343 */
344 static uint64_t lConstElementsToMask(const llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> &elements) {
345  Assert(elements.size() <= 64);
346 
347  uint64_t mask = 0;
348  for (unsigned int i = 0; i < elements.size(); ++i) {
349  llvm::APInt intMaskValue;
350  // SSE has the "interesting" approach of encoding blending
351  // masks as <n x float>.
352  llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
353  if (cf != NULL) {
354  llvm::APFloat apf = cf->getValueAPF();
355  intMaskValue = apf.bitcastToAPInt();
356  } else {
357  // Otherwise get it as an int
358  llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
359  Assert(ci != NULL); // vs return -1 if NULL?
360  intMaskValue = ci->getValue();
361  }
362  // Is the high-bit set? If so, OR in the appropriate bit in
363  // the result mask
364  if (intMaskValue.countLeadingOnes() > 0)
365  mask |= (1ull << i);
366  }
367  return mask;
368 }
369 
370 /** Given an llvm::Value represinting a vector mask, see if the value is a
371  constant. If so, return true and set *bits to be the integer mask
372  found by taking the high bits of the mask values in turn and
373  concatenating them into a single integer. In other words, given the
374  4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, we have 0b1001 = 9.
375  */
376 static bool lGetMask(llvm::Value *factor, uint64_t *mask) {
377  llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
378  if (cdv != NULL) {
379  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
380  for (int i = 0; i < (int)cdv->getNumElements(); ++i)
381  elements.push_back(cdv->getElementAsConstant(i));
382  *mask = lConstElementsToMask(elements);
383  return true;
384  }
385 
386  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
387  if (cv != NULL) {
388  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
389  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
390  llvm::Constant *c = llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
391  if (c == NULL)
392  return false;
393  if (llvm::isa<llvm::ConstantExpr>(cv->getOperand(i)))
394  return false; // We can not handle constant expressions here
395  elements.push_back(c);
396  }
397  *mask = lConstElementsToMask(elements);
398  return true;
399  } else if (llvm::isa<llvm::ConstantAggregateZero>(factor)) {
400  *mask = 0;
401  return true;
402  } else {
403 #if 0
404  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
405  if (ce != NULL) {
406  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
407  const llvm::TargetData *td = targetMachine->getTargetData();
408  llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
409  c->dump();
410  factor = c;
411  }
412  // else we should be able to handle it above...
413  Assert(!llvm::isa<llvm::Constant>(factor));
414 #endif
415  return false;
416  }
417 }
418 
420 
421 /** Determines if the given mask value is all on, all off, mixed, or
422  unknown at compile time.
423 */
424 static MaskStatus lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
425  uint64_t bits;
426  if (lGetMask(mask, &bits) == false)
427  return UNKNOWN;
428 
429  if (bits == 0)
430  return ALL_OFF;
431 
432  if (vecWidth == -1)
433  vecWidth = g->target->getVectorWidth();
434  Assert(vecWidth <= 64);
435 
436  for (int i = 0; i < vecWidth; ++i) {
437  if ((bits & (1ull << i)) == 0)
438  return MIXED;
439  }
440  return ALL_ON;
441 }
442 
443 ///////////////////////////////////////////////////////////////////////////
444 // This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
445 // and change PassManager function add by adding some checks and debug passes.
446 // This wrap can control:
447 // - If we want to switch off optimization with given number.
448 // - If we want to dump LLVM IR after optimization with given number.
449 // - If we want to generate LLVM IR debug for gdb after optimization with given number.
451  public:
453  void add(llvm::Pass *P, int stage);
454  bool run(llvm::Module &M) { return PM.run(M); }
455 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
456  llvm::PassManager &getPM() { return PM; }
457 #else /* LLVM 3.7+ */
458  llvm::legacy::PassManager &getPM() { return PM; }
459 #endif
460  private:
461 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
462  llvm::PassManager PM;
463 #else /* LLVM 3.7+ */
464  llvm::legacy::PassManager PM;
465 #endif
466  int number;
467 };
468 
469 void DebugPassManager::add(llvm::Pass *P, int stage = -1) {
470  // taking number of optimization
471  if (stage == -1) {
472  number++;
473  } else {
474  number = stage;
475  }
476  if (g->off_stages.find(number) == g->off_stages.end()) {
477  // adding optimization (not switched off)
478  PM.add(P);
479 #ifndef ISPC_NO_DUMPS
480  if (g->debug_stages.find(number) != g->debug_stages.end()) {
481  // adding dump of LLVM IR after optimization
482  char buf[100];
483 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
484  snprintf(buf, sizeof(buf), "\n\n*****LLVM IR after phase %d: %s*****\n\n", number, P->getPassName());
485 #else // LLVM 4.0+
486  snprintf(buf, sizeof(buf), "\n\n*****LLVM IR after phase %d: %s*****\n\n", number, P->getPassName().data());
487 #endif
488  PM.add(CreateDebugPass(buf));
489  }
490 #endif
491 
492 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_4 || ISPC_LLVM_VERSION == ISPC_LLVM_3_5 // only 3.4 and 3.5
493  if (g->debugIR == number) {
494  // adding generating of LLVM IR debug after optimization
495  char buf[100];
496  snprintf(buf, sizeof(buf), "Debug_IR_after_%d_phase.bc", number);
497  PM.add(llvm::createDebugIRPass(true, true, ".", buf));
498  }
499 #endif
500  }
501 }
502 ///////////////////////////////////////////////////////////////////////////
503 
504 void Optimize(llvm::Module *module, int optLevel) {
505 #ifndef ISPC_NO_DUMPS
506  if (g->debugPrint) {
507  printf("*** Code going into optimization ***\n");
508  module->dump();
509  }
510 #endif
511  DebugPassManager optPM;
512  optPM.add(llvm::createVerifierPass(), 0);
513 
514 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
515  llvm::TargetLibraryInfo *targetLibraryInfo = new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
516  optPM.add(targetLibraryInfo);
517 #else // LLVM 3.7+
518  optPM.add(new llvm::TargetLibraryInfoWrapperPass(llvm::Triple(module->getTargetTriple())));
519 #endif
520 
521 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_4
522  optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
523 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_5
524  optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
525 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_6
526  llvm::DataLayoutPass *dlp = new llvm::DataLayoutPass();
527  dlp->doInitialization(*module);
528  optPM.add(dlp);
529 #endif // LLVM 3.7+ doesn't have DataLayoutPass anymore.
530 
531  llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
532 
533 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
534  optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
535  targetMachine->getVectorTargetTransformInfo()));
536 #elif ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
537  targetMachine->addAnalysisPasses(optPM.getPM());
538 #else // LLVM 3.7+
539  optPM.getPM().add(createTargetTransformInfoWrapperPass(targetMachine->getTargetIRAnalysis()));
540 #endif
541 
542  optPM.add(llvm::createIndVarSimplifyPass());
543 
544  if (optLevel == 0) {
545  // This is more or less the minimum set of optimizations that we
546  // need to do to generate code that will actually run. (We can't
547  // run absolutely no optimizations, since the front-end needs us to
548  // take the various __pseudo_* functions it has emitted and turn
549  // them into something that can actually execute.
550  optPM.add(CreateImproveMemoryOpsPass(), 100);
551 #ifdef ISPC_NVPTX_ENABLED
553 #endif /* ISPC_NVPTX_ENABLED */
554  optPM.add(CreateImproveMemoryOpsPass(), 100);
555 
556  if (g->opt.disableHandlePseudoMemoryOps == false)
558 
559  optPM.add(CreateIntrinsicsOptPass(), 102);
561  optPM.add(llvm::createFunctionInliningPass());
563  optPM.add(llvm::createCFGSimplificationPass());
564  optPM.add(llvm::createGlobalDCEPass());
565  } else {
566  llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
567  llvm::initializeCore(*registry);
568  llvm::initializeScalarOpts(*registry);
569  llvm::initializeIPO(*registry);
570  llvm::initializeAnalysis(*registry);
571 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7
572  llvm::initializeIPA(*registry);
573 #endif
574  llvm::initializeTransformUtils(*registry);
575  llvm::initializeInstCombine(*registry);
576  llvm::initializeInstrumentation(*registry);
577  llvm::initializeTarget(*registry);
578 
579  optPM.add(llvm::createGlobalDCEPass(), 185);
580 
581  // Setup to use LLVM default AliasAnalysis
582  // Ideally, we want call:
583  // llvm::PassManagerBuilder pm_Builder;
584  // pm_Builder.OptLevel = optLevel;
585  // pm_Builder.addInitialAliasAnalysisPasses(optPM);
586  // but the addInitialAliasAnalysisPasses() is a private function
587  // so we explicitly enable them here.
588  // Need to keep sync with future LLVM change
589  // An alternative is to call populateFunctionPassManager()
590 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7
591  optPM.add(llvm::createTypeBasedAliasAnalysisPass(), 190);
592  optPM.add(llvm::createBasicAliasAnalysisPass());
593 #else
594  optPM.add(llvm::createTypeBasedAAWrapperPass(), 190);
595  optPM.add(llvm::createBasicAAWrapperPass());
596 #endif
597  optPM.add(llvm::createCFGSimplificationPass());
598 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
599  optPM.add(llvm::createScalarReplAggregatesPass());
600 #else
601  optPM.add(llvm::createSROAPass());
602 #endif
603  optPM.add(llvm::createEarlyCSEPass());
604  optPM.add(llvm::createLowerExpectIntrinsicPass());
605 
606  // Early optimizations to try to reduce the total amount of code to
607  // work with if we can
608  optPM.add(llvm::createReassociatePass(), 200);
609  optPM.add(llvm::createConstantPropagationPass());
610  optPM.add(llvm::createDeadInstEliminationPass());
611  optPM.add(llvm::createCFGSimplificationPass());
612 
613  optPM.add(llvm::createPromoteMemoryToRegisterPass());
614  optPM.add(llvm::createAggressiveDCEPass());
615 
616  if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
617  optPM.add(llvm::createInstructionCombiningPass(), 210);
619  }
621  optPM.add(CreateIntrinsicsOptPass(), 215);
623  }
624  optPM.add(llvm::createDeadInstEliminationPass(), 220);
625 
626  // Max struct size threshold for scalar replacement is
627  // 1) 4 fields (r,g,b,w)
628  // 2) field size: vectorWidth * sizeof(float)
629 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
630  const int field_limit = 4;
631  int sr_threshold = g->target->getVectorWidth() * sizeof(float) * field_limit;
632 #endif
633 
634  // On to more serious optimizations
635 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
636  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
637 #else
638  optPM.add(llvm::createSROAPass());
639 #endif
640  optPM.add(llvm::createInstructionCombiningPass());
641  optPM.add(llvm::createCFGSimplificationPass());
642  optPM.add(llvm::createPromoteMemoryToRegisterPass());
643  optPM.add(llvm::createGlobalOptimizerPass());
644  optPM.add(llvm::createReassociatePass());
645  optPM.add(llvm::createIPConstantPropagationPass());
646 
647 #ifdef ISPC_NVPTX_ENABLED
648  if (g->target->getISA() != Target::NVPTX)
649 #endif /* ISPC_NVPTX_ENABLED */
650  optPM.add(CreateReplaceStdlibShiftPass(), 229);
651 
652  optPM.add(llvm::createDeadArgEliminationPass(), 230);
653  optPM.add(llvm::createInstructionCombiningPass());
654  optPM.add(llvm::createCFGSimplificationPass());
655  optPM.add(llvm::createPruneEHPass());
656 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_9 // 3.9+
657  optPM.add(llvm::createPostOrderFunctionAttrsLegacyPass());
658  optPM.add(llvm::createReversePostOrderFunctionAttrsPass());
659 #elif ISPC_LLVM_VERSION == ISPC_LLVM_3_8 // 3.8
660  optPM.add(llvm::createPostOrderFunctionAttrsPass());
661  optPM.add(llvm::createReversePostOrderFunctionAttrsPass());
662 #else // 3.7 and earlier
663  optPM.add(llvm::createFunctionAttrsPass());
664 #endif
665  optPM.add(llvm::createFunctionInliningPass());
666  optPM.add(llvm::createConstantPropagationPass());
667  optPM.add(llvm::createDeadInstEliminationPass());
668  optPM.add(llvm::createCFGSimplificationPass());
669 
670  optPM.add(llvm::createArgumentPromotionPass());
671 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3
672  // Starting from 3.4 this functionality was moved to
673  // InstructionCombiningPass. See r184459 for details.
674  optPM.add(llvm::createSimplifyLibCallsPass(), 240);
675 #endif
676  optPM.add(llvm::createAggressiveDCEPass());
677  optPM.add(llvm::createInstructionCombiningPass(), 241);
678  optPM.add(llvm::createJumpThreadingPass());
679  optPM.add(llvm::createCFGSimplificationPass());
680 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
681  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
682 #else
683  optPM.add(llvm::createSROAPass());
684 #endif
685  optPM.add(llvm::createInstructionCombiningPass());
686  optPM.add(llvm::createTailCallEliminationPass());
687 
689  optPM.add(CreateIntrinsicsOptPass(), 250);
691  }
692 
693  if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
694  optPM.add(llvm::createInstructionCombiningPass(), 255);
696 
697  if (g->opt.disableCoalescing == false && g->target->getISA() != Target::GENERIC) {
698  // It is important to run this here to make it easier to
699  // finding matching gathers we can coalesce..
700  optPM.add(llvm::createEarlyCSEPass(), 260);
701  optPM.add(CreateGatherCoalescePass());
702  }
703  }
704 
705  optPM.add(llvm::createFunctionInliningPass(), 265);
706  optPM.add(llvm::createConstantPropagationPass());
707  optPM.add(CreateIntrinsicsOptPass());
709 
710  if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
711  optPM.add(llvm::createInstructionCombiningPass(), 270);
713  }
714 
715  optPM.add(llvm::createIPSCCPPass(), 275);
716  optPM.add(llvm::createDeadArgEliminationPass());
717  optPM.add(llvm::createAggressiveDCEPass());
718  optPM.add(llvm::createInstructionCombiningPass());
719  optPM.add(llvm::createCFGSimplificationPass());
720 
721  if (g->opt.disableHandlePseudoMemoryOps == false) {
723  }
724  optPM.add(CreateIntrinsicsOptPass(), 281);
726 
727  optPM.add(llvm::createFunctionInliningPass());
728  optPM.add(llvm::createArgumentPromotionPass());
729 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
730  optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false));
731 #else
732  optPM.add(llvm::createSROAPass());
733 #endif
734  optPM.add(llvm::createInstructionCombiningPass());
736  optPM.add(llvm::createCFGSimplificationPass());
737  optPM.add(llvm::createReassociatePass());
738  optPM.add(llvm::createLoopRotatePass());
739  optPM.add(llvm::createLICMPass());
740  optPM.add(llvm::createLoopUnswitchPass(false));
741  optPM.add(llvm::createInstructionCombiningPass());
743  optPM.add(llvm::createIndVarSimplifyPass());
744  optPM.add(llvm::createLoopIdiomPass());
745  optPM.add(llvm::createLoopDeletionPass());
746  if (g->opt.unrollLoops) {
747  optPM.add(llvm::createLoopUnrollPass(), 300);
748  }
749  optPM.add(llvm::createGVNPass(), 301);
750 
752  optPM.add(CreateIntrinsicsOptPass());
754 
755  optPM.add(llvm::createMemCpyOptPass());
756  optPM.add(llvm::createSCCPPass());
757  optPM.add(llvm::createInstructionCombiningPass());
759  optPM.add(llvm::createJumpThreadingPass());
760  optPM.add(llvm::createCorrelatedValuePropagationPass());
761  optPM.add(llvm::createDeadStoreEliminationPass());
762  optPM.add(llvm::createAggressiveDCEPass());
763  optPM.add(llvm::createCFGSimplificationPass());
764  optPM.add(llvm::createInstructionCombiningPass());
766  optPM.add(CreatePeepholePass());
767  optPM.add(llvm::createFunctionInliningPass());
768  optPM.add(llvm::createAggressiveDCEPass());
769  optPM.add(llvm::createStripDeadPrototypesPass());
771  optPM.add(llvm::createGlobalDCEPass());
772  optPM.add(llvm::createConstantMergePass());
773 
774  // Should be the last
775  optPM.add(CreateFixBooleanSelectPass(), 400);
776 #ifdef ISPC_NVPTX_ENABLED
777  if (g->target->getISA() == Target::NVPTX) {
778  optPM.add(CreatePromoteLocalToPrivatePass());
779  optPM.add(llvm::createGlobalDCEPass());
780 
781  optPM.add(llvm::createTypeBasedAliasAnalysisPass());
782  optPM.add(llvm::createBasicAliasAnalysisPass());
783  optPM.add(llvm::createCFGSimplificationPass());
784  // Here clang has an experimental pass SROAPass instead of
785  // ScalarReplAggregatesPass. We should add it in the future.
786 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
787  optPM.add(llvm::createScalarReplAggregatesPass());
788 #else
789  optPM.add(llvm::createSROAPass());
790 #endif
791  optPM.add(llvm::createEarlyCSEPass());
792  optPM.add(llvm::createLowerExpectIntrinsicPass());
793  optPM.add(llvm::createTypeBasedAliasAnalysisPass());
794  optPM.add(llvm::createBasicAliasAnalysisPass());
795 
796  // Early optimizations to try to reduce the total amount of code to
797  // work with if we can
798  optPM.add(llvm::createReassociatePass());
799  optPM.add(llvm::createConstantPropagationPass());
800  optPM.add(llvm::createDeadInstEliminationPass());
801  optPM.add(llvm::createCFGSimplificationPass());
802 
803  optPM.add(llvm::createPromoteMemoryToRegisterPass());
804  optPM.add(llvm::createAggressiveDCEPass());
805 
806  optPM.add(llvm::createInstructionCombiningPass());
807  optPM.add(llvm::createDeadInstEliminationPass());
808 
809  // On to more serious optimizations
810  optPM.add(llvm::createInstructionCombiningPass());
811  optPM.add(llvm::createCFGSimplificationPass());
812  optPM.add(llvm::createPromoteMemoryToRegisterPass());
813  optPM.add(llvm::createGlobalOptimizerPass());
814  optPM.add(llvm::createReassociatePass());
815  optPM.add(llvm::createIPConstantPropagationPass());
816 
817  optPM.add(llvm::createDeadArgEliminationPass());
818  optPM.add(llvm::createInstructionCombiningPass());
819  optPM.add(llvm::createCFGSimplificationPass());
820  optPM.add(llvm::createPruneEHPass());
821  optPM.add(llvm::createFunctionAttrsPass());
822  optPM.add(llvm::createFunctionInliningPass());
823  optPM.add(llvm::createConstantPropagationPass());
824  optPM.add(llvm::createDeadInstEliminationPass());
825  optPM.add(llvm::createCFGSimplificationPass());
826 
827  optPM.add(llvm::createArgumentPromotionPass());
828 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3
829  // Starting from 3.4 this functionality was moved to
830  // InstructionCombiningPass. See r184459 for details.
831  optPM.add(llvm::createSimplifyLibCallsPass());
832 #endif
833  optPM.add(llvm::createAggressiveDCEPass());
834  optPM.add(llvm::createInstructionCombiningPass());
835  optPM.add(llvm::createJumpThreadingPass());
836  optPM.add(llvm::createCFGSimplificationPass());
837  optPM.add(llvm::createInstructionCombiningPass());
838  optPM.add(llvm::createTailCallEliminationPass());
839 
840  optPM.add(llvm::createInstructionCombiningPass());
841 
842  optPM.add(llvm::createFunctionInliningPass());
843  optPM.add(llvm::createConstantPropagationPass());
844 
845  optPM.add(llvm::createInstructionCombiningPass());
846 
847  optPM.add(llvm::createIPSCCPPass());
848  optPM.add(llvm::createDeadArgEliminationPass());
849  optPM.add(llvm::createAggressiveDCEPass());
850  optPM.add(llvm::createInstructionCombiningPass());
851  optPM.add(llvm::createCFGSimplificationPass());
852 
853  optPM.add(llvm::createFunctionInliningPass());
854  optPM.add(llvm::createArgumentPromotionPass());
855  optPM.add(llvm::createInstructionCombiningPass());
856  optPM.add(llvm::createCFGSimplificationPass());
857  optPM.add(llvm::createReassociatePass());
858  optPM.add(llvm::createLoopRotatePass());
859  optPM.add(llvm::createLICMPass());
860 // optPM.add(llvm::createLoopUnswitchPass(false));
861 #if 1
862  optPM.add(llvm::createInstructionCombiningPass());
863  optPM.add(llvm::createIndVarSimplifyPass());
864  optPM.add(llvm::createLoopIdiomPass());
865  optPM.add(llvm::createLoopDeletionPass());
866  optPM.add(llvm::createLoopUnrollPass());
867  optPM.add(llvm::createGVNPass());
868  optPM.add(llvm::createMemCpyOptPass());
869  optPM.add(llvm::createSCCPPass());
870  optPM.add(llvm::createInstructionCombiningPass());
871  optPM.add(llvm::createJumpThreadingPass());
872  optPM.add(llvm::createCorrelatedValuePropagationPass());
873  optPM.add(llvm::createDeadStoreEliminationPass());
874  optPM.add(llvm::createAggressiveDCEPass());
875  optPM.add(llvm::createCFGSimplificationPass());
876  optPM.add(llvm::createInstructionCombiningPass());
877  optPM.add(llvm::createFunctionInliningPass());
878  optPM.add(llvm::createAggressiveDCEPass());
879  optPM.add(llvm::createStripDeadPrototypesPass());
880  optPM.add(llvm::createGlobalDCEPass());
881  optPM.add(llvm::createConstantMergePass());
882 #endif
883  }
884 #endif /* ISPC_NVPTX_ENABLED */
885  }
886 
887  // Finish up by making sure we didn't mess anything up in the IR along
888  // the way.
889  optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
890  optPM.run(*module);
891 
892 #ifndef ISPC_NO_DUMPS
893  if (g->debugPrint) {
894  printf("\n*****\nFINAL OUTPUT\n*****\n");
895  module->dump();
896  }
897 #endif
898 }
899 
900 ///////////////////////////////////////////////////////////////////////////
901 // IntrinsicsOpt
902 
903 /** This is a relatively simple optimization pass that does a few small
904  optimizations that LLVM's x86 optimizer doesn't currently handle.
905  (Specifically, MOVMSK of a constant can be replaced with the
906  corresponding constant value, BLENDVPS and AVX masked load/store with
907  either an 'all on' or 'all off' masks can be replaced with simpler
908  operations.
909 
910  @todo The better thing to do would be to submit a patch to LLVM to get
911  these; they're presumably pretty simple patterns to match.
912 */
913 class IntrinsicsOpt : public llvm::BasicBlockPass {
914  public:
915  IntrinsicsOpt() : BasicBlockPass(ID){};
916 
917 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
918  const char *getPassName() const { return "Intrinsics Cleanup Optimization"; }
919 #else // LLVM 4.0+
920  llvm::StringRef getPassName() const { return "Intrinsics Cleanup Optimization"; }
921 #endif
922  bool runOnBasicBlock(llvm::BasicBlock &BB);
923 
924  static char ID;
925 
926  private:
928  MaskInstruction(llvm::Function *f) { function = f; }
929  llvm::Function *function;
930  };
931  std::vector<MaskInstruction> maskInstructions;
932 
933  /** Structure that records everything we need to know about a blend
934  instruction for this optimization pass.
935  */
937  BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
938  : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) {}
939  /** Function pointer for the blend instruction */
940  llvm::Function *function;
941  /** Mask value for an "all on" mask for this instruction */
942  uint64_t allOnMask;
943  /** The operand number in the llvm CallInst corresponds to the
944  first operand to blend with. */
945  int op0;
946  /** The operand number in the CallInst corresponding to the second
947  operand to blend with. */
948  int op1;
949  /** The operand in the call inst where the blending factor is
950  found. */
951  int opFactor;
952  };
953  std::vector<BlendInstruction> blendInstructions;
954 
955  bool matchesMaskInstruction(llvm::Function *function);
956  BlendInstruction *matchingBlendInstruction(llvm::Function *function);
957 };
958 
959 char IntrinsicsOpt::ID = 0;
960 
961 /** Given an llvm::Value, return true if we can determine that it's an
962  undefined value. This only makes a weak attempt at chasing this down,
963  only detecting flat-out undef values, and bitcasts of undef values.
964 
965  @todo Is it worth working harder to find more of these? It starts to
966  get tricky, since having an undef operand doesn't necessarily mean that
967  the result will be undefined. (And for that matter, is there an LLVM
968  call that will do this for us?)
969  */
970 static bool lIsUndef(llvm::Value *value) {
971  if (llvm::isa<llvm::UndefValue>(value))
972  return true;
973 
974  llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(value);
975  if (bci)
976  return lIsUndef(bci->getOperand(0));
977 
978  return false;
979 }
980 
981 bool IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
982  DEBUG_START_PASS("IntrinsicsOpt");
983 
984  // We can't initialize mask/blend function vector during pass initialization,
985  // as they may be optimized out by the time the pass is invoked.
986 
987  // All of the mask instructions we may encounter. Note that even if
988  // compiling for AVX, we may still encounter the regular 4-wide SSE
989  // MOVMSK instruction.
990  if (llvm::Function *ssei8Movmsk =
991  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse2_pmovmskb_128))) {
992  maskInstructions.push_back(ssei8Movmsk);
993  }
994  if (llvm::Function *sseFloatMovmsk =
995  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse_movmsk_ps))) {
996  maskInstructions.push_back(sseFloatMovmsk);
997  }
998  if (llvm::Function *__movmsk = m->module->getFunction("__movmsk")) {
999  maskInstructions.push_back(__movmsk);
1000  }
1001  if (llvm::Function *avxFloatMovmsk =
1002  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_movmsk_ps_256))) {
1003  maskInstructions.push_back(avxFloatMovmsk);
1004  }
1005 
1006  // And all of the blend instructions
1008  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse41_blendvps)), 0xf, 0, 1, 2));
1010  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_blendv_ps_256)), 0xff, 0, 1, 2));
1011 
1012  llvm::Function *avxMaskedLoad32 =
1013  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_ps_256));
1014  llvm::Function *avxMaskedLoad64 =
1015  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_pd_256));
1016  llvm::Function *avxMaskedStore32 =
1017  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_ps_256));
1018  llvm::Function *avxMaskedStore64 =
1019  m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_pd_256));
1020 
1021  bool modifiedAny = false;
1022 restart:
1023  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
1024  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
1025  if (callInst == NULL || callInst->getCalledFunction() == NULL)
1026  continue;
1027 
1028  BlendInstruction *blend = matchingBlendInstruction(callInst->getCalledFunction());
1029  if (blend != NULL) {
1030  llvm::Value *v[2] = {callInst->getArgOperand(blend->op0), callInst->getArgOperand(blend->op1)};
1031  llvm::Value *factor = callInst->getArgOperand(blend->opFactor);
1032 
1033  // If the values are the same, then no need to blend..
1034  if (v[0] == v[1]) {
1035  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[0]);
1036  modifiedAny = true;
1037  goto restart;
1038  }
1039 
1040  // If one of the two is undefined, we're allowed to replace
1041  // with the value of the other. (In other words, the only
1042  // valid case is that the blend factor ends up having a value
1043  // that only selects from the defined one of the two operands,
1044  // otherwise the result is undefined and any value is fine,
1045  // ergo the defined one is an acceptable result.)
1046  if (lIsUndef(v[0])) {
1047  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[1]);
1048  modifiedAny = true;
1049  goto restart;
1050  }
1051  if (lIsUndef(v[1])) {
1052  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[0]);
1053  modifiedAny = true;
1054  goto restart;
1055  }
1056 
1057  uint64_t mask;
1058  if (lGetMask(factor, &mask) == true) {
1059  llvm::Value *value = NULL;
1060  if (mask == 0)
1061  // Mask all off -> replace with the first blend value
1062  value = v[0];
1063  else if (mask == blend->allOnMask)
1064  // Mask all on -> replace with the second blend value
1065  value = v[1];
1066 
1067  if (value != NULL) {
1068  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
1069  modifiedAny = true;
1070  goto restart;
1071  }
1072  }
1073  } else if (matchesMaskInstruction(callInst->getCalledFunction())) {
1074  llvm::Value *factor = callInst->getArgOperand(0);
1075  uint64_t mask;
1076  if (lGetMask(factor, &mask) == true) {
1077  // If the vector-valued mask has a known value, replace it
1078  // with the corresponding integer mask from its elements
1079  // high bits.
1080  llvm::Value *value = (callInst->getType() == LLVMTypes::Int32Type) ? LLVMInt32(mask) : LLVMInt64(mask);
1081  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
1082  modifiedAny = true;
1083  goto restart;
1084  }
1085  } else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
1086  callInst->getCalledFunction() == avxMaskedLoad64) {
1087  llvm::Value *factor = callInst->getArgOperand(1);
1088  uint64_t mask;
1089  if (lGetMask(factor, &mask) == true) {
1090  if (mask == 0) {
1091  // nothing being loaded, replace with undef value
1092  llvm::Type *returnType = callInst->getType();
1093  Assert(llvm::isa<llvm::VectorType>(returnType));
1094  llvm::Value *undefValue = llvm::UndefValue::get(returnType);
1095  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, undefValue);
1096  modifiedAny = true;
1097  goto restart;
1098  } else if (mask == 0xff) {
1099  // all lanes active; replace with a regular load
1100  llvm::Type *returnType = callInst->getType();
1101  Assert(llvm::isa<llvm::VectorType>(returnType));
1102  // cast the i8 * to the appropriate type
1103  const char *name = LLVMGetName(callInst->getArgOperand(0), "_cast");
1104  llvm::Value *castPtr = new llvm::BitCastInst(callInst->getArgOperand(0),
1105  llvm::PointerType::get(returnType, 0), name, callInst);
1106  lCopyMetadata(castPtr, callInst);
1107  int align;
1108  if (g->opt.forceAlignedMemory)
1109  align = g->target->getNativeVectorAlignment();
1110  else
1111  align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
1112  name = LLVMGetName(callInst->getArgOperand(0), "_load");
1113  llvm::Instruction *loadInst =
1114  new llvm::LoadInst(castPtr, name, false /* not volatile */, align, (llvm::Instruction *)NULL);
1115  lCopyMetadata(loadInst, callInst);
1116  llvm::ReplaceInstWithInst(callInst, loadInst);
1117  modifiedAny = true;
1118  goto restart;
1119  }
1120  }
1121  } else if (callInst->getCalledFunction() == avxMaskedStore32 ||
1122  callInst->getCalledFunction() == avxMaskedStore64) {
1123  // NOTE: mask is the 2nd parameter, not the 3rd one!!
1124  llvm::Value *factor = callInst->getArgOperand(1);
1125  uint64_t mask;
1126  if (lGetMask(factor, &mask) == true) {
1127  if (mask == 0) {
1128  // nothing actually being stored, just remove the inst
1129  callInst->eraseFromParent();
1130  modifiedAny = true;
1131  goto restart;
1132  } else if (mask == 0xff) {
1133  // all lanes storing, so replace with a regular store
1134  llvm::Value *rvalue = callInst->getArgOperand(2);
1135  llvm::Type *storeType = rvalue->getType();
1136  const char *name = LLVMGetName(callInst->getArgOperand(0), "_ptrcast");
1137  llvm::Value *castPtr = new llvm::BitCastInst(callInst->getArgOperand(0),
1138  llvm::PointerType::get(storeType, 0), name, callInst);
1139  lCopyMetadata(castPtr, callInst);
1140 
1141  llvm::StoreInst *storeInst = new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
1142  int align;
1143  if (g->opt.forceAlignedMemory)
1144  align = g->target->getNativeVectorAlignment();
1145  else
1146  align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
1147  storeInst->setAlignment(align);
1148  lCopyMetadata(storeInst, callInst);
1149  llvm::ReplaceInstWithInst(callInst, storeInst);
1150 
1151  modifiedAny = true;
1152  goto restart;
1153  }
1154  }
1155  }
1156  }
1157 
1158  DEBUG_END_PASS("IntrinsicsOpt");
1159 
1160  return modifiedAny;
1161 }
1162 
1163 bool IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
1164  for (unsigned int i = 0; i < maskInstructions.size(); ++i) {
1165  if (maskInstructions[i].function != NULL && function == maskInstructions[i].function) {
1166  return true;
1167  }
1168  }
1169  return false;
1170 }
1171 
1173  for (unsigned int i = 0; i < blendInstructions.size(); ++i) {
1174  if (blendInstructions[i].function != NULL && function == blendInstructions[i].function) {
1175  return &blendInstructions[i];
1176  }
1177  }
1178  return NULL;
1179 }
1180 
1181 static llvm::Pass *CreateIntrinsicsOptPass() { return new IntrinsicsOpt; }
1182 
1183 ///////////////////////////////////////////////////////////////////////////
1184 
1185 /** This simple optimization pass looks for a vector select instruction
1186  with an all-on or all-off constant mask, simplifying it to the
1187  appropriate operand if so.
1188 
1189  @todo The better thing to do would be to submit a patch to LLVM to get
1190  these; they're presumably pretty simple patterns to match.
1191 */
1192 class InstructionSimplifyPass : public llvm::BasicBlockPass {
1193  public:
1194  InstructionSimplifyPass() : BasicBlockPass(ID) {}
1195 
1196 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
1197  const char *getPassName() const { return "Vector Select Optimization"; }
1198 #else // LLVM 4.0+
1199  llvm::StringRef getPassName() const { return "Vector Select Optimization"; }
1200 #endif
1201  bool runOnBasicBlock(llvm::BasicBlock &BB);
1202 
1203  static char ID;
1204 
1205  private:
1206  static bool simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter);
1207  static llvm::Value *simplifyBoolVec(llvm::Value *value);
1208  static bool simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter);
1209 };
1210 
1212 
1213 llvm::Value *InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) {
1214  llvm::TruncInst *trunc = llvm::dyn_cast<llvm::TruncInst>(value);
1215  if (trunc != NULL) {
1216  // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
1217  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
1218  if (sext && sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
1219  return sext->getOperand(0);
1220 
1221  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
1222  if (zext && zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
1223  return zext->getOperand(0);
1224  }
1225  /*
1226  // This optimization has discernable benefit on the perf
1227  // suite on latest LLVM versions.
1228  // On 3.4+ (maybe even older), it can result in illegal
1229  // operations, so it's being disabled.
1230  llvm::ICmpInst *icmp = llvm::dyn_cast<llvm::ICmpInst>(value);
1231  if (icmp != NULL) {
1232  // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo
1233  if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) {
1234  llvm::Value *op1 = icmp->getOperand(1);
1235  if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
1236  llvm::Value *op0 = icmp->getOperand(0);
1237  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(op0);
1238  if (sext)
1239  return sext->getOperand(0);
1240  llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(op0);
1241  if (zext)
1242  return zext->getOperand(0);
1243  }
1244  }
1245 
1246  }
1247  */
1248  return NULL;
1249 }
1250 
1251 bool InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter) {
1252  if (selectInst->getType()->isVectorTy() == false)
1253  return false;
1254 
1255  llvm::Value *factor = selectInst->getOperand(0);
1256 
1257  // Simplify all-on or all-off mask values
1258  MaskStatus maskStatus = lGetMaskStatus(factor);
1259  llvm::Value *value = NULL;
1260  if (maskStatus == ALL_ON)
1261  // Mask all on -> replace with the first select value
1262  value = selectInst->getOperand(1);
1263  else if (maskStatus == ALL_OFF)
1264  // Mask all off -> replace with the second select value
1265  value = selectInst->getOperand(2);
1266  if (value != NULL) {
1267  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
1268  return true;
1269  }
1270 
1271  // Sometimes earlier LLVM optimization passes generate unnecessarily
1272  // complex expressions for the selection vector, which in turn confuses
1273  // the code generators and leads to sub-optimal code (particularly for
1274  // 8 and 16-bit masks). We'll try to simplify them out here so that
1275  // the code generator patterns match..
1276  if ((factor = simplifyBoolVec(factor)) != NULL) {
1277  llvm::Instruction *newSelect = llvm::SelectInst::Create(factor, selectInst->getOperand(1),
1278  selectInst->getOperand(2), selectInst->getName());
1279  llvm::ReplaceInstWithInst(selectInst, newSelect);
1280  return true;
1281  }
1282 
1283  return false;
1284 }
1285 
1286 bool InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter) {
1287  llvm::Function *calledFunc = callInst->getCalledFunction();
1288 
1289  // Turn a __movmsk call with a compile-time constant vector into the
1290  // equivalent scalar value.
1291  if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
1292  return false;
1293 
1294  uint64_t mask;
1295  if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
1296  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, LLVMInt64(mask));
1297  return true;
1298  }
1299  return false;
1300 }
1301 
1302 bool InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) {
1303  DEBUG_START_PASS("InstructionSimplify");
1304 
1305  bool modifiedAny = false;
1306 
1307 restart:
1308  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
1309  llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
1310  if (selectInst && simplifySelect(selectInst, iter)) {
1311  modifiedAny = true;
1312  goto restart;
1313  }
1314  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
1315  if (callInst && simplifyCall(callInst, iter)) {
1316  modifiedAny = true;
1317  goto restart;
1318  }
1319  }
1320 
1321  DEBUG_END_PASS("InstructionSimplify");
1322 
1323  return modifiedAny;
1324 }
1325 
1326 static llvm::Pass *CreateInstructionSimplifyPass() { return new InstructionSimplifyPass; }
1327 
1328 ///////////////////////////////////////////////////////////////////////////
1329 // ImproveMemoryOpsPass
1330 
1331 /** When the front-end emits gathers and scatters, it generates an array of
1332  vector-width pointers to represent the set of addresses to read from or
1333  write to. This optimization detects cases when the base pointer is a
1334  uniform pointer or when the indexing is into an array that can be
1335  converted into scatters/gathers from a single base pointer and an array
1336  of offsets.
1337 
1338  See for example the comments discussing the __pseudo_gather functions
1339  in builtins.cpp for more information about this.
1340  */
1341 class ImproveMemoryOpsPass : public llvm::BasicBlockPass {
1342  public:
1343  static char ID;
1344  ImproveMemoryOpsPass() : BasicBlockPass(ID) {}
1345 
1346 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
1347  const char *getPassName() const { return "Improve Memory Ops"; }
1348 #else // LLVM 4.0+
1349  llvm::StringRef getPassName() const { return "Improve Memory Ops"; }
1350 #endif
1351  bool runOnBasicBlock(llvm::BasicBlock &BB);
1352 };
1353 
1354 char ImproveMemoryOpsPass::ID = 0;
1355 
1356 /** Check to make sure that this value is actually a pointer in the end.
1357  We need to make sure that given an expression like vec(offset) +
1358  ptr2int(ptr), lGetBasePointer() doesn't return vec(offset) for the base
1359  pointer such that we then treat ptr2int(ptr) as an offset. This ends
1360  up being important so that we don't generate LLVM GEP instructions like
1361  "gep inttoptr 8, i64 %ptr", which in turn can lead to incorrect code
1362  since LLVM's pointer aliasing analysis assumes that operands after the
1363  first one to a GEP aren't pointers.
1364  */
1365 static llvm::Value *lCheckForActualPointer(llvm::Value *v) {
1366  if (v == NULL) {
1367  return NULL;
1368  } else if (llvm::isa<llvm::PointerType>(v->getType())) {
1369  return v;
1370  } else if (llvm::isa<llvm::PtrToIntInst>(v)) {
1371  return v;
1372  }
1373 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_7
1374  // This one is tricky, as it's heuristic tuned for LLVM 3.7+, which may
1375  // optimize loading double* with consequent ptr2int to straight load of i64.
1376  // This heuristic should be good enough to catch all the cases we should
1377  // detect and nothing else.
1378  else if (llvm::isa<llvm::LoadInst>(v)) {
1379  return v;
1380  }
1381 #endif
1382  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1383  llvm::Value *t = lCheckForActualPointer(ci->getOperand(0));
1384  if (t == NULL) {
1385  return NULL;
1386  } else {
1387  return v;
1388  }
1389  } else {
1390  llvm::ConstantExpr *uce = llvm::dyn_cast<llvm::ConstantExpr>(v);
1391  if (uce != NULL && uce->getOpcode() == llvm::Instruction::PtrToInt)
1392  return v;
1393  return NULL;
1394  }
1395 }
1396 
1397 /** Given a llvm::Value representing a varying pointer, this function
1398  checks to see if all of the elements of the vector have the same value
1399  (i.e. there's a common base pointer). If broadcast has been already detected
1400  it checks that the first element of the vector is not undef. If one of the conditions
1401  is true, it returns the common pointer value; otherwise it returns NULL.
1402  */
1403 static llvm::Value *lGetBasePointer(llvm::Value *v, llvm::Instruction *insertBefore, bool broadcastDetected) {
1404  if (llvm::isa<llvm::InsertElementInst>(v) || llvm::isa<llvm::ShuffleVectorInst>(v)) {
1405  // If we have already detected broadcast we want to look for
1406  // the vector with the first not-undef element
1407  llvm::Value *element = LLVMFlattenInsertChain(v, g->target->getVectorWidth(), true, false, broadcastDetected);
1408  // TODO: it's probably ok to allow undefined elements and return
1409  // the base pointer if all of the other elements have the same
1410  // value.
1411  if (element != NULL) {
1412  // all elements are the same and not NULLs
1413  return lCheckForActualPointer(element);
1414  } else {
1415  return NULL;
1416  }
1417  }
1418 
1419  // This case comes up with global/static arrays
1420  if (llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v)) {
1421  return lCheckForActualPointer(cv->getSplatValue());
1422  } else if (llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v)) {
1423  return lCheckForActualPointer(cdv->getSplatValue());
1424  }
1425  // It is a little bit tricky to use operations with pointers, casted to int with another bit size
1426  // but sometimes it is useful, so we handle this case here.
1427  else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
1428  llvm::Value *t = lGetBasePointer(ci->getOperand(0), insertBefore, broadcastDetected);
1429  if (t == NULL) {
1430  return NULL;
1431  } else {
1432  return llvm::CastInst::Create(ci->getOpcode(), t, ci->getType()->getScalarType(), LLVMGetName(t, "_cast"),
1433  insertBefore);
1434  }
1435  }
1436 
1437  return NULL;
1438 }
1439 
1440 /** Given the two operands to a constant add expression, see if we have the
1441  form "base pointer + offset", whee op0 is the base pointer and op1 is
1442  the offset; if so return the base and the offset. */
1443 static llvm::Constant *lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1, llvm::Constant **delta) {
1444  llvm::ConstantExpr *op = llvm::dyn_cast<llvm::ConstantExpr>(op0);
1445  if (op == NULL || op->getOpcode() != llvm::Instruction::PtrToInt)
1446  // the first operand isn't a pointer
1447  return NULL;
1448 
1449  llvm::ConstantInt *opDelta = llvm::dyn_cast<llvm::ConstantInt>(op1);
1450  if (opDelta == NULL)
1451  // the second operand isn't an integer operand
1452  return NULL;
1453 
1454  *delta = opDelta;
1455  return op0;
1456 }
1457 
1458 static llvm::Value *lExtractFromInserts(llvm::Value *v, unsigned int index) {
1459  llvm::InsertValueInst *iv = llvm::dyn_cast<llvm::InsertValueInst>(v);
1460  if (iv == NULL)
1461  return NULL;
1462 
1463  Assert(iv->hasIndices() && iv->getNumIndices() == 1);
1464  if (iv->getIndices()[0] == index)
1465  return iv->getInsertedValueOperand();
1466  else
1467  return lExtractFromInserts(iv->getAggregateOperand(), index);
1468 }
1469 
1470 /** Given a varying pointer in ptrs, this function checks to see if it can
1471  be determined to be indexing from a common uniform base pointer. If
1472  so, the function returns the base pointer llvm::Value and initializes
1473  *offsets with an int vector of the per-lane offsets
1474  */
1475 static llvm::Value *lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets, llvm::Instruction *insertBefore) {
1476 #ifndef ISPC_NO_DUMPS
1477  if (g->debugPrint) {
1478  fprintf(stderr, "lGetBasePtrAndOffsets\n");
1479  LLVMDumpValue(ptrs);
1480  }
1481 #endif
1482 
1483  bool broadcastDetected = false;
1484  // Looking for %gep_offset = shufflevector <8 x i64> %0, <8 x i64> undef, <8 x i32> zeroinitializer
1485  llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(ptrs);
1486  if (shuffle != NULL) {
1487  llvm::Value *indices = shuffle->getOperand(2);
1488  llvm::Value *vec = shuffle->getOperand(1);
1489  if (lIsUndef(vec) && llvm::isa<llvm::ConstantAggregateZero>(indices)) {
1490  broadcastDetected = true;
1491  }
1492  }
1493  llvm::Value *base = lGetBasePointer(ptrs, insertBefore, broadcastDetected);
1494  if (base != NULL) {
1495  // We have a straight up varying pointer with no indexing that's
1496  // actually all the same value.
1497  if (g->target->is32Bit())
1498  *offsets = LLVMInt32Vector(0);
1499  else
1500  *offsets = LLVMInt64Vector((int64_t)0);
1501 
1502  if (broadcastDetected) {
1503  llvm::Value *op = shuffle->getOperand(0);
1504  llvm::BinaryOperator *bop_var = llvm::dyn_cast<llvm::BinaryOperator>(op);
1505  if (bop_var != NULL && bop_var->getOpcode() == llvm::Instruction::Add) {
1506  // We expect here ConstantVector as
1507  // <i64 4, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>
1508  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(bop_var->getOperand(1));
1509  if (cv != NULL) {
1510  llvm::Value *zeroMask =
1511  llvm::ConstantVector::getSplat(cv->getType()->getVectorNumElements(),
1512  llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
1513  // Create offset
1514  llvm::Value *shuffle_offset = new llvm::ShuffleVectorInst(cv, llvm::UndefValue::get(cv->getType()),
1515  zeroMask, "shuffle", bop_var);
1516  *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, shuffle_offset,
1517  "new_offsets", insertBefore);
1518  }
1519  }
1520  }
1521  return base;
1522  }
1523 
1524  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(ptrs);
1525  if (bop != NULL && bop->getOpcode() == llvm::Instruction::Add) {
1526  // If we have a common pointer plus something, then we're also
1527  // good.
1528  if ((base = lGetBasePtrAndOffsets(bop->getOperand(0), offsets, insertBefore)) != NULL) {
1529  *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, bop->getOperand(1), "new_offsets",
1530  insertBefore);
1531  return base;
1532  } else if ((base = lGetBasePtrAndOffsets(bop->getOperand(1), offsets, insertBefore)) != NULL) {
1533  *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, bop->getOperand(0), "new_offsets",
1534  insertBefore);
1535  return base;
1536  }
1537  }
1538  llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(ptrs);
1539  if (cv != NULL) {
1540  // Indexing into global arrays can lead to this form, with
1541  // ConstantVectors..
1542  llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
1543  for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
1544  llvm::Constant *c = llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
1545  if (c == NULL)
1546  return NULL;
1547  elements.push_back(c);
1548  }
1549 
1550  llvm::Constant *delta[ISPC_MAX_NVEC];
1551  for (unsigned int i = 0; i < elements.size(); ++i) {
1552  // For each element, try to decompose it into either a straight
1553  // up base pointer, or a base pointer plus an integer value.
1554  llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(elements[i]);
1555  if (ce == NULL)
1556  return NULL;
1557 
1558  delta[i] = NULL;
1559  llvm::Value *elementBase = NULL; // base pointer for this element
1560  if (ce->getOpcode() == llvm::Instruction::PtrToInt) {
1561  // If the element is just a ptr to int instruction, treat
1562  // it as having an offset of zero
1563  elementBase = ce;
1564  delta[i] = g->target->is32Bit() ? LLVMInt32(0) : LLVMInt64(0);
1565  } else if (ce->getOpcode() == llvm::Instruction::Add) {
1566  // Try both orderings of the operands to see if we can get
1567  // a pointer+offset out of them.
1568  elementBase = lGetConstantAddExprBaseOffset(ce->getOperand(0), ce->getOperand(1), &delta[i]);
1569  if (elementBase == NULL)
1570  elementBase = lGetConstantAddExprBaseOffset(ce->getOperand(1), ce->getOperand(0), &delta[i]);
1571  }
1572 
1573  // We weren't able to find a base pointer in the above. (We
1574  // don't expect this to happen; if it does, it may be necessary
1575  // to handle more cases in the decomposition above.)
1576  if (elementBase == NULL)
1577  return NULL;
1578 
1579  Assert(delta[i] != NULL);
1580  if (base == NULL)
1581  // The first time we've found a base pointer
1582  base = elementBase;
1583  else if (base != elementBase)
1584  // Different program instances have different base
1585  // pointers, so no luck.
1586  return NULL;
1587  }
1588 
1589  Assert(base != NULL);
1590  llvm::ArrayRef<llvm::Constant *> deltas(&delta[0], &delta[elements.size()]);
1591  *offsets = llvm::ConstantVector::get(deltas);
1592  return base;
1593  }
1594 
1595  llvm::ExtractValueInst *ev = llvm::dyn_cast<llvm::ExtractValueInst>(ptrs);
1596  if (ev != NULL) {
1597  Assert(ev->getNumIndices() == 1);
1598  int index = ev->getIndices()[0];
1599  ptrs = lExtractFromInserts(ev->getAggregateOperand(), index);
1600  if (ptrs != NULL)
1601  return lGetBasePtrAndOffsets(ptrs, offsets, insertBefore);
1602  }
1603 
1604  return NULL;
1605 }
1606 
1607 /** Given a vector expression in vec, separate it into a compile-time
1608  constant component and a variable component, returning the two parts in
1609  *constOffset and *variableOffset. (It should be the case that the sum
1610  of these two is exactly equal to the original vector.)
1611 
1612  This routine only handles some (important) patterns; in some cases it
1613  will fail and return components that are actually compile-time
1614  constants in *variableOffset.
1615 
1616  Finally, if there aren't any constant (or, respectivaly, variable)
1617  components, the corresponding return value may be set to NULL.
1618  */
1619 static void lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset, llvm::Value **variableOffset,
1620  llvm::Instruction *insertBefore) {
1621  if (llvm::isa<llvm::ConstantVector>(vec) || llvm::isa<llvm::ConstantDataVector>(vec) ||
1622  llvm::isa<llvm::ConstantAggregateZero>(vec)) {
1623  *constOffset = vec;
1624  *variableOffset = NULL;
1625  return;
1626  }
1627 
1628  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(vec);
1629  if (cast != NULL) {
1630  // Check the cast target.
1631  llvm::Value *co, *vo;
1632  lExtractConstantOffset(cast->getOperand(0), &co, &vo, insertBefore);
1633 
1634  // make new cast instructions for the two parts
1635  if (co == NULL)
1636  *constOffset = NULL;
1637  else
1638  *constOffset =
1639  llvm::CastInst::Create(cast->getOpcode(), co, cast->getType(), LLVMGetName(co, "_cast"), insertBefore);
1640  if (vo == NULL)
1641  *variableOffset = NULL;
1642  else
1643  *variableOffset =
1644  llvm::CastInst::Create(cast->getOpcode(), vo, cast->getType(), LLVMGetName(vo, "_cast"), insertBefore);
1645  return;
1646  }
1647 
1648  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(vec);
1649  if (bop != NULL) {
1650  llvm::Value *op0 = bop->getOperand(0);
1651  llvm::Value *op1 = bop->getOperand(1);
1652  llvm::Value *c0, *v0, *c1, *v1;
1653 
1654  if (bop->getOpcode() == llvm::Instruction::Add) {
1655  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1656  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1657 
1658  if (c0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c0))
1659  *constOffset = c1;
1660  else if (c1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c1))
1661  *constOffset = c0;
1662  else
1663  *constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, c0, c1, LLVMGetName("add", c0, c1),
1664  insertBefore);
1665 
1666  if (v0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v0))
1667  *variableOffset = v1;
1668  else if (v1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v1))
1669  *variableOffset = v0;
1670  else
1671  *variableOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, v0, v1,
1672  LLVMGetName("add", v0, v1), insertBefore);
1673  return;
1674  } else if (bop->getOpcode() == llvm::Instruction::Shl) {
1675  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1676  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1677 
1678  // Given the product of constant and variable terms, we have:
1679  // (c0 + v0) * (2^(c1 + v1)) = c0 * 2^c1 * 2^v1 + v0 * 2^c1 * 2^v1
1680  // We can optimize only if v1 == NULL.
1681  if ((v1 != NULL) || (c0 == NULL) || (c1 == NULL)) {
1682  *constOffset = NULL;
1683  *variableOffset = vec;
1684  } else if (v0 == NULL) {
1685  *constOffset = vec;
1686  *variableOffset = NULL;
1687  } else {
1688  *constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Shl, c0, c1, LLVMGetName("shl", c0, c1),
1689  insertBefore);
1690  *variableOffset = llvm::BinaryOperator::Create(llvm::Instruction::Shl, v0, c1,
1691  LLVMGetName("shl", v0, c1), insertBefore);
1692  }
1693  return;
1694  } else if (bop->getOpcode() == llvm::Instruction::Mul) {
1695  lExtractConstantOffset(op0, &c0, &v0, insertBefore);
1696  lExtractConstantOffset(op1, &c1, &v1, insertBefore);
1697 
1698  // Given the product of constant and variable terms, we have:
1699  // (c0 + v0) * (c1 + v1) == (c0 c1) + (v0 c1 + c0 v1 + v0 v1)
1700  // Note that the first term is a constant and the last three are
1701  // variable.
1702  if (c0 != NULL && c1 != NULL)
1703  *constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, c1, LLVMGetName("mul", c0, c1),
1704  insertBefore);
1705  else
1706  *constOffset = NULL;
1707 
1708  llvm::Value *va = NULL, *vb = NULL, *vc = NULL;
1709  if (v0 != NULL && c1 != NULL)
1710  va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1, LLVMGetName("mul", v0, c1),
1711  insertBefore);
1712  if (c0 != NULL && v1 != NULL)
1713  vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1, LLVMGetName("mul", c0, v1),
1714  insertBefore);
1715  if (v0 != NULL && v1 != NULL)
1716  vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1, LLVMGetName("mul", v0, v1),
1717  insertBefore);
1718 
1719  llvm::Value *vab = NULL;
1720  if (va != NULL && vb != NULL)
1721  vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb, LLVMGetName("add", va, vb),
1722  insertBefore);
1723  else if (va != NULL)
1724  vab = va;
1725  else
1726  vab = vb;
1727 
1728  if (vab != NULL && vc != NULL)
1729  *variableOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, vab, vc,
1730  LLVMGetName("add", vab, vc), insertBefore);
1731  else if (vab != NULL)
1732  *variableOffset = vab;
1733  else
1734  *variableOffset = vc;
1735 
1736  return;
1737  }
1738  }
1739 
1740  // Nothing matched, just return what we have as a variable component
1741  *constOffset = NULL;
1742  *variableOffset = vec;
1743 }
1744 
1745 /* Returns true if the given value is a constant vector of integers with
1746  the same value in all of the elements. (Returns the splatted value in
1747  *splat, if so). */
1748 static bool lIsIntegerSplat(llvm::Value *v, int *splat) {
1749  llvm::ConstantDataVector *cvec = llvm::dyn_cast<llvm::ConstantDataVector>(v);
1750  if (cvec == NULL)
1751  return false;
1752 
1753  llvm::Constant *splatConst = cvec->getSplatValue();
1754  if (splatConst == NULL)
1755  return false;
1756 
1757  llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(splatConst);
1758  if (ci == NULL)
1759  return false;
1760 
1761  int64_t splatVal = ci->getSExtValue();
1762  *splat = (int)splatVal;
1763  return true;
1764 }
1765 
1766 static llvm::Value *lExtract248Scale(llvm::Value *splatOperand, int splatValue, llvm::Value *otherOperand,
1767  llvm::Value **result) {
1768  if (splatValue == 2 || splatValue == 4 || splatValue == 8) {
1769  *result = otherOperand;
1770  return LLVMInt32(splatValue);
1771  }
1772  // Even if we don't have a common scale by exactly 2, 4, or 8, we'll
1773  // see if we can pull out that much of the scale anyway; this may in
1774  // turn allow other optimizations later.
1775  for (int scale = 8; scale >= 2; scale /= 2) {
1776  llvm::Instruction *insertBefore = llvm::dyn_cast<llvm::Instruction>(*result);
1777  Assert(insertBefore != NULL);
1778 
1779  if ((splatValue % scale) == 0) {
1780  // *result = otherOperand * splatOperand / scale;
1781  llvm::Value *splatScaleVec = (splatOperand->getType() == LLVMTypes::Int32VectorType)
1782  ? LLVMInt32Vector(scale)
1783  : LLVMInt64Vector(scale);
1784  llvm::Value *splatDiv =
1785  llvm::BinaryOperator::Create(llvm::Instruction::SDiv, splatOperand, splatScaleVec, "div", insertBefore);
1786  *result = llvm::BinaryOperator::Create(llvm::Instruction::Mul, splatDiv, otherOperand, "mul", insertBefore);
1787  return LLVMInt32(scale);
1788  }
1789  }
1790  return LLVMInt32(1);
1791 }
1792 
1793 /** Given a vector of integer offsets to a base pointer being used for a
1794  gather or a scatter, see if its root operation is a multiply by a
1795  vector of some value by all 2s/4s/8s. If not, return NULL.
1796 
1797  If it is return an i32 value of 2, 4, 8 from the function and modify
1798  *vec so that it points to the operand that is being multiplied by
1799  2/4/8.
1800 
1801  We go through all this trouble so that we can pass the i32 scale factor
1802  to the {gather,scatter}_base_offsets function as a separate scale
1803  factor for the offsets. This in turn is used in a way so that the LLVM
1804  x86 code generator matches it to apply x86's free scale by 2x, 4x, or
1805  8x to one of two registers being added together for an addressing
1806  calculation.
1807  */
1808 static llvm::Value *lExtractOffsetVector248Scale(llvm::Value **vec) {
1809  llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(*vec);
1810  if (cast != NULL) {
1811  llvm::Value *castOp = cast->getOperand(0);
1812  // Check the cast target.
1813  llvm::Value *scale = lExtractOffsetVector248Scale(&castOp);
1814  if (scale == NULL)
1815  return NULL;
1816 
1817  // make a new cast instruction so that we end up with the right
1818  // type
1819  *vec = llvm::CastInst::Create(cast->getOpcode(), castOp, cast->getType(), "offset_cast", cast);
1820  return scale;
1821  }
1822 
1823  // If we don't have a binary operator, then just give up
1824  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
1825  if (bop == NULL)
1826  return LLVMInt32(1);
1827 
1828  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
1829  if (bop->getOpcode() == llvm::Instruction::Add) {
1830  if (llvm::isa<llvm::ConstantAggregateZero>(op0)) {
1831  *vec = op1;
1832  return lExtractOffsetVector248Scale(vec);
1833  } else if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
1834  *vec = op0;
1835  return lExtractOffsetVector248Scale(vec);
1836  } else {
1837  llvm::Value *s0 = lExtractOffsetVector248Scale(&op0);
1838  llvm::Value *s1 = lExtractOffsetVector248Scale(&op1);
1839  if (s0 == s1) {
1840  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add, op0, op1, "new_add", bop);
1841  return s0;
1842  } else
1843  return LLVMInt32(1);
1844  }
1845  } else if (bop->getOpcode() == llvm::Instruction::Mul) {
1846  // Check each operand for being one of the scale factors we care about.
1847  int splat;
1848  if (lIsIntegerSplat(op0, &splat))
1849  return lExtract248Scale(op0, splat, op1, vec);
1850  else if (lIsIntegerSplat(op1, &splat))
1851  return lExtract248Scale(op1, splat, op0, vec);
1852  else
1853  return LLVMInt32(1);
1854  } else
1855  return LLVMInt32(1);
1856 }
1857 
1858 #if 0
1859 static llvm::Value *
1860 lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
1861  fprintf(stderr, " lextract: ");
1862  (*vec)->dump();
1863  fprintf(stderr, "\n");
1864 
1865  if (llvm::isa<llvm::ConstantVector>(*vec) ||
1866  llvm::isa<llvm::ConstantDataVector>(*vec) ||
1867  llvm::isa<llvm::ConstantAggregateZero>(*vec))
1868  return NULL;
1869 
1870  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
1871  if (sext != NULL) {
1872  llvm::Value *sextOp = sext->getOperand(0);
1873  // Check the sext target.
1874  llvm::Value *unif = lExtractUniforms(&sextOp, insertBefore);
1875  if (unif == NULL)
1876  return NULL;
1877 
1878  // make a new sext instruction so that we end up with the right
1879  // type
1880  *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
1881  return unif;
1882  }
1883 
1884  if (LLVMVectorValuesAllEqual(*vec)) {
1885  // FIXME: we may want to redo all of the expression here, in scalar
1886  // form (if at all possible), for code quality...
1887  llvm::Value *unif =
1888  llvm::ExtractElementInst::Create(*vec, LLVMInt32(0),
1889  "first_uniform", insertBefore);
1890  *vec = NULL;
1891  return unif;
1892  }
1893 
1894  llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
1895  if (bop == NULL)
1896  return NULL;
1897 
1898  llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
1899  if (bop->getOpcode() == llvm::Instruction::Add) {
1900  llvm::Value *s0 = lExtractUniforms(&op0, insertBefore);
1901  llvm::Value *s1 = lExtractUniforms(&op1, insertBefore);
1902  if (s0 == NULL && s1 == NULL)
1903  return NULL;
1904 
1905  if (op0 == NULL)
1906  *vec = op1;
1907  else if (op1 == NULL)
1908  *vec = op0;
1909  else
1910  *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
1911  op0, op1, "new_add", insertBefore);
1912 
1913  if (s0 == NULL)
1914  return s1;
1915  else if (s1 == NULL)
1916  return s0;
1917  else
1918  return llvm::BinaryOperator::Create(llvm::Instruction::Add, s0, s1,
1919  "add_unif", insertBefore);
1920  }
1921 #if 0
1922  else if (bop->getOpcode() == llvm::Instruction::Mul) {
1923  // Check each operand for being one of the scale factors we care about.
1924  int splat;
1925  if (lIs248Splat(op0, &splat)) {
1926  *vec = op1;
1927  return LLVMInt32(splat);
1928  }
1929  else if (lIs248Splat(op1, &splat)) {
1930  *vec = op0;
1931  return LLVMInt32(splat);
1932  }
1933  else
1934  return LLVMInt32(1);
1935  }
1936 #endif
1937  else
1938  return NULL;
1939 }
1940 
1941 
1942 static void
1943 lExtractUniformsFromOffset(llvm::Value **basePtr, llvm::Value **offsetVector,
1944  llvm::Value *offsetScale,
1945  llvm::Instruction *insertBefore) {
1946 #if 1
1947  (*basePtr)->dump();
1948  printf("\n");
1949  (*offsetVector)->dump();
1950  printf("\n");
1951  offsetScale->dump();
1952  printf("-----\n");
1953 #endif
1954 
1955  llvm::Value *uniformDelta = lExtractUniforms(offsetVector, insertBefore);
1956  if (uniformDelta == NULL)
1957  return;
1958 
1959  *basePtr = lGEPInst(*basePtr, arrayRef, "new_base", insertBefore);
1960 
1961  // this should only happen if we have only uniforms, but that in turn
1962  // shouldn't be a gather/scatter!
1963  Assert(*offsetVector != NULL);
1964 }
1965 #endif
1966 
1967 static bool lVectorIs32BitInts(llvm::Value *v) {
1968  int nElts;
1969  int64_t elts[ISPC_MAX_NVEC];
1970  if (!LLVMExtractVectorInts(v, elts, &nElts))
1971  return false;
1972 
1973  for (int i = 0; i < nElts; ++i)
1974  if ((int32_t)elts[i] != elts[i])
1975  return false;
1976 
1977  return true;
1978 }
1979 
1980 /** Check to see if the two offset vectors can safely be represented with
1981  32-bit values. If so, return true and update the pointed-to
1982  llvm::Value *s to be the 32-bit equivalents. */
1983 static bool lOffsets32BitSafe(llvm::Value **variableOffsetPtr, llvm::Value **constOffsetPtr,
1984  llvm::Instruction *insertBefore) {
1985  llvm::Value *variableOffset = *variableOffsetPtr;
1986  llvm::Value *constOffset = *constOffsetPtr;
1987 
1988  if (variableOffset->getType() != LLVMTypes::Int32VectorType) {
1989  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(variableOffset);
1990  if (sext != NULL && sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType)
1991  // sext of a 32-bit vector -> the 32-bit vector is good
1992  variableOffset = sext->getOperand(0);
1993  else if (lVectorIs32BitInts(variableOffset))
1994  // The only constant vector we should have here is a vector of
1995  // all zeros (i.e. a ConstantAggregateZero, but just in case,
1996  // do the more general check with lVectorIs32BitInts().
1997  variableOffset = new llvm::TruncInst(variableOffset, LLVMTypes::Int32VectorType,
1998  LLVMGetName(variableOffset, "_trunc"), insertBefore);
1999  else
2000  return false;
2001  }
2002 
2003  if (constOffset->getType() != LLVMTypes::Int32VectorType) {
2004  if (lVectorIs32BitInts(constOffset)) {
2005  // Truncate them so we have a 32-bit vector type for them.
2006  constOffset = new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
2007  LLVMGetName(constOffset, "_trunc"), insertBefore);
2008  } else {
2009  // FIXME: otherwise we just assume that all constant offsets
2010  // can actually always fit into 32-bits... (This could be
2011  // wrong, but it should be only in pretty esoteric cases). We
2012  // make this assumption for now since we sometimes generate
2013  // constants that need constant folding before we really have a
2014  // constant vector out of them, and
2015  // llvm::ConstantFoldInstruction() doesn't seem to be doing
2016  // enough for us in some cases if we call it from here.
2017  constOffset = new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
2018  LLVMGetName(constOffset, "_trunc"), insertBefore);
2019  }
2020  }
2021 
2022  *variableOffsetPtr = variableOffset;
2023  *constOffsetPtr = constOffset;
2024  return true;
2025 }
2026 
2027 /** Check to see if the offset value is composed of a string of Adds,
2028  SExts, and Constant Vectors that are 32-bit safe. Recursively
2029  explores the operands of Add instructions (as they might themselves
2030  be adds that eventually terminate in constant vectors or a SExt.)
2031  */
2032 
2033 static bool lIs32BitSafeHelper(llvm::Value *v) {
2034  // handle Adds, SExts, Constant Vectors
2035  if (llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v)) {
2036  if (bop->getOpcode() == llvm::Instruction::Add) {
2037  return lIs32BitSafeHelper(bop->getOperand(0)) && lIs32BitSafeHelper(bop->getOperand(1));
2038  }
2039  return false;
2040  } else if (llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(v)) {
2041  return sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType;
2042  } else
2043  return lVectorIs32BitInts(v);
2044 }
2045 
2046 /** Check to see if the single offset vector can safely be represented with
2047  32-bit values. If so, return true and update the pointed-to
2048  llvm::Value * to be the 32-bit equivalent. */
2049 static bool lOffsets32BitSafe(llvm::Value **offsetPtr, llvm::Instruction *insertBefore) {
2050  llvm::Value *offset = *offsetPtr;
2051 
2052  if (offset->getType() == LLVMTypes::Int32VectorType)
2053  return true;
2054 
2055  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offset);
2056  if (sext != NULL && sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
2057  // sext of a 32-bit vector -> the 32-bit vector is good
2058  *offsetPtr = sext->getOperand(0);
2059  return true;
2060  } else if (lIs32BitSafeHelper(offset)) {
2061  // The only constant vector we should have here is a vector of
2062  // all zeros (i.e. a ConstantAggregateZero, but just in case,
2063  // do the more general check with lVectorIs32BitInts().
2064 
2065  // Alternatively, offset could be a sequence of adds terminating
2066  // in safe constant vectors or a SExt.
2067  *offsetPtr =
2068  new llvm::TruncInst(offset, LLVMTypes::Int32VectorType, LLVMGetName(offset, "_trunc"), insertBefore);
2069  return true;
2070  } else
2071  return false;
2072 }
2073 
2074 static bool lGSToGSBaseOffsets(llvm::CallInst *callInst) {
2075  struct GSInfo {
2076  GSInfo(const char *pgFuncName, const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
2077  : isGather(ig), isPrefetch(ip) {
2078  func = m->module->getFunction(pgFuncName);
2079  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
2080  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
2081  }
2082  llvm::Function *func;
2083  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
2084  const bool isGather;
2085  const bool isPrefetch;
2086  };
2087 
2088  GSInfo gsFuncs[] = {
2089  GSInfo(
2090  "__pseudo_gather32_i8",
2091  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2092  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2093  true, false),
2094  GSInfo("__pseudo_gather32_i16",
2095  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2096  : "__pseudo_gather_factored_base_offsets32_i16",
2097  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2098  : "__pseudo_gather_factored_base_offsets32_i16",
2099  true, false),
2100  GSInfo("__pseudo_gather32_i32",
2101  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2102  : "__pseudo_gather_factored_base_offsets32_i32",
2103  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2104  : "__pseudo_gather_factored_base_offsets32_i32",
2105  true, false),
2106  GSInfo("__pseudo_gather32_float",
2107  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2108  : "__pseudo_gather_factored_base_offsets32_float",
2109  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2110  : "__pseudo_gather_factored_base_offsets32_float",
2111  true, false),
2112  GSInfo("__pseudo_gather32_i64",
2113  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2114  : "__pseudo_gather_factored_base_offsets32_i64",
2115  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2116  : "__pseudo_gather_factored_base_offsets32_i64",
2117  true, false),
2118  GSInfo("__pseudo_gather32_double",
2119  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2120  : "__pseudo_gather_factored_base_offsets32_double",
2121  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2122  : "__pseudo_gather_factored_base_offsets32_double",
2123  true, false),
2124 
2125  GSInfo("__pseudo_scatter32_i8",
2126  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2127  : "__pseudo_scatter_factored_base_offsets32_i8",
2128  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2129  : "__pseudo_scatter_factored_base_offsets32_i8",
2130  false, false),
2131  GSInfo("__pseudo_scatter32_i16",
2132  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2133  : "__pseudo_scatter_factored_base_offsets32_i16",
2134  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2135  : "__pseudo_scatter_factored_base_offsets32_i16",
2136  false, false),
2137  GSInfo("__pseudo_scatter32_i32",
2138  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2139  : "__pseudo_scatter_factored_base_offsets32_i32",
2140  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2141  : "__pseudo_scatter_factored_base_offsets32_i32",
2142  false, false),
2143  GSInfo("__pseudo_scatter32_float",
2144  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2145  : "__pseudo_scatter_factored_base_offsets32_float",
2146  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2147  : "__pseudo_scatter_factored_base_offsets32_float",
2148  false, false),
2149  GSInfo("__pseudo_scatter32_i64",
2150  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2151  : "__pseudo_scatter_factored_base_offsets32_i64",
2152  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2153  : "__pseudo_scatter_factored_base_offsets32_i64",
2154  false, false),
2155  GSInfo("__pseudo_scatter32_double",
2156  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2157  : "__pseudo_scatter_factored_base_offsets32_double",
2158  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2159  : "__pseudo_scatter_factored_base_offsets32_double",
2160  false, false),
2161 
2162  GSInfo(
2163  "__pseudo_gather64_i8",
2164  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" : "__pseudo_gather_factored_base_offsets64_i8",
2165  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2166  true, false),
2167  GSInfo("__pseudo_gather64_i16",
2168  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16"
2169  : "__pseudo_gather_factored_base_offsets64_i16",
2170  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2171  : "__pseudo_gather_factored_base_offsets32_i16",
2172  true, false),
2173  GSInfo("__pseudo_gather64_i32",
2174  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32"
2175  : "__pseudo_gather_factored_base_offsets64_i32",
2176  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2177  : "__pseudo_gather_factored_base_offsets32_i32",
2178  true, false),
2179  GSInfo("__pseudo_gather64_float",
2180  g->target->hasGather() ? "__pseudo_gather_base_offsets64_float"
2181  : "__pseudo_gather_factored_base_offsets64_float",
2182  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2183  : "__pseudo_gather_factored_base_offsets32_float",
2184  true, false),
2185  GSInfo("__pseudo_gather64_i64",
2186  g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64"
2187  : "__pseudo_gather_factored_base_offsets64_i64",
2188  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2189  : "__pseudo_gather_factored_base_offsets32_i64",
2190  true, false),
2191  GSInfo("__pseudo_gather64_double",
2192  g->target->hasGather() ? "__pseudo_gather_base_offsets64_double"
2193  : "__pseudo_gather_factored_base_offsets64_double",
2194  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2195  : "__pseudo_gather_factored_base_offsets32_double",
2196  true, false),
2197 
2198  GSInfo("__pseudo_scatter64_i8",
2199  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8"
2200  : "__pseudo_scatter_factored_base_offsets64_i8",
2201  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2202  : "__pseudo_scatter_factored_base_offsets32_i8",
2203  false, false),
2204  GSInfo("__pseudo_scatter64_i16",
2205  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16"
2206  : "__pseudo_scatter_factored_base_offsets64_i16",
2207  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2208  : "__pseudo_scatter_factored_base_offsets32_i16",
2209  false, false),
2210  GSInfo("__pseudo_scatter64_i32",
2211  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32"
2212  : "__pseudo_scatter_factored_base_offsets64_i32",
2213  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2214  : "__pseudo_scatter_factored_base_offsets32_i32",
2215  false, false),
2216  GSInfo("__pseudo_scatter64_float",
2217  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float"
2218  : "__pseudo_scatter_factored_base_offsets64_float",
2219  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2220  : "__pseudo_scatter_factored_base_offsets32_float",
2221  false, false),
2222  GSInfo("__pseudo_scatter64_i64",
2223  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64"
2224  : "__pseudo_scatter_factored_base_offsets64_i64",
2225  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2226  : "__pseudo_scatter_factored_base_offsets32_i64",
2227  false, false),
2228  GSInfo("__pseudo_scatter64_double",
2229  g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double"
2230  : "__pseudo_scatter_factored_base_offsets64_double",
2231  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2232  : "__pseudo_scatter_factored_base_offsets32_double",
2233  false, false),
2234  GSInfo("__pseudo_prefetch_read_varying_1",
2235  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2236  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2237  false, true),
2238 
2239  GSInfo("__pseudo_prefetch_read_varying_2",
2240  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2241  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2242  false, true),
2243 
2244  GSInfo("__pseudo_prefetch_read_varying_3",
2245  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2246  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2247  false, true),
2248 
2249  GSInfo("__pseudo_prefetch_read_varying_nt",
2250  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2251  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2252  false, true),
2253  };
2254 
2255  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2256  for (int i = 0; i < numGSFuncs; ++i)
2257  Assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL && gsFuncs[i].baseOffsets32Func != NULL);
2258 
2259  GSInfo *info = NULL;
2260  for (int i = 0; i < numGSFuncs; ++i)
2261  if (gsFuncs[i].func != NULL && callInst->getCalledFunction() == gsFuncs[i].func) {
2262  info = &gsFuncs[i];
2263  break;
2264  }
2265  if (info == NULL)
2266  return false;
2267 
2268  // Try to transform the array of pointers to a single base pointer
2269  // and an array of int32 offsets. (All the hard work is done by
2270  // lGetBasePtrAndOffsets).
2271  llvm::Value *ptrs = callInst->getArgOperand(0);
2272  llvm::Value *offsetVector = NULL;
2273  llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector, callInst);
2274 
2275  if (basePtr == NULL || offsetVector == NULL ||
2276  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
2277  // It's actually a fully general gather/scatter with a varying
2278  // set of base pointers, so leave it as is and continune onward
2279  // to the next instruction...
2280  return false;
2281 
2282  // Cast the base pointer to a void *, since that's what the
2283  // __pseudo_*_base_offsets_* functions want.
2284  basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType, LLVMGetName(basePtr, "_2void"), callInst);
2285  lCopyMetadata(basePtr, callInst);
2286 
2287  llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
2288 
2289  if ((info->isGather == true && g->target->hasGather()) ||
2290  (info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
2291  (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
2292 
2293  // See if the offsets are scaled by 2, 4, or 8. If so,
2294  // extract that scale factor and rewrite the offsets to remove
2295  // it.
2296  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);
2297 
2298  // If we're doing 32-bit addressing on a 64-bit target, here we
2299  // will see if we can call one of the 32-bit variants of the pseudo
2300  // gather/scatter functions.
2301  if (g->opt.force32BitAddressing && lOffsets32BitSafe(&offsetVector, callInst)) {
2302  gatherScatterFunc = info->baseOffsets32Func;
2303  }
2304 
2305  if (info->isGather || info->isPrefetch) {
2306  llvm::Value *mask = callInst->getArgOperand(1);
2307 
2308  // Generate a new function call to the next pseudo gather
2309  // base+offsets instruction. Note that we're passing a NULL
2310  // llvm::Instruction to llvm::CallInst::Create; this means that
2311  // the instruction isn't inserted into a basic block and that
2312  // way we can then call ReplaceInstWithInst().
2313  llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector, mask,
2314  callInst->getName().str().c_str(), NULL);
2315  lCopyMetadata(newCall, callInst);
2316  llvm::ReplaceInstWithInst(callInst, newCall);
2317  } else {
2318  llvm::Value *storeValue = callInst->getArgOperand(1);
2319  llvm::Value *mask = callInst->getArgOperand(2);
2320 
2321  // Generate a new function call to the next pseudo scatter
2322  // base+offsets instruction. See above for why passing NULL
2323  // for the Instruction * is intended.
2324  llvm::Instruction *newCall =
2325  lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector, storeValue, mask, "", NULL);
2326  lCopyMetadata(newCall, callInst);
2327  llvm::ReplaceInstWithInst(callInst, newCall);
2328  }
2329  } else {
2330  // Try to decompose the offset vector into a compile time constant
2331  // component and a varying component. The constant component is
2332  // passed as a separate parameter to the gather/scatter functions,
2333  // which in turn allows their implementations to end up emitting
2334  // x86 instructions with constant offsets encoded in them.
2335  llvm::Value *constOffset = NULL;
2336  llvm::Value *variableOffset = NULL;
2337  lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, callInst);
2338  if (constOffset == NULL)
2339  constOffset = LLVMIntAsType(0, offsetVector->getType());
2340  if (variableOffset == NULL)
2341  variableOffset = LLVMIntAsType(0, offsetVector->getType());
2342 
2343  // See if the varying component is scaled by 2, 4, or 8. If so,
2344  // extract that scale factor and rewrite variableOffset to remove
2345  // it. (This also is pulled out so that we can match the scales by
2346  // 2/4/8 offered by x86 addressing operators.)
2347  llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);
2348 
2349  // If we're doing 32-bit addressing on a 64-bit target, here we
2350  // will see if we can call one of the 32-bit variants of the pseudo
2351  // gather/scatter functions.
2352  if (g->opt.force32BitAddressing && lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) {
2353  gatherScatterFunc = info->baseOffsets32Func;
2354  }
2355 
2356  if (info->isGather || info->isPrefetch) {
2357  llvm::Value *mask = callInst->getArgOperand(1);
2358 
2359  // Generate a new function call to the next pseudo gather
2360  // base+offsets instruction. Note that we're passing a NULL
2361  // llvm::Instruction to llvm::CallInst::Create; this means that
2362  // the instruction isn't inserted into a basic block and that
2363  // way we can then call ReplaceInstWithInst().
2364  llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, constOffset,
2365  mask, callInst->getName().str().c_str(), NULL);
2366  lCopyMetadata(newCall, callInst);
2367  llvm::ReplaceInstWithInst(callInst, newCall);
2368  } else {
2369  llvm::Value *storeValue = callInst->getArgOperand(1);
2370  llvm::Value *mask = callInst->getArgOperand(2);
2371 
2372  // Generate a new function call to the next pseudo scatter
2373  // base+offsets instruction. See above for why passing NULL
2374  // for the Instruction * is intended.
2375  llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, constOffset,
2376  storeValue, mask, "", NULL);
2377  lCopyMetadata(newCall, callInst);
2378  llvm::ReplaceInstWithInst(callInst, newCall);
2379  }
2380  }
2381  return true;
2382 }
2383 
2384 /** Try to improve the decomposition between compile-time constant and
2385  compile-time unknown offsets in calls to the __pseudo_*_base_offsets*
2386  functions. Other other optimizations have run, we will sometimes be
2387  able to pull more terms out of the unknown part and add them into the
2388  compile-time-known part.
2389  */
2390 static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
2391  struct GSBOInfo {
2392  GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
2393  : isGather(ig), isPrefetch(ip) {
2394  baseOffsetsFunc = m->module->getFunction(pgboFuncName);
2395  baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
2396  }
2397  llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
2398  const bool isGather;
2399  const bool isPrefetch;
2400  };
2401 
2402  GSBOInfo gsFuncs[] = {
2403  GSBOInfo(
2404  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2405  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
2406  true, false),
2407  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2408  : "__pseudo_gather_factored_base_offsets32_i16",
2409  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2410  : "__pseudo_gather_factored_base_offsets32_i16",
2411  true, false),
2412  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2413  : "__pseudo_gather_factored_base_offsets32_i32",
2414  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2415  : "__pseudo_gather_factored_base_offsets32_i32",
2416  true, false),
2417  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2418  : "__pseudo_gather_factored_base_offsets32_float",
2419  g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2420  : "__pseudo_gather_factored_base_offsets32_float",
2421  true, false),
2422  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2423  : "__pseudo_gather_factored_base_offsets32_i64",
2424  g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2425  : "__pseudo_gather_factored_base_offsets32_i64",
2426  true, false),
2427  GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2428  : "__pseudo_gather_factored_base_offsets32_double",
2429  g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2430  : "__pseudo_gather_factored_base_offsets32_double",
2431  true, false),
2432 
2433  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2434  : "__pseudo_scatter_factored_base_offsets32_i8",
2435  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2436  : "__pseudo_scatter_factored_base_offsets32_i8",
2437  false, false),
2438  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2439  : "__pseudo_scatter_factored_base_offsets32_i16",
2440  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2441  : "__pseudo_scatter_factored_base_offsets32_i16",
2442  false, false),
2443  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2444  : "__pseudo_scatter_factored_base_offsets32_i32",
2445  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2446  : "__pseudo_scatter_factored_base_offsets32_i32",
2447  false, false),
2448  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2449  : "__pseudo_scatter_factored_base_offsets32_float",
2450  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2451  : "__pseudo_scatter_factored_base_offsets32_float",
2452  false, false),
2453  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2454  : "__pseudo_scatter_factored_base_offsets32_i64",
2455  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2456  : "__pseudo_scatter_factored_base_offsets32_i64",
2457  false, false),
2458  GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2459  : "__pseudo_scatter_factored_base_offsets32_double",
2460  g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2461  : "__pseudo_scatter_factored_base_offsets32_double",
2462  false, false),
2463 
2464  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2465  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
2466  false, true),
2467 
2468  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2469  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
2470  false, true),
2471 
2472  GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2473  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
2474  false, true),
2475 
2476  GSBOInfo(
2477  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2478  g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
2479  false, true),
2480  };
2481 
2482  int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
2483  for (int i = 0; i < numGSFuncs; ++i)
2484  Assert(gsFuncs[i].baseOffsetsFunc != NULL && gsFuncs[i].baseOffsets32Func != NULL);
2485 
2486  llvm::Function *calledFunc = callInst->getCalledFunction();
2487  Assert(calledFunc != NULL);
2488 
2489  // Is one of the gather/scatter functins that decompose into
2490  // base+offsets being called?
2491  GSBOInfo *info = NULL;
2492  for (int i = 0; i < numGSFuncs; ++i)
2493  if (calledFunc == gsFuncs[i].baseOffsetsFunc || calledFunc == gsFuncs[i].baseOffsets32Func) {
2494  info = &gsFuncs[i];
2495  break;
2496  }
2497  if (info == NULL)
2498  return false;
2499 
2500  // Grab the old variable offset
2501  llvm::Value *origVariableOffset = callInst->getArgOperand(1);
2502 
2503  // If it's zero, we're done. Don't go and think that we're clever by
2504  // adding these zeros to the constant offsets.
2505  if (llvm::isa<llvm::ConstantAggregateZero>(origVariableOffset))
2506  return false;
2507 
2508  // Try to decompose the old variable offset
2509  llvm::Value *constOffset = NULL;
2510  llvm::Value *variableOffset = NULL;
2511  lExtractConstantOffset(origVariableOffset, &constOffset, &variableOffset, callInst);
2512 
2513  // No luck
2514  if (constOffset == NULL)
2515  return false;
2516 
2517  // Total luck: everything could be moved to the constant offset
2518  if (variableOffset == NULL)
2519  variableOffset = LLVMIntAsType(0, origVariableOffset->getType());
2520 
2521  // We need to scale the value we add to the constant offset by the
2522  // 2/4/8 scale for the variable offset, if present.
2523  llvm::ConstantInt *varScale = llvm::dyn_cast<llvm::ConstantInt>(callInst->getArgOperand(2));
2524  Assert(varScale != NULL);
2525 
2526  llvm::Value *scaleSmear;
2527  if (origVariableOffset->getType() == LLVMTypes::Int64VectorType)
2528  scaleSmear = LLVMInt64Vector((int64_t)varScale->getZExtValue());
2529  else
2530  scaleSmear = LLVMInt32Vector((int32_t)varScale->getZExtValue());
2531 
2532  constOffset =
2533  llvm::BinaryOperator::Create(llvm::Instruction::Mul, constOffset, scaleSmear, constOffset->getName(), callInst);
2534 
2535  // And add the additional offset to the original constant offset
2536  constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, constOffset, callInst->getArgOperand(3),
2537  callInst->getArgOperand(3)->getName(), callInst);
2538 
2539  // Finally, update the values of the operands to the gather/scatter
2540  // function.
2541  callInst->setArgOperand(1, variableOffset);
2542  callInst->setArgOperand(3, constOffset);
2543 
2544  return true;
2545 }
2546 
2547 static llvm::Value *lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets, llvm::Instruction *insertBefore) {
2548  llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets);
2549  return lGEPInst(base, firstOffset, "ptr", insertBefore);
2550 }
2551 
2552 static llvm::Constant *lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType) {
2553  llvm::ConstantInt *offsetScaleInt = llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
2554  Assert(offsetScaleInt != NULL);
2555  uint64_t scaleValue = offsetScaleInt->getZExtValue();
2556 
2557  std::vector<llvm::Constant *> scales;
2558  for (int i = 0; i < g->target->getVectorWidth(); ++i) {
2559  if (vecType == LLVMTypes::Int64VectorType)
2560  scales.push_back(LLVMInt64(scaleValue));
2561  else {
2562  Assert(vecType == LLVMTypes::Int32VectorType);
2563  scales.push_back(LLVMInt32((int32_t)scaleValue));
2564  }
2565  }
2566  return llvm::ConstantVector::get(scales);
2567 }
2568 
2569 /** After earlier optimization passes have run, we are sometimes able to
2570  determine that gathers/scatters are actually accessing memory in a more
2571  regular fashion and then change the operation to something simpler and
2572  more efficient. For example, if all of the lanes in a gather are
2573  reading from the same location, we can instead do a scalar load and
2574  broadcast. This pass examines gathers and scatters and tries to
2575  simplify them if at all possible.
2576 
2577  @todo Currently, this only looks for all program instances going to the
2578  same location and all going to a linear sequence of locations in
2579  memory. There are a number of other cases that might make sense to
2580  look for, including things that could be handled with a vector load +
2581  shuffle or things that could be handled with hybrids of e.g. 2 4-wide
2582  vector loads with AVX, etc.
2583 */
2584 static bool lGSToLoadStore(llvm::CallInst *callInst) {
2585  struct GatherImpInfo {
2586  GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st, int a)
2587  : align(a), isFactored(!g->target->hasGather()) {
2588  pseudoFunc = m->module->getFunction(pName);
2589  loadMaskedFunc = m->module->getFunction(lmName);
2590  Assert(pseudoFunc != NULL && loadMaskedFunc != NULL);
2591  scalarType = st;
2592  }
2593 
2594  llvm::Function *pseudoFunc;
2595  llvm::Function *loadMaskedFunc;
2596  llvm::Type *scalarType;
2597  const int align;
2598  const bool isFactored;
2599  };
2600 
2601  GatherImpInfo gInfo[] = {
2602  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8"
2603  : "__pseudo_gather_factored_base_offsets32_i8",
2604  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2605  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
2606  : "__pseudo_gather_factored_base_offsets32_i16",
2607  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2608  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
2609  : "__pseudo_gather_factored_base_offsets32_i32",
2610  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2611  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
2612  : "__pseudo_gather_factored_base_offsets32_float",
2613  "__masked_load_float", LLVMTypes::FloatType, 4),
2614  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
2615  : "__pseudo_gather_factored_base_offsets32_i64",
2616  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2617  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
2618  : "__pseudo_gather_factored_base_offsets32_double",
2619  "__masked_load_double", LLVMTypes::DoubleType, 8),
2620  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8"
2621  : "__pseudo_gather_factored_base_offsets64_i8",
2622  "__masked_load_i8", LLVMTypes::Int8Type, 1),
2623  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16"
2624  : "__pseudo_gather_factored_base_offsets64_i16",
2625  "__masked_load_i16", LLVMTypes::Int16Type, 2),
2626  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32"
2627  : "__pseudo_gather_factored_base_offsets64_i32",
2628  "__masked_load_i32", LLVMTypes::Int32Type, 4),
2629  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_float"
2630  : "__pseudo_gather_factored_base_offsets64_float",
2631  "__masked_load_float", LLVMTypes::FloatType, 4),
2632  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64"
2633  : "__pseudo_gather_factored_base_offsets64_i64",
2634  "__masked_load_i64", LLVMTypes::Int64Type, 8),
2635  GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_double"
2636  : "__pseudo_gather_factored_base_offsets64_double",
2637  "__masked_load_double", LLVMTypes::DoubleType, 8),
2638  };
2639 
2640  struct ScatterImpInfo {
2641  ScatterImpInfo(const char *pName, const char *msName, llvm::Type *vpt, int a)
2642  : align(a), isFactored(!g->target->hasScatter()) {
2643  pseudoFunc = m->module->getFunction(pName);
2644  maskedStoreFunc = m->module->getFunction(msName);
2645  vecPtrType = vpt;
2646  Assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
2647  }
2648  llvm::Function *pseudoFunc;
2649  llvm::Function *maskedStoreFunc;
2650  llvm::Type *vecPtrType;
2651  const int align;
2652  const bool isFactored;
2653  };
2654 
2655  ScatterImpInfo sInfo[] = {
2656  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
2657  : "__pseudo_scatter_factored_base_offsets32_i8",
2658  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2659  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
2660  : "__pseudo_scatter_factored_base_offsets32_i16",
2661  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2662  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
2663  : "__pseudo_scatter_factored_base_offsets32_i32",
2664  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2665  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
2666  : "__pseudo_scatter_factored_base_offsets32_float",
2667  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2668  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
2669  : "__pseudo_scatter_factored_base_offsets32_i64",
2670  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2671  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
2672  : "__pseudo_scatter_factored_base_offsets32_double",
2673  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2674  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8"
2675  : "__pseudo_scatter_factored_base_offsets64_i8",
2676  "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
2677  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16"
2678  : "__pseudo_scatter_factored_base_offsets64_i16",
2679  "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
2680  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32"
2681  : "__pseudo_scatter_factored_base_offsets64_i32",
2682  "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
2683  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float"
2684  : "__pseudo_scatter_factored_base_offsets64_float",
2685  "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
2686  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64"
2687  : "__pseudo_scatter_factored_base_offsets64_i64",
2688  "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
2689  ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double"
2690  : "__pseudo_scatter_factored_base_offsets64_double",
2691  "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
2692  };
2693 
2694  llvm::Function *calledFunc = callInst->getCalledFunction();
2695 
2696  GatherImpInfo *gatherInfo = NULL;
2697  ScatterImpInfo *scatterInfo = NULL;
2698  for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
2699  if (gInfo[i].pseudoFunc != NULL && calledFunc == gInfo[i].pseudoFunc) {
2700  gatherInfo = &gInfo[i];
2701  break;
2702  }
2703  }
2704  for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
2705  if (sInfo[i].pseudoFunc != NULL && calledFunc == sInfo[i].pseudoFunc) {
2706  scatterInfo = &sInfo[i];
2707  break;
2708  }
2709  }
2710  if (gatherInfo == NULL && scatterInfo == NULL)
2711  return false;
2712 
2713  SourcePos pos;
2714  lGetSourcePosFromMetadata(callInst, &pos);
2715 
2716  llvm::Value *base = callInst->getArgOperand(0);
2717  llvm::Value *fullOffsets = NULL;
2718  llvm::Value *storeValue = NULL;
2719  llvm::Value *mask = NULL;
2720 
2721  if ((gatherInfo != NULL && gatherInfo->isFactored) || (scatterInfo != NULL && scatterInfo->isFactored)) {
2722  llvm::Value *varyingOffsets = callInst->getArgOperand(1);
2723  llvm::Value *offsetScale = callInst->getArgOperand(2);
2724  llvm::Value *constOffsets = callInst->getArgOperand(3);
2725  if (scatterInfo)
2726  storeValue = callInst->getArgOperand(4);
2727  mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);
2728 
2729  // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
2730  llvm::Constant *offsetScaleVec = lGetOffsetScaleVec(offsetScale, varyingOffsets->getType());
2731 
2732  llvm::Value *scaledVarying = llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
2733  varyingOffsets, "scaled_varying", callInst);
2734  fullOffsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying, constOffsets,
2735  "varying+const_offsets", callInst);
2736  } else {
2737  if (scatterInfo)
2738  storeValue = callInst->getArgOperand(3);
2739  mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);
2740 
2741  llvm::Value *offsetScale = callInst->getArgOperand(1);
2742  llvm::Value *offsets = callInst->getArgOperand(2);
2743  llvm::Value *offsetScaleVec = lGetOffsetScaleVec(offsetScale, offsets->getType());
2744 
2745  fullOffsets =
2746  llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, offsets, "scaled_offsets", callInst);
2747  }
2748 
2749  Debug(SourcePos(), "GSToLoadStore: %s.", fullOffsets->getName().str().c_str());
2750 
2751  if (LLVMVectorValuesAllEqual(fullOffsets)) {
2752  // If all the offsets are equal, then compute the single
2753  // pointer they all represent based on the first one of them
2754  // (arbitrarily).
2755  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
2756  lCopyMetadata(ptr, callInst);
2757 
2758  if (gatherInfo != NULL) {
2759  // A gather with everyone going to the same location is
2760  // handled as a scalar load and broadcast across the lanes.
2761  Debug(pos, "Transformed gather to scalar load and broadcast!");
2762 
2763  ptr =
2764  new llvm::BitCastInst(ptr, llvm::PointerType::get(gatherInfo->scalarType, 0), ptr->getName(), callInst);
2765  llvm::Value *scalarValue = new llvm::LoadInst(ptr, callInst->getName(), callInst);
2766 
2767  // Generate the following sequence:
2768  // %name123 = insertelement <4 x i32> undef, i32 %val, i32 0
2769  // %name124 = shufflevector <4 x i32> %name123, <4 x i32> undef,
2770  // <4 x i32> zeroinitializer
2771  llvm::Value *undef1Value = llvm::UndefValue::get(callInst->getType());
2772  llvm::Value *undef2Value = llvm::UndefValue::get(callInst->getType());
2773  llvm::Value *insertVec =
2774  llvm::InsertElementInst::Create(undef1Value, scalarValue, LLVMInt32(0), callInst->getName(), callInst);
2775  llvm::Value *zeroMask =
2776  llvm::ConstantVector::getSplat(callInst->getType()->getVectorNumElements(),
2777  llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
2778  llvm::Value *shufValue = new llvm::ShuffleVectorInst(insertVec, undef2Value, zeroMask, callInst->getName());
2779 
2780  lCopyMetadata(shufValue, callInst);
2781  llvm::ReplaceInstWithInst(callInst, llvm::dyn_cast<llvm::Instruction>(shufValue));
2782  return true;
2783  } else {
2784  // A scatter with everyone going to the same location is
2785  // undefined (if there's more than one program instance in
2786  // the gang). Issue a warning.
2787  if (g->target->getVectorWidth() > 1)
2788  Warning(pos, "Undefined behavior: all program instances are "
2789  "writing to the same location!");
2790 
2791  // We could do something similar to the gather case, where
2792  // we arbitrarily write one of the values, but we need to
2793  // a) check to be sure the mask isn't all off and b) pick
2794  // the value from an executing program instance in that
2795  // case. We'll just let a bunch of the program instances
2796  // do redundant writes, since this isn't important to make
2797  // fast anyway...
2798  return false;
2799  }
2800  } else {
2801  int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
2802 
2803  if (step > 0 && LLVMVectorIsLinear(fullOffsets, step)) {
2804  // We have a linear sequence of memory locations being accessed
2805  // starting with the location given by the offset from
2806  // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
2807  // and 64 bit gather/scatters, respectively.)
2808  llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
2809  lCopyMetadata(ptr, callInst);
2810 
2811  if (gatherInfo != NULL) {
2812  Debug(pos, "Transformed gather to unaligned vector load!");
2813  llvm::Instruction *newCall =
2814  lCallInst(gatherInfo->loadMaskedFunc, ptr, mask, LLVMGetName(ptr, "_masked_load"));
2815  lCopyMetadata(newCall, callInst);
2816  llvm::ReplaceInstWithInst(callInst, newCall);
2817  return true;
2818  } else {
2819  Debug(pos, "Transformed scatter to unaligned vector store!");
2820  ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast", callInst);
2821  llvm::Instruction *newCall = lCallInst(scatterInfo->maskedStoreFunc, ptr, storeValue, mask, "");
2822  lCopyMetadata(newCall, callInst);
2823  llvm::ReplaceInstWithInst(callInst, newCall);
2824  return true;
2825  }
2826  }
2827  return false;
2828  }
2829 }
2830 
2831 ///////////////////////////////////////////////////////////////////////////
2832 // MaskedStoreOptPass
2833 
2834 /** Masked stores are generally more complex than regular stores; for
2835  example, they require multiple instructions to simulate under SSE.
2836  This optimization detects cases where masked stores can be replaced
2837  with regular stores or removed entirely, for the cases of an 'all on'
2838  mask and an 'all off' mask, respectively.
2839 */
2840 static bool lImproveMaskedStore(llvm::CallInst *callInst) {
2841  struct MSInfo {
2842  MSInfo(const char *name, const int a) : align(a) {
2843  func = m->module->getFunction(name);
2844  Assert(func != NULL);
2845  }
2846  llvm::Function *func;
2847  const int align;
2848  };
2849 
2850  MSInfo msInfo[] = {MSInfo("__pseudo_masked_store_i8", 1), MSInfo("__pseudo_masked_store_i16", 2),
2851  MSInfo("__pseudo_masked_store_i32", 4), MSInfo("__pseudo_masked_store_float", 4),
2852  MSInfo("__pseudo_masked_store_i64", 8), MSInfo("__pseudo_masked_store_double", 8),
2853  MSInfo("__masked_store_blend_i8", 1), MSInfo("__masked_store_blend_i16", 2),
2854  MSInfo("__masked_store_blend_i32", 4), MSInfo("__masked_store_blend_float", 4),
2855  MSInfo("__masked_store_blend_i64", 8), MSInfo("__masked_store_blend_double", 8),
2856  MSInfo("__masked_store_i8", 1), MSInfo("__masked_store_i16", 2),
2857  MSInfo("__masked_store_i32", 4), MSInfo("__masked_store_float", 4),
2858  MSInfo("__masked_store_i64", 8), MSInfo("__masked_store_double", 8)};
2859 
2860  llvm::Function *called = callInst->getCalledFunction();
2861 
2862  int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
2863  MSInfo *info = NULL;
2864  for (int i = 0; i < nMSFuncs; ++i) {
2865  if (msInfo[i].func != NULL && called == msInfo[i].func) {
2866  info = &msInfo[i];
2867  break;
2868  }
2869  }
2870  if (info == NULL)
2871  return false;
2872 
2873  // Got one; grab the operands
2874  llvm::Value *lvalue = callInst->getArgOperand(0);
2875  llvm::Value *rvalue = callInst->getArgOperand(1);
2876  llvm::Value *mask = callInst->getArgOperand(2);
2877 
2878  MaskStatus maskStatus = lGetMaskStatus(mask);
2879  if (maskStatus == ALL_OFF) {
2880  // Zero mask - no-op, so remove the store completely. (This
2881  // may in turn lead to being able to optimize out instructions
2882  // that compute the rvalue...)
2883  callInst->eraseFromParent();
2884  return true;
2885  } else if (maskStatus == ALL_ON) {
2886  // The mask is all on, so turn this into a regular store
2887  llvm::Type *rvalueType = rvalue->getType();
2888  llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
2889 
2890  lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
2891  lCopyMetadata(lvalue, callInst);
2892  llvm::Instruction *store =
2893  new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
2894  g->opt.forceAlignedMemory ? g->target->getNativeVectorAlignment() : info->align);
2895  lCopyMetadata(store, callInst);
2896  llvm::ReplaceInstWithInst(callInst, store);
2897  return true;
2898  }
2899 
2900  return false;
2901 }
2902 
2903 static bool lImproveMaskedLoad(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter) {
2904  struct MLInfo {
2905  MLInfo(const char *name, const int a) : align(a) {
2906  func = m->module->getFunction(name);
2907  Assert(func != NULL);
2908  }
2909  llvm::Function *func;
2910  const int align;
2911  };
2912 
2913  MLInfo mlInfo[] = {MLInfo("__masked_load_i8", 1), MLInfo("__masked_load_i16", 2),
2914  MLInfo("__masked_load_i32", 4), MLInfo("__masked_load_float", 4),
2915  MLInfo("__masked_load_i64", 8), MLInfo("__masked_load_double", 8)};
2916 
2917  llvm::Function *called = callInst->getCalledFunction();
2918 
2919  int nFuncs = sizeof(mlInfo) / sizeof(mlInfo[0]);
2920  MLInfo *info = NULL;
2921  for (int i = 0; i < nFuncs; ++i) {
2922  if (mlInfo[i].func != NULL && called == mlInfo[i].func) {
2923  info = &mlInfo[i];
2924  break;
2925  }
2926  }
2927  if (info == NULL)
2928  return false;
2929 
2930  // Got one; grab the operands
2931  llvm::Value *ptr = callInst->getArgOperand(0);
2932  llvm::Value *mask = callInst->getArgOperand(1);
2933 
2934  MaskStatus maskStatus = lGetMaskStatus(mask);
2935  if (maskStatus == ALL_OFF) {
2936  // Zero mask - no-op, so replace the load with an undef value
2937  llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, llvm::UndefValue::get(callInst->getType()));
2938  return true;
2939  } else if (maskStatus == ALL_ON) {
2940  // The mask is all on, so turn this into a regular load
2941  llvm::Type *ptrType = llvm::PointerType::get(callInst->getType(), 0);
2942  ptr = new llvm::BitCastInst(ptr, ptrType, "ptr_cast_for_load", callInst);
2943  llvm::Instruction *load = new llvm::LoadInst(
2944  ptr, callInst->getName(), false /* not volatile */,
2945  g->opt.forceAlignedMemory ? g->target->getNativeVectorAlignment() : info->align, (llvm::Instruction *)NULL);
2946  lCopyMetadata(load, callInst);
2947  llvm::ReplaceInstWithInst(callInst, load);
2948  return true;
2949  } else
2950  return false;
2951 }
2952 
2953 bool ImproveMemoryOpsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
2954  DEBUG_START_PASS("ImproveMemoryOps");
2955 
2956  bool modifiedAny = false;
2957 restart:
2958  // Iterate through all of the instructions in the basic block.
2959  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
2960  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
2961  // If we don't have a call to one of the
2962  // __pseudo_{gather,scatter}_* functions, then just go on to the
2963  // next instruction.
2964  if (callInst == NULL || callInst->getCalledFunction() == NULL)
2965  continue;
2966 
2967  if (lGSToGSBaseOffsets(callInst)) {
2968  modifiedAny = true;
2969  goto restart;
2970  }
2971  if (lGSBaseOffsetsGetMoreConst(callInst)) {
2972  modifiedAny = true;
2973  goto restart;
2974  }
2975  if (lGSToLoadStore(callInst)) {
2976  modifiedAny = true;
2977  goto restart;
2978  }
2979  if (lImproveMaskedStore(callInst)) {
2980  modifiedAny = true;
2981  goto restart;
2982  }
2983  if (lImproveMaskedLoad(callInst, iter)) {
2984  modifiedAny = true;
2985  goto restart;
2986  }
2987  }
2988 
2989  DEBUG_END_PASS("ImproveMemoryOps");
2990 
2991  return modifiedAny;
2992 }
2993 
2994 static llvm::Pass *CreateImproveMemoryOpsPass() { return new ImproveMemoryOpsPass; }
2995 
2996 ///////////////////////////////////////////////////////////////////////////
2997 // GatherCoalescePass
2998 
2999 // This pass implements two optimizations to improve the performance of
3000 // gathers; currently only gathers of 32-bit values where it can be
3001 // determined at compile time that the mask is all on are supported, though
3002 // both of those limitations may be generalized in the future.
3003 //
3004 // First, for any single gather, see if it's worthwhile to break it into
3005 // any of scalar, 2-wide (i.e. 64-bit), 4-wide, or 8-wide loads. Further,
3006 // we generate code that shuffles these loads around. Doing fewer, larger
3007 // loads in this manner, when possible, can be more efficient.
3008 //
3009 // Second, this pass can coalesce memory accesses across multiple
3010 // gathers. If we have a series of gathers without any memory writes in
3011 // the middle, then we try to analyze their reads collectively and choose
3012 // an efficient set of loads for them. Not only does this help if
3013 // different gathers reuse values from the same location in memory, but
3014 // it's specifically helpful when data with AOS layout is being accessed;
3015 // in this case, we're often able to generate wide vector loads and
3016 // appropriate shuffles automatically.
3017 
3018 class GatherCoalescePass : public llvm::BasicBlockPass {
3019  public:
3020  static char ID;
3021  GatherCoalescePass() : BasicBlockPass(ID) {}
3022 
3023 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
3024  const char *getPassName() const { return "Gather Coalescing"; }
3025 #else // LLVM 4.0+
3026  llvm::StringRef getPassName() const { return "Gather Coalescing"; }
3027 #endif
3028  bool runOnBasicBlock(llvm::BasicBlock &BB);
3029 };
3030 
3031 char GatherCoalescePass::ID = 0;
3032 
3033 /** Representation of a memory load that the gather coalescing code has
3034  decided to generate.
3035  */
3037  CoalescedLoadOp(int64_t s, int c) {
3038  start = s;
3039  count = c;
3040  load = element0 = element1 = NULL;
3041  }
3042 
3043  /** Starting offset of the load from the common base pointer (in terms
3044  of numbers of items of the underlying element type--*not* in terms
3045  of bytes). */
3046  int64_t start;
3047 
3048  /** Number of elements to load at this location */
3049  int count;
3050 
3051  /** Value loaded from memory for this load op */
3052  llvm::Value *load;
3053 
3054  /** For 2-wide loads (i.e. 64-bit loads), these store the lower and
3055  upper 32 bits of the result, respectively. */
3056  llvm::Value *element0, *element1;
3057 };
3058 
3059 /** This function determines whether it makes sense (and is safe) to
3060  generate a vector load of width vectorWidth, starting at *iter. It
3061  returns true if so, setting *newIter to point to the next element in
3062  the set that isn't taken care of by the generated load. If a vector
3063  load of the given width doesn't make sense, then false is returned.
3064  */
3065 static bool lVectorLoadIsEfficient(std::set<int64_t>::iterator iter, std::set<int64_t>::iterator end,
3066  std::set<int64_t>::iterator *newIter, int vectorWidth) {
3067  // We're considering a vector load of width vectorWidth, starting at
3068  // the offset "start".
3069  int64_t start = *iter;
3070 
3071  // The basic idea is that we'll look at the subsequent elements in the
3072  // load set after the initial one at start. As long as subsequent
3073  // elements:
3074  //
3075  // 1. Aren't so far separated that they no longer fit into the range
3076  // [start, start+vectorWidth)
3077  //
3078  // 2. And don't have too large a gap in between them (e.g., it's not
3079  // worth generating an 8-wide load for two elements with offsets 0
3080  // and 7, but no loads requested in between).
3081  //
3082  // Then we continue moving forward through the elements until we either
3083  // fill up the vector or run out of elements.
3084 
3085  // lastAccepted holds the last offset we've processed and accepted as
3086  // valid for the vector load underconsideration
3087  int64_t lastAccepted = start;
3088 
3089  while (iter != end) {
3090  // What is the separation in offset values from the last element we
3091  // added to the set for this load?
3092  int64_t delta = *iter - lastAccepted;
3093  if (delta > 3)
3094  // If there's too big a gap, then we won't issue the load
3095  return false;
3096 
3097  int64_t span = *iter - start + 1;
3098 
3099  if (span == vectorWidth) {
3100  // We've extended far enough that we have exactly filled up the
3101  // entire vector width; we can't go any further, so return with
3102  // success. (Update *newIter to point at the next element
3103  // after the last one accepted here.)
3104  *newIter = ++iter;
3105  return true;
3106  } else if (span > vectorWidth) {
3107  // The current offset won't fit into a vectorWidth-wide load
3108  // starting from start. It's still generally worthwhile
3109  // issuing the load we've been considering, though, since it
3110  // will provide values for a number of previous offsets. This
3111  // load will have one or more elements at the end of its range
3112  // that is not needed by any of the offsets under
3113  // consideration. As such, there are three cases where issuing
3114  // this load is a bad idea:
3115  //
3116  // 1. 2-wide loads: we know that we haven't completely filled
3117  // the 2-wide vector, since otherwise the if() test above
3118  // would have succeeded previously. Therefore, we must have
3119  // a situation with offsets like (4,6,...); it would be a
3120  // silly idea to issue a 2-wide load to get the value for
3121  // the 4 offset, versus failing here and issuing a scalar
3122  // load instead.
3123  //
3124  // 2. If there are too many unnecessary values at the end of
3125  // the load extent (defined as more than half of them)--in
3126  // this case, it'd be better to issue a vector load of
3127  // smaller width anyway.
3128  //
3129  // 3. If the gap between the last accepted offset and the
3130  // current one under consideration is more than the page
3131  // size. In this case we can't be sure whether or not some
3132  // of the unused elements at the end of the load will
3133  // straddle a page boundary and thus lead to an undesirable
3134  // fault. (It's hard to imagine this happening in practice,
3135  // except under contrived circumstances, but better safe
3136  // than sorry.)
3137  const int pageSize = 4096;
3138  if (vectorWidth != 2 && (lastAccepted - start) > (vectorWidth / 2) && (*iter - lastAccepted) < pageSize) {
3139  *newIter = iter;
3140  return true;
3141  } else
3142  return false;
3143  }
3144 
3145  // Continue moving forward
3146  lastAccepted = *iter;
3147  ++iter;
3148  }
3149 
3150  return false;
3151 }
3152 
3153 /** Given a set of offsets from a common base pointer that we need to get
3154  loaded into memory, determine a reasonable set of load operations that
3155  gets all of the corresponding values in memory (ideally, including as
3156  many as possible wider vector loads rather than scalar loads). Return
3157  a CoalescedLoadOp for each one in the *loads array.
3158  */
3159 static void lSelectLoads(const std::vector<int64_t> &loadOffsets, std::vector<CoalescedLoadOp> *loads) {
3160  // First, get a sorted set of unique offsets to load from.
3161  std::set<int64_t> allOffsets;
3162  for (unsigned int i = 0; i < loadOffsets.size(); ++i)
3163  allOffsets.insert(loadOffsets[i]);
3164 
3165  std::set<int64_t>::iterator iter = allOffsets.begin();
3166  while (iter != allOffsets.end()) {
3167  Debug(SourcePos(), "Load needed at %" PRId64 ".", *iter);
3168  ++iter;
3169  }
3170 
3171  // Now, iterate over the offsets from low to high. Starting at the
3172  // current offset, we see if a vector load starting from that offset
3173  // will cover loads at subsequent offsets as well.
3174  iter = allOffsets.begin();
3175  while (iter != allOffsets.end()) {
3176  // Consider vector loads of width of each of the elements of
3177  // spanSizes[], in order.
3178  int vectorWidths[] = {8, 4, 2};
3179  int nVectorWidths = sizeof(vectorWidths) / sizeof(vectorWidths[0]);
3180  bool gotOne = false;
3181  for (int i = 0; i < nVectorWidths; ++i) {
3182  // See if a load of vector with width vectorWidths[i] would be
3183  // effective (i.e. would cover a reasonable number of the
3184  // offsets that need to be loaded from).
3185  std::set<int64_t>::iterator newIter;
3186  if (lVectorLoadIsEfficient(iter, allOffsets.end(), &newIter, vectorWidths[i])) {
3187  // Yes: create the corresponding coalesced load and update
3188  // the iterator to the returned iterator; doing so skips
3189  // over the additional offsets that are taken care of by
3190  // this load.
3191  loads->push_back(CoalescedLoadOp(*iter, vectorWidths[i]));
3192  iter = newIter;
3193  gotOne = true;
3194  break;
3195  }
3196  }
3197 
3198  if (gotOne == false) {
3199  // We couldn't find a vector load starting from this offset
3200  // that made sense, so emit a scalar load and continue onward.
3201  loads->push_back(CoalescedLoadOp(*iter, 1));
3202  ++iter;
3203  }
3204  }
3205 }
3206 
3207 /** Print a performance message with the details of the result of
3208  coalescing over a group of gathers. */
3209 static void lCoalescePerfInfo(const std::vector<llvm::CallInst *> &coalesceGroup,
3210  const std::vector<CoalescedLoadOp> &loadOps) {
3211  SourcePos pos;
3212  lGetSourcePosFromMetadata(coalesceGroup[0], &pos);
3213 
3214  // Create a string that indicates the line numbers of the subsequent
3215  // gathers from the first one that were coalesced here.
3216  char otherPositions[512];
3217  otherPositions[0] = '\0';
3218  if (coalesceGroup.size() > 1) {
3219  const char *plural = (coalesceGroup.size() > 2) ? "s" : "";
3220  char otherBuf[32];
3221  snprintf(otherBuf, sizeof(otherBuf), "(other%s at line%s ", plural, plural);
3222  strncat(otherPositions, otherBuf, sizeof(otherPositions) - strlen(otherPositions) - 1);
3223 
3224  for (int i = 1; i < (int)coalesceGroup.size(); ++i) {
3225  SourcePos p;
3226  bool ok = lGetSourcePosFromMetadata(coalesceGroup[i], &p);
3227  if (ok) {
3228  char buf[32];
3229  snprintf(buf, sizeof(buf), "%d", p.first_line);
3230  strncat(otherPositions, buf, sizeof(otherPositions) - strlen(otherPositions) - 1);
3231  if (i < (int)coalesceGroup.size() - 1)
3232  strncat(otherPositions, ", ", sizeof(otherPositions) - strlen(otherPositions) - 1);
3233  }
3234  }
3235  strncat(otherPositions, ") ", sizeof(otherPositions) - strlen(otherPositions) - 1);
3236  }
3237 
3238  // Count how many loads of each size there were.
3239  std::map<int, int> loadOpsCount;
3240  for (int i = 0; i < (int)loadOps.size(); ++i)
3241  ++loadOpsCount[loadOps[i].count];
3242 
3243  // Generate a string the describes the mix of load ops
3244  char loadOpsInfo[512];
3245  loadOpsInfo[0] = '\0';
3246  std::map<int, int>::const_iterator iter = loadOpsCount.begin();
3247  while (iter != loadOpsCount.end()) {
3248  char buf[32];
3249  snprintf(buf, sizeof(buf), "%d x %d-wide", iter->second, iter->first);
3250  if ((strlen(loadOpsInfo) + strlen(buf)) >= 512) {
3251  break;
3252  }
3253  strncat(loadOpsInfo, buf, sizeof(loadOpsInfo) - strlen(loadOpsInfo) - 1);
3254  ++iter;
3255  if (iter != loadOpsCount.end())
3256  strncat(loadOpsInfo, ", ", sizeof(loadOpsInfo) - strlen(loadOpsInfo) - 1);
3257  }
3258 
3259  if (coalesceGroup.size() == 1)
3260  PerformanceWarning(pos, "Coalesced gather into %d load%s (%s).", (int)loadOps.size(),
3261  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3262  else
3263  PerformanceWarning(pos,
3264  "Coalesced %d gathers starting here %sinto %d "
3265  "load%s (%s).",
3266  (int)coalesceGroup.size(), otherPositions, (int)loadOps.size(),
3267  (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
3268 }
3269 
3270 /** Utility routine that computes an offset from a base pointer and then
3271  returns the result of a load of the given type from the resulting
3272  location:
3273 
3274  return *((type *)(basePtr + offset))
3275  */
3276 llvm::Value *lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align, llvm::Instruction *insertBefore,
3277  llvm::Type *type) {
3278  llvm::Value *ptr = lGEPInst(basePtr, LLVMInt64(offset), "new_base", insertBefore);
3279  ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(type, 0), "ptr_cast", insertBefore);
3280  return new llvm::LoadInst(ptr, "gather_load", false /* not volatile */, align, insertBefore);
3281 }
3282 
3283 /* Having decided that we're doing to emit a series of loads, as encoded in
3284  the loadOps array, this function emits the corresponding load
3285  instructions.
3286  */
3287 static void lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps, int elementSize,
3288  llvm::Instruction *insertBefore) {
3289  Debug(SourcePos(), "Coalesce doing %d loads.", (int)loadOps.size());
3290  for (int i = 0; i < (int)loadOps.size(); ++i) {
3291  Debug(SourcePos(), "Load #%d @ %" PRId64 ", %d items", i, loadOps[i].start, loadOps[i].count);
3292 
3293  // basePtr is an i8 *, so the offset from it should be in terms of
3294  // bytes, not underlying i32 elements.
3295  int64_t start = loadOps[i].start * elementSize;
3296 
3297  int align = 4;
3298  switch (loadOps[i].count) {
3299  case 1:
3300  // Single 32-bit scalar load
3301  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, LLVMTypes::Int32Type);
3302  break;
3303  case 2: {
3304  // Emit 2 x i32 loads as i64 loads and then break the result
3305  // into two 32-bit parts.
3306  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, LLVMTypes::Int64Type);
3307  // element0 = (int32)value;
3308  loadOps[i].element0 =
3309  new llvm::TruncInst(loadOps[i].load, LLVMTypes::Int32Type, "load64_elt0", insertBefore);
3310  // element1 = (int32)(value >> 32)
3311  llvm::Value *shift = llvm::BinaryOperator::Create(llvm::Instruction::LShr, loadOps[i].load, LLVMInt64(32),
3312  "load64_shift", insertBefore);
3313  loadOps[i].element1 = new llvm::TruncInst(shift, LLVMTypes::Int32Type, "load64_elt1", insertBefore);
3314  break;
3315  }
3316  case 4: {
3317  // 4-wide vector load
3318  if (g->opt.forceAlignedMemory) {
3319  align = g->target->getNativeVectorAlignment();
3320  }
3321  llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3322  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, vt);
3323  break;
3324  }
3325  case 8: {
3326  // 8-wide vector load
3327  if (g->opt.forceAlignedMemory) {
3328  align = g->target->getNativeVectorAlignment();
3329  }
3330  llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 8);
3331  loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, vt);
3332  break;
3333  }
3334  default:
3335  FATAL("Unexpected load count in lEmitLoads()");
3336  }
3337  }
3338 }
3339 
3340 /** Convert any loads of 8-wide vectors into two 4-wide vectors
3341  (logically). This allows the assembly code below to always operate on
3342  4-wide vectors, which leads to better code. Returns a new vector of
3343  load operations.
3344  */
3345 static std::vector<CoalescedLoadOp> lSplit8WideLoads(const std::vector<CoalescedLoadOp> &loadOps,
3346  llvm::Instruction *insertBefore) {
3347  std::vector<CoalescedLoadOp> ret;
3348  for (unsigned int i = 0; i < loadOps.size(); ++i) {
3349  if (loadOps[i].count == 8) {
3350  // Create fake CoalescedLOadOps, where the load llvm::Value is
3351  // actually a shuffle that pulls either the first 4 or the last
3352  // 4 values out of the original 8-wide loaded value.
3353  int32_t shuf[2][4] = {{0, 1, 2, 3}, {4, 5, 6, 7}};
3354 
3355  ret.push_back(CoalescedLoadOp(loadOps[i].start, 4));
3356  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load, shuf[0], 4, insertBefore);
3357 
3358  ret.push_back(CoalescedLoadOp(loadOps[i].start + 4, 4));
3359  ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load, shuf[1], 4, insertBefore);
3360  } else
3361  ret.push_back(loadOps[i]);
3362  }
3363 
3364  return ret;
3365 }
3366 
3367 /** Given a 1-wide load of a 32-bit value, merge its value into the result
3368  vector for any and all elements for which it applies.
3369  */
3370 static llvm::Value *lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
3371  llvm::Instruction *insertBefore) {
3372  for (int elt = 0; elt < 4; ++elt) {
3373  if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
3374  Debug(SourcePos(),
3375  "Load 1 @ %" PRId64 " matches for element #%d "
3376  "(value %" PRId64 ")",
3377  load.start, elt, offsets[elt]);
3378  // If this load gives one of the values that we need, then we
3379  // can just insert it in directly
3380  Assert(set[elt] == false);
3381  result = llvm::InsertElementInst::Create(result, load.load, LLVMInt32(elt), "insert_load", insertBefore);
3382  set[elt] = true;
3383  }
3384  }
3385 
3386  return result;
3387 }
3388 
3389 /** Similarly, incorporate the values from a 2-wide load into any vector
3390  elements that they apply to. */
3391 static llvm::Value *lApplyLoad2(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
3392  llvm::Instruction *insertBefore) {
3393  int elt = 0;
3394  while (elt < 4) {
3395  // First, try to do a 64-bit-wide insert into the result vector.
3396  // We can do this when we're currently at an even element, when the
3397  // current and next element have consecutive values, and where the
3398  // original 64-bit load is at the offset needed by the current
3399  // element.
3400  if ((elt & 1) == 0 && offsets[elt] + 1 == offsets[elt + 1] && offsets[elt] == load.start) {
3401  Debug(SourcePos(),
3402  "Load 2 @ %" PRId64 " matches for elements #%d,%d "
3403  "(values %" PRId64 ",%" PRId64 ")",
3404  load.start, elt, elt + 1, offsets[elt], offsets[elt + 1]);
3405  Assert(set[elt] == false && ((elt < 3) && set[elt + 1] == false));
3406 
3407  // In this case, we bitcast from a 4xi32 to a 2xi64 vector
3408  llvm::Type *vec2x64Type = llvm::VectorType::get(LLVMTypes::Int64Type, 2);
3409  result = new llvm::BitCastInst(result, vec2x64Type, "to2x64", insertBefore);
3410 
3411  // And now we can insert the 64-bit wide value into the
3412  // appropriate elment
3413  result = llvm::InsertElementInst::Create(result, load.load, LLVMInt32(elt / 2), "insert64", insertBefore);
3414 
3415  // And back to 4xi32.
3416  llvm::Type *vec4x32Type = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3417  result = new llvm::BitCastInst(result, vec4x32Type, "to4x32", insertBefore);
3418 
3419  set[elt] = true;
3420  if (elt < 3) {
3421  set[elt + 1] = true;
3422  }
3423  // Advance elt one extra time, since we just took care of two
3424  // elements
3425  ++elt;
3426  } else if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
3427  Debug(SourcePos(),
3428  "Load 2 @ %" PRId64 " matches for element #%d "
3429  "(value %" PRId64 ")",
3430  load.start, elt, offsets[elt]);
3431  // Otherwise, insert one of the 32-bit pieces into an element
3432  // of the final vector
3433  Assert(set[elt] == false);
3434  llvm::Value *toInsert = (offsets[elt] == load.start) ? load.element0 : load.element1;
3435  result = llvm::InsertElementInst::Create(result, toInsert, LLVMInt32(elt), "insert_load", insertBefore);
3436  set[elt] = true;
3437  }
3438  ++elt;
3439  }
3440 
3441  return result;
3442 }
3443 
3444 #if 1
3445 /* This approach works better with AVX, while the #else path generates
3446  slightly better code with SSE. Need to continue to dig into performance
3447  details with this stuff in general... */
3448 
3449 /** And handle a 4-wide load */
3450 static llvm::Value *lApplyLoad4(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
3451  llvm::Instruction *insertBefore) {
3452  // Conceptually, we're doing to consider doing a shuffle vector with
3453  // the 4-wide load and the 4-wide result we have so far to generate a
3454  // new 4-wide vector. We'll start with shuffle indices that just
3455  // select each element of the result so far for the result.
3456  int32_t shuf[4] = {4, 5, 6, 7};
3457 
3458  for (int elt = 0; elt < 4; ++elt) {
3459  if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
3460  Debug(SourcePos(),
3461  "Load 4 @ %" PRId64 " matches for element #%d "
3462  "(value %" PRId64 ")",
3463  load.start, elt, offsets[elt]);
3464 
3465  // If the current element falls within the range of locations
3466  // that the 4-wide load covers, then compute the appropriate
3467  // shuffle index that extracts the appropriate element from the
3468  // load.
3469  Assert(set[elt] == false);
3470  shuf[elt] = int32_t(offsets[elt] - load.start);
3471  set[elt] = true;
3472  }
3473  }
3474 
3475  // Now, issue a shufflevector instruction if any of the values from the
3476  // load we just considered were applicable.
3477  if (shuf[0] != 4 || shuf[1] != 5 || shuf[2] != 6 || shuf[3] != 7)
3478  result = LLVMShuffleVectors(load.load, result, shuf, 4, insertBefore);
3479 
3480  return result;
3481 }
3482 
3483 /** We're need to fill in the values for a 4-wide result vector. This
3484  function looks at all of the generated loads and extracts the
3485  appropriate elements from the appropriate loads to assemble the result.
3486  Here the offsets[] parameter gives the 4 offsets from the base pointer
3487  for the four elements of the result.
3488 */
3489 static llvm::Value *lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps, const int64_t offsets[4],
3490  llvm::Instruction *insertBefore) {
3491  llvm::Type *returnType = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3492  llvm::Value *result = llvm::UndefValue::get(returnType);
3493 
3494  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3495  offsets[1], offsets[2], offsets[3]);
3496 
3497  // Track whether we have found a valid value for each of the four
3498  // elements of the result
3499  bool set[4] = {false, false, false, false};
3500 
3501  // Loop over all of the loads and check each one to see if it provides
3502  // a value that's applicable to the result
3503  for (int load = 0; load < (int)loadOps.size(); ++load) {
3504  const CoalescedLoadOp &li = loadOps[load];
3505 
3506  switch (li.count) {
3507  case 1:
3508  result = lApplyLoad1(result, li, offsets, set, insertBefore);
3509  break;
3510  case 2:
3511  result = lApplyLoad2(result, li, offsets, set, insertBefore);
3512  break;
3513  case 4:
3514  result = lApplyLoad4(result, li, offsets, set, insertBefore);
3515  break;
3516  default:
3517  FATAL("Unexpected load count in lAssemble4Vector()");
3518  }
3519  }
3520 
3521  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3522  offsets[1], offsets[2], offsets[3]);
3523 
3524  for (int i = 0; i < 4; ++i)
3525  Assert(set[i] == true);
3526 
3527  return result;
3528 }
3529 
3530 #else
3531 
3532 static llvm::Value *lApplyLoad4s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3533  const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore) {
3534  int32_t firstMatchElements[4] = {-1, -1, -1, -1};
3535  const CoalescedLoadOp *firstMatch = NULL;
3536 
3537  Assert(llvm::isa<llvm::UndefValue>(result));
3538 
3539  for (int load = 0; load < (int)loadOps.size(); ++load) {
3540  const CoalescedLoadOp &loadop = loadOps[load];
3541  if (loadop.count != 4)
3542  continue;
3543 
3544  int32_t matchElements[4] = {-1, -1, -1, -1};
3545  bool anyMatched = false;
3546  for (int elt = 0; elt < 4; ++elt) {
3547  if (offsets[elt] >= loadop.start && offsets[elt] < loadop.start + loadop.count) {
3548  Debug(SourcePos(),
3549  "Load 4 @ %" PRId64 " matches for element #%d "
3550  "(value %" PRId64 ")",
3551  loadop.start, elt, offsets[elt]);
3552  anyMatched = true;
3553  Assert(set[elt] == false);
3554  matchElements[elt] = offsets[elt] - loadop.start;
3555  set[elt] = true;
3556  }
3557  }
3558 
3559  if (anyMatched) {
3560  if (llvm::isa<llvm::UndefValue>(result)) {
3561  if (firstMatch == NULL) {
3562  firstMatch = &loadop;
3563  for (int i = 0; i < 4; ++i)
3564  firstMatchElements[i] = matchElements[i];
3565  } else {
3566  int32_t shuffle[4] = {-1, -1, -1, -1};
3567  for (int i = 0; i < 4; ++i) {
3568  if (firstMatchElements[i] != -1)
3569  shuffle[i] = firstMatchElements[i];
3570  else
3571  shuffle[i] = 4 + matchElements[i];
3572  }
3573  result = LLVMShuffleVectors(firstMatch->load, loadop.load, shuffle, 4, insertBefore);
3574  firstMatch = NULL;
3575  }
3576  } else {
3577  int32_t shuffle[4] = {-1, -1, -1, -1};
3578  for (int i = 0; i < 4; ++i) {
3579  if (matchElements[i] != -1)
3580  shuffle[i] = 4 + matchElements[i];
3581  else
3582  shuffle[i] = i;
3583  }
3584  result = LLVMShuffleVectors(result, loadop.load, shuffle, 4, insertBefore);
3585  }
3586  }
3587  }
3588 
3589  if (firstMatch != NULL && llvm::isa<llvm::UndefValue>(result))
3590  return LLVMShuffleVectors(firstMatch->load, result, firstMatchElements, 4, insertBefore);
3591  else
3592  return result;
3593 }
3594 
3595 static llvm::Value *lApplyLoad12s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
3596  const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore) {
3597  // Loop over all of the loads and check each one to see if it provides
3598  // a value that's applicable to the result
3599  for (int load = 0; load < (int)loadOps.size(); ++load) {
3600  const CoalescedLoadOp &loadop = loadOps[load];
3601  Assert(loadop.count == 1 || loadop.count == 2 || loadop.count == 4);
3602 
3603  if (loadop.count == 1)
3604  result = lApplyLoad1(result, loadop, offsets, set, insertBefore);
3605  else if (loadop.count == 2)
3606  result = lApplyLoad2(result, loadop, offsets, set, insertBefore);
3607  }
3608  return result;
3609 }
3610 
3611 /** We're need to fill in the values for a 4-wide result vector. This
3612  function looks at all of the generated loads and extracts the
3613  appropriate elements from the appropriate loads to assemble the result.
3614  Here the offsets[] parameter gives the 4 offsets from the base pointer
3615  for the four elements of the result.
3616 */
3617 static llvm::Value *lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps, const int64_t offsets[4],
3618  llvm::Instruction *insertBefore) {
3619  llvm::Type *returnType = llvm::VectorType::get(LLVMTypes::Int32Type, 4);
3620  llvm::Value *result = llvm::UndefValue::get(returnType);
3621 
3622  Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3623  offsets[1], offsets[2], offsets[3]);
3624 
3625  // Track whether we have found a valid value for each of the four
3626  // elements of the result
3627  bool set[4] = {false, false, false, false};
3628 
3629  result = lApplyLoad4s(result, loadOps, offsets, set, insertBefore);
3630  result = lApplyLoad12s(result, loadOps, offsets, set, insertBefore);
3631 
3632  Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
3633  offsets[1], offsets[2], offsets[3]);
3634 
3635  for (int i = 0; i < 4; ++i)
3636  Assert(set[i] == true);
3637 
3638  return result;
3639 }
3640 #endif
3641 
3642 /** Given the set of loads that we've done and the set of result values to
3643  be computed, this function computes the final llvm::Value *s for each
3644  result vector.
3645  */
3646 static void lAssembleResultVectors(const std::vector<CoalescedLoadOp> &loadOps,
3647  const std::vector<int64_t> &constOffsets, std::vector<llvm::Value *> &results,
3648  llvm::Instruction *insertBefore) {
3649  // We work on 4-wide chunks of the final values, even when we're
3650  // computing 8-wide or 16-wide vectors. This gives better code from
3651  // LLVM's SSE/AVX code generators.
3652  Assert((constOffsets.size() % 4) == 0);
3653  std::vector<llvm::Value *> vec4s;
3654  for (int i = 0; i < (int)constOffsets.size(); i += 4)
3655  vec4s.push_back(lAssemble4Vector(loadOps, &constOffsets[i], insertBefore));
3656 
3657  // And now concatenate 1, 2, or 4 of the 4-wide vectors computed above
3658  // into 4, 8, or 16-wide final result vectors.
3659  int numGathers = constOffsets.size() / g->target->getVectorWidth();
3660  for (int i = 0; i < numGathers; ++i) {
3661  llvm::Value *result = NULL;
3662  switch (g->target->getVectorWidth()) {
3663  case 4:
3664  result = vec4s[i];
3665  break;
3666  case 8:
3667  result = LLVMConcatVectors(vec4s[2 * i], vec4s[2 * i + 1], insertBefore);
3668  break;
3669  case 16: {
3670  llvm::Value *v1 = LLVMConcatVectors(vec4s[4 * i], vec4s[4 * i + 1], insertBefore);
3671  llvm::Value *v2 = LLVMConcatVectors(vec4s[4 * i + 2], vec4s[4 * i + 3], insertBefore);
3672  result = LLVMConcatVectors(v1, v2, insertBefore);
3673  break;
3674  }
3675  default:
3676  FATAL("Unhandled vector width in lAssembleResultVectors()");
3677  }
3678 
3679  results.push_back(result);
3680  }
3681 }
3682 
3683 /** Given a call to a gather function, extract the base pointer, the 2/4/8
3684  scale, and the first varying offsets value to use them to compute that
3685  scalar base pointer that is shared by all of the gathers in the group.
3686  (Thus, this base pointer plus the constant offsets term for each gather
3687  gives the set of addresses to use for each gather.
3688  */
3689 static llvm::Value *lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore) {
3690  llvm::Value *basePtr = gatherInst->getArgOperand(0);
3691  llvm::Value *variableOffsets = gatherInst->getArgOperand(1);
3692  llvm::Value *offsetScale = gatherInst->getArgOperand(2);
3693 
3694  // All of the variable offsets values should be the same, due to
3695  // checking for this in GatherCoalescePass::runOnBasicBlock(). Thus,
3696  // extract the first value and use that as a scalar.
3697  llvm::Value *variable = LLVMExtractFirstVectorElement(variableOffsets);
3698  if (variable->getType() == LLVMTypes::Int64Type)
3699  offsetScale = new llvm::ZExtInst(offsetScale, LLVMTypes::Int64Type, "scale_to64", insertBefore);
3700  llvm::Value *offset =
3701  llvm::BinaryOperator::Create(llvm::Instruction::Mul, variable, offsetScale, "offset", insertBefore);
3702 
3703  return lGEPInst(basePtr, offset, "new_base", insertBefore);
3704 }
3705 
3706 /** Extract the constant offsets (from the common base pointer) from each
3707  of the gathers in a set to be coalesced. These come in as byte
3708  offsets, but we'll transform them into offsets in terms of the size of
3709  the base scalar type being gathered. (e.g. for an i32 gather, we might
3710  have offsets like <0,4,16,20>, which would be transformed to <0,1,4,5>
3711  here.)
3712  */
3713 static void lExtractConstOffsets(const std::vector<llvm::CallInst *> &coalesceGroup, int elementSize,
3714  std::vector<int64_t> *constOffsets) {
3715  int width = g->target->getVectorWidth();
3716  *constOffsets = std::vector<int64_t>(coalesceGroup.size() * width, 0);
3717 
3718  int64_t *endPtr = &((*constOffsets)[0]);
3719  for (int i = 0; i < (int)coalesceGroup.size(); ++i, endPtr += width) {
3720  llvm::Value *offsets = coalesceGroup[i]->getArgOperand(3);
3721  int nElts;
3722  bool ok = LLVMExtractVectorInts(offsets, endPtr, &nElts);
3723  Assert(ok && nElts == width);
3724  }
3725 
3726  for (int i = 0; i < (int)constOffsets->size(); ++i)
3727  (*constOffsets)[i] /= elementSize;
3728 }
3729 
3730 /** Actually do the coalescing. We have a set of gathers all accessing
3731  addresses of the form:
3732 
3733  (ptr + {1,2,4,8} * varyingOffset) + constOffset, a.k.a.
3734  basePtr + constOffset
3735 
3736  where varyingOffset actually has the same value across all of the SIMD
3737  lanes and where the part in parenthesis has the same value for all of
3738  the gathers in the group.
3739  */
3740 static bool lCoalesceGathers(const std::vector<llvm::CallInst *> &coalesceGroup) {
3741  llvm::Instruction *insertBefore = coalesceGroup[0];
3742 
3743  // First, compute the shared base pointer for all of the gathers
3744  llvm::Value *basePtr = lComputeBasePtr(coalesceGroup[0], insertBefore);
3745 
3746  int elementSize = 0;
3747  if (coalesceGroup[0]->getType() == LLVMTypes::Int32VectorType ||
3748  coalesceGroup[0]->getType() == LLVMTypes::FloatVectorType)
3749  elementSize = 4;
3750  else if (coalesceGroup[0]->getType() == LLVMTypes::Int64VectorType ||
3751  coalesceGroup[0]->getType() == LLVMTypes::DoubleVectorType)
3752  elementSize = 8;
3753  else
3754  FATAL("Unexpected gather type in lCoalesceGathers");
3755 
3756  // Extract the constant offsets from the gathers into the constOffsets
3757  // vector: the first vectorWidth elements will be those for the first
3758  // gather, the next vectorWidth those for the next gather, and so
3759  // forth.
3760  std::vector<int64_t> constOffsets;
3761  lExtractConstOffsets(coalesceGroup, elementSize, &constOffsets);
3762 
3763  // Determine a set of loads to perform to get all of the values we need
3764  // loaded.
3765  std::vector<CoalescedLoadOp> loadOps;
3766  lSelectLoads(constOffsets, &loadOps);
3767 
3768  lCoalescePerfInfo(coalesceGroup, loadOps);
3769 
3770  // Actually emit load instructions for them
3771  lEmitLoads(basePtr, loadOps, elementSize, insertBefore);
3772 
3773  // Now, for any loads that give us <8 x i32> vectors, split their
3774  // values into two <4 x i32> vectors; it turns out that LLVM gives us
3775  // better code on AVX when we assemble the pieces from 4-wide vectors.
3776  loadOps = lSplit8WideLoads(loadOps, insertBefore);
3777 
3778  // Given all of these chunks of values, shuffle together a vector that
3779  // gives us each result value; the i'th element of results[] gives the
3780  // result for the i'th gather in coalesceGroup.
3781  std::vector<llvm::Value *> results;
3782  lAssembleResultVectors(loadOps, constOffsets, results, insertBefore);
3783 
3784  // Finally, replace each of the original gathers with the instruction
3785  // that gives the value from the coalescing process.
3786  Assert(results.size() == coalesceGroup.size());
3787  for (int i = 0; i < (int)results.size(); ++i) {
3788  llvm::Instruction *ir = llvm::dyn_cast<llvm::Instruction>(results[i]);
3789  Assert(ir != NULL);
3790 
3791  llvm::Type *origType = coalesceGroup[i]->getType();
3792  if (origType != ir->getType())
3793  ir = new llvm::BitCastInst(ir, origType, ir->getName(), coalesceGroup[i]);
3794 
3795  // Previously, all of the instructions to compute the final result
3796  // were into the basic block here; here we remove the very last one
3797  // of them (that holds the final result) from the basic block.
3798  // This way, the following ReplaceInstWithInst() call will operate
3799  // successfully. (It expects that the second argument not be in any
3800  // basic block.)
3801  ir->removeFromParent();
3802 
3803  llvm::ReplaceInstWithInst(coalesceGroup[i], ir);
3804  }
3805 
3806  return true;
3807 }
3808 
3809 /** Given an instruction, returns true if the instructon may write to
3810  memory. This is a conservative test in that it may return true for
3811  some instructions that don't actually end up writing to memory, but
3812  should never return false for an instruction that does write to
3813  memory. */
3814 static bool lInstructionMayWriteToMemory(llvm::Instruction *inst) {
3815  if (llvm::isa<llvm::StoreInst>(inst) || llvm::isa<llvm::AtomicRMWInst>(inst) ||
3816  llvm::isa<llvm::AtomicCmpXchgInst>(inst))
3817  // FIXME: we could be less conservative and try to allow stores if
3818  // we are sure that the pointers don't overlap..
3819  return true;
3820 
3821  // Otherwise, any call instruction that doesn't have an attribute
3822  // indicating it won't write to memory has to be treated as a potential
3823  // store.
3824  llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
3825  if (ci != NULL) {
3826  llvm::Function *calledFunc = ci->getCalledFunction();
3827  if (calledFunc == NULL)
3828  return true;
3829 
3830  if (calledFunc->onlyReadsMemory() || calledFunc->doesNotAccessMemory())
3831  return false;
3832  return true;
3833  }
3834 
3835  return false;
3836 }
3837 
3838 bool GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
3839  DEBUG_START_PASS("GatherCoalescePass");
3840 
3841  llvm::Function *gatherFuncs[] = {
3842  m->module->getFunction("__pseudo_gather_factored_base_offsets32_i32"),
3843  m->module->getFunction("__pseudo_gather_factored_base_offsets32_float"),
3844  m->module->getFunction("__pseudo_gather_factored_base_offsets64_i32"),
3845  m->module->getFunction("__pseudo_gather_factored_base_offsets64_float"),
3846  };
3847  int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]);
3848 
3849  bool modifiedAny = false;
3850 
3851 restart:
3852  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
3853  // Iterate over all of the instructions and look for calls to
3854  // __pseudo_gather_factored_base_offsets{32,64}_{i32,float} calls.
3855  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
3856  if (callInst == NULL)
3857  continue;
3858 
3859  llvm::Function *calledFunc = callInst->getCalledFunction();
3860  if (calledFunc == NULL)
3861  continue;
3862 
3863  int i;
3864  for (i = 0; i < nGatherFuncs; ++i)
3865  if (gatherFuncs[i] != NULL && calledFunc == gatherFuncs[i])
3866  break;
3867  if (i == nGatherFuncs)
3868  // Doesn't match any of the types of gathers we care about
3869  continue;
3870 
3871  SourcePos pos;
3872  lGetSourcePosFromMetadata(callInst, &pos);
3873  Debug(pos, "Checking for coalescable gathers starting here...");
3874 
3875  llvm::Value *base = callInst->getArgOperand(0);
3876  llvm::Value *variableOffsets = callInst->getArgOperand(1);
3877  llvm::Value *offsetScale = callInst->getArgOperand(2);
3878  llvm::Value *mask = callInst->getArgOperand(4);
3879 
3880  // To apply this optimization, we need a set of one or more gathers
3881  // that fulfill the following conditions:
3882  //
3883  // - Mask all on
3884  // - The variable offsets to all have the same value (i.e., to be
3885  // uniform).
3886  // - Same base pointer, variable offsets, and offset scale (for
3887  // more than one gather)
3888  //
3889  // Then and only then do we have a common base pointer with all
3890  // offsets from that constants (in which case we can potentially
3891  // coalesce).
3892  if (lGetMaskStatus(mask) != ALL_ON)
3893  continue;
3894 
3895  if (!LLVMVectorValuesAllEqual(variableOffsets))
3896  continue;
3897 
3898  // coalesceGroup stores the set of gathers that we're going to try to
3899  // coalesce over
3900  std::vector<llvm::CallInst *> coalesceGroup;
3901  coalesceGroup.push_back(callInst);
3902 
3903  // Start iterating at the instruction after the initial gather;
3904  // look at the remainder of instructions in the basic block (up
3905  // until we reach a write to memory) to try to find any other
3906  // gathers that can coalesce with this one.
3907  llvm::BasicBlock::iterator fwdIter = iter;
3908  ++fwdIter;
3909  for (; fwdIter != bb.end(); ++fwdIter) {
3910  // Must stop once we come to an instruction that may write to
3911  // memory; otherwise we could end up moving a read before this
3912  // write.
3913  if (lInstructionMayWriteToMemory(&*fwdIter))
3914  break;
3915 
3916  llvm::CallInst *fwdCall = llvm::dyn_cast<llvm::CallInst>(&*fwdIter);
3917  if (fwdCall == NULL || fwdCall->getCalledFunction() != calledFunc)
3918  continue;
3919 
3920  SourcePos fwdPos;
3921  // TODO: need to redesign metadata attached to pseudo calls,
3922  // LLVM drops metadata frequently and it results in bad disgnostics.
3923  lGetSourcePosFromMetadata(fwdCall, &fwdPos);
3924 
3925 #ifndef ISPC_NO_DUMPS
3926  if (g->debugPrint) {
3927  if (base != fwdCall->getArgOperand(0)) {
3928  Debug(fwdPos, "base pointers mismatch");
3929  LLVMDumpValue(base);
3930  LLVMDumpValue(fwdCall->getArgOperand(0));
3931  }
3932  if (variableOffsets != fwdCall->getArgOperand(1)) {
3933  Debug(fwdPos, "varying offsets mismatch");
3934  LLVMDumpValue(variableOffsets);
3935  LLVMDumpValue(fwdCall->getArgOperand(1));
3936  }
3937  if (offsetScale != fwdCall->getArgOperand(2)) {
3938  Debug(fwdPos, "offset scales mismatch");
3939  LLVMDumpValue(offsetScale);
3940  LLVMDumpValue(fwdCall->getArgOperand(2));
3941  }
3942  if (mask != fwdCall->getArgOperand(4)) {
3943  Debug(fwdPos, "masks mismatch");
3944  LLVMDumpValue(mask);
3945  LLVMDumpValue(fwdCall->getArgOperand(4));
3946  }
3947  }
3948 #endif
3949 
3950  if (base == fwdCall->getArgOperand(0) && variableOffsets == fwdCall->getArgOperand(1) &&
3951  offsetScale == fwdCall->getArgOperand(2) && mask == fwdCall->getArgOperand(4)) {
3952  Debug(fwdPos, "This gather can be coalesced.");
3953  coalesceGroup.push_back(fwdCall);
3954 
3955  if (coalesceGroup.size() == 4)
3956  // FIXME: untested heuristic: don't try to coalesce
3957  // over a window of more than 4 gathers, so that we
3958  // don't cause too much register pressure and end up
3959  // spilling to memory anyway.
3960  break;
3961  } else
3962  Debug(fwdPos, "This gather doesn't match the initial one.");
3963  }
3964 
3965  Debug(pos, "Done with checking for matching gathers");
3966 
3967  // Now that we have a group of gathers, see if we can coalesce them
3968  // into something more efficient than the original set of gathers.
3969  if (lCoalesceGathers(coalesceGroup)) {
3970  modifiedAny = true;
3971  goto restart;
3972  }
3973  }
3974 
3975  DEBUG_END_PASS("GatherCoalescePass");
3976 
3977  return modifiedAny;
3978 }
3979 
3980 static llvm::Pass *CreateGatherCoalescePass() { return new GatherCoalescePass; }
3981 
3982 ///////////////////////////////////////////////////////////////////////////
3983 // ReplacePseudoMemoryOpsPass
3984 
3985 /** For any gathers and scatters remaining after the GSToLoadStorePass
3986  runs, we need to turn them into actual native gathers and scatters.
3987  This task is handled by the ReplacePseudoMemoryOpsPass here.
3988  */
3989 class ReplacePseudoMemoryOpsPass : public llvm::BasicBlockPass {
3990  public:
3991  static char ID;
3992  ReplacePseudoMemoryOpsPass() : BasicBlockPass(ID) {}
3993 
3994 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
3995  const char *getPassName() const { return "Replace Pseudo Memory Ops"; }
3996 #else // LLVM 4.0+
3997  llvm::StringRef getPassName() const { return "Replace Pseudo Memory Ops"; }
3998 #endif
3999  bool runOnBasicBlock(llvm::BasicBlock &BB);
4000 };
4001 
4003 
4004 /** This routine attempts to determine if the given pointer in lvalue is
4005  pointing to stack-allocated memory. It's conservative in that it
4006  should never return true for non-stack allocated memory, but may return
4007  false for memory that actually is stack allocated. The basic strategy
4008  is to traverse through the operands and see if the pointer originally
4009  comes from an AllocaInst.
4010 */
4011 static bool lIsSafeToBlend(llvm::Value *lvalue) {
4012  llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(lvalue);
4013  if (bc != NULL)
4014  return lIsSafeToBlend(bc->getOperand(0));
4015  else {
4016  llvm::AllocaInst *ai = llvm::dyn_cast<llvm::AllocaInst>(lvalue);
4017  if (ai) {
4018  llvm::Type *type = ai->getType();
4019  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(type);
4020  assert(pt != NULL);
4021  type = pt->getElementType();
4022  llvm::ArrayType *at;
4023  while ((at = llvm::dyn_cast<llvm::ArrayType>(type))) {
4024  type = at->getElementType();
4025  }
4026  llvm::VectorType *vt = llvm::dyn_cast<llvm::VectorType>(type);
4027  return (vt != NULL && (int)vt->getNumElements() == g->target->getVectorWidth());
4028  } else {
4029  llvm::GetElementPtrInst *gep = llvm::dyn_cast<llvm::GetElementPtrInst>(lvalue);
4030  if (gep != NULL)
4031  return lIsSafeToBlend(gep->getOperand(0));
4032  else
4033  return false;
4034  }
4035  }
4036 }
4037 
4038 static bool lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
4039  struct LMSInfo {
4040  LMSInfo(const char *pname, const char *bname, const char *msname) {
4041  pseudoFunc = m->module->getFunction(pname);
4042  blendFunc = m->module->getFunction(bname);
4043  maskedStoreFunc = m->module->getFunction(msname);
4044  Assert(pseudoFunc != NULL && blendFunc != NULL && maskedStoreFunc != NULL);
4045  }
4046  llvm::Function *pseudoFunc;
4047  llvm::Function *blendFunc;
4048  llvm::Function *maskedStoreFunc;
4049  };
4050 
4051  LMSInfo msInfo[] = {
4052  LMSInfo("__pseudo_masked_store_i8", "__masked_store_blend_i8", "__masked_store_i8"),
4053  LMSInfo("__pseudo_masked_store_i16", "__masked_store_blend_i16", "__masked_store_i16"),
4054  LMSInfo("__pseudo_masked_store_i32", "__masked_store_blend_i32", "__masked_store_i32"),
4055  LMSInfo("__pseudo_masked_store_float", "__masked_store_blend_float", "__masked_store_float"),
4056  LMSInfo("__pseudo_masked_store_i64", "__masked_store_blend_i64", "__masked_store_i64"),
4057  LMSInfo("__pseudo_masked_store_double", "__masked_store_blend_double", "__masked_store_double")};
4058 
4059  LMSInfo *info = NULL;
4060  for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
4061  if (msInfo[i].pseudoFunc != NULL && callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
4062  info = &msInfo[i];
4063  break;
4064  }
4065  }
4066  if (info == NULL)
4067  return false;
4068 
4069  llvm::Value *lvalue = callInst->getArgOperand(0);
4070  llvm::Value *rvalue = callInst->getArgOperand(1);
4071  llvm::Value *mask = callInst->getArgOperand(2);
4072 
4073  // We need to choose between doing the load + blend + store trick,
4074  // or serializing the masked store. Even on targets with a native
4075  // masked store instruction, this is preferable since it lets us
4076  // keep values in registers rather than going out to the stack.
4077  bool doBlend = (!g->opt.disableBlendedMaskedStores && lIsSafeToBlend(lvalue));
4078 
4079  // Generate the call to the appropriate masked store function and
4080  // replace the __pseudo_* one with it.
4081  llvm::Function *fms = doBlend ? info->blendFunc : info->maskedStoreFunc;
4082  llvm::Instruction *inst = lCallInst(fms, lvalue, rvalue, mask, "", callInst);
4083  lCopyMetadata(inst, callInst);
4084 
4085  callInst->eraseFromParent();
4086  return true;
4087 }
4088 
4089 static bool lReplacePseudoGS(llvm::CallInst *callInst) {
4090  struct LowerGSInfo {
4091  LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip) : isGather(ig), isPrefetch(ip) {
4092  pseudoFunc = m->module->getFunction(pName);
4093  actualFunc = m->module->getFunction(aName);
4094  }
4095  llvm::Function *pseudoFunc;
4096  llvm::Function *actualFunc;
4097  const bool isGather;
4098  const bool isPrefetch;
4099  };
4100 
4101  LowerGSInfo lgsInfo[] = {
4102  LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false),
4103  LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
4104  LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
4105  LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
4106  LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
4107  LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),
4108 
4109  LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false),
4110  LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
4111  LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
4112  LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
4113  LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
4114  LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),
4115 
4116  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i8", true, false),
4117  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i16", true, false),
4118  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i32", true, false),
4119  LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_float", true,
4120  false),
4121  LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", "__gather_factored_base_offsets32_i64", true, false),
4122  LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", "__gather_factored_base_offsets32_double", true,
4123  false),
4124 
4125  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i8", true, false),
4126  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i16", true, false),
4127  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i32", true, false),
4128  LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_float", true,
4129  false),
4130  LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_i64", true, false),
4131  LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", "__gather_factored_base_offsets64_double", true,
4132  false),
4133 
4134  LowerGSInfo("__pseudo_gather_base_offsets32_i8", "__gather_base_offsets32_i8", true, false),
4135  LowerGSInfo("__pseudo_gather_base_offsets32_i16", "__gather_base_offsets32_i16", true, false),
4136  LowerGSInfo("__pseudo_gather_base_offsets32_i32", "__gather_base_offsets32_i32", true, false),
4137  LowerGSInfo("__pseudo_gather_base_offsets32_float", "__gather_base_offsets32_float", true, false),
4138  LowerGSInfo("__pseudo_gather_base_offsets32_i64", "__gather_base_offsets32_i64", true, false),
4139  LowerGSInfo("__pseudo_gather_base_offsets32_double", "__gather_base_offsets32_double", true, false),
4140 
4141  LowerGSInfo("__pseudo_gather_base_offsets64_i8", "__gather_base_offsets64_i8", true, false),
4142  LowerGSInfo("__pseudo_gather_base_offsets64_i16", "__gather_base_offsets64_i16", true, false),
4143  LowerGSInfo("__pseudo_gather_base_offsets64_i32", "__gather_base_offsets64_i32", true, false),
4144  LowerGSInfo("__pseudo_gather_base_offsets64_float", "__gather_base_offsets64_float", true, false),
4145  LowerGSInfo("__pseudo_gather_base_offsets64_i64", "__gather_base_offsets64_i64", true, false),
4146  LowerGSInfo("__pseudo_gather_base_offsets64_double", "__gather_base_offsets64_double", true, false),
4147 
4148  LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false),
4149  LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
4150  LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
4151  LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
4152  LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
4153  LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),
4154 
4155  LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false),
4156  LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
4157  LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
4158  LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
4159  LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
4160  LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),
4161 
4162  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i8", false,
4163  false),
4164  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", "__scatter_factored_base_offsets32_i16", false,
4165  false),
4166  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i32", false,
4167  false),
4168  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_float", false,
4169  false),
4170  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", "__scatter_factored_base_offsets32_i64", false,
4171  false),
4172  LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", "__scatter_factored_base_offsets32_double",
4173  false, false),
4174 
4175  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i8", false,
4176  false),
4177  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i16", false,
4178  false),
4179  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i32", false,
4180  false),
4181  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_float", false,
4182  false),
4183  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_i64", false,
4184  false),
4185  LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", "__scatter_factored_base_offsets64_double",
4186  false, false),
4187 
4188  LowerGSInfo("__pseudo_scatter_base_offsets32_i8", "__scatter_base_offsets32_i8", false, false),
4189  LowerGSInfo("__pseudo_scatter_base_offsets32_i16", "__scatter_base_offsets32_i16", false, false),
4190  LowerGSInfo("__pseudo_scatter_base_offsets32_i32", "__scatter_base_offsets32_i32", false, false),
4191  LowerGSInfo("__pseudo_scatter_base_offsets32_float", "__scatter_base_offsets32_float", false, false),
4192  LowerGSInfo("__pseudo_scatter_base_offsets32_i64", "__scatter_base_offsets32_i64", false, false),
4193  LowerGSInfo("__pseudo_scatter_base_offsets32_double", "__scatter_base_offsets32_double", false, false),
4194 
4195  LowerGSInfo("__pseudo_scatter_base_offsets64_i8", "__scatter_base_offsets64_i8", false, false),
4196  LowerGSInfo("__pseudo_scatter_base_offsets64_i16", "__scatter_base_offsets64_i16", false, false),
4197  LowerGSInfo("__pseudo_scatter_base_offsets64_i32", "__scatter_base_offsets64_i32", false, false),
4198  LowerGSInfo("__pseudo_scatter_base_offsets64_float", "__scatter_base_offsets64_float", false, false),
4199  LowerGSInfo("__pseudo_scatter_base_offsets64_i64", "__scatter_base_offsets64_i64", false, false),
4200  LowerGSInfo("__pseudo_scatter_base_offsets64_double", "__scatter_base_offsets64_double", false, false),
4201 
4202  LowerGSInfo("__pseudo_prefetch_read_varying_1", "__prefetch_read_varying_1", false, true),
4203  LowerGSInfo("__pseudo_prefetch_read_varying_1_native", "__prefetch_read_varying_1_native", false, true),
4204 
4205  LowerGSInfo("__pseudo_prefetch_read_varying_2", "__prefetch_read_varying_2", false, true),
4206  LowerGSInfo("__pseudo_prefetch_read_varying_2_native", "__prefetch_read_varying_2_native", false, true),
4207 
4208  LowerGSInfo("__pseudo_prefetch_read_varying_3", "__prefetch_read_varying_3", false, true),
4209  LowerGSInfo("__pseudo_prefetch_read_varying_3_native", "__prefetch_read_varying_3_native", false, true),
4210 
4211  LowerGSInfo("__pseudo_prefetch_read_varying_nt", "__prefetch_read_varying_nt", false, true),
4212  LowerGSInfo("__pseudo_prefetch_read_varying_nt_native", "__prefetch_read_varying_nt_native", false, true),
4213  };
4214 
4215  llvm::Function *calledFunc = callInst->getCalledFunction();
4216 
4217  LowerGSInfo *info = NULL;
4218  for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
4219  if (lgsInfo[i].pseudoFunc != NULL && calledFunc == lgsInfo[i].pseudoFunc) {
4220  info = &lgsInfo[i];
4221  break;
4222  }
4223  }
4224  if (info == NULL)
4225  return false;
4226 
4227  Assert(info->actualFunc != NULL);
4228 
4229  // Get the source position from the metadata attached to the call
4230  // instruction so that we can issue PerformanceWarning()s below.
4231  SourcePos pos;
4232  bool gotPosition = lGetSourcePosFromMetadata(callInst, &pos);
4233 
4234  callInst->setCalledFunction(info->actualFunc);
4235  if (gotPosition && g->target->getVectorWidth() > 1) {
4236  if (info->isGather)
4237  PerformanceWarning(pos, "Gather required to load value.");
4238  else if (!info->isPrefetch)
4239  PerformanceWarning(pos, "Scatter required to store value.");
4240  }
4241  return true;
4242 }
4243 
4245  DEBUG_START_PASS("ReplacePseudoMemoryOpsPass");
4246 
4247  bool modifiedAny = false;
4248 
4249 restart:
4250  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
4251  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
4252  if (callInst == NULL || callInst->getCalledFunction() == NULL)
4253  continue;
4254 
4255  if (lReplacePseudoGS(callInst)) {
4256  modifiedAny = true;
4257  goto restart;
4258  } else if (lReplacePseudoMaskedStore(callInst)) {
4259  modifiedAny = true;
4260  goto restart;
4261  }
4262  }
4263 
4264  DEBUG_END_PASS("ReplacePseudoMemoryOpsPass");
4265 
4266  return modifiedAny;
4267 }
4268 
4270 
4271 ///////////////////////////////////////////////////////////////////////////
4272 // IsCompileTimeConstantPass
4273 
4274 /** LLVM IR implementations of target-specific functions may include calls
4275  to the functions "bool __is_compile_time_constant_*(...)"; these allow
4276  them to have specialied code paths for where the corresponding value is
4277  known at compile time. For masks, for example, this allows them to not
4278  incur the cost of a MOVMSK call at runtime to compute its value in
4279  cases where the mask value isn't known until runtime.
4280 
4281  This pass resolves these calls into either 'true' or 'false' values so
4282  that later optimization passes can operate with these as constants.
4283 
4284  See stdlib.m4 for a number of uses of this idiom.
4285  */
4286 
4287 class IsCompileTimeConstantPass : public llvm::BasicBlockPass {
4288  public:
4289  static char ID;
4290  IsCompileTimeConstantPass(bool last = false) : BasicBlockPass(ID) { isLastTry = last; }
4291 
4292 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4293  const char *getPassName() const { return "Resolve \"is compile time constant\""; }
4294 #else // LLVM 4.0+
4295  llvm::StringRef getPassName() const { return "Resolve \"is compile time constant\""; }
4296 #endif
4297  bool runOnBasicBlock(llvm::BasicBlock &BB);
4298 
4300 };
4301 
4303 
4305  DEBUG_START_PASS("IsCompileTimeConstantPass");
4306 
4307  llvm::Function *funcs[] = {m->module->getFunction("__is_compile_time_constant_mask"),
4308  m->module->getFunction("__is_compile_time_constant_uniform_int32"),
4309  m->module->getFunction("__is_compile_time_constant_varying_int32")};
4310 
4311  bool modifiedAny = false;
4312 restart:
4313  for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
4314  // Iterate through the instructions looking for calls to the
4315  // __is_compile_time_constant_*() functions
4316  llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
4317  if (callInst == NULL)
4318  continue;
4319 
4320  int j;
4321  int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
4322  for (j = 0; j < nFuncs; ++j) {
4323  if (funcs[j] != NULL && callInst->getCalledFunction() == funcs[j])
4324  break;
4325  }
4326  if (j == nFuncs)
4327  // not a __is_compile_time_constant_* function
4328  continue;
4329 
4330  // This optimization pass can be disabled with both the (poorly
4331  // named) disableGatherScatterFlattening option and
4332  // disableMaskAllOnOptimizations.
4334  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4335  modifiedAny = true;
4336  goto restart;
4337  }
4338 
4339  // Is it a constant? Bingo, turn the call's value into a constant
4340  // true value.
4341  llvm::Value *operand = callInst->getArgOperand(0);
4342  if (llvm::isa<llvm::Constant>(operand)) {
4343  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue);
4344  modifiedAny = true;
4345  goto restart;
4346  }
4347 
4348  // This pass runs multiple times during optimization. Up until the
4349  // very last time, it only replaces the call with a 'true' if the
4350  // value is known to be constant and otherwise leaves the call
4351  // alone, in case further optimization passes can help resolve its
4352  // value. The last time through, it eventually has to give up, and
4353  // replaces any remaining ones with 'false' constants.
4354  if (isLastTry) {
4355  llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
4356  modifiedAny = true;
4357  goto restart;
4358  }
4359  }
4360 
4361  DEBUG_END_PASS("IsCompileTimeConstantPass");
4362 
4363  return modifiedAny;
4364 }
4365 
4366 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry) { return new IsCompileTimeConstantPass(isLastTry); }
4367 
4368 //////////////////////////////////////////////////////////////////////////
4369 // DebugPass
4370 
4371 /** This pass is added in list of passes after optimizations which
4372  we want to debug and print dump of LLVM IR in stderr. Also it
4373  prints name and number of previous optimization.
4374  */
4375 #ifndef ISPC_NO_DUMPS
4376 class DebugPass : public llvm::ModulePass {
4377  public:
4378  static char ID;
4379  DebugPass(char *output) : ModulePass(ID) { snprintf(str_output, sizeof(str_output), "%s", output); }
4380 
4381 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4382  const char *getPassName() const { return "Dump LLVM IR"; }
4383 #else // LLVM 4.0+
4384  llvm::StringRef getPassName() const { return "Dump LLVM IR"; }
4385 #endif
4386  bool runOnModule(llvm::Module &m);
4387 
4388  private:
4389  char str_output[100];
4390 };
4391 
4392 char DebugPass::ID = 0;
4393 
4394 bool DebugPass::runOnModule(llvm::Module &module) {
4395  fprintf(stderr, "%s", str_output);
4396  fflush(stderr);
4397  module.dump();
4398  return true;
4399 }
4400 
4401 static llvm::Pass *CreateDebugPass(char *output) { return new DebugPass(output); }
4402 #endif
4403 
4404 ///////////////////////////////////////////////////////////////////////////
4405 // MakeInternalFuncsStaticPass
4406 
4407 /** There are a number of target-specific functions that we use during
4408  these optimization passes. By the time we are done with optimization,
4409  any uses of these should be inlined and no calls to these functions
4410  should remain. This pass marks all of these functions as having
4411  private linkage so that subsequent passes can eliminate them as dead
4412  code, thus cleaning up the final code output by the compiler. We can't
4413  just declare these as static from the start, however, since then they
4414  end up being eliminated as dead code during early optimization passes
4415  even though we may need to generate calls to them during later
4416  optimization passes.
4417  */
4418 class MakeInternalFuncsStaticPass : public llvm::ModulePass {
4419  public:
4420  static char ID;
4421  MakeInternalFuncsStaticPass(bool last = false) : ModulePass(ID) {}
4422 
4423  void getAnalysisUsage(llvm::AnalysisUsage &AU) const { AU.setPreservesCFG(); }
4424 
4425 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4426  const char *getPassName() const { return "Make internal funcs \"static\""; }
4427 #else // LLVM 4.0+
4428  llvm::StringRef getPassName() const { return "Make internal funcs \"static\""; }
4429 #endif
4430  bool runOnModule(llvm::Module &m);
4431 };
4432 
4434 
4435 bool MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
4436  const char *names[] = {
4437  "__avg_up_uint8",
4438  "__avg_up_int8",
4439  "__avg_up_uint16",
4440  "__avg_up_int16",
4441  "__avg_down_uint8",
4442  "__avg_down_int8",
4443  "__avg_down_uint16",
4444  "__avg_down_int16",
4445  "__fast_masked_vload",
4446  "__gather_factored_base_offsets32_i8",
4447  "__gather_factored_base_offsets32_i16",
4448  "__gather_factored_base_offsets32_i32",
4449  "__gather_factored_base_offsets32_i64",
4450  "__gather_factored_base_offsets32_float",
4451  "__gather_factored_base_offsets32_double",
4452  "__gather_factored_base_offsets64_i8",
4453  "__gather_factored_base_offsets64_i16",
4454  "__gather_factored_base_offsets64_i32",
4455  "__gather_factored_base_offsets64_i64",
4456  "__gather_factored_base_offsets64_float",
4457  "__gather_factored_base_offsets64_double",
4458  "__gather_base_offsets32_i8",
4459  "__gather_base_offsets32_i16",
4460  "__gather_base_offsets32_i32",
4461  "__gather_base_offsets32_i64",
4462  "__gather_base_offsets32_float",
4463  "__gather_base_offsets32_double",
4464  "__gather_base_offsets64_i8",
4465  "__gather_base_offsets64_i16",
4466  "__gather_base_offsets64_i32",
4467  "__gather_base_offsets64_i64",
4468  "__gather_base_offsets64_float",
4469  "__gather_base_offsets64_double",
4470  "__gather32_i8",
4471  "__gather32_i16",
4472  "__gather32_i32",
4473  "__gather32_i64",
4474  "__gather32_float",
4475  "__gather32_double",
4476  "__gather64_i8",
4477  "__gather64_i16",
4478  "__gather64_i32",
4479  "__gather64_i64",
4480  "__gather64_float",
4481  "__gather64_double",
4482  "__gather_elt32_i8",
4483  "__gather_elt32_i16",
4484  "__gather_elt32_i32",
4485  "__gather_elt32_i64",
4486  "__gather_elt32_float",
4487  "__gather_elt32_double",
4488  "__gather_elt64_i8",
4489  "__gather_elt64_i16",
4490  "__gather_elt64_i32",
4491  "__gather_elt64_i64",
4492  "__gather_elt64_float",
4493  "__gather_elt64_double",
4494  "__masked_load_i8",
4495  "__masked_load_i16",
4496  "__masked_load_i32",
4497  "__masked_load_i64",
4498  "__masked_load_float",
4499  "__masked_load_double",
4500  "__masked_store_i8",
4501  "__masked_store_i16",
4502  "__masked_store_i32",
4503  "__masked_store_i64",
4504  "__masked_store_float",
4505  "__masked_store_double",
4506  "__masked_store_blend_i8",
4507  "__masked_store_blend_i16",
4508  "__masked_store_blend_i32",
4509  "__masked_store_blend_i64",
4510  "__masked_store_blend_float",
4511  "__masked_store_blend_double",
4512  "__scatter_factored_base_offsets32_i8",
4513  "__scatter_factored_base_offsets32_i16",
4514  "__scatter_factored_base_offsets32_i32",
4515  "__scatter_factored_base_offsets32_i64",
4516  "__scatter_factored_base_offsets32_float",
4517  "__scatter_factored_base_offsets32_double",
4518  "__scatter_factored_base_offsets64_i8",
4519  "__scatter_factored_base_offsets64_i16",
4520  "__scatter_factored_base_offsets64_i32",
4521  "__scatter_factored_base_offsets64_i64",
4522  "__scatter_factored_base_offsets64_float",
4523  "__scatter_factored_base_offsets64_double",
4524  "__scatter_base_offsets32_i8",
4525  "__scatter_base_offsets32_i16",
4526  "__scatter_base_offsets32_i32",
4527  "__scatter_base_offsets32_i64",
4528  "__scatter_base_offsets32_float",
4529  "__scatter_base_offsets32_double",
4530  "__scatter_base_offsets64_i8",
4531  "__scatter_base_offsets64_i16",
4532  "__scatter_base_offsets64_i32",
4533  "__scatter_base_offsets64_i64",
4534  "__scatter_base_offsets64_float",
4535  "__scatter_base_offsets64_double",
4536  "__scatter_elt32_i8",
4537  "__scatter_elt32_i16",
4538  "__scatter_elt32_i32",
4539  "__scatter_elt32_i64",
4540  "__scatter_elt32_float",
4541  "__scatter_elt32_double",
4542  "__scatter_elt64_i8",
4543  "__scatter_elt64_i16",
4544  "__scatter_elt64_i32",
4545  "__scatter_elt64_i64",
4546  "__scatter_elt64_float",
4547  "__scatter_elt64_double",
4548  "__scatter32_i8",
4549  "__scatter32_i16",
4550  "__scatter32_i32",
4551  "__scatter32_i64",
4552  "__scatter32_float",
4553  "__scatter32_double",
4554  "__scatter64_i8",
4555  "__scatter64_i16",
4556  "__scatter64_i32",
4557  "__scatter64_i64",
4558  "__scatter64_float",
4559  "__scatter64_double",
4560  "__prefetch_read_varying_1",
4561  "__prefetch_read_varying_2",
4562  "__prefetch_read_varying_3",
4563  "__prefetch_read_varying_nt",
4564  "__keep_funcs_live",
4565  };
4566 
4567  bool modifiedAny = false;
4568  int count = sizeof(names) / sizeof(names[0]);
4569  for (int i = 0; i < count; ++i) {
4570  llvm::Function *f = m->module->getFunction(names[i]);
4571  if (f != NULL && f->empty() == false) {
4572  f->setLinkage(llvm::GlobalValue::InternalLinkage);
4573  modifiedAny = true;
4574  }
4575  }
4576 
4577  return modifiedAny;
4578 }
4579 
4581 
4582 ///////////////////////////////////////////////////////////////////////////
4583 // PeepholePass
4584 
4585 class PeepholePass : public llvm::BasicBlockPass {
4586  public:
4587  PeepholePass();
4588 
4589 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4590  const char *getPassName() const { return "Peephole Optimizations"; }
4591 #else // LLVM 4.0+
4592  llvm::StringRef getPassName() const { return "Peephole Optimizations"; }
4593 #endif
4594  bool runOnBasicBlock(llvm::BasicBlock &BB);
4595 
4596  static char ID;
4597 };
4598 
4599 char PeepholePass::ID = 0;
4600 
4601 PeepholePass::PeepholePass() : BasicBlockPass(ID) {}
4602 
4603 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3
4604 
4605 using namespace llvm::PatternMatch;
4606 
4607 template <typename Op_t, unsigned Opcode> struct CastClassTypes_match {
4608  Op_t Op;
4609  const llvm::Type *fromType, *toType;
4610 
4611  CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f, const llvm::Type *t)
4612  : Op(OpMatch), fromType(f), toType(t) {}
4613 
4614  template <typename OpTy> bool match(OpTy *V) {
4615  if (llvm::Operator *O = llvm::dyn_cast<llvm::Operator>(V))
4616  return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) && O->getType() == toType &&
4617  O->getOperand(0)->getType() == fromType);
4618  return false;
4619  }
4620 };
4621 
4622 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::SExt> m_SExt8To16(const OpTy &Op) {
4623  return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(Op, LLVMTypes::Int8VectorType,
4625 }
4626 
4627 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt> m_ZExt8To16(const OpTy &Op) {
4628  return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(Op, LLVMTypes::Int8VectorType,
4630 }
4631 
4632 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc> m_Trunc16To8(const OpTy &Op) {
4633  return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(Op, LLVMTypes::Int16VectorType,
4635 }
4636 
4637 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::SExt> m_SExt16To32(const OpTy &Op) {
4638  return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(Op, LLVMTypes::Int16VectorType,
4640 }
4641 
4642 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt> m_ZExt16To32(const OpTy &Op) {
4643  return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(Op, LLVMTypes::Int16VectorType,
4645 }
4646 
4647 template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc> m_Trunc32To16(const OpTy &Op) {
4648  return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(Op, LLVMTypes::Int32VectorType,
4650 }
4651 
4652 template <typename Op_t> struct UDiv2_match {
4653  Op_t Op;
4654 
4655  UDiv2_match(const Op_t &OpMatch) : Op(OpMatch) {}
4656 
4657  template <typename OpTy> bool match(OpTy *V) {
4658  llvm::BinaryOperator *bop;
4659  llvm::ConstantDataVector *cdv;
4660  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
4661  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) && cdv->getSplatValue() != NULL) {
4662  const llvm::APInt &apInt = cdv->getUniqueInteger();
4663 
4664  switch (bop->getOpcode()) {
4665  case llvm::Instruction::UDiv:
4666  // divide by 2
4667  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
4668  case llvm::Instruction::LShr:
4669  // shift left by 1
4670  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
4671  default:
4672  return false;
4673  }
4674  }
4675  return false;
4676  }
4677 };
4678 
4679 template <typename V> inline UDiv2_match<V> m_UDiv2(const V &v) { return UDiv2_match<V>(v); }
4680 
4681 template <typename Op_t> struct SDiv2_match {
4682  Op_t Op;
4683 
4684  SDiv2_match(const Op_t &OpMatch) : Op(OpMatch) {}
4685 
4686  template <typename OpTy> bool match(OpTy *V) {
4687  llvm::BinaryOperator *bop;
4688  llvm::ConstantDataVector *cdv;
4689  if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
4690  (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) && cdv->getSplatValue() != NULL) {
4691  const llvm::APInt &apInt = cdv->getUniqueInteger();
4692 
4693  switch (bop->getOpcode()) {
4694  case llvm::Instruction::SDiv:
4695  // divide by 2
4696  return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
4697  case llvm::Instruction::AShr:
4698  // shift left by 1
4699  return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
4700  default:
4701  return false;
4702  }
4703  }
4704  return false;
4705  }
4706 };
4707 
4708 template <typename V> inline SDiv2_match<V> m_SDiv2(const V &v) { return SDiv2_match<V>(v); }
4709 
4710 // Returns true if the given function has a call to an intrinsic function
4711 // in its definition.
4712 static bool lHasIntrinsicInDefinition(llvm::Function *func) {
4713  llvm::Function::iterator bbiter = func->begin();
4714  for (; bbiter != func->end(); ++bbiter) {
4715  for (llvm::BasicBlock::iterator institer = bbiter->begin(); institer != bbiter->end(); ++institer) {
4716  if (llvm::isa<llvm::IntrinsicInst>(institer))
4717  return true;
4718  }
4719  }
4720  return false;
4721 }
4722 
4723 static llvm::Instruction *lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) {
4724  llvm::Function *func = m->module->getFunction(name);
4725  Assert(func != NULL);
4726 
4727  // Make sure that the definition of the llvm::Function has a call to an
4728  // intrinsic function in its instructions; otherwise we will generate
4729  // infinite loops where we "helpfully" turn the default implementations
4730  // of target builtins like __avg_up_uint8 that are implemented with plain
4731  // arithmetic ops into recursive calls to themselves.
4732  if (lHasIntrinsicInDefinition(func))
4733  return lCallInst(func, opa, opb, name);
4734  else
4735  return NULL;
4736 }
4737 
4738 //////////////////////////////////////////////////
4739 
4740 static llvm::Instruction *lMatchAvgUpUInt8(llvm::Value *inst) {
4741  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
4742  llvm::Value *opa, *opb;
4743  const llvm::APInt *delta;
4744  if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr(
4745  m_CombineOr(m_Add(m_ZExt8To16(m_Value(opa)), m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))),
4746  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)), m_ZExt8To16(m_Value(opb)))),
4747  m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))), m_APInt(delta))))))) {
4748  if (delta->isIntN(1) == false)
4749  return NULL;
4750 
4751  return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
4752  }
4753  return NULL;
4754 }
4755 
4756 static llvm::Instruction *lMatchAvgDownUInt8(llvm::Value *inst) {
4757  // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
4758  llvm::Value *opa, *opb;
4759  if (match(inst, m_Trunc16To8(m_UDiv2(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))))))) {
4760  return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb);
4761  }
4762  return NULL;
4763 }
4764 
4765 static llvm::Instruction *lMatchAvgUpUInt16(llvm::Value *inst) {
4766  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
4767  llvm::Value *opa, *opb;
4768  const llvm::APInt *delta;
4769  if (match(inst,
4770  m_Trunc32To16(m_UDiv2(m_CombineOr(
4771  m_CombineOr(m_Add(m_ZExt16To32(m_Value(opa)), m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))),
4772  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)), m_ZExt16To32(m_Value(opb)))),
4773  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))), m_APInt(delta))))))) {
4774  if (delta->isIntN(1) == false)
4775  return NULL;
4776 
4777  return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
4778  }
4779  return NULL;
4780 }
4781 
4782 static llvm::Instruction *lMatchAvgDownUInt16(llvm::Value *inst) {
4783  // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
4784  llvm::Value *opa, *opb;
4785  if (match(inst, m_Trunc32To16(m_UDiv2(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))))))) {
4786  return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb);
4787  }
4788  return NULL;
4789 }
4790 
4791 static llvm::Instruction *lMatchAvgUpInt8(llvm::Value *inst) {
4792  // (int8)(((int16)a + (int16)b + 1)/2)
4793  llvm::Value *opa, *opb;
4794  const llvm::APInt *delta;
4795  if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr(
4796  m_CombineOr(m_Add(m_SExt8To16(m_Value(opa)), m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))),
4797  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)), m_SExt8To16(m_Value(opb)))),
4798  m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))), m_APInt(delta))))))) {
4799  if (delta->isIntN(1) == false)
4800  return NULL;
4801 
4802  return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
4803  }
4804  return NULL;
4805 }
4806 
4807 static llvm::Instruction *lMatchAvgDownInt8(llvm::Value *inst) {
4808  // (int8)(((int16)a + (int16)b)/2)
4809  llvm::Value *opa, *opb;
4810  if (match(inst, m_Trunc16To8(m_SDiv2(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))))))) {
4811  return lGetBinaryIntrinsic("__avg_down_int8", opa, opb);
4812  }
4813  return NULL;
4814 }
4815 
4816 static llvm::Instruction *lMatchAvgUpInt16(llvm::Value *inst) {
4817  // (int16)(((int32)a + (int32)b + 1)/2)
4818  llvm::Value *opa, *opb;
4819  const llvm::APInt *delta;
4820  if (match(inst,
4821  m_Trunc32To16(m_SDiv2(m_CombineOr(
4822  m_CombineOr(m_Add(m_SExt16To32(m_Value(opa)), m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))),
4823  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)), m_SExt16To32(m_Value(opb)))),
4824  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))), m_APInt(delta))))))) {
4825  if (delta->isIntN(1) == false)
4826  return NULL;
4827 
4828  return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
4829  }
4830  return NULL;
4831 }
4832 
4833 static llvm::Instruction *lMatchAvgDownInt16(llvm::Value *inst) {
4834  // (int16)(((int32)a + (int32)b)/2)
4835  llvm::Value *opa, *opb;
4836  if (match(inst, m_Trunc32To16(m_SDiv2(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))))))) {
4837  return lGetBinaryIntrinsic("__avg_down_int16", opa, opb);
4838  }
4839  return NULL;
4840 }
4841 #endif // !LLVM_3_2
4842 
4843 bool PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
4844  DEBUG_START_PASS("PeepholePass");
4845 
4846  bool modifiedAny = false;
4847 restart:
4848  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
4849  llvm::Instruction *inst = &*iter;
4850 
4851  llvm::Instruction *builtinCall = NULL;
4852 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3
4853  if (!builtinCall)
4854  builtinCall = lMatchAvgUpUInt8(inst);
4855  if (!builtinCall)
4856  builtinCall = lMatchAvgUpUInt16(inst);
4857  if (!builtinCall)
4858  builtinCall = lMatchAvgDownUInt8(inst);
4859  if (!builtinCall)
4860  builtinCall = lMatchAvgDownUInt16(inst);
4861  if (!builtinCall)
4862  builtinCall = lMatchAvgUpInt8(inst);
4863  if (!builtinCall)
4864  builtinCall = lMatchAvgUpInt16(inst);
4865  if (!builtinCall)
4866  builtinCall = lMatchAvgDownInt8(inst);
4867  if (!builtinCall)
4868  builtinCall = lMatchAvgDownInt16(inst);
4869 #endif // !LLVM_3_2
4870  if (builtinCall != NULL) {
4871  llvm::ReplaceInstWithInst(inst, builtinCall);
4872  modifiedAny = true;
4873  goto restart;
4874  }
4875  }
4876 
4877  DEBUG_END_PASS("PeepholePass");
4878 
4879  return modifiedAny;
4880 }
4881 
4882 static llvm::Pass *CreatePeepholePass() { return new PeepholePass; }
4883 
4884 /** Given an llvm::Value known to be an integer, return its value as
4885  an int64_t.
4886 */
4887 static int64_t lGetIntValue(llvm::Value *offset) {
4888  llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
4889  Assert(intOffset && (intOffset->getBitWidth() == 32 || intOffset->getBitWidth() == 64));
4890  return intOffset->getSExtValue();
4891 }
4892 
4893 ///////////////////////////////////////////////////////////////////////////
4894 // ReplaceStdlibShiftPass
4895 
4896 class ReplaceStdlibShiftPass : public llvm::BasicBlockPass {
4897  public:
4898  static char ID;
4899  ReplaceStdlibShiftPass() : BasicBlockPass(ID) {}
4900 
4901 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
4902  const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
4903 #else // LLVM 4.0+
4904  llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
4905 #endif
4906  bool runOnBasicBlock(llvm::BasicBlock &BB);
4907 };
4908 
4910 
4911 // This pass replaces shift() with ShuffleVector when the offset is a constant.
4912 // rotate() which is similar in functionality has a slightly different
4913 // implementation. This is due to LLVM(createInstructionCombiningPass)
4914 // optimizing rotate() implementation better when similar implementations
4915 // are used for both. This is a hack to produce similarly optimized code for
4916 // shift.
4917 bool ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
4918  DEBUG_START_PASS("ReplaceStdlibShiftPass");
4919  bool modifiedAny = false;
4920 
4921  llvm::Function *shifts[6];
4922  shifts[0] = m->module->getFunction("shift___vytuni");
4923  shifts[1] = m->module->getFunction("shift___vysuni");
4924  shifts[2] = m->module->getFunction("shift___vyiuni");
4925  shifts[3] = m->module->getFunction("shift___vyIuni");
4926  shifts[4] = m->module->getFunction("shift___vyfuni");
4927  shifts[5] = m->module->getFunction("shift___vyduni");
4928 
4929  for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
4930  llvm::Instruction *inst = &*iter;
4931 
4932  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
4933  llvm::Function *func = ci->getCalledFunction();
4934  for (int i = 0; i < 6; i++) {
4935  if (shifts[i] && (shifts[i] == func)) {
4936  // we matched a call
4937  llvm::Value *shiftedVec = ci->getArgOperand(0);
4938  llvm::Value *shiftAmt = ci->getArgOperand(1);
4939  if (llvm::isa<llvm::Constant>(shiftAmt)) {
4940  int vectorWidth = g->target->getVectorWidth();
4941  int *shuffleVals = new int[vectorWidth];
4942  int shiftInt = lGetIntValue(shiftAmt);
4943  for (int i = 0; i < vectorWidth; i++) {
4944  int s = i + shiftInt;
4945  s = (s < 0) ? vectorWidth : s;
4946  s = (s >= vectorWidth) ? vectorWidth : s;
4947  shuffleVals[i] = s;
4948  }
4949  llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
4950  llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
4951  llvm::Value *shuffle =
4952  new llvm::ShuffleVectorInst(shiftedVec, zeroVec, shuffleIdxs, "vecShift", ci);
4953  ci->replaceAllUsesWith(shuffle);
4954  modifiedAny = true;
4955  delete[] shuffleVals;
4956  } else {
4957  PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount.");
4958  }
4959  }
4960  }
4961  }
4962  }
4963 
4964  DEBUG_END_PASS("ReplaceStdlibShiftPass");
4965 
4966  return modifiedAny;
4967 }
4968 
4969 static llvm::Pass *CreateReplaceStdlibShiftPass() { return new ReplaceStdlibShiftPass(); }
4970 
4971 ///////////////////////////////////////////////////////////////////////////////
4972 // FixBooleanSelect
4973 //
4974 // The problem is that in LLVM 3.3, optimizer doesn't like
4975 // the following instruction sequence:
4976 // %cmp = fcmp olt <8 x float> %a, %b
4977 // %sext_cmp = sext <8 x i1> %cmp to <8 x i32>
4978 // %new_mask = and <8 x i32> %sext_cmp, %mask
4979 // and optimizes it to the following:
4980 // %cmp = fcmp olt <8 x float> %a, %b
4981 // %cond = select <8 x i1> %cmp, <8 x i32> %mask, <8 x i32> zeroinitializer
4982 //
4983 // It wouldn't be a problem if codegen produced good code for it. But it
4984 // doesn't, especially for vectors larger than native vectors.
4985 //
4986 // This optimization reverts this pattern and should be the last one before
4987 // code gen.
4988 //
4989 // Note that this problem was introduced in LLVM 3.3. But in LLVM 3.4 it was
4990 // fixed. See commit r194542.
4991 //
4992 // After LLVM 3.3 this optimization should probably stay for experimental
4993 // purposes and code should be compared with and without this optimization from
4994 // time to time to make sure that LLVM does right thing.
4995 ///////////////////////////////////////////////////////////////////////////////
4996 
4997 class FixBooleanSelectPass : public llvm::FunctionPass {
4998  public:
4999  static char ID;
5000  FixBooleanSelectPass() : FunctionPass(ID) {}
5001 
5002 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
5003  const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
5004 #else // LLVM 4.0+
5005  llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
5006 #endif
5007  bool runOnFunction(llvm::Function &F);
5008 
5009  private:
5010  llvm::Instruction *fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext);
5011 };
5012 
5013 char FixBooleanSelectPass::ID = 0;
5014 
5015 llvm::Instruction *FixBooleanSelectPass::fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext) {
5016  // Select instruction result type and its integer equivalent
5017  llvm::VectorType *orig_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
5018  llvm::VectorType *int_type = llvm::VectorType::getInteger(orig_type);
5019 
5020  // Result value and optional pointer to instruction to delete
5021  llvm::Instruction *result = 0, *optional_to_delete = 0;
5022 
5023  // It can be vector of integers or vector of floating point values.
5024  if (orig_type->getElementType()->isIntegerTy()) {
5025  // Generate sext+and, remove select.
5026  result = llvm::BinaryOperator::CreateAnd(sext, sel->getTrueValue(), "and_mask", sel);
5027  } else {
5028  llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(sel->getTrueValue());
5029 
5030  if (bc && bc->hasOneUse() && bc->getSrcTy()->isIntOrIntVectorTy() && bc->getSrcTy()->isVectorTy() &&
5031  llvm::isa<llvm::Instruction>(bc->getOperand(0)) &&
5032  llvm::dyn_cast<llvm::Instruction>(bc->getOperand(0))->getParent() == sel->getParent()) {
5033  // Bitcast is casting form integer type, it's operand is instruction, which is located in the same basic
5034  // block (otherwise it's unsafe to use it). bitcast+select => sext+and+bicast Create and
5035  llvm::BinaryOperator *and_inst = llvm::BinaryOperator::CreateAnd(sext, bc->getOperand(0), "and_mask", sel);
5036  // Bitcast back to original type
5037  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
5038  // Original bitcast will be removed
5039  optional_to_delete = bc;
5040  } else {
5041  // General case: select => bitcast+sext+and+bitcast
5042  // Bitcast
5043  llvm::BitCastInst *bc_in = new llvm::BitCastInst(sel->getTrueValue(), int_type, "bitcast_mask_in", sel);
5044  // And
5045  llvm::BinaryOperator *and_inst = llvm::BinaryOperator::CreateAnd(sext, bc_in, "and_mask", sel);
5046  // Bitcast back to original type
5047  result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
5048  }
5049  }
5050 
5051  // Done, finalize.
5052  sel->replaceAllUsesWith(result);
5053  sel->eraseFromParent();
5054  if (optional_to_delete) {
5055  optional_to_delete->eraseFromParent();
5056  }
5057 
5058  return result;
5059 }
5060 
5061 bool FixBooleanSelectPass::runOnFunction(llvm::Function &F) {
5062  bool modifiedAny = false;
5063 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_3 // LLVM 3.3 only
5064 
5065  // Don't optimize generic targets.
5066  if (g->target->getISA() == Target::GENERIC) {
5067  return false;
5068  }
5069 
5070  for (llvm::Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
5071  llvm::BasicBlock *bb = &*I;
5072  for (llvm::BasicBlock::iterator iter = bb->begin(), e = bb->end(); iter != e; ++iter) {
5073  llvm::Instruction *inst = &*iter;
5074 
5075  llvm::CmpInst *cmp = llvm::dyn_cast<llvm::CmpInst>(inst);
5076 
5077  if (cmp && cmp->getType()->isVectorTy() && cmp->getType()->getVectorElementType()->isIntegerTy(1)) {
5078 
5079  // Search for select instruction uses.
5080  int selects = 0;
5081  llvm::VectorType *sext_type = 0;
5082  for (llvm::Instruction::use_iterator it = cmp->use_begin(); it != cmp->use_end(); ++it) {
5083  llvm::SelectInst *sel = llvm::dyn_cast<llvm::SelectInst>(*it);
5084  if (sel && sel->getType()->isVectorTy() && sel->getType()->getScalarSizeInBits() > 1) {
5085  selects++;
5086  // We pick the first one, but typical case when all select types are the same.
5087  sext_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
5088  break;
5089  }
5090  }
5091  if (selects == 0) {
5092  continue;
5093  }
5094  // Get an integer equivalent, if it's not yet an integer.
5095  sext_type = llvm::VectorType::getInteger(sext_type);
5096 
5097  // Do transformation
5098  llvm::BasicBlock::iterator iter_copy = iter;
5099  llvm::Instruction *next_inst = &*(++iter_copy);
5100  // Create or reuse sext
5101  llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(next_inst);
5102  if (sext && sext->getOperand(0) == cmp && sext->getDestTy() == sext_type) {
5103  // This sext can be reused
5104  } else {
5105  if (next_inst) {
5106  sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", next_inst);
5107  } else {
5108  sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", bb);
5109  }
5110  }
5111 
5112  // Walk and fix selects
5113  std::vector<llvm::SelectInst *> sel_uses;
5114  for (llvm::Instruction::use_iterator it = cmp->use_begin(); it != cmp->use_end(); ++it) {
5115  llvm::SelectInst *sel = llvm::dyn_cast<llvm::SelectInst>(*it);
5116  if (sel && sel->getType()->getScalarSizeInBits() == sext_type->getScalarSizeInBits()) {
5117 
5118  // Check that second operand is zero.
5119  llvm::Constant *false_cond = llvm::dyn_cast<llvm::Constant>(sel->getFalseValue());
5120  if (false_cond && false_cond->isZeroValue()) {
5121  sel_uses.push_back(sel);
5122  modifiedAny = true;
5123  }
5124  }
5125  }
5126 
5127  for (int i = 0; i < sel_uses.size(); i++) {
5128  fixSelect(sel_uses[i], sext);
5129  }
5130  }
5131  }
5132  }
5133 
5134 #endif // LLVM 3.3
5135 
5136  return modifiedAny;
5137 }
5138 
5139 static llvm::Pass *CreateFixBooleanSelectPass() { return new FixBooleanSelectPass(); }
5140 
5141 #ifdef ISPC_NVPTX_ENABLED
5142 ///////////////////////////////////////////////////////////////////////////////
5143 // Detect addrspace(3)
5144 ///////////////////////////////////////////////////////////////////////////////
5145 
5146 class PromoteLocalToPrivatePass : public llvm::BasicBlockPass {
5147  public:
5148  static char ID; // Pass identification, replacement for typeid
5149  PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
5150 
5151  bool runOnBasicBlock(llvm::BasicBlock &BB);
5152 };
5153 
5154 char PromoteLocalToPrivatePass::ID = 0;
5155 
5156 bool PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB) {
5157  std::vector<llvm::AllocaInst *> Allocas;
5158 
5159  bool modifiedAny = false;
5160 
5161 #if 1
5162 restart:
5163  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) {
5164  llvm::Instruction *inst = &*I;
5165  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
5166  llvm::Function *func = ci->getCalledFunction();
5167  if (func && func->getName() == "llvm.trap") {
5168  std::vector<llvm::Type *> funcTyArgs;
5169  llvm::FunctionType *funcTy = llvm::FunctionType::get(
5170  /*Result=*/llvm::Type::getVoidTy(*g->ctx),
5171  /*Params=*/funcTyArgs,
5172  /*isVarArg=*/false);
5173  llvm::InlineAsm *trap_ptx = llvm::InlineAsm::get(funcTy, "trap;", "", false);
5174  assert(trap_ptx != NULL);
5175  llvm::Instruction *trap_call = llvm::CallInst::Create(trap_ptx);
5176  assert(trap_call != NULL);
5177  llvm::ReplaceInstWithInst(ci, trap_call);
5178  modifiedAny = true;
5179  goto restart;
5180  }
5181  }
5182  }
5183 #endif
5184 
5185 #if 0
5186  llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
5187 
5188  // Find allocas that are safe to promote, by looking at all instructions in
5189  // the entry node
5190  for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
5191  {
5192  llvm::Instruction *inst = &*I;
5193  if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
5194  {
5195  llvm::Function *func = ci->getCalledFunction();
5196  if (cvtFunc && (cvtFunc == func))
5197  {
5198 #if 0
5199  fprintf(stderr , "--found cvt-- name= %s \n",
5200  I->getName().str().c_str());
5201 #endif
5202  llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
5203  assert(alloca != NULL);
5204 #if 0
5205  const int align = 8; // g->target->getNativeVectorAlignment();
5206  alloca->setAlignment(align);
5207 #endif
5208  ci->replaceAllUsesWith(alloca);
5209  modifiedAny = true;
5210  }
5211  }
5212  }
5213 #endif
5214  return modifiedAny;
5215 }
5216 
5217 static llvm::Pass *CreatePromoteLocalToPrivatePass() { return new PromoteLocalToPrivatePass(); }
5218 
5219 #endif /* ISPC_NVPTX_ENABLED */
static llvm::Pass * CreateFixBooleanSelectPass()
Definition: opt.cpp:5139
static void lExtractConstOffsets(const std::vector< llvm::CallInst *> &coalesceGroup, int elementSize, std::vector< int64_t > *constOffsets)
Definition: opt.cpp:3713
static llvm::Type * FloatType
Definition: llvmutil.h:79
char str_output[100]
Definition: opt.cpp:4389
static llvm::Type * Int32VectorPointerType
Definition: llvmutil.h:102
const char * getPassName() const
Definition: opt.cpp:5003
const char * getPassName() const
Definition: opt.cpp:918
llvm::Value * LLVMShuffleVectors(llvm::Value *v1, llvm::Value *v2, int32_t shuf[], int shufSize, llvm::Instruction *insertBefore)
Definition: llvmutil.cpp:1502
Opt opt
Definition: ispc.h:535
llvm::Constant * LLVMInt64Vector(int64_t i)
Definition: llvmutil.cpp:368
llvm::Instruction * fixSelect(llvm::SelectInst *sel, llvm::SExtInst *sext)
Definition: opt.cpp:5015
static bool lIsSafeToBlend(llvm::Value *lvalue)
Definition: opt.cpp:4011
static bool lCoalesceGathers(const std::vector< llvm::CallInst *> &coalesceGroup)
Definition: opt.cpp:3740
Declaration of the FunctionEmitContext class
void PerformanceWarning(SourcePos p, const char *format,...) PRINTF_FUNC
Definition: util.cpp:394
bool hasVecPrefetch() const
Definition: ispc.h:296
static llvm::Type * DoubleType
Definition: llvmutil.h:80
static llvm::Value * lExtractFromInserts(llvm::Value *v, unsigned int index)
Definition: opt.cpp:1458
bool disableBlendedMaskedStores
Definition: ispc.h:480
static llvm::Value * lExtractOffsetVector248Scale(llvm::Value **vec)
Definition: opt.cpp:1808
static bool simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter)
Definition: opt.cpp:1286
Module * m
Definition: ispc.cpp:102
void Optimize(llvm::Module *module, int optLevel)
Definition: opt.cpp:504
static llvm::Pass * CreateImproveMemoryOpsPass()
Definition: opt.cpp:2994
int first_line
Definition: ispc.h:133
bool runOnModule(llvm::Module &m)
Definition: opt.cpp:4435
Target * target
Definition: ispc.h:537
static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, const char *name, llvm::Instruction *insertBefore=NULL)
Definition: opt.cpp:292
static bool lVectorLoadIsEfficient(std::set< int64_t >::iterator iter, std::set< int64_t >::iterator end, std::set< int64_t >::iterator *newIter, int vectorWidth)
Definition: opt.cpp:3065
int getNativeVectorAlignment() const
Definition: ispc.h:268
static void lSelectLoads(const std::vector< int64_t > &loadOffsets, std::vector< CoalescedLoadOp > *loads)
Definition: opt.cpp:3159
static llvm::Constant * lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1, llvm::Constant **delta)
Definition: opt.cpp:1443
static llvm::Value * lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3370
llvm::Value * lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align, llvm::Instruction *insertBefore, llvm::Type *type)
Definition: opt.cpp:3276
#define DEBUG_START_PASS(NAME)
Definition: opt.cpp:172
static char ID
Definition: opt.cpp:4596
static char ID
Definition: opt.cpp:4999
bool LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts)
Definition: llvmutil.cpp:690
llvm::Constant * LLVMInt32Vector(int32_t i)
Definition: llvmutil.cpp:308
static llvm::Value * lAssemble4Vector(const std::vector< CoalescedLoadOp > &loadOps, const int64_t offsets[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3489
static llvm::VectorType * Int32VectorType
Definition: llvmutil.h:95
Declarations related to optimization passes.
llvm::Value * element0
Definition: opt.cpp:3056
static llvm::Pass * CreateReplaceStdlibShiftPass()
Definition: opt.cpp:4969
std::vector< MaskInstruction > maskInstructions
Definition: opt.cpp:931
bool forceAlignedMemory
Definition: ispc.h:459
static void lCoalescePerfInfo(const std::vector< llvm::CallInst *> &coalesceGroup, const std::vector< CoalescedLoadOp > &loadOps)
Definition: opt.cpp:3209
static llvm::Type * FloatVectorPointerType
Definition: llvmutil.h:104
BlendInstruction * matchingBlendInstruction(llvm::Function *function)
Definition: opt.cpp:1172
static char ID
Definition: opt.cpp:3020
static bool lVectorIs32BitInts(llvm::Value *v)
Definition: opt.cpp:1967
static bool lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos)
Definition: opt.cpp:241
IsCompileTimeConstantPass(bool last=false)
Definition: opt.cpp:4290
static bool lGetMask(llvm::Value *factor, uint64_t *mask)
Definition: opt.cpp:376
static llvm::Type * Int16VectorPointerType
Definition: llvmutil.h:101
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:1302
const char * getPassName() const
Definition: opt.cpp:4902
static bool lIsIntegerSplat(llvm::Value *v, int *splat)
Definition: opt.cpp:1748
MakeInternalFuncsStaticPass(bool last=false)
Definition: opt.cpp:4421
static llvm::Pass * CreateReplacePseudoMemoryOpsPass()
Definition: opt.cpp:4269
static llvm::Type * Int16Type
Definition: llvmutil.h:76
static llvm::Type * DoubleVectorPointerType
Definition: llvmutil.h:105
bool run(llvm::Module &M)
Definition: opt.cpp:454
llvm::Constant * LLVMFalse
Definition: llvmutil.cpp:91
static bool lInstructionMayWriteToMemory(llvm::Instruction *inst)
Definition: opt.cpp:3814
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:2953
static llvm::Pass * CreateIntrinsicsOptPass()
Definition: opt.cpp:1181
bool disableCoalescing
Definition: ispc.h:520
static llvm::Pass * CreatePeepholePass()
Definition: opt.cpp:4882
llvm::PassManager PM
Definition: opt.cpp:462
static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst)
Definition: opt.cpp:2390
#define Assert(expr)
Definition: ispc.h:161
static llvm::VectorType * Int1VectorType
Definition: llvmutil.h:92
static llvm::Instruction * lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name, llvm::Instruction *insertBefore)
Definition: opt.cpp:329
static bool lIsUndef(llvm::Value *value)
Definition: opt.cpp:970
static void lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from)
Definition: opt.cpp:208
header file with declarations for symbol and symbol table classes.
std::set< int > debug_stages
Definition: ispc.h:567
bool disableMaskAllOnOptimizations
Definition: ispc.h:464
llvm::ConstantInt * LLVMInt32(int32_t i)
Definition: llvmutil.cpp:228
llvm::Module * module
Definition: module.h:155
bool matchesMaskInstruction(llvm::Function *function)
Definition: opt.cpp:1163
const char * getPassName() const
Definition: opt.cpp:4382
static llvm::Type * Int8VectorPointerType
Definition: llvmutil.h:100
Globals * g
Definition: ispc.cpp:101
Definition: opt.cpp:419
bool disableGatherScatterOptimizations
Definition: ispc.h:498
bool debugPrint
Definition: ispc.h:558
static llvm::VectorType * Int8VectorType
Definition: llvmutil.h:93
bool LLVMVectorValuesAllEqual(llvm::Value *v, llvm::Value **splat=NULL)
Definition: llvmutil.cpp:1061
static uint64_t lConstElementsToMask(const llvm::SmallVector< llvm::Constant *, ISPC_MAX_NVEC > &elements)
Definition: opt.cpp:344
llvm::Constant * LLVMTrue
Definition: llvmutil.cpp:90
Definition: opt.cpp:419
static llvm::Pass * CreateInstructionSimplifyPass()
Definition: opt.cpp:1326
static llvm::Pass * CreateDebugPass(char *output)
Definition: opt.cpp:4401
static llvm::Value * simplifyBoolVec(llvm::Value *value)
Definition: opt.cpp:1213
static llvm::Pass * CreateIsCompileTimeConstantPass(bool isLastTry)
Definition: opt.cpp:4366
llvm::Value * element1
Definition: opt.cpp:3056
static llvm::VectorType * FloatVectorType
Definition: llvmutil.h:97
const char * getPassName() const
Definition: opt.cpp:4293
static bool lReplacePseudoGS(llvm::CallInst *callInst)
Definition: opt.cpp:4089
static bool lReplacePseudoMaskedStore(llvm::CallInst *callInst)
Definition: opt.cpp:4038
static llvm::Type * Int64Type
Definition: llvmutil.h:78
static llvm::Type * Int8Type
Definition: llvmutil.h:75
static llvm::VectorType * Int64VectorType
Definition: llvmutil.h:96
Header file with declarations for various LLVM utility stuff.
static bool simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter)
Definition: opt.cpp:1251
static llvm::Value * lGetBasePointer(llvm::Value *v, llvm::Instruction *insertBefore, bool broadcastDetected)
Definition: opt.cpp:1403
static llvm::Value * lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore)
Definition: opt.cpp:3689
const char * getPassName() const
Definition: opt.cpp:4426
bool hasScatter() const
Definition: ispc.h:286
bool runOnModule(llvm::Module &m)
Definition: opt.cpp:4394
BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
Definition: opt.cpp:937
bool unrollLoops
Definition: ispc.h:439
DebugPass(char *output)
Definition: opt.cpp:4379
llvm::Value * LLVMExtractFirstVectorElement(llvm::Value *v)
Definition: llvmutil.cpp:1472
const char * getPassName() const
Definition: opt.cpp:3995
Representation of a range of positions in a source file.
Definition: ispc.h:129
void getAnalysisUsage(llvm::AnalysisUsage &AU) const
Definition: opt.cpp:4423
static char ID
Definition: opt.cpp:4378
static char ID
Definition: opt.cpp:924
static char ID
Definition: opt.cpp:1343
static std::vector< CoalescedLoadOp > lSplit8WideLoads(const std::vector< CoalescedLoadOp > &loadOps, llvm::Instruction *insertBefore)
Definition: opt.cpp:3345
const char * LLVMGetName(llvm::Value *v, const char *)
Definition: llvmutil.cpp:1518
Definition: opt.cpp:419
static llvm::Pass * CreateGatherCoalescePass()
Definition: opt.cpp:3980
void LLVMDumpValue(llvm::Value *v)
Definition: llvmutil.cpp:1362
bool disableHandlePseudoMemoryOps
Definition: ispc.h:470
bool force32BitAddressing
Definition: ispc.h:445
static char ID
Definition: opt.cpp:1203
static llvm::Pass * CreateMakeInternalFuncsStaticPass()
Definition: opt.cpp:4580
bool hasGather() const
Definition: ispc.h:284
bool runOnFunction(llvm::Function &F)
Definition: opt.cpp:5061
void Warning(SourcePos p, const char *format,...) PRINTF_FUNC
Definition: util.cpp:375
static llvm::PointerType * VoidPointerType
Definition: llvmutil.h:71
int getVectorWidth() const
Definition: ispc.h:272
llvm::Value * load
Definition: opt.cpp:3052
llvm::Function * function
Definition: opt.cpp:940
#define FATAL(message)
Definition: util.h:112
const char * getPassName() const
Definition: opt.cpp:4590
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:3838
static llvm::Type * Int64VectorPointerType
Definition: llvmutil.h:103
static llvm::Type * Int32Type
Definition: llvmutil.h:77
const llvm::DataLayout * getDataLayout() const
Definition: ispc.h:251
#define PTYPE(p)
Definition: llvmutil.h:55
#define ISPC_MAX_NVEC
Definition: ispc.h:69
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:4304
static bool lOffsets32BitSafe(llvm::Value **variableOffsetPtr, llvm::Value **constOffsetPtr, llvm::Instruction *insertBefore)
Definition: opt.cpp:1983
static char ID
Definition: opt.cpp:4898
#define DEBUG_END_PASS(NAME)
Definition: opt.cpp:182
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:981
void add(llvm::Pass *P, int stage)
Definition: opt.cpp:469
ISA getISA() const
Definition: ispc.h:256
llvm::Value * LLVMFlattenInsertChain(llvm::Value *inst, int vectorWidth, bool compare=true, bool undef=true, bool searchFirstUndef=false)
Definition: llvmutil.cpp:565
static bool lGSToGSBaseOffsets(llvm::CallInst *callInst)
Definition: opt.cpp:2074
const char * getPassName() const
Definition: opt.cpp:1347
IntrinsicsOpt()
Definition: opt.cpp:915
CoalescedLoadOp(int64_t s, int c)
Definition: opt.cpp:3037
int64_t start
Definition: opt.cpp:3046
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:4917
static void lEmitLoads(llvm::Value *basePtr, std::vector< CoalescedLoadOp > &loadOps, int elementSize, llvm::Instruction *insertBefore)
Definition: opt.cpp:3287
static llvm::VectorType * DoubleVectorType
Definition: llvmutil.h:98
static void lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset, llvm::Value **variableOffset, llvm::Instruction *insertBefore)
Definition: opt.cpp:1619
MaskStatus
Definition: opt.cpp:419
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:4244
std::set< int > off_stages
Definition: ispc.h:574
llvm::Constant * LLVMIntAsType(int64_t, llvm::Type *t)
Definition: llvmutil.cpp:441
static llvm::Value * lCheckForActualPointer(llvm::Value *v)
Definition: opt.cpp:1365
PeepholePass()
Definition: opt.cpp:4601
bool runOnBasicBlock(llvm::BasicBlock &BB)
Definition: opt.cpp:4843
static llvm::VectorType * Int16VectorType
Definition: llvmutil.h:94
static llvm::Constant * lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType)
Definition: opt.cpp:2552
llvm::Value * LLVMConcatVectors(llvm::Value *v1, llvm::Value *v2, llvm::Instruction *insertBefore)
Definition: llvmutil.cpp:1483
Definition: opt.cpp:419
static llvm::Value * lExtract248Scale(llvm::Value *splatOperand, int splatValue, llvm::Value *otherOperand, llvm::Value **result)
Definition: opt.cpp:1766
bool LLVMVectorIsLinear(llvm::Value *v, int stride)
Definition: llvmutil.cpp:1326
static llvm::Value * lApplyLoad2(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore)
Definition: opt.cpp:3391
#define PRId64
Definition: opt.cpp:143
llvm::PassManager & getPM()
Definition: opt.cpp:456
static void lAssembleResultVectors(const std::vector< CoalescedLoadOp > &loadOps, const std::vector< int64_t > &constOffsets, std::vector< llvm::Value *> &results, llvm::Instruction *insertBefore)
Definition: opt.cpp:3646
bool is32Bit() const
Definition: ispc.h:262
static llvm::Value * lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets, llvm::Instruction *insertBefore)
Definition: opt.cpp:1475
Declaration of the Module class, which is the ispc-side representation of the results of compiling a ...
llvm::ConstantInt * LLVMInt64(int64_t i)
Definition: llvmutil.cpp:236
llvm::LLVMContext * ctx
Definition: ispc.h:632
const char * getPassName() const
Definition: opt.cpp:3024
static bool lImproveMaskedLoad(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter)
Definition: opt.cpp:2903
static bool lIs32BitSafeHelper(llvm::Value *v)
Definition: opt.cpp:2033
DebugPassManager()
Definition: opt.cpp:452
static bool lGSToLoadStore(llvm::CallInst *callInst)
Definition: opt.cpp:2584
#define LAST_OPT_NUMBER
Definition: ispc.h:72
MaskInstruction(llvm::Function *f)
Definition: opt.cpp:928
static MaskStatus lGetMaskStatus(llvm::Value *mask, int vecWidth=-1)
Definition: opt.cpp:424
const char * getPassName() const
Definition: