Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. //===- llvm/Support/Parallel.h - Parallel algorithms ----------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8.  
  9. #ifndef LLVM_SUPPORT_PARALLEL_H
  10. #define LLVM_SUPPORT_PARALLEL_H
  11.  
  12. #include "llvm/ADT/STLExtras.h"
  13. #include "llvm/Config/llvm-config.h"
  14. #include "llvm/Support/Error.h"
  15. #include "llvm/Support/MathExtras.h"
  16. #include "llvm/Support/Threading.h"
  17.  
  18. #include <algorithm>
  19. #include <condition_variable>
  20. #include <functional>
  21. #include <mutex>
  22.  
  23. namespace llvm {
  24.  
  25. namespace parallel {
  26.  
  27. // Strategy for the default executor used by the parallel routines provided by
  28. // this file. It defaults to using all hardware threads and should be
  29. // initialized before the first use of parallel routines.
  30. extern ThreadPoolStrategy strategy;
  31.  
  32. #if LLVM_ENABLE_THREADS
  33. #ifdef _WIN32
  34. // Direct access to thread_local variables from a different DLL isn't
  35. // possible with Windows Native TLS.
  36. unsigned getThreadIndex();
  37. #else
  38. // Don't access this directly, use the getThreadIndex wrapper.
  39. extern thread_local unsigned threadIndex;
  40.  
  41. inline unsigned getThreadIndex() { return threadIndex; }
  42. #endif
  43. #else
  44. inline unsigned getThreadIndex() { return 0; }
  45. #endif
  46.  
  47. namespace detail {
  48. class Latch {
  49.   uint32_t Count;
  50.   mutable std::mutex Mutex;
  51.   mutable std::condition_variable Cond;
  52.  
  53. public:
  54.   explicit Latch(uint32_t Count = 0) : Count(Count) {}
  55.   ~Latch() {
  56.     // Ensure at least that sync() was called.
  57.     assert(Count == 0);
  58.   }
  59.  
  60.   void inc() {
  61.     std::lock_guard<std::mutex> lock(Mutex);
  62.     ++Count;
  63.   }
  64.  
  65.   void dec() {
  66.     std::lock_guard<std::mutex> lock(Mutex);
  67.     if (--Count == 0)
  68.       Cond.notify_all();
  69.   }
  70.  
  71.   void sync() const {
  72.     std::unique_lock<std::mutex> lock(Mutex);
  73.     Cond.wait(lock, [&] { return Count == 0; });
  74.   }
  75. };
  76. } // namespace detail
  77.  
  78. class TaskGroup {
  79.   detail::Latch L;
  80.   bool Parallel;
  81.  
  82. public:
  83.   TaskGroup();
  84.   ~TaskGroup();
  85.  
  86.   // Spawn a task, but does not wait for it to finish.
  87.   void spawn(std::function<void()> f);
  88.  
  89.   // Similar to spawn, but execute the task immediately when ThreadsRequested ==
  90.   // 1. The difference is to give the following pattern a more intuitive order
  91.   // when single threading is requested.
  92.   //
  93.   // for (size_t begin = 0, i = 0, taskSize = 0;;) {
  94.   //   taskSize += ...
  95.   //   bool done = ++i == end;
  96.   //   if (done || taskSize >= taskSizeLimit) {
  97.   //     tg.execute([=] { fn(begin, i); });
  98.   //     if (done)
  99.   //       break;
  100.   //     begin = i;
  101.   //     taskSize = 0;
  102.   //   }
  103.   // }
  104.   void execute(std::function<void()> f);
  105.  
  106.   void sync() const { L.sync(); }
  107. };
  108.  
  109. namespace detail {
  110.  
  111. #if LLVM_ENABLE_THREADS
  112. const ptrdiff_t MinParallelSize = 1024;
  113.  
  114. /// Inclusive median.
  115. template <class RandomAccessIterator, class Comparator>
  116. RandomAccessIterator medianOf3(RandomAccessIterator Start,
  117.                                RandomAccessIterator End,
  118.                                const Comparator &Comp) {
  119.   RandomAccessIterator Mid = Start + (std::distance(Start, End) / 2);
  120.   return Comp(*Start, *(End - 1))
  121.              ? (Comp(*Mid, *(End - 1)) ? (Comp(*Start, *Mid) ? Mid : Start)
  122.                                        : End - 1)
  123.              : (Comp(*Mid, *Start) ? (Comp(*(End - 1), *Mid) ? Mid : End - 1)
  124.                                    : Start);
  125. }
  126.  
  127. template <class RandomAccessIterator, class Comparator>
  128. void parallel_quick_sort(RandomAccessIterator Start, RandomAccessIterator End,
  129.                          const Comparator &Comp, TaskGroup &TG, size_t Depth) {
  130.   // Do a sequential sort for small inputs.
  131.   if (std::distance(Start, End) < detail::MinParallelSize || Depth == 0) {
  132.     llvm::sort(Start, End, Comp);
  133.     return;
  134.   }
  135.  
  136.   // Partition.
  137.   auto Pivot = medianOf3(Start, End, Comp);
  138.   // Move Pivot to End.
  139.   std::swap(*(End - 1), *Pivot);
  140.   Pivot = std::partition(Start, End - 1, [&Comp, End](decltype(*Start) V) {
  141.     return Comp(V, *(End - 1));
  142.   });
  143.   // Move Pivot to middle of partition.
  144.   std::swap(*Pivot, *(End - 1));
  145.  
  146.   // Recurse.
  147.   TG.spawn([=, &Comp, &TG] {
  148.     parallel_quick_sort(Start, Pivot, Comp, TG, Depth - 1);
  149.   });
  150.   parallel_quick_sort(Pivot + 1, End, Comp, TG, Depth - 1);
  151. }
  152.  
  153. template <class RandomAccessIterator, class Comparator>
  154. void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End,
  155.                    const Comparator &Comp) {
  156.   TaskGroup TG;
  157.   parallel_quick_sort(Start, End, Comp, TG,
  158.                       llvm::Log2_64(std::distance(Start, End)) + 1);
  159. }
  160.  
  161. // TaskGroup has a relatively high overhead, so we want to reduce
  162. // the number of spawn() calls. We'll create up to 1024 tasks here.
  163. // (Note that 1024 is an arbitrary number. This code probably needs
  164. // improving to take the number of available cores into account.)
  165. enum { MaxTasksPerGroup = 1024 };
  166.  
  167. template <class IterTy, class ResultTy, class ReduceFuncTy,
  168.           class TransformFuncTy>
  169. ResultTy parallel_transform_reduce(IterTy Begin, IterTy End, ResultTy Init,
  170.                                    ReduceFuncTy Reduce,
  171.                                    TransformFuncTy Transform) {
  172.   // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
  173.   // overhead on large inputs.
  174.   size_t NumInputs = std::distance(Begin, End);
  175.   if (NumInputs == 0)
  176.     return std::move(Init);
  177.   size_t NumTasks = std::min(static_cast<size_t>(MaxTasksPerGroup), NumInputs);
  178.   std::vector<ResultTy> Results(NumTasks, Init);
  179.   {
  180.     // Each task processes either TaskSize or TaskSize+1 inputs. Any inputs
  181.     // remaining after dividing them equally amongst tasks are distributed as
  182.     // one extra input over the first tasks.
  183.     TaskGroup TG;
  184.     size_t TaskSize = NumInputs / NumTasks;
  185.     size_t RemainingInputs = NumInputs % NumTasks;
  186.     IterTy TBegin = Begin;
  187.     for (size_t TaskId = 0; TaskId < NumTasks; ++TaskId) {
  188.       IterTy TEnd = TBegin + TaskSize + (TaskId < RemainingInputs ? 1 : 0);
  189.       TG.spawn([=, &Transform, &Reduce, &Results] {
  190.         // Reduce the result of transformation eagerly within each task.
  191.         ResultTy R = Init;
  192.         for (IterTy It = TBegin; It != TEnd; ++It)
  193.           R = Reduce(R, Transform(*It));
  194.         Results[TaskId] = R;
  195.       });
  196.       TBegin = TEnd;
  197.     }
  198.     assert(TBegin == End);
  199.   }
  200.  
  201.   // Do a final reduction. There are at most 1024 tasks, so this only adds
  202.   // constant single-threaded overhead for large inputs. Hopefully most
  203.   // reductions are cheaper than the transformation.
  204.   ResultTy FinalResult = std::move(Results.front());
  205.   for (ResultTy &PartialResult :
  206.        MutableArrayRef(Results.data() + 1, Results.size() - 1))
  207.     FinalResult = Reduce(FinalResult, std::move(PartialResult));
  208.   return std::move(FinalResult);
  209. }
  210.  
  211. #endif
  212.  
  213. } // namespace detail
  214. } // namespace parallel
  215.  
  216. template <class RandomAccessIterator,
  217.           class Comparator = std::less<
  218.               typename std::iterator_traits<RandomAccessIterator>::value_type>>
  219. void parallelSort(RandomAccessIterator Start, RandomAccessIterator End,
  220.                   const Comparator &Comp = Comparator()) {
  221. #if LLVM_ENABLE_THREADS
  222.   if (parallel::strategy.ThreadsRequested != 1) {
  223.     parallel::detail::parallel_sort(Start, End, Comp);
  224.     return;
  225.   }
  226. #endif
  227.   llvm::sort(Start, End, Comp);
  228. }
  229.  
  230. void parallelFor(size_t Begin, size_t End, function_ref<void(size_t)> Fn);
  231.  
  232. template <class IterTy, class FuncTy>
  233. void parallelForEach(IterTy Begin, IterTy End, FuncTy Fn) {
  234.   parallelFor(0, End - Begin, [&](size_t I) { Fn(Begin[I]); });
  235. }
  236.  
  237. template <class IterTy, class ResultTy, class ReduceFuncTy,
  238.           class TransformFuncTy>
  239. ResultTy parallelTransformReduce(IterTy Begin, IterTy End, ResultTy Init,
  240.                                  ReduceFuncTy Reduce,
  241.                                  TransformFuncTy Transform) {
  242. #if LLVM_ENABLE_THREADS
  243.   if (parallel::strategy.ThreadsRequested != 1) {
  244.     return parallel::detail::parallel_transform_reduce(Begin, End, Init, Reduce,
  245.                                                        Transform);
  246.   }
  247. #endif
  248.   for (IterTy I = Begin; I != End; ++I)
  249.     Init = Reduce(std::move(Init), Transform(*I));
  250.   return std::move(Init);
  251. }
  252.  
  253. // Range wrappers.
  254. template <class RangeTy,
  255.           class Comparator = std::less<decltype(*std::begin(RangeTy()))>>
  256. void parallelSort(RangeTy &&R, const Comparator &Comp = Comparator()) {
  257.   parallelSort(std::begin(R), std::end(R), Comp);
  258. }
  259.  
  260. template <class RangeTy, class FuncTy>
  261. void parallelForEach(RangeTy &&R, FuncTy Fn) {
  262.   parallelForEach(std::begin(R), std::end(R), Fn);
  263. }
  264.  
  265. template <class RangeTy, class ResultTy, class ReduceFuncTy,
  266.           class TransformFuncTy>
  267. ResultTy parallelTransformReduce(RangeTy &&R, ResultTy Init,
  268.                                  ReduceFuncTy Reduce,
  269.                                  TransformFuncTy Transform) {
  270.   return parallelTransformReduce(std::begin(R), std::end(R), Init, Reduce,
  271.                                  Transform);
  272. }
  273.  
  274. // Parallel for-each, but with error handling.
  275. template <class RangeTy, class FuncTy>
  276. Error parallelForEachError(RangeTy &&R, FuncTy Fn) {
  277.   // The transform_reduce algorithm requires that the initial value be copyable.
  278.   // Error objects are uncopyable. We only need to copy initial success values,
  279.   // so work around this mismatch via the C API. The C API represents success
  280.   // values with a null pointer. The joinErrors discards null values and joins
  281.   // multiple errors into an ErrorList.
  282.   return unwrap(parallelTransformReduce(
  283.       std::begin(R), std::end(R), wrap(Error::success()),
  284.       [](LLVMErrorRef Lhs, LLVMErrorRef Rhs) {
  285.         return wrap(joinErrors(unwrap(Lhs), unwrap(Rhs)));
  286.       },
  287.       [&Fn](auto &&V) { return wrap(Fn(V)); }));
  288. }
  289.  
  290. } // namespace llvm
  291.  
  292. #endif // LLVM_SUPPORT_PARALLEL_H
  293.