Where Online Learning is simpler!

The C and C++ Include Header Files

/usr/include/c++/13/pstl/parallel_backend_tbb.h

$ cat -n /usr/include/c++/13/pstl/parallel_backend_tbb.h

     1	// -*- C++ -*-
     2	//===-- parallel_backend_tbb.h --------------------------------------------===//
     3	//
     4	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
     5	// See https://llvm.org/LICENSE.txt for license information.
     6	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
     7	//
     8	//===----------------------------------------------------------------------===//
     9	
    10	#ifndef _PSTL_PARALLEL_BACKEND_TBB_H
    11	#define _PSTL_PARALLEL_BACKEND_TBB_H
    12	
    13	#include <algorithm>
    14	#include <type_traits>
    15	
    16	#include "parallel_backend_utils.h"
    17	
    18	// Bring in minimal required subset of Intel TBB
    19	#include <tbb/blocked_range.h>
    20	#include <tbb/parallel_for.h>
    21	#include <tbb/parallel_reduce.h>
    22	#include <tbb/parallel_scan.h>
    23	#include <tbb/parallel_invoke.h>
    24	#include <tbb/task_arena.h>
    25	#include <tbb/tbb_allocator.h>
    26	#include <tbb/task.h>
    27	
    28	#if TBB_INTERFACE_VERSION < 10000
    29	#    error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported.
    30	#endif
    31	
    32	namespace __pstl
    33	{
    34	namespace __tbb_backend
    35	{
    36	
    37	//! Raw memory buffer with automatic freeing and no exceptions.
    38	/** Some of our algorithms need to start with raw memory buffer,
    39	not an initialize array, because initialization/destruction
    40	would make the span be at least O(N). */
    41	// tbb::allocator can improve performance in some cases.
    42	template <typename _Tp>
    43	class __buffer
    44	{
    45	    tbb::tbb_allocator<_Tp> _M_allocator;
    46	    _Tp* _M_ptr;
    47	    const std::size_t _M_buf_size;
    48	    __buffer(const __buffer&) = delete;
    49	    void
    50	    operator=(const __buffer&) = delete;
    51	
    52	  public:
    53	    //! Try to obtain buffer of given size to store objects of _Tp type
    54	    __buffer(std::size_t n) : _M_allocator(), _M_ptr(_M_allocator.allocate(n)), _M_buf_size(n) {}
    55	    //! True if buffer was successfully obtained, zero otherwise.
    56	    operator bool() const { return _M_ptr != NULL; }
    57	    //! Return pointer to buffer, or  NULL if buffer could not be obtained.
    58	    _Tp*
    59	    get() const
    60	    {
    61	        return _M_ptr;
    62	    }
    63	    //! Destroy buffer
    64	    ~__buffer() { _M_allocator.deallocate(_M_ptr, _M_buf_size); }
    65	};
    66	
    67	// Wrapper for tbb::task
    68	inline void
    69	__cancel_execution()
    70	{
    71	#if TBB_INTERFACE_VERSION <= 12000
    72	    tbb::task::self().group()->cancel_group_execution();
    73	#else
    74	    tbb::task::current_context()->cancel_group_execution();
    75	#endif
    76	}
    77	
    78	//------------------------------------------------------------------------
    79	// parallel_for
    80	//------------------------------------------------------------------------
    81	
    82	template <class _Index, class _RealBody>
    83	class __parallel_for_body
    84	{
    85	  public:
    86	    __parallel_for_body(const _RealBody& __body) : _M_body(__body) {}
    87	    __parallel_for_body(const __parallel_for_body& __body) : _M_body(__body._M_body) {}
    88	    void
    89	    operator()(const tbb::blocked_range<_Index>& __range) const
    90	    {
    91	        _M_body(__range.begin(), __range.end());
    92	    }
    93	
    94	  private:
    95	    _RealBody _M_body;
    96	};
    97	
    98	//! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
    99	// wrapper over tbb::parallel_for
   100	template <class _ExecutionPolicy, class _Index, class _Fp>
   101	void
   102	__parallel_for(_ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f)
   103	{
   104	    tbb::this_task_arena::isolate([=]() {
   105	        tbb::parallel_for(tbb::blocked_range<_Index>(__first, __last), __parallel_for_body<_Index, _Fp>(__f));
   106	    });
   107	}
   108	
   109	//! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
   110	// wrapper over tbb::parallel_reduce
   111	template <class _ExecutionPolicy, class _Value, class _Index, typename _RealBody, typename _Reduction>
   112	_Value
   113	__parallel_reduce(_ExecutionPolicy&&, _Index __first, _Index __last, const _Value& __identity,
   114	                  const _RealBody& __real_body, const _Reduction& __reduction)
   115	{
   116	    return tbb::this_task_arena::isolate([__first, __last, &__identity, &__real_body, &__reduction]() -> _Value {
   117	        return tbb::parallel_reduce(
   118	            tbb::blocked_range<_Index>(__first, __last), __identity,
   119	            [__real_body](const tbb::blocked_range<_Index>& __r, const _Value& __value) -> _Value {
   120	                return __real_body(__r.begin(), __r.end(), __value);
   121	            },
   122	            __reduction);
   123	    });
   124	}
   125	
   126	//------------------------------------------------------------------------
   127	// parallel_transform_reduce
   128	//
   129	// Notation:
   130	//      r(i,j,init) returns reduction of init with reduction over [i,j)
   131	//      u(i) returns f(i,i+1,identity) for a hypothetical left identity element of r
   132	//      c(x,y) combines values x and y that were the result of r or u
   133	//------------------------------------------------------------------------
   134	
   135	template <class _Index, class _Up, class _Tp, class _Cp, class _Rp>
   136	struct __par_trans_red_body
   137	{
   138	    alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true
   139	    _Rp _M_brick_reduce;                           // Most likely to have non-empty layout
   140	    _Up _M_u;
   141	    _Cp _M_combine;
   142	    bool _M_has_sum; // Put last to minimize size of class
   143	    _Tp&
   144	    sum()
   145	    {
   146	        _PSTL_ASSERT_MSG(_M_has_sum, "sum expected");
   147	        return *(_Tp*)_M_sum_storage;
   148	    }
   149	    __par_trans_red_body(_Up __u, _Tp __init, _Cp __c, _Rp __r)
   150	        : _M_brick_reduce(__r), _M_u(__u), _M_combine(__c), _M_has_sum(true)
   151	    {
   152	        new (_M_sum_storage) _Tp(__init);
   153	    }
   154	
   155	    __par_trans_red_body(__par_trans_red_body& __left, tbb::split)
   156	        : _M_brick_reduce(__left._M_brick_reduce), _M_u(__left._M_u), _M_combine(__left._M_combine), _M_has_sum(false)
   157	    {
   158	    }
   159	
   160	    ~__par_trans_red_body()
   161	    {
   162	        // 17.6.5.12 tells us to not worry about catching exceptions from destructors.
   163	        if (_M_has_sum)
   164	            sum().~_Tp();
   165	    }
   166	
   167	    void
   168	    join(__par_trans_red_body& __rhs)
   169	    {
   170	        sum() = _M_combine(sum(), __rhs.sum());
   171	    }
   172	
   173	    void
   174	    operator()(const tbb::blocked_range<_Index>& __range)
   175	    {
   176	        _Index __i = __range.begin();
   177	        _Index __j = __range.end();
   178	        if (!_M_has_sum)
   179	        {
   180	            _PSTL_ASSERT_MSG(__range.size() > 1, "there should be at least 2 elements");
   181	            new (&_M_sum_storage)
   182	                _Tp(_M_combine(_M_u(__i), _M_u(__i + 1))); // The condition i+1 < j is provided by the grain size of 3
   183	            _M_has_sum = true;
   184	            std::advance(__i, 2);
   185	            if (__i == __j)
   186	                return;
   187	        }
   188	        sum() = _M_brick_reduce(__i, __j, sum());
   189	    }
   190	};
   191	
   192	template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp>
   193	_Tp
   194	__parallel_transform_reduce(_ExecutionPolicy&&, _Index __first, _Index __last, _Up __u, _Tp __init, _Cp __combine,
   195	                            _Rp __brick_reduce)
   196	{
   197	    __tbb_backend::__par_trans_red_body<_Index, _Up, _Tp, _Cp, _Rp> __body(__u, __init, __combine, __brick_reduce);
   198	    // The grain size of 3 is used in order to provide mininum 2 elements for each body
   199	    tbb::this_task_arena::isolate(
   200	        [__first, __last, &__body]() { tbb::parallel_reduce(tbb::blocked_range<_Index>(__first, __last, 3), __body); });
   201	    return __body.sum();
   202	}
   203	
   204	//------------------------------------------------------------------------
   205	// parallel_scan
   206	//------------------------------------------------------------------------
   207	
   208	template <class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
   209	class __trans_scan_body
   210	{
   211	    alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true
   212	    _Rp _M_brick_reduce;                           // Most likely to have non-empty layout
   213	    _Up _M_u;
   214	    _Cp _M_combine;
   215	    _Sp _M_scan;
   216	    bool _M_has_sum; // Put last to minimize size of class
   217	  public:
   218	    __trans_scan_body(_Up __u, _Tp __init, _Cp __combine, _Rp __reduce, _Sp __scan)
   219	        : _M_brick_reduce(__reduce), _M_u(__u), _M_combine(__combine), _M_scan(__scan), _M_has_sum(true)
   220	    {
   221	        new (_M_sum_storage) _Tp(__init);
   222	    }
   223	
   224	    __trans_scan_body(__trans_scan_body& __b, tbb::split)
   225	        : _M_brick_reduce(__b._M_brick_reduce), _M_u(__b._M_u), _M_combine(__b._M_combine), _M_scan(__b._M_scan),
   226	          _M_has_sum(false)
   227	    {
   228	    }
   229	
   230	    ~__trans_scan_body()
   231	    {
   232	        // 17.6.5.12 tells us to not worry about catching exceptions from destructors.
   233	        if (_M_has_sum)
   234	            sum().~_Tp();
   235	    }
   236	
   237	    _Tp&
   238	    sum() const
   239	    {
   240	        _PSTL_ASSERT_MSG(_M_has_sum, "sum expected");
   241	        return *const_cast<_Tp*>(reinterpret_cast<_Tp const*>(_M_sum_storage));
   242	    }
   243	
   244	    void
   245	    operator()(const tbb::blocked_range<_Index>& __range, tbb::pre_scan_tag)
   246	    {
   247	        _Index __i = __range.begin();
   248	        _Index __j = __range.end();
   249	        if (!_M_has_sum)
   250	        {
   251	            new (&_M_sum_storage) _Tp(_M_u(__i));
   252	            _M_has_sum = true;
   253	            ++__i;
   254	            if (__i == __j)
   255	                return;
   256	        }
   257	        sum() = _M_brick_reduce(__i, __j, sum());
   258	    }
   259	
   260	    void
   261	    operator()(const tbb::blocked_range<_Index>& __range, tbb::final_scan_tag)
   262	    {
   263	        sum() = _M_scan(__range.begin(), __range.end(), sum());
   264	    }
   265	
   266	    void
   267	    reverse_join(__trans_scan_body& __a)
   268	    {
   269	        if (_M_has_sum)
   270	        {
   271	            sum() = _M_combine(__a.sum(), sum());
   272	        }
   273	        else
   274	        {
   275	            new (&_M_sum_storage) _Tp(__a.sum());
   276	            _M_has_sum = true;
   277	        }
   278	    }
   279	
   280	    void
   281	    assign(__trans_scan_body& __b)
   282	    {
   283	        sum() = __b.sum();
   284	    }
   285	};
   286	
   287	template <typename _Index>
   288	_Index
   289	__split(_Index __m)
   290	{
   291	    _Index __k = 1;
   292	    while (2 * __k < __m)
   293	        __k *= 2;
   294	    return __k;
   295	}
   296	
   297	//------------------------------------------------------------------------
   298	// __parallel_strict_scan
   299	//------------------------------------------------------------------------
   300	
   301	template <typename _Index, typename _Tp, typename _Rp, typename _Cp>
   302	void
   303	__upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Rp __reduce, _Cp __combine)
   304	{
   305	    if (__m == 1)
   306	        __r[0] = __reduce(__i * __tilesize, __lastsize);
   307	    else
   308	    {
   309	        _Index __k = __split(__m);
   310	        tbb::parallel_invoke(
   311	            [=] { __tbb_backend::__upsweep(__i, __k, __tilesize, __r, __tilesize, __reduce, __combine); },
   312	            [=] {
   313	                __tbb_backend::__upsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, __reduce, __combine);
   314	            });
   315	        if (__m == 2 * __k)
   316	            __r[__m - 1] = __combine(__r[__k - 1], __r[__m - 1]);
   317	    }
   318	}
   319	
   320	template <typename _Index, typename _Tp, typename _Cp, typename _Sp>
   321	void
   322	__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine,
   323	            _Sp __scan)
   324	{
   325	    if (__m == 1)
   326	        __scan(__i * __tilesize, __lastsize, __initial);
   327	    else
   328	    {
   329	        const _Index __k = __split(__m);
   330	        tbb::parallel_invoke(
   331	            [=] { __tbb_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); },
   332	            // Assumes that __combine never throws.
   333	            //TODO: Consider adding a requirement for user functors to be constant.
   334	            [=, &__combine] {
   335	                __tbb_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize,
   336	                                           __combine(__initial, __r[__k - 1]), __combine, __scan);
   337	            });
   338	    }
   339	}
   340	
   341	// Adapted from Intel(R) Cilk(TM) version from cilkpub.
   342	// Let i:len denote a counted interval of length n starting at i.  s denotes a generalized-sum value.
   343	// Expected actions of the functors are:
   344	//     reduce(i,len) -> s  -- return reduction value of i:len.
   345	//     combine(s1,s2) -> s -- return merged sum
   346	//     apex(s) -- do any processing necessary between reduce and scan.
   347	//     scan(i,len,initial) -- perform scan over i:len starting with initial.
   348	// The initial range 0:n is partitioned into consecutive subranges.
   349	// reduce and scan are each called exactly once per subrange.
   350	// Thus callers can rely upon side effects in reduce.
   351	// combine must not throw an exception.
   352	// apex is called exactly once, after all calls to reduce and before all calls to scan.
   353	// For example, it's useful for allocating a __buffer used by scan but whose size is the sum of all reduction values.
   354	// T must have a trivial constructor and destructor.
   355	template <class _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp, typename _Ap>
   356	void
   357	__parallel_strict_scan(_ExecutionPolicy&&, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan,
   358	                       _Ap __apex)
   359	{
   360	    tbb::this_task_arena::isolate([=, &__combine]() {
   361	        if (__n > 1)
   362	        {
   363	            _Index __p = tbb::this_task_arena::max_concurrency();
   364	            const _Index __slack = 4;
   365	            _Index __tilesize = (__n - 1) / (__slack * __p) + 1;
   366	            _Index __m = (__n - 1) / __tilesize;
   367	            __buffer<_Tp> __buf(__m + 1);
   368	            _Tp* __r = __buf.get();
   369	            __tbb_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce,
   370	                                     __combine);
   371	
   372	            // When __apex is a no-op and __combine has no side effects, a good optimizer
   373	            // should be able to eliminate all code between here and __apex.
   374	            // Alternatively, provide a default value for __apex that can be
   375	            // recognized by metaprogramming that conditionlly executes the following.
   376	            size_t __k = __m + 1;
   377	            _Tp __t = __r[__k - 1];
   378	            while ((__k &= __k - 1))
   379	                __t = __combine(__r[__k - 1], __t);
   380	            __apex(__combine(__initial, __t));
   381	            __tbb_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial,
   382	                                       __combine, __scan);
   383	            return;
   384	        }
   385	        // Fewer than 2 elements in sequence, or out of memory.  Handle has single block.
   386	        _Tp __sum = __initial;
   387	        if (__n)
   388	            __sum = __combine(__sum, __reduce(_Index(0), __n));
   389	        __apex(__sum);
   390	        if (__n)
   391	            __scan(_Index(0), __n, __initial);
   392	    });
   393	}
   394	
   395	template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
   396	_Tp
   397	__parallel_transform_scan(_ExecutionPolicy&&, _Index __n, _Up __u, _Tp __init, _Cp __combine, _Rp __brick_reduce,
   398	                          _Sp __scan)
   399	{
   400	    __trans_scan_body<_Index, _Up, _Tp, _Cp, _Rp, _Sp> __body(__u, __init, __combine, __brick_reduce, __scan);
   401	    auto __range = tbb::blocked_range<_Index>(0, __n);
   402	    tbb::this_task_arena::isolate([__range, &__body]() { tbb::parallel_scan(__range, __body); });
   403	    return __body.sum();
   404	}
   405	
   406	//------------------------------------------------------------------------
   407	// parallel_stable_sort
   408	//------------------------------------------------------------------------
   409	
   410	//------------------------------------------------------------------------
   411	// stable_sort utilities
   412	//
   413	// These are used by parallel implementations but do not depend on them.
   414	//------------------------------------------------------------------------
   415	#define _PSTL_MERGE_CUT_OFF 2000
   416	
   417	template <typename _Func>
   418	class __func_task;
   419	template <typename _Func>
   420	class __root_task;
   421	
   422	#if TBB_INTERFACE_VERSION <= 12000
   423	class __task : public tbb::task
   424	{
   425	  public:
   426	    template <typename _Fn>
   427	    __task*
   428	    make_continuation(_Fn&& __f)
   429	    {
   430	        return new (allocate_continuation()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
   431	    }
   432	
   433	    template <typename _Fn>
   434	    __task*
   435	    make_child_of(__task* parent, _Fn&& __f)
   436	    {
   437	        return new (parent->allocate_child()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
   438	    }
   439	
   440	    template <typename _Fn>
   441	    __task*
   442	    make_additional_child_of(tbb::task* parent, _Fn&& __f)
   443	    {
   444	        return new (tbb::task::allocate_additional_child_of(*parent))
   445	            __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
   446	    }
   447	
   448	    inline void
   449	    recycle_as_continuation()
   450	    {
   451	        tbb::task::recycle_as_continuation();
   452	    }
   453	
   454	    inline void
   455	    recycle_as_child_of(__task* parent)
   456	    {
   457	        tbb::task::recycle_as_child_of(*parent);
   458	    }
   459	
   460	    inline void
   461	    spawn(__task* __t)
   462	    {
   463	        tbb::task::spawn(*__t);
   464	    }
   465	
   466	    template <typename _Fn>
   467	    static inline void
   468	    spawn_root_and_wait(__root_task<_Fn>& __root)
   469	    {
   470	        tbb::task::spawn_root_and_wait(*__root._M_task);
   471	    }
   472	};
   473	
   474	template <typename _Func>
   475	class __func_task : public __task
   476	{
   477	    _Func _M_func;
   478	
   479	    tbb::task*
   480	    execute()
   481	    {
   482	        return _M_func(this);
   483	    };
   484	
   485	  public:
   486	    template <typename _Fn>
   487	    __func_task(_Fn&& __f) : _M_func{std::forward<_Fn>(__f)}
   488	    {
   489	    }
   490	
   491	    _Func&
   492	    body()
   493	    {
   494	        return _M_func;
   495	    }
   496	};
   497	
   498	template <typename _Func>
   499	class __root_task
   500	{
   501	    tbb::task* _M_task;
   502	
   503	  public:
   504	    template <typename... Args>
   505	    __root_task(Args&&... args)
   506	        : _M_task{new (tbb::task::allocate_root()) __func_task<_Func>{_Func(std::forward<Args>(args)...)}}
   507	    {
   508	    }
   509	
   510	    friend class __task;
   511	    friend class __func_task<_Func>;
   512	};
   513	
   514	#else  // TBB_INTERFACE_VERSION <= 12000
   515	class __task : public tbb::detail::d1::task
   516	{
   517	  protected:
   518	    tbb::detail::d1::small_object_allocator _M_allocator{};
   519	    tbb::detail::d1::execution_data* _M_execute_data{};
   520	    __task* _M_parent{};
   521	    std::atomic<int> _M_refcount{};
   522	    bool _M_recycle{};
   523	
   524	    template <typename _Fn>
   525	    __task*
   526	    allocate_func_task(_Fn&& __f)
   527	    {
   528	        _PSTL_ASSERT(_M_execute_data != nullptr);
   529	        tbb::detail::d1::small_object_allocator __alloc{};
   530	        auto __t =
   531	            __alloc.new_object<__func_task<typename std::decay<_Fn>::type>>(*_M_execute_data, std::forward<_Fn>(__f));
   532	        __t->_M_allocator = __alloc;
   533	        return __t;
   534	    }
   535	
   536	  public:
   537	    __task*
   538	    parent()
   539	    {
   540	        return _M_parent;
   541	    }
   542	
   543	    void
   544	    set_ref_count(int __n)
   545	    {
   546	        _M_refcount.store(__n, std::memory_order_release);
   547	    }
   548	
   549	    template <typename _Fn>
   550	    __task*
   551	    make_continuation(_Fn&& __f)
   552	    {
   553	        auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
   554	        __t->_M_parent = _M_parent;
   555	        _M_parent = nullptr;
   556	        return __t;
   557	    }
   558	
   559	    template <typename _Fn>
   560	    __task*
   561	    make_child_of(__task* __parent, _Fn&& __f)
   562	    {
   563	        auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
   564	        __t->_M_parent = __parent;
   565	        return __t;
   566	    }
   567	
   568	    template <typename _Fn>
   569	    __task*
   570	    make_additional_child_of(__task* __parent, _Fn&& __f)
   571	    {
   572	        auto __t = make_child_of(__parent, std::forward<_Fn>(__f));
   573	        _PSTL_ASSERT(__parent->_M_refcount.load(std::memory_order_relaxed) > 0);
   574	        ++__parent->_M_refcount;
   575	        return __t;
   576	    }
   577	
   578	    inline void
   579	    recycle_as_continuation()
   580	    {
   581	        _M_recycle = true;
   582	    }
   583	
   584	    inline void
   585	    recycle_as_child_of(__task* parent)
   586	    {
   587	        _M_recycle = true;
   588	        _M_parent = parent;
   589	    }
   590	
   591	    inline void
   592	    spawn(__task* __t)
   593	    {
   594	        _PSTL_ASSERT(_M_execute_data != nullptr);
   595	        tbb::detail::d1::spawn(*__t, *_M_execute_data->context);
   596	    }
   597	
   598	    template <typename _Fn>
   599	    static inline void
   600	    spawn_root_and_wait(__root_task<_Fn>& __root)
   601	    {
   602	        tbb::detail::d1::execute_and_wait(*__root._M_func_task, __root._M_context, __root._M_wait_object,
   603	                                          __root._M_context);
   604	    }
   605	
   606	    template <typename _Func>
   607	    friend class __func_task;
   608	};
   609	
   610	template <typename _Func>
   611	class __func_task : public __task
   612	{
   613	    _Func _M_func;
   614	
   615	    __task*
   616	    execute(tbb::detail::d1::execution_data& __ed) override
   617	    {
   618	        _M_execute_data = &__ed;
   619	        _M_recycle = false;
   620	        __task* __next = _M_func(this);
   621	        return finalize(__next);
   622	    };
   623	
   624	    __task*
   625	    cancel(tbb::detail::d1::execution_data& __ed) override
   626	    {
   627	        return finalize(nullptr);
   628	    }
   629	
   630	    __task*
   631	    finalize(__task* __next)
   632	    {
   633	        bool __recycle = _M_recycle;
   634	        _M_recycle = false;
   635	
   636	        if (__recycle)
   637	        {
   638	            return __next;
   639	        }
   640	
   641	        auto __parent = _M_parent;
   642	        auto __alloc = _M_allocator;
   643	        auto __ed = _M_execute_data;
   644	
   645	        this->~__func_task();
   646	
   647	        _PSTL_ASSERT(__parent != nullptr);
   648	        _PSTL_ASSERT(__parent->_M_refcount.load(std::memory_order_relaxed) > 0);
   649	        if (--__parent->_M_refcount == 0)
   650	        {
   651	            _PSTL_ASSERT(__next == nullptr);
   652	            __alloc.deallocate(this, *__ed);
   653	            return __parent;
   654	        }
   655	
   656	        return __next;
   657	    }
   658	
   659	    friend class __root_task<_Func>;
   660	
   661	  public:
   662	    template <typename _Fn>
   663	    __func_task(_Fn&& __f) : _M_func(std::forward<_Fn>(__f))
   664	    {
   665	    }
   666	
   667	    _Func&
   668	    body()
   669	    {
   670	        return _M_func;
   671	    }
   672	};
   673	
   674	template <typename _Func>
   675	class __root_task : public __task
   676	{
   677	    __task*
   678	    execute(tbb::detail::d1::execution_data& __ed) override
   679	    {
   680	        _M_wait_object.release();
   681	        return nullptr;
   682	    };
   683	
   684	    __task*
   685	    cancel(tbb::detail::d1::execution_data& __ed) override
   686	    {
   687	        _M_wait_object.release();
   688	        return nullptr;
   689	    }
   690	
   691	    __func_task<_Func>* _M_func_task{};
   692	    tbb::detail::d1::wait_context _M_wait_object{0};
   693	    tbb::task_group_context _M_context{};
   694	
   695	  public:
   696	    template <typename... Args>
   697	    __root_task(Args&&... args) : _M_wait_object{1}
   698	    {
   699	        tbb::detail::d1::small_object_allocator __alloc{};
   700	        _M_func_task = __alloc.new_object<__func_task<_Func>>(_Func(std::forward<Args>(args)...));
   701	        _M_func_task->_M_allocator = __alloc;
   702	        _M_func_task->_M_parent = this;
   703	        _M_refcount.store(1, std::memory_order_relaxed);
   704	    }
   705	
   706	    friend class __task;
   707	};
   708	#endif // TBB_INTERFACE_VERSION <= 12000
   709	
   710	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _Cleanup,
   711	          typename _LeafMerge>
   712	class __merge_func
   713	{
   714	    typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
   715	    typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
   716	    typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
   717	    typedef typename std::iterator_traits<_RandomAccessIterator1>::value_type _ValueType;
   718	
   719	    _RandomAccessIterator1 _M_x_beg;
   720	    _RandomAccessIterator2 _M_z_beg;
   721	
   722	    _SizeType _M_xs, _M_xe;
   723	    _SizeType _M_ys, _M_ye;
   724	    _SizeType _M_zs;
   725	    _Compare _M_comp;
   726	    _LeafMerge _M_leaf_merge;
   727	    _SizeType _M_nsort; //number of elements to be sorted for partial_sort alforithm
   728	
   729	    static const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
   730	
   731	    bool _root;   //means a task is merging root task
   732	    bool _x_orig; //"true" means X(or left ) subrange is in the original container; false - in the buffer
   733	    bool _y_orig; //"true" means Y(or right) subrange is in the original container; false - in the buffer
   734	    bool _split; //"true" means a merge task is a split task for parallel merging, the execution logic differs
   735	
   736	    bool
   737	    is_partial() const
   738	    {
   739	        return _M_nsort > 0;
   740	    }
   741	
   742	    struct __move_value
   743	    {
   744	        template <typename Iterator1, typename Iterator2>
   745	        void
   746	        operator()(Iterator1 __x, Iterator2 __z)
   747	        {
   748	            *__z = std::move(*__x);
   749	        }
   750	    };
   751	
   752	    struct __move_value_construct
   753	    {
   754	        template <typename Iterator1, typename Iterator2>
   755	        void
   756	        operator()(Iterator1 __x, Iterator2 __z)
   757	        {
   758	            ::new (std::addressof(*__z)) _ValueType(std::move(*__x));
   759	        }
   760	    };
   761	
   762	    struct __move_range
   763	    {
   764	        template <typename Iterator1, typename Iterator2>
   765	        Iterator2
   766	        operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
   767	        {
   768	            if (__last1 - __first1 < __merge_cut_off)
   769	                return std::move(__first1, __last1, __first2);
   770	
   771	            auto __n = __last1 - __first1;
   772	            tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
   773	                              [__first1, __first2](const tbb::blocked_range<_SizeType>& __range) {
   774	                                  std::move(__first1 + __range.begin(), __first1 + __range.end(),
   775	                                            __first2 + __range.begin());
   776	                              });
   777	            return __first2 + __n;
   778	        }
   779	    };
   780	
   781	    struct __move_range_construct
   782	    {
   783	        template <typename Iterator1, typename Iterator2>
   784	        Iterator2
   785	        operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
   786	        {
   787	            if (__last1 - __first1 < __merge_cut_off)
   788	            {
   789	                for (; __first1 != __last1; ++__first1, ++__first2)
   790	                    __move_value_construct()(__first1, __first2);
   791	                return __first2;
   792	            }
   793	
   794	            auto __n = __last1 - __first1;
   795	            tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
   796	                              [__first1, __first2](const tbb::blocked_range<_SizeType>& __range) {
   797	                                  for (auto i = __range.begin(); i != __range.end(); ++i)
   798	                                      __move_value_construct()(__first1 + i, __first2 + i);
   799	                              });
   800	            return __first2 + __n;
   801	        }
   802	    };
   803	
   804	    struct __cleanup_range
   805	    {
   806	        template <typename _Iterator>
   807	        void
   808	        operator()(_Iterator __first, _Iterator __last)
   809	        {
   810	            if (__last - __first < __merge_cut_off)
   811	                _Cleanup()(__first, __last);
   812	            else
   813	            {
   814	                auto __n = __last - __first;
   815	                tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
   816	                                  [__first](const tbb::blocked_range<_SizeType>& __range) {
   817	                                      _Cleanup()(__first + __range.begin(), __first + __range.end());
   818	                                  });
   819	            }
   820	        }
   821	    };
   822	
   823	  public:
   824	    __merge_func(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp,
   825	                 _Cleanup, _LeafMerge __leaf_merge, _SizeType __nsort, _RandomAccessIterator1 __x_beg,
   826	                 _RandomAccessIterator2 __z_beg, bool __x_orig, bool __y_orig, bool __root)
   827	        : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_x_beg(__x_beg), _M_z_beg(__z_beg),
   828	          _M_comp(__comp), _M_leaf_merge(__leaf_merge), _M_nsort(__nsort), _root(__root),
   829	          _x_orig(__x_orig), _y_orig(__y_orig), _split(false)
   830	    {
   831	    }
   832	
   833	    bool
   834	    is_left(_SizeType __idx) const
   835	    {
   836	        return _M_xs == __idx;
   837	    }
   838	
   839	    template <typename IndexType>
   840	    void
   841	    set_odd(IndexType __idx, bool __on_off)
   842	    {
   843	        if (is_left(__idx))
   844	            _x_orig = __on_off;
   845	        else
   846	            _y_orig = __on_off;
   847	    }
   848	
   849	    __task*
   850	    operator()(__task* __self);
   851	
   852	  private:
   853	    __merge_func*
   854	    parent_merge(__task* __self) const
   855	    {
   856	        return _root ? nullptr : &static_cast<__func_task<__merge_func>*>(__self->parent())->body();
   857	    }
   858	    bool
   859	    x_less_y()
   860	    {
   861	        const auto __nx = (_M_xe - _M_xs);
   862	        const auto __ny = (_M_ye - _M_ys);
   863	        _PSTL_ASSERT(__nx > 0 && __ny > 0);
   864	
   865	        _PSTL_ASSERT(_x_orig == _y_orig);
   866	        _PSTL_ASSERT(!is_partial());
   867	
   868	        if (_x_orig)
   869	        {
   870	            _PSTL_ASSERT(std::is_sorted(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_comp));
   871	            _PSTL_ASSERT(std::is_sorted(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_comp));
   872	            return !_M_comp(*(_M_x_beg + _M_ys), *(_M_x_beg + _M_xe - 1));
   873	        }
   874	
   875	        _PSTL_ASSERT(std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp));
   876	        _PSTL_ASSERT(std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp));
   877	        return !_M_comp(*(_M_z_beg + _M_zs + __nx), *(_M_z_beg + _M_zs + __nx - 1));
   878	    }
   879	    void
   880	    move_x_range()
   881	    {
   882	        const auto __nx = (_M_xe - _M_xs);
   883	        const auto __ny = (_M_ye - _M_ys);
   884	        _PSTL_ASSERT(__nx > 0 && __ny > 0);
   885	
   886	        if (_x_orig)
   887	            __move_range_construct()(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_z_beg + _M_zs);
   888	        else
   889	        {
   890	            __move_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx, _M_x_beg + _M_xs);
   891	            __cleanup_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx);
   892	        }
   893	
   894	        _x_orig = !_x_orig;
   895	    }
   896	    void
   897	    move_y_range()
   898	    {
   899	        const auto __nx = (_M_xe - _M_xs);
   900	        const auto __ny = (_M_ye - _M_ys);
   901	
   902	        if (_y_orig)
   903	            __move_range_construct()(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs + __nx);
   904	        else
   905	        {
   906	            __move_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny, _M_x_beg + _M_ys);
   907	            __cleanup_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny);
   908	        }
   909	
   910	        _y_orig = !_y_orig;
   911	    }
   912	    __task*
   913	    merge_ranges(__task* __self)
   914	    {
   915	        _PSTL_ASSERT(_x_orig == _y_orig); //two merged subrange must be lie into the same buffer
   916	
   917	        const auto __nx = (_M_xe - _M_xs);
   918	        const auto __ny = (_M_ye - _M_ys);
   919	        const auto __n = __nx + __ny;
   920	
   921	        // need to merge {x} and {y}
   922	        if (__n > __merge_cut_off)
   923	            return split_merging(__self);
   924	
   925	        //merge to buffer
   926	        if (_x_orig)
   927	        {
   928	            _M_leaf_merge(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs,
   929	                          _M_comp, __move_value_construct(), __move_value_construct(), __move_range_construct(),
   930	                          __move_range_construct());
   931	            _PSTL_ASSERT(parent_merge(__self)); //not root merging task
   932	        }
   933	        //merge to "origin"
   934	        else
   935	        {
   936	            _PSTL_ASSERT(_x_orig == _y_orig);
   937	
   938	            _PSTL_ASSERT(is_partial() || std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp));
   939	            _PSTL_ASSERT(is_partial() || std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp));
   940	
   941	            const auto __nx = (_M_xe - _M_xs);
   942	            const auto __ny = (_M_ye - _M_ys);
   943	
   944	            _M_leaf_merge(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_x_beg + _M_zs,
   945	                          _M_comp, __move_value(), __move_value(), __move_range(), __move_range());
   946	
   947	            __cleanup_range()(_M_z_beg + _M_xs, _M_z_beg + _M_xe);
   948	            __cleanup_range()(_M_z_beg + _M_ys, _M_z_beg + _M_ye);
   949	        }
   950	        return nullptr;
   951	    }
   952	
   953	    __task*
   954	    process_ranges(__task* __self)
   955	    {
   956	        _PSTL_ASSERT(_x_orig == _y_orig);
   957	        _PSTL_ASSERT(!_split);
   958	
   959	        auto p = parent_merge(__self);
   960	
   961	        if (!p)
   962	        { //root merging task
   963	
   964	            //optimization, just for sort algorithm, //{x} <= {y}
   965	            if (!is_partial() && x_less_y()) //we have a solution
   966	            {
   967	                if (!_x_orig)
   968	                {                   //we have to move the solution to the origin
   969	                    move_x_range(); //parallel moving
   970	                    move_y_range(); //parallel moving
   971	                }
   972	                return nullptr;
   973	            }
   974	            //else: if we have data in the origin,
   975	            //we have to move data to the buffer for final merging into the origin.
   976	            if (_x_orig)
   977	            {
   978	                move_x_range(); //parallel moving
   979	                move_y_range(); //parallel moving
   980	            }
   981	            // need to merge {x} and {y}.
   982	            return merge_ranges(__self);
   983	        }
   984	        //else: not root merging task (parent_merge() == NULL)
   985	        //optimization, just for sort algorithm, //{x} <= {y}
   986	        if (!is_partial() && x_less_y())
   987	        {
   988	            const auto id_range = _M_zs;
   989	            p->set_odd(id_range, _x_orig);
   990	            return nullptr;
   991	        }
   992	        //else: we have to revert "_x(y)_orig" flag of the parent merging task
   993	        const auto id_range = _M_zs;
   994	        p->set_odd(id_range, !_x_orig);
   995	
   996	        return merge_ranges(__self);
   997	    }
   998	
   999	    //splitting as merge task into 2 of the same level
  1000	    __task*
  1001	    split_merging(__task* __self)
  1002	    {
  1003	        _PSTL_ASSERT(_x_orig == _y_orig);
  1004	        const auto __nx = (_M_xe - _M_xs);
  1005	        const auto __ny = (_M_ye - _M_ys);
  1006	
  1007	        _SizeType __xm{};
  1008	        _SizeType __ym{};
  1009	        if (__nx < __ny)
  1010	        {
  1011	            __ym = _M_ys + __ny / 2;
  1012	
  1013	            if (_x_orig)
  1014	                __xm = std::upper_bound(_M_x_beg + _M_xs, _M_x_beg + _M_xe, *(_M_x_beg + __ym), _M_comp) - _M_x_beg;
  1015	            else
  1016	                __xm = std::upper_bound(_M_z_beg + _M_xs, _M_z_beg + _M_xe, *(_M_z_beg + __ym), _M_comp) - _M_z_beg;
  1017	        }
  1018	        else
  1019	        {
  1020	            __xm = _M_xs + __nx / 2;
  1021	
  1022	            if (_y_orig)
  1023	                __ym = std::lower_bound(_M_x_beg + _M_ys, _M_x_beg + _M_ye, *(_M_x_beg + __xm), _M_comp) - _M_x_beg;
  1024	            else
  1025	                __ym = std::lower_bound(_M_z_beg + _M_ys, _M_z_beg + _M_ye, *(_M_z_beg + __xm), _M_comp) - _M_z_beg;
  1026	        }
  1027	
  1028	        auto __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
  1029	        __merge_func __right_func(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort,
  1030	                                  _M_x_beg, _M_z_beg, _x_orig, _y_orig, _root);
  1031	        __right_func._split = true;
  1032	        auto __merge_task = __self->make_additional_child_of(__self->parent(), std::move(__right_func));
  1033	        __self->spawn(__merge_task);
  1034	        __self->recycle_as_continuation();
  1035	
  1036	        _M_xe = __xm;
  1037	        _M_ye = __ym;
  1038	        _split = true;
  1039	
  1040	        return __self;
  1041	    }
  1042	};
  1043	
  1044	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename __M_Compare, typename _Cleanup,
  1045	          typename _LeafMerge>
  1046	__task*
  1047	__merge_func<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>::
  1048	operator()(__task* __self)
  1049	{
  1050	    //a. split merge task into 2 of the same level; the special logic,
  1051	    //without processing(process_ranges) adjacent sub-ranges x and y
  1052	    if (_split)
  1053	        return merge_ranges(__self);
  1054	
  1055	    //b. General merging of adjacent sub-ranges x and y (with optimization in case of {x} <= {y} )
  1056	
  1057	    //1. x and y are in the even buffer
  1058	    //2. x and y are in the odd buffer
  1059	    if (_x_orig == _y_orig)
  1060	        return process_ranges(__self);
  1061	
  1062	    //3. x is in even buffer, y is in the odd buffer
  1063	    //4. x is in odd buffer, y is in the even buffer
  1064	    if (!parent_merge(__self))
  1065	    { //root merge task
  1066	        if (_x_orig)
  1067	            move_x_range();
  1068	        else
  1069	            move_y_range();
  1070	    }
  1071	    else
  1072	    {
  1073	        const _SizeType __nx = (_M_xe - _M_xs);
  1074	        const _SizeType __ny = (_M_ye - _M_ys);
  1075	        _PSTL_ASSERT(__nx > 0);
  1076	        _PSTL_ASSERT(__nx > 0);
  1077	
  1078	        if (__nx < __ny)
  1079	            move_x_range();
  1080	        else
  1081	            move_y_range();
  1082	    }
  1083	
  1084	    return process_ranges(__self);
  1085	}
  1086	
  1087	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
  1088	class __stable_sort_func
  1089	{
  1090	  public:
  1091	    typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
  1092	    typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
  1093	    typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
  1094	
  1095	  private:
  1096	    _RandomAccessIterator1 _M_xs, _M_xe, _M_x_beg;
  1097	    _RandomAccessIterator2 _M_zs, _M_z_beg;
  1098	    _Compare _M_comp;
  1099	    _LeafSort _M_leaf_sort;
  1100	    bool _M_root;
  1101	    _SizeType _M_nsort; //zero or number of elements to be sorted for partial_sort alforithm
  1102	
  1103	  public:
  1104	    __stable_sort_func(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs,
  1105	                       bool __root, _Compare __comp, _LeafSort __leaf_sort, _SizeType __nsort,
  1106	                       _RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg)
  1107	        : _M_xs(__xs), _M_xe(__xe), _M_x_beg(__x_beg), _M_zs(__zs), _M_z_beg(__z_beg), _M_comp(__comp),
  1108	          _M_leaf_sort(__leaf_sort), _M_root(__root), _M_nsort(__nsort)
  1109	    {
  1110	    }
  1111	
  1112	    __task*
  1113	    operator()(__task* __self);
  1114	};
  1115	
  1116	#define _PSTL_STABLE_SORT_CUT_OFF 500
  1117	
  1118	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
  1119	__task*
  1120	__stable_sort_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::operator()(__task* __self)
  1121	{
  1122	    typedef __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy,
  1123	                         __utils::__serial_move_merge>
  1124	        _MergeTaskType;
  1125	
  1126	    const _SizeType __n = _M_xe - _M_xs;
  1127	    const _SizeType __nmerge = _M_nsort > 0 ? _M_nsort : __n;
  1128	    const _SizeType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF;
  1129	    if (__n <= __sort_cut_off)
  1130	    {
  1131	        _M_leaf_sort(_M_xs, _M_xe, _M_comp);
  1132	        _PSTL_ASSERT(!_M_root);
  1133	        return nullptr;
  1134	    }
  1135	
  1136	    const _RandomAccessIterator1 __xm = _M_xs + __n / 2;
  1137	    const _RandomAccessIterator2 __zm = _M_zs + (__xm - _M_xs);
  1138	    const _RandomAccessIterator2 __ze = _M_zs + __n;
  1139	    _MergeTaskType __m(_MergeTaskType(_M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg,
  1140	                                      _M_zs - _M_z_beg, _M_comp, __utils::__serial_destroy(),
  1141	                                      __utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg,
  1142	                                      /*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root));
  1143	    auto __parent = __self->make_continuation(std::move(__m));
  1144	    __parent->set_ref_count(2);
  1145	    auto __right = __self->make_child_of(
  1146	        __parent, __stable_sort_func(__xm, _M_xe, __zm, false, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg));
  1147	    __self->spawn(__right);
  1148	    __self->recycle_as_child_of(__parent);
  1149	    _M_root = false;
  1150	    _M_xe = __xm;
  1151	
  1152	    return __self;
  1153	}
  1154	
  1155	template <class _ExecutionPolicy, typename _RandomAccessIterator, typename _Compare, typename _LeafSort>
  1156	void
  1157	__parallel_stable_sort(_ExecutionPolicy&&, _RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp,
  1158	                       _LeafSort __leaf_sort, std::size_t __nsort = 0)
  1159	{
  1160	    tbb::this_task_arena::isolate([=, &__nsort]() {
  1161	        //sorting based on task tree and parallel merge
  1162	        typedef typename std::iterator_traits<_RandomAccessIterator>::value_type _ValueType;
  1163	        typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type _DifferenceType;
  1164	        const _DifferenceType __n = __xe - __xs;
  1165	        if (__nsort == __n)
  1166	            __nsort = 0; // 'partial_sort' becames 'sort'
  1167	
  1168	        const _DifferenceType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF;
  1169	        if (__n > __sort_cut_off)
  1170	        {
  1171	            __buffer<_ValueType> __buf(__n);
  1172	            __root_task<__stable_sort_func<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>> __root{
  1173	                __xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()};
  1174	            __task::spawn_root_and_wait(__root);
  1175	            return;
  1176	        }
  1177	        //serial sort
  1178	        __leaf_sort(__xs, __xe, __comp);
  1179	    });
  1180	}
  1181	
  1182	//------------------------------------------------------------------------
  1183	// parallel_merge
  1184	//------------------------------------------------------------------------
  1185	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
  1186	          typename _Compare, typename _LeafMerge>
  1187	class __merge_func_static
  1188	{
  1189	    _RandomAccessIterator1 _M_xs, _M_xe;
  1190	    _RandomAccessIterator2 _M_ys, _M_ye;
  1191	    _RandomAccessIterator3 _M_zs;
  1192	    _Compare _M_comp;
  1193	    _LeafMerge _M_leaf_merge;
  1194	
  1195	  public:
  1196	    __merge_func_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys,
  1197	                        _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
  1198	                        _LeafMerge __leaf_merge)
  1199	        : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_comp(__comp), _M_leaf_merge(__leaf_merge)
  1200	    {
  1201	    }
  1202	
  1203	    __task*
  1204	    operator()(__task* __self);
  1205	};
  1206	
  1207	//TODO: consider usage of parallel_for with a custom blocked_range
  1208	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
  1209	          typename __M_Compare, typename _LeafMerge>
  1210	__task*
  1211	__merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, _LeafMerge>::
  1212	operator()(__task* __self)
  1213	{
  1214	    typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
  1215	    typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
  1216	    typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
  1217	    const _SizeType __n = (_M_xe - _M_xs) + (_M_ye - _M_ys);
  1218	    const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
  1219	    if (__n <= __merge_cut_off)
  1220	    {
  1221	        _M_leaf_merge(_M_xs, _M_xe, _M_ys, _M_ye, _M_zs, _M_comp);
  1222	        return nullptr;
  1223	    }
  1224	
  1225	    _RandomAccessIterator1 __xm;
  1226	    _RandomAccessIterator2 __ym;
  1227	    if (_M_xe - _M_xs < _M_ye - _M_ys)
  1228	    {
  1229	        __ym = _M_ys + (_M_ye - _M_ys) / 2;
  1230	        __xm = std::upper_bound(_M_xs, _M_xe, *__ym, _M_comp);
  1231	    }
  1232	    else
  1233	    {
  1234	        __xm = _M_xs + (_M_xe - _M_xs) / 2;
  1235	        __ym = std::lower_bound(_M_ys, _M_ye, *__xm, _M_comp);
  1236	    }
  1237	    const _RandomAccessIterator3 __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
  1238	    auto __right = __self->make_additional_child_of(
  1239	        __self->parent(), __merge_func_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge));
  1240	    __self->spawn(__right);
  1241	    __self->recycle_as_continuation();
  1242	    _M_xe = __xm;
  1243	    _M_ye = __ym;
  1244	
  1245	    return __self;
  1246	}
  1247	
  1248	template <class _ExecutionPolicy, typename _RandomAccessIterator1, typename _RandomAccessIterator2,
  1249	          typename _RandomAccessIterator3, typename _Compare, typename _LeafMerge>
  1250	void
  1251	__parallel_merge(_ExecutionPolicy&&, _RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe,
  1252	                 _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
  1253	                 _LeafMerge __leaf_merge)
  1254	{
  1255	    typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
  1256	    typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
  1257	    typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
  1258	    const _SizeType __n = (__xe - __xs) + (__ye - __ys);
  1259	    const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
  1260	    if (__n <= __merge_cut_off)
  1261	    {
  1262	        // Fall back on serial merge
  1263	        __leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp);
  1264	    }
  1265	    else
  1266	    {
  1267	        tbb::this_task_arena::isolate([=]() {
  1268	            typedef __merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3,
  1269	                                        _Compare, _LeafMerge>
  1270	                _TaskType;
  1271	            __root_task<_TaskType> __root{__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge};
  1272	            __task::spawn_root_and_wait(__root);
  1273	        });
  1274	    }
  1275	}
  1276	
  1277	//------------------------------------------------------------------------
  1278	// parallel_invoke
  1279	//------------------------------------------------------------------------
  1280	template <class _ExecutionPolicy, typename _F1, typename _F2>
  1281	void
  1282	__parallel_invoke(_ExecutionPolicy&&, _F1&& __f1, _F2&& __f2)
  1283	{
  1284	    //TODO: a version of tbb::this_task_arena::isolate with variadic arguments pack should be added in the future
  1285	    tbb::this_task_arena::isolate([&]() { tbb::parallel_invoke(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); });
  1286	}
  1287	
  1288	} // namespace __tbb_backend
  1289	} // namespace __pstl
  1290	
  1291	#endif /* _PSTL_PARALLEL_BACKEND_TBB_H */
Welcome to MyWebUniversity on April 15, 2025.