45 template <target_system system>
62 template <
typename InputIterator,
typename UnaryFunction>
63 static inline void for_each(InputIterator begin,
78 template <
typename T,
typename UnaryFunction>
93 template <
typename UnaryFunction>
94 static inline void for_each(uint2 range,
108 template <
typename UnaryFunction>
133 template <
typename InputIterator,
typename OutputIterator,
typename Predicate>
136 OutputIterator result,
157 template <
typename InputIterator,
typename OutputIterator,
typename Predicate>
158 static inline size_t copy_if(InputIterator first,
160 OutputIterator result,
183 template <
typename InputIterator,
typename FlagIterator,
typename OutputIterator>
186 OutputIterator result,
203 template <
typename InputIterator>
204 static inline auto sum(InputIterator first,
227 template <
typename Key,
typename Value>
233 int num_key_bits =
sizeof(Key) * 8);
246 template <
typename Key>
275 template <
typename KeyIterator,
typename ValueIterator,
typename ReductionOp>
277 KeyIterator keys_end,
278 ValueIterator values_begin,
279 KeyIterator output_keys,
280 ValueIterator output_values,
282 ReductionOp reduction_op);
314 template <
typename Key,
typename Value,
typename ReductionOp>
320 ReductionOp reduction_op);
347 template <
typename InputIterator,
typename UniqueOutputIterator,
typename LengthOutputIterator>
350 UniqueOutputIterator unique_keys_output,
351 LengthOutputIterator run_lengths_output,
374 #include "parallel/parallel_cuda.inl"
375 #include "parallel/parallel_host.inl"
static size_t copy_flagged(InputIterator first, size_t len, OutputIterator result, FlagIterator flags, allocation< system, uint8 > &temp_storage)
Performs stream compaction based on a buffer with boolean flags.
static void synchronize()
Synchronizes the compute device.
static size_t run_length_encode(InputIterator keys_input, size_t num_keys, UniqueOutputIterator unique_keys_output, LengthOutputIterator run_lengths_output, allocation< system, uint8 > &temp_storage)
Compute a run-length encoding of the input key buffer.
static void sort(allocation< system, Key > &keys, allocation< system, Key > &temp_keys, allocation< system, uint8 > &temp_storage)
Sort a buffer of keys.
static size_t copy_if(InputIterator first, size_t len, OutputIterator result, Predicate op, allocation< system, uint8 > &temp_storage)
Performs stream compaction based on a predicate.
static void inclusive_scan(InputIterator first, size_t len, OutputIterator result, Predicate op)
Performs a parallel inclusive scan on [first, first + len[, using op as the scan operator.
static void check_errors(void)
Check for compute device errors.
static void sort_by_key(pointer< system, Key > &keys, pointer< system, Value > &values, allocation< system, Key > &temp_keys, allocation< system, Value > &temp_values, allocation< system, uint8 > &temp_storage, int num_key_bits=sizeof(Key)*8)
Perform a sort-by-key on a key + value buffer pair.
int2 launch_parameters(T kernel, size_t elements, int dynamic_smem_size=0)
Lift's tagged pointer class.
static void for_each(InputIterator begin, InputIterator end, UnaryFunction f, int2 launch_parameters={0, 0})
Parallel for-each implementation.
static auto sum(InputIterator first, size_t len, allocation< system, uint8 > &temp_storage) -> typename std::iterator_traits< InputIterator >::value_type
Computes the arithmetic sum of a buffer.
static size_t reduce_by_key(KeyIterator keys_begin, KeyIterator keys_end, ValueIterator values_begin, KeyIterator output_keys, ValueIterator output_values, allocation< system, uint8 > &temp_storage, ReductionOp reduction_op)
Perform a reduction by key on a key/value buffer pair.
Dispatch structure for parallel primitives.