Lift
Library of parallel computing primitives for GPUs and multi-core CPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
parallel.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015, NVIDIA CORPORATION
3  * Copyright (c) 2015, Nuno Subtil <subtil@gmail.com>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the copyright holders nor the names of its
14  * contributors may be used to endorse or promote products derived from
15  * this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #pragma once
30 
31 #include <iterator>
32 
33 #include "types.h"
34 
35 #include "decorators.h"
36 #include "backends.h"
37 #include "memory.h"
38 
39 namespace lift {
40 
45 template <target_system system>
46 struct parallel
47 {
62  template <typename InputIterator, typename UnaryFunction>
63  static inline void for_each(InputIterator begin,
64  InputIterator end,
65  UnaryFunction f,
66  int2 launch_parameters = { 0, 0 });
67 
78  template <typename T, typename UnaryFunction>
79  static inline void for_each(pointer<system, T>& vector,
80  UnaryFunction f,
81  int2 launch_parameters = { 0, 0 });
82 
93  template <typename UnaryFunction>
94  static inline void for_each(uint2 range,
95  UnaryFunction f,
96  int2 launch_parameters = { 0, 0 });
97 
108  template <typename UnaryFunction>
109  static inline void for_each(uint32 end,
110  UnaryFunction f,
111  int2 launch_parameters = { 0, 0 });
112 
133  template <typename InputIterator, typename OutputIterator, typename Predicate>
134  static inline void inclusive_scan(InputIterator first,
135  size_t len,
136  OutputIterator result,
137  Predicate op);
138 
157  template <typename InputIterator, typename OutputIterator, typename Predicate>
158  static inline size_t copy_if(InputIterator first,
159  size_t len,
160  OutputIterator result,
161  Predicate op,
162  allocation<system, uint8>& temp_storage);
163 
183  template <typename InputIterator, typename FlagIterator, typename OutputIterator>
184  static inline size_t copy_flagged(InputIterator first,
185  size_t len,
186  OutputIterator result,
187  FlagIterator flags,
188  allocation<system, uint8>& temp_storage);
189 
203  template <typename InputIterator>
204  static inline auto sum(InputIterator first,
205  size_t len,
206  allocation<system, uint8>& temp_storage) -> typename std::iterator_traits<InputIterator>::value_type;
207 
227  template <typename Key, typename Value>
228  static inline void sort_by_key(pointer<system, Key>& keys,
229  pointer<system, Value>& values,
230  allocation<system, Key>& temp_keys,
231  allocation<system, Value>& temp_values,
232  allocation<system, uint8>& temp_storage,
233  int num_key_bits = sizeof(Key) * 8);
234 
246  template <typename Key>
247  static inline void sort(allocation<system, Key>& keys,
248  allocation<system, Key>& temp_keys,
249  allocation<system, uint8>& temp_storage);
250 
275  template <typename KeyIterator, typename ValueIterator, typename ReductionOp>
276  static inline size_t reduce_by_key(KeyIterator keys_begin,
277  KeyIterator keys_end,
278  ValueIterator values_begin,
279  KeyIterator output_keys,
280  ValueIterator output_values,
281  allocation<system, uint8>& temp_storage,
282  ReductionOp reduction_op);
283 
284 
314  template <typename Key, typename Value, typename ReductionOp>
315  static inline size_t reduce_by_key(pointer<system, Key>& keys,
316  pointer<system, Value>& values,
317  allocation<system, Key>& output_keys,
318  allocation<system, Value>& output_values,
319  allocation<system, uint8>& temp_storage,
320  ReductionOp reduction_op);
321 
322  // computes a run length encoding
323  // returns the number of runs
347  template <typename InputIterator, typename UniqueOutputIterator, typename LengthOutputIterator>
348  static inline size_t run_length_encode(InputIterator keys_input,
349  size_t num_keys,
350  UniqueOutputIterator unique_keys_output,
351  LengthOutputIterator run_lengths_output,
352  allocation<system, uint8>& temp_storage);
353 
360  static inline void synchronize();
361 
369  static inline void check_errors(void);
370 };
371 
372 } // namespace lift
373 
374 #include "parallel/parallel_cuda.inl"
375 #include "parallel/parallel_host.inl"
static size_t copy_flagged(InputIterator first, size_t len, OutputIterator result, FlagIterator flags, allocation< system, uint8 > &temp_storage)
Performs stream compaction based on a buffer with boolean flags.
static void synchronize()
Synchronizes the compute device.
static size_t run_length_encode(InputIterator keys_input, size_t num_keys, UniqueOutputIterator unique_keys_output, LengthOutputIterator run_lengths_output, allocation< system, uint8 > &temp_storage)
Compute a run-length encoding of the input key buffer.
uint32_t uint32
Definition: types.h:43
static void sort(allocation< system, Key > &keys, allocation< system, Key > &temp_keys, allocation< system, uint8 > &temp_storage)
Sort a buffer of keys.
static size_t copy_if(InputIterator first, size_t len, OutputIterator result, Predicate op, allocation< system, uint8 > &temp_storage)
Performs stream compaction based on a predicate.
static void inclusive_scan(InputIterator first, size_t len, OutputIterator result, Predicate op)
Performs a parallel inclusive scan on [first, first + len[, using op as the scan operator.
static void check_errors(void)
Check for compute device errors.
static void sort_by_key(pointer< system, Key > &keys, pointer< system, Value > &values, allocation< system, Key > &temp_keys, allocation< system, Value > &temp_values, allocation< system, uint8 > &temp_storage, int num_key_bits=sizeof(Key)*8)
Perform a sort-by-key on a key + value buffer pair.
int2 launch_parameters(T kernel, size_t elements, int dynamic_smem_size=0)
Lift's tagged pointer class.
Definition: pointer.h:276
static void for_each(InputIterator begin, InputIterator end, UnaryFunction f, int2 launch_parameters={0, 0})
Parallel for-each implementation.
static auto sum(InputIterator first, size_t len, allocation< system, uint8 > &temp_storage) -> typename std::iterator_traits< InputIterator >::value_type
Computes the arithmetic sum of a buffer.
static size_t reduce_by_key(KeyIterator keys_begin, KeyIterator keys_end, ValueIterator values_begin, KeyIterator output_keys, ValueIterator output_values, allocation< system, uint8 > &temp_storage, ReductionOp reduction_op)
Perform a reduction by key on a key/value buffer pair.
Dispatch structure for parallel primitives.
Definition: parallel.h:46