Lift
Library of parallel computing primitives for GPUs and multi-core CPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
launch_parameters.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015, NVIDIA CORPORATION
3  * Copyright (c) 2015, Nuno Subtil <subtil@gmail.com>
4  * Copyright (c) 2015, Roche Molecular Systems Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  * * Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  * * Redistributions in binary form must reproduce the above copyright
12  * notice, this list of conditions and the following disclaimer in the
13  * documentation and/or other materials provided with the distribution.
14  * * Neither the name of the copyright holders nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 #pragma once
31 
32 namespace lift {
33 
34 // given a __global__ entry point and a number of elements to process,
35 // compute CUDA launch parameters that cover all elements in the input array and maximize occupancy
36 // http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-occupancy-api-simplifies-launch-configuration/
37 // note that this is not necessarily the most efficient launch configuration, just "not too bad"
38 template <class T>
39 int2 launch_parameters(T kernel, size_t elements, int dynamic_smem_size = 0)
40 {
41  int block_size; // block size returned by launch configurator
42  int min_grid_size; // minimum grid required for full occupancy
43  int grid_size; // required grid size based on input size (elements)
44 
45  cudaError_t err;
46 
47  // figure out the largest potential block size for the function
48  err = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size,
49  kernel, dynamic_smem_size, 0);
50 
51  if (err != cudaSuccess)
52  {
53  fprintf(stderr, "ERROR: cudaOccupancyMaxPotentialBlockSize failed (%d)\n", err);
54  abort();
55  }
56 
57  // round up the grid size according to the number of input elements
58  grid_size = (elements + block_size - 1) / block_size;
59 
60  return make_int2(grid_size, block_size);
61 }
62 
63 } // namespace lift
int2 launch_parameters(T kernel, size_t elements, int dynamic_smem_size=0)