latest/doxygen/launch__parameters_8h_source.html

 /*

  * Copyright (c) 2014-2015, NVIDIA CORPORATION

  * Copyright (c) 2015, Nuno Subtil <subtil@gmail.com>

  * Copyright (c) 2015, Roche Molecular Systems Inc.

  * All rights reserved.

  *

  * Redistribution and use in source and binary forms, with or without

  * modification, are permitted provided that the following conditions are met:

  *    * Redistributions of source code must retain the above copyright

  *      notice, this list of conditions and the following disclaimer.

  *    * Redistributions in binary form must reproduce the above copyright

  *      notice, this list of conditions and the following disclaimer in the

  *      documentation and/or other materials provided with the distribution.

  *    * Neither the name of the copyright holders nor the names of its

  *      contributors may be used to endorse or promote products derived from

  *      this software without specific prior written permission.

  *

  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE

  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR

  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

  */


 #pragma once


 namespace lift {


 // given a __global__ entry point and a number of elements to process,

 // compute CUDA launch parameters that cover all elements in the input array and maximize occupancy

 // http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-occupancy-api-simplifies-launch-configuration/

 // note that this is not necessarily the most efficient launch configuration, just "not too bad"

 template <class T>

 int2 launch_parameters(T kernel, size_t elements, int dynamic_smem_size = 0)

 {

     int block_size;     // block size returned by launch configurator

     int min_grid_size;  // minimum grid required for full occupancy

     int grid_size;      // required grid size based on input size (elements)


     cudaError_t err;


     // figure out the largest potential block size for the function

     err = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size,

                                              kernel, dynamic_smem_size, 0);


     if (err != cudaSuccess)

     {

         fprintf(stderr, "ERROR: cudaOccupancyMaxPotentialBlockSize failed (%d)\n", err);

         abort();

     }


     // round up the grid size according to the number of input elements

     grid_size = (elements + block_size - 1) / block_size;


     return make_int2(grid_size, block_size);

 }


 } // namespace lift

lift::launch_parameters
int2 launch_parameters(T kernel, size_t elements, int dynamic_smem_size=0)
Definition: launch_parameters.h:39