Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions coreneuron/apps/corenrn_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ struct corenrn_parameters {
unsigned ms_subint = 2; /// Number of multisend interval. 1 or 2
unsigned spkcompress = 0; /// Spike Compression
unsigned cell_interleave_permute = 0; /// Cell interleaving permutation
unsigned nwarp = 1024; /// Number of warps to balance for cell_interleave_permute == 2
unsigned num_gpus = 0; /// Number of gpus to use per node
unsigned nwarp = 65536; /// Number of warps to balance for cell_interleave_permute == 2
unsigned num_gpus = 0; /// Number of gpus to use per node
unsigned report_buff_size = report_buff_size_default; /// Size in MB of the report buffer.
int seed = -1; /// Initialization seed for random number generator (int)

Expand Down
17 changes: 11 additions & 6 deletions coreneuron/permute/cellorder.cu
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,17 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int
void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) {
auto cuda_stream = static_cast<cudaStream_t>(stream);

// the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e.
// 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
// The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU.
// In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs.
int threadsPerBlock = 128;
int blocksPerGrid = 512;
/// the selection of these parameters has been done after running the channel-benchmark for
/// typical production runs, i.e. 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
/// In the OpenACC/OpenMP implementations threadsPerBlock is set to 32. From profiling the
/// channel-benchmark circuits mentioned above we figured out that the best performance was
/// achieved with this configuration
int threadsPerBlock = warpsize;
/// Max number of blocksPerGrid for NVIDIA GPUs is 65535, so we need to make sure that the
/// blocksPerGrid we launch the CUDA kernel with doesn't exceed this number
const auto maxBlocksPerGrid = 65535;
int provisionalBlocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock;
int blocksPerGrid = provisionalBlocksPerGrid <= maxBlocksPerGrid ? provisionalBlocksPerGrid : maxBlocksPerGrid;

solve_interleaved2_kernel<<<blocksPerGrid, threadsPerBlock, 0, cuda_stream>>>(nt, info, ncore);

Expand Down