# *****************************************************************************
# * CP2K: A general program to perform molecular dynamics simulations         *
# * Copyright (C) 2000 - 2018  CP2K developers group                          *
# *****************************************************************************

[
  Kernel_dnt_medium(m=4, n=4, k=4, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=9) , # 146.542 GFlop/s
  Kernel_dnt_medium(m=4, n=4, k=5, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=10) , # 177.041 GFlop/s
  Kernel_dnt_medium(m=4, n=4, k=6, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=16) , # 206.475 GFlop/s
  Kernel_dnt_medium(m=4, n=4, k=7, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=9) , # 219.254 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=8, threads=32, grouping=12, minblocks=28) , # 224.843 GFlop/s
  Kernel_dnt_medium(m=4, n=4, k=9, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=2) , # 212.125 GFlop/s
  Kernel_dnt_medium(m=4, n=4, k=10, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=23) , # 230.568 GFlop/s
  Kernel_dnt_medium(m=4, n=4, k=13, tile_m=1, tile_n=1, threads=32, grouping=12, minblocks=26) , # 277.241 GFlop/s
  Kernel_dnt_medium(m=4, n=4, k=15, tile_m=1, tile_n=1, threads=32, grouping=12, minblocks=18) , # 306.278 GFlop/s
  Kernel_dnt_medium(m=4, n=4, k=25, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=17) , # 296.162 GFlop/s
  Kernel_dnt_medium(m=4, n=4, k=26, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=18) , # 298.901 GFlop/s
  Kernel_dnt_medium(m=4, n=4, k=28, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=11) , # 299.878 GFlop/s
  Kernel_dnt_small(m=4, n=4, k=32, tile_m=1, tile_n=1, threads=32, grouping=3, minblocks=14) , # 303.953 GFlop/s
  Kernel_dnt_medium(m=4, n=4, k=45, tile_m=1, tile_n=1, threads=64, grouping=3, minblocks=2) , # 294.358 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=4, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=11) , # 177.692 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=5, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=11) , # 217.191 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=6, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=22) , # 236.563 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=7, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=21) , # 211.342 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=8, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=1) , # 221.036 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=9, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=4) , # 229.753 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=13, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=9) , # 289.348 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=25, tile_m=1, tile_n=1, threads=32, grouping=3, minblocks=13) , # 321.429 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=26, tile_m=1, tile_n=1, threads=32, grouping=3, minblocks=10) , # 314.511 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=28, tile_m=1, tile_n=1, threads=32, grouping=4, minblocks=1) , # 320.416 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=32, tile_m=1, tile_n=1, threads=64, grouping=3, minblocks=21) , # 329.232 GFlop/s
  Kernel_dnt_medium(m=4, n=5, k=45, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=9) , # 321.273 GFlop/s
  Kernel_dnt_medium(m=4, n=6, k=4, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=15) , # 212.803 GFlop/s
  Kernel_dnt_medium(m=4, n=6, k=5, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=5) , # 249.115 GFlop/s
  Kernel_dnt_medium(m=4, n=6, k=6, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=7) , # 220.182 GFlop/s
  Kernel_dnt_medium(m=4, n=6, k=7, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=25) , # 249.712 GFlop/s
  Kernel_dnt_medium(m=4, n=6, k=8, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=11) , # 261.366 GFlop/s
  Kernel_dnt_medium(m=4, n=6, k=9, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=7) , # 270.512 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=4, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=3) , # 247.865 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=5, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=10) , # 221.348 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=6, tile_m=1, tile_n=1, threads=32, grouping=15, minblocks=21) , # 253.675 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=7, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=26) , # 283.879 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=8, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=27) , # 296.012 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=9, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=1) , # 308.449 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=13, tile_m=1, tile_n=1, threads=32, grouping=11, minblocks=20) , # 367.159 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=25, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=3) , # 359.977 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=26, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=18) , # 366.138 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=28, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=9) , # 366.881 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=32, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=17) , # 372.257 GFlop/s
  Kernel_dnt_medium(m=4, n=7, k=45, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=1) , # 364.346 GFlop/s
  Kernel_dnt_medium(m=4, n=8, k=4, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=27) , # 238.702 GFlop/s
  Kernel_dnt_medium(m=4, n=8, k=5, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=2) , # 250.59 GFlop/s
  Kernel_dnt_medium(m=4, n=8, k=6, tile_m=1, tile_n=1, threads=32, grouping=15, minblocks=19) , # 289.589 GFlop/s
  Kernel_dnt_medium(m=4, n=8, k=7, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=13) , # 321.553 GFlop/s
  Kernel_dnt_medium(m=4, n=8, k=8, tile_m=1, tile_n=1, threads=32, grouping=11, minblocks=25) , # 316.767 GFlop/s
  Kernel_dnt_medium(m=4, n=8, k=9, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=27) , # 338.79 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=2) , # 227.781 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=7) , # 266.956 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=6, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=7) , # 312.537 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=7, tile_m=2, tile_n=1, threads=32, grouping=12, minblocks=8) , # 337.427 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=8, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=19) , # 339.161 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=9, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=21) , # 345.165 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=13, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=5) , # 387.107 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=25, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=6) , # 386.259 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=26, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=17) , # 388.032 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=28, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=16) , # 393.694 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=32, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=8) , # 394.752 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=45, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=4) , # 391.427 GFlop/s
  Kernel_dnt_medium(m=4, n=10, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=2) , # 250.088 GFlop/s
  Kernel_dnt_medium(m=4, n=10, k=10, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=11) , # 391.438 GFlop/s
  Kernel_dnt_medium(m=4, n=10, k=15, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=12) , # 397.304 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=6) , # 311.366 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=5, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=10) , # 351.076 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=7, tile_m=1, tile_n=2, threads=32, grouping=12, minblocks=17) , # 428.77 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=9, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=19) , # 418.275 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=13, tile_m=1, tile_n=1, threads=64, grouping=6, minblocks=5) , # 420.747 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=25, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=13) , # 427.859 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=26, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=9) , # 434.909 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=28, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=2) , # 436.522 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=32, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=8) , # 438.394 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=45, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=2) , # 436.72 GFlop/s
  Kernel_dnt_medium(m=4, n=15, k=4, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=5) , # 354.445 GFlop/s
  Kernel_dnt_medium(m=4, n=15, k=10, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=14) , # 431.464 GFlop/s
  Kernel_dnt_medium(m=4, n=15, k=15, tile_m=1, tile_n=1, threads=64, grouping=26, minblocks=12) , # 442.867 GFlop/s
  Kernel_dnt_medium(m=4, n=25, k=4, tile_m=5, tile_n=1, threads=32, grouping=13, minblocks=7) , # 445.525 GFlop/s
  Kernel_dnt_medium(m=4, n=25, k=5, tile_m=6, tile_n=1, threads=32, grouping=13, minblocks=2) , # 455.615 GFlop/s
  Kernel_dnt_medium(m=4, n=25, k=7, tile_m=4, tile_n=1, threads=32, grouping=16, minblocks=8) , # 454.983 GFlop/s
  Kernel_dnt_medium(m=4, n=25, k=9, tile_m=2, tile_n=1, threads=64, grouping=24, minblocks=7) , # 458.769 GFlop/s
  Kernel_dnt_medium(m=4, n=25, k=13, tile_m=2, tile_n=1, threads=64, grouping=4, minblocks=13) , # 471.674 GFlop/s
  Kernel_dnt_medium(m=4, n=25, k=25, tile_m=2, tile_n=1, threads=128, grouping=3, minblocks=5) , # 481.216 GFlop/s
  Kernel_dnt_medium(m=4, n=25, k=26, tile_m=2, tile_n=1, threads=128, grouping=4, minblocks=6) , # 484.075 GFlop/s
  Kernel_dnt_medium(m=4, n=25, k=28, tile_m=1, tile_n=1, threads=128, grouping=3, minblocks=2) , # 486.834 GFlop/s
  Kernel_dnt_medium(m=4, n=25, k=32, tile_m=1, tile_n=2, threads=128, grouping=3, minblocks=2) , # 492.058 GFlop/s
  Kernel_dnt_medium(m=4, n=25, k=45, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=3) , # 490.296 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=4, tile_m=4, tile_n=1, threads=32, grouping=12, minblocks=2) , # 450.52 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=5, tile_m=4, tile_n=1, threads=32, grouping=13, minblocks=3) , # 452.479 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=7, tile_m=2, tile_n=2, threads=32, grouping=18, minblocks=4) , # 462.349 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=9, tile_m=2, tile_n=1, threads=64, grouping=26, minblocks=1) , # 458.638 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=13, tile_m=1, tile_n=2, threads=64, grouping=26, minblocks=7) , # 475.833 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=25, tile_m=1, tile_n=1, threads=128, grouping=3, minblocks=5) , # 485.815 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=26, tile_m=2, tile_n=1, threads=128, grouping=3, minblocks=6) , # 486.965 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=28, tile_m=1, tile_n=1, threads=128, grouping=3, minblocks=5) , # 488.784 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=32, tile_m=1, tile_n=1, threads=160, grouping=2, minblocks=3) , # 494.485 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=45, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=1) , # 494.086 GFlop/s
  Kernel_dnt_medium(m=4, n=28, k=4, tile_m=1, tile_n=5, threads=32, grouping=13, minblocks=6) , # 442.777 GFlop/s
  Kernel_dnt_medium(m=4, n=28, k=5, tile_m=6, tile_n=1, threads=32, grouping=13, minblocks=23) , # 450.581 GFlop/s
  Kernel_dnt_medium(m=4, n=28, k=7, tile_m=4, tile_n=1, threads=64, grouping=22, minblocks=8) , # 462.427 GFlop/s
  Kernel_dnt_medium(m=4, n=28, k=9, tile_m=2, tile_n=1, threads=64, grouping=26, minblocks=17) , # 474.408 GFlop/s
  Kernel_dnt_medium(m=4, n=28, k=13, tile_m=2, tile_n=1, threads=64, grouping=32, minblocks=7) , # 484.113 GFlop/s
  Kernel_dnt_medium(m=4, n=28, k=25, tile_m=1, tile_n=1, threads=128, grouping=5, minblocks=7) , # 487.596 GFlop/s
  Kernel_dnt_medium(m=4, n=28, k=26, tile_m=2, tile_n=1, threads=128, grouping=4, minblocks=3) , # 487.843 GFlop/s
  Kernel_dnt_medium(m=4, n=28, k=28, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=5) , # 489.847 GFlop/s
  Kernel_dnt_medium(m=4, n=28, k=32, tile_m=2, tile_n=1, threads=192, grouping=4, minblocks=3) , # 497.128 GFlop/s
  Kernel_dnt_medium(m=4, n=28, k=45, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=2) , # 502.934 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=4, tile_m=4, tile_n=1, threads=32, grouping=13, minblocks=2) , # 460.973 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=5, tile_m=5, tile_n=1, threads=32, grouping=17, minblocks=5) , # 463.913 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=7, tile_m=2, tile_n=1, threads=64, grouping=4, minblocks=21) , # 475.538 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=9, tile_m=1, tile_n=2, threads=64, grouping=4, minblocks=8) , # 481.514 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=13, tile_m=1, tile_n=2, threads=64, grouping=32, minblocks=5) , # 491.05 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=25, tile_m=1, tile_n=1, threads=128, grouping=5, minblocks=6) , # 495.767 GFlop/s
  Kernel_dnt_largeDB2(m=4, n=32, k=26, tile_m=1, tile_n=2, w=8, v=16, threads=64, grouping=16, minblocks=1) , # 500.984 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=28, tile_m=1, tile_n=1, threads=256, grouping=5, minblocks=6) , # 501.158 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=32, tile_m=1, tile_n=1, threads=256, grouping=4, minblocks=1) , # 508.453 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=45, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=3) , # 513.575 GFlop/s
  Kernel_dnt_medium(m=4, n=45, k=4, tile_m=1, tile_n=3, threads=64, grouping=19, minblocks=2) , # 451.602 GFlop/s
  Kernel_dnt_medium(m=4, n=45, k=5, tile_m=1, tile_n=6, threads=64, grouping=26, minblocks=12) , # 455.413 GFlop/s
  Kernel_dnt_medium(m=4, n=45, k=7, tile_m=4, tile_n=1, threads=64, grouping=26, minblocks=4) , # 473.951 GFlop/s
  Kernel_dnt_medium(m=4, n=45, k=9, tile_m=1, tile_n=2, threads=96, grouping=29, minblocks=12) , # 476.94 GFlop/s
  Kernel_dnt_medium(m=4, n=45, k=13, tile_m=2, tile_n=2, threads=64, grouping=26, minblocks=5) , # 489.857 GFlop/s
  Kernel_dnt_medium(m=4, n=45, k=25, tile_m=2, tile_n=1, threads=128, grouping=3, minblocks=2) , # 508.404 GFlop/s
  Kernel_dnt_medium(m=4, n=45, k=26, tile_m=1, tile_n=1, threads=192, grouping=4, minblocks=4) , # 512.756 GFlop/s
  Kernel_dnt_medium(m=4, n=45, k=28, tile_m=2, tile_n=1, threads=128, grouping=3, minblocks=4) , # 516.097 GFlop/s
  Kernel_dnt_medium(m=4, n=45, k=32, tile_m=1, tile_n=2, threads=192, grouping=3, minblocks=2) , # 523.467 GFlop/s
  Kernel_dnt_medium(m=4, n=45, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=1) , # 529.993 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=4, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=15) , # 177.029 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=5, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=3) , # 214.145 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=6, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=1) , # 233.353 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=7, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=24) , # 211.838 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=8, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=4) , # 220.845 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=9, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=14) , # 228.292 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=13, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=20) , # 276.437 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=25, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=8) , # 312.504 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=26, tile_m=1, tile_n=1, threads=32, grouping=9, minblocks=17) , # 310.699 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=28, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=23) , # 314.448 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=32, tile_m=1, tile_n=1, threads=64, grouping=3, minblocks=21) , # 327.177 GFlop/s
  Kernel_dnt_medium(m=5, n=4, k=45, tile_m=1, tile_n=1, threads=64, grouping=3, minblocks=5) , # 314.251 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=4, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=1) , # 225.089 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=5, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=15) , # 260.59 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=6, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=10) , # 285.889 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=7, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=17) , # 262.967 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=8, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=14) , # 288.785 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=9, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=17) , # 311.74 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=12, tile_m=1, tile_n=1, threads=32, grouping=12, minblocks=3) , # 375.736 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=13, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=17) , # 363.798 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=16, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=13) , # 384.724 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=24, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=22) , # 372.44 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=25, tile_m=1, tile_n=1, threads=64, grouping=3, minblocks=23) , # 364.992 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=26, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=4) , # 358.249 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=28, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=12) , # 359.923 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=32, tile_m=1, tile_n=1, threads=32, grouping=2, minblocks=17) , # 370.681 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=45, tile_m=1, tile_n=1, threads=64, grouping=3, minblocks=12) , # 356.319 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=4, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=23) , # 264.288 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=5, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=6) , # 293.115 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=6, tile_m=1, tile_n=1, threads=32, grouping=15, minblocks=7) , # 274.264 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=7, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=20) , # 273.457 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=8, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=7) , # 305.918 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=9, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=16) , # 328.197 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=20) , # 262.885 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=2) , # 256.737 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=6, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=20) , # 291.538 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=7, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=10) , # 295.189 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=8, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=25) , # 327.718 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=9, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=16) , # 349.89 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=13, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=18) , # 396.99 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=25, tile_m=1, tile_n=1, threads=64, grouping=26, minblocks=10) , # 395.03 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=26, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=3) , # 400.328 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=28, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=4) , # 400.111 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=32, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=4) , # 408.48 GFlop/s
  Kernel_dnt_medium(m=5, n=7, k=45, tile_m=1, tile_n=1, threads=96, grouping=3, minblocks=7) , # 402.851 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=9) , # 260.853 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=20) , # 290.836 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=6, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=3) , # 332.611 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=7, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=14) , # 332.852 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=8, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=14) , # 364.813 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=9, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=14) , # 383.158 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=4, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=16) , # 273.232 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=7) , # 321.78 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=6, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=5) , # 370.575 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=7, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=21) , # 372.988 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=8, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=2) , # 397.604 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=9, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=24) , # 420.547 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=13, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=26) , # 428.436 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=25, tile_m=1, tile_n=1, threads=64, grouping=26, minblocks=7) , # 437.524 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=26, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=12) , # 442.884 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=28, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=11) , # 446.122 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=32, tile_m=1, tile_n=1, threads=96, grouping=4, minblocks=2) , # 451.393 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=45, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=4) , # 448.595 GFlop/s
  Kernel_dnt_medium(m=5, n=12, k=5, tile_m=1, tile_n=2, threads=32, grouping=12, minblocks=12) , # 403.762 GFlop/s
  Kernel_dnt_medium(m=5, n=12, k=12, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=2) , # 482.763 GFlop/s
  Kernel_dnt_medium(m=5, n=12, k=13, tile_m=1, tile_n=2, threads=32, grouping=4, minblocks=9) , # 472.41 GFlop/s
  Kernel_dnt_medium(m=5, n=12, k=26, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=5) , # 493.318 GFlop/s
  Kernel_dnt_medium(m=5, n=12, k=32, tile_m=1, tile_n=1, threads=96, grouping=3, minblocks=2) , # 499.203 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=4, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=14) , # 371.523 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=5, tile_m=3, tile_n=1, threads=32, grouping=12, minblocks=3) , # 420.528 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=7, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=15) , # 463.048 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=9, tile_m=3, tile_n=1, threads=32, grouping=16, minblocks=15) , # 476.293 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=12, tile_m=1, tile_n=3, threads=64, grouping=22, minblocks=17) , # 487.729 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=13, tile_m=1, tile_n=3, threads=32, grouping=5, minblocks=1) , # 469.163 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=16, tile_m=1, tile_n=3, threads=96, grouping=29, minblocks=8) , # 486.36 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=24, tile_m=3, tile_n=1, threads=64, grouping=3, minblocks=4) , # 491.279 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=25, tile_m=1, tile_n=1, threads=128, grouping=5, minblocks=12) , # 485.413 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=26, tile_m=3, tile_n=1, threads=64, grouping=4, minblocks=5) , # 487.003 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=28, tile_m=1, tile_n=1, threads=96, grouping=4, minblocks=8) , # 491.337 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=32, tile_m=1, tile_n=1, threads=128, grouping=5, minblocks=2) , # 504.563 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=45, tile_m=1, tile_n=1, threads=128, grouping=3, minblocks=3) , # 499.154 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=5, tile_m=3, tile_n=1, threads=32, grouping=11, minblocks=16) , # 500.621 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=13, tile_m=1, tile_n=3, threads=64, grouping=4, minblocks=20) , # 504.808 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=16, tile_m=1, tile_n=1, threads=96, grouping=6, minblocks=14) , # 518.964 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=32, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=1) , # 532.529 GFlop/s
  Kernel_dnt_medium(m=5, n=24, k=5, tile_m=1, tile_n=4, threads=32, grouping=13, minblocks=19) , # 529.873 GFlop/s
  Kernel_dnt_medium(m=5, n=24, k=13, tile_m=1, tile_n=2, threads=96, grouping=32, minblocks=12) , # 547.828 GFlop/s
  Kernel_dnt_largeDB2(m=5, n=24, k=24, tile_m=1, tile_n=2, w=8, v=24, threads=64, grouping=16, minblocks=4) , # 573.313 GFlop/s
  Kernel_dnt_medium(m=5, n=24, k=26, tile_m=1, tile_n=2, threads=160, grouping=3, minblocks=8) , # 570.406 GFlop/s
  Kernel_dnt_medium(m=5, n=24, k=32, tile_m=1, tile_n=1, threads=192, grouping=2, minblocks=5) , # 579.512 GFlop/s
  Kernel_dnt_medium(m=5, n=25, k=4, tile_m=5, tile_n=1, threads=32, grouping=13, minblocks=1) , # 498.541 GFlop/s
  Kernel_dnt_medium(m=5, n=25, k=5, tile_m=6, tile_n=1, threads=32, grouping=15, minblocks=8) , # 512.665 GFlop/s
  Kernel_dnt_medium(m=5, n=25, k=7, tile_m=4, tile_n=1, threads=64, grouping=22, minblocks=3) , # 511.578 GFlop/s
  Kernel_dnt_medium(m=5, n=25, k=9, tile_m=3, tile_n=1, threads=64, grouping=26, minblocks=10) , # 542.861 GFlop/s
  Kernel_dnt_medium(m=5, n=25, k=13, tile_m=1, tile_n=1, threads=128, grouping=6, minblocks=12) , # 540.652 GFlop/s
  Kernel_dnt_medium(m=5, n=25, k=25, tile_m=1, tile_n=1, threads=128, grouping=3, minblocks=5) , # 567.148 GFlop/s
  Kernel_dnt_medium(m=5, n=25, k=26, tile_m=1, tile_n=1, threads=160, grouping=5, minblocks=1) , # 571.256 GFlop/s
  Kernel_dnt_medium(m=5, n=25, k=28, tile_m=1, tile_n=1, threads=192, grouping=4, minblocks=7) , # 571.421 GFlop/s
  Kernel_dnt_medium(m=5, n=25, k=32, tile_m=1, tile_n=1, threads=224, grouping=3, minblocks=6) , # 584.779 GFlop/s
  Kernel_dnt_medium(m=5, n=25, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=2) , # 583.507 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=4, tile_m=5, tile_n=1, threads=32, grouping=13, minblocks=11) , # 507.889 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=5, tile_m=5, tile_n=1, threads=32, grouping=16, minblocks=1) , # 520.031 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=7, tile_m=3, tile_n=1, threads=64, grouping=22, minblocks=15) , # 521.285 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=9, tile_m=3, tile_n=1, threads=64, grouping=26, minblocks=9) , # 549.571 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=12, tile_m=3, tile_n=1, threads=64, grouping=32, minblocks=2) , # 554.52 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=13, tile_m=1, tile_n=3, threads=96, grouping=32, minblocks=12) , # 549.535 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=24, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=6) , # 571.163 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=25, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=7) , # 568.459 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=26, tile_m=1, tile_n=2, threads=160, grouping=4, minblocks=2) , # 572.317 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=28, tile_m=1, tile_n=2, threads=160, grouping=5, minblocks=7) , # 573.92 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=32, tile_m=1, tile_n=2, threads=224, grouping=3, minblocks=6) , # 584.452 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=4) , # 587.258 GFlop/s
  Kernel_dnt_medium(m=5, n=28, k=4, tile_m=1, tile_n=5, threads=32, grouping=16, minblocks=21) , # 507.593 GFlop/s
  Kernel_dnt_medium(m=5, n=28, k=5, tile_m=5, tile_n=1, threads=32, grouping=19, minblocks=10) , # 526.317 GFlop/s
  Kernel_dnt_medium(m=5, n=28, k=7, tile_m=3, tile_n=1, threads=64, grouping=26, minblocks=13) , # 538.63 GFlop/s
  Kernel_dnt_medium(m=5, n=28, k=9, tile_m=3, tile_n=1, threads=64, grouping=26, minblocks=2) , # 561.006 GFlop/s
  Kernel_dnt_medium(m=5, n=28, k=13, tile_m=1, tile_n=3, threads=96, grouping=32, minblocks=11) , # 561.153 GFlop/s
  Kernel_dnt_medium(m=5, n=28, k=25, tile_m=3, tile_n=1, threads=128, grouping=4, minblocks=6) , # 577.262 GFlop/s
  Kernel_dnt_medium(m=5, n=28, k=26, tile_m=1, tile_n=1, threads=160, grouping=4, minblocks=5) , # 578.939 GFlop/s
  Kernel_dnt_medium(m=5, n=28, k=28, tile_m=2, tile_n=1, threads=160, grouping=4, minblocks=5) , # 581.324 GFlop/s
  Kernel_dnt_medium(m=5, n=28, k=32, tile_m=3, tile_n=1, threads=128, grouping=3, minblocks=2) , # 589.399 GFlop/s
  Kernel_dnt_medium(m=5, n=28, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=3) , # 595.274 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=4, tile_m=6, tile_n=1, threads=32, grouping=16, minblocks=20) , # 523.602 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=5, tile_m=6, tile_n=1, threads=32, grouping=16, minblocks=15) , # 535.714 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=7, tile_m=3, tile_n=1, threads=64, grouping=26, minblocks=4) , # 554.629 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=9, tile_m=3, tile_n=1, threads=64, grouping=30, minblocks=1) , # 565.992 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=12, tile_m=2, tile_n=1, threads=96, grouping=32, minblocks=10) , # 574.485 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=13, tile_m=1, tile_n=3, threads=96, grouping=32, minblocks=10) , # 572.183 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=16, tile_m=2, tile_n=1, threads=128, grouping=5, minblocks=4) , # 583.958 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=24, tile_m=2, tile_n=1, threads=160, grouping=4, minblocks=4) , # 593.51 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=25, tile_m=3, tile_n=1, threads=128, grouping=5, minblocks=1) , # 593.335 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=26, tile_m=1, tile_n=1, threads=256, grouping=4, minblocks=6) , # 593.116 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=28, tile_m=3, tile_n=1, threads=128, grouping=4, minblocks=3) , # 596.857 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=32, tile_m=1, tile_n=1, threads=256, grouping=4, minblocks=5) , # 610.053 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=2) , # 612.697 GFlop/s
  Kernel_dnt_medium(m=5, n=45, k=4, tile_m=5, tile_n=1, threads=64, grouping=22, minblocks=15) , # 510.03 GFlop/s
  Kernel_dnt_medium(m=5, n=45, k=5, tile_m=1, tile_n=4, threads=64, grouping=26, minblocks=15) , # 528.041 GFlop/s
  Kernel_dnt_medium(m=5, n=45, k=7, tile_m=1, tile_n=4, threads=64, grouping=26, minblocks=4) , # 553.915 GFlop/s
  Kernel_dnt_medium(m=5, n=45, k=9, tile_m=3, tile_n=1, threads=96, grouping=29, minblocks=3) , # 564.82 GFlop/s
  Kernel_dnt_medium(m=5, n=45, k=13, tile_m=1, tile_n=2, threads=128, grouping=5, minblocks=8) , # 579.618 GFlop/s
  Kernel_dnt_medium(m=5, n=45, k=25, tile_m=3, tile_n=1, threads=128, grouping=4, minblocks=2) , # 612.003 GFlop/s
  Kernel_dnt_medium(m=5, n=45, k=26, tile_m=1, tile_n=2, threads=192, grouping=3, minblocks=4) , # 612.137 GFlop/s
  Kernel_dnt_medium(m=5, n=45, k=28, tile_m=3, tile_n=1, threads=160, grouping=3, minblocks=1) , # 623.505 GFlop/s
  Kernel_dnt_medium(m=5, n=45, k=32, tile_m=3, tile_n=1, threads=192, grouping=4, minblocks=2) , # 630.854 GFlop/s
  Kernel_dnt_medium(m=5, n=45, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=1) , # 635.739 GFlop/s
  Kernel_dnt_medium(m=6, n=4, k=4, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=3) , # 212.84 GFlop/s
  Kernel_dnt_medium(m=6, n=4, k=5, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=25) , # 250.003 GFlop/s
  Kernel_dnt_medium(m=6, n=4, k=6, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=24) , # 227.238 GFlop/s
  Kernel_dnt_medium(m=6, n=4, k=7, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=5) , # 253.726 GFlop/s
  Kernel_dnt_medium(m=6, n=4, k=8, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=13) , # 260.467 GFlop/s
  Kernel_dnt_medium(m=6, n=4, k=9, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=12) , # 268.788 GFlop/s
  Kernel_dnt_medium(m=6, n=5, k=4, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=25) , # 265.113 GFlop/s
  Kernel_dnt_medium(m=6, n=5, k=5, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=4) , # 292.379 GFlop/s
  Kernel_dnt_medium(m=6, n=5, k=6, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=20) , # 278.719 GFlop/s
  Kernel_dnt_medium(m=6, n=5, k=7, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=13) , # 273.867 GFlop/s
  Kernel_dnt_medium(m=6, n=5, k=8, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=13) , # 306.256 GFlop/s
  Kernel_dnt_medium(m=6, n=5, k=9, tile_m=1, tile_n=1, threads=32, grouping=12, minblocks=2) , # 330.458 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=4, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=20) , # 279.038 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=5, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=13) , # 324.604 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=6, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=26) , # 309.661 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=7, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=11) , # 347.979 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=8, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=4) , # 388.615 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=9, tile_m=1, tile_n=2, threads=32, grouping=11, minblocks=15) , # 414.229 GFlop/s
  Kernel_dnt_medium(m=6, n=7, k=4, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=22) , # 310.207 GFlop/s
  Kernel_dnt_medium(m=6, n=7, k=5, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=2) , # 307.915 GFlop/s
  Kernel_dnt_medium(m=6, n=7, k=6, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=27) , # 311.923 GFlop/s
  Kernel_dnt_medium(m=6, n=7, k=7, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=18) , # 348.666 GFlop/s
  Kernel_dnt_medium(m=6, n=7, k=8, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=3) , # 391.768 GFlop/s
  Kernel_dnt_medium(m=6, n=7, k=9, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=16) , # 421.422 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=13) , # 306.933 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=5, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=27) , # 346.794 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=6, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=2) , # 356.722 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=7, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=1) , # 397.923 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=8, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=2) , # 434.94 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=9, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=19) , # 449.288 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=4, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=26) , # 326.343 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=5, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=17) , # 387.121 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=6, tile_m=1, tile_n=2, threads=32, grouping=12, minblocks=13) , # 390.83 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=7, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=4) , # 440.029 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=8, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=27) , # 468.53 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=9, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=13) , # 476.022 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=4, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=9) , # 247.91 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=5, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=11) , # 222.094 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=6, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=3) , # 256.908 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=7, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=20) , # 286.148 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=8, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=15) , # 296.218 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=9, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=8) , # 306.346 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=13, tile_m=1, tile_n=1, threads=32, grouping=12, minblocks=25) , # 361.319 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=25, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=21) , # 356.58 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=26, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=21) , # 362.674 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=28, tile_m=1, tile_n=1, threads=64, grouping=6, minblocks=5) , # 363.829 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=32, tile_m=1, tile_n=1, threads=64, grouping=3, minblocks=8) , # 371.107 GFlop/s
  Kernel_dnt_medium(m=7, n=4, k=45, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=9) , # 363.482 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=19) , # 270.919 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=27) , # 263.9 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=6, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=23) , # 304.548 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=7, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=20) , # 302.41 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=8, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=14) , # 332.161 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=9, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=25) , # 349.89 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=13, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=21) , # 397.427 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=25, tile_m=2, tile_n=1, threads=64, grouping=4, minblocks=20) , # 400.125 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=26, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=12) , # 397.999 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=28, tile_m=2, tile_n=1, threads=64, grouping=4, minblocks=7) , # 402.797 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=32, tile_m=1, tile_n=1, threads=64, grouping=3, minblocks=11) , # 409.525 GFlop/s
  Kernel_dnt_medium(m=7, n=5, k=45, tile_m=1, tile_n=1, threads=96, grouping=3, minblocks=5) , # 401.97 GFlop/s
  Kernel_dnt_medium(m=7, n=6, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=24) , # 313.286 GFlop/s
  Kernel_dnt_medium(m=7, n=6, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=19) , # 312.621 GFlop/s
  Kernel_dnt_medium(m=7, n=6, k=6, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=3) , # 321.111 GFlop/s
  Kernel_dnt_medium(m=7, n=6, k=7, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=12) , # 350.795 GFlop/s
  Kernel_dnt_medium(m=7, n=6, k=8, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=13) , # 385.441 GFlop/s
  Kernel_dnt_medium(m=7, n=6, k=9, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=15) , # 418.272 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=4, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=1) , # 364.566 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=5, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=11) , # 358.669 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=6, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=3) , # 406.525 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=7, tile_m=2, tile_n=1, threads=32, grouping=11, minblocks=27) , # 449.927 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=8, tile_m=2, tile_n=1, threads=32, grouping=11, minblocks=12) , # 495.144 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=9, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=5) , # 495.72 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=13, tile_m=2, tile_n=1, threads=32, grouping=18, minblocks=27) , # 488.334 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=25, tile_m=2, tile_n=1, threads=64, grouping=4, minblocks=10) , # 489.312 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=26, tile_m=2, tile_n=1, threads=64, grouping=4, minblocks=1) , # 490.132 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=28, tile_m=2, tile_n=1, threads=64, grouping=4, minblocks=9) , # 490.192 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=32, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=11) , # 499.565 GFlop/s
  Kernel_dnt_medium(m=7, n=7, k=45, tile_m=1, tile_n=1, threads=160, grouping=3, minblocks=7) , # 491.99 GFlop/s
  Kernel_dnt_medium(m=7, n=8, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=12) , # 354.633 GFlop/s
  Kernel_dnt_medium(m=7, n=8, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=9) , # 359.16 GFlop/s
  Kernel_dnt_medium(m=7, n=8, k=6, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=19) , # 411.24 GFlop/s
  Kernel_dnt_medium(m=7, n=8, k=7, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=18) , # 455.7 GFlop/s
  Kernel_dnt_medium(m=7, n=8, k=8, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=23) , # 500.053 GFlop/s
  Kernel_dnt_medium(m=7, n=8, k=9, tile_m=1, tile_n=2, threads=32, grouping=11, minblocks=3) , # 501.545 GFlop/s
  Kernel_dnt_medium(m=7, n=9, k=4, tile_m=3, tile_n=1, threads=32, grouping=16, minblocks=25) , # 357.248 GFlop/s
  Kernel_dnt_medium(m=7, n=9, k=5, tile_m=3, tile_n=1, threads=32, grouping=16, minblocks=5) , # 380.106 GFlop/s
  Kernel_dnt_medium(m=7, n=9, k=6, tile_m=1, tile_n=3, threads=32, grouping=13, minblocks=1) , # 421.294 GFlop/s
  Kernel_dnt_medium(m=7, n=9, k=7, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=26) , # 474.232 GFlop/s
  Kernel_dnt_medium(m=7, n=9, k=8, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=25) , # 498.036 GFlop/s
  Kernel_dnt_medium(m=7, n=9, k=9, tile_m=1, tile_n=3, threads=32, grouping=13, minblocks=19) , # 512.04 GFlop/s
  Kernel_dnt_medium(m=7, n=9, k=13, tile_m=1, tile_n=1, threads=64, grouping=5, minblocks=22) , # 509.15 GFlop/s
  Kernel_dnt_medium(m=7, n=9, k=25, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=9) , # 538.393 GFlop/s
  Kernel_dnt_medium(m=7, n=9, k=26, tile_m=1, tile_n=1, threads=64, grouping=32, minblocks=2) , # 539.44 GFlop/s
  Kernel_dnt_largeDB2(m=7, n=9, k=28, tile_m=1, tile_n=1, w=14, v=8, threads=64, grouping=16, minblocks=8) , # 544.807 GFlop/s
  Kernel_dnt_medium(m=7, n=9, k=32, tile_m=1, tile_n=1, threads=64, grouping=3, minblocks=10) , # 550.841 GFlop/s
  Kernel_dnt_medium(m=7, n=9, k=45, tile_m=1, tile_n=1, threads=128, grouping=3, minblocks=8) , # 548.66 GFlop/s
  Kernel_dnt_medium(m=7, n=13, k=4, tile_m=4, tile_n=1, threads=32, grouping=15, minblocks=24) , # 473.871 GFlop/s
  Kernel_dnt_medium(m=7, n=13, k=5, tile_m=2, tile_n=2, threads=32, grouping=13, minblocks=20) , # 484.652 GFlop/s
  Kernel_dnt_medium(m=7, n=13, k=7, tile_m=4, tile_n=1, threads=32, grouping=14, minblocks=22) , # 568.233 GFlop/s
  Kernel_dnt_medium(m=7, n=13, k=9, tile_m=2, tile_n=2, threads=32, grouping=12, minblocks=25) , # 575.698 GFlop/s
  Kernel_dnt_medium(m=7, n=13, k=13, tile_m=2, tile_n=1, threads=64, grouping=26, minblocks=17) , # 576.55 GFlop/s
  Kernel_dnt_medium(m=7, n=13, k=25, tile_m=2, tile_n=1, threads=64, grouping=6, minblocks=2) , # 602.965 GFlop/s
  Kernel_dnt_medium(m=7, n=13, k=26, tile_m=2, tile_n=1, threads=96, grouping=5, minblocks=1) , # 607.199 GFlop/s
  Kernel_dnt_medium(m=7, n=13, k=28, tile_m=2, tile_n=1, threads=96, grouping=4, minblocks=8) , # 615.391 GFlop/s
  Kernel_dnt_medium(m=7, n=13, k=32, tile_m=1, tile_n=1, threads=96, grouping=4, minblocks=2) , # 622.982 GFlop/s
  Kernel_dnt_medium(m=7, n=13, k=45, tile_m=1, tile_n=1, threads=128, grouping=3, minblocks=2) , # 626.534 GFlop/s
  Kernel_dnt_medium(m=7, n=25, k=4, tile_m=4, tile_n=2, threads=32, grouping=16, minblocks=10) , # 580.16 GFlop/s
  Kernel_dnt_medium(m=7, n=25, k=5, tile_m=4, tile_n=1, threads=64, grouping=22, minblocks=18) , # 597.952 GFlop/s
  Kernel_dnt_medium(m=7, n=25, k=7, tile_m=4, tile_n=1, threads=64, grouping=22, minblocks=12) , # 658.065 GFlop/s
  Kernel_dnt_medium(m=7, n=25, k=9, tile_m=1, tile_n=3, threads=64, grouping=22, minblocks=2) , # 683.511 GFlop/s
  Kernel_dnt_medium(m=7, n=25, k=13, tile_m=1, tile_n=3, threads=96, grouping=29, minblocks=11) , # 697.002 GFlop/s
  Kernel_dnt_medium(m=7, n=25, k=25, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=3) , # 721.815 GFlop/s
  Kernel_dnt_medium(m=7, n=25, k=26, tile_m=1, tile_n=2, threads=192, grouping=5, minblocks=3) , # 726.291 GFlop/s
  Kernel_dnt_medium(m=7, n=25, k=28, tile_m=1, tile_n=2, threads=224, grouping=4, minblocks=6) , # 733.469 GFlop/s
  Kernel_dnt_medium(m=7, n=25, k=32, tile_m=1, tile_n=3, threads=128, grouping=3, minblocks=4) , # 738.173 GFlop/s
  Kernel_dnt_largeDB2(m=7, n=25, k=45, tile_m=1, tile_n=2, w=22, v=24, threads=96, grouping=16, minblocks=4) , # 744.304 GFlop/s
  Kernel_dnt_medium(m=7, n=26, k=4, tile_m=4, tile_n=2, threads=32, grouping=16, minblocks=7) , # 597.715 GFlop/s
  Kernel_dnt_medium(m=7, n=26, k=5, tile_m=4, tile_n=2, threads=64, grouping=22, minblocks=14) , # 612.13 GFlop/s
  Kernel_dnt_medium(m=7, n=26, k=7, tile_m=4, tile_n=1, threads=64, grouping=22, minblocks=9) , # 669.6 GFlop/s
  Kernel_dnt_medium(m=7, n=26, k=9, tile_m=1, tile_n=3, threads=64, grouping=26, minblocks=14) , # 695 GFlop/s
  Kernel_dnt_medium(m=7, n=26, k=13, tile_m=1, tile_n=3, threads=96, grouping=32, minblocks=10) , # 708.859 GFlop/s
  Kernel_dnt_medium(m=7, n=26, k=25, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=7) , # 732.407 GFlop/s
  Kernel_dnt_medium(m=7, n=26, k=26, tile_m=1, tile_n=2, threads=192, grouping=5, minblocks=2) , # 739.029 GFlop/s
  Kernel_dnt_medium(m=7, n=26, k=28, tile_m=1, tile_n=2, threads=224, grouping=4, minblocks=6) , # 743.234 GFlop/s
  Kernel_dnt_medium(m=7, n=26, k=32, tile_m=1, tile_n=2, threads=256, grouping=4, minblocks=5) , # 753.794 GFlop/s
  Kernel_dnt_largeDB2(m=7, n=26, k=45, tile_m=2, tile_n=2, w=16, v=26, threads=64, grouping=16, minblocks=8) , # 753.616 GFlop/s
  Kernel_dnt_medium(m=7, n=28, k=4, tile_m=4, tile_n=2, threads=32, grouping=16, minblocks=16) , # 621.956 GFlop/s
  Kernel_dnt_medium(m=7, n=28, k=5, tile_m=4, tile_n=1, threads=64, grouping=22, minblocks=12) , # 630.557 GFlop/s
  Kernel_dnt_medium(m=7, n=28, k=7, tile_m=4, tile_n=1, threads=64, grouping=26, minblocks=8) , # 687.436 GFlop/s
  Kernel_dnt_medium(m=7, n=28, k=9, tile_m=2, tile_n=2, threads=64, grouping=26, minblocks=11) , # 709.918 GFlop/s
  Kernel_dnt_medium(m=7, n=28, k=13, tile_m=2, tile_n=2, threads=96, grouping=12, minblocks=7) , # 710.614 GFlop/s
  Kernel_dnt_medium(m=7, n=28, k=25, tile_m=1, tile_n=2, threads=192, grouping=5, minblocks=3) , # 746.919 GFlop/s
  Kernel_dnt_medium(m=7, n=28, k=26, tile_m=2, tile_n=1, threads=192, grouping=4, minblocks=6) , # 752.82 GFlop/s
  Kernel_dnt_medium(m=7, n=28, k=28, tile_m=2, tile_n=1, threads=224, grouping=4, minblocks=6) , # 754.704 GFlop/s
  Kernel_dnt_medium(m=7, n=28, k=32, tile_m=2, tile_n=1, threads=128, grouping=4, minblocks=1) , # 766.488 GFlop/s
  Kernel_dnt_largeDB2(m=7, n=28, k=45, tile_m=2, tile_n=1, w=12, v=24, threads=128, grouping=16, minblocks=2) , # 770.322 GFlop/s
  Kernel_dnt_medium(m=7, n=32, k=4, tile_m=4, tile_n=2, threads=32, grouping=19, minblocks=15) , # 627.174 GFlop/s
  Kernel_dnt_medium(m=7, n=32, k=5, tile_m=4, tile_n=1, threads=64, grouping=21, minblocks=3) , # 656.782 GFlop/s
  Kernel_dnt_medium(m=7, n=32, k=7, tile_m=2, tile_n=2, threads=64, grouping=22, minblocks=11) , # 701.471 GFlop/s
  Kernel_dnt_medium(m=7, n=32, k=9, tile_m=4, tile_n=1, threads=64, grouping=32, minblocks=2) , # 721.47 GFlop/s
  Kernel_dnt_medium(m=7, n=32, k=13, tile_m=4, tile_n=1, threads=96, grouping=6, minblocks=10) , # 736.094 GFlop/s
  Kernel_dnt_medium(m=7, n=32, k=25, tile_m=2, tile_n=1, threads=192, grouping=4, minblocks=1) , # 769.295 GFlop/s
  Kernel_dnt_medium(m=7, n=32, k=26, tile_m=2, tile_n=1, threads=192, grouping=4, minblocks=1) , # 776.776 GFlop/s
  Kernel_dnt_medium(m=7, n=32, k=28, tile_m=2, tile_n=1, threads=256, grouping=4, minblocks=5) , # 778.504 GFlop/s
  Kernel_dnt_largeDB2(m=7, n=32, k=32, tile_m=2, tile_n=2, w=16, v=32, threads=64, grouping=16, minblocks=2) , # 790.893 GFlop/s
  Kernel_dnt_largeDB2(m=7, n=32, k=45, tile_m=2, tile_n=1, w=12, v=32, threads=128, grouping=16, minblocks=2) , # 802.202 GFlop/s
  Kernel_dnt_medium(m=7, n=45, k=4, tile_m=2, tile_n=3, threads=64, grouping=28, minblocks=2) , # 631.988 GFlop/s
  Kernel_dnt_medium(m=7, n=45, k=5, tile_m=1, tile_n=5, threads=64, grouping=27, minblocks=5) , # 663.525 GFlop/s
  Kernel_dnt_medium(m=7, n=45, k=7, tile_m=1, tile_n=5, threads=64, grouping=32, minblocks=7) , # 709.225 GFlop/s
  Kernel_dnt_medium(m=7, n=45, k=9, tile_m=4, tile_n=1, threads=128, grouping=32, minblocks=10) , # 727.897 GFlop/s
  Kernel_dnt_medium(m=7, n=45, k=13, tile_m=4, tile_n=1, threads=128, grouping=29, minblocks=3) , # 755.771 GFlop/s
  Kernel_dnt_medium(m=7, n=45, k=25, tile_m=2, tile_n=1, threads=192, grouping=4, minblocks=1) , # 801.315 GFlop/s
  Kernel_dnt_medium(m=7, n=45, k=26, tile_m=2, tile_n=1, threads=224, grouping=3, minblocks=3) , # 810.209 GFlop/s
  Kernel_dnt_medium(m=7, n=45, k=28, tile_m=2, tile_n=1, threads=256, grouping=3, minblocks=4) , # 818.919 GFlop/s
  Kernel_dnt_medium(m=7, n=45, k=32, tile_m=2, tile_n=1, threads=256, grouping=3, minblocks=3) , # 832.676 GFlop/s
  Kernel_dnt_largeDB2(m=7, n=45, k=45, tile_m=4, tile_n=1, w=12, v=32, threads=96, grouping=16, minblocks=2) , # 841.043 GFlop/s
  Kernel_dnt_medium(m=8, n=4, k=4, tile_m=1, tile_n=1, threads=32, grouping=18, minblocks=27) , # 241.107 GFlop/s
  Kernel_dnt_medium(m=8, n=4, k=5, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=12) , # 253.28 GFlop/s
  Kernel_dnt_medium(m=8, n=4, k=6, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=8) , # 292.754 GFlop/s
  Kernel_dnt_medium(m=8, n=4, k=7, tile_m=1, tile_n=1, threads=32, grouping=16, minblocks=5) , # 322.969 GFlop/s
  Kernel_dnt_small(m=8, n=4, k=8, tile_m=1, tile_n=1, threads=32, grouping=9, minblocks=7) , # 312.182 GFlop/s
  Kernel_dnt_medium(m=8, n=4, k=9, tile_m=1, tile_n=1, threads=32, grouping=13, minblocks=25) , # 336.929 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=27) , # 261.066 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=25) , # 297.369 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=6, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=25) , # 344.888 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=7, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=21) , # 335.855 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=8, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=4) , # 362.704 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=9, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=27) , # 388.028 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=4, tile_m=1, tile_n=2, threads=32, grouping=15, minblocks=24) , # 306.873 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=19) , # 351.046 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=6, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=20) , # 357.061 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=7, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=25) , # 398.063 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=8, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=27) , # 432.142 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=9, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=2) , # 451.4 GFlop/s
  Kernel_dnt_medium(m=8, n=7, k=4, tile_m=1, tile_n=2, threads=32, grouping=15, minblocks=11) , # 354.317 GFlop/s
  Kernel_dnt_medium(m=8, n=7, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=6) , # 358.106 GFlop/s
  Kernel_dnt_medium(m=8, n=7, k=6, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=17) , # 412.399 GFlop/s
  Kernel_dnt_medium(m=8, n=7, k=7, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=11) , # 463.962 GFlop/s
  Kernel_dnt_medium(m=8, n=7, k=8, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=25) , # 495.994 GFlop/s
  Kernel_dnt_medium(m=8, n=7, k=9, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=26) , # 501.068 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=4, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=11) , # 423.915 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=5, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=12) , # 468.053 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=6, tile_m=2, tile_n=1, threads=32, grouping=11, minblocks=17) , # 531.999 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=7, tile_m=2, tile_n=1, threads=32, grouping=11, minblocks=1) , # 571.337 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=8, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=25) , # 605.152 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=9, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=5) , # 571.958 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=4, tile_m=1, tile_n=3, threads=32, grouping=15, minblocks=25) , # 378.51 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=5, tile_m=1, tile_n=3, threads=32, grouping=16, minblocks=12) , # 433.932 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=6, tile_m=1, tile_n=3, threads=32, grouping=13, minblocks=12) , # 500.839 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=7, tile_m=1, tile_n=3, threads=32, grouping=11, minblocks=25) , # 544.839 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=8, tile_m=1, tile_n=3, threads=32, grouping=13, minblocks=9) , # 572.229 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=9, tile_m=1, tile_n=3, threads=32, grouping=16, minblocks=20) , # 567.681 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=27) , # 223.697 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=12) , # 264.552 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=6, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=12) , # 305.331 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=7, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=25) , # 335.175 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=8, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=26) , # 321.571 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=9, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=6) , # 341.86 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=13, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=5) , # 381.012 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=25, tile_m=1, tile_n=1, threads=64, grouping=5, minblocks=4) , # 380.088 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=26, tile_m=1, tile_n=1, threads=64, grouping=29, minblocks=1) , # 384.187 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=28, tile_m=1, tile_n=1, threads=64, grouping=7, minblocks=5) , # 389.01 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=32, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=11) , # 391.946 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=45, tile_m=1, tile_n=1, threads=96, grouping=3, minblocks=9) , # 387.535 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=2) , # 274.355 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=7) , # 325.528 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=6, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=3) , # 371.15 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=7, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=1) , # 371.567 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=8, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=19) , # 394.72 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=9, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=2) , # 417.155 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=13, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=13) , # 424.207 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=25, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=15) , # 441.803 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=26, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=3) , # 441.181 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=28, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=13) , # 447.972 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=32, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=5) , # 455.083 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=45, tile_m=1, tile_n=1, threads=128, grouping=5, minblocks=2) , # 447.72 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=16) , # 324.725 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=5, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=2) , # 383.299 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=6, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=9) , # 384.354 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=7, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=15) , # 435.21 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=8, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=14) , # 469.311 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=9, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=9) , # 476.433 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=4, tile_m=3, tile_n=1, threads=32, grouping=16, minblocks=3) , # 360.754 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=5, tile_m=3, tile_n=1, threads=32, grouping=16, minblocks=5) , # 379.473 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=6, tile_m=1, tile_n=3, threads=32, grouping=13, minblocks=20) , # 420.882 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=7, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=27) , # 478.365 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=8, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=1) , # 493.848 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=9, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=2) , # 516.64 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=13, tile_m=1, tile_n=3, threads=32, grouping=13, minblocks=1) , # 510.27 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=25, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=12) , # 537.082 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=26, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=2) , # 540.279 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=28, tile_m=1, tile_n=1, threads=96, grouping=4, minblocks=11) , # 545.866 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=32, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=11) , # 551.821 GFlop/s
  Kernel_dnt_medium(m=9, n=7, k=45, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=3) , # 550.203 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=4, tile_m=3, tile_n=1, threads=32, grouping=15, minblocks=3) , # 375.31 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=5, tile_m=3, tile_n=1, threads=32, grouping=16, minblocks=26) , # 428.946 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=6, tile_m=1, tile_n=3, threads=32, grouping=13, minblocks=22) , # 484.981 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=7, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=9) , # 540.555 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=8, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=2) , # 563.409 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=9, tile_m=1, tile_n=3, threads=32, grouping=16, minblocks=20) , # 561.368 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=4, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=16) , # 449.812 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=5, tile_m=3, tile_n=1, threads=32, grouping=12, minblocks=25) , # 520.817 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=6, tile_m=3, tile_n=1, threads=32, grouping=12, minblocks=11) , # 584.233 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=7, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=25) , # 593.141 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=8, tile_m=3, tile_n=1, threads=32, grouping=16, minblocks=2) , # 609.153 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=9, tile_m=1, tile_n=3, threads=32, grouping=13, minblocks=20) , # 596.262 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=12, tile_m=3, tile_n=1, threads=32, grouping=18, minblocks=16) , # 608.863 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=13, tile_m=1, tile_n=3, threads=32, grouping=21, minblocks=3) , # 598.854 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=22, tile_m=1, tile_n=2, threads=64, grouping=26, minblocks=2) , # 602.914 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=25, tile_m=1, tile_n=1, threads=128, grouping=5, minblocks=11) , # 604.486 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=26, tile_m=1, tile_n=1, threads=128, grouping=5, minblocks=9) , # 606.896 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=28, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=12) , # 623.854 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=32, tile_m=1, tile_n=1, threads=96, grouping=4, minblocks=2) , # 619.696 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=45, tile_m=1, tile_n=1, threads=224, grouping=4, minblocks=6) , # 621.925 GFlop/s
  Kernel_dnt_medium(m=9, n=12, k=9, tile_m=1, tile_n=4, threads=32, grouping=13, minblocks=25) , # 636.186 GFlop/s
  Kernel_dnt_medium(m=9, n=12, k=12, tile_m=1, tile_n=2, threads=64, grouping=22, minblocks=15) , # 653.32 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=4, tile_m=5, tile_n=1, threads=32, grouping=12, minblocks=8) , # 520.652 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=5, tile_m=5, tile_n=1, threads=32, grouping=13, minblocks=13) , # 570.99 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=7, tile_m=1, tile_n=5, threads=32, grouping=16, minblocks=2) , # 619.584 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=9, tile_m=5, tile_n=1, threads=32, grouping=12, minblocks=21) , # 635.789 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=13, tile_m=1, tile_n=2, threads=64, grouping=26, minblocks=18) , # 675.011 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=25, tile_m=1, tile_n=2, threads=64, grouping=26, minblocks=6) , # 695.157 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=13, k=26, tile_m=1, tile_n=2, w=12, v=12, threads=64, grouping=16, minblocks=2) , # 706.349 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=13, k=28, tile_m=1, tile_n=2, w=12, v=8, threads=64, grouping=16, minblocks=2) , # 709.773 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=13, k=32, tile_m=1, tile_n=2, w=14, v=8, threads=64, grouping=16, minblocks=1) , # 720.317 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=45, tile_m=1, tile_n=1, threads=160, grouping=3, minblocks=6) , # 722.283 GFlop/s
  Kernel_dnt_medium(m=9, n=22, k=9, tile_m=2, tile_n=2, threads=96, grouping=32, minblocks=5) , # 751.681 GFlop/s
  Kernel_dnt_medium(m=9, n=22, k=22, tile_m=1, tile_n=2, threads=128, grouping=5, minblocks=7) , # 814.968 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=22, k=32, tile_m=2, tile_n=2, w=16, v=22, threads=64, grouping=16, minblocks=1) , # 851.65 GFlop/s
  Kernel_dnt_medium(m=9, n=25, k=4, tile_m=1, tile_n=4, threads=64, grouping=22, minblocks=19) , # 627.071 GFlop/s
  Kernel_dnt_medium(m=9, n=25, k=5, tile_m=1, tile_n=4, threads=64, grouping=22, minblocks=11) , # 697.048 GFlop/s
  Kernel_dnt_medium(m=9, n=25, k=7, tile_m=1, tile_n=4, threads=64, grouping=27, minblocks=11) , # 759.221 GFlop/s
  Kernel_dnt_medium(m=9, n=25, k=9, tile_m=1, tile_n=4, threads=96, grouping=29, minblocks=10) , # 780.691 GFlop/s
  Kernel_dnt_medium(m=9, n=25, k=13, tile_m=1, tile_n=4, threads=128, grouping=9, minblocks=9) , # 806.204 GFlop/s
  Kernel_dnt_medium(m=9, n=25, k=25, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=5) , # 850.306 GFlop/s
  Kernel_dnt_medium(m=9, n=25, k=26, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=5) , # 860.479 GFlop/s
  Kernel_dnt_medium(m=9, n=25, k=28, tile_m=2, tile_n=1, threads=160, grouping=4, minblocks=6) , # 868.917 GFlop/s
  Kernel_dnt_medium(m=9, n=25, k=32, tile_m=2, tile_n=1, threads=160, grouping=4, minblocks=4) , # 876.897 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=25, k=45, tile_m=1, tile_n=2, w=14, v=14, threads=128, grouping=16, minblocks=2) , # 889.424 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=4, tile_m=1, tile_n=4, threads=64, grouping=22, minblocks=20) , # 651.805 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=5, tile_m=1, tile_n=4, threads=64, grouping=26, minblocks=19) , # 709.356 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=7, tile_m=1, tile_n=4, threads=64, grouping=26, minblocks=11) , # 779.501 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=9, tile_m=1, tile_n=4, threads=96, grouping=26, minblocks=11) , # 790.313 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=13, tile_m=1, tile_n=4, threads=128, grouping=32, minblocks=9) , # 820.423 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=25, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=1) , # 859.973 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=26, k=26, tile_m=1, tile_n=5, w=10, v=26, threads=64, grouping=16, minblocks=1) , # 873.449 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=26, k=28, tile_m=1, tile_n=3, w=10, v=14, threads=96, grouping=16, minblocks=2) , # 879.817 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=32, tile_m=1, tile_n=2, threads=128, grouping=3, minblocks=3) , # 884.616 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=26, k=45, tile_m=1, tile_n=2, w=12, v=16, threads=128, grouping=16, minblocks=1) , # 907.729 GFlop/s
  Kernel_dnt_medium(m=9, n=28, k=4, tile_m=1, tile_n=4, threads=64, grouping=22, minblocks=14) , # 687.488 GFlop/s
  Kernel_dnt_medium(m=9, n=28, k=5, tile_m=1, tile_n=4, threads=64, grouping=21, minblocks=10) , # 735.829 GFlop/s
  Kernel_dnt_medium(m=9, n=28, k=7, tile_m=1, tile_n=4, threads=64, grouping=26, minblocks=14) , # 806.204 GFlop/s
  Kernel_dnt_medium(m=9, n=28, k=9, tile_m=3, tile_n=1, threads=96, grouping=26, minblocks=11) , # 821.996 GFlop/s
  Kernel_dnt_medium(m=9, n=28, k=13, tile_m=1, tile_n=4, threads=128, grouping=21, minblocks=7) , # 840.047 GFlop/s
  Kernel_dnt_medium(m=9, n=28, k=25, tile_m=1, tile_n=2, threads=128, grouping=5, minblocks=1) , # 890.192 GFlop/s
  Kernel_dnt_medium(m=9, n=28, k=26, tile_m=1, tile_n=2, threads=256, grouping=4, minblocks=6) , # 897.077 GFlop/s
  Kernel_dnt_medium(m=9, n=28, k=28, tile_m=1, tile_n=2, threads=256, grouping=4, minblocks=5) , # 904.627 GFlop/s
  Kernel_dnt_medium(m=9, n=28, k=32, tile_m=1, tile_n=2, threads=160, grouping=3, minblocks=4) , # 915.991 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=28, k=45, tile_m=1, tile_n=2, w=10, v=28, threads=128, grouping=16, minblocks=2) , # 927.821 GFlop/s
  Kernel_dnt_medium(m=9, n=32, k=4, tile_m=5, tile_n=1, threads=64, grouping=22, minblocks=14) , # 721.352 GFlop/s
  Kernel_dnt_medium(m=9, n=32, k=5, tile_m=3, tile_n=2, threads=64, grouping=26, minblocks=3) , # 765.332 GFlop/s
  Kernel_dnt_medium(m=9, n=32, k=7, tile_m=1, tile_n=5, threads=64, grouping=26, minblocks=6) , # 832 GFlop/s
  Kernel_dnt_medium(m=9, n=32, k=9, tile_m=2, tile_n=2, threads=96, grouping=32, minblocks=3) , # 851.008 GFlop/s
  Kernel_dnt_medium(m=9, n=32, k=13, tile_m=5, tile_n=1, threads=128, grouping=5, minblocks=10) , # 869.42 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=32, k=22, tile_m=3, tile_n=2, w=8, v=32, threads=64, grouping=16, minblocks=1) , # 910.237 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=32, k=25, tile_m=3, tile_n=1, w=10, v=26, threads=96, grouping=16, minblocks=2) , # 918.498 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=32, k=26, tile_m=3, tile_n=1, w=10, v=18, threads=96, grouping=16, minblocks=2) , # 925 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=32, k=28, tile_m=3, tile_n=1, w=10, v=26, threads=96, grouping=16, minblocks=1) , # 928.218 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=32, k=32, tile_m=1, tile_n=5, w=12, v=32, threads=64, grouping=16, minblocks=4) , # 944.428 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=32, k=45, tile_m=3, tile_n=1, w=10, v=28, threads=96, grouping=16, minblocks=4) , # 965.569 GFlop/s
  Kernel_dnt_medium(m=9, n=45, k=4, tile_m=6, tile_n=2, threads=64, grouping=30, minblocks=10) , # 732.692 GFlop/s
  Kernel_dnt_medium(m=9, n=45, k=5, tile_m=5, tile_n=2, threads=64, grouping=30, minblocks=11) , # 771.67 GFlop/s
  Kernel_dnt_medium(m=9, n=45, k=7, tile_m=1, tile_n=5, threads=96, grouping=29, minblocks=10) , # 833.778 GFlop/s
  Kernel_dnt_medium(m=9, n=45, k=9, tile_m=1, tile_n=5, threads=128, grouping=32, minblocks=9) , # 859.465 GFlop/s
  Kernel_dnt_medium(m=9, n=45, k=13, tile_m=1, tile_n=3, threads=160, grouping=5, minblocks=7) , # 903.628 GFlop/s
  Kernel_dnt_medium(m=9, n=45, k=25, tile_m=5, tile_n=1, threads=256, grouping=24, minblocks=4) , # 967.166 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=45, k=26, tile_m=5, tile_n=1, w=10, v=18, threads=96, grouping=16, minblocks=4) , # 977.295 GFlop/s
  Kernel_dnt_medium(m=9, n=45, k=28, tile_m=5, tile_n=1, threads=256, grouping=4, minblocks=4) , # 990.414 GFlop/s
  Kernel_dnt_medium(m=9, n=45, k=32, tile_m=5, tile_n=1, threads=128, grouping=3, minblocks=3) , # 1004.63 GFlop/s
  Kernel_dnt_largeDB2(m=9, n=45, k=45, tile_m=2, tile_n=2, w=12, v=16, threads=128, grouping=16, minblocks=1) , # 1022.05 GFlop/s
  Kernel_dnt_medium(m=10, n=4, k=4, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=1) , # 247.284 GFlop/s
  Kernel_dnt_medium(m=10, n=4, k=10, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=22) , # 389.557 GFlop/s
  Kernel_dnt_medium(m=10, n=4, k=15, tile_m=1, tile_n=1, threads=64, grouping=6, minblocks=27) , # 396.818 GFlop/s
  Kernel_dnt_medium(m=10, n=10, k=4, tile_m=2, tile_n=2, threads=32, grouping=13, minblocks=22) , # 526.19 GFlop/s
  Kernel_dnt_medium(m=10, n=10, k=10, tile_m=1, tile_n=4, threads=32, grouping=18, minblocks=22) , # 663.631 GFlop/s
  Kernel_dnt_medium(m=10, n=10, k=15, tile_m=1, tile_n=2, threads=64, grouping=24, minblocks=4) , # 666.283 GFlop/s
  Kernel_dnt_medium(m=10, n=15, k=4, tile_m=5, tile_n=1, threads=32, grouping=13, minblocks=7) , # 610.102 GFlop/s
  Kernel_dnt_medium(m=10, n=15, k=10, tile_m=1, tile_n=3, threads=64, grouping=24, minblocks=17) , # 729.268 GFlop/s
  Kernel_dnt_medium(m=10, n=15, k=15, tile_m=1, tile_n=3, threads=64, grouping=22, minblocks=6) , # 749.635 GFlop/s
  Kernel_dnt_medium(m=11, n=11, k=11, tile_m=2, tile_n=2, threads=64, grouping=26, minblocks=5) , # 702.059 GFlop/s
  Kernel_dnt_medium(m=12, n=5, k=5, tile_m=2, tile_n=1, threads=32, grouping=11, minblocks=6) , # 414.313 GFlop/s
  Kernel_dnt_medium(m=12, n=5, k=12, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=1) , # 489.381 GFlop/s
  Kernel_dnt_medium(m=12, n=5, k=13, tile_m=2, tile_n=1, threads=32, grouping=4, minblocks=8) , # 472.929 GFlop/s
  Kernel_dnt_medium(m=12, n=5, k=26, tile_m=1, tile_n=1, threads=96, grouping=5, minblocks=6) , # 492.364 GFlop/s
  Kernel_dnt_medium(m=12, n=5, k=32, tile_m=1, tile_n=1, threads=64, grouping=3, minblocks=1) , # 501.048 GFlop/s
  Kernel_dnt_medium(m=12, n=9, k=9, tile_m=4, tile_n=1, threads=32, grouping=13, minblocks=25) , # 630.854 GFlop/s
  Kernel_dnt_medium(m=12, n=9, k=12, tile_m=1, tile_n=2, threads=64, grouping=22, minblocks=5) , # 663.387 GFlop/s
  Kernel_dnt_medium(m=12, n=12, k=5, tile_m=6, tile_n=1, threads=32, grouping=15, minblocks=3) , # 704.113 GFlop/s
  Kernel_dnt_medium(m=12, n=12, k=9, tile_m=1, tile_n=3, threads=64, grouping=22, minblocks=10) , # 766.851 GFlop/s
  Kernel_dnt_medium(m=12, n=12, k=12, tile_m=1, tile_n=3, threads=64, grouping=26, minblocks=1) , # 787.849 GFlop/s
  Kernel_dnt_medium(m=12, n=12, k=13, tile_m=1, tile_n=3, threads=64, grouping=32, minblocks=1) , # 783.946 GFlop/s
  Kernel_dnt_medium(m=12, n=12, k=25, tile_m=1, tile_n=2, threads=128, grouping=6, minblocks=5) , # 798.948 GFlop/s
  Kernel_dnt_medium(m=12, n=12, k=26, tile_m=1, tile_n=3, threads=160, grouping=5, minblocks=8) , # 794.647 GFlop/s
  Kernel_dnt_medium(m=12, n=12, k=32, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=5) , # 821.891 GFlop/s
  Kernel_dnt_medium(m=12, n=13, k=5, tile_m=3, tile_n=2, threads=32, grouping=14, minblocks=12) , # 671.755 GFlop/s
  Kernel_dnt_medium(m=12, n=13, k=12, tile_m=1, tile_n=3, threads=64, grouping=24, minblocks=17) , # 785.316 GFlop/s
  Kernel_dnt_medium(m=12, n=13, k=13, tile_m=2, tile_n=2, threads=64, grouping=26, minblocks=12) , # 777.453 GFlop/s
  Kernel_dnt_medium(m=12, n=13, k=26, tile_m=1, tile_n=2, threads=96, grouping=5, minblocks=3) , # 821.309 GFlop/s
  Kernel_dnt_medium(m=12, n=13, k=32, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=6) , # 841.375 GFlop/s
  Kernel_dnt_medium(m=12, n=25, k=12, tile_m=1, tile_n=5, threads=96, grouping=32, minblocks=10) , # 953.604 GFlop/s
  Kernel_dnt_medium(m=12, n=25, k=25, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=6) , # 1042.81 GFlop/s
  Kernel_dnt_medium(m=12, n=26, k=5, tile_m=1, tile_n=6, threads=64, grouping=26, minblocks=12) , # 857.411 GFlop/s
  Kernel_dnt_medium(m=12, n=26, k=12, tile_m=1, tile_n=2, threads=160, grouping=9, minblocks=3) , # 961.121 GFlop/s
  Kernel_dnt_medium(m=12, n=26, k=13, tile_m=1, tile_n=6, threads=96, grouping=5, minblocks=11) , # 975.617 GFlop/s
  Kernel_dnt_medium(m=12, n=26, k=26, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=1) , # 1066.53 GFlop/s
  Kernel_dnt_medium(m=12, n=26, k=32, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=1) , # 1099.3 GFlop/s
  Kernel_dnt_medium(m=12, n=32, k=5, tile_m=6, tile_n=1, threads=64, grouping=29, minblocks=12) , # 905.659 GFlop/s
  Kernel_dnt_medium(m=12, n=32, k=12, tile_m=1, tile_n=4, threads=96, grouping=4, minblocks=11) , # 1037.69 GFlop/s
  Kernel_dnt_medium(m=12, n=32, k=13, tile_m=1, tile_n=4, threads=160, grouping=5, minblocks=8) , # 1052.61 GFlop/s
  Kernel_dnt_medium(m=12, n=32, k=26, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=3) , # 1139.65 GFlop/s
  Kernel_dnt_medium(m=12, n=32, k=32, tile_m=1, tile_n=2, threads=192, grouping=3, minblocks=4) , # 1180.17 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=6) , # 309.329 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=5, tile_m=1, tile_n=2, threads=32, grouping=16, minblocks=19) , # 351.076 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=7, tile_m=2, tile_n=1, threads=32, grouping=11, minblocks=12) , # 430.38 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=9, tile_m=1, tile_n=2, threads=32, grouping=13, minblocks=7) , # 412.394 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=13, tile_m=1, tile_n=1, threads=64, grouping=4, minblocks=27) , # 421.461 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=25, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=1) , # 422.942 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=26, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=10) , # 428.057 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=28, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=2) , # 436.195 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=32, tile_m=1, tile_n=1, threads=160, grouping=4, minblocks=5) , # 440.449 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=45, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=5) , # 436.899 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=4, tile_m=1, tile_n=3, threads=32, grouping=16, minblocks=13) , # 362.493 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=5, tile_m=1, tile_n=3, threads=32, grouping=13, minblocks=1) , # 414.824 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=7, tile_m=1, tile_n=3, threads=32, grouping=13, minblocks=16) , # 451.746 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=9, tile_m=3, tile_n=1, threads=32, grouping=13, minblocks=18) , # 474.015 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=12, tile_m=1, tile_n=3, threads=64, grouping=18, minblocks=16) , # 489.219 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=13, tile_m=1, tile_n=3, threads=32, grouping=6, minblocks=12) , # 460.079 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=16, tile_m=1, tile_n=3, threads=96, grouping=30, minblocks=3) , # 482.507 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=24, tile_m=1, tile_n=2, threads=64, grouping=5, minblocks=1) , # 491.453 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=25, tile_m=1, tile_n=2, threads=128, grouping=5, minblocks=11) , # 487.722 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=26, tile_m=1, tile_n=2, threads=64, grouping=4, minblocks=1) , # 485.067 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=28, tile_m=1, tile_n=1, threads=96, grouping=3, minblocks=7) , # 493.982 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=32, tile_m=1, tile_n=1, threads=96, grouping=3, minblocks=4) , # 501.16 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=45, tile_m=1, tile_n=1, threads=128, grouping=4, minblocks=5) , # 501.227 GFlop/s
  Kernel_dnt_medium(m=13, n=7, k=4, tile_m=1, tile_n=4, threads=32, grouping=13, minblocks=12) , # 476.167 GFlop/s
  Kernel_dnt_medium(m=13, n=7, k=5, tile_m=2, tile_n=2, threads=32, grouping=13, minblocks=13) , # 472.557 GFlop/s
  Kernel_dnt_medium(m=13, n=7, k=7, tile_m=4, tile_n=1, threads=32, grouping=13, minblocks=22) , # 566.505 GFlop/s
  Kernel_dnt_medium(m=13, n=7, k=9, tile_m=2, tile_n=2, threads=32, grouping=16, minblocks=23) , # 562.908 GFlop/s
  Kernel_dnt_medium(m=13, n=7, k=13, tile_m=1, tile_n=2, threads=64, grouping=6, minblocks=9) , # 575.653 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=7, k=25, tile_m=1, tile_n=2, w=12, v=6, threads=64, grouping=16, minblocks=2) , # 598.682 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=7, k=26, tile_m=1, tile_n=2, w=12, v=4, threads=64, grouping=16, minblocks=2) , # 609.262 GFlop/s
  Kernel_dnt_medium(m=13, n=7, k=28, tile_m=1, tile_n=1, threads=128, grouping=5, minblocks=4) , # 611.772 GFlop/s
  Kernel_dnt_medium(m=13, n=7, k=32, tile_m=1, tile_n=1, threads=128, grouping=6, minblocks=5) , # 622.583 GFlop/s
  Kernel_dnt_medium(m=13, n=7, k=45, tile_m=1, tile_n=1, threads=128, grouping=3, minblocks=6) , # 627.27 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=4, tile_m=5, tile_n=1, threads=32, grouping=13, minblocks=27) , # 500.98 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=5, tile_m=1, tile_n=5, threads=32, grouping=13, minblocks=12) , # 558.304 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=7, tile_m=1, tile_n=5, threads=32, grouping=14, minblocks=25) , # 614.166 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=9, tile_m=2, tile_n=1, threads=64, grouping=26, minblocks=19) , # 633.01 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=13, tile_m=2, tile_n=1, threads=64, grouping=22, minblocks=3) , # 674.039 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=9, k=25, tile_m=2, tile_n=1, w=12, v=8, threads=64, grouping=16, minblocks=2) , # 696.232 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=9, k=26, tile_m=2, tile_n=1, w=10, v=8, threads=64, grouping=16, minblocks=12) , # 700.495 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=9, k=28, tile_m=2, tile_n=1, w=14, v=6, threads=64, grouping=16, minblocks=2) , # 716.029 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=9, k=32, tile_m=2, tile_n=1, w=14, v=8, threads=64, grouping=16, minblocks=2) , # 717.374 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=9, k=45, tile_m=2, tile_n=1, w=14, v=6, threads=64, grouping=16, minblocks=12) , # 720.997 GFlop/s
  Kernel_dnt_medium(m=13, n=12, k=5, tile_m=1, tile_n=6, threads=32, grouping=13, minblocks=23) , # 669.522 GFlop/s
  Kernel_dnt_medium(m=13, n=12, k=12, tile_m=1, tile_n=3, threads=64, grouping=26, minblocks=17) , # 782.744 GFlop/s
  Kernel_dnt_medium(m=13, n=12, k=13, tile_m=3, tile_n=1, threads=64, grouping=22, minblocks=6) , # 773.206 GFlop/s
  Kernel_dnt_medium(m=13, n=12, k=26, tile_m=1, tile_n=3, threads=128, grouping=5, minblocks=9) , # 802.307 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=12, k=32, tile_m=4, tile_n=1, w=16, v=12, threads=64, grouping=16, minblocks=2) , # 828.44 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=4, tile_m=4, tile_n=2, threads=32, grouping=15, minblocks=15) , # 645.445 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=5, tile_m=3, tile_n=3, threads=32, grouping=17, minblocks=13) , # 687.167 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=7, tile_m=4, tile_n=2, threads=32, grouping=20, minblocks=14) , # 753.739 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=9, tile_m=4, tile_n=1, threads=64, grouping=26, minblocks=5) , # 798.832 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=12, tile_m=2, tile_n=2, threads=64, grouping=26, minblocks=4) , # 819.816 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=13, tile_m=4, tile_n=1, threads=64, grouping=32, minblocks=6) , # 808.211 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=14, tile_m=2, tile_n=2, threads=96, grouping=32, minblocks=10) , # 828.516 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=16, tile_m=2, tile_n=2, threads=64, grouping=32, minblocks=7) , # 829.325 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=24, tile_m=1, tile_n=2, threads=128, grouping=5, minblocks=4) , # 846.028 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=25, tile_m=2, tile_n=1, threads=128, grouping=5, minblocks=5) , # 843.072 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=26, tile_m=2, tile_n=1, threads=128, grouping=4, minblocks=1) , # 847.219 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=28, tile_m=2, tile_n=1, threads=128, grouping=4, minblocks=7) , # 855.86 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=32, tile_m=2, tile_n=1, threads=160, grouping=5, minblocks=2) , # 869.632 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=45, tile_m=2, tile_n=1, threads=128, grouping=3, minblocks=3) , # 862.482 GFlop/s
  Kernel_dnt_medium(m=13, n=14, k=13, tile_m=2, tile_n=2, threads=64, grouping=26, minblocks=3) , # 818.092 GFlop/s
  Kernel_dnt_medium(m=13, n=14, k=14, tile_m=1, tile_n=2, threads=96, grouping=32, minblocks=6) , # 824.853 GFlop/s
  Kernel_dnt_medium(m=13, n=14, k=25, tile_m=1, tile_n=2, threads=128, grouping=6, minblocks=1) , # 867.323 GFlop/s
  Kernel_dnt_medium(m=13, n=14, k=26, tile_m=1, tile_n=2, threads=128, grouping=5, minblocks=7) , # 875.867 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=14, k=32, tile_m=2, tile_n=2, w=16, v=14, threads=64, grouping=16, minblocks=4) , # 894.079 GFlop/s
  Kernel_dnt_medium(m=13, n=16, k=5, tile_m=4, tile_n=2, threads=32, grouping=17, minblocks=14) , # 721.863 GFlop/s
  Kernel_dnt_medium(m=13, n=16, k=13, tile_m=2, tile_n=2, threads=64, grouping=26, minblocks=1) , # 866.035 GFlop/s
  Kernel_dnt_medium(m=13, n=16, k=16, tile_m=4, tile_n=1, threads=96, grouping=4, minblocks=13) , # 894.502 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=16, k=32, tile_m=4, tile_n=1, w=16, v=16, threads=64, grouping=16, minblocks=4) , # 942.893 GFlop/s
  Kernel_dnt_medium(m=13, n=24, k=5, tile_m=2, tile_n=3, threads=64, grouping=26, minblocks=10) , # 834.262 GFlop/s
  Kernel_dnt_medium(m=13, n=24, k=13, tile_m=4, tile_n=1, threads=96, grouping=5, minblocks=12) , # 976.165 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=24, k=24, tile_m=3, tile_n=2, w=12, v=20, threads=64, grouping=16, minblocks=1) , # 1079.09 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=24, k=26, tile_m=2, tile_n=3, w=10, v=22, threads=64, grouping=16, minblocks=8) , # 1075.11 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=24, k=32, tile_m=2, tile_n=3, w=16, v=22, threads=64, grouping=16, minblocks=4) , # 1104.28 GFlop/s
  Kernel_dnt_medium(m=13, n=25, k=4, tile_m=2, tile_n=3, threads=64, grouping=25, minblocks=9) , # 775.128 GFlop/s
  Kernel_dnt_medium(m=13, n=25, k=5, tile_m=2, tile_n=3, threads=64, grouping=27, minblocks=5) , # 818.248 GFlop/s
  Kernel_dnt_medium(m=13, n=25, k=7, tile_m=2, tile_n=2, threads=96, grouping=32, minblocks=10) , # 895.315 GFlop/s
  Kernel_dnt_medium(m=13, n=25, k=9, tile_m=2, tile_n=2, threads=96, grouping=32, minblocks=9) , # 937.417 GFlop/s
  Kernel_dnt_medium(m=13, n=25, k=13, tile_m=1, tile_n=3, threads=128, grouping=7, minblocks=9) , # 972.388 GFlop/s
  Kernel_dnt_medium(m=13, n=25, k=14, tile_m=1, tile_n=4, threads=96, grouping=32, minblocks=9) , # 993.182 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=25, k=25, tile_m=2, tile_n=3, w=10, v=14, threads=64, grouping=16, minblocks=2) , # 1071.67 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=25, k=26, tile_m=2, tile_n=3, w=10, v=22, threads=64, grouping=16, minblocks=8) , # 1087.2 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=25, k=28, tile_m=2, tile_n=3, w=14, v=14, threads=64, grouping=16, minblocks=4) , # 1098.76 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=25, k=32, tile_m=2, tile_n=3, w=16, v=20, threads=64, grouping=16, minblocks=8) , # 1113.24 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=25, k=45, tile_m=2, tile_n=3, w=10, v=14, threads=64, grouping=16, minblocks=1) , # 1134.6 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=4, tile_m=2, tile_n=3, threads=64, grouping=24, minblocks=3) , # 796.636 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=5, tile_m=2, tile_n=3, threads=64, grouping=26, minblocks=10) , # 799.952 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=7, tile_m=2, tile_n=2, threads=96, grouping=32, minblocks=9) , # 916.63 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=9, tile_m=1, tile_n=3, threads=128, grouping=32, minblocks=12) , # 954.576 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=12, tile_m=1, tile_n=3, threads=160, grouping=6, minblocks=9) , # 999.327 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=13, tile_m=1, tile_n=4, threads=96, grouping=32, minblocks=10) , # 1007.42 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=14, tile_m=1, tile_n=3, threads=128, grouping=5, minblocks=10) , # 1015.55 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=26, k=24, tile_m=2, tile_n=2, w=8, v=26, threads=96, grouping=16, minblocks=1) , # 1094.14 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=25, tile_m=1, tile_n=3, threads=128, grouping=4, minblocks=4) , # 1087.67 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=26, k=26, tile_m=2, tile_n=2, w=10, v=10, threads=96, grouping=16, minblocks=1) , # 1091.97 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=26, k=28, tile_m=2, tile_n=2, w=14, v=26, threads=96, grouping=16, minblocks=1) , # 1110.06 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=26, k=32, tile_m=2, tile_n=2, w=16, v=26, threads=96, grouping=16, minblocks=1) , # 1133.28 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=26, k=45, tile_m=2, tile_n=3, w=12, v=26, threads=64, grouping=16, minblocks=8) , # 1150.11 GFlop/s
  Kernel_dnt_medium(m=13, n=28, k=4, tile_m=4, tile_n=4, threads=64, grouping=24, minblocks=12) , # 824.545 GFlop/s
  Kernel_dnt_medium(m=13, n=28, k=5, tile_m=1, tile_n=4, threads=96, grouping=9, minblocks=14) , # 835.315 GFlop/s
  Kernel_dnt_medium(m=13, n=28, k=7, tile_m=1, tile_n=4, threads=96, grouping=32, minblocks=9) , # 942.428 GFlop/s
  Kernel_dnt_medium(m=13, n=28, k=9, tile_m=1, tile_n=4, threads=128, grouping=29, minblocks=10) , # 986.164 GFlop/s
  Kernel_dnt_medium(m=13, n=28, k=13, tile_m=1, tile_n=4, threads=96, grouping=32, minblocks=9) , # 1028.12 GFlop/s
  Kernel_dnt_medium(m=13, n=28, k=25, tile_m=4, tile_n=2, threads=64, grouping=4, minblocks=2) , # 1096.2 GFlop/s
  Kernel_dnt_medium(m=13, n=28, k=26, tile_m=4, tile_n=2, threads=64, grouping=4, minblocks=2) , # 1109.32 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=28, k=28, tile_m=2, tile_n=4, w=14, v=26, threads=64, grouping=16, minblocks=2) , # 1123.07 GFlop/s
  Kernel_dnt_medium(m=13, n=28, k=32, tile_m=4, tile_n=2, threads=64, grouping=30, minblocks=1) , # 1134.83 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=28, k=45, tile_m=4, tile_n=2, w=14, v=20, threads=64, grouping=16, minblocks=1) , # 1175.77 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=4, tile_m=2, tile_n=4, threads=64, grouping=26, minblocks=6) , # 873.764 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=5, tile_m=2, tile_n=4, threads=64, grouping=32, minblocks=6) , # 870.626 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=7, tile_m=1, tile_n=5, threads=96, grouping=32, minblocks=9) , # 981.108 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=9, tile_m=1, tile_n=5, threads=96, grouping=32, minblocks=9) , # 1008.75 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=32, k=12, tile_m=4, tile_n=2, w=6, v=20, threads=64, grouping=16, minblocks=1) , # 1074.65 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=13, tile_m=4, tile_n=1, threads=128, grouping=5, minblocks=9) , # 1073.99 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=14, tile_m=4, tile_n=1, threads=128, grouping=5, minblocks=9) , # 1087.04 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=16, tile_m=4, tile_n=1, threads=128, grouping=4, minblocks=8) , # 1122.91 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=32, k=24, tile_m=4, tile_n=2, w=12, v=28, threads=64, grouping=16, minblocks=1) , # 1168.27 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=25, tile_m=4, tile_n=2, threads=64, grouping=4, minblocks=2) , # 1155.19 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=26, tile_m=4, tile_n=2, threads=64, grouping=4, minblocks=4) , # 1166.22 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=28, tile_m=4, tile_n=2, threads=64, grouping=3, minblocks=1) , # 1176.77 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=32, k=32, tile_m=2, tile_n=2, w=12, v=14, threads=128, grouping=16, minblocks=1) , # 1205.05 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=32, k=45, tile_m=2, tile_n=2, w=12, v=32, threads=128, grouping=16, minblocks=1) , # 1231.88 GFlop/s
  Kernel_dnt_medium(m=13, n=45, k=4, tile_m=2, tile_n=5, threads=64, grouping=32, minblocks=5) , # 879.247 GFlop/s
  Kernel_dnt_medium(m=13, n=45, k=5, tile_m=1, tile_n=5, threads=128, grouping=32, minblocks=9) , # 909.587 GFlop/s
  Kernel_dnt_medium(m=13, n=45, k=7, tile_m=1, tile_n=5, threads=128, grouping=32, minblocks=9) , # 1015.1 GFlop/s
  Kernel_dnt_medium(m=13, n=45, k=9, tile_m=1, tile_n=5, threads=128, grouping=32, minblocks=9) , # 1091.53 GFlop/s
  Kernel_dnt_medium(m=13, n=45, k=13, tile_m=1, tile_n=5, threads=192, grouping=29, minblocks=5) , # 1152.46 GFlop/s
  Kernel_dnt_medium(m=13, n=45, k=25, tile_m=4, tile_n=3, threads=64, grouping=29, minblocks=4) , # 1240.6 GFlop/s
  Kernel_dnt_medium(m=13, n=45, k=26, tile_m=4, tile_n=3, threads=64, grouping=4, minblocks=4) , # 1247.15 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=45, k=28, tile_m=2, tile_n=4, w=14, v=20, threads=96, grouping=16, minblocks=4) , # 1271.38 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=45, k=32, tile_m=4, tile_n=3, w=8, v=12, threads=64, grouping=16, minblocks=4) , # 1291.85 GFlop/s
  Kernel_dnt_largeDB2(m=13, n=45, k=45, tile_m=2, tile_n=5, w=12, v=26, threads=64, grouping=16, minblocks=4) , # 1331.96 GFlop/s
  Kernel_dnt_medium(m=14, n=13, k=13, tile_m=2, tile_n=2, threads=64, grouping=26, minblocks=4) , # 819.111 GFlop/s
  Kernel_dnt_medium(m=14, n=13, k=14, tile_m=1, tile_n=4, threads=64, grouping=32, minblocks=3) , # 829.012 GFlop/s
  Kernel_dnt_medium(m=14, n=13, k=25, tile_m=2, tile_n=1, threads=128, grouping=5, minblocks=7) , # 861.359 GFlop/s
  Kernel_dnt_medium(m=14, n=13, k=26, tile_m=1, tile_n=2, threads=128, grouping=5, minblocks=7) , # 876.462 GFlop/s
  Kernel_dnt_medium(m=14, n=13, k=32, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=6) , # 900.065 GFlop/s
  Kernel_dnt_medium(m=14, n=14, k=13, tile_m=1, tile_n=4, threads=96, grouping=32, minblocks=9) , # 876.026 GFlop/s
  Kernel_dnt_medium(m=14, n=14, k=14, tile_m=2, tile_n=2, threads=64, grouping=26, minblocks=7) , # 878.284 GFlop/s
  Kernel_dnt_medium(m=14, n=14, k=16, tile_m=1, tile_n=2, threads=128, grouping=32, minblocks=4) , # 902.555 GFlop/s
  Kernel_dnt_medium(m=14, n=14, k=25, tile_m=1, tile_n=2, threads=128, grouping=6, minblocks=3) , # 915.718 GFlop/s
  Kernel_dnt_medium(m=14, n=14, k=26, tile_m=1, tile_n=2, threads=128, grouping=5, minblocks=5) , # 924.538 GFlop/s
  Kernel_dnt_medium(m=14, n=14, k=29, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=3) , # 924.968 GFlop/s
  Kernel_dnt_medium(m=14, n=14, k=32, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=5) , # 948.827 GFlop/s
  Kernel_dnt_medium(m=14, n=16, k=14, tile_m=1, tile_n=4, threads=64, grouping=29, minblocks=1) , # 920.926 GFlop/s
  Kernel_dnt_medium(m=14, n=16, k=16, tile_m=1, tile_n=4, threads=64, grouping=24, minblocks=11) , # 934.112 GFlop/s
  Kernel_dnt_medium(m=14, n=16, k=29, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=5) , # 985.695 GFlop/s
  Kernel_dnt_medium(m=14, n=25, k=13, tile_m=1, tile_n=3, threads=128, grouping=32, minblocks=9) , # 1036.53 GFlop/s
  Kernel_dnt_medium(m=14, n=25, k=14, tile_m=1, tile_n=3, threads=128, grouping=6, minblocks=10) , # 1064.78 GFlop/s
  Kernel_dnt_medium(m=14, n=25, k=25, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=5) , # 1135.93 GFlop/s
  Kernel_dnt_medium(m=14, n=25, k=26, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=1) , # 1146.21 GFlop/s
  Kernel_dnt_medium(m=14, n=25, k=32, tile_m=1, tile_n=2, threads=224, grouping=3, minblocks=2) , # 1180.89 GFlop/s
  Kernel_dnt_medium(m=14, n=26, k=13, tile_m=1, tile_n=3, threads=128, grouping=6, minblocks=10) , # 1059.97 GFlop/s
  Kernel_dnt_medium(m=14, n=26, k=14, tile_m=1, tile_n=3, threads=128, grouping=4, minblocks=10) , # 1080.3 GFlop/s
  Kernel_dnt_medium(m=14, n=26, k=25, tile_m=1, tile_n=3, threads=128, grouping=3, minblocks=2) , # 1164.2 GFlop/s
  Kernel_dnt_medium(m=14, n=26, k=26, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=5) , # 1170.35 GFlop/s
  Kernel_dnt_medium(m=14, n=26, k=32, tile_m=1, tile_n=2, threads=224, grouping=4, minblocks=4) , # 1204.18 GFlop/s
  Kernel_dnt_medium(m=14, n=29, k=14, tile_m=1, tile_n=5, threads=128, grouping=5, minblocks=8) , # 1099.41 GFlop/s
  Kernel_dnt_medium(m=14, n=29, k=16, tile_m=1, tile_n=5, threads=128, grouping=4, minblocks=7) , # 1136.55 GFlop/s
  Kernel_dnt_medium(m=14, n=29, k=29, tile_m=1, tile_n=3, threads=160, grouping=4, minblocks=2) , # 1205.08 GFlop/s
  Kernel_dnt_medium(m=14, n=29, k=32, tile_m=1, tile_n=3, threads=160, grouping=4, minblocks=3) , # 1227.39 GFlop/s
  Kernel_dnt_medium(m=14, n=32, k=13, tile_m=1, tile_n=4, threads=128, grouping=4, minblocks=9) , # 1144.56 GFlop/s
  Kernel_dnt_medium(m=14, n=32, k=14, tile_m=1, tile_n=4, threads=128, grouping=5, minblocks=8) , # 1157.34 GFlop/s
  Kernel_dnt_medium(m=14, n=32, k=25, tile_m=1, tile_n=3, threads=224, grouping=4, minblocks=5) , # 1247.65 GFlop/s
  Kernel_dnt_medium(m=14, n=32, k=26, tile_m=1, tile_n=2, threads=224, grouping=4, minblocks=4) , # 1256.51 GFlop/s
  Kernel_dnt_medium(m=14, n=32, k=29, tile_m=1, tile_n=3, threads=160, grouping=3, minblocks=4) , # 1258.33 GFlop/s
  Kernel_dnt_medium(m=14, n=32, k=32, tile_m=1, tile_n=3, threads=224, grouping=3, minblocks=4) , # 1281.96 GFlop/s
  Kernel_dnt_medium(m=15, n=4, k=4, tile_m=2, tile_n=1, threads=32, grouping=16, minblocks=21) , # 349.797 GFlop/s
  Kernel_dnt_medium(m=15, n=4, k=10, tile_m=2, tile_n=1, threads=32, grouping=13, minblocks=5) , # 430.134 GFlop/s
  Kernel_dnt_medium(m=15, n=4, k=15, tile_m=2, tile_n=1, threads=64, grouping=6, minblocks=11) , # 443.27 GFlop/s
  Kernel_dnt_medium(m=15, n=10, k=4, tile_m=5, tile_n=1, threads=32, grouping=12, minblocks=22) , # 594.569 GFlop/s
  Kernel_dnt_medium(m=15, n=10, k=10, tile_m=2, tile_n=2, threads=64, grouping=22, minblocks=6) , # 728.427 GFlop/s
  Kernel_dnt_medium(m=15, n=10, k=15, tile_m=2, tile_n=2, threads=64, grouping=29, minblocks=9) , # 747.245 GFlop/s
  Kernel_dnt_medium(m=15, n=15, k=4, tile_m=2, tile_n=2, threads=64, grouping=19, minblocks=6) , # 738.168 GFlop/s
  Kernel_dnt_medium(m=15, n=15, k=10, tile_m=2, tile_n=2, threads=64, grouping=29, minblocks=10) , # 904.696 GFlop/s
  Kernel_dnt_medium(m=15, n=15, k=15, tile_m=2, tile_n=2, threads=64, grouping=32, minblocks=2) , # 928.082 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=5, tile_m=1, tile_n=3, threads=32, grouping=12, minblocks=23) , # 501.288 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=13, tile_m=1, tile_n=3, threads=64, grouping=4, minblocks=19) , # 503.261 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=16, tile_m=1, tile_n=3, threads=96, grouping=29, minblocks=11) , # 524.682 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=32, tile_m=1, tile_n=1, threads=128, grouping=32, minblocks=3) , # 534.391 GFlop/s
  Kernel_dnt_medium(m=16, n=13, k=5, tile_m=2, tile_n=4, threads=32, grouping=16, minblocks=4) , # 723.746 GFlop/s
  Kernel_dnt_medium(m=16, n=13, k=13, tile_m=2, tile_n=2, threads=64, grouping=26, minblocks=9) , # 871.757 GFlop/s
  Kernel_dnt_medium(m=16, n=13, k=16, tile_m=2, tile_n=2, threads=64, grouping=32, minblocks=1) , # 897.477 GFlop/s
  Kernel_dnt_medium(m=16, n=13, k=32, tile_m=1, tile_n=2, threads=160, grouping=4, minblocks=1) , # 953.971 GFlop/s
  Kernel_dnt_medium(m=16, n=14, k=14, tile_m=2, tile_n=2, threads=64, grouping=32, minblocks=12) , # 927.004 GFlop/s
  Kernel_dnt_medium(m=16, n=14, k=16, tile_m=1, tile_n=4, threads=96, grouping=5, minblocks=11) , # 936.063 GFlop/s
  Kernel_dnt_medium(m=16, n=14, k=29, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=3) , # 985.795 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=5, tile_m=2, tile_n=5, threads=32, grouping=19, minblocks=14) , # 857.219 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=13, tile_m=2, tile_n=2, threads=64, grouping=26, minblocks=9) , # 1001.92 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=14, tile_m=2, tile_n=2, threads=64, grouping=32, minblocks=12) , # 1018.98 GFlop/s
  Kernel_dnt_small(m=16, n=16, k=16, tile_m=2, tile_n=2, threads=64, grouping=29, minblocks=8) , # 1035.54 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=29, tile_m=1, tile_n=2, threads=160, grouping=4, minblocks=2) , # 1062.17 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=32, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=3) , # 1086.81 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=55, tile_m=1, tile_n=2, threads=128, grouping=3, minblocks=1) , # 1104.86 GFlop/s
  Kernel_dnt_medium(m=16, n=29, k=14, tile_m=1, tile_n=5, threads=128, grouping=4, minblocks=9) , # 1221.28 GFlop/s
  Kernel_dnt_medium(m=16, n=29, k=16, tile_m=1, tile_n=4, threads=128, grouping=4, minblocks=7) , # 1236.07 GFlop/s
  Kernel_dnt_medium(m=16, n=29, k=29, tile_m=1, tile_n=3, threads=160, grouping=4, minblocks=4) , # 1318.25 GFlop/s
  Kernel_dnt_largeDB2(m=16, n=29, k=55, tile_m=1, tile_n=3, w=16, v=16, threads=160, grouping=16, minblocks=1) , # 1388.25 GFlop/s
  Kernel_dnt_medium(m=16, n=32, k=5, tile_m=1, tile_n=6, threads=96, grouping=32, minblocks=9) , # 1015.49 GFlop/s
  Kernel_dnt_medium(m=16, n=32, k=13, tile_m=1, tile_n=4, threads=128, grouping=5, minblocks=8) , # 1239.79 GFlop/s
  Kernel_dnt_medium(m=16, n=32, k=16, tile_m=1, tile_n=4, threads=160, grouping=4, minblocks=7) , # 1304.45 GFlop/s
  Kernel_dnt_medium(m=16, n=32, k=32, tile_m=1, tile_n=3, threads=192, grouping=3, minblocks=3) , # 1401.94 GFlop/s
  Kernel_dnt_medium(m=16, n=55, k=16, tile_m=1, tile_n=5, threads=192, grouping=4, minblocks=5) , # 1477.07 GFlop/s
  Kernel_dnt_medium(m=16, n=55, k=29, tile_m=1, tile_n=5, threads=192, grouping=32, minblocks=2) , # 1548.78 GFlop/s
  Kernel_dnt_largeDB2(m=16, n=55, k=55, tile_m=2, tile_n=2, w=14, v=40, threads=224, grouping=16, minblocks=1) , # 1648.17 GFlop/s
  Kernel_dnt_medium(m=17, n=17, k=17, tile_m=1, tile_n=6, threads=128, grouping=6, minblocks=9) , # 1025.78 GFlop/s
  Kernel_dnt_medium(m=18, n=18, k=18, tile_m=1, tile_n=4, threads=128, grouping=4, minblocks=8) , # 1117.66 GFlop/s
  Kernel_dnt_medium(m=19, n=19, k=19, tile_m=1, tile_n=4, threads=160, grouping=4, minblocks=7) , # 1155.59 GFlop/s
  Kernel_dnt_medium(m=20, n=20, k=20, tile_m=1, tile_n=4, threads=160, grouping=4, minblocks=7) , # 1246.17 GFlop/s
  Kernel_dnt_medium(m=21, n=21, k=21, tile_m=3, tile_n=3, threads=64, grouping=5, minblocks=4) , # 1259.48 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=9, tile_m=1, tile_n=5, threads=64, grouping=22, minblocks=14) , # 745.394 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=22, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=4) , # 816.342 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=32, tile_m=1, tile_n=2, threads=160, grouping=4, minblocks=3) , # 851.574 GFlop/s
  Kernel_dnt_medium(m=22, n=22, k=9, tile_m=1, tile_n=6, threads=96, grouping=32, minblocks=9) , # 1180.32 GFlop/s
  Kernel_dnt_medium(m=22, n=22, k=22, tile_m=3, tile_n=3, threads=64, grouping=4, minblocks=5) , # 1354.56 GFlop/s
  Kernel_dnt_medium(m=22, n=22, k=32, tile_m=3, tile_n=3, threads=64, grouping=3, minblocks=2) , # 1423.15 GFlop/s
  Kernel_dnt_medium(m=22, n=32, k=9, tile_m=3, tile_n=4, threads=64, grouping=30, minblocks=4) , # 1281.35 GFlop/s
  Kernel_dnt_medium(m=22, n=32, k=22, tile_m=1, tile_n=4, threads=192, grouping=4, minblocks=5) , # 1571.65 GFlop/s
  Kernel_dnt_largeDB2(m=22, n=32, k=32, tile_m=3, tile_n=4, w=16, v=20, threads=64, grouping=16, minblocks=1) , # 1641.78 GFlop/s
  Kernel_dnt_medium(m=23, n=23, k=23, tile_m=3, tile_n=2, threads=96, grouping=4, minblocks=4) , # 1399.21 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=5, tile_m=1, tile_n=5, threads=32, grouping=13, minblocks=16) , # 537.802 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=13, tile_m=1, tile_n=3, threads=96, grouping=32, minblocks=3) , # 547.371 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=24, tile_m=1, tile_n=1, threads=128, grouping=32, minblocks=7) , # 570.165 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=26, tile_m=1, tile_n=1, threads=192, grouping=4, minblocks=8) , # 574.898 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=32, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=1) , # 586.204 GFlop/s
  Kernel_dnt_medium(m=24, n=13, k=5, tile_m=3, tile_n=2, threads=64, grouping=26, minblocks=9) , # 828.944 GFlop/s
  Kernel_dnt_medium(m=24, n=13, k=13, tile_m=1, tile_n=5, threads=96, grouping=6, minblocks=11) , # 980.411 GFlop/s
  Kernel_dnt_largeDB2(m=24, n=13, k=24, tile_m=3, tile_n=2, w=12, v=12, threads=64, grouping=16, minblocks=4) , # 1080.39 GFlop/s
  Kernel_dnt_medium(m=24, n=13, k=26, tile_m=1, tile_n=3, threads=128, grouping=3, minblocks=2) , # 1074.3 GFlop/s
  Kernel_dnt_medium(m=24, n=13, k=32, tile_m=1, tile_n=3, threads=128, grouping=3, minblocks=2) , # 1107.01 GFlop/s
  Kernel_dnt_medium(m=24, n=24, k=5, tile_m=3, tile_n=3, threads=64, grouping=32, minblocks=8) , # 1126.34 GFlop/s
  Kernel_dnt_medium(m=24, n=24, k=13, tile_m=1, tile_n=6, threads=160, grouping=4, minblocks=8) , # 1384.19 GFlop/s
  Kernel_dnt_medium(m=24, n=24, k=24, tile_m=1, tile_n=3, threads=192, grouping=3, minblocks=5) , # 1523.95 GFlop/s
  Kernel_dnt_medium(m=24, n=24, k=26, tile_m=1, tile_n=3, threads=224, grouping=4, minblocks=1) , # 1513.76 GFlop/s
  Kernel_dnt_medium(m=24, n=24, k=32, tile_m=1, tile_n=3, threads=192, grouping=3, minblocks=1) , # 1578.31 GFlop/s
  Kernel_dnt_medium(m=24, n=26, k=5, tile_m=1, tile_n=6, threads=160, grouping=12, minblocks=8) , # 1069.19 GFlop/s
  Kernel_dnt_medium(m=24, n=26, k=13, tile_m=1, tile_n=6, threads=128, grouping=5, minblocks=8) , # 1390.67 GFlop/s
  Kernel_dnt_medium(m=24, n=26, k=24, tile_m=3, tile_n=4, threads=64, grouping=3, minblocks=4) , # 1507.26 GFlop/s
  Kernel_dnt_medium(m=24, n=26, k=26, tile_m=1, tile_n=3, threads=256, grouping=32, minblocks=3) , # 1507.58 GFlop/s
  Kernel_dnt_largeDB2(m=24, n=26, k=32, tile_m=3, tile_n=4, w=16, v=18, threads=64, grouping=16, minblocks=1) , # 1559.19 GFlop/s
  Kernel_dnt_medium(m=24, n=32, k=5, tile_m=1, tile_n=4, threads=192, grouping=10, minblocks=7) , # 1142.19 GFlop/s
  Kernel_dnt_medium(m=24, n=32, k=13, tile_m=3, tile_n=4, threads=64, grouping=24, minblocks=5) , # 1488.17 GFlop/s
  Kernel_dnt_medium(m=24, n=32, k=24, tile_m=3, tile_n=2, threads=128, grouping=32, minblocks=2) , # 1672.01 GFlop/s
  Kernel_dnt_medium(m=24, n=32, k=26, tile_m=1, tile_n=4, threads=224, grouping=3, minblocks=4) , # 1681.57 GFlop/s
  Kernel_dnt_largeDB2(m=24, n=32, k=32, tile_m=3, tile_n=5, w=8, v=32, threads=64, grouping=16, minblocks=2) , # 1724.89 GFlop/s
  Kernel_dnt_medium(m=25, n=4, k=4, tile_m=1, tile_n=4, threads=32, grouping=13, minblocks=20) , # 445.461 GFlop/s
  Kernel_dnt_medium(m=25, n=4, k=5, tile_m=1, tile_n=5, threads=32, grouping=12, minblocks=11) , # 455.068 GFlop/s
  Kernel_dnt_medium(m=25, n=4, k=7, tile_m=1, tile_n=4, threads=32, grouping=16, minblocks=15) , # 457.448 GFlop/s
  Kernel_dnt_medium(m=25, n=4, k=9, tile_m=1, tile_n=5, threads=64, grouping=26, minblocks=10) , # 453.929 GFlop/s
  Kernel_dnt_medium(m=25, n=4, k=13, tile_m=1, tile_n=2, threads=64, grouping=4, minblocks=12) , # 469.649 GFlop/s
  Kernel_dnt_medium(m=25, n=4, k=25, tile_m=1, tile_n=2, threads=128, grouping=3, minblocks=1) , # 480.107 GFlop/s
  Kernel_dnt_medium(m=25, n=4, k=26, tile_m=1, tile_n=2, threads=128, grouping=3, minblocks=8) , # 482.817 GFlop/s
  Kernel_dnt_medium(m=25, n=4, k=28, tile_m=1, tile_n=2, threads=128, grouping=3, minblocks=1) , # 487.851 GFlop/s
  Kernel_dnt_medium(m=25, n=4, k=32, tile_m=1, tile_n=2, threads=160, grouping=3, minblocks=2) , # 489.45 GFlop/s
  Kernel_dnt_medium(m=25, n=4, k=45, tile_m=1, tile_n=1, threads=192, grouping=2, minblocks=2) , # 489.336 GFlop/s
  Kernel_dnt_medium(m=25, n=5, k=4, tile_m=1, tile_n=6, threads=32, grouping=13, minblocks=24) , # 501.601 GFlop/s
  Kernel_dnt_medium(m=25, n=5, k=5, tile_m=1, tile_n=5, threads=32, grouping=16, minblocks=16) , # 516.776 GFlop/s
  Kernel_dnt_medium(m=25, n=5, k=7, tile_m=1, tile_n=5, threads=64, grouping=22, minblocks=13) , # 515.647 GFlop/s
  Kernel_dnt_medium(m=25, n=5, k=9, tile_m=1, tile_n=6, threads=64, grouping=26, minblocks=15) , # 540.279 GFlop/s
  Kernel_dnt_medium(m=25, n=5, k=13, tile_m=3, tile_n=1, threads=96, grouping=32, minblocks=10) , # 542.948 GFlop/s
  Kernel_dnt_medium(m=25, n=5, k=25, tile_m=1, tile_n=1, threads=128, grouping=3, minblocks=8) , # 570.33 GFlop/s
  Kernel_dnt_medium(m=25, n=5, k=26, tile_m=1, tile_n=1, threads=192, grouping=4, minblocks=4) , # 569.355 GFlop/s
  Kernel_dnt_medium(m=25, n=5, k=28, tile_m=1, tile_n=1, threads=192, grouping=4, minblocks=2) , # 571.981 GFlop/s
  Kernel_dnt_medium(m=25, n=5, k=32, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=2) , # 588.39 GFlop/s
  Kernel_dnt_medium(m=25, n=5, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=1) , # 585.125 GFlop/s
  Kernel_dnt_medium(m=25, n=7, k=4, tile_m=3, tile_n=3, threads=32, grouping=15, minblocks=2) , # 580.754 GFlop/s
  Kernel_dnt_medium(m=25, n=7, k=5, tile_m=3, tile_n=1, threads=64, grouping=21, minblocks=19) , # 598.77 GFlop/s
  Kernel_dnt_medium(m=25, n=7, k=7, tile_m=3, tile_n=1, threads=64, grouping=22, minblocks=3) , # 660.441 GFlop/s
  Kernel_dnt_medium(m=25, n=7, k=9, tile_m=3, tile_n=1, threads=64, grouping=26, minblocks=10) , # 684.274 GFlop/s
  Kernel_dnt_medium(m=25, n=7, k=13, tile_m=3, tile_n=1, threads=96, grouping=29, minblocks=1) , # 698.671 GFlop/s
  Kernel_dnt_medium(m=25, n=7, k=25, tile_m=1, tile_n=3, threads=192, grouping=4, minblocks=7) , # 718.262 GFlop/s
  Kernel_dnt_medium(m=25, n=7, k=26, tile_m=3, tile_n=1, threads=192, grouping=29, minblocks=3) , # 721.455 GFlop/s
  Kernel_dnt_medium(m=25, n=7, k=28, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=2) , # 724.329 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=7, k=32, tile_m=3, tile_n=1, w=16, v=6, threads=64, grouping=16, minblocks=4) , # 735.427 GFlop/s
  Kernel_dnt_medium(m=25, n=7, k=45, tile_m=3, tile_n=1, threads=128, grouping=30, minblocks=4) , # 735.753 GFlop/s
  Kernel_dnt_medium(m=25, n=9, k=4, tile_m=1, tile_n=5, threads=64, grouping=24, minblocks=3) , # 615.95 GFlop/s
  Kernel_dnt_medium(m=25, n=9, k=5, tile_m=1, tile_n=5, threads=64, grouping=24, minblocks=2) , # 696.734 GFlop/s
  Kernel_dnt_medium(m=25, n=9, k=7, tile_m=1, tile_n=5, threads=64, grouping=26, minblocks=13) , # 767.41 GFlop/s
  Kernel_dnt_medium(m=25, n=9, k=9, tile_m=1, tile_n=5, threads=96, grouping=32, minblocks=10) , # 772.896 GFlop/s
  Kernel_dnt_medium(m=25, n=9, k=13, tile_m=1, tile_n=3, threads=128, grouping=32, minblocks=9) , # 812.472 GFlop/s
  Kernel_dnt_medium(m=25, n=9, k=25, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=1) , # 854.308 GFlop/s
  Kernel_dnt_medium(m=25, n=9, k=26, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=5) , # 860.057 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=9, k=28, tile_m=1, tile_n=2, w=10, v=8, threads=128, grouping=16, minblocks=4) , # 875.354 GFlop/s
  Kernel_dnt_medium(m=25, n=9, k=32, tile_m=1, tile_n=2, threads=160, grouping=3, minblocks=4) , # 882.376 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=9, k=45, tile_m=1, tile_n=2, w=14, v=8, threads=128, grouping=16, minblocks=1) , # 892.009 GFlop/s
  Kernel_dnt_medium(m=25, n=12, k=12, tile_m=1, tile_n=4, threads=96, grouping=32, minblocks=9) , # 945.839 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=12, k=25, tile_m=3, tile_n=2, w=10, v=12, threads=64, grouping=16, minblocks=1) , # 1042.28 GFlop/s
  Kernel_dnt_medium(m=25, n=13, k=4, tile_m=3, tile_n=2, threads=64, grouping=26, minblocks=7) , # 772.213 GFlop/s
  Kernel_dnt_medium(m=25, n=13, k=5, tile_m=3, tile_n=2, threads=64, grouping=26, minblocks=9) , # 811.652 GFlop/s
  Kernel_dnt_medium(m=25, n=13, k=7, tile_m=3, tile_n=2, threads=64, grouping=29, minblocks=1) , # 894.47 GFlop/s
  Kernel_dnt_medium(m=25, n=13, k=9, tile_m=3, tile_n=2, threads=64, grouping=32, minblocks=7) , # 937.943 GFlop/s
  Kernel_dnt_medium(m=25, n=13, k=13, tile_m=1, tile_n=3, threads=128, grouping=7, minblocks=10) , # 975.605 GFlop/s
  Kernel_dnt_medium(m=25, n=13, k=14, tile_m=1, tile_n=3, threads=128, grouping=7, minblocks=10) , # 992.46 GFlop/s
  Kernel_dnt_medium(m=25, n=13, k=25, tile_m=1, tile_n=3, threads=128, grouping=4, minblocks=5) , # 1066.09 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=13, k=26, tile_m=3, tile_n=2, w=12, v=4, threads=64, grouping=16, minblocks=2) , # 1073.76 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=13, k=28, tile_m=3, tile_n=2, w=12, v=12, threads=64, grouping=16, minblocks=1) , # 1099.51 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=13, k=32, tile_m=3, tile_n=2, w=14, v=12, threads=64, grouping=16, minblocks=2) , # 1111.26 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=13, k=45, tile_m=3, tile_n=2, w=12, v=10, threads=64, grouping=16, minblocks=1) , # 1133.67 GFlop/s
  Kernel_dnt_medium(m=25, n=14, k=13, tile_m=1, tile_n=3, threads=128, grouping=6, minblocks=9) , # 1028.26 GFlop/s
  Kernel_dnt_medium(m=25, n=14, k=14, tile_m=1, tile_n=3, threads=128, grouping=6, minblocks=9) , # 1048.08 GFlop/s
  Kernel_dnt_medium(m=25, n=14, k=25, tile_m=1, tile_n=3, threads=128, grouping=3, minblocks=1) , # 1119.23 GFlop/s
  Kernel_dnt_medium(m=25, n=14, k=26, tile_m=1, tile_n=3, threads=128, grouping=3, minblocks=3) , # 1118.61 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=14, k=32, tile_m=3, tile_n=2, w=16, v=14, threads=64, grouping=16, minblocks=2) , # 1161.79 GFlop/s
  Kernel_dnt_medium(m=25, n=25, k=4, tile_m=3, tile_n=4, threads=64, grouping=32, minblocks=4) , # 976.486 GFlop/s
  Kernel_dnt_medium(m=25, n=25, k=5, tile_m=1, tile_n=5, threads=128, grouping=32, minblocks=9) , # 1070.36 GFlop/s
  Kernel_dnt_medium(m=25, n=25, k=7, tile_m=1, tile_n=5, threads=128, grouping=32, minblocks=9) , # 1194.1 GFlop/s
  Kernel_dnt_medium(m=25, n=25, k=9, tile_m=1, tile_n=5, threads=128, grouping=32, minblocks=9) , # 1269.67 GFlop/s
  Kernel_dnt_medium(m=25, n=25, k=12, tile_m=1, tile_n=5, threads=128, grouping=32, minblocks=9) , # 1365.39 GFlop/s
  Kernel_dnt_medium(m=25, n=25, k=13, tile_m=1, tile_n=5, threads=128, grouping=29, minblocks=5) , # 1367.05 GFlop/s
  Kernel_dnt_medium(m=25, n=25, k=14, tile_m=1, tile_n=5, threads=192, grouping=29, minblocks=5) , # 1390.72 GFlop/s
  Kernel_dnt_medium(m=25, n=25, k=25, tile_m=3, tile_n=4, threads=64, grouping=3, minblocks=4) , # 1504.46 GFlop/s
  Kernel_dnt_medium(m=25, n=25, k=26, tile_m=3, tile_n=4, threads=64, grouping=30, minblocks=1) , # 1498.12 GFlop/s
  Kernel_dnt_medium(m=25, n=25, k=28, tile_m=3, tile_n=4, threads=64, grouping=4, minblocks=2) , # 1526.33 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=25, k=32, tile_m=3, tile_n=4, w=8, v=24, threads=64, grouping=16, minblocks=2) , # 1548.35 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=25, k=45, tile_m=3, tile_n=4, w=20, v=8, threads=64, grouping=16, minblocks=2) , # 1587.51 GFlop/s
  Kernel_dnt_medium(m=25, n=26, k=4, tile_m=3, tile_n=4, threads=64, grouping=32, minblocks=5) , # 974.726 GFlop/s
  Kernel_dnt_medium(m=25, n=26, k=5, tile_m=1, tile_n=6, threads=160, grouping=14, minblocks=8) , # 1032.18 GFlop/s
  Kernel_dnt_medium(m=25, n=26, k=7, tile_m=1, tile_n=6, threads=192, grouping=10, minblocks=8) , # 1153.33 GFlop/s
  Kernel_dnt_medium(m=25, n=26, k=9, tile_m=3, tile_n=4, threads=64, grouping=27, minblocks=3) , # 1238.92 GFlop/s
  Kernel_dnt_medium(m=25, n=26, k=13, tile_m=1, tile_n=6, threads=128, grouping=24, minblocks=6) , # 1358.07 GFlop/s
  Kernel_dnt_medium(m=25, n=26, k=14, tile_m=1, tile_n=6, threads=128, grouping=24, minblocks=6) , # 1398.79 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=26, k=25, tile_m=3, tile_n=4, w=8, v=26, threads=64, grouping=16, minblocks=2) , # 1510.79 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=26, k=26, tile_m=3, tile_n=4, w=8, v=26, threads=64, grouping=16, minblocks=8) , # 1521.72 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=26, k=28, tile_m=3, tile_n=4, w=8, v=26, threads=64, grouping=16, minblocks=2) , # 1547.14 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=26, k=32, tile_m=3, tile_n=4, w=8, v=26, threads=64, grouping=16, minblocks=4) , # 1582.6 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=26, k=45, tile_m=3, tile_n=4, w=18, v=26, threads=64, grouping=16, minblocks=2) , # 1621.87 GFlop/s
  Kernel_dnt_medium(m=25, n=28, k=4, tile_m=3, tile_n=4, threads=64, grouping=32, minblocks=3) , # 1038.23 GFlop/s
  Kernel_dnt_medium(m=25, n=28, k=5, tile_m=1, tile_n=6, threads=160, grouping=9, minblocks=8) , # 1102.36 GFlop/s
  Kernel_dnt_medium(m=25, n=28, k=7, tile_m=1, tile_n=6, threads=128, grouping=7, minblocks=8) , # 1200.18 GFlop/s
  Kernel_dnt_medium(m=25, n=28, k=9, tile_m=1, tile_n=6, threads=128, grouping=6, minblocks=8) , # 1319.69 GFlop/s
  Kernel_dnt_medium(m=25, n=28, k=13, tile_m=1, tile_n=6, threads=128, grouping=4, minblocks=8) , # 1432.84 GFlop/s
  Kernel_dnt_medium(m=25, n=28, k=25, tile_m=3, tile_n=4, threads=64, grouping=3, minblocks=2) , # 1590.64 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=28, k=26, tile_m=3, tile_n=4, w=8, v=28, threads=64, grouping=16, minblocks=2) , # 1589.92 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=28, k=28, tile_m=3, tile_n=4, w=8, v=28, threads=64, grouping=16, minblocks=1) , # 1612.35 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=28, k=32, tile_m=3, tile_n=4, w=8, v=28, threads=64, grouping=16, minblocks=4) , # 1653.53 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=28, k=45, tile_m=3, tile_n=4, w=18, v=28, threads=64, grouping=16, minblocks=4) , # 1684.81 GFlop/s
  Kernel_dnt_medium(m=25, n=32, k=4, tile_m=5, tile_n=1, threads=160, grouping=12, minblocks=7) , # 1025.19 GFlop/s
  Kernel_dnt_medium(m=25, n=32, k=5, tile_m=5, tile_n=1, threads=192, grouping=16, minblocks=7) , # 1111.36 GFlop/s
  Kernel_dnt_medium(m=25, n=32, k=7, tile_m=3, tile_n=5, threads=64, grouping=24, minblocks=3) , # 1256.28 GFlop/s
  Kernel_dnt_medium(m=25, n=32, k=9, tile_m=5, tile_n=3, threads=64, grouping=29, minblocks=2) , # 1364.67 GFlop/s
  Kernel_dnt_medium(m=25, n=32, k=13, tile_m=3, tile_n=5, threads=64, grouping=29, minblocks=1) , # 1484.69 GFlop/s
  Kernel_dnt_medium(m=25, n=32, k=14, tile_m=3, tile_n=5, threads=64, grouping=29, minblocks=4) , # 1517.54 GFlop/s
  Kernel_dnt_medium(m=25, n=32, k=25, tile_m=5, tile_n=3, threads=64, grouping=29, minblocks=1) , # 1655.28 GFlop/s
  Kernel_dnt_medium(m=25, n=32, k=26, tile_m=3, tile_n=5, threads=64, grouping=3, minblocks=3) , # 1680.88 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=32, k=28, tile_m=5, tile_n=3, w=14, v=32, threads=64, grouping=16, minblocks=1) , # 1689.82 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=32, k=32, tile_m=5, tile_n=3, w=8, v=32, threads=64, grouping=16, minblocks=1) , # 1750.62 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=32, k=45, tile_m=5, tile_n=3, w=8, v=32, threads=64, grouping=16, minblocks=4) , # 1784.12 GFlop/s
  Kernel_dnt_medium(m=25, n=45, k=4, tile_m=3, tile_n=5, threads=96, grouping=30, minblocks=4) , # 1026.08 GFlop/s
  Kernel_dnt_medium(m=25, n=45, k=5, tile_m=5, tile_n=2, threads=128, grouping=24, minblocks=2) , # 1149.65 GFlop/s
  Kernel_dnt_medium(m=25, n=45, k=7, tile_m=3, tile_n=5, threads=96, grouping=24, minblocks=2) , # 1315.18 GFlop/s
  Kernel_dnt_medium(m=25, n=45, k=9, tile_m=5, tile_n=2, threads=128, grouping=32, minblocks=3) , # 1432.35 GFlop/s
  Kernel_dnt_medium(m=25, n=45, k=13, tile_m=3, tile_n=5, threads=96, grouping=24, minblocks=1) , # 1592.59 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=45, k=25, tile_m=5, tile_n=4, w=10, v=22, threads=64, grouping=16, minblocks=1) , # 1834.49 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=45, k=26, tile_m=5, tile_n=4, w=10, v=44, threads=64, grouping=16, minblocks=4) , # 1856.86 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=45, k=28, tile_m=5, tile_n=4, w=10, v=44, threads=64, grouping=16, minblocks=1) , # 1880.32 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=45, k=32, tile_m=5, tile_n=4, w=8, v=22, threads=64, grouping=16, minblocks=1) , # 1931.76 GFlop/s
  Kernel_dnt_largeDB2(m=25, n=45, k=45, tile_m=5, tile_n=4, w=10, v=14, threads=64, grouping=16, minblocks=4) , # 2021.91 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=4, tile_m=1, tile_n=4, threads=32, grouping=13, minblocks=8) , # 455.983 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=5, tile_m=1, tile_n=4, threads=32, grouping=13, minblocks=13) , # 453.07 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=7, tile_m=1, tile_n=4, threads=32, grouping=16, minblocks=8) , # 462.367 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=9, tile_m=1, tile_n=2, threads=64, grouping=26, minblocks=7) , # 461.91 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=13, tile_m=1, tile_n=2, threads=64, grouping=32, minblocks=1) , # 478.188 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=25, tile_m=1, tile_n=2, threads=128, grouping=3, minblocks=2) , # 483.153 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=26, tile_m=1, tile_n=2, threads=128, grouping=3, minblocks=4) , # 487.755 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=28, tile_m=1, tile_n=2, threads=128, grouping=3, minblocks=3) , # 489.503 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=32, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=4) , # 493.24 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=45, tile_m=1, tile_n=1, threads=256, grouping=2, minblocks=2) , # 495.988 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=4, tile_m=1, tile_n=5, threads=32, grouping=13, minblocks=27) , # 513.002 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=5, tile_m=5, tile_n=1, threads=32, grouping=18, minblocks=10) , # 517.591 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=7, tile_m=1, tile_n=3, threads=64, grouping=22, minblocks=1) , # 525.335 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=9, tile_m=1, tile_n=3, threads=64, grouping=26, minblocks=12) , # 550.268 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=12, tile_m=1, tile_n=3, threads=64, grouping=32, minblocks=11) , # 560.898 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=13, tile_m=1, tile_n=3, threads=96, grouping=29, minblocks=9) , # 549.663 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=24, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=4) , # 573.419 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=25, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=7) , # 570.821 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=26, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=7) , # 573.903 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=28, tile_m=1, tile_n=3, threads=160, grouping=24, minblocks=3) , # 573.438 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=32, tile_m=1, tile_n=1, threads=224, grouping=3, minblocks=6) , # 587.174 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=1) , # 590.001 GFlop/s
  Kernel_dnt_medium(m=26, n=7, k=4, tile_m=2, tile_n=4, threads=32, grouping=16, minblocks=1) , # 594.471 GFlop/s
  Kernel_dnt_medium(m=26, n=7, k=5, tile_m=3, tile_n=1, threads=64, grouping=22, minblocks=17) , # 613.113 GFlop/s
  Kernel_dnt_medium(m=26, n=7, k=7, tile_m=1, tile_n=4, threads=64, grouping=22, minblocks=6) , # 672.9 GFlop/s
  Kernel_dnt_medium(m=26, n=7, k=9, tile_m=3, tile_n=1, threads=64, grouping=26, minblocks=1) , # 697.663 GFlop/s
  Kernel_dnt_medium(m=26, n=7, k=13, tile_m=3, tile_n=1, threads=96, grouping=32, minblocks=7) , # 710.135 GFlop/s
  Kernel_dnt_medium(m=26, n=7, k=25, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=7) , # 730.136 GFlop/s
  Kernel_dnt_medium(m=26, n=7, k=26, tile_m=1, tile_n=2, threads=192, grouping=5, minblocks=3) , # 735.317 GFlop/s
  Kernel_dnt_medium(m=26, n=7, k=28, tile_m=1, tile_n=2, threads=224, grouping=4, minblocks=6) , # 737.392 GFlop/s
  Kernel_dnt_medium(m=26, n=7, k=32, tile_m=1, tile_n=2, threads=256, grouping=4, minblocks=5) , # 751.816 GFlop/s
  Kernel_dnt_medium(m=26, n=7, k=45, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=4) , # 754.379 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=4, tile_m=1, tile_n=5, threads=64, grouping=24, minblocks=4) , # 645.89 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=5, tile_m=1, tile_n=5, threads=64, grouping=26, minblocks=16) , # 708.797 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=7, tile_m=1, tile_n=5, threads=64, grouping=25, minblocks=15) , # 783.549 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=9, tile_m=1, tile_n=5, threads=96, grouping=32, minblocks=11) , # 797.252 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=13, tile_m=1, tile_n=3, threads=128, grouping=18, minblocks=4) , # 821.531 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=25, tile_m=1, tile_n=3, threads=128, grouping=4, minblocks=6) , # 862.508 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=26, tile_m=1, tile_n=3, threads=256, grouping=5, minblocks=5) , # 871.198 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=28, tile_m=1, tile_n=3, threads=128, grouping=4, minblocks=3) , # 877.813 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=32, tile_m=1, tile_n=3, threads=128, grouping=3, minblocks=5) , # 895.509 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=9, k=45, tile_m=3, tile_n=1, w=10, v=6, threads=96, grouping=16, minblocks=8) , # 902.055 GFlop/s
  Kernel_dnt_medium(m=26, n=12, k=5, tile_m=1, tile_n=6, threads=64, grouping=24, minblocks=11) , # 853.834 GFlop/s
  Kernel_dnt_medium(m=26, n=12, k=12, tile_m=1, tile_n=2, threads=160, grouping=6, minblocks=11) , # 965.748 GFlop/s
  Kernel_dnt_medium(m=26, n=12, k=13, tile_m=1, tile_n=2, threads=160, grouping=6, minblocks=1) , # 969.942 GFlop/s
  Kernel_dnt_medium(m=26, n=12, k=26, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=6) , # 1059.98 GFlop/s
  Kernel_dnt_medium(m=26, n=12, k=32, tile_m=1, tile_n=2, threads=192, grouping=3, minblocks=3) , # 1091.93 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=4, tile_m=3, tile_n=2, threads=64, grouping=26, minblocks=8) , # 799.987 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=5, tile_m=3, tile_n=2, threads=64, grouping=26, minblocks=2) , # 797.258 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=7, tile_m=1, tile_n=5, threads=96, grouping=26, minblocks=13) , # 922.13 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=9, tile_m=1, tile_n=5, threads=128, grouping=32, minblocks=10) , # 949.735 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=12, tile_m=1, tile_n=5, threads=96, grouping=32, minblocks=10) , # 997.362 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=13, tile_m=1, tile_n=5, threads=96, grouping=6, minblocks=11) , # 991.96 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=14, tile_m=1, tile_n=4, threads=128, grouping=24, minblocks=6) , # 1008.41 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=13, k=24, tile_m=3, tile_n=2, w=12, v=12, threads=64, grouping=16, minblocks=1) , # 1092.08 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=25, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=5) , # 1088.18 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=26, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=3) , # 1101.37 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=13, k=28, tile_m=3, tile_n=2, w=14, v=8, threads=64, grouping=16, minblocks=2) , # 1105.61 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=13, k=32, tile_m=3, tile_n=2, w=14, v=12, threads=64, grouping=16, minblocks=8) , # 1128.15 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=13, k=45, tile_m=3, tile_n=2, w=12, v=8, threads=64, grouping=16, minblocks=1) , # 1156.84 GFlop/s
  Kernel_dnt_medium(m=26, n=14, k=13, tile_m=1, tile_n=5, threads=96, grouping=4, minblocks=11) , # 1046.07 GFlop/s
  Kernel_dnt_medium(m=26, n=14, k=14, tile_m=1, tile_n=5, threads=128, grouping=5, minblocks=9) , # 1076.05 GFlop/s
  Kernel_dnt_medium(m=26, n=14, k=25, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=2) , # 1153.33 GFlop/s
  Kernel_dnt_medium(m=26, n=14, k=26, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=2) , # 1159.6 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=14, k=32, tile_m=3, tile_n=2, w=16, v=14, threads=64, grouping=16, minblocks=4) , # 1196.64 GFlop/s
  Kernel_dnt_medium(m=26, n=24, k=5, tile_m=1, tile_n=4, threads=160, grouping=32, minblocks=9) , # 1078.8 GFlop/s
  Kernel_dnt_medium(m=26, n=24, k=13, tile_m=1, tile_n=6, threads=128, grouping=4, minblocks=7) , # 1387.5 GFlop/s
  Kernel_dnt_medium(m=26, n=24, k=24, tile_m=1, tile_n=4, threads=160, grouping=3, minblocks=5) , # 1530.31 GFlop/s
  Kernel_dnt_medium(m=26, n=24, k=26, tile_m=1, tile_n=4, threads=192, grouping=24, minblocks=4) , # 1507.25 GFlop/s
  Kernel_dnt_medium(m=26, n=24, k=32, tile_m=1, tile_n=4, threads=256, grouping=32, minblocks=3) , # 1570.75 GFlop/s
  Kernel_dnt_medium(m=26, n=25, k=4, tile_m=3, tile_n=4, threads=64, grouping=32, minblocks=8) , # 973.176 GFlop/s
  Kernel_dnt_medium(m=26, n=25, k=5, tile_m=1, tile_n=5, threads=160, grouping=13, minblocks=8) , # 1040.55 GFlop/s
  Kernel_dnt_medium(m=26, n=25, k=7, tile_m=1, tile_n=4, threads=192, grouping=18, minblocks=8) , # 1170.93 GFlop/s
  Kernel_dnt_medium(m=26, n=25, k=9, tile_m=3, tile_n=4, threads=64, grouping=26, minblocks=3) , # 1241.28 GFlop/s
  Kernel_dnt_medium(m=26, n=25, k=13, tile_m=1, tile_n=5, threads=160, grouping=6, minblocks=6) , # 1351.85 GFlop/s
  Kernel_dnt_medium(m=26, n=25, k=14, tile_m=1, tile_n=5, threads=192, grouping=29, minblocks=6) , # 1386.15 GFlop/s
  Kernel_dnt_medium(m=26, n=25, k=25, tile_m=3, tile_n=4, threads=64, grouping=30, minblocks=4) , # 1508.73 GFlop/s
  Kernel_dnt_medium(m=26, n=25, k=26, tile_m=3, tile_n=5, threads=64, grouping=29, minblocks=2) , # 1524.65 GFlop/s
  Kernel_dnt_medium(m=26, n=25, k=28, tile_m=1, tile_n=5, threads=192, grouping=24, minblocks=4) , # 1536.17 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=25, k=32, tile_m=3, tile_n=4, w=8, v=22, threads=64, grouping=16, minblocks=8) , # 1588.12 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=25, k=45, tile_m=3, tile_n=4, w=18, v=20, threads=64, grouping=16, minblocks=4) , # 1619.09 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=4, tile_m=3, tile_n=4, threads=64, grouping=32, minblocks=5) , # 1064.38 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=5, tile_m=3, tile_n=2, threads=160, grouping=9, minblocks=8) , # 1092.54 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=7, tile_m=1, tile_n=4, threads=192, grouping=9, minblocks=8) , # 1229.79 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=9, tile_m=1, tile_n=5, threads=160, grouping=9, minblocks=7) , # 1308.23 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=12, tile_m=1, tile_n=5, threads=160, grouping=5, minblocks=7) , # 1442.33 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=13, tile_m=1, tile_n=5, threads=160, grouping=5, minblocks=6) , # 1449.82 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=14, tile_m=1, tile_n=5, threads=160, grouping=5, minblocks=6) , # 1481.84 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=24, tile_m=3, tile_n=4, threads=64, grouping=3, minblocks=4) , # 1576.45 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=25, tile_m=1, tile_n=4, threads=192, grouping=32, minblocks=1) , # 1587.9 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=26, tile_m=1, tile_n=4, threads=192, grouping=4, minblocks=4) , # 1597.11 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=28, tile_m=3, tile_n=4, threads=64, grouping=3, minblocks=2) , # 1611.35 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=32, tile_m=3, tile_n=4, threads=64, grouping=3, minblocks=2) , # 1634.06 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=26, k=45, tile_m=3, tile_n=4, w=18, v=14, threads=64, grouping=16, minblocks=4) , # 1658.01 GFlop/s
  Kernel_dnt_medium(m=26, n=28, k=4, tile_m=3, tile_n=4, threads=64, grouping=32, minblocks=2) , # 1060.2 GFlop/s
  Kernel_dnt_medium(m=26, n=28, k=5, tile_m=1, tile_n=5, threads=160, grouping=9, minblocks=8) , # 1126.12 GFlop/s
  Kernel_dnt_medium(m=26, n=28, k=7, tile_m=1, tile_n=4, threads=224, grouping=27, minblocks=6) , # 1233.99 GFlop/s
  Kernel_dnt_medium(m=26, n=28, k=9, tile_m=3, tile_n=4, threads=64, grouping=25, minblocks=3) , # 1333.66 GFlop/s
  Kernel_dnt_medium(m=26, n=28, k=13, tile_m=1, tile_n=5, threads=160, grouping=5, minblocks=6) , # 1462.23 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=28, k=25, tile_m=3, tile_n=4, w=8, v=28, threads=64, grouping=16, minblocks=1) , # 1614.7 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=28, k=26, tile_m=3, tile_n=4, w=8, v=28, threads=64, grouping=16, minblocks=1) , # 1634.02 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=28, k=28, tile_m=3, tile_n=4, w=8, v=28, threads=64, grouping=16, minblocks=2) , # 1644.78 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=28, k=32, tile_m=3, tile_n=4, w=8, v=28, threads=64, grouping=16, minblocks=1) , # 1687.3 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=28, k=45, tile_m=3, tile_n=3, w=16, v=28, threads=96, grouping=16, minblocks=1) , # 1724.14 GFlop/s
  Kernel_dnt_medium(m=26, n=32, k=4, tile_m=1, tile_n=6, threads=160, grouping=12, minblocks=7) , # 1067.11 GFlop/s
  Kernel_dnt_medium(m=26, n=32, k=5, tile_m=1, tile_n=6, threads=192, grouping=12, minblocks=6) , # 1162.16 GFlop/s
  Kernel_dnt_medium(m=26, n=32, k=7, tile_m=3, tile_n=5, threads=64, grouping=26, minblocks=6) , # 1279.84 GFlop/s
  Kernel_dnt_medium(m=26, n=32, k=9, tile_m=1, tile_n=6, threads=160, grouping=10, minblocks=7) , # 1389.5 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=32, k=12, tile_m=7, tile_n=2, w=6, v=32, threads=64, grouping=16, minblocks=1) , # 1501.3 GFlop/s
  Kernel_dnt_medium(m=26, n=32, k=13, tile_m=1, tile_n=6, threads=160, grouping=5, minblocks=6) , # 1539.69 GFlop/s
  Kernel_dnt_medium(m=26, n=32, k=14, tile_m=1, tile_n=6, threads=160, grouping=5, minblocks=6) , # 1590.16 GFlop/s
  Kernel_dnt_medium(m=26, n=32, k=24, tile_m=3, tile_n=5, threads=64, grouping=3, minblocks=3) , # 1705.11 GFlop/s
  Kernel_dnt_medium(m=26, n=32, k=25, tile_m=3, tile_n=5, threads=64, grouping=3, minblocks=2) , # 1687.93 GFlop/s
  Kernel_dnt_medium(m=26, n=32, k=26, tile_m=3, tile_n=5, threads=64, grouping=3, minblocks=2) , # 1732.07 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=32, k=28, tile_m=3, tile_n=5, w=14, v=26, threads=64, grouping=16, minblocks=1) , # 1728.22 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=32, k=32, tile_m=3, tile_n=5, w=8, v=32, threads=64, grouping=16, minblocks=4) , # 1778.97 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=32, k=45, tile_m=3, tile_n=5, w=16, v=30, threads=64, grouping=16, minblocks=1) , # 1831.59 GFlop/s
  Kernel_dnt_medium(m=26, n=45, k=4, tile_m=1, tile_n=6, threads=224, grouping=29, minblocks=5) , # 1073.89 GFlop/s
  Kernel_dnt_medium(m=26, n=45, k=5, tile_m=1, tile_n=5, threads=256, grouping=29, minblocks=5) , # 1204.58 GFlop/s
  Kernel_dnt_medium(m=26, n=45, k=7, tile_m=3, tile_n=5, threads=96, grouping=25, minblocks=3) , # 1351.98 GFlop/s
  Kernel_dnt_medium(m=26, n=45, k=9, tile_m=1, tile_n=5, threads=256, grouping=29, minblocks=5) , # 1490.07 GFlop/s
  Kernel_dnt_medium(m=26, n=45, k=13, tile_m=3, tile_n=5, threads=96, grouping=24, minblocks=2) , # 1646.86 GFlop/s
  Kernel_dnt_medium(m=26, n=45, k=25, tile_m=1, tile_n=5, threads=256, grouping=32, minblocks=3) , # 1858.51 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=45, k=26, tile_m=7, tile_n=3, w=8, v=42, threads=64, grouping=16, minblocks=2) , # 1878.25 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=45, k=28, tile_m=7, tile_n=3, w=8, v=38, threads=64, grouping=16, minblocks=2) , # 1924.64 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=45, k=32, tile_m=7, tile_n=3, w=8, v=22, threads=64, grouping=16, minblocks=1) , # 1985.28 GFlop/s
  Kernel_dnt_largeDB2(m=26, n=45, k=45, tile_m=7, tile_n=3, w=14, v=36, threads=64, grouping=16, minblocks=4) , # 2060.31 GFlop/s
  Kernel_dnt_medium(m=27, n=27, k=27, tile_m=3, tile_n=4, threads=64, grouping=3, minblocks=3) , # 1627.88 GFlop/s
  Kernel_dnt_medium(m=28, n=4, k=4, tile_m=1, tile_n=5, threads=32, grouping=13, minblocks=23) , # 461.401 GFlop/s
  Kernel_dnt_medium(m=28, n=4, k=5, tile_m=1, tile_n=5, threads=32, grouping=13, minblocks=21) , # 448.695 GFlop/s
  Kernel_dnt_medium(m=28, n=4, k=7, tile_m=1, tile_n=2, threads=64, grouping=24, minblocks=7) , # 463.739 GFlop/s
  Kernel_dnt_medium(m=28, n=4, k=9, tile_m=1, tile_n=2, threads=64, grouping=5, minblocks=2) , # 475.527 GFlop/s
  Kernel_dnt_medium(m=28, n=4, k=13, tile_m=1, tile_n=2, threads=64, grouping=29, minblocks=6) , # 481.891 GFlop/s
  Kernel_dnt_medium(m=28, n=4, k=25, tile_m=1, tile_n=2, threads=128, grouping=4, minblocks=4) , # 486.812 GFlop/s
  Kernel_dnt_medium(m=28, n=4, k=26, tile_m=1, tile_n=2, threads=128, grouping=32, minblocks=1) , # 486.174 GFlop/s
  Kernel_dnt_medium(m=28, n=4, k=28, tile_m=1, tile_n=2, threads=128, grouping=6, minblocks=3) , # 487.386 GFlop/s
  Kernel_dnt_medium(m=28, n=4, k=32, tile_m=1, tile_n=1, threads=192, grouping=4, minblocks=1) , # 493.569 GFlop/s
  Kernel_dnt_medium(m=28, n=4, k=45, tile_m=1, tile_n=1, threads=256, grouping=2, minblocks=3) , # 499.71 GFlop/s
  Kernel_dnt_medium(m=28, n=5, k=4, tile_m=1, tile_n=5, threads=32, grouping=15, minblocks=13) , # 512.554 GFlop/s
  Kernel_dnt_medium(m=28, n=5, k=5, tile_m=1, tile_n=5, threads=32, grouping=16, minblocks=24) , # 527.04 GFlop/s
  Kernel_dnt_medium(m=28, n=5, k=7, tile_m=1, tile_n=5, threads=64, grouping=22, minblocks=17) , # 539.965 GFlop/s
  Kernel_dnt_medium(m=28, n=5, k=9, tile_m=3, tile_n=1, threads=64, grouping=26, minblocks=12) , # 563.57 GFlop/s
  Kernel_dnt_medium(m=28, n=5, k=13, tile_m=1, tile_n=3, threads=96, grouping=29, minblocks=3) , # 557.175 GFlop/s
  Kernel_dnt_medium(m=28, n=5, k=25, tile_m=1, tile_n=3, threads=192, grouping=4, minblocks=7) , # 578.247 GFlop/s
  Kernel_dnt_medium(m=28, n=5, k=26, tile_m=1, tile_n=2, threads=160, grouping=5, minblocks=3) , # 579.676 GFlop/s
  Kernel_dnt_medium(m=28, n=5, k=28, tile_m=1, tile_n=1, threads=160, grouping=4, minblocks=1) , # 584.008 GFlop/s
  Kernel_dnt_medium(m=28, n=5, k=32, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=3) , # 594.508 GFlop/s
  Kernel_dnt_medium(m=28, n=5, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=3) , # 598.534 GFlop/s
  Kernel_dnt_medium(m=28, n=7, k=4, tile_m=2, tile_n=4, threads=32, grouping=17, minblocks=10) , # 622.45 GFlop/s
  Kernel_dnt_medium(m=28, n=7, k=5, tile_m=1, tile_n=4, threads=64, grouping=22, minblocks=7) , # 627.062 GFlop/s
  Kernel_dnt_medium(m=28, n=7, k=7, tile_m=1, tile_n=5, threads=64, grouping=26, minblocks=9) , # 683.291 GFlop/s
  Kernel_dnt_medium(m=28, n=7, k=9, tile_m=1, tile_n=4, threads=64, grouping=24, minblocks=1) , # 714.393 GFlop/s
  Kernel_dnt_medium(m=28, n=7, k=13, tile_m=1, tile_n=3, threads=96, grouping=29, minblocks=9) , # 722.039 GFlop/s
  Kernel_dnt_medium(m=28, n=7, k=25, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=6) , # 745.291 GFlop/s
  Kernel_dnt_medium(m=28, n=7, k=26, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=4) , # 747.448 GFlop/s
  Kernel_dnt_medium(m=28, n=7, k=28, tile_m=1, tile_n=2, threads=224, grouping=3, minblocks=6) , # 754.047 GFlop/s
  Kernel_dnt_medium(m=28, n=7, k=32, tile_m=1, tile_n=2, threads=256, grouping=4, minblocks=5) , # 768.773 GFlop/s
  Kernel_dnt_medium(m=28, n=7, k=45, tile_m=1, tile_n=2, threads=192, grouping=3, minblocks=2) , # 768.667 GFlop/s
  Kernel_dnt_medium(m=28, n=9, k=4, tile_m=1, tile_n=5, threads=64, grouping=24, minblocks=11) , # 686.434 GFlop/s
  Kernel_dnt_medium(m=28, n=9, k=5, tile_m=1, tile_n=5, threads=64, grouping=22, minblocks=19) , # 738.32 GFlop/s
  Kernel_dnt_medium(m=28, n=9, k=7, tile_m=1, tile_n=5, threads=64, grouping=26, minblocks=16) , # 808.579 GFlop/s
  Kernel_dnt_medium(m=28, n=9, k=9, tile_m=1, tile_n=3, threads=96, grouping=32, minblocks=3) , # 825.886 GFlop/s
  Kernel_dnt_medium(m=28, n=9, k=13, tile_m=1, tile_n=3, threads=128, grouping=32, minblocks=9) , # 844.884 GFlop/s
  Kernel_dnt_medium(m=28, n=9, k=25, tile_m=1, tile_n=3, threads=256, grouping=4, minblocks=6) , # 895.124 GFlop/s
  Kernel_dnt_medium(m=28, n=9, k=26, tile_m=1, tile_n=3, threads=256, grouping=4, minblocks=6) , # 891.424 GFlop/s
  Kernel_dnt_medium(m=28, n=9, k=28, tile_m=1, tile_n=2, threads=256, grouping=5, minblocks=5) , # 897.496 GFlop/s
  Kernel_dnt_medium(m=28, n=9, k=32, tile_m=1, tile_n=3, threads=128, grouping=3, minblocks=3) , # 918.668 GFlop/s
  Kernel_dnt_largeDB2(m=28, n=9, k=45, tile_m=1, tile_n=3, w=10, v=4, threads=96, grouping=16, minblocks=4) , # 930.129 GFlop/s
  Kernel_dnt_medium(m=28, n=13, k=4, tile_m=2, tile_n=4, threads=64, grouping=26, minblocks=7) , # 836.442 GFlop/s
  Kernel_dnt_medium(m=28, n=13, k=5, tile_m=1, tile_n=5, threads=96, grouping=12, minblocks=12) , # 827.406 GFlop/s
  Kernel_dnt_medium(m=28, n=13, k=7, tile_m=1, tile_n=5, threads=96, grouping=26, minblocks=11) , # 960.632 GFlop/s
  Kernel_dnt_medium(m=28, n=13, k=9, tile_m=1, tile_n=5, threads=128, grouping=32, minblocks=9) , # 996.46 GFlop/s
  Kernel_dnt_medium(m=28, n=13, k=13, tile_m=1, tile_n=5, threads=96, grouping=4, minblocks=10) , # 1030.59 GFlop/s
  Kernel_dnt_medium(m=28, n=13, k=25, tile_m=1, tile_n=3, threads=192, grouping=5, minblocks=5) , # 1115.36 GFlop/s
  Kernel_dnt_medium(m=28, n=13, k=26, tile_m=1, tile_n=3, threads=160, grouping=4, minblocks=4) , # 1114.24 GFlop/s
  Kernel_dnt_largeDB2(m=28, n=13, k=28, tile_m=1, tile_n=4, w=10, v=12, threads=128, grouping=16, minblocks=2) , # 1122.82 GFlop/s
  Kernel_dnt_medium(m=28, n=13, k=32, tile_m=1, tile_n=5, threads=128, grouping=24, minblocks=4) , # 1145.54 GFlop/s
  Kernel_dnt_largeDB2(m=28, n=13, k=45, tile_m=1, tile_n=7, w=12, v=8, threads=64, grouping=16, minblocks=8) , # 1176.78 GFlop/s
  Kernel_dnt_medium(m=28, n=25, k=4, tile_m=4, tile_n=3, threads=64, grouping=32, minblocks=3) , # 1029.26 GFlop/s
  Kernel_dnt_medium(m=28, n=25, k=5, tile_m=1, tile_n=5, threads=160, grouping=9, minblocks=8) , # 1103.4 GFlop/s
  Kernel_dnt_medium(m=28, n=25, k=7, tile_m=5, tile_n=3, threads=64, grouping=24, minblocks=2) , # 1185.57 GFlop/s
  Kernel_dnt_medium(m=28, n=25, k=9, tile_m=1, tile_n=5, threads=160, grouping=10, minblocks=7) , # 1283.85 GFlop/s
  Kernel_dnt_medium(m=28, n=25, k=13, tile_m=1, tile_n=5, threads=160, grouping=29, minblocks=5) , # 1423.58 GFlop/s
  Kernel_dnt_medium(m=28, n=25, k=25, tile_m=3, tile_n=5, threads=64, grouping=3, minblocks=3) , # 1582.11 GFlop/s
  Kernel_dnt_medium(m=28, n=25, k=26, tile_m=3, tile_n=5, threads=64, grouping=3, minblocks=4) , # 1581.68 GFlop/s
  Kernel_dnt_medium(m=28, n=25, k=28, tile_m=1, tile_n=5, threads=160, grouping=24, minblocks=4) , # 1601.02 GFlop/s
  Kernel_dnt_medium(m=28, n=25, k=32, tile_m=3, tile_n=3, threads=128, grouping=32, minblocks=1) , # 1631.38 GFlop/s
  Kernel_dnt_largeDB2(m=28, n=25, k=45, tile_m=3, tile_n=5, w=12, v=14, threads=64, grouping=16, minblocks=4) , # 1683.73 GFlop/s
  Kernel_dnt_medium(m=28, n=26, k=4, tile_m=4, tile_n=3, threads=64, grouping=32, minblocks=2) , # 1046.4 GFlop/s
  Kernel_dnt_medium(m=28, n=26, k=5, tile_m=1, tile_n=6, threads=160, grouping=9, minblocks=8) , # 1116.46 GFlop/s
  Kernel_dnt_medium(m=28, n=26, k=7, tile_m=5, tile_n=3, threads=64, grouping=25, minblocks=4) , # 1206.91 GFlop/s
  Kernel_dnt_medium(m=28, n=26, k=9, tile_m=5, tile_n=3, threads=64, grouping=21, minblocks=2) , # 1308.19 GFlop/s
  Kernel_dnt_medium(m=28, n=26, k=13, tile_m=3, tile_n=5, threads=64, grouping=5, minblocks=5) , # 1440.64 GFlop/s
  Kernel_dnt_medium(m=28, n=26, k=25, tile_m=3, tile_n=5, threads=64, grouping=3, minblocks=2) , # 1615.94 GFlop/s
  Kernel_dnt_medium(m=28, n=26, k=26, tile_m=3, tile_n=5, threads=64, grouping=3, minblocks=3) , # 1620.78 GFlop/s
  Kernel_dnt_medium(m=28, n=26, k=28, tile_m=3, tile_n=3, threads=96, grouping=3, minblocks=1) , # 1637.82 GFlop/s
  Kernel_dnt_medium(m=28, n=26, k=32, tile_m=3, tile_n=3, threads=128, grouping=32, minblocks=2) , # 1665.27 GFlop/s
  Kernel_dnt_largeDB2(m=28, n=26, k=45, tile_m=3, tile_n=3, w=16, v=26, threads=96, grouping=16, minblocks=2) , # 1738.17 GFlop/s
  Kernel_dnt_medium(m=28, n=28, k=4, tile_m=2, tile_n=4, threads=128, grouping=24, minblocks=3) , # 1053.5 GFlop/s
  Kernel_dnt_medium(m=28, n=28, k=5, tile_m=1, tile_n=6, threads=160, grouping=9, minblocks=7) , # 1153.08 GFlop/s
  Kernel_dnt_medium(m=28, n=28, k=7, tile_m=5, tile_n=3, threads=64, grouping=25, minblocks=3) , # 1291.9 GFlop/s
  Kernel_dnt_medium(m=28, n=28, k=9, tile_m=5, tile_n=3, threads=64, grouping=29, minblocks=3) , # 1407.79 GFlop/s
  Kernel_dnt_medium(m=28, n=28, k=13, tile_m=1, tile_n=6, threads=160, grouping=5, minblocks=6) , # 1560.57 GFlop/s
  Kernel_dnt_medium(m=28, n=28, k=25, tile_m=3, tile_n=5, threads=64, grouping=3, minblocks=3) , # 1712.97 GFlop/s
  Kernel_dnt_medium(m=28, n=28, k=26, tile_m=3, tile_n=5, threads=64, grouping=3, minblocks=2) , # 1708.21 GFlop/s
  Kernel_dnt_medium(m=28, n=28, k=28, tile_m=3, tile_n=5, threads=64, grouping=3, minblocks=3) , # 1700.13 GFlop/s
  Kernel_dnt_medium(m=28, n=28, k=32, tile_m=3, tile_n=5, threads=96, grouping=3, minblocks=2) , # 1753.32 GFlop/s
  Kernel_dnt_largeDB2(m=28, n=28, k=45, tile_m=5, tile_n=3, w=8, v=14, threads=64, grouping=16, minblocks=4) , # 1784.75 GFlop/s
  Kernel_dnt_medium(m=28, n=32, k=4, tile_m=3, tile_n=2, threads=160, grouping=29, minblocks=5) , # 1074.89 GFlop/s
  Kernel_dnt_medium(m=28, n=32, k=5, tile_m=1, tile_n=4, threads=224, grouping=9, minblocks=6) , # 1179.11 GFlop/s
  Kernel_dnt_medium(m=28, n=32, k=7, tile_m=1, tile_n=4, threads=256, grouping=29, minblocks=5) , # 1331.58 GFlop/s
  Kernel_dnt_medium(m=28, n=32, k=9, tile_m=4, tile_n=4, threads=64, grouping=29, minblocks=2) , # 1423.42 GFlop/s
  Kernel_dnt_medium(m=28, n=32, k=13, tile_m=3, tile_n=2, threads=160, grouping=29, minblocks=5) , # 1552.86 GFlop/s
  Kernel_dnt_medium(m=28, n=32, k=25, tile_m=1, tile_n=4, threads=224, grouping=4, minblocks=4) , # 1787.13 GFlop/s
  Kernel_dnt_medium(m=28, n=32, k=26, tile_m=1, tile_n=4, threads=256, grouping=32, minblocks=3) , # 1777.2 GFlop/s
  Kernel_dnt_medium(m=28, n=32, k=28, tile_m=5, tile_n=2, threads=96, grouping=3, minblocks=2) , # 1795.81 GFlop/s
  Kernel_dnt_largeDB2(m=28, n=32, k=32, tile_m=5, tile_n=2, w=16, v=18, threads=96, grouping=16, minblocks=4) , # 1849.83 GFlop/s
  Kernel_dnt_largeDB2(m=28, n=32, k=45, tile_m=3, tile_n=6, w=10, v=16, threads=64, grouping=16, minblocks=1) , # 1909.3 GFlop/s
  Kernel_dnt_medium(m=28, n=45, k=4, tile_m=3, tile_n=5, threads=96, grouping=29, minblocks=1) , # 1163.62 GFlop/s
  Kernel_dnt_medium(m=28, n=45, k=5, tile_m=1, tile_n=5, threads=256, grouping=16, minblocks=4) , # 1262.25 GFlop/s
  Kernel_dnt_medium(m=28, n=45, k=7, tile_m=3, tile_n=5, threads=96, grouping=25, minblocks=4) , # 1441 GFlop/s
  Kernel_dnt_medium(m=28, n=45, k=9, tile_m=3, tile_n=4, threads=128, grouping=32, minblocks=1) , # 1569.32 GFlop/s
  Kernel_dnt_medium(m=28, n=45, k=13, tile_m=3, tile_n=5, threads=96, grouping=24, minblocks=4) , # 1754.86 GFlop/s
  Kernel_dnt_medium(m=28, n=45, k=25, tile_m=3, tile_n=4, threads=128, grouping=32, minblocks=2) , # 1976.24 GFlop/s
  Kernel_dnt_medium(m=28, n=45, k=26, tile_m=3, tile_n=4, threads=128, grouping=32, minblocks=3) , # 1998.55 GFlop/s
  Kernel_dnt_largeDB2(m=28, n=45, k=28, tile_m=7, tile_n=3, w=8, v=20, threads=64, grouping=16, minblocks=4) , # 2023.14 GFlop/s
  Kernel_dnt_largeDB2(m=28, n=45, k=32, tile_m=7, tile_n=3, w=8, v=20, threads=64, grouping=16, minblocks=1) , # 2082.87 GFlop/s
  Kernel_dnt_largeDB2(m=28, n=45, k=45, tile_m=7, tile_n=3, w=8, v=40, threads=64, grouping=16, minblocks=2) , # 2164.99 GFlop/s
  Kernel_dnt_medium(m=29, n=14, k=14, tile_m=1, tile_n=5, threads=128, grouping=6, minblocks=9) , # 1094.19 GFlop/s
  Kernel_dnt_medium(m=29, n=14, k=16, tile_m=1, tile_n=5, threads=128, grouping=5, minblocks=8) , # 1132.98 GFlop/s
  Kernel_dnt_largeDB2(m=29, n=14, k=29, tile_m=3, tile_n=3, w=12, v=14, threads=64, grouping=16, minblocks=8) , # 1191.43 GFlop/s
  Kernel_dnt_medium(m=29, n=14, k=32, tile_m=3, tile_n=3, threads=64, grouping=29, minblocks=1) , # 1211.15 GFlop/s
  Kernel_dnt_medium(m=29, n=16, k=14, tile_m=1, tile_n=4, threads=128, grouping=4, minblocks=9) , # 1200.6 GFlop/s
  Kernel_dnt_medium(m=29, n=16, k=16, tile_m=1, tile_n=4, threads=128, grouping=4, minblocks=8) , # 1237.1 GFlop/s
  Kernel_dnt_largeDB2(m=29, n=16, k=29, tile_m=3, tile_n=1, w=10, v=14, threads=160, grouping=16, minblocks=1) , # 1302.52 GFlop/s
  Kernel_dnt_largeDB2(m=29, n=16, k=55, tile_m=3, tile_n=3, w=14, v=16, threads=64, grouping=16, minblocks=1) , # 1381.48 GFlop/s
  Kernel_dnt_medium(m=29, n=29, k=14, tile_m=3, tile_n=2, threads=160, grouping=32, minblocks=1) , # 1562.65 GFlop/s
  Kernel_dnt_medium(m=29, n=29, k=16, tile_m=3, tile_n=2, threads=160, grouping=32, minblocks=1) , # 1616.32 GFlop/s
  Kernel_dnt_medium(m=29, n=29, k=29, tile_m=3, tile_n=5, threads=96, grouping=24, minblocks=3) , # 1736.13 GFlop/s
  Kernel_dnt_medium(m=29, n=29, k=32, tile_m=3, tile_n=5, threads=96, grouping=3, minblocks=2) , # 1807.51 GFlop/s
  Kernel_dnt_largeDB2(m=29, n=29, k=55, tile_m=3, tile_n=5, w=12, v=26, threads=64, grouping=16, minblocks=1) , # 1881.61 GFlop/s
  Kernel_dnt_medium(m=29, n=32, k=14, tile_m=3, tile_n=2, threads=192, grouping=29, minblocks=5) , # 1630.9 GFlop/s
  Kernel_dnt_largeDB2(m=29, n=32, k=29, tile_m=5, tile_n=2, w=12, v=28, threads=96, grouping=16, minblocks=1) , # 1818.21 GFlop/s
  Kernel_dnt_medium(m=29, n=32, k=32, tile_m=3, tile_n=3, threads=128, grouping=32, minblocks=2) , # 1865.52 GFlop/s
  Kernel_dnt_largeDB2(m=29, n=32, k=55, tile_m=3, tile_n=6, w=10, v=32, threads=64, grouping=16, minblocks=4) , # 1977.23 GFlop/s
  Kernel_dnt_medium(m=29, n=55, k=16, tile_m=3, tile_n=5, threads=128, grouping=32, minblocks=1) , # 1958.07 GFlop/s
  Kernel_dnt_medium(m=29, n=55, k=29, tile_m=3, tile_n=5, threads=128, grouping=32, minblocks=1) , # 2173.7 GFlop/s
  Kernel_dnt_largeDB2(m=29, n=55, k=32, tile_m=4, tile_n=7, w=8, v=36, threads=64, grouping=16, minblocks=4) , # 2227.18 GFlop/s
  Kernel_dnt_largeDB2(m=29, n=55, k=55, tile_m=5, tile_n=6, w=6, v=36, threads=64, grouping=16, minblocks=4) , # 2364.15 GFlop/s
  Kernel_dnt_medium(m=30, n=30, k=30, tile_m=5, tile_n=3, threads=64, grouping=3, minblocks=1) , # 1838.1 GFlop/s
  Kernel_dnt_medium(m=31, n=31, k=31, tile_m=4, tile_n=4, threads=64, grouping=24, minblocks=1) , # 1830.67 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=4, tile_m=1, tile_n=6, threads=32, grouping=13, minblocks=2) , # 463.773 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=5, tile_m=1, tile_n=4, threads=32, grouping=16, minblocks=13) , # 469.138 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=7, tile_m=1, tile_n=2, threads=64, grouping=24, minblocks=7) , # 479.728 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=9, tile_m=1, tile_n=2, threads=64, grouping=6, minblocks=15) , # 482.038 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=13, tile_m=1, tile_n=2, threads=64, grouping=26, minblocks=11) , # 493.254 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=25, tile_m=1, tile_n=1, threads=160, grouping=4, minblocks=5) , # 498.236 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=26, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=5) , # 498.907 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=28, tile_m=1, tile_n=2, threads=256, grouping=4, minblocks=6) , # 503.858 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=32, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=1) , # 510.352 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=3) , # 514.431 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=4, tile_m=1, tile_n=5, threads=32, grouping=17, minblocks=26) , # 521.537 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=5, tile_m=1, tile_n=5, threads=32, grouping=16, minblocks=14) , # 535.38 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=7, tile_m=1, tile_n=5, threads=64, grouping=26, minblocks=13) , # 556.583 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=9, tile_m=1, tile_n=3, threads=64, grouping=29, minblocks=9) , # 568.533 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=12, tile_m=1, tile_n=3, threads=96, grouping=32, minblocks=5) , # 578.088 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=13, tile_m=1, tile_n=2, threads=96, grouping=32, minblocks=10) , # 571.398 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=16, tile_m=1, tile_n=2, threads=128, grouping=32, minblocks=7) , # 590.624 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=24, tile_m=1, tile_n=1, threads=192, grouping=5, minblocks=5) , # 595.227 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=25, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=1) , # 593.569 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=26, tile_m=1, tile_n=2, threads=160, grouping=4, minblocks=4) , # 597.402 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=28, tile_m=1, tile_n=1, threads=192, grouping=3, minblocks=1) , # 604.614 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=32, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=2) , # 611.583 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=3) , # 618.945 GFlop/s
  Kernel_dnt_medium(m=32, n=7, k=4, tile_m=2, tile_n=4, threads=32, grouping=19, minblocks=13) , # 635.452 GFlop/s
  Kernel_dnt_medium(m=32, n=7, k=5, tile_m=1, tile_n=4, threads=64, grouping=22, minblocks=4) , # 657.342 GFlop/s
  Kernel_dnt_medium(m=32, n=7, k=7, tile_m=1, tile_n=4, threads=64, grouping=26, minblocks=7) , # 710.551 GFlop/s
  Kernel_dnt_medium(m=32, n=7, k=9, tile_m=1, tile_n=4, threads=64, grouping=26, minblocks=12) , # 727.544 GFlop/s
  Kernel_dnt_medium(m=32, n=7, k=13, tile_m=1, tile_n=2, threads=128, grouping=32, minblocks=6) , # 739.552 GFlop/s
  Kernel_dnt_medium(m=32, n=7, k=25, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=4) , # 775.794 GFlop/s
  Kernel_dnt_medium(m=32, n=7, k=26, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=3) , # 776.845 GFlop/s
  Kernel_dnt_medium(m=32, n=7, k=28, tile_m=1, tile_n=2, threads=224, grouping=4, minblocks=3) , # 780.958 GFlop/s
  Kernel_dnt_medium(m=32, n=7, k=32, tile_m=1, tile_n=2, threads=256, grouping=4, minblocks=4) , # 799.863 GFlop/s
  Kernel_dnt_medium(m=32, n=7, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=3) , # 803.204 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=4, tile_m=1, tile_n=5, threads=64, grouping=24, minblocks=11) , # 719.416 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=5, tile_m=1, tile_n=6, threads=64, grouping=23, minblocks=13) , # 768.086 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=7, tile_m=1, tile_n=6, threads=64, grouping=25, minblocks=14) , # 830.63 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=9, tile_m=1, tile_n=5, threads=96, grouping=32, minblocks=10) , # 853.67 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=13, tile_m=1, tile_n=3, threads=128, grouping=4, minblocks=5) , # 873.679 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=22, tile_m=1, tile_n=3, threads=224, grouping=4, minblocks=5) , # 922.783 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=25, tile_m=1, tile_n=3, threads=256, grouping=4, minblocks=5) , # 926.612 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=26, tile_m=1, tile_n=3, threads=256, grouping=4, minblocks=5) , # 937.269 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=28, tile_m=1, tile_n=2, threads=160, grouping=3, minblocks=1) , # 943.269 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=9, k=32, tile_m=1, tile_n=3, w=8, v=8, threads=96, grouping=16, minblocks=2) , # 954.006 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=9, k=45, tile_m=1, tile_n=3, w=10, v=4, threads=96, grouping=16, minblocks=1) , # 967.274 GFlop/s
  Kernel_dnt_medium(m=32, n=12, k=5, tile_m=1, tile_n=6, threads=64, grouping=26, minblocks=12) , # 908.801 GFlop/s
  Kernel_dnt_medium(m=32, n=12, k=12, tile_m=1, tile_n=3, threads=160, grouping=5, minblocks=9) , # 1047.84 GFlop/s
  Kernel_dnt_medium(m=32, n=12, k=13, tile_m=1, tile_n=4, threads=160, grouping=4, minblocks=9) , # 1058.78 GFlop/s
  Kernel_dnt_medium(m=32, n=12, k=26, tile_m=1, tile_n=2, threads=192, grouping=4, minblocks=5) , # 1144.79 GFlop/s
  Kernel_dnt_medium(m=32, n=12, k=32, tile_m=1, tile_n=2, threads=192, grouping=3, minblocks=4) , # 1176.59 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=4, tile_m=2, tile_n=4, threads=64, grouping=26, minblocks=1) , # 880.033 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=5, tile_m=1, tile_n=5, threads=96, grouping=24, minblocks=11) , # 897.283 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=7, tile_m=1, tile_n=5, threads=96, grouping=29, minblocks=11) , # 1013.1 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=9, tile_m=1, tile_n=5, threads=128, grouping=32, minblocks=9) , # 1040.81 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=12, tile_m=1, tile_n=3, threads=160, grouping=5, minblocks=9) , # 1085.48 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=13, tile_m=1, tile_n=5, threads=128, grouping=4, minblocks=8) , # 1085.19 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=14, tile_m=1, tile_n=5, threads=128, grouping=4, minblocks=7) , # 1105.19 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=16, tile_m=1, tile_n=4, threads=128, grouping=4, minblocks=7) , # 1140.15 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=13, k=24, tile_m=1, tile_n=4, w=12, v=10, threads=128, grouping=16, minblocks=1) , # 1184.2 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=25, tile_m=1, tile_n=3, threads=224, grouping=4, minblocks=5) , # 1177.19 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=26, tile_m=1, tile_n=2, threads=224, grouping=4, minblocks=3) , # 1192.35 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=28, tile_m=1, tile_n=2, threads=224, grouping=4, minblocks=2) , # 1204.64 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=32, tile_m=1, tile_n=3, threads=192, grouping=3, minblocks=4) , # 1221.29 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=13, k=45, tile_m=1, tile_n=3, w=12, v=8, threads=160, grouping=16, minblocks=2) , # 1240.12 GFlop/s
  Kernel_dnt_medium(m=32, n=14, k=13, tile_m=1, tile_n=5, threads=128, grouping=4, minblocks=9) , # 1143.59 GFlop/s
  Kernel_dnt_medium(m=32, n=14, k=14, tile_m=1, tile_n=4, threads=128, grouping=5, minblocks=8) , # 1160.77 GFlop/s
  Kernel_dnt_medium(m=32, n=14, k=25, tile_m=1, tile_n=3, threads=224, grouping=3, minblocks=5) , # 1240.88 GFlop/s
  Kernel_dnt_medium(m=32, n=14, k=26, tile_m=1, tile_n=2, threads=224, grouping=4, minblocks=5) , # 1258.77 GFlop/s
  Kernel_dnt_medium(m=32, n=14, k=29, tile_m=1, tile_n=3, threads=160, grouping=3, minblocks=4) , # 1262.69 GFlop/s
  Kernel_dnt_medium(m=32, n=14, k=32, tile_m=1, tile_n=2, threads=224, grouping=3, minblocks=1) , # 1286.87 GFlop/s
  Kernel_dnt_medium(m=32, n=16, k=5, tile_m=1, tile_n=6, threads=96, grouping=32, minblocks=10) , # 1010.78 GFlop/s
  Kernel_dnt_medium(m=32, n=16, k=13, tile_m=1, tile_n=5, threads=128, grouping=4, minblocks=9) , # 1239.79 GFlop/s
  Kernel_dnt_medium(m=32, n=16, k=16, tile_m=1, tile_n=4, threads=160, grouping=5, minblocks=7) , # 1289.33 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=16, k=32, tile_m=1, tile_n=4, w=12, v=16, threads=128, grouping=16, minblocks=2) , # 1400.34 GFlop/s
  Kernel_dnt_medium(m=32, n=22, k=9, tile_m=1, tile_n=6, threads=128, grouping=6, minblocks=8) , # 1313.03 GFlop/s
  Kernel_dnt_medium(m=32, n=22, k=22, tile_m=1, tile_n=4, threads=192, grouping=5, minblocks=4) , # 1573.89 GFlop/s
  Kernel_dnt_medium(m=32, n=22, k=32, tile_m=1, tile_n=3, threads=256, grouping=32, minblocks=2) , # 1639.19 GFlop/s
  Kernel_dnt_medium(m=32, n=24, k=5, tile_m=1, tile_n=4, threads=192, grouping=9, minblocks=7) , # 1147.88 GFlop/s
  Kernel_dnt_medium(m=32, n=24, k=13, tile_m=1, tile_n=6, threads=160, grouping=29, minblocks=5) , # 1516.34 GFlop/s
  Kernel_dnt_medium(m=32, n=24, k=24, tile_m=1, tile_n=6, threads=128, grouping=3, minblocks=4) , # 1673.42 GFlop/s
  Kernel_dnt_medium(m=32, n=24, k=26, tile_m=1, tile_n=4, threads=224, grouping=3, minblocks=4) , # 1689.58 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=24, k=32, tile_m=1, tile_n=4, w=16, v=24, threads=192, grouping=16, minblocks=1) , # 1736.99 GFlop/s
  Kernel_dnt_medium(m=32, n=25, k=4, tile_m=1, tile_n=5, threads=192, grouping=12, minblocks=7) , # 1051.58 GFlop/s
  Kernel_dnt_medium(m=32, n=25, k=5, tile_m=1, tile_n=5, threads=192, grouping=9, minblocks=7) , # 1163.15 GFlop/s
  Kernel_dnt_medium(m=32, n=25, k=7, tile_m=1, tile_n=5, threads=256, grouping=16, minblocks=5) , # 1273.22 GFlop/s
  Kernel_dnt_medium(m=32, n=25, k=9, tile_m=1, tile_n=5, threads=160, grouping=6, minblocks=7) , # 1389.05 GFlop/s
  Kernel_dnt_medium(m=32, n=25, k=13, tile_m=1, tile_n=5, threads=160, grouping=4, minblocks=7) , # 1543.97 GFlop/s
  Kernel_dnt_medium(m=32, n=25, k=14, tile_m=1, tile_n=5, threads=160, grouping=29, minblocks=5) , # 1565.97 GFlop/s
  Kernel_dnt_medium(m=32, n=25, k=25, tile_m=1, tile_n=5, threads=160, grouping=3, minblocks=4) , # 1683.41 GFlop/s
  Kernel_dnt_medium(m=32, n=25, k=26, tile_m=1, tile_n=5, threads=192, grouping=4, minblocks=4) , # 1695.86 GFlop/s
  Kernel_dnt_medium(m=32, n=25, k=28, tile_m=1, tile_n=5, threads=224, grouping=32, minblocks=3) , # 1709.63 GFlop/s
  Kernel_dnt_medium(m=32, n=25, k=32, tile_m=1, tile_n=5, threads=256, grouping=32, minblocks=3) , # 1751.37 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=25, k=45, tile_m=3, tile_n=5, w=16, v=20, threads=64, grouping=16, minblocks=1) , # 1795.15 GFlop/s
  Kernel_dnt_medium(m=32, n=26, k=4, tile_m=1, tile_n=6, threads=160, grouping=10, minblocks=7) , # 1059.64 GFlop/s
  Kernel_dnt_medium(m=32, n=26, k=5, tile_m=1, tile_n=6, threads=192, grouping=24, minblocks=6) , # 1158.06 GFlop/s
  Kernel_dnt_medium(m=32, n=26, k=7, tile_m=5, tile_n=3, threads=64, grouping=24, minblocks=3) , # 1280.12 GFlop/s
  Kernel_dnt_medium(m=32, n=26, k=9, tile_m=1, tile_n=6, threads=160, grouping=6, minblocks=7) , # 1392.26 GFlop/s
  Kernel_dnt_medium(m=32, n=26, k=12, tile_m=5, tile_n=3, threads=64, grouping=24, minblocks=1) , # 1502.2 GFlop/s
  Kernel_dnt_medium(m=32, n=26, k=13, tile_m=1, tile_n=6, threads=160, grouping=5, minblocks=6) , # 1540.86 GFlop/s
  Kernel_dnt_medium(m=32, n=26, k=14, tile_m=1, tile_n=6, threads=160, grouping=4, minblocks=6) , # 1592.55 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=26, k=24, tile_m=5, tile_n=3, w=12, v=26, threads=64, grouping=16, minblocks=1) , # 1704.87 GFlop/s
  Kernel_dnt_medium(m=32, n=26, k=25, tile_m=5, tile_n=3, threads=64, grouping=3, minblocks=3) , # 1702.43 GFlop/s
  Kernel_dnt_medium(m=32, n=26, k=26, tile_m=1, tile_n=4, threads=224, grouping=32, minblocks=3) , # 1710.53 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=26, k=28, tile_m=2, tile_n=5, w=14, v=24, threads=96, grouping=16, minblocks=4) , # 1736.93 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=26, k=32, tile_m=1, tile_n=5, w=16, v=26, threads=192, grouping=16, minblocks=2) , # 1778.77 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=26, k=45, tile_m=2, tile_n=7, w=12, v=26, threads=64, grouping=16, minblocks=1) , # 1845.15 GFlop/s
  Kernel_dnt_medium(m=32, n=28, k=4, tile_m=1, tile_n=6, threads=160, grouping=29, minblocks=5) , # 1093.69 GFlop/s
  Kernel_dnt_medium(m=32, n=28, k=5, tile_m=1, tile_n=5, threads=192, grouping=10, minblocks=6) , # 1203.06 GFlop/s
  Kernel_dnt_medium(m=32, n=28, k=7, tile_m=1, tile_n=4, threads=256, grouping=24, minblocks=6) , # 1352.54 GFlop/s
  Kernel_dnt_medium(m=32, n=28, k=9, tile_m=1, tile_n=6, threads=160, grouping=6, minblocks=6) , # 1453.34 GFlop/s
  Kernel_dnt_medium(m=32, n=28, k=13, tile_m=1, tile_n=6, threads=160, grouping=29, minblocks=5) , # 1612.39 GFlop/s
  Kernel_dnt_medium(m=32, n=28, k=25, tile_m=1, tile_n=4, threads=224, grouping=4, minblocks=4) , # 1803.09 GFlop/s
  Kernel_dnt_medium(m=32, n=28, k=26, tile_m=1, tile_n=4, threads=224, grouping=32, minblocks=3) , # 1791.09 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=28, k=28, tile_m=2, tile_n=7, w=10, v=26, threads=64, grouping=16, minblocks=1) , # 1810.27 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=28, k=32, tile_m=1, tile_n=5, w=16, v=28, threads=192, grouping=16, minblocks=1) , # 1860.59 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=28, k=45, tile_m=2, tile_n=7, w=12, v=8, threads=64, grouping=16, minblocks=1) , # 1916.4 GFlop/s
  Kernel_dnt_medium(m=32, n=29, k=14, tile_m=1, tile_n=6, threads=160, grouping=4, minblocks=6) , # 1690.58 GFlop/s
  Kernel_dnt_medium(m=32, n=29, k=29, tile_m=1, tile_n=5, threads=192, grouping=32, minblocks=3) , # 1821.2 GFlop/s
  Kernel_dnt_medium(m=32, n=29, k=32, tile_m=1, tile_n=5, threads=192, grouping=32, minblocks=2) , # 1879.76 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=29, k=55, tile_m=6, tile_n=3, w=14, v=18, threads=64, grouping=16, minblocks=2) , # 1972.82 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=4, tile_m=2, tile_n=4, threads=128, grouping=29, minblocks=1) , # 1153.41 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=5, tile_m=1, tile_n=6, threads=192, grouping=29, minblocks=5) , # 1275.62 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=7, tile_m=4, tile_n=4, threads=64, grouping=29, minblocks=1) , # 1445.13 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=9, tile_m=4, tile_n=4, threads=64, grouping=30, minblocks=2) , # 1569.35 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=12, tile_m=1, tile_n=6, threads=192, grouping=29, minblocks=5) , # 1716.9 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=13, tile_m=1, tile_n=6, threads=192, grouping=5, minblocks=5) , # 1747.15 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=14, tile_m=1, tile_n=6, threads=192, grouping=5, minblocks=5) , # 1782.16 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=16, tile_m=1, tile_n=6, threads=192, grouping=4, minblocks=5) , # 1837.55 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=22, tile_m=1, tile_n=4, threads=256, grouping=4, minblocks=4) , # 1920.38 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=24, tile_m=4, tile_n=4, threads=64, grouping=3, minblocks=3) , # 1928.13 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=25, tile_m=1, tile_n=4, threads=256, grouping=32, minblocks=3) , # 1922.43 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=26, tile_m=1, tile_n=4, threads=256, grouping=32, minblocks=3) , # 1932.91 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=28, tile_m=4, tile_n=4, threads=64, grouping=3, minblocks=1) , # 1945.61 GFlop/s
  Kernel_dnt_medium(m=32, n=32, k=29, tile_m=1, tile_n=4, threads=256, grouping=3, minblocks=3) , # 1953.05 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=32, k=32, tile_m=4, tile_n=4, w=8, v=32, threads=64, grouping=16, minblocks=2) , # 1988.65 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=32, k=45, tile_m=2, tile_n=8, w=10, v=16, threads=64, grouping=16, minblocks=1) , # 2049.85 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=32, k=55, tile_m=2, tile_n=8, w=10, v=32, threads=64, grouping=16, minblocks=2) , # 2095.89 GFlop/s
  Kernel_dnt_medium(m=32, n=45, k=4, tile_m=2, tile_n=6, threads=192, grouping=32, minblocks=1) , # 1189.96 GFlop/s
  Kernel_dnt_medium(m=32, n=45, k=5, tile_m=1, tile_n=6, threads=256, grouping=24, minblocks=4) , # 1353.3 GFlop/s
  Kernel_dnt_medium(m=32, n=45, k=7, tile_m=1, tile_n=6, threads=256, grouping=26, minblocks=4) , # 1511.94 GFlop/s
  Kernel_dnt_medium(m=32, n=45, k=9, tile_m=4, tile_n=4, threads=96, grouping=32, minblocks=2) , # 1646.51 GFlop/s
  Kernel_dnt_medium(m=32, n=45, k=13, tile_m=2, tile_n=6, threads=128, grouping=32, minblocks=1) , # 1808.85 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=45, k=25, tile_m=5, tile_n=5, w=8, v=40, threads=64, grouping=16, minblocks=4) , # 2094.65 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=45, k=26, tile_m=5, tile_n=5, w=6, v=24, threads=64, grouping=16, minblocks=4) , # 2121.03 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=45, k=28, tile_m=5, tile_n=5, w=8, v=34, threads=64, grouping=16, minblocks=1) , # 2167.72 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=45, k=32, tile_m=5, tile_n=5, w=8, v=30, threads=64, grouping=16, minblocks=4) , # 2242.67 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=45, k=45, tile_m=5, tile_n=5, w=8, v=16, threads=64, grouping=16, minblocks=4) , # 2335 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=55, k=29, tile_m=4, tile_n=7, w=14, v=36, threads=64, grouping=16, minblocks=2) , # 2308.45 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=55, k=32, tile_m=4, tile_n=7, w=8, v=38, threads=64, grouping=16, minblocks=1) , # 2402.8 GFlop/s
  Kernel_dnt_largeDB2(m=32, n=55, k=55, tile_m=6, tile_n=6, w=12, v=20, threads=64, grouping=16, minblocks=4) , # 2483.08 GFlop/s
  Kernel_dnt_medium(m=45, n=4, k=4, tile_m=6, tile_n=1, threads=64, grouping=19, minblocks=16) , # 447.443 GFlop/s
  Kernel_dnt_medium(m=45, n=4, k=5, tile_m=6, tile_n=1, threads=64, grouping=25, minblocks=16) , # 452.508 GFlop/s
  Kernel_dnt_medium(m=45, n=4, k=7, tile_m=3, tile_n=1, threads=64, grouping=24, minblocks=2) , # 473.476 GFlop/s
  Kernel_dnt_medium(m=45, n=4, k=9, tile_m=1, tile_n=4, threads=64, grouping=29, minblocks=6) , # 479.338 GFlop/s
  Kernel_dnt_medium(m=45, n=4, k=13, tile_m=1, tile_n=2, threads=128, grouping=6, minblocks=4) , # 488.528 GFlop/s
  Kernel_dnt_medium(m=45, n=4, k=25, tile_m=1, tile_n=2, threads=128, grouping=3, minblocks=2) , # 509.978 GFlop/s
  Kernel_dnt_medium(m=45, n=4, k=26, tile_m=1, tile_n=2, threads=192, grouping=3, minblocks=1) , # 511.579 GFlop/s
  Kernel_dnt_medium(m=45, n=4, k=28, tile_m=1, tile_n=2, threads=128, grouping=3, minblocks=2) , # 517.616 GFlop/s
  Kernel_dnt_medium(m=45, n=4, k=32, tile_m=1, tile_n=2, threads=256, grouping=3, minblocks=3) , # 524.404 GFlop/s
  Kernel_dnt_medium(m=45, n=4, k=45, tile_m=1, tile_n=2, threads=256, grouping=3, minblocks=1) , # 530.028 GFlop/s
  Kernel_dnt_medium(m=45, n=5, k=4, tile_m=1, tile_n=6, threads=64, grouping=26, minblocks=10) , # 511.613 GFlop/s
  Kernel_dnt_medium(m=45, n=5, k=5, tile_m=1, tile_n=5, threads=64, grouping=26, minblocks=10) , # 529.849 GFlop/s
  Kernel_dnt_medium(m=45, n=5, k=7, tile_m=1, tile_n=5, threads=64, grouping=32, minblocks=7) , # 554.528 GFlop/s
  Kernel_dnt_medium(m=45, n=5, k=9, tile_m=1, tile_n=5, threads=64, grouping=26, minblocks=11) , # 565.503 GFlop/s
  Kernel_dnt_medium(m=45, n=5, k=13, tile_m=1, tile_n=3, threads=128, grouping=4, minblocks=9) , # 581.404 GFlop/s
  Kernel_dnt_medium(m=45, n=5, k=25, tile_m=1, tile_n=3, threads=192, grouping=3, minblocks=2) , # 614.32 GFlop/s
  Kernel_dnt_medium(m=45, n=5, k=26, tile_m=1, tile_n=2, threads=192, grouping=3, minblocks=4) , # 613.164 GFlop/s
  Kernel_dnt_medium(m=45, n=5, k=28, tile_m=1, tile_n=2, threads=256, grouping=3, minblocks=3) , # 622.314 GFlop/s
  Kernel_dnt_medium(m=45, n=5, k=32, tile_m=1, tile_n=2, threads=256, grouping=3, minblocks=1) , # 630.744 GFlop/s
  Kernel_dnt_medium(m=45, n=5, k=45, tile_m=1, tile_n=1, threads=256, grouping=3, minblocks=1) , # 638.115 GFlop/s
  Kernel_dnt_medium(m=45, n=7, k=4, tile_m=5, tile_n=1, threads=64, grouping=25, minblocks=14) , # 640.697 GFlop/s
  Kernel_dnt_medium(m=45, n=7, k=5, tile_m=5, tile_n=1, threads=64, grouping=26, minblocks=17) , # 669.324 GFlop/s
  Kernel_dnt_medium(m=45, n=7, k=7, tile_m=6, tile_n=1, threads=64, grouping=32, minblocks=11) , # 712.064 GFlop/s
  Kernel_dnt_medium(m=45, n=7, k=9, tile_m=1, tile_n=4, threads=96, grouping=32, minblocks=2) , # 727.757 GFlop/s
  Kernel_dnt_medium(m=45, n=7, k=13, tile_m=1, tile_n=4, threads=128, grouping=24, minblocks=4) , # 753.039 GFlop/s
  Kernel_dnt_medium(m=45, n=7, k=25, tile_m=1, tile_n=2, threads=192, grouping=3, minblocks=4) , # 807.2 GFlop/s
  Kernel_dnt_medium(m=45, n=7, k=26, tile_m=1, tile_n=2, threads=192, grouping=3, minblocks=3) , # 815.315 GFlop/s
  Kernel_dnt_medium(m=45, n=7, k=28, tile_m=1, tile_n=2, threads=256, grouping=3, minblocks=3) , # 821.715 GFlop/s
  Kernel_dnt_medium(m=45, n=7, k=32, tile_m=1, tile_n=2, threads=256, grouping=3, minblocks=3) , # 837.239 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=7, k=45, tile_m=1, tile_n=2, w=12, v=4, threads=192, grouping=16, minblocks=4) , # 833.545 GFlop/s
  Kernel_dnt_medium(m=45, n=9, k=4, tile_m=3, tile_n=3, threads=64, grouping=32, minblocks=9) , # 735.448 GFlop/s
  Kernel_dnt_medium(m=45, n=9, k=5, tile_m=3, tile_n=3, threads=64, grouping=32, minblocks=3) , # 771.229 GFlop/s
  Kernel_dnt_medium(m=45, n=9, k=7, tile_m=1, tile_n=5, threads=96, grouping=32, minblocks=12) , # 843.763 GFlop/s
  Kernel_dnt_medium(m=45, n=9, k=9, tile_m=1, tile_n=5, threads=96, grouping=32, minblocks=11) , # 864.971 GFlop/s
  Kernel_dnt_medium(m=45, n=9, k=13, tile_m=1, tile_n=3, threads=160, grouping=5, minblocks=3) , # 908.669 GFlop/s
  Kernel_dnt_medium(m=45, n=9, k=25, tile_m=1, tile_n=3, threads=256, grouping=4, minblocks=1) , # 980.24 GFlop/s
  Kernel_dnt_medium(m=45, n=9, k=26, tile_m=1, tile_n=3, threads=256, grouping=3, minblocks=4) , # 988.361 GFlop/s
  Kernel_dnt_medium(m=45, n=9, k=28, tile_m=1, tile_n=2, threads=256, grouping=3, minblocks=3) , # 997.761 GFlop/s
  Kernel_dnt_medium(m=45, n=9, k=32, tile_m=1, tile_n=3, threads=160, grouping=3, minblocks=2) , # 1009.32 GFlop/s
  Kernel_dnt_medium(m=45, n=9, k=45, tile_m=1, tile_n=3, threads=256, grouping=3, minblocks=1) , # 1014.45 GFlop/s
  Kernel_dnt_medium(m=45, n=13, k=4, tile_m=5, tile_n=2, threads=64, grouping=32, minblocks=1) , # 885.5 GFlop/s
  Kernel_dnt_medium(m=45, n=13, k=5, tile_m=5, tile_n=1, threads=128, grouping=32, minblocks=9) , # 912.85 GFlop/s
  Kernel_dnt_medium(m=45, n=13, k=7, tile_m=5, tile_n=1, threads=128, grouping=32, minblocks=9) , # 1016.11 GFlop/s
  Kernel_dnt_medium(m=45, n=13, k=9, tile_m=5, tile_n=1, threads=128, grouping=32, minblocks=9) , # 1076.14 GFlop/s
  Kernel_dnt_medium(m=45, n=13, k=13, tile_m=1, tile_n=5, threads=192, grouping=5, minblocks=6) , # 1153.04 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=13, k=25, tile_m=3, tile_n=4, w=8, v=10, threads=64, grouping=16, minblocks=8) , # 1237.7 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=13, k=26, tile_m=3, tile_n=4, w=8, v=12, threads=64, grouping=16, minblocks=4) , # 1247.69 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=13, k=28, tile_m=5, tile_n=2, w=14, v=12, threads=64, grouping=16, minblocks=2) , # 1268.39 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=13, k=32, tile_m=5, tile_n=2, w=12, v=10, threads=64, grouping=16, minblocks=2) , # 1290.64 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=13, k=45, tile_m=5, tile_n=2, w=12, v=12, threads=64, grouping=16, minblocks=1) , # 1334.66 GFlop/s
  Kernel_dnt_medium(m=45, n=25, k=4, tile_m=3, tile_n=5, threads=96, grouping=29, minblocks=4) , # 1027.95 GFlop/s
  Kernel_dnt_medium(m=45, n=25, k=5, tile_m=5, tile_n=2, threads=128, grouping=24, minblocks=4) , # 1152.55 GFlop/s
  Kernel_dnt_medium(m=45, n=25, k=7, tile_m=5, tile_n=3, threads=96, grouping=24, minblocks=1) , # 1310.14 GFlop/s
  Kernel_dnt_medium(m=45, n=25, k=9, tile_m=5, tile_n=2, threads=128, grouping=32, minblocks=3) , # 1429.19 GFlop/s
  Kernel_dnt_medium(m=45, n=25, k=13, tile_m=3, tile_n=5, threads=96, grouping=24, minblocks=4) , # 1595.31 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=25, k=25, tile_m=3, tile_n=7, w=10, v=14, threads=64, grouping=16, minblocks=2) , # 1830.75 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=25, k=26, tile_m=5, tile_n=4, w=10, v=12, threads=64, grouping=16, minblocks=2) , # 1854.31 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=25, k=28, tile_m=5, tile_n=4, w=12, v=22, threads=64, grouping=16, minblocks=4) , # 1879.95 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=25, k=32, tile_m=5, tile_n=4, w=8, v=12, threads=64, grouping=16, minblocks=4) , # 1929.59 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=25, k=45, tile_m=5, tile_n=4, w=12, v=16, threads=64, grouping=16, minblocks=2) , # 2025.82 GFlop/s
  Kernel_dnt_medium(m=45, n=26, k=4, tile_m=3, tile_n=5, threads=96, grouping=29, minblocks=4) , # 1069.89 GFlop/s
  Kernel_dnt_medium(m=45, n=26, k=5, tile_m=3, tile_n=3, threads=160, grouping=24, minblocks=4) , # 1144.5 GFlop/s
  Kernel_dnt_medium(m=45, n=26, k=7, tile_m=3, tile_n=5, threads=96, grouping=24, minblocks=4) , # 1348.87 GFlop/s
  Kernel_dnt_medium(m=45, n=26, k=9, tile_m=5, tile_n=2, threads=128, grouping=32, minblocks=2) , # 1480.85 GFlop/s
  Kernel_dnt_medium(m=45, n=26, k=13, tile_m=3, tile_n=5, threads=96, grouping=24, minblocks=3) , # 1643.06 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=26, k=25, tile_m=5, tile_n=4, w=12, v=26, threads=64, grouping=16, minblocks=1) , # 1877.75 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=26, k=26, tile_m=5, tile_n=4, w=12, v=26, threads=64, grouping=16, minblocks=4) , # 1901.17 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=26, k=28, tile_m=3, tile_n=7, w=8, v=16, threads=64, grouping=16, minblocks=1) , # 1935.75 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=26, k=32, tile_m=5, tile_n=4, w=8, v=26, threads=64, grouping=16, minblocks=2) , # 1991.83 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=26, k=45, tile_m=5, tile_n=4, w=12, v=26, threads=64, grouping=16, minblocks=1) , # 2075.74 GFlop/s
  Kernel_dnt_medium(m=45, n=28, k=4, tile_m=3, tile_n=5, threads=96, grouping=29, minblocks=3) , # 1161.39 GFlop/s
  Kernel_dnt_medium(m=45, n=28, k=5, tile_m=5, tile_n=2, threads=128, grouping=24, minblocks=4) , # 1229.89 GFlop/s
  Kernel_dnt_medium(m=45, n=28, k=7, tile_m=3, tile_n=5, threads=96, grouping=24, minblocks=4) , # 1432.27 GFlop/s
  Kernel_dnt_medium(m=45, n=28, k=9, tile_m=5, tile_n=2, threads=128, grouping=32, minblocks=2) , # 1563.84 GFlop/s
  Kernel_dnt_medium(m=45, n=28, k=13, tile_m=3, tile_n=5, threads=96, grouping=24, minblocks=4) , # 1719.62 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=28, k=25, tile_m=5, tile_n=4, w=12, v=20, threads=64, grouping=16, minblocks=4) , # 1977.98 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=28, k=26, tile_m=5, tile_n=4, w=12, v=28, threads=64, grouping=16, minblocks=1) , # 1994.7 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=28, k=28, tile_m=3, tile_n=7, w=14, v=28, threads=64, grouping=16, minblocks=2) , # 2029.85 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=28, k=32, tile_m=5, tile_n=4, w=8, v=28, threads=64, grouping=16, minblocks=2) , # 2089.27 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=28, k=45, tile_m=3, tile_n=7, w=8, v=16, threads=64, grouping=16, minblocks=2) , # 2179.89 GFlop/s
  Kernel_dnt_medium(m=45, n=32, k=4, tile_m=6, tile_n=2, threads=192, grouping=32, minblocks=2) , # 1191.44 GFlop/s
  Kernel_dnt_medium(m=45, n=32, k=5, tile_m=3, tile_n=4, threads=128, grouping=25, minblocks=2) , # 1315.4 GFlop/s
  Kernel_dnt_medium(m=45, n=32, k=7, tile_m=3, tile_n=4, threads=128, grouping=32, minblocks=1) , # 1521.68 GFlop/s
  Kernel_dnt_medium(m=45, n=32, k=9, tile_m=3, tile_n=4, threads=128, grouping=32, minblocks=1) , # 1691.63 GFlop/s
  Kernel_dnt_medium(m=45, n=32, k=13, tile_m=3, tile_n=4, threads=128, grouping=32, minblocks=1) , # 1848.13 GFlop/s
  Kernel_dnt_medium(m=45, n=32, k=25, tile_m=3, tile_n=4, threads=128, grouping=32, minblocks=1) , # 2107.65 GFlop/s
  Kernel_dnt_medium(m=45, n=32, k=26, tile_m=3, tile_n=4, threads=128, grouping=32, minblocks=1) , # 2137 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=32, k=28, tile_m=6, tile_n=4, w=8, v=8, threads=64, grouping=16, minblocks=1) , # 2172 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=32, k=32, tile_m=6, tile_n=4, w=8, v=16, threads=64, grouping=16, minblocks=2) , # 2249.68 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=32, k=45, tile_m=6, tile_n=4, w=8, v=16, threads=64, grouping=16, minblocks=2) , # 2341.8 GFlop/s
  Kernel_dnt_medium(m=45, n=45, k=4, tile_m=6, tile_n=2, threads=192, grouping=32, minblocks=2) , # 1271.92 GFlop/s
  Kernel_dnt_medium(m=45, n=45, k=5, tile_m=6, tile_n=2, threads=256, grouping=29, minblocks=1) , # 1313.41 GFlop/s
  Kernel_dnt_medium(m=45, n=45, k=7, tile_m=3, tile_n=4, threads=192, grouping=29, minblocks=1) , # 1559.15 GFlop/s
  Kernel_dnt_medium(m=45, n=45, k=9, tile_m=3, tile_n=4, threads=192, grouping=29, minblocks=2) , # 1743.9 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=45, k=13, tile_m=6, tile_n=6, w=6, v=28, threads=64, grouping=16, minblocks=2) , # 1951.48 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=45, k=25, tile_m=6, tile_n=6, w=12, v=26, threads=64, grouping=16, minblocks=2) , # 2366.95 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=45, k=26, tile_m=6, tile_n=6, w=10, v=26, threads=64, grouping=16, minblocks=1) , # 2409.7 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=45, k=28, tile_m=7, tile_n=5, w=14, v=18, threads=64, grouping=16, minblocks=1) , # 2479.64 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=45, k=32, tile_m=7, tile_n=5, w=12, v=22, threads=64, grouping=16, minblocks=4) , # 2550.51 GFlop/s
  Kernel_dnt_largeDB2(m=45, n=45, k=45, tile_m=6, tile_n=6, w=12, v=18, threads=64, grouping=16, minblocks=2) , # 2690.9 GFlop/s
  Kernel_dnt_medium(m=55, n=16, k=16, tile_m=1, tile_n=4, threads=256, grouping=4, minblocks=5) , # 1448.91 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=16, k=29, tile_m=2, tile_n=8, w=8, v=8, threads=64, grouping=16, minblocks=8) , # 1544.22 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=16, k=55, tile_m=2, tile_n=8, w=8, v=12, threads=64, grouping=16, minblocks=4) , # 1652.51 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=29, k=16, tile_m=7, tile_n=4, w=8, v=22, threads=64, grouping=16, minblocks=2) , # 1916.32 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=29, k=29, tile_m=3, tile_n=6, w=10, v=18, threads=96, grouping=16, minblocks=4) , # 2161.11 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=29, k=32, tile_m=7, tile_n=4, w=8, v=22, threads=64, grouping=16, minblocks=2) , # 2241.72 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=29, k=55, tile_m=3, tile_n=6, w=14, v=16, threads=96, grouping=16, minblocks=2) , # 2370.43 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=32, k=29, tile_m=7, tile_n=4, w=14, v=20, threads=64, grouping=16, minblocks=1) , # 2315.88 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=32, k=32, tile_m=7, tile_n=4, w=8, v=24, threads=64, grouping=16, minblocks=1) , # 2400.75 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=32, k=55, tile_m=7, tile_n=4, w=6, v=12, threads=64, grouping=16, minblocks=4) , # 2531.29 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=55, k=16, tile_m=7, tile_n=4, w=8, v=32, threads=128, grouping=16, minblocks=1) , # 2371.04 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=55, k=29, tile_m=7, tile_n=7, w=10, v=28, threads=64, grouping=16, minblocks=1) , # 2634.65 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=55, k=32, tile_m=7, tile_n=7, w=10, v=28, threads=64, grouping=16, minblocks=2) , # 2696.5 GFlop/s
  Kernel_dnt_largeDB2(m=55, n=55, k=55, tile_m=7, tile_n=7, w=10, v=24, threads=64, grouping=16, minblocks=1) , # 2934.2 GFlop/s
  Kernel_dnt_largeDB2(m=64, n=64, k=64, tile_m=9, tile_n=4, w=22, v=32, threads=128, grouping=16, minblocks=2) , # 3072.53 GFlop/s
  Kernel_dnt_largeDB1(m=78, n=78, k=78, tile_m=10, tile_n=5, w=8, v=30, threads=128, grouping=16, minblocks=1) , # 3319.77 GFlop/s
]

#EOF
