/*******************************************************************************
* Copyright (C) 2014 Intel Corporation
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*!
@file OptimizeProblem.cpp

HPCG routine
*/

#include "OptimizeProblem.hpp"
#include "PermuteGraph.hpp"
#include "PrefixSum.hpp"
#include "Helpers.hpp"
#include "VeryBasicProfiler.hpp"

#include <iomanip> // std::setw, std::setprecision
#include <immintrin.h>

#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/execution>

// turn this on to print out stages in OptimizeProblem()
//#define USE_PRINTF
#ifdef USE_PRINTF
#define DEBUG_PRINTF(fmt, ...) do { printf(fmt, ##__VA_ARGS__); fflush(0); } while (0)
#else
#define DEBUG_PRINTF(fmt, ...) do {} while(0)
#endif



#ifdef BASIC_PROFILING

// OptimizeProblem() macros nonstandard because the profiler
// is not inside optData yet
#define BEGIN_OP_PROFILE(n) profiler->begin(n);
#define END_OP_PROFILE(n) profiler->end(n);
#define END_OP_PROFILE_WAIT(n, event) event.wait(); profiler->end(n);

#else

#define BEGIN_OP_PROFILE(n)
#define END_OP_PROFILE(n)
#define END_OP_PROFILE_WAIT(n, event)

#endif

// at setup/Make.<MPI>_<HW>, can set  HPCG_FORCE_MV_LLC_REORDER to override local options

// Skips l3 cache reordering
//#define HPCG_NO_L3_CACHE_REORDER

// Skips l3 cache reordering for trmv only (better perf for n=320 if we skip trmv)
//#define HPCG_NO_TRMV_L3_CACHE_REORDER

// Switch between different L3 cache reorder strategy
//#define HPCG_L3_CACHE_REORDER_STRATEGY_OPT
#define HPCG_L3_CACHE_REORDER_STRATEGY_SORT


/*!
Optimizes the data structures used for CG iteration to increase the
performance of the benchmark version of the preconditioned CG algorithm.

@param[inout] A      The known system matrix, also contains the MG hierarchy in attributes Ac and mgData.
@param[inout] data   The data structure with all necessary CG vectors preallocated
@param[inout] b      The known right hand side vector
@param[inout] x      The solution vector to be computed in future CG iteration
@param[inout] xexact The exact solution vector

@return returns 0 upon success and non-zero otherwise

@see GenerateGeometry
@see GenerateProblem
*/
#include "stdio.h"
#include "math.h"
#include <complex>

#include "kernels/insertionsort_kernels.hxx"
#include "kernels/bitonic_exchange_kernels.hxx"

sycl::event print_csr_matrix_to_file(sycl::queue &queue,
                             local_int_t nrow, local_int_t nnz,
                             local_int_t *pia, local_int_t *pja, double *pa,
                             const std::vector<sycl::event> &dependencies)
{

// print permuted rowptr
local_int_t *pia_host = sycl::malloc_host<local_int_t>(nrow+1, queue);
queue.memcpy(pia_host, pia, (nrow+1)*sizeof(local_int_t), dependencies).wait();
char fname_pia[80];
sprintf(fname_pia, "pia%04d.txt", nrow);
std::ofstream fout_pia;
fout_pia.open(fname_pia);
fout_pia << "permuted HPCG matrix ia array (0-based), length: " << nrow+1 << std::endl;
    for (local_int_t row = 0; row < (nrow+1); ++row) {
        fout_pia << pia_host[row] << std::endl;
    }
    fout_pia.close();
    sycl::free(pia_host, queue);


    // print permuted colind
    local_int_t *pja_host = sycl::malloc_host<local_int_t>(nnz, queue);
    queue.memcpy(pja_host, pja, (nnz)*sizeof(local_int_t), dependencies).wait();
    char fname_pja[80];
    sprintf(fname_pja, "pja%04d.txt", nrow);
    std::ofstream fout_pja;
    fout_pja.open(fname_pja);
    fout_pja << "permuted HPCG matrix ja array (0-based), length: " << nnz << std::endl;
    for (local_int_t row = 0; row < nnz; ++row) {
        fout_pja << pja_host[row] << std::endl;
    }
    fout_pja.close();
    sycl::free(pja_host, queue);


    // print permuted values
    double *pa_host = sycl::malloc_host<double>(nnz, queue);
    auto last = queue.memcpy(pa_host, pa, (nnz)*sizeof(double), dependencies);
    last.wait();
    char fname_pa[80];
    sprintf(fname_pa, "pa%04d.txt", nrow);
    std::ofstream fout_pa;
    fout_pa.open(fname_pa);
    fout_pa << "permuted HPCG matrix a array (double), length: " << nnz << std::endl;
    for (local_int_t row = 0; row < nnz; ++row) {
        fout_pa << pa_host[row] << std::endl;
    }
    fout_pa.close();
    sycl::free(pa_host, queue);



    return last;

}

sycl::event print_esb_matrix_to_file(sycl::queue &queue, const std::string &name_addon_str, int rank,
                             local_int_t nrow, local_int_t nBlocks, local_int_t nVectors, local_int_t block_size,
                             local_int_t *blockptr, local_int_t *colind, double *values,
                             local_int_t *lastLower, local_int_t *firstUpper, local_int_t *lastUpper, local_int_t *firstNonloc,
                             const std::vector<sycl::event> &dependencies)
{

    const local_int_t nnz = nVectors * block_size;

    // print permuted rowptr
    local_int_t *blockptr_host = sycl::malloc_host<local_int_t>(nBlocks+1, queue);
    queue.memcpy(blockptr_host, blockptr, (nBlocks+1)*sizeof(local_int_t), dependencies).wait();
    char fname_blockptr[80];
    sprintf(fname_blockptr, "esb%s_blockptr%04d_rank%02d.txt", name_addon_str.c_str(), nrow, rank);
    std::ofstream fout_blockptr;
    fout_blockptr.open(fname_blockptr);
    fout_blockptr << "HPCG matrix ESB blockptr array (0-based), length: " << nBlocks+1 << std::endl;
    for (local_int_t block = 0; block < (nBlocks+1); ++block) {
        fout_blockptr << blockptr_host[block] << std::endl;
    }
    fout_blockptr.close();


    // print ESB colind
    local_int_t *colind_host = sycl::malloc_host<local_int_t>(nnz, queue);
    queue.memcpy(colind_host, colind, (nnz)*sizeof(local_int_t), dependencies).wait();

    local_int_t *lastLower_host = sycl::malloc_host<local_int_t>(nBlocks, queue);
    queue.memcpy(lastLower_host, lastLower, (nBlocks)*sizeof(local_int_t), dependencies).wait();
    local_int_t *firstUpper_host = sycl::malloc_host<local_int_t>(nBlocks, queue);
    queue.memcpy(firstUpper_host, firstUpper, (nBlocks)*sizeof(local_int_t), dependencies).wait();
    local_int_t *lastUpper_host = sycl::malloc_host<local_int_t>(nBlocks, queue);
    queue.memcpy(lastUpper_host, lastUpper, (nBlocks)*sizeof(local_int_t), dependencies).wait();
    local_int_t *firstNonloc_host = sycl::malloc_host<local_int_t>(nBlocks, queue);
    queue.memcpy(firstNonloc_host, firstNonloc, (nBlocks)*sizeof(local_int_t), dependencies).wait();

    char fname_colind[80];
    sprintf(fname_colind, "esb%s_colind%04d_rank%02d.txt", name_addon_str.c_str(), nrow, rank);
    std::ofstream fout_colind;
    fout_colind.open(fname_colind);
    fout_colind << "HPCG matrix ESB colind array (0-based), nBlocks: " << nBlocks << ", nVectors: " << nVectors << ", block_size: " << block_size << ", length: " << nnz << std::endl;
    for (local_int_t block = 0; block < nBlocks; ++block) {
        local_int_t vec_st = blockptr_host[block];
        local_int_t vec_en = blockptr_host[block+1];
        fout_colind << "block: " << std::setw(8) << block << ", nvecs: " << std::setw(3) << (vec_en - vec_st) << std::endl;

        fout_colind << "vecs        : ";
        for (int vec = vec_st; vec < vec_en; ++vec) {
            fout_colind << std::setw(8) << vec << " ";
        }
        fout_colind << std::endl;

        for (int loc = 0; loc < block_size; ++loc) {
            fout_colind << "row " << std::setw(8) << block * block_size + loc << ": ";
            for (local_int_t vec = vec_st; vec < vec_en; ++vec) {
                fout_colind << std::setw(8) << colind_host[vec * block_size + loc] << " ";
            }
            fout_colind << std::endl;
        }

        fout_colind << "firstLower = " << blockptr_host[block] << ", lastLower = " << lastLower_host[block] << ", firstUpper = " << firstUpper_host[block] << ", lastUpper = " << lastUpper_host[block] << ", firstNonloc = " << firstNonloc_host[block] << ", lastNonloc = " << blockptr_host[block+1] << std::endl;
        fout_colind << std::endl; // add space between blocks
    }
    fout_colind.close();
    sycl::free(colind_host, queue);
    sycl::free(lastLower_host, queue);
    sycl::free(firstUpper_host, queue);
    sycl::free(lastUpper_host, queue);
    sycl::free(firstNonloc_host, queue);


    // print ESB values
    double *values_host = sycl::malloc_host<double>(nnz, queue);
    auto last = queue.memcpy(values_host, values, (nnz)*sizeof(double), dependencies);
    last.wait();
    char fname_values[80];
    sprintf(fname_values, "esb%s_values%04d_rank%02d.txt", name_addon_str.c_str(), nrow, rank);
    std::ofstream fout_values;
    fout_values.open(fname_values);
    fout_values << "HPCG matrix ESB values array (double), nBlocks: " << nBlocks << ", nVectors: " << nVectors << ", block_size: " << block_size << ", length: " << nnz << std::endl;
    for (local_int_t block = 0; block < nBlocks; ++block) {
        local_int_t vec_st = blockptr_host[block];
        local_int_t vec_en = blockptr_host[block+1];
        fout_values << "block: " << std::setw(8) << block << ", nvecs: " << std::setw(3) << (vec_en - vec_st) << std::endl;
        for (int loc = 0; loc < block_size; ++loc) {
            fout_values << "row " << std::setw(8) << block * block_size + loc << ": ";
            for (local_int_t vec = vec_st; vec < vec_en; ++vec) {
                fout_values << std::setprecision(4) << values_host[vec * block_size + loc] << " ";
            }
            fout_values << std::endl;
        }
        fout_values << "" << std::endl; // add space between blocks
    }
    fout_values.close();
    sycl::free(values_host, queue);
    sycl::free(blockptr_host, queue);

    return last;

}


#if defined(HPCG_L3_CACHE_REORDER_STRATEGY_OPT)
enum ESB_KERNEL_TYPE {MV, TRMV_L, TRMV_U};

/**
 *  Desc: Take the sequence [0,...,nBlocks] and partition it into [P_1, P_2, ..., P_n] such that
 *  the number of distinct x elements accessed by processing blocks in P_i is <= cacheLimit for all i = 1,..,n
 *
 *  Alg:
 *    reorder_h  - contains reordered block IDs (host-copy). Not all of the array may be used
 *    reorder_d  - contains reordered block IDs (device-copy) copied from reorder_h
 *    xvalMap    - array used to track which x values have already been accessed
 *    blkMap     - contains block IDs that have not been assigned to a partition yet.
 *    pass_count - counts the number of passes that have completed. This value changes across L3CacheReordering calls
 *
 *  Pseudocode:
 *
 *    set reorder_h = {}
 *    set blkMap = {0,....,nBlocks}
 *
 *    while ( reorder_d is not full) {
 *
 *      set xValWorkingSize = 0    <- this is the number of x accessed by the current partition
 *
 *      for (block in blkMap) {
 *
 *        xValWorkingSize += number of new x values accessed by block
 *
 *        if (xValWorkingSize < cacheLimit OR no new x values accessed)
 *          add block to reorder_h, and remove from blkMap
 *      }
 *      copy newly added blocks in reorder_h to reorder_d     <- This corresponds to creating partition P_i
 *    }
 *
*/
template <ESB_KERNEL_TYPE kerType>
sycl::event L3CacheReordering(sycl::queue &queue, local_int_t nBlocks, local_int_t nrow, local_int_t *reorder_h, const local_int_t *colind_h,
                              const local_int_t *blockptr_st, const local_int_t *blockptr_en,
                              local_int_t *xvalMap, local_int_t *blkMap,
                              local_int_t block_size, local_int_t &pass_count, local_int_t cacheLimit, local_int_t *reorder_d,
                              const local_int_t *nonloc_st = NULL, const local_int_t *nonloc_en = NULL)
{

    assert(block_size == 16); // assumes 32b integers and avx512

    local_int_t block_ind = 0; // Tracks number of blocks added to reordered list

    // Number of passes so far (including from previous calls)
    // We keep running count between calls to avoid having to zero-out xvalMap
    local_int_t pass_count_st = pass_count;

    sycl::event evt;
    local_int_t min_partition_size = nBlocks; // This is used to early exit

    while (block_ind < nBlocks) {
        local_int_t block_prev = block_ind;

        // c iterates through blkMap to check if we can add blkMap[c] to current partition
        // i iterates through blkMap storing block IDs we can't add to the start of blkMap
        // i <= c so we can overwrite early entries of blkMap with block IDs not in a partition yet
        local_int_t c = 0, i = 0;

        local_int_t setFull = false;

        local_int_t initBlockId = 0;
        bool whileCondition = false;

        local_int_t xValWorkingSize = 0;

        // During the first pass we don't check blkMap for the block ID values instead we
        // set block = initBlockId and update initBlockId. This saves on initializing blkMap between
        // function calls.
        if constexpr (kerType == MV || kerType == TRMV_U) {
            initBlockId = 0;
            whileCondition = pass_count == pass_count_st ? initBlockId < nBlocks : blkMap[c] != -1;
        }
        else if constexpr (kerType == TRMV_L) {
            initBlockId = nBlocks - 1;
            whileCondition = pass_count == pass_count_st ? initBlockId >= 0 : blkMap[c] != -1;
        }

        evt.wait(); // wait for prev copies (if any) to complete

        // If remaining blocks in blkMap is less than the minimum size of all previous partitions
        // we just copy the remaining blocks to reorder_d instead of count the number of x value accesses.
        if (nBlocks - block_ind >= min_partition_size) {
            while (whileCondition) {

                // If this is first pass, block is tracked by initBlockId
                // otherwise block is tracked by values in blkMap
                auto block = pass_count == pass_count_st ? initBlockId : blkMap[c];

                auto start_row  = block * block_size;
                auto st_vec = blockptr_st[block];
                auto en_vec = blockptr_en[block];

                // Indicates whether this block access values of x already accessed by other blocks
                // in the current partition
                bool blockXValsOverlap = true;

                // Helper function which counts the number of x elements accessed by a specific colind indexed at j
                auto countNumXAccesses = [=, &xValWorkingSize, &blockXValsOverlap] (local_int_t j) {
                    __m512i ind_ = _mm512_loadu_epi32(colind_h + j * block_size);
                    __mmask16 m1;

                    if constexpr (kerType == MV) { // ind >= 0 && ind < nrow
                        __m512i nrow_zmm = _mm512_set1_epi32 (nrow);
                        m1 = _mm512_cmp_epi32_mask (ind_, _mm512_setzero(), _MM_CMPINT_NLT); // Omit negative index
                        __mmask16 m2 = _mm512_cmp_epi32_mask (ind_, nrow_zmm, _MM_CMPINT_LT);
                        m1 = _mm512_kand (m1, m2);
                    }
                    else if constexpr (kerType == TRMV_L) { //indexMask = ind >= 0 && ind < start_row + l;
                        m1 = _mm512_cmp_epi32_mask (ind_, _mm512_setzero(), _MM_CMPINT_NLT); // Omit negative index
                        __m512i offset = _mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
                        __m512i sr = _mm512_set1_epi32(start_row) + offset;

                        __mmask16 m2 = _mm512_cmp_epi32_mask (ind_, sr, _MM_CMPINT_LT);
                        m1 = _mm512_kand (m1, m2);
#ifndef HPCG_NO_MPI
                        // Count column indices corresponding to nonlocal values
                        __m512i nrow_zmm = _mm512_set1_epi32 (nrow);
                        __mmask16 m3 = _mm512_cmp_epi32_mask (ind_, nrow_zmm, _MM_CMPINT_NLT);
                        m1 = _mm512_kand (m1, m3);
#endif
                    }
                    else if constexpr (kerType == TRMV_U) { //indexMask = ind > start_row + l;
                        __m512i offset = _mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
                        __m512i sr = _mm512_set1_epi32(start_row) + offset;
                        m1 = _mm512_cmp_epi32_mask(ind_, sr, _MM_CMPINT_NLE); // Omit negative index
                    }

                    // Get current values in xvalMap to see if they were accessed by a previous block
                    // xvalMap[ind] == pass_count indicates that the x[ind] has already been accessed previously
                    __m512i xvals_zmm = _mm512_mask_i32gather_epi32 (_mm512_set1_epi32(pass_count), m1, ind_, xvalMap, 4);

                    // m masks elements which are new (may have duplicates)
                    __mmask16 m = _mm512_cmp_epi32_mask(xvals_zmm, _mm512_set1_epi32(pass_count), _MM_CMPINT_NE);

                    // Find and mask duplicates
                    __m512i a = _mm512_mask_conflict_epi32(_mm512_set1_epi32(0xff), m, ind_);
                    m = _mm512_cmp_epi32_mask (a, _mm512_setzero(), _MM_CMPINT_EQ);

                    int numNewXVals = _popcnt32(_cvtmask16_u32(m));
                    if (!setFull) {
                        xValWorkingSize += numNewXVals;
                        // update xvalMap to mark indices as accessed
                        _mm512_mask_i32scatter_epi32 (xvalMap, m1, ind_, _mm512_set1_epi32(pass_count), 4);
                    }
                    else
                        blockXValsOverlap = (numNewXVals == 0); // If all values were already accessed add this block
                };

                // Iterate through each vector and count corresponding number of x accesses
                for (local_int_t j = st_vec; j < en_vec; j++)
                    countNumXAccesses(j);

#ifndef HPCG_NO_MPI
                if constexpr (kerType == TRMV_L) {
                    // Extra case for TRMV_L to check for accesses corresponding to nonlocal values (B)
                    for (local_int_t j = nonloc_st[block]; j < nonloc_en[block]; j++)
                        countNumXAccesses(j);
                }
#endif
                setFull = xValWorkingSize >= cacheLimit; // Set is full now.

                if (blockXValsOverlap)
                    // Add block to reordered list
                    // We store to front of the list for better cache reuse
                    reorder_h[block_ind++ - block_prev] = block;
                else
                    // Add block to a different partition, to be processed next pass
                    // Store to front of the blkMap for better cache reuse.
                    // Don't need to worry about overwrite current values since i <= c
                    blkMap[i++] = block;
                c++;

                // Update conditions for next iteration of while loop
                // mark blkMap with -1 to indicate end of blockIDs
                if constexpr (kerType == MV || kerType == TRMV_U) {
                    initBlockId++;
                    whileCondition = pass_count == pass_count_st ? initBlockId < nBlocks : blkMap[c] != -1;
                }
                else if constexpr (kerType == TRMV_L) {
                    initBlockId--;
                    whileCondition = pass_count == pass_count_st ? initBlockId >= 0 : blkMap[c] != -1;
                }
            }

            min_partition_size = std::min(min_partition_size, block_ind-block_prev);
            // Overlap host->device copies with host code, after this copy, beginning of reorder_h can be reused
            evt = queue.memcpy(reorder_d + block_prev, reorder_h, sizeof(local_int_t) * (block_ind - block_prev), {evt});

            blkMap[i] = -1; // Mark end of block IDs
        }
        else {
            // Remaining blocks less than min_partition_size, just copy instead of counting x accesses
            evt = queue.memcpy(reorder_d + block_prev, blkMap, sizeof(local_int_t) * (nBlocks - block_ind), {evt});
            block_ind = nBlocks; // indicate reorder_d is full
        }
        pass_count++;
    }
    return evt;
}
#endif // HPCG_L3_CACHE_REORDER_STRATEGY_OPT


void PermuteMKLSparseMatrix(SparseMatrix *Ac, struct optData *optData, const int A_geom_size,
                            local_int_t *perm, local_int_t *invperm,
#ifdef BASIC_PROFILING
                            VeryBasicProfiler * profiler,
#endif
                            sycl::queue &main_queue)
{
    DEBUG_PRINTF("OptimizeProblem PermuteMKLSparseMatrix 1\n");
    const local_int_t nrow = Ac->localNumberOfRows;
    const local_int_t ncol = Ac->localNumberOfColumns;
    const int  rank = Ac->geom->rank;
    double ** matrixValues = Ac->matrixValues;
    local_int_t ** mtxIndL = Ac->mtxIndL;
    char * nonzerosInRow   = Ac->nonzerosInRow;

    // Extract (local) csrA and (nonlocal)csrB arrays from SparseMatrix
    //  along with bmap and diag if necessary

    BEGIN_OP_PROFILE("PermuteMKL:csr_arrays");
    local_int_t *ia     = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nrow+1), main_queue);
    local_int_t *ib     = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nrow+1), main_queue);
    local_int_t *bmap   = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*nrow, main_queue);

    size_t mem_width = 16; // 16 local_int_t == 64 bytes is recommended cache alignment for GPU
    local_int_t *nnz    = (local_int_t *)sparse_malloc_shared(sizeof(local_int_t)*mem_width*3, main_queue); // used only in this file

    if (  ia == NULL || ib == NULL || bmap == NULL || nnz == NULL ) {
        std::cerr << "Error in allocation in OptimizeProblem::PermuteMKLSparseMatrix 1" << std::endl;
        return;
    }
    local_int_t *nnz_b  = nnz + 1*mem_width;
    local_int_t *nrow_b = nnz + 2*mem_width;

    // calculate oneMKL csr arrays from hpcg matrix representation
    // need to permute rows for writing to ia/ib

    DEBUG_PRINTF("OptimizeProblem PermuteMKLSparseMatrix 2\n");
    auto ev_separate_ia_ib = main_queue.submit([&](sycl::handler &cgh) {
        auto kernel = [=](sycl::item<1> item) {
            const local_int_t prow = item.get_id(0);
            const local_int_t row = invperm[prow];

            const double * const cur_vals = matrixValues[row];
            const local_int_t *  const cur_inds = mtxIndL[row];
            ib[prow+1] = 0;
            ia[prow+1] = 0;

            for (local_int_t j = 0; j < nonzerosInRow[row]; j++)
            {
                if ( cur_inds[j] < nrow ) ia[prow+1] += 1;
                else                      ib[prow+1] += 1;
            }

            // prepare for next prefix sums on these arrays
            if (prow == 0 ) {
                ia[0] = 0;
                ib[0] = 0;
            }
        };
        cgh.parallel_for<class OptimizeProblem_separate_local_nonlocal_ia>(sycl::range<1>(nrow), kernel);
    });
    END_OP_PROFILE_WAIT("PermuteMKL:csr_arrays", ev_separate_ia_ib);

#ifdef USE_PRINTF
    ev_separate_ia_ib.wait();
    printf("OptimizeProblem PermuteMKLSparseMatrix 3\n");  fflush(0);
#endif
    
    BEGIN_OP_PROFILE("PermuteMKL:count_nrow_b");
    auto ev_prefix_sum = prefix_sum(main_queue, nrow+1, ia, {ev_separate_ia_ib});

#ifdef USE_PRINTF
    ev_prefix_sum.wait();
    printf("OptimizeProblem PermuteMKLSparseMatrix 4\n");  fflush(0);
#endif
    auto ev_count_nrow_b = main_queue.submit([&](sycl::handler &cgh) {
        cgh.depends_on(ev_prefix_sum);
        constexpr local_int_t SIMDLEN = HPCG_BLOCK_SIZE;
        constexpr local_int_t TG_SIZE = SIMDLEN;
        const local_int_t nrows_per_thread = ceil_div(nrow, TG_SIZE);

        auto kernel = [=](sycl::nd_item<1> item) SYCL_ESIMD_KERNEL {

            esimd::slm_init(sizeof(local_int_t) * 2 * TG_SIZE);
            local_int_t nnz_b_loc = 0;
            local_int_t nrow_b_loc = 0;

            local_int_t thr_id = item.get_local_linear_id();
            local_int_t start_row = thr_id * nrows_per_thread;
            local_int_t end_row = esimd::min((thr_id+1) * nrows_per_thread, nrow);

            local_int_t i = start_row;
            for (; i < end_row - SIMDLEN; i += SIMDLEN) {
                esimd::simd<local_int_t, SIMDLEN> ibVals = esimd_lsc_block_load<local_int_t, local_int_t, SIMDLEN, nc, nc>(ib, i+1);
                esimd::simd_mask<SIMDLEN> mask = (ibVals > 0);
                int n = esimd::pack_mask(mask);
                if (n > 0 ) {
                    nnz_b_loc  += esimd::reduce<local_int_t>(ibVals, std::plus());
                    nrow_b_loc += esimd::cbit(n);
                }
            }

            if (i < end_row) {
                esimd::simd<local_int_t, SIMDLEN> rows(i, 1);
                esimd::simd_mask<SIMDLEN> mask = rows < end_row;
                esimd::simd<local_int_t, SIMDLEN> ibVals = esimd_lsc_gather<local_int_t, local_int_t, SIMDLEN, nc, nc>(ib, rows+1, mask);
                esimd::simd_mask<SIMDLEN> mask2 = mask & (ibVals > 0);
                int n = esimd::pack_mask(mask2);
                if (n > 0) {
                    nnz_b_loc  += esimd::reduce<local_int_t>(ibVals, std::plus());
                    nrow_b_loc += esimd::cbit(n);
                }
            }

            // write to SLM
            esimd_lsc_slm_scalar_store(thr_id, nrow_b_loc);
            esimd_lsc_slm_scalar_store(TG_SIZE + thr_id, nnz_b_loc);

            esimd::barrier();
            if ( thr_id == 0) {
                esimd::simd<local_int_t, TG_SIZE> tg_nrows_b = esimd_lsc_slm_block_load<local_int_t, local_int_t, TG_SIZE>(0);
                esimd::simd<local_int_t, TG_SIZE> tg_nnz_b   = esimd_lsc_slm_block_load<local_int_t, local_int_t, TG_SIZE>(TG_SIZE);

                nnz_b_loc  = esimd::reduce<local_int_t>(tg_nnz_b, std::plus());
                nrow_b_loc = esimd::reduce<local_int_t>(tg_nrows_b, std::plus());

                esimd_lsc_scalar_store<local_int_t, local_int_t, nc, nc>(nnz_b, 0, nnz_b_loc);
                esimd_lsc_scalar_store<local_int_t, local_int_t, nc, nc>(nrow_b, 0, nrow_b_loc);
                esimd_lsc_scalar_store<local_int_t, local_int_t, nc, nc>(nnz, 0, esimd_lsc_scalar_load<local_int_t, local_int_t>(ia, nrow));
            }
        };
        cgh.parallel_for<class OptimizeProblem_count_nrow_b>(sycl::nd_range<1>(TG_SIZE, TG_SIZE), kernel);
    });
    ev_count_nrow_b.wait();
    END_OP_PROFILE("PermuteMKL:count_nrow_b");

    BEGIN_OP_PROFILE("PermuteMKL:fill");
    DEBUG_PRINTF("OptimizeProblem PermuteMKLSparseMatrix 5\n");
    // allocate and fill ja/a arrays -- writing to permuted row and applying permutation (and eventually sort)
    // to colinds on each row
    local_int_t *ja = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nnz[0]), main_queue);
    double       *a = (double      *)sparse_malloc_device(sizeof(double)*(nnz[0]), main_queue);
    if ( ja == NULL || a == NULL ) {
        std::cerr << "Error in allocation in OptimizeProblem::PermuteMKLSparseMatrix 2" << std::endl;
        return;
    }

    DEBUG_PRINTF("OptimizeProblem PermuteMKLSparseMatrix 6\n");
    auto ev_fill_A = main_queue.submit([&](sycl::handler &cgh) {

        constexpr local_int_t BLOCKSIZE = HPCG_BLOCK_SIZE; // num rows to be grouped together and processed at a time
        constexpr std::uint32_t MAXROWSIZE = 32; // max num elements on a row
        const local_int_t nBlocks = ceil_div(nrow, BLOCKSIZE);

        auto kernel = [=](sycl::item<1> item) SYCL_ESIMD_KERNEL {
            const local_int_t block = item.get_id(0);
            local_int_t prow = block * BLOCKSIZE;
            const local_int_t prow_en = prow + BLOCKSIZE > nrow ? nrow : prow + BLOCKSIZE;
            const local_int_t nloc  = prow_en - prow;

            esimd::simd<std::uint32_t, MAXROWSIZE> iota(0,1); // 0, 1, 2, ... MAXROWSIZE-1
            // lower_wis is a vector of bitmasks for each i -- all 1 bits strictly less than i -- [0 1 3 5, 7 15  31 63, 127 255 511 1023, 2047 ... ]
#if defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20240712  // date after 2024.2.1 compiler
            esimd::simd<std::uint32_t, MAXROWSIZE> lower_wis 
                = esimd::rol<std::uint32_t, std::uint32_t, MAXROWSIZE>(esimd::simd<std::uint32_t, MAXROWSIZE>(1), iota) - 1u;
#else
            esimd::simd<std::uint32_t, MAXROWSIZE> lower_wis
                = esimd_exp::rol<std::uint32_t, std::uint32_t, MAXROWSIZE>(esimd::simd<std::uint32_t, MAXROWSIZE>(1), iota) - 1u;
#endif

            // collect details for block start/stop
            esimd::simd<local_int_t, BLOCKSIZE> rows;
            esimd::simd<char, BLOCKSIZE> rowlengths;
            esimd::simd<local_int_t, BLOCKSIZE> iabs;
            esimd::simd<local_int_t, BLOCKSIZE> iaes;
            if (prow_en < nrow) {
                rows = esimd_lsc_block_load<local_int_t, local_int_t, BLOCKSIZE, nc,nc>(invperm, prow);
                rowlengths = esimd_lsc_gather<char, local_int_t, BLOCKSIZE, nc,nc>(nonzerosInRow, rows);
                iabs = esimd_lsc_block_load<local_int_t, local_int_t, BLOCKSIZE, nc,nc>(ia, prow);
                iaes = esimd_lsc_block_load<local_int_t, local_int_t, BLOCKSIZE, nc,nc>(ia, prow+1);
            }
            else {
                esimd::simd<local_int_t, BLOCKSIZE> bs_iota(0,1);
                esimd::simd_mask<BLOCKSIZE> mask(prow + bs_iota < nrow);
                // don't need pass_through here since always used with mask or nloc protection
                rows = esimd_lsc_gather<local_int_t, local_int_t, BLOCKSIZE, nc,nc>(invperm, prow+bs_iota, mask);
                rowlengths = esimd_lsc_gather<char, local_int_t, BLOCKSIZE, nc,nc>(nonzerosInRow, rows, mask);
                iabs = esimd_lsc_gather<local_int_t, local_int_t, BLOCKSIZE, nc,nc>(ia, prow+bs_iota, mask);
                iaes = esimd_lsc_gather<local_int_t, local_int_t, BLOCKSIZE, nc,nc>(ia, prow+1+bs_iota, mask);
            }

            // process block row by row
            for (std::uint32_t loc = 0; loc < nloc; ++loc) {
                const double * const cur_vals = matrixValues[rows[loc]];
                const local_int_t * const cur_inds = mtxIndL[rows[loc]];
                esimd::simd_mask<MAXROWSIZE> rowmask(iota < static_cast<std::uint32_t>(rowlengths[loc]));
                // load row's column indices and values
                esimd::simd<local_int_t, MAXROWSIZE> inds = esimd_lsc_gather<local_int_t, local_int_t, MAXROWSIZE, nc,nc>(cur_inds, iota, rowmask);
                esimd::simd<double, MAXROWSIZE> vals      = esimd_lsc_gather<double, local_int_t, MAXROWSIZE, nc,nc>(cur_vals, iota, rowmask);
                // apply column permutation to local indices
                esimd::simd_mask<MAXROWSIZE> localrowmask((inds < nrow) & rowmask);
                inds.merge(esimd_lsc_gather<local_int_t, local_int_t, MAXROWSIZE, nc,nc>(perm, inds, localrowmask), localrowmask); // apply permutation here to column inds

                // sort local indices in this row, update mask
                // insertionsort_kernel(inds, vals, static_cast<local_int_t>(rowlengths[loc]));
                bitonic_exchange_simd<MAXROWSIZE>(inds, vals, static_cast<local_int_t>(rowlengths[loc]));
                localrowmask = ((inds < nrow) & rowmask);

                // we only keep local indices, so do a local prefix sum on localrowmask to determine local offset where to write local inds to in row
                esimd::simd<std::uint32_t, MAXROWSIZE> offsets = esimd::cbit<std::uint32_t, MAXROWSIZE>(lower_wis & esimd::pack_mask(localrowmask));
                //offset.merge(esimd::simd<local_int_t, MAXROWSIZE>(MAXROWSIZE-1), !localrowmask); // change offsets for non locals to last index

                // note that the previous step can also add in later the movement of backfilling for esb formats
                esimd::simd<local_int_t, MAXROWSIZE> tmpCols(-1);
                esimd::simd<double, MAXROWSIZE> tmpVals(0.0);

                //
                // scatter local inds/vals into local registers tmpCols/tmpVals  (tried vectorized approach iupdate(offset, vals) member function, but so far didn't work right)
                //
                for (int i = 0; i < MAXROWSIZE; ++i) {
                    if (localrowmask[i]) {
                        tmpCols[offsets[i]] = inds[i];
                        tmpVals[offsets[i]] = vals[i];
                    }
                }

                //
                // scatter modified row back to a/ja arrays
                //
                const local_int_t nnzloc = iaes[loc] - iabs[loc];
                esimd::simd_mask<MAXROWSIZE> finalmask(iota < nnzloc);
                esimd_lsc_scatter<local_int_t, local_int_t, MAXROWSIZE, nc, nc>(ja+iabs[loc], iota, tmpCols, finalmask);
                esimd_lsc_scatter<double, local_int_t, MAXROWSIZE, nc, nc>(a+iabs[loc], iota, tmpVals, finalmask);

            } // for loc < nloc

        };
        cgh.parallel_for<class OptimizeProblem_fill_local_ja_a>(sycl::range<1>(nBlocks),kernel);
    });

#ifdef USE_PRINTF
    ev_fill_A.wait();
    printf("[rank %ld]: OptimizeProblem PermuteMKLSparseMatrix 7\n", rank);  fflush(0);
#endif
    // allocate and fill jb/b arrays
    local_int_t *jb = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nnz_b[0]), main_queue);
    double *b = (double *)sparse_malloc_device(sizeof(double)*(nnz_b[0]), main_queue);
    if ( (jb == NULL || b == NULL) && nnz_b[0] > 0 ) {
        std::cerr << "Error in allocation in OptimizeProblem::PermuteMKLSparseMatrix 3" << std::endl;
        return;
    }

    DEBUG_PRINTF("[rank %ld]: OptimizeProblem PermuteMKLSparseMatrix 8, rank = %ld\n", rank);
    auto ev_prefix_sum_b = prefix_sum(main_queue, nrow+1, ib);

    DEBUG_PRINTF("[rank %ld]: OptimizeProblem PermuteMKLSparseMatrix 8.1\n", rank);
    if (nrow_b[0] > 0) {

        DEBUG_PRINTF("[rank %ld]: OptimizeProblem PermuteMKLSparseMatrix 8.2\n", rank);
        local_int_t *new_ib =
            (local_int_t *) sparse_malloc_device(sizeof(local_int_t)*(nrow_b[0]+1), main_queue);

        // bmap_inv is *almost* the inverse of bmap, in particular, for each row
        // that is included in b:
        //     bmap[bmap_inv[row] - 1] == row
        local_int_t *bmap_inv = (local_int_t*)sparse_malloc_device(sizeof(local_int_t) * nrow, main_queue);
        auto ev_fill_bmap_inv = main_queue.submit([&](sycl::handler &cgh) {
            cgh.depends_on(ev_prefix_sum_b);
            auto kernel = [=](sycl::item<1> item) {
                const local_int_t i = item.get_id(0);
                if ( ib[i+1] - ib[i] > 0 ) {
                    bmap_inv[i] = 1;
                }
                else {
                    bmap_inv[i] = 0;
                }
            };
            cgh.parallel_for(sycl::range<1>(nrow), kernel);
        });

        auto ev_prefix_sum_bmap_inv = prefix_sum(main_queue, rank, nrow, bmap_inv, {ev_fill_bmap_inv});

        auto ev_fill_B = main_queue.submit([&](sycl::handler &cgh) {
            cgh.depends_on(ev_prefix_sum_bmap_inv);

            auto kernel = [=](sycl::item<1> item) {
                const local_int_t pi = item.get_id(0);
                const local_int_t i = invperm[pi];
                // p counts where we are in nonzero array
                // k counts where we are in rows of b
                local_int_t p = ib[pi];
                const double * const cur_vals = matrixValues[i];
                const local_int_t *  const cur_inds = mtxIndL[i];

                if ( ib[pi+1] - ib[pi] > 0 ) { // shifting the B data down from scattered through
                    const local_int_t k = bmap_inv[pi] - 1;
                    for (local_int_t j = 0; j < nonzerosInRow[i]; j++) {
                        if ( cur_inds[j] >= nrow ) {
                            b [p] = cur_vals[j];
                            jb[p] = cur_inds[j]; // use identity perm for nonlocal colinds
                            p++;
                        }
                    }
                    bmap[k] = pi;
                    new_ib[k+1] = ib[pi+1];
                }
            };
            cgh.parallel_for<class OptimizeProblem_fill_nonlocal_jb_b>(sycl::range<1>(nrow), kernel);
        });
        ev_fill_B.wait();
        sycl::free(bmap_inv, main_queue);
        sycl::free(ib, main_queue);
        ib = new_ib;
    } // nrow_b > 0

    main_queue.wait();
    DEBUG_PRINTF("OptimizeProblem PermuteMKLSparseMatrix 10\n");

    optData->ia    = ia;
    optData->ja    = ja;
    optData->a     = a;

    optData->ib    = ib;
    optData->jb    = jb;
    optData->b     = b;

    optData->nrow_b  = nrow_b[0];
    optData->bmap    = bmap;

    // cleanup any locally used variables
    sycl::free(nnz, main_queue);

    END_OP_PROFILE("PermuteMKL:fill");
}



//
// ExtractAndPermuteSparseMatrix()
// 1. Implement a graph permutation for local part of nonzerosInRow/mtxIndL/matrixValues format
//    of HPCG matrix
//
// 2. extract (local) csrA and (nonlocal)csrB arrays from SparseMatrix A
//    along with bmap, and other associated arrays doing the row/col
//    permutation for csrA at this stage, then get rid of previous format of sparse
//    system
//
// 3. Sort csrA (until it is moved to step 1)
//
// 4. convert permuted A and B into final matrix formats and store as A and B matrix
//
void ExtractAndPermuteSparseMatrix(SparseMatrix *Ac, struct optData *optData, const int A_geom_size,
                                   bool mkl_matrices,
#ifdef BASIC_PROFILING
                                   VeryBasicProfiler * profiler,
#endif
                                   sycl::queue &main_queue)
{

    const local_int_t nrow = Ac->localNumberOfRows;
    const local_int_t ncol = Ac->localNumberOfColumns;
    const int  rank = Ac->geom->rank;

    // 1. Implement a graph permutation for local part of nonzerosInRow/mtxIndL/matrixValues format
    //    of HPCG matrix

    double ** matrixValues = Ac->matrixValues;
    local_int_t ** mtxIndL = Ac->mtxIndL;
    char * nonzerosInRow   = Ac->nonzerosInRow;

    local_int_t *perm        = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nrow), main_queue);
    local_int_t *invperm     = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nrow), main_queue);
    local_int_t *xcolors_dev = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nrow+1), main_queue);
    if ( perm == NULL || invperm == NULL || xcolors_dev == NULL ) {
        std::cerr << "Error in allocation in OptimizeProblem::ExtractAndPermuteSparseMatrix 0" << std::endl;
        return;
    }

    local_int_t nColors = 0;
    BEGIN_OP_PROFILE("ExtractAndPermute:PermuteGraph");
    auto ev_graph = PermuteGraph(main_queue, rank,  Ac->geom->nx, Ac->geom->ny, Ac->geom->nz,
                                 nrow, nonzerosInRow, mtxIndL, matrixValues, perm, invperm,
                                 nColors, xcolors_dev, {});
    END_OP_PROFILE("ExtractAndPermute:PermuteGraph");



    // 2. Extract custom Matrix/Matrices from original matrix format along with diags and other
    // associated arrays.  We apply the row/column permutations as we extract into final form. We sort
    // the matrix and then finally  we get rid of original matrix format arrays.

    double *diags        = (double      *)sparse_malloc_device(sizeof(double)*nrow, main_queue);
     if ( diags == NULL) {
        std::cerr << "Error in allocation in OptimizeProblem::ExtractAndPermuteSparseMatrix 1" << std::endl;
        return;
    }

    //
    // Extract esbM = local + nonlocal parts of original matrix in esb Format along with
    // diags.
    //

    BEGIN_OP_PROFILE("ExtractAndPermute:esb_arrays");
    // Currently only support block_size of 16.
    constexpr local_int_t block_size = HPCG_BLOCK_SIZE; // 16 or 32 this sets value to be used for nearly every other kernel
    constexpr local_int_t MAXROWNNZ = 27; // max nonzero per row
    constexpr std::uint32_t MAXROWSIZE = 32; // min power of two above MAXROWNNZ
    local_int_t nBlocks = ceil_div(nrow, block_size);
    local_int_t nVectors = -1;

    local_int_t *nVec_host = (local_int_t *)sparse_malloc_host(sizeof(local_int_t)*1, main_queue); // used only in this file

    local_int_t *xcolors_host   = (local_int_t *)sparse_malloc_host(sizeof(local_int_t)*(nColors+1), main_queue);
    local_int_t *esblastLower   = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nBlocks ), main_queue);
    local_int_t *esbfirstUpper  = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nBlocks), main_queue);
    local_int_t *esblastUpper   = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nBlocks), main_queue);
    local_int_t *esbfirstNonloc = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nBlocks), main_queue);
    local_int_t *esbblockptr    = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nBlocks + 1), main_queue);

    //
    // Note that these cache limits should be updated in each architecture
    //
    local_int_t esb_mv_cache_limit = 10'000'000;
    local_int_t esb_trmv_l_cache_limit = 10'000'000;
    local_int_t esb_trmv_u_cache_limit = 10'000'000;

    bool applyL3CacheMVReorder = nrow > 16'384'000; // 256*256*256 is 16,777,216
    bool applyL3CacheTRMVLReorder = nrow > 16'384'000;
    bool applyL3CacheTRMVUReorder = nrow > 16'384'000;

#ifdef HPCG_FORCE_MV_LLC_REORDER // use global options
    applyL3CacheMVReorder = true;
    applyL3CacheTRMVLReorder = true;
    applyL3CacheTRMVUReorder = true;
#else // use local options set in file
#ifdef HPCG_NO_L3_CACHE_REORDER
    applyL3CacheMVReorder = false;
    applyL3CacheTRMVLReorder = false;
    applyL3CacheTRMVUReorder = false;
#elif defined(HPCG_NO_TRMV_L3_CACHE_REORDER)
    applyL3CacheTRMVLReorder = false;
    applyL3CacheTRMVUReorder = false;
#endif
#endif


#ifdef HPCG_DEBUG
   if (rank == 0) {
       HPCG_fout << "Using Reordering for MV kernels:" << std::endl;
       HPCG_fout << "  applyL3CacheMVReorder    = " << applyL3CacheMVReorder << std::endl;
       HPCG_fout << "  applyL3CacheTRMVLReorder = " << applyL3CacheTRMVLReorder << std::endl;
       HPCG_fout << "  applyL3CacheTRMVUReorder = " << applyL3CacheTRMVUReorder << std::endl;
       HPCG_fout << "  nBlocks = " << nBlocks << std::endl;
   }
#endif


    // ================  Start Assumption Validation =========================
    // some validation of assumptions in optimized kernels
    //
    // 1. CustomAXPBY and ComputeDotProduct assumes nrows % block_size == 0
    if (nrow % block_size != 0) {
        std::cerr << " Error in OptimizeProblem():  nrow=" << nrow
                  << ", computed from problem_size input does not divide block_size="
                  << block_size << ", for AXPBY and DotProduct kernels!" << std::endl;
    }
    // 2. HPCG_L3_CACHE_REORDER_STRATEGY_OPT case currently assumes block_size == 16, so if block_size is not 16, this can't be used
#ifdef HPCG_L3_CACHE_REORDER_STRATEGY_OPT
    if (block_size != 16 && (applyL3CacheMVReorder || applyL3CacheTRMVLReorder || applyL3CacheTRMVUReorder) ) {
        std::cerr << "Error in OptimizeProblem(): using HPCG_L3_CACHE_REORDER_STRATEGY_OPT algorithm with block_size != 16, which is currently unsupported!" << std::endl;
    }
#endif
    // ================  End Assumption Validation ===========================


    local_int_t *mv_reorder = NULL, *trmv_l_reorder = NULL, *trmv_u_reorder = NULL;

    if (applyL3CacheMVReorder) mv_reorder   = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*nBlocks, main_queue);
    if (applyL3CacheTRMVLReorder) trmv_l_reorder   = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*nBlocks, main_queue);
    if (applyL3CacheTRMVUReorder) trmv_u_reorder   = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*nBlocks, main_queue);

    if ( xcolors_host == NULL || esblastLower == NULL || esbfirstUpper == NULL || esblastUpper == NULL ||
            esbfirstNonloc == NULL || esbblockptr == NULL || nVec_host == NULL ) {
        std::cerr << "Error in allocation in OptimizeProblem::ExtractAndPermuteSparseMatrix 0.5" << std::endl;
        return;
    }

    if ( (applyL3CacheMVReorder && mv_reorder == NULL) || (applyL3CacheTRMVLReorder && trmv_l_reorder == NULL)
                                                       || (applyL3CacheTRMVUReorder && trmv_u_reorder == NULL) ) {
        std::cerr << "Error in allocation in OptimizeProblem::ExtractAndPermuteSparseMatrix 0.5.1" << std::endl;
        return;
    }


    // do pull data to host for SpTRSV
    main_queue.memcpy(xcolors_host, xcolors_dev, (nColors + 1) * sizeof(local_int_t), {ev_graph});

    auto ev_fill_esbblockptr = main_queue.submit([&](sycl::handler &cgh) {
        cgh.depends_on(ev_graph);
        auto kernel = [=](sycl::item<1> item) {
            const local_int_t block = item.get_id(0);
            const local_int_t prow_st = block * block_size;
            const local_int_t prow_en = sycl::min(prow_st + block_size, nrow);

            local_int_t nvecs = 0;
            for ( local_int_t prow = prow_st; prow < prow_en; ++prow) {
                local_int_t row = invperm[prow];
                nvecs = sycl::max(nvecs, static_cast<local_int_t>(nonzerosInRow[row]));
            }

            esbblockptr[block+1] = nvecs;

            if (block == 0) esbblockptr[0] = 0;
        };
        cgh.parallel_for<class OptimizeProblem_fill_esbblockptr>(sycl::range<1>(nBlocks), kernel);
    });
    auto ev_prefix_sum_blockptr = prefix_sum(main_queue, nBlocks+1, esbblockptr, {ev_fill_esbblockptr});

    main_queue.memcpy(nVec_host, esbblockptr + nBlocks , 1 * sizeof(local_int_t), {ev_prefix_sum_blockptr}).wait();
    nVectors = nVec_host[0];
    sycl::free(nVec_host, main_queue); // done with nVec_host now

    local_int_t *esbcolind = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(nVectors * block_size), main_queue);
    double      *esbvalues = (double      *)sparse_malloc_device(sizeof(double)*(nVectors * block_size), main_queue);
    if ( esbcolind == NULL || esbvalues == NULL) {
        std::cerr << "Error in allocation in OptimizeProblem::ExtractAndPermuteSparseMatrix 0.75" << std::endl;
        return;
    }

    auto ev_fill_M = main_queue.submit([&](sycl::handler &cgh) {
        cgh.depends_on(ev_prefix_sum_blockptr);

        auto kernel = [=](sycl::item<1> item) SYCL_ESIMD_KERNEL {
            const local_int_t block = item.get_id(0);
            const local_int_t vec_st = esimd_lsc_scalar_load<local_int_t, local_int_t, ca, uc>(esbblockptr, block);
            const local_int_t vec_en = esimd_lsc_scalar_load<local_int_t, local_int_t, ca, uc>(esbblockptr, block+1);
            const local_int_t prow_st  = block * block_size;
            const local_int_t prow_en  = esimd::min(prow_st + block_size, nrow);
            const local_int_t nprow    = prow_en - prow_st;
            const local_int_t nvecs    = vec_en - vec_st;
            const local_int_t block_st = vec_st * block_size; // start of block values

            esimd::simd<std::uint32_t, MAXROWSIZE> iota(0,1); // 0, 1, 2, ... MAXROWSIZE-1

            esimd::simd<double, block_size> diagsLoc(0.0);

            // collect details for block start/stop
            esimd::simd<local_int_t, block_size> rows;
            esimd::simd<char, block_size> rowlengths;
            if (nprow == block_size) {
                rows = esimd_lsc_block_load<local_int_t, local_int_t, block_size, nc,nc>(invperm, prow_st);
                rowlengths = esimd_lsc_gather<char, local_int_t, block_size, nc,nc>(nonzerosInRow, rows);
            }
            else { // last block if it extends beyond nrows (unlikely)
                esimd::simd<local_int_t, block_size> bs_iota(0,1);
                esimd::simd_mask<block_size> mask(prow_st + bs_iota < nrow);
                // don't need pass_through since always accessed with mask or nprow protection
                rows = esimd_lsc_gather<local_int_t, local_int_t, block_size, nc,nc>(invperm, prow_st+bs_iota, mask);
                rowlengths = esimd_lsc_gather<char, local_int_t, block_size, nc,nc>(nonzerosInRow, rows, mask);
                for ( std::uint32_t loc = nprow; loc < block_size; ++loc) {
                    //
                    // scatter (-1, 0.0)  back to esbcolind/esbvalues arrays for nonlocal rows
                    //
                    esimd::simd<local_int_t, MAXROWSIZE> offsets(loc, block_size); // loc, loc + block_size,  loc + 2*block_size ...
                    esimd::simd_mask<MAXROWSIZE> fillmask( iota < nvecs );
                    esimd::simd<local_int_t, MAXROWSIZE> negone(-1);
                    esimd::simd<double, MAXROWSIZE> zero(0.0);
                    esimd_lsc_scatter<local_int_t, local_int_t, MAXROWSIZE, nc, nc>(esbcolind + block_st, offsets, negone, fillmask);
                    esimd_lsc_scatter<double, local_int_t, MAXROWSIZE, nc, nc>(esbvalues + block_st, offsets, zero, fillmask);
                }
            }

            local_int_t lastLowerVec = 0;
            local_int_t firstUpperVec = nvecs;
            local_int_t lastUpperVec = 0;
            local_int_t firstNonlocVec = nvecs;

            // process block row by row
            for (std::uint32_t loc = 0; loc < nprow; ++loc) {
                const double * const cur_vals = matrixValues[rows[loc]];
                const local_int_t * const cur_inds = mtxIndL[rows[loc]];
                esimd::simd_mask<MAXROWSIZE> rowmask(iota < static_cast<std::uint32_t>(rowlengths[loc]));

                // load row's column indices and values
                esimd::simd<local_int_t, MAXROWSIZE> inds = esimd_lsc_gather<local_int_t, local_int_t, MAXROWSIZE,nc,nc>(cur_inds, iota, rowmask);
                esimd::simd<double, MAXROWSIZE>      vals = esimd_lsc_gather<double, local_int_t, MAXROWSIZE,nc,nc>(cur_vals, iota, rowmask);

                // apply column permutation to local indices
                esimd::simd_mask<MAXROWSIZE> localrowmask((inds < nrow) & rowmask);
                inds.merge(esimd_lsc_gather<local_int_t, local_int_t, MAXROWSIZE,nc,nc>(perm, inds, localrowmask), localrowmask); // apply permutation here to column inds

                //
                // sort inds/vals in place
                //
                // insertionsort_kernel(inds, vals, static_cast<local_int_t>(rowlengths[loc]));
                bitonic_exchange_simd<MAXROWSIZE>(inds, vals, static_cast<local_int_t>(rowlengths[loc]));

                // add (-1, 0.0) as (ind, val) to !rowmask so we have correct esb backfill at the end
                inds.merge(esimd::simd<local_int_t, MAXROWSIZE>(-1), !rowmask);
                vals.merge(esimd::simd<double, MAXROWSIZE>(0.0), !rowmask);


                // find diagonal and store it
                local_int_t diagloc = esimd::fbl(esimd::pack_mask(inds == prow_st+loc));
                double diagVal = vals[diagloc];
                diagsLoc[loc] = diagVal;

                // update lastLowerVec/firstUpperVec/lastUpperVec local
                local_int_t lowLoc     = esimd::fbl(esimd::pack_mask(inds >= prow_st+loc)); // one past lastLower
                local_int_t firstupLoc = esimd::fbl(esimd::pack_mask((inds > prow_st+loc) && rowmask)); // first Upper
                local_int_t lastupLoc  = esimd::fbl(esimd::pack_mask((inds >= nrow) )); // last Upper and firstNonloc
                lastLowerVec  = esimd::max(lastLowerVec, lowLoc > 0 ? lowLoc : 0);
                firstUpperVec = esimd::min(firstUpperVec, firstupLoc > 0 ? firstupLoc : nvecs); // fbl/fbh return -1 if input is 0
                lastUpperVec  = esimd::max(lastUpperVec, lastupLoc > 0 ? lastupLoc : static_cast<local_int_t>(rowlengths[loc]) );
                firstNonlocVec = esimd::min(firstNonlocVec, lastupLoc > 0 ? lastupLoc : nvecs); // fbl/fbh return -1 if input is 0

                // work out column major scattering offsets of row loc based from start of block:
                esimd::simd<local_int_t, MAXROWSIZE> offsets(loc, block_size); // loc, loc + block_size,  loc + 2*block_size ...

                //
                // scatter modified row back to esbcolind/esbvalues arrays
                //
                esimd::simd_mask<MAXROWSIZE> fillmask( iota < nvecs );
                esimd_lsc_scatter<local_int_t, local_int_t, MAXROWSIZE, nc, nc>(esbcolind + block_st, offsets, inds, fillmask);
                esimd_lsc_scatter<double, local_int_t, MAXROWSIZE, nc, nc>(esbvalues + block_st, offsets, vals, fillmask);

            } // for loc < nprow

            // store diagonals of block
            esimd_lsc_block_store<double, local_int_t, block_size, nc, nc>(diags, prow_st, diagsLoc);

            // store lower/upper data for block
            esimd_lsc_scalar_store<local_int_t, local_int_t, nc, nc>(esblastLower, block, vec_st + lastLowerVec);
            esimd_lsc_scalar_store<local_int_t, local_int_t, nc, nc>(esbfirstUpper, block, vec_st + firstUpperVec);
            esimd_lsc_scalar_store<local_int_t, local_int_t, nc, nc>(esblastUpper, block, vec_st + lastUpperVec);
            esimd_lsc_scalar_store<local_int_t, local_int_t, nc, nc>(esbfirstNonloc, block, vec_st + firstNonlocVec);

        };
        cgh.parallel_for<class OptimizeProblem_esbM>(sycl::range<1>(nBlocks),kernel);
    });

    // Reordering for L3 cache-locality in ESB kernels
    if (applyL3CacheMVReorder || applyL3CacheTRMVLReorder || applyL3CacheTRMVUReorder) {
#if defined(HPCG_L3_CACHE_REORDER_STRATEGY_SORT) // Sub-optimal ordering strategy, but much faster

        auto min_ind_d = sycl::malloc_device<local_int_t>(nBlocks, main_queue);

        if ( min_ind_d == NULL) {
            std::cerr << "Error in allocation in OptimizeProblem::GEMV L3 Cache reorder (sort)" << std::endl;
            return;
        }

        // Find the minimum non-negative colind that each block accesses
        // TODO: Possibly unroll this kernel for more performance if needed
        auto ev_find_min_ind = main_queue.submit([&](sycl::handler &cgh) {
            cgh.depends_on(ev_fill_M);
            const local_int_t nWG = 16;

            auto min_ind_kernel = [=](sycl::nd_item<1> item) SYCL_ESIMD_KERNEL
                {
                    local_int_t block = item.get_global_id(0);
                    local_int_t offset = block * block_size;

                    if (block >= nBlocks) return;

                    auto st_vec = esbblockptr[block];
                    auto en_vec = esbblockptr[block + 1];

                    esimd::simd<local_int_t, block_size> min_vec(nrow);
                    esimd::simd<local_int_t, block_size> nrow_vec(nrow);
                    for (local_int_t j = st_vec; j < en_vec; j++) {
                        auto indices = esimd_lsc_block_load<local_int_t, local_int_t, block_size, st, uc>(esbcolind, j * block_size);
                        indices.merge(nrow_vec, indices < 0); // Ignore indices less than 0.
                        min_vec = esimd::min(min_vec, indices);
                    }

                    local_int_t min_ind = nrow;
                    for (local_int_t j = 0; j < block_size; j++)
                        min_ind = std::min<local_int_t>(min_ind, min_vec[j]);

                    min_ind_d[block] = min_ind;

                    // Default ordering of blocks (0,...,nBlocks-1) to be reordered by sort_by_key below
                    mv_reorder[block] = block;
                };

            cgh.parallel_for<class min_ind_kernel>(
                sycl::nd_range<1>(ceil_div(nBlocks, nWG) * nWG, nWG), min_ind_kernel);
        });
        ev_find_min_ind.wait();

        auto policy = oneapi::dpl::execution::make_device_policy(main_queue);
        oneapi::dpl::sort_by_key(policy, min_ind_d, min_ind_d + nBlocks, mv_reorder, std::less<void>());

        sycl::event cpy1, cpy2;

        // Use same ordering as gemv
        if (applyL3CacheTRMVLReorder)
            cpy1 = main_queue.memcpy(trmv_l_reorder, mv_reorder, sizeof(local_int_t) * nBlocks);

        // Use same ordering as gemv
        if (applyL3CacheTRMVUReorder)
            cpy2 = main_queue.memcpy(trmv_u_reorder, mv_reorder, sizeof(local_int_t) * nBlocks);

        cpy1.wait();
        cpy2.wait();

        sycl::free(min_ind_d, main_queue);

#elif defined(HPCG_L3_CACHE_REORDER_STRATEGY_OPT) // More optimal L3 cache reordering strategy
        // Needed by all cases
        auto colind_h = sycl::malloc_host<local_int_t>(nVectors * block_size, main_queue);
        auto esbblockptr_h = sycl::malloc_host<local_int_t>(nBlocks + 1, main_queue);

        // arrays use on host
        auto reorder_h = sycl::malloc_host<local_int_t>(nBlocks, main_queue);
        auto blkMap = sycl::malloc_host<local_int_t>(nBlocks + 1, main_queue);
        auto xvalsMap = sycl::malloc_host<int>(ncol, main_queue);

        if ( colind_h == NULL || esbblockptr_h == NULL || reorder_h == NULL ||
            blkMap == NULL || xvalsMap == NULL) {
            std::cerr << "Error in allocation in OptimizeProblem::GEMV L3 Cache reorder (opt) 1" << std::endl;
            return;
        }

        auto cpy0 = main_queue.memcpy(colind_h, esbcolind, sizeof(local_int_t) * (nVectors * block_size), {ev_fill_M});
        auto cpy1 = main_queue.memcpy(esbblockptr_h, esbblockptr, sizeof(local_int_t) * (nBlocks + 1), {ev_fill_M});
        auto setzero = main_queue.memset(xvalsMap, 0, sizeof(int) * ncol);

        auto esblastLower_h = sycl::malloc_host<local_int_t>(nBlocks, main_queue);
        auto esbfirstUpper_h = sycl::malloc_host<local_int_t>(nBlocks, main_queue);

        if ( esblastLower_h == NULL || esbfirstUpper_h == NULL) {
            std::cerr << "Error in allocation in OptimizeProblem::GEMV L3 Cache reorder (opt) 2" << std::endl;
            return;
        }

        local_int_t *esbfirstNonloc_h = NULL;
#ifndef HPCG_NO_MPI
        esbfirstNonloc_h = sycl::malloc_host<local_int_t>(nBlocks, main_queue);

        if ( esbfirstNonloc_h == NULL) {
            std::cerr << "Error in allocation in OptimizeProblem::GEMV L3 Cache reorder (opt) 3" << std::endl;
            return;
        }
#endif

        double start_time, wall_time;
        local_int_t pass_count = 1;
        sycl::event reorder_ev0, reorder_ev1, reorder_ev2;

        cpy0.wait(); // colind_h
        cpy1.wait(); // esbblockptr_h
        setzero.wait(); // xvalsMap

        // Reorder for gemv
        if (applyL3CacheMVReorder) {
            //start_time = mytimer();
            reorder_ev0 = L3CacheReordering<MV>(
                main_queue, nBlocks, nrow, reorder_h, colind_h, esbblockptr_h, esbblockptr_h + 1,
                xvalsMap, blkMap, block_size, pass_count, esb_mv_cache_limit, mv_reorder);
            reorder_ev0.wait();
            //wall_time = mytimer() - start_time;
            //printf("Wall-clock timer (cache-reorder): %lf\n", wall_time);
        }

        // Reorder for trmv_l
        if (applyL3CacheTRMVLReorder) {
            // Used by TRMV_L
            auto cpy = main_queue.memcpy(esblastLower_h, esblastLower, sizeof(local_int_t) * nBlocks, {ev_fill_M});
#ifndef HPCG_NO_MPI
            auto cpy1 = main_queue.memcpy(esbfirstNonloc_h, esbfirstNonloc, sizeof(local_int_t) * nBlocks, {ev_fill_M});
            cpy1.wait();
#endif
            cpy.wait();

            reorder_ev1 = L3CacheReordering<TRMV_L>(
                main_queue, nBlocks, nrow, reorder_h, colind_h, esbblockptr_h, esblastLower_h,
                xvalsMap, blkMap, block_size, pass_count, esb_trmv_l_cache_limit, trmv_l_reorder, esbfirstNonloc_h, esbblockptr_h + 1);
            reorder_ev1.wait();
        }

        // Reorder for trmv_u
        if (applyL3CacheTRMVUReorder) {
            auto cpy = main_queue.memcpy(esbfirstUpper_h, esbfirstUpper, sizeof(local_int_t) * nBlocks, {ev_fill_M});
            cpy.wait();

            auto reorder_ev2 = L3CacheReordering<TRMV_U>(
                main_queue, nBlocks, nrow, reorder_h, colind_h, esbfirstUpper_h, esbblockptr_h + 1,
                xvalsMap, blkMap, block_size, pass_count, esb_trmv_u_cache_limit, trmv_u_reorder);
            reorder_ev2.wait();
        }

        sycl::free(colind_h, main_queue);
        sycl::free(esblastLower_h, main_queue);
        sycl::free(esbfirstUpper_h, main_queue);
        sycl::free(esbblockptr_h, main_queue);
#ifndef HPCG_NO_MPI
        if (esbfirstNonloc_h) sycl::free(esbfirstNonloc_h, main_queue);
#endif

        sycl::free(xvalsMap, main_queue);
        sycl::free(blkMap, main_queue);
        sycl::free(reorder_h, main_queue);
#endif // HPCG_L3_CACHE_REORDER_STRATEGY_OPT
    }
    else {
        ev_fill_M.wait();
    }

    END_OP_PROFILE("ExtractAndPermute:esb_arrays");

    custom::sparseMatrix *sparseM = new custom::sparseMatrix();
    custom::deviceInfo   *devInfo = new custom::deviceInfo(main_queue);

    sparseM->setDevInfo(devInfo);
    sparseM->initMatrix(nrow, ncol, block_size, nVectors, nBlocks, esbblockptr, esbcolind, esbvalues);
    sparseM->initDiagonal(diags, esblastLower, esbfirstUpper, esblastUpper, esbfirstNonloc);
    sparseM->initColoring(nColors, xcolors_host, xcolors_dev);

    sparseM->mv_reorder = mv_reorder;
    sparseM->trmv_l_reorder = trmv_l_reorder;
    sparseM->trmv_u_reorder = trmv_u_reorder;
    sparseM->applyL3CacheMVReorder = applyL3CacheMVReorder;
    sparseM->applyL3CacheTRMVLReorder = applyL3CacheTRMVLReorder;
    sparseM->applyL3CacheTRMVUReorder = applyL3CacheTRMVUReorder;

    // do pull data to host for SpTRSV
    main_queue.memcpy(xcolors_host, xcolors_dev, (nColors + 1) * sizeof(local_int_t)).wait();

#ifdef HPCG_DEBUG
    if (rank==0) HPCG_fout << "Graph nColors = " << nColors << std::endl;
#endif

    if (mkl_matrices) {
        //
        // In case, we are testing custom kernels agaisnt oneMKL, we
        // extract the csrA (local portion) and csrB (nonlocal portion) and bmap so that it can
        // be used in oneMKL.  This is not the common path for optimized HPCG.
        //
        PermuteMKLSparseMatrix(Ac, optData, A_geom_size, perm, invperm,
#ifdef BASIC_PROFILING
                               profiler,
#endif
                               main_queue);
    }

    //
    // clean up mtx data arrays now that we have extracted into another format
    //

    if(Ac->mtxL)         { sycl::free(Ac->mtxL, main_queue);          Ac->mtxL          = NULL; }
    if(Ac->mtxA)         { sycl::free(Ac->mtxA, main_queue);          Ac->mtxA          = NULL; }
    if(Ac->nonzerosInRow){ sycl::free(Ac->nonzerosInRow, main_queue); Ac->nonzerosInRow = NULL; }
    if(Ac->matrixValues) { sycl::free(Ac->matrixValues, main_queue);  Ac->matrixValues  = NULL; }
    if(Ac->mtxIndL)      { sycl::free(Ac->mtxIndL, main_queue);       Ac->mtxIndL       = NULL; }


    //
    // store final format of data arrays into optData
    //

    optData->nColors      = nColors;
    optData->xcolors_dev  = xcolors_dev;
    optData->xcolors_host = xcolors_host;

    optData->target_block_size = block_size;

    optData->nBlocks     = nBlocks;
    optData->nVectors    = nVectors;
    optData->esbblockptr = esbblockptr;
    optData->esbcolind   = esbcolind;
    optData->esbvalues   = esbvalues;

    optData->esblastLower   = esblastLower;
    optData->esbfirstUpper  = esbfirstUpper;
    optData->esblastUpper   = esblastUpper;
    optData->esbfirstNonloc = esbfirstNonloc;

    optData->esbM         = sparseM;
    optData->devInfo      = devInfo;

#ifdef HPCG_REORDER_BLOCKS
    optData->mv_reorder   = mv_reorder;
    optData->trmv_l_reorder   = trmv_l_reorder;
    optData->trmv_u_reorder   = trmv_u_reorder;
#endif

    optData->invperm = invperm;
    optData->perm    = perm;
    optData->diags   = diags;

    Ac->optimizationData = optData;
}


void OptimizeProblem(SparseMatrix *A, Vector *rhs, double & t7, bool mkl_matrices,
                     sycl::queue & main_queue)
{
    t7 = 0.0;
#ifndef HPCG_LOCAL_LONG_LONG
    int rank = A->geom->rank;

    SparseMatrix *Ac = A;
#ifdef BASIC_PROFILING
#ifdef HPCG_NO_MPI
    auto profiler = new VeryBasicProfiler();
#else
    auto profiler = new VeryBasicProfiler(MPI_COMM_WORLD);
#endif
#endif // BASIC_PROFILING

    double t1 = 0.0;
    t1 = mytimer();
    BEGIN_OP_PROFILE("OptimizeProblem");
    while (Ac != NULL) { // do optimization for all levels of multigrid
        struct optData *optData = (struct optData *)sparse_malloc_host(sizeof(struct optData), main_queue);
        if(optData == NULL) return;
        init_optData(optData);

        // remove mtxIndG as we are done with it, before conversions to leave more room
        if( Ac->mtxIndG ) { sycl::free(Ac->mtxIndG, main_queue); Ac->mtxIndG = NULL; }

        BEGIN_OP_PROFILE("OptimizeProblem:ExtractAndPermute");
        ExtractAndPermuteSparseMatrix(Ac, optData, A->geom->size, mkl_matrices,
#ifdef BASIC_PROFILING
                                      profiler,
#endif
                                      main_queue);
        END_OP_PROFILE("OptimizeProblem:ExtractAndPermute");

        BEGIN_OP_PROFILE("OptimizeProblem:alloc");
        const local_int_t nrow = Ac->localNumberOfRows;
        const local_int_t ncol = Ac->localNumberOfColumns;

        size_t mem_width = 8; // 8 doubles == 64 bytes is recommended cache alignment for GPU
        // round up nrow to nearest multiple of mem_width so each sub array in dtmp is
        // cache_line_size aligned.
        size_t nrow_offset = round_up_next_multiple((size_t)nrow, mem_width);
        double *dtmp           = (double *)sparse_malloc_device(sizeof(double) * 4 * nrow_offset, main_queue);
        if ( dtmp == NULL ) {
            std::cerr << "Error in allocation in OptimizeProblem 1" << std::endl;
            return;
        }

        double *device_scalars = (double *)sparse_malloc_device(sizeof(double) * 7 * mem_width, main_queue);
        double *host_scalars   = (double *)malloc(sizeof(double) * 4 * mem_width);
        if ( device_scalars == NULL || host_scalars == NULL ) {
            std::cerr << "Error in allocation in OptimizeProblem 2" << std::endl;
            return;
        }

        double *halo_host_vector = nullptr;
        if (ncol > nrow) {
            halo_host_vector = (double *) malloc(sizeof(double) * (ncol - nrow));
            if (halo_host_vector == NULL) {
                std::cerr << "Error in allocation in OptimizeProblem 3" << std::endl;
                return;
            }
        }
        END_OP_PROFILE("OptimizeProblem:alloc");

#ifndef HPCG_NO_MPI
        if (Ac->totalToBeSent > 0) {
            BEGIN_OP_PROFILE("OptimizeProblem:permute_elementsToSend");
            // modify elementsToSend into permuted form
            local_int_t *pelementsToSend = (local_int_t *)sparse_malloc_device(sizeof(local_int_t)*(Ac->totalToBeSent), main_queue);
            if ( pelementsToSend == NULL ) {
                std::cerr << "Error in allocation in OptimizeProblem 2" << std::endl;
                return;
            }

            auto ev = main_queue.submit([&](sycl::handler &cgh) {
                local_int_t *perm = optData->perm;
                local_int_t totalToBeSent = Ac->totalToBeSent;
                local_int_t *elementsToSend_d = Ac->elementsToSend_d;
                auto kernel = [=] (sycl::item<1> item) {
                    local_int_t i = item.get_id(0);
                    pelementsToSend[i] = perm[elementsToSend_d[i]];
                };
                cgh.parallel_for<class permute_elementsToSend>(sycl::range<1>(totalToBeSent), kernel);
            });
            ev.wait();

            sycl::free(Ac->elementsToSend_d, main_queue);
            Ac->elementsToSend_d = pelementsToSend;
            END_OP_PROFILE("OptimizeProblem:permute_elementsToSend");
        }
#endif // ifndef HPCG_NO_MPI

        // store other

        optData->dtmp    = dtmp;
        optData->dtmp2   = dtmp + 1 * nrow_offset;
        optData->dtmp3   = dtmp + 2 * nrow_offset;
        optData->dtmp4   = dtmp + 3 * nrow_offset;

        optData->normr_dev    = device_scalars;
        optData->pAp_loc_dev  = device_scalars + 1 * mem_width;
        optData->pAp_dev      = device_scalars + 2 * mem_width;
        optData->rtz_loc_dev  = device_scalars + 3 * mem_width;
        optData->rtz_dev      = device_scalars + 4 * mem_width;
        optData->oldrtz_dev   = device_scalars + 5 * mem_width;

        optData->normr_host         = host_scalars;
        optData->rtz_loc_host       = host_scalars + 1 * mem_width;
        optData->pAp_loc_host       = host_scalars + 2 * mem_width;
        optData->global_result_host = host_scalars + 3 * mem_width;

        optData->halo_host_vector = halo_host_vector;

#ifdef BASIC_PROFILING
        optData->profiler = profiler;  // same profiler object for all multigrid levels
#endif

        Ac->optimizationData = optData;
        Ac = Ac->Ac;

    }//while Ac!=NULL

    // store perm/perm_coarse for restriction/prolongation
    Ac = A;
    while (Ac != NULL) { // traverse multigrid to store pointers for perm
        if (Ac->Ac != NULL) {
            struct optData *optData = (struct optData *)Ac->optimizationData;
            struct optData *optData_coarse = (struct optData *)Ac->Ac->optimizationData;

            optData->perm_coarse = optData_coarse->perm;
        }

        Ac = Ac->Ac;
    }

    // permute right-hand side vector
    // (in principle we should permute the exact solution xexact as well,
    //  but since it is a vector of all ones the order does not matter)
    {
        struct optData *optData = (struct optData *)A->optimizationData;
        const local_int_t * perm = optData->perm;
        double * old_rhs = rhs->values;
        const local_int_t nrow = A->localNumberOfRows;
        auto new_rhs = sycl::malloc_device<double>(nrow, main_queue);
        auto ev_fill_rhs = main_queue.submit([&](sycl::handler &cgh) {
            auto kernel = [=](sycl::item<1> item) {
                const local_int_t i = item.get_id(0);
                const local_int_t pi = perm[i];

                new_rhs[pi] = old_rhs[i];
            };
            cgh.parallel_for<class OptimizeProblem_fill_rhs>(sycl::range<1>(nrow), kernel);
        });
        ev_fill_rhs.wait();
        rhs->values = new_rhs;
        sycl::free(old_rhs, main_queue);
    }

    t7 += (mytimer() - t1);
    END_OP_PROFILE("OptimizeProblem");

//    //
//    // print top level esb format
//    //
//    {
//        struct optData *optData = (struct optData *)A->optimizationData;
//
//        local_int_t nrow = A->localNumberOfRows;
//        local_int_t nBlocks = optData->nBlocks;
//        local_int_t nVectors = optData->nVectors;
//        local_int_t block_size = optData->target_block_size;
//        local_int_t *blockptr = optData->esbblockptr;
//        local_int_t *colind = optData->esbcolind;
//        double *values = optData->esbvalues;
//        local_int_t *lastLower = optData->esblastLower;
//        local_int_t *firstUpper = optData->esbfirstUpper;
//        local_int_t *lastUpper = optData->esblastUpper;
//        local_int_t *firstNonloc = optData->esbfirstNonloc;
//
//        print_esb_matrix_to_file(main_queue,"A", rank, nrow, nBlocks, nVectors, block_size,
//                blockptr, colind, values, lastLower, firstUpper, lastUpper, firstNonloc, {}).wait();
//
//    }

#else
    return;
#endif
}

// Helper function (see OptimizeProblem.hpp for details)
double OptimizeProblemMemoryUse(const SparseMatrix & A) {

  return 0.0;

}
