/* 
Copyright 2009, 2011 William Hart. All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are
permitted provided that the following conditions are met:

   1. Redistributions of source code must retain the above copyright notice, this list of
      conditions and the following disclaimer.

   2. Redistributions in binary form must reproduce the above copyright notice, this list
      of conditions and the following disclaimer in the documentation and/or other materials
      provided with the distribution.

THIS SOFTWARE IS PROVIDED BY William Hart ``AS IS'' AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL William Hart OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

The views and conclusions contained in the software and documentation are those of the
authors and should not be interpreted as representing official policies, either expressed
or implied, of William Hart. 
*/

/******************************************************************************

    Copyright (C) 2011 William Hart

******************************************************************************/

*******************************************************************************

    Split/combine FFT coefficients

*******************************************************************************

mp_size_t fft_split_limbs(mp_limb_t ** poly, mp_srcptr limbs, 
         mp_size_t total_limbs, mp_size_t coeff_limbs, mp_size_t output_limbs)

    Split an integer \code{(limbs, total_limbs)} into coefficients of length
    \code{coeff_limbs} limbs and store as the coefficients of \code{poly}
    which are assumed to have space for \code{output_limbs + 1} limbs per
    coefficient. The coefficients of the polynomial do not need to be zeroed
    before calling this function, however the number of coefficients written
    is returned by the function and any coefficients beyond this point are
    not touched.

mp_size_t fft_split_bits(mp_limb_t ** poly, mp_srcptr limbs, 
              mp_size_t total_limbs, mp_bitcnt_t bits, mp_size_t output_limbs)

    Split an integer \code{(limbs, total_limbs)} into coefficients of the 
    given number of \code{bits} and store as the coefficients of \code{poly} 
    which are assumed to have space for \code{output_limbs + 1} limbs per 
    coefficient. The coefficients of the polynomial do not need to be zeroed 
    before calling this function, however the number of coefficients written 
    is returned by the function and any coefficients beyond this point are 
    not touched. 

void fft_combine_limbs(mp_limb_t * res, mp_limb_t ** poly, slong length, 
         mp_size_t coeff_limbs, mp_size_t output_limbs, mp_size_t total_limbs)

    Evaluate the polynomial \code{poly} of the given \code{length} at 
    \code{B^coeff_limbs}, where \code{B = 2^FLINT_BITS}, and add the 
    result to the integer \code{(res, total_limbs)} throwing away any bits 
    that exceed the given number of limbs. The polynomial coefficients are 
    assumed to have at least \code{output_limbs} limbs each, however any 
    additional limbs are ignored.

    If the integer is initially zero the result will just be the evaluation 
    of the polynomial.

void fft_combine_bits(mp_limb_t * res, mp_limb_t ** poly, slong length, 
              mp_bitcnt_t bits, mp_size_t output_limbs, mp_size_t total_limbs)

    Evaluate the polynomial \code{poly} of the given \code{length} at
    \code{2^bits} and add the result to the integer
    \code{(res, total_limbs)} throwing away any bits that exceed the given
    number of limbs. The polynomial coefficients are assumed to have at least
    \code{output_limbs} limbs each, however any additional limbs are ignored.
    If the integer is initially zero the result will just be the evaluation
    of the polynomial.

*******************************************************************************

    Test helper functions

*******************************************************************************

void fermat_to_mpz(mpz_t m, mp_limb_t * i, mp_size_t limbs)

    Convert the Fermat number \code{(i, limbs)} modulo \code{B^limbs + 1} to
    an \code{mpz_t m}. Assumes \code{m} has been initialised. This function 
    is used only in test code.

*******************************************************************************

    Arithmetic modulo a generalised Fermat number

*******************************************************************************

void mpn_addmod_2expp1_1(mp_limb_t * r, mp_size_t limbs, mp_limb_signed_t c)

    Adds the signed limb \code{c} to the generalised fermat number \code{r}
    modulo \code{B^limbs + 1}. The compiler should be able to inline
    this for the case that there is no overflow from the first limb.

void mpn_normmod_2expp1(mp_limb_t * t, mp_size_t limbs)

    Given \code{t} a signed integer of \code{limbs + 1} limbs in twos
    complement format, reduce \code{t} to the corresponding value modulo the
    generalised Fermat number \code{B^limbs + 1}, where
    \code{B = 2^FLINT_BITS}.

void mpn_mul_2expmod_2expp1(mp_limb_t * t,
                               mp_limb_t * i1, mp_size_t limbs, mp_bitcnt_t d)

    Given \code{i1} a signed integer of \code{limbs + 1} limbs in twos
    complement format reduced modulo \code{B^limbs + 1} up to some
    overflow, compute \code{t = i1*2^d} modulo $p$. The result will not
    necessarily be fully reduced. The number of bits \code{d} must be
    nonnegative and less than \code{FLINT_BITS}. Aliasing is permitted.

void mpn_div_2expmod_2expp1(mp_limb_t * t,
                              mp_limb_t * i1, mp_size_t limbs, mp_bitcnt_t d)

    Given \code{i1} a signed integer of \code{limbs + 1} limbs in twos
    complement format reduced modulo \code{B^limbs + 1} up to some
    overflow, compute \code{t = i1/2^d} modulo $p$. The result will not
    necessarily be fully reduced. The number of bits \code{d} must be
    nonnegative and less than \code{FLINT_BITS}. Aliasing is permitted.


*******************************************************************************

    Generic butterflies

*******************************************************************************

void fft_adjust(mp_limb_t * r, mp_limb_t * i1,
                                 mp_size_t i, mp_size_t limbs, mp_bitcnt_t w)

    Set \code{r} to \code{i1} times $z^i$ modulo \code{B^limbs + 1} where
    $z$ corresponds to multiplication by $2^w$. This can be thought of as part
    of a butterfly operation. We require $0 \leq i < n$ where $nw =$
    \code{limbs*FLINT_BITS}. Aliasing is not supported.

void fft_adjust_sqrt2(mp_limb_t * r, mp_limb_t * i1,
               mp_size_t i, mp_size_t limbs, mp_bitcnt_t w, mp_limb_t * temp)

    Set \code{r} to \code{i1} times $z^i$ modulo \code{B^limbs + 1} where
    $z$ corresponds to multiplication by $\sqrt{2}^w$. This can be thought of
    as part of a butterfly operation. We require $0 \leq i < 2*n$ and odd
    where $nw =$ \code{limbs*FLINT_BITS}.

void butterfly_lshB(mp_limb_t * t, mp_limb_t * u, mp_limb_t * i1,
                    mp_limb_t * i2, mp_size_t limbs, mp_size_t x, mp_size_t y)

    We are given two integers \code{i1} and \code{i2} modulo
    \code{B^limbs + 1} which are not necessarily normalised. We compute
    \code{t = (i1 + i2)*B^x} and \code{u = (i1 - i2)*B^y} modulo $p$. Aliasing
    between inputs and outputs is not permitted. We require \code{x} and
    \code{y} to be less than \code{limbs} and nonnegative.

void butterfly_rshB(mp_limb_t * t, mp_limb_t * u, mp_limb_t * i1,
                    mp_limb_t * i2, mp_size_t limbs, mp_size_t x, mp_size_t y)

    We are given two integers \code{i1} and \code{i2} modulo
    \code{B^limbs + 1} which are not necessarily normalised. We compute
    \code{t = (i1 + i2)/B^x} and \code{u = (i1 - i2)/B^y} modulo $p$. Aliasing
    between inputs and outputs is not permitted. We require \code{x} and
    \code{y} to be less than \code{limbs} and nonnegative.

*******************************************************************************

    Radix 2 transforms

*******************************************************************************

void fft_butterfly(mp_limb_t * s, mp_limb_t * t, mp_limb_t * i1,
                  mp_limb_t * i2, mp_size_t i, mp_size_t limbs, mp_bitcnt_t w)

    Set \code{s = i1 + i2}, \code{t = z1^i*(i1 - i2)} modulo
    \code{B^limbs + 1} where \code{z1 = exp(Pi*I/n)} corresponds to
    multiplication by $2^w$. Requires $0 \leq i < n$ where $nw =$
    \code{limbs*FLINT_BITS}.

void ifft_butterfly(mp_limb_t * s, mp_limb_t * t, mp_limb_t * i1, 
                  mp_limb_t * i2, mp_size_t i, mp_size_t limbs, mp_bitcnt_t w)

    Set \code{s = i1 + z1^i*i2}, \code{t = i1 -  z1^i*i2} modulo
    \code{B^limbs + 1} where\\ \code{z1 = exp(-Pi*I/n)} corresponds to
    division by $2^w$. Requires $0 \leq i < 2n$ where $nw =$
    \code{limbs*FLINT_BITS}.

void fft_radix2(mp_limb_t ** ii,
                 mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2)

    The radix 2 DIF FFT works as follows:

    Input: \code{[i0, i1, ..., i(m-1)]}, for $m = 2n$ a power of $2$.

    Output: \code{[r0, r1, ..., r(m-1)]}\\ \code{ = FFT[i0, i1, ..., i(m-1)]}.

    Algorithm:

    $\bullet$ Recursively compute \code{[r0, r2, r4, ...., r(m-2)]}\\
         \code{= FFT[i0+i(m/2), i1+i(m/2+1), ..., i(m/2-1)+i(m-1)]}

    $\bullet$ Let \code{[t0, t1, ..., t(m/2-1)]}\\
         \code{= [i0-i(m/2), i1-i(m/2+1), ..., i(m/2-1)-i(m-1)]}

    $\bullet$ Let \code{[u0, u1, ..., u(m/2-1)]}\\
         \code{= [z1^0*t0, z1^1*t1, ..., z1^(m/2-1)*t(m/2-1)]}
         where \code{z1 = exp(2*Pi*I/m)} corresponds to multiplication
         by $2^w$.

    $\bullet$ Recursively compute \code{[r1, r3, ..., r(m-1)]}\\
         \code{= FFT[u0, u1, ..., u(m/2-1)]}

    The parameters are as follows:

    $\bullet$ \code{2*n} is the length of the input and output
         arrays

    $\bullet$ $w$ is such that $2^w$ is an $2n$-th root of unity
         in the ring $\mathbf{Z}/p\mathbf{Z}$ that we are working in,
         i.e. $p = 2^{wn} + 1$ (here $n$ is divisible by
         \code{GMP_LIMB_BITS})

    $\bullet$ \code{ii} is the array of inputs (each input is an
         array of limbs of length \code{wn/GMP_LIMB_BITS + 1} (the
         extra limbs being a "carry limb"). Outputs are written
         in-place.

    We require $nw$ to be at least 64 and the two temporary space pointers to
    point to blocks of size \code{n*w + FLINT_BITS} bits.

void fft_truncate(mp_limb_t ** ii,  mp_size_t n, mp_bitcnt_t w,
                            mp_limb_t ** t1, mp_limb_t ** t2, mp_size_t trunc)

    As for \code{fft_radix2} except that only the first \code{trunc}
    coefficients of the output are computed and the input is regarded as
    having (implied) zero coefficients from coefficient \code{trunc} onwards.
    The coefficients must exist as the algorithm needs to use this extra
    space, but their value is irrelevant. The value of \code{trunc} must be
    divisible by 2.

void fft_truncate1(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w,
                            mp_limb_t ** t1, mp_limb_t ** t2, mp_size_t trunc)

    As for \code{fft_radix2} except that only the first \code{trunc}
    coefficients of the output are computed. The transform still needs all
    $2n$ input coefficients to be specified.

void ifft_radix2(mp_limb_t ** ii,
                 mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2)

    The radix 2 DIF IFFT works as follows:

    Input: \code{[i0, i1, ..., i(m-1)]}, for $m = 2n$ a power of $2$.

    Output: \code{[r0, r1, ..., r(m-1)]}\\
            \code{ = IFFT[i0, i1, ..., i(m-1)]}.

    Algorithm:

    $\bullet$ Recursively compute \code{[s0, s1, ...., s(m/2-1)]}\\
         \code{= IFFT[i0, i2, ..., i(m-2)]}

    $\bullet$ Recursively compute \code{[t(m/2), t(m/2+1), ..., t(m-1)]}\\
         \code{= IFFT[i1, i3, ..., i(m-1)]}

    $\bullet$ Let \code{[r0, r1, ..., r(m/2-1)]}\\
         \code{= [s0+z1^0*t0, s1+z1^1*t1, ..., s(m/2-1)+z1^(m/2-1)*t(m/2-1)]}
         where \code{z1 = exp(-2*Pi*I/m)} corresponds to division by $2^w$.

    $\bullet$ Let \code{[r(m/2), r(m/2+1), ..., r(m-1)]}\\
        \code{= [s0-z1^0*t0, s1-z1^1*t1, ..., s(m/2-1)-z1^(m/2-1)*t(m/2-1)]}

    The parameters are as follows:

    $\bullet$ \code{2*n} is the length of the input and output
        arrays

    $\bullet$ $w$ is such that $2^w$ is an $2n$-th root of unity
        in the ring $\mathbf{Z}/p\mathbf{Z}$ that we are working in,
        i.e. $p = 2^{wn} + 1$ (here $n$ is divisible by
        \code{GMP_LIMB_BITS})

    $\bullet$ \code{ii} is the array of inputs (each input is an
        array of limbs of length \code{wn/GMP_LIMB_BITS + 1} (the
        extra limbs being a "carry limb"). Outputs are written
        in-place.

    We require $nw$ to be at least 64 and the two temporary space pointers
    to point to blocks of size \code{n*w + FLINT_BITS} bits.

void ifft_truncate(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w,
                            mp_limb_t ** t1, mp_limb_t ** t2, mp_size_t trunc)

    As for \code{ifft_radix2} except that the output is assumed to have
    zeros from coefficient trunc onwards and only the first trunc
    coefficients of the input are specified. The remaining coefficients need
    to exist as the extra space is needed, but their value is irrelevant.
    The value of \code{trunc} must be divisible by 2.

    Although the implementation does not require it, we assume for simplicity
    that \code{trunc} is greater than $n$. The algorithm begins by computing
    the inverse transform of the first $n$ coefficients of the input array.
    The unspecified coefficients of the second half of the array are then
    written: coefficient \code{trunc + i} is computed as a twist of
    coefficient \code{i} by a root of unity. The values of these coefficients
    are then equal to what they would have been if the inverse transform of
    the right hand side of the input array had been computed with full data
    from the start. The function \code{ifft_truncate1} is then called on the
    entire right half of the input array with this auxilliary data filled in.
    Finally a single layer of the IFFT is completed on all the coefficients
    up to \code{trunc} being careful to note that this involves doubling the
    coefficients from \code{trunc - n} up to \code{n}.

void ifft_truncate1(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w,
                            mp_limb_t ** t1, mp_limb_t ** t2, mp_size_t trunc)

    Computes the first \code{trunc} coefficients of the radix 2 inverse
    transform assuming the first \code{trunc} coefficients are given and that
    the remaining coefficients have been set to the value they would have if
    an inverse transform had already been applied with full data.

    The algorithm is the same as for \code{ifft_truncate} except that the
    coefficients from \code{trunc} onwards after the inverse transform are
    not inferred to be zero but the supplied values.

void fft_butterfly_sqrt2(mp_limb_t * s, mp_limb_t * t,
                         mp_limb_t * i1, mp_limb_t * i2, mp_size_t i, 
                             mp_size_t limbs, mp_bitcnt_t w, mp_limb_t * temp)

    Let $w = 2k + 1$, $i = 2j + 1$. Set \code{s = i1 + i2}, 
    \code{t = z1^i*(i1 - i2)} modulo \code{B^limbs + 1} where 
    \code{z1^2 = exp(Pi*I/n)} corresponds to multiplication by $2^w$. Requires 
    $0 \leq i < 2n$ where $nw =$ \code{limbs*FLINT_BITS}.

    Here \code{z1} corresponds to multiplication by $2^k$ then multiplication
    by\\ \code{(2^(3nw/4) - 2^(nw/4))}. We see \code{z1^i} corresponds to
    multiplication by \code{(2^(3nw/4) - 2^(nw/4))*2^(j+ik)}.

    We first multiply by \code{2^(j + ik + wn/4)} then multiply by an
    additional \code{2^(nw/2)} and subtract.

void ifft_butterfly_sqrt2(mp_limb_t * s, mp_limb_t * t, mp_limb_t * i1, 
                              mp_limb_t * i2, mp_size_t i, mp_size_t limbs, 
                                              mp_bitcnt_t w, mp_limb_t * temp)

    Let $w = 2k + 1$, $i = 2j + 1$. Set \code{s = i1 + z1^i*i2},
    \code{t = i1 - z1^i*i2} modulo \code{B^limbs + 1} where 
    \code{z1^2 = exp(-Pi*I/n)} corresponds to division by $2^w$. Requires 
    $0 \leq i < 2n$ where $nw =$ \code{limbs*FLINT_BITS}.

    Here \code{z1} corresponds to division by $2^k$ then division by 
    \code{(2^(3nw/4) - 2^(nw/4))}. We see \code{z1^i} corresponds to division 
    by \code{(2^(3nw/4) - 2^(nw/4))*2^(j+ik)} which is the same as division 
    by \code{2^(j+ik + 1)} then multiplication by 
    \code{(2^(3nw/4) - 2^(nw/4))}.

    Of course, division by \code{2^(j+ik + 1)} is the same as multiplication 
    by \code{2^(2*wn - j - ik - 1)}. The exponent is positive as 
    $i \leq 2*n$, $j < n$, $k < w/2$.

    We first multiply by \code{2^(2*wn - j - ik - 1 + wn/4)} then multiply by 
    an additional \code{2^(nw/2)} and subtract.

void fft_truncate_sqrt2(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w, 
         mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp, mp_size_t trunc)

    As per \code{fft_truncate} except that the transform is twice the usual 
    length, i.e. length $4n$ rather than $2n$. This is achieved by making use 
    of twiddles by powers of a square root of 2, not powers of 2 in the first 
    layer of the transform.  

    We require $nw$ to be at least 64 and the three temporary space pointers 
    to point to blocks of size \code{n*w + FLINT_BITS} bits.

void ifft_truncate_sqrt2(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w, 
         mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp, mp_size_t trunc)

    As per \code{ifft_truncate} except that the transform is twice the usual
    length, i.e. length $4n$ instead of $2n$. This is achieved by making use 
    of twiddles by powers of a square root of 2, not powers of 2 in the final 
    layer of the transform. 

    We require $nw$ to be at least 64 and the three temporary space pointers 
    to point to blocks of size \code{n*w + FLINT_BITS} bits.

*******************************************************************************

    Matrix Fourier Transforms

*******************************************************************************

void fft_butterfly_twiddle(mp_limb_t * u, mp_limb_t * v, 
                        mp_limb_t * s, mp_limb_t * t, mp_size_t limbs,
                                               mp_bitcnt_t b1, mp_bitcnt_t b2)

    Set \code{u = 2^b1*(s + t)}, \code{v = 2^b2*(s - t)} modulo 
    \code{B^limbs + 1}. This is used to compute 
    \code{u = 2^(ws*tw1)*(s + t)},\\ \code{v = 2^(w+ws*tw2)*(s - t)} in the 
    matrix fourier algorithm, i.e. effectively computing an ordinary butterfly 
    with additional twiddles by \code{z1^rc} for row $r$ and column $c$ of the 
    matrix of coefficients. Aliasing is not allowed.

void ifft_butterfly_twiddle(mp_limb_t * u, mp_limb_t * v, 
                        mp_limb_t * s, mp_limb_t * t, mp_size_t limbs, 
                                               mp_bitcnt_t b1, mp_bitcnt_t b2)

    Set \code{u = s/2^b1 + t/2^b1)}, \code{v = s/2^b1 - t/2^b1} modulo 
    \code{B^limbs + 1}. This is used to compute 
    \code{u = 2^(-ws*tw1)*s + 2^(-ws*tw2)*t)},\\
    \code{v = 2^(-ws*tw1)*s + 2^(-ws*tw2)*t)} in the matrix fourier algorithm, 
    i.e. effectively computing an ordinary butterfly with additional twiddles 
    by \code{z1^(-rc)} for row $r$ and column $c$ of the matrix of 
    coefficients. Aliasing is not allowed.

void fft_radix2_twiddle(mp_limb_t ** ii, mp_size_t is,
       mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
                         mp_size_t ws, mp_size_t r, mp_size_t c, mp_size_t rs)

    As for \code{fft_radix2} except that the coefficients are spaced by 
    \code{is} in the array \code{ii} and an additional twist by \code{z^c*i}
    is applied to each coefficient where $i$ starts at $r$ and increases by
    \code{rs} as one moves from one coefficient to the next. Here \code{z} 
    corresponds to multiplication by \code{2^ws}. 

void ifft_radix2_twiddle(mp_limb_t ** ii, mp_size_t is,
        mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
                         mp_size_t ws, mp_size_t r, mp_size_t c, mp_size_t rs)

    As for \code{ifft_radix2} except that the coefficients are spaced by 
    \code{is} in the array \code{ii} and an additional twist by 
    \code{z^(-c*i)} is applied to each coefficient where $i$ starts at $r$ 
    and increases by \code{rs} as one moves from one coefficient to the next. 
    Here \code{z} corresponds to multiplication by \code{2^ws}. 

void fft_truncate1_twiddle(mp_limb_t ** ii, mp_size_t is,
     mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
        mp_size_t ws, mp_size_t r, mp_size_t c, mp_size_t rs, mp_size_t trunc)

    As per \code{fft_radix2_twiddle} except that the transform is truncated 
    as per\\ \code{fft_truncate1}.

void ifft_truncate1_twiddle(mp_limb_t ** ii, mp_size_t is,
     mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
        mp_size_t ws, mp_size_t r, mp_size_t c, mp_size_t rs, mp_size_t trunc)

    As per \code{ifft_radix2_twiddle} except that the transform is truncated
    as per\\ \code{ifft_truncate1}.

void fft_mfa_truncate_sqrt2(mp_limb_t ** ii, mp_size_t n,
                     mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
                             mp_limb_t ** temp, mp_size_t n1, mp_size_t trunc)

    This is as per the \code{fft_truncate_sqrt2} function except that the 
    matrix fourier algorithm is used for the left and right FFTs. The total 
    transform length is $4n$ where \code{n = 2^depth} so that the left and
    right transforms are both length $2n$. We require \code{trunc > 2*n} and
    that \code{trunc} is divisible by \code{2*n1} (explained below).

    The matrix fourier algorithm, which is applied to each transform of length
    $2n$, works as follows. We set \code{n1} to a power of 2 about the square
    root of $n$. The data is then thought of as a set of \code{n2} rows each
    with \code{n1} columns (so that \code{n1*n2 = 2n}). 

    The length $2n$ transform is then computed using a whole pile of short 
    transforms. These comprise \code{n1} column transforms of length \code{n2}
    followed by some twiddles by roots of unity (namely \code{z^rc} where $r$ 
    is the row and $c$ the column within the data) followed by \code{n2}
    row transforms of length \code{n1}. Along the way the data needs to be
    rearranged due to the fact that the short transforms output the data in 
    binary reversed order compared with what is needed.

    The matrix fourier algorithm provides better cache locality by decomposing
    the long length $2n$ transforms into many transforms of about the square 
    root of the original length. 

    For better cache locality the sqrt2 layer of the full length $4n$ 
    transform is folded in with the column FFTs performed as part of the first 
    matrix fourier algorithm on the left half of the data.

    The second half of the data requires a truncated version of the matrix
    fourier algorithm. This is achieved by truncating to an exact multiple of 
    the row length so that the row transforms are full length. Moreover, the 
    column transforms will then be truncated transforms and their truncated 
    length needs to be a multiple of 2. This explains the condition on 
    \code{trunc} given above. 

    To improve performance, the extra twiddles by roots of unity are combined
    with the butterflies performed at the last layer of the column transforms.

    We require $nw$ to be at least 64 and the three temporary space pointers 
    to point to blocks of size \code{n*w + FLINT_BITS} bits.

void ifft_mfa_truncate_sqrt2(mp_limb_t ** ii, mp_size_t n,
                      mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
                             mp_limb_t ** temp, mp_size_t n1, mp_size_t trunc)

    This is as per the \code{ifft_truncate_sqrt2} function except that the 
    matrix fourier algorithm is used for the left and right IFFTs. The total 
    transform length is $4n$ where \code{n = 2^depth} so that the left and
    right transforms are both length $2n$. We require \code{trunc > 2*n} and
    that \code{trunc} is divisible by \code{2*n1}.

    We set \code{n1} to a power of 2 about the square root of $n$.

    As per the matrix fourier FFT the sqrt2 layer is folded into the the 
    final column IFFTs for better cache locality and the extra twiddles that 
    occur in the matrix fourier algorithm are combined with the butterflied 
    performed at the first layer of the final column transforms.

    We require $nw$ to be at least 64 and the three temporary space pointers 
    to point to blocks of size \code{n*w + FLINT_BITS} bits.

void fft_mfa_truncate_sqrt2_outer(mp_limb_t ** ii, mp_size_t n,
                      mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
                             mp_limb_t ** temp, mp_size_t n1, mp_size_t trunc)

    Just the outer layers of \code{fft_mfa_truncate_sqrt2}.

void fft_mfa_truncate_sqrt2_inner(mp_limb_t ** ii, mp_limb_t ** jj,
          mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
             mp_limb_t ** temp, mp_size_t n1, mp_size_t trunc, mp_limb_t * tt)

    The inner layers of \code{fft_mfa_truncate_sqrt2} and 
    \code{ifft_mfa_truncate_sqrt2} combined with pointwise mults.

void ifft_mfa_truncate_sqrt2_outer(mp_limb_t ** ii, mp_size_t n,
                      mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
                             mp_limb_t ** temp, mp_size_t n1, mp_size_t trunc)

    The outer layers of \code{ifft_mfa_truncate_sqrt2} combined with
    normalisation.

*******************************************************************************

    Negacyclic multiplication

*******************************************************************************

void fft_negacyclic(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w, 
                          mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp)

    As per \code{fft_radix2} except that it performs a sqrt2 negacyclic 
    transform of length $2n$. This is the same as the radix 2 transform 
    except that the $i$-th coefficient of the input is first multiplied by 
    $\sqrt{2}^{iw}$.

    We require $nw$ to be at least 64 and the two temporary space pointers to
    point to blocks of size \code{n*w + FLINT_BITS} bits.

void ifft_negacyclic(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w, 
                          mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp)

    As per \code{ifft_radix2} except that it performs a sqrt2 negacyclic 
    inverse transform of length $2n$. This is the same as the radix 2 inverse
    transform except that the $i$-th coefficient of the output is finally 
    divided by $\sqrt{2}^{iw}$.

    We require $nw$ to be at least 64 and the two temporary space pointers to
    point to blocks of size \code{n*w + FLINT_BITS} bits.

void fft_naive_convolution_1(mp_limb_t * r, mp_limb_t * ii, 
                                                 mp_limb_t * jj, mp_size_t m)

    Performs a naive negacyclic convolution of \code{ii} with \code{jj}, 
    both of length $m$ and sets $r$ to the result. This is essentially 
    multiplication of polynomials modulo $x^m + 1$.

void _fft_mulmod_2expp1(mp_limb_t * r1, mp_limb_t * i1, mp_limb_t * i2, 
                         mp_size_t r_limbs, mp_bitcnt_t depth, mp_bitcnt_t w)

    Multiply \code{i1} by \code{i2} modulo \code{B^r_limbs + 1} where 
    \code{r_limbs = nw/FLINT_BITS} with \code{n = 2^depth}. Uses the 
    negacyclic FFT convolution CRT'd with a 1 limb naive convolution. We 
    require that \code{depth} and \code{w} have been selected as per the 
    wrapper \code{fft_mulmod_2expp1} below.

slong fft_adjust_limbs(mp_size_t limbs)

    Given a number of limbs, returns a new number of limbs (no more than 
    the next power of 2) which will work with the Nussbaumer code. It is only 
    necessary to make this adjustment if 
    \code{limbs > FFT_MULMOD_2EXPP1_CUTOFF}.

void fft_mulmod_2expp1(mp_limb_t * r, mp_limb_t * i1, mp_limb_t * i2, 
                                    mp_size_t n, mp_size_t w, mp_limb_t * tt)

    As per \code{_fft_mulmod_2expp1} but with a tuned cutoff below which more 
    classical methods are used for the convolution. The temporary space is 
    required to fit \code{n*w + FLINT_BITS} bits. There are no restrictions 
    on $n$, but if \code{limbs = n*w/FLINT_BITS} then if \code{limbs} exceeds 
    \code{FFT_MULMOD_2EXPP1_CUTOFF} the function \code{fft_adjust_limbs} must
    be called to increase the number of limbs to an appropriate value.

*******************************************************************************

    Integer multiplication

*******************************************************************************

void mul_truncate_sqrt2(mp_ptr r1, mp_srcptr i1, mp_size_t n1,
                        mp_srcptr i2, mp_size_t n2, mp_bitcnt_t depth, mp_bitcnt_t w)

    Integer multiplication using the radix 2 truncated sqrt2 transforms. 

    Set \code{(r1, n1 + n2)} to the product of \code{(i1, n1)} by 
    \code{(i2, n2)}. This is achieved through an FFT convolution of length at 
    most \code{2^(depth + 2)} with coefficients of size $nw$ bits where 
    \code{n = 2^depth}. We require \code{depth >= 6}. The input data is 
    broken into chunks of data not exceeding \code{(nw - (depth + 1))/2} 
    bits. If breaking the first integer into chunks of this size results in 
    \code{j1} coefficients and breaking the second integer results in 
    \code{j2} chunks then \code{j1 + j2 - 1 <= 2^(depth + 2)}.

    If \code{n = 2^depth} then we require $nw$ to be at least 64.

void mul_mfa_truncate_sqrt2(mp_ptr r1, mp_srcptr i1, mp_size_t n1,
                        mp_srcptr i2, mp_size_t n2, mp_bitcnt_t depth, mp_bitcnt_t w)

    As for \code{mul_truncate_sqrt2} except that the cache friendly matrix
    fourier algorithm is used.

    If \code{n = 2^depth} then we require $nw$ to be at least 64. Here we
    also require $w$ to be $2^i$ for some $i \geq 0$. 

void flint_mpn_mul_fft_main(mp_ptr r1, mp_srcptr i1, mp_size_t n1,
                        mp_srcptr i2, mp_size_t n2)

    The main integer multiplication routine. Sets \code{(r1, n1 + n2)} to
    \code{(i1, n1)} times \code{(i2, n2)}. We require \code{n1 >= n2 > 0}.

*******************************************************************************

    Convolution

*******************************************************************************

void fft_convolution(mp_limb_t ** ii, mp_limb_t ** jj, slong depth, 
                              slong limbs, slong trunc, mp_limb_t ** t1, 
                             mp_limb_t ** t2, mp_limb_t ** s1, mp_limb_t * tt)

    Perform an FFT convolution of \code{ii} with \code{jj}, both of length 
    \code{4*n} where \code{n = 2^depth}. Assume that all but the first 
    \code{trunc} coefficients of the output (placed in \code{ii}) are zero.
    Each coefficient is taken modulo \code{B^limbs + 1}. The temporary 
    spaces \code{t1}, \code{t2} and \code{s1} must have \code{limbs + 1} 
    limbs of space and \code{tt} must have \code{2*(limbs + 1)} of free 
    space.
