#ifndef SOLVERS_INCLUDED
#define SOLVERS_INCLUDED

#define USE_CHOLMOD 0
#define USE_EIGEN 0
#define USE_DIRECT_SOLVER ( USE_CHOLMOD || USE_EIGEN )

#include <Util/fftw3.h>
#include <Util/SoRMetric.h>
#include <Util/Array.h>
#include <Util/Util.h>

#if USE_EIGEN
#include <Eigen/Sparse>
#endif // USE_EIGEN

#if USE_CHOLMOD
#pragma message( "[WARNING] Need to explicitly exclude VCOMP.lib" )
#include <Cholmod/cholmod.h>
#pragma comment( lib , "CHOLMOD.lib" )
#define DLONG
#ifdef DLONG
typedef long long SOLVER_LONG;
#define CHOLMOD( name ) cholmod_l_ ## name
#else // !DLONG
typedef       int SOLVER_LONG;
#define CHOLMOD( name ) cholmod_ ## name
#endif // DLONG
#endif // USE_CHOLMOD

template< class Real >
class Solver
{
public:
	virtual void solve( ConstPointer( Real ) b , Pointer( Real ) x ) = 0;
	virtual int nonZeros( void ) const { return 0; }
};
#if USE_EIGEN
template< class Real >
class EigenSolver : public Solver< Real >
{
	int _dim;
	Eigen::SimplicialLLT< Eigen::SparseMatrix< double > > _solver;
	Eigen::Matrix< double , Eigen::Dynamic , 1 > _temp;
public:
	template< class _Real >
	EigenSolver( const SparseMatrix< _Real , int >& M );
	template< class _Real , unsigned int Radius > EigenSolver( const BandedMatrix< _Real , Radius >& M );
	void solve( ConstPointer( Real ) b , Pointer( Real ) x );
	int nonZeros( void ) const { Eigen::SparseMatrix< double > m = _solver.matrixU() ; return m.nonZeros(); }
	template< class _Real > static void Solve( const SparseMatrix< _Real , int >&M , ConstPointer( Real ) b , Pointer( Real ) x ){ EigenSolver solver( M ) ; solver.solve( b , x ); }
};
#endif // USE_EIGEN
template< class Real >
class TridiagonalSolver : public Solver< Real >
{
	int _dim;
	Real _alpha , _beta;
	std::vector< Real > _a , _b , _c , _w;
	template< class Data > void _solve ( ConstPointer( Data ) b , Pointer( Data ) x );
	template< class Data > void _solve2( ConstPointer( Data ) b , Pointer( Data ) x );
	void _init( void );
public:
	template< class _Real > TridiagonalSolver( const SparseMatrix< _Real , int >& M );
	template< class _Real > TridiagonalSolver( const BandedMatrix< _Real , 1 >& M );
	void solve ( ConstPointer( Real ) b , Pointer( Real ) x );
	void solve2( ConstPointer( Real ) b , Pointer( Real ) x );
	template< class Data > void solve ( ConstPointer( Data ) b , Pointer( Data ) x );
	template< class Data > void solve2( ConstPointer( Data ) b , Pointer( Data ) x );
	template< class _Real , class Data > static void Solve( const SparseMatrix< _Real , int >& M , ConstPointer( Data ) b , Pointer( Data ) x ){ TridiagonalSolver solver(M) ; solver.solve( b , x ); }
	template< class _Real , class Data > static void Solve( const BandedMatrix< _Real , 1 >& M , ConstPointer( Data ) b , Pointer( Data ) x ){ TridiagonalSolver solver(M) ; solver.solve( b , x ); }
};

#if USE_EIGEN
template< class Real >
template< class _Real >
EigenSolver< Real >::EigenSolver( const SparseMatrix< _Real , int >& M )
{
	_dim = (int)M.Rows();
	Eigen::SparseMatrix< double > eigenM( _dim , _dim );
	std::vector< Eigen::Triplet< double > > triplets;
	triplets.reserve( M.Entries() );
	for( int i=0 ; i<_dim ; i++ ) for( int j=0 ; j<M.rowSizes[i] ; j++ ) triplets.push_back( Eigen::Triplet< double >( i , M[i][j].N , (double)M[i][j].Value ) );
	eigenM.setFromTriplets( triplets.begin() , triplets.end() );
	_solver.compute( eigenM );
	if( _solver.info()!=Eigen::Success ) fprintf( stderr , "[ERROR] EigenSolver::EigenSolver: Failed to factorize matrix\n" ) , exit(0);
	_temp.resize( _dim );
}
template< class Real >
template< class _Real , unsigned int Radius >
EigenSolver< Real >::EigenSolver( const BandedMatrix< _Real , Radius >& M )
{
	_dim = (int)M.rows();
	Eigen::SparseMatrix< double > eigenM( _dim , _dim );
	std::vector< Eigen::Triplet< double > > triplets;
	triplets.reserve( M.entries() );
	for( int i=0 ; i<_dim ; i++ ) for( int j=0 ; j<=2*Radius ; j++ ) triplets.push_back( Eigen::Triplet< double >( i , (i+_dim-Radius+j) % _dim , (double)M[i][j] ) );
	eigenM.setFromTriplets( triplets.begin() , triplets.end() );
	_solver.compute( eigenM );
	if( _solver.info()!=Eigen::Success ) fprintf( stderr , "[ERROR] EigenSolver::EigenSolver: Failed to factorize matrix\n" ) , exit(0);
	_temp.resize( _dim );
}
template< class Real >
void EigenSolver< Real >::solve( ConstPointer( Real ) b , Pointer( Real ) x )
{
	for( int i=0 ; i<_dim ; i++ ) _temp[i] = (double)b[i];
	_temp = _solver.solve( _temp );
	for( int i=0 ; i<_dim ; i++ ) x[i] = (Real)_temp[i];
}
#endif // USE_EIGEN
#if USE_CHOLMOD
template< class Real >
class CholmodSolver : public Solver< Real >
{
	const static bool LOWER_TRIANGULAR = true;
	int dim;
	cholmod_factor* cholmod_L;
	cholmod_dense*  cholmod_b;
	cholmod_sparse* cholmod_M;
	std::vector< bool > flaggedValues;
	template< class _Real >
	void   _init( const SparseMatrix< _Real , int >& M );
	template< class _Real >
	bool _update( const SparseMatrix< _Real , int >& M );
public:
	static cholmod_common cholmod_C;
	static bool cholmod_C_set;

	template< class _Real >
	CholmodSolver( const SparseMatrix< _Real , int >& M );
	~CholmodSolver( void );

	void solve ( ConstPointer( Real ) b , Pointer( Real ) x );
	int nonZeros( void ) const;

};
template< class Real > bool CholmodSolver< Real >::cholmod_C_set = false;
template< class Real > cholmod_common CholmodSolver< Real >::cholmod_C;

template< class Real >
template< class _Real >
CholmodSolver< Real >::CholmodSolver( const SparseMatrix< _Real , int >& M )
{
	_init( M );
	_update( M );
}
template< class Real >
template< class _Real >
void CholmodSolver< Real >::_init( const SparseMatrix< _Real , int >& M )
{
	{
		if( !cholmod_C_set ) CHOLMOD(start)( &cholmod_C );
		cholmod_C_set = true;
	}
	dim = M.rows;

	int maxEntries;
	if( LOWER_TRIANGULAR )
	{
		maxEntries = (int)( ( M.Entries()-M.rows ) / 2 + M.rows );
		cholmod_M = CHOLMOD(allocate_sparse)( dim , dim , maxEntries , 0 , 1 , -1 , CHOLMOD_REAL , &cholmod_C );
	}
	else
	{
		maxEntries = (int)M.Entries();
		cholmod_M = CHOLMOD(allocate_sparse)( dim , dim , maxEntries , 0 , 1 ,  0 , CHOLMOD_REAL , &cholmod_C );
	}
	cholmod_M->i = malloc( sizeof( SOLVER_LONG ) * maxEntries );
	cholmod_M->x = malloc( sizeof( double ) * maxEntries );

	SOLVER_LONG *_p = (SOLVER_LONG*)cholmod_M->p;
	SOLVER_LONG *_i = (SOLVER_LONG*)cholmod_M->i;

	int off = 0;
	dim = 0;

	for( int i=0 ; i<M.rows ; i++ )
	{
		_p[dim++] = off;
		for( int j=0 ; j<M.rowSizes[i] ; j++ ) if( !LOWER_TRIANGULAR || M[i][j].N>=i ) _i[off++] = M[i][j].N;
	}
	_p[dim] = off;

	cholmod_L = CHOLMOD(analyze)( cholmod_M , &cholmod_C );
	cholmod_b = CHOLMOD(allocate_dense)( dim , 1 , dim , cholmod_M->xtype , &cholmod_C );
}
template< class Real >
template< class _Real >
bool CholmodSolver< Real >::_update( const SparseMatrix< _Real , int >& M )
{
	double *_x = (double*)cholmod_M->x;
	int off = 0;

	SOLVER_LONG *_p = (SOLVER_LONG*)cholmod_M->p;
#pragma omp parallel for
	for( int i=0 ; i<M.rows ; i++ )
	{
		int off = (int)_p[i];
		for( int j=0 ; j<M.rowSizes[i] ; j++ ) if( !LOWER_TRIANGULAR || M[i][j].N>=i ) _x[off++] = double( M[i][j].Value );
	}

	cholmod_C.print = 0;
	CHOLMOD(factorize)( cholmod_M , cholmod_L , &cholmod_C );
	if( cholmod_C.status==CHOLMOD_NOT_POSDEF )
	{
		fprintf( stderr , "[WARNING] Matrix not positive-definite\n" );
		return false;
	}
	else if( cholmod_C.status==CHOLMOD_OUT_OF_MEMORY )
	{
		fprintf( stderr , "[WARNING] CHOLMOD ran out of memory\n" );
		return false;
	}
	else if( cholmod_C.status!=CHOLMOD_OK )
	{
		fprintf( stderr , "[WARNING] CHOLMOD status not OK: %d\n" , cholmod_C.status );
		return false;
	}
	return true;
}
template< class Real >
CholmodSolver< Real >::~CholmodSolver( void )
{
	if( cholmod_L ) CHOLMOD(free_factor)( &cholmod_L , &cholmod_C ) , cholmod_L = NULL;
	if( cholmod_b ) CHOLMOD(free_dense )( &cholmod_b , &cholmod_C ) , cholmod_b = NULL;
	if( cholmod_M ) CHOLMOD(free_sparse)( &cholmod_M , &cholmod_C ) , cholmod_M = NULL;
}

template< class Real >
void CholmodSolver< Real >::solve( ConstPointer( Real ) b , Pointer( Real ) x )
{
	double* _b = (double*)cholmod_b->x;
	for( int i=0 ; i<dim ; i++ ) _b[i] = (double)b[i];

	cholmod_dense* cholmod_x = CHOLMOD(solve)( CHOLMOD_A , cholmod_L , cholmod_b , &cholmod_C );
	double* _x = (double*)cholmod_x->x;
	for( int i=0 ; i<dim ; i++ ) x[i] = (Real)_x[i];

	CHOLMOD(free_dense)( &cholmod_x , &cholmod_C );
}
template< class Real >
int CholmodSolver< Real >::nonZeros( void ) const
{
	long long nz = 0;
	if( cholmod_L->xtype != CHOLMOD_PATTERN && !(cholmod_L->is_super ) ) for( int i=0 ; i<cholmod_L->n ; i++ ) nz += ((SOLVER_LONG*)cholmod_L->nz)[i];
	bool examine_super = false;
	if( cholmod_L->xtype != CHOLMOD_PATTERN ) examine_super = true ;
	else                                      examine_super = ( ((int*)cholmod_L->s)[0] != (-1));
	if( examine_super )
	{
		/* check and print each supernode */
		for (int s = 0 ; s < cholmod_L->nsuper ; s++)
		{
			int k1 = ((int*)cholmod_L->super) [s] ;
			int k2 = ((int*)cholmod_L->super) [s+1] ;
			int psi = ((int*)cholmod_L->pi)[s] ;
			int psend = ((int*)cholmod_L->pi)[s+1] ;
			int nsrow = psend - psi ;
			int nscol = k2 - k1 ;
			nz += nscol * nsrow - (nscol*nscol - nscol)/2 ;
		}
	}
	return (int)nz;
}
#endif // USE_CHOLMOD

template< class Real >
void TridiagonalSolver< Real >::_init( void )
{
	// Modify the matrix so that it becomes cyclic
	_beta = _a[0] , _alpha = _c.back();
	_a[0] -= _beta , _c.back() -= _alpha;
	_b[0] -= _alpha , _b.back() -= _beta;

	// Rescale the first row so that the diagonal is one
	//     { _b[0] , _c[0] , 0  , ... }
	// - > {  -> 1 , _c[0] / _b[0] , 0 , ... = 1 , _c'[0] , 0 , ... }
	_c[0] /= _b[0];
	// Subtract off _a[i] times row i-1:
	// { 0 , ... , 0 ,   1   , _c'[i-1] ,    0  , 0 , ... }
	// { 0 , ... , 0 , _a[i] , _b [i]   , _c[i] , 0 , ... }
	// -> 
	// { 0 , ... , 0 , 0 , _b[i]-_a[i]*_c'[i-1] , _c[i] , 0 , ... }
	// Rescale the i-th row so that the diagonal is one:
	// { 0 , ... , 0 , 0 , _b[i]-_a[i]*_c'[i-1] , _c[i] , 0 , ... }
	// -> 
	// { 0 , ... , 0 , 0 , 1 , _c[i] / ( _b[i]-_a[i]*_c'[i-1] ) , 0 , ... } = { 0 , ... , 0 , 0 , 1 , _c'[i] , 0 , ... }
	_b[0] = (Real)1. / _b[0];
	for( int i=1 ; i<_dim ; i++ )
	{
		Real temp = _b[i] - _a[i] * _c[i-1];
		_b[i] = (Real)1. / temp  , _a[i] /= temp , _c[i] /= temp;
	}
	if( _alpha || _beta )
	{
		// In this case, the matrix that we want to invert is:
		//		A = M + u^t v
		// with:
		//		u = (1,0,...,0,1)
		//		v = (_alpha,0,...,0,_beta)
		// Using the Sherman-Morrison formula we have:
		//		A^{-1}b = M^{-1}b - ( M^{-1} u v^t M^{-1}b ) /( 1 + v^t M^{-1} u )
		// Setting w = M^{-1} u this gives:
		//		A^{-1}b = M^{-1}b - ( w v^t M^{-1}b ) /( 1 + v^t w )
		std::vector< Real > u( _dim );
		_w.resize( _dim );
		u[0] = u[_dim-1] = (Real)1.;
		_solve( ( ConstPointer(Real) )GetPointer( u ) , ( Pointer(Real) )GetPointer( _w ) );
	}
}
template< class Real >
template< class _Real >
TridiagonalSolver< Real >::TridiagonalSolver( const SparseMatrix< _Real , int >& M )
{
	_dim = M.rows;
	_a.resize( _dim , (Real)0. ) , _b.resize( _dim , (Real)0. ) , _c.resize( _dim , (Real)0. );
	for( int i=0 ; i<M.rows ; i++ ) for( int j=0 ; j<M.rowSizes[i] ; j++ )
	{
		if     ( M[i][j].N==i ) _b[i] += (Real)M[i][j].Value;
		else if( M[i][j].N==( (i-1+_dim)%_dim ) ) _a[i] += (Real)M[i][j].Value;
		else if( M[i][j].N==( (i+1     )%_dim ) ) _c[i] += (Real)M[i][j].Value;
		else fprintf( stderr , "[ERROR] Matrix is not tridiagonal: %d %d\n" , i , M[i][j].N ) , exit(0);
	}
	_init();
}
template< class Real >
template< class _Real >
TridiagonalSolver< Real >::TridiagonalSolver( const BandedMatrix< _Real , 1 >& M )
{
	_dim = (int)M.rows();
	_a.resize( _dim , (Real)0. ) , _b.resize( _dim , (Real)0. ) , _c.resize( _dim , (Real)0. );
	for( int i=0 ; i<_dim ; i++ ) _a[i] = (Real)M[i][0] , _b[i] = (Real)M[i][1] , _c[i] = (Real)M[i][2];
	_init();
}
template< class Real >
template< class Data >
void TridiagonalSolver< Real >::_solve( ConstPointer( Data ) b , Pointer( Data ) x )
{
	x[0] = b[0] * _b[0];
	for( int i=1 ; i<_dim ; i++ ) x[i] = b[i] * _b[i] - _a[i] * x[i-1];
	for( int i=_dim-2 ; i>=0 ; i-- ) x[i] -= _c[i] * x[i+1];
}
template< class Real >
template< class Data >
void TridiagonalSolver< Real >::_solve2( ConstPointer( Data ) b , Pointer( Data ) x )
{
	x[0] = b[0] * _b[0] , x[1] = b[1] * _b[0];
	for( int i=1 ; i<_dim ; i++ ) x[i<<1] = b[i<<1] * _b[i] - _a[i] * x[(i-1)<<1] , x[(i<<1)|1] = b[(i<<1)|1] * _b[i] - _a[i] * x[((i-1)<<1)|1];
	for( int i=_dim-2 ; i>=0 ; i-- ) x[i<<1] -= _c[i] * x[(i+1)<<1] , x[(i<<1)|1] -= _c[i] * x[((i+1)<<1)|1];
}
template< class Real > void TridiagonalSolver< Real >::solve ( ConstPointer( Real ) b , Pointer( Real ) x ){ solve < Real >( b , x ); }
template< class Real > void TridiagonalSolver< Real >::solve2( ConstPointer( Real ) b , Pointer( Real ) x ){ solve2< Real >( b , x ); }
template< class Real >
template< class Data >
void TridiagonalSolver< Real >::solve( ConstPointer( Data ) b , Pointer( Data ) x )
{
	_solve( b , x );
	if( _alpha || _beta )
	{
		Data scale = ( x[0] * _alpha + x[_dim-1] * _beta ) / (Real)( 1. + _w[0] * _alpha + _w[_dim-1] * _beta );
		for( int i=0 ; i<_dim ; i++ ) x[i] -= scale * _w[i];
	}
}
template< class Real >
template< class Data >
void TridiagonalSolver< Real >::solve2( ConstPointer( Data ) b , Pointer( Data ) x )
{
	_solve2( b , x );
	if( _alpha || _beta )
	{
		Data scale0 = ( x[0] * _alpha + x[ (_dim-1)<<1   ] * _beta ) / (Real)( 1. + _w[0] * _alpha + _w[_dim-1] * _beta );
		Data scale1 = ( x[1] * _alpha + x[((_dim-1)<<1)|1] * _beta ) / (Real)( 1. + _w[0] * _alpha + _w[_dim-1] * _beta );
		for( int i=0 ; i<_dim ; i++ ) x[i<<1] -= scale0 * _w[i] , x[(i<<1)|1] -= scale1 * _w[i];
	}
}

#if USE_DIRECT_SOLVER
template< class Real >
struct PoissonSolver
{
protected:
	int _threads;
	SparseMatrix< double , int > _mass , _stiffness;
	Solver< Real > *_poissonSolver , *_diffusionSolver;
	std::vector< Real > _temp;
public:
	struct Params
	{
		Real massWeight , stiffnessWeight , diffusionWeight;
		int threads;
		bool verbose;
		Params( void ){ massWeight = (Real)1e-8 , stiffnessWeight = (Real)1. , diffusionWeight = (Real)0. , threads=1 , verbose = false; }
	};


	PoissonSolver( void );
	template< class Parameterization >
	PoissonSolver( const Parameterization& param , Params params = Params() );
	~PoissonSolver( void );

	void _set( Params params );
	template< class Parameterization >
	void set( const Parameterization& param , Params params );
	void solve( Pointer( Real ) inOut , bool solveDiffusion=true , bool preMassMultiply=false );

	void resetDiffusion( Real diffusionWeight );
};
#endif // USE_DIRECT_SOLVER
template< class Real >
struct FourierRows
{
public:
	struct Params
	{
		int threads , planType;
		Params( void ){ threads = 1 , planType = FFTW_PATIENT; }
	};
protected:
	int _threads;
	int _resX , _resY , _bw;
	RegularGridFEM::GridType _gridType;
	void *_fPlans , *_bPlans;
	Real* _coefficients;
	template< class FFTWPlan > void _runForward ( Pointer( Real ) inOut , void (*FFTWExecutePlan)( FFTWPlan ) );
	template< class FFTWPlan > void _runBackward( Pointer( Real ) inOut , void (*FFTWExecutePlan)( FFTWPlan ) );
	template< class FFTWComplex , class FFTWPlan , class FFTWR2RKind >
	void _set( int resX , int resY , RegularGridFEM::GridType gridType , Params params ,
		FFTWPlan (*FFTWDFTR2C1D)( int , Real* , FFTWComplex* , unsigned int ) ,
		FFTWPlan (*FFTWDFTC2R1D)( int , FFTWComplex* , Real* , unsigned int ) ,
		FFTWPlan (*FFTWDFTR2R1D)( int , Real* , Real* , FFTWR2RKind , unsigned int ) );
public:
	FourierRows( int resX , int resY , RegularGridFEM::GridType gridType , Params params=Params() );
	~FourierRows( void );
	void runForward ( Pointer( Real ) inOut );
	void runBackward( Pointer( Real ) inOut ); 
};
template< class Real >
struct SoRPoissonSolver
{
protected:
	int _threads;
	int _resX , _resY , _bw;
	RegularGridFEM::GridType _gridType;
	BandedMatrix< double , 1 > *_mass , *_stiffness;
public:
	struct Params
	{
		Real massWeight , stiffnessWeight , diffusionWeight;
		int threads , planType;
		bool supportPreMultiply , supportDiffusion;
		bool verbose;
		Params( void ){ massWeight = (Real)1e-8 , stiffnessWeight = (Real)1. , diffusionWeight = (Real)0. , threads=1 , planType = FFTW_PATIENT , verbose = false , supportPreMultiply = true , supportDiffusion = true; }
	};
	std::vector< TridiagonalSolver< Real >* > poissonSolvers , diffusionSolvers;
	std::vector< Pointer( Real ) > tVectors1 , tVectors2;
	FourierRows< Real >* fourierRows;
	
	~SoRPoissonSolver( void );
	SoRPoissonSolver( void );
	SoRPoissonSolver( const SoRParameterization& sparam , Params params = Params() );
	void set( const SoRParameterization& sparam , Params params );

	void resetDiffusion( Real diffusionWeight );
	void solve( Pointer( Real ) inOut , bool solveDiffusion=true , bool preMassMultiply=false );
	void solveSpectral( Pointer( Real ) inOut , bool solveDiffusion=true , bool preMassMultiply=false );
};

#if USE_DIRECT_SOLVER
///////////////////
// PoissonSolver //
///////////////////
template< class Real >
PoissonSolver< Real >::PoissonSolver( void ){ _poissonSolver = _diffusionSolver = NULL , _threads = 1; }
template< class Real >
template< class Parameterization >
PoissonSolver< Real >::PoissonSolver( const Parameterization& param , Params params ){ _poissonSolver = _diffusionSolver = NULL , set( param , params ); }
template< class Real >
PoissonSolver< Real >::~PoissonSolver( void ){ if( _poissonSolver ) delete _poissonSolver ; if( _diffusionSolver ) delete _diffusionSolver ; _poissonSolver = _diffusionSolver = NULL;}
template< class Real >
void PoissonSolver< Real >::_set( Params params )
{
	SparseMatrix< double , int > M;
	M.resize( _mass.rows );
	for( int i=0 ; i<_mass.rows ; i++ )
	{
		M.SetRowSize( i , _stiffness.rowSizes[i] );
		for( int j=0 ; j<_stiffness.rowSizes[i] ; j++ ) M[i][j] = MatrixEntry< double , int >( _stiffness[i][j].N , _stiffness[i][j].Value * params.stiffnessWeight + _mass[i][j].Value * params.massWeight );
	}
	if( _poissonSolver ) delete _poissonSolver;
	double t = Time();
#if USE_CHOLMOD
	_poissonSolver = new CholmodSolver< Real >( M );
#else // !USE_CHOLMOD
	_poissonSolver = new EigenSolver< Real >( M );
#endif // USE_CHOLMOD
	if( params.verbose )
	{
		size_t inNonZeros = ( M.Entries()-M.rows ) / 2 + M.rows;
		size_t outNonZeros = _poissonSolver->nonZeros();
		printf( "NZ: %d (%d x %f) [%f]  %.3f(s)\n" , outNonZeros , _mass.rows , ( (float)outNonZeros )/_mass.rows , ( (float)outNonZeros ) / inNonZeros , Time()-t );
	}
	resetDiffusion( params.diffusionWeight );
}
template< class Real >
void PoissonSolver< Real >::resetDiffusion( Real diffusionWeight )
{
	if( _diffusionSolver ) delete _diffusionSolver , _diffusionSolver = NULL;
	if( diffusionWeight )
	{
		SparseMatrix< double , int > M;
		M.resize( _mass.rows );
		for( int i=0 ; i<_mass.rows ; i++ )
		{
			M.SetRowSize( i , _stiffness.rowSizes[i] );
			for( int j=0 ; j<_stiffness.rowSizes[i] ; j++ ) M[i][j] = MatrixEntry< double , int >( _stiffness[i][j].N , _stiffness[i][j].Value*diffusionWeight + _mass[i][j].Value );
		}
		double t = Time();
#if USE_CHOLMOD
		_diffusionSolver = new CholmodSolver< Real >( M );
#else // !USE_CHOLMOD
		_diffusionSolver = new EigenSolver< Real >( M );
#endif // USE_CHOLMOD
	}
}
template< class Real >
template< class Parameterization >
void PoissonSolver< Real >::set( const Parameterization& param , Params params )
{
	_threads = std::max< int >( 1 , params.threads );
	param.poissonSystem( _mass , _stiffness , _threads );
	_set( params );
}
template< class Real >
void PoissonSolver< Real >::solve( Pointer( Real ) inOut , bool solveDiffusion , bool preMassMultiply )
{
	_temp.resize( _mass.rows );
	if( preMassMultiply ) _mass.MultiplyParallel( inOut , GetPointer( _temp ) , _threads , 0 );
	else
#pragma omp parallel for num_threads( _threads )
		for( int i=0 ; i<_mass.rows ; i++ ) _temp[i] = inOut[i];
	_poissonSolver->solve( GetPointer( _temp ) , inOut );
	if( solveDiffusion && _diffusionSolver )
	{
		_mass.MultiplyParallel( ( ConstPointer( Real ) )inOut , GetPointer( _temp ) , _threads , 0 );
		_diffusionSolver->solve( GetPointer( _temp ) , inOut );
	}
}
#endif // USE_DIRECT_SOLVER
//////////////////////
// SoRPoissonSolver //
//////////////////////
template< class Real >
SoRPoissonSolver< Real >::SoRPoissonSolver( void ){ _mass = _stiffness = NULL , _threads = 1; }
template< class Real >
SoRPoissonSolver< Real >::SoRPoissonSolver( const SoRParameterization& sparam , Params params )
{
	_mass = _stiffness = NULL , fourierRows = NULL;
	set( sparam , params );
}
template< class Real >
SoRPoissonSolver< Real >::~SoRPoissonSolver( void )
{
	for( int i=0 ; i<poissonSolvers.size()   ; i++ ) delete poissonSolvers[i];
	for( int i=0 ; i<diffusionSolvers.size() ; i++ ) delete diffusionSolvers[i];
	poissonSolvers.resize( 0 ) , diffusionSolvers.resize( 0 );
	if( _mass ){ delete[] _mass ; _mass = NULL; }
	if( _stiffness ){ delete[] _stiffness ; _stiffness = NULL; }
	if( fourierRows ) delete fourierRows , fourierRows = NULL;
	for( int i=0 ; i<tVectors1.size() ; i++ ) if( tVectors1[i] ) FreePointer( tVectors1[i] );
	for( int i=0 ; i<tVectors2.size() ; i++ ) if( tVectors2[i] ) FreePointer( tVectors2[i] );
}
template< class Real >
void SoRPoissonSolver< Real >::set( const SoRParameterization& sParam , Params params )
{
	_threads = std::max< int >( params.threads , 1 );
	for( int i=0 ; i<poissonSolvers.size()   ; i++ ) delete poissonSolvers[i];
	for( int i=0 ; i<diffusionSolvers.size() ; i++ ) delete diffusionSolvers[i];
	poissonSolvers.resize( 0 ) , diffusionSolvers.resize( 0 );
	if( _mass ) delete[] _mass;
	if( _stiffness ) delete[] _stiffness;

	sParam.resolution( _resX , _resY );
	_gridType = sParam.gridType();
	int sz = _resY - ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ? 1 : 0 ) - ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ? 1 : 0 );
	_bw = _resX / 2;

	double tt;
	tt = Time();
	if( fourierRows ) delete fourierRows;
	typename FourierRows< Real >::Params fParams;
	fParams.threads = params.threads , fParams.planType = params.planType;
	fourierRows = new FourierRows< Real >( _resX , _resY , _gridType , fParams );
	if( params.verbose ) printf( "\tInitialized FFTW: %.2f(s)\n" , Time()-tt ) , tt = Time();

	int start , systems;
	if     ( _gridType.xPeriodic()  ) start = 0 , systems = _bw+1;
	else if( _gridType.xNeumann()   ) start = 0 , systems = _resX;
	else if( _gridType.xDirichlet() ) start = 1 , systems = _resX-2;
	_mass = new BandedMatrix< double , 1 >[systems];
	_stiffness = new BandedMatrix< double , 1 >[systems];
	sParam.poissonFrequencySystems( _mass , _stiffness , start , start+systems , _threads );
	poissonSolvers.resize( systems , NULL );

	double t=0;
	if( params.verbose ) printf( "\tGot matrices: %.2f(s)\n" , Time()-tt ) , tt = Time();

	for( int i=0 ; i<tVectors1.size() ; i++ ) if( tVectors1[i] ) FreePointer( tVectors1[i] );
	for( int i=0 ; i<tVectors2.size() ; i++ ) if( tVectors2[i] ) FreePointer( tVectors2[i] );
	tVectors1.resize( _threads ) , tVectors2.resize( _threads );
	for( int t=0 ; t<_threads ; t++ ) tVectors1[t] = AllocPointer< Real >( sz*2 ) , tVectors2[t] = AllocPointer< Real >( sz*2 );
	std::vector< BandedMatrix< double , 1 > > _M( _threads );
	for( int t=0 ; t<_threads ; t++ ) _M[t].resize( sz );
#pragma omp parallel for num_threads( _threads )
	for( int b=0 ; b<systems ; b++ )
	{
		BandedMatrix< double , 1 >& M = _M[ omp_get_thread_num() ];
		for( int i=0 ; i<_mass[b].rows() ; i++ ) for( int j=0 ; j<3 ; j++ ) M[i][j] = _stiffness[b][i][j] * params.stiffnessWeight + _mass[b][i][j] * params.massWeight;

		double _t = Time();
		poissonSolvers[b] = new TridiagonalSolver< Real >( M );
		t += Time()-_t;
	}
	if( params.verbose ) printf( "\tGot solvers: %.2f(s)\n" , Time()-tt ) , tt = Time();
	if( !params.supportDiffusion )
	{
		delete[] _stiffness;
		_stiffness = NULL;
	}
	else resetDiffusion( params.diffusionWeight );
	if( !params.supportPreMultiply )
	{
		delete[] _mass;
		_mass = NULL;
	}
}

template< class Real >
void SoRPoissonSolver< Real >::resetDiffusion( Real diffusionWeight )
{
	if( !_stiffness || !_mass ) { fprintf( stderr , "[WARNING] SoRPoissonSolver::resetDiffusion: system matrices are null: %d %d\n" , _mass!=NULL , _stiffness!=NULL ) ; return; }
	for( int i=0 ; i<diffusionSolvers.size() ; i++ ) delete diffusionSolvers[i];
	diffusionSolvers.resize( 0 );
	if( diffusionWeight )
	{

		int systems;
		if     ( _gridType.xPeriodic()  ) systems = _bw+1;
		else if( _gridType.xNeumann()   ) systems = _resX;
		else if( _gridType.xDirichlet() ) systems = _resX-2;

		diffusionSolvers.resize( systems , NULL );
		for( int b=0 ; b<systems ; b++ )
		{
			BandedMatrix< double , 1 > M;
			M.resize( _mass[b].rows() );
			for( int i=0 ; i<_mass[b].rows() ; i++ ) for( int j=0 ; j<3 ; j++ ) M[i][j] = _mass[b][i][j] + _stiffness[b][i][j] * diffusionWeight;
			diffusionSolvers[b] = new TridiagonalSolver< Real >( M );
		}
	}
}

template< class Real > FourierRows< Real >::~FourierRows( void )
{
	if( _coefficients ) fftw_free( _coefficients ) , _coefficients = NULL;
	if( _fPlans ) fftw_free( _fPlans ) , _fPlans = NULL;
	if( _bPlans ) fftw_free( _bPlans ) , _bPlans = NULL;
}
template< class Real > FourierRows< Real >::FourierRows( int resX , int resY , RegularGridFEM::GridType gridType , Params params )
{
	fprintf( stderr , "[ERROR] FourierRows only supported for float and double types\n" ) , exit( 0 );
}
template< class Real > void FourierRows< Real >::runForward( Pointer( Real ) inOut )
{
	fprintf( stderr , "[ERROR] FourierRows::runForward only supported for float and double types\n" ) , exit( 0 );
}
template< class Real > void FourierRows< Real >::runBackward( Pointer( Real ) inOut )
{
	fprintf( stderr , "[ERROR] FourierRows::runBackward only supported for float and double types\n" ) , exit( 0 );
}
template< class Real >
template< class FFTWComplex , class FFTWPlan , class FFTWR2RKind >
void FourierRows< Real >::_set( int resX , int resY , RegularGridFEM::GridType gridType , Params params ,
		FFTWPlan (*FFTWDFTR2C1D)( int , Real* , FFTWComplex* , unsigned int ) ,
		FFTWPlan (*FFTWDFTC2R1D)( int , FFTWComplex* , Real* , unsigned int ) ,
		FFTWPlan (*FFTWDFTR2R1D)( int , Real* , Real* , FFTWR2RKind , unsigned int )
	)
{
	_resX = resX , _resY = resY , _gridType = gridType , _threads = std::max< int >( 1 , params.threads );
	int sz = _resY - ( ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ? 1 : 0 ) - ( ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ) ? 1 : 0 );
	_bw = _resX / 2;
	_coefficients = (Real*)fftw_malloc( sizeof( FFTWComplex ) * _threads * ( _bw+1 ) );
	_fPlans = fftw_malloc( sizeof( FFTWPlan ) * _threads );
	_bPlans = fftw_malloc( sizeof( FFTWPlan ) * _threads );
	FFTWPlan *__fPlans = (FFTWPlan*) _fPlans , *__bPlans = (FFTWPlan*) _bPlans;
	for( int t=0 ; t<_threads ; t++ )
	{
		Real* __coefficients = _coefficients + 2 * t * ( _bw+1 );
		if( _gridType.xPeriodic() )
		{
			__fPlans[t] = FFTWDFTR2C1D( _resX , __coefficients , (FFTWComplex*)__coefficients , params.planType );
			__bPlans[t] = FFTWDFTC2R1D( _resX , (FFTWComplex*)__coefficients , __coefficients , params.planType );
		}
		else if( _gridType.xNeumann() )   __bPlans[t] = __fPlans[t] = FFTWDFTR2R1D( _resX   , __coefficients , __coefficients , FFTW_REDFT00 , params.planType );
		else if( _gridType.xDirichlet() ) __bPlans[t] = __fPlans[t] = FFTWDFTR2R1D( _resX-2 , __coefficients , __coefficients , FFTW_RODFT00 , params.planType );
		if( !__fPlans[t] ) fprintf( stderr , "[ERROR] Failed to create forward plan[%d]\n" , t ) , exit( 0 );
		if( !__bPlans[t] ) fprintf( stderr , "[ERROR] Failed to create backward plan[%d]\n" , t ) , exit( 0 );
	}
}

template< class Real >
template< class FFTWPlan >
void FourierRows< Real >::_runForward( Pointer( Real ) inOut , void (*FFTWExecutePlan)( FFTWPlan ) )
{
	FFTWPlan *__fPlans = (FFTWPlan*)_fPlans;
	int dimX = _gridType.xDirichlet() ? _resX-2 : _resX;
	int poleDim = _gridType.xDirichlet() ? 0 : 1;
	int logicalDim = _gridType.xPeriodic() ? _resX : 2 * (_resX-1);
#pragma omp parallel for num_threads( _threads )
	for( int j=0 ; j<_resY ; j++ )
	{
		if( j==0       && ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ) continue;
		if( j==_resY-1 && ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ) ) continue;
		int _thread = omp_get_thread_num();
		Real* _scratch = _coefficients + 2 * _thread * (_bw+1);
		Pointer( Real ) _inOut;
		if     ( _gridType.yPole0() )      _inOut = inOut + poleDim + (j-1) * dimX;
		else if( _gridType.yDirichlet0() ) _inOut = inOut +           (j-1) * dimX;
		else                               _inOut = inOut +            j    * dimX;

		if( j==0 && _gridType.yPole0() )   _inOut = inOut;

		if     ( j==0       && _gridType.yPole0() ) for( int i=0 ; i<dimX ; i++ ) _scratch[i] = _inOut[0] / logicalDim;
		else if( j==_resY-1 && _gridType.yPole1() ) for( int i=0 ; i<dimX ; i++ ) _scratch[i] = _inOut[0] / logicalDim;
		else                                        for( int i=0 ; i<dimX ; i++ ) _scratch[i] = _inOut[i] / logicalDim;
		FFTWExecutePlan( __fPlans[_thread] );
		if( _gridType.xPeriodic() )
		{
			if     ( j==0       && _gridType.yPole0() ) for( int i=0 ; i<dimX ; i++ ) _inOut[0] = _scratch[0];
			else if( j==_resY-1 && _gridType.yPole1() ) for( int i=0 ; i<dimX ; i++ ) _inOut[0] = _scratch[0];
			else _inOut[0] = _scratch[0] , _inOut[1] = _scratch[2*_bw] , memcpy( _inOut + 2 , _scratch+2 , sizeof(Real) * 2 * (_bw-1) );
		}
		else
		{
			if     ( j==0       && _gridType.yPole0() ) for( int i=0 ; i<dimX ; i++ ) _inOut[0] = _scratch[0];
			else if( j==_resY-1 && _gridType.yPole1() ) for( int i=0 ; i<dimX ; i++ ) _inOut[0] = _scratch[0];
			else memcpy( _inOut , _scratch , sizeof(Real) * dimX );
		}
	}
}
template< class Real >
template< class FFTWPlan >
void FourierRows< Real >::_runBackward( Pointer( Real ) inOut , void (*FFTWExecutePlan)( FFTWPlan ) )
{
	FFTWPlan *__bPlans = (FFTWPlan*)_bPlans;
	int dimX = _gridType.xDirichlet() ? _resX-2 : _resX;
	int poleDim = _gridType.xDirichlet() ? 0 : 1;
#pragma omp parallel for num_threads( _threads )
	for( int j=0 ; j<_resY ; j++ )
	{
		if( j==0       && ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ) continue;
		if( j==_resY-1 && ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ) ) continue;
		int _thread = omp_get_thread_num();
		Real* _scratch = _coefficients + 2 * _thread * (_bw+1);
		Pointer( Real ) _inOut;
		if     ( _gridType.yPole0() )      _inOut = inOut + poleDim + (j-1) * dimX;
		else if( _gridType.yDirichlet0() ) _inOut = inOut +           (j-1) * dimX;
		else                               _inOut = inOut +            j    * dimX;

		if( j==0 && _gridType.yPole0() )   _inOut = inOut;

		if( _gridType.xPeriodic() )
		{
			if     ( j==0       && _gridType.yPole0() ) memset( _scratch , 0 , sizeof(Real)*2*(_bw+1) ) , _scratch[0] = _inOut[0];
			else if( j==_resY-1 && _gridType.yPole1() ) memset( _scratch , 0 , sizeof(Real)*2*(_bw+1) ) , _scratch[0] = _inOut[0];
			else _scratch[0] = _inOut[0] , _scratch[2*_bw] = _inOut[1] , _scratch[1] = _scratch[2*_bw+1] = 0.f , memcpy( _scratch + 2 , _inOut + 2 , sizeof(Real) * 2 * (_bw-1) );
		}
		else
		{
			if     ( j==0       && _gridType.yPole0() ) memset( _scratch , 0 , sizeof(Real) * dimX ) , _scratch[0] = _inOut[0];
			else if( j==_resY-1 && _gridType.yPole1() ) memset( _scratch , 0 , sizeof(Real) * dimX ) , _scratch[0] = _inOut[0];
			else memcpy( _scratch , _inOut , sizeof(Real) * dimX );
		}
		FFTWExecutePlan( __bPlans[_thread] );
		if     ( j==0       && _gridType.yPole0() ){ _inOut[0] = 0 ; for( int i=0 ; i<dimX ; i++ ) _inOut[0] += _scratch[i] ; _inOut[0] /= dimX; }
		else if( j==_resY-1 && _gridType.yPole1() ){ _inOut[0] = 0 ; for( int i=0 ; i<dimX ; i++ ) _inOut[0] += _scratch[i] ; _inOut[0] /= dimX; }
		else memcpy( _inOut , _scratch , sizeof(Real)*dimX );
	}
}

template< > void FourierRows< float  >::runForward ( Pointer( float  ) inOut ){ _runForward ( inOut , fftwf_execute ); }
template< > void FourierRows< double >::runForward ( Pointer( double ) inOut ){ _runForward ( inOut , fftw_execute  ); }
template< > void FourierRows< float  >::runBackward( Pointer( float  ) inOut ){ _runBackward( inOut , fftwf_execute ); }
template< > void FourierRows< double >::runBackward( Pointer( double ) inOut ){ _runBackward( inOut , fftw_execute  ); }
template< > FourierRows< float  >::FourierRows( int resX , int resY , RegularGridFEM::GridType gridType , Params params ){ _set< fftwf_complex , fftwf_plan , fftwf_r2r_kind >( resX , resY , gridType , params , fftwf_plan_dft_r2c_1d , fftwf_plan_dft_c2r_1d , fftwf_plan_r2r_1d ); }
template< > FourierRows< double >::FourierRows( int resX , int resY , RegularGridFEM::GridType gridType , Params params ){ _set<  fftw_complex ,  fftw_plan ,  fftw_r2r_kind >( resX , resY , gridType , params ,  fftw_plan_dft_r2c_1d ,  fftw_plan_dft_c2r_1d ,  fftw_plan_r2r_1d ); }

template< class Real > void SoRPoissonSolver< Real >::solve( Pointer( Real ) inOut , bool solveDiffusion , bool preMassMultiply )
{
	if( !preMassMultiply ) SoRParameterization::ToDoubleCoveringConstraints< Real , Real >( inOut , _resX , _resY , _gridType , _threads );
	fourierRows->runForward( inOut );
	solveSpectral( inOut , solveDiffusion , preMassMultiply );
	fourierRows->runBackward( inOut );
}
template< class Real > void SoRPoissonSolver< Real >::solveSpectral( Pointer( Real ) inOut , bool solveDiffusion , bool preMassMultiply )
{
	int sz = _resY - ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ? 1 : 0 ) - ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ? 1 : 0 );
	int vDimX = _gridType.xDirichlet() ? _resX-2 : _resX;
	if( preMassMultiply && !_mass ) { fprintf( stderr , "[ERROR] SoRPoissonSolver::solveSpectral: Cannot pre-multiply by mass matrix\n" ) ; exit(0); }
	if( _gridType.xPeriodic() )
	{
		{
			// First process frequency 0
			Pointer( Real ) tVector1 = tVectors1[0];
			Pointer( Real ) tVector2 = tVectors2[0];
			if     ( _gridType.yPole0() && _gridType.yPole1() ) tVector1[0] = inOut[0] , tVector1[sz-1] = inOut[ 1 + (sz-2) * vDimX ];
			else if( _gridType.yPole0()                       ) tVector1[0] = inOut[0];
			else if(                       _gridType.yPole1() )                          tVector1[sz-1] = inOut[     (sz-1) * vDimX ];

			if     ( _gridType.yPole0() && _gridType.yPole1() ) for( int j=1 ; j<sz-1 ; j++ ) tVector1[j] = inOut[ 1 + (j-1) * vDimX ];
			else if( _gridType.yPole0()                       ) for( int j=1 ; j<sz   ; j++ ) tVector1[j] = inOut[ 1 + (j-1) * vDimX ];
			else if(                       _gridType.yPole1() ) for( int j=0 ; j<sz-1 ; j++ ) tVector1[j] = inOut[      j    * vDimX ];
			else                                                for( int j=0 ; j<sz   ; j++ ) tVector1[j] = inOut[      j    * vDimX ];

			if( preMassMultiply ) _mass[0].multiply( ( ConstPointer( Real ) )tVector1 , tVector2 , _threads ) , poissonSolvers[0]->solve( tVector2 , tVector1 );
			else poissonSolvers[0]->solve( tVector1 , tVector1 );
			if( solveDiffusion && diffusionSolvers.size() ) _mass[0].multiply( ( ConstPointer( Real ) )tVector1 , tVector2 , _threads ) , diffusionSolvers[0]->solve( tVector2 , tVector1 );
			if     ( _gridType.yPole0() && _gridType.yPole1() ) inOut[0] = tVector1[0] , inOut[ 1 + (sz-2) * vDimX ] = tVector1[sz-1];
			else if( _gridType.yPole0()                       ) inOut[0] = tVector1[0]                                             ;
			else if(                       _gridType.yPole1() )                          inOut[     (sz-1) * vDimX ] = tVector1[sz-1];

			if     ( _gridType.yPole0() && _gridType.yPole1() ) for( int j=1 ; j<sz-1 ; j++ ) inOut[ 1 + (j-1) * vDimX ] = tVector1[j];
			else if( _gridType.yPole0()                       ) for( int j=1 ; j<sz   ; j++ ) inOut[ 1 + (j-1) * vDimX ] = tVector1[j];
			else if(                       _gridType.yPole1() ) for( int j=0 ; j<sz-1 ; j++ ) inOut[      j    * vDimX ] = tVector1[j];
			else                                                for( int j=0 ; j<sz   ; j++ ) inOut[      j    * vDimX ] = tVector1[j];

			// Next process the band-width frequency
			if     ( _gridType.yPole0() && _gridType.yPole1() ) tVector1[0] = tVector1[sz-1] = 0;
			else if( _gridType.yPole0()                       ) tVector1[0] =                  0;
			else if(                       _gridType.yPole1() )               tVector1[sz-1] = 0;

			if     ( _gridType.yPole0() && _gridType.yPole1() ) for( int j=1 ; j<sz-1 ; j++ ) tVector1[j] = inOut[ 1 + (j-1) * vDimX + 1 ];
			else if( _gridType.yPole0()                       ) for( int j=1 ; j<sz   ; j++ ) tVector1[j] = inOut[ 1 + (j-1) * vDimX + 1 ];
			else if(                       _gridType.yPole1() ) for( int j=0 ; j<sz-1 ; j++ ) tVector1[j] = inOut[      j    * vDimX + 1 ];
			else                                                for( int j=0 ; j<sz   ; j++ ) tVector1[j] = inOut[      j    * vDimX + 1 ];
			if( preMassMultiply ) _mass[_bw].multiply( ( ConstPointer( Real ) )tVector1 , tVector2 , _threads ) , poissonSolvers[_bw]->solve( tVector2 , tVector1 );
			else poissonSolvers[_bw]->solve( tVector1 , tVector1 );
			if( solveDiffusion && diffusionSolvers.size() ) _mass[_bw].multiply( ( ConstPointer( Real ) )tVector1 , tVector2 , _threads ) , diffusionSolvers[_bw]->solve( tVector2 , tVector1 );
			if     ( _gridType.yPole0() && _gridType.yPole1() ) for( int j=1 ; j<sz-1 ; j++ ) inOut[ 1 + (j-1) * vDimX + 1 ] = tVector1[j];
			else if( _gridType.yPole0()                       ) for( int j=1 ; j<sz   ; j++ ) inOut[ 1 + (j-1) * vDimX + 1 ] = tVector1[j];
			else if(                       _gridType.yPole1() ) for( int j=0 ; j<sz-1 ; j++ ) inOut[      j    * vDimX + 1 ] = tVector1[j];
			else                                                for( int j=0 ; j<sz   ; j++ ) inOut[      j    * vDimX + 1 ] = tVector1[j];
		}
		// Now process all the other frequencies
#pragma omp parallel for num_threads( _threads )
		for( int b=1 ; b<_bw ; b++ )
		{
			int _thread = omp_get_thread_num();
			Pointer( Real ) tVector1 = tVectors1[_thread];
			Pointer( Real ) tVector2 = tVectors2[_thread];
			if     ( _gridType.yPole0() && _gridType.yPole1() ) tVector1[0] = tVector1[1] = tVector1[(sz-1)<<1] = tVector1[((sz-1)<<1)|1] = 0;
			else if( _gridType.yPole0()                       ) tVector1[0] = tVector1[1] =                                                 0;
			else if(                       _gridType.yPole1() )                             tVector1[(sz-1)<<1] = tVector1[((sz-1)<<1)|1] = 0;

			if     ( _gridType.yPole0() && _gridType.yPole1() ) for( int j=1 ; j<sz-1 ; j++ ) tVector1[j<<1] = inOut[ 1 + (j-1) * vDimX + 2*b ] , tVector1[(j<<1)|1] = inOut[ 1 + (j-1) * vDimX + 2*b + 1 ];
			else if( _gridType.yPole0()                       ) for( int j=1 ; j<sz   ; j++ ) tVector1[j<<1] = inOut[ 1 + (j-1) * vDimX + 2*b ] , tVector1[(j<<1)|1] = inOut[ 1 + (j-1) * vDimX + 2*b + 1 ];
			else if(                       _gridType.yPole1() ) for( int j=0 ; j<sz-1 ; j++ ) tVector1[j<<1] = inOut[      j    * vDimX + 2*b ] , tVector1[(j<<1)|1] = inOut[      j    * vDimX + 2*b + 1 ];
			else                                                for( int j=0 ; j<sz   ; j++ ) tVector1[j<<1] = inOut[      j    * vDimX + 2*b ] , tVector1[(j<<1)|1] = inOut[      j    * vDimX + 2*b + 1 ];
			if( preMassMultiply ) _mass[b].multiply2( ( ConstPointer( Real ) )tVector1 , tVector2 , 1 ) , poissonSolvers[b]->solve2( tVector2 , tVector1 );
			else poissonSolvers[b]->solve2( tVector1 , tVector1 );
			if( solveDiffusion && diffusionSolvers.size() ) _mass[b].multiply2( ( ConstPointer( Real ) )tVector1 , tVector2 , 1 ) , diffusionSolvers[b]->solve2( tVector2 , tVector1 );
			if     ( _gridType.yPole0() && _gridType.yPole1() ) for( int j=1 ; j<sz-1 ; j++ ) inOut[ 1 + (j-1) * vDimX + 2*b ] = tVector1[j<<1] , inOut[ 1 + (j-1) * vDimX + 2*b + 1 ] = tVector1[(j<<1)|1];
			else if( _gridType.yPole0()                       ) for( int j=1 ; j<sz   ; j++ ) inOut[ 1 + (j-1) * vDimX + 2*b ] = tVector1[j<<1] , inOut[ 1 + (j-1) * vDimX + 2*b + 1 ] = tVector1[(j<<1)|1];
			else if(                       _gridType.yPole1() ) for( int j=0 ; j<sz-1 ; j++ ) inOut[      j    * vDimX + 2*b ] = tVector1[j<<1] , inOut[      j    * vDimX + 2*b + 1 ] = tVector1[(j<<1)|1];
			else                                                for( int j=0 ; j<sz   ; j++ ) inOut[      j    * vDimX + 2*b ] = tVector1[j<<1] , inOut[      j    * vDimX + 2*b + 1 ] = tVector1[(j<<1)|1];
		}
	}
	else if( _gridType.xNeumann() )
	{
#pragma omp parallel for num_threads( _threads )
		for( int b=0 ; b<vDimX ; b++ )
		{
			int _thread = omp_get_thread_num();
			Pointer( Real ) tVector1 = tVectors1[_thread];
			Pointer( Real ) tVector2 = tVectors2[_thread];

			if     ( _gridType.yPole0() && _gridType.yPole1() ) tVector1[0] = b!=0 ? 0 : inOut[0] , tVector1[sz-1] = b!=0 ? 0 : inOut[ 1 + (sz-2) * vDimX ];
			else if( _gridType.yPole0()                       ) tVector1[0] = b!=0 ? 0 : inOut[0]                                                          ;
			else if(                       _gridType.yPole1() )                                     tVector1[sz-1] = b!=0 ? 0 : inOut[     (sz-1) * vDimX ];

			if     ( _gridType.yPole0() && _gridType.yPole1() ) for( int j=1 ; j<sz-1 ; j++ ) tVector1[j] = inOut[ 1 + (j-1) * vDimX + b ];
			else if( _gridType.yPole0()                       ) for( int j=1 ; j<sz   ; j++ ) tVector1[j] = inOut[ 1 + (j-1) * vDimX + b ];
			else if(                       _gridType.yPole1() ) for( int j=0 ; j<sz-1 ; j++ ) tVector1[j] = inOut[      j    * vDimX + b ];
			else                                                for( int j=0 ; j<sz   ; j++ ) tVector1[j] = inOut[      j    * vDimX + b ];

			if( preMassMultiply ) _mass[b].multiply( ( ConstPointer( Real ) )tVector1 , tVector2 , 1 ) , poissonSolvers[b]->solve( tVector2 , tVector1 );
			else poissonSolvers[b]->solve( tVector1 , tVector1 );
			if( solveDiffusion && diffusionSolvers.size() ) _mass[b].multiply( ( ConstPointer( Real ) )tVector1 , tVector2 , 1 ) , diffusionSolvers[b]->solve( tVector2 , tVector1 );

			if     ( _gridType.yPole0() && _gridType.yPole1() && !b ) inOut[0] = tVector1[0] , inOut[ 1 + (sz-2) * vDimX ] = tVector1[sz-1];
			else if( _gridType.yPole0()                       && !b ) inOut[0] = tVector1[0]                                               ;
			else if(                       _gridType.yPole1() && !b )                          inOut[     (sz-1) * vDimX ] = tVector1[sz-1];

			if     ( _gridType.yPole0() && _gridType.yPole1() ) for( int j=1 ; j<sz-1 ; j++ ) inOut[ 1 + (j-1) * vDimX + b ] = tVector1[j];
			else if( _gridType.yPole0()                       ) for( int j=1 ; j<sz   ; j++ ) inOut[ 1 + (j-1) * vDimX + b ] = tVector1[j];
			else if(                       _gridType.yPole1() ) for( int j=0 ; j<sz-1 ; j++ ) inOut[      j    * vDimX + b ] = tVector1[j];
			else                                                for( int j=0 ; j<sz   ; j++ ) inOut[      j    * vDimX + b ] = tVector1[j];
		}
	}
	else if( _gridType.xDirichlet() )
	{
#pragma omp parallel for num_threads( _threads )
		for( int b=0 ; b<vDimX ; b++ )
		{
			int _thread = omp_get_thread_num();
			Pointer( Real ) tVector1 = tVectors1[_thread];
			Pointer( Real ) tVector2 = tVectors2[_thread];

			for( int j=0 ; j<sz ; j++ ) tVector1[j] = inOut[ j * vDimX + b ];

			if( preMassMultiply ) _mass[b].multiply( ( ConstPointer( Real ) )tVector1 , tVector2 , 1 ) , poissonSolvers[b]->solve( tVector2 , tVector1 );
			else poissonSolvers[b]->solve( tVector1 , tVector1 );
			if( solveDiffusion && diffusionSolvers.size() ) _mass[b].multiply( ( ConstPointer( Real ) )tVector1 , tVector2 , 1 ) , diffusionSolvers[b]->solve( tVector2 , tVector1 );

			for( int j=0 ; j<sz ; j++ ) inOut[ j * vDimX + b ] = tVector1[j];
		}
	}
}
#endif // SOLVERS_INCLUDED