#ifndef PI
#define PI 3.1415926535897932384
#endif // PI
#include <algorithm>
#include <Util/solvers.h>

#define MY_ATAN_2 1

#ifndef SetOMPParallel
#ifdef _WIN32
#define SetOMPParallel( threads ) __pragma( omp parallel for num_threads( threads ) )
#else // !_WIN32
#define SetOMPParallel( threads ) _Pragma( "omp parallel for num_threads( threads )" )
#endif // _WIN32
#endif // SetOMPParallel


/////////////////////////
// SoRParameterization //
/////////////////////////

void SoRParameterization::resolution( int& resX , int& resY ) const { resX = _resX , resY = _resY; }
template< class Real >
Point2D< Real > SoRParameterization::rotate90( double x , double y , Point2D< Real > v ) const
{
	bool reflectX , reflectY , negate;
	RegularGridFEM::RemapParameter( x , y , _resX , _resY , _gridType , reflectX , reflectY , negate );
	v = _globalToLocal( (int)floor(y) , x-floor(x) , y-floor(y) , v );
	v = Point2D< Real >( -v[1] , v[0] );
	return _localToGlobal( (int)floor(y) , x-floor(x) , y-floor(y) , v );
}
template< class Real > Point2D< Real > SoRParameterization::_globalToLocal( int j , double x , double y , Point2D< Real > d ) const
{
	if( _conicalGeometry ) return Point2D< Real >( (Real)( d[0] / ( _coneData[j].a0 + _coneData[j].a1 * y ) ) , (Real)( d[1] / _coneData[j].b0 ) );
	else
	{
		double b = _trapData[j].width0 , db = _trapData[j].width1 - _trapData[j].width0 , h = _trapData[j].height;
		double det = ( b + db * y ) * h;
		return Point2D< Real >( (Real)( ( d[0] * h + d[1] * db * ( 0.5 - x ) ) / det ) , (Real)( d[1] * ( b + db*y ) / det ) );
	}
}
template< class Data , class Real >
std::pair< Data , Data > SoRParameterization::_globalToLocal( int j , double x , double y , std::pair< Data , Data > d ) const
{
	if( _conicalGeometry ) return std::pair< Data , Data >( d.first / (Real)( _coneData[j].a0 + _coneData[j].a1 * y ) , d.second / (Real)( _coneData[j].b0 ) );
	else
	{
		double b = _trapData[j].width0 , db = _trapData[j].width1 - _trapData[j].width0 , h = _trapData[j].height;
		double det = ( b + db * y ) * h;
		return std::pair< Data , Data >( d.first * (Real)( h/det ) + d.second * (Real)( db * ( 0.5 - x ) / det ) , d.second * (Real)( ( b + db*y ) / det ) );
	}
}
template< class Real > Point2D< Real > SoRParameterization::_localToGlobal( int j , double x , double y , Point2D< Real > d ) const
{
	if( _conicalGeometry ) return Point2D< Real >( (Real)( d[0] * ( _coneData[j].a0 + _coneData[j].a1 * y ) ) , (Real)( d[1] * _coneData[j].b0 ) );
	else
	{
		double b = _trapData[j].width0 , db = _trapData[j].width1 - _trapData[j].width0 , h = _trapData[j].height;
		return Point2D< Real >( (Real)( d[0] * ( b + db*y ) + d[1] * ( db * ( x - 0.5 ) ) ) , (Real)( d[1]*h ) );
	}
}
template< class Data , class Real >
std::pair< Data , Data > SoRParameterization::_localToGlobal( int j , double x , double y , std::pair< Data , Data > d ) const
{
	if( _conicalGeometry ) return std::pair< Data , Data >( d.first * (Real)( _coneData[j].a0 + _coneData[j].a1 * y ) , d.second * (Real)_coneData[j].b0 );
	else
	{
		double b = _trapData[j].width0 , db = _trapData[j].width1 - _trapData[j].width0 , h = _trapData[j].height;
		return std::pair< Data , Data >( d.first * (Real)( b + db*y ) + d.second *(Real)( db * ( x - 0.5 ) ) , d.second * (Real)( h ) );
	}
}
double SoRParameterization::_theta( double x , double y ) const
{
	double theta = ( _angleOfRevolution * x ) / _resX;
	theta -= _angleOfRevolution / 2.;
	int b = (int)floor(y);
	double dy = y-b;
	return theta;
}
SquareMatrix< double , 2 > SoRParameterization::_metricRoot( int b , double x , double y ) const
{
	MetricRoot mr = _metricRoot( b );
	SquareMatrix< double , 2 > M;
	M(0,0) = mr.x_dx( y ) , M(1,1) = mr.y_dy;
	M(1,0) = mr.y_dx( x ) , M(0,1) = 0.;
	return M;
}
SoRParameterization::MetricRoot SoRParameterization::_metricRoot( int b ) const
{
	MetricRoot mr;
	if( !_gridType.yPeriodic() && ( b<0 || b>=(int)bands() ) )
	{
		mr.x_dx = 0. , mr.y_dx = 0. , mr.y_dy = 0.;
		return mr;
	}
	else b = MOD( b , _resY );
	if( _conicalGeometry )
	{
		if( _coneData[b].ratio )
		{
			double r0 = _coneData[b].r0 , r1 = _coneData[b].r1 , theta = ( _angleOfRevolution / _coneData[b].ratio ) / _resX;
			mr.x_dx[0] = r0*theta , mr.x_dx[1] = ( r1 - r0 ) * theta;
			mr.y_dx[0] = mr.y_dx[1] = 0;
			mr.y_dy = fabs( r1 - r0 );
		}
		else
		{
			mr.x_dx[0] = _coneData[b].width , mr.y_dy = _coneData[b].height;
			mr.x_dx[1] = mr.y_dx[0] = mr.y_dx[1] = 0;
		}
	}
	else
	{
		double base = _trapData[b].width0 , d = _trapData[b].width1 - _trapData[b].width0 , h = _trapData[b].height;
		mr.x_dx[0] =  base , mr.x_dx[1] = d;
		mr.y_dx[0] = -d/2. , mr.y_dx[1] = d;
		mr.y_dy = h;
	}
	return mr;
}
Polynomial< 1 > SoRParameterization::_area( int b ) const
{
	MetricRoot mr = _metricRoot(b);
	return mr.x_dx * mr.y_dy;
}
template< unsigned int Degree >
double MCIntegral( const Polynomial< Degree >& numerator , double sMin , double sMax , int samples )
{
	double integral = 0;
	for( int i=0 ; i<samples ; i++ )
	{
		double t = sMin + (i+0.5)/samples * (sMax-sMin);
		integral += numerator(t);
	}
	return integral * (sMax-sMin) / samples;
}
template< unsigned int Degree >
double AnalyticIntegral( const Polynomial< Degree >& numerator , double sMin , double sMax ){ return numerator.integral( sMin , sMax ); }
template< unsigned int Degree >
double MCQuotientIntegral( const Polynomial< Degree >& numerator , const Polynomial< 1 >& denominator , double sMin , double sMax , int samples )
{
	double integral = 0;
	for( int i=0 ; i<samples ; i++ )
	{
		double t = sMin + (i+0.5)/samples * (sMax-sMin);
		integral += numerator(t) / denominator(t);
	}
	return integral * (sMax-sMin) / samples;
}
template< unsigned int Degree >
double AnalyticQuotientIntegral( const Polynomial< Degree >& numerator , const Polynomial< 1 >& denominator , double sMin , double sMax , bool noLog )
{
#pragma message( "[WARNING] Soft equality testing (needed for trapezoidal)" )
//	if( fabs(denominator[1])>1e-6 )
	if( fabs(denominator[1])>1e-6 || noLog )
	{
		Polynomial< Degree > n = numerator;
		n /= denominator[1];
		double s = denominator[0] / denominator[1];
		// \int_[sMin,sMax] P(x) / (ax+b) = \int_[sMin,sMax]  [P(x)/a] / (x+b/a)
		//                                = \int_[sMin+b/a,sMax+b/a]  [P(x-b/a)/a] / x
		n = n.shift( s );
		sMin += s , sMax += s;
		Polynomial< ( Degree>0 ? Degree-1 : 0 ) > _n;
		for( int i=0 ; i<Degree ; i++ ) _n[i] = n[i+1];
		if( noLog ) return _n.integral( sMin , sMax );
		else        return _n.integral( sMin , sMax ) + n[0] * ( log(sMax/sMin) );
	}
	else return numerator.integral( sMin , sMax ) / denominator[0];
}
template< unsigned int Degree >
double Integral( const Polynomial< Degree >& numerator , double sMin , double sMax )
{
	return AnalyticIntegral( numerator , sMin , sMax );
}
template< unsigned int Degree >
double QuotientIntegral( const Polynomial< Degree >& numerator , const Polynomial< 1 >& denominator , double sMin , double sMax , bool noLog )
{
	return AnalyticQuotientIntegral( numerator , denominator , sMin , sMax , noLog );
}
double SoRParameterization::_vIntegral( int b , int type1 , int type2 , const Polynomial< 1 >& area ) const
{
	Polynomial< 1 > p0 , p1 , unit( 1. );
	p0[0] = 0. , p0[1] = 1. , p1[0] = 1. , p1[1] = -1.;
	Polynomial< 1 > x1 = ( __R(type1) ? p0 : p1 ) , x2 = ( __R(type2) ? p0 : p1 ) , y1 = ( __T(type1) ? p0 : p1 ) , y2 = ( __T(type2) ? p0 : p1 );
	if( !_gridType.yPeriodic() && ( b<0 || b>=(int)bands() ) ) return 0.;
	else b = MOD( b , _resY );
	if( b==0 )
	{
		if( _gridType.yPole0() )
		{
			if( __B(type1) ) x1 = unit;
			if( __B(type2) ) x2 = unit;
		}
		else if( _gridType.yDirichlet0() && ( __B(type1) || __B(type2) ) ) return 0;
	}
	if( b==bands()-1 )
	{
		if( _gridType.yPole1() )
		{
			if( __T(type1) ) x1 = unit;
			if( __T(type2) ) x2 = unit;
		}
		else if( _gridType.yDirichlet1() && ( __T(type1) || __T(type2) ) ) return 0;
	}
	return Integral( x1 * x2 , 0. , 1. ) * Integral( y1 * y2 * area , 0. , 1. );
}
double SoRParameterization::_dIntegral( int b , int type1 , int type2 , bool rotate1 , bool rotate2 , const SoRParameterization::MetricRoot& metricRoot ) const
{
	if( !_gridType.yPeriodic() && ( b<0 || b>=(int)bands() ) ) return 0.;
	else b = MOD( b , _resY );
	if( b==        0 && ( _gridType.yPole0() || _gridType.yDirichlet0() ) && (type1==_B || type2==_B) ) return 0.;
	if( b==bands()-1 && ( _gridType.yPole1() || _gridType.yDirichlet1() ) && (type1==_T || type2==_T) ) return 0.;
	bool noLog =
		( 
		( _gridType.yPole0() && b==0         && ( type1==_T || type2==_T ) ) || 
		( _gridType.yPole1() && b==bands()-1 && ( type1==_B || type2==_B ) )
		);
	auto poly_x  = [] ( int e )
	{
		Polynomial< 1 > p( 1. );
		if     ( e==_L ) p[1] = -1.0 , p[0] = 1.0;
		else if( e==_R ) p[1] =  1.0 , p[0] = 0.0;
		return p;
	};
	auto poly_y  = [] ( int e )
	{
		Polynomial< 1 > p( 1. );
		if     ( e==_B ) p[1] = -1.0 , p[0] = 1.0;
		else if( e==_T ) p[1] =  1.0 , p[0] = 0.0;
		return p;
	};
	// Rotations act by x->y , y->-x
	Polynomial< 1 > x1 = poly_x( type1 ) , x2 = poly_x( type2 ) , y1 = poly_y( type1 ) , y2 = poly_y( type2 );
	// The metric root is:
	// | x_dx(y)  y_dx(x) |
	// |   0      y_dy    |
	// The inverse transpose is:
	// |  y_dy       0     |  /
	// | -y_dx(x)  x_dx(y) | /  [ x_dx(y) * y_dy ]
	// Then
	// [ X(x) * Y(y) , 0 ] -> [   X(x) * Y(y) * y_dy , - X(x) * Y(y) * y_dx(x) ] / [ x_dx(y) * y_dy ]
	// [ 0 , X(x) * Y(y) ] -> [                    0 ,   X(x) * Y(y) * x_dx(y) ] / [ x_dx(y) * y_dy ]
	const Polynomial< 1 >& y_dx = metricRoot.y_dx;
	const Polynomial< 1 >& x_dx = metricRoot.x_dx;
	double y_dy = metricRoot.y_dy;
	bool vertical1 = __V(type1) , vertical2 = __V(type2);
	double dIntegral = 0;
	if( rotate1==rotate2 )
	{
		if( !vertical1 && !vertical2 )
			dIntegral =
			Integral( x1 * x2               , 0. , 1. ) * QuotientIntegral( y1 * y2 , x_dx , 0. , 1. , noLog ) * y_dy +
			Integral( x1 * x2 * y_dx * y_dx , 0. , 1. ) * QuotientIntegral( y1 * y2 , x_dx , 0. , 1. , noLog ) / y_dy ;
		else if( vertical1 && vertical2 ) dIntegral =  Integral( x1 * x2        , 0. , 1. ) * Integral( y1 * y2 * x_dx , 0. , 1. ) / y_dy;
		else                              dIntegral = -Integral( x1 * x2 * y_dx , 0. , 1. ) * Integral( y1 * y2        , 0. , 1. ) / y_dy;
	}
	else if( rotate1 )
	{
		if( vertical1==vertical2 ) return 0;
		else if( !vertical1 ) dIntegral =   Integral( x1 * x2 , 0. , 1. ) * Integral( y1 * y2 , 0. , 1. );
		else                  dIntegral = - Integral( x1 * x2 , 0. , 1. ) * Integral( y1 * y2 , 0. , 1. );
	}
	else if( rotate2 )
	{
		if( vertical1==vertical2 ) return 0;
		else if( !vertical1 ) dIntegral = - Integral( x1 * x2 , 0. , 1. ) * Integral( y1 * y2 , 0. , 1. );
		else                  dIntegral =   Integral( x1 * x2 , 0. , 1. ) * Integral( y1 * y2 , 0. , 1. );
	}
	return dIntegral;
}
// Computes the value-value stencil
void SoRParameterization::__mStencil( int j , Stencil< 3 , 3 >& stencil ) const
{
	memset( stencil.values , 0 , sizeof(double)*3*3 );
	int fDimX = _gridType.xPeriodic() ? _resX : _resX-1;
	int jj = j-1;
  	Polynomial< 1 > area0 = _area(jj) , area1 = _area(j);
#define V_INTEGRAL_00( corner ) ( _vIntegral( jj , _T_R , (corner) , (area0) ) )
#define V_INTEGRAL_10( corner ) ( _vIntegral( jj , _T_L , (corner) , (area0) ) )
#define V_INTEGRAL_01( corner ) ( _vIntegral( j  , _B_R , (corner) , (area1) ) )
#define V_INTEGRAL_11( corner ) ( _vIntegral( j  , _B_L , (corner) , (area1) ) )
	if( !( _gridType.yPole0() && j==0 ) && !( _gridType.yPole1() && j==_resY-1 ) )
	{
		stencil[1][1]                 = V_INTEGRAL_00( _T_R ) + V_INTEGRAL_10( _T_L ) + V_INTEGRAL_01( _B_R ) + V_INTEGRAL_11( _B_L );
		stencil[1][0]                 = V_INTEGRAL_00( _B_R ) + V_INTEGRAL_10( _B_L );
		stencil[1][2]                 =                                                 V_INTEGRAL_01( _T_R ) + V_INTEGRAL_11( _T_L );
		stencil[0][1] = stencil[2][1] =                         V_INTEGRAL_10( _T_R )                         + V_INTEGRAL_11( _B_R );
		stencil[0][0] = stencil[2][0] = V_INTEGRAL_00( _B_L );
		stencil[0][2] = stencil[2][2] =                                                 V_INTEGRAL_01( _T_L );

		if( j==1       && _gridType.yPole0() ) stencil[0][0] = stencil[2][0] = 0.;
		if( j==_resY-2 && _gridType.yPole1() ) stencil[0][2] = stencil[2][2] = 0.;
	}
	else if( _gridType.yPole0() && j==      0 ) stencil[1][1] = V_INTEGRAL_01( _B_R ) * fDimX , stencil[1][2] = V_INTEGRAL_01( _T_R ) + V_INTEGRAL_11( _T_L );
	else if( _gridType.yPole1() && j==_resY-1 ) stencil[1][1] = V_INTEGRAL_00( _T_R ) * fDimX , stencil[1][0] = V_INTEGRAL_00( _B_R ) + V_INTEGRAL_10( _B_L );
#undef V_INTEGRAL_00
#undef V_INTEGRAL_10
#undef V_INTEGRAL_01
#undef V_INTEGRAL_11
}
// Computes the derivative-derivative stencil
void SoRParameterization::__sStencil( int j , Stencil< 3 , 3 >& stencil ) const
{
	memset( stencil.values , 0 , sizeof(double)*3*3 );
	int jj = j-1;
	int fDimX = _gridType.xPeriodic() ? _resX : _resX-1;
	MetricRoot mr0 = _metricRoot(jj) , mr1 = _metricRoot(j);
#define D_INTEGRAL_0( edge1 , edge2 ) ( _dIntegral( jj , (edge1) , (edge2) , false , false , mr0 ) )
#define D_INTEGRAL_1( edge1 , edge2 ) ( _dIntegral( j  , (edge1) , (edge2) , false , false , mr1 ) )


#define D_INTEGRAL_00( edge ) (   _dIntegral( jj , _T , edge , false , false , mr0 ) + _dIntegral( jj , _R , edge , false , false , mr0 ) )
#define D_INTEGRAL_10( edge ) ( - _dIntegral( jj , _T , edge , false , false , mr0 ) + _dIntegral( jj , _L , edge , false , false , mr0 ) )
#define D_INTEGRAL_01( edge ) (   _dIntegral( j  , _B , edge , false , false , mr1 ) - _dIntegral( j  , _R , edge , false , false , mr1 ) )
#define D_INTEGRAL_11( edge ) ( - _dIntegral( j  , _B , edge , false , false , mr1 ) - _dIntegral( j  , _L , edge , false , false , mr1 ) )
	if     ( _gridType.yDirichlet0() && j==0       ) ;
	else if( _gridType.yDirichlet1() && j==_resY-1 ) ;
	else if( _gridType.yDirichlet0() && j==1       )
	{
		stencil[1][1]                 =   D_INTEGRAL_00( _T ) + D_INTEGRAL_00( _R ) - D_INTEGRAL_10( _T ) + D_INTEGRAL_10( _L ) + D_INTEGRAL_01( _B ) - D_INTEGRAL_01( _R ) - D_INTEGRAL_11( _B ) - D_INTEGRAL_11( _L );
		stencil[0][1] = stencil[2][1] = - D_INTEGRAL_00( _T ) + D_INTEGRAL_00( _L ) - D_INTEGRAL_01( _B ) - D_INTEGRAL_01( _L );
		stencil[1][2]                 =   D_INTEGRAL_01( _T ) + D_INTEGRAL_01( _R ) - D_INTEGRAL_11( _T ) + D_INTEGRAL_11( _L );
		stencil[0][2] = stencil[2][2] = - D_INTEGRAL_01( _T ) + D_INTEGRAL_01( _L );
	}
	else if( _gridType.yDirichlet1() && j==_resY-2 )
	{
		stencil[1][1]                 =   D_INTEGRAL_00( _T ) + D_INTEGRAL_00( _R ) - D_INTEGRAL_10( _T ) + D_INTEGRAL_10( _L ) + D_INTEGRAL_01( _B ) - D_INTEGRAL_01( _R ) - D_INTEGRAL_11( _B ) - D_INTEGRAL_11( _L );
		stencil[0][1] = stencil[2][1] = - D_INTEGRAL_00( _T ) + D_INTEGRAL_00( _L ) - D_INTEGRAL_01( _B ) - D_INTEGRAL_01( _L );
		stencil[1][0]                 =   D_INTEGRAL_00( _B ) - D_INTEGRAL_00( _R ) - D_INTEGRAL_10( _B ) - D_INTEGRAL_10( _L );
		stencil[0][0] = stencil[2][0] = - D_INTEGRAL_00( _B ) - D_INTEGRAL_00( _L );
	}
	else if( _gridType.yPole0() && j==0 )
	{
		stencil[1][1] =
			/* Quadrant (*,1) */ (   D_INTEGRAL_1( _R , _R ) + D_INTEGRAL_1( _L , _L ) + D_INTEGRAL_1( _L , _R ) + D_INTEGRAL_1( _R , _L ) ) * fDimX;
		stencil[1][2] =
			/* Quadrant (0,1) */ ( - D_INTEGRAL_1( _R , _R ) - D_INTEGRAL_1( _L , _T ) - D_INTEGRAL_1( _L , _R ) - D_INTEGRAL_1( _R , _T ) ) +
			/* Quadrant (1,1) */ ( - D_INTEGRAL_1( _R , _L ) + D_INTEGRAL_1( _L , _T ) - D_INTEGRAL_1( _L , _L ) + D_INTEGRAL_1( _R , _T ) ) ;
	}
	else if( _gridType.yPole1() && j==_resY-1 )
	{
		stencil[1][1] =
			/* Quadrant (*,0) */ (   D_INTEGRAL_0( _R , _R ) + D_INTEGRAL_0( _L , _L ) + D_INTEGRAL_0( _L , _R ) + D_INTEGRAL_0( _R , _L ) ) * fDimX;
		stencil[1][0] =
			/* Quadrant (0,0) */ ( - D_INTEGRAL_0( _R , _R ) + D_INTEGRAL_0( _L , _B ) - D_INTEGRAL_0( _L , _R ) + D_INTEGRAL_0( _R , _B ) ) +
			/* Quadrant (1,0) */ ( - D_INTEGRAL_0( _R , _L ) - D_INTEGRAL_0( _L , _B ) - D_INTEGRAL_0( _L , _L ) - D_INTEGRAL_0( _R , _B ) ) ;
	}
	else
	{
		stencil[1][1]                 =   D_INTEGRAL_00( _T ) + D_INTEGRAL_00( _R ) - D_INTEGRAL_10( _T ) + D_INTEGRAL_10( _L ) + D_INTEGRAL_01( _B ) - D_INTEGRAL_01( _R ) - D_INTEGRAL_11( _B ) - D_INTEGRAL_11( _L );
		stencil[0][1] = stencil[2][1] = - D_INTEGRAL_00( _T ) + D_INTEGRAL_00( _L ) - D_INTEGRAL_01( _B ) - D_INTEGRAL_01( _L );
		stencil[1][0]                 =   D_INTEGRAL_00( _B ) - D_INTEGRAL_00( _R ) - D_INTEGRAL_10( _B ) - D_INTEGRAL_10( _L );
		stencil[0][0] = stencil[2][0] = - D_INTEGRAL_00( _B ) - D_INTEGRAL_00( _L );
		stencil[1][2]                 =   D_INTEGRAL_01( _T ) + D_INTEGRAL_01( _R ) - D_INTEGRAL_11( _T ) + D_INTEGRAL_11( _L );
		stencil[0][2] = stencil[2][2] = - D_INTEGRAL_01( _T ) + D_INTEGRAL_01( _L );

		if( _gridType.yPole0() && j==1       ) stencil[0][0] = stencil[2][0] = 0. , stencil[1][0] = - D_INTEGRAL_00( _R ) - D_INTEGRAL_00( _L ) - D_INTEGRAL_10( _R ) - D_INTEGRAL_10( _L );
		if( _gridType.yPole1() && j==_resY-2 ) stencil[0][2] = stencil[2][2] = 0. , stencil[1][2] =   D_INTEGRAL_01( _R ) + D_INTEGRAL_01( _L ) + D_INTEGRAL_11( _R ) + D_INTEGRAL_11( _L );
	}
#undef D_INTEGRAL_0
#undef D_INTEGRAL_1
#undef D_INTEGRAL_00
#undef D_INTEGRAL_01
#undef D_INTEGRAL_10
#undef D_INTEGRAL_11
}
void SoRParameterization::_dStencil( int b , double stencil[4][4] ) const
{
	for( int e1=0 ; e1<4 ; e1++ ) for( int e2=0 ; e2<4 ; e2++ ) stencil[e1][e2] = _dIntegral( b , e1 , e2 , false , false , _metricRoot(b) );
}
// Computes the value-derivative stencil
template< int DType >
void SoRParameterization::__dStencils( int j , Stencil< 2 , 3 >& hStencil , Stencil< 3 , 2 >& vStencil , bool rotate ) const
{
	memset( hStencil.values , 0 , sizeof(double)*2*3 ) , memset( vStencil.values , 0 , sizeof(double)*3*2 );
	int j0 = j-1 , j1 = j , j2 = j+1;
	int b0 = j0 , b1 = j1;
	MetricRoot mr0 = _metricRoot(b0) , mr1 = _metricRoot(b1);
#define D_INTEGRAL_00( edge ) (   _dIntegral( b0 , _R , (edge) , false , rotate , mr0 ) + _dIntegral( b0 , _T , (edge) , false , rotate , mr0 ) )
#define D_INTEGRAL_10( edge ) (   _dIntegral( b0 , _L , (edge) , false , rotate , mr0 ) - _dIntegral( b0 , _T , (edge) , false , rotate , mr0 ) )
#define D_INTEGRAL_01( edge ) ( - _dIntegral( b1 , _R , (edge) , false , rotate , mr1 ) + _dIntegral( b1 , _B , (edge) , false , rotate , mr1 ) )
#define D_INTEGRAL_11( edge ) ( - _dIntegral( b1 , _L , (edge) , false , rotate , mr1 ) - _dIntegral( b1 , _B , (edge) , false , rotate , mr1 ) )
	if( !( _gridType.yPole0() && j==0 ) && !( _gridType.yPole1() && j==_resY-1 ) )
	{
		if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X )
		{
			hStencil[0][0] = D_INTEGRAL_00( _B );
			hStencil[1][0] = D_INTEGRAL_10( _B );
			hStencil[0][1] = D_INTEGRAL_00( _T ) + D_INTEGRAL_01( _B );
			hStencil[1][1] = D_INTEGRAL_10( _T ) + D_INTEGRAL_11( _B );
			hStencil[0][2] =                       D_INTEGRAL_01( _T );
			hStencil[1][2] =                       D_INTEGRAL_11( _T );
		}

		if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y )
		{
			vStencil[0][0] = D_INTEGRAL_00( _L );
			vStencil[0][1] = D_INTEGRAL_01( _L );
			vStencil[1][0] = D_INTEGRAL_00( _R ) + D_INTEGRAL_10( _L );
			vStencil[1][1] = D_INTEGRAL_01( _R ) + D_INTEGRAL_11( _L );
			vStencil[2][0] =                       D_INTEGRAL_10( _R );
			vStencil[2][1] =                       D_INTEGRAL_11( _R );
		}
	}
	else if( _gridType.yPole1() && j==0 )
	{
		if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X )
			hStencil[0][2] = hStencil[1][2] = - _dIntegral( b1 , _R , _T , false , rotate , mr1 ) - _dIntegral( b1 , _L , _T , false , rotate , mr1 );
		if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y )
			vStencil[0][1] = vStencil[1][1] = vStencil[2][1]  =
				- _dIntegral( b1 , _R , _L , false , rotate , mr1 ) - _dIntegral( b1 , _R , _R , false , rotate , mr1 )
				- _dIntegral( b1 , _L , _L , false , rotate , mr1 ) - _dIntegral( b1 , _L , _R , false , rotate , mr1 );
	}
	else if ( _gridType.yPole1() && j==_resY-1 )
	{
		if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X )
			hStencil[0][0] = hStencil[1][0] =   _dIntegral( b0 , _R , _B , false , rotate , mr0 ) + _dIntegral( b0 , _L , _B , false , rotate , mr0 );
		if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y )
			vStencil[0][0] = vStencil[1][0] = vStencil[2][0]  =
				+ _dIntegral( b0 , _R , _L , false , rotate , mr0 ) + _dIntegral( b0 , _R , _R , false , rotate , mr0 )
				+ _dIntegral( b0 , _L , _L , false , rotate , mr0 ) + _dIntegral( b0 , _L , _R , false , rotate , mr0 );
	}
#undef D_INTEGRAL_00
#undef D_INTEGRAL_10
#undef D_INTEGRAL_01
#undef D_INTEGRAL_11
}
template< class Data , class Real >
void SoRParameterization::gradient( const RegularGridFEM::template Signal< Data , Real >& sf , RegularGridFEM::template Derivative< Data , Real >& vf , int threads ) const { RegularGridFEM::Differentiate( sf , vf , threads ); }
template< class Data , class Real , int DType >
void SoRParameterization::divergence( const RegularGridFEM::template Derivative< Data , Real , DType >& v , RegularGridFEM::template Signal< Data , Real >& div , int threads ) const { _div_curl( v , div , false , threads ); }
template< class Data , class Real >
void SoRParameterization::curl( const RegularGridFEM::template Derivative< Data , Real >& v , RegularGridFEM::template Signal< Data , Real >& crl , int threads ) const { _div_curl( v , crl , true , threads ); }
template< class Data , class Real , int DType >
void SoRParameterization::_div_curl( const RegularGridFEM::template Derivative< Data , Real , DType >& vf , RegularGridFEM::template Signal< Data , Real >& sf , bool rotate , int threads ) const
{
	_assertValidity( vf , "SoRParameterization::_div_curl" );

	if( DType==RegularGridFEM::DERIVATIVE_BOTH ) sf.resize( _resX , _resY , _gridType , true );
	else _assertValidity( sf , "SoRParameterization::_div_curl" );

	threads = std::max< int >( 1 , threads );
#define SUM_X_00 (                                                                                                             _dx1[ i1 ] * (Real)hStencil[1][1] +                                     _dx2[ i1 ] * (Real)hStencil[1][2] )
#define SUM_X_10 (                                                                         _dx1[ i0 ] * (Real)hStencil[0][1] + _dx1[ i1 ] * (Real)hStencil[1][1] + _dx2[ i0 ] * (Real)hStencil[0][2] + _dx2[ i1 ] * (Real)hStencil[1][2] )
#define SUM_X_20 (                                                                         _dx1[ i0 ] * (Real)hStencil[0][1] +                                     _dx2[ i0 ] * (Real)hStencil[0][2]                                     )
#define SUM_X_01 (                                     _dx0[ i1 ] * (Real)hStencil[1][0] +                                     _dx1[ i1 ] * (Real)hStencil[1][1] +                                     _dx2[ i1 ] * (Real)hStencil[1][2] )
#define SUM_X_11 ( _dx0[ i0 ] * (Real)hStencil[0][0] + _dx0[ i1 ] * (Real)hStencil[1][0] + _dx1[ i0 ] * (Real)hStencil[0][1] + _dx1[ i1 ] * (Real)hStencil[1][1] + _dx2[ i0 ] * (Real)hStencil[0][2] + _dx2[ i1 ] * (Real)hStencil[1][2] )
#define SUM_X_21 ( _dx0[ i0 ] * (Real)hStencil[0][0] +                                     _dx1[ i0 ] * (Real)hStencil[0][1] +                                     _dx2[ i0 ] * (Real)hStencil[0][2]                                     )
#define SUM_X_02 (                                     _dx0[ i1 ] * (Real)hStencil[1][0] +                                     _dx1[ i1 ] * (Real)hStencil[1][1]                                                                         )
#define SUM_X_12 ( _dx0[ i0 ] * (Real)hStencil[0][0] + _dx0[ i1 ] * (Real)hStencil[1][0] + _dx1[ i0 ] * (Real)hStencil[0][1] + _dx1[ i1 ] * (Real)hStencil[1][1]                                                                         )
#define SUM_X_22 ( _dx0[ i0 ] * (Real)hStencil[0][0] +                                     _dx1[ i0 ] * (Real)hStencil[0][1]                                                                                                             )

#define SUM_Y_00 (                                                                                                             _dy1[ i1 ] * (Real)vStencil[1][1] +                                     _dy1[ i2 ] * (Real)vStencil[2][1] )
#define SUM_Y_10 (                                     _dy1[ i0 ] * (Real)vStencil[0][1] +                                     _dy1[ i1 ] * (Real)vStencil[1][1] +                                     _dy1[ i2 ] * (Real)vStencil[2][1] )
#define SUM_Y_20 (                                     _dy1[ i0 ] * (Real)vStencil[0][1] +                                     _dy1[ i1 ] * (Real)vStencil[1][1]                                                                         )
#define SUM_Y_01 (                                                                         _dy0[ i1 ] * (Real)vStencil[1][0] + _dy1[ i1 ] * (Real)vStencil[1][1] + _dy0[ i2 ] * (Real)vStencil[2][0] + _dy1[ i2 ] * (Real)vStencil[2][1] )
#define SUM_Y_11 ( _dy0[ i0 ] * (Real)vStencil[0][0] + _dy1[ i0 ] * (Real)vStencil[0][1] + _dy0[ i1 ] * (Real)vStencil[1][0] + _dy1[ i1 ] * (Real)vStencil[1][1] + _dy0[ i2 ] * (Real)vStencil[2][0] + _dy1[ i2 ] * (Real)vStencil[2][1] )
#define SUM_Y_21 ( _dy0[ i0 ] * (Real)vStencil[0][0] + _dy1[ i0 ] * (Real)vStencil[0][1] + _dy0[ i1 ] * (Real)vStencil[1][0] + _dy1[ i1 ] * (Real)vStencil[1][1]                                                                         )
#define SUM_Y_02 (                                                                         _dy0[ i1 ] * (Real)vStencil[1][0] +                                     _dy0[ i2 ] * (Real)vStencil[2][0]                                     )
#define SUM_Y_12 ( _dy0[ i0 ] * (Real)vStencil[0][0] +                                     _dy0[ i1 ] * (Real)vStencil[1][0] +                                     _dy0[ i2 ] * (Real)vStencil[2][0]                                     )
#define SUM_Y_22 ( _dy0[ i0 ] * (Real)vStencil[0][0] +                                     _dy0[ i1 ] * (Real)vStencil[1][0]                                                                                                             )

#define SUM_Y_N_00 (                                                                                                                         _dy1[ i1 ] * (Real)vStencil[1][1] * (Real)0.5 +                                     _dy1[ i2 ] * (Real)vStencil[2][1] )
#define SUM_Y_N_20 (                                     _dy1[ i0 ] * (Real)vStencil[0][1] +                                                 _dy1[ i1 ] * (Real)vStencil[1][1] * (Real)0.5                                                                         )
#define SUM_Y_N_01 (                                                                         _dy0[ i1 ] * (Real)vStencil[1][0] * (Real)0.5 + _dy1[ i1 ] * (Real)vStencil[1][1] * (Real)0.5 + _dy0[ i2 ] * (Real)vStencil[2][0] + _dy1[ i2 ] * (Real)vStencil[2][1] )
#define SUM_Y_N_21 ( _dy0[ i0 ] * (Real)vStencil[0][0] + _dy1[ i0 ] * (Real)vStencil[0][1] + _dy0[ i1 ] * (Real)vStencil[1][0] * (Real)0.5 + _dy1[ i1 ] * (Real)vStencil[1][1] * (Real)0.5                                                                         )
#define SUM_Y_N_02 (                                                                         _dy0[ i1 ] * (Real)vStencil[1][0] * (Real)0.5 +                                                 _dy0[ i2 ] * (Real)vStencil[2][0]                                     )
#define SUM_Y_N_22 ( _dy0[ i0 ] * (Real)vStencil[0][0] +                                     _dy0[ i1 ] * (Real)vStencil[1][0] * (Real)0.5                                                                                                                         )

#define SUM( WHICHX , WHICHY )\
	{ \
		if( _gridType.xPeriodic() ) \
			for( int i=0 ; i<_resX ; i++ ) \
			{ \
				int i0 = (i-1+_resX) % _resX , i1 = i , i2 = (i+1) % _resX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X ) _sf[i] += SUM_X_1 ## WHICHX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y ) _sf[i] += SUM_Y_1 ## WHICHY; \
			} \
		else if( _gridType.xNeumann() ) \
		{ \
			{ \
				int i=0; \
				int i0 = (i-1+_resX) % _resX , i1 = i , i2 = (i+1) % _resX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X ) _sf[i] += SUM_X_0   ## WHICHX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y ) _sf[i] += SUM_Y_N_0 ## WHICHY; \
			} \
			for( int i=1 ; i<_resX-1 ; i++ ) \
			{ \
				int i0 = (i-1+_resX) % _resX , i1 = i , i2 = (i+1) % _resX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X ) _sf[i] += SUM_X_1 ## WHICHX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y ) _sf[i] += SUM_Y_1 ## WHICHY; \
			} \
			{ \
				int i=_resX-1; \
				int i0 = (i-1+_resX) % _resX , i1 = i , i2 = (i+1) % _resX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X ) _sf[i] += SUM_X_2   ## WHICHX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y ) _sf[i] += SUM_Y_N_2 ## WHICHY; \
			} \
		} \
		else if( _gridType.xDirichlet() ) \
		{ \
			{ \
				int i=1; \
				int i0 = (i-1+_resX) % _resX , i1 = i , i2 = (i+1) % _resX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X ) _sf[i] += SUM_X_1 ## WHICHX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y ) _sf[i] += SUM_Y_0 ## WHICHY; \
			} \
			for( int i=2 ; i<_resX-2 ; i++ ) \
			{ \
				int i0 = (i-1+_resX) % _resX , i1 = i , i2 = (i+1) % _resX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X ) _sf[i] += SUM_X_1 ## WHICHX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y ) _sf[i] += SUM_Y_1 ## WHICHY; \
			} \
			{ \
				int i = _resX-2; \
				int i0 = (i-1+_resX) % _resX , i1 = i , i2 = (i+1) % _resX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X ) _sf[i] += SUM_X_1 ## WHICHX; \
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y ) _sf[i] += SUM_Y_2 ## WHICHY; \
			} \
		} \
	}

	int poleDim = _gridType.xDirichlet() ? 0 : 1;
	int vDimX = _gridType.xDirichlet() ? _resX-2 : _resX;
	int fDimX = _gridType.xPeriodic() ? _resX : _resX-1;
#pragma omp parallel for num_threads( threads )
	for( int j=0 ; j<_resY ; j++ )
	{
		if     ( j==0       && ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ) continue;
		else if( j==_resY-1 && ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ) ) continue;

		int j0 = (j-1+_resY) % _resY , j1 = j , j2 = (j+1) % _resY;
		const Stencil< 2 , 3 >& hStencil = rotate ? _drStencils[j].hStencil : _dStencils[j].hStencil;
		const Stencil< 3 , 2 >& vStencil = rotate ? _drStencils[j].vStencil : _dStencils[j].vStencil;
		Pointer( Data ) _sf;
		ConstPointer( Data ) _dx0;
		ConstPointer( Data ) _dx1;
		ConstPointer( Data ) _dx2;
		ConstPointer( Data ) _dy0 = vf.dy() + j0 * vDimX;
		ConstPointer( Data ) _dy1 = vf.dy() + j1 * vDimX;
		if     ( _gridType.yPole0     () ) _sf = sf() + poleDim + (j-1) * vDimX , _dx0 = vf.dx() + (j0-1) * fDimX , _dx1 = vf.dx() + (j1-1) * fDimX , _dx2 = vf.dx() + (j2-1) * fDimX;
		else if( _gridType.yDirichlet0() ) _sf = sf() +           (j-1) * vDimX , _dx0 = vf.dx() + (j0-1) * fDimX , _dx1 = vf.dx() + (j1-1) * fDimX , _dx2 = vf.dx() + (j2-1) * fDimX;
		else                               _sf = sf() +            j    * vDimX , _dx0 = vf.dx() +  j0    * fDimX , _dx1 = vf.dx() +  j1    * fDimX , _dx2 = vf.dx() +  j2    * fDimX;

		if( _gridType.xDirichlet() ) _sf-- , _dy0-- , _dy1--;

		if( j==0       && _gridType.yPole0() )
		{
			_sf = sf();
			if( _gridType.xPeriodic() )
			{
				for( int i=0 ; i<_resX ; i++ )
				{
					if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X ) _sf[0] += _dx2[ i ] * (Real)hStencil[0][2];
					if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y ) _sf[0] += _dy1[ i ] * (Real)vStencil[0][1];
				}
			}
			else if( _gridType.xNeumann() )
			{
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X )
					for( int i=0 ; i<fDimX ; i++ ) _sf[0] += _dx2[ i ] * (Real)hStencil[0][2];
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y )
				{
					{ int i=0;                       _sf[0] += _dy1[ i ] * (Real)( vStencil[0][1] * 0.5 ) ; }
					for( int i=1 ; i<_resX-1 ; i++ ) _sf[0] += _dy1[ i ] * (Real)vStencil[0][1];
					{ int i=_resX-1;                 _sf[0] += _dy1[ i ] * (Real)( vStencil[0][1] * 0.5 ) ; }
				}
			}
		}
		else if( j==_resY-1 && _gridType.yPole1() )
		{
			if( _gridType.xPeriodic() )
			{
				for( int i=0 ; i<_resX ; i++ )
				{
					if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X ) _sf[0] += _dx0[ i ] * (Real)hStencil[0][0];
					if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y ) _sf[0] += _dy0[ i ] * (Real)vStencil[0][0];
				}
			}
			else if( _gridType.xNeumann() )
			{
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_X )
					for( int i=0 ; i<fDimX ; i++ ) _sf[0] += _dx0[ i ] * (Real)hStencil[0][0];
				if( DType==RegularGridFEM::DERIVATIVE_BOTH || DType==RegularGridFEM::DERIVATIVE_Y )
				{
					{ int i=0;                       _sf[0] += _dy0[ i ] * (Real)( vStencil[0][0] * 0.5 ) ; }
					for( int i=1 ; i<_resX-1 ; i++ ) _sf[0] += _dy0[ i ] * (Real)vStencil[0][0];
					{ int i=_resX-1;                 _sf[0] += _dy0[ i ] * (Real)( vStencil[0][0] * 0.5 ) ; }
				}
			}
		}
		else if( j==0       && _gridType.yNeumann0() )                             SUM( 0 , 0 )
		else if( j==1       && ( _gridType.yDirichlet0() || _gridType.yPole0() ) ) SUM( 0 , 1 )
		else if( j==_resY-1 && _gridType.yNeumann1() )                             SUM( 2 , 2 )
		else if( j==_resY-2 && ( _gridType.yDirichlet1() || _gridType.yPole1() ) ) SUM( 2 , 1 )
		else                                                                       SUM( 1 , 1 )
	}
#undef SUM_X_00
#undef SUM_X_10
#undef SUM_X_20
#undef SUM_X_01
#undef SUM_X_11
#undef SUM_X_21
#undef SUM_X_02
#undef SUM_X_12
#undef SUM_X_22
#undef SUM_Y_00
#undef SUM_Y_10
#undef SUM_Y_20
#undef SUM_Y_01
#undef SUM_Y_11
#undef SUM_Y_21
#undef SUM_Y_02
#undef SUM_Y_12
#undef SUM_Y_22
#undef SUM_Y_N_00
#undef SUM_Y_N_20
#undef SUM_Y_N_01
#undef SUM_Y_N_21
#undef SUM_Y_N_02
#undef SUM_Y_N_22
#undef SUM
}
template< class Real >
double SoRParameterization::dotProduct( const RegularGridFEM::template Signal< Real , Real >& sf1 , const RegularGridFEM::template Signal< Real , Real >& sf2 , int threads ) const
{
#define SUM_00 (                                                                          (           _sf1_1[i2]) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] + (           _sf1_2[i2]) * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] )
#define SUM_10 (                                                                          (_sf1_1[i0]+_sf1_1[i2]) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] + (_sf1_2[i0]+_sf1_2[i2]) * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] )
#define SUM_20 (                                                                          (_sf1_1[i0]           ) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] + (_sf1_2[i0]           ) * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] )
#define SUM_01 ( (           _sf1_0[i2]) * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] + (           _sf1_1[i2]) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] + (           _sf1_2[i2]) * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] )
#define SUM_11 ( (_sf1_0[i0]+_sf1_0[i2]) * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] + (_sf1_1[i0]+_sf1_1[i2]) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] + (_sf1_2[i0]+_sf1_2[i2]) * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] )
#define SUM_21 ( (_sf1_0[i0]           ) * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] + (_sf1_1[i0]           ) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] + (_sf1_2[i0]           ) * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] )
#define SUM_02 ( (           _sf1_0[i2]) * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] + (           _sf1_1[i2]) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1]                                                                          )
#define SUM_12 ( (_sf1_0[i0]+_sf1_0[i2]) * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] + (_sf1_1[i0]+_sf1_1[i2]) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1]                                                                          )
#define SUM_22 ( (_sf1_0[i0]           ) * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] + (_sf1_1[i0]           ) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1]                                                                          )

#define SUM_N_00 (                                                                                (           _sf1_1[i2]) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] * 0.5 + (           _sf1_2[i2]) * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] * 0.5 )
#define SUM_N_20 (                                                                                (_sf1_1[i0]           ) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] * 0.5 + (_sf1_2[i0]           ) * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] * 0.5 )
#define SUM_N_01 ( (           _sf1_0[i2]) * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] * 0.5 + (           _sf1_1[i2]) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] * 0.5 + (           _sf1_2[i2]) * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] * 0.5 )
#define SUM_N_21 ( (_sf1_0[i0]           ) * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] * 0.5 + (_sf1_1[i0]           ) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] * 0.5 + (_sf1_2[i0]           ) * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] * 0.5 )
#define SUM_N_02 ( (           _sf1_0[i2]) * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] * 0.5 + (           _sf1_1[i2]) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] * 0.5                                                                                )
#define SUM_N_22 ( (_sf1_0[i0]           ) * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] * 0.5 + (_sf1_1[i0]           ) * mStencil[0][1] + _sf1_1[i1] * mStencil[1][1] * 0.5                                                                                )

#define SUM( WHICH ) \
	{ \
		if( _gridType.xPeriodic() ) \
			for( int i=0 ; i<_resX ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( SUM_1 ## WHICH ) * _sf2[i]; } \
		else if( _gridType.xNeumann() ) \
		{ \
			{ int i=0;                        int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( SUM_N_0 ## WHICH ) * _sf2[i]; } \
			for( int i=1 ; i<_resX-1 ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( SUM_1   ## WHICH ) * _sf2[i]; } \
			{ int i=_resX-1;                  int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( SUM_N_2 ## WHICH ) * _sf2[i]; } \
		} \
		else if( _gridType.xDirichlet() ) \
		{ \
			{ int i=1;                        int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( SUM_0 ## WHICH ) * _sf2[i] ; } \
			for( int i=2 ; i<_resX-2 ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( SUM_1 ## WHICH ) * _sf2[i] ; } \
			{ int i=_resX-2;                  int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( SUM_2 ## WHICH ) * _sf2[i] ; } \
		} \
	}
#define SUM_P( WHICH ) \
	{ \
		if( _gridType.xPeriodic() ) \
			for( int i=0 ; i<_resX ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( ( SUM_1 ## WHICH ) + ( _sf1_ ## WHICH [0] * mStencil[1][WHICH] ) ) * _sf2[i] ; } \
		else if( _gridType.xNeumann() ) \
		{ \
			{ int i=0;                        int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( ( SUM_N_0 ## WHICH ) + ( _sf1_ ## WHICH [0] * mStencil[1][WHICH] * 0.5 ) ) * _sf2[i]; } \
			for( int i=1 ; i<_resX-1 ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( ( SUM_1   ## WHICH ) + ( _sf1_ ## WHICH [0] * mStencil[1][WHICH]       ) ) * _sf2[i]; } \
			{ int i=_resX-1;                  int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( ( SUM_N_2 ## WHICH ) + ( _sf1_ ## WHICH [0] * mStencil[1][WHICH] * 0.5 ) ) * _sf2[i]; } \
		} \
		else if( _gridType.xDirichlet() ) \
		{ \
			{ int i=1;                        int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( ( SUM_0 ## WHICH ) + ( _sf1_ ## WHICH [0] * mStencil[1][WHICH] ) ) * _sf2[i]; } \
			for( int i=2 ; i<_resX-2 ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( ( SUM_1 ## WHICH ) + ( _sf1_ ## WHICH [0] * mStencil[1][WHICH] ) ) * _sf2[i]; } \
			{ int i=_resX-2;                  int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; dot += ( ( SUM_2 ## WHICH ) + ( _sf1_ ## WHICH [0] * mStencil[1][WHICH] ) ) * _sf2[i]; } \
		} \
	}

	_assertValidity( sf1 , "SoRParameterization::dotProduct" ) , _assertValidity( sf2 , "SoRParameterization::dotProduct" );

	threads = std::max< int >( 1 , threads );
	double dot = 0.;
	int poleDim = _gridType.xDirichlet() ? 0 : 1;
	int vDimX = _gridType.xDirichlet() ? _resX-2 : _resX;
	int fDimX = _gridType.xPeriodic() ? _resX : _resX-1;
#pragma omp parallel for num_threads( threads ) reduction( + : dot )
	for( int j=0 ; j<_resY ; j++ )
	{
		if     ( j==0       && ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ) continue;
		else if( j==_resY-1 && ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ) ) continue;

		int j0 = (j+_resY-1) % _resY , j1 = j , j2 = (j+1)%_resY;
		const Stencil< 3 , 3 >& mStencil = _mStencils[j].stencil;
		const Real *_sf1_0 , *_sf1_1 , *_sf1_2 , *_sf2;
		if     ( _gridType.yPole0()      ) _sf2 = sf2() + poleDim + (j-1) * vDimX , _sf1_0 = sf1() + poleDim + (j0-1) * vDimX , _sf1_1 = sf1() + poleDim + (j1-1) * vDimX , _sf1_2 = sf1() + poleDim + (j2-1) * vDimX;
		else if( _gridType.yDirichlet0() ) _sf2 = sf2() +           (j-1) * vDimX , _sf1_0 = sf1() +           (j0-1) * vDimX , _sf1_1 = sf1() +           (j1-1) * vDimX , _sf1_2 = sf1() +           (j2-1) * vDimX;
		else                               _sf2 = sf2() +            j    * vDimX , _sf1_0 = sf1() +            j0    * vDimX , _sf1_1 = sf1() +            j1    * vDimX , _sf1_2 = sf1() +            j2    * vDimX;

		if( _gridType.xDirichlet() ) _sf2-- , _sf1_0-- , _sf1_1-- , _sf1_2--;

		if( _gridType.yPole0() )
			if     ( j==0 ) _sf1_1 = sf1() , _sf2 = sf2();
			else if( j==1 ) _sf1_0 = sf1();

		if( j==0 && _gridType.yPole0() )
		{
			double temp = _sf1_1[0] * mStencil[1][1];
			if( _gridType.xPeriodic() )
				for( int i=0 ; i<_resX ; i++ )
				{
					int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
					temp += _sf1_2[i0] * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] + _sf1_2[i2] * mStencil[2][2] ;
				}
			else if( _gridType.xNeumann() )
			{
				{
					int i=0;
					int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
					temp += _sf1_2[i1] * mStencil[1][2] * 0.5 + _sf1_2[i2] * mStencil[2][2];
				}
				for( int i=1 ; i<_resX-1 ; i++ )
				{
					int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
					temp += _sf1_2[i0] * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] + _sf1_2[i2] * mStencil[2][2];
				}
				{
					int i = _resX-1;
					int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
					temp += _sf1_2[i0] * mStencil[0][2] + _sf1_2[i1] * mStencil[1][2] * 0.5;
				}
			}
			dot += temp * _sf2[0];
		}
		else if( j==_resY-1 && _gridType.yPole1() )
		{
			double temp = _sf1_1[0] * mStencil[1][1];
			if( _gridType.xPeriodic() )
				for( int i=0 ; i<_resX ; i++ )
				{
					int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
					temp += _sf1_0[i0] * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] + _sf1_0[i2] * mStencil[2][0] ;
				}
			else if( _gridType.xNeumann() )
			{
				{
					int i=0;
					int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
					temp += _sf1_0[i1] * mStencil[1][0] * 0.5 + _sf1_0[i2] * mStencil[2][0];
				}
				for( int i=1 ; i<_resX-1 ; i++ )
				{
					int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
					temp += _sf1_0[i0] * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] + _sf1_0[i2] * mStencil[2][0];
				}
				{
					int i = _resX-1;
					int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
					temp += _sf1_0[i0] * mStencil[0][0] + _sf1_0[i1] * mStencil[1][0] * 0.5;
				}
			}
			dot += temp * _sf2[0];
		}
		else if( j==1       && ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ) SUM( 0 )
		else if( j==_resY-2 && ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ) ) SUM( 2 )
		else if( j==1       && _gridType.yPole0()      ) SUM_P( 0 )
		else if( j==_resY-2 && _gridType.yPole1()      ) SUM_P( 2 )
		else if( j==0       && _gridType.yNeumann0()   ) SUM  ( 0 )
		else if( j==_resY-1 && _gridType.yNeumann1()   ) SUM  ( 2 )
		else                                             SUM  ( 1 )
	}
	return dot;
#undef SUM_00
#undef SUM_10
#undef SUM_20
#undef SUM_01
#undef SUM_11
#undef SUM_21
#undef SUM_02
#undef SUM_12
#undef SUM_22
#undef SUM_N_00
#undef SUM_N_20
#undef SUM_N_01
#undef SUM_N_21
#undef SUM_N_02
#undef SUM_N_22
#undef SUM
#undef SUM_P
}
template< class Real >
double SoRParameterization::dotProduct( const RegularGridFEM::template Derivative< Real , Real >& vf1 , const RegularGridFEM::template Derivative< Real , Real >& vf2 , int threads ) const
{
	_assertValidity( vf1 , "SoRParameterization::dotProduct" ) , _assertValidity( vf2 , "SoRParameterization::dotProduct" );

	double dot = 0;
	threads = std::max< int >( 1 , threads );
	static std::vector< Real > zeros;
	zeros.resize( _resX , (Real)0. );

	int vDimX = _gridType.xDirichlet() ? _resX-2 : _resX;
	int fDimX = _gridType.xPeriodic() ? _resX : _resX-1;
	// The horizontal edges
#pragma omp parallel for num_threads( threads ) reduction( + : dot )
	for( int j=0 ; j<_resY ; j++ )
	{
		if     ( j==0       && ( _gridType.yPole0() || _gridType.yDirichlet0() ) ) continue;
		else if( j==_resY-1 && ( _gridType.yPole1() || _gridType.yDirichlet1() ) ) continue;

		int j0 = (j+_resY-1) % _resY , j1 = j , j2 = (j+1) % _resY;
		double stencil0[4][4] , stencil1[4][4];
		_dStencil( j0 , stencil0 ) , _dStencil( j1 , stencil1 );

		ConstPointer( Real ) dx1;
		ConstPointer( Real ) dx2_0;
		ConstPointer( Real ) dx2_1;
		ConstPointer( Real ) dx2_2;
		ConstPointer( Real ) dy2_0 = vf2.dy() + j0 * vDimX;
		ConstPointer( Real ) dy2_1 = vf2.dy() + j1 * vDimX;
		if( _gridType.yPole0() || _gridType.yDirichlet0() ) dx1 = vf1.dx() + (j-1) * fDimX , dx2_0 = vf2.dx() + (j0-1) * fDimX , dx2_1 = vf2.dx() + (j1-1) * fDimX , dx2_2 = vf2.dx() + (j2-1) * fDimX;
		else                                                dx1 = vf1.dx() +  j    * fDimX , dx2_0 = vf2.dx() +  j0    * fDimX , dx2_1 = vf2.dx() +  j1    * fDimX , dx2_2 = vf2.dx() +  j2    * fDimX;

		if( _gridType.xDirichlet() ) dy2_0-- , dy2_1--;

		if     ( j==0       &&   _gridType.yNeumann0() )                           dx2_0 = dy2_0 = GetPointer( zeros );
		else if( j==_resY-1 &&   _gridType.yNeumann1() )                           dx2_2 = dy2_1 = GetPointer( zeros );
		else if( j==1       && ( _gridType.yPole0() || _gridType.yDirichlet0() ) ) dx2_0 =         GetPointer( zeros );
		else if( j==_resY-2 && ( _gridType.yPole1() || _gridType.yDirichlet1() ) ) dx2_2 =         GetPointer( zeros );
		if( _gridType.xPeriodic() )
		{
			for( int i=0 ; i<_resX ; i++ )
			{
				int i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil0[_T][_B] * dx2_0[i1] + ( stencil0[_T][_T] + stencil1[_B][_B] ) * dx2_1[i1] + stencil1[_B][_T] * dx2_2[i1];
				d += stencil0[_T][_L] * dy2_0[i1] + stencil0[_T][_R] * dy2_0[i2] + stencil1[_B][_L] * dy2_1[i1] + stencil1[_B][_R] * dy2_1[i2];
				dot += d * dx1[i];
			}

		}
		else if( _gridType.xNeumann() )
		{
			for( int i=0 ; i<_resX-1 ; i++ )
			{
				int i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil0[_T][_B] * dx2_0[i1] + ( stencil0[_T][_T] + stencil1[_B][_B] ) * dx2_1[i1] + stencil1[_B][_T] * dx2_2[i1];
				d += stencil0[_T][_L] * dy2_0[i1] + stencil0[_T][_R] * dy2_0[i2] + stencil1[_B][_L] * dy2_1[i1] + stencil1[_B][_R] * dy2_1[i2];
				dot += d * dx1[i];
			}
		}
		else if( _gridType.xDirichlet() )
		{
			{
				int i=0;
				int i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil0[_T][_B] * dx2_0[i1] + ( stencil0[_T][_T] + stencil1[_B][_B] ) * dx2_1[i1] + stencil1[_B][_T] * dx2_2[i1];
				d += stencil0[_T][_R] * dy2_0[i2] + stencil1[_B][_R] * dy2_1[i2];
				dot += d * dx1[i];
			}
			for( int i=1 ; i<_resX-2 ; i++ )
			{
				int i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil0[_T][_B] * dx2_0[i1] + ( stencil0[_T][_T] + stencil1[_B][_B] ) * dx2_1[i1] + stencil1[_B][_T] * dx2_2[i1];
				d += stencil0[_T][_L] * dy2_0[i1] + stencil0[_T][_R] * dy2_0[i2] + stencil1[_B][_L] * dy2_1[i1] + stencil1[_B][_R] * dy2_1[i2];
				dot += d * dx1[i];
			}
			{
				int i=_resX-2;
				int i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil0[_T][_B] * dx2_0[i1] + ( stencil0[_T][_T] + stencil1[_B][_B] ) * dx2_1[i1] + stencil1[_B][_T] * dx2_2[i1];
				d += stencil0[_T][_L] * dy2_0[i1] + stencil1[_B][_L] * dy2_1[i1];
				dot += d * dx1[i];
			}
		}
	}
	// The vertical edges
#pragma omp parallel for num_threads( threads ) reduction( + : dot )
	for( int b=0 ; b<(int)bands() ; b++ )
	{
		int j0 = b , j1 = (b+1) % _resY;
		double stencil[4][4];
		_dStencil( b , stencil );

		ConstPointer( Real ) dy1 = vf1.dy() + b * vDimX;
		ConstPointer( Real ) dy2 = vf2.dy() + b * vDimX;
		ConstPointer( Real ) dx2_0;
		ConstPointer( Real ) dx2_1;

		if( _gridType.yPole0() || _gridType.yDirichlet0() ) dx2_0 = vf2.dx() + (j0-1) * fDimX , dx2_1 = vf2.dx() + (j1-1) * fDimX;
		else                                                dx2_0 = vf2.dx() +  j0    * fDimX , dx2_1 = vf2.dx() +  j1    * fDimX;

		if( _gridType.xDirichlet() ) dy1-- , dy2--;


		if     ( b==0         && ( _gridType.yPole0() || _gridType.yDirichlet0() ) ) dx2_0 = GetPointer( zeros );
		else if( b==bands()-1 && ( _gridType.yPole1() || _gridType.yDirichlet1() ) ) dx2_1 = GetPointer( zeros );
		if( _gridType.xPeriodic() )
		{
			for( int i=0 ; i<_resX ; i++ )
			{
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_R][_L] * dy2[i0] + ( stencil[_R][_R] + stencil[_L][_L] ) * dy2[i1] + stencil[_L][_R] * dy2[i2];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0] + stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dot += d * dy1[i];
			}
		}
		else if( _gridType.xNeumann() )
		{
			{
				int i=0;
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_L][_L] * dy2[i1] + stencil[_L][_R] * dy2[i2];
				d += stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dot += d * dy1[i];
			}
			for( int i=1 ; i<_resX-1 ; i++ )
			{
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_R][_L] * dy2[i0] + ( stencil[_R][_R] + stencil[_L][_L] ) * dy2[i1] + stencil[_L][_R] * dy2[i2];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0] + stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dot += d * dy1[i];
			}
			{
				int i=_resX-1;
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_R][_L] * dy2[i0] + stencil[_R][_R] * dy2[i1];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0];
				dot += d * dy1[i];
			}
		}
		else if( _gridType.xDirichlet() )
		{
			{
				int i=1;
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += ( stencil[_R][_R] + stencil[_L][_L] ) * dy2[i1] + stencil[_L][_R] * dy2[i2];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0] + stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dot += d * dy1[i];
			}
			for( int i=2 ; i<_resX-2 ; i++ )
			{
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_R][_L] * dy2[i0] + ( stencil[_R][_R] + stencil[_L][_L] ) * dy2[i1] + stencil[_L][_R] * dy2[i2];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0] + stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dot += d * dy1[i];
			}
			{
				int i=_resX-2;
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_R][_L] * dy2[i0] + ( stencil[_R][_R] + stencil[_L][_L] ) * dy2[i1];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0] + stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dot += d * dy1[i];
			}
		}
	}
	return dot;
}
template< class Data , class Real >
void SoRParameterization::screenedLaplacian( const RegularGridFEM::template Signal< Data , Real >& in , RegularGridFEM::template Signal< Data , Real >& out , double mWeight , double sWeight , bool add , int threads ) const
{
#define SUM_00 (                                                                  (         _in1[i2]) * stencil[0][1] + _in1[i1] * stencil[1][1] + (         _in2[i2]) * stencil[0][2] + _in2[i1] * stencil[1][2] )
#define SUM_10 (                                                                  (_in1[i0]+_in1[i2]) * stencil[0][1] + _in1[i1] * stencil[1][1] + (_in2[i0]+_in2[i2]) * stencil[0][2] + _in2[i1] * stencil[1][2] )
#define SUM_20 (                                                                  (_in1[i0]         ) * stencil[0][1] + _in1[i1] * stencil[1][1] + (_in2[i0]         ) * stencil[0][2] + _in2[i1] * stencil[1][2] )
#define SUM_01 ( (         _in0[i2]) * stencil[0][0] + _in0[i1] * stencil[1][0] + (         _in1[i2]) * stencil[0][1] + _in1[i1] * stencil[1][1] + (         _in2[i2]) * stencil[0][2] + _in2[i1] * stencil[1][2] )
#define SUM_11 ( (_in0[i0]+_in0[i2]) * stencil[0][0] + _in0[i1] * stencil[1][0] + (_in1[i0]+_in1[i2]) * stencil[0][1] + _in1[i1] * stencil[1][1] + (_in2[i0]+_in2[i2]) * stencil[0][2] + _in2[i1] * stencil[1][2] )
#define SUM_21 ( (_in0[i0]         ) * stencil[0][0] + _in0[i1] * stencil[1][0] + (_in1[i0]         ) * stencil[0][1] + _in1[i1] * stencil[1][1] + (_in2[i0]         ) * stencil[0][2] + _in2[i1] * stencil[1][2] )
#define SUM_02 ( (         _in0[i2]) * stencil[0][0] + _in0[i1] * stencil[1][0] + (         _in1[i2]) * stencil[0][1] + _in1[i1] * stencil[1][1]                                                                  )
#define SUM_12 ( (_in0[i0]+_in0[i2]) * stencil[0][0] + _in0[i1] * stencil[1][0] + (_in1[i0]+_in1[i2]) * stencil[0][1] + _in1[i1] * stencil[1][1]                                                                  )
#define SUM_22 ( (_in0[i0]         ) * stencil[0][0] + _in0[i1] * stencil[1][0] + (_in1[i0]         ) * stencil[0][1] + _in1[i1] * stencil[1][1]                                                                  )

#define SUM_N_00 (                                                                              (         _in1[i2]) * stencil[0][1] + _in1[i1] * stencil[1][1] * (Real)0.5 + (         _in2[i2]) * stencil[0][2] + _in2[i1] * stencil[1][2] * (Real)0.5 )
#define SUM_N_20 (                                                                              (_in1[i0]         ) * stencil[0][1] + _in1[i1] * stencil[1][1] * (Real)0.5 + (_in2[i0]         ) * stencil[0][2] + _in2[i1] * stencil[1][2] * (Real)0.5 )
#define SUM_N_01 ( (         _in0[i2]) * stencil[0][0] + _in0[i1] * stencil[1][0] * (Real)0.5 + (         _in1[i2]) * stencil[0][1] + _in1[i1] * stencil[1][1] * (Real)0.5 + (         _in2[i2]) * stencil[0][2] + _in2[i1] * stencil[1][2] * (Real)0.5 )
#define SUM_N_21 ( (_in0[i0]         ) * stencil[0][0] + _in0[i1] * stencil[1][0] * (Real)0.5 + (_in1[i0]         ) * stencil[0][1] + _in1[i1] * stencil[1][1] * (Real)0.5 + (_in2[i0]         ) * stencil[0][2] + _in2[i1] * stencil[1][2] * (Real)0.5 )
#define SUM_N_02 ( (         _in0[i2]) * stencil[0][0] + _in0[i1] * stencil[1][0] * (Real)0.5 + (         _in1[i2]) * stencil[0][1] + _in1[i1] * stencil[1][1] * (Real)0.5                                                                              )
#define SUM_N_22 ( (_in0[i0]         ) * stencil[0][0] + _in0[i1] * stencil[1][0] * (Real)0.5 + (_in1[i0]         ) * stencil[0][1] + _in1[i1] * stencil[1][1] * (Real)0.5                                                                              )

#define SUM( WHICH ) \
	{ \
		if( _gridType.xPeriodic() ) \
			for( int i=0 ; i<_resX ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_1 ## WHICH; } \
		else if( _gridType.xNeumann() ) \
		{ \
			{ int i=0;                        int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_N_0 ## WHICH; } \
			for( int i=1 ; i<_resX-1 ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_1   ## WHICH; } \
			{ int i=_resX-1;                  int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_N_2 ## WHICH; } \
		} \
		else if( _gridType.xDirichlet() ) \
		{ \
			{ int i=1;                        int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_0 ## WHICH ; } \
			for( int i=2 ; i<_resX-2 ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_1 ## WHICH ; } \
			{ int i=_resX-2;                  int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_2 ## WHICH ; } \
		} \
	}
#define SUM_P( WHICH ) \
	{ \
		if( _gridType.xPeriodic() ) \
			for( int i=0 ; i<_resX ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_1 ## WHICH + ( _in ## WHICH [0] * stencil[1][WHICH] ) ; } \
		else if( _gridType.xNeumann() ) \
		{ \
			{ int i=0;                        int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_N_0 ## WHICH + ( _in ## WHICH [0] * stencil[1][WHICH] * (Real)0.5 ) ; } \
			for( int i=1 ; i<_resX-1 ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_1   ## WHICH + ( _in ## WHICH [0] * stencil[1][WHICH]             ) ; } \
			{ int i = _resX-1;                int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_N_2 ## WHICH + ( _in ## WHICH [0] * stencil[1][WHICH] * (Real)0.5 ) ; } \
		} \
		else if( _gridType.xDirichlet() ) \
		{ \
			{ int i=1;                        int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_0 ## WHICH + ( _in ## WHICH [0] * stencil[1][WHICH] ) ; } \
			for( int i=2 ; i<_resX-2 ; i++ ){ int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_1 ## WHICH + ( _in ## WHICH [0] * stencil[1][WHICH] ) ; } \
			{ int i=_resX-2;                  int i0 = (i-1+_resX)%_resX , i1 = i , i2 = (i+1)%_resX ; _out[i] += SUM_2 ## WHICH + ( _in ## WHICH [0] * stencil[1][WHICH] ) ; } \
		} \
	}

	_assertValidity( in , "SoRParameterization::screenedLaplacian" );

	out.resize( _resX , _resY , in.gridType() , !add );
	int poleDim = _gridType.xDirichlet() ? 0 : 1;
	int vDimX = _gridType.xDirichlet() ? _resX-2 : _resX;
	int fDimX = _gridType.xPeriodic() ? _resX : _resX-1;
#pragma omp parallel for num_threads( threads )
	for( int j=0 ; j<_resY ; j++ )
	{
		if     ( j==0       && ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ) continue;
		else if( j==_resY-1 && ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) )) continue;

		int j0 = (j+_resY-1) % _resY , j1 = j , j2 = (j+1) % _resY;
		Pointer( Data ) _out;
		ConstPointer( Data ) _in0;
		ConstPointer( Data ) _in1;
		ConstPointer( Data ) _in2;
		if     ( _gridType.yPole0()      ) _out = out() + poleDim + (j-1) * vDimX , _in0 = in() + poleDim + (j0-1) * vDimX , _in1 = in() + poleDim + (j1-1) * vDimX , _in2 = in() + poleDim + (j2-1) * vDimX;
		else if( _gridType.yDirichlet0() ) _out = out() +           (j-1) * vDimX , _in0 = in() +           (j0-1) * vDimX , _in1 = in() +           (j1-1) * vDimX , _in2 = in() +           (j2-1) * vDimX;
		else                               _out = out() +            j    * vDimX , _in0 = in() +            j0    * vDimX , _in1 = in() +            j1    * vDimX , _in2 = in() +            j2    * vDimX;

		if( _gridType.xDirichlet() ) _out-- , _in0-- , _in1-- , _in2--;

		if( _gridType.yPole0() )
			if     ( j==0 ) _out = out() , _in1 = in();
			else if( j==1 ) _in0 = in();

		Stencil< 3 , 3 , Real > stencil;
		const Stencil< 3 , 3 >& mStencil = _mStencils[j].stencil;
		const Stencil< 3 , 3 >& sStencil = _sStencils[j].stencil;
		for( int i=0 ; i<3 ; i++ ) for( int j=0 ; j<3 ; j++ ) stencil[i][j] = (Real)( mStencil[i][j]*mWeight + sStencil[i][j]*sWeight );

		if( j==0 && _gridType.yPole0() )
		{
			Data temp = _in1[0] * stencil[1][1];
			if( _gridType.xPeriodic() )
				for( int i=0 ; i<_resX ; i++ ) temp += _in2[i] * stencil[1][2];
			else if( _gridType.xNeumann() )
			{
				{ int i=0;                       temp += _in2[i] * stencil[1][2]*(Real)0.5; }
				for( int i=1 ; i<_resX-1 ; i++ ) temp += _in2[i] * stencil[1][2];
				{ int i=_resX-1;                 temp += _in2[i] * stencil[1][2]*(Real)0.5; }
			}
			_out[0] += temp;
		}
		else if( j==_resY-1 && _gridType.yPole1() )
		{
			Data temp = _in1[0] * stencil[1][1];
			if( _gridType.xPeriodic() )
				for( int i=0 ; i<_resX ; i++ ) temp += _in0[i] * stencil[1][0];
			else if( _gridType.xNeumann() )
			{
				{ int i=0;                       temp += _in0[i] * stencil[1][0]*(Real)0.5; }
				for( int i=1 ; i<_resX-1 ; i++ ) temp += _in0[i] * stencil[1][0];
				{ int i=_resX-1;                 temp += _in0[i] * stencil[1][0]*(Real)0.5; }
			}
			_out[0] += temp;
		}
		else if( j==1       && ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ) SUM( 0 )
		else if( j==_resY-2 && ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ) ) SUM( 2 )
		else if( j==1       && _gridType.yPole0()      ) SUM_P( 0 )
		else if( j==_resY-2 && _gridType.yPole1()      ) SUM_P( 2 )
		else if( j==0       && _gridType.yNeumann0()   ) SUM  ( 0 )
		else if( j==_resY-1 && _gridType.yNeumann1()   ) SUM  ( 2 )
		else                                             SUM  ( 1 )
	}
#undef SUM_00
#undef SUM_10
#undef SUM_20
#undef SUM_01
#undef SUM_11
#undef SUM_21
#undef SUM_02
#undef SUM_12
#undef SUM_22
#undef SUM_N_00
#undef SUM_N_20
#undef SUM_N_01
#undef SUM_N_21
#undef SUM_N_02
#undef SUM_N_22
#undef SUM
#undef SUM_P
}

template< class Data , class Real >
void SoRParameterization::dual( const RegularGridFEM::template Signal< Data , Real >& in , RegularGridFEM::template Signal< Data , Real >& out , int threads ) const
{
	screenedLaplacian( in , out , 1. , 0. , false , threads );
}
template< class Data , class Real >
void SoRParameterization::dual( const RegularGridFEM::template Derivative< Data , Real >& in , RegularGridFEM::template Derivative< Data , Real >& out , int threads ) const
{
	_assertValidity( in , "SoRParameterization::dual" );
	out.resize( _resX , _resY , in.gridType() );

	threads = std::max< int >( 1 , threads );
	static std::vector< Real > zeros;
	zeros.resize( _resX , (Real)0. );

	int vDimX = _gridType.xDirichlet() ? _resX-2 : _resX;
	int fDimX = _gridType.xPeriodic() ? _resX : _resX-1;
	// The horizontal edges
#pragma omp parallel for num_threads( threads )
	for( int j=0 ; j<_resY ; j++ )
	{
		if     ( j==0       && ( _gridType.yPole0() || _gridType.yDirichlet0() ) ) continue;
		else if( j==_resY-1 && ( _gridType.yPole1() || _gridType.yDirichlet1() ) ) continue;

		int j0 = (j+_resY-1) % _resY , j1 = j , j2 = (j+1) % _resY;
		double stencil0[4][4] , stencil1[4][4];
		_dStencil( j0 , stencil0 ) , _dStencil( j1 , stencil1 );

		Pointer( Real ) dx1;
		ConstPointer( Real ) dx2_0;
		ConstPointer( Real ) dx2_1;
		ConstPointer( Real ) dx2_2;
		ConstPointer( Real ) dy2_0 = in.dy() + j0 * vDimX;
		ConstPointer( Real ) dy2_1 = in.dy() + j1 * vDimX;
		if( _gridType.yPole0() || _gridType.yDirichlet0() ) dx1 = out.dx() + (j-1) * fDimX , dx2_0 = in.dx() + (j0-1) * fDimX , dx2_1 = in.dx() + (j1-1) * fDimX , dx2_2 = in.dx() + (j2-1) * fDimX;
		else                                                dx1 = out.dx() +  j    * fDimX , dx2_0 = in.dx() +  j0    * fDimX , dx2_1 = in.dx() +  j1    * fDimX , dx2_2 = in.dx() +  j2    * fDimX;

		if( _gridType.xDirichlet() ) dy2_0-- , dy2_1--;

		if     ( j==0       &&   _gridType.yNeumann0() )                           dx2_0 = dy2_0 = GetPointer( zeros );
		else if( j==_resY-1 &&   _gridType.yNeumann1() )                           dx2_2 = dy2_1 = GetPointer( zeros );
		else if( j==1       && ( _gridType.yPole0() || _gridType.yDirichlet0() ) ) dx2_0 =         GetPointer( zeros );
		else if( j==_resY-2 && ( _gridType.yPole1() || _gridType.yDirichlet1() ) ) dx2_2 =         GetPointer( zeros );
		if( _gridType.xPeriodic() )
		{
			for( int i=0 ; i<_resX ; i++ )
			{
				int i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil0[_T][_B] * dx2_0[i1] + ( stencil0[_T][_T] + stencil1[_B][_B] ) * dx2_1[i1] + stencil1[_B][_T] * dx2_2[i1];
				d += stencil0[_T][_L] * dy2_0[i1] + stencil0[_T][_R] * dy2_0[i2] + stencil1[_B][_L] * dy2_1[i1] + stencil1[_B][_R] * dy2_1[i2];
				dx1[i] = (Real)d;
			}

		}
		else if( _gridType.xNeumann() )
		{
			for( int i=0 ; i<_resX-1 ; i++ )
			{
				int i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil0[_T][_B] * dx2_0[i1] + ( stencil0[_T][_T] + stencil1[_B][_B] ) * dx2_1[i1] + stencil1[_B][_T] * dx2_2[i1];
				d += stencil0[_T][_L] * dy2_0[i1] + stencil0[_T][_R] * dy2_0[i2] + stencil1[_B][_L] * dy2_1[i1] + stencil1[_B][_R] * dy2_1[i2];
				dx1[i] = (Real)d;
			}
		}
		else if( _gridType.xDirichlet() )
		{
			{
				int i=0;
				int i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil0[_T][_B] * dx2_0[i1] + ( stencil0[_T][_T] + stencil1[_B][_B] ) * dx2_1[i1] + stencil1[_B][_T] * dx2_2[i1];
				d += stencil0[_T][_R] * dy2_0[i2] + stencil1[_B][_R] * dy2_1[i2];
				dx1[i] = (Real)d;
			}
			for( int i=1 ; i<_resX-2 ; i++ )
			{
				int i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil0[_T][_B] * dx2_0[i1] + ( stencil0[_T][_T] + stencil1[_B][_B] ) * dx2_1[i1] + stencil1[_B][_T] * dx2_2[i1];
				d += stencil0[_T][_L] * dy2_0[i1] + stencil0[_T][_R] * dy2_0[i2] + stencil1[_B][_L] * dy2_1[i1] + stencil1[_B][_R] * dy2_1[i2];
				dx1[i] = (Real)d;
			}
			{
				int i=_resX-2;
				int i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil0[_T][_B] * dx2_0[i1] + ( stencil0[_T][_T] + stencil1[_B][_B] ) * dx2_1[i1] + stencil1[_B][_T] * dx2_2[i1];
				d += stencil0[_T][_L] * dy2_0[i1] + stencil1[_B][_L] * dy2_1[i1];
				dx1[i] = (Real)d;
			}
		}
	}
	// The vertical edges
#pragma omp parallel for num_threads( threads )
	for( int b=0 ; b<(int)bands() ; b++ )
	{
		int j0 = b , j1 = (b+1) % _resY;
		double stencil[4][4];
		_dStencil( b , stencil );

		Pointer( Real ) dy1 = out.dy() + b * vDimX;
		ConstPointer( Real ) dy2 = in.dy() + b * vDimX;
		ConstPointer( Real ) dx2_0;
		ConstPointer( Real ) dx2_1;

		if( _gridType.yPole0() || _gridType.yDirichlet0() ) dx2_0 = in.dx() + (j0-1) * fDimX , dx2_1 = in.dx() + (j1-1) * fDimX;
		else                                                dx2_0 = in.dx() +  j0    * fDimX , dx2_1 = in.dx() +  j1    * fDimX;

		if( _gridType.xDirichlet() ) dy1-- , dy2--;


		if     ( b==0         && ( _gridType.yPole0() || _gridType.yDirichlet0() ) ) dx2_0 = GetPointer( zeros );
		else if( b==bands()-1 && ( _gridType.yPole1() || _gridType.yDirichlet1() ) ) dx2_1 = GetPointer( zeros );
		if( _gridType.xPeriodic() )
		{
			for( int i=0 ; i<_resX ; i++ )
			{
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_R][_L] * dy2[i0] + ( stencil[_R][_R] + stencil[_L][_L] ) * dy2[i1] + stencil[_L][_R] * dy2[i2];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0] + stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dy1[i] = (Real)d;
			}
		}
		else if( _gridType.xNeumann() )
		{
			{
				int i=0;
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_L][_L] * dy2[i1] + stencil[_L][_R] * dy2[i2];
				d += stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dy1[i] = (Real)d;
			}
			for( int i=1 ; i<_resX-1 ; i++ )
			{
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_R][_L] * dy2[i0] + ( stencil[_R][_R] + stencil[_L][_L] ) * dy2[i1] + stencil[_L][_R] * dy2[i2];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0] + stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dy1[i] = (Real)d;
			}
			{
				int i=_resX-1;
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_R][_L] * dy2[i0] + stencil[_R][_R] * dy2[i1];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0];
				dy1[i] = (Real)d;
			}
		}
		else if( _gridType.xDirichlet() )
		{
			{
				int i=1;
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += ( stencil[_R][_R] + stencil[_L][_L] ) * dy2[i1] + stencil[_L][_R] * dy2[i2];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0] + stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dy1[i] = (Real)d;
			}
			for( int i=2 ; i<_resX-2 ; i++ )
			{
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_R][_L] * dy2[i0] + ( stencil[_R][_R] + stencil[_L][_L] ) * dy2[i1] + stencil[_L][_R] * dy2[i2];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0] + stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dy1[i] = (Real)d;
			}
			{
				int i=_resX-2;
				int i0 = (i+_resX-1) % _resX , i1 = i , i2 = (i+1) % _resX;
				double d = 0;
				d += stencil[_R][_L] * dy2[i0] + ( stencil[_R][_R] + stencil[_L][_L] ) * dy2[i1];
				d += stencil[_R][_B] * dx2_0[i0] + stencil[_R][_T] * dx2_1[i0] + stencil[_L][_B] * dx2_0[i1] + stencil[_L][_T] * dx2_1[i1];
				dy1[i] = (Real)d;
			}
		}
	}
}
template< class Real >
void SoRParameterization::poissonSystem( SparseMatrix< Real , int >& mass , SparseMatrix< Real , int >& stiffness , int threads ) const
{
	int sz = RegularGridFEM::Dim( _resX , _resY , _gridType );
	mass.resize( sz ) , stiffness.resize( sz );


#define SET0( STARTJ , ENDJ )                                                                                           \
	{                                                                                                                   \
		int N = idx[1] + i;                                                                                             \
		mass.SetRowSize( N , 2 * ( ENDJ - STARTJ + 1 ) ) , stiffness.SetRowSize( N , 2 * ( ENDJ - STARTJ + 1 ) );       \
		for( int index=0 , jj=STARTJ ; jj<=ENDJ ; jj++ ) for( int ii=0 ; ii<=1 ; ii++ , index++ )                       \
		{                                                                                                               \
			mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii + _resX ) % _resX;                          \
			mass[N][index].Value = (Real)mStencil[ii+1][jj+1] , stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1]; \
		}                                                                                                               \
	}
#define SET1( STARTJ , ENDJ )                                                                                           \
	{                                                                                                                   \
		int N = idx[1] + i;                                                                                             \
		mass.SetRowSize( N , 3 * ( ENDJ - STARTJ + 1 ) ) , stiffness.SetRowSize( N , 3 * ( ENDJ - STARTJ + 1 ) );       \
		for( int index=0 , jj=STARTJ ; jj<=ENDJ ; jj++ ) for( int ii=-1 ; ii<=1 ; ii++ , index++ )                      \
		{                                                                                                               \
			mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii + _resX ) % _resX;                          \
			mass[N][index].Value = (Real)mStencil[ii+1][jj+1] , stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1]; \
		}                                                                                                               \
	}
#define SET2( STARTJ , ENDJ )                                                                                           \
	{                                                                                                                   \
		int N = idx[1] + i;                                                                                             \
		mass.SetRowSize( N , 2 * ( ENDJ - STARTJ + 1 ) ) , stiffness.SetRowSize( N , 2 * ( ENDJ - STARTJ + 1 ) );       \
		for( int index=0 , jj=STARTJ ; jj<=ENDJ ; jj++ ) for( int ii=-1 ; ii<=0 ; ii++ , index++ )                      \
		{                                                                                                               \
			mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii + _resX ) % _resX;                          \
			mass[N][index].Value = (Real)mStencil[ii+1][jj+1] , stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1]; \
		}                                                                                                               \
	}
#define SET0_N( STARTJ , ENDJ )                                                                                                   \
	{                                                                                                                             \
		int N = idx[1] + i;                                                                                                       \
		mass.SetRowSize( N , 2 * ( ENDJ - STARTJ + 1 ) ) , stiffness.SetRowSize( N , 2 * ( ENDJ - STARTJ + 1 ) );                 \
		for( int index=0 , jj=STARTJ ; jj<=ENDJ ; jj++ )                                                                          \
		{                                                                                                                         \
			mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + 0 + _resX ) % _resX;                                     \
			mass[N][index].Value = (Real)( mStencil[1][jj+1]*0.5 ) , stiffness[N][index].Value = (Real)( sStencil[1][jj+1]*0.5 ); \
			index++;                                                                                                              \
			mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + 1 + _resX ) % _resX;                                     \
			mass[N][index].Value = (Real)mStencil[2][jj+1] , stiffness[N][index].Value = (Real)sStencil[2][jj+1];                 \
			index++;                                                                                                              \
		}                                                                                                                         \
	}
#define SET2_N( STARTJ , ENDJ )                                                                                                   \
	{                                                                                                                             \
		int N = idx[1] + i;                                                                                                       \
		mass.SetRowSize( N , 2 * ( ENDJ - STARTJ + 1 ) ) , stiffness.SetRowSize( N , 2 * ( ENDJ - STARTJ + 1 ) );                 \
		for( int index=0 , jj=STARTJ ; jj<=ENDJ ; jj++ )                                                                          \
		{                                                                                                                         \
			mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i - 1 + _resX ) % _resX;                                     \
			mass[N][index].Value = (Real)mStencil[0][jj+1] , stiffness[N][index].Value = (Real)sStencil[0][jj+1];                 \
			index++;                                                                                                              \
			mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + 0 + _resX ) % _resX;                                     \
			mass[N][index].Value = (Real)( mStencil[1][jj+1]*0.5 ) , stiffness[N][index].Value = (Real)( sStencil[1][jj+1]*0.5 ); \
			index++;                                                                                                              \
		}                                                                                                                         \
	}

#define SET_ROW( STARTJ , ENDJ )                                    \
	if( _gridType.xPeriodic() )                                     \
		for( int i=0 ; i<_resX ; i++ ) { SET1( STARTJ , ENDJ ); }   \
	else if( _gridType.xNeumann() )                                 \
	{                                                               \
		{ int i=0       ; SET0_N( STARTJ , ENDJ ) ; }               \
		for( int i=1 ; i<_resX-1 ; i++ ) { SET1( STARTJ , ENDJ ); } \
		{ int i=_resX-1 ; SET2_N( STARTJ , ENDJ ) ; }               \
	}                                                               \
	else if( _gridType.xDirichlet() )                               \
	{                                                               \
		{ int i=      1 ;  SET0( STARTJ , ENDJ ) ; }                \
		for( int i=2 ; i<_resX-2 ; i++ ) { SET1( STARTJ , ENDJ ); } \
		{ int i=_resX-2 ;  SET2( STARTJ , ENDJ ) ; }                \
	}

	int vDimX = _gridType.xDirichlet() ? _resX-2 : _resX;
	int fDimX = _gridType.xPeriodic() ? _resX : _resX-1;
	int poleDim = _gridType.xDirichlet() ? 0 : 1;
#pragma omp parallel for num_threads( threads )
	for( int j=0 ; j<_resY ; j++ )
	{
		if     ( j==0       && ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ) continue;
		else if( j==_resY-1 && ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ) ) continue;

		int j0 = (j+_resY-1) %_resY , j1 = j , j2 = (j+1) % _resY;
		const Stencil< 3 , 3 >& mStencil = _mStencils[j].stencil;
		const Stencil< 3 , 3 >& sStencil = _sStencils[j].stencil;

		int idx[3];
		if     ( _gridType.yPole0() )      idx[0] = poleDim + (j0-1) * vDimX , idx[1] = poleDim + (j1-1) * vDimX , idx[2] = poleDim + (j2-1) * vDimX;
		else if( _gridType.yDirichlet0() ) idx[0] =           (j0-1) * vDimX , idx[1] =           (j1-1) * vDimX , idx[2] =           (j2-1) * vDimX;
		else                               idx[0] =            j0    * vDimX , idx[1] =            j1    * vDimX , idx[2] =            j2    * vDimX;
		if( _gridType.xDirichlet() ) idx[0]-- , idx[1]-- , idx[2]--;

		if( _gridType.yPole0() )
			if     ( j==0 ) idx[1] = 0;
			else if( j==1 ) idx[0] = 0;

		if( j==0       && _gridType.yPole0() )
		{
			int N = idx[1];
			mass.SetRowSize( N , _resX+1 ) , stiffness.SetRowSize( N , _resX+1 );
			mass[N][0].N = stiffness[N][0].N = N;
			mass[N][0].Value = (Real)mStencil[1][1] , stiffness[N][0].Value = (Real)sStencil[1][1];
			if( _gridType.xPeriodic() )
				for( int i=0 ; i<_resX ; i++ )
				{
					mass[N][i+1].N = stiffness[N][i+1].N = idx[2]+i;
					mass[N][i+1].Value = (Real)mStencil[1][2] , stiffness[N][i+1].Value = (Real)sStencil[1][2];
				}
			else if( _gridType.xNeumann() )
			{
				{
					int i=0;
					mass[N][i+1].N = stiffness[N][i+1].N = idx[2]+i;
					mass[N][i+1].Value = (Real)( mStencil[1][2] * 0.5 ) , stiffness[N][i+1].Value = (Real)( sStencil[1][2] * 0.5 );
				}
				for( int i=1 ; i<_resX-1 ; i++ )
				{
					mass[N][i+1].N = stiffness[N][i+1].N = idx[2]+i;
					mass[N][i+1].Value = (Real)mStencil[1][2] , stiffness[N][i+1].Value = (Real)sStencil[1][2];
				}
				{
					int i=_resX-1;
					mass[N][i+1].N = stiffness[N][i+1].N = idx[2]+i;
					mass[N][i+1].Value = (Real)( mStencil[1][2] * 0.5 ) , stiffness[N][i+1].Value = (Real)( sStencil[1][2] * 0.5 );
				}
			}
		}
		else if( j==_resY-1 && _gridType.yPole1() )
		{
			int N = idx[1];
			mass.SetRowSize( N , _resX+1 ) , stiffness.SetRowSize( N , _resX+1 );
			mass[N][0].N = stiffness[N][0].N = N;
			mass[N][0].Value = (Real)mStencil[1][1] , stiffness[N][0].Value = (Real)sStencil[1][1];
			if( _gridType.xPeriodic() )
				for( int i=0 ; i<_resX ; i++ )
				{
					mass[N][i+1].N = stiffness[N][i+1].N = idx[0] + i;
					mass[N][i+1].Value = (Real)mStencil[1][0] , stiffness[N][i+1].Value = (Real)sStencil[1][0];
				}
			else if( _gridType.xNeumann() )
			{
				{
					int i=0;
					mass[N][i+1].N = stiffness[N][i+1].N = idx[0]+i;
					mass[N][i+1].Value = (Real)( mStencil[1][0] * 0.5 ) , stiffness[N][i+1].Value = (Real)( sStencil[1][0] * 0.5 );
				}
				for( int i=1 ; i<_resX-1 ; i++ )
				{
					mass[N][i+1].N = stiffness[N][i+1].N = idx[0]+i;
					mass[N][i+1].Value = (Real)mStencil[1][0] , stiffness[N][i+1].Value = (Real)sStencil[1][0];
				}
				{
					int i=_resX-1;
					mass[N][i+1].N = stiffness[N][i+1].N = idx[0]+i;
					mass[N][i+1].Value = (Real)( mStencil[1][0] * 0.5 ) , stiffness[N][i+1].Value = (Real)( sStencil[1][0] * 0.5 );
				}
			}
		}
		else if( j==0       && _gridType.yNeumann0() ) { SET_ROW(  0 , 1 ); }
		else if( j==_resY-1 && _gridType.yNeumann1() ) { SET_ROW( -1 , 0 ); }
		else if( j==1       && ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ) { SET_ROW(  0 , 1 ); }
		else if( j==_resY-2 && ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ) ) { SET_ROW( -1 , 0 ); }
		else if( j==1       && _gridType.yPole0() )
		{
			if( _gridType.xPeriodic() )
			{
				for( int i=0 ; i<_resX ; i++ )
				{
					int N = idx[1] + i;
					mass.SetRowSize( N , 7 ) , stiffness.SetRowSize( N , 7 );
					mass[N][0].N = stiffness[N][0].N = idx[0];
					mass[N][0].Value = (Real)mStencil[1][0] , stiffness[N][0].Value = (Real)sStencil[1][0];
					for( int jj=0 ; jj<=1 ; jj++ ) for( int ii=-1 ; ii<=1 ; ii++ )
					{
						int index = 1+jj*3+(ii+1);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii + _resX ) % _resX;
						mass[N][index].Value = (Real)mStencil[ii+1][jj+1] , stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1];
					}
				}
			}
			else if( _gridType.xNeumann() )
			{
				{
					int i=0;
					int N = idx[1] + i;
					mass.SetRowSize( N , 5 ) , stiffness.SetRowSize( N , 5 );
					mass[N][0].N = stiffness[N][0].N = idx[0];
					mass[N][0].Value = (Real)( mStencil[1][0] * 0.5 ) , stiffness[N][0].Value = (Real)( sStencil[1][0] * 0.5 );
					for( int jj=0 ; jj<=1 ; jj++ ) for( int ii=0 ; ii<=1 ; ii++ )
					{
						int index = 1+jj*2+(ii);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii + _resX ) % _resX;
						mass     [N][index].Value = (Real)mStencil[ii+1][jj+1] * ( ii==0 ? (Real)0.5 : (Real)1. );
						stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1] * ( ii==0 ? (Real)0.5 : (Real)1. );
					}
				}
				for( int i=1 ; i<_resX-1 ; i++ )
				{
					int N = idx[1] + i;
					mass.SetRowSize( N , 7 ) , stiffness.SetRowSize( N , 7 );
					mass[N][0].N = stiffness[N][0].N = idx[0];
					mass[N][0].Value = (Real)mStencil[1][0] , stiffness[N][0].Value = (Real)sStencil[1][0];
					for( int jj=0 ; jj<=1 ; jj++ ) for( int ii=-1 ; ii<=1 ; ii++ )
					{
						int index = 1+jj*3+(ii+1);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii + _resX ) % _resX;
						mass[N][index].Value = (Real)mStencil[ii+1][jj+1] , stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1];
					}
				}
				{
					int i=_resX-1;
					int N = idx[1] + i;
					mass.SetRowSize( N , 5 ) , stiffness.SetRowSize( N , 5 );
					mass[N][0].N = stiffness[N][0].N = idx[0];
					mass[N][0].Value = (Real)( mStencil[1][0] * 0.5 ) , stiffness[N][0].Value = (Real)( sStencil[1][0] * 0.5 );
					for( int jj=0 ; jj<=1 ; jj++ ) for( int ii=-1 ; ii<=0 ; ii++ )
					{
						int index = 1+jj*2+(ii+1);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii + _resX ) % _resX;
						mass     [N][index].Value = (Real)mStencil[ii+1][jj+1] * ( ii==0 ? (Real)0.5 : (Real)1. );
						stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1] * ( ii==0 ? (Real)0.5 : (Real)1. );
					}
				}
			}
			else if( _gridType.xDirichlet() )
			{
				{
					int i=1;
					int N = idx[1] + i;
					mass.SetRowSize( N , 4 ) , stiffness.SetRowSize( N , 4 );
					for( int jj=0 ; jj<=1 ; jj++ ) for( int ii=0 ; ii<=1 ; ii++ )
					{
						int index = jj*2+(ii);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii + _resX ) % _resX;
						mass     [N][index].Value = (Real)mStencil[ii+1][jj+1];
						stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1];
					}
				}
				for( int i=2 ; i<_resX-2 ; i++ )
				{
					int N = idx[1] + i;
					mass.SetRowSize( N , 6 ) , stiffness.SetRowSize( N , 6 );
					for( int jj=0 ; jj<=1 ; jj++ ) for( int ii=-1 ; ii<=1 ; ii++ )
					{
						int index = jj*3+(ii+1);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii + _resX ) % _resX;
						mass[N][index].Value = (Real)mStencil[ii+1][jj+1] , stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1];
					}
				}
				{
					int i=_resX-2;
					int N = idx[1] + i;
					mass.SetRowSize( N , 4 ) , stiffness.SetRowSize( N , 4 );
					for( int jj=0 ; jj<=1 ; jj++ ) for( int ii=-1 ; ii<=0 ; ii++ )
					{
						int index = jj*2+(ii+1);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii + _resX ) % _resX;
						mass     [N][index].Value = (Real)mStencil[ii+1][jj+1];
						stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1];
					}
				}
			}
		}
		else if( j==_resY-2 && _gridType.yPole1() )
		{
			if( _gridType.xPeriodic() )
			{
				for( int i=0 ; i<_resX ; i++ )
				{
					int N = idx[1] + i;
					mass.SetRowSize( N , 7 ) , stiffness.SetRowSize( N , 7 );
					mass[N][0].N = stiffness[N][0].N = idx[2];
					mass[N][0].Value = (Real)mStencil[1][2] , stiffness[N][0].Value = (Real)sStencil[1][2];
					for( int jj=-1 ; jj<=0 ; jj++ ) for( int ii=-1 ; ii<=1 ; ii++ )
					{
						int index = 1+(jj+1)*3+(ii+1);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii + _resX ) % _resX;
						mass[N][index].Value = (Real)mStencil[ii+1][jj+1] , stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1];
					}
				}
			}
			else if( _gridType.xNeumann() )
			{
				{
					int i=0;
					int N = idx[1] + i;
					mass.SetRowSize( N , 5 ) , stiffness.SetRowSize( N , 5 );
					mass[N][0].N = stiffness[N][0].N = idx[2];
					mass[N][0].Value = (Real)( mStencil[1][2] * 0.5 ) , stiffness[N][0].Value = (Real)( sStencil[1][2] * 0.5 );
					for( int jj=-1 ; jj<=0 ; jj++ ) for( int ii=0 ; ii<=1 ; ii++ )
					{
						int index = 1+(jj+1)*2+(ii);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii );
						mass     [N][index].Value = (Real)mStencil[ii+1][jj+1] * ( ii==0 ? (Real)0.5 : (Real)1. );
						stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1] * ( ii==0 ? (Real)0.5 : (Real)1. );
					}
				}
#pragma omp parallel for num_threads( threads )
				for( int i=1 ; i<_resX-1 ; i++ )
				{
					int N = idx[1] + i;
					mass.SetRowSize( N , 7 ) , stiffness.SetRowSize( N , 7 );
					mass[N][0].N = stiffness[N][0].N = idx[2];
					mass[N][0].Value = (Real)mStencil[1][2] , stiffness[N][0].Value = (Real)sStencil[1][2];
					for( int jj=-1 ; jj<=0 ; jj++ ) for( int ii=-1 ; ii<=1 ; ii++ )
					{
						int index = 1+(jj+1)*3+(ii+1);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii );
						mass[N][index].Value = (Real)mStencil[ii+1][jj+1] , stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1];
					}
				}
				{
					int i=_resX-1;
					int N = idx[1] + i;
					mass.SetRowSize( N , 5 ) , stiffness.SetRowSize( N , 5 );
					mass[N][0].N = stiffness[N][0].N = idx[2];
					mass[N][0].Value = (Real)( mStencil[1][2] * 0.5 ) , stiffness[N][0].Value = (Real)( sStencil[1][2] * 0.5 );
					for( int jj=-1 ; jj<=0 ; jj++ ) for( int ii=-1 ; ii<=0 ; ii++ )
					{
						int index = 1+(jj+1)*2+(ii+1);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii );
						mass     [N][index].Value = (Real)mStencil[ii+1][jj+1] * ( ii==0 ? (Real)0.5 : (Real)1. );
						stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1] * ( ii==0 ? (Real)0.5 : (Real)1. );
					}
				}
			}
			else if( _gridType.xDirichlet() )
			{
				{
					int i=1;
					int N = idx[1] + i;
					mass.SetRowSize( N , 4 ) , stiffness.SetRowSize( N , 4 );
					for( int jj=-1 ; jj<=0 ; jj++ ) for( int ii=0 ; ii<=1 ; ii++ )
					{
						int index = (jj+1)*2+(ii);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii );
						mass     [N][index].Value = (Real)mStencil[ii+1][jj+1];
						stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1];
					}
				}
				for( int i=2 ; i<_resX-2 ; i++ )
				{
					int N = idx[1] + i;
					mass.SetRowSize( N , 6 ) , stiffness.SetRowSize( N , 6 );
					for( int jj=-1 ; jj<=0 ; jj++ ) for( int ii=-1 ; ii<=1 ; ii++ )
					{
						int index = (jj+1)*3+(ii+1);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii );
						mass[N][index].Value = (Real)mStencil[ii+1][jj+1] , stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1];
					}
				}
				{
					int i=_resX-2;
					int N = idx[1] + i;
					mass.SetRowSize( N , 4 ) , stiffness.SetRowSize( N , 4 );
					for( int jj=-1 ; jj<=0 ; jj++ ) for( int ii=-1 ; ii<=0 ; ii++ )
					{
						int index = (jj+1)*2+(ii+1);
						mass[N][index].N = stiffness[N][index].N = idx[jj+1] + ( i + ii );
						mass     [N][index].Value = (Real)mStencil[ii+1][jj+1];
						stiffness[N][index].Value = (Real)sStencil[ii+1][jj+1];
					}
				}
			}
		}
		else{ SET_ROW( -1 , 1 ); }
	}
#undef SET_ROW
#undef SET0
#undef SET1
#undef SET2
#undef SET0_N
#undef SET2_N
}
template< class Real >
void SoRParameterization::_poissonFrequencySystems( BandedMatrix< Real , 1 >* mass , BandedMatrix< Real , 1 >* stiffness , unsigned int minFrequency , unsigned int maxFrequency , int threads ) const
{
	if( minFrequency>=maxFrequency ) fprintf( stderr , "[ERROR] Frequencies out of order: %d >= %d\n" , minFrequency , maxFrequency ) , exit( 0 );
	int logicalXDim = _gridType.xPeriodic() ? _resX : 2 * ( _resX - 1 );

	double *cs = new double[ maxFrequency-minFrequency ];
	cs -= minFrequency , mass -= minFrequency , stiffness -= minFrequency;

	int sz = _resY - ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() )? 1 : 0 ) - ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ? 1 : 0 );
	for( unsigned int f=minFrequency ; f<maxFrequency ; f++ )
	{
		mass[f].resize( sz , (Real)0. ) , stiffness[f].resize( sz , (Real)0. );
		cs[f] = cos( 2 * f * M_PI / logicalXDim );
	}
#define SET_ROW( JSTART , JEND )                                                                              \
	{                                                                                                         \
		for( int jj=JSTART ; jj<=JEND ; jj++ )                                                                \
		{                                                                                                     \
			mass     [f][_j][jj] = (Real)( mStencil[1][jj] + ( mStencil[0][jj] + mStencil[2][jj] ) * cs[f] ); \
			stiffness[f][_j][jj] = (Real)( sStencil[1][jj] + ( sStencil[0][jj] + sStencil[2][jj] ) * cs[f] ); \
		}                                                                                                     \
	}

	int poleScale = _gridType.xNeumann() ? 2 : 1;
#pragma omp parallel for num_threads( threads )
	for( int j=0 ; j<_resY ; j++ )
	{
		if     ( j==0       && ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ) continue;
		else if( j==_resY-1 && ( _gridType.yDirichlet1() || ( _gridType.yPole1() && _gridType.xDirichlet() ) ) ) continue;

		int _j = ( _gridType.yDirichlet0() || ( _gridType.yPole0() && _gridType.xDirichlet() ) ) ? j-1 : j;

		const Stencil< 3 , 3 >& mStencil = _mStencils[j].stencil;
		const Stencil< 3 , 3 >& sStencil = _sStencils[j].stencil;

		if( j==0       && _gridType.yPole0() )
			for( unsigned int f=minFrequency ; f<maxFrequency ; f++ )
				if( f==0 )
				{
					mass[f][_j][1] = (Real)mStencil[1][1]*poleScale   , stiffness[f][_j][1] = (Real)sStencil[1][1]*poleScale  ;
					mass[f][_j][2] = (Real)mStencil[1][2]*logicalXDim , stiffness[f][_j][2] = (Real)sStencil[1][2]*logicalXDim;
				}
				else mass[f][_j][1] = stiffness[f][_j][1] = (Real)1;
		else if( j==_resY-1 && _gridType.yPole1() )
			for( unsigned int f=minFrequency ; f<maxFrequency ; f++ )
				if( f==0 )
				{
					mass[f][_j][0] = (Real)mStencil[1][0]*logicalXDim , stiffness[f][_j][0] = (Real)sStencil[1][0]*logicalXDim;
					mass[f][_j][1] = (Real)mStencil[1][1]*poleScale   , stiffness[f][_j][1] = (Real)sStencil[1][1]*poleScale  ;
				}
				else mass[f][_j][1] = stiffness[f][_j][1] = (Real)1;
		else if( j==1 && _gridType.yPole0() )
			for( unsigned int f=minFrequency ; f<maxFrequency ; f++ )
			{
				if( f==0 && !_gridType.xDirichlet() ) mass[f][_j][0] = (Real)mStencil[1][0] , stiffness[f][_j][0] = (Real)sStencil[1][0];
				SET_ROW( 1 , 2 );
			}
		else if( j==_resY-2 && _gridType.yPole1() )
			for( unsigned int f=minFrequency ; f<maxFrequency ; f++ )
			{
				SET_ROW( 0 , 1 );
				if( f==0 && !_gridType.xDirichlet() ) mass[f][_j][2] = (Real)mStencil[1][2] , stiffness[f][_j][2] = (Real)sStencil[1][2];
			}
		else for( unsigned int f=minFrequency ; f<maxFrequency ; f++ ){ SET_ROW( 0 , 2 ); }
	}
	cs += minFrequency , mass += minFrequency , stiffness += minFrequency;
	delete[] cs;
#undef SET_ROW
}
template< class Real >
void SoRParameterization::poissonFrequencySystem( BandedMatrix< Real , 1 >& mass , BandedMatrix< Real , 1 >& stiffness , int frequency , int threads ) const
{
	return _poissonFrequencySystems( &mass , &stiffness , frequency , frequency+1 , threads );
}
template< class Real >
void SoRParameterization::poissonFrequencySystems( BandedMatrix< Real , 1 >* mass , BandedMatrix< Real , 1 >* stiffness , int minFrequency , int maxFrequency , int threads ) const
{
	return _poissonFrequencySystems( mass , stiffness , minFrequency , maxFrequency , threads );
}

template< class Data , class Real >
void SoRParameterization::_assertValidity( const RegularGridFEM::template Signal< Data , Real >& f , const char* methodName ) const
{
	unsigned int resX , resY;
	f.resolution( resX , resY );
	if( _resX!=resX || resY!=_resY ) fprintf( stderr , "[ERROR] %s: Resolutions don't match: %d x %d != %d x %d\n" , methodName , _resX , _resY , resX , resY ) , exit( 0 );
	if( _gridType!=f.gridType() ) fprintf( stderr , "[ERROR] %s: Grid types don't match: (%s %s) <-> (%s %s)\n" , methodName , _gridType.xName() , _gridType.yName() , f.gridType().xName() , f.gridType().yName() ) , exit( 0 );
}
template< class Data , class Real , int DType >
void SoRParameterization::_assertValidity( const RegularGridFEM::template Derivative< Data , Real , DType >& f , const char* methodName ) const
{
	unsigned int resX , resY;
	f.resolution( resX , resY );
	if( _resX!=resX || resY!=_resY ) fprintf( stderr , "[ERROR] %s: Resolutions don't match: %d x %d != %d x %d\n" , methodName , _resX , _resY , resX , resY ) , exit( 0 );
	if( _gridType!=f.gridType() ) fprintf( stderr , "[ERROR] %s: Grid types don't match: (%s %s) <-> (%s %s)\n" , methodName , _gridType.xName() , _gridType.yName() , f.gridType().xName() , f.gridType().yName() ) , exit( 0 );
}
void SoRParameterization::write( FILE* fp ) const
{
	fwrite( &_resX , sizeof( unsigned int ) , 1 , fp );
	fwrite( &_resY , sizeof( unsigned int ) , 1 , fp );
	_gridType.write( fp );
	char temp = _conicalGeometry ? 1 : 0;
	fwrite( &temp , sizeof( char ) , 1 , fp );
	fwrite( &_angleOfRevolution , sizeof( double ) , 1 , fp );
	fwrite( _samples , sizeof( Point2D< double > ) , _resY , fp );
}
SoRParameterization::SoRParameterization( FILE* fp )
{
	unsigned int rX , rY;
	RegularGridFEM::GridType gridType;
	bool conicalGeometry;
	Pointer( Point2D< double > ) samples;
	double angleOfRevolution;
	fread( &rX , sizeof( unsigned int ) , 1 , fp );
	fread( &rY , sizeof( unsigned int ) , 1 , fp );
	gridType.read( fp );
	char temp;
	fread( &temp , sizeof( char ) , 1 , fp );
	conicalGeometry = temp!=0;
	fread( &angleOfRevolution , sizeof( double ) , 1 , fp );
	samples = AllocPointer< Point2D< double > >( rY );
	fread( samples , sizeof( Point2D< double > ) , rY , fp );
	_init( rX , rY , gridType , conicalGeometry , samples , angleOfRevolution );
	FreePointer( samples );
}
SoRParameterization::SoRParameterization( int rX , int rY , RegularGridFEM::GridType gridType , bool conicalGeometry , ConstPointer( Point2D< double > ) samples , double angleOfRevolution )
{
	_init( rX , rY , gridType , conicalGeometry , samples , angleOfRevolution );
}
void SoRParameterization::_init( int rX , int rY , RegularGridFEM::GridType gridType , bool conicalGeometry , ConstPointer( Point2D< double > ) samples , double angleOfRevolution )
{
	_gridType = gridType;
	_angleOfRevolution = angleOfRevolution;
	if( _gridType.xPeriodic() ) _angleOfRevolution = 2. * PI;
	else if( _angleOfRevolution<=0 || _angleOfRevolution>2.*PI ) fprintf( stderr , "[ERROR] SoRParameterization::SoRParameterization: angle out of bounds: %d <= %g <= %g\n" , 0 , _angleOfRevolution , 2. * PI ) , exit( 0 );
	_conicalGeometry = conicalGeometry;
	{
		int j=0;
		if( _gridType.yPole0() )
		{
			if( samples[j][0]!=0 ) fprintf( stderr , "[ERROR] X-coordinate must equal zero: [%s %s][%d / %d](%f %f)\n" , _gridType.xName() , _gridType.yName() , j , rY , samples[j][0] , samples[j][1] ) , exit( 0 );
		}
		else
		{
			if( samples[j][0]<=0 ) fprintf( stderr , "[ERROR] X-coordinates must be positive: [%s %s][%d / %d](%f %f)\n" , _gridType.xName() , _gridType.yName() , j , rY , samples[j][0] , samples[j][1] ) , exit( 0 );
		}
	}
	for( int j=1 ; j<rY-1 ; j++ )
		if( samples[j][0]<=0 ) fprintf( stderr , "[ERROR] X-coordinates must be positive: [%s %s][%d / %d](%f %f)\n" , _gridType.xName() , _gridType.yName() , j , rY , samples[j][0] , samples[j][1] ) , exit( 0 );
	{
		int j=rY-1;
		if( _gridType.yPole1() )
		{
			if( samples[j][0]!=0 ) fprintf( stderr , "[ERROR] X-coordinate must equal zero: [%s %s][%d / %d](%f %f)\n" , _gridType.xName() , _gridType.yName() , j , rY , samples[j][0] , samples[j][1] ) , exit( 0 );
		}
		else
		{
			if( samples[j][0]<=0 ) fprintf( stderr , "[ERROR] X-coordinates must be positive: [%s %s][%d / %d](%f %f)\n" , _gridType.xName() , _gridType.yName() , j , rY , samples[j][0] , samples[j][1] ) , exit( 0 );
		}
	}
	_resX = rX , _resY = rY;
	_samples = AllocPointer< Point2D< double > >( _resY );
	memcpy( _samples , samples , sizeof( Point2D< double > ) * _resY );


	if( _conicalGeometry ) _coneData = AllocPointer< ConeData >( bands() ) , _trapData = NullPointer< TrapData >();
	else                   _trapData = AllocPointer< TrapData >( bands() ) , _coneData = NullPointer< ConeData >();
	
	if( _conicalGeometry )
	{
		for( int j=0 ; j<(int)bands() ; j++ )
		{
			int jj = (j+1)%_resY;
			double r0 = fabs( _samples[j][0] ) , r1 = fabs( _samples[jj][0] ) , h0 = _samples[j][1] , h1 = _samples[jj][1];
			double dr = r1-r0 , dh = h1-h0;
#pragma message( "[WARNING] Soft equality testing" )
			if( fabs(r0-r1)>1e-4 )
			{
				// Compute the height at the apex of the cone
				// r(h) = m*h + b, w/ r(h0) = r0 , r(h1) = r1 
				// <=> b = r0 - dr / dh * h0 , m  = dr / dh
				// r(h) = dr / dh * h + r0 - dr / dh * h0
				// => r(h) = 0 <=> h = - r0 * dh / dr + h0
				{
					double center = -r0 * dh / dr + h0;
					h0 -= center , h1 -= center;
				}
				_coneData[j].r0 = sqrt( r0*r0 + h0*h0 ) , _coneData[j].r1 = sqrt( r1*r1 + h1*h1 );
				// Compute the angle multiplier (>=1):
				_coneData[j].ratio = r0==0 ? _coneData[j].r1/r1 : _coneData[j].r0/r0;

				double r = _coneData[j].r0 , dr = _coneData[j].r1 - _coneData[j].r0 , theta = _angleOfRevolution / _resX / _coneData[j].ratio;
				_coneData[j].a0  = theta * r , _coneData[j].a1 = theta * dr , _coneData[j].b0 = fabs(dr);

			}
			else
			{
				_coneData[j].ratio = 0;
				_coneData[j].width  = fabs( _samples[j][0] + _samples[jj][0] ) / 2. * _angleOfRevolution / _resX;
				_coneData[j].height = fabs( _samples[j][1] - _samples[jj][1] );
				_coneData[j].a0 = _coneData[j].width , _coneData[j].a1 = 0 , _coneData[j].b0 = _coneData[j].height;
			}
		}
	}
	else
	{
		// The faces of the mesh are (possibly degenerate) trapezoids
		// The coordinates of the vertices associated to sample s are:
		//		( s[0] , s[1] , 0 ) and ( s[0]*cos(dTheta) , s[1] , s[0]*sin(dTheta) )
		// The distance between the two samples on a parallel is:
		//		p(s) = ( [ s[0] - s[0] * cos(dTheta) ]^2 + ( s[0] * sin(dTheta )^2 ] )^0.5
		//		     = |s[0]| * [ ( 1-cos(dTheta) )^2 + sin^2(dTheta) ]^0.5
		//		     = |s[0]| * [ 1-2*cos(dTheta) + cos^2(dTheta) + sin^2(dTheta) ]^0.5
		//			 = |s[0]| * [ 2-2*cos(dTheta) ]^0.5
		// The height of the meridian is the distance between the mid-points of the two samples s and t
		//		m_s = [ ( s[0] * ( 1 + cos(dTheta) )/2 , s[1] , s[0]*sin(dTheta)/2 )
		//		m_t = [ ( t[0] * ( 1 + cos(dTheta) )/2 , t[1] , t[0]*sin(dTheta)/2 )
		//		m(s,t) = || m_s - m_t ||
		//		       = || ( (s[0]-t[0]) * (1+cos(dTheta) ) / 2 , s[1]-t[1] , (s[0]-t[0])*sin(dTheta)/2 ) ||
		//			   = [ (s[0]-t[0])^2 * ( (1+cos(dTheta))^2 + sin^2(dTheta) )/4 + (s[1]-t[1])^2 ]^0.5
		//			   = [ (s[0]-t[0])^2 * ( 1+cos(dTheta) )/2 + (s[1]-t[1])^2 ]^0.5
		double dTheta = _angleOfRevolution / _resX , cos_dTheta = cos( dTheta );
		for( int j=0 ; j<(int)bands() ; j++ )
		{
			int jj = (j+1)%_resY;
			Point2D< double > dSample = _samples[jj]-_samples[j];
			_trapData[j].width0 = fabs( _samples[j ][0] ) * sqrt( std::max< double >( 0. , 2. - 2. * cos_dTheta ) );
			_trapData[j].width1 = fabs( _samples[jj][0] ) * sqrt( std::max< double >( 0. , 2. - 2. * cos_dTheta ) );
			_trapData[j].height = sqrt( std::max< double >( 0 , dSample[0]*dSample[0] * (1.+cos_dTheta)/2. + dSample[1]*dSample[1] ) );
			// Compute the angle at vertex (0,0)
			double width0 = _trapData[j].width0 , width1 = _trapData[j].width1 , height = _trapData[j].height;
			double alpha = atan2( height , (width0-width1)/2 );
			// The rotation should take angle alpha to angle PI-alpha
			alpha = PI - 2. * alpha;
			_trapData[j].reflectCos = cos(alpha) , _trapData[j].reflectSin = sin(alpha);
		}
	}
	_mStencils = AllocPointer< ValueStencil >( _resY );
	_sStencils = AllocPointer< ValueStencil >( _resY );
	_dStencils = AllocPointer< DerivativeStencil >( _resY );
	_drStencils = AllocPointer< DerivativeStencil >( _resY );
	for( int j=0 ; j<_resY ; j++ ) __mStencil( j , _mStencils[j].stencil ) , __sStencil( j , _sStencils[j].stencil ) , __dStencils( j , _dStencils[j].hStencil , _dStencils[j].vStencil , false ) , __dStencils( j , _drStencils[j].hStencil , _drStencils[j].vStencil , true );
}
SoRParameterization::~SoRParameterization( void )
{
	FreePointer( _samples );
	if( _trapData ) FreePointer( _trapData );
	if( _coneData ) FreePointer( _coneData );
	FreePointer( _mStencils );
	FreePointer( _sStencils );
	FreePointer( _dStencils );
	FreePointer( _drStencils );
}
double SoRParameterization::area( int b ) const { return Integral( _area(b) , 0. , 1. ); }
double SoRParameterization::area( void ) const
{
	int fDimX = _gridType.xPeriodic() ? _resX : _resX-1;
	double a = 0;
	for( int b=0 ; b<(int)bands() ; b++ ) a += area( b ) * fDimX;
	return a;
}
unsigned int SoRParameterization::bands( void ) const{ return _gridType.yPeriodic() ? _resY : _resY-1; }
unsigned int SoRParameterization::faces( void ) const{ return ( _gridType.xPeriodic() ? _resX : _resX-1 ) * bands(); }
unsigned int SoRParameterization::vertices( void ) const { return _resX * ( _resY-2 ) + ( _gridType.yPole0() ? 1 : _resX ) + ( _gridType.yPole1() ? 1 : _resX ); }
unsigned int SoRParameterization::faceVertices( unsigned int face , unsigned int* vertices ) const
{
	int dimX = _gridType.xPeriodic() ? _resX : _resX-1;
	int x = face % dimX , y = face / dimX , xx = (x+1)%_resX , yy = (y+1)%_resY;
	if( !_gridType.yPole0() && !_gridType.yPole1() )
	{
		vertices[0] = y * _resX + x , vertices[1] = y * _resX + xx , vertices[2] = yy * _resX + xx , vertices[3] = yy * _resX + x;
		return 4;
	}
	else if( !_gridType.yPole0() || !_gridType.yPole1() )
	{
		if( y==_resY-2 )
		{
			vertices[0] = y * _resX + x , vertices[1] = y * _resX + xx , vertices[2] = _resX * ( _resY-1 );
			return 3;
		}
		else
		{
			vertices[0] = y * _resX + x , vertices[1] = y * _resX + xx , vertices[2] = yy * _resX + xx , vertices[3] = yy * _resX + x;
			return 4;
		}
	}
	else if( _gridType.yPole0() && _gridType.yPole1() )
	{
		if( y==0 )
		{
			vertices[0] = 0 , vertices[1] = 1 + (yy-1)*_resX + xx , vertices[2] = 1 + (yy-1)*_resX + x;
			return 3;
		}
		else if( y==_resY-2 )
		{
			vertices[0] = 1 + (y-1) * _resX + x , vertices[1] = 1 + (y-1) * _resX + xx , vertices[2] = 1 + (_resY-2) * _resX;
			return 3;
		}
		else
		{
			vertices[0] = 1 + (y-1) * _resX + x , vertices[1] = 1 + (y-1) * _resX + xx , vertices[2] = 1 + (yy-1) * _resX + xx , vertices[3] = 1 + (yy-1) * _resX + x;
			return 4;
		}
	}
	return 0;
}
void SoRParameterization::faceIndices( unsigned int face , unsigned int& x0 , unsigned int& y0 ) const { x0 = face % ( _gridType.xPeriodic() ? _resX : _resX-1 ) , y0 = face / ( _gridType.xPeriodic() ? _resX : _resX-1 ); }
template< class Real >
Point3D< Real > SoRParameterization::vertexPosition( unsigned int v ) const
{
	unsigned int x , y;
	vertexIndices( v , x , y );
	return position< Real >( (double)x , (double)y );
}
void SoRParameterization::vertexIndices( unsigned int v , unsigned int& x , unsigned int& y ) const
{
	if( _gridType.yPole0() && _gridType.yPole1() )
	{
		if( v==0 ) x = y = 0;
		else x = (v-1) % _resX , y = ( (v-1) / _resX ) + 1;
	}
	else x = v % _resX , y = v / _resX;
}
unsigned int SoRParameterization::vertexIndex( unsigned int x , unsigned int y ) const
{
	if( _gridType.yPole0() )
	{
		if( y==0 ) return 0;
		else if( y==_resY-1 && _gridType.yPole1() ) return 1 + (_resY-2) * _resX;
		else return 1 + (y-1)*_resX + x;
	}
	else
	{
		if( y==_resY-1 && _gridType.yPole1() ) return (_resY-1) * _resX;
		else return y * _resX + x;
	}
}


template< class Real >
Point3D< Real > SoRParameterization::position( double x , double y ) const
{
	bool reflectX , reflectY , negate;
	RegularGridFEM::RemapParameter( x , y , _resX , _resY , _gridType , reflectX , reflectY , negate );
	Point2D< double > sample;
	int j = (int)floor(y);
	double dy = y-j;
	if( _conicalGeometry )
	{
		double theta = _theta( x , y );
		if( _gridType.yPeriodic() ) sample = _samples[j] * (1.-dy) + _samples[(j+1)%_resY] * dy;
		else                        sample = _samples[j] * (1.-dy) + _samples[ std::min< int >( j+1 , _resY-1 ) ] * dy;
		return Point3D< Real >( (Real)( sample[0] * cos(theta) ) , (Real)sample[1] , (Real)( sample[0] * sin(theta) ) );
	}
	else
	{
		int i = (int)floor(x);
		double dx = x-i;
		Point3D< Real > corners[2][2];
		double theta0 = ( _angleOfRevolution * i ) / _resX , theta1 = ( _angleOfRevolution * (i+1) ) / _resX;
		theta0 -= _angleOfRevolution / 2. , theta1 -= _angleOfRevolution / 2.;
		double cos0 = cos(theta0) , cos1 = cos(theta1) , sin0 = sin(theta0) , sin1 = sin(theta1);
		sample = _samples[j];
		corners[0][0] = Point3D< Real >(  (Real)( sample[0] * cos0 ) , (Real)sample[1] , (Real)( sample[0] * sin0 ) );
		corners[1][0] = Point3D< Real >(  (Real)( sample[0] * cos1 ) , (Real)sample[1] , (Real)( sample[0] * sin1 ) );
		sample = _gridType.yPeriodic() ? _samples[(j+1)%_resY] : _samples[ std::min< int >( j+1 , _resY-1 ) ];
		corners[0][1] = Point3D< Real >(  (Real)( sample[0] * cos0 ) , (Real)sample[1] , (Real)( sample[0] * sin0 ) );
		corners[1][1] = Point3D< Real >(  (Real)( sample[0] * cos1 ) , (Real)sample[1] , (Real)( sample[0] * sin1 ) );
		return ( corners[0][0] * (Real)(1.-dx) + corners[1][0] * (Real)dx ) * ( (Real)(1.-dy) ) + ( corners[0][1] * (Real)(1.-dx) + corners[1][1] * (Real)dx ) * ( (Real)dy );
	}
}
template< class Real >
SquareMatrix< Real , 2 > SoRParameterization::metricTensor( double x , double y ) const
{
	bool reflectX , reflectY , negate;
	RegularGridFEM::RemapParameter( x , y , _resX , _resY , _gridType , reflectX , reflectY , negate );
	int i = (int)floor(x) , j = (int)floor(y);
	double dx = x-i , dy = y-j;
	SquareMatrix< Real , 2 > temp;
	MetricRoot mr = _metricRoot( j );
	temp(0,0) = (Real)mr.x_dx(dy) , temp(1,1) = (Real)mr.y_dy , temp(1,0) = (Real)mr.y_dx(dx) , (Real)temp(0,1) = (Real)0.;
	return SquareMatrix< Real , 2 >( temp.transpose() ) * temp;
}
double SoRParameterization::area( double x , double y ) const { return Length( normal< double >( x , y ) ); }
template< class Real >
void SoRParameterization::cotangents( double x , double y , Point3D< Real >& dX , Point3D< Real >& dY ) const
{
	Point3D< Real > _dX , _dY , N;
	tangents( x , y , _dX , _dY );
	N = Point3D< Real >::CrossProduct( _dX , _dY );
	dX = Point3D< Real >::CrossProduct( _dY , N ) , dX /= Point3D< Real >::Dot( dX , _dX );
	dY = Point3D< Real >::CrossProduct( _dX , N ) , dY /= Point3D< Real >::Dot( dY , _dY );
}
template< class Real >
void SoRParameterization::tangents( double x , double y , Point3D< Real >& dX , Point3D< Real >& dY ) const
{
	bool reflectX , reflectY , negate;
	RegularGridFEM::RemapParameter( x , y , _resX , _resY , _gridType , reflectX , reflectY , negate );

	int j = (int)floor(y);
	double dy = y-j;
	Point2D< double > sample0 = _samples[j] , sample1 = _gridType.yPeriodic() ? _samples[(j+1)%_resY] : _samples[ std::min< int >( j+1 , _resY-1 ) ];
	if( _conicalGeometry )
	{
		Point2D< double > dSample = sample1 - sample0;
		double theta = _theta( x , y );
		double cos_theta = cos(theta) , sin_theta = sin(theta);
		Point2D< double > sample = sample0 + dSample* dy;
		dX = Point3D< Real >( (Real)( -sample[0] * sin_theta ) , (Real)0.         , (Real)(  sample[0] * cos_theta ) ) * (Real)( ( _angleOfRevolution ) / _resX );
		dY = Point3D< Real >( (Real)( dSample[0] * cos_theta ) , (Real)dSample[1] , (Real)( dSample[0] * sin_theta ) );
	}
	else
	{
		int i = (int)floor(x);
		double dx = x-i;
		Point3D< Real > corners[2][2];
		double theta0 = ( _angleOfRevolution * i ) / _resX , theta1 = ( _angleOfRevolution * (i+1) ) / _resX;
		theta0 -= _angleOfRevolution / 2. , theta1 -= _angleOfRevolution / 2.;
		double cos0 = cos(theta0) , cos1 = cos(theta1) , sin0 = sin(theta0) , sin1 = sin(theta1);
		corners[0][0] = Point3D< Real >(  (Real)( sample0[0] * cos0 ) , (Real)sample0[1] , (Real)( sample0[0] * sin0 ) );
		corners[1][0] = Point3D< Real >(  (Real)( sample0[0] * cos1 ) , (Real)sample0[1] , (Real)( sample0[0] * sin1 ) );
		corners[0][1] = Point3D< Real >(  (Real)( sample1[0] * cos0 ) , (Real)sample1[1] , (Real)( sample1[0] * sin0 ) );
		corners[1][1] = Point3D< Real >(  (Real)( sample1[0] * cos1 ) , (Real)sample1[1] , (Real)( sample1[0] * sin1 ) );
		dX = ( corners[1][0]-corners[0][0] ) * ( (Real)(1.-dy) ) + ( corners[1][1] - corners[0][1] ) * ( (Real)dy );
		dY = ( corners[0][1] * (Real)(1.-dx) + corners[1][1] * (Real)dx ) - ( corners[0][0] * (Real)(1.-dx) + corners[1][0] * (Real)dx );
	}
}
template< class Real >
Point3D< Real > SoRParameterization::normal( double x , double y ) const
{
	Point3D< Real > dX , dY;
	tangents( x , y , dX , dY );
	return Point3D< Real >::CrossProduct( dX , dY );
}

void SoRParameterization::_parameterToTrapezoid( int j , double& x , double& y ) const
{
	double width0 , width1 , height;
	width0 = _trapData[j].width0 , width1 = _trapData[j].width1 , height = _trapData[j].height;
	double width = width0 * ( 1.-y ) + width1 * y;
	x = - width/2 +  width*x;
	y = -height/2 + height*y;
}
void SoRParameterization::_trapezoidToParameter( int j , double& x , double& y ) const
{
	double width0 , width1 , height;
	width0 = _trapData[j].width0 , width1 = _trapData[j].width1 , height = _trapData[j].height;
	y = ( y + height/2 ) / height;
	double width = width0 * ( 1.-y ) + width1 * y;
	x = ( x + width/2 ) / width;
}
int SoRParameterization::_trapezoidIntersect( double& s , double width0 , double width1 , double height , double x , double y , double dx , double dy , int invalidIsectType ) const
{
	int isectType = NO_INTERSECTION;
	s = -1;
	// Intersect with the top and bottom
	// y + s * dy = (-/+)height/2
	// s = ( (-/+)height/2-y ) / dy
	if( dy<0 && width0 && invalidIsectType!=Y0_INTERSECTION ) s = (-height/2-y) / dy , isectType = Y0_INTERSECTION;
	if( dy>0 && width1 && invalidIsectType!=Y1_INTERSECTION ) s = ( height/2-y) / dy , isectType = Y1_INTERSECTION;
	if( s<0 ) isectType = NO_INTERSECTION;
	// Intersect with the sides:
	//     (x,y) + s(dx,dy) = ( [-/+]width0/2 , -height/2 ) + t ([-/+](width1-width0)/2 , height )
	// <=> s(dx,dy) - t( [-/+](width1-width0)/2 , height ) = ( [-/+]width0/2 - x , -height/2 - y )
	SquareMatrix< double , 2 > M;
	Point2D< double > p;
	if( invalidIsectType!=X0_INTERSECTION )
	{
		M(0,0) = dx , M(0,1) = dy;
		M(1,0) = (width1-width0)/2 , M(1,1) = -height;
		p[0] = -width0/2 - x , p[1] = -height/2-y;
		p = M.inverse() * p;
		if( p[0]>0 && (p[0]<s || isectType==NO_INTERSECTION ) ) s = p[0] , isectType = X0_INTERSECTION;
	}
	if( invalidIsectType!=X1_INTERSECTION )
	{
		M(0,0) = dx , M(0,1) = dy;
		M(1,0) = -(width1-width0)/2 , M(1,1) = -height;
		p[0] = width0/2 - x , p[1] = -height/2 - y;
		p = M.inverse() * p;
		if( p[0]>0 && (p[0]<s || isectType==NO_INTERSECTION ) ) s = p[0] , isectType = X1_INTERSECTION;
	}
	return isectType;
}
void SoRParameterization::_trapReflectTangent( int j , int isectType , double& dx , double& dy ) const
{
	switch( isectType )
	{
	case Y0_INTERSECTION:
	case Y1_INTERSECTION:
		break;
	case X0_INTERSECTION:
	case X1_INTERSECTION:
		{
			double cos_alpha = _trapData[j].reflectCos , sin_alpha = _trapData[j].reflectSin;
			double _dx , _dy;
			// For left intersections, we want a CCW rotation
			if( isectType==X0_INTERSECTION ) _dx = cos_alpha * dx - sin_alpha * dy , _dy =  sin_alpha * dx + cos_alpha * dy;
			else                             _dx = cos_alpha * dx + sin_alpha * dy , _dy = -sin_alpha * dx + cos_alpha * dy;
			dx = _dx , dy = _dy;
		}
		break;
	default:
		fprintf( stderr , "[ERROR] Unrecognized intersection type: %d\n" , isectType );
	}
}

// Input/Output:
// (i,j): The coordinates of the cell we are tracing through
// (x,y): The coordinates within the cell, assumed to be in [0,1]x[0,1]
// (dx,dy): The coordinates of the tangent vector, in the local frame of the cell
void SoRParameterization::_trapGeodesic( double& x , double& y , double& dx , double& dy , double len ) const
{
	int i = (int)floor( x ) , j = (int)floor( y );
	x -= i , y -= j;
	int _i = i , _j = j;
	double _x = x , _y = y , _dx = dx , _dy = dy , _len = len;

	if( j<0 || ( _gridType.yPeriodic() && j>=_resY ) || ( !_gridType.yPeriodic() && j>=_resY-1  ) )
		fprintf( stderr , "[ERROR] j-index is out of bounds: 0<=%d<%d\n" , j , _gridType.yPeriodic() ? _resY : _resY-1 ) , exit( 0 );
	int invalidIsectType = NO_INTERSECTION;
	for( int count=0 ; count<100000 ; count++ )
	{
		bool reflectX , reflectY , negate;
		RegularGridFEM::RemapFaceIndex( i , j , _resX , _resY , _gridType , reflectX , reflectY , negate );
		if( reflectX )
		{
			x = 1. - x;
			if( invalidIsectType==X0_INTERSECTION || invalidIsectType==X1_INTERSECTION ) _trapReflectTangent( j , invalidIsectType , dx , dy );
			dx = -dx;
			if     ( invalidIsectType==X0_INTERSECTION ) invalidIsectType = X1_INTERSECTION;
			else if( invalidIsectType==X1_INTERSECTION ) invalidIsectType = X0_INTERSECTION;
		}
		if( reflectY )
		{
			y = 1. - y;
			dy = -dy;
			if     ( invalidIsectType==Y0_INTERSECTION ) invalidIsectType = Y1_INTERSECTION;
			else if( invalidIsectType==Y1_INTERSECTION ) invalidIsectType = Y0_INTERSECTION;
		}
		// Remap the parameterization coordinates into the trapezoid coordinates
		_parameterToTrapezoid( j , x , y );
		double width0 , width1 , height;
		width0 = _trapData[j].width0 , width1 = _trapData[j].width1 , height = _trapData[j].height;
		// Compute the intersection of the ray with the sides of the trapezoid.
		double s=-1.;
		int isectType = _trapezoidIntersect( s , width0 , width1 , height , x , y , dx , dy , invalidIsectType );
		if( s<0 || isectType==NO_INTERSECTION )
		{
			fprintf( stderr , "[ERROR] Failed to find ray-trapezoid intersection: s=%f , i=%d , j=%d , x=%g , y=%g , dx=%g , dy=%g , len=%g\n" , s , _i , _j , _x , _y , _dx , _dy , _len );
			fprintf( stderr , "                                                   s=%f , i=%d , j=%d , x=%g , y=%g , dx=%g , dy=%g , len=%g\n" , s ,  i ,  j ,  x ,  y ,  dx ,  dy ,  len );
			printf( "\t[%d] Width: [%g , %g] ; Height: [%g]\n" , j , width0 , width1 , height );
			printf( "\t(x,y) = (%g,%g)\n" , x , y );
			exit( 0 );
		}
		if( s>len )
		{
			x += dx*len , y += dy*len;
			_trapezoidToParameter( j , x , y );
			x += i , y += j;
			return;
		}
		else
		{
			x += dx*s , y += dy*s;
			len -= s;
			_trapezoidToParameter( j , x , y );
		}
		// Advance the ray
		{
			switch( isectType )
			{
			case Y0_INTERSECTION: j-- , y = 1.0 , invalidIsectType = Y1_INTERSECTION ; break;
			case Y1_INTERSECTION: j++ , y = 0.0 , invalidIsectType = Y0_INTERSECTION ; break;
			case X0_INTERSECTION: i-- , x = 1.0 , invalidIsectType = X1_INTERSECTION ; break;
			case X1_INTERSECTION: i++ , x = 0.0 , invalidIsectType = X0_INTERSECTION ; break;
			}
			if( isectType==X0_INTERSECTION || isectType==X1_INTERSECTION ) _trapReflectTangent( j , isectType , dx , dy );
		}
	}
	fprintf( stderr , "[ERROR] Failed to complete flat geodesic: i=%d , j=%d , x=%f , y=%f , dx=%f , dy=%f , len=%f\n" , _i , _j , _x , _y , _dx , _dy , _len ) , exit( 0 );
}
void SoRParameterization::_circleIntersect( double& s , double radius , double y , double dx , double dy , bool onCircle ) const
{
	// C( s*dx , y + s*dy ) = 0
	// => s^2 * dx^2 + y^2 + 2*s*y*dy + s^2*dy^2 - radius^2 = 0
	// => s^2 * ( dx^2 + dy^2 ) + 2*s*y*dy + ( y^2 - radius^2 ) = 0
	// => s^2 + 2*s*y*dy + ( y^2 - radius^2 ) = 0
	if( onCircle ) s = - 2. * dy * y;
	else
	{
		double b = dy * y , c = y * y - radius * radius;
		double disc = b*b-c;
		if( disc>0 )
		{
			disc = sqrt( disc );
			s = -b - disc;
			if( s<=0 ) s = -b + disc;
		}
		else s = -1;
	}
}
// Assume that y \in [0,1]
// Assume that || (dx,dy) || = 1
void SoRParameterization::_coneGeodesic( double&x , double& y , double& dx , double& dy , double len ) const
{
	const double X_SCALE = _angleOfRevolution / _resX;
	double theta = x * X_SCALE;
	int j = (int)floor( y );
	y -= j;
#if MY_ATAN_2
	static const int A_TAN_TABLE_RES = 1000;
	static double aTanTable1[A_TAN_TABLE_RES+1][3];
	static double aTanTable2[A_TAN_TABLE_RES+1][3];
	static bool aTanTableSet = false;
	if( !aTanTableSet )
	{
		for( int i=0 ; i<A_TAN_TABLE_RES+1 ; i++ )
		{
			double theta1 = atan2( (double)i/(A_TAN_TABLE_RES-1) , 1. );
			aTanTable1[i][0] = theta1;
			aTanTable1[i][1] = cos(theta1);
			aTanTable1[i][2] = sin(theta1);
			double theta2 = atan2( 1. , (double)i/(A_TAN_TABLE_RES-1) );
			aTanTable2[i][0] = theta2;
			aTanTable2[i][1] = cos(theta2);
			aTanTable2[i][2] = sin(theta2);
		}
		aTanTableSet = true;
	}
	auto my_atan_2 = [&] ( double y , double x , double& cos_theta , double& sin_theta )
	{
		double slope = fabs( y / x );
		bool flip = slope > 1.;
		if( flip ) slope = 1./slope;
		double ii = slope * (A_TAN_TABLE_RES-1);
		int i1 = (int)(ii) , i2 = i1+1;
		double d2 = ii - i1 , d1 = 1.-d2;
		double theta;
		if( flip )
		{
			theta     = aTanTable2[i1][0] * d1 + aTanTable2[i2][0] * d2;
			cos_theta = aTanTable2[i1][1] * d1 + aTanTable2[i2][1] * d2;
			sin_theta = aTanTable2[i1][2] * d1 + aTanTable2[i2][2] * d2;
		}
		else
		{
			theta     = aTanTable1[i1][0] * d1 + aTanTable1[i2][0] * d2;
			cos_theta = aTanTable1[i1][1] * d1 + aTanTable1[i2][1] * d2;
			sin_theta = aTanTable1[i1][2] * d1 + aTanTable1[i2][2] * d2;
		}
		if( x<0 ) cos_theta = -cos_theta , theta = PI - theta;
		if( y<0 ) sin_theta = -sin_theta , theta =     -theta;
		return theta;
	};
#endif // MY_ATAN_2
	int onCircle = -1;
	int __j = j;
	double __theta = theta , __y = y , __dx = dx , __dy = dy , __len = len;
	// The anulus with radius s0 , s1 is parameterized by:
	// \Phi(theta,y) = [ cos( theta ) , sin( theta ) ] * [ s0 * (1.-y) + s1 * y ]
	for( int count=0 ; count<100000 ; count++ )
	{
		int i = (int)floor( theta / X_SCALE );
		double dTheta = theta - i * X_SCALE;
		bool reflectX , reflectY , negate;
		RegularGridFEM::RemapFaceIndex( i , j , _resX , _resY , _gridType , reflectX , reflectY , negate );
		if( reflectX )
		{
			theta = i  * X_SCALE + ( 1. - dTheta );
			dx = -dx;
		}
		if( reflectY )
		{
			y = 1. - y;
			dy = -dy;
			if     ( onCircle==0 ) onCircle = 1;
			else if( onCircle==1 ) onCircle = 0;
		}
		int which = -1;
		double minS = -1;
		// Cylindrical case
		if( _coneData[j].ratio==0 || _coneData[j].ratio>1e4 )
		{
			double r = fabs( _samples[j][0] + _samples[(j+1)%_resY][0] ) / 2;
			double h0 = _samples[j][1] , h1 = _samples[(j+1)%_resY][1] , dh = h1-h0;
			if( dh<0 ) h0 = -h0 , h1 = -h1 , dh = -dh;
			if( !dh ) fprintf( stderr , "[ERROR] No height difference in cylindrical region\n" ) , exit( 0 );
			double _y =  h0 * (1.-y) + h1 * y;
			// _y + s * dy = h
			// s = (h-_y)/dy
			if( dy>0 ) minS = (h1-_y)/dy , which = 1;
			else if( dy<0 ) minS = (h0-_y)/dy , which = 0;
			else minS = len , which = -1;
			if( minS<0 ) fprintf( stderr , "[ERROR] Failed to find intersection in cylindrical region: %f \\in[%f , %f] (%f %f) ... %f %f\n" , _y , h0 , h1 , dx , dy , (h0-_y)/dh , (h1-_y)/dh ) , exit( 0 );

			theta += std::min< double >( minS , len ) * dx / r;

			if( minS>=len )
			{
				minS = len;
				_y += dy * len;
				y = (_y-h0) / (h1-h0);
				x = theta / X_SCALE , y += j;
				return;
			}
		}
		// Conical case
		else
		{
			// C_k(x,y):
			// => x^2 + y^2 = s_k^2
			// C_k( s*dx , y + s*dy ) = 0
			// => s^2 * dx^2 + y^2 + 2*s*y*dy + s^2*dy^2 - s_k^2 = 0
			// => s^2 * ( dx^2 + dy^2 ) + 2*s*y*dy + ( y^2 - s_k^2 ) = 0
			double _y = _coneData[j].r0 * (1.-y) + _coneData[j].r1 * y;
			bool dr = _coneData[j].r1>_coneData[j].r0;

			double _dy = dy * ( dr ? 1 : -1 );
			double _dx = dx * ( dr ? 1 : -1 );

			// Intersections:
			// -- If we start on the interior circle, we don't need to test intersections with the interior circle
			// -- If we intersect the interior circle, we don't need to test intersections with the exterior circle
			// -- If we are on the circle, the other point of intersection can be found by solving a linear equation, rather than a quadratic one
#define CIRCLE_INTERSECT( WHICH )                                                                 \
			{                                                                                     \
				double s;                                                                         \
				_circleIntersect( s , _coneData[j].r##WHICH , _y , _dx , _dy , onCircle==WHICH ); \
				if( s>=0 ) which = WHICH , minS = s;                                              \
			}

			minS = -1;
			if( dr ) // circle 1 is bigger than circle 0
			{
				// Intersect with the smaller circle
				if( !( ( _gridType.yPole0() && j==0 ) || onCircle==0 ) ) CIRCLE_INTERSECT( 0 );
				// Intersect with the larger circle
				if( minS<0 && !( _gridType.yPole1() && j==_resY-2 ) ) CIRCLE_INTERSECT( 1 );
			}
			else // circle 0 is bigger than circle 1
			{
				// Intersect with the smaller circle
				if( !( ( _gridType.yPole1() && j==_resY-2 ) || onCircle==1 ) ) CIRCLE_INTERSECT( 1 );
				// Intersect with the larger circle
				if( minS<0 && !( _gridType.yPole0() && j==0 ) ) CIRCLE_INTERSECT( 0 );
			}
#undef CIRCLE_INTERSECT
			if( minS<0 )
			{
				fprintf( stderr , "[ERROR] Failed to intersect cone geodesic(%d)[%d][%d]: %f \\in [%f,%f][%f] (%f %f)\n" , which , j , onCircle , _y , _coneData[j].r0 , _coneData[j].r1 , _coneData[j].ratio , _dx , _dy );
				fprintf( stderr , "\tj=%d , theta=%g , y=%g , dx=%g , dy=%g , len=%g\n" , __j , __theta , __y , __dx , __dy , __len );
				fprintf( stderr , "\tSamples: (%f %f) (%f %f)\n" , _samples[j][0] , _samples[j][1] , _samples[(j+1)%_resY][0] , _samples[(j+1)%_resY][1] );
				exit( 0 );
			}
			minS = std::min< double >( minS , len );
			double __x = minS*_dx , __y = _y+minS*_dy;
#if MY_ATAN_2
			double cos_dTheta , sin_dTheta;
#else // !MY_ATAN_2
			double __l = sqrt( __x*__x + __y*__y );
#endif // MY_ATAN_2
			{
#if MY_ATAN_2
				double dTheta = PI/2 - my_atan_2( __y , __x , cos_dTheta , sin_dTheta );
				{ double temp = cos_dTheta ; cos_dTheta = sin_dTheta , sin_dTheta = temp; }
#else // !MY_ATAN_2
				double dTheta = PI/2 - atan2( __y , __x );
				if( dTheta<-PI ) dTheta += 2.*PI;
				if( dTheta> PI ) dTheta -= 2.*PI;
#endif // MY_ATAN_2
				// cos( dTheta ) = cos( PI/2 - atan2( __y , __x ) )
				//               = cos( PI/2 ) * cos( -atan2( __y , __x ) ) - sin( PI/2 ) * sin( -atan2( __y , __x ) )
				//               = sin( atan2( __y , __x ) )
				//               = __y / sqrt( __x^2 + __y^2 )
				// sin( dTheta ) = sin( PI/2 - atan2( __y , __x ) )
				//               = sin( PI/2 ) * cos( -atan2( __y , __x ) ) + cos( PI/2 ) * sin( -atan2( __y , __x ) )
				//               = cos( atan2( __y , __x ) )
				//               = __x / sqrt( __x^2 + __y^2 )
				{
#if !MY_ATAN_2
					double cos_dTheta = __y / __l , sin_dTheta = __x / __l;
#endif // !MY_ATAN_2
					double _dx =  dx*cos_dTheta - dy*sin_dTheta;
					double _dy =  dx*sin_dTheta + dy*cos_dTheta;
					dx = _dx , dy = _dy;
				}
				theta += dTheta * _coneData[j].ratio * ( dr ? 1 : -1 );
			}

			if( minS==len )
			{
#if MY_ATAN_2
				_y = __x * sin_dTheta + __y * cos_dTheta;
#else // !MY_ATAN_2
				_y = __l;
#endif // MY_ATAN_2
				//    _y = s0 * (1.-y) + s1 * y
				// =>    = s0 + y*(s1-s0)
				// =>  y = (_y-s0) / (s1-s0)
				y = ( _y - _coneData[j].r0 ) / (_coneData[j].r1-_coneData[j].r0);
				x = theta / X_SCALE , y += j;
				return;
			}
		}
		onCircle = 1-which;
		if     ( which==0 ) j-- , y = 1.;
		else if( which==1 ) j++ , y = 0.;
		else fprintf( stderr , "[ERROR] Unrecognized which=%d: j=%d , theta=%f , y=%f , dx=%f , dy=%f , len=%f\n" , which , __j , __theta , __y , __dx , __dy , __len ) , exit( 0 );
		len -= minS;
	}
	fprintf( stderr , "[ERROR] Failed to complete cone geodesic: j=%d , theta=%f , y=%f , dx=%f , dy=%f , len=%f\n" , __j , __theta , __y , __dx , __dy , __len ) , exit( 0 );
}
void SoRParameterization::_geodesic( double& x , double& y , double& dx , double& dy ) const
{
	double len = dx*dx + dy*dy;
	if( len )
	{
		len = sqrt( len );
		dx /= len , dy /= len;
		if( _conicalGeometry ) _coneGeodesic( x , y , dx , dy , len );
		else                   _trapGeodesic( x , y , dx , dy , len );
		dx *= len , dy *= len;
	}
	bool reflectX , reflectY , negate;
	RegularGridFEM::RemapParameter( x , y , _resX , _resY , _gridType , reflectX , reflectY , negate );
	if( reflectX ) dx = -dx;
	if( reflectY ) dy = -dy;
}
void SoRParameterization::geodesic( double& x , double& y , double& dx , double& dy ) const
{
	bool reflectX , reflectY , negate;
	RegularGridFEM::RemapParameter( x , y , _resX , _resY , _gridType , reflectX , reflectY , negate );
	if( reflectX ) dx = -dx;
	if( reflectY ) dy = -dy;
	Point2D< double > d;
	d = _globalToLocal( (int)floor(y) , x-floor(x) , y-floor(y) , Point2D< double >( dx , dy ) );
	dx = d[0] , dy = d[1];
	_geodesic( x , y , dx , dy );
	d = _localToGlobal( (int)floor(y) , x-floor(x) , y-floor(y) , Point2D< double >( dx , dy ) );
	dx = d[0] , dy = d[1];
}
template< class Real >
 SquareMatrix< Real , 2 > SimilarityMatrix2D( Real x , Real y )
 {
	 SquareMatrix< Real , 2 > M;
	 M(0,0) = M(1,1) = x;
	 M(0,1) = y , M(1,0) = -y;
	 return M;
 }
template< class VectorData , class SignalData , class Real >
double SoRParameterization::advectBackward( const RegularGridFEM::template Derivative< Real , Real >& vf , bool rotate ,
	RegularGridFEM::template Derivative< VectorData , Real >* inOutFlow , RegularGridFEM::template Signal< SignalData , Real >* inOutF ,
	Real delta , Real maxStepSize , int subSteps , int threads ) const
{
	static std::vector< Real > zeros;
	zeros.resize( _resX , (Real)0. );
	static std::vector< SignalData > signalFaceValues;
	static std::vector< std::pair< VectorData , VectorData > > derivativeFaceValues;
	unsigned int fDimX = _gridType.xPeriodic() ? _resX : _resX-1;
	unsigned int vDimX = _gridType.xDirichlet() ? _resX-2 : _resX;

	signalFaceValues.resize( fDimX * bands() );
	derivativeFaceValues.resize( fDimX * bands() );
	int countSum = 0;
	delta /= subSteps;
#pragma omp parallel for num_threads( threads ) reduction( + : countSum ) schedule( dynamic )
	for( int j=0 ; j<(int)bands() ; j++ )
	{
		SquareMatrix< Real , 2 > transportM;
		Pointer( SignalData ) _signalFaceValues = GetPointer( signalFaceValues ) + j*fDimX;
		Pointer( std::pair< VectorData , VectorData > ) _derivativeFaceValues = GetPointer( derivativeFaceValues ) + j*fDimX;

		ConstPointer( Real ) _dx0;
		ConstPointer( Real ) _dx1;
		if( j==0 &&         ( _gridType.yPole0() || _gridType.yDirichlet0() ) ) _dx0 = GetPointer( zeros );
		else if(              _gridType.yPole0() || _gridType.yDirichlet0() )   _dx0 = vf.dx() + (j-1) * fDimX;
		else                                                                    _dx0 = vf.dx() +  j    * fDimX;
		if( j==bands()-1 && ( _gridType.yPole1() || _gridType.yDirichlet1() ) ) _dx1 = GetPointer( zeros );
		else if(              _gridType.yPole0() || _gridType.yDirichlet0() )   _dx1 = vf.dx() +  j    * fDimX;
		else                                                                    _dx1 = vf.dx() + ( (j+1)%_resY ) * fDimX;
		ConstPointer( Real ) _dy = vf.dy() + j*vDimX;
		if( _gridType.xDirichlet() ) _dy--;

		for( int i=0 ; i<(int)fDimX ; i++ )
		{
			transportM = SquareMatrix< Real , 2 >::Identity();
			double x = i+0.5 , y = j+0.5;
			int ii = (i+1)%_resX;
			Point2D< Real > _v;
			if     ( i==0     && _gridType.xDirichlet() ) _v = Point2D< Real >( (_dx0[i]+_dx1[i]) * (Real)0.5  , (       _dy[ii]) * (Real)0.5 );
			else if( i==fDimX && _gridType.xDirichlet() ) _v = Point2D< Real >( (_dx0[i]+_dx1[i]) * (Real)0.5  , (_dy[i]+_dy[ii]) * (Real)0.5 );
			else                                          _v = Point2D< Real >( (_dx0[i]+_dx1[i]) * (Real)0.5  , (_dy[i]        ) * (Real)0.5 );
			Point2D< Real > v = _globalToLocal( j , 0.5 , 0.5 , _v );

			bool done = false;
			for( int s=0 ; s<subSteps && !done ; s++ )
			{
				double _delta = delta;
				while( 1 )
				{
					countSum++;
					double len = v.squareNorm();
					if( !len ){ done = true ;  break; }

					if( rotate ) v = Point2D< Real >( -v[1] , v[0] );
					// || maxStepSize * v || < EPSILON <=> maxStepSize < EPSILON / || v ||
					double _maxStepSize = maxStepSize<=0 ? _delta*2 : maxStepSize / sqrt( len );
					double dx = -v[0] * std::min< double >( _maxStepSize , _delta ) , dy = -v[1] * std::min< double >( _maxStepSize , _delta ) , _dx = dx , _dy = dy;
					_geodesic( x , y , _dx , _dy );
					if( inOutFlow )
					{
						Real l2 = (Real)( dx*dx+dy*dy );
						if( l2>1e-12 )
						{
							transportM = transportM * SimilarityMatrix2D< Real >( (Real) dx , (Real) dy ) * SquareMatrix< Real , 2 >( SimilarityMatrix2D< Real >( (Real)_dx , (Real)_dy ).transpose() );
							transportM /= l2;
						}
					}
					if( _maxStepSize<_delta || s<subSteps-1 )
					{
						std::pair< Real , Real > _v = vf.sample( x , y );
						v = _globalToLocal( (int)floor(y) , x-floor(x) , y-floor(y) , Point2D< Real >( _v.first , _v.second ) );
					}
					if( _maxStepSize<_delta ) _delta -= _maxStepSize;
					else break;
				}
			}
			if( inOutF ) _signalFaceValues[i] = inOutF->sample( x , y );
			if( inOutFlow )
			{
				std::pair< VectorData , VectorData > temp = inOutFlow->sample( x , y );
				temp = _globalToLocal< VectorData , Real >( (int)floor(y) , x-floor(x) , y-floor(y) , temp );
				_derivativeFaceValues[i] = _localToGlobal< VectorData , Real >( j , 0.5 , 0.5 , std::pair< VectorData , VectorData >( temp.first * transportM(0,0) + temp.second * transportM(1,0) , temp.first * transportM(0,1) + temp.second * transportM(1,1) ) );
			}
		}
	}
	if( inOutF ) inOutF->setFromFaceValues( GetPointer( signalFaceValues ) , false , threads );
	if( inOutFlow ) inOutFlow->setFromFaceValues( ( ConstPointer( Real ) )GetPointer( derivativeFaceValues ) , false , threads );
	return ( (double)countSum ) / ( fDimX * bands() );
}
template< class VectorData , class SignalData , class Real >
double SoRParameterization::advectForward( const RegularGridFEM::template Derivative< Real , Real >& vf , bool rotate ,
	typename RegularGridFEM::template Derivative< VectorData , Real >* inOutFlow , RegularGridFEM::template Signal< SignalData , Real >* inOutF ,
	Real delta , Real maxStepSize , int subSteps , int threads ) const
{
	static std::vector< Real > zeros;
	zeros.resize( _resX , (Real)0. );
	static std::vector< SignalData > signalFaceValues;
	static std::vector< std::pair< VectorData , VectorData > > derivativeFaceValues;

	int countSum = 0;
	delta /= subSteps;
#pragma omp parallel for num_threads( threads ) reduction( + : countSum ) schedule( dynamic )
	for( int j=0 ; j<bands() ; j++ )
	{
		SquareMatrix< Real , 2 > transportM;
		Pointer( SignalData ) _signalFaceValues = GetPointer( signalFaceValues ) + j*_resX;
		Pointer( std::pair< VectorData , VectorData > ) _derivativeFaceValues = GetPointer( derivativeFaceValues ) + j*_resX;

		ConstPointer( Real ) _dx0;
		ConstPointer( Real ) _dx1;
		if( j==0 &&         ( _gridType.yPole0() || _gridType.yDirichlet0() ) ) _dx0 = GetPointer( zeros );
		else if(              _gridType.yPole0() || _gridType.yDirichlet0() )   _dx0 = vf.dx() + (j-1) * _resX;
		else                                                                                          _dx0 = vf.dx() +  j    * _resX;
		if( j==bands()-1 && ( _gridType.yPole1() || _gridType.yDirichlet1() ) ) _dx1 = GetPointer( zeros );
		else if(              _gridType.yPole0() || _gridType.yDirichlet0() )   _dx1 = vf.dx() +  j    * _resX;
		else                                                                                          _dx1 = vf.dx() + ( (j+1)%_resY ) * _resX;
		ConstPointer( Real ) _dy = vf.dy() + j*_resX;

		for( int i=0 ; i<_resX ; i++ )
		{
			transportM = SquareMatrix< Real , 2 >::Identity();
			double x = i+0.5 , y = j+0.5;
			int ii = (i+1)%_resX;
			Point2D< Real > _v( (_dx0[i]+_dx1[i]) * (Real)0.5  , (_dy[i]+_dy[ii]) * (Real)0.5 );
			Point2D< Real > v = _globalToLocal( j , 0.5 , 0.5 , _v );
			for( int s=0 ; s<subSteps ; s++ )
			{
				double _delta = delta;
				while( 1 )
				{
					countSum++;
					double len = v.squareNorm();
					if( !len ) break;

					if( rotate ) v = Point2D< Real >( -v[1] , v[0] );
					// || maxStepSize * v || < EPSILON <=> maxStepSize < EPSILON / || v ||
					double _maxStepSize = maxStepSize<=0 ? _delta*2 : maxStepSize / sqrt( len );
					double dx = v[0] * std::min< double >( _maxStepSize , _delta ) , dy = v[1] * std::min< double >( _maxStepSize , _delta ) , _dx = dx , _dy = dy;
					_geodesic( x , y , _dx , _dy );
					if( inOutFlow )
					{
						Real l2 = (Real)( dx*dx+dy*dy );
						if( l2>1e-12 )
						{
							transportM = SimilarityMatrix2D< Real >( (Real) _dx , (Real) _dy ) * SquareMatrix< Real , 2 >( SimilarityMatrix2D< Real >( (Real)dx , (Real)dy ).transpose() ) * transportM;
							transportM /= l2;
						}
					}
					if( _maxStepSize<_delta || s<subSteps-1 )
					{
						std::pair< Real , Real > _v = vf.sample( x , y );
						v = _globalToLocal( (int)floor(y) , x-floor(x) , y-floor(y) , Point2D< Real >( _v.first , _v.second ) );
					}
					if( _maxStepSize<_delta ) _delta -= _maxStepSize;
					else break;
				}
			}
			if( inOutF ) _signalFaceValues[i] = inOutF->sample( x , y );
			if( inOutFlow )
			{
				std::pair< VectorData , VectorData > temp = inOutFlow->sample( x , y );
				temp = _globalToLocal< VectorData , Real >( (int)floor(y) , x-floor(x) , y-floor(y) , temp );
				_derivativeFaceValues[i] = _localToGlobal< VectorData , Real >( j , 0.5 , 0.5 , std::pair< VectorData , VectorData >( temp.first * transportM(0,0) + temp.second * transportM(1,0) , temp.first * transportM(0,1) + temp.second * transportM(1,1) ) );
			}
		}
	}
	if( inOutF ) inOutF->setFromFaceValues( GetPointer( signalFaceValues ) , false , threads );
	if( inOutFlow ) inOutFlow->setFromFaceValues( GetPointer( derivativeFaceValues ) , false , threads );
	return ( (double)countSum ) / ( _resX * bands() );
}

template< class Real >
int SoRParameterization::advectForward( const RegularGridFEM::template Derivative< Real , Real >& vf , bool rotate , Point2D< double >& position , Real delta , Real maxStepSize , int subSteps ) const
{
	int countSum = 0;

	delta /= subSteps;

	double x = position[0] , y = position[1];
	std::pair< Real , Real > _v = vf.sample( x , y );
	Point2D< Real > v = _globalToLocal( (int)floor(y) , x-floor(x) , y-floor(y) , Point2D< Real >( _v.first , _v.second ) );
	for( int s=0 ; s<subSteps ; s++ )
	{
		double _delta = delta;
		while( 1 )
		{
			countSum++;
			double len = v.squareNorm();
			if( !len ) break;

			if( rotate ) v = Point2D< Real >( -v[1] , v[0] );
			// || maxStepSize * v || < EPSILON <=> maxStepSize < EPSILON / || v ||
			double _maxStepSize = maxStepSize<=0 ? _delta*2 : maxStepSize / sqrt( len );
			double dx = v[0] * std::min< double >( _maxStepSize , _delta ) , dy = v[1] * std::min< double >( _maxStepSize , _delta ) , _dx = dx , _dy = dy;
			_geodesic( x , y , _dx , _dy );
			if( _maxStepSize<_delta || s<subSteps-1 )
			{
				std::pair< Real , Real > _v = vf.sample( x , y );
				v = _globalToLocal( (int)floor(y) , x-floor(x) , y-floor(y) , Point2D< Real >( _v.first , _v.second ) );
			}
			if( _maxStepSize<_delta ) _delta -= _maxStepSize;
			else break;
		}
	}
	position = Point2D< double >( x ,  y );
	return countSum;
}

template< class Real >
void SoRParameterization::sanityCheckSystem( int tests , int threads , double cutOff ) const
{
	SparseMatrix< Real , int > mass , stiffness;

	poissonSystem( mass , stiffness , threads );
	RegularGridFEM::template     Signal< Real , Real > s1( _resX , _resY , _gridType );
	RegularGridFEM::template     Signal< Real , Real > s2( _resX , _resY , _gridType );
	RegularGridFEM::template     Signal< Real , Real > _s( _resX , _resY , _gridType );
	RegularGridFEM::template Derivative< Real , Real > v1( _resX , _resY , _gridType );
	RegularGridFEM::template Derivative< Real , Real > v2( _resX , _resY , _gridType );
	RegularGridFEM::template Derivative< Real , Real > _v( _resX , _resY , _gridType );

	for( int t=0 ; t<tests ; t++ )
	{
		for( int i=0 ; i<(int)s1.dim() ; i++ ) s1[i] = Random< Real >() , s2[i] = Random< Real >();

		// Test the mass matrix
		double massError1[3] , massError2[3] , massError[4] , stiffnessError1[3] , stiffnessError2[3] , stiffnessError[4] , dualError1[1] , dualError2[1];
		{
			double dot1[2] , dot2[2] , dot3[2] , dot[2];

			// < mass * s1 , s2 >
			dot1[0] = 0;
			mass.MultiplyParallel( s1() , _s() , threads , 0 );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot1[0] += _s[i] * s2[i];
			// < ( mass * 1 + stiffness * 0 ) * s1 , s2 >
			dot2[0] = 0;
			screenedLaplacian( s1 , _s , 1. , 0. , 0 , threads );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot2[0] += _s[i] * s2[i];
			// < Dual(s1) , s2 >
			dot3[0] = 0;
			dual( s1 , _s , threads );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot3[0] += _s[i] * s2[i];
			// s1^t * M_0 * s2
			dot[0] = dotProduct( s1 , s2 );
			massError1[0] = fabs( dot1[0] - dot[0] ) , massError1[1] = fabs( dot2[0] - dot[0] ) , massError1[2] = fabs( dot3[0] - dot[0] );

			// < mass * s2 , s1 >
			dot1[1] = 0;
			mass.MultiplyParallel( s2() , _s() , threads , 0 );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot1[1] += _s[i] * s1[i];
			// < ( mass * 1 + stiffness * 0 ) * s2 , s1 >
			dot2[1] = 0;
			screenedLaplacian( s2 , _s , 1. , 0. , 0 , threads );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot2[1] += _s[i] * s1[i];
			// < Dual(s1) , s2 >
			dot3[1] = 0;
			dual( s2 , _s , threads );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot3[1] += _s[i] * s1[i];
			// s2^t * M_0 * s1
			dot[1] = dotProduct( s2 , s1 );
			massError2[0] = fabs( dot1[1] - dot[1] ) , massError2[1] = fabs( dot2[1] - dot[1] ) , massError2[2] = fabs( dot3[1] - dot[1] );

			massError[0] = fabs( dot[0]-dot[1] ) , massError[1] = fabs( dot1[0]-dot1[1] ) , massError[2] = fabs( dot2[0]-dot2[1] ) , massError[3] = fabs( dot3[0]-dot3[1] );
		}

		// Test the stiffness matrix
		{
			gradient( s1 , v1 ) , gradient( s2 , v2 );
			double dot1[2] , dot2[2] , dot3[2] , dot4[2] , dot[2];

			// < stiffness * s1 , s2 >
			dot1[0] = 0;
			stiffness.MultiplyParallel( s1() , _s() , threads , 0 );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot1[0] += _s[i] * s2[i];
			// < div( grad( s1 ) ) , s2 >
			dot2[0] = 0;
			divergence( v1 , _s , threads );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot2[0] += _s[i] * s2[i];
			// < ( mass * 0 + stiffness * 1 ) * s1 , s2 >
			dot3[0] = 0;
			screenedLaplacian( s1 , _s , 0. , 1. , 0 , threads );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot3[0] += _s[i] * s2[i];
			// s1^T * M_1 * s2
			dot[0] = dotProduct( v1 , v2 );
			stiffnessError1[0] = fabs( dot1[0] - dot[0] ) , stiffnessError1[1] = fabs( dot2[0] - dot[0] ) , stiffnessError1[2] = fabs( dot3[0] - dot[0] ) , stiffnessError1[3] = fabs( dot4[0] - dot[0] );

			// < stiffness * s2 , s1 >
			dot1[1] = 0;
			stiffness.MultiplyParallel( s2() , _s() , threads , 0 );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot1[1] += _s[i] * s1[i];
			// < div( grad( s2 ) ) , s1 >
			dot2[1] = 0;
			divergence( v2 , _s , threads );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot2[1] += _s[i] * s1[i];
			// < ( mass * 0 + stiffness * 1 ) * s2 , s1 >
			dot3[1] = 0;
			screenedLaplacian( s2 , _s , 0. , 1. , 0 , threads );
			for( unsigned int i=0 ; i<_s.dim() ; i++ ) dot3[1] += _s[i] * s1[i];
			// s2^T * M_1 * s1
			dot[1] = dotProduct( v2 , v1 );
			stiffnessError2[0] = fabs( dot1[1] - dot[1] ) , stiffnessError2[1] = fabs( dot2[1] - dot[1] ) , stiffnessError2[2] = fabs( dot3[1] - dot[1] );

			stiffnessError[0] = fabs( dot[0]-dot[1] ) , stiffnessError[1] = fabs( dot1[0]-dot1[1] ) , stiffnessError[2] = fabs( dot2[0]-dot2[1] );
		}


		for( int i=0 ; i<(int)v1.dim() ; i++ ) v1[i] = Random< Real >() , v2[i] = Random< Real >();
		// Test the 1-form mass-matrix
		{
			double dot , dot1 , dot2;
			dot = dotProduct( v1 , v2 );

			// < Dual(v1) , v2 >
			dot1 = 0;
			dual( v1 , _v , threads );
			for( unsigned int i=0 ; i<_v.dim() ; i++ ) dot1 += _v[i] * v2[i];
			// < Dual(v2) , v1 >
			dot2 = 0;
			dual( v2 , _v , threads );
			for( unsigned int i=0 ; i<_v.dim() ; i++ ) dot2 += _v[i] * v1[i];

			dualError1[0] = fabs( dot-dot1 );
			dualError2[0] = fabs( dot-dot2 );
		}
		bool mError1 = massError1[0]>=cutOff || massError1[1]>=cutOff || massError1[2]>=cutOff;
		bool mError2 = massError2[0]>=cutOff || massError2[1]>=cutOff || massError2[2]>=cutOff;
		bool mError  = massError [0]>=cutOff || massError [1]>=cutOff || massError [2]>=cutOff || massError[3]>=cutOff;
		bool sError1 = stiffnessError1[0]>=cutOff || stiffnessError1[1]>=cutOff || stiffnessError1[2]>=cutOff;
		bool sError2 = stiffnessError2[0]>=cutOff || stiffnessError2[1]>=cutOff || stiffnessError2[2]>=cutOff;
		bool dError1 = dualError1[0]>=cutOff;
		bool dError2 = dualError1[0]>=cutOff;
		bool sError  = stiffnessError [0]>=cutOff || stiffnessError [1]>=cutOff || stiffnessError [2]>=cutOff || stiffnessError[3]>=cutOff;
		if( mError || mError1 || sError1 || mError2 || sError2 | dError1 ||dError2 ) printf( "Test %d:\n" , t );
		if( mError  ) printf( "\tMass: %g %g %g %g\n" , massError[0] , massError[1] , massError[2] , massError[3] );
		if( mError1 ) printf( "\tMass(1): %g %g %g\n" , massError1[0] , massError1[1] , massError1[2] );
		if( mError2 ) printf( "\tMass(2): %g %g %g\n" , massError2[0] , massError2[1] , massError2[2] );
		if( sError  ) printf( "\tStiffness: %g %g %g %g\n" , stiffnessError[0] , stiffnessError[1] , stiffnessError[2] , stiffnessError[3] );
		if( sError1 ) printf( "\tStiffness(1): %g %g %g %g\n" , stiffnessError1[0] , stiffnessError1[1] , stiffnessError1[2] );
		if( sError2 ) printf( "\tStiffness(2): %g %g %g %g\n" , stiffnessError2[0] , stiffnessError2[1] , stiffnessError2[2] );
		if( dError1 ) printf( "\tDual(1): %g\n" , dualError1[0] );
		if( dError2 ) printf( "\tDual(2): %g\n" , dualError2[0] );
	}

	// Test that the constant functions have zero gradients/laplacians and evaluate areas
	if( !( _gridType.yDirichlet0() || _gridType.yDirichlet1() || _gridType.xDirichlet() ) )
	{
		for( int i=0 ; i<(int)s1.dim() ; i++ ) s1[i] = (Real)1.;
		gradient( s1 , v1 , threads );
		stiffness.MultiplyParallel( s1() , _s() , threads , 0 );
		screenedLaplacian( s1 , s2 , 0. , 1. , 0 , threads );
		double n1 = 0 , n2 = 0;
		for( int i=0 ; i<(int)v1.dim() ; i++ ) n1 += v1[i] * v1[i];
		for( int i=0 ; i<(int)_s.dim() ; i++ ) n2 += _s[i] * _s[i];
		double errors[] = { dotProduct(v1,v1) , dotProduct(_s,_s) , dotProduct(s2,s2) , n1 , n2 };
		if( errors[0]>=cutOff || errors[1]>=cutOff || errors[2]>=cutOff || errors[3]>=cutOff || errors[4]>=cutOff )
			printf( "Constant: %g -> %g , %g , %g : %g , %g\n" , dotProduct(s1,s1) , errors[0] , errors[1] , errors[2] , errors[3] , errors[4] );

		mass.MultiplyParallel( s1() , _s() , threads , 0 );
		double a = 0;
		for( int i=0 ; i<(int)_s.dim() ; i++ ) a += _s[i];
		double err = fabs( a - area() );
		if( err>=cutOff ) printf( "Area: %g %g -> %g\n" , a , area() , err );
	}
}
template< class Real >
int SoRParameterization::harmonics( RegularGridFEM::template Derivative< Real , Real >& h1 , RegularGridFEM::template Derivative< Real , Real >& h2 ) const
{
	RegularGridFEM::template Derivative< Real , Real >* _h[] = { &h1 , &h2 };
	_h[0]->resize( _resX , _resY , _gridType ) , _h[1]->resize( _resX , _resY , _gridType );
	int ret = 0;

	for( unsigned int i=0 ; i<_h[0]->dim() ; i++ ) (*_h[0])[i] = (*_h[1])[i] = 0.;
	// Generate the gradient of a function that is constant on the parallels
	// Note that though this is locally a gradient it is not locally divergence-free so we need to project that part out.
	bool xDirichlet = _gridType.xDirichlet() , yDirichlet = ( _gridType.yDirichlet0() && _gridType.yDirichlet1() );
	if( ( _gridType.yPeriodic() || yDirichlet ) && !xDirichlet )
	{
		unsigned int vDimX = _gridType.xDirichlet() ? _resX-2 : _resX;
		RegularGridFEM::template Derivative< Real , Real >& h = *_h[ret];
		for( int i=0 ; i<(int)h.dyDim() ; i++ ) h.dy()[i] = (Real)1.;
		RegularGridFEM::template Signal< Real , Real > s;
		RegularGridFEM::template Derivative< Real , Real > d;
		divergence( h , s );
		BandedMatrix< Real , 1 > M , stiffness;
		poissonFrequencySystem( M , stiffness , 0 );
		std::vector< Real > b( M.rows() ) , x( M.rows() );
		for( int j=0 ; j<(int)b.size() ; j++ ) b[j] = s[j*vDimX];
		for( int i=0 ; i<M.rows() ; i++ ) for( int j=0 ; j<3 ; j++ ) M[i][j] = (Real)( stiffness[i][j] + M[i][j]*1e-8 );
		TridiagonalSolver< Real >::Solve( M , ( ConstPointer(Real) )GetPointer( b ) , ( Pointer(Real) )GetPointer( x ) );
		for( int j=0 ; j<(int)x.size() ; j++ ) for( int i=0 ; i<(int)vDimX ; i++ ) s[j*vDimX+i] = x[j];
		gradient( s , d );
		for( int i=0 ; i<(int)h.dim() ; i++ ) h[i] -= d[i];
		Real scale = (Real)( 1./sqrt( dotProduct( h , h ) ) );
		for( int i=0 ; i<(int)h.dim() ; i++ ) h[i] *= scale;
		ret++;
	}
	// Generate the gradient of a function that is constant on the meridians
	// This one is already divergence-free so we don't have to project
	if( ( _gridType.xPeriodic() || xDirichlet ) && !yDirichlet )
	{
		RegularGridFEM::template Derivative< Real , Real >& h = *_h[ret];
		for( int i=0 ; i<(int)h.dxDim() ; i++ ) h.dx()[i] = (Real)1.;
		Real scale = (Real)( 1./sqrt( squareNorm( h ) ) );
		for( int i=0 ; i<(int)h.dim() ; i++ ) h[i] *= scale;
		ret++;
	}

	return ret;
}
template< class Data , class Real >
void SoRParameterization::ToDoubleCoveringConstraints( Pointer( Data ) constraints , int resX , int resY , RegularGridFEM::GridType gridType , int threads )
{
	if( gridType.xNeumann() )
#pragma omp parallel for num_threads( threads )
		for( int j=0 ; j<resY ; j++ )
		{
			if     ( j==0      && gridType.yDirichlet0() ) continue;
			else if( j==resY-1 && gridType.yDirichlet1() ) continue;

			Pointer( Real ) _constraints;

			if     ( gridType.yPole0() )      _constraints = constraints + 1 + (j-1) * resX;
			else if( gridType.yDirichlet0() ) _constraints = constraints +     (j-1) * resX;
			else                              _constraints = constraints +      j    * resX;

			if( j==0 && gridType.yPole0() ) _constraints = constraints;

			if     ( j==0      && gridType.yPole0() ) _constraints[0] *= (Real)2.;
			else if( j==resY-1 && gridType.yPole1() ) _constraints[0] *= (Real)2.;
			else _constraints[0] *= (Real)2. , _constraints[resX-1] *= (Real)2.;
		}
}
template< class Data , class Real >
void SoRParameterization::FromDoubleCoveringConstraints( Pointer( Data ) constraints , int resX , int resY , RegularGridFEM::GridType gridType , int threads )
{
	if( gridType.xNeumann() )
#pragma omp parallel for num_threads( threads )
		for( int j=0 ; j<resY ; j++ )
		{
			if     ( j==0      && gridType.yDirichlet0() ) continue;
			else if( j==resY-1 && gridType.yDirichlet1() ) continue;

			Pointer( Real ) _constraints;

			if     ( gridType.yPole0() )      _constraints = constraints + 1 + (j-1) * resX;
			else if( gridType.yDirichlet0() ) _constraints = constraints +     (j-1) * resX;
			else                              _constraints = constraints +      j    * resX;

			if( j==0 && gridType.yPole0() ) _constraints = constraints;

			if     ( j==0      && gridType.yPole0() ) _constraints[0] /= (Real)2.;
			else if( j==resY-1 && gridType.yPole1() ) _constraints[0] /= (Real)2.;
			else _constraints[0] /= (Real)2. , _constraints[resX-1] /= (Real)2.;
		}
}
