/*
Copyright (c) 2011, Michael Kazhdan and Ming Chuang
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of
conditions and the following disclaimer. Redistributions in binary form must reproduce
the above copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the distribution. 

Neither the name of the Johns Hopkins University nor the names of its contributors
may be used to endorse or promote products derived from this software without specific
prior written permission. 

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
DAMAGE.
*/
template< class Real , bool Primal >
template< class C >
void MeshOctree< Real , Primal >::ParallelSolver< C >::Init( int blockSize , int dimensions , bool setDSFromElements )
{
	if( setDSFromElements )
	{
		DS.resize( restriction.size() );
		for( int d=1; d<restriction.size() ; d++ ) BasisDownSampleMatrix( e2b[d].e2b , restriction[d] , e2b[d-1].e2b , DS[d] , threads );
	}
	ParallelMultigridSolver< Real , C , int >::Init( blockSize , dimensions );
	_eM.resize( DS.size() );
	for( int d=0 ; d<DS.size() ; d++ )
	{
		_eM[d].resize( e2b[d].elementNum() );
		e2b[d].resizeBasisMatrix( _M[d] );
		for( int i=0 ; i<_M[d].groups ; i++ ) _M[d].groupSizes[i]--;
		_D[d].Resize( _M[d].groups );
	}
}
template< class Real , bool Primal >
template< class C >
void MeshOctree< Real , Primal >::ParallelSolver< C >::ResetMatrix( int threads )
{
	e2b.back().elementToBasisMatrix( _eM.back() , _M.back() , threads , false );
	for( int d=DS.size()-2 ; d>=0 ; d-- )
	{
		restriction[d+1].downSample( _eM[d+1] , _eM[d] , threads , false );
		e2b[d].elementToBasisMatrix( _eM[d] , _M[d] , threads , false );
	}
	for( int d=0 ; d<DS.size() ; d++ )
#pragma omp parallel for num_threads( threads )
		for( int i=0 ; i<_M[d].groups ; i++ )
		{
			int sz = _M[d].groupSizes[i];
			Pointer( MatrixEntry< Real , int > ) row = _M[d][i];
			_D[d][i] = row[sz].Value;
			for( int j=0 ; j<sz ; j++ ) row[j].Value /= _D[d][i];
		}
}

template< class Real , bool Primal > __declspec( align( 16 ) ) Real MeshOctree< Real , Primal >::ElementRestrictionOperator::UpSampleValues[Primal?8:27][8][Primal?8:28];
template< class Real , bool Primal >                           bool MeshOctree< Real , Primal >::ElementRestrictionOperator::_upSampleValuesSet = false;
template< class Real , bool Primal > std::pair< int , Real >*       MeshOctree< Real , Primal >::ElementRestrictionOperator::NonZeroUpSampleValues[Primal?8:27][8];
template< class Real , bool Primal > int                            MeshOctree< Real , Primal >::ElementRestrictionOperator::NonZeroUpSampleValueCount[Primal?8:27][8];

template< class Real , bool Primal >
MeshOctree< Real , Primal >::ElementRestrictionOperator::ElementRestrictionOperator( void )
{
#pragma omp critical
	{
		if( !_upSampleValuesSet )
		{
			Real pWeights[2][2][2] =
			{
				{ { 1.0 , 0.5 } , { 0.5 , 0.0 } } ,
				{ { 0.0 , 0.5 } , { 0.5 , 1.0 } }
			};
			Real dWeights[3][2][3] =
			{
				{ { 0.75 , 0.25 , 0.00 } , { 0.25 , 0.00 , 0.00 } } ,
				{ { 0.25 , 0.75 , 0.75 } , { 0.75 , 0.75 , 0.25 } } ,
				{ { 0.00 , 0.00 , 0.25 } , { 0.00 , 0.25 , 0.75 } }
			};

			for( int x=0 , j=0 ; x<(Primal?2:3) ; x++ ) for( int y=0 ; y<(Primal?2:3) ; y++ ) for( int z=0 ; z<(Primal?2:3) ; z++ , j++ )
				for( int c=0 ; c<8 ; c++ )
				{
					int cx , cy , cz;
					Cube::FactorCornerIndex( c , cx , cy , cz );
					for( int xx=0 , jj=0 ; xx<(Primal?2:3) ; xx++ ) for( int yy=0 ; yy<(Primal?2:3) ; yy++ ) for( int zz=0 ; zz<(Primal?2:3) ; zz++ , jj++ )
						UpSampleValues[j][c][jj] = Primal ? pWeights[x][cx][xx] * pWeights[y][cy][yy] * pWeights[z][cz][zz] : dWeights[x][cx][xx] * dWeights[y][cy][yy] * dWeights[z][cz][zz]; 
				}
				for( int i=0 ; i<(Primal?8:27) ; i++ ) for( int j=0 ; j<8 ; j++ )
				{
					NonZeroUpSampleValueCount[i][j] = 0;
					for( int k=0 ; k<(Primal?8:27) ; k++ ) if( UpSampleValues[i][j][k] ) NonZeroUpSampleValueCount[i][j]++;
					NonZeroUpSampleValues[i][j] = new std::pair< int , Real >[ NonZeroUpSampleValueCount[i][j] ];
					NonZeroUpSampleValueCount[i][j] = 0;
					for( int k=0 ; k<(Primal?8:27) ; k++ ) if( UpSampleValues[i][j][k] ) NonZeroUpSampleValues[i][j][ NonZeroUpSampleValueCount[i][j]++ ] = std::pair< int , Real >( k , UpSampleValues[i][j][k] );
				}
		}
		_upSampleValuesSet = true;
	}
}
template< class Real , bool Primal >
MeshOctree< Real , Primal >::ElementRestrictionOperator::~ElementRestrictionOperator( void )
{
	for( int i=0 ; i<childInfo.size() ; i++ ) if( childInfo[i] ) delete[] childInfo[i];
	childInfo.clear();
	childCount.clear();
}

template< class Real , bool Primal > template< class C > void MeshOctree< Real , Primal >::VectorIntegrals< C >::clear( void ) { for( int j=0 ; j<(Primal?8:27) ; j++ ) values[j] *= Real( 0. ); }
template< class Real , bool Primal >
template< class C >
typename MeshOctree< Real , Primal >::VectorIntegrals< C >& MeshOctree< Real , Primal >::VectorIntegrals< C >::addScaled( const typename MeshOctree< Real , Primal >::VectorIntegrals< C >& integrals , Real s )
{
	for( int j=0 ; j<(Primal?8:27) ; j++ ) values[j] += integrals.values[j] * s;
	return *this;
}
template< class Real , bool Primal >
template< class C >
typename MeshOctree< Real , Primal >::VectorIntegrals< C >& MeshOctree< Real , Primal >::VectorIntegrals< C >::setScaled( const typename MeshOctree< Real , Primal >::VectorIntegrals< C >& integrals , Real s )
{
	for( int j=0 ; j<(Primal?8:27) ; j++ ) values[j] = integrals.values[j] * s;
	return *this;
}
template< class Real , bool Primal >
void MeshOctree< Real , Primal >::MatrixIntegrals::clear( void ) { memset( values , 0 , sizeof( Real ) * (Primal?8*8:27*28) ); }
template< class Real , bool Primal >
Real MeshOctree< Real , Primal >::MatrixIntegrals::RowDot( const typename MeshOctree< Real , Primal >::MatrixIntegrals& i1 , const Real* i2 , int row )
{
	Real dot = Real( 0. );
	for( int j=0 ; j<(Primal?8:27) ; j++ ) dot += i1[row][j] * i2[j];
	return dot;
}
template< >
float MeshOctree< float , true >::MatrixIntegrals::RowDot( const typename MeshOctree< float , true >::MatrixIntegrals& i1 , const float* i2 , int row )
{
	__m128 dot = _mm_setzero_ps();
	__m128* _v1 = (__m128*)i1[row]; 
	__m128* _v2 = (__m128*)i2; 
	for( int j=0 ; j<2 ; j++ ) dot = _mm_add_ps( dot , _mm_mul_ps( _v1[j] , _v2[j] ) );
	float* f = (float*)(&dot);
	return f[0] + f[1] + f[2] + f[3];
}
template< >
double MeshOctree< double , true >::MatrixIntegrals::RowDot( const typename MeshOctree< double , true >::MatrixIntegrals& i1 , const double* i2 , int row )
{
	__m128d dot = _mm_setzero_pd();
	__m128d* _v1 = (__m128d*)i1[row]; 
	__m128d* _v2 = (__m128d*)i2; 
	for( int j=0 ; j<4 ; j++ ) dot =  _mm_add_pd( dot , _mm_mul_pd( _v1[j] , _v2[j] ) );
	double* d = (double*)(&dot);
	return d[0] + d[1];
}
template< >
float MeshOctree< float , false >::MatrixIntegrals::RowDot( const typename MeshOctree< float , false >::MatrixIntegrals& i1 , const float* i2 , int row )
{
	__m128 dot = _mm_setzero_ps();
	__m128* _v1 = (__m128*)i1[row];
	__m128* _v2 = (__m128*)i2;
	for( int j=0 ; j<7 ; j++ ) dot = _mm_add_ps( dot , _mm_mul_ps( _v1[j] , _v2[j] ) );
	float* f = (float*)(&dot);
	return f[0] + f[1] + f[2] + f[3];
}
template< >
double MeshOctree< double , false >::MatrixIntegrals::RowDot( const typename MeshOctree< double , false >::MatrixIntegrals& i1 , const double* i2 , int row )
{
	__m128d dot = _mm_setzero_pd();
	__m128d* _v1 = (__m128d*)i1[row];
	__m128d* _v2 = (__m128d*)i2;
	for( int j=0 ; j<14 ; j++ ) dot =  _mm_add_pd( dot , _mm_mul_pd( _v1[j] , _v2[j] ) );
	double* d = (double*)(&dot);
	return d[0] + d[1];
}

template< class Real , bool Primal >
typename MeshOctree< Real , Primal >::MatrixIntegrals& MeshOctree< Real , Primal >::MatrixIntegrals::addScaled( const typename MeshOctree< Real , Primal >::MatrixIntegrals& integrals , Real s , int outRow , int inRow )
{
	for( int j=0 ; j<(Primal?8:27) ; j++ ) values[outRow][j] += integrals.values[inRow][j] * s;
	return *this;
}
template< class Real , bool Primal >
typename MeshOctree< Real , Primal >::MatrixIntegrals& MeshOctree< Real , Primal >::MatrixIntegrals::addScaled( const typename MeshOctree< Real , Primal >::MatrixIntegrals& integrals , Real s )
{
	for( int i=0 ; i<(Primal?8:27) ; i++ ) for( int j=0 ; j<(Primal?8:27) ; j++ ) values[i][j] += integrals.values[i][j] * s;
	return *this;
}
template< class Real , bool Primal >
typename MeshOctree< Real , Primal >::MatrixIntegrals& MeshOctree< Real , Primal >::MatrixIntegrals::setScaled( const typename MeshOctree< Real , Primal >::MatrixIntegrals& integrals , Real s )
{
	for( int i=0 ; i<(Primal?8:27) ; i++ ) for( int j=0 ; j<(Primal?8:27) ; j++ ) values[i][j] = integrals.values[i][j] * s;
	return *this;
}
template< >
typename MeshOctree< float , true >::MatrixIntegrals& MeshOctree< float , true >::MatrixIntegrals::addScaled( const typename MeshOctree< float , true >::MatrixIntegrals& integrals , float s , int outRow , int inRow )
{
	__m128 _s = _mm_set1_ps( s );
	{
		__m128* _out = (__m128*)(*this)[outRow];
		__m128* _in  = (__m128*)integrals[inRow];
		for( int j=0 ; j<2 ; j++ ) _out[j] = _mm_add_ps( _out[j] , _mm_mul_ps( _in[j] , _s ) );
	}
	return *this;
}
template< >
typename MeshOctree< float , true >::MatrixIntegrals& MeshOctree< float , true >::MatrixIntegrals::addScaled( const typename MeshOctree< float , true >::MatrixIntegrals& integrals , float s )
{
	__m128 _s = _mm_set1_ps( s );
	__m128* _out = (__m128*)values;
	__m128* _in  = (__m128*)integrals.values;
	for( int j=0 ; j<16 ; j++ ) _out[j] = _mm_add_ps( _out[j] , _mm_mul_ps( _in[j] , _s ) );
	return *this;
}
template< >
typename MeshOctree< float , true >::MatrixIntegrals& MeshOctree< float , true >::MatrixIntegrals::setScaled( const typename MeshOctree< float , true >::MatrixIntegrals& integrals , float s )
{
	__m128 _s = _mm_set1_ps( s );
	__m128* _out = (__m128*)values;
	__m128* _in  = (__m128*)integrals.values;
	for( int j=0 ; j<16 ; j++ ) _out[j] = _mm_mul_ps( _in[j] , _s );
	return *this;
}
template< >
typename MeshOctree< double , true >::MatrixIntegrals& MeshOctree< double , true >::MatrixIntegrals::addScaled( const typename MeshOctree< double , true >::MatrixIntegrals& integrals , double s , int outRow , int inRow )
{
	__m128d _s = _mm_set1_pd( s );
	{
		__m128d* _out = (__m128d*)(*this)[outRow];
		__m128d* _in  = (__m128d*)integrals[inRow];
		for( int j=0 ; j<4 ; j++ ) _out[j] = _mm_add_pd( _out[j] , _mm_mul_pd( _in[j] , _s ) );
	}
	return *this;
}
template< >
typename MeshOctree< double , true >::MatrixIntegrals& MeshOctree< double , true >::MatrixIntegrals::addScaled( const typename MeshOctree< double , true >::MatrixIntegrals& integrals , double s )
{
	__m128d _s = _mm_set1_pd( s );
	__m128d* _out = (__m128d*)values;
	__m128d* _in  = (__m128d*)integrals.values;
	for( int j=0 ; j<32 ; j++ ) _out[j] = _mm_add_pd( _out[j] , _mm_mul_pd( _in[j] , _s ) );
	return *this;
}
template< >
typename MeshOctree< double , true >::MatrixIntegrals& MeshOctree< double , true >::MatrixIntegrals::setScaled( const typename MeshOctree< double , true >::MatrixIntegrals& integrals , double s )
{
	__m128d _s = _mm_set1_pd( s );
	__m128d* _out = (__m128d*)values;
	__m128d* _in  = (__m128d*)integrals.values;
	for( int j=0 ; j<32 ; j++ ) _out[j] = _mm_mul_pd( _in[j] , _s );
	return *this;
}
template< >
typename MeshOctree< float , false >::MatrixIntegrals& MeshOctree< float , false >::MatrixIntegrals::addScaled( const typename MeshOctree< float , false >::MatrixIntegrals& integrals , float s , int outRow , int inRow )
{
	__m128 _s = _mm_set1_ps( s );
	{
		__m128* _out = (__m128*)(*this)[outRow];
		__m128* _in  = (__m128*)integrals[inRow];
		for( int j=0 ; j<7 ; j++ ) _out[j] = _mm_add_ps( _out[j] , _mm_mul_ps( _in[j] , _s ) );
	}
	return *this;
}
template< >
typename MeshOctree< float , false >::MatrixIntegrals& MeshOctree< float , false >::MatrixIntegrals::addScaled( const typename MeshOctree< float , false >::MatrixIntegrals& integrals , float s )
{
	__m128 _s = _mm_set1_ps( s );
	__m128* _out = (__m128*)values;
	__m128* _in  = (__m128*)integrals.values;
	for( int j=0 ; j<7*27 ; j++ ) _out[j] = _mm_add_ps( _out[j] , _mm_mul_ps( _in[j] , _s ) );
	return *this;
}
template< >
typename MeshOctree< float , false >::MatrixIntegrals& MeshOctree< float , false >::MatrixIntegrals::setScaled( const typename MeshOctree< float , false >::MatrixIntegrals& integrals , float s )
{
	__m128 _s = _mm_set1_ps( s );
	__m128* _out = (__m128*)values;
	__m128* _in  = (__m128*)integrals.values; 
	for( int j=0 ; j<7*27 ; j++ ) _out[j] = _mm_mul_ps( _in[j] , _s );
	return *this;
}
template< >
typename MeshOctree< double , false >::MatrixIntegrals& MeshOctree< double , false >::MatrixIntegrals::addScaled( const typename MeshOctree< double , false >::MatrixIntegrals& integrals , double s , int outRow , int inRow )
{
	__m128d _s = _mm_set1_pd( s );
	{
		__m128d* _out = (__m128d*)(*this)[outRow]; 
		__m128d* _in  = (__m128d*)integrals[inRow];
		for( int j=0 ; j<14 ; j++ ) _out[j] = _mm_add_pd( _out[j] , _mm_mul_pd( _in[j] , _s ) );
	}
	return *this;
}
template< >
typename MeshOctree< double , false >::MatrixIntegrals& MeshOctree< double , false >::MatrixIntegrals::addScaled( const typename MeshOctree< double , false >::MatrixIntegrals& integrals , double s )
{
	__m128d _s = _mm_set1_pd( s );
	__m128d* _out = (__m128d*)values; 
	__m128d* _in  = (__m128d*)integrals.values;
	for( int j=0 ; j<14*27 ; j++ ) _out[j] = _mm_add_pd( _out[j] , _mm_mul_pd( _in[j] , _s ) );
	return *this;
}
template< >
typename MeshOctree< double , false >::MatrixIntegrals& MeshOctree< double , false >::MatrixIntegrals::setScaled( const typename MeshOctree< double , false >::MatrixIntegrals& integrals , double s )
{
	__m128d _s = _mm_set1_pd( s );
	__m128d* _out = (__m128d*)values; 
	__m128d* _in  = (__m128d*)integrals.values;
	for( int j=0 ; j<14*27 ; j++ ) _out[j] = _mm_mul_pd( _in[j] , _s );
	return *this;
}

template< class Real , bool Primal >
void MeshOctree< Real , Primal >::ElementRestrictionOperator::downSample( const ElementMatrix& high , ElementMatrix& low , int threads , bool resize )
{
	if( resize ) low.resize( childCount.size() );

#pragma omp parallel for num_threads( threads )
	for( int t=0 ; t<threads ; t++ )
		for( int l=(low.size()*t) / threads ; l<(low.size()*(t+1)) / threads ; l++ )
		{
			memset( low[l].values , 0 , sizeof( MatrixIntegrals ) );

			for( int i=0 ; i<childCount[l] ; i++ )
			{
				int idx = childInfo[l][i].first , c = childInfo[l][i].second;

				MatrixIntegrals childParentIntegrals;
				memset( childParentIntegrals.values , 0 , sizeof( MatrixIntegrals ) );

				// Compute the child-parent integrals
				for( int j=0 ; j<(Primal?8:27) ; j++ )
				{
					int count = NonZeroUpSampleValueCount[j][c];
					const std::pair< int , Real >* values = NonZeroUpSampleValues[j][c];
					for( int jj=0 ; jj<count ; jj++ , values++ )
						childParentIntegrals.addScaled( high[idx] , values->second , j , values->first );
				}

				// Compute (half) the child-child integrals
				for( int j=0; j<(Primal?8:27) ; j++ )
					for( int _j=0 ; _j<=j ; _j++ )
						low[l][j][_j] += MatrixIntegrals::RowDot( childParentIntegrals , &UpSampleValues[j][c][0] , _j );
			}
			for( int j=0 ; j<(Primal?8:27) ; j++ ) for( int _j=0 ; _j<=j ; _j++ ) low[l][_j][j] = low[l][j][_j];
		}
}
template< class Real , bool Primal >
template< class C >
void MeshOctree< Real , Primal >::ElementRestrictionOperator::downSample( const ElementVector< C >& high , ElementVector< C >& low , int threads , bool resize )
{
	if( resize ) low.resize( childCount.size() );

#pragma omp parallel for num_threads( threads )
	for( int t=0 ; t<threads ; t++ )
		for( int l=(low.size()*t) / threads ; l<(low.size()*(t+1)) / threads ; l++ )
		{
			memset( low[l].values , 0 , sizeof( IntegralValues ) );

			for( int i=0 ; i<childCount[l] ; i++ )
			{
				int idx = childInfo[l][i].first , c = childInfo[l][i].second;

				MatrixIntegrals childParentIntegrals;
				memset( childParentIntegrals.values , 0 , sizeof( MatrixIntegrals ) );

				// Compute the child-parent integrals
				for( int j=0 ; j<(Primal?8:27) ; j++ )
					for( int jj=0 ; jj<(Primal?8:27) ; jj++ )
						childParentIntegrals.addScaled( high[idx] , UpSampleValues[j][c][jj] , j , jj );

				// Compute (half) the child-child integrals
				for( int j=0; j<(Primal?8:27) ; j++ )
					for( int _j=0 ; _j<=j ; _j++ )
						low[l][j][_j] += MatrixIntegrals::RowDot( childParentIntegrals , &UpSampleValues[j][c][0] , _j );
			}
			for( int j=0 ; j<(Primal?8:27) ; j++ ) for( int _j=0 ; _j<=j ; _j++ ) low[l][_j][j] = low[l][j][_j];
		}
}
template< class Real , bool Primal >
void MeshOctree< Real , Primal >::BasisDownSampleMatrix( const SparseMatrix< Real , int >& high , const typename MeshOctree< Real , Primal >::ElementRestrictionOperator& high2LowE , const SparseMatrix< Real , int >& low , SparseMatrix< Real , int >& high2LowB , int threads )
{
	SparseMatrix< Real , int > temp , highT;
	std::vector< int > sums( high.groups );
	SparseMatrix< Real , int >::Transpose( high , highT );
#pragma omp parallel for num_threads( threads )
	for( int i=0 ; i<high.groups ; i++ )
	{
		sums[i] = Real( 0. );
		for( int j=0 ; j<high.groupSizes[i] ; j++ ) sums[i] += high[i][j].Value * high[i][j].Value;
	}
#pragma omp parallel for num_threads( threads )
	for( int i=0 ; i<highT.groups ; i++ )
		for( int j=0 ; j<highT.groupSizes[i] ; j++ ) highT[i][j].Value /= sums[ highT[i][j].N ];
	SparseMatrix< Real , int >::Multiply( high2LowE , highT , temp , threads );
	SparseMatrix< Real , int >::Multiply( low , temp , high2LowB , threads );
}

template< class Real , bool Primal >
template< bool UnitEntries >
void MeshOctree< Real , Primal >::ElementToBasisOperator< UnitEntries >::init( int threads )
{
	// Compute the number of elements
	int elementNum = 0;
	for( int i=0 ; i<e2b.groups ; i++ ) for( int j=0 ; j<e2b.groupSizes[i] ; j++ ) if( e2b[i][j].N>elementNum ) elementNum = e2b[i][j].N;
	_elementNum = elementNum/(Primal?8:27) + 1;
	elementNum = _elementNum * (Primal?8:27);
	
	// Associate each element function with a basis function
	_e2b.resize( elementNum );
	for( int i=0 ; i<elementNum ; i++ ) _e2b[i].first = -1;
	for( int i=0 ; i<e2b.groups ; i++ ) for( int j=0 ; j<e2b.groupSizes[i] ; j++ ) _e2b[ e2b[i][j].N ] = std::pair< int , Real >( i , e2b[i][j].Value );

	elementNum /= (Primal?8:27);
	_offset.resize( elementNum );
	_rowSize.resize( e2b.groups );
#pragma omp parallel for num_threads( threads )
	for( int i=0 ; i<e2b.groups ; i++ )
	{
		// Set a unique position for every basis function overlapping the i-th basis function
		_rowSize[i] = 0;
		stdext::hash_map< int , int > rowPosition;
		for( int j=0 ; j<e2b.groupSizes[i] ; j++ )
		{
			int eNum = e2b[i][j].N / (Primal?8:27);
			int eOff = e2b[i][j].N % (Primal?8:27);
			int base = eNum * (Primal?8:27);
			for( int k=0 ; k<(Primal?8:27) ; k++ )
			{
				int idx = _e2b[ base + k ].first;
				if( idx!=-1 && idx!=i && rowPosition.find( idx )==rowPosition.end() ) rowPosition[idx] = _rowSize[i]++;
			}
		}
		rowPosition[i] = _rowSize[i]++;

		// Set the values in the offset table
		for( int j=0 ; j<e2b.groupSizes[i] ; j++ )
		{
			int eNum = e2b[i][j].N / (Primal?8:27);
			int eOff = e2b[i][j].N % (Primal?8:27);
			int base = eNum * (Primal?8:27);
			for( int k=0 ; k<(Primal?8:27) ; k++ )
			{
				int idx = _e2b[ base + k ].first;
				if( idx!=-1 ) _offset[eNum][eOff][k] = rowPosition[ idx ];
				else          _offset[eNum][eOff][k] = -1;
			}
		}
	}
}
template< class Real , bool Primal >
template< bool UnitEntries >
void MeshOctree< Real , Primal >::ElementToBasisOperator< UnitEntries >::resizeBasisMatrix( SparseMatrix< Real , int >& matrix ) const
{
	matrix.Resize( e2b.groups );
	for( int i=0 ; i<e2b.groups ; i++ )
	{
		matrix.SetGroupSize( i , _rowSize[i] );
		// Set the indices for the matrix entries
		for( int j=0 ; j<e2b.groupSizes[i] ; j++ )
		{
			int eNum = e2b[i][j].N / (Primal?8:27);
			int eOff = e2b[i][j].N % (Primal?8:27);
			int base = eNum * (Primal?8:27);
			for( int k=0 ; k<(Primal?8:27) ; k++ )
				if( _offset[eNum][eOff][k]!=-1 )
				{
					int idx = _e2b[ base + k ].first;
					matrix[i][ _offset[eNum][eOff][k] ].N = idx;
				}
		}
	}
	matrix.MakeContiguous();
}
template< class Real , bool Primal >
template< bool UnitEntries >
void MeshOctree< Real , Primal >::ElementToBasisOperator< UnitEntries >::elementToBasisMatrix( const ElementMatrix& eMatrix , SparseMatrix< Real , int >& matrix , int threads , bool resize ) const
{
	if( resize ) resizeBasisMatrix( matrix );

#pragma omp parallel for num_threads( threads )
	for( int t=0 ; t<threads ; t++ )
		for( int i=(matrix.Rows()*t) / threads ; i<(matrix.Rows()*(t+1)) / threads ; i++ )
		{
			Pointer( MatrixEntry< Real , int > ) mRow = matrix[i];
			for( int j=0 ; j<_rowSize[i] ; j++ ) mRow[j].Value = Real( 0. );
			if( UnitEntries )
				for( int j=0 ; j<e2b.groupSizes[i] ; j++ )
				{
					int eNum = e2b[i][j].N / (Primal?8:27);
					int eOff = e2b[i][j].N % (Primal?8:27);
					const Real* _integrals = eMatrix[eNum][eOff];
					const int* off = _offset[eNum][eOff];
					for( int k=0 ; k<(Primal?8:27) ; k++ ) if( off[k]!=-1 ) mRow[ off[k] ].Value += _integrals[k];
				}
			else
				for( int j=0 ; j<e2b.groupSizes[i] ; j++ )
				{
					int eNum = e2b[i][j].N / (Primal?8:27);
					int eOff = e2b[i][j].N % (Primal?8:27);
					int base = eNum * (Primal?8:27);
					const Real* _integrals = eMatrix[eNum][eOff];
					const int* off = _offset[eNum][eOff];
					Real w = e2b[i][j].Value;
					for( int k=0 ; k<(Primal?8:27) ; k++ ) if( off[k]!=-1 ) mRow[ off[k] ].Value += _integrals[k] * _e2b[ base+k ].second * w;
				}
		}
}
template< class Real , bool Primal >
template< bool UnitEntries >
template< class C >
void MeshOctree< Real , Primal >::ElementToBasisOperator< UnitEntries >::MultiplyTranspose( const Vector< C >& in , ElementVector< C >& out , int threads , bool resize ) const
{
	if( resize ) out.resize( _elementNum );
#pragma omp parallel for num_threads( threads )
	for( int i=0 ; i<_elementNum ; i++ )
		for( int j=0 ; j<(Primal?8:27) ; j++ ) out[i][j] *=0;
	if( UnitEntries )
#pragma omp parallel for num_threads( threads )
		for( int i=0 ; i<_e2b.size() ; i++ )
			out[ i / (Primal?8:27) ][ i % (Primal?8:27) ] = in[ _e2b[i].first ];
	else
#pragma omp parallel for num_threads( threads )
		for( int i=0 ; i<_e2b.size() ; i++ )
			out[ i / (Primal?8:27) ][ i % (Primal?8:27) ] = in[ _e2b[i].first ] * _e2b[i].second;
}
template< class Real , bool Primal >
template< bool UnitEntries >
template< class C >
void MeshOctree< Real , Primal >::ElementToBasisOperator< UnitEntries >::Multiply( const ElementVector< C >& in , Vector< C >& out , int threads , bool resize ) const
{
	if( resize ) out.Resize( e2b.groups );
#pragma omp parallel for num_threads( threads )
	for( int i=0 ; i<e2b.groups ; i++ )
	{
		out[i] *= 0;
		if( UnitEntries ) for( int j=0 ; j<e2b.groupSizes[i] ; j++ ) out[i] += in[ e2b[i][j].N / (Primal?8:27) ][ e2b[i][j].N % (Primal?8:27) ];
		else              for( int j=0 ; j<e2b.groupSizes[i] ; j++ ) out[i] += in[ e2b[i][j].N / (Primal?8:27) ][ e2b[i][j].N % (Primal?8:27) ] * e2b[i][j].Value;
	}
}
template< class Real , bool Primal >
template< class C >
void MeshOctree< Real , Primal >::ElementMatrix::MultiplyParallel( const ElementVector< C >& in , ElementVector< C >& out , int threads , int multiplyFlag ) const
{
#pragma omp parallel for num_threads( threads )
	for( int i=0 ; i<size() ; i++ )
	{
		const C* _in = &in[i][0];
		for( int j=0 ; j<(Primal?8:27) ; j++ )
		{
			C temp;
			memset( &temp , 0 , sizeof(C) );
			const Real* row = (*this)[i][j];
			for( int k=0 ; k<(Primal?8:27) ; k++ ) temp += _in[k] * row[k];
			switch( multiplyFlag )
			{
				case MULTIPLY_CLEAR:    out[i][j]  = temp ; break;
				case MULTIPLY_ADD:      out[i][j] += temp ; break;
				case MULTIPLY_SUBTRACT: out[i][j] -= temp ; break;
			}
		}
	}

}
template< class Real , bool Primal >
void MeshOctree< Real , Primal >::MatrixIntegrationSample::setSubValues( const double sValues[Primal?2:3][3] , const  double sDValues[Primal?2:3][3] , bool preDivided )

{
	for( int i=0 ; i<(Primal?2:3)-1 ; i++ ) for( int c=0 ; c<3 ; c++ )
	{
		subValues[i][c] = Real( sValues[i][c] );
		// Pre-divide the derivative values so that gradients are faster to compute.
		if( !preDivided )
		{
			if( subValues[i][c]!=0 ) subDValues[i][c] = sDValues[i][c]/subValues[i][c];
			else                     subDValues[i][c] = Real( 0. );
		}
	}
}
template< class Real , bool Primal >
void MeshOctree< Real , Primal >::MatrixIntegrationSample::getSubValues( double sValues[Primal?2:3][3] , double sDValues[Primal?2:3][3] , bool preDivided ) const
{
	for( int c=0 ; c<3 ; c++ )
	{
		sValues [(Primal?2:3)-1][c] = 1.;
		sDValues[(Primal?2:3)-1][c] = 0.;
		for( int i=0 ; i<(Primal?2:3)-1 ; i++ )
		{
			sValues[i][c] = double( subValues[i][c] );
			sValues[(Primal?2:3)-1][c] -= sValues[i][c];
			sDValues[i][c] = double( subDValues[i][c] );
			sDValues[(Primal?2:3)-1][c] -= sDValues[i][c] * sValues[i][c];
		}
		if( preDivided )
		{
			if( sValues[(Primal?2:3)-1][c] ) sDValues[(Primal?2:3)-1][c] /= sValues[(Primal?2:3)-1][c];
			else                             sDValues[(Primal?2:3)-1][c] = 0.;
		}
		else for( int i=0 ; i<(Primal?2:3)-1 ; i++ ) sDValues[i][c] *= sValues[i][c];
	}
}
template< class Real , bool Primal >
template< class Vertex >
template< class C >
void MeshOctree< Real , Primal >::TrackingData< Vertex >::setConstrainedLaplacianAndVector( const typename MeshOctree< Real , Primal >::ElementVector< Vertex >& embedding ,
																						   const typename MeshOctree< Real , Primal >::ElementVector< C >& coefficients ,
																						   typename MeshOctree< Real , Primal >::ElementMatrix&      eM , 
																						   typename MeshOctree< Real , Primal >::ElementVector< C >& eV , 
																						   Real dotM , Real lapM , Real dotV , Real lapV , int flowType , int threads ) const
{
	// If the conformal flag is enabled, we switch the flow from:
	//		dX/dt = - \Delta_t X = - 1/\sqrt{|g_t|}\sum_{ij} \partial_i\left(\sqrt{|g_t|} g^{ij}\partial_j X(0)\right)
	// to 
	//		dX/dt = - 1/\sqrt{|g_t|} \Delta_0 X
	// If the authalic flag is enabled, we switch to:
	//		dX/dt = - \sum_{ij} \partial_i\left(g^{ij}\partial_j X(0)\right)
	// Given X(p) = p, under what conditions is the map: X+F:M -> IR^3 conformal?
	// If we consider the differential of the map dX+dF:
	// => dF + dF^t = \lambda Id
	// => dF = | \lambda/2     a    |
	//         |    -a    \lambda/2 |

	// Suppose we have a domain \Omega \subset IR^2
	// Under what conditions is the map F:\Omega -> IR^3 conformal?
	// If   < \partial F / \partial x , \partial F / \partial y> =0
	// and || \partial F / \partial x ||^2 = || \partial F / \partial y ||^2
	// What is the projection of F onto the space of conformal maps?

	// Suppose we have a domain \Omega \subset IR^2
	// Under what conditions is the map F:\Omega -> IR^2 conformal?
	// If  dF_x / dx =  dF_y / dy
	// and dF_x / dy = -dF_y / dx
	// What is the projection of F onto the space of conformal maps?
	// set T to be the matrix:
	// |  dF_y/dy - dF_x/dx  -dF_y/dx - dF_x/dy | / 2
	// | -dF_y/dx - dF_x/dy   dF_x/dx - dF_y/dy |
	// Then T is the smallest matrix such that dF + T = \lambda Id.
	// So, in theory, we would like to integrate T to get a function G and then subtract off G from F?

	bool degenerateSamples = false;
	int samples = 0;
	for( int i=0 ; i<embedding.size() ; i++ ) samples = std::max< int >( samples , sampleStart[i+1]-sampleStart[i] );
#pragma omp parallel for num_threads( threads )
	for( int i=0 ; i<eV.size() ; i++ ) for( int j=0 ; j<(Primal?8:27) ; j++ ) eV[i][j] *= 0;
#pragma omp parallel for num_threads( threads )
	for( int i=0 ; i<eM.size() ; i++ ) memset( eM[i][0] , 0 , sizeof( typename MeshOctree< Real , Primal >::MatrixIntegrals ) );

#pragma omp parallel for num_threads( threads )
	for( int thread=0 ; thread<threads ; thread++ )
	{
		Pointer( double ) ab1[(Primal?8:27)];
		Pointer( double ) ab2[(Primal?8:27)];
		Pointer( double ) values[(Primal?8:27)];
		Pointer( double ) newWeights;

		newWeights = AlignedAllocPointer< double >( samples , 16 );
		for( int x=0 ; x<(Primal?8:27) ; x++ ) values[x] = AlignedAllocPointer< double >( samples , 16 );
		for( int x=0 ; x<(Primal?8:27) ; x++ ) ab1[x] = AlignedAllocPointer< double >( 2*samples , 16 ) , ab2[x] = AlignedAllocPointer< double >( 2*samples , 16 );
		for( int idx=(embedding.size()*thread)/threads ; idx<(embedding.size()*(thread+1))/threads ; idx++ )
		{
			int sampleStart = this->sampleStart[idx  ];
			int sampleCount = this->sampleStart[idx+1] - sampleStart;

			// For every sample, compute the derivative of the parameterization function that takes
			// the initial surface to the new one.
			for( int s=0 ; s<sampleCount ; s++ )
			{
				Matrix< double , 2 , 3 > dF;
				dF *= 0;
				int parent;
				double sValues[Primal?2:3][3] , sDValues[Primal?2:3][3];
				mSamples[sampleStart+s].getSubValues( sValues , sDValues , true );
				Point3D< double > t1 = Point3D< double >( tangents[ mSamples[sampleStart+s].index ].first  );
				Point3D< double > t2 = Point3D< double >( tangents[ mSamples[sampleStart+s].index ].second );
				for( int i=0 , x=0 ; i<(Primal?2:3) ; i++ )
				{
					double dX = sDValues[i][0];
					double _d1 = t1[0]*dX , _d2 = t2[0]*dX;
					for( int j=0 ; j<(Primal?2:3) ; j++ )
					{
						double _v = sValues[i][0] * sValues[j][1];
						double dY = sDValues[j][1];
						double d1 = _d1 + t1[1]*dY , d2 = _d2 + t2[1]*dY;
						for( int k=0 ; k<(Primal?2:3) ; k++ , x++ )
						{
							double v =  _v * sValues[k][2];
							double dZ = sDValues[k][2];
							values[x][s] = v;
							ab1[x][2*s  ] = ( d1 + t1[2]*dZ ) * v;
							ab1[x][2*s+1] = ( d2 + t2[2]*dZ ) * v;
							const Vertex& vertex = embedding[idx][x];
							for( int l=0 ; l<2 ; l++ ) for( int c=0 ; c<3 ; c++ ) dF(l,c) += ab1[x][2*s+l]*vertex[c];
						}
					}
				}
				SquareMatrix< double , 2 > D;
				D(0,0)          = dF(0,0)*dF(0,0) + dF(0,1)*dF(0,1) + dF(0,2)*dF(0,2);
				D(1,1)          = dF(1,0)*dF(1,0) + dF(1,1)*dF(1,1) + dF(1,2)*dF(1,2);
				D(0,1) = D(1,0) = dF(0,0)*dF(1,0) + dF(0,1)*dF(1,1) + dF(0,2)*dF(1,2);
				double det = D(0,0)*D(1,1)-D(0,1)*D(1,0);
				if( flowType==FLOW_CONFORMAL_LAPLACIAN )
				{
					// Uniformly scaling [0,1]x[0,1] to [0,a]x[0,b] takes
					// If_xx + If_yy -> a * b * ( If_xx / a^2 + If_yy / b^2 ) = < (If_xx , If_yy) , ( b/a , a/b ) >
					double newWeight = mSamples[sampleStart+s].weight;
					newWeights[s] = mSamples[sampleStart+s].weight * sqrt(det);
					for( int x=0 ; x<(Primal?8:27) ; x++ )
					{
						ab2[x][2*s  ] = ab1[x][2*s  ] * newWeight;
						ab2[x][2*s+1] = ab1[x][2*s+1] * newWeight;
					}
				}
				else
				{
					if( det!=det )
					{
						printf( "Warning: Degenerate! (det=%f)\n" , det );
						printf( "%f %f\n%f %f\n%f %f\n" , dF(0,0) , dF(1,0) , dF(0,1) , dF(1,1) , dF(0,2) , dF(1,2) );
						exit( 0 );
					}
					else if( det<=0 )
					{
						degenerateSamples = true;
						D *= 0;
						det = 0;
					}
					else
					{
						double X = D(1,1)/det;
						double Y = D(0,0)/det;
						double Z =-D(0,1)/det;
						D( 0 , 0 ) = X;
						D( 1 , 0 ) = D( 0 , 1 ) = Z;
						D( 1 , 1 ) = Y;
					}
					if( flowType==FLOW_AUTHALIC_LAPLACIAN ) det = 1;
					double newWeight = mSamples[sampleStart+s].weight * sqrt(det);
					newWeights[s] = newWeight;
					for( int x=0 ; x<(Primal?8:27) ; x++ )
					{
						ab2[x][2*s  ] = ( D( 0 , 0 ) * ab1[x][2*s] + D( 0 , 1 ) * ab1[x][2*s+1] ) * newWeight;
						ab2[x][2*s+1] = ( D( 1 , 0 ) * ab1[x][2*s] + D( 1 , 1 ) * ab1[x][2*s+1] ) * newWeight;
					}
				}
			}
			for( int x=0 ; x<(Primal?8:27) ; x++ )
			{
				ConstPointer( double ) valuesX = values[x];
				ConstPointer( __m128d ) dValuesX = ( ConstPointer( __m128d ) ) ab1[x];
				C& _v = eV[idx][x];
				const C& _x = coefficients[idx][x];
				for( int y=x ; y<(Primal?8:27) ; y++ )
				{
					ConstPointer( double ) valuesY = values[y];
					double temp = 0;
					ConstPointer( __m128d ) dValuesY = ( ConstPointer( __m128d ) ) ab2[y];
					__declspec (align(16)) __m128d _dTemp = _mm_set1_pd( 0 );
					for( int s=0 ; s<sampleCount ; s++ ) temp += valuesX[s] * valuesY[s] * newWeights[s] , _dTemp = _mm_add_pd( _dTemp , _mm_mul_pd( dValuesX[s] , dValuesY[s] ) );
					double dTemp = ((double*)&_dTemp)[0] + ((double*)&_dTemp)[1];
					eM[idx][y][x] = eM[idx][x][y] = temp*dotM + dTemp*lapM;
					Real __temp = Real( temp*dotV + dTemp*lapV );
					_v += coefficients[idx][y] * __temp;
					if( x!=y ) eV[idx][y] += _x * __temp;
				}
			}
		}
		AlignedFreePointer( newWeights );
		for( int x=0 ; x<(Primal?8:27) ; x++ ) AlignedFreePointer( values[x] );
		for( int x=0 ; x<(Primal?8:27) ; x++ ) AlignedFreePointer( ab1[x] );
		for( int x=0 ; x<(Primal?8:27) ; x++ ) AlignedFreePointer( ab2[x] );
	}
	if( degenerateSamples ) fprintf( stderr , "Warning: Degenerate samples found!\n" );
}
template< class Real , bool Primal >
template< class Vertex >
template< class C >
void MeshOctree< Real , Primal >::TrackingData< Vertex >::setMatricesAndVectors( const typename MeshOctree< Real , Primal >::ElementVector< Vertex >& embedding ,
																				 const typename MeshOctree< Real , Primal >::ElementVector< C >& coefficients ,
																				 typename MeshOctree< Real , Primal >::ElementMatrix& dotM , 
																				 typename MeshOctree< Real , Primal >::ElementMatrix& lapM , 
																				 typename MeshOctree< Real , Primal >::ElementVector< C >& dotV , 
																				 typename MeshOctree< Real , Primal >::ElementVector< C >& lapV , 
																				 int flowType , int threads ) const
{
	// If the conformal flag is enabled, we switch the flow from:
	//		dX/dt = - \Delta_t X = - 1/\sqrt{|g_t|}\sum_{ij} \partial_i\left(\sqrt{|g_t|} g^{ij}\partial_j X(0)\right)
	// to 
	//		dX/dt = - 1/\sqrt{|g_t|} \Delta_0 X
	// If the authalic flag is enabled, we switch to:
	//		dX/dt = - \sum_{ij} \partial_i\left(g^{ij}\partial_j X(0)\right)
	// Given X(p) = p, under what conditions is the map: X+F:M -> IR^3 conformal?
	// If we consider the differential of the map dX+dF:
	// => dF + dF^t = \lambda Id
	// => dF = | \lambda/2     a    |
	//         |    -a    \lambda/2 |

	// Suppose we have a domain \Omega \subset IR^2
	// Under what conditions is the map F:\Omega -> IR^3 conformal?
	// If   < \partial F / \partial x , \partial F / \partial y> =0
	// and || \partial F / \partial x ||^2 = || \partial F / \partial y ||^2
	// What is the projection of F onto the space of conformal maps?

	// Suppose we have a domain \Omega \subset IR^2
	// Under what conditions is the map F:\Omega -> IR^2 conformal?
	// If  dF_x / dx =  dF_y / dy
	// and dF_x / dy = -dF_y / dx
	// What is the projection of F onto the space of conformal maps?
	// set T to be the matrix:
	// |  dF_y/dy - dF_x/dx  -dF_y/dx - dF_x/dy | / 2
	// | -dF_y/dx - dF_x/dy   dF_x/dx - dF_y/dy |
	// Then T is the smallest matrix such that dF + T = \lambda Id.
	// So, in theory, we would like to integrate T to get a function G and then subtract off G from F?

	bool degenerateSamples = false;
	int samples = 0;
	for( int i=0 ; i<embedding.size() ; i++ ) samples = std::max< int >( samples , sampleStart[i+1]-sampleStart[i] );
#pragma omp parallel for num_threads( threads )
	for( int i=0 ; i<dotV.size() ; i++ ) for( int j=0 ; j<(Primal?8:27) ; j++ ) dotV[i][j] *= 0 , lapV[i][j] *= 0;
#pragma omp parallel for num_threads( threads )
	for( int i=0 ; i<dotM.size() ; i++ ) memset( dotM[i][0] , 0 , sizeof( typename MeshOctree< Real , Primal >::MatrixIntegrals ) ) , memset( lapM[i][0] , 0 , sizeof( typename MeshOctree< Real , Primal >::MatrixIntegrals ) );

#pragma omp parallel for num_threads( threads )
	for( int thread=0 ; thread<threads ; thread++ )
	{
		Pointer( double ) ab1[(Primal?8:27)];
		Pointer( double ) ab2[(Primal?8:27)];
		Pointer( double ) values[(Primal?8:27)];
		Pointer( double ) newWeights;

		newWeights = AlignedAllocPointer< double >( samples , 16 );
		for( int x=0 ; x<(Primal?8:27) ; x++ ) values[x] = AlignedAllocPointer< double >( samples , 16 );
		for( int x=0 ; x<(Primal?8:27) ; x++ ) ab1[x] = AlignedAllocPointer< double >( 2*samples , 16 ) , ab2[x] = AlignedAllocPointer< double >( 2*samples , 16 );
		for( int idx=(embedding.size()*thread)/threads ; idx<(embedding.size()*(thread+1))/threads ; idx++ )
		{
			int sampleStart = this->sampleStart[idx  ];
			int sampleCount = this->sampleStart[idx+1] - sampleStart;

			// For every sample, compute the derivative of the parameterization function that takes
			// the initial surface to the new one.
			for( int s=0 ; s<sampleCount ; s++ )
			{
				Matrix< double , 2 , 3 > dF;
				dF *= 0;
				int parent;
				double sValues[Primal?2:3][3] , sDValues[Primal?2:3][3];
				mSamples[sampleStart+s].getSubValues( sValues , sDValues , true );
				Point3D< double > t1 = Point3D< double >( tangents[ mSamples[sampleStart+s].index ].first  );
				Point3D< double > t2 = Point3D< double >( tangents[ mSamples[sampleStart+s].index ].second );
				for( int i=0 , x=0 ; i<(Primal?2:3) ; i++ )
				{
					double dX = sDValues[i][0];
					double _d1 = t1[0]*dX , _d2 = t2[0]*dX;
					for( int j=0 ; j<(Primal?2:3) ; j++ )
					{
						double _v = sValues[i][0] * sValues[j][1];
						double dY = sDValues[j][1];
						double d1 = _d1 + t1[1]*dY , d2 = _d2 + t2[1]*dY;
						for( int k=0 ; k<(Primal?2:3) ; k++ , x++ )
						{
							double v =  _v * sValues[k][2];
							double dZ = sDValues[k][2];
							values[x][s] = v;
							ab1[x][2*s  ] = ( d1 + t1[2]*dZ ) * v;
							ab1[x][2*s+1] = ( d2 + t2[2]*dZ ) * v;
							const Vertex& vertex = embedding[idx][x];
							for( int l=0 ; l<2 ; l++ ) for( int c=0 ; c<3 ; c++ ) dF(l,c) += ab1[x][2*s+l]*vertex[c];
						}
					}
				}
				SquareMatrix< double , 2 > D;
				D(0,0)          = dF(0,0)*dF(0,0) + dF(0,1)*dF(0,1) + dF(0,2)*dF(0,2);
				D(1,1)          = dF(1,0)*dF(1,0) + dF(1,1)*dF(1,1) + dF(1,2)*dF(1,2);
				D(0,1) = D(1,0) = dF(0,0)*dF(1,0) + dF(0,1)*dF(1,1) + dF(0,2)*dF(1,2);
				double det = D(0,0)*D(1,1)-D(0,1)*D(1,0);
				if( flowType==FLOW_CONFORMAL_LAPLACIAN )
				{
					// Uniformly scaling [0,1]x[0,1] to [0,a]x[0,b] takes
					// If_xx + If_yy -> a * b * ( If_xx / a^2 + If_yy / b^2 ) = < (If_xx , If_yy) , ( b/a , a/b ) >
					double newWeight = mSamples[sampleStart+s].weight;
					newWeights[s] = mSamples[sampleStart+s].weight * sqrt(det);
					for( int x=0 ; x<(Primal?8:27) ; x++ )
					{
						ab2[x][2*s  ] = ab1[x][2*s  ] * newWeight;
						ab2[x][2*s+1] = ab1[x][2*s+1] * newWeight;
					}
				}
				else
				{
					if( det!=det )
					{
						printf( "Warning: Degenerate! (det=%f)\n" , det );
						printf( "%f %f\n%f %f\n%f %f\n" , dF(0,0) , dF(1,0) , dF(0,1) , dF(1,1) , dF(0,2) , dF(1,2) );
						exit( 0 );
					}
					else if( det<=0 )
					{
						degenerateSamples = true;
						D *= 0;
						det = 0;
					}
					else
					{
						double X = D(1,1)/det;
						double Y = D(0,0)/det;
						double Z =-D(0,1)/det;
						D( 0 , 0 ) = X;
						D( 1 , 0 ) = D( 0 , 1 ) = Z;
						D( 1 , 1 ) = Y;
					}
					if( flowType==FLOW_AUTHALIC_LAPLACIAN ) det = 1;
					double newWeight = mSamples[sampleStart+s].weight * sqrt(det);
					newWeights[s] = newWeight;
					for( int x=0 ; x<(Primal?8:27) ; x++ )
					{
						ab2[x][2*s  ] = ( D( 0 , 0 ) * ab1[x][2*s] + D( 0 , 1 ) * ab1[x][2*s+1] ) * newWeight;
						ab2[x][2*s+1] = ( D( 1 , 0 ) * ab1[x][2*s] + D( 1 , 1 ) * ab1[x][2*s+1] ) * newWeight;
					}
				}
			}
			for( int x=0 ; x<(Primal?8:27) ; x++ )
			{
				ConstPointer( double ) valuesX = values[x];
				ConstPointer( __m128d ) dValuesX = ( ConstPointer( __m128d ) ) ab1[x];
				C& _dotV = dotV[idx][x];
				C& _lapV = lapV[idx][x];
				const C& _x = coefficients[idx][x];
				for( int y=x ; y<(Primal?8:27) ; y++ )
				{
					ConstPointer( double ) valuesY = values[y];
					double tDot = 0;
					ConstPointer( __m128d ) dValuesY = ( ConstPointer( __m128d ) ) ab2[y];
					__declspec (align(16)) __m128d _tLap = _mm_set1_pd( 0 );
					for( int s=0 ; s<sampleCount ; s++ ) tDot += valuesX[s] * valuesY[s] * newWeights[s] , _tLap = _mm_add_pd( _tLap , _mm_mul_pd( dValuesX[s] , dValuesY[s] ) );
					double tLap = ((double*)&_tLap)[0] + ((double*)&_tLap)[1];
					dotM[idx][y][x] = dotM[idx][x][y] = tDot;
					lapM[idx][y][x] = lapM[idx][x][y] = tLap;
					_dotV += coefficients[idx][y] * tDot;
					_lapV += coefficients[idx][y] * tLap;
					if( x!=y ) dotV[idx][y] += _x * tDot , lapV[idx][y] += _x * tLap;
				}
			}
		}
		AlignedFreePointer( newWeights );
		for( int x=0 ; x<(Primal?8:27) ; x++ ) AlignedFreePointer( values[x] );
		for( int x=0 ; x<(Primal?8:27) ; x++ ) AlignedFreePointer( ab1[x] );
		for( int x=0 ; x<(Primal?8:27) ; x++ ) AlignedFreePointer( ab2[x] );
	}
	if( degenerateSamples ) fprintf( stderr , "Warning: Degenerate samples found!\n" );
}
