/*
*                               imass.c
*
*       Copyright 1999 by The University at Stony Brook, All rights reserved.
*/

#include <tri/trilocaldecs.h>

#if defined (PETSC)
#include "petscksp.h"
#include "petsc.h"
#endif /* if defined (PETSC) */

#if defined(PETSC)
LOCAL   float     K[2] = {0.5,0.5};
LOCAL   int       comm_size = 1;
#endif /* if defined(PETSC) */

#if defined(TWOD)

#define     SWAP(a,b)    {temp=(a); (a) = (b); (b) = temp;}

#if defined(__cplusplus)
extern "C" {
#endif /* defined(__cplusplus) */
    FORTRAN int    FORTRAN_NAME(dgetri)(int*, double*, int*, int*, double*, int *, int *);
                   // routine to compute the inverse of a matrix using the LU factorization provided by dgetrf
    FORTRAN int    FORTRAN_NAME(dgetrf)(int*, int*, double*, int*, int*, int*);
                   // routine for doing LU factorization of general m x n matrix
#if defined(__cplusplus)
}
#endif /* defined(__cplusplus) */

// IMPORT void      matrix_inv(long double**,int,long double**);
// IMPORT void      comp_mass_matrix(int,TRI*,int,long double**);
// IMPORT void      comp_mass_matrix_1st_row(int,TRI*,int,long double**);
// IMPORT void      inverse_matrix(long double**,int,long double**);
// IMPORT void      print_ldb_matrix(const char*,int,int,long double**,const char*);

LOCAL long double     integral_eta_xi(int,int);
LOCAL int             factorial(int);
//IMPORT long double     int_x(float*,float*,float*,long double);
//IMPORT long double     int_y(float*,float*,float*,long double);
//IMPORT long double     int_x2(float*,float*,float*,long double);
//IMPORT long double     int_y2(float*,float*,float*,long double);
//IMPORT long double     int_xy(float*,float*,float*,long double);
//IMPORT long double     int_x3(float*,float*,float*,long double);
//IMPORT long double     int_y3(float*,float*,float*,long double);
//IMPORT long double     int_x2y(float*,float*,float*,long double);
//IMPORT long double     int_xy2(float*,float*,float*,long double);
//IMPORT long double     int_x4(float*,float*,float*,long double);
//IMPORT long double     int_y4(float*,float*,float*,long double);
//IMPORT long double     int_x3y(float*,float*,float*,long double);
//IMPORT long double     int_x2y2(float*,float*,float*,long double);
//IMPORT long double     int_xy3(float*,float*,float*,long double);

LOCAL void      lubksb(long double**,int,int*,long double*);
LOCAL void      ludcmp(long double**,int,int*,long double*);
LOCAL void      gaussj(long double**,int,long double**,int);
LOCAL void      inverse_matrix_gj(double**,int,double**);
// LOCAL void      comp_mass_matrix_p2(int,TRI*,double*,int,double**);
// LOCAL void      comp_mass_matrix_p3(int,TRI*,double*,int,double**);
LOCAL void      comp_mass_matrix_p4(int,TRI*,int,double**);
LOCAL void      comp_mass_matrix_1st_row_p1(int,TRI*,int,double*,double**);
LOCAL void      comp_mass_matrix_1st_row_p2(int,TRI*,int,double*,double**);
LOCAL void      comp_mass_matrix_1st_row_p3(int,TRI*,int,double*,double**);
LOCAL void      comp_mass_matrix_1st_row_p4(int,TRI*,int,double*,double**);
LOCAL void      comp_CV_mass_matrix_p1(int,float*,float*,float*,float*,float*);
LOCAL void      comp_CV_mass_matrix_p2(int,float*,float*,float*,float*,float*);
LOCAL void      comp_CV_mass_matrix_p3(int,float*,float*,float*,float*,float*);
LOCAL void      comp_CV_mass_matrix_1st_row_p3(int,TRI*,int,int,float*,float**);
LOCAL void      comp_CV_mass_matrix_1st_row_p2(int,TRI*,int,int,float*,float**);


LOCAL long double B_val(float crds[][2],double*,int,int);
LOCAL long double normalized_B_val(double crds[][2],double*,double,int,int);

LOCAL long double normalized_B_val(
	double   crds[][2],
        double   *cent,
        double   sqrt_area,
        int      pos,
        int      indx)
{
        long double tmpx, tmpy;

        switch(indx)
        {
        case 0:
            return 1.0;
        break;
        case 1:
            tmpx = crds[pos][0] - cent[0];
            return tmpx/sqrt_area;
        break;
        case 2:
            tmpy = crds[pos][1] - cent[1];
            return tmpy/sqrt_area;
        break;
        case 3:
            tmpx = crds[pos][0] - cent[0];
            return sqr(tmpx/sqrt_area);
        break;
        case 4:
            tmpx = (crds[pos][0] - cent[0])/sqrt_area;
            tmpy = (crds[pos][1] - cent[1])/sqrt_area;
            return tmpx*tmpy;
        break;
        case 5:
            tmpy = crds[pos][1] - cent[1];
            return sqr(tmpy/sqrt_area);
        break;
        case 6:
            tmpx = crds[pos][0] - cent[0];
            return cub(tmpx/sqrt_area);
        break;
        case 7:
            tmpx = (crds[pos][0] - cent[0])/sqrt_area;
            tmpy = (crds[pos][1] - cent[1])/sqrt_area;
            return (sqr(tmpx)*tmpy);
        break;
        case 8:
            tmpx = (crds[pos][0] - cent[0])/sqrt_area;
            tmpy = (crds[pos][1] - cent[1])/sqrt_area;
            return (tmpx*sqr(tmpy));
        break;
        case 9:
            tmpy = (crds[pos][1] - cent[1])/sqrt_area;
            return cub(tmpy);
        break;
        case 10:
             tmpx = (crds[pos][0] - cent[0])/sqrt_area;
             return sqr(tmpx)*sqr(tmpx);
        break;
        case 11:
             tmpx = (crds[pos][0] - cent[0])/sqrt_area;
             tmpy = (crds[pos][1] - cent[1])/sqrt_area;
             return cub(tmpx)*tmpy;
        break;
        case 12:
             tmpx = (crds[pos][0] - cent[0])/sqrt_area;
             tmpy = (crds[pos][1] - cent[1])/sqrt_area;
             return sqr(tmpx)*sqr(tmpy);
        break;
        case 13:
             tmpx = (crds[pos][0] - cent[0])/sqrt_area;
             tmpy = (crds[pos][1] - cent[1])/sqrt_area;
             return (tmpx)*cub(tmpy);
        break;
        case 14:
             tmpy = (crds[pos][1] - cent[1])/sqrt_area;
             return sqr(tmpy)*sqr(tmpy);
        break;
        }

        printf("ERROR: normalized_B_val()\n");
        clean_up(ERROR);
}


/*
    Ref: A note on the approximation properties of the locally divergence-free finite elements.
Int. J. Numer. Anal. and Modeling. 5(4):693-703, 2008
*/

EXPORT void comp_Mag_mass_matrix(
        int     n_coeff,
        TRI     *tri, 
        int     dim,
        double   **mass_m)
{
        POINT    *p[3];
        int      i, j, k;
        double   *pcrds[3], midpt[3][2], tmp[2];
        long double det;
        double   *cent = fg_centroid(tri); 
        double   crds[20][2];  
        long  double tmpans[20];
        static double w[16] ={0.144315607677787,0.095091634267285,0.095091634267285,0.095091634267285,
                             0.103217370534718, 0.103217370534718,0.103217370534718,
                             0.032458497623198,0.032458497623198,0.032458497623198,
                             0.027230314174435,0.027230314174435,0.027230314174435,
                             0.027230314174435,0.027230314174435,0.027230314174435};
        double   sqrt_area, tmp_area;

        sqrt_area = sqrt(fg_area(tri));
        cent = fg_centroid(tri);
        
        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }   

        tri_quadrature_16_pts(pcrds[0], pcrds[1], pcrds[2], crds);

        if(MAX_N_COEF == 1)
        {
            det = (long double)(pcrds[1][0]-pcrds[0][0])*(pcrds[2][1]-pcrds[0][1]) -
                  (long double)(pcrds[2][0]-pcrds[0][0])*(pcrds[1][1]-pcrds[0][1]);
            mass_m[0][0] = det*0.5;
        }
        else if(MAX_N_COEF == 3 || MAX_N_COEF == 6 || MAX_N_COEF == 10 ||  MAX_N_COEF == 15)
        {
            for(k = 0; k < MAX_N_COEF; k++)
            {
                for(i = 0; i < MAX_N_COEF; i++)
                {
                    for(j = 0; j < 16; j++)
                        tmpans[j] = normalized_B_val(crds, cent, sqrt_area, j, i)*
                                normalized_B_val(crds, cent, sqrt_area, j, k);
                    mass_m[k][i] = 0.0;
                    for(j = 0; j < 16; j++)
                        mass_m[k][i] += tmpans[j]*w[j];
                    mass_m[k][i] *= fg_area(tri);
                }
            }
        }
        else
        {
            printf("ERROR: implement comp_Mag_mass_matrix for MAX_N_COEF = %d\n",
                MAX_N_COEF);
            clean_up(ERROR);
        }

        if(tri->CVmass_matrix != NULL)
        {
            for(i = 0; i < 3; i++)
            {       
                for(j = 0; j < dim; j++)
                    midpt[i][j] = 0.5*(pcrds[i][j] + pcrds[(i+1)%3][j]);
            }       

            //// CV0
            tri_quadrature_16_pts(pcrds[0], midpt[0],  midpt[2], crds);
            tmp_area = (midpt[0][0]-pcrds[0][0])*(midpt[2][1]-pcrds[0][1]) -
                       (midpt[2][0]-pcrds[0][0])*(midpt[0][1]-pcrds[0][1]);
            tmp_area *= 0.5;
            comput_tri_cent(2, pcrds[0], midpt[0],  midpt[2], tri->CVcent[0]);
            for(i = 0; i < MAX_N_COEF; i++)
            {
                for(j = 0; j < 16; j++)
                    tmpans[j] = normalized_B_val(crds, cent, sqrt_area, j, i)*
                                normalized_B_val(crds, cent, sqrt_area, j, 0);
                tri->CVmass_matrix[0][i] = 0.0;
                for(j = 0; j < 16; j++)
                    tri->CVmass_matrix[0][i] += tmpans[j]*w[j];
                tri->CVmass_matrix[0][i] *= tmp_area;
            }
            //// CV1
            tri_quadrature_16_pts(midpt[0],  pcrds[1], midpt[1], crds);
            tmp_area = (pcrds[1][0]-midpt[0][0])*(midpt[1][1]-midpt[0][1]) -
                       (midpt[1][0]-midpt[0][0])*(pcrds[1][1]-midpt[0][1]);
            tmp_area *= 0.5;
            comput_tri_cent(2, midpt[0],  pcrds[1], midpt[1], tri->CVcent[1]);
            for(i = 0; i < MAX_N_COEF; i++)
            {
                for(j = 0; j < 16; j++)
                    tmpans[j] = normalized_B_val(crds, cent, sqrt_area, j, i)*
                                normalized_B_val(crds, cent, sqrt_area, j, 0);
                tri->CVmass_matrix[1][i] = 0.0;
                for(j = 0; j < 16; j++)
                    tri->CVmass_matrix[1][i] += tmpans[j]*w[j];
                tri->CVmass_matrix[1][i] *= tmp_area;
            }
            //// CV2
            tri_quadrature_16_pts(midpt[1],  pcrds[2], midpt[2], crds);
            tmp_area = (pcrds[2][0]-midpt[1][0])*(midpt[2][1]-midpt[1][1]) -
                       (midpt[2][0]-midpt[1][0])*(pcrds[2][1]-midpt[1][1]);
            tmp_area *= 0.5;
            comput_tri_cent(2, midpt[1],  pcrds[2], midpt[2], tri->CVcent[2]);
            for(i = 0; i < MAX_N_COEF; i++)
            {
                for(j = 0; j < 16; j++)
                    tmpans[j] = normalized_B_val(crds, cent, sqrt_area, j, i)*
                                normalized_B_val(crds, cent, sqrt_area, j, 0);
                tri->CVmass_matrix[2][i] = 0.0;
                for(j = 0; j < 16; j++)
                    tri->CVmass_matrix[2][i] += tmpans[j]*w[j];
                tri->CVmass_matrix[2][i] *= tmp_area;
            }
            //// CV3
            tri_quadrature_16_pts(midpt[0],  midpt[1], midpt[2], crds);
            tmp_area = (midpt[1][0]-midpt[0][0])*(midpt[2][1]-midpt[0][1]) -
                       (midpt[2][0]-midpt[0][0])*(midpt[1][1]-midpt[0][1]);
            tmp_area *= 0.5;
            comput_tri_cent(2, midpt[0],  midpt[1], midpt[2], tri->CVcent[3]);
            for(i = 0; i < MAX_N_COEF; i++)
            {
                for(j = 0; j < 16; j++)
                    tmpans[j] = normalized_B_val(crds, cent, sqrt_area, j, i)*
                                normalized_B_val(crds, cent, sqrt_area, j, 0);
                tri->CVmass_matrix[3][i] = 0.0;
                for(j = 0; j < 16; j++)
                    tri->CVmass_matrix[3][i] += tmpans[j]*w[j];
                tri->CVmass_matrix[3][i] *= tmp_area;
            }
        }
}

/*
  Transform onto the right triangle 
  (reference triangle in Xi-Eta Coords) and
  compute coeff.
*/
EXPORT void comp_mass_matrix(
        int     n_coeff,
        TRI     *tri,
        int     dim,
        double   **mass_m)
{
        POINT   *p[3];
        int     i, j;
        double   *pcrds[3], midpt[3][2], tmp[2], *cent;
        long double det;

        if(NULL == mass_m) return;

        cent = fg_centroid(tri);

        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }
        for(i = 0; i < 3; i++)
        {
            for(j = 0; j < dim; j++)
                midpt[i][j] = 0.5*(pcrds[i][j] + pcrds[(i+1)%3][j]);
        }

        if(MAX_N_COEF == 1)
        {
            det = (long double)(pcrds[1][0]-pcrds[0][0])*(pcrds[2][1]-pcrds[0][1]) -
                  (long double)(pcrds[2][0]-pcrds[0][0])*(pcrds[1][1]-pcrds[0][1]);
            mass_m[0][0] = det*0.5;
        }
        else if(MAX_N_COEF == 3)
        {
            comp_mass_matrix_p1(n_coeff, tri, cent, dim, mass_m);
            //// NOTE that CV is integrated with respect to triangle center.
            //// NOT the partial cell center.
            comput_tri_cent(2, pcrds[0], midpt[0],  midpt[2], tri->CVcent[0]);
            if(tri->CVmass_matrix != NULL)
                comp_CV_mass_matrix_p1(n_coeff, cent, pcrds[0], midpt[0],  midpt[2], tri->CVmass_matrix[0]);
            comput_tri_cent(2, midpt[0],  pcrds[1], midpt[1], tri->CVcent[1]);
            if(tri->CVmass_matrix != NULL)
                comp_CV_mass_matrix_p1(n_coeff, cent, midpt[0],  pcrds[1], midpt[1], tri->CVmass_matrix[1]);
            comput_tri_cent(2, midpt[1],  pcrds[2], midpt[2], tri->CVcent[2]);
            if(tri->CVmass_matrix != NULL)
                comp_CV_mass_matrix_p1(n_coeff, cent, midpt[1],  pcrds[2], midpt[2], tri->CVmass_matrix[2]);
            comput_tri_cent(2, midpt[0],  midpt[1], midpt[2], tri->CVcent[3]);
            if(tri->CVmass_matrix != NULL)
                comp_CV_mass_matrix_p1(n_coeff, cent, midpt[0],  midpt[1], midpt[2], tri->CVmass_matrix[3]);
        }
        else if(MAX_N_COEF == 6)
        {
            comp_mass_matrix_p2(n_coeff, tri, cent, dim, mass_m);
            //// NOTE that CV is integrated with respect to triangle center.
            //// NOT the partial cell center.
	    comput_tri_cent(2, pcrds[0], midpt[0],  midpt[2], tri->CVcent[0]);
            if(tri->CVmass_matrix != NULL)
                comp_CV_mass_matrix_p2(n_coeff, cent, pcrds[0], midpt[0],  midpt[2], tri->CVmass_matrix[0]);
	    comput_tri_cent(2, midpt[0],  pcrds[1], midpt[1], tri->CVcent[1]);
            if(tri->CVmass_matrix != NULL)
                comp_CV_mass_matrix_p2(n_coeff, cent, midpt[0],  pcrds[1], midpt[1], tri->CVmass_matrix[1]);
	    comput_tri_cent(2, midpt[1],  pcrds[2], midpt[2], tri->CVcent[2]);
            if(tri->CVmass_matrix != NULL)
                comp_CV_mass_matrix_p2(n_coeff, cent, midpt[1],  pcrds[2], midpt[2], tri->CVmass_matrix[2]);
	    comput_tri_cent(2, midpt[0],  midpt[1], midpt[2], tri->CVcent[3]);
            if(tri->CVmass_matrix != NULL)
                comp_CV_mass_matrix_p2(n_coeff, cent, midpt[0],  midpt[1], midpt[2], tri->CVmass_matrix[3]);
        }
        else if(MAX_N_COEF == 10)
        {
            comp_mass_matrix_p3(n_coeff, tri, cent, dim, mass_m);
            if(N_PART == 6)
            {
                /// partition suggested by Yingjie
	        comput_tri_cent(2, pcrds[0], midpt[0], cent, tri->CVcent[0]);
                comp_CV_mass_matrix_p3(n_coeff, tri->CVcent[0], pcrds[0], midpt[0], cent, tri->CVmass_matrix[0]);
	        comput_tri_cent(2, midpt[0], pcrds[1], cent, tri->CVcent[1]);
                comp_CV_mass_matrix_p3(n_coeff, tri->CVcent[1], midpt[0], pcrds[1], cent, tri->CVmass_matrix[1]);

	        comput_tri_cent(2, pcrds[1], midpt[1], cent, tri->CVcent[2]);
                comp_CV_mass_matrix_p3(n_coeff, tri->CVcent[2], pcrds[1], midpt[1], cent, tri->CVmass_matrix[2]);
	        comput_tri_cent(2, midpt[1], pcrds[2], cent, tri->CVcent[3]);
                comp_CV_mass_matrix_p3(n_coeff, tri->CVcent[3], midpt[1], pcrds[2], cent, tri->CVmass_matrix[3]);

	        comput_tri_cent(2, pcrds[2], midpt[2], cent, tri->CVcent[4]);
                comp_CV_mass_matrix_p3(n_coeff, tri->CVcent[4], pcrds[2], midpt[2], cent, tri->CVmass_matrix[4]);
	        comput_tri_cent(2, midpt[2], pcrds[0], cent, tri->CVcent[5]);
                comp_CV_mass_matrix_p3(n_coeff, tri->CVcent[5], midpt[2], pcrds[0], cent, tri->CVmass_matrix[5]);

                // printf("In comp_mass_matrix, tri %p, cent[%g %g]\n",tri, cent[0], cent[1]);
                // print_tri_coords(tri);
                // clean_up(0);
            }
            else if(N_PART == 4)
            {
                //// 4 self-similar sub-triangles
	        comput_tri_cent(2, pcrds[0], midpt[0],  midpt[2], tri->CVcent[0]);
                comp_CV_mass_matrix_p3(n_coeff, tri->CVcent[0], pcrds[0], midpt[0],  midpt[2], tri->CVmass_matrix[0]);
                comput_tri_cent(2, midpt[0],  pcrds[1], midpt[1], tri->CVcent[1]);
                comp_CV_mass_matrix_p3(n_coeff, tri->CVcent[1], midpt[0],  pcrds[1], midpt[1], tri->CVmass_matrix[1]);
                comput_tri_cent(2, midpt[1],  pcrds[2], midpt[2], tri->CVcent[2]);
                comp_CV_mass_matrix_p3(n_coeff, tri->CVcent[2], midpt[1],  pcrds[2], midpt[2], tri->CVmass_matrix[2]);
                comput_tri_cent(2, midpt[0],  midpt[1], midpt[2], tri->CVcent[3]);
                comp_CV_mass_matrix_p3(n_coeff, tri->CVcent[3], midpt[0],  midpt[1], midpt[2], tri->CVmass_matrix[3]);
            }
            else if(N_PART == 7)
            {
                for(i = 0; i < 3; i++)
                {
                    for(j = 0; j < dim; j++)
                    {
                        tmp[j] = 0.5*(pcrds[(i+1)%3][j] + pcrds[(i+2)%3][j]);
                        midpt[0][j] = 0.5*(pcrds[i][j] + tmp[j]);
                        midpt[1][j] = 0.75*pcrds[(i+1)%3][j] + 0.25*pcrds[(i+2)%3][j];
                        midpt[2][j] = 0.25*pcrds[(i+1)%3][j] + 0.75*pcrds[(i+2)%3][j];
                    }
                    comput_tri_cent(2, midpt[0],  midpt[1], midpt[2], tri->CVcent[4+i]);
                    comp_CV_mass_matrix_p3(n_coeff, tri->CVcent[4+i], midpt[0],  midpt[1], midpt[2], tri->CVmass_matrix[4+i]);
                } 
            }
        }
        else if(MAX_N_COEF == 15)
        {
            comp_mass_matrix_p4(n_coeff, tri, dim, mass_m);
        }
        else
        {
            printf("ERROR: implement comp_mass_matrix for MAX_N_COEF = %d\n",
                MAX_N_COEF);
            clean_up(ERROR);
        }
}

LOCAL void comp_CV_mass_matrix_p3(
        int     n_coeff,
        float   *cent,
        float   *p0,
        float   *p1,
        float   *p2,
        float   *mass_m)
{
        float   det, area;
        float   crds[13][2];
        long double   tmpans[13];
        static long double w1 =-0.149570044467670, w2 = 0.053347235608839,
                    w3 = 0.175615257433204,  w4 = 0.077113760890257;
        int    i, j;

        tri_quadrature_13_pts(p0, p1, p2, crds);

        det = (float)(p1[0]-p0[0])*(p2[1]-p0[1]) -
              (float)(p2[0]-p0[0])*(p1[1]-p0[1]);

        area = mass_m[0] = det*0.5;

        mass_m[1] = int_x(p0,p1,p2,det) - cent[0]*mass_m[0];

        mass_m[2] = int_y(p0,p1,p2,det) - cent[1]*mass_m[0];

        mass_m[3] = int_x2(p0,p1,p2,det) - 2.0*cent[0]*
                       int_x(p0,p1,p2,det) + sqr(cent[0])*mass_m[0];

        mass_m[4] = int_xy(p0,p1,p2,det) -
                       cent[1]*int_x(p0,p1,p2,det)-
                       cent[0]*int_y(p0,p1,p2,det) +
                       cent[0]*cent[1]*mass_m[0];

        mass_m[5] = int_y2(p0,p1,p2,det) - 2.0*cent[1]*
                       int_y(p0,p1,p2,det) + sqr(cent[1])*mass_m[0];

        for(i = 6; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i);
            mass_m[i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }
}

LOCAL void comp_CV_mass_matrix_p2(
	int     n_coeff,
        float   *cent,
	float   *p0,
        float   *p1,
        float   *p2,
	float   *mass_m)
{
        double   det;

	//comput_tri_cent(2, p0, p1, p2, cent);

        det = (p1[0]-p0[0])*(p2[1]-p0[1]) -
              (p2[0]-p0[0])*(p1[1]-p0[1]);

        mass_m[0] = det*0.5;

        mass_m[1] = int_x(p0,p1,p2,det) - cent[0]*mass_m[0];

        mass_m[2] = int_y(p0,p1,p2,det) - cent[1]*mass_m[0];

        mass_m[3] = int_x2(p0,p1,p2,det) - 2.0*cent[0]*
                       int_x(p0,p1,p2,det) + sqr(cent[0])*mass_m[0];

        mass_m[4] = int_xy(p0,p1,p2,det) -
                       cent[1]*int_x(p0,p1,p2,det)-
                       cent[0]*int_y(p0,p1,p2,det) +
                       cent[0]*cent[1]*mass_m[0];

        mass_m[5] = int_y2(p0,p1,p2,det) - 2.0*cent[1]*
                       int_y(p0,p1,p2,det) + sqr(cent[1])*mass_m[0];
}

LOCAL void comp_CV_mass_matrix_p1(
        int     n_coeff,
        float   *cent,
        float   *p0,
        float   *p1,
        float   *p2,
        float   *mass_m)
{
        double   det;

        det = (p1[0]-p0[0])*(p2[1]-p0[1]) -
              (p2[0]-p0[0])*(p1[1]-p0[1]);

        mass_m[0] = det*0.5;

        mass_m[1] = int_x(p0,p1,p2,det) - cent[0]*mass_m[0];

        mass_m[2] = int_y(p0,p1,p2,det) - cent[1]*mass_m[0];
}

EXPORT void comp_mass_matrix_p1(
        int     n_coeff,
        TRI     *tri,
        double   *cent,
        int     dim,
        double   **mass_m)
{
        int     i, j;
        // double   *cent = fg_centroid(tri);
        POINT   *p[3];
        double   *pcrds[3]; 
        long double   det;
        
        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }

        det = (long double)(pcrds[1][0]-pcrds[0][0])*(pcrds[2][1]-pcrds[0][1]) - 
              (long double)(pcrds[2][0]-pcrds[0][0])*(pcrds[1][1]-pcrds[0][1]);

        mass_m[0][0] = det*0.5;

        mass_m[0][1] = int_x(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[0]*mass_m[0][0];
        // mass_m[0][1] = 0.0;

        mass_m[0][2] = int_y(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[1]*mass_m[0][0]; 
        // mass_m[0][2] = 0.0; 

        mass_m[1][0] = mass_m[0][1];
        // mass_m[1][1] = mass_m[0][3];
        mass_m[1][1] = int_x2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)(2.0*cent[0])*
                       int_x(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[0])*mass_m[0][0];
        // mass_m[1][2] = mass_m[0][4];
        mass_m[1][2] = int_xy(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cent[1]*int_x(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)cent[0]*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(cent[0]*cent[1])*mass_m[0][0];

        mass_m[2][0] = mass_m[0][2];
        mass_m[2][1] = mass_m[1][2];
        // mass_m[2][2] = mass_m[0][5];
        mass_m[2][2] = int_y2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)(2.0*cent[1])*
                       int_y(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[1])*mass_m[0][0];

} 

EXPORT void comp_mass_matrix_p2(
        int     n_coeff,
        TRI     *tri,
        double   *cent,
        int     dim,
        double   **mass_m)
{
        int     i, j;
        // double   *cent = fg_centroid(tri);
        POINT   *p[3];
        float   *pcrds[3]; 
        long double   det;
        
        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }

        det = (long double)(pcrds[1][0]-pcrds[0][0])*(pcrds[2][1]-pcrds[0][1]) - 
              (long double)(pcrds[2][0]-pcrds[0][0])*(pcrds[1][1]-pcrds[0][1]);

        mass_m[0][0] = det*0.5;

        mass_m[0][1] = int_x(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[0]*mass_m[0][0];
        // mass_m[0][1] = 0.0;

        // TMP
        // printf("cent[%g %g]\n", cent[0], cent[1]);
        // printf("int_x = %g\n", int_x(pcrds[0],pcrds[1],pcrds[2],det) );

        mass_m[0][2] = int_y(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[1]*mass_m[0][0]; 
        // mass_m[0][2] = 0.0; 

        mass_m[0][3] = int_x2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)(2.0*cent[0])*
                       int_x(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[0])*mass_m[0][0];

        mass_m[0][4] = int_xy(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)cent[1]*int_x(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)cent[0]*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(cent[0]*cent[1])*mass_m[0][0];

        mass_m[0][5] = int_y2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)(2.0*cent[1])*
                       int_y(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[1])*mass_m[0][0];

        mass_m[1][0] = mass_m[0][1];
        mass_m[1][1] = mass_m[0][3];
        mass_m[1][2] = mass_m[0][4];
        mass_m[1][3] = int_x3(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cub(cent[0])*mass_m[0][0] -
                        (long double)3.0*cent[0]*int_x2(pcrds[0],pcrds[1],pcrds[2],det) + 
                        (long double)3.0*sqr(cent[0])*int_x(pcrds[0],pcrds[1],pcrds[2],det);

        mass_m[1][4] = int_x2y(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)cent[1]*int_x2(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)(2.0*cent[0])*int_xy(pcrds[0],pcrds[1],pcrds[2],det) + 
                       (long double)(2.0*cent[0]*cent[1])*int_x(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)sqr(cent[0])*int_y(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(sqr(cent[0])*cent[1])*mass_m[0][0];

        /**
        if(debug_flag == YES)
        {
            printf("IN comp_mass_matrix(), Tri(%d) debug mass_m[1][5]\n", tri->id);
            printf("int_x2y = %22.20Lg\n", int_x2y(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_x2 = %22.20Lg\n", int_x2(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_xy = %22.20Lg\n", int_xy(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_y = %22.20Lg\n", int_y(pcrds[0],pcrds[1],pcrds[2],det));
            // clean_up(0);
        }
        **/

        mass_m[1][5] = int_xy2(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)(2.0*cent[1])*int_xy(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(sqr(cent[1]))*int_x(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cent[0]*int_y2(pcrds[0],pcrds[1],pcrds[2],det) + 
                       (long double)(2.0*cent[0]*cent[1])*int_y(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(sqr(cent[1])*cent[0])*mass_m[0][0];  

        /**
        if(debug_flag == YES)
        {
            printf("IN comp_mass_matrix(), Tri(%d) debug mass_m[1][4]\n", tri->id);
            printf("int_xy2 = %22.20Lg\n", int_xy2(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_y2 = %22.20Lg\n", int_y2(pcrds[0],pcrds[1],pcrds[2],det));
            // clean_up(0);
        }
        **/

        mass_m[2][0] = mass_m[0][2];
        mass_m[2][1] = mass_m[0][4];
        mass_m[2][2] = mass_m[0][5];
        mass_m[2][3] = mass_m[1][4];
        mass_m[2][4] = mass_m[1][5];      
        mass_m[2][5] = int_y3(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)cub(cent[1])*mass_m[0][0] - 
                       (long double)(3.0*cent[1])*int_y2(pcrds[0],pcrds[1],pcrds[2],det) + 
                       (long double)(3.0*sqr(cent[1]))*int_y(pcrds[0],pcrds[1],pcrds[2],det);

        mass_m[3][0] = mass_m[0][3];
        mass_m[3][1] = mass_m[1][3];
        mass_m[3][2] = mass_m[2][3];
        mass_m[3][3] = int_x4(pcrds[0],pcrds[1],pcrds[2],det) + 
                   (long double)(sqr(cent[0])*sqr(cent[0]))*mass_m[0][0] +
                   (long double)(6.0*sqr(cent[0]))*int_x2(pcrds[0],pcrds[1],pcrds[2],det) -
                   (long double)(4.0*cent[0])*int_x3(pcrds[0],pcrds[1],pcrds[2],det) - 
                   (long double)(4.0*cub(cent[0]))*int_x(pcrds[0],pcrds[1],pcrds[2],det); 

        mass_m[3][4] = int_x3y(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cent[1]*int_x3(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(3.0*cent[0])*int_x2y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(3.0*cent[0]*cent[1])*int_x2(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(3.0*sqr(cent[0]))*int_xy(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(3.0*sqr(cent[0])*cent[1])*int_x(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cub(cent[0])*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(cub(cent[0])*cent[1])*mass_m[0][0];
        // TMP
        /**
        if(debug_flag == YES)
        {
            printf("IN comp_mass_matrix(), Tri(%d) debug mass_m[3][4]\n", tri->id);
            printf("int_x3y = %Lg\n", int_x3y(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_x3 = %22.20Lg\n",  int_x3(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_x2y = %Lg\n", int_x2y(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_x2 = %Lg\n", int_x2(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_xy = %Lg\n", int_xy(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_x = %Lg\n", int_x(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_y = %Lg\n", int_y(pcrds[0],pcrds[1],pcrds[2],det));
            // clean_up(0);
        }
        **/

        mass_m[3][5] = int_x2y2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[1])*int_x2y(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(sqr(cent[1]))*int_x2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[0])*int_xy2(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(4.0*cent[0]*cent[1])*int_xy(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[0]*sqr(cent[1]))*int_x(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)sqr(cent[0])*int_y2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[1]*sqr(cent[0]))*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(sqr(cent[0])*sqr(cent[1]))*mass_m[0][0];

        /**
        if(debug_flag == YES)
        {
            printf("IN comp_mass_matrix(), Tri(%d) debug mass_m[3][4]\n", tri->id);
            printf("int_x2y2 = %22.20Lg\n", int_x2y2(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_x2y = %22.20Lg\n",  int_x2y(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_x2 = %22.20Lg\n", int_x2(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_xy2 = %22.20Lg\n", int_xy2(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_xy = %22.20Lg\n", int_xy(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_y2 = %22.20Lg\n", int_y2(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_y = %22.20Lg\n", int_y(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_x = %22.20Lg\n", int_x(pcrds[0],pcrds[1],pcrds[2],det));
            // clean_up(0);
        }
        **/

        mass_m[4][0] = mass_m[0][4];
        mass_m[4][1] = mass_m[1][4];
        mass_m[4][2] = mass_m[2][4];
        mass_m[4][3] = mass_m[3][4];
        mass_m[4][4] = mass_m[3][5];
        mass_m[4][5] = int_xy3(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(3.0*cent[1])*int_xy2(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(3.0*sqr(cent[1]))*int_xy(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(cub(cent[1]))*int_x(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(cent[0])*int_y3(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(3.0*cent[0]*cent[1])*int_y2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(3.0*cent[0]*sqr(cent[1]))*int_y(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(cent[0]*cub(cent[1]))*mass_m[0][0];
        /**
        if(tri->id == 61)
        {
            printf("IN comp_mass_matrix(), Tri(%d) debug mass_m[5][4]\n", tri->id);
            printf("int_xy3 = %22.20Lg\n", int_xy3(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_xy2 = %22.20Lg\n",  int_xy2(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_xy = %22.20Lg\n", int_xy(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_y2 = %22.20Lg\n", int_y2(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_y3 = %22.20Lg\n", int_y3(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_y = %22.20Lg\n", int_y(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_x = %22.20Lg\n", int_x(pcrds[0],pcrds[1],pcrds[2],det));
            // clean_up(0);
        }
        **/

        mass_m[5][0] = mass_m[0][5];
        mass_m[5][1] = mass_m[1][5];
        mass_m[5][2] = mass_m[2][5];
        mass_m[5][3] = mass_m[3][5];
        mass_m[5][4] = mass_m[4][5];
        mass_m[5][5] = int_y4(pcrds[0],pcrds[1],pcrds[2],det) +
                   (long double)(sqr(cent[1])*sqr(cent[1]))*mass_m[0][0] +
                   (long double)(6.0*sqr(cent[1]))*int_y2(pcrds[0],pcrds[1],pcrds[2],det) -
                   (long double)(4.0*cent[1])*int_y3(pcrds[0],pcrds[1],pcrds[2],det) -
                   (long double)(4.0*cub(cent[1]))*int_y(pcrds[0],pcrds[1],pcrds[2],det);

        /**
        if(debug_flag == YES)
        {
            printf("IN comp_mass_matrix(), Tri(%d) debug mass_m[5][4]\n", tri->id);
            printf("int_y4 = %22.20Lg\n", int_y4(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_y3 = %22.20Lg\n",  int_y3(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_y2 = %22.20Lg\n", int_y2(pcrds[0],pcrds[1],pcrds[2],det));
            printf("int_y = %22.20Lg\n", int_y(pcrds[0],pcrds[1],pcrds[2],det));
            // clean_up(0);
        }
        **/


        /***
        for(i = 0; i < MAX_N_COEF; i++)
        {
            for(j = 0; j < MAX_N_COEF; j++)
            {
                if(fabs(mass_m[i][j]) < dmach)
                    mass_m[i][j] = 0.0;
            }
        }
        **/
} 

EXPORT void comp_mass_matrix_p3(
        int     n_coeff,
        TRI     *tri,
        double  *cent,
        int     dim,
        double   **mass_m)
{
        int     i, j;
        // double   *cent = fg_centroid(tri);
        POINT   *p[3];
        float   *pcrds[3]; 
        long double   det;
        float   crds[13][2]; 
        long double   tmpans[13];
        long double w1 =-0.149570044467670, w2 = 0.053347235608839,
                    w3 = 0.175615257433204,  w4 = 0.077113760890257;
        long double   area;

        area = fg_area(tri);
        
        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }

        tri_quadrature_13_pts(pcrds[0], pcrds[1], pcrds[2], crds);

        det = (long double)(pcrds[1][0]-pcrds[0][0])*(pcrds[2][1]-pcrds[0][1]) - 
              (long double)(pcrds[2][0]-pcrds[0][0])*(pcrds[1][1]-pcrds[0][1]);

        mass_m[0][0] = det*0.5;

        mass_m[0][1] = int_x(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[0]*mass_m[0][0];

        mass_m[0][2] = int_y(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[1]*mass_m[0][0]; 

        mass_m[0][3] = int_x2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)(2.0*cent[0])*
                       int_x(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[0])*mass_m[0][0];

        mass_m[0][4] = int_xy(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)cent[1]*int_x(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)cent[0]*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(cent[0]*cent[1])*mass_m[0][0];

        mass_m[0][5] = int_y2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)(2.0*cent[1])*
                       int_y(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[1])*mass_m[0][0];

        for(i = 6; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i);
            mass_m[0][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        // (x-x_i)*all bases functions
        mass_m[1][0] = mass_m[0][1];
        mass_m[1][1] = mass_m[0][3];
        mass_m[1][2] = mass_m[0][4];
        mass_m[1][3] = int_x3(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cub(cent[0])*mass_m[0][0] -
                        (long double)3.0*cent[0]*int_x2(pcrds[0],pcrds[1],pcrds[2],det) + 
                        (long double)3.0*sqr(cent[0])*int_x(pcrds[0],pcrds[1],pcrds[2],det);

        mass_m[1][4] = int_x2y(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)cent[1]*int_x2(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)(2.0*cent[0])*int_xy(pcrds[0],pcrds[1],pcrds[2],det) + 
                       (long double)(2.0*cent[0]*cent[1])*int_x(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)sqr(cent[0])*int_y(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(sqr(cent[0])*cent[1])*mass_m[0][0];

        mass_m[1][5] = int_xy2(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)(2.0*cent[1])*int_xy(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(sqr(cent[1]))*int_x(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cent[0]*int_y2(pcrds[0],pcrds[1],pcrds[2],det) + 
                       (long double)(2.0*cent[0]*cent[1])*int_y(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(sqr(cent[1])*cent[0])*mass_m[0][0];  
        
        for(i = 6; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 1);
            mass_m[1][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }
        // mass_m[1][6] = mass_m[3][3];
        // mass_m[1][7] = mass_m[3][4];
        // mass_m[1][8] = mass_m[3][5];
        // mass_m[1][9] = mass_m[4][5];

        mass_m[2][0] = mass_m[0][2];
        mass_m[2][1] = mass_m[0][4];
        mass_m[2][2] = mass_m[0][5];
        mass_m[2][3] = mass_m[1][4];
        mass_m[2][4] = mass_m[1][5];      
        mass_m[2][5] = int_y3(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)cub(cent[1])*mass_m[0][0] - 
                       (long double)(3.0*cent[1])*int_y2(pcrds[0],pcrds[1],pcrds[2],det) + 
                       (long double)(3.0*sqr(cent[1]))*int_y(pcrds[0],pcrds[1],pcrds[2],det);
        mass_m[2][6] = mass_m[1][7];
        for(i = 7; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 2);
            mass_m[2][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }
        // mass_m[2][7] = mass_m[3][5];
        // mass_m[2][8] = mass_m[1][9];
        // mass_m[2][9] = mass_m[5][5];

        mass_m[3][0] = mass_m[0][3];
        mass_m[3][1] = mass_m[1][3];
        mass_m[3][2] = mass_m[2][3];
        mass_m[3][3] = int_x4(pcrds[0],pcrds[1],pcrds[2],det) + 
                   (long double)(sqr(cent[0])*sqr(cent[0]))*mass_m[0][0] +
                   (long double)(6.0*sqr(cent[0]))*int_x2(pcrds[0],pcrds[1],pcrds[2],det) -
                   (long double)(4.0*cent[0])*int_x3(pcrds[0],pcrds[1],pcrds[2],det) - 
                   (long double)(4.0*cub(cent[0]))*int_x(pcrds[0],pcrds[1],pcrds[2],det); 

        mass_m[3][4] = int_x3y(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cent[1]*int_x3(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(3.0*cent[0])*int_x2y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(3.0*cent[0]*cent[1])*int_x2(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(3.0*sqr(cent[0]))*int_xy(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(3.0*sqr(cent[0])*cent[1])*int_x(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cub(cent[0])*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(cub(cent[0])*cent[1])*mass_m[0][0];

        mass_m[3][5] = int_x2y2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[1])*int_x2y(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(sqr(cent[1]))*int_x2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[0])*int_xy2(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(4.0*cent[0]*cent[1])*int_xy(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[0]*sqr(cent[1]))*int_x(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)sqr(cent[0])*int_y2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[1]*sqr(cent[0]))*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(sqr(cent[0])*sqr(cent[1]))*mass_m[0][0];
        for(i = 6; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 3);
            mass_m[3][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[4][0] = mass_m[0][4];
        mass_m[4][1] = mass_m[1][4];
        mass_m[4][2] = mass_m[2][4];
        mass_m[4][3] = mass_m[3][4];
        mass_m[4][4] = mass_m[3][5];
        mass_m[4][5] = int_xy3(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(3.0*cent[1])*int_xy2(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(3.0*sqr(cent[1]))*int_xy(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(cub(cent[1]))*int_x(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(cent[0])*int_y3(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(3.0*cent[0]*cent[1])*int_y2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(3.0*cent[0]*sqr(cent[1]))*int_y(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(cent[0]*cub(cent[1]))*mass_m[0][0];
        // mass_m[4][6] = mass_m[3][7];
        // mass_m[4][7] = mass_m[3][8];
        // mass_m[4][8] = mass_m[3][9];
        for(i = 6; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 4);
            mass_m[4][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[5][0] = mass_m[0][5];
        mass_m[5][1] = mass_m[1][5];
        mass_m[5][2] = mass_m[2][5];
        mass_m[5][3] = mass_m[3][5];
        mass_m[5][4] = mass_m[4][5];
        mass_m[5][5] = int_y4(pcrds[0],pcrds[1],pcrds[2],det) +
                   (long double)(sqr(cent[1])*sqr(cent[1]))*mass_m[0][0] +
                   (long double)(6.0*sqr(cent[1]))*int_y2(pcrds[0],pcrds[1],pcrds[2],det) -
                   (long double)(4.0*cent[1])*int_y3(pcrds[0],pcrds[1],pcrds[2],det) -
                   (long double)(4.0*cub(cent[1]))*int_y(pcrds[0],pcrds[1],pcrds[2],det);
        // mass_m[5][6] = mass_m[3][8];
        // mass_m[5][7] = mass_m[3][9];
        for(i = 6; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 5);
            mass_m[5][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[6][0] = mass_m[0][6];
        mass_m[6][1] = mass_m[1][6];
        mass_m[6][2] = mass_m[2][6];
        mass_m[6][3] = mass_m[3][6];
        mass_m[6][4] = mass_m[4][6];
        mass_m[6][5] = mass_m[5][6];
        for(i = 6; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 6);
            mass_m[6][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[7][0] = mass_m[0][7];
        mass_m[7][1] = mass_m[1][7];
        mass_m[7][2] = mass_m[2][7];
        mass_m[7][3] = mass_m[3][7];
        mass_m[7][4] = mass_m[4][7];
        mass_m[7][5] = mass_m[5][7];
        mass_m[7][6] = mass_m[6][7];
        for(i = 7; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 7);
            mass_m[7][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }
        mass_m[8][0] = mass_m[0][8];
        mass_m[8][1] = mass_m[1][8];
        mass_m[8][2] = mass_m[2][8];
        mass_m[8][3] = mass_m[3][8];
        mass_m[8][4] = mass_m[4][8];
        mass_m[8][5] = mass_m[5][8];
        mass_m[8][6] = mass_m[6][8];
        mass_m[8][7] = mass_m[7][8];
        for(i = 8; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 8);
            mass_m[8][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }
        mass_m[9][0] = mass_m[0][9];
        mass_m[9][1] = mass_m[1][9];
        mass_m[9][2] = mass_m[2][9];
        mass_m[9][3] = mass_m[3][9];
        mass_m[9][4] = mass_m[4][9];
        mass_m[9][5] = mass_m[5][9];
        mass_m[9][6] = mass_m[6][9];
        mass_m[9][7] = mass_m[7][9];
        mass_m[9][8] = mass_m[8][9];
        for(i = 9; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 9);
            mass_m[9][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }
} 

LOCAL void comp_mass_matrix_p4(
        int     n_coeff,
        TRI     *tri,
        int     dim,
        double   **mass_m)
{
        int     i, j;
        double   *cent = fg_centroid(tri);
        POINT   *p[3];
        float   *pcrds[3]; 
        long double   det;
        float   crds[13][2]; 
        long double   tmpans[13];
        long double w1 =-0.149570044467670, w2 = 0.053347235608839,
                    w3 = 0.175615257433204,  w4 = 0.077113760890257;
        long double   area;

        area = fg_area(tri);
        
        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }

        tri_quadrature_13_pts(pcrds[0], pcrds[1], pcrds[2], crds);

        det = (long double)(pcrds[1][0]-pcrds[0][0])*(pcrds[2][1]-pcrds[0][1]) - 
              (long double)(pcrds[2][0]-pcrds[0][0])*(pcrds[1][1]-pcrds[0][1]);

        mass_m[0][0] = det*0.5;

        mass_m[0][1] = int_x(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[0]*mass_m[0][0];

        mass_m[0][2] = int_y(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[1]*mass_m[0][0]; 

        mass_m[0][3] = int_x2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)(2.0*cent[0])*
                       int_x(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[0])*mass_m[0][0];

        mass_m[0][4] = int_xy(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)cent[1]*int_x(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)cent[0]*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(cent[0]*cent[1])*mass_m[0][0];

        mass_m[0][5] = int_y2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)(2.0*cent[1])*
                       int_y(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[1])*mass_m[0][0];

        for(i = 6; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i);
            mass_m[0][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        // (x-x_i)*all bases functions
        mass_m[1][0] = mass_m[0][1];
        mass_m[1][1] = mass_m[0][3];
        mass_m[1][2] = mass_m[0][4];
        mass_m[1][3] = int_x3(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cub(cent[0])*mass_m[0][0] -
                        (long double)3.0*cent[0]*int_x2(pcrds[0],pcrds[1],pcrds[2],det) + 
                        (long double)3.0*sqr(cent[0])*int_x(pcrds[0],pcrds[1],pcrds[2],det);

        mass_m[1][4] = int_x2y(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)cent[1]*int_x2(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)(2.0*cent[0])*int_xy(pcrds[0],pcrds[1],pcrds[2],det) + 
                       (long double)(2.0*cent[0]*cent[1])*int_x(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)sqr(cent[0])*int_y(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(sqr(cent[0])*cent[1])*mass_m[0][0];

        mass_m[1][5] = int_xy2(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)(2.0*cent[1])*int_xy(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(sqr(cent[1]))*int_x(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cent[0]*int_y2(pcrds[0],pcrds[1],pcrds[2],det) + 
                       (long double)(2.0*cent[0]*cent[1])*int_y(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(sqr(cent[1])*cent[0])*mass_m[0][0];  
        
        for(i = 6; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 1);
            mass_m[1][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[2][0] = mass_m[0][2];
        mass_m[2][1] = mass_m[0][4];
        mass_m[2][2] = mass_m[0][5];
        mass_m[2][3] = mass_m[1][4];
        mass_m[2][4] = mass_m[1][5];      
        mass_m[2][5] = int_y3(pcrds[0],pcrds[1],pcrds[2],det) - 
                       (long double)cub(cent[1])*mass_m[0][0] - 
                       (long double)(3.0*cent[1])*int_y2(pcrds[0],pcrds[1],pcrds[2],det) + 
                       (long double)(3.0*sqr(cent[1]))*int_y(pcrds[0],pcrds[1],pcrds[2],det);
        mass_m[2][6] = mass_m[1][7];
        mass_m[2][7] = mass_m[0][12];
        mass_m[2][8] = mass_m[0][13];
        mass_m[2][9] = mass_m[0][14];
        for(i = 10; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 2);
            mass_m[2][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[3][0] = mass_m[0][3];
        mass_m[3][1] = mass_m[1][3];
        mass_m[3][2] = mass_m[2][3];
        mass_m[3][3] = int_x4(pcrds[0],pcrds[1],pcrds[2],det) + 
                   (long double)(sqr(cent[0])*sqr(cent[0]))*mass_m[0][0] +
                   (long double)(6.0*sqr(cent[0]))*int_x2(pcrds[0],pcrds[1],pcrds[2],det) -
                   (long double)(4.0*cent[0])*int_x3(pcrds[0],pcrds[1],pcrds[2],det) - 
                   (long double)(4.0*cub(cent[0]))*int_x(pcrds[0],pcrds[1],pcrds[2],det); 

        mass_m[3][4] = int_x3y(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cent[1]*int_x3(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(3.0*cent[0])*int_x2y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(3.0*cent[0]*cent[1])*int_x2(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(3.0*sqr(cent[0]))*int_xy(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(3.0*sqr(cent[0])*cent[1])*int_x(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cub(cent[0])*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(cub(cent[0])*cent[1])*mass_m[0][0];

        mass_m[3][5] = int_x2y2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[1])*int_x2y(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(sqr(cent[1]))*int_x2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[0])*int_xy2(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(4.0*cent[0]*cent[1])*int_xy(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[0]*sqr(cent[1]))*int_x(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)sqr(cent[0])*int_y2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(2.0*cent[1]*sqr(cent[0]))*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(sqr(cent[0])*sqr(cent[1]))*mass_m[0][0];
        mass_m[3][6] = mass_m[1][10];
        mass_m[3][7] = mass_m[2][10];
        mass_m[3][8] = mass_m[2][11];
        mass_m[3][9] = mass_m[1][13];
        for(i = 10; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 3);
            mass_m[3][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[4][0] = mass_m[0][4];
        mass_m[4][1] = mass_m[1][4];
        mass_m[4][2] = mass_m[2][4];
        mass_m[4][3] = mass_m[3][4];
        mass_m[4][4] = mass_m[3][5];
        mass_m[4][5] = int_xy3(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)(3.0*cent[1])*int_xy2(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(3.0*sqr(cent[1]))*int_xy(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(cub(cent[1]))*int_x(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(cent[0])*int_y3(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)(3.0*cent[0]*cent[1])*int_y2(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)(3.0*cent[0]*sqr(cent[1]))*int_y(pcrds[0],pcrds[1],pcrds[2],det)+
                       (long double)(cent[0]*cub(cent[1]))*mass_m[0][0];
        mass_m[4][6] = mass_m[2][10];
        mass_m[4][7] = mass_m[1][12];
        mass_m[4][8] = mass_m[1][13];
        mass_m[4][9] = mass_m[1][14];
        mass_m[4][10] = mass_m[3][11];
        mass_m[4][11] = mass_m[3][12];
        mass_m[4][12] = mass_m[3][13];
        for(i = 13; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 4);
            mass_m[4][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[5][0] = mass_m[0][5];
        mass_m[5][1] = mass_m[1][5];
        mass_m[5][2] = mass_m[2][5];
        mass_m[5][3] = mass_m[3][5];
        mass_m[5][4] = mass_m[4][5];
        mass_m[5][5] = int_y4(pcrds[0],pcrds[1],pcrds[2],det) +
                   (long double)(sqr(cent[1])*sqr(cent[1]))*mass_m[0][0] +
                   (long double)(6.0*sqr(cent[1]))*int_y2(pcrds[0],pcrds[1],pcrds[2],det) -
                   (long double)(4.0*cent[1])*int_y3(pcrds[0],pcrds[1],pcrds[2],det) -
                   (long double)(4.0*cub(cent[1]))*int_y(pcrds[0],pcrds[1],pcrds[2],det);
        mass_m[5][6] = mass_m[1][13];
        mass_m[5][7] = mass_m[3][9];
        mass_m[5][8] = mass_m[1][14];
        mass_m[5][9] = mass_m[2][14];
        mass_m[5][10] = mass_m[4][11];
        mass_m[5][11] = mass_m[4][12];
        mass_m[5][12] = mass_m[4][13];
        mass_m[5][13] = mass_m[4][14];
        for(i = 14; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 5);
            mass_m[5][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[6][0] = mass_m[0][6];
        mass_m[6][1] = mass_m[1][6];
        mass_m[6][2] = mass_m[2][6];
        mass_m[6][3] = mass_m[3][6];
        mass_m[6][4] = mass_m[4][6];
        mass_m[6][5] = mass_m[5][6];
        for(i = 6; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 6);
            mass_m[6][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[7][0] = mass_m[0][7];
        mass_m[7][1] = mass_m[1][7];
        mass_m[7][2] = mass_m[2][7];
        mass_m[7][3] = mass_m[3][7];
        mass_m[7][4] = mass_m[4][7];
        mass_m[7][5] = mass_m[5][7];
        mass_m[7][6] = mass_m[6][7];
        for(i = 7; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 7);
            mass_m[7][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }
        mass_m[8][0] = mass_m[0][8];
        mass_m[8][1] = mass_m[1][8];
        mass_m[8][2] = mass_m[2][8];
        mass_m[8][3] = mass_m[3][8];
        mass_m[8][4] = mass_m[4][8];
        mass_m[8][5] = mass_m[5][8];
        mass_m[8][6] = mass_m[6][8];
        mass_m[8][7] = mass_m[7][8];
        for(i = 8; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 8);
            mass_m[8][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }
        mass_m[9][0] = mass_m[0][9];
        mass_m[9][1] = mass_m[1][9];
        mass_m[9][2] = mass_m[2][9];
        mass_m[9][3] = mass_m[3][9];
        mass_m[9][4] = mass_m[4][9];
        mass_m[9][5] = mass_m[5][9];
        mass_m[9][6] = mass_m[6][9];
        mass_m[9][7] = mass_m[7][9];
        mass_m[9][8] = mass_m[8][9];
        for(i = 9; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 9);
            mass_m[9][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }
        mass_m[10][0] = mass_m[0][10];
        mass_m[10][1] = mass_m[1][10];
        mass_m[10][2] = mass_m[2][10];
        mass_m[10][3] = mass_m[3][10];
        mass_m[10][4] = mass_m[4][10];
        mass_m[10][5] = mass_m[5][10];
        mass_m[10][6] = mass_m[6][10];
        mass_m[10][7] = mass_m[7][10];
        mass_m[10][8] = mass_m[8][10];
        mass_m[10][9] = mass_m[9][10];
        for(i = 10; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 9);
            mass_m[10][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[11][0] = mass_m[0][11];
        mass_m[11][1] = mass_m[1][11];
        mass_m[11][2] = mass_m[2][11];
        mass_m[11][3] = mass_m[3][11];
        mass_m[11][4] = mass_m[4][11];
        mass_m[11][5] = mass_m[5][11];
        mass_m[11][6] = mass_m[6][11];
        mass_m[11][7] = mass_m[7][11];
        mass_m[11][8] = mass_m[8][11];
        mass_m[11][9] = mass_m[9][11];
        mass_m[11][10] = mass_m[10][11];
        for(i = 11; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 9);
            mass_m[11][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[12][0] = mass_m[0][12];
        mass_m[12][1] = mass_m[1][12];
        mass_m[12][2] = mass_m[2][12];
        mass_m[12][3] = mass_m[3][12];
        mass_m[12][4] = mass_m[4][12];
        mass_m[12][5] = mass_m[5][12];
        mass_m[12][6] = mass_m[6][12];
        mass_m[12][7] = mass_m[7][12];
        mass_m[12][8] = mass_m[8][12];
        mass_m[12][9] = mass_m[9][12];
        mass_m[12][10] = mass_m[10][12];
        mass_m[12][11] = mass_m[11][12];
        for(i = 12; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 9);
            mass_m[12][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[13][0] = mass_m[0][13];
        mass_m[13][1] = mass_m[1][13];
        mass_m[13][2] = mass_m[2][13];
        mass_m[13][3] = mass_m[3][13];
        mass_m[13][4] = mass_m[4][13];
        mass_m[13][5] = mass_m[5][13];
        mass_m[13][6] = mass_m[6][13];
        mass_m[13][7] = mass_m[7][13];
        mass_m[13][8] = mass_m[8][13];
        mass_m[13][9] = mass_m[9][13];
        mass_m[13][10] = mass_m[10][13];
        mass_m[13][11] = mass_m[11][13];
        mass_m[13][12] = mass_m[12][13];
        for(i = 13; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 9);
            mass_m[13][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

        mass_m[14][0] = mass_m[0][14];
        mass_m[14][1] = mass_m[1][14];
        mass_m[14][2] = mass_m[2][14];
        mass_m[14][3] = mass_m[3][14];
        mass_m[14][4] = mass_m[4][14];
        mass_m[14][5] = mass_m[5][14];
        mass_m[14][6] = mass_m[6][14];
        mass_m[14][7] = mass_m[7][14];
        mass_m[14][8] = mass_m[8][14];
        mass_m[14][9] = mass_m[9][14];
        mass_m[14][10] = mass_m[10][14];
        mass_m[14][11] = mass_m[11][14];
        mass_m[14][12] = mass_m[12][14];
        mass_m[14][13] = mass_m[13][14];
        for(i = 14; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i)*B_val(crds, cent, j, 9);
            mass_m[14][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }
} 


EXPORT void comp_CV_Mag_mass_matrix_1st_row(
        int      n_coeff,
        TRI      *tri,
        int      dim,
        int      cv_indx,
        double   *cent,
        double   sqrt_area,
        double   **mass_m)
{
        POINT   *p[3];
        int     i, j;
        float   *pcrds[3], midpt[3][2];
        double   crds[20][2], tmp_area;
        long  double tmpans[20];
        static double w[16] ={0.144315607677787,0.095091634267285,0.095091634267285,0.095091634267285,
                             0.103217370534718, 0.103217370534718,0.103217370534718,
                             0.032458497623198,0.032458497623198,0.032458497623198,
                             0.027230314174435,0.027230314174435,0.027230314174435,
                             0.027230314174435,0.027230314174435,0.027230314174435};

        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }
        for(i = 0; i < 3; i++)
        {
            for(j = 0; j < dim; j++)
                midpt[i][j] = 0.5*(pcrds[i][j] + pcrds[(i+1)%3][j]);
        }

        switch(cv_indx)
        {
        case 0:
            tri_quadrature_16_pts(pcrds[0], midpt[0],  midpt[2], crds);
            tmp_area = (midpt[0][0]-pcrds[0][0])*(midpt[2][1]-pcrds[0][1]) -
                       (midpt[2][0]-pcrds[0][0])*(midpt[0][1]-pcrds[0][1]);
            tmp_area *= 0.5;
        break;
        case 1:
            tri_quadrature_16_pts(midpt[0],  pcrds[1], midpt[1], crds);
            tmp_area = (pcrds[1][0]-midpt[0][0])*(midpt[1][1]-midpt[0][1]) -
                       (midpt[1][0]-midpt[0][0])*(pcrds[1][1]-midpt[0][1]);
            tmp_area *= 0.5;
        break;
        case 2:
            tri_quadrature_16_pts(midpt[1],  pcrds[2], midpt[2], crds);
            tmp_area = (pcrds[2][0]-midpt[1][0])*(midpt[2][1]-midpt[1][1]) -
                       (midpt[2][0]-midpt[1][0])*(pcrds[2][1]-midpt[1][1]);
            tmp_area *= 0.5;
        break;
        case 3:
            tri_quadrature_16_pts(midpt[0],  midpt[1], midpt[2], crds);
            tmp_area = (midpt[1][0]-midpt[0][0])*(midpt[2][1]-midpt[0][1]) -
                       (midpt[2][0]-midpt[0][0])*(midpt[1][1]-midpt[0][1]);
            tmp_area *= 0.5;
        break;
        default:
        printf("ERROR: comp_CV_Mag_mass_matrix_1st_row(), cv_indx = %d\n", cv_indx);
        clean_up(ERROR);
        }
        for(i = 0; i < MAX_N_COEF; i++)
        {
            for(j = 0; j < 16; j++)
                tmpans[j] = normalized_B_val(crds, cent, sqrt_area, j, i)*
                            normalized_B_val(crds, cent, sqrt_area, j, 0);
            mass_m[0][i] = 0.0;
            for(j = 0; j < 16; j++)
                mass_m[0][i] += tmpans[j]*w[j];
            mass_m[0][i] *= tmp_area;
        }
}

EXPORT void comp_CV_mass_matrix_1st_row(
        int     n_coeff,
        TRI     *tri,
        int     dim,
        int     cv_indx,
        float   *cent,
        float   **mass_m)
{
	if(MAX_N_COEF == 6)
        {
           comp_CV_mass_matrix_1st_row_p2(n_coeff,tri,dim,cv_indx,cent,mass_m);
            // printf("ERROR: comp_CV_mass_matrix_1st_row(), implement 6 case\n");
            // clean_up(ERROR);  
        }
        else if(MAX_N_COEF == 10)
        {
            comp_CV_mass_matrix_1st_row_p3(n_coeff,tri,dim,cv_indx,cent,mass_m); 
        }
}

LOCAL void comp_CV_mass_matrix_1st_row_p2(
        int     n_coeff,
        TRI     *tri,
        int     dim,
        int     cv_indx,
        float   *cent,
        float   **mass_m)
{
        POINT   *p[3];
        int     i, j;
        float   *pcrds[3], midpt[3][2];

        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }
        for(i = 0; i < 3; i++)
        {
            for(j = 0; j < dim; j++)
                midpt[i][j] = 0.5*(pcrds[i][j] + pcrds[(i+1)%3][j]);
        }
        switch(cv_indx)
        {
        case 0:
            comp_CV_mass_matrix_p2(n_coeff, cent, pcrds[0], midpt[0],  midpt[2], mass_m[0]);
        break;
        case 1:
            comp_CV_mass_matrix_p2(n_coeff, cent, midpt[0],  pcrds[1], midpt[1], mass_m[0]);
        break;
        case 2:
            comp_CV_mass_matrix_p2(n_coeff, cent, midpt[1],  pcrds[2], midpt[2], mass_m[0]);
        break;
        case 3:
            comp_CV_mass_matrix_p2(n_coeff, cent, midpt[0],  midpt[1], midpt[2], mass_m[0]);
        break;
        default:
        printf("ERROR: comp_CV_mass_matrix_1st_row_p2(), cv_indx = %d\n", cv_indx);
        clean_up(ERROR);
        }
}


LOCAL void comp_CV_mass_matrix_1st_row_p3(
        int     n_coeff,
        TRI     *tri,
        int     dim,
        int     cv_indx,
        float   *cent,
        float   **mass_m)
{
        POINT   *p[3];
        int     i, j;
        float   *pcrds[3], midpt[3][2], tmp[2], *tri_cent;
        double  tmpmass[50];

        tri_cent = fg_centroid(tri);
        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }
        for(i = 0; i < 3; i++)
        {
            for(j = 0; j < dim; j++)
                midpt[i][j] = 0.5*(pcrds[i][j] + pcrds[(i+1)%3][j]);
        }

        if(N_PART == 6)
        {
            switch(cv_indx)
            {
                case 0:
                    comp_CV_mass_matrix_p3(n_coeff, cent, pcrds[0], midpt[0], tri_cent, tmpmass);
                break;
                case 1:
                    comp_CV_mass_matrix_p3(n_coeff, cent, midpt[0], pcrds[1], tri_cent, tmpmass);
                break;
                case 2:
                    comp_CV_mass_matrix_p3(n_coeff, cent, pcrds[1], midpt[1], tri_cent, tmpmass);
                break;
                case 3:
                    comp_CV_mass_matrix_p3(n_coeff, cent, midpt[1], pcrds[2], tri_cent, tmpmass);
                break;
                case 4:
                    comp_CV_mass_matrix_p3(n_coeff, cent, pcrds[2], midpt[2], tri_cent, tmpmass);
                break;
                case 5:
                    comp_CV_mass_matrix_p3(n_coeff, cent, midpt[2], pcrds[0], tri_cent, tmpmass);
                break;
            }
            for(j = 0; j < MAX_N_COEF; j++)
                mass_m[0][j] = tmpmass[j];
            return; 
        }

        ///// for self-similar partition case
        switch(cv_indx)
        {
        case 0:
            comp_CV_mass_matrix_p3(n_coeff, cent, pcrds[0], midpt[0],  midpt[2], tmpmass);
        break;
        case 1:
            comp_CV_mass_matrix_p3(n_coeff, cent, midpt[0],  pcrds[1], midpt[1], tmpmass);
        break;
        case 2:
            comp_CV_mass_matrix_p3(n_coeff, cent, midpt[1],  pcrds[2], midpt[2], tmpmass);
        break;
        case 3:
            comp_CV_mass_matrix_p3(n_coeff, cent, midpt[0],  midpt[1], midpt[2], tmpmass);
        break;
        case 4:
            i = 0; 
            for(j = 0; j < dim; j++) 
            {
                tmp[j] = 0.5*(pcrds[(i+1)%3][j] + pcrds[(i+2)%3][j]);
                midpt[0][j] = 0.5*(pcrds[i][j] + tmp[j]);
                midpt[1][j] = 0.75*pcrds[(i+1)%3][j] + 0.25*pcrds[(i+2)%3][j];
                midpt[2][j] = 0.25*pcrds[(i+1)%3][j] + 0.75*pcrds[(i+2)%3][j];
            }
            comp_CV_mass_matrix_p3(n_coeff, cent, midpt[0],  midpt[1], midpt[2], tmpmass);
        break;
        case 5:
            i = 1;
            for(j = 0; j < dim; j++) 
            {
                tmp[j] = 0.5*(pcrds[(i+1)%3][j] + pcrds[(i+2)%3][j]);
                midpt[0][j] = 0.5*(pcrds[i][j] + tmp[j]);
                midpt[1][j] = 0.75*pcrds[(i+1)%3][j] + 0.25*pcrds[(i+2)%3][j];
                midpt[2][j] = 0.25*pcrds[(i+1)%3][j] + 0.75*pcrds[(i+2)%3][j];
            }
            comp_CV_mass_matrix_p3(n_coeff, cent, midpt[0],  midpt[1], midpt[2], tmpmass);
        break;
        case 6:
            i = 2;
            for(j = 0; j < dim; j++)
            {
                tmp[j] = 0.5*(pcrds[(i+1)%3][j] + pcrds[(i+2)%3][j]);
                midpt[0][j] = 0.5*(pcrds[i][j] + tmp[j]);
                midpt[1][j] = 0.75*pcrds[(i+1)%3][j] + 0.25*pcrds[(i+2)%3][j];
                midpt[2][j] = 0.25*pcrds[(i+1)%3][j] + 0.75*pcrds[(i+2)%3][j];
            }
            comp_CV_mass_matrix_p3(n_coeff, cent, midpt[0],  midpt[1], midpt[2], tmpmass);
        break;
        default:
        printf("ERROR: comp_CV_mass_matrix_1st_row_p3(), cv_indx = %d\n", cv_indx);
        clean_up(ERROR);
        }

        for(j = 0; j < MAX_N_COEF; j++)
            mass_m[0][j] = tmpmass[j];
}

EXPORT void comp_Mag_mass_matrix_1st_row(
        int      n_coeff,
        TRI      *tri,
        int      dim,
        double   *cent,
        double   sqrt_area,
        double   **mass_m)
{
        POINT    *p[3];
        int      i, j, k;
        double   *pcrds[3], midpt[3][2], tmp[2];
        long double det; 
        double   crds[20][2];  
        long  double tmpans[20];
        static double w[16] ={0.144315607677787,0.095091634267285,0.095091634267285,0.095091634267285,
                             0.103217370534718, 0.103217370534718,0.103217370534718,
                             0.032458497623198,0.032458497623198,0.032458497623198,
                             0.027230314174435,0.027230314174435,0.027230314174435,
                             0.027230314174435,0.027230314174435,0.027230314174435};

        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }

        tri_quadrature_16_pts(pcrds[0], pcrds[1], pcrds[2], crds);

        for(i = 0; i < MAX_N_COEF; i++)
        {
             for(j = 0; j < 16; j++)
                tmpans[j] = normalized_B_val(crds, cent, sqrt_area, j, i)*
                            normalized_B_val(crds, cent, sqrt_area, j, 0);
             mass_m[0][i] = 0.0;
             for(j = 0; j < 16; j++)
                 mass_m[0][i] += tmpans[j]*w[j];
             mass_m[0][i] *= fg_area(tri);
        }
}


EXPORT void comp_mass_matrix_1st_row(
        int     n_coeff,
        TRI     *tri,
        int     dim,
        double   *cent,
        double   **mass_m)
{
        if(MAX_N_COEF == 3)
        {
            comp_mass_matrix_1st_row_p1(n_coeff, tri, dim, cent, mass_m);
        }
        else if(MAX_N_COEF == 6)
        {
            comp_mass_matrix_1st_row_p2(n_coeff, tri, dim, cent, mass_m);
        }
        else if(MAX_N_COEF == 10)
        {
            comp_mass_matrix_1st_row_p3(n_coeff, tri, dim, cent, mass_m);
        }
        else if(MAX_N_COEF == 15)
        {
            comp_mass_matrix_1st_row_p4(n_coeff, tri, dim, cent, mass_m);
        }
        else
        {
            printf("ERROR: implement comp_mass_matrix_1st_row for MAX_N_COEF = %d\n",
                MAX_N_COEF);
            clean_up(ERROR);
        }
}

LOCAL void comp_mass_matrix_1st_row_p1(
        int     n_coeff,
        TRI     *tri,
        int     dim,
        double   *cent,
        double   **mass_m)
{
        int     i;
        // long double   *cent = fg_centroid(tri);
        POINT   *p[3];
        float   *pcrds[3];
        long double   det;

        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }

        det = (long double)(pcrds[1][0]-pcrds[0][0])*(pcrds[2][1]-pcrds[0][1]) -
              (long double)(pcrds[2][0]-pcrds[0][0])*(pcrds[1][1]-pcrds[0][1]);

        mass_m[0][0] = det*0.5;

        mass_m[0][1] = int_x(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[0]*mass_m[0][0];

        mass_m[0][2] = int_y(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[1]*mass_m[0][0];
}

LOCAL void comp_mass_matrix_1st_row_p2(
        int     n_coeff,
        TRI     *tri,
        int     dim,
        double   *cent,
        double   **mass_m)
{
        int     i;
        // long double   *cent = fg_centroid(tri);
        POINT   *p[3];
        float   *pcrds[3];
        long double   det;

        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }

        det = (long double)(pcrds[1][0]-pcrds[0][0])*(pcrds[2][1]-pcrds[0][1]) -
              (long double)(pcrds[2][0]-pcrds[0][0])*(pcrds[1][1]-pcrds[0][1]);

        mass_m[0][0] = det*0.5;

        mass_m[0][1] = int_x(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[0]*mass_m[0][0];

        mass_m[0][2] = int_y(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[1]*mass_m[0][0];

        mass_m[0][3] = int_x2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)2.0*cent[0]*
                       int_x(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[0])*mass_m[0][0];

        mass_m[0][4] = int_xy(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cent[1]*int_x(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)cent[0]*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)cent[0]*cent[1]*mass_m[0][0];

        mass_m[0][5] = int_y2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)2.0*cent[1]*
                       int_y(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[1])*mass_m[0][0];
}

LOCAL void comp_mass_matrix_1st_row_p3(
        int     n_coeff,
        TRI     *tri,
        int     dim,
        double   *cent,
        double   **mass_m)
{
        int     i, j;
        POINT   *p[3];
        float   *pcrds[3];
        long double   det;
        float   crds[13][2]; 
        long double   tmpans[13];
        static long double w1 =-0.149570044467670, w2 = 0.053347235608839,
                           w3 = 0.175615257433204,  w4 = 0.077113760890257;
        long double   area;

        area = fg_area(tri);

        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }

        tri_quadrature_13_pts(pcrds[0], pcrds[1], pcrds[2], crds);

        det = (long double)(pcrds[1][0]-pcrds[0][0])*(pcrds[2][1]-pcrds[0][1]) -
              (long double)(pcrds[2][0]-pcrds[0][0])*(pcrds[1][1]-pcrds[0][1]);

        mass_m[0][0] = det*0.5;

        mass_m[0][1] = int_x(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[0]*mass_m[0][0];

        mass_m[0][2] = int_y(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[1]*mass_m[0][0];

        mass_m[0][3] = int_x2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)2.0*cent[0]*
                       int_x(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[0])*mass_m[0][0];

        mass_m[0][4] = int_xy(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cent[1]*int_x(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)cent[0]*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)cent[0]*cent[1]*mass_m[0][0];

        mass_m[0][5] = int_y2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)2.0*cent[1]*
                       int_y(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[1])*mass_m[0][0];
        for(i = 6; i< 10; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i);
            mass_m[0][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }

}

LOCAL void comp_mass_matrix_1st_row_p4(
        int     n_coeff,
        TRI     *tri,
        int     dim,
        double   *cent,
        double   **mass_m)
{
        int     i, j;
        POINT   *p[3];
        float   *pcrds[3];
        long double   det;
        float   crds[13][2]; 
        long double   tmpans[13];
        long double w1 =-0.149570044467670, w2 = 0.053347235608839,
                    w3 = 0.175615257433204,  w4 = 0.077113760890257;
        long double   area;

        area = fg_area(tri);

        for(i = 0; i < 3; i++)
        {
            p[i] = Point_of_tri(tri)[i];
            pcrds[i] = Coords(p[i]);
        }

        tri_quadrature_13_pts(pcrds[0], pcrds[1], pcrds[2], crds);

        det = (long double)(pcrds[1][0]-pcrds[0][0])*(pcrds[2][1]-pcrds[0][1]) -
              (long double)(pcrds[2][0]-pcrds[0][0])*(pcrds[1][1]-pcrds[0][1]);

        mass_m[0][0] = det*0.5;

        mass_m[0][1] = int_x(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[0]*mass_m[0][0];

        mass_m[0][2] = int_y(pcrds[0],pcrds[1],pcrds[2],det) - (long double)cent[1]*mass_m[0][0];

        mass_m[0][3] = int_x2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)2.0*cent[0]*
                       int_x(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[0])*mass_m[0][0];

        mass_m[0][4] = int_xy(pcrds[0],pcrds[1],pcrds[2],det) -
                       (long double)cent[1]*int_x(pcrds[0],pcrds[1],pcrds[2],det)-
                       (long double)cent[0]*int_y(pcrds[0],pcrds[1],pcrds[2],det) +
                       (long double)cent[0]*cent[1]*mass_m[0][0];

        mass_m[0][5] = int_y2(pcrds[0],pcrds[1],pcrds[2],det) - (long double)2.0*cent[1]*
                       int_y(pcrds[0],pcrds[1],pcrds[2],det) + (long double)sqr(cent[1])*mass_m[0][0];
        for(i = 6; i< 15; i++)
        {
            for(j = 0; j < 13; j++)
                tmpans[j] = B_val(crds, cent, j, i);
            mass_m[0][i] =  (w1*tmpans[0] + w2*(tmpans[1] + tmpans[2] + tmpans[3]) +
                             w3*(tmpans[4] + tmpans[5] + tmpans[6]) +
                             w4*(tmpans[7] + tmpans[8] + tmpans[9] +
                                 tmpans[10] + tmpans[11] + tmpans[12]))*area;
        }
}


EXPORT long double int_x(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*((long double)(crds1[0]-crds0[0])*integral_eta_xi(1,0) +
                 (long double)(crds2[0]-crds0[0])*integral_eta_xi(0,1) +
                 (long double)crds0[0]*0.5); 
      return ans; 
}

EXPORT long double int_y(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*((long double)(crds1[1]-crds0[1])*integral_eta_xi(1,0) +
                 (long double)(crds2[1]-crds0[1])*integral_eta_xi(0,1) +
                 (long double)crds0[1]*0.5);
      return ans; 
}

EXPORT long double int_x2(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*((long double)sqr(crds1[0]-crds0[0])*integral_eta_xi(2,0) +
            (long double)sqr(crds2[0]-crds0[0])*integral_eta_xi(0,2) +
            (long double)sqr(crds0[0])*0.5 +
            (long double)2.0*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*integral_eta_xi(1,1) +
            (long double)2.0*(crds1[0]-crds0[0])*crds0[0]*integral_eta_xi(1,0) +
            (long double)2.0*(crds2[0]-crds0[0])*crds0[0]*integral_eta_xi(0,1));
      return ans;
}

EXPORT long double int_y2(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*((long double)sqr(crds1[1]-crds0[1])*integral_eta_xi(2,0) +
            (long double)sqr(crds2[1]-crds0[1])*integral_eta_xi(0,2) +
            (long double)sqr(crds0[1])*0.5 +
            (long double)2.0*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*integral_eta_xi(1,1) +
            (long double)2.0*(crds1[1]-crds0[1])*crds0[1]*integral_eta_xi(1,0) +
            (long double)2.0*(crds2[1]-crds0[1])*crds0[1]*integral_eta_xi(0,1));
      return ans;
}

EXPORT long double int_xy(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*((long double)(crds1[0]-crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(2,0) +
                 (long double)(crds1[0]-crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(1,1) + 
                 (long double)crds0[1]*(crds1[0]-crds0[0])*integral_eta_xi(1,0) + 
                 (long double)(crds2[0]-crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(1,1) +
                 (long double)(crds2[0]-crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(0,2) +
                 (long double)crds0[1]*(crds2[0]-crds0[0])*integral_eta_xi(0,1) +
                 (long double)crds0[0]*(crds1[1]-crds0[1])*integral_eta_xi(1,0) + 
                 (long double)crds0[0]*(crds2[1]-crds0[1])*integral_eta_xi(0,1) +
                 (long double)crds0[0]*crds0[1]*0.5  
                );
      return ans;
}

EXPORT long double int_x3(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*( (long double)cub(crds1[0]-crds0[0])*integral_eta_xi(3,0) +
                  (long double)sqr(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*integral_eta_xi(2,1) + 
                  (long double)sqr(crds1[0]-crds0[0])*crds0[0]*integral_eta_xi(2,0) +

                  (long double)sqr(crds2[0]-crds0[0])*(crds1[0]-crds0[0])*integral_eta_xi(1,2) +
                  (long double)cub(crds2[0]-crds0[0])*integral_eta_xi(0,3) +
                  (long double)crds0[0]*sqr(crds2[0]-crds0[0])*integral_eta_xi(0,2) +
                  (long double)sqr(crds0[0])*(crds1[0]-crds0[0])*integral_eta_xi(1,0) + 
                  (long double)sqr(crds0[0])*(crds2[0]-crds0[0])*integral_eta_xi(0,1) + 
                  (long double)cub(crds0[0])*0.5 + 
                  (long double)2.0*sqr(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*integral_eta_xi(2,1) +
                  (long double)2.0*sqr(crds2[0]-crds0[0])*(crds1[0]-crds0[0])*integral_eta_xi(1,2) +
                  (long double)2.0*crds0[0]*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*integral_eta_xi(1,1) +
                  (long double)2.0*sqr(crds1[0]-crds0[0])*crds0[0]*integral_eta_xi(2,0) +
                  (long double)2.0*crds0[0]*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*integral_eta_xi(1,1) +
                  (long double)2.0*(crds1[0]-crds0[0])*sqr(crds0[0])*integral_eta_xi(1,0) +
                  (long double)2.0*crds0[0]*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*integral_eta_xi(1,1) +
                  (long double)2.0*crds0[0]*sqr(crds2[0]-crds0[0])*integral_eta_xi(0,2) +
                  (long double)2.0*sqr(crds0[0])*(crds2[0]-crds0[0])*integral_eta_xi(0,1)
                );
      return ans;
}

EXPORT long double int_y3(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*( (long double)cub(crds1[1]-crds0[1])*integral_eta_xi(3,0) +
                  (long double)(sqr(crds1[1]-crds0[1])*(crds2[1]-crds0[1]))*integral_eta_xi(2,1) +
                  (long double)(sqr(crds1[1]-crds0[1])*crds0[1])*integral_eta_xi(2,0) +
                  (long double)(sqr(crds2[1]-crds0[1])*(crds1[1]-crds0[1]))*integral_eta_xi(1,2) +
                  (long double)(cub(crds2[1]-crds0[1]))*integral_eta_xi(0,3) +
                  (long double)(crds0[1]*sqr(crds2[1]-crds0[1]))*integral_eta_xi(0,2) +
                  (long double)(sqr(crds0[1])*(crds1[1]-crds0[1]))*integral_eta_xi(1,0) +
                  (long double)(sqr(crds0[1])*(crds2[1]-crds0[1]))*integral_eta_xi(0,1) +
                  (long double)(cub(crds0[1])*0.5) +
                  (long double)(2.0*sqr(crds1[1]-crds0[1])*(crds2[1]-crds0[1]))*integral_eta_xi(2,1) +
                  (long double)(2.0*sqr(crds2[1]-crds0[1])*(crds1[1]-crds0[1]))*integral_eta_xi(1,2) +
                  (long double)(2.0*crds0[1]*(crds1[1]-crds0[1])*(crds2[1]-crds0[1]))*integral_eta_xi(1,1) +
                  (long double)(2.0*sqr(crds1[1]-crds0[1])*crds0[1])*integral_eta_xi(2,0) +
                  (long double)(2.0*crds0[1]*(crds1[1]-crds0[1])*(crds2[1]-crds0[1]))*integral_eta_xi(1,1) +
                  (long double)(2.0*(crds1[1]-crds0[1])*sqr(crds0[1]))*integral_eta_xi(1,0) +
                  (long double)(2.0*crds0[1]*(crds1[1]-crds0[1])*(crds2[1]-crds0[1]))*integral_eta_xi(1,1) +
                  (long double)(2.0*crds0[1]*sqr(crds2[1]-crds0[1]))*integral_eta_xi(0,2) +
                  (long double)(2.0*sqr(crds0[1])*(crds2[1]-crds0[1]))*integral_eta_xi(0,1)
                );
      return ans;
}

EXPORT long double int_x2y(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*( 
           (long double)sqr(crds1[0]-crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(3,0) + 
           (long double)sqr(crds1[0]-crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(2,1) + 
           (long double)sqr(crds1[0]-crds0[0])*crds0[1]*integral_eta_xi(2,0) + 
           (long double)2.0*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(2,1) +
           (long double)2.0*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(1,2) + 
           (long double)2.0*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*crds0[1]*integral_eta_xi(1,1)+
           (long double)sqr(crds2[0]-crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(1,2) + 
           (long double)sqr(crds2[0]-crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(0,3)+
           (long double)sqr(crds2[0]-crds0[0])*crds0[1]*integral_eta_xi(0,2)+
           (long double)2.0*(crds1[0]-crds0[0])*(crds1[1]-crds0[1])*crds0[0]*integral_eta_xi(2,0)+
           (long double)2.0*(crds1[0]-crds0[0])*(crds2[1]-crds0[1])*crds0[0]*integral_eta_xi(1,1)+
           (long double)2.0*crds0[0]*(crds1[0]-crds0[0])*crds0[1]*integral_eta_xi(1,0)+
           (long double)2.0*crds0[0]*(crds2[0]-crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(1,1)+
           (long double)2.0*crds0[0]*(crds2[0]-crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(0,2)+
           (long double)2.0*crds0[0]*crds0[1]*(crds2[0]-crds0[0])*integral_eta_xi(0,1)+
           (long double)sqr(crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(1,0)+
           (long double)sqr(crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(0,1)+
           (long double)sqr(crds0[0])*crds0[1]*0.5
            );
      return ans;
}

EXPORT long double int_xy2(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*(
           (long double)((long double)sqr(crds1[1]-crds0[1])*(long double)(crds1[0]-crds0[0]))*integral_eta_xi(3,0) +
           (long double)((long double)sqr(crds2[1]-crds0[1])*(long double)(crds1[0]-crds0[0]))*integral_eta_xi(1,2) +
           (long double)(crds1[0]-crds0[0])*(long double)sqr(crds0[1])*integral_eta_xi(1,0) +
           (long double)(2.0*(crds1[0]-crds0[0])*(long double)(crds2[1]-crds0[1])*(crds1[1]-crds0[1]))*integral_eta_xi(2,1) +
           (long double)2.0*(crds1[0]-crds0[0])*(long double)crds0[1]*(long double)(crds1[1]-crds0[1])*integral_eta_xi(2,0) +
           (long double)2.0*(crds1[0]-crds0[0])*(long double)(crds2[1]-crds0[1])*(long double)crds0[1]*integral_eta_xi(1,1)+
           (long double)sqr(crds1[1]-crds0[1])*(long double)(crds2[0]-crds0[0])*integral_eta_xi(2,1) +
           (long double)sqr(crds2[1]-crds0[1])*(long double)(crds2[0]-crds0[0])*integral_eta_xi(0,3)+
           (long double)sqr(crds0[1])*(long double)(crds2[0]-crds0[0])*integral_eta_xi(0,1)+
           (long double)2.0*(crds2[0]-crds0[0])*(long double)(crds1[1]-crds0[1])*(long double)(crds2[1]-crds0[1])*integral_eta_xi(1,2)+
           (long double)2.0*(crds2[0]-crds0[0])*(crds1[1]-crds0[1])*crds0[1]*integral_eta_xi(1,1)+
           (long double)2.0*crds0[1]*(crds2[0]-crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(0,2)+
           (long double)crds0[0]*sqr(crds1[1]-crds0[1])*integral_eta_xi(2,0)+
           (long double)crds0[0]*sqr(crds2[1]-crds0[1])*integral_eta_xi(0,2)+
           (long double)crds0[0]*sqr(crds0[1])*0.5+
           (long double)2.0*crds0[0]*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*integral_eta_xi(1,1)+
           (long double)2.0*crds0[0]*(crds1[1]-crds0[1])*crds0[1]*integral_eta_xi(1,0)+
           (long double)2.0*crds0[0]*crds0[1]*(crds2[1]-crds0[1])*integral_eta_xi(0,1)
            );
      return ans;
}

EXPORT long double int_x4(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*(
           (long double)sqr(crds1[0]-crds0[0])*sqr(crds1[0]-crds0[0])*integral_eta_xi(4,0) + 
           (long double)sqr(crds2[0]-crds0[0])*sqr(crds2[0]-crds0[0])*integral_eta_xi(0,4) + 
           (long double)sqr(crds0[0])*sqr(crds0[0])*0.5+
           (long double)6.0*sqr(crds1[0]-crds0[0])*sqr(crds2[0]-crds0[0])*integral_eta_xi(2,2) + 
           (long double)6.0*sqr(crds1[0]-crds0[0])*sqr(crds0[0])*integral_eta_xi(2,0) +
           (long double)6.0*sqr(crds2[0]-crds0[0])*sqr(crds0[0])*integral_eta_xi(0,2) + 
           (long double)4.0*cub(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*integral_eta_xi(3,1) + 
           (long double)4.0*cub(crds1[0]-crds0[0])*(crds0[0])*integral_eta_xi(3,0) +
           (long double)4.0*(crds1[0]-crds0[0])*cub(crds2[0]-crds0[0])*integral_eta_xi(1,3) +
           (long double)4.0*cub(crds2[0]-crds0[0])*crds0[0]*integral_eta_xi(0,3) + 
           (long double)4.0*(crds1[0]-crds0[0])*cub(crds0[0])*integral_eta_xi(1,0) + 
           (long double)4.0*(crds2[0]-crds0[0])*cub(crds0[0])*integral_eta_xi(0,1) + 
           (long double)12.0*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*sqr(crds0[0])*integral_eta_xi(1,1) +
           (long double)12.0*sqr(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*crds0[0]*integral_eta_xi(2,1) +
           (long double)12.0*(crds1[0]-crds0[0])*sqr(crds2[0]-crds0[0])*crds0[0]*integral_eta_xi(1,2)
                );
      return ans;
}

EXPORT long double int_y4(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*(
           (long double)sqr(crds1[1]-crds0[1])*sqr(crds1[1]-crds0[1])*integral_eta_xi(4,0) +
           (long double)sqr(crds2[1]-crds0[1])*sqr(crds2[1]-crds0[1])*integral_eta_xi(0,4) +
           (long double)sqr(crds0[1])*sqr(crds0[1])*0.5+
           (long double)6.0*sqr(crds1[1]-crds0[1])*sqr(crds2[1]-crds0[1])*integral_eta_xi(2,2) +
           (long double)6.0*sqr(crds1[1]-crds0[1])*sqr(crds0[1])*integral_eta_xi(2,0) +
           (long double)6.0*sqr(crds2[1]-crds0[1])*sqr(crds0[1])*integral_eta_xi(0,2) +
           (long double)4.0*cub(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*integral_eta_xi(3,1) +
           (long double)4.0*cub(crds1[1]-crds0[1])*(crds0[1])*integral_eta_xi(3,0) +
           (long double)4.0*(crds1[1]-crds0[1])*cub(crds2[1]-crds0[1])*integral_eta_xi(1,3) +
           (long double)4.0*cub(crds2[1]-crds0[1])*crds0[1]*integral_eta_xi(0,3) +
           (long double)4.0*(crds1[1]-crds0[1])*cub(crds0[1])*integral_eta_xi(1,0) +
           (long double)4.0*(crds2[1]-crds0[1])*cub(crds0[1])*integral_eta_xi(0,1) +
           (long double)12.0*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*sqr(crds0[1])*integral_eta_xi(1,1) +
           (long double)12.0*sqr(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*crds0[1]*integral_eta_xi(2,1) +
           (long double)12.0*(crds1[1]-crds0[1])*sqr(crds2[1]-crds0[1])*crds0[1]*integral_eta_xi(1,2)
                );
      return ans;
}

EXPORT long double int_x3y(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*(
           (long double)cub(crds1[0]-crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(4,0) +
           (long double)cub(crds1[0]-crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(3,1) + 
           (long double)cub(crds1[0]-crds0[0])*crds0[1]*integral_eta_xi(3,0) + 
           (long double)cub(crds2[0]-crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(1,3) +
           (long double)cub(crds2[0]-crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(0,4) +
           (long double)cub(crds2[0]-crds0[0])*crds0[1]*integral_eta_xi(0,3) +
           (long double)cub(crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(1,0) +
           (long double)cub(crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(0,1) +
           (long double)cub(crds0[0])*crds0[1]*0.5 +

           (long double)3.0*sqr(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(3,1) +
           (long double)3.0*sqr(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(2,2) +
           (long double)3.0*sqr(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*crds0[1]*integral_eta_xi(2,1) +

           (long double)3.0*sqr(crds1[0]-crds0[0])*crds0[0]*(crds1[1]-crds0[1])*integral_eta_xi(3,0) +
           (long double)3.0*sqr(crds1[0]-crds0[0])*crds0[0]*(crds2[1]-crds0[1])*integral_eta_xi(2,1) +
           (long double)3.0*sqr(crds1[0]-crds0[0])*crds0[0]*crds0[1]*integral_eta_xi(2,0) +

           (long double)3.0*(crds1[0]-crds0[0])*sqr(crds2[0]-crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(2,2) +
           (long double)3.0*(crds1[0]-crds0[0])*sqr(crds2[0]-crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(1,3) +
           (long double)3.0*(crds1[0]-crds0[0])*sqr(crds2[0]-crds0[0])*crds0[1]*integral_eta_xi(1,2)+
           
           (long double)3.0*sqr(crds2[0]-crds0[0])*crds0[0]*(crds1[1]-crds0[1])*integral_eta_xi(1,2) +
           (long double)3.0*sqr(crds2[0]-crds0[0])*crds0[0]*(crds2[1]-crds0[1])*integral_eta_xi(0,3) +
           (long double)3.0*sqr(crds2[0]-crds0[0])*crds0[0]*crds0[1]*integral_eta_xi(0,2) +

           (long double)3.0*(crds1[0]-crds0[0])*sqr(crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(2,0) +
           (long double)3.0*(crds1[0]-crds0[0])*sqr(crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(1,1) +
           (long double)3.0*(crds1[0]-crds0[0])*sqr(crds0[0])*crds0[1]*integral_eta_xi(1,0) +

           (long double)3.0*(crds2[0]-crds0[0])*sqr(crds0[0])*(crds1[1]-crds0[1])*integral_eta_xi(1,1) +
           (long double)3.0*(crds2[0]-crds0[0])*sqr(crds0[0])*(crds2[1]-crds0[1])*integral_eta_xi(0,2) +
           (long double)3.0*(crds2[0]-crds0[0])*sqr(crds0[0])*crds0[1]*integral_eta_xi(0,1) +

      (long double)6.0*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*crds0[0]*(crds1[1]-crds0[1])*integral_eta_xi(2,1) +
      (long double)6.0*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*crds0[0]*(crds2[1]-crds0[1])*integral_eta_xi(1,2) + 
      (long double)6.0*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*crds0[0]*crds0[1]*integral_eta_xi(1,1) 
                );
      return ans;
}


EXPORT long double int_x2y2(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*(
        (long double)sqr(crds1[0]-crds0[0])*(
        (long double)sqr(crds1[1]-crds0[1])*integral_eta_xi(4,0)+
        (long double)sqr(crds2[1]-crds0[1])*integral_eta_xi(2,2)+
        (long double)sqr(crds0[1])*integral_eta_xi(2,0)+
        (long double)2.0*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*integral_eta_xi(3,1)+
        (long double)2.0*(crds1[1]-crds0[1])*crds0[1]*integral_eta_xi(3,0)+
        (long double)2.0*(crds2[1]-crds0[1])*crds0[1]*integral_eta_xi(2,1))+

        (long double)sqr(crds2[0]-crds0[0])*(
        (long double)sqr(crds1[1]-crds0[1])*integral_eta_xi(2,2)+
        (long double)sqr(crds2[1]-crds0[1])*integral_eta_xi(0,4)+
        (long double)sqr(crds0[1])*integral_eta_xi(0,2)+
        (long double)2.0*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*integral_eta_xi(1,3)+
        (long double)2.0*(crds1[1]-crds0[1])*crds0[1]*integral_eta_xi(1,2)+
        (long double)2.0*(crds2[1]-crds0[1])*crds0[1]*integral_eta_xi(0,3))+

        (long double)sqr(crds0[0])*(sqr(crds1[1]-crds0[1])*integral_eta_xi(2,0)+
        (long double)sqr(crds2[1]-crds0[1])*integral_eta_xi(0,2)+
        (long double)sqr(crds0[1])*0.5+
        (long double)2.0*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*integral_eta_xi(1,1)+
        (long double)2.0*(crds1[1]-crds0[1])*crds0[1]*integral_eta_xi(1,0)+
        (long double)2.0*(crds2[1]-crds0[1])*crds0[1]*integral_eta_xi(0,1))+

        (long double)2.0*(crds1[0]-crds0[0])*(crds2[0]-crds0[0])*(
        (long double)sqr(crds1[1]-crds0[1])*integral_eta_xi(3,1)+
        (long double)sqr(crds2[1]-crds0[1])*integral_eta_xi(1,3)+
        (long double)sqr(crds0[1])*integral_eta_xi(1,1)+
   (long double)2.0*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*integral_eta_xi(2,2)+
   (long double)2.0*(crds1[1]-crds0[1])*crds0[1]*integral_eta_xi(2,1)+
   (long double)2.0*(crds2[1]-crds0[1])*crds0[1]*integral_eta_xi(1,2))+

        (long double)2.0*(crds1[0]-crds0[0])*crds0[0]*(
        (long double)sqr(crds1[1]-crds0[1])*integral_eta_xi(3,0)+
        (long double)sqr(crds2[1]-crds0[1])*integral_eta_xi(1,2)+
        (long double)sqr(crds0[1])*integral_eta_xi(1,0)+
        (long double)2.0*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*integral_eta_xi(2,1)+
        (long double)2.0*(crds1[1]-crds0[1])*crds0[1]*integral_eta_xi(2,0)+
        (long double)2.0*(crds2[1]-crds0[1])*crds0[1]*integral_eta_xi(1,1))+

        (long double)2.0*(crds2[0]-crds0[0])*crds0[0]*(
        (long double)sqr(crds1[1]-crds0[1])*integral_eta_xi(2,1)+
        (long double)sqr(crds2[1]-crds0[1])*integral_eta_xi(0,3)+
        (long double)sqr(crds0[1])*integral_eta_xi(0,1)+
        (long double)2.0*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*integral_eta_xi(1,2)+ 
        (long double)2.0*(crds1[1]-crds0[1])*crds0[1]*integral_eta_xi(1,1)+
        (long double)2.0*(crds2[1]-crds0[1])*crds0[1]*integral_eta_xi(0,2))
         );
  
      // printf("print x2y2 result long db = %23.21Lg\n", ans); 
      // printf("print x2y2 result db      = %23.21g\n", (double)ans); 
      return ans;
}

EXPORT long double int_xy3(
      float *crds0,
      float *crds1,
      float *crds2,
      long double det)
{
      long double ans;
      ans = det*(
           (long double)((long double)cub(crds1[1]-crds0[1])*(crds1[0]-crds0[0]))*integral_eta_xi(4,0) +
           (long double)((long double)cub(crds1[1]-crds0[1])*(crds2[0]-crds0[0]))*integral_eta_xi(3,1) +
           (long double)((long double)cub(crds1[1]-crds0[1])*crds0[0])*integral_eta_xi(3,0) +

           (long double)((long double)cub(crds2[1]-crds0[1])*(crds1[0]-crds0[0]))*integral_eta_xi(1,3) +
           (long double)((long double)cub(crds2[1]-crds0[1])*(crds2[0]-crds0[0]))*integral_eta_xi(0,4) +
           (long double)((long double)cub(crds2[1]-crds0[1])*crds0[0])*integral_eta_xi(0,3) +

           (long double)((long double)cub(crds0[1])*(crds1[0]-crds0[0]))*integral_eta_xi(1,0) +
           (long double)((long double)cub(crds0[1])*(crds2[0]-crds0[0]))*integral_eta_xi(0,1) +
           (long double)((long double)cub(crds0[1])*crds0[0]*0.5) +

           (long double)(3.0*(long double)sqr(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*(crds1[0]-crds0[0]))*integral_eta_xi(3,1) +
           (long double)(3.0*(long double)sqr(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*(crds2[0]-crds0[0]))*integral_eta_xi(2,2) +
           (long double)(3.0*(long double)sqr(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*crds0[0])*integral_eta_xi(2,1) +

           (long double)(3.0*(long double)sqr(crds1[1]-crds0[1])*crds0[1]*(crds1[0]-crds0[0]))*integral_eta_xi(3,0) +
           (long double)(3.0*(long double)sqr(crds1[1]-crds0[1])*crds0[1]*(crds2[0]-crds0[0]))*integral_eta_xi(2,1) +
           (long double)(3.0*(long double)sqr(crds1[1]-crds0[1])*crds0[1]*crds0[0])*integral_eta_xi(2,0) +

           (long double)(3.0*(crds1[1]-crds0[1])*(long double)sqr(crds2[1]-crds0[1])*(crds1[0]-crds0[0]))*integral_eta_xi(2,2) +
           (long double)(3.0*(crds1[1]-crds0[1])*(long double)sqr(crds2[1]-crds0[1])*(crds2[0]-crds0[0]))*integral_eta_xi(1,3) +
           (long double)(3.0*(crds1[1]-crds0[1])*(long double)sqr(crds2[1]-crds0[1])*crds0[0])*integral_eta_xi(1,2)+

           (long double)(3.0*(long double)sqr(crds2[1]-crds0[1])*crds0[1]*(crds1[0]-crds0[0]))*integral_eta_xi(1,2) +
           (long double)(3.0*(long double)sqr(crds2[1]-crds0[1])*crds0[1]*(crds2[0]-crds0[0]))*integral_eta_xi(0,3) +
           (long double)(3.0*(long double)sqr(crds2[1]-crds0[1])*crds0[1]*crds0[0])*integral_eta_xi(0,2) +

           (long double)(3.0*(crds1[1]-crds0[1])*(long double)sqr(crds0[1])*(crds1[0]-crds0[0]))*integral_eta_xi(2,0) +
           (long double)(3.0*(crds1[1]-crds0[1])*(long double)sqr(crds0[1])*(crds2[0]-crds0[0]))*integral_eta_xi(1,1) +
           (long double)(3.0*(crds1[1]-crds0[1])*(long double)sqr(crds0[1])*crds0[0])*integral_eta_xi(1,0) +

           (long double)(3.0*(crds2[1]-crds0[1])*(long double)sqr(crds0[1])*(crds1[0]-crds0[0]))*integral_eta_xi(1,1) +
           (long double)(3.0*(crds2[1]-crds0[1])*(long double)sqr(crds0[1])*(crds2[0]-crds0[0]))*integral_eta_xi(0,2) +
           (long double)(3.0*(crds2[1]-crds0[1])*(long double)sqr(crds0[1])*crds0[0])*integral_eta_xi(0,1) +

      (long double)(6.0*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*crds0[1]*(crds1[0]-crds0[0]))*integral_eta_xi(2,1) +
      (long double)(6.0*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*crds0[1]*(crds2[0]-crds0[0]))*integral_eta_xi(1,2) +
      (long double)(6.0*(crds1[1]-crds0[1])*(crds2[1]-crds0[1])*crds0[1]*crds0[0])*integral_eta_xi(1,1)
                );
      return ans;
}

LOCAL long double integral_eta_xi(
        int n_xi,
        int n_eta)
{
        int   l;
        int   sign;
        long double ans = 0.0;
        for(l = 0; l <= n_eta+1; l++)
        {
            sign = (l%2 == 0? 1:-1); 
            ans += (long double)1.0*factorial(n_eta+1)/(factorial(l)*factorial(n_eta+1-l))*sign/(n_xi+l+1);     
        }    
        return ans/(n_eta+1);
}

LOCAL int factorial(int n)
{
    if(n == 1 || n == 0)
        return 1;
    else 
        return (factorial(n-1)*n);
}

EXPORT double matrix_det(
	double     **mat,
        int        size)
{
        int       i, j, order;
        static int size_ = 0;
        static int *IPIV = NULL;
        int       info, job;
                  /*IPIV: (output) INTEGER array, dimension (min(M,N))
                     The pivot indices; for 1 <= i <= min(M,N), row i of the
                     matrix was interchanged with row IPIV(i).
                   info: = 0 --- successful exit
                         < 0:  if INFO = -i, the i-th argument had an illegal value
                         > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                          has been completed, but the factor U is exactly
                          singular, and division by zero will occur if it is used
                          to solve a system of equations.
                   */
        static double     *A, *work, *z;
        double            rcond, det; 
        if(size_ != size)
        {
            size_ = size;
            if(IPIV != NULL)
                free_these(4,A, work, IPIV, z);

            vector(&A,size*size,sizeof(double));
            vector(&work,size*size,sizeof(double));
            vector(&IPIV, size,sizeof(int));
            vector(&z, size,sizeof(double));
        }

        for(j = 0; j < size; j++)
        {
            for(i = 0; i < size; i++)
            {
                A[j*size+i] = mat[i][j];
            }
        }

        order = size_*size_;  
        job = 10; 
        // FORTRAN_NAME(dgeco)(A, &size_, &order, IPIV, &rcond, &z);
        // FORTRAN_NAME(dgedi)(A, &size_, &order, IPIV, &det, &work, &job);

        return det;
}

/* inverse of a square matrix */
EXPORT void  matrix_inv(
	double     **mat,
        int        size,
        double     **inv)
{
        int       i, j;
        static int size_ = 0;
        static int *IPIV = NULL;
        int       info;
                  /*IPIV: (output) INTEGER array, dimension (min(M,N))
                     The pivot indices; for 1 <= i <= min(M,N), row i of the
                     matrix was interchanged with row IPIV(i).
                   info: = 0 --- successful exit
                         < 0:  if INFO = -i, the i-th argument had an illegal value
                         > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                          has been completed, but the factor U is exactly
                          singular, and division by zero will occur if it is used
                          to solve a system of equations.
                   */
        static float     *A, *work;

        if(size_ != size)
        {
            size_ = size;  
            if(IPIV != NULL)
                free_these(3,A, work, IPIV);

            vector(&A,size*size,sizeof(float));
            vector(&work,size*size,sizeof(float));
            vector(&IPIV, size,sizeof(int));
        }
        
        for(j = 0; j < size; j++)
        {
            for(i = 0; i < size; i++)
            {
                A[j*size+i] = mat[i][j];
            }
        }
         
        FORTRAN_NAME(dgetrf)(&size_, &size_, A, &size_, IPIV, &info);
        if(info != 0)
        {
            printf("ERROR: matrix_inv, dgetrf failed, info = %d\n", info);
            clean_up(ERROR);
        }
        FORTRAN_NAME(dgetri)(&size_, A, &size_, IPIV, work, &size_, &info);
        if(info != 0)
        {
            printf("ERROR: matrix_inv, dgetri failed, info = %d\n", info);
            clean_up(ERROR);
        }

        for(j = 0; j < size; j++)
        {
            for(i = 0; i < size; i++)
            {
                inv[i][j] = A[j*size+i];
            }
        }

        // free_these(3,A, work, IPIV);
}

/*
The matrix index starts from 1 to N
*/
EXPORT void  inverse_matrix(
        double      **mat,
        int        size,
        double      **inv)
{

        int        i, j, ii, jj, k;
        static int size_ = 0, *indx, improve = 3;
        static long double *col = NULL, **a, *colcp, *r;
        long double d, sdp;
        long double **iden;

        if(NULL == inv) return;

        return inverse_matrix_gj(mat, size, inv);

        if(size != size_)
        {
            if(col != NULL)
            {
                free(col);
                free(a);
                free(indx);
                free(colcp);
                free(r);
            }
            vector(&col, (size+1), sizeof(long double));
            vector(&colcp, (size+1), sizeof(long double));
            vector(&r, (size+1), sizeof(long double));
            vector(&indx, (size+1), sizeof(int));
            matrix(&a, (size+1), (size+1), sizeof(long double));
            size_ = size;
        }
        for(j = 0; j < size; j++)
        {
            for(i = 0; i < size; i++)
                a[i+1][j+1] = mat[i][j];
        }

        ludcmp(a, size, indx, &d);

        // TMP
        /**
        printf("print LU decomposition:\n");
        for (j = 0; j < size; j++)
        {
            for(i=0; i<size; i++)
                printf(" %22.20g", a[j][i]);
           printf("\n");
        }
        **/

        for(j = 1; j <= size; j++)
        {
            for(i = 1; i <= size; i++)
            {
                colcp[i] = col[i] = 0.0;
            }
            colcp[j] = col[j] = 1.0;

            lubksb(a,size,indx,col);

            // for(i = 1; i <= size; i++)
            //     inv[i-1][j-1] = (col[i]); 

            // improve the soln
            for(k = 0; k < improve; k++)
            {
                for(ii = 1; ii <= size; ii++)
                {
                    sdp = -colcp[ii];
                    for(jj = 1; jj <= size; jj++)
                        sdp += mat[ii-1][jj-1]*col[jj]; 
                    r[ii] = sdp;
                }
                lubksb(a,size,indx,r);
                for(i = 1; i <= size; i++)
                    col[i] = inv[i-1][j-1] = (col[i] - r[i]); 
            }
            // END improve the soln
        }

        /* test ident
        matrix(&iden, MAX_N_COEF, MAX_N_COEF, sizeof(long double));
        matrix_matrix_mult(inv, mat, MAX_N_COEF, MAX_N_COEF, iden);

        print_ldb_matrix("mass_matrix:",MAX_N_COEF, MAX_N_COEF, mat," %18.17Le");
        print_ldb_matrix("Inverse by inverse_matrix:", MAX_N_COEF, MAX_N_COEF, inv," %22.20Lg");
        print_ldb_matrix("ident:", MAX_N_COEF, MAX_N_COEF, iden," %22.20Lg");
        clean_up(0);
        **/
}

EXPORT void solve_by_gj(
        double      **mat,
        int         size,
        double      *inb,
        double      *x)
{       
        static int size_ = 0;
        static long double **b = NULL, **a;
        int i, j;
        double **tmpm, **iden;
        
        if(size > size_)
        {
            if(b != NULL)
            {
                free(a); free(b);
            }
            size_ = size;
            matrix(&a, (size+1), (size+1), sizeof(long double));
            matrix(&b, (size+1), (size+1), sizeof(long double));
        }
        for(j = 0; j < size; j++)
        {
            for(i = 0; i < size; i++)
                a[i+1][j+1] = mat[i][j];
        }

        for(i = 1; i <= size; i++)
            b[i][1] = inb[i-1];

        gaussj(a,size,b,1);
        for(j = 0; j < size; j++)
            x[j] = b[j+1][1];
}


/*
The input matrix index starts from 0 to N-1.
The subroutines matrix index starts from 1 to N.
*/

EXPORT void solve_by_LU(
	double     **mat,
	int        size,
        double     *b,
        double     *x)
{
        int        i, j, ii, jj;
        static int size_ = 0, *indx, improve = 3; // old 3
        static long double *col = NULL, **a, **copya, *colcp, *r;
        long double d, sdp;
        long double **iden;

        if(size != size_)
        {
            if(col != NULL)
            {
                free(col);
                free(a);
                free(copya);
                free(indx);
                free(colcp);
                free(r);
            }
            vector(&col, (size+1), sizeof(long double));
            vector(&colcp, (size+1), sizeof(long double));
            vector(&r, (size+1), sizeof(long double));
            vector(&indx, (size+1), sizeof(int));
            matrix(&a, (size+1), (size+1), sizeof(long double));
            matrix(&copya, (size+1), (size+1), sizeof(long double));
            size_ = size;
        }

        for(j = 0; j < size; j++)
        {
            colcp[j+1] = col[j+1] = b[j];
            for(i = 0; i < size; i++)
                copya[i+1][j+1] = a[i+1][j+1] = mat[i][j];
        }

        ludcmp(a, size, indx, &d);

        lubksb(a, size, indx, col);

        // improve the soln
        for(j = 0; j < improve; j++)
        {
            for(ii = 1; ii <= size; ii++)
            {
                sdp = -colcp[ii];
                for(jj = 1; jj <= size; jj++)
                    sdp += copya[ii][jj]*col[jj];
                r[ii] = sdp;
            }    
            lubksb(a,size,indx,r);
            for(i = 1; i <= size; i++)
                col[i] = x[i-1] = (col[i] - r[i]);
        }
}

LOCAL void lubksb(
	long double **a,
        int         n,
        int         *indx,
        long double *b)
{
        int          i, ii = 0, ip, j;
        long double  sum; 

        
        for(i = 1; i <= n; i++)
        {
            ip = indx[i];
            sum = b[ip];
            b[ip] = b[i];
            if(ii)
            {
                for(j = ii; j <= i-1; j++)
                    sum -= a[i][j]*b[j]; 
            }
            else if (sum != 0.0) 
                ii = i;
            // for(j = 0; j <= i-1; j++)
            //     sum -= a[i][j]*b[j];
            b[i] = sum;
        }
        for(i = n; i >= 1; i--)
        {
            sum = b[i];
            for(j = i+1; j <= n; j++)
                sum -= a[i][j]*b[j];

            /**
            if(i == n-1)
                printf("print sum %g, a[i][i] = %g\n", sum, a[i][i]);
            **/

            b[i] = sum/a[i][i]; 
            
        }
} 

LOCAL void ludcmp(
	long double **a,
        int         n,
        int         *indx,
        long double *d)
{
        int i, imax, j, k;
        long double big, dum, sum, temp;
        static long double *vv = NULL; 
        static int       size_ = 0;

        if(n != size_)
        {
            if(vv != NULL)
            {
                free(vv);
            }
            vector(&vv, (n+1), sizeof(long double));
            size_ = n;
        }

        *d = 1.0;
        for(i = 1; i <= n; i++)
        {
            big = 0.0;
            for(j = 1; j <= n; j++)
            {
                if( (temp = fabsl(a[i][j])) > big) 
                    big = temp;   
            }
            if(big == 0.0)
            {
                printf("ERROR: Singular matrix in ludcmp\n");
                clean_up(ERROR);
            }
            vv[i] = 1.0/big; 
        }

        for(j = 1; j <= n; j++)
        {
            for(i = 1; i < j; i++)
            {
                sum = a[i][j];
                for(k = 1; k < i; k++)
                    sum -= a[i][k]*a[k][j];
                a[i][j] = sum;
            } 
            big = 0.0;
            for(i = j; i <= n; i++)
            {
                sum = a[i][j];
                for(k = 1; k < j; k++)
                    sum -= a[i][k]*a[k][j];
                a[i][j] = sum;
                if( (dum = vv[i]*fabsl(sum)) >= big )
                {
                    big = dum;
                    imax = i;
                }
            }
            if(j != imax)
            {
                for(k = 1; k <= n; k++)
                {
                    dum = a[imax][k];
                    a[imax][k] = a[j][k];
                    a[j][k] = dum;
                }
                *d = -(*d);
                vv[imax] = vv[j];
            }
            indx[j] = imax;
            if(a[j][j] == 0.0) a[j][j] = 0.0;
            if(j != n)
            {
                dum = 1.0/a[j][j];
                for(i = j+1; i <= n; i++)
                    a[i][j] *= dum;
            }  
        }
}

EXPORT  void    print_ldb_matrix(
        const char      *title,
        int             rows,
        int             cols,
        double     **matrix,
        const char      *format)
{
        int             row,col;

        (void) printf("\n\n");
        if (title != NULL) (void) printf("%s\n",title);
        // for (row = rows-1; row >= 0; row--)
        for (row = 0; row < rows; row++)
        {
            for(col=0; col<cols; col++)
                (void) printf(format,matrix[row][col]);
            (void) printf("\n");
        }
        (void) printf("\n\n");
}               

/* comput Ax */
EXPORT void matrix_vec_mult(
        double    **mat,
        double    *vec,
        int      row,
        int      col,
        double    *ans)
{
        int      i, j;

        for(i = 0; i < row; i++)
        {
            ans[i] = 0.0;
            for(j = 0; j < col; j++)
            {
                ans[i] += mat[i][j]*vec[j];
            }
        }
}

EXPORT void matrix_matrix_mult(
        double    **mat,
        double    **matr,
        int      row,
        int      col,
        double    **ans)
{
        int      i, j, k;

        for(i = 0; i < row; i++)
        {
            for(j = 0; j < col; j++)
            {
                ans[i][j] = 0.0;
                for(k = 0; k < col; k++)
                    ans[i][j] += mat[i][k]*matr[k][j];
            }
        }
}

EXPORT void trans_matrix(
            double **mat,
            int  row,
            int  col,
            double **tra)
{
            int i,   j;
            for(i = 0; i < row; i++)
            {
                for(j = 0; j < col; j++)
                {
                    tra[j][i] = mat[i][j];
                }
            }
}

EXPORT void d_trans_matrix(
            double **mat,
            int  row,
            int  col,
            double **tra)
{
            int i,   j;
            for(i = 0; i < row; i++)
            {
                for(j = 0; j < col; j++)
                {
                    tra[j][i] = mat[i][j];
                }
            }
}


EXPORT float vh_val(
        float *crds,
        double *cent,
        int   indx)
{
        float ans;
        switch(indx)
        {
        case 0:
            ans = 1.0;
        break;
        case 1:
            ans = crds[0]-cent[0];
        break;
        case 2:
            ans = crds[1]-cent[1];
        break;
        case 3:
            ans = sqr(crds[0]-cent[0]);
        break;
        case 4:
            ans = (crds[0]-cent[0])*(crds[1]-cent[1]);
        break;
        case 5:
            ans = sqr(crds[1]-cent[1]);
        break;
        case 6:
            ans = cub(crds[0]-cent[0]);
        break;
        case 7:
            ans = sqr(crds[0]-cent[0])*(crds[1]-cent[1]); 
        break;
        case 8:
            ans = (crds[0]-cent[0])*sqr(crds[1]-cent[1]);
        break;
        case 9:
            ans = cub(crds[1]-cent[1]);
        break;
        default:
            printf("ERROR vh_val, implement 2D degree %d\n", indx);
            clean_up(ERROR);
        }
        return ans;
}

EXPORT float vh_val_loc_div_free_basis(
        float *crds,
        double *cent,
        double sqrt_area,
        int   indx)
{
        double ans;
        switch(indx)
        {   
        case 0:
            ans = 1.0;
        break;
        case 1: 
            ans = (crds[0]-cent[0])/sqrt_area;
        break;  
        case 2: 
            ans = (crds[1]-cent[1])/sqrt_area;
        break;
        case 3:     
            ans = sqr((crds[0]-cent[0])/sqrt_area);
        break;  
        case 4:
            ans = ((crds[0]-cent[0])/sqrt_area)*((crds[1]-cent[1])/sqrt_area);
        break;
        case 5:
            ans = sqr((crds[1]-cent[1])/sqrt_area);
        break;
        case 6:
            ans = cub((crds[0]-cent[0])/sqrt_area);
        break;
        case 7:
            ans = sqr((crds[0]-cent[0])/sqrt_area)*((crds[1]-cent[1])/sqrt_area);
        break;
        case 8:
            ans = ((crds[0]-cent[0])/sqrt_area)*sqr((crds[1]-cent[1])/sqrt_area);
        break;
        case 9:
            ans = cub((crds[1]-cent[1])/sqrt_area);
        break;
        default:
            printf("ERROR vh_val_loc_div_free_basis, implement 2D degree %d\n", indx);
            clean_up(ERROR);
        }
        return ans;
}


//ok
EXPORT TRI *find_corres_tri(
        float     *cn,
        INTERFACE *mesh)
{
        TRI      *tri;
        SURFACE  **surf = mesh->surfaces;
        float    *h = computational_grid(mesh)->h;
        double    *cent;

        for(; surf && *surf; surf++)
        {
            for (tri = first_tri(*surf);
                 !at_end_of_tri_list(tri,*surf); tri = tri->next)
            {
                cent = fg_centroid(tri);
                if(fabs(cent[0]-cn[0]) < min(h[0],h[1])*0.0001 &&
                   fabs(cent[1]-cn[1]) < min(h[0],h[1])*0.0001)
                {
                    return tri;
                }
            }
        }

        /**
        for(surf = mesh->surfaces; surf && *surf; surf++)
        {
            for (tri = first_tri(*surf);
                 !at_end_of_tri_list(tri,*surf); tri = tri->next)
            {
                cent = fg_centroid(tri);
                if(fabs(cent[0]-cn[0]) < 0.000001 &&
                   fabs(cent[1]-cn[1]) < 0.000001)
                {
                    printf("WARNING: tri center (%22.20g, %22.20g). h[%g, %g]\n",
                        cent[0], cent[1], h[0], h[1]);
                }
            }
        }
        **/

        return NULL;
}

// gauss-Jordan elimination with pivoting
LOCAL void gaussj(
	long double **a,
        int         n,
        long double **b,
        int         m)
{
	static int size_ = 0,  *indxc = NULL, *indxr, *ipiv;
	int   i, icol, irow, j, k , l, ll;
        long double big, dum, pivinv, temp;

        if(n != size_)
        {
            if(indxc != NULL)
            {
                free(indxc); free(indxr); free(ipiv);
            }
            vector(&indxc, (n+1),  sizeof(int));
            vector(&indxr, (n+1),  sizeof(int));
            vector(&ipiv, (n+1),  sizeof(int));
            size_ = n;
        }

        for(i = 1; i <= n; i++) ipiv[i] = 0;
        for(i = 1; i <= n; i++)
        {
            big = 0.0;
            for(j = 1; j <= n; j++)
            {
                if(ipiv[j] != 1)
                {
                    for(k =1; k<=n; k++)
                    {
                        if(ipiv[k] == 0)
                        {
                            if(fabsl(a[j][k]) >= big)
                            {
                                big = fabsl(a[j][k]);
                                irow = j;
                                icol = k;
                            }  
                        }
                    } 
                }
            }
            ++(ipiv[icol]);

            if(irow != icol)
            {
                for(l = 1; l <= n; l++) SWAP(a[irow][l], a[icol][l]);
                for(l = 1; l <= m; l++) SWAP(b[irow][l], b[icol][l]);
            } 
            indxr[i] = irow;
            indxc[i] = icol;
            if(a[icol][icol] == 0.0)
            {
                printf("ERROR() gaussj, main ele = 0.0 at icol %d\n", icol);
                clean_up(ERROR);
            }
            pivinv = 1.0/a[icol][icol];
            a[icol][icol] = 1.0;
            for(l = 1; l <= n; l++) a[icol][l] *= pivinv;
            for(l = 1; l <= m; l++) b[icol][l] *= pivinv;
            for(ll= 1; ll <= n; ll++)
            {
                if(ll != icol)
                {
                    dum = a[ll][icol];
                    a[ll][icol] = 0.0;
                    for(l = 1; l <= n; l++) a[ll][l] -= a[icol][l]*dum;
                    for(l = 1; l <= m; l++) b[ll][l] -= b[icol][l]*dum;
                }
            }
        }

        for(l =n; l >=1; l--)
        {
            if(indxr[l] != indxc[l])
            {
                for(k =1; k <= n; k++)
                    SWAP(a[k][indxr[l]], a[k][indxc[l]]);
            }
        } 
}

// inverse by Gauss-Jordan
LOCAL void  inverse_matrix_gj(
        double      **mat,
        int        size,
        double      **inv)
{
	static int size_ = 0;
        static long double **b = NULL, **a;
        int i, j;
        long double **tmpm, **iden;
  
        if(size == 1)
        {
            inv[0][0] = 1.0/mat[0][0];
            return;
        }

        if(size != size_)
        {
            if(b != NULL)
            {
                free(a); free(b);
            }
            size_ = size;
            matrix(&a, (size+1), (size+1), sizeof(long double));
            matrix(&b, (size+1), (size+1), sizeof(long double));
        }
        for(j = 0; j < size; j++)
        {
            for(i = 0; i < size; i++)
                a[i+1][j+1] = mat[i][j];
        }

        for(i = 1; i <= size; i++)
        {
            b[i][1] = 0.0;
            b[i][2] = 0.0;
        }
        b[1][1] = 1.0;
        b[2][2] = 1.0;

        gaussj(a,size,b,2);
 
        for(j = 0; j < size; j++)
        {
            for(i = 0; i < size; i++)
                inv[i][j] = a[i+1][j+1];
        }

        /** test ident
        matrix(&iden, MAX_N_COEF, MAX_N_COEF, sizeof(long double));
        matrix_matrix_mult(inv, mat, MAX_N_COEF, MAX_N_COEF, iden);

        print_ldb_matrix("mass_matrix:",MAX_N_COEF, MAX_N_COEF, mat," %18.17Le");
        print_ldb_matrix("Inverse by inverse_matrix_gj:", MAX_N_COEF, MAX_N_COEF, inv," %22.20Lg");
        print_ldb_matrix("ident:", MAX_N_COEF, MAX_N_COEF, iden," %22.20Lg");
        clean_up(0);
        **/
}


/* exact for polynomial of degree 8*/
// From http://electromagnetics.biz/Integration.htm //Gaussian
EXPORT void tri_quadrature_16_pts(
        float       *pcrds0,
        float       *pcrds1,
        float       *pcrds2,
        float       crds[][2])
{
        static float xg[16] = {0.33333333333333333,0.081414823414554,0.459292588292723,
                               0.459292588292723,0.65886138449648,0.17056930775176,
                               0.17056930775176,0.898905543365938,0.050547228317031,
                               0.050547228317031,0.008394777409958001,0.263112829634638,
                               0.728492392955404,0.263112829634638,0.728492392955404,
                               0.008394777409958001};
        static float yg[16] = {0.33333333333333333,0.459292588292723,0.459292588292723,
                               0.081414823414554,0.17056930775176,0.17056930775176 ,
                               0.65886138449648,0.050547228317031,0.050547228317031,
                               0.898905543365938,0.263112829634638,0.728492392955404,
                               0.008394777409958001,0.008394777409958001,0.263112829634638,
                               0.728492392955404};

        int i;
        for(i = 0; i < 16; i++)
        {
            crds[i][0] = pcrds0[0] + (pcrds1[0]-pcrds0[0])*xg[i] + (pcrds2[0]-pcrds0[0])*yg[i];
            crds[i][1] = pcrds0[1] + (pcrds1[1]-pcrds0[1])*xg[i] + (pcrds2[1]-pcrds0[1])*yg[i];
        }
}

EXPORT void tri_quadrature_13_pts(
	float       *pcrds0,
        float       *pcrds1,
        float       *pcrds2,
        float       crds[][2])
{
        static float a = 0.065130102902216, b = 0.869739794195568;
        static float c = 0.312865496004875, d = 0.638444188569809;
        static float e = 0.048690315425316, f = 0.260345966079038, g = 0.479308067841923; 
        float third;
        // float w1 =-0.149570044467670, w2 = 0.053347235608839,
        //       w3 = 0.175615257433204,  w4 = 0.077113760890257;
        third = 1.0/3.0;

        crds[0][0] = (pcrds0[0] + pcrds1[0] + pcrds2[0]) * third;
        crds[0][1] = (pcrds0[1] + pcrds1[1] + pcrds2[1]) * third;

        crds[1][0] = b*pcrds0[0] + a*(pcrds1[0] + pcrds2[0]);
        crds[1][1] = b*pcrds0[1] + a*(pcrds1[1] + pcrds2[1]);
        crds[2][0] = b*pcrds1[0] + a*(pcrds0[0] + pcrds2[0]);
        crds[2][1] = b*pcrds1[1] + a*(pcrds0[1] + pcrds2[1]);
        crds[3][0] = b*pcrds2[0] + a*(pcrds0[0] + pcrds1[0]);
        crds[3][1] = b*pcrds2[1] + a*(pcrds0[1] + pcrds1[1]);

        crds[4][0] = g*pcrds0[0] + f*(pcrds1[0] + pcrds2[0]);
        crds[4][1] = g*pcrds0[1] + f*(pcrds1[1] + pcrds2[1]);
        crds[5][0] = g*pcrds1[0] + f*(pcrds0[0] + pcrds2[0]);
        crds[5][1] = g*pcrds1[1] + f*(pcrds0[1] + pcrds2[1]);
        crds[6][0] = g*pcrds2[0] + f*(pcrds1[0] + pcrds0[0]);
        crds[6][1] = g*pcrds2[1] + f*(pcrds1[1] + pcrds0[1]);

        crds[7][0] = c*pcrds0[0] + d*pcrds1[0] + e*pcrds2[0];
        crds[7][1] = c*pcrds0[1] + d*pcrds1[1] + e*pcrds2[1];

        crds[8][0] = d*pcrds0[0] + c*pcrds1[0] + e*pcrds2[0];
        crds[8][1] = d*pcrds0[1] + c*pcrds1[1] + e*pcrds2[1];

        crds[9][0] = d*pcrds0[0] + e*pcrds1[0] + c*pcrds2[0];
        crds[9][1] = d*pcrds0[1] + e*pcrds1[1] + c*pcrds2[1];

        crds[10][0] = e*pcrds0[0] + d*pcrds1[0] + c*pcrds2[0];
        crds[10][1] = e*pcrds0[1] + d*pcrds1[1] + c*pcrds2[1];

        crds[11][0] = e*pcrds0[0] + c*pcrds1[0] + d*pcrds2[0];
        crds[11][1] = e*pcrds0[1] + c*pcrds1[1] + d*pcrds2[1];

        crds[12][0] = c*pcrds0[0] + e*pcrds1[0] + d*pcrds2[0];
        crds[12][1] = c*pcrds0[1] + e*pcrds1[1] + d*pcrds2[1];

        // cavef13 = w1* ff(x1,y1,m) +
        //           w2*(ff(x2,y2,m)+ff(x3,y3,m)+ff(x4,y4,m))  +
        //           w3*(ff(x5,y5,m)+ff(x6,y6,m)+ff(x7,y7,m))  +
        //           w4*(ff(x8,y8,m)+ff(x9,y9,m)+ff(x10,y10,m)+
        //              ff(x11,y11,m)+ff(x12,y12,m)+ff(x13,y13,m));
}

// Order, vertices &  edge centers & centriod
EXPORT void tri_quadrature_7_pts(
        float       *pcrds0,
        float       *pcrds1,
        float       *pcrds2,
        double *cent,
        float       crds[][2])
{
        int i;
        for(i = 0; i < 2; i++)
        {
            crds[0][i] = (pcrds0[i]);
            crds[1][i] = (pcrds1[i]);
            crds[2][i] = (pcrds2[i]);
        }
        for(i = 0; i < 2; i++)
        {
            crds[3][i] = 0.5*(pcrds0[i] + pcrds1[i]);
            crds[4][i] = 0.5*(pcrds1[i] + pcrds2[i]);
            crds[5][i] = 0.5*(pcrds2[i] + pcrds0[i]);
        }
        for(i = 0; i < 2; i++)
            crds[6][i] =cent[i];
}


// Order: edge centers 
EXPORT void tri_quadrature_3_pts(
        float       *pcrds0,
        float       *pcrds1,
        float       *pcrds2,
        double      *cent,
        float       crds[][2])
{
        int i;
        for(i = 0; i < 2; i++)
        {
            crds[0][i] = 0.5*(pcrds0[i] + pcrds1[i]);
            crds[1][i] = 0.5*(pcrds1[i] + pcrds2[i]);
            crds[2][i] = 0.5*(pcrds2[i] + pcrds0[i]);
        }
}

EXPORT void print_tri_crds(
        TRI *tri)
{
        printf("Triangle(%d) ceontriod (%13.12g, %13.12g), BC type = %s\n", tri->id,
                fg_centroid(tri)[0], fg_centroid(tri)[1], print_TRI_BC_TYPE(tri->BC_type));
        print_general_vector("Tri_pt", Coords(Point_of_tri(tri)[0]), 2, "\n");
        print_general_vector("Tri_pt", Coords(Point_of_tri(tri)[1]), 2, "\n");
        print_general_vector("Tri_pt", Coords(Point_of_tri(tri)[2]), 2, ";\n\n");
}

EXPORT int tri_on_phy_bdry(
        TRI      *tri)
{
        if((tri)->BC_type == NEUMANN ||
           (tri)->BC_type == IN_FLOW ||
           (tri)->BC_type == OUT_FLOW ||
           (tri)->BC_type == CONST_P)
        {
            return YES;
        }

        return NO;
}

EXPORT char *print_TRI_BC_TYPE(int BC_type)
{
     switch(BC_type)
     {
     case NEUMANN:
         return "NEUMANN";
     break;
     case IN_FLOW:
         return "IN_FLOW";
     break;
     case OUT_FLOW:
         return "OUT_FLOW";
     break;
     case CONST_P:
         return "CONST_P";
     break;
     case SUBDOMAIN:
         return "SUBDOMAIN";
     break;
     case ON_SUBDOMAIN:
         return "ON_SUBDOMAIN";
     break;
     default:
         return "UNKNOWN TYPE";
     break;
     }
}

EXPORT void print_polyg_crds(
        POLYGON *polyg)        
{                        
        int   i; 
        char  str[1024];

        printf("Polygon(%d) ceontriod (%g, %g), closed = %d\n", polyg->id,
                polyg_centroid(polyg)[0], polyg_centroid(polyg)[1], polyg->closed);
        for(i = 0; i < polyg->n_sides; i++)
        {
            sprintf(str, "Polyg_pt[%d]", i);
            print_general_vector(str, Coords(Point_of_polyg(polyg)[i]), 2, "\n");
        }
        // sprintf(str, "Polyg_pt[%d]", polyg->n_sides);
        // print_general_vector(str, Coords(Point_of_polyg(polyg)[i]), 2, ";\n\n");
        printf("\n");
}       

LOCAL long double B_val(
        float crds[][2],
        double   *cent,
        int   pos,
        int   indx)
{
        long double tmpx, tmpy;

	switch(indx)
        {
        case 0:
            return 1.0;
        break;
        case 1:
            tmpx = crds[pos][0] - cent[0];
            return tmpx;
        break;
        case 2:
            tmpy = crds[pos][1] - cent[1];
            return tmpy;
        break;
        case 3:
            tmpx = crds[pos][0] - cent[0];
            return sqr(tmpx);
        break;
        case 4:
            tmpx = crds[pos][0] - cent[0];
            tmpy = crds[pos][1] - cent[1];
            return tmpx*tmpy;
        break;
        case 5:
            tmpy = crds[pos][1] - cent[1];
            return sqr(tmpy);
        break;
        case 6:
            tmpx = crds[pos][0] - cent[0];
            return cub(tmpx);
        break;
        case 7:
            tmpx = crds[pos][0] - cent[0];
            tmpy = crds[pos][1] - cent[1];
            return (sqr(tmpx)*tmpy);
        break;
        case 8:
            tmpx = crds[pos][0] - cent[0];
            tmpy = crds[pos][1] - cent[1];
            return (tmpx*sqr(tmpy));
        break;
        case 9:
            tmpy = crds[pos][1] - cent[1];
            return cub(tmpy);
        break;
        case 10:
             tmpx = crds[pos][0] - cent[0];
             return sqr(tmpx)*sqr(tmpx);
        break; 
        case 11:
             tmpx = crds[pos][0] - cent[0];
             tmpy = crds[pos][1] - cent[1];
             return cub(tmpx)*tmpy;
        break;
        case 12:
             tmpx = crds[pos][0] - cent[0];
             tmpy = crds[pos][1] - cent[1];
             return sqr(tmpx)*sqr(tmpy);
        break;
        case 13:
             tmpx = crds[pos][0] - cent[0];
             tmpy = crds[pos][1] - cent[1];
             return (tmpx)*cub(tmpy);
        break;
        case 14:
             tmpy = crds[pos][1] - cent[1];
             return sqr(tmpy)*sqr(tmpy);
        break;
        }

        printf("ERROR: B_val\n");
        clean_up(ERROR);
}

EXPORT bool tri_out_rect(
        TRI             *tri,
        float           *L,
        float           *U)
{
        float           *cent;

        cent = fg_centroid(tri);
        if(cent[0] < L[0] || cent[0] > U[0])
            return YES;
        if(cent[1] < L[1] || cent[1] > U[1])
            return YES;
        return NO;
}       

EXPORT bool polyg_out_rect(
        POLYGON         *polyg,
        float           *L,
        float           *U)
{
        float           *cent;

        cent = polyg_centroid(polyg);
        if(cent[0] < L[0] || cent[0] > U[0])
            return YES;
        if(cent[1] < L[1] || cent[1] > U[1])
            return YES;
        return NO;
}

EXPORT int  Point_on_polygon_bdry(
        POINT      **pt,
        int        N,
        double     *point)
{
        int        i;
        double     *p0, *p1;

        for(i = 0; i < N; i++)
        {
            p0 = Coords(pt[i]);
            p1 = Coords(pt[(i+1)%N]);

            if(YES == point_on_line_seg_2d(p0, p1, point, NO))
            {
                // printf("WARNING: found pt on line segment\n");
                // point_on_line_seg_2d(p0, p1, point, YES);
                return YES;
            }
        }

        return NO;
}

EXPORT int  Point_in_polygon_2d(
        POINT      **pt,
        int        N,
        double     *point)
{
        int     counter = 0;
        int     i;
        double  xinters;
        POINT   *p1, *p2;

        p1 = pt[0];
        for (i=1;i<=N;i++)
        {
            p2 = pt[i % N];
            if (point[1] > min(Coords(p1)[1],Coords(p2)[1]))
            {
                if (point[1] <= max(Coords(p1)[1],Coords(p2)[1]))
                {
                    if (point[0] <= max(Coords(p1)[0],Coords(p2)[0]))
                    {
                        if (Coords(p1)[1] != Coords(p2)[1])
                        {
                            xinters = (point[1]-Coords(p1)[1])*(Coords(p2)[0]-Coords(p1)[0])/
                                      (Coords(p2)[1]-Coords(p1)[1])+Coords(p1)[0];
                            if (Coords(p1)[0] == Coords(p2)[0] || point[0] <= xinters)
                                counter++;
                        }
                    }
                }
            }
 
            p1 = p2;
        }

        if (counter % 2 == 0)
            return NO;
        else
            return YES;
}

/// Compute local P1 basis for triangle 
EXPORT void C0_conformal_basis_tri(
        double     *x,
        double     *y,
        double     *phi0,
        double     *phi1,
        double     *phi2)
{
        static double  **A = NULL, **invA;
        int        i, j;

        if(NULL == A)
        {
            matrix(&A, 3, 3, sizeof(double));
            matrix(&invA, 3, 3, sizeof(double));
            for(i = 0; i < 3; i++)
                A[i][0] = 1.0;
        }

        for(i = 0; i < 3; i++)
            A[i][1] = x[i];
        for(i = 0; i < 3; i++)
            A[i][2] = y[i];

        inverse_matrix(A, 3, invA);

        for(i = 0; i < 3; i++)
            phi0[i] = invA[i][0];
        for(i = 0; i < 3; i++)
            phi1[i] = invA[i][1];
        for(i = 0; i < 3; i++)
            phi2[i] = invA[i][2];
}

/// Compute local P2 basis for triangle
EXPORT void C0_conformal_basis_tri_P2(
        double     *x,
        double     *y,
        double     **phi) 
{               
        static double  **A = NULL, **invA;
        int        i, j;
                
        if(NULL == A)
        {       
            matrix(&A, 6, 6, sizeof(double));
            matrix(&invA, 6, 6, sizeof(double));
            for(i = 0; i < 6; i++)
                A[i][0] = 1.0;
        }
                
        for(i = 0; i < 6; i++)
            A[i][1] = x[i];
        for(i = 0; i < 6; i++)
            A[i][2] = y[i];
        for(i = 0; i < 6; i++)
            A[i][3] = sqr(x[i]);
        for(i = 0; i < 6; i++)
            A[i][4] = x[i]*y[i];
        for(i = 0; i < 6; i++)
            A[i][5] = sqr(y[i]);

        inverse_matrix(A, 6, invA);
        
        for(i = 0; i < 6; i++) // nodes
        {
            for(j = 0; j < 6; j++)
                phi[i][j] = invA[j][i];
        }
}

///// Index order of local basis for triangular sub-region of dual cell:
/// For P1 case:
///// 0-----dual-cell centroid node
///// 1-----starting node of ith edge
///// 2-----starting node of (i+1)th edge
/// For P2 case:
///// 0-----dual-cell centroid node
///// 2-----starting node of ith edge
///// 4-----starting node of (i+1)th edge
///// other edge middle points are in between
EXPORT void comp_C0_conformal_basis_polygon(
        POLYGON   *polyg)
{
        int        i, j, k;
        POINT      *p0, *p1, *pc;
        double     x_crds[20], y_crds[20], *cent;
        double     phi0[20][C0_MAX_N_COEF], phi1[20][C0_MAX_N_COEF], phi2[20][C0_MAX_N_COEF];
        double     ***conformal_basis_tri = polyg->conformal_basis_tri;
        int        debug = NO;
        double     **tmp_phi, val[10];

        if(C0_MAX_N_COEF == 3)
        {
            x_crds[0] = polyg_centroid(polyg)[0];
            y_crds[0] = polyg_centroid(polyg)[1];

            // if(polyg->n_sides == 5) debug = YES;

            for(i = 0; i < polyg->n_sides; i++)
            {
                p0 = Point_of_polyg(polyg)[i];
                p1 = Point_of_polyg(polyg)[(i+1)%polyg->n_sides];
                x_crds[1] = Coords(p0)[0];
                y_crds[1] = Coords(p0)[1];

                x_crds[2] = Coords(p1)[0];
                y_crds[2] = Coords(p1)[1];

                /***
                if(debug == YES && i == 4)
                {
                    printf("work on tri-region %d\n", i); 
                    print_general_vector("x-crds", x_crds,3, "\n");
                    print_general_vector("y-crds", y_crds,3, "\n");
                }
                ***/
                C0_conformal_basis_tri(x_crds,y_crds,phi0[i],phi1[i],phi2[i]);

                for(k = 0; k < C0_MAX_N_COEF; k++)
                    conformal_basis_tri[i][0][k] = phi0[i][k];
                for(k = 0; k < C0_MAX_N_COEF; k++)
                    conformal_basis_tri[i][1][k] = phi1[i][k];
                for(k = 0; k < C0_MAX_N_COEF; k++)
                    conformal_basis_tri[i][2][k] = phi2[i][k];
            }
        }
        else if(C0_MAX_N_COEF == 6)
        {
            matrix(&tmp_phi, C0_MAX_N_COEF, C0_MAX_N_COEF, sizeof(double));

            x_crds[0] = polyg_centroid(polyg)[0];
            y_crds[0] = polyg_centroid(polyg)[1];
            for(i = 0; i < polyg->n_sides; i++)
            {
                p0 = Point_of_polyg(polyg)[i];
                p1 = Point_of_polyg(polyg)[(i+1)%polyg->n_sides];

                x_crds[2] = Coords(p0)[0];
                y_crds[2] = Coords(p0)[1];

                x_crds[4] = Coords(p1)[0];
                y_crds[4] = Coords(p1)[1];

                x_crds[1] = 0.5*(x_crds[0] + x_crds[2]);
                y_crds[1] = 0.5*(y_crds[0] + y_crds[2]);

                x_crds[3] = 0.5*(x_crds[4] + x_crds[2]);
                y_crds[3] = 0.5*(y_crds[4] + y_crds[2]);

                x_crds[5] = 0.5*(x_crds[4] + x_crds[0]);
                y_crds[5] = 0.5*(y_crds[4] + y_crds[0]);

                C0_conformal_basis_tri_P2(x_crds,y_crds, tmp_phi);

                for(j = 0; j < 6; j++) // nodes
                {
                    for(k = 0; k < C0_MAX_N_COEF; k++)
                    {
                        conformal_basis_tri[i][j][k] = tmp_phi[j][k];
                    }
                }

                /// TMP:: debug
                /**
                for(j = 0; j < 6; j++)
                {
                    val[j] = tmp_phi[3][0] + x_crds[j]*tmp_phi[3][1] + y_crds[j]*tmp_phi[3][2] + 
                             sqr(x_crds[j])*tmp_phi[3][3] + x_crds[j]*y_crds[j]*tmp_phi[3][4] + 
                             sqr(y_crds[j])*tmp_phi[3][5];
                }
                print_general_vector("val", val, 6, "\n");
                printf("\n");
                **/
                //// END::: TMP:: debug
            }
            /// TMP
            // printf("WARNING: STOP in comp_C0_conformal_basis_polygon()\n");
            // clean_up(0);

            free(tmp_phi);
        }
}


#endif /* #if defined(TWOD) */




#if defined(PETSC)
/// This is to solve ax = B on local machine. No parallel is needed.
EXPORT void  petsc_linear_solver(
        double    **a,
        double    *B,
        int       dim,
        double    *ans)
{
        KSP                  ksp;
        Mat                  A; // linear system matrix
        Vec                  x, b, u;  /* approx solution, RHS, exact solution */
        PetscScalar          *xx;
        MatNullSpace         nullsp;
        PC                   pc;
        int                  its, i, j;
        PetscReal            norm;
        KSPConvergedReason   reason;
        double               tmp;

        VecCreate(PETSC_COMM_WORLD,&x);
        PetscObjectSetName((PetscObject) x, "Solution");
        VecSetSizes(x,PETSC_DECIDE,dim);
        VecSetFromOptions(x);
        VecDuplicate(x,&b);
        VecDuplicate(x,&u);

        /*  Create matrix.  */
        MatCreate(PETSC_COMM_WORLD,&A);
        MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,dim,dim);
        MatSetFromOptions(A);

        /* set matrix value */
        for(j = 0; j < dim; j++) // col
        {
            for(i = 0; i < dim; i++) // row
            {
                tmp = a[i][j];
                MatSetValues(A,1,&i,1,&j,&tmp,INSERT_VALUES);
            }
        }
        MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
        MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);

        // TMP
        if(debugging("petsc_solver"))
        {
            printf("\n********************Show Matrix A\n\n");
            MatView(A,PETSC_VIEWER_STDOUT_WORLD);
        }

        /* set RHS */
        for(j = 0; j < dim; j++)
        {
            tmp = B[j];
            VecSetValues(b,1,&j,&tmp,INSERT_VALUES);
        }
        VecAssemblyBegin(b);
        VecAssemblyEnd(b);

        // TMP
        if(debugging("petsc_solver"))
        {
            printf("\n********************Show Right side B\n\n");
            VecView(b,PETSC_VIEWER_STDOUT_WORLD);
        }


        KSPCreate(PETSC_COMM_WORLD,&ksp);
        KSPSetOperators(ksp,A,A,DIFFERENT_NONZERO_PATTERN);

        KSPGetPC(ksp,&pc);
        PCSetType(pc,PCLU);
        PCFactorSetShiftPd(pc,PETSC_TRUE);
        PCFactorSetShiftNonzero(pc,1.e-13);

        // KSPGetPC(ksp,&pc);
        // PCSetType(pc,PCSOR);

        KSPSetTolerances(ksp,1.e-25,1.e-50,PETSC_DEFAULT,(PETSC_DEFAULT));
        KSPSetFromOptions(ksp);
        // KSPSetUp(ksp);

        KSPSolve(ksp,b,x);

        KSPGetIterationNumber(ksp,&its);
        KSPGetResidualNorm(ksp,&norm);

        KSPGetConvergedReason(ksp,&reason);
        if(reason < 0)
        {
            printf("WARNING: petsc_linear_solver()***** failed, try again\n");
            printf("Norm of error %g, Iterations %d\n", norm,its);
            // KSPGetPC(ksp,&pc);
            // PCSetType(pc,PCNONE);
            // PCSetType(pc,PCILU);
            // PCFactorSetShiftPd(pc,PETSC_TRUE);
            // PCFactorSetShiftNonzero(pc,1.e-14);
            KSPSetType(ksp,KSPCG);
            KSPSetInitialGuessNonzero(ksp,PETSC_TRUE);
 
            KSPGetPC(ksp,&pc);
            PCSetType(pc,PCICC);
            PCFactorSetShiftPd(pc,PETSC_TRUE);
 
            KSPSetTolerances(ksp,1.e-25,1.e-50,PETSC_DEFAULT,PETSC_DEFAULT);
            KSPSetFromOptions(ksp);

            KSPSetUp(ksp);

            KSPSolve(ksp,b,x);

            KSPGetIterationNumber(ksp,&its);
            KSPGetResidualNorm(ksp,&norm);

            KSPGetConvergedReason(ksp,&reason);

            if(reason < 0)
            {
                printf("ERROR: petsc_linear_solver()\n");
                printf("Ksp diverges, reason = %d\n", reason);
                PetscPrintf(PETSC_COMM_WORLD,"Failure to converge\n");
                PetscPrintf(PETSC_COMM_WORLD,"Norm of error %A, Iterations %D\n",
                      norm,its);
                printf("Norm of error %g, Iterations %d\n", norm,its);
                clean_up(ERROR);
            }
        }

        // TMP
        if(debugging("petsc_solver"))
        {
            PetscPrintf(PETSC_COMM_WORLD,"Norm of error %A, Iterations %D\n",
                          norm,its);
            printf("Norm of error %g, Iterations %d\n", norm,its);
        }

        VecGetArray(x,&xx);
        for(j = 0; j < dim; j++)
            ans[j] = xx[j];
        VecRestoreArray(x,&xx);

        // TMP
        if(debugging("petsc_solver"))
        {
            MatMult(A,x,u);
            printf("\n********************Show Ax =\n");
            VecGetArray(u,&xx);
            for(j = 0; j < dim; j++)
                printf("%15.14g; old = %15.14g\n", xx[j], B[j]);
            VecRestoreArray(u,&xx);
        }

        VecDestroy(x); VecDestroy(u);
        VecDestroy(b); MatDestroy(A);
        KSPDestroy(ksp);
}

// get inverse by solving A x_i = e_i;
// where e_i is unit vector, x_i is i^th
// col of inverse
EXPORT void inverse_by_petsc(
	double    **A, 
        double    **invA,
	int       dim)
{
        int       i, j;
        static double  *e = NULL, *ans;
        static int     vsize;

        if(NULL == e)
        {
            vector(&e,dim,sizeof(double));
            vector(&ans,dim,sizeof(double));
            vsize = dim;
        }
        if(vsize != dim)
        {
            free(e); free(ans);
            vector(&e,dim,sizeof(double));
            vector(&ans,dim,sizeof(double));
            vsize = dim;
        }

        for(i = 0; i < dim; i++) // col
        {
            for(j = 0; j < dim; j++)
                e[j] = 0.0;
            e[i] = 1.0;
            petsc_linear_solver(A,e,dim,ans);
            for(j = 0; j < dim; j++)
                invA[j][i] = ans[j];
        }
}

#endif /* if defined(PETSC) */


