/*
*				toblas:
*
*	Copyright 1999 by The University at Stony Brook, All rights reserved.
*
*	These routines provide a set of low-level efficient operations
*	on vectors.     
*
*	These routines are single-precision - all vector and scalar
*	arguments are single precision quantities.   In addition for
*	each routine there is a double-precision version, with name
*	prefixed by  'd' .  For these routines all scalar and vector
*	arguments are double precision.  The later routines are not
*	very efficient.   There are also a few routines that mix
*	the precision of their arguments.
*
*	On each machine, these routines will be as efficient as possible,
*	taking advantage of parallel or vector instructions, or other
*	special features.    For each routine there is a corresponding
*	version, with name preceded by  c_  which is written in
*	straight C and may be regarded as the definition of the routine.
*
*	There is also a test_program which calls the fast and slow versions
*	of each of the routines, and prints the norm of the differences.
*
*
*	Currently supported routines are:
*
*
*	zero_vector(n,x)
*		x[i] = 0.	
*
*	vector_equals_scalar(n,v,a)
*		v[i] = a
*
*	sum_elements_of_vector(n,v,sum)
*		float *sum = sum of v[i]
*
*	float norm_vector(n,v)
*		return sqrt(v.v)
*
*	add_scalar_to_vector(n,a,v)
*		v[i] = v[i] + a
*
*	multiply_vector_by_scalar(n,v,a)
*		v[i] = a*v[i]
*
*	min_max_vector(n,v,vmin,vmax)
*		float *vmin = min(v[i])
*		float *vmax = max(v[i])
*
*	copy_vector_to_vector(n,v1,v2)
*		v2[i] = v1[i]
*
*	vector_equals_scalar_plus_vector(n,v1,a,v2)
*		v1[i] = v2[i] + a
*
*	vector_equals_scalar_times_vector(n,v1,a,v2)
*		v1[i] = v2[i] * a
*
*	float inner_product(n,v1,v2)
*		return v1.v2
*
*	multiply_vector_by_vector(n,v1,v2)
*		v1[i] = v1[i]*v2[i]
*
*	divide_vector_by_vector(n,v1,v2)
*		v1[i] = v1[i]/v2[i]
*
*	subtract_vector_from_vector(n,v1,v2)		
*		v2[i] = v2[i] - v1[i]
*
*	add_scalar_times_vector_to_vector(n,a,v1,v2)
*		v2[i] = v2[i] + a*v1[i]
*
*	vector_equals_vector_minus_vector(n,v1,v2,v3)
*		v1[i] = v2[i] - v3[i]
*
*	add_vector_times_vector_to_vector(n,v1,v2,v3)
*		v3[i] = v3[i] + v1[i]*v2[i]
*
*	vector_equals_vector_plus_scalar_times_vector(n,v1,v2,a,v3)
*		v1[i] = v2[i] + a*v3[i]
*
*	
*	The double precision version of each routine has the
*	same name preceded by a d.
*
*	Some mixed precision routines are:
*
*	copy_fvector_to_dvector(n,fv,dv)
*		dv[i] = fv[i]
*
*	copy_dvector_to_fvector(n,dv,fv)
*		fv[i] = dv[i]
*
*	add_fvector_times_dvector_to_dvector(n,fv,dv1,dv2)
*		dv2[i] = dv2[i] + fv[i]*dv1[i]
*
*	Mixed precision routines added April 12, 1988 (Brent Lindquist)
*
*	multiply_dvector_by_fvector(n,dv,fv)
*		dv[i] = dv[i]*fv[i]
*
*	divide_dvector_by_fvector(n,dv,fv)
*		dv[i] = dv[i]/fv[i]
*
*
*	Oliver McBryan
*	Courant Institute
*	New York University
*
*/


#include <cdecs.h>	/* includes stdio.h, string.h, ctype.h, math.h */
#include <vector/vectorprotos.h>

#ifdef __cplusplus
extern "C" {
#endif

#if defined(FOR)
LOCAL float one = 1.0, minus_one = -1.0, zero = 0.;
LOCAL int   ione = 1;
#endif /* defined(FOR) */

#if !defined(TOBLAS_ASSEMBLER)



EXPORT void subtract_vector_from_vector(int n, float *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v2[i] -= v1[i];
}
	


EXPORT void add_scalar_times_vector_to_vector(int n, float a, float *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v2[i] += a*v1[i];
}



EXPORT void add_vector_times_vector_to_vector(int n, float *v1, float *v2, float *v3)
{
	int i;
	for (i=0; i<n; i++) v3[i] += v1[i]*v2[i];
}




EXPORT void copy_vector_to_vector(int n, float *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v2[i] = v1[i];
}




EXPORT void multiply_vector_by_scalar(int n, float *v, float a)
{
	int i;
	for (i=0; i<n; i++) v[i] *= a;
}




EXPORT void multiply_vector_by_vector(int n, float *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] *= v2[i];
}


EXPORT void divide_vector_by_vector(int n, float *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] /= v2[i];
}



EXPORT void vector_equals_scalar(int n, float *v, float a)
{
	int i;
	for (i=0; i<n; i++) v[i] = a;
}



EXPORT void add_scalar_to_vector(int n, float a, float *v)
{
	int i;
	for (i=0; i<n; i++) v[i] += a;
}





EXPORT void min_max_vector(int n, float *v, float *minv, float *maxv)
{
	float Min, Max;
	int i;

	if (n==0) return;
	Min = Max = v[0];
	for (i=1; i<n; i++)
		if      (v[i] > Max) Max = v[i];
		else if (v[i] < Min) Min = v[i];
	*minv = Min;
	*maxv = Max;
}





EXPORT void vector_equals_scalar_plus_vector(int n, float *v1, float a, float *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] = a+v2[i];
}



EXPORT void vector_equals_scalar_times_vector(int n, float *v1, float a, float *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] = a*v2[i];
}



EXPORT void sum_elements_of_vector(int n, float *v, float *sum)
{
	int i;
	*sum = 0.;
	for (i=0; i<n; i++) *sum += v[i];
}




EXPORT void vector_equals_vector_plus_scalar_times_vector(int n, float *v1, float *v2, float a, float *v3)
{
	int i;
	for (i=0; i<n; i++) v1[i] = v2[i] + a*v3[i];
}


EXPORT void vector_equals_vector_minus_vector(int n, float *v1, float *v2, float *v3)
{
	int i;
	for (i=0; i<n; i++) v1[i] = v2[i] - v3[i];
}



EXPORT float inner_product(int n, float *v1, float *v2)
{
	float sum;
	int i;
	sum = 0.;
	for (i=0; i<n; i++) sum += v1[i]*v2[i];
	return sum;
}



EXPORT float norm_vector(int n, float *v)
{
	float sum;
	int i;
	sum = 0.;
	for (i=0; i<n; i++) sum += v[i]*v[i];
	return (float)sqrt(sum);
}




EXPORT void izero_vector(int n, int *x)
{
	int i;
	for (i=0; i<n; i++) x[i] = 0;
}


EXPORT void zero_vector(int n, float *x)
{
	int i;
	for (i=0; i<n; i++) x[i] = 0.0;
}

#else /* !defined(TOBLAS_ASSEMBLER) */




EXPORT void subtract_vector_from_vector(int n, float *v1, float *v2)
{
#if defined(FOR)
	FORTRAN(saxpy)(&n,&minus_one,v1,&ione,v2,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for(i=m; i<n; i++) v2[i] -= v1[i];
	svlv(m,v1,v2);
#endif /* defined(FOR) */
}
	


EXPORT void add_scalar_times_vector_to_vector(int n, float a, float *v1, float *v2)
{
#if defined(FOR)
	FORTRAN(saxpy)(&n,&a,v1,&ione,v2,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v2[i] += a*v1[i];
	svpsv(m,a,v1,v2);
#endif /* defined(FOR) */
}



EXPORT void add_vector_times_vector_to_vector(int n, float *v1, float *v2, float *v3)
{
#if defined(FOR)
	FORTRAN(sypax)(&n,v1,&ione,v2,&ione,v3,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v3[i] += v1[i]*v2[i];
	svpvv(m,v1,v2,v3);
#endif /* defined(FOR) */
}




EXPORT void copy_vector_to_vector(int n, float *v1, float *v2)
{
#if defined(FOR)
	FORTRAN(scopy)(&n,v1,&ione,v2,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v2[i] = v1[i];
	while (m>16000) {
		svev(16000,v1,v2);
		m -= 16000;
		v1 += 16000;
		v2 += 16000;
	}
	svev(m,v1,v2);
#endif /* defined(FOR) */
}


EXPORT void multiply_vector_by_scalar(int n, float *v, float a)
{
#if defined(FOR)
	FORTRAN(sscal)(&n,&a,v,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v[i] *= a;
	svms(m,v,a);
#endif /* defined(FOR) */
}

EXPORT void multiply_vector_by_vector(int n, float *v1, float *v2)
{
#if defined(FOR)
	FORTRAN(svvm)(&n,v1,&ione,v2,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v1[i] *= v2[i];
	svmv(m,v1,v2);
#endif /* defined(FOR) */
}


EXPORT void divide_vector_by_vector(int n, float *v1, float *v2)
{
#if defined(FOR)
	FORTRAN(svvd)(&n,v1,&ione,v2,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v1[i] /= v2[i];
	svdv(m,v1,v2);
#endif /* defined(FOR) */
}



EXPORT void vector_equals_scalar(int n, float *v, float a)
{
#if defined(FOR)
	FORTRAN(svesf)(&n,v,&ione,&a);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v[i] = a;
	sves(m,v,a);
#endif /* defined(FOR) */
}



EXPORT void add_scalar_to_vector(int n, float a, float *v)
{
#if defined(FOR)
	FORTRAN(sasv)(&n,&a,v,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v[i] += a;
	svas(m,a,v);
#endif /* defined(FOR) */
}





EXPORT void min_max_vector(int n, float *v, float *minv, float *maxv)
{
	int i;
	float Min, Max;

	if (n==0) return;
	Min = Max = v[0];
	for (i=1; i<n; i++)
		if      (v[i] > Max) Max = v[i];
		else if (v[i] < Min) Min = v[i];
	*minv = Min;
	*maxv = Max;
}





EXPORT void vector_equals_scalar_plus_vector(int n, float *v1, float a, float *v2)
{
#if defined(FOR)
	FORTRAN(svspvf)(&n,v1,&ione,&a,v2,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v1[i] = a+v2[i];
	svespv(m,v1,a,v2);
#endif /* defined(FOR) */
}



EXPORT void vector_equals_scalar_times_vector(int n, float *v1, float a, float *v2)
{
#if defined(FOR)
	FORTRAN(svsmvf)(&n,v1,&ione,&a,v2,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v1[i] = a*v2[i];
	svesmv(m,v1,a,v2);
#endif /* defined(FOR) */
}



EXPORT void sum_elements_of_vector(int n, float *v, float *sum)
{
#if defined(FOR)
	FORTRAN float FORTRAN_NAME(ssum)();

	*sum = FORTRAN_NAME(ssum)(&n,v,&ione);
#else /* defined(FOR) */
	int i, m;
	IMPORT float svsum(int,float*);

	*sum = 0.;
	m = n-n%4;
	for (i=m; i<n; i++) *sum += v[i];
	*sum += svsum(m,v);
#endif /* defined(FOR) */
}




EXPORT void vector_equals_vector_plus_scalar_times_vector(int n, float *v1, float *v2, float a, float *v3)
{
#if defined(FOR)
	FORTRAN(sxayz)(&n,v2,&ione,&a,v3,&ione,v1,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v1[i] = v2[i] + a*v3[i];
	svevpsv(m,v1,v2,a,v3);
#endif /* defined(FOR) */
}


EXPORT void vector_equals_vector_minus_vector(int n, float *v1, float *v2, float *v3)
{
#if defined(FOR)
	FORTRAN(sxayz)(&n,v2,&ione,&minus_one,v3,&ione,v1,&ione);
#else /* defined(FOR) */
	int i, m;
	m = n-n%4;
	for (i=m; i<n; i++) v1[i] = v2[i] - v3[i];
	svevlv(m,v1,v2,v3);
#endif /* defined(FOR) */
}



EXPORT float inner_product(int n, float *v1, float *v2)
{
#if defined(FOR)
	IMPORT float FORTRAN(sdot)();

	return FORTRAN(sdot)(&n,v1,&ione,v2,&ione);
#else /* defined(FOR) */
	int i, m;
	float	sum;
	IMPORT float svdotv();
	sum = 0.;
	m = n-n%4;
	for (i=m; i<n; i++) sum += v1[i]*v2[i];
	return (sum + svdotv(m,v1,v2));
#endif /* defined(FOR) */
}



EXPORT float norm_vector(int n, float *v)
{
#if defined(FOR)
	IMPORT float FORTRAN(sdot)();

	return sqrt(FORTRAN(sdot)(&n,v,&ione,v,&ione));
#else /* defined(FOR) */
	int i, m;
	float	sum;
	IMPORT float svdotv();
	sum = 0.;
	m = n-n%4;
	for (i=m; i<n; i++) sum += v[i]*v[i];
	return sqrt(sum + svdotv(m,v,v));
#endif /* defined(FOR) */
}


EXPORT void izero_vector(int n, int *x)
{
	memset((POINTER)x,0,n*sizeof(int));
}


EXPORT void zero_vector(int n, float *x)
{
	memset((POINTER)x,0,n*sizeof(float));
}




#endif /* !defined(TOBLAS_ASSEMBLER) */






		/* Double Precision Versions of Routines Above: */



EXPORT void dzero_vector(int n, double *x)
{
	int i;
	for (i=0; i<n; i++) x[i] = 0.0;
}



EXPORT void dsubtract_vector_from_vector(int n, double *v1, double *v2)
{
	int i;
	for (i=0; i<n; i++) v2[i] -= v1[i];
}
	


EXPORT void dadd_scalar_times_vector_to_vector(int n, double a, double *v1, double *v2)
{
	int i;
	for (i=0; i<n; i++) v2[i] += a*v1[i];
}



EXPORT void dadd_vector_times_vector_to_vector(int n, double *v1, double *v2, double *v3)
{
	int i;
	for (i=0; i<n; i++) v3[i] += v1[i]*v2[i];
}




EXPORT void dcopy_vector_to_vector(int n, double *v1, double *v2)
{
	int i;
	for (i=0; i<n; i++) v2[i] = v1[i];
}




EXPORT void dmultiply_vector_by_scalar(int n, double *v, double a)
{
	int i;
	for (i=0; i<n; i++) v[i] *= a;
}




EXPORT void dmultiply_vector_by_vector(int n, double *v1, double *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] *= v2[i];
}


EXPORT void ddivide_vector_by_vector(int n, double *v1, double *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] /= v2[i];
}



EXPORT void dvector_equals_scalar(int n, double *v, double a)
{
	int i;
	for (i=0; i<n; i++) v[i] = a;
}



EXPORT void dadd_scalar_to_vector(int n, double a, double *v)
{
	int i;
	for (i=0; i<n; i++) v[i] += a;
}





EXPORT void dmin_max_vector(int n, double *v, double *minv, double *maxv)
{
	int i;
	double Min, Max;

	if (n==0) return;
	Min = Max = v[0];
	for (i=1; i<n; i++)
		if      (v[i] > Max) Max = v[i];
		else if (v[i] < Min) Min = v[i];
	*minv = Min;
	*maxv = Max;
}





EXPORT void dvector_equals_scalar_plus_vector(int n, double *v1, double a, double *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] = a+v2[i];
}



EXPORT void dvector_equals_scalar_times_vector(int n, double *v1, double a, double *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] = a*v2[i];
}



EXPORT void dsum_elements_of_vector(int n, double *v, double *dsum)
{
	int i;
	*dsum = 0.;
	for (i=0; i<n; i++) *dsum += v[i];
}




EXPORT void dvector_equals_vector_plus_scalar_times_vector(int n, double *v1, double *v2, double a, double *v3)
{
	int i;
	for (i=0; i<n; i++) v1[i] = v2[i] + a*v3[i];
}


EXPORT void dvector_equals_vector_minus_vector(int n, double *v1, double *v2, double *v3)
{
	int i;
	for (i=0; i<n; i++) v1[i] = v2[i] - v3[i];
}



EXPORT double dinner_product(int n, double *v1, double *v2)
{
	double dsum;
	int i;
	dsum = 0.;
	for (i=0; i<n; i++) dsum += v1[i]*v2[i];
	return dsum;
}



EXPORT double dnorm_vector(int n, double *v)
{
	double dsum;
	int i;
	dsum = 0.;
	for (i=0; i<n; i++) dsum += v[i]*v[i];
	return sqrt(dsum);
}






/*
*				Mixed Precision Routines: 
*/



EXPORT void copy_fvector_to_dvector(int n, float *v, double *dw)
{
	int i;
	for (i=0; i<n; i++)  dw[i] = v[i];
}


EXPORT void copy_dvector_to_fvector(int n, double *dv, float *w)
{
	int i;
	for (i=0; i<n; i++)  w[i] = dv[i];
}




EXPORT void add_fvector_times_dvector_to_dvector(int n, float *v1, double *v2, double *v3)
{
	int i;
	for (i=0; i<n; i++) v3[i] += v1[i]*v2[i];
}



EXPORT void multiply_dvector_by_fvector(int n, double *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] *= v2[i];
}


EXPORT void divide_dvector_by_fvector(int n, double *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] /= v2[i];
}



#if defined(TEST_BLAS)





EXPORT void c_subtract_vector_from_vector(int n, float *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v2[i] -= v1[i];
	
}
	


EXPORT void c_add_scalar_times_vector_to_vector(int n, float a, float *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v2[i] += a*v1[i];
}



EXPORT void c_add_vector_times_vector_to_vector(int n, float *v1, float *v2, float *v3)
{
	int i;
	for (i=0; i<n; i++) v3[i] += v1[i]*v2[i];
}



EXPORT void c_copy_vector_to_vector(int n, float *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v2[i] = v1[i];
}


EXPORT void c_multiply_vector_by_scalar(int n, float *v, float a)
{
	int i;
	for (i=0; i<n; i++) v[i] *= a;
}

EXPORT void c_multiply_vector_by_vector(int n, float *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] *= v2[i];
}


EXPORT void c_divide_vector_by_vector(int n, float *v1, float *v2)
{
	int i;
	for (i=0; i<n; i++) v1[i] /= v2[i];
}


EXPORT void c_add_scalar_to_vector(int n, float a, float *v)
{
	int i;
	for (i=0; i<n; i++) v[i] += a;
}


EXPORT void c_sum_elements_of_vector(int n, float *v, float *sum)
{
	int i;
	*sum = 0.;
	for (i=0; i<n; i++) *sum += v[i];
}




EXPORT void c_vector_equals_vector_plus_scalar_times_vector(int n, float *v1, float *v2, float a, float *v3)
{
	int i;
	for (i=0; i<n; i++)  v1[i] = v2[i] + a*v3[i];
}


EXPORT void c_vector_equals_vector_minus_vector(int n, float *v1, float *v2, float *v3)
{
	int i;
	for (i=0; i<n; i++)  v1[i] = v2[i] - v3[i];
}



EXPORT float c_inner_product(int n, float *v1, float *v2)
{
	int i;
	float sum = 0.;

	for (i=0; i<n; i++)  sum += v1[i]*v2[i];
	return sum;
}




EXPORT float c_norm_vector(int n, float *v)
{
	int i;
	float sum = 0.;

	for (i=0; i<n; i++)  sum += v[i]*v[i];
	return sqrt(sum);
}


EXPORT void c_zero_vector(register int n, register float *x)
{
	int i;
	for (i=0; i<n; i++) x[i] = 0.;
}



EXPORT void c_vector_equals_scalar(int n, float *v, float a)
{
	int i;
	for (i=0; i<n; i++) v[i] = a;
}


EXPORT void	c_vector_equals_scalar_plus_vector(int n, float *v, float a, float *v1)
{
	int i;
	for (i=0; i<n; i++) v[i] = a + v1[i];
}


EXPORT void	c_vector_equals_scalar_times_vector(int n, float *v, float a, float *v1)
{
	int i;
	for (i=0; i<n; i++) v[i] = a * v1[i];
}




LOCAL	float	diff_vector(int,float*,float*);
LOCAL	void	init(int,float*,float*,float*,float*,float*);



#define LEN 10000

LOCAL float v1[LEN],v2[LEN],v3[LEN],v[LEN],w[LEN],unit[LEN];

int main(int argc, char **argv)
{
	float x,y,diff,a;
	int i,n,ntimes;

	a = .7136;

	init_debug();
	screen("Enter Length n of Vectors, and Number of Timing Loops: ");
	Scanf("%d %d\n",&n,&ntimes);
	if (n>LEN) screen("Lowering n to %d\n",LEN);
	printf("Test of toblas routines with n = %d\n\n",n);
	init(n,v,w,v1,v2,v3);
	printf("a = %f |v|=%f |w|=%f |v1|=%f |v2|=%f |v3|=%f\n\n\n", a,
		c_norm_vector(n,v), c_norm_vector(n,w),
		c_norm_vector(n,v1), c_norm_vector(n,v2),
		c_norm_vector(n,v3)  );


	init(n,v,w,v1,v2,v3);
	zero_vector(n,v);
	c_zero_vector(n,w);
	diff = diff_vector(n,v,w);
	printf("zero_vector(): 				%f\n",diff);
	start_clock("zero_vector");
	for (i=0; i<ntimes; i++)  
		zero_vector(n,v);
	stop_clock("zero_vector");
	start_clock("c_zero_vector");
	for (i=0; i<ntimes; i++)  
		c_zero_vector(n,v);
	stop_clock("c_zero_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	vector_equals_scalar(n,v,a);
	c_vector_equals_scalar(n,w,a);
	diff = diff_vector(n,v,w);
	printf("vector_equals_scalar(): 		%f\n",diff);
	start_clock("vector_equals_scalar");
	for (i=0; i<ntimes; i++)  
		vector_equals_scalar(n,v,a);
	stop_clock("vector_equals_scalar");
	start_clock("c_vector_equals_scalar");
	for (i=0; i<ntimes; i++)  
		c_vector_equals_scalar(n,v,a);
	stop_clock("c_vector_equals_scalar");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	subtract_vector_from_vector(n,v1,v);
	c_subtract_vector_from_vector(n,v1,w);
	diff = diff_vector(n,v,w);
	printf("subtract_vector_from_vector(): 		%f\n",diff);
	start_clock("subtract_vector_from_vector");
	for (i=0; i<ntimes; i++)  
		subtract_vector_from_vector(n,v1,v);
	stop_clock("subtract_vector_from_vector");
	start_clock("c_subtract_vector_from_vector");
	for (i=0; i<ntimes; i++)  
		c_subtract_vector_from_vector(n,v1,v);
	stop_clock("c_subtract_vector_from_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	add_scalar_times_vector_to_vector(n,a,v1,v);
	c_add_scalar_times_vector_to_vector(n,a,v1,w);
	diff = diff_vector(n,v,w);
	printf("add_scalar_times_vector_to_vector():	%f\n",diff);
	start_clock("add_scalar_times_vector_to_vector");
	for (i=0; i<ntimes; i++)  
		add_scalar_times_vector_to_vector(n,a,v1,v);
	stop_clock("add_scalar_times_vector_to_vector");
	start_clock("c_add_scalar_times_vector_to_vector");
	for (i=0; i<ntimes; i++)  
		c_add_scalar_times_vector_to_vector(n,a,v1,v);
	stop_clock("c_add_scalar_times_vector_to_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	vector_equals_scalar_plus_vector(n,v,a,v1);
	c_vector_equals_scalar_plus_vector(n,w,a,v1);
	diff = diff_vector(n,v,w);
	printf("vector_equals_scalar_plus_vector():	%f\n",diff);
	start_clock("vector_equals_scalar_plus_vector");
	for (i=0; i<ntimes; i++)  
		vector_equals_scalar_plus_vector(n,v,a,v1);
	stop_clock("vector_equals_scalar_plus_vector");
	start_clock("c_vector_equals_scalar_plus_vector");
	for (i=0; i<ntimes; i++)  
		c_vector_equals_scalar_plus_vector(n,v,a,v1);
	stop_clock("c_vector_equals_scalar_plus_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	vector_equals_scalar_times_vector(n,v,a,v1);
	c_vector_equals_scalar_times_vector(n,w,a,v1);
	diff = diff_vector(n,v,w);
	printf("vector_equals_scalar_times_vector():	%f\n",diff);
	start_clock("vector_equals_scalar_times_vector");
	for (i=0; i<ntimes; i++)  
		vector_equals_scalar_times_vector(n,v,a,v1);
	stop_clock("vector_equals_scalar_times_vector");
	start_clock("c_vector_equals_scalar_times_vector");
	for (i=0; i<ntimes; i++)  
		c_vector_equals_scalar_times_vector(n,v,a,v1);
	stop_clock("c_vector_equals_scalar_times_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	add_vector_times_vector_to_vector(n,v1,v2,v);
	c_add_vector_times_vector_to_vector(n,v1,v2,w);
	diff = diff_vector(n,v,w);
	printf("add_vector_times_vector_to_vector():	%f\n",diff);
	start_clock("add_vector_times_vector_to_vector");
	for (i=0; i<ntimes; i++)  
		add_vector_times_vector_to_vector(n,v1,v2,v);
	stop_clock("add_vector_times_vector_to_vector");
	start_clock("c_add_vector_times_vector_to_vector");
	for (i=0; i<ntimes; i++)  
		c_add_vector_times_vector_to_vector(n,v1,v2,v);
	stop_clock("c_add_vector_times_vector_to_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	copy_vector_to_vector(n,v1,v);
	c_copy_vector_to_vector(n,v1,w);
	diff = diff_vector(n,v,w);
	printf("copy_vector_to_vector(): 		%f\n",diff);
	start_clock("copy_vector_to_vector");
	for (i=0; i<ntimes; i++)  
		copy_vector_to_vector(n,v1,v);
	stop_clock("copy_vector_to_vector");
	start_clock("c_copy_vector_to_vector");
	for (i=0; i<ntimes; i++)  
		c_copy_vector_to_vector(n,v1,v);
	stop_clock("c_copy_vector_to_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	multiply_vector_by_scalar(n,v,a);
	c_multiply_vector_by_scalar(n,w,a);
	diff = diff_vector(n,v,w);
	printf("multiply_vector_by_scalar():		%f\n",diff);
	start_clock("multiply_vector_by_scalar");
	for (i=0; i<ntimes; i++)  
		multiply_vector_by_scalar(n,v,a);
	stop_clock("multiply_vector_by_scalar");
	start_clock("c_multiply_vector_by_scalar");
	for (i=0; i<ntimes; i++)  
		c_multiply_vector_by_scalar(n,v,a);
	stop_clock("c_multiply_vector_by_scalar");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	multiply_vector_by_vector(n,v,v1);
	c_multiply_vector_by_vector(n,w,v1);
	diff = diff_vector(n,v,w);
	printf("multiply_vector_by_vector():		%f\n",diff);
	start_clock("multiply_vector_by_vector");
	for (i=0; i<ntimes; i++)  
		multiply_vector_by_vector(n,v,unit);
	stop_clock("multiply_vector_by_vector");
	start_clock("c_multiply_vector_by_vector");
	for (i=0; i<ntimes; i++)  
		c_multiply_vector_by_vector(n,v,unit);
	stop_clock("c_multiply_vector_by_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	divide_vector_by_vector(n,v,v1);
	c_divide_vector_by_vector(n,w,v1);
	diff = diff_vector(n,v,w);
	printf("divide_vector_by_vector():		%f\n",diff);
	start_clock("divide_vector_by_vector");
	for (i=0; i<ntimes; i++)  
		divide_vector_by_vector(n,v,unit);
	stop_clock("divide_vector_by_vector");
	start_clock("c_divide_vector_by_vector");
	for (i=0; i<ntimes; i++)  
		c_divide_vector_by_vector(n,v,unit);
	stop_clock("c_divide_vector_by_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	add_scalar_to_vector(n,a,v);
	c_add_scalar_to_vector(n,a,w);
	diff = diff_vector(n,v,w);
	printf("add_scalar_to_vector(): 		%f\n",diff);
	start_clock("add_scalar_to_vector");
	for (i=0; i<ntimes; i++)  
		add_scalar_to_vector(n,a,v);
	stop_clock("add_scalar_to_vector");
	start_clock("c_add_scalar_to_vector");
	for (i=0; i<ntimes; i++)  
		c_add_scalar_to_vector(n,a,v);
	stop_clock("c_add_scalar_to_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	sum_elements_of_vector(n,v,&x);
	c_sum_elements_of_vector(n,w,&y);
	diff = y-x;
	printf("sum_of_elements_of_vector():		%f\n",diff);
	start_clock("sum_elements_of_vector");
	for (i=0; i<ntimes; i++)  
		sum_elements_of_vector(n,v,&x);
	stop_clock("sum_elements_of_vector");
	start_clock("c_sum_elements_of_vector");
	for (i=0; i<ntimes; i++)  
		c_sum_elements_of_vector(n,v,&x);
	stop_clock("c_sum_elements_of_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	vector_equals_vector_plus_scalar_times_vector(n,v,v1,a,v2);
	c_vector_equals_vector_plus_scalar_times_vector(n,w,v1,a,v2);
	diff = diff_vector(n,v,w);
	printf("vector_equals_vector_plus_scalar_t..(): %f\n",diff);
	start_clock("vector_equals_vector_plus_scalar_times_vector");
	for (i=0; i<ntimes; i++)  
		vector_equals_vector_plus_scalar_times_vector(n,v,v1,a,v2);
	stop_clock("vector_equals_vector_plus_scalar_times_vector");
	start_clock("c_vector_equals_vector_plus_scalar_times_vector");
	for (i=0; i<ntimes; i++)  
		c_vector_equals_vector_plus_scalar_times_vector(n,v,v1,a,v2);
	stop_clock("c_vector_equals_vector_plus_scalar_times_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	vector_equals_vector_minus_vector(n,v,v1,v2);
	c_vector_equals_vector_minus_vector(n,w,v1,v2);
	diff = diff_vector(n,v,w);
	printf("vector_equals_vector_minus_vector(): 	%f\n",diff);
	start_clock("vector_equals_vector_minus_vector");
	for (i=0; i<ntimes; i++)  
		vector_equals_vector_minus_vector(n,v,v1,v2);
	stop_clock("vector_equals_vector_minus_vector");
	start_clock("c_vector_equals_vector_minus_vector");
	for (i=0; i<ntimes; i++)  
		c_vector_equals_vector_minus_vector(n,v,v1,v2);
	stop_clock("c_vector_equals_vector_minus_vector");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	x = inner_product(n,v1,v2);
	y = c_inner_product(n,v1,v2);
	diff = y - x;
	printf("inner_product(): 			%f\n",diff);
	start_clock("inner_product");
	for (i=0; i<ntimes; i++)  
		inner_product(n,v1,v2);
	stop_clock("inner_product");
	start_clock("c_inner_product");
	for (i=0; i<ntimes; i++)  
		c_inner_product(n,v1,v2);
	stop_clock("c_inner_product");
	printf("\n");

	init(n,v,w,v1,v2,v3);
	x = norm_vector(n,v);
	y = c_norm_vector(n,w);
	diff = y - x;
	printf("norm_vector(): 				%f\n",diff);
	start_clock("norm_vector");
	for (i=0; i<ntimes; i++)  
		norm_vector(n,v);
	stop_clock("norm_vector");
	start_clock("c_norm_vector");
	for (i=0; i<ntimes; i++)  
		c_norm_vector(n,v);
	stop_clock("c_norm_vector");
	printf("\n");
	return 0;
}


LOCAL void init(int n,float *v,float *w,float *v1,float *v2,float *v3)
{
	int i;

	for (i=0; i<n; i++) {
		v[i] = w[i] = sqrt(1.3+i);
		v1[i] = sqrt(2.+1.13*i);
		v2[i] = sqrt(4.+.91*i);
		v3[i] = sqrt(5.+.5*i);
		unit[i] = 1.;
	}
}



LOCAL float diff_vector(int n,float *v,float *w)
{
	int i;
	double sum=0.;

	for (i=0; i<n; i++)  sum += fabs(v[i]-w[i]);
	return (sum);
}


#endif /* defined(TEST_BLAS) */
#ifdef __cplusplus
}
#endif
