gem5/splash2/codes/apps/ocean/non_contiguous_partitions/multi.C
Sanchayan Maity 2fcc51c2c1 Commit splash2 benchmark
While at it also add the libpthread static library amd m5op_x86
for matrix multiplication test code as well.

Note that the splash2 benchmark code does not comply with gem5
coding guidelines. Academic guys never seem to follow 80 columns
and no whitespace guideline :(.
2017-04-26 20:50:15 +05:30

453 lines
14 KiB
C

/*************************************************************************/
/* */
/* Copyright (c) 1994 Stanford University */
/* */
/* All rights reserved. */
/* */
/* Permission is given to use, copy, and modify this software for any */
/* non-commercial purpose as long as this copyright notice is not */
/* removed. All other uses, including redistribution in whole or in */
/* part, are forbidden without prior written permission. */
/* */
/* This software is provided with absolutely no warranty and no */
/* support. */
/* */
/*************************************************************************/
/* shared memory implementation of the multigrid method
implementation uses red-black gauss-seidel relaxation
iterations, w cycles, and the method of half-injection for
residual computation */
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include "decs.h"
/* perform multigrid (w cycles) */
void multig(long my_id)
{
long iter;
double wu;
double errp;
long m;
long minlevel;
long flag1;
long flag2;
long k;
long my_num;
double wmax;
double local_err;
double red_local_err;
double black_local_err;
double g_error;
flag1 = 0;
flag2 = 0;
iter = 0;
m = numlev-1;
wmax = maxwork;
minlevel = minlev;
my_num = my_id;
wu = 0.0;
k = m;
g_error = 1.0e30;
while ((!flag1) && (!flag2)) {
errp = g_error;
iter++;
if (my_num == MASTER) {
multi->err_multi = 0.0;
}
/* barrier to make sure all procs have finished intadd or rescal */
/* before proceeding with relaxation */
#if defined(MULTIPLE_BARRIERS)
BARRIER(bars->error_barrier,nprocs)
#else
BARRIER(bars->barrier,nprocs)
#endif
relax(k,&red_local_err,RED_ITER,my_num);
/* barrier to make sure all red computations have been performed */
#if defined(MULTIPLE_BARRIERS)
BARRIER(bars->error_barrier,nprocs)
#else
BARRIER(bars->barrier,nprocs)
#endif
relax(k,&black_local_err,BLACK_ITER,my_num);
/* compute max local error from red_local_err and black_local_err */
if (red_local_err > black_local_err) {
local_err = red_local_err;
} else {
local_err = black_local_err;
}
/* update the global error if necessary */
LOCK(locks->error_lock)
if (local_err > multi->err_multi) {
multi->err_multi = local_err;
}
UNLOCK(locks->error_lock)
/* a single relaxation sweep at the finest level is one unit of */
/* work */
wu+=pow((double)4.0,(double)k-m);
/* barrier to make sure all processors have checked local error */
#if defined(MULTIPLE_BARRIERS)
BARRIER(bars->error_barrier,nprocs)
#else
BARRIER(bars->barrier,nprocs)
#endif
g_error = multi->err_multi;
/* barrier to make sure master does not cycle back to top of loop */
/* and reset global->err before we read it and decide what to do */
#if defined(MULTIPLE_BARRIERS)
BARRIER(bars->error_barrier,nprocs)
#else
BARRIER(bars->barrier,nprocs)
#endif
if (g_error >= lev_tol[k]) {
if (wu > wmax) {
/* max work exceeded */
flag1 = 1;
fprintf(stderr,"ERROR: Maximum work limit %0.5f exceeded\n",wmax);
exit(-1);
} else {
/* if we have not converged */
if ((k != 1) && (g_error/errp >= 0.6) && (k > minlevel)) {
/* if need to go to coarser grid */
rescal(k,my_num);
/* transfer residual to rhs of coarser grid */
lev_tol[k-1] = 0.3 * g_error;
k = k-1;
putz(k,my_num);
/* make initial guess on coarser grid zero */
g_error = 1.0e30;
}
}
} else {
/* if we have converged at this level */
if (k == m) {
/* if finest grid, we are done */
flag2 = 1;
} else {
/* else go to next finest grid */
intadd(k,my_num);
k++;
g_error = 1.0e30;
}
}
}
if (do_output) {
if (my_num == MASTER) {
printf("iter %ld, level %ld, residual norm %12.8e, work = %7.3f\n", iter,k,multi->err_multi,wu);
}
}
}
/* perform red or black iteration (not both) */
void relax(long k, double *err, long color, long my_num)
{
long i;
long j;
long iend;
long jend;
long oddistart;
long oddjstart;
long evenistart;
long evenjstart;
long oddiendst;
long eveniendst;
long oddjendst;
long evenjendst;
double a;
double h;
double factor;
double maxerr;
double newerr;
double oldval;
double newval;
i = 0;
j = 0;
*err = 0.0;
h = lev_res[k];
/* points whose sum of row and col index is even do a red iteration, */
/* others do a black */
evenistart = gp[my_num].eist[k];
evenjstart = gp[my_num].ejst[k];
oddistart = gp[my_num].oist[k];
oddjstart = gp[my_num].ojst[k];
eveniendst = gp[my_num].eiest[k];
evenjendst = gp[my_num].ejest[k];
oddiendst = gp[my_num].oiest[k];
oddjendst = gp[my_num].ojest[k];
iend = gp[my_num].rel_start_y[k] + gp[my_num].rel_num_y[k];
jend = gp[my_num].rel_start_x[k] + gp[my_num].rel_num_x[k];
factor = 4.0 - eig2 * h * h ;
maxerr = 0.0;
if (color == RED_ITER) {
for (i=evenistart;i<iend;i+=2) {
for (j=evenjstart;j<jend;j+=2) {
a = multi->q_multi[k][i][j+1] + multi->q_multi[k][i][j-1] +
multi->q_multi[k][i-1][j] + multi->q_multi[k][i+1][j] -
multi->rhs_multi[k][i][j] ;
oldval = multi->q_multi[k][i][j];
newval = a / factor;
newerr = oldval - newval;
multi->q_multi[k][i][j] = newval;
if (fabs(newerr) > maxerr) {
maxerr = fabs(newerr);
}
}
}
for (i=oddistart;i<iend;i+=2) {
for (j=oddjstart;j<jend;j+=2) {
a = multi->q_multi[k][i][j+1] + multi->q_multi[k][i][j-1] +
multi->q_multi[k][i-1][j] + multi->q_multi[k][i+1][j] -
multi->rhs_multi[k][i][j] ;
oldval = multi->q_multi[k][i][j];
newval = a / factor;
newerr = oldval - newval;
multi->q_multi[k][i][j] = newval;
if (fabs(newerr) > maxerr) {
maxerr = fabs(newerr);
}
}
}
} else if (color == BLACK_ITER) {
for (i=evenistart;i<iend;i+=2) {
for (j=oddjstart;j<jend;j+=2) {
a = multi->q_multi[k][i][j+1] + multi->q_multi[k][i][j-1] +
multi->q_multi[k][i-1][j] + multi->q_multi[k][i+1][j] -
multi->rhs_multi[k][i][j] ;
oldval = multi->q_multi[k][i][j];
newval = a / factor;
newerr = oldval - newval;
multi->q_multi[k][i][j] = newval;
if (fabs(newerr) > maxerr) {
maxerr = fabs(newerr);
}
}
}
for (i=oddistart;i<iend;i+=2) {
for (j=evenjstart;j<jend;j+=2) {
a = multi->q_multi[k][i][j+1] + multi->q_multi[k][i][j-1] +
multi->q_multi[k][i-1][j] + multi->q_multi[k][i+1][j] -
multi->rhs_multi[k][i][j] ;
oldval = multi->q_multi[k][i][j];
newval = a / factor;
newerr = oldval - newval;
multi->q_multi[k][i][j] = newval;
if (fabs(newerr) > maxerr) {
maxerr = fabs(newerr);
}
}
}
}
*err = maxerr;
}
/* perform half-injection to next coarsest level */
void rescal(long kf, long my_num)
{
long ic;
long if17;
long jf;
long jc;
long krc;
long istart;
long iend;
long jstart;
long jend;
double hf;
double hc;
double s;
double s1;
double s2;
double s3;
double s4;
double factor;
double int1;
double int2;
double i_int_factor;
double j_int_factor;
double int_val;
krc = kf - 1;
hc = lev_res[krc];
hf = lev_res[kf];
istart = gp[my_num].rlist[krc];
jstart = gp[my_num].rljst[krc];
iend = gp[my_num].rlien[krc];
jend = gp[my_num].rljen[krc];
iend = gp[my_num].rel_start_y[krc] + gp[my_num].rel_num_y[krc] - 1;
jend = gp[my_num].rel_start_x[krc] + gp[my_num].rel_num_x[krc] - 1;
factor = 4.0 - eig2 * hf * hf;
if17=2*(istart-1);
for (ic=istart;ic<=iend;ic++) {
if17+=2;
i_int_factor = ic * i_int_coeff[krc] * 0.5;
jf = 2 * (jstart - 1);
for (jc=jstart;jc<=jend;jc++) {
jf+=2;
j_int_factor = jc*j_int_coeff[krc] * 0.5;
/* method of half-injection uses 2.0 instead of 4.0 */
s = multi->q_multi[kf][if17][jf+1] + multi->q_multi[kf][if17][jf-1] +
multi->q_multi[kf][if17-1][jf] + multi->q_multi[kf][if17+1][jf];
s1 = 2.0 * (multi->rhs_multi[kf][if17][jf] - s +
factor * multi->q_multi[kf][if17][jf]);
if ((if17 == 2) || (jf ==2)) {
s2 = 0;
} else {
s = multi->q_multi[kf][if17][jf-1] + multi->q_multi[kf][if17][jf-3] +
multi->q_multi[kf][if17-1][jf-2] + multi->q_multi[kf][if17+1][jf-2];
s2 = 2.0 * (multi->rhs_multi[kf][if17][jf-2] - s +
factor * multi->q_multi[kf][if17][jf-2]);
}
if ((if17 == 2) || (jf ==2)) {
s3 = 0;
} else {
s = multi->q_multi[kf][if17-2][jf+1] + multi->q_multi[kf][if17-2][jf-1] +
multi->q_multi[kf][if17-3][jf] + multi->q_multi[kf][if17-1][jf];
s3 = 2.0 * (multi->rhs_multi[kf][if17-2][jf] - s +
factor * multi->q_multi[kf][if17-2][jf]);
}
if ((if17 == 2) || (jf ==2)) {
s4 = 0;
} else {
s = multi->q_multi[kf][if17-2][jf-1] + multi->q_multi[kf][if17-2][jf-3] +
multi->q_multi[kf][if17-3][jf-2] + multi->q_multi[kf][if17-1][jf-2];
s4 = 2.0 * (multi->rhs_multi[kf][if17-2][jf-2] - s +
factor * multi->q_multi[kf][if17-2][jf-2]);
}
int1 = j_int_factor*s4 + (1.0-j_int_factor)*s3;
int2 = j_int_factor*s2 + (1.0-j_int_factor)*s1;
int_val = i_int_factor*int1+(1.0-i_int_factor)*int2;
multi->rhs_multi[krc][ic][jc] = i_int_factor*int1+(1.0-i_int_factor)*int2;
}
}
}
/* perform interpolation and addition to next finest grid */
void intadd(long kc, long my_num)
{
long ic;
long if17;
long jf;
long jc;
long kf;
long istart;
long jstart;
long iend;
long jend;
double hc;
double hf;
long ifine1;
long ifine2;
long jfine1;
long jfine2;
double int1;
double int2;
double i_int_factor1;
double j_int_factor1;
double i_int_factor2;
double j_int_factor2;
kf = kc + 1;
hc = lev_res[kc];
hf = lev_res[kf];
istart = gp[my_num].iist[kc];
jstart = gp[my_num].ijst[kc];
iend = gp[my_num].iien[kc];
jend = gp[my_num].ijen[kc];
istart = gp[my_num].rel_start_y[kc];
jstart = gp[my_num].rel_start_x[kc];
iend = gp[my_num].rel_start_y[kc] + gp[my_num].rel_num_y[kc] - 1;
jend = gp[my_num].rel_start_x[kc] + gp[my_num].rel_num_x[kc] - 1;
if17 = 2*(istart-1);
for (ic=istart;ic<=iend;ic++) {
if17+=2;
ifine1 = if17-1;
ifine2 = if17;
i_int_factor1= ((imx[kc]-2)-(ic-1)) * (i_int_coeff[kf]);
i_int_factor2= ic * i_int_coeff[kf];
jf = 2*(jstart-1);
for (jc=jstart;jc<=jend;jc++) {
jf+=2;
jfine1 = jf-1;
jfine2 = jf;
j_int_factor1= ((jmx[kc]-2)-(jc-1)) * (j_int_coeff[kf]);
j_int_factor2= jc * j_int_coeff[kf];
int1 = j_int_factor1*multi->q_multi[kc][ic][jc-1] +
(1.0-j_int_factor1)*multi->q_multi[kc][ic][jc];
int2 = j_int_factor1*multi->q_multi[kc][ic-1][jc-1] +
(1.0-j_int_factor1)*multi->q_multi[kc][ic-1][jc];
multi->q_multi[kf][if17-1][jf-1] += i_int_factor1*int2 +
(1.0-i_int_factor1)*int1;
int2 = j_int_factor1*multi->q_multi[kc][ic+1][jc-1] +
(1.0-j_int_factor1)*multi->q_multi[kc][ic+1][jc];
multi->q_multi[kf][if17][jf-1] += i_int_factor2*int2 +
(1.0-i_int_factor2)*int1;
int1 = j_int_factor2*multi->q_multi[kc][ic][jc+1] +
(1.0-j_int_factor2)*multi->q_multi[kc][ic][jc];
int2 = j_int_factor2*multi->q_multi[kc][ic-1][jc+1] +
(1.0-j_int_factor2)*multi->q_multi[kc][ic-1][jc];
multi->q_multi[kf][if17-1][jf] += i_int_factor1*int2 +
(1.0-i_int_factor1)*int1;
int2 = j_int_factor2*multi->q_multi[kc][ic+1][jc+1] +
(1.0-j_int_factor2)*multi->q_multi[kc][ic+1][jc];
multi->q_multi[kf][if17][jf] += i_int_factor2*int2 +
(1.0-i_int_factor2)*int1;
}
}
}
/* initialize a grid to zero in parallel */
void putz(long k, long my_num)
{
long i;
long j;
long istart;
long jstart;
long iend;
long jend;
istart = gp[my_num].pist[k];
jstart = gp[my_num].pjst[k];
iend = gp[my_num].pien[k];
jend = gp[my_num].pjen[k];
for (i=istart;i<=iend;i++) {
for (j=jstart;j<=jend;j++) {
multi->q_multi[k][i][j] = 0.0;
}
}
}