gem5/splash2/codes/apps/ocean/non_contiguous_partitions/multi.C

/*************************************************************************/
/*                                                                       */
/*  Copyright (c) 1994 Stanford University                               */
/*                                                                       */
/*  All rights reserved.                                                 */
/*                                                                       */
/*  Permission is given to use, copy, and modify this software for any   */
/*  non-commercial purpose as long as this copyright notice is not       */
/*  removed.  All other uses, including redistribution in whole or in    */
/*  part, are forbidden without prior written permission.                */
/*                                                                       */
/*  This software is provided with absolutely no warranty and no         */
/*  support.                                                             */
/*                                                                       */
/*************************************************************************/

/* shared memory implementation of the multigrid method
   implementation uses red-black gauss-seidel relaxation
   iterations, w cycles, and the method of half-injection for
   residual computation */

#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <ctime>

#include "decs.h"

/* perform multigrid (w cycles)                                     */
void multig(long my_id)
{
   long iter;
   double wu;
   double errp;
   long m;
   long minlevel;
   long flag1;
   long flag2;
   long k;
   long my_num;
   double wmax;
   double local_err;
   double red_local_err;
   double black_local_err;
   double g_error;

   flag1 = 0;
   flag2 = 0;
   iter = 0;
   m = numlev-1;
   wmax = maxwork;
   minlevel = minlev;
   my_num = my_id;
   wu = 0.0;

   k = m;
   g_error = 1.0e30;
   while ((!flag1) && (!flag2)) {
     errp = g_error;
     iter++;
     if (my_num == MASTER) {
       multi->err_multi = 0.0;
     }

/* barrier to make sure all procs have finished intadd or rescal   */
/* before proceeding with relaxation                               */

#if defined(MULTIPLE_BARRIERS)
     BARRIER(bars->error_barrier,nprocs)
#else
     BARRIER(bars->barrier,nprocs)
#endif
     relax(k,&red_local_err,RED_ITER,my_num);

/* barrier to make sure all red computations have been performed   */

#if defined(MULTIPLE_BARRIERS)
     BARRIER(bars->error_barrier,nprocs)
#else
     BARRIER(bars->barrier,nprocs)
#endif
     relax(k,&black_local_err,BLACK_ITER,my_num);

/* compute max local error from red_local_err and black_local_err  */

     if (red_local_err > black_local_err) {
       local_err = red_local_err;
     } else {
       local_err = black_local_err;
     }

/* update the global error if necessary                            */

     LOCK(locks->error_lock)
     if (local_err > multi->err_multi) {
       multi->err_multi = local_err;
     }
     UNLOCK(locks->error_lock)

/* a single relaxation sweep at the finest level is one unit of    */
/* work                                                            */

     wu+=pow((double)4.0,(double)k-m);

/* barrier to make sure all processors have checked local error    */

#if defined(MULTIPLE_BARRIERS)
     BARRIER(bars->error_barrier,nprocs)
#else
     BARRIER(bars->barrier,nprocs)
#endif
     g_error = multi->err_multi;

/* barrier to make sure master does not cycle back to top of loop  */
/* and reset global->err before we read it and decide what to do   */

#if defined(MULTIPLE_BARRIERS)
     BARRIER(bars->error_barrier,nprocs)
#else
     BARRIER(bars->barrier,nprocs)
#endif
     if (g_error >= lev_tol[k]) {
       if (wu > wmax) {
/* max work exceeded                                               */
         flag1 = 1;
         fprintf(stderr,"ERROR: Maximum work limit %0.5f exceeded\n",wmax);
         exit(-1);
       } else {
/* if we have not converged                                        */
         if ((k != 1) && (g_error/errp >= 0.6) && (k > minlevel)) {
/* if need to go to coarser grid                                   */
           rescal(k,my_num);
/* transfer residual to rhs of coarser grid                        */
           lev_tol[k-1] = 0.3 * g_error;
           k = k-1;
           putz(k,my_num);
/* make initial guess on coarser grid zero                         */
           g_error = 1.0e30;
         }
       }
     } else {
/* if we have converged at this level                              */
       if (k == m) {
/* if finest grid, we are done                                     */
         flag2 = 1;
       } else {
/* else go to next finest grid                                     */
         intadd(k,my_num);
         k++;
         g_error = 1.0e30;
       }
     }
   }
   if (do_output) {
     if (my_num == MASTER) {
       printf("iter %ld, level %ld, residual norm %12.8e, work = %7.3f\n", iter,k,multi->err_multi,wu);
     }
   }
}

/* perform red or black iteration (not both)                    */
void relax(long k, double *err, long color, long my_num)
{
   long i;
   long j;
   long iend;
   long jend;
   long oddistart;
   long oddjstart;
   long evenistart;
   long evenjstart;
   long oddiendst;
   long eveniendst;
   long oddjendst;
   long evenjendst;
   double a;
   double h;
   double factor;
   double maxerr;
   double newerr;
   double oldval;
   double newval;

   i = 0;
   j = 0;

   *err = 0.0;
   h = lev_res[k];

/* points whose sum of row and col index is even do a red iteration, */
/* others do a black				                     */

   evenistart = gp[my_num].eist[k];
   evenjstart = gp[my_num].ejst[k];
   oddistart = gp[my_num].oist[k];
   oddjstart = gp[my_num].ojst[k];
   eveniendst = gp[my_num].eiest[k];
   evenjendst = gp[my_num].ejest[k];
   oddiendst = gp[my_num].oiest[k];
   oddjendst = gp[my_num].ojest[k];

   iend = gp[my_num].rel_start_y[k] + gp[my_num].rel_num_y[k];
   jend = gp[my_num].rel_start_x[k] + gp[my_num].rel_num_x[k];

   factor = 4.0 - eig2 * h * h ;
   maxerr = 0.0;
   if (color == RED_ITER) {
     for (i=evenistart;i<iend;i+=2) {
       for (j=evenjstart;j<jend;j+=2) {
         a = multi->q_multi[k][i][j+1] + multi->q_multi[k][i][j-1] +
             multi->q_multi[k][i-1][j] + multi->q_multi[k][i+1][j] -
             multi->rhs_multi[k][i][j] ;
         oldval = multi->q_multi[k][i][j];
         newval = a / factor;
         newerr = oldval - newval;
         multi->q_multi[k][i][j] = newval;
         if (fabs(newerr) > maxerr) {
           maxerr = fabs(newerr);
         }
       }
     }
     for (i=oddistart;i<iend;i+=2) {
       for (j=oddjstart;j<jend;j+=2) {
         a = multi->q_multi[k][i][j+1] + multi->q_multi[k][i][j-1] +
             multi->q_multi[k][i-1][j] + multi->q_multi[k][i+1][j] -
             multi->rhs_multi[k][i][j] ;
         oldval = multi->q_multi[k][i][j];
         newval = a / factor;
         newerr = oldval - newval;
         multi->q_multi[k][i][j] = newval;
         if (fabs(newerr) > maxerr) {
           maxerr = fabs(newerr);
         }
       }
     }
   } else if (color == BLACK_ITER) {
     for (i=evenistart;i<iend;i+=2) {
       for (j=oddjstart;j<jend;j+=2) {
         a = multi->q_multi[k][i][j+1] + multi->q_multi[k][i][j-1] +
             multi->q_multi[k][i-1][j] + multi->q_multi[k][i+1][j] -
             multi->rhs_multi[k][i][j] ;
         oldval = multi->q_multi[k][i][j];
         newval = a / factor;
         newerr = oldval - newval;
         multi->q_multi[k][i][j] = newval;
         if (fabs(newerr) > maxerr) {
           maxerr = fabs(newerr);
         }
       }
     }
     for (i=oddistart;i<iend;i+=2) {
       for (j=evenjstart;j<jend;j+=2) {
         a = multi->q_multi[k][i][j+1] + multi->q_multi[k][i][j-1] +
             multi->q_multi[k][i-1][j] + multi->q_multi[k][i+1][j] -
             multi->rhs_multi[k][i][j] ;
         oldval = multi->q_multi[k][i][j];
         newval = a / factor;
         newerr = oldval - newval;
         multi->q_multi[k][i][j] = newval;
         if (fabs(newerr) > maxerr) {
           maxerr = fabs(newerr);
         }
       }
     }
   }
   *err = maxerr;
}

/* perform half-injection to next coarsest level                */
void rescal(long kf, long my_num)
{
   long ic;
   long if17;
   long jf;
   long jc;
   long krc;
   long istart;
   long iend;
   long jstart;
   long jend;
   double hf;
   double hc;
   double s;
   double s1;
   double s2;
   double s3;
   double s4;
   double factor;
   double int1;
   double int2;
   double i_int_factor;
   double j_int_factor;
   double int_val;

   krc = kf - 1;
   hc = lev_res[krc];
   hf = lev_res[kf];

   istart = gp[my_num].rlist[krc];
   jstart = gp[my_num].rljst[krc];
   iend = gp[my_num].rlien[krc];
   jend = gp[my_num].rljen[krc];
   iend = gp[my_num].rel_start_y[krc] + gp[my_num].rel_num_y[krc] - 1;
   jend = gp[my_num].rel_start_x[krc] + gp[my_num].rel_num_x[krc] - 1;

   factor = 4.0 - eig2 * hf * hf;

   if17=2*(istart-1);
   for (ic=istart;ic<=iend;ic++) {
     if17+=2;
     i_int_factor = ic * i_int_coeff[krc] * 0.5;
     jf = 2 * (jstart - 1);
     for (jc=jstart;jc<=jend;jc++) {
       jf+=2;
       j_int_factor = jc*j_int_coeff[krc] * 0.5;
/* method of half-injection uses 2.0 instead of 4.0 */
       s = multi->q_multi[kf][if17][jf+1] + multi->q_multi[kf][if17][jf-1] +
           multi->q_multi[kf][if17-1][jf] + multi->q_multi[kf][if17+1][jf];
       s1 = 2.0 * (multi->rhs_multi[kf][if17][jf] - s +
                   factor * multi->q_multi[kf][if17][jf]);
       if ((if17 == 2) || (jf ==2)) {
         s2 = 0;
       } else {
         s = multi->q_multi[kf][if17][jf-1] + multi->q_multi[kf][if17][jf-3] +
             multi->q_multi[kf][if17-1][jf-2] + multi->q_multi[kf][if17+1][jf-2];
         s2 = 2.0 * (multi->rhs_multi[kf][if17][jf-2] - s +
                   factor * multi->q_multi[kf][if17][jf-2]);
       }
       if ((if17 == 2) || (jf ==2)) {
         s3 = 0;
       } else {
         s = multi->q_multi[kf][if17-2][jf+1] + multi->q_multi[kf][if17-2][jf-1] +
             multi->q_multi[kf][if17-3][jf] + multi->q_multi[kf][if17-1][jf];
         s3 = 2.0 * (multi->rhs_multi[kf][if17-2][jf] - s +
                     factor * multi->q_multi[kf][if17-2][jf]);
       }
       if ((if17 == 2) || (jf ==2)) {
         s4 = 0;
       } else {
         s = multi->q_multi[kf][if17-2][jf-1] + multi->q_multi[kf][if17-2][jf-3] +
         multi->q_multi[kf][if17-3][jf-2] + multi->q_multi[kf][if17-1][jf-2];
         s4 = 2.0 * (multi->rhs_multi[kf][if17-2][jf-2] - s +
                   factor * multi->q_multi[kf][if17-2][jf-2]);
       }
       int1 = j_int_factor*s4 + (1.0-j_int_factor)*s3;
       int2 = j_int_factor*s2 + (1.0-j_int_factor)*s1;
       int_val = i_int_factor*int1+(1.0-i_int_factor)*int2;
       multi->rhs_multi[krc][ic][jc] = i_int_factor*int1+(1.0-i_int_factor)*int2;
     }
   }
}

/* perform interpolation and addition to next finest grid       */
void intadd(long kc, long my_num)
{
   long ic;
   long if17;
   long jf;
   long jc;
   long kf;
   long istart;
   long jstart;
   long iend;
   long jend;
   double hc;
   double hf;
   long ifine1;
   long ifine2;
   long jfine1;
   long jfine2;
   double int1;
   double int2;
   double i_int_factor1;
   double j_int_factor1;
   double i_int_factor2;
   double j_int_factor2;

   kf = kc + 1;
   hc = lev_res[kc];
   hf = lev_res[kf];

   istart = gp[my_num].iist[kc];
   jstart = gp[my_num].ijst[kc];
   iend = gp[my_num].iien[kc];
   jend = gp[my_num].ijen[kc];

   istart = gp[my_num].rel_start_y[kc];
   jstart = gp[my_num].rel_start_x[kc];
   iend = gp[my_num].rel_start_y[kc] + gp[my_num].rel_num_y[kc] - 1;
   jend = gp[my_num].rel_start_x[kc] + gp[my_num].rel_num_x[kc] - 1;
   if17 = 2*(istart-1);
   for (ic=istart;ic<=iend;ic++) {

     if17+=2;
     ifine1 = if17-1;
     ifine2 = if17;
     i_int_factor1= ((imx[kc]-2)-(ic-1)) * (i_int_coeff[kf]);
     i_int_factor2= ic * i_int_coeff[kf];

     jf = 2*(jstart-1);

     for (jc=jstart;jc<=jend;jc++) {
       jf+=2;
       jfine1 = jf-1;
       jfine2 = jf;
       j_int_factor1= ((jmx[kc]-2)-(jc-1)) * (j_int_coeff[kf]);
       j_int_factor2= jc * j_int_coeff[kf];

       int1 = j_int_factor1*multi->q_multi[kc][ic][jc-1] +
              (1.0-j_int_factor1)*multi->q_multi[kc][ic][jc];
       int2 = j_int_factor1*multi->q_multi[kc][ic-1][jc-1] +
              (1.0-j_int_factor1)*multi->q_multi[kc][ic-1][jc];
       multi->q_multi[kf][if17-1][jf-1] += i_int_factor1*int2 +
              (1.0-i_int_factor1)*int1;
       int2 = j_int_factor1*multi->q_multi[kc][ic+1][jc-1] +
              (1.0-j_int_factor1)*multi->q_multi[kc][ic+1][jc];
       multi->q_multi[kf][if17][jf-1] += i_int_factor2*int2 +
              (1.0-i_int_factor2)*int1;
       int1 = j_int_factor2*multi->q_multi[kc][ic][jc+1] +
              (1.0-j_int_factor2)*multi->q_multi[kc][ic][jc];
       int2 = j_int_factor2*multi->q_multi[kc][ic-1][jc+1] +
              (1.0-j_int_factor2)*multi->q_multi[kc][ic-1][jc];
       multi->q_multi[kf][if17-1][jf] += i_int_factor1*int2 +
              (1.0-i_int_factor1)*int1;
       int2 = j_int_factor2*multi->q_multi[kc][ic+1][jc+1] +
              (1.0-j_int_factor2)*multi->q_multi[kc][ic+1][jc];
       multi->q_multi[kf][if17][jf] += i_int_factor2*int2 +
              (1.0-i_int_factor2)*int1;
     }
   }
}

/* initialize a grid to zero in parallel                        */
void putz(long k, long my_num)
{
   long i;
   long j;
   long istart;
   long jstart;
   long iend;
   long jend;

   istart = gp[my_num].pist[k];
   jstart = gp[my_num].pjst[k];
   iend = gp[my_num].pien[k];
   jend = gp[my_num].pjen[k];
   for (i=istart;i<=iend;i++) {
     for (j=jstart;j<=jend;j++) {
       multi->q_multi[k][i][j] = 0.0;
     }
   }
}