#include "tools.h"
void CpdasymvU(N, alpha, A, IA, JA, descA, X0, IX0, JX0, descX0, incX0, beta, 
               Y0, IY0, JY0, descY0, incY0)
int N;
double alpha;
double *A;
int IA;
int JA;
int *descA;
double *X0;
int IX0;
int JX0;
int *descX0;
int incX0;
double beta;
double *Y0;
int IY0;
int JY0;
int *descY0;
int  incY0;
/*
 *
 *             ======(Yr)======
 *             ======(Xr)======
 *                    N
 *              ---------------
 *  |    |     |\_             |          A - N x N
 *  |    |     |  \_           |          Y - N x 1
 *  |    |     |    \_   (A)   |          X - 1 x N
 * (Yc) (Xr) N |      \_       |
 *  |    |     |        \_     |          X will be replicated on every
 *  |    |     |          \_   |            process row and column
 *  |    |     |            \  |          Y will have space on every
 *  |    |     |             \_|            process row and column
 *              ---------------
 */
{
/*
 * .. External routines ..
 */
   char *ptop();
   void pchkmat();
   void pchkvec();
   void pberror_();
   void Cinfog2l();
   int Cnumroc2();
   void Cblacs_gridinfo();
   void Cdgebs2d();
   void Cdgebr2d();
   void Cdgsum2d();
   F_INTG_FCT dgemv_();
   void Cpdscal1();
   void Cpdcopy1();
   void Cpdaxpy1();

   F_CHAR trans;
   char *top;
   int ctxt, nprow, npcol, myrow, mycol;
   int Ir, Ic, Jr, Jc, inc;
   int descXr[DLEN_], descXc[DLEN_], descYr[DLEN_], descYc[DLEN_];
   int i, j, k, kb, nb, lld, LOCp, LOCq, LOCp2, LOCq2, info=0, one=1, I=0;
   int arow, acol, currow, curcol;
   double *Xr, *Xc, *Yr, *Yc, *xr, *xc, *yr, *yc, *yr2, *yc2;
   double *absA, *a, *ar, *ac, *aa;
   double zero=0;

   ctxt = descA[CTXT_];
   Cblacs_gridinfo(descA[CTXT_], &nprow, &npcol, &myrow, &mycol);
/*
 * Scale Y0 by beta: Y0 = beta * Y0; this allows us to later add in
 * alpha*A*x to get Y0 = alpha*A*x + Y0*beta
 */
   Cpdscal1(N, beta, Y0, IY0, JY0, descY0, incY0);
/*
 * Get local information about our matrix
 */
   Cinfog2l(IA, JA, descA, nprow, npcol, myrow, mycol, &i, &j, &arow, &acol);
   a = &A[ i+j*descA[LLD_] ];
   nb = descA[NB_];
   lld = descA[LLD_];
   LOCp2 = LOCp = Cnumroc2(N, IA, nb, myrow, descA[RSRC_], nprow);
   LOCq2 = LOCq = Cnumroc2(N, JA, nb, mycol, descA[CSRC_], npcol);

/*
 * Set up Xr and Xc, and copy abs( X ) to one row/col.
 */
   inc = 1;
   Ir = Jc = 0;
   Jr = JA % nb;
   Ic = IA % nb;
   i = Jr + LOCq;
   j = Ic + LOCp;
   Mmalloc(Xr, double, i+j, info, ctxt);
   Xc = &Xr[i];
   Mdescset(descXr, 1, N + Jr, nb, nb,
            MCindxg2p(IX0, descX0[MB_], descX0[RSRC_], nprow), acol, ctxt, 1);
   Mdescset(descXc, N + Ic, 1, nb, nb, arow,
            MCindxg2p(JX0, descX0[NB_], descX0[CSRC_], npcol), ctxt, MAX(j,1));
   Cpdcopy1(N, X0, IX0, JX0, descX0, incX0, Xr, Ir, Jr, descXr, inc);
   Cpdcopy1(N, X0, IX0, JX0, descX0, incX0, Xc, Ic, Jc, descXc, inc);
/*
 * Set up Yr and Yc
 */
   Mmalloc(Yr, double, i+j, info, ctxt);
   Yc = &Yr[i];
   Mdescset(descYc, N + Ic, 1, nb, nb, arow,
            MCindxg2p(JY0, descY0[NB_], descY0[CSRC_], npcol), ctxt, MAX(j,1));
   Mdescset(descYr, 1, N + Jr, nb, nb,
            MCindxg2p(IY0, descY0[MB_], descY0[RSRC_], nprow), acol, ctxt, 1);
/*
 * Get memory for absA
 */
   Mmalloc(absA, double, nb*MAX(LOCp, LOCq), info, ctxt);

/*
 * Set local pointers
 */
   if (myrow == arow)
   {
      xc = &Xc[Ic];
      yc2 = yc = &Yc[Ic];
   }
   else
   {
      xc = Xc;
      yc2 = yc = Yc;
   }
   if (mycol == acol)
   {
      xr = &Xr[Jr];
      yr2 = yr = &Yr[Jr];
   }
   else
   {
      xr = Xr;
      yr2 = yr = Yr;
   }
   ac = ar = a;

/*
 * Broadcast X so all rows and columns have a copy
 */
   top = ptop("B", "C", "!");
   if (myrow == descXr[RSRC_])
   {
      for (i=0; i < LOCq; i++) xr[i] = ABS( xr[i] );
      Cdgebs2d(ctxt, "c", top, LOCq, 1, xr, LOCq);
   }
   else Cdgebr2d(ctxt, "c", top, LOCq, 1, xr, LOCq, descXr[RSRC_], mycol);
   top = ptop("B", "R", "!");
   if (mycol == descXc[CSRC_])
   {
      for (i=0; i < LOCp; i++) xc[i] = ABS( xc[i] );
      Cdgebs2d(ctxt, "r", top, LOCp, 1, xc, LOCp);
   }
   else Cdgebr2d(ctxt, "r", top, LOCp, 1, xc, LOCp, myrow, descXc[CSRC_]);
/*
 * Loop over global matrix
 */
   kb = nb - Ic;
   kb = MIN(kb, N);
   currow = arow;
   curcol = acol;
   LOCp = 0;
   do
   {
      if (mycol == curcol)
      {
         trans = C2F_CHAR("T");
         aa = absA;
         a = ac;
         if (myrow == currow)  /* I have diagonal blocks */
         {
/*
 *          Do block along column
 */
            for (j=0; j != kb; j++)
            {
               k = LOCp + j;
               for (i=0; i != k; i++) aa[i] = ABS( a[i] );
               for (k=LOCp+kb; i != k; i++) aa[i] = 0.0;
               aa += k;
               a += lld;
            }
            LOCp += kb;
            dgemv_(trans, &LOCp, &kb, &alpha, absA, &LOCp, xc, &one, &zero,
                   yr, &one);
/*
 *          Do block along row
 */
            trans = C2F_CHAR("N");
            aa = absA;
            a = ar;
            for (j=0; j != kb; j++)  /* copy diagonal block */
            {
               for (i=0; i <= j; i++) aa[i] = ABS( a[i] );
               for (; i != kb; i++) aa[i] = 0.0;
               aa += kb;
               a += lld;
            }
            for (j=LOCq-kb; j; j--)
            {
               for (i=0; i != kb; i++) aa[i] = ABS( a[i] );
               aa += kb;
               a += lld;
            }
            dgemv_(trans, &kb, &LOCq, &alpha, absA, &kb, xr, &one, &zero,
                   yc, &one);
            ar += kb*(lld + 1);
            yc += kb;
         }
         else
         {
            ar += kb*lld;
            if (LOCp)
            {
               for (j=kb; j; j--)
               {
                  for (i=0; i != LOCp; i++) aa[i] = ABS( a[i] );
                  aa += LOCp;
                  a += lld;
               }
               dgemv_(trans, &LOCp, &kb, &alpha, absA, &LOCp, xc, &one, &zero,
                      yr, &one);
            }
            else for (i=0; i != kb; i++) yr[i] = 0.0;
         }
         LOCq -= kb;
         xr += kb;
         yr += kb;
         ac += kb*lld;
      }
      else if (myrow == currow)
      {
         LOCp += kb;
         if (LOCq)
         {
            trans = C2F_CHAR("N");
            a = ar;
            aa = absA;
            for (j=LOCq; j; j--)
            {
               for (i=0; i != kb; i++) aa[i] = ABS( a[i] );
               aa += kb;
               a += lld;
            }
            dgemv_(trans, &kb, &LOCq, &alpha, absA, &kb, xr, &one, &zero,
                   yc, &one);
         }
         else for (i=0; i != kb; i++) yc[i] = 0.0;
         ar += kb;
         yc += kb;
      }
      I += kb;
      kb = MIN(nb, N-I);
      if (++currow == nprow) currow=0;
      if (++curcol == npcol) curcol=0;
   }
   while (I != N);
   free(Xr);
   free(absA);
/*
 * Collect distributed Yr and Yc
 */
   if (LOCq2 != 0)
   {
      top = ptop("C", "C", "!");
      Cdgsum2d(ctxt, "col", top, LOCq2, 1, yr2, LOCq2, descYr[RSRC_], mycol);
   }
   if (LOCp2 != 0)
   {
      top = ptop("C", "R", "!");
      Cdgsum2d(ctxt, "row", top, LOCp2, 1, yc2, LOCp2, myrow, descYr[CSRC_]);
   }
/*
 * Set y = yc + yr
 */
   if (incY0 == descY0[M_])
   {
      Cpdaxpy1(N, 1.0, Yc, Ic, Jc, descYc, inc, Yr, Ir, Jr, descYr, inc);
      Cpdaxpy1(N, 1.0, Yr, Ir, Jr, descYr, inc, Y0, IY0, JY0, descY0, incY0);
   }
   else
   {
      Cpdaxpy1(N, 1.0, Yr, Ir, Jr, descYr, inc, Yc, Ic, Jc, descYc, inc);
      Cpdaxpy1(N, 1.0, Yc, Ic, Jc, descYc, inc, Y0, IY0, JY0, descY0, incY0);
   }
   free(Yr);
}
