#include "tools.h"
void CpsatrmvLN(UNIT, N, alpha, A, IA, JA, descA, X, IX, JX, descX, incX, beta, 
                Y, IY, JY, descY, incY)
int UNIT;
int N;
float alpha;
float *A;
int IA;
int JA;
int *descA;
float *X;
int IX;
int JX;
int *descX;
int incX;
float beta;
float *Y;
int IY;
int JY;
int *descY;
int  incY;
/*
 *
 *             ======(Xr)======
 *
 *                    N
 *              ---------------
 *       |     |\_             |          A - N x N
 *       |     |  \_           |          Y - N x 1
 *       |     |    \_         |          X - 1 x N
 *      (Yc) N |      \_       |
 *       |     |        \_     |          X will be replicated on every
 *       |     |  (A)     \_   |            process row
 *       |     |            \  |          Y will have space on every
 *       |     |             \_|            process column
 *              ---------------
 */
{
/*
 * .. External routines ..
 */
   char *ptop();
   void pberror_();
   void Cinfog2l();
   int Cnumroc2();
   void Cblacs_gridinfo();
   void Csgebs2d();
   void Csgebr2d();
   void Csgsum2d();
   F_INTG_FCT sgemv_();
   void Cpsscal1();
   void Cpscopy1();
   void Cpsaxpy1();

   F_CHAR trans;
   char *top;
   int Ir, Jr, Ic, Jc, i, j, kb, nb, lld, LOCp, LOCq, one=1, I=0;
   int ctxt, nprow, npcol, myrow, mycol, arow, acol, currow, curcol;
   int descXr[DLEN_], descYc[DLEN_];
   float *a, *aa, *ar, *absA, *Xr, *Yc, *xr, *yc, *yc2, zero=0.0;

   trans = C2F_CHAR("N");
   ctxt = descA[CTXT_];
   Cblacs_gridinfo(descA[CTXT_], &nprow, &npcol, &myrow, &mycol);
/*
 * Scale Y by beta: Y = beta * Y; this allows us to later add in
 * alpha*A*x to get Y = alpha*A*x + Y*beta
 */
   Cpsscal1(N, beta, Y, IY, JY, descY, incY);
/*
 * Get local information about our matrix
 */
   Cinfog2l(IA, JA, descA, nprow, npcol, myrow, mycol, &i, &j, &arow, &acol);
   ar = &A[ i+j*descA[LLD_] ];
   nb = descA[NB_];
   lld = descA[LLD_];
   LOCp = Cnumroc2(N, IA, nb, myrow, descA[RSRC_], nprow);
   LOCq = Cnumroc2(N, JA, nb, mycol, descA[CSRC_], npcol);
/*
 * Set up Xr, Yc, and absA
 */
   Ir = Jc = 0;
   Ic = IA % nb;
   Jr = JA % nb;
   i = Ic + LOCp;
   j = Jr + LOCq;
   Mmalloc(Yc, float, i+j+nb*LOCq, kb, ctxt);
   Xr = &Yc[i];
   absA = &Xr[j];
   Mdescset(descXr, 1, N+Jr, nb, nb,
            MCindxg2p(IX, descX[MB_], descX[RSRC_], nprow), acol, ctxt, 1);
   Mdescset(descYc, N+Ic, 1, nb, nb, arow,
            MCindxg2p(JY, descY[NB_], descY[CSRC_], npcol), ctxt, MAX(1,i));
   if (mycol == acol) xr = &Xr[Jr];
   else xr = Xr;
   if (myrow == arow) yc2 = yc = &Yc[Ic];
   else yc2 = yc = Yc;
/*
 * Copy X and broadcast it to all process rows
 */
   Cpscopy1(N, X, IX, JX, descX, incX, Xr, Ir, Jr, descXr, one);
   top = ptop("B", "C", "!");
   if (myrow == descXr[RSRC_])
   {
      for (i=0; i != LOCq; i++) xr[i] = ABS( xr[i] );
      Csgebs2d(ctxt, "c", top, LOCq, 1, xr, LOCq);
   }
   else Csgebr2d(ctxt, "c", top, LOCq, 1, xr, LOCq, descXr[RSRC_], mycol);

   kb = nb - Ic;
   kb = MIN(kb, N);
   currow = arow;
   curcol = acol;
   LOCq = 0;
   do
   {
      if (myrow == currow)
      {
         aa = absA;
         a = ar;
         for (j=LOCq; j; j--)   /* copy all but diagonal block */
         {
            for (i=0; i != kb; i++) aa[i] = ABS( a[i] );
            a += lld;
            aa += kb;
         }
         if (mycol == curcol) /* I have diagonal block */
         {
            for (j=0; j != kb; j++)
            {
               for (i=0; i != j; i++) aa[i] = 0.0;
               if (UNIT) aa[i] = 1.0;
               else aa[i] = ABS( a[i] );
               for (i++; i < kb; i++) aa[i] = ABS( a[i] );
               a += lld;
               aa += kb;
            }
            LOCq += kb;
         }
         else if (!LOCq) for (i=0; i != kb; i++) yc[i] = 0.0;

         sgemv_(trans, &kb, &LOCq, &alpha, absA, &kb, xr, &one, &zero,
                yc, &one);
         ar += kb;
         yc += kb;
      }
      else if (mycol == curcol) LOCq += kb;

      I += kb;
      kb = MIN(nb, N-I);
      if (++currow == nprow) currow=0;
      if (++curcol == npcol) curcol=0;
   }
   while (I != N);
/*
 * Collect distributed Y
 */
   if (LOCp)
   {
      top = ptop("C", "R", "!");
      Csgsum2d(ctxt, "row", top, LOCp, 1, yc2, LOCp, myrow, descYc[CSRC_]);
   }
/*
 * Set y = A*x + beta*y (Y has beta*y, and Yc has A*x)
 */
   Cpsaxpy1(N, 1.0, Yc, Ic, Jc, descYc, one, Y, IY, JY, descY, incY);
   if (Yc) free(Yc);
}
