public class MM
{
  // Unblocked matrix multiply: C = C + A * B
  // Using border, this only requires that A, B, C have the same size and strides
  void MMunblocked(double array<2> C, double array<2> A, double array<2> B)
  {
    RectDomain<2> d = C.domain;
    Point<2> upperleft = d.min();
    RectDomain<2> Arow0 = A.domain.border(1, -1, 0);
    RectDomain<2> Bcolumn0 = B.domain.border(1, -2, 0);

    foreach (c within d)
      {
	double sum = C[c];

	// Do the dot product of a row of A with a column of B
	foreach (a within Arow0 + Point<2>.direction(0, c[0] - upperleft[0]),
		 b within Bcolumn0 + Point<2>.direction(1, c[1] - upperleft[1]))
	  sum += A[a] * B[b];
	C[c] = sum;
      }
  }

  // Parallel blocked matrix multipy: C = A * B
  void all_MM(ParallelMatrix C, ParallelMatrix A, ParallelMatrix B)
  {
    RectDomain<2> d = C.domain();
    Point<2> upperleft = d.min();
    RectDomain<2> Arow0 = A.blockDomain().border(1, -1, 0);
    RectDomain<2> Bcolumn0 = B.blockDomain().border(1, -2, 0);
    double array<2> blk1, blk2;
    RectDomain<2> blkDomain = C.singleBlockDomain();

    // Storage for local copies of remote blocks
    blk1 = new double array[blkDomain];
    blk2 = new double array[blkDomain];

    foreach (c within C.localBlockDomain())
      {
	double array<2> Cblock = C.fetchBlock(c, null);

	foreach (cblk within Cblock.domain) Cblock[cblk] = 0;

	// Sum the product of a row of blocks of A with a column of blocks of B
	// (remote blocks are copied locally first)
	foreach (a within Arow0 + Point<2>.direction(0, c[0] - upperleft[0]),
		 b within Bcolumn0 + Point<2>.direction(1, c[1] - upperleft[1]))
	  MMunblocked(Cblock, A.fetchBlock(a, blk1), B.fetchBlock(b, blk2));
      }
  }
}

// A blocked, parallel matrix. Uses a blocked row distribution, but that
// is easily changed.
public final class ParallelMatrix
{
  final public int blksize = 16;
  final Point<2> blksizep = Point<2>.all(blksize);

  private int n, m;
  private RectDomain<2> fullDomain, blockDomain, localDomain, localBlockDomain;
  private double array<2> array<2> blocks, localBlocks;

  static private int min(int i1, int i2) { return i1 < i2 ? i1 : i2; }
  static private int max(int i1, int i2) { return i1 > i2 ? i1 : i2; }

  // Return the point formed by component-wise max of each component
  static private Point<2> limit(Point<2> p1, Point<2> p2)
  {
    return [ max(p1[0], p2[0]), max(p1[1], p2[1]) ];
  }

  // Return the domain for the block of d that starts at base
  static private RectDomain<2> makeBlockDomain(RectDomain<2> d, Point<2> base)
  {
    Point<2> stride = d.stride();

    return [ base : 
	     limit(d.max(), base + stride * (blksize - 1)) :
	     stride ];
  }

  // Allocate the n x m parallel array
  public void all_allocate(int _n, int _m)
  {
    int array<1> team = Ti.myTeam();
    // Should probably be a Ti.nprocs() method
    int nprocs = team.domain.max()[0] - team.domain.min()[0] + 1;

    n = _n; m = _m;

    // Compute the global domains
    fullDomain = [ [0, 0] : [n - 1, m - 1] ];
    blockDomain = (fullDomain / blksizep) * blksizep;
    blocks = new double array<2> array[blockDomain];

    // Divide the blocks amongst processes in a blocked-row distribution
    int nblocks = (n + blksize - 1) / blksize;
    int localnblocks = nblocks / nprocs;
    int missing = nblocks * - localnblocks * nprocs;
    int firstlocal;
    if (Ti.thisProc() < missing)
      {
	localnblocks++;
	firstlocal = Ti.thisProc() * localnblocks;
      }
    else
      firstlocal = Ti.thisProc() * localnblocks + missing;

    // And compute the local domains
    localDomain = [ [ firstlocal * blksize, 0] :
		    [ min((firstlocal + localnblocks) * blksize, n - 1), m ] ];
    localBlockDomain = (localDomain / blksizep) * blksizep;
    localBlocks = new double array<2> array[localBlockDomain];

    // Allocate the local blocks. For a blocked row domain, we could
    // allocate one local block instead, and construct the blocks
    // either here, or on the fly in block/fetchBlock. This code is
    // left intentionally simple and general.
    foreach (b within localBlockDomain)
      localBlocks[b] = new double array[makeBlockDomain(fullDomain, b)];

    // Everyone tells everyone else about their blocks
    double array<2> array<2> array<1> allBlocks = 
      new double array<2> array<2> array[Ti.myTeam().domain];
    //exchange(allBlocks, localBlocks);

    // Initialise the array of all blocks
    foreach (t within allBlocks.domain)
      foreach (b within allBlocks[t].domain)
        blocks[b] = allBlocks[t][b];
  }

  // Return a potentially remote pointer to the block at base
  public double array<2> block(Point<2> base)
  {
    return blocks[base];
  }

  // If the block at base is remote, copy it into local and return local
  // Otherwise return a reference to the local block at base
  public double array<2> fetchBlock(Point<2> base, double array<2> local)
  {
    if (localBlockDomain.contains(base)) return localBlocks[base];

    // Make local look like block, then copy block there
    double array<2> block = blocks[base];
    local = local.translate(block.domain.min() - local.domain.min()).restrict(block.domain);
    // Assumes that copy copies into 'local' here (check)
    local.copy(block, local.domain.asDomain());

    return local;
  }

  // Update the block at base from newBlock. newBlock should be the
  // result of fetchBlock (so it will be an actual reference to the
  // block if the block was local)
  public void updateBlock(Point<2> base, double array<2> newBlock)
  {
    if (!localBlockDomain.contains(base))
      // check direction of copy...
      blocks[base].copy(newBlock, newBlock.domain.asDomain());
  }

  // Return various useful domains
  public RectDomain<2> domain() { return fullDomain; }
  public RectDomain<2> blockDomain() { return blockDomain; }
  public RectDomain<2> localDomain() { return localDomain; }
  public RectDomain<2> localBlockDomain() { return localBlockDomain; }
  public RectDomain<2> singleBlockDomain()
  {
    // no stride...
    return [ [0, 0] : [blksize - 1, blksize - 1] ];
  }
}