public class MM { // Unblocked matrix multiply: C = C + A * B // Using border, this only requires that A, B, C have the same size and strides void MMunblocked(double array<2> C, double array<2> A, double array<2> B) { RectDomain<2> d = C.domain; Point<2> upperleft = d.min(); RectDomain<2> Arow0 = A.domain.border(1, -1, 0); RectDomain<2> Bcolumn0 = B.domain.border(1, -2, 0); foreach (c within d) { double sum = C[c]; // Do the dot product of a row of A with a column of B foreach (a within Arow0 + Point<2>.direction(0, c[0] - upperleft[0]), b within Bcolumn0 + Point<2>.direction(1, c[1] - upperleft[1])) sum += A[a] * B[b]; C[c] = sum; } } // Parallel blocked matrix multipy: C = A * B void all_MM(ParallelMatrix C, ParallelMatrix A, ParallelMatrix B) { RectDomain<2> d = C.domain(); Point<2> upperleft = d.min(); RectDomain<2> Arow0 = A.blockDomain().border(1, -1, 0); RectDomain<2> Bcolumn0 = B.blockDomain().border(1, -2, 0); double array<2> blk1, blk2; RectDomain<2> blkDomain = C.singleBlockDomain(); // Storage for local copies of remote blocks blk1 = new double array[blkDomain]; blk2 = new double array[blkDomain]; foreach (c within C.localBlockDomain()) { double array<2> Cblock = C.fetchBlock(c, null); foreach (cblk within Cblock.domain) Cblock[cblk] = 0; // Sum the product of a row of blocks of A with a column of blocks of B // (remote blocks are copied locally first) foreach (a within Arow0 + Point<2>.direction(0, c[0] - upperleft[0]), b within Bcolumn0 + Point<2>.direction(1, c[1] - upperleft[1])) MMunblocked(Cblock, A.fetchBlock(a, blk1), B.fetchBlock(b, blk2)); } } } // A blocked, parallel matrix. Uses a blocked row distribution, but that // is easily changed. public final class ParallelMatrix { final public int blksize = 16; final Point<2> blksizep = Point<2>.all(blksize); private int n, m; private RectDomain<2> fullDomain, blockDomain, localDomain, localBlockDomain; private double array<2> array<2> blocks, localBlocks; static private int min(int i1, int i2) { return i1 < i2 ? i1 : i2; } static private int max(int i1, int i2) { return i1 > i2 ? i1 : i2; } // Return the point formed by component-wise max of each component static private Point<2> limit(Point<2> p1, Point<2> p2) { return [ max(p1[0], p2[0]), max(p1[1], p2[1]) ]; } // Return the domain for the block of d that starts at base static private RectDomain<2> makeBlockDomain(RectDomain<2> d, Point<2> base) { Point<2> stride = d.stride(); return [ base : limit(d.max(), base + stride * (blksize - 1)) : stride ]; } // Allocate the n x m parallel array public void all_allocate(int _n, int _m) { int array<1> team = Ti.myTeam(); // Should probably be a Ti.nprocs() method int nprocs = team.domain.max()[0] - team.domain.min()[0] + 1; n = _n; m = _m; // Compute the global domains fullDomain = [ [0, 0] : [n - 1, m - 1] ]; blockDomain = (fullDomain / blksizep) * blksizep; blocks = new double array<2> array[blockDomain]; // Divide the blocks amongst processes in a blocked-row distribution int nblocks = (n + blksize - 1) / blksize; int localnblocks = nblocks / nprocs; int missing = nblocks * - localnblocks * nprocs; int firstlocal; if (Ti.thisProc() < missing) { localnblocks++; firstlocal = Ti.thisProc() * localnblocks; } else firstlocal = Ti.thisProc() * localnblocks + missing; // And compute the local domains localDomain = [ [ firstlocal * blksize, 0] : [ min((firstlocal + localnblocks) * blksize, n - 1), m ] ]; localBlockDomain = (localDomain / blksizep) * blksizep; localBlocks = new double array<2> array[localBlockDomain]; // Allocate the local blocks. For a blocked row domain, we could // allocate one local block instead, and construct the blocks // either here, or on the fly in block/fetchBlock. This code is // left intentionally simple and general. foreach (b within localBlockDomain) localBlocks[b] = new double array[makeBlockDomain(fullDomain, b)]; // Everyone tells everyone else about their blocks double array<2> array<2> array<1> allBlocks = new double array<2> array<2> array[Ti.myTeam().domain]; //exchange(allBlocks, localBlocks); // Initialise the array of all blocks foreach (t within allBlocks.domain) foreach (b within allBlocks[t].domain) blocks[b] = allBlocks[t][b]; } // Return a potentially remote pointer to the block at base public double array<2> block(Point<2> base) { return blocks[base]; } // If the block at base is remote, copy it into local and return local // Otherwise return a reference to the local block at base public double array<2> fetchBlock(Point<2> base, double array<2> local) { if (localBlockDomain.contains(base)) return localBlocks[base]; // Make local look like block, then copy block there double array<2> block = blocks[base]; local = local.translate(block.domain.min() - local.domain.min()).restrict(block.domain); // Assumes that copy copies into 'local' here (check) local.copy(block, local.domain.asDomain()); return local; } // Update the block at base from newBlock. newBlock should be the // result of fetchBlock (so it will be an actual reference to the // block if the block was local) public void updateBlock(Point<2> base, double array<2> newBlock) { if (!localBlockDomain.contains(base)) // check direction of copy... blocks[base].copy(newBlock, newBlock.domain.asDomain()); } // Return various useful domains public RectDomain<2> domain() { return fullDomain; } public RectDomain<2> blockDomain() { return blockDomain; } public RectDomain<2> localDomain() { return localDomain; } public RectDomain<2> localBlockDomain() { return localBlockDomain; } public RectDomain<2> singleBlockDomain() { // no stride... return [ [0, 0] : [blksize - 1, blksize - 1] ]; } }