// Titanium micro-benchmarker // Dan Bonachea // 10/2/2001 class Box { public byte byteval; public short shortval; public int intval; public long longval; public float floatval; public double doubleval; public Box() {} } public class perf { public int peerid; public boolean activeproc; public void report(String s) { System.out.println("P"+Ti.thisProc()+"-P" + peerid +": " + s); } // used to keep reads from being loop optimized away public static void donothing(byte b) {} public static void donothing(short s) {} public static void donothing(int i) {} public static void donothing(long l) {} // run a read latency test of iters iterations and report microsecond latency public sglobal void readlatency(int iters) { Timer t = new Timer(); Box [1d] single allBoxes = new Box[0:Ti.numProcs()-1]; allBoxes.exchange(new Box()); Box peerbox = allBoxes[peerid]; t.reset(); Ti.barrier(); if (activeproc) { t.start(); byte temp; for (int i = 0; i < iters; i++) { donothing(peerbox.byteval); } t.stop(); report("read latency (1 byte) = " + (t.micros()/iters) + " us"); } t.reset(); Ti.barrier(); if (activeproc) { t.start(); short temp; for (int i = 0; i < iters; i++) { donothing(peerbox.shortval); } t.stop(); report("read latency (2 byte) = " + (t.micros()/iters) + " us"); } t.reset(); Ti.barrier(); if (activeproc) { t.start(); int temp; for (int i = 0; i < iters; i++) { donothing(peerbox.intval); } t.stop(); report("read latency (4 byte) = " + (t.micros()/iters) + " us"); } t.reset(); Ti.barrier(); if (activeproc) { t.start(); long temp; for (int i = 0; i < iters; i++) { donothing(peerbox.longval); } t.stop(); report("read latency (8 byte) = " + (t.micros()/iters) + " us"); } } // run a write latency test of iters iterations and report microsecond latency public sglobal void writelatency(int iters) { Timer t = new Timer(); Box [1d] single allBoxes = new Box[0:Ti.numProcs()-1]; allBoxes.exchange(new Box()); Box peerbox = allBoxes[peerid]; t.reset(); Ti.barrier(); if (activeproc) { t.start(); byte temp; for (int i = 0; i < iters; i++) { peerbox.byteval = (byte)i; } t.stop(); report("write latency (1 byte) = " + (t.micros()/iters) + " us"); } t.reset(); Ti.barrier(); if (activeproc) { t.start(); short temp; for (int i = 0; i < iters; i++) { peerbox.shortval = (short)i; } t.stop(); report("write latency (2 byte) = " + (t.micros()/iters) + " us"); } t.reset(); Ti.barrier(); if (activeproc) { t.start(); int temp; for (int i = 0; i < iters; i++) { peerbox.intval = (int)i; } t.stop(); report("write latency (4 byte) = " + (t.micros()/iters) + " us"); } t.reset(); Ti.barrier(); if (activeproc) { t.start(); long temp; for (int i = 0; i < iters; i++) { peerbox.longval = (long)i; } t.stop(); report("write latency (8 byte) = " + (t.micros()/iters) + " us"); } } // run a read bandwidth test and report bandwidth public sglobal void readbandwidth(int MBpertest, int single maxchunkMB, int maxiters) { Timer t = new Timer(); for (int single sz = 1; sz <= 1048576*maxchunkMB; sz*=2) { byte [1d] single [1d] allArrs = new byte[0:Ti.numProcs()-1][1d]; byte [1d] myArr = new byte[1:sz]; int iters = (MBpertest * 1048576) / sz; if (iters > maxiters) iters = maxiters; allArrs.exchange(myArr); byte [1d] peerArr = allArrs[peerid]; t.reset(); Ti.barrier(); if (activeproc) { t.start(); for (int i = 0; i < iters; i++) { myArr.copy(peerArr); } t.stop(); double totalMB = ((double)iters) * sz / 1048576.0; report("read bandwidth ("+sz+" byte chunks) = " + (totalMB/t.secs()) + " MB/sec"); } } } // run a write bandwidth test and report bandwidth public sglobal void writebandwidth(int MBpertest, int single maxchunkMB, int maxiters) { Timer t = new Timer(); for (int single sz = 1; sz <= 1048576*maxchunkMB; sz*=2) { byte [1d] single [1d] allArrs = new byte[0:Ti.numProcs()-1][1d]; byte [1d] myArr = new byte[1:sz]; int iters = (MBpertest * 1048576) / sz; if (iters > maxiters) iters = maxiters; allArrs.exchange(myArr); byte [1d] peerArr = allArrs[peerid]; t.reset(); Ti.barrier(); if (activeproc) { t.start(); for (int i = 0; i < iters; i++) { peerArr.copy(myArr); } t.stop(); double totalMB = ((double)iters) * sz / 1048576.0; report("write bandwidth ("+sz+" byte chunks) = " + (totalMB/t.secs()) + " MB/sec"); } } } // usage: perf [numiterations] [maxMBperiteration] [maxMBchunk] public static void main(String [] args) { int iters; int single maxMB; int single maxchunkMB; { int iters0 = 10000; int maxMB0 = 10; int maxchunkMB0 = 1; if (args.length > 0) try { iters0 = Integer.parseInt(args[0]); } catch (Throwable exn) {} if (args.length > 1) try { maxMB0 = Integer.parseInt(args[1]); } catch (Throwable exn) {} if (args.length > 2) try { maxchunkMB0 = Integer.parseInt(args[2]); } catch (Throwable exn) {} iters = broadcast iters0 from 0; maxMB = broadcast maxMB0 from 0; maxchunkMB = broadcast maxchunkMB0 from 0; } perf single p = new perf(); p.peerid = ( Ti.thisProc() + 1 ) % Ti.numProcs(); p.activeproc = (Ti.thisProc() % 2 == 0); if (Ti.thisProc() == 0) System.out.println("Running latency tests (" + iters + " iterations)"); p.readlatency(iters); p.writelatency(iters); if (Ti.thisProc() == 0) System.out.println("Running bandwidth test (iterations at each chunk size: MAX("+iters+" iterations, "+maxMB+" MB))"); p.readbandwidth(maxMB, maxchunkMB, iters); p.writebandwidth(maxMB, maxchunkMB, iters); } }