/* Storage optimizations */ #include "stoptifu.h" #include "free_map.h" #include "l2s.h" // #undef DEBUG_STORAGE // #define DEBUG_STORAGE 2 #ifndef DEBUG_STORAGE #define DEBUG_STORAGE DEBUG_AC #endif /* Macros for debugging output */ #undef DBG #undef DBG2 #undef DPRINT #undef DPRINT2 #undef DPRINTLN #undef DPRINTLN2 #define DPRINT(x) do { if (DEBUG_STORAGE) cout << (x); } while (0) #define DPRINTLN(x) DPRINT(string(x) + '\n') #define DBG(x) do { if (DEBUG_STORAGE) { x; } } while (0) #define DPRINT2(x) do { if (DEBUG_STORAGE >= 2) cout << (x); } while (0) #define DPRINTLN2(x) DPRINT2(string(x) + '\n') #define DBG2(x) do { if (DEBUG_STORAGE >= 2) { x; } } while (0) #undef XBG #undef XBG2 #undef XPRINT #undef XPRINTLN #define XBG(x) do { if (1) { x; } } while (0) #define XBG2(x) do { if (1) { x; } } while (0) #define XPRINT(x) do { if (1) cout << (x); } while (0) #define XPRINTLN(x) XPRINT(string(x) + '\n') #ifndef MAXINT #define MAXINT (0x7fffffff) #endif #ifndef MININT #define MININT (-1 - MAXINT) #endif /* which opts to do (set by args to storage_opt()) */ bool enable_storage_opt_schedule_read, enable_storage_opt_read_from_reg, enable_storage_opt_usib, enable_storage_opt_ts; /**************************************************************************** * utils ****************************************************************************/ static map access_to_smap_element_memo; smap_element *access_to_smap_element(int access, smap &s) { smap_element *&result = access_to_smap_element_memo[access]; if (result != NULL) return result; for (smap::iterator i = s.begin(); i != s.end(); ++i) foreach (l, llist, *((*i).second)) if ((*l)->access == access) return (result = *l); fatal("error in access_to_smap_element(int access, smap &s)"); return 0; } static string (*arrayname_fn)(int); string arrayname(int array) { string s; return ((arrayname_fn == NULL || (s = ((*arrayname_fn)(array))) == "") ? "an array" : s); } static map string_to_ACvar_map; static inline ACvar *get_ACvar(const string &s, Ty *t, Block *b) { ACvar *&result = string_to_ACvar_map[s]; if (result == NULL) result = b->temp(s, t); return result; } /* x is a Relation from f's iteration space to array indices. Compute a Relation that maps from f's megatiled space to array indices. */ Relation relation_from_tilespace_to_arrayindex(Relation x, const Foreach *f, const intvector *v) { DPRINT("remap(f=" + f->source_code_location + "): " + relation_to_str(x) + " -> "); int n = f->arity(); Relation r(n, n); F_And *root = r.add_and(); for (int i = 0; i < n; i++) { EQ_Handle eqh = root->add_EQ(); eqh.update_coef(r.output_var(i + 1), -1); eqh.update_const((*v)[i]); for (int j = 0; j < n; j++) eqh.update_coef(r.input_var(j + 1), (*(*f->steps)[j])[i]); } r = Join(r, copy(x)); DPRINTLN(relation_to_str(r)); return r; } static map > tilespace_to_arrayindex_memo; /* Memoized version of the above. Memo is on e and k, even though k is not used to calculate the answer. */ template Relation relation_from_tilespace_to_arrayindex(T *e, int k, const Foreach *f, const intvector *v) { map &m = tilespace_to_arrayindex_memo[(void *) e]; map::iterator i = m.find(k); if (i != m.end()) return (*i).second; else return (m[k] = relation_from_tilespace_to_arrayindex(e->R, f, v)); } /* Helper for optimize_read(). The argument r is a map from tile space to an array index for the read we're trying to schedule, in the context specified by the other arguments. See below for explanation of args. Returns true if it performs any optimization. */ /* currently does nothing */ static bool schedule_read(const smap_element *e, smap &s, rwmap &w, Megatile *m, int k, const Foreach **ff, const intvector **vv, int size, const Relation r) { if (enable_storage_opt_schedule_read) { DBG({ string r0 = singleton_set_to_string(eval_relation_at_zero(r)); cout << "schedule_read of arr " << e->array << " in, e.g., tile (0, " << k << ") @ " << r0 << endl; }); } return false; } /**************************************************************************** * read-related storage opts/utils ****************************************************************************/ /* The read described by e reads an array elt specified by r, a map from tile space to an array index. For each element of l, compute a map, w, from tile space (offset by -tile_offset in the last dimension) to array index space. Return the first element of l for which w and r map are the same map, i.e., a write that always writes the same array location that is read in e OR a read that always reads the same array location that is read in e. And set prevr to that map. If none found, return NULL. */ static smap_element *smap_find_elt(llist *l, const Foreach *f, const intvector *v, int k, int tile_offset, const smap_element *e, Relation r, Relation &prevr) { #ifdef DEBUG_STORAGE string r0; if (DEBUG_STORAGE) r0 = singleton_set_to_string(eval_relation_at_zero(r)); #endif smap_element *result = NULL; #define FOUND(x) do { result = (x); goto done; } while (0) int IS_arity = r.n_inp(); intvector *tile_offset_v = NULL; while (l != NULL) { smap_element *x = l->front(); if (e->must_be_same_array(x)) { Relation w = relation_from_tilespace_to_arrayindex(x, k, f, v); prevr = copy(w); if (tile_offset != 0) { if (tile_offset_v == NULL) tile_offset_v = unit_vector(IS_arity - 1, IS_arity)-> destructive_product(-tile_offset); w = translate_domain(copy(w), tile_offset_v); } if (same_relation(w, r)) FOUND(x); else DBG2({ cout << "read at, e.g., " << r0 << " doesn't match " << (x->is_read() ? "read" : "write") << " at, e.g., " << singleton_set_to_string(eval_relation_at_zero(w)) << endl; }); } else DBG({ cout << "Not necessarily same array: " << x->to_string() << endl; }); l = l->tail(); } done: delete tile_offset_v; return result; #undef FOUND } /* The read specified by e reads an array specified by r, a map from tile space to an array index. For each element of l, compute a map, w, from tile space (offset by -tile_offset in the last dimension) to array index space (offset by v). Return true if there exists a tile t s.t. w(t) and r(t) intersect and they may be accessing the same array. */ static bool rwmap_find_write(llist *l, const Foreach *f, const intvector *v, int k, int tile_offset, const smap_element *e, Relation r) { bool result = false; #define YES do { result = true; goto done; } while (0) int IS_arity = r.n_inp(); intvector *tile_offset_v = NULL; while (l != NULL) { rwmap_element *x = l->front(); if (x->is_write()) { if (e->must_be_same_array(x)) { Relation w = relation_from_tilespace_to_arrayindex(x, k, f, v); if (tile_offset != 0) { if (tile_offset_v == NULL) tile_offset_v = unit_vector(IS_arity - 1, IS_arity)-> destructive_product(-tile_offset); w = translate_domain(copy(w), tile_offset_v); } if (exists_range_overlap(w, r)) YES; } else if (e->might_be_same_array(x)) YES; } l = l->tail(); } done: delete tile_offset_v; return result; #undef YES } /* Helper for optimize_read(). The argument r is a map from tile space to an array index for the read we're trying to optimize, in the context specified by the other arguments. See below for explanation of args. Returns true if it performs any optimization. */ static bool read_from_reg(const smap_element *e, smap &s, rwmap &w, Megatile *m, int k, const Foreach **ff, const intvector **vv, int size, const Relation r) { if (enable_storage_opt_read_from_reg) { DPRINTLN("read_from_reg(k=" + i2s(k) + ")"); int max_dist; // try this many steps prior to step k (must not exceed size) max_dist = size; for (int cur_dist = 1; cur_dist <= max_dist; cur_dist++) { int try_k = k - cur_dist; bool prev_tile = (try_k < 0); if (prev_tile) try_k += size; DBG(cout << "check k=" << try_k << ", prev_tile=" << prev_tile << endl); assert(try_k >= 0 && try_k < size); Foreach *f = const_cast(ff[try_k]); // f is not modified const intvector *v = vv[try_k]; Relation prevr; smap_element *prev = smap_find_elt(s[f], f, v, try_k, (prev_tile ? -1 : 0), e, r, prevr); if (prev != NULL) { DBG({ string r0 = singleton_set_to_string(eval_relation_at_zero(r)); cout << "Read of arr " << e->array << " in, e.g., tile (0, " << k << ") @ " << r0 << " can instead use\n value from tile (" << (prev_tile ? -1 : 0) << ", " << try_k << ")" << endl; }); m->optimize_read_from_reg(e, r, prev, prevr, k, cur_dist); return true; } if (rwmap_find_write(w[f], f, v, try_k, (prev_tile ? -1 : 0), e, r)) break; } } return false; } /* optimize_read(): Returns true if any optimization or scheduling was done. E is the read to try to optimize/schedule S specifies optimizable reads and writes W specifies the writes that may conflict with this read (includes all of S) M is the megatile we're optimizing K is which step of m we're optimizing FF is an array of Foreaches; ff[i] is the Foreach used at step i of m VV is an array of vectors; vv[i] is the offset used at step i of m SIZE is the number of steps in m. */ static bool optimize_read(smap_element *e, smap &s, rwmap &w, Megatile *m, int k, const Foreach **ff, const intvector **vv, int size) { DPRINTLN("optimize_read(" + e->to_string() + ")"); Relation r = relation_from_tilespace_to_arrayindex(e, k, ff[k], vv[k]); DBG({ cout << "offset for k=" << k << " is " << vv[k]->to_string() << endl; printrel(r, "map from tile space to array index:"); }); return read_from_reg(e, s, w, m, k, ff, vv, size, r) || schedule_read(e, s, w, m, k, ff, vv, size, r); } /* The next few routines are for filtering the read_from_reg candidates. */ /* Helper for Megatile::filter_read_from_reg(). If o is suitable then bump all steps for which its associated variable is live and set pick to o and return true. Otherwise return false. o is suitable if doing that bump would result in step_to_num_live[] having no element greater than max_live. */ static bool suitable(int write_k, int size, int max_live, opt_read *& pick, map &step_to_num_live, opt_read *o) { int read_k = write_k + o->delta_k; DPRINT("suitable(write_k=" + i2s(write_k) + ", read_k=" + i2s(read_k) + ", ...): "); /* do the bump */ if (read_k >= size) { bump_all_in_range(step_to_num_live, 0, read_k - size); bump_all_in_range(step_to_num_live, write_k, size - 1); } else bump_all_in_range(step_to_num_live, write_k, read_k); if (max_second(step_to_num_live) > max_live) { DPRINTLN("no (" + i2s(max_second(step_to_num_live)) + " > " + i2s(max_live) + ")"); /* undo the bump */ if (read_k >= size) { bump_all_in_range(step_to_num_live, 0, read_k - size, -1); bump_all_in_range(step_to_num_live, write_k, size - 1, -1); } else bump_all_in_range(step_to_num_live, write_k, read_k, -1); return false; } DPRINTLN("yes"); pick = o; return true; } /* Helper for Megatile::filter_read_from_reg(). */ static bool find_suitable(int write_k, int size, int max_live, opt_read *& pick, set &use_opt, map &step_to_num_live, llist *l) { while (l != NULL) if (use_opt.find(l->front()) == use_opt.end() && suitable(write_k, size, max_live, pick, step_to_num_live, l->front())) return true; else l = l->tail(); return false; } /* Helper for Megatile::filter_read_from_reg(). */ static void select_rfrs(int max_live, int size, set &use_opt, map *> &write_k_to_opt, map &step_to_num_live) { while (true) { int k = 0; /* for (; k < size; k++) if (step_to_num_live[k] <= max_live) break; if (k == size) { DPRINTLN("select_rfrs: all steps have num live > " + i2s(max_live)); return; } */ int start = k; /* search for a write that starts at or after this step */ DPRINTLN("select_rfrs: start=" + i2s(start)); opt_read *pick = NULL; int i = 0; int *a = new int [size]; // ordered by best k to worst for (int seeking = 0; seeking < max_live && i < size; seeking++) for (k = 0; k < size; k++) if (step_to_num_live[k] == seeking) { DPRINTLN("a[" + i2s(i) + "]=" + i2s(k) + ", with num_live=" + i2s(seeking)); a[i++] = k; } int asize = i; i = 0; while (i < asize && !find_suitable(a[i], size, max_live, pick, use_opt, step_to_num_live, write_k_to_opt[a[i]])) ++i; delete[] a; if (i == asize) { DPRINTLN("select_rfrs: nothing suitable at or after step " + i2s(start)); return; } assert(pick != NULL); use_opt.insert(pick); } } void Megatile::filter_read_from_reg() { int s = size(), max_live_if_no_filter, total = 0; map step_to_num_live; for (int k = 0; k < s; k++) for (llist *l = array_read_elisions[k]; l != NULL; l = l->tail()) { ++total; opt_read *f = l->front(); int write_k = k - f->delta_k; if (write_k < 0) { bump_all_in_range(step_to_num_live, 0, k); bump_all_in_range(step_to_num_live, write_k + s, s - 1); } else bump_all_in_range(step_to_num_live, write_k, k); } if (total == 0) return; max_live_if_no_filter = max_second(step_to_num_live); int max_live = AC::rr_aggressiveness; if (max_live < 0) max_live = get_parameter_in_range("Aggressiveness in avoiding re-reading array values from memory", "At any program point, the number of live variables that are used to avoiding re-reading array values from memory will not exceed this number.", 0, max_live_if_no_filter, 1); if (max_live >= max_live_if_no_filter) return; step_to_num_live.clear(); set use_opt; map *> write_k_to_opt; if (max_live > 0) { for (int k = 0; k < s; k++) for (llist *l = array_read_elisions[k]; l != NULL; l = l->tail()) { opt_read *f = l->front(); int write_k = k - f->delta_k; if (write_k < 0) write_k += s; push(write_k_to_opt[write_k], f); } for (int i = 0; i < max_live; ) select_rfrs(++i, s, use_opt, write_k_to_opt, step_to_num_live); } /* Remove opt_reads that aren't pointed to by a member of use_opt. */ for (int k = 0; k < s; k++) { llist *x = NULL; for (llist *l = array_read_elisions[k]; l != NULL; l = l->tail()) if (use_opt.find(l->front()) != use_opt.end()) push(x, l->front()); #if 0 else free(l->front()); #endif ::free_all(array_read_elisions[k]); array_read_elisions[k] = x; } } /* Each element of l contains a Relation that maps from f's iteration space to array indices. Replace it with a Relation that maps from m's tiled space to array indices. */ /* unused */ #if 0 static void remap_smap(llist *l, Foreach *f) { DPRINTLN("remap_smap(" + f->source_code_location + ")"); int n = f->arity(); while (l != NULL) { smap_element *e = l->front(); DPRINT("remap smap elt " + e->to_string() + " -> "); Relation r(n, n); F_And *root = r.add_and(); for (int i = 0; i < n; i++) { EQ_Handle eqh = root->add_EQ(); eqh.update_coef(r.output_var(i + 1), -1); for (int j = 0; j < n; j++) eqh.update_coef(r.input_var(j + 1), (*(*f->steps)[j])[i]); } e->R = Join(r, e->R); DPRINTLN(e->to_string()); l = l->tail(); } } #endif /**************************************************************************** * set_shrink_mods() and related utils ****************************************************************************/ static map< int, map< map< intpair, st_source * > *, map * > > memo_relk0; static map< int, map< map< smap_at_k, string > *, map * > > memo_relk1; static map< int, map< map< smap_at_k, intvector * > *, map * > > memo_relk2; ACvar *backup_by_n(const string &s, size_t n, Ty *t, Block *b) { return get_ACvar(s + string(n, 'p'), t, b); } /* If maxdist is v, then how many temps do we need? */ int maxdist_count(const intvector *v) { const int y = v->find_non_zero(); return y < 0 ? 1 : (1 - (*v)[y]); } int maxdist_to_array_dim(const intvector *v) { const int y = v->find_non_zero(); assert(y >= 0); return ((int) v->size()) - 1 - y; } /* Create an array index for a temp array. l specifies the point in tile space that must be converted to a 1D array index. Does not modify l. */ static string create_opt_array_index(llist *l, int loop_arity) { extern ACvar *temp_array_size(int); string result; int array_arity = (int) l->size(); DPRINT("create_opt_array_index(" + l2s(l) + ", "); result = l->front(); for (int i = loop_arity - 1; --i >= loop_arity - array_arity; ) { l = l->tail(); result += " + ((" + l->front() + ") * " + temp_array_size(i + 1)->to_string() + ")"; } DPRINTLN(i2s(loop_arity) + "): " + result); return result; } /* access is what we're optimizing; src is the writer. */ void UpdateAndExecute:: shrink_opt_read(const int access, const st_source *src, map< smap_at_k, string > *writer_to_name_all_k, map< smap_at_k, intvector * > *writer_to_maxdist_all_k, ACvar **ti, bool parallel) { const smap_at_k w(src->k, src->e); const string &name = (*writer_to_name_all_k)[w]; const intvector *maxdist = (*writer_to_maxdist_all_k)[w]; const intvector *dist = src->v; /* dist for this read */ int arity = maxdist->size(), y = maxdist->find_non_zero(); ACvar *readvar; if (y < 0 || y == arity - 1) { /* scalar case */ readvar = backup_by_n(name, y < 0 ? 0 : -(*dist)[y]); _replace_read[access] = rpair(readvar, ""); DPRINTLN("shrink_opt read (" + i2s(access) + ") in " + (y < 0 ? "tile" : "bundle") + " to " + readvar->to_string()); } else { /* array case */ readvar = backup_by_n(name, (parallel || y < 0) ? 0 : -(*dist)[y]); llist *l = NULL; if (parallel) { y = -1; arity = f->arity(); } while (++y < arity) { int offset; string s = ti[y]->to_string(); if ((offset = (*dist)[y]) > 0) s += " + " + i2s(offset); else if (offset < 0) s += " - " + i2s(-offset); push(l, s); // cout << "y=" << y << ": " << l->front() << endl; } _replace_read[access] = rpair(readvar, create_opt_array_index(l, arity)); ::free_all(l); DPRINTLN("shrink_opt read (" + i2s(access) + ") in bundle to " + readvar->to_string() + '[' + _replace_read[access].second + ']'); } } void UpdateAndExecute:: shrink_opt_write(const smap_element *write, const string &name, const intvector *maxdist, ACvar **ti, bool parallel) { const int access = write->access; int arity = maxdist->size(), y = maxdist->find_non_zero(); ACvar *writevar = backup_by_n(name, 0); if (y < 0 || y == arity - 1) { _replace_write[access] = rpair(writevar, ""); DPRINTLN("shrink_opt write (" + i2s(access) + ") in " + (y < 0 ? "tile" : "bundle") + " to " + writevar->to_string()); } else { llist *l = NULL; if (parallel) { y = -1; arity = f->arity(); } while (++y < arity) { push(l, ti[y]->to_string()); // cout << "y=" << y << ": " << l->front() << endl; } _replace_write[access] = rpair(writevar, create_opt_array_index(l, arity)); DPRINTLN("shrink_opt write (" + i2s(access) + ") in bundle to " + writevar->to_string() + '[' + _replace_write[access].second + ']'); } } /* Append to b statements that are required by ts when moving from one tile in a bundle to the next. */ void Megatile::ts_next_tile_within_bundle(Block *b) { DPRINTLN("ts_next_tile_within_bundle()"); map< smap_at_k, string >::const_iterator i; for (i = writer_to_name.begin(); i != writer_to_name.end(); ++i) { const smap_at_k &writer = (*i).first; const string &name = (*i).second; const intvector *v = writer_to_maxdist[writer]; DBG(cout << " Write " << writer.second->to_string() << ",k=" << writer.first << " uses name " << name << " " << v->to_string() << endl); const int y = v->find_non_zero(), arity = v->size(); if (y < 0 || y == arity - 1) for (int n = maxdist_count(v) - 1; --n >= 0; ) b->append(new Assign(backup_by_n(name, n + 1), backup_by_n(name, n))); } } UpdateAndExecute *UpdateAndExecute:: set_shrink_mods(int k, map< intpair, st_source * > *reader_to_writer_all_k, map< smap_at_k, string > *writer_to_name_all_k, map< smap_at_k, intvector * > *writer_to_maxdist_all_k, ACvar **ti, bool parallel) { assert(!shrink_mods_are_set); shrink_mods_are_set = true; if (enable_storage_opt_ts) { /* Maps supplied as args contain data for all values of k. Select out the parts that are relevant for the given k. */ DPRINTLN("set_shrink_mods(k=" + i2s(k) + ", par=" + b2s(parallel)+ ")"); { map< intpair, st_source * >::const_iterator i; for (i = reader_to_writer_all_k->begin(); i != reader_to_writer_all_k->end(); ++i) { const intpair &read = (*i).first; int acc = read.second; const st_source *write = (*i).second; if (read.first == k) shrink_opt_read(acc, write, writer_to_name_all_k, writer_to_maxdist_all_k, ti, parallel); } } map< smap_at_k, string >::const_iterator i; for (i = writer_to_name_all_k->begin(); i != writer_to_name_all_k->end(); ++i) { const smap_at_k &writer = (*i).first; const string &name = (*i).second; const intvector *maxdist = (*writer_to_maxdist_all_k)[writer]; if (writer.first == k) shrink_opt_write(writer.second, name, maxdist, ti, parallel); } } return this; } /**************************************************************************** * analysis related to shrinking temp arrays ****************************************************************************/ /* Suppose the write specified by e, taking place as step k of some tile T in tilespace, writes to array_elt. (er is the relation from tilespace to array indices for e when executed in step k.) Is it possible that such a T could occur before or at position cutoff_TS_position (step cutoff_k) in tilespace? */ static bool could_touch_before_or_at(rwmap_element *e, int k, Relation er, Relation cutoff_TS_position, int cutoff_k, Relation array_elt) { Relation T = Join(copy(array_elt), Inverse(copy(er))); Relation pre = preceding(cutoff_TS_position, (cutoff_k >= k)); DBG({ printrel(T, "could_touch_before_or_at(): T:"); printrel(pre, "pre:"); printrel(cutoff_TS_position, "cutoff:"); }); return Intersection(T, pre).is_lower_bound_satisfiable(); } /* Analogous to could_touch_before_or_at(), above. */ static bool could_touch_after_or_at(rwmap_element *e, int k, Relation er, Relation cutoff_TS_position, int cutoff_k, Relation array_elt) { Relation T = Join(copy(array_elt), Inverse(copy(er))); Relation pre = preceding(T, (cutoff_k <= k)); DBG({ printrel(T, "could_touch_after_or_at(): T:"); printrel(pre, "pre:"); printrel(cutoff_TS_position, "cutoff:"); }); return Intersection(cutoff_TS_position, pre).is_lower_bound_satisfiable(); } /* If read in m always reads a datum supplied by the same statement, specified by the first three args, then return an appropriate st_source. Otherwise return NULL. */ static st_source *always_is_source_for_read(intvector *write_TS_offset, int write_k, int write_access, rwmap_element *read, int k, Megatile *m, smap &s, rwmap &w, const Foreach **ff, const intvector **vv, int size) { #define FAIL(x) do { DBG(x); return NULL; } while (0) smap_element *write = access_to_smap_element(write_access, s); if (write != NULL) { int arity = m->arity(); // TS (and iteration space) arity Relation rt = relation_from_tilespace_to_arrayindex(read, k, ff[k], vv[k]), wt = relation_from_tilespace_to_arrayindex(write, write_k, ff[write_k], vv[write_k]); /* In a generic position in TS, what array element does read touch? */ llist *generic = NULL; /* Set generic to be a generic point in space. */ for (int i = arity; i > 0; --i) push(generic, freevar(i)); Relation array_elt = apply_to_generic(rt, generic); Relation read_TS = apply_to_generic(identity_relation(arity), generic); Relation write_TS = translate_set(read_TS, write_TS_offset); DBG({ printrel(array_elt, "array_elt:"); printrel(read_TS, "read_TS:"); cout << "Read " << read->to_string() << " occurs in step " << k << endl; printrel(write_TS, "write_TS:"); cout << "Write " << write->to_string() << " occurs in step " << write_k << " of tile with offset " << write_TS_offset->to_string() << endl; }); Relation warray_elt = apply_to_generic(translate_domain(wt, write_TS_offset->times(-1)), generic); if (!same_relation(warray_elt, array_elt)) FAIL(printrel(warray_elt, "warray_elt does not match:")); /* Go through every write in m and check if any might write to array_elt in between WRITE and READ. */ Megatile::iterator i; int pk = 0; for (i = m->begin(); !i.isDone(); i.next(), pk++) { DPRINTLN("pk = " + i2s(pk)); foreach (x, llist, *w[i.loop()]) { rwmap_element *e = *x; if (e->array == read->array && e->is_write() && e->access != write_access) { Relation er = relation_from_tilespace_to_arrayindex(e, pk, ff[pk], vv[pk]); if (could_touch_before_or_at(e, pk, er, read_TS, k, array_elt) && could_touch_after_or_at(e, pk, er, write_TS, write_k, array_elt)) FAIL(cout << "always_is_source(): no, because of " << e->to_string() << endl); } } } /* Success! */ DPRINTLN("always_is_source(): yes"); return new st_source(write_TS_offset, write_k, write); } return NULL; #undef FAIL } /* If read in m always reads a datum supplied by the same statement, and that statement occurs a fixed number of steps in m prior to read, then return an appropriate st_source. Otherwise return NULL. */ static st_source *find_source_for_read(rwmap_element *read, int k, Megatile *m, smap &s, rwmap &w, const Foreach **ff, const intvector **vv, int size) { DPRINTLN("find_source_for_read(" + read->to_string() + ") k=" + i2s(k)); Relation r = relation_from_tilespace_to_arrayindex(read, k, ff[k], vv[k]); int TS_arity = m->arity(); // tile space arity intvector *zero = zero_vector(TS_arity); Relation lex_neg = lexicographically_negative(TS_arity); // best guess as to a source that is a fixed number of steps prior to read intvector *best = NULL; int best_k, best_access; Megatile::iterator i; int sk = 0; best_k = best_access = 0; for (i = m->begin(); !i.isDone(); i.next(), sk++) { DPRINTLN("sk = " + i2s(sk)); foreach (x, llist, *w[i.loop()]) { rwmap_element *e = *x; if (e->array == read->array && e->is_write() && e->every_iter) { DPRINTLN("possible source: " + e->to_string()); Relation sr = relation_from_tilespace_to_arrayindex(e, sk, ff[sk], vv[sk]); Relation tilespace_to_tilespace = Join(copy(r), Inverse(copy(sr))); Relation at_zero = eval_relation_at_zero(tilespace_to_tilespace); /* We're only interested in lexicographically nonpositive elements of at_zero; lex. positive values correspond to writes after the read in question. */ llist *possibles = NULL; if (sk < k && set_contains_intvector(at_zero, zero)) { DPRINTLN("possibles=" + zero->to_string()); possibles = cons(zero); } else { DBG(printrel(at_zero, "at_zero:")); at_zero = Intersection(at_zero, copy(lex_neg)); bool finite = set_to_llist(at_zero, possibles); if (!finite) { DBG(printrel(at_zero, "fail because not finite:")); return NULL; } DBG({ printrel(r, "r:"); printrel(sr, "sr:"); printrel(tilespace_to_tilespace, "tilespace_to_tilespace:"); }); if (possibles == NULL) { DPRINTLN("possibles=NULL"); continue; } } /* Choose the most recent of possibles. */ intvector *p = (*possibles)[lexicographically_last(possibles)]; free_all(possibles); if (best == NULL || best->lexicographically_precedes(p) || best->equals(p)) { best = p; best_k = sk; best_access = e->access; DPRINTLN("best=p=" + p->to_string()); } else { DPRINTLN("p=" + p->to_string() + " (discarded)"); } } } } if (best != NULL) return always_is_source_for_read(best, best_k, best_access, read, k, m, s, w, ff, vv, size); return NULL; } /* add code to m that causes it to fail unless the tile space of rk translated by d is a subset of the tile space of wk. rk and wk are "step numbers" within m. */ static void require_subset(Megatile *m, int rk, const intvector *d, int wk) { int rl = m->step(rk).loop_number(), wl = m->step(wk).loop_number(), rindex = m->which_node_from_loop(rl, rk), windex = m->which_node_from_loop(wl, wk); DBG(cout << "require_subset(rl=" << rl << ", rindex=" << rindex << ", wl=" << wl << ", windex=" << windex << ", d=" << d->to_string() << ")\n"); m->subset_or_fail(rl, rindex, wl, windex, d); } /* Look for and perhaps shrink storage for temporary arrays. Optimized access are added (in the form of (k, access) pairs) to ignore to prevent later opts from processing them. */ static void shrink_temp_arrays(Megatile *m, smap &s, rwmap &w, const Foreach **ff, const intvector **vv, int size, set &ignore) { /* If this is not the first opt, ignore might be non-empty. That is no problem, but it would require additional logic below, to make sure there is no conflict with any previous opts. */ assert(ignore.empty()); Megatile::iterator i; set arrays; map *> writes, reads; // arrays written/read in loop l // map array number to first/last loop using it in the original program order map firstloopw, lastloopw; // writes map firstloopr, lastloopr; // reads for (i = m->begin(); !i.isDone(); i.next()) { int l = i.loop_number(); if (writes[l] == NULL) { writes[l] = new set; reads[l] = new set; foreach (x, llist, *w[i.loop()]) { int array = (*x)->array; if (array == 0) { DPRINTLN("shrink_temp_arrays() stymied by possible " "unknown op(s) on array(s)"); return; } ((*x)->is_read() ? reads[l] : writes[l])->insert(array); if (arrays.find(array) == arrays.end()) { arrays.insert(array); firstloopr[array] = firstloopw[array] = MAXINT; lastloopr[array] = lastloopw[array] = MININT; } if ((*x)->is_read()) { firstloopr[array] = std::min(l, firstloopr[array]); lastloopr[array] = std::max(l, lastloopr[array]); } else { firstloopw[array] = std::min(l, firstloopw[array]); lastloopw[array] = std::max(l, lastloopw[array]); } } } } for (set::iterator a = arrays.begin(); a != arrays.end(); a++) { int arr = *a, fr = firstloopr[arr], lr = lastloopr[arr], fw = firstloopw[arr], lw = lastloopw[arr]; DBG({ cout << "Array " << arr << " (" << arrayname(arr) << "):"; if (lr >= 0) cout << " fr=" << fr << " lr=" << lr; if (lw >= 0) cout << " fw=" << fw << " lw=" << lw; cout << endl; }); /* Array must be both written and read or we're not interested; also, ignore cases where it is read before it is first written or written after it is last read. */ if (lr < 0 || lw < 0 || fr < fw || lw > lr) continue; /* Elements must not be read again after last loop to read arr. */ set const &junk = m->nth_loop(lr)->junk_post; if (junk.find(arr) == junk.end()) { DPRINTLN("junk_post for loop " + i2s(lr) + " doesn't contain " + i2s(arr)); continue; } DPRINTLN(" could be a temp array..."); /* For simplicity, we require that for each read or possible read, there must be a (intvector v in tile space, int k, smap_element *e) that points to the source of the datum. If we can't find one, then don't optimize away this array. */ int k = 0, namecount = 0; string name = no_spaces(arrayname(arr) + "_temp"); map< intpair, st_source * > reader_to_writer; map< smap_at_k, string > writer_to_name; map< smap_at_k, intvector * > writer_to_maxdist; for (i = m->begin(); !i.isDone(); i.next(), k++) { foreach (x, llist, *w[i.loop()]) { rwmap_element *rw = *x; if (rw->is_read() && rw->array == arr) { st_source *v = find_source_for_read(rw, k, m, s, w, ff, vv, size); if (v == NULL) { DPRINTLN("find_source_for_read(...) -> NULL"); goto next_array; } reader_to_writer[intpair(k, rw->access)] = v; smap_at_k writer(v->k, v->e); if (writer_to_name[writer].empty()) { writer_to_name[writer] = name + '_' + i2s(namecount++); writer_to_maxdist[writer] = v->v; } else { intvector *& max = writer_to_maxdist[writer]; if (v->v->lexicographically_precedes(max)) max = v->v; } } } } /* Success */ DPRINTLN(arrayname(arr) + " is a temp array!"); if (get_bool_parameter("Use compiler generated scratch space instead of temporary array " + arrayname(arr), "If true, the compiler will automatically replace loads and stores to this array with loads and stores to a lower-dimension compiler-generated temporary array (or to scalar variables).", true)) { // m->rw = w; for (map< intpair, st_source * >::iterator g = reader_to_writer.begin(); g != reader_to_writer.end(); ++g) { intpair reader = (*g).first; st_source *v = (*g).second; m->reader_to_writer[reader] = v; ignore.insert(reader); ignore.insert(intpair(v->k, v->e->access)); DPRINTLN("shrink_temp_array(): opt acc=" + i2s(reader.second) + ",k=" + i2s(reader.first) + " from " + v->e->to_string() + ",k=" + i2s(v->k)); assert(arr == v->e->array); if (!m->step(v->k).loop()->was_junk_before_this(arr)) require_subset(m, reader.first, v->v, v->k); } for (map< smap_at_k, string >::iterator g = writer_to_name.begin(); g != writer_to_name.end(); ++g) m->writer_to_name[(*g).first] = (*g).second; for (map< smap_at_k, intvector * >::iterator g = writer_to_maxdist.begin(); g != writer_to_maxdist.end(); ++g) m->writer_to_maxdist[(*g).first] = (*g).second; } next_array: ; } } /**************************************************************************** * waw_storage_opt() and related utils ****************************************************************************/ /* At step k of m, e writes to the same array element in each tile in a bundle. If we can locate all reads and writes in m to that element, then we can use a register to hold the value instead. */ static void written_every_tile(smap_element *e, int k, Megatile *m, smap &s, rwmap &w, const Foreach **ff, const intvector **vv, int size, set &ignore) { #define TAG "written_every_tile(e=" + e->to_string() + ", k=" + i2s(k) + ")" #define FAIL do { DPRINTLN(TAG + ": fail"); goto free_everything; } while (0) DPRINTLN(TAG); int arity = m->arity(); // iteration space arity llist *generic = NULL; /* Set generic to be a generic point in space. */ for (int i = arity; i > 0; --i) push(generic, freevar(i)); Relation x = apply_to_generic(relation_from_tilespace_to_arrayindex(e, k, ff[k], vv[k]), generic); /* Change generic to be a generic point in the same bundle as before. */ // Actually, unnecessary since x won't contain any uses of freevar(arity) // generic = dreverse(cons(freevar(0), dreverse(generic)->free())); DBG(printrel(x, "e's tilespace to array index:")); int sk = 0; set *touches = new set; for (Megatile::iterator i = m->begin(); !i.isDone(); i.next(), sk++) { DBG(cout << "From " << i.loop()->source_code_location << " do " << i.v()->to_string() << endl); foreach (rw, llist, *w[i.loop()]) { if (ignore.find(intpair(sk, (*rw)->access)) != ignore.end()) continue; if ((*rw)->might_be_same_array(e)) { if (!(*rw)->must_be_same_array(e)) FAIL; } else continue; /* If we get here *rw and e refer to the same array. */ Relation y = apply_to_generic(relation_from_tilespace_to_arrayindex(*rw, sk, ff[sk], vv[sk]), generic); DBG(printrel(y, "tilespace to array index:")); /* Imagine points p and q are two generic points in the same bundle in tile space. Then x and y are the array indices touched at p (by e) and at q (by *rw). In order to do the optimization we require that either x and y do not overlap or that x and y are identical. */ Relation intersection = Intersection(copy(x), copy(y)); DBG(printrel(intersection, "intersection:")); if (intersection.is_lower_bound_satisfiable()) if (same_relation(x, y)) touches->insert(touch(sk, *rw)); else FAIL; } } assert(!touches->empty()); DPRINTLN(TAG + ": success"); for (set::iterator i = touches->begin(); i != touches->end(); i++) { int k = (*i).first; rwmap_element *rw = (*i).second; DPRINTLN(" k=" + i2s(k) + ' ' + rw->to_string()); ignore.insert(intpair(k, rw->access)); } m->use_scalar_in_bundle(touches); goto free_all_but_touches; free_everything: delete touches; free_all_but_touches: free_all(generic); #undef FAIL #undef TAG } static bool same_loc_every_tile(smap_element *e, int k, const Foreach *f, const intvector *v) { const int n = f->arity(); Relation r = relation_from_tilespace_to_arrayindex(e, k, f, v); Relation *line = wholeline(unit_vector(n - 1, n), zero_vector(n)); r = Join(copy(*line), r); bool result = is_singleton_set(r); DPRINTLN("same_loc_every_tile(" + relation_to_str(r) + "): " + (result ? '1' : '0')); return result; } /* Look for and optimize: 1. writes to same location in every tile in a bundle 2. writes to same location multiple times in a tile (unimplemented!) In both cases, all possible reads and writes to the location of interest must be known. As a side-effect, insert pairs into ignore for all optimized accesses. */ static void waw_storage_opt(Megatile *m, smap &s, rwmap &w, const Foreach **ff, const intvector **vv, int size, set &ignore) { DPRINTLN("waw_storage_opt()"); // map (array, index) to list of touches that only write to that location // in tile 0. map< int, map< string, llist * > > z; int k = 0; Megatile::iterator i; set unknown; // k's for which the kth step has an unknown array op /* Compute z and unknown. */ for (i = m->begin(); !i.isDone(); i.next(), k++) { DPRINTLN("k = " + i2s(k)); foreach (x, llist, *w[i.loop()]) if ((*x)->array == 0) unknown.insert(k); foreach (x, llist, *s[i.loop()]) { smap_element *e = *x; int array = e->array; if (array == 0) { DPRINTLN("unknown array op: " + e->to_string()); unknown.insert(k); break; } if (!e->is_read()) { Relation r = relation_from_tilespace_to_arrayindex(e, k, ff[k], vv[k]); Relation at_zero = eval_relation_at_zero(r); intvector *v; if (is_singleton_set(at_zero, &v)) { string s = v->to_string(); DPRINTLN("Write " + e->to_string() + ' ' + s); push(z[array][s], smap_at_k(k, e)); } else DPRINTLN("Write " + e->to_string() + ' ' + relation_to_str(at_zero)); } } } /* z and unknown have been computed. If unknown is empty then if every tile in a bundle writes to the same location we may be able to optimize that. */ if (unknown.empty()) { map< int, map< string, llist * > >::iterator j; for (j = z.begin(); j != z.end(); ++j) { map< string, llist * >::iterator jj; for (jj = (*j).second.begin(); jj != (*j).second.end(); ++jj) { const string &index = (*jj).first; DPRINTLN("Check location " + index); llist *l = (*jj).second; if (l != NULL) { k = l->front().first; DPRINT("k=" + i2s(k) + ' '); smap_element *e = l->front().second; if (same_loc_every_tile(e, k, ff[k], vv[k])) { while ((l = l->tail()) != NULL) { int tk = l->front().first; if (!same_loc_every_tile(l->front().second, tk, ff[tk], vv[tk])) goto next_loc; } /* every elt of the list ol touches the same elt every iteration */ written_every_tile(e, k, m, s, w, ff, vv, size, ignore); } } next_loc: ; } } } /* Delete cons cells in z. */ map< int, map< string, llist * > >::iterator j; for (j = z.begin(); j != z.end(); ++j) { map< string, llist * >::iterator k; for (k = (*j).second.begin(); k != (*j).second.end(); ++k) free_all((*k).second); } } /**************************************************************************** * storage_opt() (high level driver) and related utils ****************************************************************************/ /* s is optimizable accesses; w is all writes, including those in s. Try to replace multiple writes to the same location with one write. Try to optimize/schedule the array reads specified in s. May destroy/modify s and w (?). */ static void storage_opt(Megatile *m, smap &s, rwmap &w) { DPRINTLNV("\nstorage_opt()\n"); Megatile::iterator i; int size, k = 0; const Foreach **ff; const intvector **vv; m->summarize(ff, vv, size); set ignore; if (enable_storage_opt_ts) { shrink_temp_arrays(m, s, w, ff, vv, size, ignore); DBG(m->summarize_ts(cout)); } if (enable_storage_opt_usib) { waw_storage_opt(m, s, w, ff, vv, size, ignore); DBG(m->summarize_usib(cout)); } for (i = m->begin(); !i.isDone(); i.next(), k++) { DBG(cout << "From " << i.loop()->source_code_location << " do " << i.v()->to_string() << endl); foreach (e, llist, *s[i.loop()]) if ((*e)->is_read() && ignore.find(intpair(k, (*e)->access)) == ignore.end()) optimize_read(*e, s, w, m, k, ff, vv, size); } if (enable_storage_opt_read_from_reg) m->filter_read_from_reg(); DBG(m->summarize_rr(cout)); delete[] ff; delete[] vv; } /* Clean out memoized results, etc. */ static void reset() { tilespace_to_arrayindex_memo.clear(); freevar_map_clear(); string_to_ACvar_map.clear(); access_to_smap_element_memo.clear(); memo_relk0.clear(); memo_relk1.clear(); memo_relk2.clear(); } /* s is optimizable accesses; w is all writes, including those in s. For every Megatile in m, try to optimize/schedule the array reads specified in s. May destroy/modify s and w. */ void storage_opt(llist *m, smap &s, rwmap &w, bool sr, bool rr, bool usib, bool ts, string (*_arrayname_fn)(int)) { arrayname_fn = _arrayname_fn; enable_storage_opt_schedule_read = sr; enable_storage_opt_read_from_reg = rr; enable_storage_opt_usib = usib; enable_storage_opt_ts = ts; reset(); if (!s.empty()) while (m != NULL) { storage_opt(m->front(), s, w); m = m->tail(); } }