Commit 3244e5da authored by Sebastian Hack's avatar Sebastian Hack
Browse files

Added the fabulous Gauss-Seidel linear equation solver by Mr. Grund

Adapted execfreqs
Now an order of magnitude faster... at least in c-lex.c

[r15816]
parent 2daaa258
#ifndef MATRIX_H_
#define MATRIX_H_
#include <stdio.h>
typedef struct _gs_matrix_t gs_matrix_t;
/**
* Allocate a new matrix and init internal data for a matrix of size
* row_init X col_init. Matrix cannot grow beyond these init values.
* All elements are initially (implicitly) set to 0.
*/
gs_matrix_t *gs_new_matrix(int n_init_rows, int n_init_cols);
/**
* Free space used by matrix m
*/
void gs_delete_matrix(gs_matrix_t *m);
void gs_matrix_assure_row_capacity(gs_matrix_t *m, int row, int min_capacity);
void gs_matrix_trim_row_capacities(gs_matrix_t *m);
void gs_matrix_delete_zero_entries(gs_matrix_t *m);
/**
* Sets m[row, col] to val
* @p increase If non-zero @p val is added to the existing
*/
void gs_matrix_set(gs_matrix_t *m, int row, int col, double val);
/**
* Returns the value stored in m[row, col].
*/
double gs_matrix_get(const gs_matrix_t *m, int row, int col);
/**
* Performs one step of the Gauss-Seidel algorithm
* @p m The iteration matrix
* @p x The iteration vector
* @p a The dimension of the matrix (axa matrix)
*/
double gs_matrix_gauss_seidel(const gs_matrix_t *m, double *x, int n);
unsigned gs_matrix_get_n_entries(const gs_matrix_t *m);
/**
* Dumps the matrix factor*m to the stream @p out.
*/
void gs_matrix_dump(const gs_matrix_t *m, int a, int b, FILE *out);
int gs_matrix_get_sizeof_allocated_memory(const gs_matrix_t *m);
void gs_matrix_export(const gs_matrix_t *m, double *nw, int size);
#endif /*MATRIX_H_*/
#include <assert.h>
#include <math.h>
#include <string.h>
#include "xmalloc.h"
#include "gaussseidel.h"
#include "firm_config.h"
#include "util.h"
#define MAX(x,y) ((x) > (y) ? (x) : (y))
#define MIN(x,y) ((x) < (y) ? (x) : (y))
/**
* The number of newly allocated rows (realloc)
* when there is no more room. Must be >= 1.
*/
#define ROW_INCREASE_FACTOR 1.2
/**
* The number of newly allocated cols (realloc)
* when there is no more room. Must be >= 1.
*/
#define COL_INCREASE 2
typedef struct _col_val_t {
double v;
int col_idx;
} col_val_t;
typedef struct _row_col_t {
int c_cols;
int n_cols;
double diag;
col_val_t *cols;
} row_col_t;
struct _gs_matrix_t {
int initial_col_increase;
int c_rows;
int n_zero_entries; ///< Upper bound on number of entries equal to 0.0
row_col_t *rows;
};
static INLINE void alloc_cols(row_col_t *row, int c_cols) {
assert(c_cols > row->c_cols);
row->c_cols = c_cols;
row->cols = xrealloc(row->cols, c_cols * sizeof(*row->cols));
}
static INLINE void alloc_rows(gs_matrix_t *m, int c_rows, int c_cols, int begin_init) {
int i;
assert(c_rows > m->c_rows);
m->c_rows = c_rows;
m->rows = xrealloc(m->rows, c_rows * sizeof(*m->rows));
for (i = begin_init; i < c_rows; ++i) {
m->rows[i].c_cols = 0;
m->rows[i].n_cols = 0;
m->rows[i].diag = 0.0;
m->rows[i].cols = NULL;
if (c_cols > 0)
alloc_cols(&m->rows[i], c_cols);
}
}
gs_matrix_t *gs_new_matrix(int n_init_rows, int n_init_cols) {
gs_matrix_t *res = xmalloc(sizeof(*res));
memset(res, 0, sizeof(*res));
if (n_init_rows < 16)
n_init_rows = 16;
res->initial_col_increase = n_init_cols;
alloc_rows(res, n_init_rows, n_init_cols, 0);
return res;
}
void gs_delete_matrix(gs_matrix_t *m) {
int i;
for (i = 0; i < m->c_rows; ++i) {
if (m->rows[i].c_cols)
xfree(m->rows[i].cols);
}
if (m->c_rows)
xfree(m->rows);
xfree(m);
}
unsigned gs_matrix_get_n_entries(const gs_matrix_t *m) {
int i;
unsigned n_entries = 0;
for (i = 0; i < m->c_rows; ++i) {
n_entries += m->rows[i].n_cols;
n_entries += (m->rows[i].diag != 0.0) ? 1 : 0;
}
return n_entries - m->n_zero_entries;
}
int gs_matrix_get_sizeof_allocated_memory(const gs_matrix_t *m) {
int i, n_col_val_ts = 0;
for (i = 0; i < m->c_rows; ++i)
n_col_val_ts += m->rows[i].c_cols;
return n_col_val_ts * sizeof(col_val_t) + m->c_rows * sizeof(row_col_t) + sizeof(gs_matrix_t);
}
void gs_matrix_assure_row_capacity(gs_matrix_t *m, int row, int min_capacity) {
row_col_t *the_row = &m->rows[row];
if (the_row->c_cols < min_capacity)
alloc_cols(the_row, min_capacity);
}
void gs_matrix_trim_row_capacities(gs_matrix_t *m) {
int i;
for (i = 0; i < m->c_rows; ++i) {
row_col_t *the_row = &m->rows[i];
if (the_row->c_cols) {
the_row->c_cols = the_row->n_cols;
if (the_row->c_cols)
the_row->cols = xrealloc(the_row->cols, the_row->c_cols * sizeof(*the_row->cols));
else
xfree(the_row->cols);
}
}
}
void gs_matrix_delete_zero_entries(gs_matrix_t *m) {
int i, read_pos;
for (i = 0; i < m->c_rows; ++i) {
row_col_t *the_row = &m->rows[i];
int write_pos = 0;
for (read_pos = 0; read_pos < the_row->n_cols; ++read_pos)
if (the_row->cols[read_pos].v != 0.0 && read_pos != write_pos)
the_row->cols[write_pos++] = the_row->cols[read_pos];
the_row->n_cols = write_pos;
}
m->n_zero_entries = 0;
}
void gs_matrix_set(gs_matrix_t *m, int row, int col, double val) {
row_col_t *the_row;
col_val_t *cols;
int min, max, c, i;
if (row >= m->c_rows) {
int new_c_rows = ROW_INCREASE_FACTOR * row;
alloc_rows(m, new_c_rows, m->initial_col_increase, m->c_rows);
}
the_row = &m->rows[row];
if (row == col) {
/* Note that we store the diagonal inverted to turn divisions to mults in
* matrix_gauss_seidel(). */
assert(val != 0.0);
the_row->diag = 1.0 / val;
return;
}
// Search for correct column
cols = the_row->cols;
min = 0;
max = the_row->n_cols;
c = (max+min)/2;
while (min < max) {
int idx = cols[c].col_idx;
if (idx < col)
min = MAX(c, min+1);
else if (idx > col)
max = MIN(c, max-1);
else
break;
c = (max+min)/2;
}
// Have we found the entry?
if (c < the_row->n_cols && the_row->cols[c].col_idx == col) {
the_row->cols[c].v = val;
if (val == 0.0)
m->n_zero_entries++;
return;
}
// We haven't found the entry, so we must create a new one.
// Is there enough space?
if (the_row->c_cols == the_row->n_cols)
alloc_cols(the_row, the_row->c_cols + COL_INCREASE);
// Shift right-most entries to the right by one
for (i = the_row->n_cols; i > c; --i)
the_row->cols[i] = the_row->cols[i-1];
// Finally insert the new entry
the_row->n_cols++;
the_row->cols[c].col_idx = col;
the_row->cols[c].v = val;
// Check that the entries are sorted
assert(c==0 || the_row->cols[c-1].col_idx < the_row->cols[c].col_idx);
assert(c>=the_row->n_cols-1 || the_row->cols[c].col_idx < the_row->cols[c+1].col_idx);
}
double gs_matrix_get(const gs_matrix_t *m, int row, int col) {
int c;
if (row >= m->c_rows)
return 0.0;
row_col_t *the_row = &m->rows[row];
if (row == col)
return the_row->diag != 0.0 ? 1.0 / the_row->diag : 0.0;
// Search for correct column
for (c = 0; c < the_row->n_cols && the_row->cols[c].col_idx < col; ++c);
if (c >= the_row->n_cols || the_row->cols[c].col_idx > col)
return 0.0;
assert(the_row->cols[c].col_idx == col);
return the_row->cols[c].v;
}
/* NOTE: You can slice out miss_rate and weights.
* This does ONE step of gauss_seidel. Termination must be checked outside!
* This solves m*x=0. You must add stuff for m*x=b. See wikipedia german and english article. Should be simple.
* param a is the number of rows in the matrix that should be considered.
*
* Note that the diagonal element is stored separately in this matrix implementation.
* */
double gs_matrix_gauss_seidel(const gs_matrix_t *m, double *x, int n) {
double res = 0.0;
int r;
assert(n <= m->c_rows);
for (r = 0; r < n; ++r) {
row_col_t *row = &m->rows[r];
col_val_t *cols = row->cols;
double sum, old, nw;
int c;
sum = 0.0;
for (c = 0; c < row->n_cols; ++c) {
int col_idx = cols[c].col_idx;
sum += cols[c].v * x[col_idx];
}
old = x[r];
nw = - sum * row->diag;
// nw = old - overdrive * (old + sum * row->diag);
res += fabs(old - nw);
x[r] = nw;
}
return res;
}
void gs_matrix_export(const gs_matrix_t *m, double *nw, int size)
{
int effective_rows = MIN(size, m->c_rows);
int c, r;
memset(nw, 0, size * size * sizeof(*nw));
for (r=0; r < effective_rows; ++r) {
row_col_t *row = &m->rows[r];
int base = r * size;
assert(row->diag != 0.0);
nw[base + r] = 1.0 / row->diag;
for (c = 0; c < row->n_cols; ++c) {
int col_idx = row->cols[c].col_idx;
nw[base + col_idx] = row->cols[c].v;
}
}
}
void gs_matrix_dump(const gs_matrix_t *m, int a, int b, FILE *out) {
int effective_rows = MIN(a, m->c_rows);
int r, c, i;
double *elems = xmalloc(b * sizeof(*elems));
// The rows which have some content
for (r=0; r < effective_rows; ++r) {
memset(elems, 0, b * sizeof(*elems));
row_col_t *row = &m->rows[r];
for (c = 0; c < row->n_cols; ++c) {
int col_idx = row->cols[c].col_idx;
elems[col_idx] = row->cols[c].v;
}
elems[r] = row->diag != 0.0 ? 1.0 / row->diag : 0.0;
for (i = 0; i < b; ++i)
if (elems[i] != 0.0)
fprintf(out, "%+4.4f ", elems[i]);
else
fprintf(out, " ");
fprintf(out, "\n");
}
// Append 0-rows to fit height of matrix
for (r=effective_rows; r < a; ++r) {
for (c=0; c < b; ++c)
fprintf(out, " ");
fprintf(out, "\n");
}
xfree(elems);
}
void gs_matrix_self_test(int d) {
int i, o;
gs_matrix_t *m = gs_new_matrix(10, 10);
for (i=0; i<d; ++i)
for (o=0; o<d; ++o)
gs_matrix_set(m, i, o, i*o);
for (i=0; i<d; ++i)
for (o=0; o<d; ++o)
assert(gs_matrix_get(m, i, o) == i*o);
gs_delete_matrix(m);
}
...@@ -28,24 +28,20 @@ ...@@ -28,24 +28,20 @@
#include "config.h" #include "config.h"
#endif #endif
#undef USE_GSL
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <limits.h> #include <limits.h>
#include <math.h> #include <math.h>
#ifdef USE_GSL #include "gaussseidel.h"
#include <gsl/gsl_linalg.h>
#include <gsl/gsl_vector.h>
#else
#include "gaussjordan.h"
#endif
#include "firm_common_t.h" #include "firm_common_t.h"
#include "set.h" #include "set.h"
#include "hashptr.h" #include "hashptr.h"
#include "debug.h" #include "debug.h"
#include "statev.h"
#include "dfs_t.h"
#include "absgraph.h"
#include "irprog_t.h" #include "irprog_t.h"
#include "irgraph_t.h" #include "irgraph_t.h"
...@@ -59,21 +55,28 @@ ...@@ -59,21 +55,28 @@
#include "execfreq.h" #include "execfreq.h"
#define set_foreach(s,i) for((i)=set_first((s)); (i); (i)=set_next((s))) /* enable to also solve the equations with Gauss-Jordan */
#undef COMPARE_AGAINST_GAUSSJORDAN
#ifdef COMPARE_AGAINST_GAUSSJORDAN
#include "gaussjordan.h"
#endif
#define EPSILON 1e-5
#define UNDEF(x) (fabs(x) < EPSILON)
#define SEIDEL_TOLERANCE 1e-7
#define MAX_INT_FREQ 1000000 #define MAX_INT_FREQ 1000000
#define set_foreach(s,i) for((i)=set_first((s)); (i); (i)=set_next((s)))
typedef struct _freq_t { typedef struct _freq_t {
const ir_node *irn; const ir_node *irn;
int idx;
double freq; double freq;
} freq_t; } freq_t;
typedef struct _walkerdata_t {
set *set;
size_t idx;
} walkerdata_t;
struct ir_exec_freq { struct ir_exec_freq {
set *set; set *set;
hook_entry_t hook; hook_entry_t hook;
...@@ -134,58 +137,48 @@ get_block_execfreq_ulong(const ir_exec_freq *ef, const ir_node *bb) ...@@ -134,58 +137,48 @@ get_block_execfreq_ulong(const ir_exec_freq *ef, const ir_node *bb)
{ {
double f = get_block_execfreq(ef, bb); double f = get_block_execfreq(ef, bb);
int res = (int) (f > ef->min_non_zero ? ef->m * f + ef->b : 1.0); int res = (int) (f > ef->min_non_zero ? ef->m * f + ef->b : 1.0);
// printf("%20.6f %10d\n", f, res);
return res; return res;
} }
#define EPSILON 0.0001
#define UNDEF(x) !(x > EPSILON)
static void
block_walker(ir_node * bb, void * data)
{
walkerdata_t *wd = data;
set_insert_freq(wd->set, bb);
set_irn_link(bb, (void*)wd->idx++);
}
#ifdef USE_GSL
static gsl_vector *
solve_lgs(double * a_data, double * b_data, size_t size)
{
gsl_matrix_view m
= gsl_matrix_view_array (a_data, size, size);
gsl_vector_view b
= gsl_vector_view_array (b_data, size);
gsl_vector *x = gsl_vector_alloc (size);
int s;
gsl_permutation * p = gsl_permutation_alloc (size);
gsl_linalg_LU_decomp (&m.matrix, p, &s);
gsl_linalg_LU_solve (&m.matrix, p, &b.vector, x);
gsl_permutation_free (p);
return x;
}
#else
static double * static double *
solve_lgs(double * A, double * b, size_t size) solve_lgs(gs_matrix_t *mat, double *x, int size)
{ {
if(firm_gaussjordansolve(A,b,size) == 0) { double init = 1.0 / size;
return b; double dev;
} else { int i, iter = 0;
return NULL;
} /* better convergence. */
for (i = 0; i < size; ++i)
x[i] = init;
stat_ev_dbl("execfreq_matrix_size", size);
stat_ev_tim_push();
do {
++iter;
dev = gs_matrix_gauss_seidel(mat, x, size);
} while(fabs(dev) > SEIDEL_TOLERANCE);
stat_ev_tim_pop("execfreq_seidel_time");
stat_ev_dbl("execfreq_seidel_iter", iter);
#ifdef COMPARE_AGAINST_GAUSSJORDAN
{
double *nw = xmalloc(size * size * sizeof(*nw));
double *nx = xmalloc(size * sizeof(*nx));
memset(nx, 0, size * sizeof(*nx));
gs_matrix_export(mat, nw, size);
stat_ev_tim_push();
firm_gaussjordansolve(nw, nx, size);
stat_ev_tim_pop("execfreq_jordan_time");
xfree(nw);
xfree(nx);
}
#endif
return x;
} }
#endif /* USE_GSL */
static double static double
get_cf_probability(ir_node *bb, int pos, double loop_weight) get_cf_probability(ir_node *bb, int pos, double loop_weight)
...@@ -241,23 +234,26 @@ void set_execfreq(ir_exec_freq *execfreq, const ir_node *block, double freq) ...@@ -241,23 +234,26 @@ void set_execfreq(ir_exec_freq *execfreq, const ir_node *block, double freq)
ir_exec_freq * ir_exec_freq *
compute_execfreq(ir_graph * irg, double loop_weight) compute_execfreq(ir_graph * irg, double loop_weight)
{ {
size_t size; gs_matrix_t *mat;
double *matrix; int size;
double *rhs; int idx;
int i; freq_t *freq, *s, *e;
freq_t *freq; ir_exec_freq *ef;
walkerdata_t wd;
ir_exec_freq *ef;
set *freqs; set *freqs;
#ifdef USE_GSL dfs_t *dfs;
gsl_vector *x;
#else
double *x; double *x;
#endif double norm;
/*
* compute a DFS.
* using a toposort on the CFG (without back edges) will propagate
* the values better for the gauss/seidel iteration.
* => they can "flow" from start to end.
*/
dfs = dfs_new(&absgraph_irg_cfg_succ, irg);
ef = xmalloc(sizeof(ef[0])); ef = xmalloc(sizeof(ef[0]));
memset(ef, 0, sizeof(ef[0])); memset(ef, 0, sizeof(ef[0]));
ef->min_non_zero = 1e50; /* initialize with a reasonable large number. */ ef->min_non_zero = HUGE_VAL; /* initialize with a reasonable large number. */
freqs = ef->set = new_set(cmp_freq, 32); freqs = ef->set = new_set(cmp_freq, 32);
construct_cf_backedges(irg); construct_cf_backedges(irg);
...@@ -267,123 +263,118 @@ compute_execfreq(ir_graph * irg, double loop_weight) ...@@ -267,123 +263,118 @@ compute_execfreq(ir_graph * irg, double loop_weight)
edges_activate(irg); edges_activate(irg);
/* edges_assure(irg); */ /* edges_assure(irg); */
wd.idx = 0; size = dfs_get_n_nodes(dfs);
wd.set = freqs;