Commit c1106f5b authored by Matthias Braun's avatar Matthias Braun
Browse files

ia32: use builtin lowerer for popcount instead of handcrafted transformer

parent 173b3a62
......@@ -33,6 +33,7 @@
#include "lower_calls.h"
#include "lower_mode_b.h"
#include "lower_softfloat.h"
#include "lower_builtins.h"
#include "firmstat_t.h"
#include "beabi.h"
......@@ -1779,7 +1780,7 @@ static int ia32_is_valid_clobber(const char *clobber)
static void ia32_lower_for_target(void)
{
ir_mode *mode_gp = ia32_reg_classes[CLASS_ia32_gp].mode;
size_t i, n_irgs = get_irp_n_irgs();
size_t n_irgs = get_irp_n_irgs();
/* perform doubleword lowering */
lwrdw_param_t lower_dw_params = {
......@@ -1803,7 +1804,28 @@ static void ia32_lower_for_target(void)
lower_floating_point();
}
for (i = 0; i < n_irgs; ++i) {
ir_builtin_kind supported[32];
size_t s = 0;
supported[s++] = ir_bk_trap;
supported[s++] = ir_bk_debugbreak;
supported[s++] = ir_bk_return_address;
supported[s++] = ir_bk_frame_address;
supported[s++] = ir_bk_prefetch;
supported[s++] = ir_bk_ffs;
supported[s++] = ir_bk_clz;
supported[s++] = ir_bk_ctz;
supported[s++] = ir_bk_parity;
supported[s++] = ir_bk_bswap;
supported[s++] = ir_bk_outport;
supported[s++] = ir_bk_inport;
supported[s++] = ir_bk_inner_trampoline;
supported[s++] = ir_bk_saturating_increment;
if (ia32_cg_config.use_popcnt)
supported[s++] = ir_bk_popcount;
assert(s < ARRAY_SIZE(supported));
lower_builtins(s, supported);
for (size_t i = 0; i < n_irgs; ++i) {
ir_graph *irg = get_irp_irg(i);
/* break up switches with wide ranges */
lower_switch(irg, 4, 256, mode_gp);
......@@ -1812,13 +1834,13 @@ static void ia32_lower_for_target(void)
ir_prepare_dw_lowering(&lower_dw_params);
ir_lower_dw_ops();
for (i = 0; i < n_irgs; ++i) {
for (size_t i = 0; i < n_irgs; ++i) {
ir_graph *irg = get_irp_irg(i);
/* lower for mode_b stuff */
ir_lower_mode_b(irg, mode_Iu);
}
for (i = 0; i < n_irgs; ++i) {
for (size_t i = 0; i < n_irgs; ++i) {
ir_graph *irg = get_irp_irg(i);
/* Turn all small CopyBs into loads/stores, keep medium-sized CopyBs,
* so we can generate rep movs later, and turn all big CopyBs into
......
......@@ -4695,6 +4695,10 @@ static ir_node *gen_be_Call(ir_node *node)
am.new_op2, sp, fpcw, eax, ecx, edx, pop, call_tp);
ir_set_throws_exception(call, throws_exception);
set_am_attributes(call, &am);
arch_set_irn_register_out(call, pn_ia32_Call_stack,
&ia32_registers[REG_ESP]);
arch_set_irn_register_out(call, pn_ia32_Call_fpcw,
&ia32_registers[REG_FPCW]);
call = fix_mem_proj(call, &am);
if (get_irn_pinned(node) == op_pin_state_pinned)
......@@ -5035,109 +5039,26 @@ static ir_node *gen_parity(ir_node *node)
*/
static ir_node *gen_popcount(ir_node *node)
{
ir_node *param = get_Builtin_param(node, 0);
dbg_info *dbgi = get_irn_dbg_info(node);
ir_node *block = get_nodes_block(node);
ir_node *new_block = be_transform_node(block);
ir_node *new_param;
ir_node *imm, *simm, *m1, *s1, *s2, *s3, *s4, *s5, *m2, *m3, *m4, *m5, *m6, *m7, *m8, *m9, *m10, *m11, *m12, *m13;
/* check for SSE4.2 or SSE4a and use the popcnt instruction */
if (ia32_cg_config.use_popcnt) {
ia32_address_mode_t am;
ia32_address_t *addr = &am.addr;
ir_node *cnt;
match_arguments(&am, block, NULL, param, NULL, match_am | match_16bit_am | match_upconv);
cnt = new_bd_ia32_Popcnt(dbgi, new_block, addr->base, addr->index, addr->mem, am.new_op2);
set_am_attributes(cnt, &am);
set_ia32_ls_mode(cnt, get_irn_mode(param));
SET_IA32_ORIG_NODE(cnt, node);
return fix_mem_proj(cnt, &am);
}
new_param = be_transform_node(param);
/* do the standard popcount algo */
/* TODO: This is stupid, we should transform this before the backend,
* to get CSE, localopts, etc. for the operations
* TODO: This is also not the optimal algorithm (it is just the starting
* example in hackers delight, they optimize it more on the following page)
* But I'm too lazy to fix this now, as the code should get lowered before
* the backend anyway.
*/
ir_graph *const irg = get_Block_irg(new_block);
/* m1 = x & 0x55555555 */
imm = ia32_create_Immediate(irg, NULL, 0, 0x55555555);
m1 = new_bd_ia32_And(dbgi, new_block, noreg_GP, noreg_GP, nomem, new_param, imm);
/* s1 = x >> 1 */
simm = ia32_create_Immediate(irg, NULL, 0, 1);
s1 = new_bd_ia32_Shr(dbgi, new_block, new_param, simm);
/* m2 = s1 & 0x55555555 */
m2 = new_bd_ia32_And(dbgi, new_block, noreg_GP, noreg_GP, nomem, s1, imm);
/* m3 = m1 + m2 */
m3 = new_bd_ia32_Lea(dbgi, new_block, m2, m1);
/* m4 = m3 & 0x33333333 */
imm = ia32_create_Immediate(irg, NULL, 0, 0x33333333);
m4 = new_bd_ia32_And(dbgi, new_block, noreg_GP, noreg_GP, nomem, m3, imm);
/* s2 = m3 >> 2 */
simm = ia32_create_Immediate(irg, NULL, 0, 2);
s2 = new_bd_ia32_Shr(dbgi, new_block, m3, simm);
/* m5 = s2 & 0x33333333 */
m5 = new_bd_ia32_And(dbgi, new_block, noreg_GP, noreg_GP, nomem, s2, imm);
/* builtin lowerer should have replaced the popcount if !use_popcount */
assert(ia32_cg_config.use_popcnt);
/* m6 = m4 + m5 */
m6 = new_bd_ia32_Lea(dbgi, new_block, m4, m5);
ir_node *param = get_Builtin_param(node, 0);
ir_node *block = get_nodes_block(node);
ia32_address_mode_t am;
match_arguments(&am, block, NULL, param, NULL,
match_am | match_16bit_am | match_upconv);
/* m7 = m6 & 0x0F0F0F0F */
imm = ia32_create_Immediate(irg, NULL, 0, 0x0F0F0F0F);
m7 = new_bd_ia32_And(dbgi, new_block, noreg_GP, noreg_GP, nomem, m6, imm);
/* s3 = m6 >> 4 */
simm = ia32_create_Immediate(irg, NULL, 0, 4);
s3 = new_bd_ia32_Shr(dbgi, new_block, m6, simm);
/* m8 = s3 & 0x0F0F0F0F */
m8 = new_bd_ia32_And(dbgi, new_block, noreg_GP, noreg_GP, nomem, s3, imm);
/* m9 = m7 + m8 */
m9 = new_bd_ia32_Lea(dbgi, new_block, m7, m8);
/* m10 = m9 & 0x00FF00FF */
imm = ia32_create_Immediate(irg, NULL, 0, 0x00FF00FF);
m10 = new_bd_ia32_And(dbgi, new_block, noreg_GP, noreg_GP, nomem, m9, imm);
/* s4 = m9 >> 8 */
simm = ia32_create_Immediate(irg, NULL, 0, 8);
s4 = new_bd_ia32_Shr(dbgi, new_block, m9, simm);
/* m11 = s4 & 0x00FF00FF */
m11 = new_bd_ia32_And(dbgi, new_block, noreg_GP, noreg_GP, nomem, s4, imm);
/* m12 = m10 + m11 */
m12 = new_bd_ia32_Lea(dbgi, new_block, m10, m11);
/* m13 = m12 & 0x0000FFFF */
imm = ia32_create_Immediate(irg, NULL, 0, 0x0000FFFF);
m13 = new_bd_ia32_And(dbgi, new_block, noreg_GP, noreg_GP, nomem, m12, imm);
/* s5 = m12 >> 16 */
simm = ia32_create_Immediate(irg, NULL, 0, 16);
s5 = new_bd_ia32_Shr(dbgi, new_block, m12, simm);
ia32_address_t *addr = &am.addr;
dbg_info *dbgi = get_irn_dbg_info(node);
ir_node *new_block = be_transform_node(block);
ir_node *cnt = new_bd_ia32_Popcnt(dbgi, new_block, addr->base,
addr->index, addr->mem,
am.new_op2);
set_am_attributes(cnt, &am);
set_ia32_ls_mode(cnt, get_irn_mode(param));
/* res = m13 + s5 */
return new_bd_ia32_Lea(dbgi, new_block, m13, s5);
SET_IA32_ORIG_NODE(cnt, node);
return fix_mem_proj(cnt, &am);
}
/**
......@@ -5466,18 +5387,6 @@ found:;
}
res = new_rd_Proj(dbgi, new_call, mode, proj);
/* TODO arch_set_irn_register() only operates on Projs, need variant with index */
switch (proj) {
case pn_ia32_Call_stack:
arch_set_irn_register(res, &ia32_registers[REG_ESP]);
break;
case pn_ia32_Call_fpcw:
arch_set_irn_register(res, &ia32_registers[REG_FPCW]);
break;
}
return res;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment