ia32_architecture.c 32.3 KB
Newer Older
1
2
/*
 * This file is part of libFirm.
3
 * Copyright (C) 2012 University of Karlsruhe.
4
5
6
7
8
9
10
11
12
 */

/**
 * @file
 * @brief       ia32 architecture variants
 * @author      Michael Beck, Matthias Braun
 */
#include "config.h"

Matthias Braun's avatar
Matthias Braun committed
13
#include <stdbool.h>
14
15

#include "irmode_t.h"
Matthias Braun's avatar
Matthias Braun committed
16
17
#include "lc_opts.h"
#include "lc_opts_enum.h"
18
19
#include "irtools.h"
#include "ia32_architecture.h"
20
#include "tv.h"
21

22
23
#undef NATIVE_X86

24
#ifdef _MSC_VER
25
#if defined(_M_IX86) || defined(_M_X64)
26
#include <intrin.h>
27
28
29
30
31
32
#define NATIVE_X86
#endif
#else
#if defined(__i386__) || defined(__x86_64__)
#define NATIVE_X86
#endif
33
34
#endif

35
36
37
ia32_code_gen_config_t  ia32_cg_config;

/**
38
 * CPU architectures and features.
39
 */
40
typedef enum cpu_arch_features {
41
	arch_generic32        = 0x00000001, /**< no specific architecture */
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

	arch_i386             = 0x00000002, /**< i386 architecture */
	arch_i486             = 0x00000004, /**< i486 architecture */
	arch_pentium          = 0x00000008, /**< Pentium architecture */
	arch_ppro             = 0x00000010, /**< PentiumPro architecture */
	arch_netburst         = 0x00000020, /**< Netburst architecture */
	arch_nocona           = 0x00000040, /**< Nocona architecture */
	arch_core2            = 0x00000080, /**< Core2 architecture */
	arch_atom             = 0x00000100, /**< Atom architecture */

	arch_k6               = 0x00000200, /**< k6 architecture */
	arch_geode            = 0x00000400, /**< Geode architecture */
	arch_athlon           = 0x00000800, /**< Athlon architecture */
	arch_k8               = 0x00001000, /**< K8/Opteron architecture */
	arch_k10              = 0x00002000, /**< K10/Barcelona architecture */

	arch_mask             = 0x00003FFF,

	arch_athlon_plus      = arch_athlon | arch_k8 | arch_k10,
	arch_all_amd          = arch_k6 | arch_geode | arch_athlon_plus,

Michael Beck's avatar
Michael Beck committed
63
	arch_feature_mmx      = 0x00004000, /**< MMX instructions */
64
65
66
67
68
69
70
71
72
73
74
75
76
	arch_feature_cmov     = 0x00008000, /**< cmov instructions */
	arch_feature_p6_insn  = 0x00010000, /**< PentiumPro instructions */
	arch_feature_sse1     = 0x00020000, /**< SSE1 instructions */
	arch_feature_sse2     = 0x00040000, /**< SSE2 instructions */
	arch_feature_sse3     = 0x00080000, /**< SSE3 instructions */
	arch_feature_ssse3    = 0x00100000, /**< SSSE3 instructions */
	arch_feature_3DNow    = 0x00200000, /**< 3DNow! instructions */
	arch_feature_3DNowE   = 0x00400000, /**< Enhanced 3DNow! instructions */
	arch_feature_64bit    = 0x00800000, /**< x86_64 support */
	arch_feature_sse4_1   = 0x01000000, /**< SSE4.1 instructions */
	arch_feature_sse4_2   = 0x02000000, /**< SSE4.2 instructions */
	arch_feature_sse4a    = 0x04000000, /**< SSE4a instructions */
	arch_feature_popcnt   = 0x08000000, /**< popcnt instruction */
Michael Beck's avatar
Michael Beck committed
77
78

	arch_mmx_insn     = arch_feature_mmx,                         /**< MMX instructions */
79
80
81
82
83
84
85
	arch_sse1_insn    = arch_feature_sse1   | arch_mmx_insn,      /**< SSE1 instructions, include MMX */
	arch_sse2_insn    = arch_feature_sse2   | arch_sse1_insn,     /**< SSE2 instructions, include SSE1 */
	arch_sse3_insn    = arch_feature_sse3   | arch_sse2_insn,     /**< SSE3 instructions, include SSE2 */
	arch_ssse3_insn   = arch_feature_ssse3  | arch_sse3_insn,     /**< SSSE3 instructions, include SSE3 */
	arch_sse4_1_insn  = arch_feature_sse4_1 | arch_ssse3_insn,    /**< SSE4.1 instructions, include SSSE3 */
	arch_sse4_2_insn  = arch_feature_sse4_2 | arch_sse4_1_insn,   /**< SSE4.2 instructions, include SSE4.1 */
	arch_sse4a_insn   = arch_feature_sse4a  | arch_ssse3_insn,    /**< SSE4a instructions, include SSSE3 */
Michael Beck's avatar
Michael Beck committed
86

yb9976's avatar
yb9976 committed
87
	arch_3DNow_insn   = arch_feature_3DNow  | arch_feature_mmx,   /**< 3DNow! instructions, including MMX */
Michael Beck's avatar
Michael Beck committed
88
89
	arch_3DNowE_insn  = arch_feature_3DNowE | arch_3DNow_insn,    /**< Enhanced 3DNow! instructions */
	arch_64bit_insn   = arch_feature_64bit  | arch_sse2_insn,     /**< x86_64 support, includes SSE2 */
90

91
	cpu_generic             = arch_generic32,
92

93
	/* intel CPUs */
yb9976's avatar
yb9976 committed
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
	cpu_i386                = arch_i386,
	cpu_i486                = arch_i486,
	cpu_pentium             = arch_pentium,
	cpu_pentium_mmx         = arch_pentium | arch_mmx_insn,
	cpu_pentium_pro_generic = arch_ppro | arch_feature_p6_insn,
	cpu_pentium_pro         = arch_ppro | arch_feature_cmov | arch_feature_p6_insn,
	cpu_pentium_2           = arch_ppro | arch_feature_cmov | arch_feature_p6_insn | arch_mmx_insn,
	cpu_pentium_3           = arch_ppro | arch_feature_cmov | arch_feature_p6_insn | arch_sse1_insn,
	cpu_pentium_m           = arch_ppro | arch_feature_cmov | arch_feature_p6_insn | arch_sse2_insn,
	cpu_netburst_generic    = arch_netburst | arch_feature_p6_insn,
	cpu_pentium_4           = arch_netburst | arch_feature_cmov | arch_feature_p6_insn | arch_sse2_insn,
	cpu_prescott            = arch_nocona | arch_feature_cmov | arch_feature_p6_insn | arch_sse3_insn,
	cpu_nocona              = arch_nocona | arch_feature_cmov | arch_feature_p6_insn | arch_64bit_insn | arch_sse3_insn,
	cpu_core2_generic       = arch_core2 | arch_feature_p6_insn,
	cpu_core2               = arch_core2 | arch_feature_cmov | arch_feature_p6_insn | arch_64bit_insn | arch_ssse3_insn,
	cpu_penryn              = arch_core2 | arch_feature_cmov | arch_feature_p6_insn | arch_64bit_insn | arch_sse4_1_insn,
	cpu_atom_generic        = arch_atom | arch_feature_p6_insn,
	cpu_atom                = arch_atom | arch_feature_cmov | arch_feature_p6_insn | arch_ssse3_insn,
112

113
	/* AMD CPUs */
yb9976's avatar
yb9976 committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
	cpu_k6_generic     = arch_k6,
	cpu_k6             = arch_k6 | arch_mmx_insn,
	cpu_k6_PLUS        = arch_k6 | arch_3DNow_insn,
	cpu_geode_generic  = arch_geode,
	cpu_geode          = arch_geode  | arch_sse1_insn | arch_3DNowE_insn,
	cpu_athlon_generic = arch_athlon | arch_feature_p6_insn,
	cpu_athlon_old     = arch_athlon | arch_3DNowE_insn | arch_feature_cmov | arch_feature_p6_insn,
	cpu_athlon         = arch_athlon | arch_sse1_insn | arch_3DNowE_insn | arch_feature_cmov | arch_feature_p6_insn,
	cpu_athlon64       = arch_athlon | arch_sse2_insn | arch_3DNowE_insn | arch_feature_cmov | arch_feature_p6_insn | arch_64bit_insn,
	cpu_k8_generic     = arch_k8  | arch_feature_p6_insn,
	cpu_k8             = arch_k8  | arch_3DNowE_insn | arch_feature_cmov | arch_feature_p6_insn | arch_64bit_insn,
	cpu_k8_sse3        = arch_k8  | arch_3DNowE_insn | arch_feature_cmov | arch_feature_p6_insn | arch_64bit_insn | arch_sse3_insn,
	cpu_k10_generic    = arch_k10 | arch_feature_p6_insn,
	cpu_k10            = arch_k10 | arch_3DNowE_insn | arch_feature_cmov | arch_feature_p6_insn | arch_feature_popcnt | arch_64bit_insn | arch_sse4a_insn,
128

129
	/* other CPUs */
130
131
132
	cpu_winchip_c6  = arch_i486 | arch_feature_mmx,
	cpu_winchip2    = arch_i486 | arch_feature_mmx | arch_feature_3DNow,
	cpu_c3          = arch_i486 | arch_feature_mmx | arch_feature_3DNow,
133
	cpu_c3_2        = arch_ppro | arch_feature_cmov | arch_feature_p6_insn | arch_sse1_insn, /* really no 3DNow! */
Andreas Zwinkau's avatar
Andreas Zwinkau committed
134
135

	cpu_autodetect  = 0,
136
} cpu_arch_features;
Matthias Braun's avatar
Matthias Braun committed
137
ENUM_BITSET(cpu_arch_features)
138

139
140
static int               opt_size             = 0;
static int               emit_machcode        = 0;
141
static int               use_softfloat        = 0;
142
static cpu_arch_features arch                 = cpu_generic;
143
static cpu_arch_features opt_arch             = 0;
144
145
146
static int               fpu_arch             = 0;
static int               opt_cc               = 1;
static int               opt_unsafe_floatconv = 0;
147
148
149

/* instruction set architectures. */
static const lc_opt_enum_int_items_t arch_items[] = {
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
	{ "i386",         cpu_i386 },
	{ "i486",         cpu_i486 },
	{ "i586",         cpu_pentium },
	{ "pentium",      cpu_pentium },
	{ "pentium-mmx",  cpu_pentium_mmx },
	{ "i686",         cpu_pentium_pro },
	{ "pentiumpro",   cpu_pentium_pro },
	{ "pentium2",     cpu_pentium_2 },
	{ "p2",           cpu_pentium_2 },
	{ "pentium3",     cpu_pentium_3 },
	{ "pentium3m",    cpu_pentium_3 },
	{ "p3",           cpu_pentium_3 },
	{ "pentium-m",    cpu_pentium_m },
	{ "pm",           cpu_pentium_m },
	{ "pentium4",     cpu_pentium_4 },
	{ "pentium4m",    cpu_pentium_4 },
	{ "p4",           cpu_pentium_4 },
	{ "prescott",     cpu_prescott },
	{ "nocona",       cpu_nocona },
	{ "merom",        cpu_core2 },
	{ "core2",        cpu_core2 },
171
	{ "penryn",       cpu_penryn },
yb9976's avatar
yb9976 committed
172
	{ "atom",         cpu_atom },
173
174
175
176
177

	{ "k6",           cpu_k6 },
	{ "k6-2",         cpu_k6_PLUS },
	{ "k6-3",         cpu_k6_PLUS },
	{ "geode",        cpu_geode },
178
	{ "athlon",       cpu_athlon_old },
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
	{ "athlon-tbird", cpu_athlon },
	{ "athlon-4",     cpu_athlon },
	{ "athlon-xp",    cpu_athlon },
	{ "athlon-mp",    cpu_athlon },
	{ "athlon64",     cpu_athlon64 },
	{ "k8",           cpu_k8 },
	{ "opteron",      cpu_k8 },
	{ "athlon-fx",    cpu_k8 },
	{ "k8-sse3",      cpu_k8_sse3 },
	{ "opteron-sse3", cpu_k8_sse3 },
	{ "k10",          cpu_k10 },
	{ "barcelona",    cpu_k10 },
	{ "amdfam10",     cpu_k10 },

	{ "winchip-c6",   cpu_winchip_c6, },
	{ "winchip2",     cpu_winchip2 },
	{ "c3",           cpu_c3 },
	{ "c3-2",         cpu_c3_2 },

	{ "generic",      cpu_generic },
	{ "generic32",    cpu_generic },
Andreas Zwinkau's avatar
Andreas Zwinkau committed
200

201
#ifdef NATIVE_X86
Andreas Zwinkau's avatar
Andreas Zwinkau committed
202
	{ "native",       cpu_autodetect },
203
204
#endif

205
	{ NULL,           0 }
206
207
208
209
210
211
212
213
214
215
216
};

static lc_opt_enum_int_var_t arch_var = {
	(int*) &arch, arch_items
};

static lc_opt_enum_int_var_t opt_arch_var = {
	(int*) &opt_arch, arch_items
};

static const lc_opt_enum_int_items_t fp_unit_items[] = {
217
218
	{ "387" ,      IA32_FPU_ARCH_X87 },
	{ "sse",       IA32_FPU_ARCH_SSE2 },
219
220
	{ "softfloat", IA32_FPU_ARCH_SOFTFLOAT },
	{ NULL,        IA32_FPU_ARCH_NONE }
221
222
223
};

static lc_opt_enum_int_var_t fp_unit_var = {
224
	&fpu_arch, fp_unit_items
225
226
227
};

static const lc_opt_table_entry_t ia32_architecture_options[] = {
228
229
	LC_OPT_ENT_BOOL    ("size",             "optimize for size",                                  &opt_size),
	LC_OPT_ENT_ENUM_INT("arch",             "select the instruction architecture",                &arch_var),
230
231
	LC_OPT_ENT_ENUM_INT("tune",             "optimize for instruction architecture",              &opt_arch_var),
	LC_OPT_ENT_ENUM_INT("fpmath",           "select the floating point unit",                     &fp_unit_var),
232
	LC_OPT_ENT_BOOL    ("optcc",            "optimize calling convention",                        &opt_cc),
233
234
	LC_OPT_ENT_BOOL    ("unsafe_floatconv", "do unsafe floating point controlword optimisations", &opt_unsafe_floatconv),
	LC_OPT_ENT_BOOL    ("machcode",         "output machine code instead of assembler",           &emit_machcode),
235
	LC_OPT_ENT_BOOL    ("soft-float",       "equivalent to fpmath=softfloat",                     &use_softfloat),
236
237
238
239
	LC_OPT_LAST
};

typedef struct insn_const {
240
241
242
243
244
245
246
247
	int      add_cost;                 /**< cost of an add instruction */
	int      lea_cost;                 /**< cost of a lea instruction */
	int      const_shf_cost;           /**< cost of a constant shift instruction */
	int      cost_mul_start;           /**< starting cost of a multiply instruction */
	int      cost_mul_bit;             /**< cost of multiply for every set bit */
	unsigned function_alignment;       /**< logarithm for alignment of function labels */
	unsigned label_alignment;          /**< logarithm for alignment of loops labels */
	unsigned label_alignment_max_skip; /**< maximum skip for alignment of loops labels */
248
249
} insn_const;

Michael Beck's avatar
Michael Beck committed
250
251
252
253
254
/* costs for optimizing for size */
static const insn_const size_cost = {
	2,   /* cost of an add instruction */
	3,   /* cost of a lea instruction */
	3,   /* cost of a constant shift instruction */
255
	4,   /* starting cost of a multiply instruction */
Michael Beck's avatar
Michael Beck committed
256
257
258
259
260
261
	0,   /* cost of multiply for every set bit */
	0,   /* logarithm for alignment of function labels */
	0,   /* logarithm for alignment of loops labels */
	0,   /* maximum skip for alignment of loops labels */
};

262
263
/* costs for the i386 */
static const insn_const i386_cost = {
264
265
266
267
	1,   /* cost of an add instruction */
	1,   /* cost of a lea instruction */
	3,   /* cost of a constant shift instruction */
	9,   /* starting cost of a multiply instruction */
268
269
270
	1,   /* cost of multiply for every set bit */
	2,   /* logarithm for alignment of function labels */
	2,   /* logarithm for alignment of loops labels */
271
	3,   /* maximum skip for alignment of loops labels */
272
273
274
275
};

/* costs for the i486 */
static const insn_const i486_cost = {
276
277
278
279
	1,   /* cost of an add instruction */
	1,   /* cost of a lea instruction */
	2,   /* cost of a constant shift instruction */
	12,  /* starting cost of a multiply instruction */
280
281
282
	1,   /* cost of multiply for every set bit */
	4,   /* logarithm for alignment of function labels */
	4,   /* logarithm for alignment of loops labels */
283
	15,  /* maximum skip for alignment of loops labels */
284
285
286
287
};

/* costs for the Pentium */
static const insn_const pentium_cost = {
288
289
290
291
	1,   /* cost of an add instruction */
	1,   /* cost of a lea instruction */
	1,   /* cost of a constant shift instruction */
	11,  /* starting cost of a multiply instruction */
292
293
294
	0,   /* cost of multiply for every set bit */
	4,   /* logarithm for alignment of function labels */
	4,   /* logarithm for alignment of loops labels */
295
	7,   /* maximum skip for alignment of loops labels */
296
297
298
299
};

/* costs for the Pentium Pro */
static const insn_const pentiumpro_cost = {
300
301
302
303
	1,   /* cost of an add instruction */
	1,   /* cost of a lea instruction */
	1,   /* cost of a constant shift instruction */
	4,   /* starting cost of a multiply instruction */
304
305
306
	0,   /* cost of multiply for every set bit */
	4,   /* logarithm for alignment of function labels */
	4,   /* logarithm for alignment of loops labels */
307
	10,  /* maximum skip for alignment of loops labels */
308
309
310
311
};

/* costs for the K6 */
static const insn_const k6_cost = {
312
313
314
315
	1,   /* cost of an add instruction */
	2,   /* cost of a lea instruction */
	1,   /* cost of a constant shift instruction */
	3,   /* starting cost of a multiply instruction */
316
317
318
	0,   /* cost of multiply for every set bit */
	5,   /* logarithm for alignment of function labels */
	5,   /* logarithm for alignment of loops labels */
319
	7,   /* maximum skip for alignment of loops labels */
320
321
};

322
323
324
325
326
327
/* costs for the Geode */
static const insn_const geode_cost = {
	1,   /* cost of an add instruction */
	1,   /* cost of a lea instruction */
	1,   /* cost of a constant shift instruction */
	7,   /* starting cost of a multiply instruction */
328
329
330
	0,   /* cost of multiply for every set bit */
	0,   /* logarithm for alignment of function labels */
	0,   /* logarithm for alignment of loops labels */
331
	0,   /* maximum skip for alignment of loops labels */
332
};
333

334
335
/* costs for the Athlon */
static const insn_const athlon_cost = {
336
337
338
339
	1,   /* cost of an add instruction */
	2,   /* cost of a lea instruction */
	1,   /* cost of a constant shift instruction */
	5,   /* starting cost of a multiply instruction */
340
341
342
	0,   /* cost of multiply for every set bit */
	4,   /* logarithm for alignment of function labels */
	4,   /* logarithm for alignment of loops labels */
343
	7,   /* maximum skip for alignment of loops labels */
344
345
};

346
/* costs for the Opteron/K8 */
347
static const insn_const k8_cost = {
348
349
350
351
	1,   /* cost of an add instruction */
	2,   /* cost of a lea instruction */
	1,   /* cost of a constant shift instruction */
	3,   /* starting cost of a multiply instruction */
352
353
354
	0,   /* cost of multiply for every set bit */
	4,   /* logarithm for alignment of function labels */
	4,   /* logarithm for alignment of loops labels */
355
	7,   /* maximum skip for alignment of loops labels */
356
357
358
359
360
361
362
363
364
365
366
};

/* costs for the K10 */
static const insn_const k10_cost = {
	1,   /* cost of an add instruction */
	2,   /* cost of a lea instruction */
	1,   /* cost of a constant shift instruction */
	3,   /* starting cost of a multiply instruction */
	0,   /* cost of multiply for every set bit */
	5,   /* logarithm for alignment of function labels */
	5,   /* logarithm for alignment of loops labels */
367
	7,   /* maximum skip for alignment of loops labels */
368
369
370
};

/* costs for the Pentium 4 */
371
static const insn_const netburst_cost = {
372
373
374
375
	1,   /* cost of an add instruction */
	3,   /* cost of a lea instruction */
	4,   /* cost of a constant shift instruction */
	15,  /* starting cost of a multiply instruction */
376
377
378
	0,   /* cost of multiply for every set bit */
	4,   /* logarithm for alignment of function labels */
	4,   /* logarithm for alignment of loops labels */
379
	7,   /* maximum skip for alignment of loops labels */
380
381
};

382
/* costs for the Nocona and Core */
383
static const insn_const nocona_cost = {
384
385
386
387
	1,   /* cost of an add instruction */
	1,   /* cost of a lea instruction */
	1,   /* cost of a constant shift instruction */
	10,  /* starting cost of a multiply instruction */
388
389
390
	0,   /* cost of multiply for every set bit */
	4,   /* logarithm for alignment of function labels */
	4,   /* logarithm for alignment of loops labels */
391
	7,   /* maximum skip for alignment of loops labels */
392
393
394
395
};

/* costs for the Core2 */
static const insn_const core2_cost = {
396
397
398
399
	1,   /* cost of an add instruction */
	1,   /* cost of a lea instruction */
	1,   /* cost of a constant shift instruction */
	3,   /* starting cost of a multiply instruction */
400
401
402
	0,   /* cost of multiply for every set bit */
	4,   /* logarithm for alignment of function labels */
	4,   /* logarithm for alignment of loops labels */
403
	10,  /* maximum skip for alignment of loops labels */
404
};
405

406
407
/* costs for the generic32 */
static const insn_const generic32_cost = {
408
409
410
411
	1,   /* cost of an add instruction */
	2,   /* cost of a lea instruction */
	1,   /* cost of a constant shift instruction */
	4,   /* starting cost of a multiply instruction */
412
413
414
	0,   /* cost of multiply for every set bit */
	4,   /* logarithm for alignment of function labels */
	4,   /* logarithm for alignment of loops labels */
415
	7,   /* maximum skip for alignment of loops labels */
416
417
};

418
static const insn_const *arch_costs = &generic32_cost;
419
420
421

static void set_arch_costs(void)
{
Michael Beck's avatar
Michael Beck committed
422
423
424
425
	if (opt_size) {
		arch_costs = &size_cost;
		return;
	}
426
	switch (opt_arch & arch_mask) {
427
428
429
430
431
432
433
434
435
436
437
438
	case arch_i386:      arch_costs = &i386_cost;       break;
	case arch_i486:      arch_costs = &i486_cost;       break;
	case arch_pentium:   arch_costs = &pentium_cost;    break;
	case arch_ppro:      arch_costs = &pentiumpro_cost; break;
	case arch_netburst:  arch_costs = &netburst_cost;   break;
	case arch_nocona:    arch_costs = &nocona_cost;     break;
	case arch_core2:     arch_costs = &core2_cost;      break;
	case arch_k6:        arch_costs = &k6_cost;         break;
	case arch_geode:     arch_costs = &geode_cost;      break;
	case arch_athlon:    arch_costs = &athlon_cost;     break;
	case arch_k8:        arch_costs = &k8_cost;         break;
	case arch_k10:       arch_costs = &k10_cost;        break;
439
	default:
440
	case arch_generic32: arch_costs = &generic32_cost;  break;
441
442
443
	}
}

Michael Beck's avatar
Michael Beck committed
444
/* Evaluate the costs of an instruction. */
445
int ia32_evaluate_insn(insn_kind kind, const ir_mode *mode, ir_tarval *tv)
446
{
447
448
449
450
	int cost;

	switch (kind) {
	case MUL:
451
		cost = arch_costs->cost_mul_start;
452
453
454
455
456
457
458
459
460
461
462
		if (arch_costs->cost_mul_bit > 0) {
			char *bitstr = get_tarval_bitpattern(tv);
			int i;

			for (i = 0; bitstr[i] != '\0'; ++i) {
				if (bitstr[i] == '1') {
					cost += arch_costs->cost_mul_bit;
				}
			}
			free(bitstr);
		}
463
464
465
466
		if (get_mode_size_bits(mode) <= 32)
			return cost;
		/* 64bit mul supported, approx 4times of a 32bit mul*/
		return 4 * cost;
467
	case LEA:
468
469
470
		/* lea is only supported for 32 bit */
		if (get_mode_size_bits(mode) <= 32)
			return arch_costs->lea_cost;
471
472
		/* in 64bit mode, the Lea cost are at wort 2 shifts and one add */
		return 2 * arch_costs->add_cost + 2 * (2 * arch_costs->const_shf_cost);
473
474
	case ADD:
	case SUB:
475
476
477
478
		if (get_mode_size_bits(mode) <= 32)
			return arch_costs->add_cost;
		/* 64bit add/sub supported, double the cost */
		return 2 * arch_costs->add_cost;
479
	case SHIFT:
480
481
482
483
		if (get_mode_size_bits(mode) <= 32)
			return arch_costs->const_shf_cost;
		/* 64bit shift supported, double the cost */
		return 2 * arch_costs->const_shf_cost;
484
485
486
	case ZERO:
		return arch_costs->add_cost;
	default:
487
		return 1;
488
489
490
	}
}

491
492
493
/* auto detection code only works if we're on an x86 cpu obviously */
#ifdef NATIVE_X86
typedef struct x86_cpu_info_t {
494
495
496
497
498
499
500
501
502
	unsigned char cpu_stepping;
	unsigned char cpu_model;
	unsigned char cpu_family;
	unsigned char cpu_type;
	unsigned char cpu_ext_model;
	unsigned char cpu_ext_family;
	unsigned      edx_features;
	unsigned      ecx_features;
	unsigned      add_features;
503
} x86_cpu_info_t;
Andreas Zwinkau's avatar
Andreas Zwinkau committed
504

505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
enum {
	CPUID_FEAT_ECX_SSE3      = 1 << 0,
	CPUID_FEAT_ECX_PCLMUL    = 1 << 1,
	CPUID_FEAT_ECX_DTES64    = 1 << 2,
	CPUID_FEAT_ECX_MONITOR   = 1 << 3,
	CPUID_FEAT_ECX_DS_CPL    = 1 << 4,
	CPUID_FEAT_ECX_VMX       = 1 << 5,
	CPUID_FEAT_ECX_SMX       = 1 << 6,
	CPUID_FEAT_ECX_EST       = 1 << 7,
	CPUID_FEAT_ECX_TM2       = 1 << 8,
	CPUID_FEAT_ECX_SSSE3     = 1 << 9,
	CPUID_FEAT_ECX_CID       = 1 << 10,
	CPUID_FEAT_ECX_FMA       = 1 << 12,
	CPUID_FEAT_ECX_CX16      = 1 << 13,
	CPUID_FEAT_ECX_ETPRD     = 1 << 14,
	CPUID_FEAT_ECX_PDCM      = 1 << 15,
	CPUID_FEAT_ECX_DCA       = 1 << 18,
	CPUID_FEAT_ECX_SSE4_1    = 1 << 19,
	CPUID_FEAT_ECX_SSE4_2    = 1 << 20,
	CPUID_FEAT_ECX_x2APIC    = 1 << 21,
	CPUID_FEAT_ECX_MOVBE     = 1 << 22,
	CPUID_FEAT_ECX_POPCNT    = 1 << 23,
	CPUID_FEAT_ECX_AES       = 1 << 25,
	CPUID_FEAT_ECX_XSAVE     = 1 << 26,
	CPUID_FEAT_ECX_OSXSAVE   = 1 << 27,
	CPUID_FEAT_ECX_AVX       = 1 << 28,

	CPUID_FEAT_EDX_FPU       = 1 << 0,
	CPUID_FEAT_EDX_VME       = 1 << 1,
	CPUID_FEAT_EDX_DE        = 1 << 2,
	CPUID_FEAT_EDX_PSE       = 1 << 3,
	CPUID_FEAT_EDX_TSC       = 1 << 4,
	CPUID_FEAT_EDX_MSR       = 1 << 5,
	CPUID_FEAT_EDX_PAE       = 1 << 6,
	CPUID_FEAT_EDX_MCE       = 1 << 7,
	CPUID_FEAT_EDX_CX8       = 1 << 8,
	CPUID_FEAT_EDX_APIC      = 1 << 9,
	CPUID_FEAT_EDX_SEP       = 1 << 11,
	CPUID_FEAT_EDX_MTRR      = 1 << 12,
	CPUID_FEAT_EDX_PGE       = 1 << 13,
	CPUID_FEAT_EDX_MCA       = 1 << 14,
	CPUID_FEAT_EDX_CMOV      = 1 << 15,
	CPUID_FEAT_EDX_PAT       = 1 << 16,
	CPUID_FEAT_EDX_PSE36     = 1 << 17,
	CPUID_FEAT_EDX_PSN       = 1 << 18,
	CPUID_FEAT_EDX_CLF       = 1 << 19,
	CPUID_FEAT_EDX_DTES      = 1 << 21,
	CPUID_FEAT_EDX_ACPI      = 1 << 22,
	CPUID_FEAT_EDX_MMX       = 1 << 23,
	CPUID_FEAT_EDX_FXSR      = 1 << 24,
	CPUID_FEAT_EDX_SSE       = 1 << 25,
	CPUID_FEAT_EDX_SSE2      = 1 << 26,
	CPUID_FEAT_EDX_SS        = 1 << 27,
	CPUID_FEAT_EDX_HTT       = 1 << 28,
	CPUID_FEAT_EDX_TM1       = 1 << 29,
	CPUID_FEAT_EDX_IA64      = 1 << 30,
	CPUID_FEAT_EDX_PBE       = 1 << 31
};

564
static cpu_arch_features auto_detect_Intel(x86_cpu_info_t const *info)
565
{
566
	cpu_arch_features auto_arch = cpu_generic;
Andreas Zwinkau's avatar
Andreas Zwinkau committed
567

568
569
	unsigned family = info->cpu_ext_family + info->cpu_family;
	unsigned model  = (info->cpu_ext_model << 4) | info->cpu_model;
570
571
572

	switch (family) {
	case 4:
yb9976's avatar
yb9976 committed
573
		auto_arch = cpu_i486;
574
575
		break;
	case 5:
yb9976's avatar
yb9976 committed
576
		auto_arch = cpu_pentium;
577
578
579
580
581
582
583
584
585
586
587
588
589
590
		break;
	case 6:
		switch (model) {
		case 0x01: /* PentiumPro */
		case 0x03: /* Pentium II Model 3 */
		case 0x05: /* Pentium II Model 5 */
		case 0x06: /* Celeron Model 6 */
		case 0x07: /* Pentium III Model 7 */
		case 0x08: /* Pentium III Model 8 */
		case 0x09: /* Pentium M Model 9 */
		case 0x0A: /* Pentium III Model 0A */
		case 0x0B: /* Pentium III Model 0B */
		case 0x0D: /* Pentium M Model 0D */
		case 0x0E: /* Core Model 0E */
yb9976's avatar
yb9976 committed
591
			auto_arch = cpu_pentium_pro_generic;
592
593
594
595
596
			break;
		case 0x0F: /* Core2 Model 0F */
		case 0x15: /* Intel EP80579 */
		case 0x16: /* Celeron Model 16 */
		case 0x17: /* Core2 Model 17 */
yb9976's avatar
yb9976 committed
597
			auto_arch = cpu_core2_generic;
598
599
600
601
602
603
604
605
606
607
608
609
610
611
			break;
		default:
			/* unknown */
			break;
		}
		break;
	case 15:
		switch (model) {
		case 0x00: /* Pentium 4 Model 00 */
		case 0x01: /* Pentium 4 Model 01 */
		case 0x02: /* Pentium 4 Model 02 */
		case 0x03: /* Pentium 4 Model 03 */
		case 0x04: /* Pentium 4 Model 04 */
		case 0x06: /* Pentium 4 Model 06 */
yb9976's avatar
yb9976 committed
612
			auto_arch = cpu_netburst_generic;
613
614
			break;
		case 0x1A: /* Core i7 */
yb9976's avatar
yb9976 committed
615
			auto_arch = cpu_core2_generic;
616
617
			break;
		case 0x1C: /* Atom */
yb9976's avatar
yb9976 committed
618
			auto_arch = cpu_atom_generic;
619
620
			break;
		case 0x1D: /* Xeon MP */
yb9976's avatar
yb9976 committed
621
			auto_arch = cpu_core2_generic;
622
			break;
Andreas Zwinkau's avatar
Andreas Zwinkau committed
623
		default:
624
625
626
627
628
629
630
			/* unknown */
			break;
		}
		break;
	default:
		/* unknown */
		break;
Andreas Zwinkau's avatar
Andreas Zwinkau committed
631
632
	}

633
	return auto_arch;
Andreas Zwinkau's avatar
Andreas Zwinkau committed
634
635
}

636
637
638
static cpu_arch_features auto_detect_AMD(x86_cpu_info_t const *info)
{
	cpu_arch_features auto_arch = cpu_generic;
Andreas Zwinkau's avatar
Andreas Zwinkau committed
639

640
641
642
	unsigned family, model;

	if (info->cpu_family == 0x0F) {
643
644
		family = info->cpu_ext_family + info->cpu_family;
		model  = (info->cpu_ext_model << 4) | info->cpu_model;
645
646
647
648
649
650
651
	} else {
		family = info->cpu_family;
		model  = info->cpu_model;
	}

	switch (family) {
	case 0x04:
yb9976's avatar
yb9976 committed
652
		auto_arch = cpu_i486;
653
654
		break;
	case 0x05:
Michael Beck's avatar
Michael Beck committed
655
656
657
658
659
		switch (model) {
		case 0x00: /* K5 Model 0 */
		case 0x01: /* K5 Model 1 */
		case 0x02: /* K5 Model 2 */
		case 0x03: /* K5 Model 3 */
yb9976's avatar
yb9976 committed
660
			auto_arch = cpu_pentium;
Michael Beck's avatar
Michael Beck committed
661
662
663
664
665
666
			break;
		case 0x06: /* K6 Model 6 */
		case 0x07: /* K6 Model 7 */
		case 0x08: /* K6-2 Model 8 */
		case 0x09: /* K6-III Model 9 */
		case 0x0D: /* K6-2+ or K6-III+ */
yb9976's avatar
yb9976 committed
667
			auto_arch = cpu_k6_generic;
Michael Beck's avatar
Michael Beck committed
668
			break;
669
		case 0x0A: /* Geode LX */
yb9976's avatar
yb9976 committed
670
			auto_arch = cpu_geode_generic;
Michael Beck's avatar
Michael Beck committed
671
672
			break;
		default:
673
			/* unknown K6 */
yb9976's avatar
yb9976 committed
674
			auto_arch = cpu_k6_generic;
Michael Beck's avatar
Michael Beck committed
675
676
677
678
679
680
681
682
683
684
685
			break;
		}
		break;
	case 0x06:
		switch (model) {
		case 0x01: /* Athlon Model 1 */
		case 0x02: /* Athlon Model 2 */
		case 0x03: /* Duron Model 3 */
		case 0x04: /* Athlon Model 4 */
		case 0x06: /* Athlon MP/Mobile Athlon Model 6 */
		case 0x07: /* Mobile Duron Model 7 */
686
		case 0x08: /* Athlon (TH/AP core) including Geode NX */
Michael Beck's avatar
Michael Beck committed
687
		case 0x0A: /* Athlon (BT core) */
yb9976's avatar
yb9976 committed
688
689
		default:   /* unknown K7 */
			auto_arch = cpu_athlon_generic;
Michael Beck's avatar
Michael Beck committed
690
691
			break;
		}
692
693
		break;
	case 0x0F:
yb9976's avatar
yb9976 committed
694
		auto_arch = cpu_k8_generic;
695
		break;
696
697
	case 0x10:
	case 0x11: /* AMD Family 11h */
698
699
700
	case 0x12: /* AMD Family 12h */
	case 0x14: /* AMD Family 14h */
	case 0x15: /* AMD Family 15h */
yb9976's avatar
yb9976 committed
701
		auto_arch = cpu_k10_generic;
702
703
704
705
		break;
	default:
		/* unknown */
		break;
Andreas Zwinkau's avatar
Andreas Zwinkau committed
706
	}
Andreas Zwinkau's avatar
Andreas Zwinkau committed
707

708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
	return auto_arch;
}

typedef union {
	struct {
        unsigned eax;
        unsigned ebx;
        unsigned ecx;
        unsigned edx;
	} r;
	int bulk[4];
} cpuid_registers;

static void x86_cpuid(cpuid_registers *regs, unsigned level)
{
#if defined(__GNUC__)
724
#	if defined(__PIC__) && !defined(__amd64) // GCC cannot handle EBX in PIC
725
726
727
728
729
730
731
732
733
	__asm (
		"pushl %%ebx\n\t"
		"cpuid\n\t"
		"movl %%ebx, %1\n\t"
		"popl %%ebx"
	: "=a" (regs->r.eax), "=r" (regs->r.ebx), "=c" (regs->r.ecx), "=d" (regs->r.edx)
	: "a" (level)
	);
#	else
734
735
	__asm ("cpuid\n\t"
	: "=a" (regs->r.eax), "=b" (regs->r.ebx), "=c" (regs->r.ecx), "=d" (regs->r.edx)
736
737
	: "a" (level)
	);
738
#	endif
739
740
#elif defined(_MSC_VER)
	__cpuid(regs->bulk, level);
741
742
#else
#	error CPUID is missing
743
744
745
#endif
}

Matthias Braun's avatar
Matthias Braun committed
746
static bool x86_toogle_cpuid(void)
747
{
Matthias Braun's avatar
Matthias Braun committed
748
749
	unsigned eflags_before = 0;
	unsigned eflags_after = 0;
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765

#if defined(__GNUC__)
#ifdef __i386__
	/* If bit 21 of the EFLAGS register can be changed, the cpuid instruction is available */
	__asm__(
		"pushf\n\t"
		"popl %0\n\t"
		"movl %0, %1\n\t"
		"xorl $0x00200000, %1\n\t"
		"pushl %1\n\t"
		"popf\n\t"
		"pushf\n\t"
		"popl %1"
		: "=r" (eflags_before), "=r" (eflags_after) :: "cc"
		);
#else
Matthias Braun's avatar
Matthias Braun committed
766
	eflags_after = 0x00200000;
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
#endif
#elif defined(_MSC_VER)
#if defined(_M_IX86)
	__asm {
		pushfd
		pop eax
		mov eflags_before, eax
		xor eax, 0x00200000
		push eax
		popfd
		pushfd
		pop eax
		mov eflags_after, eax
	}
#else
Matthias Braun's avatar
Matthias Braun committed
782
	eflags_after = 0x00200000;
783
784
785
786
787
788
789
#endif
#endif
	return (eflags_before ^ eflags_after) & 0x00200000;
}

static void autodetect_arch(void)
{
790
	cpu_arch_features auto_arch = cpu_generic;
791
792
793
794
795

	/* We use the cpuid instruction to detect the CPU features */
	if (x86_toogle_cpuid()) {
		cpuid_registers   regs;
		char              vendorid[13];
796
		x86_cpu_info_t    cpu_info;
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821

		/* get vendor ID */
		x86_cpuid(&regs, 0);
		memcpy(&vendorid[0], &regs.r.ebx, 4);
		memcpy(&vendorid[4], &regs.r.edx, 4);
		memcpy(&vendorid[8], &regs.r.ecx, 4);
		vendorid[12] = '\0';

		/* get processor info and feature bits */
		x86_cpuid(&regs, 1);

		cpu_info.cpu_stepping   = (regs.r.eax >>  0) & 0x0F;
		cpu_info.cpu_model      = (regs.r.eax >>  4) & 0x0F;
		cpu_info.cpu_family     = (regs.r.eax >>  8) & 0x0F;
		cpu_info.cpu_type       = (regs.r.eax >> 12) & 0x03;
		cpu_info.cpu_ext_model  = (regs.r.eax >> 16) & 0x0F;
		cpu_info.cpu_ext_family = (regs.r.eax >> 20) & 0xFF;
		cpu_info.edx_features   = regs.r.edx;
		cpu_info.ecx_features   = regs.r.ecx;
		cpu_info.add_features   = regs.r.ebx;

		if        (0 == strcmp(vendorid, "GenuineIntel")) {
			auto_arch = auto_detect_Intel(&cpu_info);
		} else if (0 == strcmp(vendorid, "AuthenticAMD")) {
			auto_arch = auto_detect_AMD(&cpu_info);
822
		} else if (0 == strcmp(vendorid, "Geode by NSC")) {
yb9976's avatar
yb9976 committed
823
			auto_arch = cpu_geode_generic;
824
		}
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844

		if (cpu_info.edx_features & CPUID_FEAT_EDX_CMOV)
			auto_arch |= arch_feature_cmov;
		if (cpu_info.edx_features & CPUID_FEAT_EDX_MMX)
			auto_arch |= arch_feature_mmx;
		if (cpu_info.edx_features & CPUID_FEAT_EDX_SSE)
			auto_arch |= arch_feature_sse1;
		if (cpu_info.edx_features & CPUID_FEAT_EDX_SSE2)
			auto_arch |= arch_feature_sse2;

		if (cpu_info.ecx_features & CPUID_FEAT_ECX_SSE3)
			auto_arch |= arch_feature_sse3;
		if (cpu_info.ecx_features & CPUID_FEAT_ECX_SSSE3)
			auto_arch |= arch_feature_ssse3;
		if (cpu_info.ecx_features & CPUID_FEAT_ECX_SSE4_1)
			auto_arch |= arch_feature_sse4_1;
		if (cpu_info.ecx_features & CPUID_FEAT_ECX_SSE4_2)
			auto_arch |= arch_feature_sse4_2;
		if (cpu_info.ecx_features & CPUID_FEAT_ECX_POPCNT)
			auto_arch |= arch_feature_popcnt;
845
846
847
	}

	arch     = auto_arch;
Andreas Zwinkau's avatar
Andreas Zwinkau committed
848
849
	opt_arch = auto_arch;
}
850
#endif  /* NATIVE_X86 */
Andreas Zwinkau's avatar
Andreas Zwinkau committed
851

852
853
854
855
856
static bool flags(cpu_arch_features features, cpu_arch_features flags)
{
	return (features & flags) != 0;
}

857
858
void ia32_setup_cg_config(void)
{
859
860
	if (use_softfloat)
		fpu_arch = IA32_FPU_ARCH_SOFTFLOAT;
861

862
863
#ifdef NATIVE_X86
	if (arch == cpu_autodetect)
864
		autodetect_arch();
865
#endif
866
867
	if (opt_arch == 0)
		opt_arch = arch;
Andreas Zwinkau's avatar
Andreas Zwinkau committed
868

869
870
871
872
	set_arch_costs();

	ia32_code_gen_config_t *const c = &ia32_cg_config;
	memset(c, 0, sizeof(*c));
873
	c->optimize_size        = opt_size != 0;
Christoph Mallon's avatar
Typos.    
Christoph Mallon committed
874
	/* on newer intel cpus mov, pop is often faster than leave although it has a
875
	 * longer opcode */
876
	c->use_leave            = flags(opt_arch, arch_i386 | arch_all_amd | arch_core2) || opt_size;
877
	/* P4s don't like inc/decs because they only partially write the flags
Christoph Mallon's avatar
Typos.    
Christoph Mallon committed
878
	 * register which produces false dependencies */
879
880
881
882
	c->use_incdec           = !flags(opt_arch, arch_netburst | arch_nocona | arch_core2 | arch_geode) || opt_size;
	c->use_softfloat        = (fpu_arch & IA32_FPU_ARCH_SOFTFLOAT) != 0;
	c->use_sse2             = (fpu_arch & IA32_FPU_ARCH_SSE2) != 0 && flags(arch, arch_feature_sse2);
	c->use_ffreep           = flags(opt_arch, arch_athlon_plus);
883
	c->use_femms            = flags(opt_arch, arch_athlon_plus) && flags(arch, arch_feature_3DNow);
884
885
886
887
888
889
890
891
892
893
	c->use_fucomi           = flags(arch, arch_feature_p6_insn);
	c->use_cmov             = flags(arch, arch_feature_cmov);
	c->use_modeD_moves      = flags(opt_arch, arch_generic32 | arch_athlon_plus | arch_netburst | arch_nocona | arch_core2 | arch_ppro | arch_geode);
	c->use_add_esp_4        = flags(opt_arch, arch_generic32 | arch_athlon_plus | arch_netburst | arch_nocona | arch_core2 |             arch_geode)                         && !opt_size;
	c->use_add_esp_8        = flags(opt_arch, arch_generic32 | arch_athlon_plus | arch_netburst | arch_nocona | arch_core2 | arch_ppro | arch_geode | arch_i386 | arch_i486) && !opt_size;
	c->use_sub_esp_4        = flags(opt_arch, arch_generic32 | arch_athlon_plus | arch_netburst | arch_nocona | arch_core2 | arch_ppro)                                      && !opt_size;
	c->use_sub_esp_8        = flags(opt_arch, arch_generic32 | arch_athlon_plus | arch_netburst | arch_nocona | arch_core2 | arch_ppro |              arch_i386 | arch_i486) && !opt_size;
	c->use_imul_mem_imm32   = !flags(opt_arch, arch_k8 | arch_k10) || opt_size;
	c->use_pxor             = flags(opt_arch, arch_netburst);
	c->use_mov_0            = flags(opt_arch, arch_k6) && !opt_size;
894
	c->use_short_sex_eax    = !flags(opt_arch, arch_k6) || opt_size;
895
896
897
898
899
900
	c->use_pad_return       = flags(opt_arch, arch_athlon_plus) && !opt_size;
	c->use_bt               = flags(opt_arch, arch_core2 | arch_athlon_plus) || opt_size;
	c->use_fisttp           = flags(opt_arch & arch, arch_feature_sse3);
	c->use_sse_prefetch     = flags(arch, (arch_feature_3DNowE | arch_feature_sse1));
	c->use_3dnow_prefetch   = flags(arch, arch_feature_3DNow);
	c->use_popcnt           = flags(arch, arch_feature_popcnt);
901
	c->use_bswap            = (arch & arch_mask) >= arch_i486;
902
	c->use_cmpxchg          = (arch & arch_mask) != arch_i386;
903
904
	c->optimize_cc          = opt_cc;
	c->use_unsafe_floatconv = opt_unsafe_floatconv;
905
	c->emit_machcode        = emit_machcode;
906
907
908
909
910
911

	c->function_alignment       = arch_costs->function_alignment;
	c->label_alignment          = arch_costs->label_alignment;
	c->label_alignment_max_skip = arch_costs->label_alignment_max_skip;

	c->label_alignment_factor =
912
		flags(opt_arch, arch_i386 | arch_i486) || opt_size ? 0 :
913
914
		opt_arch & arch_all_amd                            ? 3 :
		2;
915
916
917
918
}

void ia32_init_architecture(void)
{
Michael Beck's avatar
Michael Beck committed
919
920
	lc_opt_entry_t *be_grp, *ia32_grp;

921
922
	memset(&ia32_cg_config, 0, sizeof(ia32_cg_config));

Michael Beck's avatar
Michael Beck committed
923
924
	be_grp   = lc_opt_get_grp(firm_opt_get_root(), "be");
	ia32_grp = lc_opt_get_grp(be_grp, "ia32");
925
926
927

	lc_opt_add_table(ia32_grp, ia32_architecture_options);
}