Trust, But Verify
Pretty much all engineering is based on abstractions. We write software, but it's really an abstraction. The compiler converts it to assembly language. But that's an abstraction too. The assembler converts that to machine code. Of course, machine code is an abstraction of how to set up a very sophisticated state machine (the CPU). The CPU designer drew a schematic or used Verilog to design the CPU to abstract himself from the raw logic gates. The logic gates are an abstraction for the transistors. You get the idea.
Our projects would get nowhere if we started with electron flow through silicon and worked up from there. So hooray for abstractions!
The problem is, one of the hardest problems to find is when your abstraction is broken. I have seen more than one time when a compiler or other tool took something that should have worked and made it into something that didn't work. That's tough to find because you have to go down at least one level to see where it is broken.
The same is true when you try to reason about abstractions. Consider one of my favorite abstractions: C. What's faster? x++ or x+=1? Is it faster if x is unsigned instead of an integer? Who knows? You should know, but the answer will vary based on your underlying architecture, the compiler you use, and the optimization level you select. Granted, you can just profile the gross behavior of little programs to intuit the behavior of constructs like that, but if you break the abstraction, it's often easier to see what the compiler is doing for you.
If you are using the standard gcc tools try this:
gcc -c -g -Wa,-a,-ahl=somefile.s [[any other options you need]] somefile.c
And yes, you need -g. Trust me.
So let's start with this:
#include <stdio.h>
struct test_it
{
int id;
int a;
char z;
int b;
int c;
} test[100];
void foo(struct test_it *p)
{
// just do something so the optimizer doesn't
// decide there's no reason to do anything
printf("%d\n",p->id);
}
// set up the array
void init(void)
{
int i;
for (i=0;i
// call foo with an array index
for (i=0;i<sizeof(test)/sizeof(test[0]);i++)
{
foo(&test[i]);
}
// just add an offset
for (tp=test;tp->id;tp++)
{
foo(tp);
}
}
With no specific optimization options you get (excerpt):
37:t.c **** // call foo with an array index
38:t.c **** for (i=0;i
118 00a1 EB26 jmp .L6
119 .L7:
39:t.c **** {
40:t.c **** foo(&test[i]);
120 .loc 1 40 0 discriminator 2
121 00a3 8B45FC movl -4(%rbp), %eax
122 00a6 4863D0 movslq %eax, %rdx
123 00a9 4889D0 movq %rdx, %rax
124 00ac 48C1E002 salq $2, %rax
125 00b0 4801D0 addq %rdx, %rax
126 00b3 48C1E002 salq $2, %rax
127 00b7 48050000 addq $test, %rax
127 0000
128 00bd 4889C7 movq %rax, %rdi
129 00c0 E8000000 call foo
129 00
38:t.c **** for (i=0;i<sizeof(test)/sizeof(test[0]);i++)
116 .loc 1 38 0
117 009a C745FC00 movl $0, -4(%rbp)
117 000000
118 00a1 EB26 jmp .L6
119 .L7:
39:t.c **** {
40:t.c **** foo(&test[i]);
120 .loc 1 40 0 discriminator 2
121 00a3 8B45FC movl -4(%rbp), %eax
122 00a6 4863D0 movslq %eax, %rdx
123 00a9 4889D0 movq %rdx, %rax
124 00ac 48C1E002 salq $2, %rax
125 00b0 4801D0 addq %rdx, %rax
126 00b3 48C1E002 salq $2, %rax
127 00b7 48050000 addq $test, %rax
127 0000
128 00bd 4889C7 movq %rax, %rdi
129 00c0 E8000000 call foo
129 00
38:t.c **** for (i=0;i<sizeof(test)/sizeof(test[0]);i++)
130 .loc 1 38 0 discriminator 2
131 00c5 8345FC01 addl $1, -4(%rbp)
132 .L6:
38:t.c **** for (i=0;i<sizeof(test)/sizeof(test[0]);i++)
133 .loc 1 38 0 is_stmt 0 discriminator 1
134 00c9 8B45FC movl -4(%rbp), %eax
135 00cc 83F863 cmpl $99, %eax
136 00cf 76D2 jbe .L7
41:t.c **** }
42:t.c ****
43:t.c **** // just add an offset
44:t.c **** for (tp=test;tp->id;tp++)
137 .loc 1 44 0 is_stmt 1
138 00d1 48C745F0 movq $test, -16(%rbp)
138 00000000
139 00d9 EB11 jmp .L8
140 .L9:
45:t.c **** {
46:t.c **** foo(tp);
141 .loc 1 46 0 discriminator 2
142 00db 488B45F0 movq -16(%rbp), %rax
143 00df 4889C7 movq %rax, %rdi
144 00e2 E8000000 call foo
144 00
44:t.c **** for (tp=test;tp->id;tp++)
145 .loc 1 44 0 discriminator 2
146 00e7 488345F0 addq $20, -16(%rbp)
146 14
147 .L8:
44:t.c **** for (tp=test;tp->id;tp++)
148 .loc 1 44 0 is_stmt 0 discriminator 1
149 00ec 488B45F0 movq -16(%rbp), %rax
150 00f0 8B00 movl (%rax), %eax
151 00f2 85C0 testl %eax, %eax
152 00f4 75E5 jne .L9
47:t.c **** }
That's with default optimization on an x86 CPU. You can see there is a definite difference. If you took the Intel data sheets you could do clock counts for both, but for this example, I'll leave that to your imagination. But you can see the first method has plenty of multiplies (well, shifts and adds, anyway). Whereas the second loop is only a small piece of code.
Of course, the compiler can optimize harder. What happens with -O3?
105 .loc 2 105 0 discriminator 2
106 0080 8B930000 movl test(%rbx), %edx
106 0000
107 0086 31C0 xorl %eax, %eax
108 0088 BE000000 movl $.LC0, %esi
108 00
109 008d BF010000 movl $1, %edi
109 00
110 0092 4883C314 addq $20, %rbx
111 0096 E8000000 call __printf_chk
111 00
37:t.c **** // call foo with an array index
38:t.c **** for (i=0;i<sizeof(test)/sizeof(test[0]);i++)
116 .loc 1 38 0 discriminator 2
117 009b 4881FBD0 cmpq $2000, %rbx
117 070000
118 00a2 75DC jne .L8
119 .LVL8:
39:t.c **** {
40:t.c **** foo(&test[i]);
41:t.c **** }
42:t.c ****
43:t.c **** // just add an offset
44:t.c **** for (tp=test;tp->id;tp++)
120 .loc 1 44 0 discriminator 1
121 00a4 8B150000 movl test(%rip), %edx
121 0000
122 00aa 85D2 testl %edx, %edx
123 00ac 7425 je .L11
124 .loc 1 44 0 is_stmt 0
125 00ae BB000000 movl $test, %ebx
125 00
126 .LVL9:
127 .p2align 4,,10
128 00b3 0F1F4400 .p2align 3
128 00
129 .L10:
130 .LBB28:
131 .LBB29:
132 .LBB30:
133 .loc 2 105 0 is_stmt 1
134 00b8 31C0 xorl %eax, %eax
135 .LBE30:
136 .LBE29:
137 .LBE28:
138 .loc 1 44 0
139 00ba 4883C314 addq $20, %rbx
140 .LVL10:
141 .LBB33:
142 .LBB32:
143 .LBB31:
144 .loc 2 105 0
145 00be BE000000 movl $.LC0, %esi
145 00
146 00c3 BF010000 movl $1, %edi
146 00
147 00c8 E8000000 call __printf_chk
147 00
148 .LVL11:
149 .LBE31:
150 .LBE32:
151 .LBE33:
152 .loc 1 44 0
153 00cd 8B13 movl (%rbx), %edx
154 00cf 85D2 testl %edx, %edx
155 00d1 75E5 jne .L10
156 .LVL12:
157 .L11:
45:t.c **** {
46:t.c **** foo(tp);
47:t.c **** }
Note how the loops were significantly changed to turn the loops into decrements of ebx and all the math was abstracted out more or less in the first loop.
By using these options you can see exactly what the compiler does with your code and how it optimizes it. This is good for timing, answering "what if" questions, and on the rare occasion when you have a bug in the optimizer. Trust but verify.

