前言
优化你的代码, 有时要在深度思考问题, 编译器的角度看待你的程序的效率
正文
一, 优化你的代码
jle 汇编语言中的条件转移指令。小于或等于,或者不大于则转移。
typedef long data_t;
// create abstract data type for vector
typedef struct {
long len;
data_t *data;
} vec_rec, *vec_ptr;
// create vector of specified length
vec_ptr new_vec(long len)
{
// Allocate header structure
vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec));
data_t * data = NULL;
if (!result)
{
return NULL; /// couldn't allocate storage
}
result->len = len;
// Allocate array
if (len > 0)
{
data = (data_t *)calloc(len, sizeof(data_t));
if (!data)
{
free((void *)result);
return NULL; // couldn't allocate storage
}
}
// Data will either be NULL or allocated array
result->data = data;
return result;
}
/// retrieve vector element and store at dest
// return 0 (out of bounds ) or 1 (sucessful)
int get_vec_element(vec_ptr v, long index, data_t *dest)
{
if (index < 0 || index >= v->len)
{
return 0;
}
*dest = v->data[index];
return 1;
}
// return length or vector
long vec_length(vec_ptr v)
{
return v->len;
}
两种方式比较 * 和 + 比较
// first
#define IDENT 0
#define OP +
// second
#define IDENT 1
#define OP *
案例一:
// implementation with maximum use of data abstraction
void combine1(vec_ptr v, data_t *dest)
{
long i;
*dest = IDENT;
for (i = 0; i < vec_length(v); ++i)
{
data_t val;
get_vec_element(v, i, &val);
*dest = *dest OP val;
}
}
案例二:消除循环的低效率
术语:代码转移
// move call to vec_length out of loop
void combine2(vec_ptr v, data_t *dest)
{
long i;
long length = vec_length(v); // 术语:代码转移
*dest = IDENT;
for (i = 0; i < length; ++i)
{
data_t val;
get_vec_element(v, i, &val);
*dest = *dest OP val;
}
}
案例三: 减少过程调用
data_t *get_vec_start(vec_ptr v)
{
return v->data;
}
// direct access to vector data
void combine3(vec_ptr v, data_t *dest)
{
long i;
long length = vec_length(v);
data_t *data = get_vec_start(v);
*dest = IDENT;
for (i = 0; i < length; ++i)
{
*dest = *dest OP data[i];
}
}
案例 四: 消除不必要的内存引用
void combine4(vec_ptr v, data_t *dest)
{
long i;
long length = vec_length(v);
data_t *data = get_vec_start(v);
data_t acc = IDENT;
for (i = 0; i < length; ++i)
{
acc = acc OP data[i];
}
*dest = acc;
}
gcc -O1 -S cpu.c
combine4:
.LFB23:
.cfi_startproc
movq (%rdi), %rdx
testq %rdx, %rdx
jle .L8
movl $0, %eax
.L9:
addq $1, %rax
cmpq %rdx, %rax
jne .L9
.L8:
movq $0, (%rsi)
ret
.cfi_endproc
.LFE23:
.size combine4, .-combine4
.globl combine5
.type combine5, @function
二, cpu
现代处理器的结构
案例 五: 循环展开
合并代码使用”2 * 1循环展开”的版本 – [k == 2 , 这种变换为”k * 1 循环展开”]
// 2 * 1 loop unrolling
void combine5(vec_ptr v, data_t *dest)
{
long i;
long length = vec_length(v);
long limit = length - 1;
data_t *data = get_vec_start(v);
data_t acc = IDENT;
// Combine 2 element at a time
for (i = 0; i < limit; i += 2)
{ //这种变换能减少循环开销的影响 [在v字节数很小时用, 多了开销寄存器要的多了]
acc = (acc OP data[i]) OP data[ i + 1];
}
// finish any remaining elements
for (; i < length; ++i)
{
acc = acc OP data[i];
}
*dest = acc;
}
gcc -O0 -S cpu.c
combine5:
.LFB6:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $64, %rsp
movq %rdi, -56(%rbp)
movq %rsi, -64(%rbp)
movq -56(%rbp), %rax
movq %rax, %rdi
call vec_length
movq %rax, -24(%rbp)
movq -24(%rbp), %rax
subq $1, %rax
movq %rax, -32(%rbp)
movq -56(%rbp), %rax
movq %rax, %rdi
call get_vec_start
movq %rax, -40(%rbp)
movq $0, -16(%rbp)
movq $0, -8(%rbp)
jmp .L13
// inner loop of combine5. data_t = double , OP = *
// i in %rdx, data %rax, limit in %rbp, --acc in %xmmO
.L14:
movq -8(%rbp), %rax
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rax
imulq -16(%rbp), %rax // imulq(带符号整数四字相乘)
movq %rax, %rdx
movq -8(%rbp), %rax
addq $1, %rax
leaq 0(,%rax,8), %rcx
movq -40(%rbp), %rax
addq %rcx, %rax
movq (%rax), %rax
imulq %rdx, %rax
movq %rax, -16(%rbp)
addq $2, -8(%rbp)
.L13:
movq -8(%rbp), %rax
cmpq -32(%rbp), %rax // if > --> loop
jl .L14 // loop
jmp .L15
.L16:
movq -8(%rbp), %rax
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rax
movq -16(%rbp), %rdx
imulq %rdx, %rax
movq %rax, -16(%rbp)
addq $1, -8(%rbp)
.L15:
movq -8(%rbp), %rax
cmpq -24(%rbp), %rax
jl .L16
movq -64(%rbp), %rax
movq -16(%rbp), %rdx
movq %rdx, (%rax)
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE6:
.size combine5, .-combine5
.globl main
.type main, @function
案例六: 提高并行性 “2 * 2 循环展开”
使用两路并行在acc0中
// 2 * 2 loop unrolling
void combine6(vec_ptr v, data_t *dest)
{
long i;
long length = vec_length(v);
long limit = length - 1;
data_t * data = get_vec_start(v);
data_t acc0 = IDENT;
data_t acc1 = IDENT;
// combine 2 elements at a time
for (i = 0; i < limit; i += 2)
{
acc0 = acc0 OP data[i];
acc1 = acc1 OP data[i + 1];
}
// finish any remaining elements
for (; i < length; ++i)
{
acc0 = acc0 OP data[i];
}
*dest = acc0 OP acc1;
}
编译指令:
gcc -o1 -S cpu.c
combine6:
.LFB25:
.cfi_startproc
movq (%rdi), %rdx
leaq -1(%rdx), %rcx
testq %rcx, %rcx
jle .L21
movl $0, %eax
.L18:
addq $2, %rax
cmpq %rax, %rcx
jg .L18
jmp .L17
.L21:
movl $0, %eax
.L17:
cmpq %rdx, %rax
jge .L19
.L20:
addq $1, %rax
cmpq %rdx, %rax
jne .L20
.L19:
movq $0, (%rsi)
ret
.cfi_endproc
.LFE25:
.size combine6, .-combine6
.globl combine7
.type combine7, @function
案例七: 重新结合变换
术语: 重新结合变换 [“2 * 1a”]
acc = acc OP (data[i] OP data[i + 1]);
// 2 * 1a loop unrolling
void combine7(vec_ptr v, data_t *dest)
{
long i;
long length = vec_length(v);
long limit = length - 1;
data_t *data = get_vec_start(v);
data_t acc = IDENT;
// combine 2 elements at a tims
for (i = 0; i < limit; i += 2)
{
acc = acc OP (data[i] OP data[i + 1]);
}
// finish any remaining elements
for (; i < length; ++i)
{
acc = acc OP data[i];
}
*dest = acc;
}
combine7:
.LFB26:
.cfi_startproc
movq (%rdi), %rdx
leaq -1(%rdx), %rcx
testq %rcx, %rcx
jle .L27
movl $0, %eax
.L24:
addq $2, %rax
cmpq %rax, %rcx
jg .L24
jmp .L23
.L27:
movl $0, %eax
.L23:
cmpq %rdx, %rax
jge .L25
.L26:
addq $1, %rax
cmpq %rdx, %rax
jne .L26
.L25:
movq $0, (%rsi)
ret
.cfi_endproc
.LFE26:
.size combine7, .-combine7
.globl main
.type main, @function
结语
减少过程调用