3,优化程序的性能

高质量的代码是有思考深度的优化

Posted by chensong on 2018-12-26 01::18::00

前言

优化你的代码, 有时要在深度思考问题, 编译器的角度看待你的程序的效率

正文

一, 优化你的代码

jle 汇编语言中的条件转移指令。小于或等于,或者不大于则转移。


typedef long data_t;

// create abstract data type for vector
typedef struct {
	long len;
	data_t *data;
} vec_rec, *vec_ptr;

// create vector of specified length

vec_ptr new_vec(long len)
{
	// Allocate header structure
	vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec));
	data_t * data = NULL;
	if (!result)
	{
		return NULL; /// couldn't  allocate storage
	}
	result->len = len;
	// Allocate array
	if (len > 0)
	{
		data = (data_t *)calloc(len, sizeof(data_t));
		if (!data)
		{
			free((void *)result);
			return NULL; // couldn't allocate storage
		}
	}
	// Data will either be NULL or allocated array
	result->data = data;
	return result;
}

/// retrieve vector element and store at dest
// return 0 (out of bounds ) or 1 (sucessful)
int get_vec_element(vec_ptr v, long index, data_t *dest)
{
	if (index < 0 || index >= v->len)
	{
		return 0;
	}
	*dest = v->data[index];
	return 1;
}

// return length or vector 
long vec_length(vec_ptr v)
{
	return v->len;
}


两种方式比较 * 和 + 比较

// first
#define IDENT 0
#define OP +

// second
#define IDENT 1
#define OP *

案例一:

// implementation with maximum use of data abstraction
void combine1(vec_ptr v, data_t *dest)
{
	long i;
	*dest = IDENT;
	for (i = 0; i < vec_length(v); ++i)
	{
		data_t val;
		get_vec_element(v, i, &val);
		*dest = *dest OP val;
	}
}

案例二:消除循环的低效率

术语:代码转移

// move call to vec_length out of loop
void combine2(vec_ptr v, data_t *dest)
{
	long i;
	long length = vec_length(v); // 术语:代码转移
	
	*dest = IDENT;
	for (i = 0; i < length; ++i)
	{
		data_t val;
		get_vec_element(v, i, &val);
		*dest = *dest OP val;
	}
}

案例三: 减少过程调用

data_t *get_vec_start(vec_ptr v)
{
	return v->data;
}

// direct access to vector data
void combine3(vec_ptr v, data_t *dest)
{
	long i;
	long length = vec_length(v);
	data_t *data = get_vec_start(v);
	
	*dest = IDENT;
	for (i = 0; i < length; ++i)
	{
		*dest = *dest OP data[i];
	}
}

案例 四: 消除不必要的内存引用


void combine4(vec_ptr v, data_t *dest)
{
	long i;
	long length = vec_length(v);
	data_t *data = get_vec_start(v);
	data_t acc = IDENT;
	
	for (i = 0; i < length; ++i)
	{
		acc = acc OP data[i];
	}
	*dest = acc;
}

gcc -O1 -S cpu.c

combine4:
.LFB23:
        .cfi_startproc
        movq    (%rdi), %rdx
        testq   %rdx, %rdx
        jle     .L8
        movl    $0, %eax
.L9:
        addq    $1, %rax
        cmpq    %rdx, %rax
        jne     .L9
.L8:
        movq    $0, (%rsi)
        ret
        .cfi_endproc
.LFE23:
        .size   combine4, .-combine4
        .globl  combine5
        .type   combine5, @function

二, cpu

现代处理器的结构

案例 五: 循环展开

合并代码使用”2 * 1循环展开”的版本 – [k == 2 , 这种变换为”k * 1 循环展开”]

// 2 * 1 loop unrolling
void combine5(vec_ptr v, data_t *dest)
{
	long i;
	long length = vec_length(v);
	long limit = length - 1;
	data_t *data = get_vec_start(v);
	data_t acc = IDENT;
	
	// Combine 2 element at a time
	for (i = 0; i < limit; i += 2)
	{   //这种变换能减少循环开销的影响 [在v字节数很小时用, 多了开销寄存器要的多了]
		acc = (acc OP data[i]) OP data[ i + 1];
	}
	
	// finish any remaining elements
	for (; i < length; ++i)
	{
		acc = acc OP data[i];
	}
	*dest = acc;
}

gcc -O0 -S cpu.c

combine5:
.LFB6:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        subq    $64, %rsp
        movq    %rdi, -56(%rbp)
        movq    %rsi, -64(%rbp)
        movq    -56(%rbp), %rax
        movq    %rax, %rdi
        call    vec_length
        movq    %rax, -24(%rbp)
        movq    -24(%rbp), %rax
        subq    $1, %rax
        movq    %rax, -32(%rbp)
        movq    -56(%rbp), %rax
        movq    %rax, %rdi
        call    get_vec_start
        movq    %rax, -40(%rbp)
        movq    $0, -16(%rbp)
        movq    $0, -8(%rbp)
        jmp     .L13
		// inner loop of combine5. data_t = double , OP = * 
		// i in %rdx,  data %rax, limit in %rbp, --acc in %xmmO
.L14:
        movq    -8(%rbp), %rax
        leaq    0(,%rax,8), %rdx
        movq    -40(%rbp), %rax
        addq    %rdx, %rax
        movq    (%rax), %rax
        imulq   -16(%rbp), %rax  // imulq(带符号整数四字相乘) 
        movq    %rax, %rdx
        movq    -8(%rbp), %rax
        addq    $1, %rax
        leaq    0(,%rax,8), %rcx
        movq    -40(%rbp), %rax
        addq    %rcx, %rax
        movq    (%rax), %rax
        imulq   %rdx, %rax
        movq    %rax, -16(%rbp)
        addq    $2, -8(%rbp)
.L13:
        movq    -8(%rbp), %rax
        cmpq    -32(%rbp), %rax  // if > --> loop
        jl      .L14 // loop 
        jmp     .L15
.L16:
        movq    -8(%rbp), %rax
        leaq    0(,%rax,8), %rdx
        movq    -40(%rbp), %rax
        addq    %rdx, %rax
        movq    (%rax), %rax
        movq    -16(%rbp), %rdx
        imulq   %rdx, %rax
        movq    %rax, -16(%rbp)
        addq    $1, -8(%rbp)
.L15:
        movq    -8(%rbp), %rax
        cmpq    -24(%rbp), %rax
        jl      .L16
        movq    -64(%rbp), %rax
        movq    -16(%rbp), %rdx
        movq    %rdx, (%rax)
        leave
        .cfi_def_cfa 7, 8
        ret
        .cfi_endproc
.LFE6:
        .size   combine5, .-combine5
        .globl  main
        .type   main, @function

案例六: 提高并行性 “2 * 2 循环展开”

使用两路并行在acc0中

// 2 * 2 loop unrolling
void combine6(vec_ptr v, data_t *dest)
{
	long i;
	long length = vec_length(v);
	long limit = length - 1;
	data_t * data = get_vec_start(v);
	data_t acc0 = IDENT;
	data_t acc1 = IDENT;
	
	// combine 2 elements at a time
	for (i = 0; i < limit; i += 2)
	{
		acc0 = acc0 OP data[i];
		acc1 = acc1 OP data[i + 1];
	}
	
	// finish any remaining elements
	for (; i < length; ++i)
	{
		acc0 = acc0 OP data[i];
	}
	*dest = acc0 OP acc1;
}

编译指令:

gcc -o1 -S cpu.c

combine6:
.LFB25:
        .cfi_startproc
        movq    (%rdi), %rdx
        leaq    -1(%rdx), %rcx
        testq   %rcx, %rcx
        jle     .L21
        movl    $0, %eax
.L18:
        addq    $2, %rax
        cmpq    %rax, %rcx
        jg      .L18
        jmp     .L17
.L21:
        movl    $0, %eax
.L17:
        cmpq    %rdx, %rax
        jge     .L19
.L20:
        addq    $1, %rax
        cmpq    %rdx, %rax
        jne     .L20
.L19:
        movq    $0, (%rsi)
        ret
        .cfi_endproc
.LFE25:
        .size   combine6, .-combine6
        .globl  combine7
        .type   combine7, @function

案例七: 重新结合变换

术语: 重新结合变换 [“2 * 1a”]

acc = acc OP (data[i] OP data[i + 1]);

// 2 * 1a loop unrolling
void combine7(vec_ptr v, data_t *dest)
{
	long i;
	long length = vec_length(v);
	long limit = length - 1;
	data_t *data = get_vec_start(v);
	data_t acc = IDENT;
	
	// combine 2 elements at a tims
	for (i = 0; i < limit; i += 2)
	{
		acc = acc OP (data[i] OP data[i + 1]);
	}
	
	// finish any remaining elements
	for (; i < length; ++i)
	{
		acc = acc OP data[i];
	}
	*dest = acc;
}
combine7:
.LFB26:
        .cfi_startproc
        movq    (%rdi), %rdx
        leaq    -1(%rdx), %rcx
        testq   %rcx, %rcx
        jle     .L27
        movl    $0, %eax
.L24:
        addq    $2, %rax
        cmpq    %rax, %rcx
        jg      .L24
        jmp     .L23
.L27:
        movl    $0, %eax
.L23:
        cmpq    %rdx, %rax
        jge     .L25
.L26:
        addq    $1, %rax
        cmpq    %rdx, %rax
        jne     .L26
.L25:
        movq    $0, (%rsi)
        ret
        .cfi_endproc
.LFE26:
        .size   combine7, .-combine7
        .globl  main
        .type   main, @function

结语

减少过程调用