This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Memory buffer handling - additional optimization proposal
- From: daniel at poradnik-webmastera dot com
- To: gcc at gcc dot gnu dot org
- Date: Sat, 21 Jan 2017 16:24:50 +0100
- Subject: Memory buffer handling - additional optimization proposal
- Authentication-results: sourceware.org; auth=none
Hi
I have checked how gcc treats temporary buffers allocated in different
ways (local buffer on stack, malloced locally in function, malloced
outside and passed via argument) and found that gcc could do better work
there. I have few proposals how to make things better:
1. Introduce __builtin_assume_temporary(ptr) and/or
__attribute__((temporary)) which will tell gcc that given function
argument or variable is a temporary buffer and its contents after
function execution will not be used. By doing so gcc could treat it as
if it was allocated locally in function and optimize out unnecessary
memory writes.
2. One of function versions used malloc() to allocate buffer of constant
size at beginning of function, and free() to delete it at the end. When
this function was inlined and called twice, its body was inserted twice
in my code. Other function versions with buffer on stack or malloced
outside and passed via argument were inserted once. I am not sure if
this is some bug or feature, someone familiar with this should check it.
3. If there is valid reason to actually duplicate this code (e.g. loop
unrolling), gcc could detect that code calls malloc and free in loop and
repeatedly allocate/free buffer of the same size. In such case gcc could
optimize this to call malloc and free once. If someone will use calloc,
gcc could simply clear buffer before each loop iteration.
I tested this with gcc 5.4.0 shipped by default in Cygwin 64 bit. Test
code used by me is below.
Regards,
Daniel
#include <stdio.h>
#include <stdlib.h>
#define CNT 4
#define ATTR_INLINE __attribute__((hot, always_inline)) static inline
//#define ATTR_INLINE
ATTR_INLINE int func_1(int* data)
{
int buff[CNT * CNT];
for (int i = 0; i < CNT; ++i)
{
for (int j = 0; j < CNT; ++j)
{
buff[i * CNT + j] = data[i];
}
}
for (int n = 0; n < CNT; ++n)
{
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < n; ++j)
{
buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT
+ j+1];
}
}
}
return buff[0];
}
ATTR_INLINE int func_2(int* data)
{
int* buff = (int*)malloc(CNT * CNT * sizeof(int));
for (int i = 0; i < CNT; ++i)
{
for (int j = 0; j < CNT; ++j)
{
buff[i * CNT + j] = data[i];
}
}
for (int n = 0; n < CNT; ++n)
{
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < n; ++j)
{
buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT
+ j+1];
}
}
}
int result = buff[0];
free(buff);
return result;
}
ATTR_INLINE int func_3(int* __restrict__ data, int* __restrict__ buff)
{
for (int i = 0; i < CNT; ++i)
{
for (int j = 0; j < CNT; ++j)
{
buff[i * CNT + j] = data[i];
}
}
for (int n = 0; n < CNT; ++n)
{
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < n; ++j)
{
buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT
+ j+1];
}
}
}
return buff[0];
}
ATTR_INLINE int func_4(int* data, int* buff)
{
for (int i = 0; i < CNT; ++i)
{
for (int j = 0; j < CNT; ++j)
{
buff[i * CNT + j] = data[i];
}
}
for (int n = 0; n < CNT; ++n)
{
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < n; ++j)
{
buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT
+ j+1];
}
}
}
return buff[0];
}
int main()
{
int* data = (int*)malloc(CNT * sizeof(int));
int* buff = (int*)malloc(CNT * sizeof(int));
for (int n = 0; n < CNT; ++n)
data[n] = rand();
int sum = 0;
printf("1");
sum += func_1(data);
sum += func_1(data);
printf("2");
sum += func_2(data);
sum += func_2(data);
printf("3");
sum += func_3(data, buff);
sum += func_3(data, buff);
printf("4");
sum += func_4(data, buff);
sum += func_4(data, buff);
printf("4");
return sum;
}