This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: gcc -O1 performs better than gcc -O2


Richard Guenther <richard.guenther@gmail.com> wrote on 2010/02/14 19:05:24:
>
> On Sun, Feb 14, 2010 at 5:51 PM, Joakim Tjernlund
> <joakim.tjernlund@transmode.se> wrote:
> >
> > Noticed while optimizing crc16 that gcc -O performed much better
> > than gcc -O2 while doing crc16:
>
> Reducing the noise by adding a loop with trip count 64, making sure
> my powersaving model is fixed at performance I see
>
> -O1:
> crc1:f532 crc2:f532
> crc16 tv_res:1 :387072
> CRC16 tv_res:1 :100397
>
> -O2:
> crc1:f532 crc2:f532
> crc16 tv_res:1 :301706
> CRC16 tv_res:1 :77103

The new CRC16 seems a lot faster :)

>
> so it's faster, with GCC 4.4.3.
>
> It's indeed slower with GCC 4.3.4 though.
>
> But your benchmark seems artificial enough that GCC 4.5 optimizes
> it away - it manages to see that CRC16 and crc16 are pure functions,
> thus it only retains their last calls.  At least at -O1, at -O2 it inlines
> all functions into main and isn't that clever anymore in the end.
>
> So - beware of benchmarks.

the warmup isn't really needed after I added memset and inline or not should
not matter that much are only used once so I think my
conclusion still stands: gcc 4.3.4 is slower with -O2 than -O1

Glad to hear that newer versions are back on track, does than mean
that this won't be fixed in gcc 4.3.x series?

>
> Richard.
>
> > # > gcc -O1 ?CRC16.c ;./a.out
> > crc1:f532 crc2:f532
> > crc16 tv_res:0 :12768
> > CRC16 tv_res:0 :10795
> > # > gcc -O2 ?CRC16.c ;./a.out
> > crc1:f532 crc2:f532
> > crc16 tv_res:0 :17092
> > CRC16 tv_res:0 :11581
> >
> > #> gcc --version
> > gcc (Gentoo 4.3.4 p1.0, pie-10.1.5) 4.3.4
> >
> > cpu:
> > vendor_id ? ? ? : GenuineIntel
> > cpu family ? ? ?: 6
> > model ? ? ? ? ? : 23
> > model name ? ? ?: Intel(R) Core(TM)2 Duo CPU ? ? E8500 ?@ 3.16GHz
> > stepping ? ? ? ?: 10
> > cpu MHz : 3159.236
> >
> > Here is the CRC16.c:
> >
> > #define u32 unsigned long
> > #define u16 unsigned short
> > #define u8 unsigned char
> >
> > #include <sys/time.h>
> >
> > #include <stdio.h>
> > #include <stdlib.h>
> >
> > /** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */
> > u16 const crc16_table[256] = {
> > ? ? ? ?0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
> > ? ? ? ?0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
> > ? ? ? ?0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
> > ? ? ? ?0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
> > ? ? ? ?0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
> > ? ? ? ?0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
> > ? ? ? ?0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
> > ? ? ? ?0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
> > ? ? ? ?0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
> > ? ? ? ?0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
> > ? ? ? ?0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
> > ? ? ? ?0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
> > ? ? ? ?0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
> > ? ? ? ?0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
> > ? ? ? ?0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
> > ? ? ? ?0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
> > ? ? ? ?0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
> > ? ? ? ?0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
> > ? ? ? ?0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
> > ? ? ? ?0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
> > ? ? ? ?0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
> > ? ? ? ?0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
> > ? ? ? ?0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
> > ? ? ? ?0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
> > ? ? ? ?0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
> > ? ? ? ?0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
> > ? ? ? ?0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
> > ? ? ? ?0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
> > ? ? ? ?0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
> > ? ? ? ?0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
> > ? ? ? ?0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
> > ? ? ? ?0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
> > };
> > #include <asm/byteorder.h>
> > #define tole(x) __constant_cpu_to_le16(x)
> > u16 const crc16_table_le[256] = {
> > ? ? ? ?tole(0x0000), tole(0xC0C1), tole(0xC181), tole(0x0140), tole(0xC301),
> > ? ? ? ?tole(0x03C0), tole(0x0280), tole(0xC241), tole(0xC601), tole(0x06C0),
> > ? ? ? ?tole(0x0780), tole(0xC741), tole(0x0500), tole(0xC5C1), tole(0xC481),
> > ? ? ? ?tole(0x0440), tole(0xCC01), tole(0x0CC0), tole(0x0D80), tole(0xCD41),
> > ? ? ? ?tole(0x0F00), tole(0xCFC1), tole(0xCE81), tole(0x0E40), tole(0x0A00),
> > ? ? ? ?tole(0xCAC1), tole(0xCB81), tole(0x0B40), tole(0xC901), tole(0x09C0),
> > ? ? ? ?tole(0x0880), tole(0xC841), tole(0xD801), tole(0x18C0), tole(0x1980),
> > ? ? ? ?tole(0xD941), tole(0x1B00), tole(0xDBC1), tole(0xDA81), tole(0x1A40),
> > ? ? ? ?tole(0x1E00), tole(0xDEC1), tole(0xDF81), tole(0x1F40), tole(0xDD01),
> > ? ? ? ?tole(0x1DC0), tole(0x1C80), tole(0xDC41), tole(0x1400), tole(0xD4C1),
> > ? ? ? ?tole(0xD581), tole(0x1540), tole(0xD701), tole(0x17C0), tole(0x1680),
> > ? ? ? ?tole(0xD641), tole(0xD201), tole(0x12C0), tole(0x1380), tole(0xD341),
> > ? ? ? ?tole(0x1100), tole(0xD1C1), tole(0xD081), tole(0x1040), tole(0xF001),
> > ? ? ? ?tole(0x30C0), tole(0x3180), tole(0xF141), tole(0x3300), tole(0xF3C1),
> > ? ? ? ?tole(0xF281), tole(0x3240), tole(0x3600), tole(0xF6C1), tole(0xF781),
> > ? ? ? ?tole(0x3740), tole(0xF501), tole(0x35C0), tole(0x3480), tole(0xF441),
> > ? ? ? ?tole(0x3C00), tole(0xFCC1), tole(0xFD81), tole(0x3D40), tole(0xFF01),
> > ? ? ? ?tole(0x3FC0), tole(0x3E80), tole(0xFE41), tole(0xFA01), tole(0x3AC0),
> > ? ? ? ?tole(0x3B80), tole(0xFB41), tole(0x3900), tole(0xF9C1), tole(0xF881),
> > ? ? ? ?tole(0x3840), tole(0x2800), tole(0xE8C1), tole(0xE981), tole(0x2940),
> > ? ? ? ?tole(0xEB01), tole(0x2BC0), tole(0x2A80), tole(0xEA41), tole(0xEE01),
> > ? ? ? ?tole(0x2EC0), tole(0x2F80), tole(0xEF41), tole(0x2D00), tole(0xEDC1),
> > ? ? ? ?tole(0xEC81), tole(0x2C40), tole(0xE401), tole(0x24C0), tole(0x2580),
> > ? ? ? ?tole(0xE541), tole(0x2700), tole(0xE7C1), tole(0xE681), tole(0x2640),
> > ? ? ? ?tole(0x2200), tole(0xE2C1), tole(0xE381), tole(0x2340), tole(0xE101),
> > ? ? ? ?tole(0x21C0), tole(0x2080), tole(0xE041), tole(0xA001), tole(0x60C0),
> > ? ? ? ?tole(0x6180), tole(0xA141), tole(0x6300), tole(0xA3C1), tole(0xA281),
> > ? ? ? ?tole(0x6240), tole(0x6600), tole(0xA6C1), tole(0xA781), tole(0x6740),
> > ? ? ? ?tole(0xA501), tole(0x65C0), tole(0x6480), tole(0xA441), tole(0x6C00),
> > ? ? ? ?tole(0xACC1), tole(0xAD81), tole(0x6D40), tole(0xAF01), tole(0x6FC0),
> > ? ? ? ?tole(0x6E80), tole(0xAE41), tole(0xAA01), tole(0x6AC0), tole(0x6B80),
> > ? ? ? ?tole(0xAB41), tole(0x6900), tole(0xA9C1), tole(0xA881), tole(0x6840),
> > ? ? ? ?tole(0x7800), tole(0xB8C1), tole(0xB981), tole(0x7940), tole(0xBB01),
> > ? ? ? ?tole(0x7BC0), tole(0x7A80), tole(0xBA41), tole(0xBE01), tole(0x7EC0),
> > ? ? ? ?tole(0x7F80), tole(0xBF41), tole(0x7D00), tole(0xBDC1), tole(0xBC81),
> > ? ? ? ?tole(0x7C40), tole(0xB401), tole(0x74C0), tole(0x7580), tole(0xB541),
> > ? ? ? ?tole(0x7700), tole(0xB7C1), tole(0xB681), tole(0x7640), tole(0x7200),
> > ? ? ? ?tole(0xB2C1), tole(0xB381), tole(0x7340), tole(0xB101), tole(0x71C0),
> > ? ? ? ?tole(0x7080), tole(0xB041), tole(0x5000), tole(0x90C1), tole(0x9181),
> > ? ? ? ?tole(0x5140), tole(0x9301), tole(0x53C0), tole(0x5280), tole(0x9241),
> > ? ? ? ?tole(0x9601), tole(0x56C0), tole(0x5780), tole(0x9741), tole(0x5500),
> > ? ? ? ?tole(0x95C1), tole(0x9481), tole(0x5440), tole(0x9C01), tole(0x5CC0),
> > ? ? ? ?tole(0x5D80), tole(0x9D41), tole(0x5F00), tole(0x9FC1), tole(0x9E81),
> > ? ? ? ?tole(0x5E40), tole(0x5A00), tole(0x9AC1), tole(0x9B81), tole(0x5B40),
> > ? ? ? ?tole(0x9901), tole(0x59C0), tole(0x5880), tole(0x9841), tole(0x8801),
> > ? ? ? ?tole(0x48C0), tole(0x4980), tole(0x8941), tole(0x4B00), tole(0x8BC1),
> > ? ? ? ?tole(0x8A81), tole(0x4A40), tole(0x4E00), tole(0x8EC1), tole(0x8F81),
> > ? ? ? ?tole(0x4F40), tole(0x8D01), tole(0x4DC0), tole(0x4C80), tole(0x8C41),
> > ? ? ? ?tole(0x4400), tole(0x84C1), tole(0x8581), tole(0x4540), tole(0x8701),
> > ? ? ? ?tole(0x47C0), tole(0x4680), tole(0x8641), tole(0x8201), tole(0x42C0),
> > ? ? ? ?tole(0x4380), tole(0x8341), tole(0x4100), tole(0x81C1), tole(0x8081),
> > ? ? ? ?tole(0x4040)
> > };
> >
> > extern u16 const crc16_table[256];
> >
> > extern u16 crc16(u16 crc, const u8 *buffer, u32 len);
> >
> > static inline u16 crc16_byte(u16 crc, const u8 data)
> > {
> > ? ? ? ?return (crc >> 8) ^ crc16_table[(crc ^ data) & 0xff];
> > }
> >
> > /**
> > ?* crc16 - compute the CRC-16 for the data buffer
> > ?* @crc: ? ? ? ?previous CRC value
> > ?* @buffer: ? ? data pointer
> > ?* @len: ? ? ? ?number of bytes in the buffer
> > ?*
> > ?* Returns the updated CRC value.
> > ?*/
> > u16 crc16(u16 crc, u8 const *buffer, u32 len)
> > {
> > ? ? ? ?while (len--)
> > ? ? ? ? ? ? ? ?crc = crc16_byte(crc, *buffer++);
> > ? ? ? ?return crc;
> > }
> > #include <endian.h>
> > # if __BYTE_ORDER == __LITTLE_ENDIAN
> > # ?define DO_CRC16(x) crc = tab[(crc ^ (x)) & 255] ^ (crc >> 8)
> > # else
> > # ?define DO_CRC16(x) crc = tab[((crc >> 8) ^ (x))] ^ (crc << 8)
> > # endif
> >
> > static inline u16 CRC16_byte(u16 crc, const u8 data)
> > {
> > ? ? ? ?const u16 *tab = crc16_table_le;
> >
> > ? ? ? ?crc = __cpu_to_le16(crc);
> > ? ? ? ?DO_CRC16(data);
> > ? ? ? ?return __le16_to_cpu(crc);
> > }
> >
> > u16 CRC16(u16 crc, u8 const *buffer, u32 len)
> > {
> > ? ? ? ?u32 loops;
> > ? ? ? ?u16 *b;
> > ? ? ? ?const u16 *tab = crc16_table_le;;
> >
> > ? ? ? ?crc = __cpu_to_le16(crc);
> > ? ? ? ?/* Align */
> > ? ? ? ?if ((long)buffer & 1 && len) {
> > ? ? ? ? ? ? ? ?DO_CRC16(*buffer++);
> > ? ? ? ? ? ? ? ?--len;
> > ? ? ? ?}
> > ? ? ? ?loops = len >> 1;
> > ? ? ? ?b = (u16 *)buffer;
> > ? ? ? ?for(--b; loops; --loops) {
> > ? ? ? ? ? ? ? ?crc ^= *++b; /* use pre increment for speed */
> > ? ? ? ? ? ? ? ?DO_CRC16(0);
> > ? ? ? ? ? ? ? ?DO_CRC16(0);
> > ? ? ? ?}
> > ? ? ? ?if (len & 1) {
> > ? ? ? ? ? ? ? ?u8 *p = (u8 *)(b + 1);
> > ? ? ? ? ? ? ? ?DO_CRC16(*p);
> > ? ? ? ?}
> > ? ? ? ?return __le16_to_cpu(crc);
> > }
> > #define TST "1234567890"
> > #define BUF_SIZ 5*1024*1024
> > main()
> > {
> > ? ? ? ?u16 crc1, crc2;
> > ? ? ? ?char *buffer = ?malloc(BUF_SIZ);
> > ? ? ? ?struct timeval tv, tv2, tv3, tv_res, tv2_res;
> >
> > ? ? ? ?memset(buffer, -1, BUF_SIZ);
> > ? ? ? ?memcpy (buffer, TST, sizeof(TST));
> > ? ? ? ?/*warm up */
> > ? ? ? ?crc1 = crc16(~0, buffer, BUF_SIZ);
> > ? ? ? ?crc2 = CRC16(~0, buffer, BUF_SIZ);
> >
> > ? ? ? ?gettimeofday(&tv, NULL);
> > ? ? ? ?crc1 = crc16(~0, buffer, BUF_SIZ);
> > ? ? ? ?gettimeofday(&tv2, NULL);
> > ? ? ? ?crc2 = CRC16(~0, buffer, BUF_SIZ);
> > ? ? ? ?gettimeofday(&tv3, NULL);
> > ? ? ? ?timersub(&tv2, &tv, &tv_res);
> > ? ? ? ?printf("crc1:%x crc2:%x\n", crc1, crc2);
> > ? ? ? ?printf("crc16 tv_res:%d :%d\n", (int)tv_res.tv_sec, (int)tv_res.tv_usec);
> > ? ? ? ?timersub(&tv3, &tv2, &tv2_res);
> > ? ? ? ?printf("CRC16 tv_res:%d :%d\n", (int)tv2_res.tv_sec, (int)tv2_res.tv_usec);
> > }
> >
> >
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]