This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

optimize slowdown problems on longest_match func detect.


Hello 

longest_match is a function that is used in compression to find out same
bytes.
I notice GCC 4.5.0(but on GGC series 4 happen too) use 0x64 bytes from stack
gcc3.4.0 only 8.
gcc4.5.0 is btw faster as 4.4.1.

Can somebody tell me, how this optimization can switch off in GCC ?.

GCC4.5.0 move data on this 0x64 bytes in this function.gcc3.4 do this not
and the asm optimized version of longest_match do this too not.I also dont
understand wy there must data copy.this func only compare data as far i see
and return length of same values.

here is this function optimized in asm and X86 asm 

http://hermes.ibert.com/source/foreign/gzip/match.s

this stack usage happen when i use -O1.

So i can see if it run faster then.
I think thats not only a backend problem, and happen on all platforms.

Here is C source and below is asm Output from gcc 4.5.0 and GCC 3.4.0 in 68k
asm.
its compile with -g2 there are stabd in.To see what sourceline it is, look
in attached file.

uInt longest_match(s, cur_match)
    deflate_state *s;
    IPos cur_match;                             /* current match */
{ 
    unsigned chain_length = s->max_chain_length;/* max hash chain length */
    register Bytef *scan = s->window + s->strstart; /* current string */
    register Bytef *match;                       /* matched string */
    register int len;                           /* length of current match
*/
     
    int best_len = s->prev_length;              /* best match length so far
*/
    int nice_match = s->nice_match;             /* stop if match long enough
*/
    IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
        s->strstart - (IPos)MAX_DIST(s) : NIL;
    /* Stop when cur_match becomes <= limit. To simplify the code,
     * we prevent matches with the string of window index 0.
     */ 
    Posf *prev = s->prev;
    uInt wmask = s->w_mask;
   
#ifdef UNALIGNED_OK
    /* Compare two bytes at a time. Note: this is not always beneficial.
     * Try with and without -DUNALIGNED_OK to check.
     */
    register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
    register ush scan_start = *(ushf*)scan;
    register ush scan_end   = *(ushf*)(scan+best_len-1);
#else
    register Bytef *strend = s->window + s->strstart + MAX_MATCH;
    register Byte scan_end1  = scan[best_len-1];
    register Byte scan_end   = scan[best_len];
#endif
  
    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of
16.
     * It is easy to get rid of this optimization if necessary.
     */
    
    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");

    /* Do not waste too much time if we already have a good match: */
    if (s->prev_length >= s->good_match) {
        chain_length >>= 2;
    }
    /* Do not look for matches beyond the end of the input. This is
necessary
     * to make deflate deterministic.
     */
    if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;

    Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need
lookahead");

    do {
        Assert(cur_match < s->strstart, "no future");
        match = s->window + cur_match;

        /* Skip to next match if the match length cannot increase
         * or if the match length is less than 2.  Note that the checks
below
         * for insufficient lookahead only occur occasionally for
performance
         * reasons.  Therefore uninitialized memory will be accessed, and
         * conditional jumps will be made that depend on those values.
         * However the length of the match is limited to the lookahead, so
         * the output of deflate is not affected by the uninitialized
values.
         */
#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
        /* This code assumes sizeof(unsigned short) == 2. Do not use
         * UNALIGNED_OK if your compiler uses a different size.
         */
        if (*(ushf*)(match+best_len-1) != scan_end ||
            *(ushf*)match != scan_start) continue;

        /* It is not necessary to compare scan[2] and match[2] since they
are
         * always equal when the other bytes match, given that the hash keys
         * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
         * strstart+3, +5, ... up to strstart+257. We check for insufficient
         * lookahead only every 4th comparison; the 128th check will be made
         * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
         * necessary to put more guard bytes at the end of the window, or
         * to check more often for insufficient lookahead.
         */
        Assert(scan[2] == match[2], "scan[2]?");
        scan++, match++;
        do {
        } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
                 scan < strend);
        /* The funny "do {}" generates better code on most compilers */

        /* Here, scan <= window+strstart+257 */
        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
        if (*scan == *match) scan++;

        len = (MAX_MATCH - 1) - (int)(strend-scan);
        scan = strend - (MAX_MATCH-1);

#else /* UNALIGNED_OK */

        if (match[best_len]   != scan_end  ||
            match[best_len-1] != scan_end1 ||
            *match            != *scan     ||
            *++match          != scan[1])      continue;

        /* The check at best_len-1 can be removed because it will be made
         * again later. (This heuristic is not always a win.)
         * It is not necessary to compare scan[2] and match[2] since they
         * are always equal when the other bytes match, given that
         * the hash keys are equal and that HASH_BITS >= 8.
         */
        scan += 2, match++;
        Assert(*scan == *match, "match[2]?");

        /* We check for insufficient lookahead only every 8th comparison;
         * the 256th check will be made at strstart+258.
         */
        do {
        } while (*++scan == *++match && *++scan == *++match &&
                 *++scan == *++match && *++scan == *++match &&
                 *++scan == *++match && *++scan == *++match &&
                 *++scan == *++match && *++scan == *++match &&
                 scan < strend);

        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");

        len = MAX_MATCH - (int)(strend - scan);
        scan = strend - MAX_MATCH;

#endif /* UNALIGNED_OK */

        if (len > best_len) {
            s->match_start = cur_match;
            best_len = len;
            if (len >= nice_match) break;
#ifdef UNALIGNED_OK
            scan_end = *(ushf*)(scan+best_len-1);
#else
            scan_end1  = scan[best_len-1];
            scan_end   = scan[best_len];
#endif
        }
    } while ((cur_match = prev[cur_match & wmask]) > limit
             && --chain_length != 0);

    if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
    return s->lookahead;
}
#endif /* ASMV */
#endif /* FASTEST */



GCC 4.5.0

_longest_match:
    .stabd  68,0,1069
LFBB1:
    lea (-64,sp),sp
    movem.l #16190,-(sp)
    move.l 112(sp),a1
    move.l 116(sp),d1
    .stabd  68,0,1070
    move.l 122(a1),d2
    .stabd  68,0,1071
    move.l 54(a1),a2
    move.l 106(a1),d3
    lea (a2,d3.l),a4
    .stabd  68,0,1075
    move.l 118(a1),d0
    move.l d0,d7
    .stabd  68,0,1076
    move.l 142(a1),a6
    .stabd  68,0,1077
    move.l 42(a1),d5
    .stabd  68,0,1078
    move.l d5,d4
    add.l #-262,d4
    cmp.l d3,d4
    jcc L12
    move.l d3,d4
    add.l #262,d4
    sub.l d5,d4
L2:
    .stabd  68,0,1082
    move.l 62(a1),a3
    .stabd  68,0,1083
    move.l 50(a1),d5
    .stabd  68,0,1093
    lea 258(a2,d3.l),a5
    .stabd  68,0,1094
    move.b -1(a4,d0.l),50(sp)
    .stabd  68,0,1095
    move.b (a4,d0.l),d3
    .stabd  68,0,1101
    cmp.l 138(a1),d0
    jcs L3
    .stabd  68,0,1106
    lsr.l #2,d2
L3:
    .stabd  68,0,1111
    move.l 114(a1),d6
    cmp.l a6,d6
    jcc L10
    move.l d6,a6
L10:
    .stabd  68,0,1188
    move.l #258,d0
    sub.l a5,d0
    move.l d0,62(sp)
    .stabd  68,0,1189
    lea (-258,a5),a0
    move.l a0,58(sp)
    move.l d7,a0
    move.l a1,66(sp)
    move.l d6,54(sp)
    move.l a6,46(sp)
    move.l a5,a6
    move.w d3,a5
L25:
    .stabd  68,0,1117
    lea (a2,d1.l),a1
    .stabd  68,0,1162
    move.l a0,d0
    move.b (a1,a0.l),d3
    move.w a5,d6
    cmp.b d3,d6
    jeq L29
L5:
    .stabd  68,0,1204
    and.l d5,d1
    .stabd  68,0,1205
    move.w (a3,d1.l*2),d1
    and.l #65535,d1
    cmp.l d4,d1
    jls L27
    subq.l #1,d2
    jne L25
L27:
    move.l 54(sp),d6
L9:
    .stabd  68,0,1207
    cmp.l d0,d6
    jcc L11
    move.l d6,d0
L11:
    .stabd  68,0,1209
    movem.l (sp)+,#31996
    lea (64,sp),sp
    rts
L29:
    .stabd  68,0,1163
    move.b -1(a1,a0.l),d7
    .stabd  68,0,1162
    cmp.b 50(sp),d7
    jne L5
    .stabd  68,0,1164
    move.b (a1),d6
    cmp.b (a4),d6
    jne L5
    addq.l #1,a1
    .stabd  68,0,1165
    move.b (a1),d6
    cmp.b 1(a4),d6
    jne L5
    .stabd  68,0,1173
    lea (2,a4),a5
    move.l a1,d6
    addq.l #1,d6
    lea (3,a4),a1
    move.l a1,84(sp)
    lea (9,a4),a1
    move.l a1,88(sp)
    lea (8,a4),a1
    move.l a1,92(sp)
    lea (7,a4),a1
    move.l a1,96(sp)
    lea (6,a4),a1
    move.l a1,100(sp)
    lea (5,a4),a1
    move.l a1,104(sp)
    addq.l #4,a4
    move.l d6,a1
    move.l 46(sp),d6
    move.l d5,50(sp)
    move.l a0,d5
    move.l a0,70(sp)
    move.l 88(sp),d0
    move.b d3,74(sp)
    move.l d2,88(sp)
    move.l a6,d2
    move.b d7,75(sp)
    move.l a3,d7
    move.l 100(sp),a6
    move.l 104(sp),a0
    move.l d1,76(sp)
    move.l 92(sp),d1
    move.l a2,80(sp)
    move.l a4,a2
    move.l 84(sp),a4
L7:
    .stabd  68,0,1180
    move.b (a4),d3
    cmp.b 1(a1),d3
    jne L16
    move.b (a2),d3
    cmp.b 2(a1),d3
    jne L17
    .stabd  68,0,1181
    move.b (a0),d3
    cmp.b 3(a1),d3
    jne L18
    move.b (a6),d3
    cmp.b 4(a1),d3
    jne L19
    .stabd  68,0,1182
    move.l 96(sp),a3
    move.b (a3),d3
    cmp.b 5(a1),d3
    jne L20
    move.l d1,a3
    move.b (a3),d3
    cmp.b 6(a1),d3
    jne L21
    .stabd  68,0,1183
    move.l d0,a3
    move.b (a3),d3
    cmp.b 7(a1),d3
    jne L22
    .stabd  68,0,1066
    addq.l #8,a5
    addq.l #8,a1
    .stabd  68,0,1183
    move.b (a5),d3
    cmp.b (a1),d3
    jne L26
    addq.l #8,a4
    addq.l #8,d0
    addq.l #8,d1
    addq.l #8,96(sp)
    addq.l #8,a6
    addq.l #8,a0
    addq.l #8,a2
    cmp.l d2,a5
    jcs L7
L26:
    move.l d5,a0
    move.l d6,46(sp)
    move.l d7,a3
    move.l 50(sp),d5
    move.l d2,a6
    move.l 88(sp),d2
    move.l 70(sp),d0
    move.b 74(sp),d3
    move.b 75(sp),d7
    move.l 76(sp),d1
    move.l 80(sp),a2
L6:
    .stabd  68,0,1188
    add.l 62(sp),a5
    .stabd  68,0,1189
    move.l 58(sp),a4
    .stabd  68,0,1193
    cmp.l a0,a5
    jle L23
    .stabd  68,0,1194
    move.l 66(sp),a0
    move.l d1,110(a0)
    .stabd  68,0,1196
    cmp.l 46(sp),a5
    jge L30
    .stabd  68,0,1200
    move.l a5,d0
    move.l 58(sp),a1
    move.b -1(a1,a5.l),50(sp)
    .stabd  68,0,1201
    move.b (a1,a5.l),d3
    move.w d3,a5
    move.l d0,a0
    jra L5
L12:
    .stabd  68,0,1078
    clr.l d4
    jra L2
L23:
    .stabd  68,0,1193
    move.w d3,a5
    move.b d7,50(sp)
    jra L5
L16:
    move.l d5,a0
    move.l a4,84(sp)
    move.l d6,46(sp)
    move.l d7,a3
    move.l 50(sp),d5
    move.l d2,a6
    move.l 88(sp),d2
    move.l 70(sp),d0
    move.b 74(sp),d3
    move.b 75(sp),d7
    move.l 76(sp),d1
    move.l 80(sp),a2
    .stabd  68,0,1184
    move.l a4,a5
    jra L6
L17:
    move.l d5,a0
    move.l d6,46(sp)
    move.l d7,a3
    move.l 50(sp),d5
    move.l d2,a6
    move.l 88(sp),d2
    move.l 70(sp),d0
    move.b 74(sp),d3
    move.b 75(sp),d7
    move.l a2,a4
    move.l 76(sp),d1
    move.l 80(sp),a2
    .stabd  68,0,1180
    move.l a4,a5
    jra L6
L19:
    move.l d5,a0
    move.l d6,46(sp)
    move.l d7,a3
    move.l 50(sp),d5
    move.l 70(sp),d0
    move.b 75(sp),d7
    move.l a6,100(sp)
    move.l d2,a6
    move.l 88(sp),d2
    move.b 74(sp),d3
    move.l 76(sp),d1
    move.l 80(sp),a2
    .stabd  68,0,1181
    move.l 100(sp),a5
    jra L6
L18:
    move.l d6,46(sp)
    move.l d7,a3
    move.l d2,a6
    move.l 88(sp),d2
    move.l 70(sp),d0
    move.b 74(sp),d3
    move.b 75(sp),d7
    move.l a0,104(sp)
    move.l d5,a0
    move.l 50(sp),d5
    move.l 76(sp),d1
    move.l 80(sp),a2
    .stabd  68,0,1180
    move.l 104(sp),a5
    jra L6
L21:
    move.l d5,a0
    move.l d6,46(sp)
    move.l d7,a3
    move.l 50(sp),d5
    move.l d2,a6
    move.l 88(sp),d2
    move.l 70(sp),d0
    move.b 74(sp),d3
    move.b 75(sp),d7
    move.l d1,92(sp)
    move.l 76(sp),d1
    move.l 80(sp),a2
    .stabd  68,0,1182
    move.l 92(sp),a5
    jra L6
L20:
    move.l d5,a0
    move.l d6,46(sp)
    move.l 50(sp),d5
    move.l d2,a6
    move.l 88(sp),d2
    move.l 70(sp),d0
    move.b 74(sp),d3
    move.l d7,a3
    move.b 75(sp),d7
    move.l 76(sp),d1
    move.l 80(sp),a2
    .stabd  68,0,1181
    move.l 96(sp),a5
    jra L6
L22:
    move.l d5,a0
    move.l d6,46(sp)
    move.l d7,a3
    move.l 50(sp),d5
    move.l d2,a6
    move.l 88(sp),d2
    move.l d0,88(sp)
    move.l 70(sp),d0
    move.b 74(sp),d3
    move.b 75(sp),d7
    move.l 76(sp),d1
    move.l 80(sp),a2
    .stabd  68,0,1182
    move.l 88(sp),a5
    jra L6
L30:
    move.l a5,d0
    move.l 54(sp),d6
    jra L9
    .stabs  "longest_match:f17",36,0,1066,_longest_match
    .stabs  "s:p189=*181",160,0,1067,112
    .stabs  "cur_match:p171",160,0,1068,116
    .stabs  "chain_length:r4",64,0,1070,2
    .stabs  "len:r1",64,0,1073,0
    .stabs  "best_len:r1",64,0,1075,7
    .stabs  "nice_match:r1",64,0,1076,14
    .stabs  "limit:r171",64,0,1077,4
    .stabs  "prev:r172",64,0,1082,11
    .stabs  "wmask:r17",64,0,1083,5
    .stabs  "strend:r127",64,0,1093,13
    .stabs  "s:r189",64,0,1067,9
    .stabs  "cur_match:r171",64,0,1068,1
    .stabn  192,0,0,LFBB1
    .stabn  224,0,0,Lscope1
Lscope1:
    .even

---------------------- GCC 3.4.0 ----------------------------------------

    .even
_longest_match:
    .stabd  68,0,1069
    subqw #8,sp
    moveml #0x3f3e,sp@-
    movel sp@(56),a4
    movel sp@(60),d1
    .stabd  68,0,1070
    movel a4@(122),d2
    .stabd  68,0,1071
    movel a4@(54),a2
    movel a4@(106),a1
    lea a2@(0,a1:l),a3
    .stabd  68,0,1075
    movel a4@(118),d3
    .stabd  68,0,1076
    movel a4@(142),d7
    .stabd  68,0,1077
    movel a4@(42),a0
    movel a0,d0
    addl #-262,d0
    lea 0:w,a6
    cmpl a1,d0
    jcc L378
    .stabd  68,0,1077
    movel a1,a5
    subl a0,a5
    lea a5@(262),a6
L378:
    .stabd  68,0,1082
    movel a4@(62),a5
    .stabd  68,0,1083
    movel a4@(50),d6
    .stabd  68,0,1093
    lea a1@(258,a2:l),a1
    movel a1,sp@(48)
    .stabd  68,0,1094
    moveb a3@(-1,d3:l),sp@(47)
    .stabd  68,0,1095
    moveb a3@(d3:l),d4
    .stabd  68,0,1104
    cmpl a4@(138),d3
    jcs L379
    .stabd  68,0,1105
    lsrl #2,d2
L379:
    .stabd  68,0,1110
    movel a4@(114),d5
    cmpl d7,d5
    jcc L392
    movel d5,d7
    .even
L392:
    .stabd  68,0,1116
    lea a2@(0,d1:l),a0
    .stabd  68,0,1161
    cmpb a0@(d3:l),d4
    jeq L396
L383:
    .stabd  68,0,1203
    movel d1,d0
    andl d6,d0
    clrl d1
    movew a5@(d0:l:2),d1
    cmpl d1,a6
    jcc L382
    subql #1,d2
    jne L392
L382:
    .stabd  68,0,1206
    movel d3,d0
    cmpl d3,d5
    jcc L376
    .stabd  68,0,1207
    movel d5,d0
    jra L376
    .even
L396:
    .stabd  68,0,1161
    moveb sp@(47),d0
    cmpb a0@(-1,d3:l),d0
    jne L383
    moveb a0@,d0
    cmpb a3@,d0
    jne L383
    lea a0@(1),a1
    moveb a1@,d0
    cmpb a3@(1),d0
    jne L383
    .stabd  68,0,1172
    lea a3@(2),a0
    addql #1,a1
L386:
    .stabd  68,0,1178
    addql #1,a0
    addql #1,a1
    moveb a0@,d0
    cmpb a1@,d0
    jne L387
    addql #1,a0
    addql #1,a1
    moveb a0@,d0
    cmpb a1@,d0
    jne L387
    addql #1,a0
    addql #1,a1
    moveb a0@,d0
    cmpb a1@,d0
    jne L387
    addql #1,a0
    addql #1,a1
    moveb a0@,d0
    cmpb a1@,d0
    jne L387
    addql #1,a0
    addql #1,a1
    moveb a0@,d0
    cmpb a1@,d0
    jne L387
    addql #1,a0
    addql #1,a1
    moveb a0@,d0
    cmpb a1@,d0
    jne L387
    addql #1,a0
    addql #1,a1
    moveb a0@,d0
    cmpb a1@,d0
    jne L387
    addql #1,a0
    addql #1,a1
    moveb a0@,d0
    cmpb a1@,d0
    jne L387
    cmpl sp@(48),a0
    jcs L386
L387:
    .stabd  68,0,1187
    movel sp@(48),a1
    subl a0,a1
    movel #258,d0
    subl a1,d0
    .stabd  68,0,1188
    movel sp@(48),a3
    lea a3@(-258),a3
    .stabd  68,0,1192
    cmpl d0,d3
    jge L383
    .stabd  68,0,1193
    movel d1,a4@(110)
    .stabd  68,0,1194
    movel d0,d3
    .stabd  68,0,1195
    cmpl d0,d7
    jle L382
    .stabd  68,0,1199
    moveb a3@(-1,d0:l),sp@(47)
    .stabd  68,0,1200
    moveb a3@(d0:l),d4
    jra L383
    .even
L376:
    moveml sp@+,#0x7cfc
    addqw #8,sp
    rts
    .stabs  "longest_match:f23",36,0,1069,_longest_match
    .stabs  "s:p198",160,0,1067,56
    .stabs  "cur_match:p176",160,0,1068,60
    .stabs  "chain_length:r4",64,0,1070,2
    .stabs  "scan:r132",64,0,1071,11
    .stabs  "match:r132",64,0,1072,8
    .stabs  "len:r1",64,0,1073,0
    .stabs  "best_len:r1",64,0,1075,3
    .stabs  "nice_match:r1",64,0,1076,7
    .stabs  "limit:r176",64,0,1077,14
    .stabs  "prev:r177",64,0,1082,13
    .stabs  "wmask:r23",64,0,1083,6
    .stabs  "strend:132",128,0,1093,48
    .stabs  "scan_end1:22",128,0,1094,47
    .stabs  "scan_end:r22",64,0,1095,4
    .stabs  "s:r198",64,0,1067,12
    .stabs  "cur_match:r176",64,0,1068,1
    .even
_fill_window:




> HSMathLibs V.46.00 beta 1 (17.07.2007):
> 
>    * the functions "IEEEDPFloor", "IEEEDPCeil" (mathieeedoubbas.library),
> "IEEESPFloor", "IEEESPCeil" (mathieeesingbas.library and
> mathieeesingbas-Patch), "SPFloor" and "SPCeil" (mathffp.library and
> mathffp-Patch) new written
>    * Bugfixing of the functions "IEEEDPPow" (mathieeedoubtrans.library),
> "IEEESPPow" (mathieeesingtrans.library), "SPCos", "SPSincos", "SPCosh",
> "SPExp" and "SPPow" (mathtrans.library)
>    * changed the init-function of the mathffp-Patch and
> mathieeesingbas-Patch
>    * changed the init-function of all libraries
>    * revised the manual 
Regards

Attachment: deflate.c
Description: Text document


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]