This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug target/64306] New: [SH] Improve unaligned loads
- From: "olegendo at gcc dot gnu.org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Sun, 14 Dec 2014 15:32:01 +0000
- Subject: [Bug target/64306] New: [SH] Improve unaligned loads
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64306
Bug ID: 64306
Summary: [SH] Improve unaligned loads
Product: gcc
Version: 5.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: olegendo at gcc dot gnu.org
Target: sh*-*-*
On SH4A the movua.l insn can be used to do 32 bit unaligned loads (currently
defunct, see PR 52480). It could also be used to do 16 bit unaligned loads
with fewer insns, if over-reading adjacent bytes is OK to do (usually it's not
safe, but could be relaxed and enabled by a -m option).
struct __attribute__((packed)) x
{
int val32;
short val_s16;
unsigned short val_u16;
};
int load_unaligned_s16 (const x& xx)
{
return xx.val_s16;
}
currently compiles to
little endian:
mov.b @(4,r4),r0
extu.b r0,r1
mov.b @(5,r4),r0
extu.b r0,r4
swap.b r4,r4
or r1,r4
exts.w r4,r0
big endian:
mov.b @(4,r4),r0
mov r0,r1
mov.b @(5,r4),r0
extu.b r0,r4
extu.b r1,r0
swap.b r0,r0
or r4,r0
exts.w r0,r0
better:
mov.b @({5|4},r4),r0
extu.b r0,r1
mov.b @({4|5},r4),r0
shll8 r0
or r1,r0
SH4A little endian (unsafe):
movua.l @r4,r0
exts.w r0,r0
SH4A big endian (unsafe):
movua.l @r4,r0
shlr16 r0
exts.w r0,r0
int load_unaligned_u16 (const x& xx)
{
return xx.val_u16;
}
currently compiles to
little endian:
mov.b @(6,r4),r0
extu.b r0,r1
mov.b @(7,r4),r0
extu.b r0,r4
swap.b r4,r0
or r1,r0
big endian:
mov.b @(6,r4),r0
mov r0,r1
mov.b @(7,r4),r0
extu.b r0,r4
extu.b r1,r0
swap.b r0,r0
or r4,r0
better (uses fewer regs):
mov.b @({6|7},r4),r0
extu.b r0,r1
mov.b @({7|6},r4),r0
shll8 r0
or r1,r0
extu.w r0,r0
int load_unaligned32 (const x& xx)
{
return xx.val32;
}
currently compiles to
little endian:
mov.b @(1,r4),r0
mov.b @r4,r2
extu.b r0,r1
mov.b @(2,r4),r0
extu.b r2,r3
swap.b r1,r2
or r3,r2
extu.b r0,r3
mov r3,r0
shll16 r0
mov r0,r1
mov.b @(3,r4),r0
or r2,r1
shll16 r0
shll8 r0
or r1,r0
better:
mov.b @r4+,r0 ! r0 = xx.xx.xx.aa
mov.b @r4+,r1 ! r1 = xx.xx.xx.bb
extu.b r0,r0 ! r0 = 00.00.00.aa
mov.b @r4+,r2 ! r2 = xx.xx.xx.cc
shll8 r1 ! r1 = xx.xx.bb.00
or r1,r0 ! r0 = xx.xx.bb.aa
mov.b @r4+,r3 ! r3 = xx.xx.xx.dd
extu.b r2,r2 ! r2 = 00.00.00.cc
shll16 r0 ! r0 = bb.aa.00.00
shll8 r3 ! r3 = xx.xx.dd.00
or r3,r2 ! r2 = xx.xx.dd.cc
xtrct r2,r0 ! r0 = dd.cc.bb.aa
which is two unaligned signed 16 bit loads + shll16 + xtrct.
If the (mis)alignment offset value is known, it can be even more compact.
x0.x1.aa.bb.cc.dd.y0.y1
^^^^^^^^^^^
add #-2,r4
mov.l @r4,r0 ! r0 = bb.aa.x1.x0
mov.l @(4,r4),r1 ! r1 = y1.y0.dd.cc
xtrct r1,r0 ! r0 = dd.cc.bb.aa
x0.aa.bb.cc.dd.y0.y1.y2
^^^^^^^^^^^
add #-1,r4
mov.l @r4,r0 ! r0 = cc.bb.aa.x0
mov.l @(4,r4),r1 ! r1 = y2.y1.y0.dd
! r1:r0 = y2.y1.y0.dd : cc.bb.aa.x0
mov r0,r2
xtrct r1,r2 ! r2 = y0.dd.cc.bb
shlr8 r2 ! r2 = 00.y0.dd.cc
shll8 r0 ! r0 = bb.aa.x0.00
xtrct r1,r0 ! r0 = dd.cc.bb.aa
void store_unaligned16 (x& xx, int val)
{
xx.val_s16 = val;
}
currently compiles to
little endian:
extu.w r5,r0
mov.b r0,@(4,r4)
shlr8 r0
mov.b r0,@(5,r4)
big endian:
extu.w r5,r5
mov r5,r1
shlr8 r1
mov r1,r0
mov.b r0,@(4,r4)
mov r5,r0
mov.b r0,@(5,r4)
better (eliminate unnecessary extu.w):
mov r5,r0
mov.b r0,@({4|5},r4)
shlr8 r0
mov.b r0,@({5|4},r4)
void store_unaligned32 (x& xx, int val)
{
xx.val32 = val;
}
currently compiles to
little endian:
mov r5,r0
shlr8 r0
mov.b r5,@r4
mov.b r0,@(1,r4)
mov r5,r0
shlr16 r0
mov.b r0,@(2,r4)
mov r5,r0
shlr16 r0
shlr8 r0
mov.b r0,@(3,r4)
big endian:
mov r5,r1
mov r5,r0
shlr16 r1
shlr16 r0
shlr8 r1
mov.b r0,@(1,r4)
mov r5,r0
shlr8 r0
mov.b r0,@(2,r4)
mov r5,r0
mov.b r1,@r4
mov.b r0,@(3,r4)
better:
mov r5,r0
mov.b r0,@({0|3},r4)
shlr8 r0
mov.b r0,@({1|2},r4)
shlr8 r0
mov.b r0,@({2|1},r4)
shlr8 r0
mov.b r0,@({3|0},r4)