[Bug target/100926] New: PPCLE: Inefficient code for vec_xl_be(unsigned short *) < P9
jens.seifert at de dot ibm.com
gcc-bugzilla@gcc.gnu.org
Sat Jun 5 18:19:16 GMT 2021
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100926
Bug ID: 100926
Summary: PPCLE: Inefficient code for vec_xl_be(unsigned short
*) < P9
Product: gcc
Version: 8.3.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: jens.seifert at de dot ibm.com
Target Milestone: ---
Input:
vector unsigned short load_be(unsigned short *c)
{
return vec_xl_be(0L, c);
}
creates:
_Z7load_bePt:
.LFB6:
.cfi_startproc
.LCF6:
0: addis 2,12,.TOC.-.LCF6@ha
addi 2,2,.TOC.-.LCF6@l
.localentry _Z7load_bePt,.-_Z7load_bePt
addis 9,2,.LC4@toc@ha
lxvw4x 34,0,3
addi 9,9,.LC4@toc@l
lvx 0,0,9
vperm 2,2,2,0
blr
Optimal sequence:
vector unsigned short load_be_opt2(unsigned short *c)
{
vector signed int vneg16;
__asm__("vspltisw %0,-16":"=v"(vneg16));
vector unsigned int tmp = vec_xl_be(0L, (unsigned int *)c);
tmp = vec_rl(tmp, (vector unsigned int)vneg16);
return (vector unsigned short)tmp;
}
creates:
_Z12load_be_opt2Pt:
.LFB8:
.cfi_startproc
lxvw4x 34,0,3
#APP
# 77 "vec.C" 1
vspltisw 0,-16
# 0 "" 2
#NO_APP
vrlw 2,2,0
blr
rotate left (-16) = rotate right (+16) as only the 5 bits get evaluated.
Please note that the inline assembly is required, because vec_splats(-16) gets
converted into a very inefficient constant generation.
vector unsigned short load_be_opt(unsigned short *c)
{
vector signed int vneg16 = vec_splats(-16);
vector unsigned int tmp = vec_xl_be(0L, (unsigned int *)c);
tmp = vec_rl(tmp, (vector unsigned int)vneg16);
return (vector unsigned short)tmp;
}
creates:
_Z11load_be_optPt:
.LFB7:
.cfi_startproc
li 9,48
lxvw4x 34,0,3
vspltisw 0,0
mtvsrd 33,9
xxspltw 33,33,1
vsubuwm 0,0,1
vrlw 2,2,0
blr
More information about the Gcc-bugs
mailing list