This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug target/83479] Register spilling in AVX code
- From: "bugzilla at poradnik-webmastera dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Tue, 19 Dec 2017 12:50:08 +0000
- Subject: [Bug target/83479] Register spilling in AVX code
- Auto-submitted: auto-generated
- References: <bug-83479-4@http.gcc.gnu.org/bugzilla/>
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83479
--- Comment #5 from Daniel Fruzynski <bugzilla@poradnik-webmastera.com> ---
Here is also valid AVX version, it also spills a bit. Compiled with "-O3
-march=haswell -Wall -Werror".
[code]
#include "immintrin.h"
double test(const double data[5][4])
{
__m256d vLastRow, vLastCol, vSqrtRow, vSqrtCol;
__m256d v1 = _mm256_load_pd (&data[0][0]);
__m256d v2 = _mm256_load_pd (&data[1][0]);
__m256d v3 = _mm256_load_pd (&data[2][0]);
__m256d v4 = _mm256_load_pd (&data[3][0]);
// 4
vLastRow = _mm256_load_pd (&data[4][0]);
vSqrtRow = _mm256_sqrt_pd(vLastRow);
vLastCol = _mm256_set1_pd(vLastRow[0]);
vSqrtCol = _mm256_sqrt_pd(vLastCol);
v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol;
vLastCol = _mm256_set1_pd(vLastRow[1]);
vSqrtCol = _mm256_sqrt_pd(vLastCol);
v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol;
vLastCol = _mm256_set1_pd(vLastRow[2]);
vSqrtCol = _mm256_sqrt_pd(vLastCol);
v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol;
vLastCol = _mm256_set1_pd(vLastRow[3]);
vSqrtCol = _mm256_sqrt_pd(vLastCol);
v4 = (v4 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol;
// 3
vLastRow = v4;
vSqrtRow = _mm256_sqrt_pd(vLastRow);
vLastCol = _mm256_set1_pd(vLastRow[0]);
vSqrtCol = _mm256_sqrt_pd(vLastCol);
v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol;
vLastCol = _mm256_set1_pd(vLastRow[1]);
vSqrtCol = _mm256_sqrt_pd(vLastCol);
v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol;
vLastCol = _mm256_set1_pd(vLastRow[2]);
vSqrtCol = _mm256_sqrt_pd(vLastCol);
v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol;
// 2
vLastRow = v3;
vSqrtRow = _mm256_sqrt_pd(vLastRow);
vLastCol = _mm256_set1_pd(vLastRow[0]);
vSqrtCol = _mm256_sqrt_pd(vLastCol);
v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol;
vLastCol = _mm256_set1_pd(vLastRow[1]);
vSqrtCol = _mm256_sqrt_pd(vLastCol);
v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol;
// 1
vLastRow = v2;
vSqrtRow = _mm256_sqrt_pd(vLastRow);
vLastCol = _mm256_set1_pd(vLastRow[0]);
vSqrtCol = _mm256_sqrt_pd(vLastCol);
v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol;
return v1[0];
}
[/code]