This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Re: Alpha CPU-specific builtins
Richard Henderson <rth@redhat.com> writes:
> On Sat, May 25, 2002 at 10:41:02PM +0200, Falk Hueffner wrote:
> > Is anybody interested in this? If so, I could polish it a bit more
> > and post it here...
>
> Sure.
OK. I'm a total beginner with gcc, so it would be nice if you could
point me to any problems. The instructions currently implemented are
somewhat random (well, mostly those I needed :-). Other candidates
are:
umulh
ext*
ins*
msk*
ctlz
ctpop
cttz
and maybe some volatiles like mb, although scheduling isn't really
important for them usually.
Some of the instructions can be generated by gcc (like bic), but it
sometimes misses them because of CSE.
If anybody thinks this is worthwhile, I could add the missing
instructions and write some documentation and tests...
A question: I would have thought that adding a % to the constraints
would allow gcc to optimize __builtin_alpha_minub8(1, a) to minub8
a0,1,v0, but that doesn't happen, why?
I've also attached a small test file...
Falk
diff -upr -X X.gcc ../cvs/gcc/gcc/config/alpha/alpha-protos.h gcc-alpha-builtins/gcc/config/alpha/alpha-protos.h
--- ../cvs/gcc/gcc/config/alpha/alpha-protos.h Sun May 19 20:25:04 2002
+++ gcc-alpha-builtins/gcc/config/alpha/alpha-protos.h Sat May 25 19:37:44 2002
@@ -167,6 +167,12 @@ extern void alpha_output_mi_thunk_osf PA
HOST_WIDE_INT, tree));
#endif /* TREE CODE */
+extern void alpha_init_builtins PARAMS ((void));
+#if defined (TREE_CODE) && defined (RTX_CODE)
+extern rtx alpha_expand_builtin PARAMS ((tree, rtx, rtx,
+ enum machine_mode, int));
+#endif
+
#ifdef RTX_CODE
extern rtx unicosmk_add_call_info_word PARAMS ((rtx));
#endif
diff -upr -X X.gcc ../cvs/gcc/gcc/config/alpha/alpha.c gcc-alpha-builtins/gcc/config/alpha/alpha.c
--- ../cvs/gcc/gcc/config/alpha/alpha.c Sat May 25 19:18:12 2002
+++ gcc-alpha-builtins/gcc/config/alpha/alpha.c Sun May 26 16:48:44 2002
@@ -266,6 +266,12 @@ static void unicosmk_unique_section PARA
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
alpha_multipass_dfa_lookahead
+#undef TARGET_INIT_BUILTINS
+#define TARGET_INIT_BUILTINS alpha_init_builtins
+
+#undef TARGET_EXPAND_BUILTIN
+#define TARGET_EXPAND_BUILTIN alpha_expand_builtin
+
struct gcc_target targetm = TARGET_INITIALIZER;
/* Parse target option strings. */
@@ -5821,6 +5827,147 @@ alpha_va_arg (valist, type)
return addr;
}
+
+
+/* Builtins. */
+void
+alpha_init_builtins ()
+{
+ tree di_ftype
+ = build_function_type (long_integer_type_node, void_list_node);
+ tree di_ftype_di
+ = build_function_type (long_integer_type_node,
+ tree_cons (NULL_TREE,
+ long_integer_type_node,
+ void_list_node));
+ tree di_ftype_di_di
+ = build_function_type (long_integer_type_node,
+ tree_cons (NULL_TREE,
+ long_integer_type_node,
+ tree_cons (NULL_TREE,
+ long_integer_type_node,
+ void_list_node)));
+
+#define def_builtin(name, type, code) \
+ builtin_function ((name), (type), (code), BUILT_IN_MD, NULL)
+
+ def_builtin ("__builtin_alpha_bic", di_ftype_di_di, ALPHA_BUILTIN_BIC);
+ def_builtin ("__builtin_alpha_eqv", di_ftype_di_di, ALPHA_BUILTIN_EQV);
+ def_builtin ("__builtin_alpha_ornot", di_ftype_di_di, ALPHA_BUILTIN_ORNOT);
+ def_builtin ("__builtin_alpha_cmpbge", di_ftype_di_di, ALPHA_BUILTIN_CMPBGE);
+ def_builtin ("__builtin_alpha_extql", di_ftype_di_di, ALPHA_BUILTIN_EXTQL);
+ def_builtin ("__builtin_alpha_extqh", di_ftype_di_di, ALPHA_BUILTIN_EXTQH);
+ def_builtin ("__builtin_alpha_zap", di_ftype_di_di, ALPHA_BUILTIN_ZAP);
+ def_builtin ("__builtin_alpha_zapnot", di_ftype_di_di, ALPHA_BUILTIN_ZAPNOT);
+ def_builtin ("__builtin_alpha_amask", di_ftype_di, ALPHA_BUILTIN_AMASK);
+ def_builtin ("__builtin_alpha_implver", di_ftype, ALPHA_BUILTIN_IMPLVER);
+ def_builtin ("__builtin_alpha_rpcc", di_ftype, ALPHA_BUILTIN_RPCC);
+ def_builtin ("__builtin_alpha_minub8", di_ftype_di_di, ALPHA_BUILTIN_MINUB8);
+ def_builtin ("__builtin_alpha_minsb8", di_ftype_di_di, ALPHA_BUILTIN_MINSB8);
+ def_builtin ("__builtin_alpha_minuw4", di_ftype_di_di, ALPHA_BUILTIN_MINUW4);
+ def_builtin ("__builtin_alpha_minsw4", di_ftype_di_di, ALPHA_BUILTIN_MINSW4);
+ def_builtin ("__builtin_alpha_maxub8", di_ftype_di_di, ALPHA_BUILTIN_MAXUB8);
+ def_builtin ("__builtin_alpha_maxsb8", di_ftype_di_di, ALPHA_BUILTIN_MAXSB8);
+ def_builtin ("__builtin_alpha_maxuw4", di_ftype_di_di, ALPHA_BUILTIN_MAXUW4);
+ def_builtin ("__builtin_alpha_maxsw4", di_ftype_di_di, ALPHA_BUILTIN_MAXSW4);
+ def_builtin ("__builtin_alpha_perr", di_ftype_di_di, ALPHA_BUILTIN_PERR);
+ def_builtin ("__builtin_alpha_pklb", di_ftype_di, ALPHA_BUILTIN_PKLB);
+ def_builtin ("__builtin_alpha_pkwb", di_ftype_di, ALPHA_BUILTIN_PKWB);
+ def_builtin ("__builtin_alpha_unpkbl", di_ftype_di, ALPHA_BUILTIN_UNPKBL);
+ def_builtin ("__builtin_alpha_unpkbw", di_ftype_di, ALPHA_BUILTIN_UNPKBW);
+
+#undef def_builtin
+}
+
+/* Expand an expression EXP that calls a built-in function,
+ with result going to TARGET if that's convenient
+ (and in mode MODE if that's convenient).
+ SUBTARGET may be used as the target for computing one of EXP's operands.
+ IGNORE is nonzero if the value is to be ignored. */
+
+rtx
+alpha_expand_builtin (exp, target, subtarget, mode, ignore)
+ tree exp;
+ rtx target;
+ rtx subtarget ATTRIBUTE_UNUSED;
+ enum machine_mode mode ATTRIBUTE_UNUSED;
+ int ignore ATTRIBUTE_UNUSED;
+{
+ tree fndecl = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
+ unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
+ enum insn_code icode;
+ tree arglist = TREE_OPERAND (exp, 1);
+ int arity = 0, width = 0;
+#define MAX_ARGS 2
+ rtx op[MAX_ARGS], pat;
+ enum machine_mode tmode;
+
+ switch (fcode)
+ {
+ case ALPHA_BUILTIN_BIC: icode = CODE_FOR_bic; break;
+ case ALPHA_BUILTIN_EQV: icode = CODE_FOR_eqv; break;
+ case ALPHA_BUILTIN_ORNOT: icode = CODE_FOR_ornot; break;
+ case ALPHA_BUILTIN_CMPBGE: icode = CODE_FOR_cmpbge; break;
+ case ALPHA_BUILTIN_EXTQL: icode = CODE_FOR_extxl_le; width = 64; break;
+ case ALPHA_BUILTIN_EXTQH: icode = CODE_FOR_extqh_le; break;
+ case ALPHA_BUILTIN_ZAP: icode = CODE_FOR_zap; break;
+ case ALPHA_BUILTIN_ZAPNOT: icode = CODE_FOR_zapnot; break;
+ case ALPHA_BUILTIN_AMASK: icode = CODE_FOR_amask; break;
+ case ALPHA_BUILTIN_IMPLVER: icode = CODE_FOR_implver; break;
+ case ALPHA_BUILTIN_RPCC: icode = CODE_FOR_rpcc; break;
+ case ALPHA_BUILTIN_MINUB8: icode = CODE_FOR_minub8; break;
+ case ALPHA_BUILTIN_MINSB8: icode = CODE_FOR_minsb8; break;
+ case ALPHA_BUILTIN_MINUW4: icode = CODE_FOR_minuw4; break;
+ case ALPHA_BUILTIN_MINSW4: icode = CODE_FOR_minsw4; break;
+ case ALPHA_BUILTIN_MAXUB8: icode = CODE_FOR_maxub8; break;
+ case ALPHA_BUILTIN_MAXSB8: icode = CODE_FOR_maxsb8; break;
+ case ALPHA_BUILTIN_MAXUW4: icode = CODE_FOR_maxuw4; break;
+ case ALPHA_BUILTIN_MAXSW4: icode = CODE_FOR_maxsw4; break;
+ case ALPHA_BUILTIN_PERR: icode = CODE_FOR_perr; break;
+ case ALPHA_BUILTIN_PKLB: icode = CODE_FOR_pklb; break;
+ case ALPHA_BUILTIN_PKWB: icode = CODE_FOR_pkwb; break;
+ case ALPHA_BUILTIN_UNPKBL: icode = CODE_FOR_unpkbl; break;
+ case ALPHA_BUILTIN_UNPKBW: icode = CODE_FOR_unpkbw; break;
+ default: internal_error("bad builtin fcode");
+ }
+
+ for (arglist = TREE_OPERAND (exp, 1); arglist;
+ arglist = TREE_CHAIN (arglist))
+ {
+ enum machine_mode opmode;
+ tree arg = TREE_VALUE (arglist);
+
+ if (arg == error_mark_node)
+ return NULL_RTX;
+ op[arity] = expand_expr (arg, NULL_RTX, VOIDmode, 0);
+ opmode = insn_data[icode].operand[1 + arity].mode;
+ if (!(*insn_data[icode].operand[1 + arity].predicate) (op[arity], opmode))
+ op[arity] = copy_to_mode_reg (opmode, op[arity]);
+ arity++;
+ }
+
+ tmode = insn_data[icode].operand[0].mode;
+ if (!target
+ || GET_MODE (target) != tmode
+ || !(*insn_data[icode].operand[0].predicate) (target, tmode))
+ target = gen_reg_rtx (tmode);
+
+ if (width)
+ pat = GEN_FCN (icode) (target, op[0], GEN_INT (width), op[1]);
+ else
+ switch (arity)
+ {
+ case 0: pat = GEN_FCN (icode) (target); break;
+ case 1: pat = GEN_FCN (icode) (target, op[0]); break;
+ case 2: pat = GEN_FCN (icode) (target, op[0], op[1]); break;
+ default: internal_error("bad builtin arity");
+ }
+ if (!pat)
+ return 0;
+ emit_insn (pat);
+
+ return target;
+ }
/* This page contains routines that are used to determine what the function
prologue and epilogue code will do and write them out. */
diff -upr -X X.gcc ../cvs/gcc/gcc/config/alpha/alpha.h gcc-alpha-builtins/gcc/config/alpha/alpha.h
--- ../cvs/gcc/gcc/config/alpha/alpha.h Tue May 21 22:59:48 2002
+++ gcc-alpha-builtins/gcc/config/alpha/alpha.h Sat May 25 19:37:44 2002
@@ -2114,3 +2114,30 @@ do { \
Used for C++ multiple inheritance. */
#define ASM_OUTPUT_MI_THUNK(FILE, THUNK_FNDECL, DELTA, FUNCTION) \
alpha_output_mi_thunk_osf (FILE, THUNK_FNDECL, DELTA, FUNCTION)
+
+enum alpha_builtins {
+ ALPHA_BUILTIN_BIC,
+ ALPHA_BUILTIN_EQV,
+ ALPHA_BUILTIN_ORNOT,
+ ALPHA_BUILTIN_CMPBGE,
+ ALPHA_BUILTIN_EXTQL,
+ ALPHA_BUILTIN_EXTQH,
+ ALPHA_BUILTIN_ZAP,
+ ALPHA_BUILTIN_ZAPNOT,
+ ALPHA_BUILTIN_AMASK,
+ ALPHA_BUILTIN_IMPLVER,
+ ALPHA_BUILTIN_RPCC,
+ ALPHA_BUILTIN_MINUB8,
+ ALPHA_BUILTIN_MINSB8,
+ ALPHA_BUILTIN_MINUW4,
+ ALPHA_BUILTIN_MINSW4,
+ ALPHA_BUILTIN_MAXUB8,
+ ALPHA_BUILTIN_MAXSB8,
+ ALPHA_BUILTIN_MAXUW4,
+ ALPHA_BUILTIN_MAXSW4,
+ ALPHA_BUILTIN_PERR,
+ ALPHA_BUILTIN_PKLB,
+ ALPHA_BUILTIN_PKWB,
+ ALPHA_BUILTIN_UNPKBL,
+ ALPHA_BUILTIN_UNPKBW
+};
diff -upr -X X.gcc ../cvs/gcc/gcc/config/alpha/alpha.md gcc-alpha-builtins/gcc/config/alpha/alpha.md
--- ../cvs/gcc/gcc/config/alpha/alpha.md Thu May 9 15:32:01 2002
+++ gcc-alpha-builtins/gcc/config/alpha/alpha.md Sat May 25 22:07:07 2002
@@ -40,6 +40,27 @@
(UNSPEC_LITUSE 12)
(UNSPEC_SIBCALL 13)
(UNSPEC_SYMBOL 14)
+ (UNSPEC_BIC 15)
+ (UNSPEC_EQV 16)
+ (UNSPEC_ORNOT 17)
+ (UNSPEC_CMPBGE 18)
+ (UNSPEC_ZAP 19)
+ (UNSPEC_ZAPNOT 20)
+ (UNSPEC_AMASK 21)
+ (UNSPEC_IMPLVER 22)
+ (UNSPEC_MINUB8 23)
+ (UNSPEC_MINSB8 24)
+ (UNSPEC_MINUW4 25)
+ (UNSPEC_MINSW4 26)
+ (UNSPEC_MAXUB8 27)
+ (UNSPEC_MAXSB8 28)
+ (UNSPEC_MAXUW4 29)
+ (UNSPEC_MAXSW4 30)
+ (UNSPEC_PERR 31)
+ (UNSPEC_PKLB 32)
+ (UNSPEC_PKWB 33)
+ (UNSPEC_UNPKBL 34)
+ (UNSPEC_UNPKBW 35)
])
;; UNSPEC_VOLATILE:
@@ -57,6 +78,7 @@
(UNSPECV_FORCE_MOV 9)
(UNSPECV_LDGP1 10)
(UNSPECV_PLDGP2 11) ; prologue ldgp
+ (UNSPECV_RPCC 12)
])
;; Where necessary, the suffixes _le and _be are used to distinguish between
@@ -6608,6 +6630,197 @@ fadd,fmul,fcpys,fdiv,fsqrt,misc,mvi,ftoi
[(const_int 2)]
""
"ldq_u $31,0($30)")
+
+;; Instructions to be emitted from __builtins.
+
+(define_insn "bic"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_BIC))]
+ ""
+ "bic %r1,%2,%0"
+ [(set_attr "type" "ilog")])
+
+(define_insn "eqv"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_EQV))]
+ ""
+ "eqv %r1,%2,%0"
+ [(set_attr "type" "ilog")])
+
+(define_insn "ornot"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_ORNOT))]
+ ""
+ "ornot %r1,%2,%0"
+ [(set_attr "type" "ilog")])
+
+(define_insn "cmpbge"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_CMPBGE))]
+ ""
+ "cmpbge %r1,%2,%0"
+ [(set_attr "type" "icmp")]) ; ??? on ev6, it's ilog
+
+(define_insn "zap"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_ZAP))]
+ ""
+ "zap %r1,%2,%0"
+ [(set_attr "type" "shift")])
+
+(define_insn "zapnot"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_ZAPNOT))]
+ ""
+ "zapnot %r1,%2,%0"
+ [(set_attr "type" "shift")])
+
+(define_insn "amask"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_8bit_operand" "rI")]
+ UNSPEC_AMASK))]
+ ""
+ "amask %1,%0"
+ [(set_attr "type" "misc")]) ; ??? can't find the correct class
+
+(define_insn "implver"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(const_int 0)] UNSPEC_IMPLVER))]
+ ""
+ "implver %0"
+ [(set_attr "type" "misc")]) ; ??? can't find the correct class
+
+(define_insn "rpcc"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec_volatile:DI [(const_int 0)] UNSPECV_RPCC))]
+ ""
+ "rpcc %0"
+ [(set_attr "type" "misc")]) ; ??? can't find the correct class
+
+(define_insn "minub8"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "%rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_MINUB8))]
+ ""
+ "minub8 %r1,%2,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "minsb8"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "%rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_MINSB8))]
+ ""
+ "minsb8 %r1,%2,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "minuw4"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "%rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_MINUW4))]
+ ""
+ "minuw4 %r1,%2,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "minsw4"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "%rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_MINSW4))]
+ ""
+ "minsw4 %r1,%2,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "maxub8"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "%rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_MAXUB8))]
+ ""
+ "maxub8 %r1,%2,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "maxsb8"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "%rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_MAXSB8))]
+ ""
+ "maxsb8 %r1,%2,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "maxuw4"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "%rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_MAXUW4))]
+ ""
+ "maxuw4 %r1,%2,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "maxsw4"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "%rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rI")]
+ UNSPEC_MAXSW4))]
+ ""
+ "maxsw4 %r1,%2,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "perr"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "%rJ")
+ (match_operand:DI 2 "reg_or_8bit_operand" "rJ")]
+ UNSPEC_PERR))]
+ ""
+ "perr %r1,%r2,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "pklb"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "rJ")]
+ UNSPEC_PKLB))]
+ ""
+ "pklb %r1,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "pkwb"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "rJ")]
+ UNSPEC_PKWB))]
+ ""
+ "pkwb %r1,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "unpkbl"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "rJ")]
+ UNSPEC_UNPKBL))]
+ ""
+ "unpkbl %r1,%0"
+ [(set_attr "type" "mvi")])
+
+(define_insn "unpkbw"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "reg_or_0_operand" "rJ")]
+ UNSPEC_UNPKBW))]
+ ""
+ "unpkbw %r1,%0"
+ [(set_attr "type" "mvi")])
;; On Unicos/Mk we use a macro for aligning code.
#include <stdint.h>
#define ldq(p) (*(const uint64_t *) (p))
#define ldl(p) (*(const int32_t *) (p))
#define stl(l, p) do { *(uint32_t *) (p) = (l); } while (0)
#if 1
#define bic(a, b) ((a) & ~(b))
#define zap(a, b) ({ uint64_t __r; asm ("zap %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; })
#define minsw4(a, b) ({ uint64_t __r; asm ("minsw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
#define maxsw4(a, b) ({ uint64_t __r; asm ("maxsw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
#define pkwb(a) ({ uint64_t __r; asm ("pkwb %r1,%0" : "=r" (__r) : "rJ" (a)); __r; })
#define unpkbw(a) ({ uint64_t __r; asm ("unpkbw %r1,%0" : "=r" (__r) : "rJ" (a)); __r; })
#else
#define bic __builtin_alpha_bic
#define zap __builtin_alpha_zap
#define minsw4 __builtin_alpha_minsw4
#define maxsw4 __builtin_alpha_maxsw4
#define pkwb __builtin_alpha_pkwb
#define unpkbw __builtin_alpha_unpkbw
#endif
/* Add signed words to unsigned bytes with clamping. */
void add_pixels_clamped(const int16_t *block, uint8_t *pixels,
int line_size)
{
int h = 8;
uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
uint64_t signmask = zap(-1, 0x33);
signmask ^= signmask >> 1; /* 0x8000800080008000 */
do {
uint64_t words, pix, signs;
words = ldq(block);
pix = unpkbw(ldl(pixels));
signs = words & signmask;
words = bic(words, signmask);
words += pix;
words ^= signs;
words = maxsw4(words, 0);
words = minsw4(words, clampmask);
stl(pkwb(words), pixels);
words = ldq(block + 4);
pix = unpkbw(ldl(pixels + 4));
signs = words & signmask;
words = bic(words, signmask);
words += pix;
words ^= signs;
words = maxsw4(words, 0);
words = minsw4(words, clampmask);
stl(pkwb(words), pixels + 4);
pixels += line_size;
block += 8;
} while (--h);
}