]> gcc.gnu.org Git - gcc.git/blame - gcc/config/rs6000/rs6000-string.cc
Add -mcpu=power11 support.
[gcc.git] / gcc / config / rs6000 / rs6000-string.cc
CommitLineData
8845cb37
AS
1/* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
a945c346 3 Copyright (C) 1991-2024 Free Software Foundation, Inc.
8845cb37
AS
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
8845cb37
AS
23#include "config.h"
24#include "system.h"
25#include "coretypes.h"
26#include "backend.h"
27#include "rtl.h"
28#include "tree.h"
29#include "memmodel.h"
30#include "tm_p.h"
31#include "ira.h"
32#include "print-tree.h"
33#include "varasm.h"
34#include "explow.h"
35#include "expr.h"
36#include "output.h"
e0bd6c9f 37#include "target.h"
faaeebd6
AS
38#include "profile-count.h"
39#include "predict.h"
8845cb37
AS
40
41/* Expand a block clear operation, and return 1 if successful. Return 0
42 if we should let the compiler generate normal code.
43
44 operands[0] is the destination
45 operands[1] is the length
46 operands[3] is the alignment */
47
48int
49expand_block_clear (rtx operands[])
50{
51 rtx orig_dest = operands[0];
52 rtx bytes_rtx = operands[1];
53 rtx align_rtx = operands[3];
2e42a52f 54 bool constp = CONST_INT_P (bytes_rtx);
8845cb37
AS
55 HOST_WIDE_INT align;
56 HOST_WIDE_INT bytes;
57 int offset;
58 int clear_bytes;
59 int clear_step;
60
61 /* If this is not a fixed size move, just call memcpy */
62 if (! constp)
63 return 0;
64
65 /* This must be a fixed size alignment */
2e42a52f 66 gcc_assert (CONST_INT_P (align_rtx));
8845cb37
AS
67 align = INTVAL (align_rtx) * BITS_PER_UNIT;
68
69 /* Anything to clear? */
70 bytes = INTVAL (bytes_rtx);
71 if (bytes <= 0)
72 return 1;
73
74 /* Use the builtin memset after a point, to avoid huge code bloat.
75 When optimize_size, avoid any significant code bloat; calling
76 memset is about 4 instructions, so allow for one instruction to
77 load zero and three to do clearing. */
3b0cb1a5 78 if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
8845cb37
AS
79 clear_step = 16;
80 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
81 clear_step = 8;
82 else
83 clear_step = 4;
84
85 if (optimize_size && bytes > 3 * clear_step)
86 return 0;
87 if (! optimize_size && bytes > 8 * clear_step)
88 return 0;
89
645eee74
AS
90 bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
91
8845cb37
AS
92 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
93 {
94 machine_mode mode = BLKmode;
95 rtx dest;
96
31369f5a 97 if (TARGET_ALTIVEC
645eee74 98 && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
8845cb37
AS
99 {
100 clear_bytes = 16;
101 mode = V4SImode;
102 }
103 else if (bytes >= 8 && TARGET_POWERPC64
104 && (align >= 64 || !STRICT_ALIGNMENT))
105 {
106 clear_bytes = 8;
107 mode = DImode;
108 if (offset == 0 && align < 64)
109 {
110 rtx addr;
111
112 /* If the address form is reg+offset with offset not a
113 multiple of four, reload into reg indirect form here
114 rather than waiting for reload. This way we get one
115 reload, not one per store. */
116 addr = XEXP (orig_dest, 0);
117 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2e42a52f 118 && CONST_INT_P (XEXP (addr, 1))
8845cb37
AS
119 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
120 {
121 addr = copy_addr_to_reg (addr);
122 orig_dest = replace_equiv_address (orig_dest, addr);
123 }
124 }
125 }
126 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
127 { /* move 4 bytes */
128 clear_bytes = 4;
129 mode = SImode;
130 }
131 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
132 { /* move 2 bytes */
133 clear_bytes = 2;
134 mode = HImode;
135 }
136 else /* move 1 byte at a time */
137 {
138 clear_bytes = 1;
139 mode = QImode;
140 }
141
142 dest = adjust_address (orig_dest, mode, offset);
143
144 emit_move_insn (dest, CONST0_RTX (mode));
145 }
146
147 return 1;
148}
149
150/* Figure out the correct instructions to generate to load data for
151 block compare. MODE is used for the read from memory, and
152 data is zero extended if REG is wider than MODE. If LE code
153 is being generated, bswap loads are used.
154
155 REG is the destination register to move the data into.
156 MEM is the memory block being read.
157 MODE is the mode of memory to use for the read. */
158static void
159do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
160{
161 switch (GET_MODE (reg))
162 {
9d36bd3b
AS
163 case E_V16QImode:
164 switch (mode)
165 {
166 case E_V16QImode:
167 if (!BYTES_BIG_ENDIAN)
168 {
169 if (TARGET_P9_VECTOR)
170 emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
171 else
172 {
173 rtx reg_v2di = simplify_gen_subreg (V2DImode, reg,
174 V16QImode, 0);
175 gcc_assert (MEM_P (mem));
176 rtx addr = XEXP (mem, 0);
177 rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
178 MEM_COPY_ATTRIBUTES (mem_v2di, mem);
179 set_mem_size (mem, GET_MODE_SIZE (V2DImode));
180 emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
181 }
182 }
183 else
184 emit_insn (gen_vsx_movv2di_64bit (reg, mem));
185 break;
186 default:
187 gcc_unreachable ();
188 }
189 break;
4e10a5a7 190 case E_DImode:
8845cb37
AS
191 switch (mode)
192 {
4e10a5a7 193 case E_QImode:
8845cb37
AS
194 emit_insn (gen_zero_extendqidi2 (reg, mem));
195 break;
4e10a5a7 196 case E_HImode:
8845cb37
AS
197 {
198 rtx src = mem;
199 if (!BYTES_BIG_ENDIAN)
200 {
201 src = gen_reg_rtx (HImode);
202 emit_insn (gen_bswaphi2 (src, mem));
203 }
204 emit_insn (gen_zero_extendhidi2 (reg, src));
205 break;
206 }
4e10a5a7 207 case E_SImode:
8845cb37
AS
208 {
209 rtx src = mem;
210 if (!BYTES_BIG_ENDIAN)
211 {
212 src = gen_reg_rtx (SImode);
213 emit_insn (gen_bswapsi2 (src, mem));
214 }
215 emit_insn (gen_zero_extendsidi2 (reg, src));
216 }
217 break;
4e10a5a7 218 case E_DImode:
8845cb37
AS
219 if (!BYTES_BIG_ENDIAN)
220 emit_insn (gen_bswapdi2 (reg, mem));
221 else
222 emit_insn (gen_movdi (reg, mem));
223 break;
224 default:
225 gcc_unreachable ();
226 }
227 break;
228
4e10a5a7 229 case E_SImode:
8845cb37
AS
230 switch (mode)
231 {
4e10a5a7 232 case E_QImode:
8845cb37
AS
233 emit_insn (gen_zero_extendqisi2 (reg, mem));
234 break;
4e10a5a7 235 case E_HImode:
8845cb37
AS
236 {
237 rtx src = mem;
238 if (!BYTES_BIG_ENDIAN)
239 {
240 src = gen_reg_rtx (HImode);
241 emit_insn (gen_bswaphi2 (src, mem));
242 }
243 emit_insn (gen_zero_extendhisi2 (reg, src));
244 break;
245 }
4e10a5a7 246 case E_SImode:
8845cb37
AS
247 if (!BYTES_BIG_ENDIAN)
248 emit_insn (gen_bswapsi2 (reg, mem));
249 else
250 emit_insn (gen_movsi (reg, mem));
251 break;
4e10a5a7 252 case E_DImode:
8845cb37
AS
253 /* DImode is larger than the destination reg so is not expected. */
254 gcc_unreachable ();
255 break;
256 default:
257 gcc_unreachable ();
258 }
259 break;
9d36bd3b
AS
260
261 case E_QImode:
262 gcc_assert (mode == E_QImode);
263 emit_move_insn (reg, mem);
264 break;
ef4adf1f 265
8845cb37
AS
266 default:
267 gcc_unreachable ();
268 break;
269 }
270}
271
272/* Select the mode to be used for reading the next chunk of bytes
273 in the compare.
274
275 OFFSET is the current read offset from the beginning of the block.
276 BYTES is the number of bytes remaining to be read.
74f9986e 277 ALIGN is the minimum alignment of the memory blocks being compared in bytes. */
8845cb37
AS
278static machine_mode
279select_block_compare_mode (unsigned HOST_WIDE_INT offset,
280 unsigned HOST_WIDE_INT bytes,
74f9986e 281 unsigned HOST_WIDE_INT align)
8845cb37
AS
282{
283 /* First see if we can do a whole load unit
284 as that will be more efficient than a larger load + shift. */
285
286 /* If big, use biggest chunk.
287 If exactly chunk size, use that size.
288 If remainder can be done in one piece with shifting, do that.
289 Do largest chunk possible without violating alignment rules. */
290
291 /* The most we can read without potential page crossing. */
292 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
293
74f9986e
AS
294 /* If we have an LE target without ldbrx and word_mode is DImode,
295 then we must avoid using word_mode. */
296 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
297 && word_mode == DImode);
298
8845cb37
AS
299 if (word_mode_ok && bytes >= UNITS_PER_WORD)
300 return word_mode;
301 else if (bytes == GET_MODE_SIZE (SImode))
302 return SImode;
303 else if (bytes == GET_MODE_SIZE (HImode))
304 return HImode;
305 else if (bytes == GET_MODE_SIZE (QImode))
306 return QImode;
307 else if (bytes < GET_MODE_SIZE (SImode)
78bd9e25 308 && !targetm.slow_unaligned_access (SImode, align * BITS_PER_UNIT)
8845cb37
AS
309 && offset >= GET_MODE_SIZE (SImode) - bytes)
310 /* This matches the case were we have SImode and 3 bytes
311 and offset >= 1 and permits us to move back one and overlap
312 with the previous read, thus avoiding having to shift
313 unwanted bytes off of the input. */
314 return SImode;
315 else if (word_mode_ok && bytes < UNITS_PER_WORD
78bd9e25 316 && !targetm.slow_unaligned_access (word_mode, align * BITS_PER_UNIT)
8845cb37
AS
317 && offset >= UNITS_PER_WORD-bytes)
318 /* Similarly, if we can use DImode it will get matched here and
319 can do an overlapping read that ends at the end of the block. */
320 return word_mode;
321 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
322 /* It is safe to do all remaining in one load of largest size,
323 possibly with a shift to get rid of unwanted bytes. */
324 return word_mode;
325 else if (maxread >= GET_MODE_SIZE (SImode))
326 /* It is safe to do all remaining in one SImode load,
327 possibly with a shift to get rid of unwanted bytes. */
328 return SImode;
329 else if (bytes > GET_MODE_SIZE (SImode))
330 return SImode;
331 else if (bytes > GET_MODE_SIZE (HImode))
332 return HImode;
333
334 /* final fallback is do one byte */
335 return QImode;
336}
337
338/* Compute the alignment of pointer+OFFSET where the original alignment
339 of pointer was BASE_ALIGN. */
340static unsigned HOST_WIDE_INT
341compute_current_alignment (unsigned HOST_WIDE_INT base_align,
342 unsigned HOST_WIDE_INT offset)
343{
344 if (offset == 0)
345 return base_align;
346 return MIN (base_align, offset & -offset);
347}
348
5ec3397e
AS
349/* Prepare address and then do a load.
350
351 MODE is the mode to use for the load.
352 DEST is the destination register for the data.
353 ADDR is the address to be loaded.
354 ORIG_ADDR is the original address expression. */
355static void
356do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
357 rtx orig_addr)
358{
359 rtx mem = gen_rtx_MEM (mode, addr);
360 MEM_COPY_ATTRIBUTES (mem, orig_addr);
361 set_mem_size (mem, GET_MODE_SIZE (mode));
362 do_load_for_compare (dest, mem, mode);
363 return;
364}
365
366/* Do a branch for an if/else decision.
367
368 CMPMODE is the mode to use for the comparison.
369 COMPARISON is the rtx code for the compare needed.
370 A is the first thing to be compared.
371 B is the second thing to be compared.
372 CR is the condition code reg input, or NULL_RTX.
373 TRUE_LABEL is the label to branch to if the condition is true.
faaeebd6 374 P is the estimated branch probability for the branch.
5ec3397e
AS
375
376 The return value is the CR used for the comparison.
377 If CR is null_rtx, then a new register of CMPMODE is generated.
378 If A and B are both null_rtx, then CR must not be null, and the
379 compare is not generated so you can use this with a dot form insn. */
380
381static void
382do_ifelse (machine_mode cmpmode, rtx_code comparison,
faaeebd6 383 rtx a, rtx b, rtx cr, rtx true_label, profile_probability br_prob)
5ec3397e
AS
384{
385 gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
386 || (a != NULL_RTX && b != NULL_RTX));
387
388 if (cr != NULL_RTX)
389 gcc_assert (GET_MODE (cr) == cmpmode);
390 else
391 cr = gen_reg_rtx (cmpmode);
392
393 rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
394
395 if (a != NULL_RTX)
396 emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
397
398 rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
399
400 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
faaeebd6
AS
401 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
402 add_reg_br_prob_note (j, br_prob);
5ec3397e
AS
403 JUMP_LABEL (j) = true_label;
404 LABEL_NUSES (true_label) += 1;
405}
406
407/* Emit an isel of the proper mode for DEST.
408
409 DEST is the isel destination register.
410 SRC1 is the isel source if CR is true.
411 SRC2 is the isel source if CR is false.
412 CR is the condition for the isel. */
413static void
414do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
415{
416 if (GET_MODE (dest) == DImode)
4ba3902e 417 emit_insn (gen_isel_cc_di (dest, cmp, src_t, src_f, cr));
5ec3397e 418 else
4ba3902e 419 emit_insn (gen_isel_cc_si (dest, cmp, src_t, src_f, cr));
5ec3397e
AS
420}
421
422/* Emit a subtract of the proper mode for DEST.
423
424 DEST is the destination register for the subtract.
425 SRC1 is the first subtract input.
426 SRC2 is the second subtract input.
427
428 Computes DEST = SRC1-SRC2. */
429static void
430do_sub3 (rtx dest, rtx src1, rtx src2)
431{
432 if (GET_MODE (dest) == DImode)
433 emit_insn (gen_subdi3 (dest, src1, src2));
434 else
435 emit_insn (gen_subsi3 (dest, src1, src2));
436}
437
438/* Emit an add of the proper mode for DEST.
439
440 DEST is the destination register for the add.
441 SRC1 is the first add input.
442 SRC2 is the second add input.
443
444 Computes DEST = SRC1+SRC2. */
445static void
446do_add3 (rtx dest, rtx src1, rtx src2)
447{
448 if (GET_MODE (dest) == DImode)
449 emit_insn (gen_adddi3 (dest, src1, src2));
450 else
451 emit_insn (gen_addsi3 (dest, src1, src2));
452}
453
f7e94dfb
AS
454/* Emit an and of the proper mode for DEST.
455
456 DEST is the destination register for the and.
457 SRC1 is the first and input.
458 SRC2 is the second and input.
459
460 Computes DEST = SRC1&SRC2. */
461static void
462do_and3 (rtx dest, rtx src1, rtx src2)
463{
464 if (GET_MODE (dest) == DImode)
465 emit_insn (gen_anddi3 (dest, src1, src2));
466 else
467 emit_insn (gen_andsi3 (dest, src1, src2));
468}
469
470/* Emit an cmpb of the proper mode for DEST.
471
472 DEST is the destination register for the cmpb.
473 SRC1 is the first input.
474 SRC2 is the second input.
475
476 Computes cmpb of SRC1, SRC2. */
477static void
478do_cmpb3 (rtx dest, rtx src1, rtx src2)
479{
480 if (GET_MODE (dest) == DImode)
481 emit_insn (gen_cmpbdi3 (dest, src1, src2));
482 else
483 emit_insn (gen_cmpbsi3 (dest, src1, src2));
484}
485
486/* Emit a rotl of the proper mode for DEST.
487
488 DEST is the destination register for the and.
489 SRC1 is the first and input.
490 SRC2 is the second and input.
491
492 Computes DEST = SRC1 rotated left by SRC2. */
493static void
494do_rotl3 (rtx dest, rtx src1, rtx src2)
495{
496 if (GET_MODE (dest) == DImode)
497 emit_insn (gen_rotldi3 (dest, src1, src2));
498 else
499 emit_insn (gen_rotlsi3 (dest, src1, src2));
500}
501
5ec3397e
AS
502/* Generate rtl for a load, shift, and compare of less than a full word.
503
504 LOAD_MODE is the machine mode for the loads.
505 DIFF is the reg for the difference.
506 CMP_REM is the reg containing the remaining bytes to compare.
507 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
508 SRC1_ADDR is the first source address.
509 SRC2_ADDR is the second source address.
510 ORIG_SRC1 is the original first source block's address rtx.
511 ORIG_SRC2 is the original second source block's address rtx. */
512static void
513do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
514 rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
515{
516 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
517 rtx shift_amount = gen_reg_rtx (word_mode);
518 rtx d1 = gen_reg_rtx (word_mode);
519 rtx d2 = gen_reg_rtx (word_mode);
520
521 do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
522 do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
523 do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
524
525 if (word_mode == DImode)
526 {
527 emit_insn (gen_ashldi3 (shift_amount, shift_amount,
528 GEN_INT (LOG2_BITS_PER_UNIT)));
529 emit_insn (gen_lshrdi3 (d1, d1,
530 gen_lowpart (SImode, shift_amount)));
531 emit_insn (gen_lshrdi3 (d2, d2,
532 gen_lowpart (SImode, shift_amount)));
533 }
534 else
535 {
536 emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
537 GEN_INT (LOG2_BITS_PER_UNIT)));
538 emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
539 emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
540 }
541
542 if (TARGET_P9_MISC)
543 {
544 /* Generate a compare, and convert with a setb later. */
545 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
546 emit_insn (gen_rtx_SET (dcond, cmp));
547 }
548 else
549 {
550 if (word_mode == DImode)
551 emit_insn (gen_subfdi3_carry (diff, d2, d1));
552 else
553 emit_insn (gen_subfsi3_carry (diff, d2, d1));
554 }
555}
556
557/* Generate rtl for an overlapping load and compare of less than a
558 full load_mode. This assumes that the previous word is part of the
559 block being compared so it's ok to back up part of a word so we can
560 compare the last unaligned full word that ends at the end of the block.
561
562 LOAD_MODE is the machine mode for the loads.
563 ISCONST tells whether the remaining length is a constant or in a register.
564 BYTES_REM is the remaining length if ISCONST is true.
565 DIFF is the reg for the difference.
566 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
567 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
568 SRC1_ADDR is the first source address.
569 SRC2_ADDR is the second source address.
570 ORIG_SRC1 is the original first source block's address rtx.
571 ORIG_SRC2 is the original second source block's address rtx. */
572static void
573do_overlap_load_compare (machine_mode load_mode, bool isConst,
574 HOST_WIDE_INT bytes_rem, rtx diff,
575 rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
576 rtx orig_src1, rtx orig_src2)
577{
578 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
579 HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
580 rtx d1 = gen_reg_rtx (word_mode);
581 rtx d2 = gen_reg_rtx (word_mode);
582
583 rtx addr1, addr2;
584 if (!isConst || addr_adj)
585 {
586 rtx adj_reg = gen_reg_rtx (word_mode);
587 if (isConst)
588 emit_move_insn (adj_reg, GEN_INT (-addr_adj));
589 else
590 {
591 rtx reg_lms = gen_reg_rtx (word_mode);
592 emit_move_insn (reg_lms, GEN_INT (load_mode_size));
593 do_sub3 (adj_reg, cmp_rem, reg_lms);
594 }
595
596 addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
597 addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
598 }
599 else
600 {
601 addr1 = src1_addr;
602 addr2 = src2_addr;
603 }
604
605 do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
606 do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
607
608 if (TARGET_P9_MISC)
609 {
610 /* Generate a compare, and convert with a setb later. */
611 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
612 emit_insn (gen_rtx_SET (dcond, cmp));
613 }
614 else
615 {
616 if (word_mode == DImode)
617 emit_insn (gen_subfdi3_carry (diff, d2, d1));
618 else
619 emit_insn (gen_subfsi3_carry (diff, d2, d1));
620 }
621}
622
37ae4739
AS
623/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
624 instructions.
625
626 BYTES_TO_COMPARE is the number of bytes to be compared.
627 ORIG_SRC1 is the unmodified rtx for the first string.
628 ORIG_SRC2 is the unmodified rtx for the second string.
629 S1ADDR is the register to use for the base address of the first string.
630 S2ADDR is the register to use for the base address of the second string.
631 OFF_REG is the register to use for the string offset for loads.
632 S1DATA is the register for loading the first string.
633 S2DATA is the register for loading the second string.
634 VEC_RESULT is the rtx for the vector result indicating the byte difference.
635 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
636 to strcmp/strncmp if we have equality at the end of the inline comparison.
637 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
638 to clean up and generate the final comparison result.
639 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
640 set the final result.
641 CHECKZERO indicates whether the sequence should check for zero bytes
642 for use doing strncmp, or not (for use doing memcmp). */
643static void
644expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
645 rtx orig_src1, rtx orig_src2,
646 rtx s1addr, rtx s2addr, rtx off_reg,
647 rtx s1data, rtx s2data, rtx vec_result,
648 bool equality_compare_rest, rtx *p_cleanup_label,
649 rtx final_move_label, bool checkzero)
650{
651 machine_mode load_mode;
652 unsigned int load_mode_size;
653 unsigned HOST_WIDE_INT cmp_bytes = 0;
654 unsigned HOST_WIDE_INT offset = 0;
655 rtx zero_reg = NULL;
656
657 gcc_assert (p_cleanup_label != NULL);
658 rtx cleanup_label = *p_cleanup_label;
659
660 emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
661 emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
662
663 if (checkzero && !TARGET_P9_VECTOR)
664 {
665 zero_reg = gen_reg_rtx (V16QImode);
666 emit_move_insn (zero_reg, CONST0_RTX (V16QImode));
667 }
668
669 while (bytes_to_compare > 0)
670 {
671 /* VEC/VSX compare sequence for P8:
672 check each 16B with:
673 lxvd2x 32,28,8
674 lxvd2x 33,29,8
675 vcmpequb 2,0,1 # compare strings
676 vcmpequb 4,0,3 # compare w/ 0
677 xxlorc 37,36,34 # first FF byte is either mismatch or end of string
678 vcmpequb. 7,5,3 # reg 7 contains 0
679 bnl 6,.Lmismatch
680
681 For the P8 LE case, we use lxvd2x and compare full 16 bytes
700d4cb0 682 but then use vgbbd and a shift to get two bytes with the
37ae4739
AS
683 information we need in the correct order.
684
685 VEC/VSX compare sequence if TARGET_P9_VECTOR:
686 lxvb16x/lxvb16x # load 16B of each string
687 vcmpnezb. # produces difference location or zero byte location
688 bne 6,.Lmismatch
689
690 Use the overlapping compare trick for the last block if it is
691 less than 16 bytes.
692 */
693
694 load_mode = V16QImode;
695 load_mode_size = GET_MODE_SIZE (load_mode);
696
697 if (bytes_to_compare >= load_mode_size)
698 cmp_bytes = load_mode_size;
699 else
700 {
701 /* Move this load back so it doesn't go past the end. P8/P9
702 can do this efficiently. This is never called with less
703 than 16 bytes so we should always be able to do this. */
704 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
705 cmp_bytes = bytes_to_compare;
706 gcc_assert (offset > extra_bytes);
707 offset -= extra_bytes;
708 cmp_bytes = load_mode_size;
709 bytes_to_compare = cmp_bytes;
710 }
711
712 /* The offset currently used is always kept in off_reg so that the
713 cleanup code on P8 can use it to extract the differing byte. */
714 emit_move_insn (off_reg, GEN_INT (offset));
715
716 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
717 do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1);
718 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
719 do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2);
720
721 /* Cases to handle. A and B are chunks of the two strings.
722 1: Not end of comparison:
723 A != B: branch to cleanup code to compute result.
724 A == B: next block
725 2: End of the inline comparison:
726 A != B: branch to cleanup code to compute result.
727 A == B: call strcmp/strncmp
728 3: compared requested N bytes:
729 A == B: branch to result 0.
730 A != B: cleanup code to compute result. */
731
732 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
733
734 if (checkzero)
735 {
736 if (TARGET_P9_VECTOR)
737 emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data));
738 else
739 {
740 /* Emit instructions to do comparison and zero check. */
741 rtx cmp_res = gen_reg_rtx (load_mode);
742 rtx cmp_zero = gen_reg_rtx (load_mode);
743 rtx cmp_combined = gen_reg_rtx (load_mode);
744 emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data));
745 emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg));
746 emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res));
747 emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg));
748 }
749 }
750 else
751 emit_insn (gen_altivec_vcmpequb_p (vec_result, s1data, s2data));
752
753 bool branch_to_cleanup = (remain > 0 || equality_compare_rest);
754 rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO);
755 rtx dst_label;
756 rtx cmp_rtx;
757 if (branch_to_cleanup)
758 {
759 /* Branch to cleanup code, otherwise fall through to do more
760 compares. P8 and P9 use different CR bits because on P8
761 we are looking at the result of a comparsion vs a
762 register of zeroes so the all-true condition means no
763 difference or zero was found. On P9, vcmpnezb sets a byte
764 to 0xff if there is a mismatch or zero, so the all-false
765 condition indicates we found no difference or zero. */
766 if (!cleanup_label)
767 cleanup_label = gen_label_rtx ();
768 dst_label = cleanup_label;
769 if (TARGET_P9_VECTOR && checkzero)
770 cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx);
771 else
772 cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx);
773 }
774 else
775 {
776 /* Branch to final return or fall through to cleanup,
777 result is already set to 0. */
778 dst_label = final_move_label;
779 if (TARGET_P9_VECTOR && checkzero)
780 cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx);
781 else
782 cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx);
783 }
784
785 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
786 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
787 lab_ref, pc_rtx);
faaeebd6
AS
788 rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
789 add_reg_br_prob_note (j2, profile_probability::likely ());
37ae4739
AS
790 JUMP_LABEL (j2) = dst_label;
791 LABEL_NUSES (dst_label) += 1;
792
793 offset += cmp_bytes;
794 bytes_to_compare -= cmp_bytes;
795 }
796 *p_cleanup_label = cleanup_label;
797 return;
798}
799
800/* Generate the final sequence that identifies the differing
801 byte and generates the final result, taking into account
802 zero bytes:
803
804 P8:
805 vgbbd 0,0
806 vsldoi 0,0,0,9
807 mfvsrd 9,32
808 addi 10,9,-1 # count trailing zero bits
809 andc 9,10,9
810 popcntd 9,9
811 lbzx 10,28,9 # use that offset to load differing byte
812 lbzx 3,29,9
813 subf 3,3,10 # subtract for final result
814
815 P9:
816 vclzlsbb # counts trailing bytes with lsb=0
817 vextublx # extract differing byte
818
819 STR1 is the reg rtx for data from string 1.
820 STR2 is the reg rtx for data from string 2.
821 RESULT is the reg rtx for the comparison result.
822 S1ADDR is the register to use for the base address of the first string.
823 S2ADDR is the register to use for the base address of the second string.
824 ORIG_SRC1 is the unmodified rtx for the first string.
825 ORIG_SRC2 is the unmodified rtx for the second string.
826 OFF_REG is the register to use for the string offset for loads.
827 VEC_RESULT is the rtx for the vector result indicating the byte difference. */
828
829static void
830emit_final_compare_vec (rtx str1, rtx str2, rtx result,
831 rtx s1addr, rtx s2addr,
832 rtx orig_src1, rtx orig_src2,
833 rtx off_reg, rtx vec_result)
834{
835
836 if (TARGET_P9_VECTOR)
837 {
838 rtx diffix = gen_reg_rtx (SImode);
839 rtx chr1 = gen_reg_rtx (SImode);
840 rtx chr2 = gen_reg_rtx (SImode);
841 rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0);
842 rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0);
843 emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result));
844 emit_insn (gen_vextublx (chr1, diffix, str1));
845 emit_insn (gen_vextublx (chr2, diffix, str2));
846 do_sub3 (result, chr1_di, chr2_di);
847 }
848 else
849 {
850 gcc_assert (TARGET_P8_VECTOR);
851 rtx diffix = gen_reg_rtx (DImode);
852 rtx result_gbbd = gen_reg_rtx (V16QImode);
853 /* Since each byte of the input is either 00 or FF, the bytes in
854 dw0 and dw1 after vgbbd are all identical to each other. */
855 emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
856 /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
857 For BE, we shift by 7 and get AB in the high two bytes then CLZ. */
858 rtx result_shifted = gen_reg_rtx (V16QImode);
859 int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9;
860 emit_insn (gen_altivec_vsldoi_v16qi (result_shifted, result_gbbd,
861 result_gbbd, GEN_INT (shift_amt)));
862
863 rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0);
864 emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted));
865 rtx count = gen_reg_rtx (DImode);
866
867 if (BYTES_BIG_ENDIAN)
868 emit_insn (gen_clzdi2 (count, diffix));
869 else
870 emit_insn (gen_ctzdi2 (count, diffix));
871
872 /* P8 doesn't have a good solution for extracting one byte from
873 a vsx reg like vextublx on P9 so we just compute the offset
874 of the differing byte and load it from each string. */
875 do_add3 (off_reg, off_reg, count);
876
877 rtx chr1 = gen_reg_rtx (QImode);
878 rtx chr2 = gen_reg_rtx (QImode);
879 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
880 do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1);
881 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
882 do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2);
883 machine_mode rmode = GET_MODE (result);
884 rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0);
885 rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0);
886 do_sub3 (result, chr1_rm, chr2_rm);
887 }
888
889 return;
890}
891
5ec3397e
AS
892/* Expand a block compare operation using loop code, and return true
893 if successful. Return false if we should let the compiler generate
894 normal code, probably a memcmp call.
895
896 OPERANDS[0] is the target (result).
897 OPERANDS[1] is the first source.
898 OPERANDS[2] is the second source.
899 OPERANDS[3] is the length.
900 OPERANDS[4] is the alignment. */
901bool
902expand_compare_loop (rtx operands[])
903{
904 rtx target = operands[0];
905 rtx orig_src1 = operands[1];
906 rtx orig_src2 = operands[2];
907 rtx bytes_rtx = operands[3];
908 rtx align_rtx = operands[4];
909
910 /* This case is complicated to handle because the subtract
911 with carry instructions do not generate the 64-bit
912 carry and so we must emit code to calculate it ourselves.
913 We choose not to implement this yet. */
914 if (TARGET_32BIT && TARGET_POWERPC64)
915 return false;
916
917 /* Allow non-const length. */
918 int bytes_is_const = CONST_INT_P (bytes_rtx);
919
920 /* This must be a fixed size alignment. */
921 if (!CONST_INT_P (align_rtx))
922 return false;
923
924 HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
925 HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
926 HOST_WIDE_INT minalign = MIN (align1, align2);
927
928 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
929
930 gcc_assert (GET_MODE (target) == SImode);
931
932 /* Anything to move? */
933 HOST_WIDE_INT bytes = 0;
934 if (bytes_is_const)
935 bytes = INTVAL (bytes_rtx);
936
937 if (bytes_is_const && bytes == 0)
938 return true;
939
940 /* Limit the amount we compare, if known statically. */
941 HOST_WIDE_INT max_bytes;
942 switch (rs6000_tune)
943 {
944 case PROCESSOR_POWER7:
945 if (!bytes_is_const)
946 if (minalign < 8)
947 max_bytes = 0;
948 else
949 max_bytes = 128;
950 else
951 if (minalign < 8)
952 max_bytes = 32;
953 else
954 max_bytes = 128;
955 break;
956 case PROCESSOR_POWER8:
957 if (!bytes_is_const)
958 max_bytes = 0;
959 else
960 if (minalign < 8)
961 max_bytes = 128;
962 else
963 max_bytes = 64;
964 break;
965 case PROCESSOR_POWER9:
5d9d0c94 966 case PROCESSOR_POWER10:
dc0f9a74 967 case PROCESSOR_POWER11:
5ec3397e
AS
968 if (bytes_is_const)
969 max_bytes = 191;
970 else
971 max_bytes = 0;
972 break;
973 default:
974 max_bytes = 128;
975 }
976
977 /* Allow the option to override the default. */
978 if (rs6000_block_compare_inline_loop_limit >= 0)
979 max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
980
981 if (max_bytes == 0)
982 return false;
983
984 rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */
985 rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */
986 HOST_WIDE_INT niter;
987 rtx iter = gen_reg_rtx (word_mode);
988 rtx iv1 = gen_reg_rtx (word_mode);
989 rtx iv2 = gen_reg_rtx (word_mode);
990 rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */
991 rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */
992 rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */
993 rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */
994
995 /* Strip unneeded subreg from length if there is one. */
996 if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
997 bytes_rtx = SUBREG_REG (bytes_rtx);
998 /* Extend bytes_rtx to word_mode if needed. But, we expect only to
999 maybe have to deal with the case were bytes_rtx is SImode and
1000 word_mode is DImode. */
1001 if (!bytes_is_const)
1002 {
1003 if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
1004 /* Do not expect length longer than word_mode. */
ef4adf1f 1005 return false;
5ec3397e
AS
1006 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
1007 {
1008 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1009 bytes_rtx = force_reg (word_mode,
1010 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
1011 bytes_rtx));
1012 }
1013 else
1014 /* Make sure it's in a register before we get started. */
1015 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1016 }
1017
1018 machine_mode load_mode = word_mode;
1019 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
1020
1021 /* Number of bytes per iteration of the unrolled loop. */
1022 HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
1023 /* max iters and bytes compared in the loop. */
1024 HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
1025 HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
1026 int l2lb = floor_log2 (loop_bytes);
1027
1028 if (bytes_is_const && (max_bytes < load_mode_size
1029 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
1030 return false;
1031
1032 bool no_remainder_code = false;
1033 rtx final_label = gen_label_rtx ();
1034 rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1035 rtx diff_label = gen_label_rtx ();
1036 rtx library_call_label = NULL;
1037 rtx cleanup_label = gen_label_rtx ();
1038
1039 rtx cr;
1040
1041 rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
1042 rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
1043
1044 /* Difference found is stored here before jump to diff_label. */
1045 rtx diff = gen_reg_rtx (word_mode);
faaeebd6 1046 rtx_insn *j;
5ec3397e
AS
1047
1048 /* Example of generated code for 35 bytes aligned 1 byte.
ef4adf1f 1049
5ec3397e
AS
1050 mtctr 8
1051 li 6,0
1052 li 5,8
1053 .L13:
1054 ldbrx 7,3,6
1055 ldbrx 9,10,6
1056 ldbrx 0,3,5
1057 ldbrx 4,10,5
1058 addi 6,6,16
1059 addi 5,5,16
1060 subfc. 9,9,7
1061 bne 0,.L10
1062 subfc. 9,4,0
1063 bdnzt 2,.L13
1064 bne 0,.L10
1065 add 3,3,6
1066 add 10,10,6
1067 addi 9,3,-5
1068 ldbrx 7,0,9
1069 addi 9,10,-5
1070 ldbrx 9,0,9
1071 subfc 9,9,7
1072 .p2align 4,,15
1073 .L10:
1074 popcntd 9,9
1075 subfe 10,10,10
1076 or 9,9,10
ef4adf1f 1077
5ec3397e
AS
1078 Compiled with -fno-reorder-blocks for clarity. */
1079
1080 /* Structure of what we're going to do:
1081 Two separate lengths: what we will compare before bailing to library
1082 call (max_bytes), and the total length to be checked.
1083 if length <= 16, branch to linear cleanup code starting with
1084 remainder length check (length not known at compile time)
1085 set up 2 iv's and load count reg, compute remainder length
1086 unrollx2 compare loop
1087 if loop exit due to a difference, branch to difference handling code
1088 if remainder length < 8, branch to final cleanup compare
1089 load and compare 8B
1090 final cleanup comparison (depends on alignment and length)
1091 load 8B, shift off bytes past length, compare
1092 load 8B ending at last byte and compare
1093 load/compare 1 byte at a time (short block abutting 4k boundary)
1094 difference handling, 64->32 conversion
1095 final result
1096 branch around memcmp call
1097 memcmp library call
1098 */
1099
1100 /* If bytes is not const, compare length and branch directly
1101 to the cleanup code that can handle 0-16 bytes if length
1102 is >= 16. Stash away bytes-max_bytes for the library call. */
1103 if (bytes_is_const)
1104 {
1105 /* These need to be set for some of the places we may jump to. */
1106 if (bytes > max_bytes)
1107 {
1108 no_remainder_code = true;
1109 niter = max_loop_iter;
1110 library_call_label = gen_label_rtx ();
1111 }
1112 else
1113 {
1114 niter = bytes / loop_bytes;
1115 }
1116 emit_move_insn (iter, GEN_INT (niter));
1117 emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
1118 emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
1119 }
1120 else
1121 {
1122 library_call_label = gen_label_rtx ();
1123
1124 /* If we go to the cleanup code, it expects length to be in cmp_rem. */
1125 emit_move_insn (cmp_rem, bytes_rtx);
1126
1127 /* Check for > max_bytes bytes. We want to bail out as quickly as
1128 possible if we have to go over to memcmp. */
1129 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
faaeebd6 1130 NULL_RTX, library_call_label, profile_probability::even ());
5ec3397e
AS
1131
1132 /* Check for < loop_bytes bytes. */
1133 do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
faaeebd6 1134 NULL_RTX, cleanup_label, profile_probability::even ());
5ec3397e
AS
1135
1136 /* Loop compare bytes and iterations if bytes>max_bytes. */
1137 rtx mb_reg = gen_reg_rtx (word_mode);
1138 emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
1139 rtx mi_reg = gen_reg_rtx (word_mode);
1140 emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
1141
1142 /* Compute number of loop iterations if bytes <= max_bytes. */
1143 if (word_mode == DImode)
1144 emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1145 else
1146 emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1147
1148 /* Compute bytes to compare in loop if bytes <= max_bytes. */
1149 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
1150 if (word_mode == DImode)
1151 {
1152 emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
1153 }
1154 else
1155 {
1156 emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
1157 }
1158
1159 /* Check for bytes <= max_bytes. */
1160 if (TARGET_ISEL)
1161 {
1162 /* P9 has fast isel so we use one compare and two isel. */
1163 cr = gen_reg_rtx (CCmode);
1164 rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
1165 GEN_INT (max_bytes));
1166 emit_move_insn (cr, compare_rtx);
1167 rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
1168 do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
1169 do_isel (iter, cmp_rtx, iter, mi_reg, cr);
1170 }
1171 else
1172 {
1173 rtx lab_after = gen_label_rtx ();
1174 do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
faaeebd6 1175 NULL_RTX, lab_after, profile_probability::even ());
5ec3397e
AS
1176 emit_move_insn (loop_cmp, mb_reg);
1177 emit_move_insn (iter, mi_reg);
1178 emit_label (lab_after);
1179 }
1180
1181 /* Now compute remainder bytes which isn't used until after the loop. */
1182 do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
1183 }
1184
1185 rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */
1186 /* For p9 we need to have just one of these as multiple places define
1187 it and it gets used by the setb at the end. */
1188 if (TARGET_P9_MISC)
1189 dcond = gen_reg_rtx (CCUNSmode);
1190
1191 if (!bytes_is_const || bytes >= loop_bytes)
1192 {
1193 /* It should not be possible to come here if remaining bytes is
1194 < 16 in the runtime case either. Compute number of loop
1195 iterations. We compare 2*word_mode per iteration so 16B for
1196 64-bit code and 8B for 32-bit. Set up two induction
1197 variables and load count register. */
1198
1199 /* HACK ALERT: create hard reg for CTR here. If we just use a
1200 pseudo, cse will get rid of it and then the allocator will
1201 see it used in the lshr above and won't give us ctr. */
1202 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1203 emit_move_insn (ctr, iter);
1204 emit_move_insn (diff, GEN_INT (0));
1205 emit_move_insn (iv1, GEN_INT (0));
1206 emit_move_insn (iv2, GEN_INT (load_mode_size));
1207
1208 /* inner loop to compare 2*word_mode */
1209 rtx loop_top_label = gen_label_rtx ();
1210 emit_label (loop_top_label);
1211
1212 rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
1213 rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
1214
1215 do_load_for_compare_from_addr (load_mode, d1_1,
1216 src1_ix1, orig_src1);
1217 do_load_for_compare_from_addr (load_mode, d2_1,
1218 src2_ix1, orig_src2);
1219 do_add3 (iv1, iv1, GEN_INT (loop_bytes));
1220
1221 rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
1222 rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
1223
1224 do_load_for_compare_from_addr (load_mode, d1_2,
1225 src1_ix2, orig_src1);
1226 do_load_for_compare_from_addr (load_mode, d2_2,
1227 src2_ix2, orig_src2);
1228 do_add3 (iv2, iv2, GEN_INT (loop_bytes));
1229
1230 if (TARGET_P9_MISC)
1231 {
1232 /* Generate a compare, and convert with a setb later. */
1233 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1234 emit_insn (gen_rtx_SET (dcond, cmp));
1235 }
1236 else
1237 {
1238 dcond = gen_reg_rtx (CCmode);
1239 if (word_mode == DImode)
1240 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1241 else
1242 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1243 }
1244
1245 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
faaeebd6 1246 dcond, diff_label, profile_probability::unlikely ());
5ec3397e
AS
1247
1248 if (TARGET_P9_MISC)
1249 {
1250 /* Generate a compare, and convert with a setb later. */
1251 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
1252 emit_insn (gen_rtx_SET (dcond, cmp));
1253 }
1254 else
1255 {
1256 dcond = gen_reg_rtx (CCmode);
1257 if (word_mode == DImode)
1258 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1259 else
1260 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1261 }
1262
1263 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
1264 if (TARGET_64BIT)
1265 j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
1266 eqrtx, dcond));
1267 else
1268 j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
1269 eqrtx, dcond));
faaeebd6 1270 add_reg_br_prob_note (j, profile_probability::likely ());
5ec3397e
AS
1271 JUMP_LABEL (j) = loop_top_label;
1272 LABEL_NUSES (loop_top_label) += 1;
1273 }
1274
1275 HOST_WIDE_INT bytes_remaining = 0;
1276 if (bytes_is_const)
1277 bytes_remaining = (bytes % loop_bytes);
1278
1279 /* If diff is nonzero, branch to difference handling
1280 code. If we exit here with a nonzero diff, it is
1281 because the second word differed. */
1282 if (TARGET_P9_MISC)
faaeebd6
AS
1283 do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond,
1284 diff_label, profile_probability::unlikely ());
5ec3397e 1285 else
faaeebd6
AS
1286 do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX,
1287 diff_label, profile_probability::unlikely ());
5ec3397e
AS
1288
1289 if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
1290 {
1291 /* If the length is known at compile time, then we will always
1292 have a remainder to go to the library call with. */
1293 rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
1294 j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
1295 JUMP_LABEL (j) = library_call_label;
1296 LABEL_NUSES (library_call_label) += 1;
1297 emit_barrier ();
1298 }
1299
1300 if (bytes_is_const && bytes_remaining == 0)
1301 {
1302 /* No remainder and if we are here then diff is 0 so just return 0 */
1303 if (TARGET_64BIT)
1304 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1305 else
1306 emit_move_insn (target, diff);
1307 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1308 JUMP_LABEL (j) = final_label;
1309 LABEL_NUSES (final_label) += 1;
1310 emit_barrier ();
1311 }
1312 else if (!no_remainder_code)
1313 {
1314 /* Update addresses to point to the next word to examine. */
1315 do_add3 (src1_addr, src1_addr, iv1);
1316 do_add3 (src2_addr, src2_addr, iv1);
1317
1318 emit_label (cleanup_label);
1319
1320 if (!bytes_is_const)
1321 {
1322 /* If we're dealing with runtime length, we have to check if
ef4adf1f 1323 it's zero after the loop. When length is known at compile
5ec3397e
AS
1324 time the no-remainder condition is dealt with above. By
1325 doing this after cleanup_label, we also deal with the
1326 case where length is 0 at the start and we bypass the
1327 loop with a branch to cleanup_label. */
1328 emit_move_insn (target, const0_rtx);
1329 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
faaeebd6 1330 NULL_RTX, final_label, profile_probability::unlikely ());
5ec3397e
AS
1331 }
1332
1333 rtx final_cleanup = gen_label_rtx ();
1334 rtx cmp_rem_before = gen_reg_rtx (word_mode);
1335 /* Compare one more word_mode chunk if needed. */
37ca383f 1336 if (!bytes_is_const || bytes_remaining >= load_mode_size)
5ec3397e
AS
1337 {
1338 /* If remainder length < word length, branch to final
1339 cleanup compare. */
faaeebd6 1340
5ec3397e 1341 if (!bytes_is_const)
faaeebd6
AS
1342 {
1343 do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
1344 NULL_RTX, final_cleanup, profile_probability::even ());
1345 }
5ec3397e
AS
1346
1347 /* load and compare 8B */
1348 do_load_for_compare_from_addr (load_mode, d1_1,
1349 src1_addr, orig_src1);
1350 do_load_for_compare_from_addr (load_mode, d2_1,
1351 src2_addr, orig_src2);
1352
1353 /* Compare the word, see if we need to do the last partial. */
1354 if (TARGET_P9_MISC)
1355 {
1356 /* Generate a compare, and convert with a setb later. */
1357 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1358 emit_insn (gen_rtx_SET (dcond, cmp));
1359 }
1360 else
1361 {
1362 dcond = gen_reg_rtx (CCmode);
1363 if (word_mode == DImode)
1364 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1365 else
1366 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1367 }
1368
1369 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
faaeebd6 1370 dcond, diff_label, profile_probability::even ());
5ec3397e
AS
1371
1372 do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1373 do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1374 emit_move_insn (cmp_rem_before, cmp_rem);
1375 do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1376 if (bytes_is_const)
1377 bytes_remaining -= load_mode_size;
1378 else
1379 /* See if remaining length is now zero. We previously set
1380 target to 0 so we can just jump to the end. */
faaeebd6
AS
1381 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX,
1382 final_label, profile_probability::unlikely ());
5ec3397e
AS
1383 }
1384
1385 /* Cases:
1386 bytes_is_const
1387 We can always shift back to do an overlapping compare
1388 of the last chunk because we know length >= 8.
1389
1390 !bytes_is_const
1391 align>=load_mode_size
1392 Read word_mode and mask
1393 align<load_mode_size
1394 avoid stepping past end
1395
1396 Three strategies:
1397 * decrement address and do overlapping compare
1398 * read word_mode and mask
1399 * carefully avoid crossing 4k boundary
1400 */
1401
1402 if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1403 && align1 >= load_mode_size && align2 >= load_mode_size)
1404 {
1405 /* Alignment is larger than word_mode so we do not need to be
1406 concerned with extra page crossings. But, we do not know
1407 that the length is larger than load_mode_size so we might
1408 end up compareing against data before the block if we try
1409 an overlapping compare. Also we use this on P7 for fixed length
1410 remainder because P7 doesn't like overlapping unaligned.
1411 Strategy: load 8B, shift off bytes past length, and compare. */
1412 emit_label (final_cleanup);
1413 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1414 src1_addr, src2_addr, orig_src1, orig_src2);
1415 }
1416 else if (bytes_remaining && bytes_is_const)
1417 {
1418 /* We do not do loop expand if length < 32 so we know at the
1419 end we can do an overlapping compare.
1420 Strategy: shift address back and do word_mode load that
1421 ends at the end of the block. */
1422 emit_label (final_cleanup);
1423 do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1424 cmp_rem, dcond, src1_addr, src2_addr,
1425 orig_src1, orig_src2);
1426 }
1427 else if (!bytes_is_const)
1428 {
1429 rtx handle4k_label = gen_label_rtx ();
1430 rtx nonconst_overlap = gen_label_rtx ();
1431 emit_label (nonconst_overlap);
1432
1433 /* Here we have to handle the case where whe have runtime
1434 length which may be too short for overlap compare, and
1435 alignment is not at least load_mode_size so we have to
1436 tread carefully to avoid stepping across 4k boundaries. */
1437
1438 /* If the length after the loop was larger than word_mode
1439 size, we can just do an overlapping compare and we're
1440 done. We fall through to this code from the word_mode
1441 compare that preceeds this. */
1442 do_overlap_load_compare (load_mode, false, 0, diff,
1443 cmp_rem, dcond, src1_addr, src2_addr,
1444 orig_src1, orig_src2);
1445
1446 rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1447 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1448 JUMP_LABEL (j) = diff_label;
1449 LABEL_NUSES (diff_label) += 1;
1450 emit_barrier ();
1451
1452 /* If we couldn't do the overlap compare we have to be more
1453 careful of the 4k boundary. Test to see if either
1454 address is less than word_mode_size away from a 4k
1455 boundary. If not, then we can do a load/shift/compare
1456 and we are done. We come to this code if length was less
1457 than word_mode_size. */
1458
1459 emit_label (final_cleanup);
1460
1461 /* We can still avoid the slow case if the length was larger
1462 than one loop iteration, in which case go do the overlap
1463 load compare path. */
1464 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
faaeebd6 1465 NULL_RTX, nonconst_overlap, profile_probability::even ());
5ec3397e
AS
1466
1467 rtx rem4k = gen_reg_rtx (word_mode);
1468 rtx dist1 = gen_reg_rtx (word_mode);
1469 rtx dist2 = gen_reg_rtx (word_mode);
1470 do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1471 if (word_mode == SImode)
1472 emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1473 else
1474 emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
faaeebd6
AS
1475 do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX,
1476 handle4k_label, profile_probability::very_unlikely ());
5ec3397e
AS
1477 if (word_mode == SImode)
1478 emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1479 else
1480 emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
faaeebd6
AS
1481 do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX,
1482 handle4k_label, profile_probability::very_unlikely ());
5ec3397e
AS
1483
1484 /* We don't have a 4k boundary to deal with, so do
1485 a load/shift/compare and jump to diff. */
1486
1487 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1488 src1_addr, src2_addr, orig_src1, orig_src2);
1489
1490 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1491 JUMP_LABEL (j) = diff_label;
1492 LABEL_NUSES (diff_label) += 1;
1493 emit_barrier ();
1494
1495 /* Finally in the unlikely case we are inching up to a
1496 4k boundary we use a compact lbzx/compare loop to do
1497 it a byte at a time. */
1498
1499 emit_label (handle4k_label);
1500
1501 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1502 emit_move_insn (ctr, cmp_rem);
1503 rtx ixreg = gen_reg_rtx (Pmode);
1504 emit_move_insn (ixreg, const0_rtx);
1505
1506 rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1507 rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1508 rtx d1 = gen_reg_rtx (word_mode);
1509 rtx d2 = gen_reg_rtx (word_mode);
1510
1511 rtx fc_loop = gen_label_rtx ();
1512 emit_label (fc_loop);
1513
1514 do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1515 do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1516
1517 do_add3 (ixreg, ixreg, const1_rtx);
1518
1519 rtx cond = gen_reg_rtx (CCmode);
1520 rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1521 rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1522
1523 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1524 if (TARGET_64BIT)
1525 j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1526 eqrtx, cond));
1527 else
1528 j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1529 eqrtx, cond));
5585759f 1530 add_reg_br_prob_note (j, profile_probability::likely ());
5ec3397e
AS
1531 JUMP_LABEL (j) = fc_loop;
1532 LABEL_NUSES (fc_loop) += 1;
1533
1534 if (TARGET_64BIT)
1535 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1536 else
1537 emit_move_insn (target, diff);
1538
1539 /* Since we are comparing bytes, the difference can be used
1540 as the final result and we are done here. */
1541 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1542 JUMP_LABEL (j) = final_label;
1543 LABEL_NUSES (final_label) += 1;
1544 emit_barrier ();
1545 }
1546 }
1547
1548 emit_label (diff_label);
1549 /* difference handling, 64->32 conversion */
1550
1551 /* We need to produce DI result from sub, then convert to target SI
1552 while maintaining <0 / ==0 / >0 properties. This sequence works:
1553 subfc L,A,B
1554 subfe H,H,H
1555 popcntd L,L
1556 rldimi L,H,6,0
1557
1558 This is an alternate one Segher cooked up if somebody
1559 wants to expand this for something that doesn't have popcntd:
1560 subfc L,a,b
1561 subfe H,x,x
1562 addic t,L,-1
1563 subfe v,t,L
1564 or z,v,H
1565
1566 And finally, p9 can just do this:
1567 cmpld A,B
1568 setb r */
1569
1570 if (TARGET_P9_MISC)
1571 emit_insn (gen_setb_unsigned (target, dcond));
1572 else
1573 {
1574 if (TARGET_64BIT)
1575 {
1576 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1577 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1578 emit_insn (gen_popcntddi2 (diff, diff));
1579 emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1580 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1581 }
1582 else
1583 {
1584 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1585 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1586 emit_insn (gen_popcntdsi2 (diff, diff));
1587 emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1588 }
1589 }
1590
1591 if (library_call_label != NULL)
1592 {
1593 /* Branch around memcmp call. */
1594 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1595 JUMP_LABEL (j) = final_label;
1596 LABEL_NUSES (final_label) += 1;
1597 emit_barrier ();
1598
1599 /* Make memcmp library call. cmp_rem is the remaining bytes that
1600 were compared and cmp_rem is the expected amount to be compared
1601 by memcmp. If we don't find a difference in the loop compare, do
1602 the library call directly instead of doing a small compare just
1603 to get to an arbitrary boundary before calling it anyway.
1604 Also, update addresses to point to the next word to examine. */
1605 emit_label (library_call_label);
1606
1607 rtx len_rtx = gen_reg_rtx (word_mode);
1608 if (bytes_is_const)
1609 {
1610 emit_move_insn (len_rtx, cmp_rem);
1611 do_add3 (src1_addr, src1_addr, iv1);
1612 do_add3 (src2_addr, src2_addr, iv1);
1613 }
1614 else
1615 emit_move_insn (len_rtx, bytes_rtx);
1616
1617 tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1618 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1619 target, LCT_NORMAL, GET_MODE (target),
1620 src1_addr, Pmode,
1621 src2_addr, Pmode,
1622 len_rtx, GET_MODE (len_rtx));
1623 }
1624
1625 /* emit final_label */
1626 emit_label (final_label);
1627 return true;
1628}
1629
37ae4739
AS
1630/* Generate code to convert a DImode-plus-carry subtract result into
1631 a SImode result that has the same <0 / ==0 / >0 properties to
1632 produce the final result from memcmp.
8845cb37 1633
37ae4739
AS
1634 TARGET is the rtx for the register to receive the memcmp result.
1635 SUB_RESULT is the rtx for the register contining the subtract result. */
8845cb37 1636
37ae4739
AS
1637void
1638generate_6432_conversion(rtx target, rtx sub_result)
1639{
1640 /* We need to produce DI result from sub, then convert to target SI
1641 while maintaining <0 / ==0 / >0 properties. This sequence works:
1642 subfc L,A,B
1643 subfe H,H,H
1644 popcntd L,L
1645 rldimi L,H,6,0
8845cb37 1646
37ae4739
AS
1647 This is an alternate one Segher cooked up if somebody
1648 wants to expand this for something that doesn't have popcntd:
1649 subfc L,a,b
1650 subfe H,x,x
1651 addic t,L,-1
1652 subfe v,t,L
1653 or z,v,H
8845cb37 1654
37ae4739
AS
1655 And finally, p9 can just do this:
1656 cmpld A,B
1657 setb r */
8845cb37 1658
37ae4739
AS
1659 if (TARGET_64BIT)
1660 {
1661 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1662 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1663 rtx popcnt = gen_reg_rtx (DImode);
1664 emit_insn (gen_popcntddi2 (popcnt, sub_result));
1665 rtx tmp2 = gen_reg_rtx (DImode);
1666 emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca));
1667 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2)));
1668 }
8845cb37 1669 else
37ae4739
AS
1670 {
1671 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1672 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1673 rtx popcnt = gen_reg_rtx (SImode);
1674 emit_insn (gen_popcntdsi2 (popcnt, sub_result));
1675 emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca));
1676 }
1677}
8845cb37 1678
37ae4739
AS
1679/* Generate memcmp expansion using in-line non-loop GPR instructions.
1680 The bool return indicates whether code for a 64->32 conversion
1681 should be generated.
1682
1683 BYTES is the number of bytes to be compared.
1684 BASE_ALIGN is the minimum alignment for both blocks to compare.
1685 ORIG_SRC1 is the original pointer to the first block to compare.
1686 ORIG_SRC2 is the original pointer to the second block to compare.
1687 SUB_RESULT is the reg rtx for the result from the final subtract.
1688 COND is rtx for a condition register that will be used for the final
1689 compare on power9 or better.
1690 FINAL_RESULT is the reg rtx for the final memcmp result.
1691 P_CONVERT_LABEL is a pointer to rtx that will be used to store the
1692 label generated for a branch to the 64->32 code, if such a branch
1693 is needed.
1694 P_FINAL_LABEL is a pointer to rtx that will be used to store the label
1695 for the end of the memcmp if a branch there is needed.
1696*/
8845cb37 1697
37ae4739
AS
1698bool
1699expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes, unsigned int base_align,
1700 rtx orig_src1, rtx orig_src2,
1701 rtx sub_result, rtx cond, rtx final_result,
1702 rtx *p_convert_label, rtx *p_final_label)
1703{
8845cb37
AS
1704 /* Example of generated code for 18 bytes aligned 1 byte.
1705 Compiled with -fno-reorder-blocks for clarity.
1706 ldbrx 10,31,8
1707 ldbrx 9,7,8
1708 subfc. 9,9,10
1709 bne 0,.L6487
1710 addi 9,12,8
1711 addi 5,11,8
1712 ldbrx 10,0,9
1713 ldbrx 9,0,5
1714 subfc. 9,9,10
1715 bne 0,.L6487
1716 addi 9,12,16
1717 lhbrx 10,0,9
1718 addi 9,11,16
1719 lhbrx 9,0,9
1720 subf 9,9,10
1721 b .L6488
1722 .p2align 4,,15
1723 .L6487: #convert_label
1724 popcntd 9,9
1725 subfe 10,10,10
1726 or 9,9,10
1727 .L6488: #final_label
1728 extsw 10,9
1729
1730 We start off with DImode for two blocks that jump to the DI->SI conversion
1731 if the difference is found there, then a final block of HImode that skips
1732 the DI->SI conversion. */
1733
37ae4739
AS
1734 unsigned HOST_WIDE_INT offset = 0;
1735 unsigned int load_mode_size;
1736 HOST_WIDE_INT cmp_bytes = 0;
1737 rtx src1 = orig_src1;
1738 rtx src2 = orig_src2;
1739 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1740 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1741 bool need_6432_conv = false;
1742 rtx convert_label = NULL;
1743 rtx final_label = NULL;
1744 machine_mode load_mode;
1745
8845cb37
AS
1746 while (bytes > 0)
1747 {
1748 unsigned int align = compute_current_alignment (base_align, offset);
74f9986e 1749 load_mode = select_block_compare_mode (offset, bytes, align);
8845cb37
AS
1750 load_mode_size = GET_MODE_SIZE (load_mode);
1751 if (bytes >= load_mode_size)
1752 cmp_bytes = load_mode_size;
78bd9e25
HG
1753 else if (!targetm.slow_unaligned_access (load_mode,
1754 align * BITS_PER_UNIT))
8845cb37
AS
1755 {
1756 /* Move this load back so it doesn't go past the end.
1757 P8/P9 can do this efficiently. */
1758 unsigned int extra_bytes = load_mode_size - bytes;
1759 cmp_bytes = bytes;
1760 if (extra_bytes < offset)
1761 {
1762 offset -= extra_bytes;
1763 cmp_bytes = load_mode_size;
1764 bytes = cmp_bytes;
1765 }
1766 }
1767 else
1768 /* P7 and earlier can't do the overlapping load trick fast,
1769 so this forces a non-overlapping load and a shift to get
1770 rid of the extra bytes. */
1771 cmp_bytes = bytes;
1772
1773 src1 = adjust_address (orig_src1, load_mode, offset);
1774 src2 = adjust_address (orig_src2, load_mode, offset);
1775
1776 if (!REG_P (XEXP (src1, 0)))
1777 {
1778 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1779 src1 = replace_equiv_address (src1, src1_reg);
1780 }
f4f867ca 1781 set_mem_size (src1, load_mode_size);
8845cb37
AS
1782
1783 if (!REG_P (XEXP (src2, 0)))
1784 {
1785 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1786 src2 = replace_equiv_address (src2, src2_reg);
1787 }
f4f867ca 1788 set_mem_size (src2, load_mode_size);
8845cb37
AS
1789
1790 do_load_for_compare (tmp_reg_src1, src1, load_mode);
1791 do_load_for_compare (tmp_reg_src2, src2, load_mode);
1792
1793 if (cmp_bytes < load_mode_size)
1794 {
1795 /* Shift unneeded bytes off. */
1796 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1797 if (word_mode == DImode)
1798 {
1799 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1800 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1801 }
1802 else
1803 {
1804 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1805 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1806 }
1807 }
1808
1809 int remain = bytes - cmp_bytes;
37ae4739 1810 if (GET_MODE_SIZE (GET_MODE (final_result)) > GET_MODE_SIZE (load_mode))
8845cb37 1811 {
37ae4739 1812 /* Final_result is larger than load size so we don't need to
8845cb37
AS
1813 reduce result size. */
1814
1815 /* We previously did a block that need 64->32 conversion but
1816 the current block does not, so a label is needed to jump
1817 to the end. */
37ae4739 1818 if (need_6432_conv && !final_label)
8845cb37
AS
1819 final_label = gen_label_rtx ();
1820
1821 if (remain > 0)
1822 {
1823 /* This is not the last block, branch to the end if the result
1824 of this subtract is not zero. */
1825 if (!final_label)
1826 final_label = gen_label_rtx ();
1827 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1828 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1829 rtx cr = gen_reg_rtx (CCmode);
1830 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
37ae4739 1831 emit_insn (gen_movsi (final_result,
8845cb37
AS
1832 gen_lowpart (SImode, tmp_reg_src2)));
1833 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1834 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1835 fin_ref, pc_rtx);
faaeebd6
AS
1836 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1837 add_reg_br_prob_note (j, profile_probability::unlikely ());
8845cb37
AS
1838 JUMP_LABEL (j) = final_label;
1839 LABEL_NUSES (final_label) += 1;
1840 }
1841 else
1842 {
1843 if (word_mode == DImode)
1844 {
1845 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1846 tmp_reg_src2));
37ae4739 1847 emit_insn (gen_movsi (final_result,
8845cb37
AS
1848 gen_lowpart (SImode, tmp_reg_src2)));
1849 }
1850 else
37ae4739 1851 emit_insn (gen_subsi3 (final_result, tmp_reg_src1, tmp_reg_src2));
8845cb37
AS
1852
1853 if (final_label)
1854 {
1855 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1856 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
5ec3397e 1857 JUMP_LABEL (j) = final_label;
8845cb37
AS
1858 LABEL_NUSES (final_label) += 1;
1859 emit_barrier ();
1860 }
1861 }
1862 }
1863 else
1864 {
1865 /* Do we need a 64->32 conversion block? We need the 64->32
37ae4739 1866 conversion even if final_result size == load_mode size because
8845cb37 1867 the subtract generates one extra bit. */
37ae4739 1868 need_6432_conv = true;
8845cb37
AS
1869
1870 if (remain > 0)
1871 {
1872 if (!convert_label)
1873 convert_label = gen_label_rtx ();
1874
1875 /* Compare to zero and branch to convert_label if not zero. */
1876 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1877 if (TARGET_P9_MISC)
1878 {
37ae4739
AS
1879 /* Generate a compare, and convert with a setb later.
1880 Use cond that is passed in because the caller needs
1881 to use it for the 64->32 conversion later. */
8845cb37
AS
1882 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1883 tmp_reg_src2);
1884 emit_insn (gen_rtx_SET (cond, cmp));
1885 }
1886 else
37ae4739
AS
1887 {
1888 /* Generate a subfc. and use the longer sequence for
1889 conversion. Cond is not used outside this
1890 function in this case. */
1891 cond = gen_reg_rtx (CCmode);
1892 if (TARGET_64BIT)
1893 emit_insn (gen_subfdi3_carry_dot2 (sub_result, tmp_reg_src2,
1894 tmp_reg_src1, cond));
1895 else
1896 emit_insn (gen_subfsi3_carry_dot2 (sub_result, tmp_reg_src2,
1897 tmp_reg_src1, cond));
1898 }
1899
8845cb37
AS
1900 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1901 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1902 cvt_ref, pc_rtx);
5585759f
AS
1903 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1904 add_reg_br_prob_note (j, profile_probability::likely ());
5ec3397e 1905 JUMP_LABEL (j) = convert_label;
8845cb37
AS
1906 LABEL_NUSES (convert_label) += 1;
1907 }
1908 else
1909 {
1910 /* Just do the subtract/compare. Since this is the last block
1911 the convert code will be generated immediately following. */
1912 if (TARGET_P9_MISC)
1913 {
1914 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1915 tmp_reg_src2);
1916 emit_insn (gen_rtx_SET (cond, cmp));
1917 }
1918 else
1919 if (TARGET_64BIT)
37ae4739 1920 emit_insn (gen_subfdi3_carry (sub_result, tmp_reg_src2,
8845cb37
AS
1921 tmp_reg_src1));
1922 else
37ae4739 1923 emit_insn (gen_subfsi3_carry (sub_result, tmp_reg_src2,
8845cb37
AS
1924 tmp_reg_src1));
1925 }
1926 }
1927
1928 offset += cmp_bytes;
1929 bytes -= cmp_bytes;
1930 }
1931
37ae4739
AS
1932 if (convert_label)
1933 *p_convert_label = convert_label;
1934 if (final_label)
1935 *p_final_label = final_label;
1936 return need_6432_conv;
1937}
1938
1939/* Expand a block compare operation, and return true if successful.
1940 Return false if we should let the compiler generate normal code,
1941 probably a memcmp call.
1942
1943 OPERANDS[0] is the target (result).
1944 OPERANDS[1] is the first source.
1945 OPERANDS[2] is the second source.
1946 OPERANDS[3] is the length.
1947 OPERANDS[4] is the alignment. */
1948bool
1949expand_block_compare (rtx operands[])
1950{
d92d26ff
HG
1951 /* TARGET_POPCNTD is already guarded at expand cmpmemsi. */
1952 gcc_assert (TARGET_POPCNTD);
37ae4739 1953
464de9c2
HG
1954 /* For P8, this case is complicated to handle because the subtract
1955 with carry instructions do not generate the 64-bit carry and so
1956 we must emit code to calculate it ourselves. We skip it on P8
1957 but setb works well on P9. */
1958 if (TARGET_32BIT
1959 && TARGET_POWERPC64
1960 && !TARGET_P9_MISC)
37ae4739
AS
1961 return false;
1962
37ae4739
AS
1963 /* Allow this param to shut off all expansion. */
1964 if (rs6000_block_compare_inline_limit == 0)
1965 return false;
1966
d92d26ff
HG
1967 rtx target = operands[0];
1968 rtx orig_src1 = operands[1];
1969 rtx orig_src2 = operands[2];
1970 rtx bytes_rtx = operands[3];
1971 rtx align_rtx = operands[4];
37ae4739 1972
d92d26ff
HG
1973 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
1974 if (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1975 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2)))
37ae4739
AS
1976 return false;
1977
1978 /* If this is not a fixed size compare, try generating loop code and
1979 if that fails just call memcmp. */
1980 if (!CONST_INT_P (bytes_rtx))
1981 return expand_compare_loop (operands);
1982
1983 /* This must be a fixed size alignment. */
1984 if (!CONST_INT_P (align_rtx))
1985 return false;
1986
78bd9e25
HG
1987 unsigned int align_by_bits = UINTVAL (align_rtx);
1988 unsigned int base_align = align_by_bits / BITS_PER_UNIT;
37ae4739
AS
1989
1990 gcc_assert (GET_MODE (target) == SImode);
1991
1992 /* Anything to move? */
1993 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1994 if (bytes == 0)
1995 return true;
1996
1997 /* P7/P8 code uses cond for subfc. but P9 uses
1998 it for cmpld which needs CCUNSmode. */
1999 rtx cond = NULL;
2000 if (TARGET_P9_MISC)
2001 cond = gen_reg_rtx (CCUNSmode);
2002
2003 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
2004 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2005 at least POWER8. That way we can rely on overlapping compares to
2006 do the final comparison of less than 16 bytes. Also I do not
2007 want to deal with making this work for 32 bits. In addition, we
2008 have to make sure that we have at least P8_VECTOR (we don't allow
2009 P9_VECTOR without P8_VECTOR). */
2010 int use_vec = (bytes >= 33 && !TARGET_32BIT
2011 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
2012
2013 /* We don't want to generate too much code. The loop code can take
2014 over for lengths greater than 31 bytes. */
2015 unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
2016
2017 /* Don't generate too much code if vsx was disabled. */
2018 if (!use_vec && max_bytes > 1)
2019 max_bytes = ((max_bytes + 1) / 2) - 1;
2020
2021 if (!IN_RANGE (bytes, 1, max_bytes))
2022 return expand_compare_loop (operands);
2023
37ae4739
AS
2024 rtx final_label = NULL;
2025
2026 if (use_vec)
8845cb37 2027 {
37ae4739
AS
2028 rtx final_move_label = gen_label_rtx ();
2029 rtx s1addr = gen_reg_rtx (Pmode);
2030 rtx s2addr = gen_reg_rtx (Pmode);
2031 rtx off_reg = gen_reg_rtx (Pmode);
2032 rtx cleanup_label = NULL;
2033 rtx vec_result = gen_reg_rtx (V16QImode);
2034 rtx s1data = gen_reg_rtx (V16QImode);
2035 rtx s2data = gen_reg_rtx (V16QImode);
2036 rtx result_reg = gen_reg_rtx (word_mode);
2037 emit_move_insn (result_reg, GEN_INT (0));
8845cb37 2038
37ae4739
AS
2039 expand_cmp_vec_sequence (bytes, orig_src1, orig_src2,
2040 s1addr, s2addr, off_reg, s1data, s2data,
2041 vec_result, false,
2042 &cleanup_label, final_move_label, false);
2043
2044 if (cleanup_label)
2045 emit_label (cleanup_label);
2046
2047 emit_insn (gen_one_cmplv16qi2 (vec_result, vec_result));
2048
2049 emit_final_compare_vec (s1data, s2data, result_reg,
2050 s1addr, s2addr, orig_src1, orig_src2,
2051 off_reg, vec_result);
2052
2053 emit_label (final_move_label);
2054 emit_insn (gen_movsi (target,
2055 gen_lowpart (SImode, result_reg)));
2056 }
2057 else
2058 { /* generate GPR code */
2059
2060 rtx convert_label = NULL;
2061 rtx sub_result = gen_reg_rtx (word_mode);
2062 bool need_6432_conversion =
2063 expand_block_compare_gpr(bytes, base_align,
2064 orig_src1, orig_src2,
2065 sub_result, cond, target,
2066 &convert_label, &final_label);
2067
2068 if (need_6432_conversion)
8845cb37 2069 {
37ae4739
AS
2070 if (convert_label)
2071 emit_label (convert_label);
2072 if (TARGET_P9_MISC)
2073 emit_insn (gen_setb_unsigned (target, cond));
8845cb37 2074 else
37ae4739 2075 generate_6432_conversion(target, sub_result);
8845cb37
AS
2076 }
2077 }
2078
2079 if (final_label)
2080 emit_label (final_label);
2081
8845cb37
AS
2082 return true;
2083}
2084
f7e94dfb 2085/* Generate page crossing check and branch code to set up for
8845cb37
AS
2086 strncmp when we don't have DI alignment.
2087 STRNCMP_LABEL is the label to branch if there is a page crossing.
f7e94dfb 2088 SRC_ADDR is the string address to be examined.
8845cb37
AS
2089 BYTES is the max number of bytes to compare. */
2090static void
f7e94dfb 2091expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
8845cb37
AS
2092{
2093 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
f7e94dfb
AS
2094 rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr));
2095 do_and3 (src_pgoff, src_addr, GEN_INT (0xfff));
8845cb37 2096 rtx cond = gen_reg_rtx (CCmode);
f7e94dfb 2097 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff,
8845cb37
AS
2098 GEN_INT (4096 - bytes)));
2099
0c791c59 2100 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
8845cb37
AS
2101
2102 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
0c791c59 2103 lab_ref, pc_rtx);
faaeebd6
AS
2104 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2105 add_reg_br_prob_note (j, profile_probability::unlikely ());
8845cb37
AS
2106 JUMP_LABEL (j) = strncmp_label;
2107 LABEL_NUSES (strncmp_label) += 1;
2108}
2109
74f9986e
AS
2110/* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
2111 BYTES_TO_COMPARE is the number of bytes to be compared.
2112 BASE_ALIGN is the smaller of the alignment of the two strings.
2113 ORIG_SRC1 is the unmodified rtx for the first string.
2114 ORIG_SRC2 is the unmodified rtx for the second string.
2115 TMP_REG_SRC1 is the register for loading the first string.
2116 TMP_REG_SRC2 is the register for loading the second string.
2117 RESULT_REG is the rtx for the result register.
2118 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
2119 to strcmp/strncmp if we have equality at the end of the inline comparison.
9d36bd3b
AS
2120 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
2121 to clean up and generate the final comparison result.
ef4adf1f 2122 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
74f9986e
AS
2123 set the final result. */
2124static void
9d36bd3b
AS
2125expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
2126 unsigned int base_align,
2127 rtx orig_src1, rtx orig_src2,
2128 rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
2129 bool equality_compare_rest, rtx *p_cleanup_label,
2130 rtx final_move_label)
74f9986e
AS
2131{
2132 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
2133 machine_mode load_mode;
2134 unsigned int load_mode_size;
2135 unsigned HOST_WIDE_INT cmp_bytes = 0;
2136 unsigned HOST_WIDE_INT offset = 0;
2137 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2138 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
9d36bd3b
AS
2139 gcc_assert (p_cleanup_label != NULL);
2140 rtx cleanup_label = *p_cleanup_label;
74f9986e
AS
2141
2142 while (bytes_to_compare > 0)
2143 {
2144 /* GPR compare sequence:
ef4adf1f
AS
2145 check each 8B with: ld/ld/cmpb/cmpb/orc./bne
2146
74f9986e 2147 cleanup code at end:
74f9986e
AS
2148 cntlzd get bit of first zero/diff byte
2149 subfic convert for rldcl use
2150 rldcl rldcl extract diff/zero byte
2151 subf subtract for final result
2152
2153 The last compare can branch around the cleanup code if the
2154 result is zero because the strings are exactly equal. */
ef4adf1f 2155
74f9986e
AS
2156 unsigned int align = compute_current_alignment (base_align, offset);
2157 load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
2158 load_mode_size = GET_MODE_SIZE (load_mode);
2159 if (bytes_to_compare >= load_mode_size)
2160 cmp_bytes = load_mode_size;
78bd9e25
HG
2161 else if (!targetm.slow_unaligned_access (load_mode,
2162 align * BITS_PER_UNIT))
74f9986e
AS
2163 {
2164 /* Move this load back so it doesn't go past the end.
2165 P8/P9 can do this efficiently. */
2166 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
2167 cmp_bytes = bytes_to_compare;
2168 if (extra_bytes < offset)
2169 {
2170 offset -= extra_bytes;
2171 cmp_bytes = load_mode_size;
2172 bytes_to_compare = cmp_bytes;
2173 }
2174 }
2175 else
2176 /* P7 and earlier can't do the overlapping load trick fast,
2177 so this forces a non-overlapping load and a shift to get
2178 rid of the extra bytes. */
2179 cmp_bytes = bytes_to_compare;
2180
122d6c36
AS
2181 rtx offset_rtx;
2182 if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM)
2183 offset_rtx = GEN_INT (offset);
2184 else
2185 {
2186 offset_rtx = gen_reg_rtx (Pmode);
2187 emit_move_insn (offset_rtx, GEN_INT (offset));
2188 }
2189 rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx);
2190 rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx);
37ae4739 2191
74f9986e 2192 do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
74f9986e
AS
2193 do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
2194
2195 /* We must always left-align the data we read, and
2196 clear any bytes to the right that are beyond the string.
2197 Otherwise the cmpb sequence won't produce the correct
ef4adf1f
AS
2198 results. However if there is only one byte left, we
2199 can just subtract to get the final result so the shifts
2200 and clears are not needed. */
74f9986e 2201
ef4adf1f 2202 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
74f9986e 2203
ef4adf1f
AS
2204 /* Loading just a single byte is a special case. If we are
2205 loading more than that, we have to check whether we are
2206 looking at the entire chunk of data. If not, rotate left and
2207 clear right so that bytes we aren't supposed to look at are
2208 zeroed, and the first byte we are supposed to compare is
2209 leftmost. */
2210 if (load_mode_size != 1)
74f9986e 2211 {
ef4adf1f
AS
2212 if (load_mode_size < word_mode_size)
2213 {
2214 /* Rotate left first. */
2215 rtx sh = GEN_INT (BITS_PER_UNIT
2216 * (word_mode_size - load_mode_size));
2217 do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
2218 do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
2219 }
2220
2221 if (cmp_bytes < word_mode_size)
2222 {
2223 /* Now clear right. This plus the rotate can be
2224 turned into a rldicr instruction. */
2225 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2226 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2227 do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
2228 do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
2229 }
74f9986e
AS
2230 }
2231
2232 /* Cases to handle. A and B are chunks of the two strings.
2233 1: Not end of comparison:
2234 A != B: branch to cleanup code to compute result.
2235 A == B: check for 0 byte, next block if not found.
2236 2: End of the inline comparison:
2237 A != B: branch to cleanup code to compute result.
2238 A == B: check for 0 byte, call strcmp/strncmp
2239 3: compared requested N bytes:
2240 A == B: branch to result 0.
2241 A != B: cleanup code to compute result. */
2242
74f9986e
AS
2243 rtx dst_label;
2244 if (remain > 0 || equality_compare_rest)
2245 {
2246 /* Branch to cleanup code, otherwise fall through to do
2247 more compares. */
2248 if (!cleanup_label)
2249 cleanup_label = gen_label_rtx ();
2250 dst_label = cleanup_label;
2251 }
2252 else
2253 /* Branch to end and produce result of 0. */
2254 dst_label = final_move_label;
2255
ef4adf1f
AS
2256 if (load_mode_size == 1)
2257 {
2258 /* Special case for comparing just single byte. */
2259 if (equality_compare_rest)
2260 {
2261 /* Use subf./bne to branch to final_move_label if the
2262 byte differs, otherwise fall through to the strncmp
2263 call. We must also check for a zero byte here as we
2264 must not make the library call if this is the end of
2265 the string. */
2266
2267 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2268 rtx cond = gen_reg_rtx (CCmode);
2269 rtx diff_rtx = gen_rtx_MINUS (word_mode,
2270 tmp_reg_src1, tmp_reg_src2);
2271 rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
2272 rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2273
2274 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2275 lab_ref, pc_rtx);
faaeebd6
AS
2276 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2277 add_reg_br_prob_note (j, profile_probability::unlikely ());
ef4adf1f
AS
2278 JUMP_LABEL (j) = final_move_label;
2279 LABEL_NUSES (final_move_label) += 1;
74f9986e 2280
ef4adf1f
AS
2281 /* Check for zero byte here before fall through to
2282 library call. This catches the case where the
2283 strings are equal and end in a zero byte at this
2284 position. */
74f9986e 2285
ef4adf1f
AS
2286 rtx cond0 = gen_reg_rtx (CCmode);
2287 emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
2288 const0_rtx));
74f9986e 2289
ef4adf1f 2290 rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
74f9986e 2291
ef4adf1f
AS
2292 rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
2293 lab_ref, pc_rtx);
faaeebd6
AS
2294 rtx_insn *j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
2295 add_reg_br_prob_note (j0, profile_probability::unlikely ());
ef4adf1f
AS
2296 JUMP_LABEL (j0) = final_move_label;
2297 LABEL_NUSES (final_move_label) += 1;
2298 }
2299 else
2300 {
2301 /* This is the last byte to be compared so we can use
2302 subf to compute the final result and branch
2303 unconditionally to final_move_label. */
2304
2305 do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
2306
2307 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2308 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2309 JUMP_LABEL (j) = final_move_label;
2310 LABEL_NUSES (final_move_label) += 1;
2311 emit_barrier ();
2312 }
2313 }
2314 else
74f9986e 2315 {
74f9986e 2316 rtx cmpb_zero = gen_reg_rtx (word_mode);
ef4adf1f 2317 rtx cmpb_diff = gen_reg_rtx (word_mode);
74f9986e 2318 rtx zero_reg = gen_reg_rtx (word_mode);
ef4adf1f
AS
2319 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
2320 rtx cond = gen_reg_rtx (CCmode);
2321
74f9986e 2322 emit_move_insn (zero_reg, GEN_INT (0));
ef4adf1f 2323 do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
74f9986e 2324 do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
ef4adf1f
AS
2325 rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
2326 rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);
74f9986e 2327
ef4adf1f 2328 rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);
74f9986e 2329
ef4adf1f
AS
2330 rtx cmp_rtx;
2331 if (remain == 0 && !equality_compare_rest)
2332 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
2333 else
2334 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
74f9986e 2335
ef4adf1f
AS
2336 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2337 lab_ref, pc_rtx);
faaeebd6
AS
2338 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2339 add_reg_br_prob_note (j, profile_probability::unlikely ());
ef4adf1f
AS
2340 JUMP_LABEL (j) = dst_label;
2341 LABEL_NUSES (dst_label) += 1;
74f9986e
AS
2342 }
2343
2344 offset += cmp_bytes;
2345 bytes_to_compare -= cmp_bytes;
2346 }
2347
9d36bd3b
AS
2348 *p_cleanup_label = cleanup_label;
2349 return;
2350}
2351
f7e94dfb
AS
2352/* Generate the final sequence that identifies the differing
2353 byte and generates the final result, taking into account
2354 zero bytes:
ef4adf1f 2355
f7e94dfb
AS
2356 cntlzd get bit of first zero/diff byte
2357 addi convert for rldcl use
2358 rldcl rldcl extract diff/zero byte
2359 subf subtract for final result
2360
2361 STR1 is the reg rtx for data from string 1.
2362 STR2 is the reg rtx for data from string 2.
2363 RESULT is the reg rtx for the comparison result. */
2364
2365static void
2366emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
2367{
2368 machine_mode m = GET_MODE (str1);
f7e94dfb 2369 rtx rot_amt = gen_reg_rtx (m);
f7e94dfb
AS
2370
2371 rtx rot1_1 = gen_reg_rtx (m);
2372 rtx rot1_2 = gen_reg_rtx (m);
2373 rtx rot2_1 = gen_reg_rtx (m);
2374 rtx rot2_2 = gen_reg_rtx (m);
2375
2376 if (m == SImode)
2377 {
ef4adf1f 2378 emit_insn (gen_clzsi2 (rot_amt, result));
f7e94dfb
AS
2379 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2380 emit_insn (gen_rotlsi3 (rot1_1, str1,
2381 gen_lowpart (SImode, rot_amt)));
2382 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2383 emit_insn (gen_rotlsi3 (rot2_1, str2,
2384 gen_lowpart (SImode, rot_amt)));
2385 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2386 emit_insn (gen_subsi3 (result, rot1_2, rot2_2));
2387 }
2388 else if (m == DImode)
2389 {
ef4adf1f 2390 emit_insn (gen_clzdi2 (rot_amt, result));
f7e94dfb
AS
2391 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2392 emit_insn (gen_rotldi3 (rot1_1, str1,
2393 gen_lowpart (SImode, rot_amt)));
2394 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2395 emit_insn (gen_rotldi3 (rot2_1, str2,
2396 gen_lowpart (SImode, rot_amt)));
2397 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2398 emit_insn (gen_subdi3 (result, rot1_2, rot2_2));
2399 }
2400 else
2401 gcc_unreachable ();
ef4adf1f 2402
f7e94dfb
AS
2403 return;
2404}
2405
8845cb37 2406/* Expand a string compare operation with length, and return
ef4adf1f 2407 true if successful. Return false if we should let the
8845cb37
AS
2408 compiler generate normal code, probably a strncmp call.
2409
2410 OPERANDS[0] is the target (result).
2411 OPERANDS[1] is the first source.
2412 OPERANDS[2] is the second source.
2413 If NO_LENGTH is zero, then:
2414 OPERANDS[3] is the length.
2415 OPERANDS[4] is the alignment in bytes.
2416 If NO_LENGTH is nonzero, then:
2417 OPERANDS[3] is the alignment in bytes. */
2418bool
2419expand_strn_compare (rtx operands[], int no_length)
2420{
2421 rtx target = operands[0];
2422 rtx orig_src1 = operands[1];
2423 rtx orig_src2 = operands[2];
2424 rtx bytes_rtx, align_rtx;
2425 if (no_length)
2426 {
2427 bytes_rtx = NULL;
2428 align_rtx = operands[3];
2429 }
2430 else
2431 {
2432 bytes_rtx = operands[3];
2433 align_rtx = operands[4];
2434 }
74f9986e 2435
f7e94dfb
AS
2436 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2437 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
8845cb37 2438
ef4adf1f 2439 /* If we have a length, it must be constant. This simplifies things
8845cb37 2440 a bit as we don't have to generate code to check if we've exceeded
ef4adf1f 2441 the length. Later this could be expanded to handle this case. */
8845cb37
AS
2442 if (!no_length && !CONST_INT_P (bytes_rtx))
2443 return false;
2444
2445 /* This must be a fixed size alignment. */
2446 if (!CONST_INT_P (align_rtx))
2447 return false;
2448
2449 unsigned int base_align = UINTVAL (align_rtx);
f7e94dfb
AS
2450 unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
2451 unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
8845cb37 2452
e0bd6c9f
RS
2453 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
2454 if (targetm.slow_unaligned_access (word_mode, align1)
2455 || targetm.slow_unaligned_access (word_mode, align2))
8845cb37
AS
2456 return false;
2457
2458 gcc_assert (GET_MODE (target) == SImode);
2459
9d36bd3b 2460 unsigned int required_align = 8;
8845cb37
AS
2461
2462 unsigned HOST_WIDE_INT offset = 0;
2463 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
2464 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
9d36bd3b 2465
8845cb37 2466 if (no_length)
9d36bd3b 2467 bytes = rs6000_string_compare_inline_limit;
8845cb37
AS
2468 else
2469 bytes = UINTVAL (bytes_rtx);
2470
ef4adf1f 2471 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
9d36bd3b
AS
2472 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2473 at least POWER8. That way we can rely on overlapping compares to
6bd2b8ec
AS
2474 do the final comparison of less than 16 bytes. Also I do not
2475 want to deal with making this work for 32 bits. In addition, we
2476 have to make sure that we have at least P8_VECTOR (we don't allow
2477 P9_VECTOR without P8_VECTOR). */
2478 int use_vec = (bytes >= 16 && !TARGET_32BIT
2479 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
9d36bd3b
AS
2480
2481 if (use_vec)
2482 required_align = 16;
2483
2484 machine_mode load_mode;
2485 rtx tmp_reg_src1, tmp_reg_src2;
2486 if (use_vec)
2487 {
2488 load_mode = V16QImode;
2489 tmp_reg_src1 = gen_reg_rtx (V16QImode);
2490 tmp_reg_src2 = gen_reg_rtx (V16QImode);
2491 }
2492 else
2493 {
2494 load_mode = select_block_compare_mode (0, bytes, base_align);
2495 tmp_reg_src1 = gen_reg_rtx (word_mode);
2496 tmp_reg_src2 = gen_reg_rtx (word_mode);
2497 }
2498
2499 compare_length = rs6000_string_compare_inline_limit;
8845cb37
AS
2500
2501 /* If we have equality at the end of the last compare and we have not
2502 found the end of the string, we need to call strcmp/strncmp to
2503 compare the remainder. */
2504 bool equality_compare_rest = false;
2505
2506 if (no_length)
2507 {
2508 bytes = compare_length;
2509 equality_compare_rest = true;
2510 }
2511 else
2512 {
2513 if (bytes <= compare_length)
2514 compare_length = bytes;
2515 else
2516 equality_compare_rest = true;
2517 }
2518
2519 rtx result_reg = gen_reg_rtx (word_mode);
2520 rtx final_move_label = gen_label_rtx ();
2521 rtx final_label = gen_label_rtx ();
2522 rtx begin_compare_label = NULL;
ef4adf1f 2523
f7e94dfb 2524 if (base_align < required_align)
8845cb37
AS
2525 {
2526 /* Generate code that checks distance to 4k boundary for this case. */
2527 begin_compare_label = gen_label_rtx ();
2528 rtx strncmp_label = gen_label_rtx ();
2529 rtx jmp;
2530
2531 /* Strncmp for power8 in glibc does this:
5ec3397e
AS
2532 rldicl r8,r3,0,52
2533 cmpldi cr7,r8,4096-16
2534 bgt cr7,L(pagecross) */
8845cb37
AS
2535
2536 /* Make sure that the length we use for the alignment test and
2537 the subsequent code generation are in agreement so we do not
2538 go past the length we tested for a 4k boundary crossing. */
2539 unsigned HOST_WIDE_INT align_test = compare_length;
9d36bd3b 2540 if (align_test < required_align)
8845cb37
AS
2541 {
2542 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
2543 base_align = align_test;
2544 }
2545 else
2546 {
f7e94dfb
AS
2547 align_test = ROUND_UP (align_test, required_align);
2548 base_align = required_align;
8845cb37
AS
2549 }
2550
f7e94dfb
AS
2551 if (align1 < required_align)
2552 expand_strncmp_align_check (strncmp_label, src1_addr, align_test);
2553 if (align2 < required_align)
2554 expand_strncmp_align_check (strncmp_label, src2_addr, align_test);
8845cb37
AS
2555
2556 /* Now generate the following sequence:
2557 - branch to begin_compare
2558 - strncmp_label
2559 - call to strncmp
2560 - branch to final_label
2561 - begin_compare_label */
2562
2563 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
2564 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
2565 JUMP_LABEL (jmp) = begin_compare_label;
2566 LABEL_NUSES (begin_compare_label) += 1;
2567 emit_barrier ();
2568
2569 emit_label (strncmp_label);
2570
8845cb37
AS
2571 if (no_length)
2572 {
2573 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2574 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2575 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb
AS
2576 force_reg (Pmode, src1_addr), Pmode,
2577 force_reg (Pmode, src2_addr), Pmode);
8845cb37
AS
2578 }
2579 else
2580 {
2581 /* -m32 -mpowerpc64 results in word_mode being DImode even
9d36bd3b 2582 though otherwise it is 32-bit. The length arg to strncmp
8845cb37 2583 is a size_t which will be the same size as pointers. */
e9727bda
AS
2584 rtx len_rtx = gen_reg_rtx (Pmode);
2585 emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
8845cb37
AS
2586
2587 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2588 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2589 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb
AS
2590 force_reg (Pmode, src1_addr), Pmode,
2591 force_reg (Pmode, src2_addr), Pmode,
e9727bda 2592 len_rtx, Pmode);
8845cb37
AS
2593 }
2594
2595 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2596 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2597 JUMP_LABEL (jmp) = final_label;
2598 LABEL_NUSES (final_label) += 1;
2599 emit_barrier ();
2600 emit_label (begin_compare_label);
2601 }
2602
2603 rtx cleanup_label = NULL;
9d36bd3b 2604 rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL;
8845cb37 2605
f7e94dfb 2606 /* Generate a sequence of GPR or VEC/VSX instructions to compare out
8845cb37 2607 to the length specified. */
9d36bd3b
AS
2608 if (use_vec)
2609 {
2610 s1addr = gen_reg_rtx (Pmode);
2611 s2addr = gen_reg_rtx (Pmode);
2612 off_reg = gen_reg_rtx (Pmode);
2613 vec_result = gen_reg_rtx (load_mode);
2614 emit_move_insn (result_reg, GEN_INT (0));
37ae4739
AS
2615 expand_cmp_vec_sequence (compare_length,
2616 orig_src1, orig_src2,
2617 s1addr, s2addr, off_reg,
2618 tmp_reg_src1, tmp_reg_src2,
2619 vec_result,
2620 equality_compare_rest,
2621 &cleanup_label, final_move_label, true);
9d36bd3b
AS
2622 }
2623 else
2624 expand_strncmp_gpr_sequence (compare_length, base_align,
2625 orig_src1, orig_src2,
2626 tmp_reg_src1, tmp_reg_src2,
2627 result_reg,
2628 equality_compare_rest,
2629 &cleanup_label, final_move_label);
74f9986e
AS
2630
2631 offset = compare_length;
ef4adf1f 2632
8845cb37
AS
2633 if (equality_compare_rest)
2634 {
2635 /* Update pointers past what has been compared already. */
f7e94dfb
AS
2636 rtx src1 = force_reg (Pmode,
2637 gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)));
2638 rtx src2 = force_reg (Pmode,
2639 gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)));
8845cb37
AS
2640
2641 /* Construct call to strcmp/strncmp to compare the rest of the string. */
2642 if (no_length)
2643 {
2644 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2645 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2646 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb 2647 src1, Pmode, src2, Pmode);
8845cb37
AS
2648 }
2649 else
2650 {
e9727bda
AS
2651 rtx len_rtx = gen_reg_rtx (Pmode);
2652 emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode));
8845cb37
AS
2653 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2654 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2655 target, LCT_NORMAL, GET_MODE (target),
e9727bda 2656 src1, Pmode, src2, Pmode, len_rtx, Pmode);
8845cb37
AS
2657 }
2658
2659 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2660 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2661 JUMP_LABEL (jmp) = final_label;
2662 LABEL_NUSES (final_label) += 1;
2663 emit_barrier ();
2664 }
2665
2666 if (cleanup_label)
2667 emit_label (cleanup_label);
2668
9d36bd3b 2669 if (use_vec)
37ae4739
AS
2670 emit_final_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg,
2671 s1addr, s2addr, orig_src1, orig_src2,
2672 off_reg, vec_result);
9d36bd3b
AS
2673 else
2674 emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
8845cb37
AS
2675
2676 emit_label (final_move_label);
2677 emit_insn (gen_movsi (target,
2678 gen_lowpart (SImode, result_reg)));
2679 emit_label (final_label);
2680 return true;
2681}
2682
19db0ebb
AS
2683/* Generate loads and stores for a move of v4si mode using lvx/stvx.
2684 This uses altivec_{l,st}vx_<mode>_internal which use unspecs to
2685 keep combine from changing what instruction gets used.
2686
2687 DEST is the destination for the data.
2688 SRC is the source of the data for the move. */
2689
2690static rtx
2691gen_lvx_v4si_move (rtx dest, rtx src)
2692{
2693 gcc_assert (MEM_P (dest) ^ MEM_P (src));
2694 gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
2695
2696 if (MEM_P (dest))
2697 return gen_altivec_stvx_v4si_internal (dest, src);
2698 else
2699 return gen_altivec_lvx_v4si_internal (dest, src);
2700}
2701
afd97163
AS
2702static rtx
2703gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
2704{
2705 gcc_assert (MEM_P (dest) ^ MEM_P (src));
2706 gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
2707 gcc_assert (length <= 16);
2708
2709 bool is_store = MEM_P (dest);
2710 rtx addr;
2711
2712 /* If the address form is not a simple register, make it so. */
2713 if (is_store)
2714 addr = XEXP (dest, 0);
2715 else
2716 addr = XEXP (src, 0);
2717
2718 if (!REG_P (addr))
2719 addr = force_reg (Pmode, addr);
2720
2721 rtx len = force_reg (DImode, gen_int_mode (length, DImode));
2722 if (is_store)
2723 return gen_stxvl (src, addr, len);
2724 else
2725 return gen_lxvl (dest, addr, len);
2726}
2727
8845cb37
AS
2728/* Expand a block move operation, and return 1 if successful. Return 0
2729 if we should let the compiler generate normal code.
2730
2731 operands[0] is the destination
2732 operands[1] is the source
2733 operands[2] is the length
2734 operands[3] is the alignment */
2735
2736#define MAX_MOVE_REG 4
2737
2738int
c8241327 2739expand_block_move (rtx operands[], bool might_overlap)
8845cb37
AS
2740{
2741 rtx orig_dest = operands[0];
2742 rtx orig_src = operands[1];
2743 rtx bytes_rtx = operands[2];
2744 rtx align_rtx = operands[3];
2e42a52f 2745 int constp = CONST_INT_P (bytes_rtx);
8845cb37
AS
2746 int align;
2747 int bytes;
2748 int offset;
2749 int move_bytes;
c8241327 2750 rtx loads[MAX_MOVE_REG];
8845cb37
AS
2751 rtx stores[MAX_MOVE_REG];
2752 int num_reg = 0;
2753
2754 /* If this is not a fixed size move, just call memcpy */
2755 if (! constp)
2756 return 0;
2757
2758 /* This must be a fixed size alignment */
2e42a52f 2759 gcc_assert (CONST_INT_P (align_rtx));
8845cb37
AS
2760 align = INTVAL (align_rtx) * BITS_PER_UNIT;
2761
2762 /* Anything to move? */
2763 bytes = INTVAL (bytes_rtx);
2764 if (bytes <= 0)
2765 return 1;
2766
2767 if (bytes > rs6000_block_move_inline_limit)
2768 return 0;
2769
afd97163 2770 int orig_bytes = bytes;
8845cb37
AS
2771 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2772 {
2773 union {
8845cb37 2774 rtx (*mov) (rtx, rtx);
afd97163 2775 rtx (*movlen) (rtx, rtx, int);
8845cb37
AS
2776 } gen_func;
2777 machine_mode mode = BLKmode;
2778 rtx src, dest;
afd97163
AS
2779 bool move_with_length = false;
2780
f8f8909a 2781 /* Use OOmode for paired vsx load/store. Use V2DI for single
afd97163
AS
2782 unaligned vsx load/store, for consistency with what other
2783 expansions (compare) already do, and so we can use lxvd2x on
2784 p8. Order is VSX pair unaligned, VSX unaligned, Altivec, VSX
2785 with length < 16 (if allowed), then gpr load/store. */
2786
2787 if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
2788 && TARGET_BLOCK_OPS_VECTOR_PAIR
2789 && bytes >= 32
2790 && (align >= 256 || !STRICT_ALIGNMENT))
2791 {
2792 move_bytes = 32;
f8f8909a
AS
2793 mode = OOmode;
2794 gen_func.mov = gen_movoo;
afd97163
AS
2795 }
2796 else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX
2797 && VECTOR_MEM_VSX_P (V2DImode)
2798 && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT))
2799 {
2800 move_bytes = 16;
2801 mode = V2DImode;
2802 gen_func.mov = gen_vsx_movv2di_64bit;
2803 }
2804 else if (TARGET_BLOCK_OPS_UNALIGNED_VSX
946b8967
HG
2805 /* Only use lxvl/stxvl on 64bit POWER10. */
2806 && TARGET_POWER10
2807 && TARGET_64BIT
2808 && bytes < 16
afd97163 2809 && orig_bytes > 16
946b8967
HG
2810 && !(bytes == 1
2811 || bytes == 2
2812 || bytes == 4
2813 || bytes == 8)
2814 && (align >= 128
2815 || !STRICT_ALIGNMENT))
afd97163
AS
2816 {
2817 /* Only use lxvl/stxvl if it could replace multiple ordinary
2818 loads+stores. Also don't use it unless we likely already
2819 did one vsx copy so we aren't mixing gpr and vsx. */
2820 move_bytes = bytes;
2821 mode = V16QImode;
2822 gen_func.movlen = gen_lxvl_stxvl_move;
2823 move_with_length = true;
2824 }
2825 else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
8845cb37
AS
2826 {
2827 move_bytes = 16;
2828 mode = V4SImode;
19db0ebb 2829 gen_func.mov = gen_lvx_v4si_move;
8845cb37 2830 }
8845cb37
AS
2831 else if (bytes >= 8 && TARGET_POWERPC64
2832 && (align >= 64 || !STRICT_ALIGNMENT))
2833 {
2834 move_bytes = 8;
2835 mode = DImode;
2836 gen_func.mov = gen_movdi;
2837 if (offset == 0 && align < 64)
2838 {
2839 rtx addr;
2840
2841 /* If the address form is reg+offset with offset not a
2842 multiple of four, reload into reg indirect form here
2843 rather than waiting for reload. This way we get one
2844 reload, not one per load and/or store. */
2845 addr = XEXP (orig_dest, 0);
2846 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2e42a52f 2847 && CONST_INT_P (XEXP (addr, 1))
8845cb37
AS
2848 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2849 {
2850 addr = copy_addr_to_reg (addr);
2851 orig_dest = replace_equiv_address (orig_dest, addr);
2852 }
2853 addr = XEXP (orig_src, 0);
2854 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2e42a52f 2855 && CONST_INT_P (XEXP (addr, 1))
8845cb37
AS
2856 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2857 {
2858 addr = copy_addr_to_reg (addr);
2859 orig_src = replace_equiv_address (orig_src, addr);
2860 }
2861 }
2862 }
8845cb37
AS
2863 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2864 { /* move 4 bytes */
2865 move_bytes = 4;
2866 mode = SImode;
2867 gen_func.mov = gen_movsi;
2868 }
2869 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2870 { /* move 2 bytes */
2871 move_bytes = 2;
2872 mode = HImode;
2873 gen_func.mov = gen_movhi;
2874 }
8845cb37
AS
2875 else /* move 1 byte at a time */
2876 {
2877 move_bytes = 1;
2878 mode = QImode;
2879 gen_func.mov = gen_movqi;
2880 }
2881
afd97163
AS
2882 /* If we can't succeed in doing the move in one pass, we can't
2883 do it in the might_overlap case. Bail out and return
2884 failure. We test num_reg + 1 >= MAX_MOVE_REG here to check
2885 the same condition as the test of num_reg >= MAX_MOVE_REG
2886 that is done below after the increment of num_reg. */
2887 if (might_overlap && num_reg + 1 >= MAX_MOVE_REG
2888 && bytes > move_bytes)
2889 return 0;
2890
2891 /* Mode is always set to something other than BLKmode by one of the
c8241327
AS
2892 cases of the if statement above. */
2893 gcc_assert (mode != BLKmode);
2894
8845cb37
AS
2895 src = adjust_address (orig_src, mode, offset);
2896 dest = adjust_address (orig_dest, mode, offset);
2897
c8241327 2898 rtx tmp_reg = gen_reg_rtx (mode);
8845cb37 2899
afd97163
AS
2900 if (move_with_length)
2901 {
2902 loads[num_reg] = (*gen_func.movlen) (tmp_reg, src, move_bytes);
2903 stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes);
2904 }
2905 else
2906 {
2907 loads[num_reg] = (*gen_func.mov) (tmp_reg, src);
2908 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2909 }
8845cb37 2910
c8241327
AS
2911 /* Emit loads and stores saved up. */
2912 if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
8845cb37
AS
2913 {
2914 int i;
c8241327
AS
2915 for (i = 0; i < num_reg; i++)
2916 emit_insn (loads[i]);
8845cb37
AS
2917 for (i = 0; i < num_reg; i++)
2918 emit_insn (stores[i]);
2919 num_reg = 0;
2920 }
c8241327 2921
8845cb37
AS
2922 }
2923
2924 return 1;
2925}
This page took 2.767119 seconds and 5 git commands to generate.