This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] fold builtin_tolower, builtin_toupper
- From: Bernhard Reutner-Fischer <rep dot dot dot nop at gmail dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: Bernhard Reutner-Fischer <rep dot dot dot nop at gmail dot com>, "Joseph S . Myers" <joseph at codesourcery dot com>, Richard Biener <rguenther at suse dot de>
- Date: Thu, 9 Jul 2015 14:58:09 +0200
- Subject: [PATCH] fold builtin_tolower, builtin_toupper
- Authentication-results: sourceware.org; auth=none
gcc/ChangeLog
2015-07-09 Bernhard Reutner-Fischer <aldot@gcc.gnu.org>
* builtins.c (fold_builtin_tolower, fold_builtin_toupper): New
static functions.
(fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER.
Signed-off-by: Bernhard Reutner-Fischer <rep.dot.nop@gmail.com>
---
gcc/builtins.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 99 insertions(+)
Using the three testcases attached to PR66741 where the -1.c one is using
builtins
$ for i in 0 1 2;do gcc -o tolower_strcpy-$i tolower_strcpy-$i.c -Ofast -W -Wall -Wextra -pedantic -DMAIN -msse4.2;done
pristine (trunk@225368):
# tolower_strcpy-0
real 0m6.068s
user 0m3.204s
sys 0m2.840s
# tolower_strcpy-1
real 0m8.097s
user 0m5.548s
sys 0m2.528s
# tolower_strcpy-2
real 0m3.568s
user 0m0.804s
sys 0m2.748s
trunk@225368 + fold tolower/toupper below
# tolower_strcpy-0
real 0m6.055s
user 0m3.212s
sys 0m2.832s
# tolower_strcpy-1
real 0m5.383s
user 0m2.464s
sys 0m2.900s
# tolower_strcpy-2
real 0m3.605s
user 0m0.668s
sys 0m2.924s
The tolower loop now ends up as
.L5:
movsbl (%rbx), %edx
leal 32(%rdx), %ecx
movl %edx, %eax
subl $65, %edx
cmpl $25, %edx
cmovbe %ecx, %eax
addq $1, %rbx
movb %al, -1(%rbx)
cmpq %rsi, %rbx
jne .L5
instead of the former call
.L5:
movsbl (%rbx), %edi
addq $1, %rbx
call tolower
movb %al, -1(%rbx)
cmpq %rbp, %rbx
jne .L5
Would something like attached be ok for trunk after proper testing?
Advise on the questions inline WRT caching lang_hooks intermediate
results?
Hints on further steps towards fixing the PR?
I think the next step would be to try to teach graphite to fuse the two
loops in tolower_strcpy-0.c. Need to look at graphite..
Then see how to classify builtins that could be expanded early and what
breaks if doing so. This sounds like a potential disaster, fun.
Next, see why the vectorizer (or something else) does not pave the way
to use SSE instruction as the tolower_strcpy-2.c does.
thanks,
diff --git a/gcc/builtins.c b/gcc/builtins.c
index 5f53342..421c908 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -204,6 +204,9 @@ static tree fold_builtin_strrchr (location_t, tree, tree, tree);
static tree fold_builtin_strspn (location_t, tree, tree);
static tree fold_builtin_strcspn (location_t, tree, tree);
+static tree fold_builtin_tolower (location_t, tree);
+static tree fold_builtin_toupper (location_t, tree);
+
static rtx expand_builtin_object_size (tree);
static rtx expand_builtin_memory_chk (tree, rtx, machine_mode,
enum built_in_function);
@@ -10285,6 +10288,12 @@ fold_builtin_1 (location_t loc, tree fndecl, tree arg0)
case BUILT_IN_ISDIGIT:
return fold_builtin_isdigit (loc, arg0);
+ case BUILT_IN_TOLOWER:
+ return fold_builtin_tolower (loc, arg0);
+
+ case BUILT_IN_TOUPPER:
+ return fold_builtin_toupper (loc, arg0);
+
CASE_FLT_FN (BUILT_IN_FINITE):
case BUILT_IN_FINITED32:
case BUILT_IN_FINITED64:
@@ -11208,6 +11217,96 @@ fold_builtin_strcspn (location_t loc, tree s1, tree s2)
}
}
+
+/* Simplify a call to the tolower builtin. ARG is the argument to the call.
+
+ Return NULL_TREE if no simplification was possible, otherwise return the
+ simplified form of the call as a tree. */
+
+static tree
+fold_builtin_tolower (location_t loc, tree arg)
+{
+ if (!validate_arg (arg, INTEGER_TYPE))
+ return NULL_TREE;
+
+ /* Transform tolower(c) -> (unsigned)(c) | 0x20.
+
+ More specifically:
+ unsigned tem = arg - 'A';
+ if (tem <= ('Z' - 'A'))
+ arg += 'a' - 'A';
+ return arg;
+ */
+ unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
+ unsigned HOST_WIDE_INT target_Z = lang_hooks.to_target_charset ('Z');
+ unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
+ if (target_A == 0
+ || target_Z == 0
+ || target_a == 0)
+ return NULL_TREE;
+
+ arg = fold_convert_loc (loc, unsigned_type_node, arg);
+ tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
+ build_int_cst (unsigned_type_node, target_A));
+ /* ??? x19 and x20 would better live in static storage; Think:
+ * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
+ */
+ unsigned HOST_WIDE_INT x19 = target_Z - target_A;
+ unsigned HOST_WIDE_INT x20 = target_a - target_A;
+ tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
+ build_int_cst (unsigned_type_node, x19));
+ tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
+ fold_build2 (PLUS_EXPR, unsigned_type_node, arg,
+ build_int_cst (unsigned_type_node, x20)),
+ arg);
+ return fold_convert_loc (loc, integer_type_node, tem);
+}
+
+/* Simplify a call to the toupper builtin. ARG is the argument to the call.
+
+ Return NULL_TREE if no simplification was possible, otherwise return the
+ simplified form of the call as a tree. */
+
+static tree
+fold_builtin_toupper (location_t loc, tree arg)
+{
+ if (!validate_arg (arg, INTEGER_TYPE))
+ return NULL_TREE;
+
+ /* Transform toupper(c) -> (unsigned)(c) ^ 0x20.
+
+ More specifically:
+ unsigned tem = arg - 'a';
+ if (tem <= ('z' - 'a'))
+ arg -= 'a' - 'A';
+ return arg;
+ */
+ unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
+ unsigned HOST_WIDE_INT target_z = lang_hooks.to_target_charset ('z');
+ unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
+ if (target_A == 0
+ || target_z == 0
+ || target_a == 0)
+ return NULL_TREE;
+
+ arg = fold_convert_loc (loc, unsigned_type_node, arg);
+ tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
+ build_int_cst (unsigned_type_node, target_a));
+ /* ??? x19 and x20 would better live in static storage; Think:
+ * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
+ */
+ unsigned HOST_WIDE_INT x19 = target_z - target_a;
+ unsigned HOST_WIDE_INT x20 = target_a - target_A;
+ tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
+ build_int_cst (unsigned_type_node, x19));
+ tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
+ fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
+ build_int_cst (unsigned_type_node, x20)),
+ arg);
+ return fold_convert_loc (loc, integer_type_node, tem);
+}
+
+
/* Fold the next_arg or va_start call EXP. Returns true if there was an error
produced. False otherwise. This is done so that we don't output the error
or warning twice or three times. */
--
2.1.4