This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] fold builtin_tolower, builtin_toupper


gcc/ChangeLog

2015-07-09  Bernhard Reutner-Fischer  <aldot@gcc.gnu.org>

	* builtins.c (fold_builtin_tolower, fold_builtin_toupper): New
	static functions.
	(fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER.

Signed-off-by: Bernhard Reutner-Fischer <rep.dot.nop@gmail.com>
---
 gcc/builtins.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

Using the three testcases attached to PR66741 where the -1.c one is using
builtins
$ for i in 0 1 2;do gcc -o tolower_strcpy-$i tolower_strcpy-$i.c -Ofast -W -Wall -Wextra -pedantic -DMAIN -msse4.2;done

pristine (trunk@225368):
# tolower_strcpy-0

real	0m6.068s
user	0m3.204s
sys	0m2.840s
# tolower_strcpy-1

real	0m8.097s
user	0m5.548s
sys	0m2.528s
# tolower_strcpy-2

real	0m3.568s
user	0m0.804s
sys	0m2.748s

trunk@225368 + fold tolower/toupper below

# tolower_strcpy-0

real	0m6.055s
user	0m3.212s
sys	0m2.832s
# tolower_strcpy-1

real	0m5.383s
user	0m2.464s
sys	0m2.900s
# tolower_strcpy-2

real	0m3.605s
user	0m0.668s
sys	0m2.924s

The tolower loop now ends up as
.L5:
        movsbl  (%rbx), %edx
        leal    32(%rdx), %ecx
        movl    %edx, %eax
        subl    $65, %edx
        cmpl    $25, %edx
        cmovbe  %ecx, %eax
        addq    $1, %rbx
        movb    %al, -1(%rbx)
        cmpq    %rsi, %rbx
        jne     .L5

instead of the former call

.L5:
        movsbl  (%rbx), %edi
        addq    $1, %rbx
        call    tolower
        movb    %al, -1(%rbx)
        cmpq    %rbp, %rbx
        jne     .L5

Would something like attached be ok for trunk after proper testing?
Advise on the questions inline WRT caching lang_hooks intermediate
results?
Hints on further steps towards fixing the PR?

I think the next step would be to try to teach graphite to fuse the two
loops in tolower_strcpy-0.c. Need to look at graphite..
Then see how to classify builtins that could be expanded early and what
breaks if doing so. This sounds like a potential disaster, fun.
Next, see why the vectorizer (or something else) does not pave the way
to use SSE instruction as the tolower_strcpy-2.c does.

thanks,

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 5f53342..421c908 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -204,6 +204,9 @@ static tree fold_builtin_strrchr (location_t, tree, tree, tree);
 static tree fold_builtin_strspn (location_t, tree, tree);
 static tree fold_builtin_strcspn (location_t, tree, tree);
 
+static tree fold_builtin_tolower (location_t, tree);
+static tree fold_builtin_toupper (location_t, tree);
+
 static rtx expand_builtin_object_size (tree);
 static rtx expand_builtin_memory_chk (tree, rtx, machine_mode,
 				      enum built_in_function);
@@ -10285,6 +10288,12 @@ fold_builtin_1 (location_t loc, tree fndecl, tree arg0)
     case BUILT_IN_ISDIGIT:
       return fold_builtin_isdigit (loc, arg0);
 
+    case BUILT_IN_TOLOWER:
+      return fold_builtin_tolower (loc, arg0);
+
+    case BUILT_IN_TOUPPER:
+      return fold_builtin_toupper (loc, arg0);
+
     CASE_FLT_FN (BUILT_IN_FINITE):
     case BUILT_IN_FINITED32:
     case BUILT_IN_FINITED64:
@@ -11208,6 +11217,96 @@ fold_builtin_strcspn (location_t loc, tree s1, tree s2)
     }
 }
 
+
+/* Simplify a call to the tolower builtin.  ARG is the argument to the call.
+
+   Return NULL_TREE if no simplification was possible, otherwise return the
+   simplified form of the call as a tree.  */
+
+static tree
+fold_builtin_tolower (location_t loc, tree arg)
+{
+  if (!validate_arg (arg, INTEGER_TYPE))
+    return NULL_TREE;
+
+  /* Transform tolower(c) -> (unsigned)(c) | 0x20.
+
+     More specifically:
+     unsigned tem = arg - 'A';
+     if (tem <= ('Z' - 'A'))
+       arg += 'a' - 'A';
+     return arg;
+   */
+  unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
+  unsigned HOST_WIDE_INT target_Z = lang_hooks.to_target_charset ('Z');
+  unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
+  if (target_A == 0
+      || target_Z == 0
+      || target_a == 0)
+    return NULL_TREE;
+
+  arg = fold_convert_loc (loc, unsigned_type_node, arg);
+  tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
+			  build_int_cst (unsigned_type_node, target_A));
+  /* ??? x19 and x20 would better live in static storage; Think:
+   * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
+   */
+  unsigned HOST_WIDE_INT x19 = target_Z - target_A;
+  unsigned HOST_WIDE_INT x20 = target_a - target_A;
+  tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
+			 build_int_cst (unsigned_type_node, x19));
+  tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
+			 fold_build2 (PLUS_EXPR, unsigned_type_node, arg,
+				      build_int_cst (unsigned_type_node, x20)),
+			 arg);
+  return fold_convert_loc (loc, integer_type_node, tem);
+}
+
+/* Simplify a call to the toupper builtin.  ARG is the argument to the call.
+
+   Return NULL_TREE if no simplification was possible, otherwise return the
+   simplified form of the call as a tree.  */
+
+static tree
+fold_builtin_toupper (location_t loc, tree arg)
+{
+  if (!validate_arg (arg, INTEGER_TYPE))
+    return NULL_TREE;
+
+  /* Transform toupper(c) -> (unsigned)(c) ^ 0x20.
+
+     More specifically:
+     unsigned tem = arg - 'a';
+     if (tem <= ('z' - 'a'))
+       arg -= 'a' - 'A';
+     return arg;
+   */
+  unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
+  unsigned HOST_WIDE_INT target_z = lang_hooks.to_target_charset ('z');
+  unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
+  if (target_A == 0
+      || target_z == 0
+      || target_a == 0)
+    return NULL_TREE;
+
+  arg = fold_convert_loc (loc, unsigned_type_node, arg);
+  tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
+			  build_int_cst (unsigned_type_node, target_a));
+  /* ??? x19 and x20 would better live in static storage; Think:
+   * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
+   */
+  unsigned HOST_WIDE_INT x19 = target_z - target_a;
+  unsigned HOST_WIDE_INT x20 = target_a - target_A;
+  tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
+			 build_int_cst (unsigned_type_node, x19));
+  tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
+			 fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
+				      build_int_cst (unsigned_type_node, x20)),
+			 arg);
+  return fold_convert_loc (loc, integer_type_node, tem);
+}
+
+
 /* Fold the next_arg or va_start call EXP. Returns true if there was an error
    produced.  False otherwise.  This is done so that we don't output the error
    or warning twice or three times.  */
-- 
2.1.4


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]