This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH 23/27] Documentation: the "intro" subdirectory


The "intro" subdirectory of gcc/jit/docs consists of a 4-part tutorial
aimed at experienced developers who are new users of the library, taking
them from beginning use all the way up to adding JIT-compilation to an
interpreter.

gcc/jit/
	* docs/intro/factorial.png: New.
	* docs/intro/index.rst: New.
	* docs/intro/sum-of-squares.png: New.
	* docs/intro/tutorial01.rst: New.
	* docs/intro/tutorial02.rst: New.
	* docs/intro/tutorial03.rst: New.
	* docs/intro/tutorial04.rst: New.
---
 gcc/jit/docs/intro/factorial.png      |  Bin 0 -> 183838 bytes
 gcc/jit/docs/intro/index.rst          |   27 +
 gcc/jit/docs/intro/sum-of-squares.png |  Bin 0 -> 22839 bytes
 gcc/jit/docs/intro/tutorial01.rst     |   52 ++
 gcc/jit/docs/intro/tutorial02.rst     |  349 +++++++++++
 gcc/jit/docs/intro/tutorial03.rst     |  378 +++++++++++
 gcc/jit/docs/intro/tutorial04.rst     | 1108 +++++++++++++++++++++++++++++++++
 7 files changed, 1914 insertions(+)
 create mode 100644 gcc/jit/docs/intro/factorial.png
 create mode 100644 gcc/jit/docs/intro/index.rst
 create mode 100644 gcc/jit/docs/intro/sum-of-squares.png
 create mode 100644 gcc/jit/docs/intro/tutorial01.rst
 create mode 100644 gcc/jit/docs/intro/tutorial02.rst
 create mode 100644 gcc/jit/docs/intro/tutorial03.rst
 create mode 100644 gcc/jit/docs/intro/tutorial04.rst

diff --git a/gcc/jit/docs/intro/factorial.png b/gcc/jit/docs/intro/factorial.png
new file mode 100644
index 0000000..dff47ce
Binary files /dev/null and b/gcc/jit/docs/intro/factorial.png differ
diff --git a/gcc/jit/docs/intro/index.rst b/gcc/jit/docs/intro/index.rst
new file mode 100644
index 0000000..d3bcec9
--- /dev/null
+++ b/gcc/jit/docs/intro/index.rst
@@ -0,0 +1,27 @@
+.. Copyright (C) 2014 Free Software Foundation, Inc.
+   Originally contributed by David Malcolm <dmalcolm@redhat.com>
+
+   This is free software: you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see
+   <http://www.gnu.org/licenses/>.
+
+Tutorial
+========
+
+.. toctree::
+   :maxdepth: 2
+
+   tutorial01.rst
+   tutorial02.rst
+   tutorial03.rst
+   tutorial04.rst
diff --git a/gcc/jit/docs/intro/sum-of-squares.png b/gcc/jit/docs/intro/sum-of-squares.png
new file mode 100644
index 0000000..7a3b4af
Binary files /dev/null and b/gcc/jit/docs/intro/sum-of-squares.png differ
diff --git a/gcc/jit/docs/intro/tutorial01.rst b/gcc/jit/docs/intro/tutorial01.rst
new file mode 100644
index 0000000..b1a5128
--- /dev/null
+++ b/gcc/jit/docs/intro/tutorial01.rst
@@ -0,0 +1,52 @@
+.. Copyright (C) 2014 Free Software Foundation, Inc.
+   Originally contributed by David Malcolm <dmalcolm@redhat.com>
+
+   This is free software: you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see
+   <http://www.gnu.org/licenses/>.
+
+.. default-domain:: c
+
+Tutorial part 1: "Hello world"
+==============================
+
+Before we look at the details of the API, let's look at building and
+running programs that use the library.
+
+Here's a toy "hello world" program that uses the library to synthesize
+a call to `printf` and uses it to write a message to stdout.
+
+Don't worry about the content of the program for now; we'll cover
+the details in later parts of this tutorial.
+
+   .. literalinclude:: ../examples/tut01-hello-world.c
+    :language: c
+
+Copy the above to `tut01-hello-world.c`.
+
+Assuming you have the jit library installed, build the test program
+using:
+
+.. code-block:: console
+
+  $ gcc \
+      tut01-hello-world.c \
+      -o tut01-hello-world \
+      -lgccjit
+
+You should then be able to run the built program:
+
+.. code-block:: console
+
+  $ ./tut01-hello-world
+  hello world
diff --git a/gcc/jit/docs/intro/tutorial02.rst b/gcc/jit/docs/intro/tutorial02.rst
new file mode 100644
index 0000000..f52499e
--- /dev/null
+++ b/gcc/jit/docs/intro/tutorial02.rst
@@ -0,0 +1,349 @@
+.. Copyright (C) 2014 Free Software Foundation, Inc.
+   Originally contributed by David Malcolm <dmalcolm@redhat.com>
+
+   This is free software: you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see
+   <http://www.gnu.org/licenses/>.
+
+.. default-domain:: c
+
+Tutorial part 2: Creating a trivial machine code function
+---------------------------------------------------------
+
+Consider this C function:
+
+.. code-block:: c
+
+   int square (int i)
+   {
+     return i * i;
+   }
+
+How can we construct this at run-time using libgccjit?
+
+First we need to include the relevant header:
+
+.. code-block:: c
+
+  #include <libgccjit.h>
+
+All state associated with compilation is associated with a
+:c:type:`gcc_jit_context *`.
+
+Create one using :c:func:`gcc_jit_context_acquire`:
+
+.. code-block:: c
+
+  gcc_jit_context *ctxt;
+  ctxt = gcc_jit_context_acquire ();
+
+The JIT library has a system of types.  It is statically-typed: every
+expression is of a specific type, fixed at compile-time.  In our example,
+all of the expressions are of the C `int` type, so let's obtain this from
+the context, as a :c:type:`gcc_jit_type *`, using
+:c:func:`gcc_jit_context_get_type`:
+
+.. code-block:: c
+
+  gcc_jit_type *int_type =
+    gcc_jit_context_get_type (ctxt, GCC_JIT_TYPE_INT);
+
+:c:type:`gcc_jit_type *` is an example of a "contextual" object: every
+entity in the API is associated with a :c:type:`gcc_jit_context *`.
+
+Memory management is easy: all such "contextual" objects are automatically
+cleaned up for you when the context is released, using
+:c:func:`gcc_jit_context_release`:
+
+.. code-block:: c
+
+  gcc_jit_context_release (ctxt);
+
+so you don't need to manually track and cleanup all objects, just the
+contexts.
+
+Although the API is C-based, there is a form of class hierarchy, which
+looks like this::
+
+  +- gcc_jit_object
+      +- gcc_jit_location
+      +- gcc_jit_type
+         +- gcc_jit_struct
+      +- gcc_jit_field
+      +- gcc_jit_function
+      +- gcc_jit_block
+      +- gcc_jit_rvalue
+          +- gcc_jit_lvalue
+             +- gcc_jit_param
+
+There are casting methods for upcasting from subclasses to parent classes.
+For example, :c:func:`gcc_jit_type_as_object`:
+
+.. code-block:: c
+
+   gcc_jit_object *obj = gcc_jit_type_as_object (int_type);
+
+One thing you can do with a :c:type:`gcc_jit_object *` is
+to ask it for a human-readable description, using
+:c:func:`gcc_jit_object_get_debug_string`:
+
+.. code-block:: c
+
+   printf ("obj: %s\n", gcc_jit_object_get_debug_string (obj));
+
+giving this text on stdout:
+
+.. code-block:: bash
+
+   obj: int
+
+This is invaluable when debugging.
+
+Let's create the function.  To do so, we first need to construct
+its single parameter, specifying its type and giving it a name,
+using :c:func:`gcc_jit_context_new_param`:
+
+.. code-block:: c
+
+  gcc_jit_param *param_i =
+    gcc_jit_context_new_param (ctxt, NULL, int_type, "i");
+
+Now we can create the function, using
+:c:func:`gcc_jit_context_new_function`:
+
+.. code-block:: c
+
+  gcc_jit_function *func =
+    gcc_jit_context_new_function (ctxt, NULL,
+                                  GCC_JIT_FUNCTION_EXPORTED,
+                                  int_type,
+                                  "square",
+                                  1, &param_i,
+                                  0);
+
+To define the code within the function, we must create basic blocks
+containing statements.
+
+Every basic block contains a list of statements, eventually terminated
+by a statement that either returns, or jumps to another basic block.
+
+Our function has no control-flow, so we just need one basic block:
+
+.. code-block:: c
+
+  gcc_jit_block *block = gcc_jit_function_new_block (func, NULL);
+
+Our basic block is relatively simple: it immediately terminates by
+returning the value of an expression.
+
+We can build the expression using :c:func:`gcc_jit_context_new_binary_op`:
+
+.. code-block:: c
+
+   gcc_jit_rvalue *expr =
+     gcc_jit_context_new_binary_op (
+       ctxt, NULL,
+       GCC_JIT_BINARY_OP_MULT, int_type,
+       gcc_jit_param_as_rvalue (param_i),
+       gcc_jit_param_as_rvalue (param_i));
+
+A :c:type:`gcc_jit_rvalue *` is another example of a
+:c:type:`gcc_jit_object *` subclass.  We can upcast it using
+:c:func:`gcc_jit_rvalue_as_object` and as before print it with
+:c:func:`gcc_jit_object_get_debug_string`.
+
+.. code-block:: c
+
+   printf ("expr: %s\n",
+           gcc_jit_object_get_debug_string (
+             gcc_jit_rvalue_as_object (expr)));
+
+giving this output:
+
+.. code-block:: bash
+
+   expr: i * i
+
+Creating the expression in itself doesn't do anything; we have to add
+this expression to a statement within the block.  In this case, we use it
+to build a return statement, which terminates the basic block:
+
+.. code-block:: c
+
+  gcc_jit_block_end_with_return (block, NULL, expr);
+
+OK, we've populated the context.  We can now compile it using
+:c:func:`gcc_jit_context_compile`:
+
+.. code-block:: c
+
+   gcc_jit_result *result;
+   result = gcc_jit_context_compile (ctxt);
+
+and get a :c:type:`gcc_jit_result *`.
+
+We can now use :c:func:`gcc_jit_result_get_code` to look up a specific
+machine code routine within the result, in this case, the function we
+created above.
+
+.. code-block:: c
+
+   void *fn_ptr = gcc_jit_result_get_code (result, "square");
+   if (!fn_ptr)
+     {
+       fprintf (stderr, "NULL fn_ptr");
+       goto error;
+     }
+
+We can now cast the pointer to an appropriate function pointer type, and
+then call it:
+
+.. code-block:: c
+
+  typedef int (*fn_type) (int);
+  fn_type square = (fn_type)fn_ptr;
+  printf ("result: %d", square (5));
+
+.. code-block:: bash
+
+  result: 25
+
+
+Options
+*******
+
+To get more information on what's going on, you can set debugging flags
+on the context using :c:func:`gcc_jit_context_set_bool_option`.
+
+.. (I'm deliberately not mentioning
+    :c:macro:`GCC_JIT_BOOL_OPTION_DUMP_INITIAL_TREE` here since I think
+    it's probably more of use to implementors than to users)
+
+Setting :c:macro:`GCC_JIT_BOOL_OPTION_DUMP_INITIAL_GIMPLE` will dump a
+C-like representation to stderr when you compile (GCC's "GIMPLE"
+representation):
+
+.. code-block:: c
+
+   gcc_jit_context_set_bool_option (
+     ctxt,
+     GCC_JIT_BOOL_OPTION_DUMP_INITIAL_GIMPLE,
+     1);
+   result = gcc_jit_context_compile (ctxt);
+
+.. code-block:: c
+
+  square (signed int i)
+  {
+    signed int D.260;
+
+    entry:
+    D.260 = i * i;
+    return D.260;
+  }
+
+We can see the generated machine code in assembler form (on stderr) by
+setting :c:macro:`GCC_JIT_BOOL_OPTION_DUMP_GENERATED_CODE` on the context
+before compiling:
+
+.. code-block:: c
+
+  gcc_jit_context_set_bool_option (
+    ctxt,
+    GCC_JIT_BOOL_OPTION_DUMP_GENERATED_CODE,
+    1);
+  result = gcc_jit_context_compile (ctxt);
+
+.. code-block:: gas
+
+        .file   "fake.c"
+        .text
+        .globl  square
+        .type   square, @function
+  square:
+  .LFB6:
+        .cfi_startproc
+        pushq   %rbp
+        .cfi_def_cfa_offset 16
+        .cfi_offset 6, -16
+        movq    %rsp, %rbp
+        .cfi_def_cfa_register 6
+        movl    %edi, -4(%rbp)
+  .L14:
+        movl    -4(%rbp), %eax
+        imull   -4(%rbp), %eax
+        popq    %rbp
+        .cfi_def_cfa 7, 8
+        ret
+        .cfi_endproc
+  .LFE6:
+        .size   square, .-square
+        .ident  "GCC: (GNU) 4.9.0 20131023 (Red Hat 0.2-0.5.1920c315ff984892399893b380305ab36e07b455.fc20)"
+        .section       .note.GNU-stack,"",@progbits
+
+By default, no optimizations are performed, the equivalent of GCC's
+`-O0` option.  We can turn things up to e.g. `-O3` by calling
+:c:func:`gcc_jit_context_set_int_option` with
+:c:macro:`GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL`:
+
+.. code-block:: c
+
+  gcc_jit_context_set_int_option (
+    ctxt,
+    GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL,
+    3);
+
+.. code-block:: gas
+
+        .file   "fake.c"
+        .text
+        .p2align 4,,15
+        .globl  square
+        .type   square, @function
+  square:
+  .LFB7:
+        .cfi_startproc
+  .L16:
+        movl    %edi, %eax
+        imull   %edi, %eax
+        ret
+        .cfi_endproc
+  .LFE7:
+        .size   square, .-square
+        .ident  "GCC: (GNU) 4.9.0 20131023 (Red Hat 0.2-0.5.1920c315ff984892399893b380305ab36e07b455.fc20)"
+        .section        .note.GNU-stack,"",@progbits
+
+Naturally this has only a small effect on such a trivial function.
+
+
+Full example
+************
+
+Here's what the above looks like as a complete program:
+
+   .. literalinclude:: ../examples/tut02-square.c
+    :lines: 1-
+    :language: c
+
+Building and running it:
+
+.. code-block:: console
+
+  $ gcc \
+      tut02-square.c \
+      -o tut02-square \
+      -lgccjit
+
+  # Run the built program:
+  $ ./tut02-square
+  result: 25
diff --git a/gcc/jit/docs/intro/tutorial03.rst b/gcc/jit/docs/intro/tutorial03.rst
new file mode 100644
index 0000000..0845150
--- /dev/null
+++ b/gcc/jit/docs/intro/tutorial03.rst
@@ -0,0 +1,378 @@
+.. Copyright (C) 2014 Free Software Foundation, Inc.
+   Originally contributed by David Malcolm <dmalcolm@redhat.com>
+
+   This is free software: you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see
+   <http://www.gnu.org/licenses/>.
+
+Tutorial part 3: Loops and variables
+------------------------------------
+Consider this C function:
+
+ .. code-block:: c
+
+  int loop_test (int n)
+  {
+    int sum = 0;
+    for (int i = 0; i < n; i++)
+      sum += i * i;
+    return sum;
+  }
+
+This example demonstrates some more features of libgccjit, with local
+variables and a loop.
+
+To break this down into libgccjit terms, it's usually easier to reword
+the `for` loop as a `while` loop, giving:
+
+ .. code-block:: c
+
+  int loop_test (int n)
+  {
+    int sum = 0;
+    int i = 0;
+    while (i < n)
+    {
+      sum += i * i;
+      i++;
+    }
+    return sum;
+  }
+
+Here's what the final control flow graph will look like:
+
+    .. figure:: sum-of-squares.png
+      :alt: image of a control flow graph
+
+As before, we include the libgccjit header and make a
+:c:type:`gcc_jit_context *`.
+
+.. code-block:: c
+
+  #include <libgccjit.h>
+
+  void test (void)
+  {
+    gcc_jit_context *ctxt;
+    ctxt = gcc_jit_context_acquire ();
+
+The function works with the C `int` type:
+
+.. code-block:: c
+
+  gcc_jit_type *the_type =
+    gcc_jit_context_get_type (ctxt, GCC_JIT_TYPE_INT);
+  gcc_jit_type *return_type = the_type;
+
+though we could equally well make it work on, say, `double`:
+
+.. code-block:: c
+
+  gcc_jit_type *the_type =
+    gcc_jit_context_get_type (ctxt, GCC_JIT_TYPE_DOUBLE);
+
+Let's build the function:
+
+.. code-block:: c
+
+  gcc_jit_param *n =
+    gcc_jit_context_new_param (ctxt, NULL, the_type, "n");
+  gcc_jit_param *params[1] = {n};
+  gcc_jit_function *func =
+    gcc_jit_context_new_function (ctxt, NULL,
+				  GCC_JIT_FUNCTION_EXPORTED,
+				  return_type,
+				  "loop_test",
+				  1, params, 0);
+
+Expressions: lvalues and rvalues
+********************************
+
+The base class of expression is the :c:type:`gcc_jit_rvalue *`,
+representing an expression that can be on the *right*-hand side of
+an assignment: a value that can be computed somehow, and assigned
+*to* a storage area (such as a variable).  It has a specific
+:c:type:`gcc_jit_type *`.
+
+Anothe important class is :c:type:`gcc_jit_lvalue *`.
+A :c:type:`gcc_jit_lvalue *`. is something that can of the *left*-hand
+side of an assignment: a storage area (such as a variable).
+
+In other words, every assignment can be thought of as:
+
+.. code-block:: c
+
+   LVALUE = RVALUE;
+
+Note that :c:type:`gcc_jit_lvalue *` is a subclass of
+:c:type:`gcc_jit_rvalue *`, where in an assignment of the form:
+
+.. code-block:: c
+
+   LVALUE_A = LVALUE_B;
+
+the `LVALUE_B` implies reading the current value of that storage
+area, assigning it into the `LVALUE_A`.
+
+So far the only expressions we've seen are `i * i`:
+
+.. code-block:: c
+
+   gcc_jit_rvalue *expr =
+     gcc_jit_context_new_binary_op (
+       ctxt, NULL,
+       GCC_JIT_BINARY_OP_MULT, int_type,
+       gcc_jit_param_as_rvalue (param_i),
+       gcc_jit_param_as_rvalue (param_i));
+
+which is a :c:type:`gcc_jit_rvalue *`, and the various function
+parameters: `param_i` and `param_n`, instances of
+:c:type:`gcc_jit_param *`, which is a subclass of
+:c:type:`gcc_jit_lvalue *` (and, in turn, of :c:type:`gcc_jit_rvalue *`):
+we can both read from and write to function parameters within the
+body of a function.
+
+Our new example has a couple of local variables.  We create them by
+calling :c:func:`gcc_jit_function_new_local`, supplying a type and a
+name:
+
+.. code-block:: c
+
+  /* Build locals:  */
+  gcc_jit_lvalue *i =
+    gcc_jit_function_new_local (func, NULL, the_type, "i");
+  gcc_jit_lvalue *sum =
+    gcc_jit_function_new_local (func, NULL, the_type, "sum");
+
+These are instances of :c:type:`gcc_jit_lvalue *` - they can be read from
+and written to.
+
+Note that there is no precanned way to create *and* initialize a variable
+like in C:
+
+.. code-block:: c
+
+   int i = 0;
+
+Instead, having added the local to the function, we have to separately add
+an assignment of `0` to `local_i` at the beginning of the function.
+
+Control flow
+************
+
+This function has a loop, so we need to build some basic blocks to
+handle the control flow.  In this case, we need 4 blocks:
+
+1. before the loop (initializing the locals)
+2. the conditional at the top of the loop (comparing `i < n`)
+3. the body of the loop
+4. after the loop terminates (`return sum`)
+
+so we create these as :c:type:`gcc_jit_block *` instances within the
+:c:type:`gcc_jit_function *`:
+
+.. code-block:: c
+
+  gcc_jit_block *b_initial =
+    gcc_jit_function_new_block (func, "initial");
+  gcc_jit_block *b_loop_cond =
+    gcc_jit_function_new_block (func, "loop_cond");
+  gcc_jit_block *b_loop_body =
+    gcc_jit_function_new_block (func, "loop_body");
+  gcc_jit_block *b_after_loop =
+    gcc_jit_function_new_block (func, "after_loop");
+
+We now populate each block with statements.
+
+The entry block `b_initial` consists of initializations followed by a jump
+to the conditional.  We assign `0` to `i` and to `sum`, using
+:c:func:`gcc_jit_block_add_assignment` to add
+an assignment statement, and using :c:func:`gcc_jit_context_zero` to get
+the constant value `0` for the relevant type for the right-hand side of
+the assignment:
+
+.. code-block:: c
+
+  /* sum = 0; */
+  gcc_jit_block_add_assignment (
+    b_initial, NULL,
+    sum,
+    gcc_jit_context_zero (ctxt, the_type));
+
+  /* i = 0; */
+  gcc_jit_block_add_assignment (
+    b_initial, NULL,
+    i,
+    gcc_jit_context_zero (ctxt, the_type));
+
+We can then terminate the entry block by jumping to the conditional:
+
+.. code-block:: c
+
+  gcc_jit_block_end_with_jump (b_initial, NULL, b_loop_cond);
+
+The conditional block is equivalent to the line `while (i < n)` from our
+C example. It contains a single statement: a conditional, which jumps to
+one of two destination blocks depending on a boolean
+:c:type:`gcc_jit_rvalue *`, in this case the comparison of `i` and `n`.
+We build the comparison using :c:func:`gcc_jit_context_new_comparison`:
+
+.. code-block:: c
+
+   gcc_jit_rvalue *guard =
+     gcc_jit_context_new_comparison (
+       ctxt, NULL,
+       GCC_JIT_COMPARISON_GE,
+       gcc_jit_lvalue_as_rvalue (i),
+       gcc_jit_param_as_rvalue (n));
+
+and can then use this to add `b_loop_cond`'s sole statement, via
+:c:func:`gcc_jit_block_end_with_conditional`:
+
+.. code-block:: c
+
+  gcc_jit_block_end_with_conditional (b_loop_cond, NULL, guard);
+
+Next, we populate the body of the loop.
+
+The C statement `sum += i * i;` is an assignment operation, where an
+lvalue is modified "in-place".  We use
+:c:func:`gcc_jit_block_add_assignment_op` to handle these operations:
+
+.. code-block:: c
+
+  /* sum += i * i */
+  gcc_jit_block_add_assignment_op (
+    b_loop_body, NULL,
+    sum,
+    GCC_JIT_BINARY_OP_PLUS,
+    gcc_jit_context_new_binary_op (
+      ctxt, NULL,
+      GCC_JIT_BINARY_OP_MULT, the_type,
+      gcc_jit_lvalue_as_rvalue (i),
+      gcc_jit_lvalue_as_rvalue (i)));
+
+The `i++` can be thought of as `i += 1`, and can thus be handled in
+a similar way.  We use :c:func:`gcc_jit_context_one` to get the constant
+value `1` (for the relevant type) for the right-hand side
+of the assignment.
+
+.. code-block:: c
+
+  /* i++ */
+  gcc_jit_block_add_assignment_op (
+    b_loop_body, NULL,
+    i,
+    GCC_JIT_BINARY_OP_PLUS,
+    gcc_jit_context_one (ctxt, the_type));
+
+.. note::
+
+  For numeric constants other than 0 or 1, we could use
+  :c:func:`gcc_jit_context_new_rvalue_from_int` and
+  :c:func:`gcc_jit_context_new_rvalue_from_double`.
+
+The loop body completes by jumping back to the conditional:
+
+.. code-block:: c
+
+  gcc_jit_block_end_with_jump (b_loop_body, NULL, b_loop_cond);
+
+Finally, we populate the `b_after_loop` block, reached when the loop
+conditional is false.  We want to generate the equivalent of:
+
+.. code-block:: c
+
+   return sum;
+
+so the block is just one statement:
+
+.. code-block:: c
+
+  /* return sum */
+  gcc_jit_block_end_with_return (
+    b_after_loop,
+    NULL,
+    gcc_jit_lvalue_as_rvalue (sum));
+
+.. note::
+
+   You can intermingle block creation with statement creation,
+   but given that the terminator statements generally include references
+   to other blocks, I find it's clearer to create all the blocks,
+   *then* all the statements.
+
+We've finished populating the function.  As before, we can now compile it
+to machine code:
+
+.. code-block:: c
+
+   gcc_jit_result *result;
+   result = gcc_jit_context_compile (ctxt);
+
+   typedef int (*loop_test_fn_type) (int);
+   loop_test_fn_type loop_test =
+    (loop_test_fn_type)gcc_jit_result_get_code (result, "loop_test");
+   if (!loop_test)
+     goto error;
+   printf ("result: %d", loop_test (10));
+
+.. code-block:: bash
+
+   result: 285
+
+
+Visualizing the control flow graph
+**********************************
+
+You can see the control flow graph of a function using
+:c:func:`gcc_jit_function_dump_to_dot`:
+
+.. code-block:: c
+
+  gcc_jit_function_dump_to_dot (func, "/tmp/sum-of-squares.dot");
+
+giving a .dot file in GraphViz format.
+
+You can convert this to an image using `dot`:
+
+.. code-block:: bash
+
+   $ dot -Tpng /tmp/sum-of-squares.dot -o /tmp/sum-of-squares.png
+
+or use a viewer (my preferred one is xdot.py; see
+https://github.com/jrfonseca/xdot.py; on Fedora you can
+install it with `yum install python-xdot`):
+
+    .. figure:: sum-of-squares.png
+      :alt: image of a control flow graph
+
+Full example
+************
+
+   .. literalinclude:: ../examples/tut03-sum-of-squares.c
+    :lines: 1-
+    :language: c
+
+Building and running it:
+
+.. code-block:: console
+
+  $ gcc \
+      tut03-sum-of-squares.c \
+      -o tut03-sum-of-squares \
+      -lgccjit
+
+  # Run the built program:
+  $ ./tut03-sum-of-squares
+  loop_test returned: 285
diff --git a/gcc/jit/docs/intro/tutorial04.rst b/gcc/jit/docs/intro/tutorial04.rst
new file mode 100644
index 0000000..cafdddb
--- /dev/null
+++ b/gcc/jit/docs/intro/tutorial04.rst
@@ -0,0 +1,1108 @@
+.. Copyright (C) 2014 Free Software Foundation, Inc.
+   Originally contributed by David Malcolm <dmalcolm@redhat.com>
+
+   This is free software: you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see
+   <http://www.gnu.org/licenses/>.
+
+Tutorial part 4: Adding JIT-compilation to a toy interpreter
+------------------------------------------------------------
+In this example we construct a "toy" interpreter, and add JIT-compilation
+to it.
+
+Our toy interpreter
+*******************
+
+It's a stack-based interpreter, and is intended as a (very simple) example
+of the kind of bytecode interpreter seen in dynamic languages such as
+Python, Ruby etc.
+
+For the sake of simplicity, our toy virtual machine is very limited:
+
+  * The only data type is `int`
+
+  * It can only work on one function at a time (so that the only
+    function call that can be made is to recurse).
+
+  * Functions can only take one parameter.
+
+  * Functions have a stack of `int` values.
+
+  * We'll implement function call within the interpreter by calling a
+    function in our implementation, rather than implementing our own
+    frame stack.
+
+  * The parser is only good enough to get the examples to work.
+
+Naturally, a real interpreter would be much more complicated that this.
+
+The following operations are supported:
+
+====================== ======================== =============== ==============
+Operation              Meaning                  Old Stack       New Stack
+====================== ======================== =============== ==============
+DUP                    Duplicate top of stack.  ``[..., x]``    ``[..., x, x]``
+ROT                    Swap top two elements    ``[..., x, y]`` ``[..., y, x]``
+                       of stack.
+BINARY_ADD             Add the top two elements ``[..., x, y]`` ``[..., (x+y)]``
+                       on the stack.
+BINARY_SUBTRACT        Likewise, but subtract.  ``[..., x, y]`` ``[..., (x-y)]``
+BINARY_MULT            Likewise, but multiply.  ``[..., x, y]`` ``[..., (x*y)]``
+BINARY_COMPARE_LT      Compare the top two      ``[..., x, y]`` ``[..., (x<y)]``
+                       elements on the stack
+                       and push a nonzero/zero
+                       if (x<y).
+RECURSE                Recurse, passing the top ``[..., x]``    ``[..., fn(x)]``
+                       of the stack, and
+                       popping the result.
+RETURN                 Return the top of the    ``[x]``         ``[]``
+                       stack.
+PUSH_CONST `arg`       Push an int const.       ``[...]``       ``[..., arg]``
+JUMP_ABS_IF_TRUE `arg` Pop; if top of stack was ``[..., x]``    ``[...]``
+                       nonzero, jump to
+                       ``arg``.
+====================== ======================== =============== ==============
+
+Programs can be interpreted, disassembled, and compiled to machine code.
+
+The interpreter reads ``.toy`` scripts.  Here's what a simple recursive
+factorial program looks like, the script ``factorial.toy``.
+The parser ignores lines beginning with a `#`.
+
+   .. literalinclude:: ../examples/tut04-toyvm/factorial.toy
+    :lines: 1-
+    :language: c
+
+The interpreter is a simple infinite loop with a big ``switch`` statement
+based on what the next opcode is:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Execute the given function.  */
+    :end-before: /* JIT compilation.  */
+    :language: c
+
+Compiling to machine code
+*************************
+We want to generate machine code that can be cast to this type and
+then directly executed in-process:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Functions are compiled to this function ptr type.  */
+    :end-before: enum opcode
+    :language: c
+
+Our compiler isn't very sophisticated; it takes the implementation of
+each opcode above, and maps it directly to the operations supported by
+the libgccjit API.
+
+How should we handle the stack?  In theory we could calculate what the
+stack depth will be at each opcode, and optimize away the stack
+manipulation "by hand".  We'll see below that libgccjit is able to do
+this for us, so we'll implement stack manipulation
+in a direct way, by creating a ``stack`` array and ``stack_depth``
+variables, local within the generated function, equivalent to this C code:
+
+.. code-block:: c
+
+  int stack_depth;
+  int stack[MAX_STACK_DEPTH];
+
+We'll also have local variables ``x`` and ``y`` for use when implementing
+the opcodes, equivalent to this:
+
+.. code-block:: c
+
+  int x;
+  int y;
+
+This means our compiler has the following state:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* JIT compilation.  */
+    :end-before: /* Stack manipulation.  */
+    :language: c
+
+Setting things up
+*****************
+
+First we create our types:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Create types.  */
+    :end-before: /* The constant value 1.  */
+    :language: c
+
+along with extracting a useful `int` constant:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* The constant value 1.  */
+    :end-before: /* Create locations.  */
+    :language: c
+
+We'll implement push and pop in terms of the ``stack`` array and
+``stack_depth``.  Here are helper functions for adding statements to
+a block, implementing pushing and popping values:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Stack manipulation.  */
+    :end-before: /* The main compilation hook.  */
+    :language: c
+
+We will support single-stepping through the generated code in the
+debugger, so we need to create :c:type:`gcc_jit_location` instances, one
+per operation in the source code.  These will reference the lines of
+e.g. ``factorial.toy``.
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Create locations.  */
+    :end-before: /* Creating the function.  */
+    :language: c
+
+Let's create the function itself.  As usual, we create its parameter
+first, then use the parameter to create the function:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Creating the function.  */
+    :end-before: /* Create stack lvalues.  */
+    :language: c
+
+We create the locals within the function.
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Create stack lvalues.  */
+    :end-before: /* 1st pass: create blocks, one per opcode.
+    :language: c
+
+Populating the function
+***********************
+
+There's some one-time initialization, and the API treats the first block
+you create as the entrypoint of the function, so we need to create that
+block first:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: first.  */
+    :end-before: /* Create a block per operation.  */
+    :language: c
+
+We can now create blocks for each of the operations.  Most of these will
+be consolidated into larger blocks when the optimizer runs.
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Create a block per operation.  */
+    :end-before: /* Populate the initial block.  */
+    :language: c
+
+Now that we have a block it can jump to when it's done, we can populate
+the initial block:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Populate the initial block.  */
+    :end-before: /* 2nd pass: fill in instructions.  */
+    :language: c
+
+We can now populate the blocks for the individual operations.  We loop
+through them, adding instructions to their blocks:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* 2nd pass: fill in instructions.  */
+    :end-before: /* Helper macros.  */
+    :language: c
+
+We're going to have another big ``switch`` statement for implementing
+the opcodes, this time for compiling them, rather than interpreting
+them.  It's helpful to have macros for implementing push and pop, so that
+we can make the ``switch`` statement that's coming up look as much as
+possible like the one above within the interpreter:
+
+.. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Helper macros.  */
+    :end-before: gcc_jit_block_add_comment
+    :language: c
+
+.. note::
+
+   A particularly clever implementation would have an *identical*
+   ``switch`` statement shared by the interpreter and the compiler, with
+   some preprocessor "magic".  We're not doing that here, for the sake
+   of simplicity.
+
+When I first implemented this compiler, I accidentally missed an edit
+when copying and pasting the ``Y_EQUALS_POP`` macro, so that popping the
+stack into ``y`` instead erroneously assigned it to ``x``, leaving ``y``
+uninitialized.
+
+To track this kind of thing down, we can use
+:c:func:`gcc_jit_block_add_comment` to add descriptive comments
+to the internal representation.  This is invaluable when looking through
+the generated IR for, say ``factorial``:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: PUSH_RVALUE (gcc_jit_lvalue_as_rvalue (state.y))
+    :end-before: /* Handle the individual opcodes.  */
+    :language: c
+
+We can now write the big ``switch`` statement that implements the
+individual opcodes, populating the relevant block with statements:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Handle the individual opcodes.  */
+    :end-before: /* Go to the next block.  */
+    :language: c
+
+Every block must be terminated, via a call to one of the
+``gcc_jit_block_end_with_`` entrypoints.  This has been done for two
+of the opcodes, but we need to do it for the other ones, by jumping
+to the next block.
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* Go to the next block.  */
+    :end-before: /* end of loop on PC locations.  */
+    :language: c
+
+This is analogous to simply incrementing the program counter.
+
+Verifying the control flow graph
+********************************
+Having finished looping over the blocks, the context is complete.
+
+As before, we can verify that the control flow and statements are sane by
+using :c:func:`gcc_jit_function_dump_to_dot`:
+
+.. code-block:: c
+
+  gcc_jit_function_dump_to_dot (state.fn, "/tmp/factorial.dot");
+
+and viewing the result.  Note how the label names, comments, and
+variable names show up in the dump, to make it easier to spot
+errors in our compiler.
+
+  .. figure:: factorial.png
+    :alt: image of a control flow graph
+
+Compiling the context
+*********************
+Having finished looping over the blocks and populating them with
+statements, the context is complete.
+
+We can now compile it, and extract machine code from the result:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* We've now finished populating the context.  Compile it.  */
+    :end-before: /* (this leaks "result" and "funcname") */
+    :language: c
+
+We can now run the result:
+
+   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
+    :start-after: /* JIT-compilation.  */
+    :end-before: return 0;
+    :language: c
+
+Single-stepping through the generated code
+******************************************
+
+It's possible to debug the generated code.  To do this we need to both:
+
+  * Set up source code locations for our statements, so that we can
+    meaningfully step through the code.  We did this above by
+    calling :c:func:`gcc_jit_context_new_location` and using the
+    results.
+
+  * Enable the generation of debugging information, by setting
+    :c:macro:`GCC_JIT_BOOL_OPTION_DEBUGINFO` on the
+    :c:type:`gcc_jit_context` via
+    :c:func:`gcc_jit_context_set_bool_option`:
+
+    .. code-block:: c
+
+     gcc_jit_context_set_bool_option (
+       ctxt,
+       GCC_JIT_BOOL_OPTION_DEBUGINFO,
+       1);
+
+Having done this, we can put a breakpoint on the generated function:
+
+.. code-block:: console
+
+  $ gdb --args ./toyvm factorial.toy 10
+  (gdb) break factorial
+  Function "factorial" not defined.
+  Make breakpoint pending on future shared library load? (y or [n]) y
+  Breakpoint 1 (factorial) pending.
+  (gdb) run
+  Breakpoint 1, factorial (arg=10) at factorial.toy:14
+  14	DUP
+
+We've set up location information, which references ``factorial.toy``.
+This allows us to use e.g. ``list`` to see where we are in the script:
+
+.. code-block:: console
+
+  (gdb) list
+  9
+  10    # Initial state:
+  11    # stack: [arg]
+  12
+  13    # 0:
+  14    DUP
+  15    # stack: [arg, arg]
+  16
+  17    # 1:
+  18    PUSH_CONST 2
+
+and to step through the function, examining the data:
+
+.. code-block:: console
+
+  (gdb) n
+  18    PUSH_CONST 2
+  (gdb) n
+  22    BINARY_COMPARE_LT
+  (gdb) print stack
+  $5 = {10, 10, 2, 0, -7152, 32767, 0, 0}
+  (gdb) print stack_depth
+  $6 = 3
+
+You'll see that the parts of the ``stack`` array that haven't been
+touched yet are uninitialized.
+
+.. note::
+
+   Turning on optimizations may lead to unpredictable results when
+   stepping through the generated code: the execution may appear to
+   "jump around" the source code.  This is analogous to turning up the
+   optimization level in a regular compiler.
+
+Examining the generated code
+****************************
+
+How good is the optimized code?
+
+We can turn up optimizations, by calling
+:c:func:`gcc_jit_context_set_int_option` with
+:c:macro:`GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL`:
+
+.. code-block:: c
+
+  gcc_jit_context_set_int_option (
+    ctxt,
+    GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL,
+    3);
+
+One of GCC's internal representations is called "gimple".  A dump of the
+initial gimple representation of the code can be seen by setting:
+
+.. code-block:: c
+
+  gcc_jit_context_set_bool_option (ctxt,
+                                   GCC_JIT_BOOL_OPTION_DUMP_INITIAL_GIMPLE,
+                                   1);
+
+With optimization on and source locations displayed, this gives:
+
+.. We'll use "c" for gimple dumps
+
+.. code-block:: c
+
+  factorial (signed int arg)
+  {
+    <unnamed type> D.80;
+    signed int D.81;
+    signed int D.82;
+    signed int D.83;
+    signed int D.84;
+    signed int D.85;
+    signed int y;
+    signed int x;
+    signed int stack_depth;
+    signed int stack[8];
+
+    try
+      {
+        initial:
+        stack_depth = 0;
+        stack[stack_depth] = arg;
+        stack_depth = stack_depth + 1;
+        goto instr0;
+        instr0:
+        /* DUP */:
+        stack_depth = stack_depth + -1;
+        x = stack[stack_depth];
+        stack[stack_depth] = x;
+        stack_depth = stack_depth + 1;
+        stack[stack_depth] = x;
+        stack_depth = stack_depth + 1;
+        goto instr1;
+        instr1:
+        /* PUSH_CONST */:
+        stack[stack_depth] = 2;
+        stack_depth = stack_depth + 1;
+        goto instr2;
+
+        /* etc */
+
+You can see the generated machine code in assembly form via:
+
+.. code-block:: c
+
+  gcc_jit_context_set_bool_option (
+    ctxt,
+    GCC_JIT_BOOL_OPTION_DUMP_GENERATED_CODE,
+    1);
+  result = gcc_jit_context_compile (ctxt);
+
+which shows that (on this x86_64 box) the compiler has unrolled the loop
+and is using MMX instructions to perform several multiplications
+simultaneously:
+
+.. code-block:: gas
+
+          .file   "fake.c"
+          .text
+  .Ltext0:
+          .p2align 4,,15
+          .globl  factorial
+          .type   factorial, @function
+  factorial:
+  .LFB0:
+          .file 1 "factorial.toy"
+          .loc 1 14 0
+          .cfi_startproc
+  .LVL0:
+  .L2:
+          .loc 1 26 0
+          cmpl    $1, %edi
+          jle     .L13
+          leal    -1(%rdi), %edx
+          movl    %edx, %ecx
+          shrl    $2, %ecx
+          leal    0(,%rcx,4), %esi
+          testl   %esi, %esi
+          je      .L14
+          cmpl    $9, %edx
+          jbe     .L14
+          leal    -2(%rdi), %eax
+          movl    %eax, -16(%rsp)
+          leal    -3(%rdi), %eax
+          movd    -16(%rsp), %xmm0
+          movl    %edi, -16(%rsp)
+          movl    %eax, -12(%rsp)
+          movd    -16(%rsp), %xmm1
+          xorl    %eax, %eax
+          movl    %edx, -16(%rsp)
+          movd    -12(%rsp), %xmm4
+          movd    -16(%rsp), %xmm6
+          punpckldq       %xmm4, %xmm0
+          movdqa  .LC1(%rip), %xmm4
+          punpckldq       %xmm6, %xmm1
+          punpcklqdq      %xmm0, %xmm1
+          movdqa  .LC0(%rip), %xmm0
+          jmp     .L5
+          # etc - edited for brevity
+
+This is clearly overkill for a function that will likely overflow the
+``int`` type before the vectorization is worthwhile - but then again, this
+is a toy example.
+
+Turning down the optimization level to 2:
+
+.. code-block:: c
+
+  gcc_jit_context_set_int_option (
+    ctxt,
+    GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL,
+    3);
+
+yields this code, which is simple enough to quote in its entirety:
+
+.. code-block:: gas
+
+          .file   "fake.c"
+          .text
+          .p2align 4,,15
+          .globl  factorial
+          .type   factorial, @function
+  factorial:
+  .LFB0:
+          .cfi_startproc
+  .L2:
+          cmpl    $1, %edi
+          jle     .L8
+          movl    $1, %edx
+          jmp     .L4
+          .p2align 4,,10
+          .p2align 3
+  .L6:
+          movl    %eax, %edi
+  .L4:
+  .L5:
+          leal    -1(%rdi), %eax
+          imull   %edi, %edx
+          cmpl    $1, %eax
+          jne     .L6
+  .L3:
+  .L7:
+          imull   %edx, %eax
+          ret
+  .L8:
+          movl    %edi, %eax
+          movl    $1, %edx
+          jmp     .L7
+          .cfi_endproc
+  .LFE0:
+          .size   factorial, .-factorial
+          .ident  "GCC: (GNU) 4.9.0 20131023 (Red Hat 0.2-%{gcc_release})"
+          .section        .note.GNU-stack,"",@progbits
+
+Note that the stack pushing and popping have been eliminated, as has the
+recursive call (in favor of an iteration).
+
+Putting it all together
+***********************
+
+The complete example can be seen in the source tree at
+``gcc/jit/docs/examples/tut04-toyvm/toyvm.c``
+
+along with a Makefile and a couple of sample .toy scripts:
+
+.. code-block:: console
+
+  $ ls -al
+  drwxrwxr-x. 2 david david   4096 Sep 19 17:46 .
+  drwxrwxr-x. 3 david david   4096 Sep 19 15:26 ..
+  -rw-rw-r--. 1 david david    615 Sep 19 12:43 factorial.toy
+  -rw-rw-r--. 1 david david    834 Sep 19 13:08 fibonacci.toy
+  -rw-rw-r--. 1 david david    238 Sep 19 14:22 Makefile
+  -rw-rw-r--. 1 david david  16457 Sep 19 17:07 toyvm.c
+
+  $ make toyvm
+  g++ -Wall -g -o toyvm toyvm.c -lgccjit
+
+  $ ./toyvm factorial.toy 10
+  interpreter result: 3628800
+  compiler result: 3628800
+
+  $ ./toyvm fibonacci.toy 10
+  interpreter result: 55
+  compiler result: 55
+
+Behind the curtain: How does our code get optimized?
+****************************************************
+
+Our example is done, but you may be wondering about exactly how the
+compiler turned what we gave it into the machine code seen above.
+
+We can examine what the compiler is doing in detail by setting:
+
+.. code-block:: c
+
+  gcc_jit_context_set_bool_option (state.ctxt,
+                                   GCC_JIT_BOOL_OPTION_DUMP_EVERYTHING,
+                                   1);
+  gcc_jit_context_set_bool_option (state.ctxt,
+                                   GCC_JIT_BOOL_OPTION_KEEP_INTERMEDIATES,
+                                   1);
+
+This will dump detailed information about the compiler's state to a
+directory under ``/tmp``, and keep it from being cleaned up.
+
+The precise names and their formats of these files is subject to change.
+Higher optimization levels lead to more files.
+Here's what I saw (edited for brevity; there were almost 200 files):
+
+.. code-block:: console
+
+  intermediate files written to /tmp/libgccjit-KPQbGw
+  $ ls /tmp/libgccjit-KPQbGw/
+  fake.c.000i.cgraph
+  fake.c.000i.type-inheritance
+  fake.c.004t.gimple
+  fake.c.007t.omplower
+  fake.c.008t.lower
+  fake.c.011t.eh
+  fake.c.012t.cfg
+  fake.c.014i.visibility
+  fake.c.015i.early_local_cleanups
+  fake.c.016t.ssa
+  # etc
+
+The gimple code is converted into Static Single Assignment form,
+with annotations for use when generating the debuginfo:
+
+.. code-block:: console
+
+  $ less /tmp/libgccjit-KPQbGw/fake.c.016t.ssa
+
+.. code-block:: c
+
+  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
+
+  factorial (signed int arg)
+  {
+    signed int stack[8];
+    signed int stack_depth;
+    signed int x;
+    signed int y;
+    <unnamed type> _20;
+    signed int _21;
+    signed int _38;
+    signed int _44;
+    signed int _51;
+    signed int _56;
+
+  initial:
+    stack_depth_3 = 0;
+    # DEBUG stack_depth => stack_depth_3
+    stack[stack_depth_3] = arg_5(D);
+    stack_depth_7 = stack_depth_3 + 1;
+    # DEBUG stack_depth => stack_depth_7
+    # DEBUG instr0 => NULL
+    # DEBUG /* DUP */ => NULL
+    stack_depth_8 = stack_depth_7 + -1;
+    # DEBUG stack_depth => stack_depth_8
+    x_9 = stack[stack_depth_8];
+    # DEBUG x => x_9
+    stack[stack_depth_8] = x_9;
+    stack_depth_11 = stack_depth_8 + 1;
+    # DEBUG stack_depth => stack_depth_11
+    stack[stack_depth_11] = x_9;
+    stack_depth_13 = stack_depth_11 + 1;
+    # DEBUG stack_depth => stack_depth_13
+    # DEBUG instr1 => NULL
+    # DEBUG /* PUSH_CONST */ => NULL
+    stack[stack_depth_13] = 2;
+
+    /* etc; edited for brevity */
+
+We can perhaps better see the code by turning off
+:c:macro:`GCC_JIT_BOOL_OPTION_DEBUGINFO` to suppress all those ``DEBUG``
+statements, giving:
+
+.. code-block:: console
+
+  $ less /tmp/libgccjit-1Hywc0/fake.c.016t.ssa
+
+.. code-block:: c
+
+  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
+
+  factorial (signed int arg)
+  {
+    signed int stack[8];
+    signed int stack_depth;
+    signed int x;
+    signed int y;
+    <unnamed type> _20;
+    signed int _21;
+    signed int _38;
+    signed int _44;
+    signed int _51;
+    signed int _56;
+
+  initial:
+    stack_depth_3 = 0;
+    stack[stack_depth_3] = arg_5(D);
+    stack_depth_7 = stack_depth_3 + 1;
+    stack_depth_8 = stack_depth_7 + -1;
+    x_9 = stack[stack_depth_8];
+    stack[stack_depth_8] = x_9;
+    stack_depth_11 = stack_depth_8 + 1;
+    stack[stack_depth_11] = x_9;
+    stack_depth_13 = stack_depth_11 + 1;
+    stack[stack_depth_13] = 2;
+    stack_depth_15 = stack_depth_13 + 1;
+    stack_depth_16 = stack_depth_15 + -1;
+    y_17 = stack[stack_depth_16];
+    stack_depth_18 = stack_depth_16 + -1;
+    x_19 = stack[stack_depth_18];
+    _20 = x_19 < y_17;
+    _21 = (signed int) _20;
+    stack[stack_depth_18] = _21;
+    stack_depth_23 = stack_depth_18 + 1;
+    stack_depth_24 = stack_depth_23 + -1;
+    x_25 = stack[stack_depth_24];
+    if (x_25 != 0)
+      goto <bb 4> (instr9);
+    else
+      goto <bb 3> (instr4);
+
+  instr4:
+  /* DUP */:
+    stack_depth_26 = stack_depth_24 + -1;
+    x_27 = stack[stack_depth_26];
+    stack[stack_depth_26] = x_27;
+    stack_depth_29 = stack_depth_26 + 1;
+    stack[stack_depth_29] = x_27;
+    stack_depth_31 = stack_depth_29 + 1;
+    stack[stack_depth_31] = 1;
+    stack_depth_33 = stack_depth_31 + 1;
+    stack_depth_34 = stack_depth_33 + -1;
+    y_35 = stack[stack_depth_34];
+    stack_depth_36 = stack_depth_34 + -1;
+    x_37 = stack[stack_depth_36];
+    _38 = x_37 - y_35;
+    stack[stack_depth_36] = _38;
+    stack_depth_40 = stack_depth_36 + 1;
+    stack_depth_41 = stack_depth_40 + -1;
+    x_42 = stack[stack_depth_41];
+    _44 = factorial (x_42);
+    stack[stack_depth_41] = _44;
+    stack_depth_46 = stack_depth_41 + 1;
+    stack_depth_47 = stack_depth_46 + -1;
+    y_48 = stack[stack_depth_47];
+    stack_depth_49 = stack_depth_47 + -1;
+    x_50 = stack[stack_depth_49];
+    _51 = x_50 * y_48;
+    stack[stack_depth_49] = _51;
+    stack_depth_53 = stack_depth_49 + 1;
+
+    # stack_depth_1 = PHI <stack_depth_24(2), stack_depth_53(3)>
+  instr9:
+  /* RETURN */:
+    stack_depth_54 = stack_depth_1 + -1;
+    x_55 = stack[stack_depth_54];
+    _56 = x_55;
+    stack ={v} {CLOBBER};
+    return _56;
+
+  }
+
+Note in the above how all the :c:type:`gcc_jit_block` instances we
+created have been consolidated into just 3 blocks in GCC's internal
+representation: ``initial``, ``instr4`` and ``instr9``.
+
+Optimizing away stack manipulation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Recall our simple implementation of stack operations.  Let's examine
+how the stack operations are optimized away.
+
+After a pass of constant-propagation, the depth of the stack at each
+opcode can be determined at compile-time:
+
+.. code-block:: console
+
+  $ less /tmp/libgccjit-1Hywc0/fake.c.021t.ccp1
+
+.. code-block:: c
+
+  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
+
+  factorial (signed int arg)
+  {
+    signed int stack[8];
+    signed int stack_depth;
+    signed int x;
+    signed int y;
+    <unnamed type> _20;
+    signed int _21;
+    signed int _38;
+    signed int _44;
+    signed int _51;
+
+  initial:
+    stack[0] = arg_5(D);
+    x_9 = stack[0];
+    stack[0] = x_9;
+    stack[1] = x_9;
+    stack[2] = 2;
+    y_17 = stack[2];
+    x_19 = stack[1];
+    _20 = x_19 < y_17;
+    _21 = (signed int) _20;
+    stack[1] = _21;
+    x_25 = stack[1];
+    if (x_25 != 0)
+      goto <bb 4> (instr9);
+    else
+      goto <bb 3> (instr4);
+
+  instr4:
+  /* DUP */:
+    x_27 = stack[0];
+    stack[0] = x_27;
+    stack[1] = x_27;
+    stack[2] = 1;
+    y_35 = stack[2];
+    x_37 = stack[1];
+    _38 = x_37 - y_35;
+    stack[1] = _38;
+    x_42 = stack[1];
+    _44 = factorial (x_42);
+    stack[1] = _44;
+    y_48 = stack[1];
+    x_50 = stack[0];
+    _51 = x_50 * y_48;
+    stack[0] = _51;
+
+  instr9:
+  /* RETURN */:
+    x_55 = stack[0];
+    x_56 = x_55;
+    stack ={v} {CLOBBER};
+    return x_56;
+
+  }
+
+Note how, in the above, all those ``stack_depth`` values are now just
+constants: we're accessing specific stack locations at each opcode.
+
+The "esra" pass ("Early Scalar Replacement of Aggregates") breaks
+out our "stack" array into individual elements:
+
+.. code-block:: console
+
+  $ less /tmp/libgccjit-1Hywc0/fake.c.024t.esra
+
+.. code-block:: c
+
+  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
+
+  Created a replacement for stack offset: 0, size: 32: stack$0
+  Created a replacement for stack offset: 32, size: 32: stack$1
+  Created a replacement for stack offset: 64, size: 32: stack$2
+
+  Symbols to be put in SSA form
+  { D.89 D.90 D.91 }
+  Incremental SSA update started at block: 0
+  Number of blocks in CFG: 5
+  Number of blocks to update: 4 ( 80%)
+
+
+  factorial (signed int arg)
+  {
+    signed int stack$2;
+    signed int stack$1;
+    signed int stack$0;
+    signed int stack[8];
+    signed int stack_depth;
+    signed int x;
+    signed int y;
+    <unnamed type> _20;
+    signed int _21;
+    signed int _38;
+    signed int _44;
+    signed int _51;
+
+  initial:
+    stack$0_45 = arg_5(D);
+    x_9 = stack$0_45;
+    stack$0_39 = x_9;
+    stack$1_32 = x_9;
+    stack$2_30 = 2;
+    y_17 = stack$2_30;
+    x_19 = stack$1_32;
+    _20 = x_19 < y_17;
+    _21 = (signed int) _20;
+    stack$1_28 = _21;
+    x_25 = stack$1_28;
+    if (x_25 != 0)
+      goto <bb 4> (instr9);
+    else
+      goto <bb 3> (instr4);
+
+  instr4:
+  /* DUP */:
+    x_27 = stack$0_39;
+    stack$0_22 = x_27;
+    stack$1_14 = x_27;
+    stack$2_12 = 1;
+    y_35 = stack$2_12;
+    x_37 = stack$1_14;
+    _38 = x_37 - y_35;
+    stack$1_10 = _38;
+    x_42 = stack$1_10;
+    _44 = factorial (x_42);
+    stack$1_6 = _44;
+    y_48 = stack$1_6;
+    x_50 = stack$0_22;
+    _51 = x_50 * y_48;
+    stack$0_1 = _51;
+
+    # stack$0_52 = PHI <stack$0_39(2), stack$0_1(3)>
+  instr9:
+  /* RETURN */:
+    x_55 = stack$0_52;
+    x_56 = x_55;
+    stack ={v} {CLOBBER};
+    return x_56;
+
+  }
+
+Hence at this point, all those pushes and pops of the stack are now
+simply assignments to specific temporary variables.
+
+After some copy propagation, the stack manipulation has been completely
+optimized away:
+
+.. code-block:: console
+
+  $ less /tmp/libgccjit-1Hywc0/fake.c.026t.copyprop1
+
+.. code-block:: c
+
+  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
+
+  factorial (signed int arg)
+  {
+    signed int stack$2;
+    signed int stack$1;
+    signed int stack$0;
+    signed int stack[8];
+    signed int stack_depth;
+    signed int x;
+    signed int y;
+    <unnamed type> _20;
+    signed int _21;
+    signed int _38;
+    signed int _44;
+    signed int _51;
+
+  initial:
+    stack$0_39 = arg_5(D);
+    _20 = arg_5(D) <= 1;
+    _21 = (signed int) _20;
+    if (_21 != 0)
+      goto <bb 4> (instr9);
+    else
+      goto <bb 3> (instr4);
+
+  instr4:
+  /* DUP */:
+    _38 = arg_5(D) + -1;
+    _44 = factorial (_38);
+    _51 = arg_5(D) * _44;
+    stack$0_1 = _51;
+
+    # stack$0_52 = PHI <arg_5(D)(2), _51(3)>
+  instr9:
+  /* RETURN */:
+    stack ={v} {CLOBBER};
+    return stack$0_52;
+
+  }
+
+Later on, another pass finally eliminated ``stack_depth`` local and the
+unused parts of the `stack`` array altogether:
+
+.. code-block:: console
+
+  $ less /tmp/libgccjit-1Hywc0/fake.c.036t.release_ssa
+
+.. code-block:: c
+
+  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
+
+  Released 44 names, 314.29%, removed 44 holes
+  factorial (signed int arg)
+  {
+    signed int stack$0;
+    signed int mult_acc_1;
+    <unnamed type> _5;
+    signed int _6;
+    signed int _7;
+    signed int mul_tmp_10;
+    signed int mult_acc_11;
+    signed int mult_acc_13;
+
+    # arg_9 = PHI <arg_8(D)(0)>
+    # mult_acc_13 = PHI <1(0)>
+  initial:
+
+    <bb 5>:
+    # arg_4 = PHI <arg_9(2), _7(3)>
+    # mult_acc_1 = PHI <mult_acc_13(2), mult_acc_11(3)>
+    _5 = arg_4 <= 1;
+    _6 = (signed int) _5;
+    if (_6 != 0)
+      goto <bb 4> (instr9);
+    else
+      goto <bb 3> (instr4);
+
+  instr4:
+  /* DUP */:
+    _7 = arg_4 + -1;
+    mult_acc_11 = mult_acc_1 * arg_4;
+    goto <bb 5>;
+
+    # stack$0_12 = PHI <arg_4(5)>
+  instr9:
+  /* RETURN */:
+    mul_tmp_10 = mult_acc_1 * stack$0_12;
+    return mul_tmp_10;
+
+  }
+
+
+Elimination of tail recursion
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Another significant optimization is the detection that the call to
+``factorial`` is tail recursion, which can be eliminated in favor of
+an iteration:
+
+.. code-block:: console
+
+  $ less /tmp/libgccjit-1Hywc0/fake.c.030t.tailr1
+
+.. code-block:: c
+
+  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
+
+
+  Symbols to be put in SSA form
+  { D.88 }
+  Incremental SSA update started at block: 0
+  Number of blocks in CFG: 5
+  Number of blocks to update: 4 ( 80%)
+
+
+  factorial (signed int arg)
+  {
+    signed int stack$2;
+    signed int stack$1;
+    signed int stack$0;
+    signed int stack[8];
+    signed int stack_depth;
+    signed int x;
+    signed int y;
+    signed int mult_acc_1;
+    <unnamed type> _20;
+    signed int _21;
+    signed int _38;
+    signed int mul_tmp_44;
+    signed int mult_acc_51;
+
+    # arg_5 = PHI <arg_39(D)(0), _38(3)>
+    # mult_acc_1 = PHI <1(0), mult_acc_51(3)>
+  initial:
+    _20 = arg_5 <= 1;
+    _21 = (signed int) _20;
+    if (_21 != 0)
+      goto <bb 4> (instr9);
+    else
+      goto <bb 3> (instr4);
+
+  instr4:
+  /* DUP */:
+    _38 = arg_5 + -1;
+    mult_acc_51 = mult_acc_1 * arg_5;
+    goto <bb 2> (initial);
+
+    # stack$0_52 = PHI <arg_5(2)>
+  instr9:
+  /* RETURN */:
+    stack ={v} {CLOBBER};
+    mul_tmp_44 = mult_acc_1 * stack$0_52;
+    return mul_tmp_44;
+
+  }
-- 
1.8.5.3


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]