Loops in GCC preprocessor needed

Nulik Nol nuliknol@gmail.com
Tue Jan 26 18:34:00 GMT 2016


Hi,
I need looping functionality in GCC's preprocessor. Consider the
following example, which is very primitive, the loops I need are
actually more complex, but this example is only to explain the
problem:

I write code like this, then I preprocess it with PHP, and after that I compile:
====================================================================================
void find_new_line(nl_parse_job_t *jobs,uint num_jobs) {
    __m128i *needle;
    char newline[1]  __attribute__ ((aligned (16))) ={'\n'};
    __m128i *data_ptrs[MAX_THREADS];
    uint nl_positions[MAX_THREADS];
    uint search_results[MAX_THREADS];
    uint add_delims[MAX_THREADS];
    uint num_delims[MAX_THREADS];
    uint new_positions[MAX_THREADS];

    needle=(__m128i*) newline;

//LOOP_THREADS_START:016
    data_ptrs[MAX_THREADS-_ITERATOR_]=(__m128i*)
bufs[jobs[MAX_THREADS-_ITERATOR_].buf_idx].data;
//LOOP_THREADS_END
//LOOP_THREADS_START:016
    num_delims[MAX_THREADS-_ITERATOR_]=delims[jobs[MAX_THREADS-_ITERATOR_].buf_idx].num_delims;
                                // read num_delims from memory
//LOOP_THREADS_END
//LOOP_THREADS_START:016
    nl_positions[MAX_THREADS-_ITERATOR_]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-_ITERATOR_],
jobs[MAX_THREADS-_ITERATOR_].length, _SIDD_UBYTE_OPS
|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST$
//LOOP_THREADS_END
//LOOP_THREADS_START:016
    search_results[MAX_THREADS-_ITERATOR_]=(nl_positions[MAX_THREADS-_ITERATOR_]==16)
?  0 : 1;                                     // determine found or
not
//LOOP_THREADS_END
//LOOP_THREADS_START:016
    new_positions[MAX_THREADS-_ITERATOR_]=(!search_results[MAX_THREADS-_ITERATOR_])
? 0 : nl_positions[MAX_THREADS-_ITERATOR_]; // convert SSEv42 result
to index of '\n'
//LOOP_THREADS_END
//LOOP_THREADS_START:016
    num_delims[MAX_THREADS-_ITERATOR_]=num_delims[MAX_THREADS-_ITERATOR_]+search_results[MAX_THREADS-_ITERATOR_];
          // num_delims ++
//LOOP_THREADS_END
//LOOP_THREADS_START:016
    add_delims[MAX_THREADS-_ITERATOR_]=jobs[MAX_THREADS-_ITERATOR_].offset+new_positions[MAX_THREADS-_ITERATOR_];
          // calculate absolute offest to '\n'
//LOOP_THREADS_END
//LOOP_THREADS_START:016
    delims[jobs[MAX_THREADS-_ITERATOR_].buf_idx].positions[add_delims[MAX_THREADS-_ITERATOR_]]=add_delims[MAX_THREADS-_ITERATOR_];
 // write new delimiter (i.e. the '\n') position
//LOOP_THREADS_END
//LOOP_THREADS_START:016
    delims[MAX_THREADS-_ITERATOR_].num_delims=num_delims[MAX_THREADS-_ITERATOR_];
                                                      // store the
counter of delimiters to global array
//LOOP_THREADS_END
}
=======================================================================
Then I run a php script to unroll the loops, and I get this output:
=======================================================================
void find_new_line(nl_parse_job_t *jobs,uint num_jobs) {
    __m128i *needle;
    char newline[1]  __attribute__ ((aligned (16))) ={'\n'};
    __m128i *data_ptrs[MAX_THREADS];
    uint nl_positions[MAX_THREADS];
    uint search_results[MAX_THREADS];
    uint add_delims[MAX_THREADS];
    uint num_delims[MAX_THREADS];
    uint new_positions[MAX_THREADS];

    needle=(__m128i*) newline;

    data_ptrs[MAX_THREADS-0]=(__m128i*) bufs[jobs[MAX_THREADS-0].buf_idx].data;
    data_ptrs[MAX_THREADS-1]=(__m128i*) bufs[jobs[MAX_THREADS-1].buf_idx].data;
    data_ptrs[MAX_THREADS-2]=(__m128i*) bufs[jobs[MAX_THREADS-2].buf_idx].data;
    data_ptrs[MAX_THREADS-3]=(__m128i*) bufs[jobs[MAX_THREADS-3].buf_idx].data;
    data_ptrs[MAX_THREADS-4]=(__m128i*) bufs[jobs[MAX_THREADS-4].buf_idx].data;
    data_ptrs[MAX_THREADS-5]=(__m128i*) bufs[jobs[MAX_THREADS-5].buf_idx].data;
    data_ptrs[MAX_THREADS-6]=(__m128i*) bufs[jobs[MAX_THREADS-6].buf_idx].data;
    data_ptrs[MAX_THREADS-7]=(__m128i*) bufs[jobs[MAX_THREADS-7].buf_idx].data;
    data_ptrs[MAX_THREADS-8]=(__m128i*) bufs[jobs[MAX_THREADS-8].buf_idx].data;
    data_ptrs[MAX_THREADS-9]=(__m128i*) bufs[jobs[MAX_THREADS-9].buf_idx].data;
    data_ptrs[MAX_THREADS-10]=(__m128i*)
bufs[jobs[MAX_THREADS-10].buf_idx].data;
    data_ptrs[MAX_THREADS-11]=(__m128i*)
bufs[jobs[MAX_THREADS-11].buf_idx].data;
    data_ptrs[MAX_THREADS-12]=(__m128i*)
bufs[jobs[MAX_THREADS-12].buf_idx].data;
    data_ptrs[MAX_THREADS-13]=(__m128i*)
bufs[jobs[MAX_THREADS-13].buf_idx].data;
    data_ptrs[MAX_THREADS-14]=(__m128i*)
bufs[jobs[MAX_THREADS-14].buf_idx].data;
    data_ptrs[MAX_THREADS-15]=(__m128i*)
bufs[jobs[MAX_THREADS-15].buf_idx].data;

    num_delims[MAX_THREADS-0]=delims[jobs[MAX_THREADS-0].buf_idx].num_delims;
                                  // read num_delims from memory
    num_delims[MAX_THREADS-1]=delims[jobs[MAX_THREADS-1].buf_idx].num_delims;
                                  // read num_delims from memory
    num_delims[MAX_THREADS-2]=delims[jobs[MAX_THREADS-2].buf_idx].num_delims;
                                  // read num_delims from memory
    num_delims[MAX_THREADS-3]=delims[jobs[MAX_THREADS-3].buf_idx].num_delims;
                                  // read num_delims from memory
    num_delims[MAX_THREADS-4]=delims[jobs[MAX_THREADS-4].buf_idx].num_delims;
                                  // read num_delims from memory
    num_delims[MAX_THREADS-5]=delims[jobs[MAX_THREADS-5].buf_idx].num_delims;
                                  // read num_delims from memory
    num_delims[MAX_THREADS-6]=delims[jobs[MAX_THREADS-6].buf_idx].num_delims;
                                  // read num_delims from memory
    num_delims[MAX_THREADS-7]=delims[jobs[MAX_THREADS-7].buf_idx].num_delims;
                                  // read num_delims from memory
    num_delims[MAX_THREADS-8]=delims[jobs[MAX_THREADS-8].buf_idx].num_delims;
                                  // read num_delims from memory
    num_delims[MAX_THREADS-9]=delims[jobs[MAX_THREADS-9].buf_idx].num_delims;
                                  // read num_delims from memory
    num_delims[MAX_THREADS-10]=delims[jobs[MAX_THREADS-10].buf_idx].num_delims;
                                // read num_delims from memory
    num_delims[MAX_THREADS-11]=delims[jobs[MAX_THREADS-11].buf_idx].num_delims;
                                // read num_delims from memory
    num_delims[MAX_THREADS-12]=delims[jobs[MAX_THREADS-12].buf_idx].num_delims;
                                // read num_delims from memory
    num_delims[MAX_THREADS-13]=delims[jobs[MAX_THREADS-13].buf_idx].num_delims;
                                // read num_delims from memory
    num_delims[MAX_THREADS-14]=delims[jobs[MAX_THREADS-14].buf_idx].num_delims;
                                // read num_delims from memory
    num_delims[MAX_THREADS-15]=delims[jobs[MAX_THREADS-15].buf_idx].num_delims;
                                // read num_delims from memory

    nl_positions[MAX_THREADS-0]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-0], jobs[MAX_THREADS-0].length, _SIDD_UBYTE_OPS
|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-1]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-1], jobs[MAX_THREADS-1].length, _SIDD_UBYTE_OPS
|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-2]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-2], jobs[MAX_THREADS-2].length, _SIDD_UBYTE_OPS
|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-3]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-3], jobs[MAX_THREADS-3].length, _SIDD_UBYTE_OPS
|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-4]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-4], jobs[MAX_THREADS-4].length, _SIDD_UBYTE_OPS
|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-5]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-5], jobs[MAX_THREADS-5].length, _SIDD_UBYTE_OPS
|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-6]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-6], jobs[MAX_THREADS-6].length, _SIDD_UBYTE_OPS
|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-7]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-7], jobs[MAX_THREADS-7].length, _SIDD_UBYTE_OPS
|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-8]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-8], jobs[MAX_THREADS-8].length, _SIDD_UBYTE_OPS
|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-9]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-9], jobs[MAX_THREADS-9].length, _SIDD_UBYTE_OPS
|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-10]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-10], jobs[MAX_THREADS-10].length,
_SIDD_UBYTE_OPS |_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-11]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-11], jobs[MAX_THREADS-11].length,
_SIDD_UBYTE_OPS |_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-12]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-12], jobs[MAX_THREADS-12].length,
_SIDD_UBYTE_OPS |_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-13]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-13], jobs[MAX_THREADS-13].length,
_SIDD_UBYTE_OPS |_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-14]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-14], jobs[MAX_THREADS-14].length,
_SIDD_UBYTE_OPS |_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
    nl_positions[MAX_THREADS-15]=_mm_cmpestri(*needle, 1,
*data_ptrs[MAX_THREADS-15], jobs[MAX_THREADS-15].length,
_SIDD_UBYTE_OPS |_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT);
................
................
more code omitted
...............
...............
=======================================================================
As you can see I am using comments in C code, which are parsed by my
PHP preprocessor script and then the real C code is created.
The LOOP_THREADS_START:016 means the loops begins , the 016 means you
have to unroll it 16 times, and _ITERATOR_ is the "i" variable which
goes from 0 to MAX_THREADS. Very simple stuff, but I can't do this
with GCC.
I have seen some people have worked around the GCC preprocessor
(https://github.com/orangeduck/CPP_COMPLETE) and got it done, but
apparently the loops are limited to about 1000 lines or something like
that. Also this is very dirty solution. Why I just cant do it like my
example in PHP :
LOOP BEGINS
    stuff to repeat
LOOP ENDS
Why do we have to xxxx our brain in order to have loops in the GCC's
preprocessor? Or is it very difficult to implement? I wonder why I am
the first person to ask for this, doesn't anybody uses loop unrolling
in their code? Or are there any other much powerful solutions out
there? I have checked the m4 but it also does not support loops, and
you have to xxxx your brain to work around. The good thing to have it
in GCC is because the syntax is checked quickly inside vim, this is
why a loop command is needed for GCC preprocessor. Also, to it is
cleaner to distribute your code, you don't need to install other
applications just ./configure;make and it is done.

Waiting for your comments

TIA
Nulik



More information about the Gcc-help mailing list