libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010,
   3    2011, 2012 Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 271    Before Solaris 9 Update 6, SSE insns cannot be executed.
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       if (__builtin_expect (end - s < 16, 0)
 431           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 432         {
 433           /* There are less than 16 bytes left in the buffer, and less
 434              than 16 bytes left on the page.  Reading 16 bytes at this
 435              point might generate a spurious page fault.  Defer to the
 436              SSE2 implementation, which already handles alignment.  */
 437           return search_line_sse2 (s, end);
 438         }
 439
 440       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 441          memory need not be aligned.  */
 442       __asm ("%vpcmpestri $0, (%1), %2"
 443              : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 16) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 454      in inline assembly, we can make proper use of the flags set.  */
 455   __asm (      "sub $16, %1\n"
 456         "       .balign 16\n"
 457         "0:     add $16, %1\n"
 458         "       %vpcmpestri $0, (%1), %2\n"
 459         "       jnc 0b"
 460         : "=&c"(index), "+r"(s)
 461         : "x"(search), "a"(4), "d"(16));
 462
 463  found:
 464   return s + index;
 465 }
 466
 467 #else
 468 /* Work around out-dated assemblers without sse4 support.  */
 469 #define search_line_sse42 search_line_sse2
 470 #endif
 471
 472 /* Check the CPU capabilities.  */
 473
 474 #include "../gcc/config/i386/cpuid.h"
 475
 476 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 477 static search_line_fast_type search_line_fast;
 478
 479 #define HAVE_init_vectorized_lexer 1
 480 static inline void
 481 init_vectorized_lexer (void)
 482 {
 483   unsigned dummy, ecx = 0, edx = 0;
 484   search_line_fast_type impl = search_line_acc_char;
 485   int minimum = 0;
 486
 487 #if defined(__SSE4_2__)
 488   minimum = 3;
 489 #elif defined(__SSE2__)
 490   minimum = 2;
 491 #elif defined(__SSE__)
 492   minimum = 1;
 493 #endif
 494
 495   if (minimum == 3)
 496     impl = search_line_sse42;
 497   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 498     {
 499       if (minimum == 3 || (ecx & bit_SSE4_2))
 500         impl = search_line_sse42;
 501       else if (minimum == 2 || (edx & bit_SSE2))
 502         impl = search_line_sse2;
 503       else if (minimum == 1 || (edx & bit_SSE))
 504         impl = search_line_mmx;
 505     }
 506   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 507     {
 508       if (minimum == 1
 509           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 510         impl = search_line_mmx;
 511     }
 512
 513   search_line_fast = impl;
 514 }
 515
 516 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 517
 518 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 519 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 520    so we can't compile this function without -maltivec on the command line
 521    (or implied by some other switch).  */
 522
 523 static const uchar *
 524 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 525 {
 526   typedef __attribute__((altivec(vector))) unsigned char vc;
 527
 528   const vc repl_nl = {
 529     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 531   };
 532   const vc repl_cr = {
 533     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 535   };
 536   const vc repl_bs = {
 537     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 539   };
 540   const vc repl_qm = {
 541     '?', '?', '?', '?', '?', '?', '?', '?',
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543   };
 544   const vc ones = {
 545     -1, -1, -1, -1, -1, -1, -1, -1,
 546     -1, -1, -1, -1, -1, -1, -1, -1,
 547   };
 548   const vc zero = { 0 };
 549
 550   vc data, mask, t;
 551
 552   /* Altivec loads automatically mask addresses with -16.  This lets us
 553      issue the first load as early as possible.  */
 554   data = __builtin_vec_ld(0, (const vc *)s);
 555
 556   /* Discard bytes before the beginning of the buffer.  Do this by
 557      beginning with all ones and shifting in zeros according to the
 558      mis-alignment.  The LVSR instruction pulls the exact shift we
 559      want from the address.  */
 560   mask = __builtin_vec_lvsr(0, s);
 561   mask = __builtin_vec_perm(zero, ones, mask);
 562   data &= mask;
 563
 564   /* While altivec loads mask addresses, we still need to align S so
 565      that the offset we compute at the end is correct.  */
 566   s = (const uchar *)((uintptr_t)s & -16);
 567
 568   /* Main loop processing 16 bytes at a time.  */
 569   goto start;
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       s += 16;
 575       data = __builtin_vec_ld(0, (const vc *)s);
 576
 577     start:
 578       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 579       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 580       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 581       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 582       t = (m_nl | m_cr) | (m_bs | m_qm);
 583
 584       /* T now contains 0xff in bytes for which we matched one of the relevant
 585          characters.  We want to exit the loop if any byte in T is non-zero.
 586          Below is the expansion of vec_any_ne(t, zero).  */
 587     }
 588   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 589
 590   {
 591 #define N  (sizeof(vc) / sizeof(long))
 592
 593     typedef char check_count[(N == 2 || N == 4) * 2 - 1];
 594     union {
 595       vc v;
 596       unsigned long l[N];
 597     } u;
 598     unsigned long l, i = 0;
 599
 600     u.v = t;
 601
 602     /* Find the first word of T that is non-zero.  */
 603     switch (N)
 604       {
 605       case 4:
 606         l = u.l[i++];
 607         if (l != 0)
 608           break;
 609         s += sizeof(unsigned long);
 610         l = u.l[i++];
 611         if (l != 0)
 612           break;
 613         s += sizeof(unsigned long);
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625     l = __builtin_clzl(l) >> 3;
 626     return s + l;
 627
 628 #undef N
 629   }
 630 }
 631
 632 #elif defined (__ARM_NEON__)
 633 #include "arm_neon.h"
 634
 635 static const uchar *
 636 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 637 {
 638   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 639   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 640   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 641   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 642   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 643
 644   unsigned int misalign, found, mask;
 645   const uint8_t *p;
 646   uint8x16_t data;
 647
 648   /* Align the source pointer.  */
 649   misalign = (uintptr_t)s & 15;
 650   p = (const uint8_t *)((uintptr_t)s & -16);
 651   data = vld1q_u8 (p);
 652
 653   /* Create a mask for the bytes that are valid within the first
 654      16-byte block.  The Idea here is that the AND with the mask
 655      within the loop is "free", since we need some AND or TEST
 656      insn in order to set the flags for the branch anyway.  */
 657   mask = (-1u << misalign) & 0xffff;
 658
 659   /* Main loop, processing 16 bytes at a time.  */
 660   goto start;
 661
 662   do
 663     {
 664       uint8x8_t l;
 665       uint16x4_t m;
 666       uint32x2_t n;
 667       uint8x16_t t, u, v, w;
 668
 669       p += 16;
 670       data = vld1q_u8 (p);
 671       mask = 0xffff;
 672
 673     start:
 674       t = vceqq_u8 (data, repl_nl);
 675       u = vceqq_u8 (data, repl_cr);
 676       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 677       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 678       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 679       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 680       m = vpaddl_u8 (l);
 681       n = vpaddl_u16 (m);
 682
 683       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 684               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 685       found &= mask;
 686     }
 687   while (!found);
 688
 689   /* FOUND contains 1 in bits for which we matched a relevant
 690      character.  Conversion to the byte index is trivial.  */
 691   found = __builtin_ctz (found);
 692   return (const uchar *)p + found;
 693 }
 694
 695 #else
 696
 697 /* We only have one accellerated alternative.  Use a direct call so that
 698    we encourage inlining.  */
 699
 700 #define search_line_fast  search_line_acc_char
 701
 702 #endif
 703
 704 /* Initialize the lexer if needed.  */
 705
 706 void
 707 _cpp_init_lexer (void)
 708 {
 709 #ifdef HAVE_init_vectorized_lexer
 710   init_vectorized_lexer ();
 711 #endif
 712 }
 713
 714 /* Returns with a logical line that contains no escaped newlines or
 715    trigraphs.  This is a time-critical inner loop.  */
 716 void
 717 _cpp_clean_line (cpp_reader *pfile)
 718 {
 719   cpp_buffer *buffer;
 720   const uchar *s;
 721   uchar c, *d, *p;
 722
 723   buffer = pfile->buffer;
 724   buffer->cur_note = buffer->notes_used = 0;
 725   buffer->cur = buffer->line_base = buffer->next_line;
 726   buffer->need_line = false;
 727   s = buffer->next_line;
 728
 729   if (!buffer->from_stage3)
 730     {
 731       const uchar *pbackslash = NULL;
 732
 733       /* Fast path.  This is the common case of an un-escaped line with
 734          no trigraphs.  The primary win here is by not writing any
 735          data back to memory until we have to.  */
 736       while (1)
 737         {
 738           /* Perform an optimized search for \n, \r, \\, ?.  */
 739           s = search_line_fast (s, buffer->rlimit);
 740
 741           c = *s;
 742           if (c == '\\')
 743             {
 744               /* Record the location of the backslash and continue.  */
 745               pbackslash = s++;
 746             }
 747           else if (__builtin_expect (c == '?', 0))
 748             {
 749               if (__builtin_expect (s[1] == '?', false)
 750                    && _cpp_trigraph_map[s[2]])
 751                 {
 752                   /* Have a trigraph.  We may or may not have to convert
 753                      it.  Add a line note regardless, for -Wtrigraphs.  */
 754                   add_line_note (buffer, s, s[2]);
 755                   if (CPP_OPTION (pfile, trigraphs))
 756                     {
 757                       /* We do, and that means we have to switch to the
 758                          slow path.  */
 759                       d = (uchar *) s;
 760                       *d = _cpp_trigraph_map[s[2]];
 761                       s += 2;
 762                       goto slow_path;
 763                     }
 764                 }
 765               /* Not a trigraph.  Continue on fast-path.  */
 766               s++;
 767             }
 768           else
 769             break;
 770         }
 771
 772       /* This must be \r or \n.  We're either done, or we'll be forced
 773          to write back to the buffer and continue on the slow path.  */
 774       d = (uchar *) s;
 775
 776       if (__builtin_expect (s == buffer->rlimit, false))
 777         goto done;
 778
 779       /* DOS line ending? */
 780       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 781         {
 782           s++;
 783           if (s == buffer->rlimit)
 784             goto done;
 785         }
 786
 787       if (__builtin_expect (pbackslash == NULL, true))
 788         goto done;
 789
 790       /* Check for escaped newline.  */
 791       p = d;
 792       while (is_nvspace (p[-1]))
 793         p--;
 794       if (p - 1 != pbackslash)
 795         goto done;
 796
 797       /* Have an escaped newline; process it and proceed to
 798          the slow path.  */
 799       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 800       d = p - 2;
 801       buffer->next_line = p - 1;
 802
 803     slow_path:
 804       while (1)
 805         {
 806           c = *++s;
 807           *++d = c;
 808
 809           if (c == '\n' || c == '\r')
 810             {
 811               /* Handle DOS line endings.  */
 812               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 813                 s++;
 814               if (s == buffer->rlimit)
 815                 break;
 816
 817               /* Escaped?  */
 818               p = d;
 819               while (p != buffer->next_line && is_nvspace (p[-1]))
 820                 p--;
 821               if (p == buffer->next_line || p[-1] != '\\')
 822                 break;
 823
 824               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 825               d = p - 2;
 826               buffer->next_line = p - 1;
 827             }
 828           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 829             {
 830               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 831               add_line_note (buffer, d, s[2]);
 832               if (CPP_OPTION (pfile, trigraphs))
 833                 {
 834                   *d = _cpp_trigraph_map[s[2]];
 835                   s += 2;
 836                 }
 837             }
 838         }
 839     }
 840   else
 841     {
 842       while (*s != '\n' && *s != '\r')
 843         s++;
 844       d = (uchar *) s;
 845
 846       /* Handle DOS line endings.  */
 847       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 848         s++;
 849     }
 850
 851  done:
 852   *d = '\n';
 853   /* A sentinel note that should never be processed.  */
 854   add_line_note (buffer, d + 1, '\n');
 855   buffer->next_line = s + 1;
 856 }
 857
 858 /* Return true if the trigraph indicated by NOTE should be warned
 859    about in a comment.  */
 860 static bool
 861 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 862 {
 863   const uchar *p;
 864
 865   /* Within comments we don't warn about trigraphs, unless the
 866      trigraph forms an escaped newline, as that may change
 867      behavior.  */
 868   if (note->type != '/')
 869     return false;
 870
 871   /* If -trigraphs, then this was an escaped newline iff the next note
 872      is coincident.  */
 873   if (CPP_OPTION (pfile, trigraphs))
 874     return note[1].pos == note->pos;
 875
 876   /* Otherwise, see if this forms an escaped newline.  */
 877   p = note->pos + 3;
 878   while (is_nvspace (*p))
 879     p++;
 880
 881   /* There might have been escaped newlines between the trigraph and the
 882      newline we found.  Hence the position test.  */
 883   return (*p == '\n' && p < note[1].pos);
 884 }
 885
 886 /* Process the notes created by add_line_note as far as the current
 887    location.  */
 888 void
 889 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 890 {
 891   cpp_buffer *buffer = pfile->buffer;
 892
 893   for (;;)
 894     {
 895       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 896       unsigned int col;
 897
 898       if (note->pos > buffer->cur)
 899         break;
 900
 901       buffer->cur_note++;
 902       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 903
 904       if (note->type == '\\' || note->type == ' ')
 905         {
 906           if (note->type == ' ' && !in_comment)
 907             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 908                                  "backslash and newline separated by space");
 909
 910           if (buffer->next_line > buffer->rlimit)
 911             {
 912               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 913                                    "backslash-newline at end of file");
 914               /* Prevent "no newline at end of file" warning.  */
 915               buffer->next_line = buffer->rlimit;
 916             }
 917
 918           buffer->line_base = note->pos;
 919           CPP_INCREMENT_LINE (pfile, 0);
 920         }
 921       else if (_cpp_trigraph_map[note->type])
 922         {
 923           if (CPP_OPTION (pfile, warn_trigraphs)
 924               && (!in_comment || warn_in_comment (pfile, note)))
 925             {
 926               if (CPP_OPTION (pfile, trigraphs))
 927                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 928                                        pfile->line_table->highest_line, col,
 929                                        "trigraph ??%c converted to %c",
 930                                        note->type,
 931                                        (int) _cpp_trigraph_map[note->type]);
 932               else
 933                 {
 934                   cpp_warning_with_line
 935                     (pfile, CPP_W_TRIGRAPHS,
 936                      pfile->line_table->highest_line, col,
 937                      "trigraph ??%c ignored, use -trigraphs to enable",
 938                      note->type);
 939                 }
 940             }
 941         }
 942       else if (note->type == 0)
 943         /* Already processed in lex_raw_string.  */;
 944       else
 945         abort ();
 946     }
 947 }
 948
 949 /* Skip a C-style block comment.  We find the end of the comment by
 950    seeing if an asterisk is before every '/' we encounter.  Returns
 951    nonzero if comment terminated by EOF, zero otherwise.
 952
 953    Buffer->cur points to the initial asterisk of the comment.  */
 954 bool
 955 _cpp_skip_block_comment (cpp_reader *pfile)
 956 {
 957   cpp_buffer *buffer = pfile->buffer;
 958   const uchar *cur = buffer->cur;
 959   uchar c;
 960
 961   cur++;
 962   if (*cur == '/')
 963     cur++;
 964
 965   for (;;)
 966     {
 967       /* People like decorating comments with '*', so check for '/'
 968          instead for efficiency.  */
 969       c = *cur++;
 970
 971       if (c == '/')
 972         {
 973           if (cur[-2] == '*')
 974             break;
 975
 976           /* Warn about potential nested comments, but not if the '/'
 977              comes immediately before the true comment delimiter.
 978              Don't bother to get it right across escaped newlines.  */
 979           if (CPP_OPTION (pfile, warn_comments)
 980               && cur[0] == '*' && cur[1] != '/')
 981             {
 982               buffer->cur = cur;
 983               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 984                                      pfile->line_table->highest_line,
 985                                      CPP_BUF_COL (buffer),
 986                                      "\"/*\" within comment");
 987             }
 988         }
 989       else if (c == '\n')
 990         {
 991           unsigned int cols;
 992           buffer->cur = cur - 1;
 993           _cpp_process_line_notes (pfile, true);
 994           if (buffer->next_line >= buffer->rlimit)
 995             return true;
 996           _cpp_clean_line (pfile);
 997
 998           cols = buffer->next_line - buffer->line_base;
 999           CPP_INCREMENT_LINE (pfile, cols);
1000
1001           cur = buffer->cur;
1002         }
1003     }
1004
1005   buffer->cur = cur;
1006   _cpp_process_line_notes (pfile, true);
1007   return false;
1008 }
1009
1010 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1011    terminating newline.  Handles escaped newlines.  Returns nonzero
1012    if a multiline comment.  */
1013 static int
1014 skip_line_comment (cpp_reader *pfile)
1015 {
1016   cpp_buffer *buffer = pfile->buffer;
1017   source_location orig_line = pfile->line_table->highest_line;
1018
1019   while (*buffer->cur != '\n')
1020     buffer->cur++;
1021
1022   _cpp_process_line_notes (pfile, true);
1023   return orig_line != pfile->line_table->highest_line;
1024 }
1025
1026 /* Skips whitespace, saving the next non-whitespace character.  */
1027 static void
1028 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1029 {
1030   cpp_buffer *buffer = pfile->buffer;
1031   bool saw_NUL = false;
1032
1033   do
1034     {
1035       /* Horizontal space always OK.  */
1036       if (c == ' ' || c == '\t')
1037         ;
1038       /* Just \f \v or \0 left.  */
1039       else if (c == '\0')
1040         saw_NUL = true;
1041       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1042         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1043                              CPP_BUF_COL (buffer),
1044                              "%s in preprocessing directive",
1045                              c == '\f' ? "form feed" : "vertical tab");
1046
1047       c = *buffer->cur++;
1048     }
1049   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1050   while (is_nvspace (c));
1051
1052   if (saw_NUL)
1053     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1054
1055   buffer->cur--;
1056 }
1057
1058 /* See if the characters of a number token are valid in a name (no
1059    '.', '+' or '-').  */
1060 static int
1061 name_p (cpp_reader *pfile, const cpp_string *string)
1062 {
1063   unsigned int i;
1064
1065   for (i = 0; i < string->len; i++)
1066     if (!is_idchar (string->text[i]))
1067       return 0;
1068
1069   return 1;
1070 }
1071
1072 /* After parsing an identifier or other sequence, produce a warning about
1073    sequences not in NFC/NFKC.  */
1074 static void
1075 warn_about_normalization (cpp_reader *pfile,
1076                           const cpp_token *token,
1077                           const struct normalize_state *s)
1078 {
1079   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1080       && !pfile->state.skipping)
1081     {
1082       /* Make sure that the token is printed using UCNs, even
1083          if we'd otherwise happily print UTF-8.  */
1084       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1085       size_t sz;
1086
1087       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1088       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1089         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1090                                "`%.*s' is not in NFKC", (int) sz, buf);
1091       else
1092         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1093                                "`%.*s' is not in NFC", (int) sz, buf);
1094     }
1095 }
1096
1097 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1098    an identifier.  FIRST is TRUE if this starts an identifier.  */
1099 static bool
1100 forms_identifier_p (cpp_reader *pfile, int first,
1101                     struct normalize_state *state)
1102 {
1103   cpp_buffer *buffer = pfile->buffer;
1104
1105   if (*buffer->cur == '$')
1106     {
1107       if (!CPP_OPTION (pfile, dollars_in_ident))
1108         return false;
1109
1110       buffer->cur++;
1111       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1112         {
1113           CPP_OPTION (pfile, warn_dollars) = 0;
1114           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1115         }
1116
1117       return true;
1118     }
1119
1120   /* Is this a syntactically valid UCN?  */
1121   if (CPP_OPTION (pfile, extended_identifiers)
1122       && *buffer->cur == '\\'
1123       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1124     {
1125       buffer->cur += 2;
1126       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1127                           state))
1128         return true;
1129       buffer->cur -= 2;
1130     }
1131
1132   return false;
1133 }
1134
1135 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1136 static cpp_hashnode *
1137 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1138 {
1139   cpp_hashnode *result;
1140   const uchar *cur;
1141   unsigned int len;
1142   unsigned int hash = HT_HASHSTEP (0, *base);
1143
1144   cur = base + 1;
1145   while (ISIDNUM (*cur))
1146     {
1147       hash = HT_HASHSTEP (hash, *cur);
1148       cur++;
1149     }
1150   len = cur - base;
1151   hash = HT_HASHFINISH (hash, len);
1152   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1153                                               base, len, hash, HT_ALLOC));
1154
1155   /* Rarely, identifiers require diagnostics when lexed.  */
1156   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1157                         && !pfile->state.skipping, 0))
1158     {
1159       /* It is allowed to poison the same identifier twice.  */
1160       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1161         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1162                    NODE_NAME (result));
1163
1164       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1165          replacement list of a variadic macro.  */
1166       if (result == pfile->spec_nodes.n__VA_ARGS__
1167           && !pfile->state.va_args_ok)
1168         cpp_error (pfile, CPP_DL_PEDWARN,
1169                    "__VA_ARGS__ can only appear in the expansion"
1170                    " of a C99 variadic macro");
1171
1172       /* For -Wc++-compat, warn about use of C++ named operators.  */
1173       if (result->flags & NODE_WARN_OPERATOR)
1174         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1175                      "identifier \"%s\" is a special operator name in C++",
1176                      NODE_NAME (result));
1177     }
1178
1179   return result;
1180 }
1181
1182 /* Get the cpp_hashnode of an identifier specified by NAME in
1183    the current cpp_reader object.  If none is found, NULL is returned.  */
1184 cpp_hashnode *
1185 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1186 {
1187   cpp_hashnode *result;
1188   result = lex_identifier_intern (pfile, (uchar *) name);
1189   return result;
1190 }
1191
1192 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1193 static cpp_hashnode *
1194 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1195                 struct normalize_state *nst)
1196 {
1197   cpp_hashnode *result;
1198   const uchar *cur;
1199   unsigned int len;
1200   unsigned int hash = HT_HASHSTEP (0, *base);
1201
1202   cur = pfile->buffer->cur;
1203   if (! starts_ucn)
1204     while (ISIDNUM (*cur))
1205       {
1206         hash = HT_HASHSTEP (hash, *cur);
1207         cur++;
1208       }
1209   pfile->buffer->cur = cur;
1210   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1211     {
1212       /* Slower version for identifiers containing UCNs (or $).  */
1213       do {
1214         while (ISIDNUM (*pfile->buffer->cur))
1215           {
1216             pfile->buffer->cur++;
1217             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1218           }
1219       } while (forms_identifier_p (pfile, false, nst));
1220       result = _cpp_interpret_identifier (pfile, base,
1221                                           pfile->buffer->cur - base);
1222     }
1223   else
1224     {
1225       len = cur - base;
1226       hash = HT_HASHFINISH (hash, len);
1227
1228       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1229                                                   base, len, hash, HT_ALLOC));
1230     }
1231
1232   /* Rarely, identifiers require diagnostics when lexed.  */
1233   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1234                         && !pfile->state.skipping, 0))
1235     {
1236       /* It is allowed to poison the same identifier twice.  */
1237       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1238         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1239                    NODE_NAME (result));
1240
1241       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1242          replacement list of a variadic macro.  */
1243       if (result == pfile->spec_nodes.n__VA_ARGS__
1244           && !pfile->state.va_args_ok)
1245         cpp_error (pfile, CPP_DL_PEDWARN,
1246                    "__VA_ARGS__ can only appear in the expansion"
1247                    " of a C99 variadic macro");
1248
1249       /* For -Wc++-compat, warn about use of C++ named operators.  */
1250       if (result->flags & NODE_WARN_OPERATOR)
1251         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1252                      "identifier \"%s\" is a special operator name in C++",
1253                      NODE_NAME (result));
1254     }
1255
1256   return result;
1257 }
1258
1259 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1260 static void
1261 lex_number (cpp_reader *pfile, cpp_string *number,
1262             struct normalize_state *nst)
1263 {
1264   const uchar *cur;
1265   const uchar *base;
1266   uchar *dest;
1267
1268   base = pfile->buffer->cur - 1;
1269   do
1270     {
1271       cur = pfile->buffer->cur;
1272
1273       /* N.B. ISIDNUM does not include $.  */
1274       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1275         {
1276           cur++;
1277           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1278         }
1279
1280       pfile->buffer->cur = cur;
1281     }
1282   while (forms_identifier_p (pfile, false, nst));
1283
1284   number->len = cur - base;
1285   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1286   memcpy (dest, base, number->len);
1287   dest[number->len] = '\0';
1288   number->text = dest;
1289 }
1290
1291 /* Create a token of type TYPE with a literal spelling.  */
1292 static void
1293 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1294                 unsigned int len, enum cpp_ttype type)
1295 {
1296   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1297
1298   memcpy (dest, base, len);
1299   dest[len] = '\0';
1300   token->type = type;
1301   token->val.str.len = len;
1302   token->val.str.text = dest;
1303 }
1304
1305 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1306    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1307
1308 static void
1309 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1310                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1311 {
1312   _cpp_buff *first_buff = *first_buff_p;
1313   _cpp_buff *last_buff = *last_buff_p;
1314
1315   if (first_buff == NULL)
1316     first_buff = last_buff = _cpp_get_buff (pfile, len);
1317   else if (len > BUFF_ROOM (last_buff))
1318     {
1319       size_t room = BUFF_ROOM (last_buff);
1320       memcpy (BUFF_FRONT (last_buff), base, room);
1321       BUFF_FRONT (last_buff) += room;
1322       base += room;
1323       len -= room;
1324       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1325     }
1326
1327   memcpy (BUFF_FRONT (last_buff), base, len);
1328   BUFF_FRONT (last_buff) += len;
1329
1330   *first_buff_p = first_buff;
1331   *last_buff_p = last_buff;
1332 }
1333
1334 /* Lexes a raw string.  The stored string contains the spelling, including
1335    double quotes, delimiter string, '(' and ')', any leading
1336    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1337    literal, or CPP_OTHER if it was not properly terminated.
1338
1339    The spelling is NUL-terminated, but it is not guaranteed that this
1340    is the first NUL since embedded NULs are preserved.  */
1341
1342 static void
1343 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1344                 const uchar *cur)
1345 {
1346   const uchar *raw_prefix;
1347   unsigned int raw_prefix_len = 0;
1348   enum cpp_ttype type;
1349   size_t total_len = 0;
1350   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1351   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1352
1353   type = (*base == 'L' ? CPP_WSTRING :
1354           *base == 'U' ? CPP_STRING32 :
1355           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1356           : CPP_STRING);
1357
1358   raw_prefix = cur + 1;
1359   while (raw_prefix_len < 16)
1360     {
1361       switch (raw_prefix[raw_prefix_len])
1362         {
1363         case ' ': case '(': case ')': case '\\': case '\t':
1364         case '\v': case '\f': case '\n': default:
1365           break;
1366         /* Basic source charset except the above chars.  */
1367         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1368         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1369         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1370         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1371         case 'y': case 'z':
1372         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1373         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1374         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1375         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1376         case 'Y': case 'Z':
1377         case '0': case '1': case '2': case '3': case '4': case '5':
1378         case '6': case '7': case '8': case '9':
1379         case '_': case '{': case '}': case '#': case '[': case ']':
1380         case '<': case '>': case '%': case ':': case ';': case '.':
1381         case '?': case '*': case '+': case '-': case '/': case '^':
1382         case '&': case '|': case '~': case '!': case '=': case ',':
1383         case '"': case '\'':
1384           raw_prefix_len++;
1385           continue;
1386         }
1387       break;
1388     }
1389
1390   if (raw_prefix[raw_prefix_len] != '(')
1391     {
1392       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1393                 + 1;
1394       if (raw_prefix_len == 16)
1395         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1396                              "raw string delimiter longer than 16 characters");
1397       else
1398         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1399                              "invalid character '%c' in raw string delimiter",
1400                              (int) raw_prefix[raw_prefix_len]);
1401       pfile->buffer->cur = raw_prefix - 1;
1402       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1403       return;
1404     }
1405
1406   cur = raw_prefix + raw_prefix_len + 1;
1407   for (;;)
1408     {
1409 #define BUF_APPEND(STR,LEN)                                     \
1410       do {                                                      \
1411         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1412                         &first_buff, &last_buff);               \
1413         total_len += (LEN);                                     \
1414       } while (0);
1415
1416       cppchar_t c;
1417
1418       /* If we previously performed any trigraph or line splicing
1419          transformations, undo them within the body of the raw string.  */
1420       while (note->pos < cur)
1421         ++note;
1422       for (; note->pos == cur; ++note)
1423         {
1424           switch (note->type)
1425             {
1426             case '\\':
1427             case ' ':
1428               /* Restore backslash followed by newline.  */
1429               BUF_APPEND (base, cur - base);
1430               base = cur;
1431               BUF_APPEND ("\\", 1);
1432             after_backslash:
1433               if (note->type == ' ')
1434                 {
1435                   /* GNU backslash whitespace newline extension.  FIXME
1436                      could be any sequence of non-vertical space.  When we
1437                      can properly restore any such sequence, we should mark
1438                      this note as handled so _cpp_process_line_notes
1439                      doesn't warn.  */
1440                   BUF_APPEND (" ", 1);
1441                 }
1442
1443               BUF_APPEND ("\n", 1);
1444               break;
1445
1446             case 0:
1447               /* Already handled.  */
1448               break;
1449
1450             default:
1451               if (_cpp_trigraph_map[note->type])
1452                 {
1453                   /* Don't warn about this trigraph in
1454                      _cpp_process_line_notes, since trigraphs show up as
1455                      trigraphs in raw strings.  */
1456                   uchar type = note->type;
1457                   note->type = 0;
1458
1459                   if (!CPP_OPTION (pfile, trigraphs))
1460                     /* If we didn't convert the trigraph in the first
1461                        place, don't do anything now either.  */
1462                     break;
1463
1464                   BUF_APPEND (base, cur - base);
1465                   base = cur;
1466                   BUF_APPEND ("??", 2);
1467
1468                   /* ??/ followed by newline gets two line notes, one for
1469                      the trigraph and one for the backslash/newline.  */
1470                   if (type == '/' && note[1].pos == cur)
1471                     {
1472                       if (note[1].type != '\\'
1473                           && note[1].type != ' ')
1474                         abort ();
1475                       BUF_APPEND ("/", 1);
1476                       ++note;
1477                       goto after_backslash;
1478                     }
1479                   /* The ) from ??) could be part of the suffix.  */
1480                   else if (type == ')'
1481                            && strncmp ((const char *) cur+1,
1482                                        (const char *) raw_prefix,
1483                                        raw_prefix_len) == 0
1484                            && cur[raw_prefix_len+1] == '"')
1485                     {
1486                       BUF_APPEND (")", 1);
1487                       base++;
1488                       cur += raw_prefix_len + 2;
1489                       goto break_outer_loop;
1490                     }
1491                   else
1492                     {
1493                       /* Skip the replacement character.  */
1494                       base = ++cur;
1495                       BUF_APPEND (&type, 1);
1496                     }
1497                 }
1498               else
1499                 abort ();
1500               break;
1501             }
1502         }
1503       c = *cur++;
1504
1505       if (c == ')'
1506           && strncmp ((const char *) cur, (const char *) raw_prefix,
1507                       raw_prefix_len) == 0
1508           && cur[raw_prefix_len] == '"')
1509         {
1510           cur += raw_prefix_len + 1;
1511           break;
1512         }
1513       else if (c == '\n')
1514         {
1515           if (pfile->state.in_directive
1516               || pfile->state.parsing_args
1517               || pfile->state.in_deferred_pragma)
1518             {
1519               cur--;
1520               type = CPP_OTHER;
1521               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1522                                    "unterminated raw string");
1523               break;
1524             }
1525
1526           BUF_APPEND (base, cur - base);
1527
1528           if (pfile->buffer->cur < pfile->buffer->rlimit)
1529             CPP_INCREMENT_LINE (pfile, 0);
1530           pfile->buffer->need_line = true;
1531
1532           pfile->buffer->cur = cur-1;
1533           _cpp_process_line_notes (pfile, false);
1534           if (!_cpp_get_fresh_line (pfile))
1535             {
1536               source_location src_loc = token->src_loc;
1537               token->type = CPP_EOF;
1538               /* Tell the compiler the line number of the EOF token.  */
1539               token->src_loc = pfile->line_table->highest_line;
1540               token->flags = BOL;
1541               if (first_buff != NULL)
1542                 _cpp_release_buff (pfile, first_buff);
1543               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1544                                    "unterminated raw string");
1545               return;
1546             }
1547
1548           cur = base = pfile->buffer->cur;
1549           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1550         }
1551     }
1552  break_outer_loop:
1553
1554   if (CPP_OPTION (pfile, user_literals))
1555     {
1556       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1557          underscore is ill-formed.  Since this breaks programs using macros
1558          from inttypes.h, we generate a warning and treat the ud-suffix as a
1559          separate preprocessing token.  This approach is under discussion by
1560          the standards committee, and has been adopted as a conforming
1561          extension by other front ends such as clang. */
1562       if (ISALPHA (*cur))
1563         {
1564           // Raise a warning, but do not consume subsequent tokens.
1565           if (CPP_OPTION (pfile, warn_literal_suffix))
1566             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1567                                    token->src_loc, 0,
1568                                    "invalid suffix on literal; C++11 requires "
1569                                    "a space between literal and identifier");
1570         }
1571       /* Grab user defined literal suffix.  */
1572       else if (*cur == '_')
1573         {
1574           type = cpp_userdef_string_add_type (type);
1575           ++cur;
1576
1577           while (ISIDNUM (*cur))
1578             ++cur;
1579         }
1580     }
1581
1582   pfile->buffer->cur = cur;
1583   if (first_buff == NULL)
1584     create_literal (pfile, token, base, cur - base, type);
1585   else
1586     {
1587       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1588
1589       token->type = type;
1590       token->val.str.len = total_len + (cur - base);
1591       token->val.str.text = dest;
1592       last_buff = first_buff;
1593       while (last_buff != NULL)
1594         {
1595           memcpy (dest, last_buff->base,
1596                   BUFF_FRONT (last_buff) - last_buff->base);
1597           dest += BUFF_FRONT (last_buff) - last_buff->base;
1598           last_buff = last_buff->next;
1599         }
1600       _cpp_release_buff (pfile, first_buff);
1601       memcpy (dest, base, cur - base);
1602       dest[cur - base] = '\0';
1603     }
1604 }
1605
1606 /* Lexes a string, character constant, or angle-bracketed header file
1607    name.  The stored string contains the spelling, including opening
1608    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1609    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1610    if it was not properly terminated, or CPP_LESS for an unterminated
1611    header name which must be relexed as normal tokens.
1612
1613    The spelling is NUL-terminated, but it is not guaranteed that this
1614    is the first NUL since embedded NULs are preserved.  */
1615 static void
1616 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1617 {
1618   bool saw_NUL = false;
1619   const uchar *cur;
1620   cppchar_t terminator;
1621   enum cpp_ttype type;
1622
1623   cur = base;
1624   terminator = *cur++;
1625   if (terminator == 'L' || terminator == 'U')
1626     terminator = *cur++;
1627   else if (terminator == 'u')
1628     {
1629       terminator = *cur++;
1630       if (terminator == '8')
1631         terminator = *cur++;
1632     }
1633   if (terminator == 'R')
1634     {
1635       lex_raw_string (pfile, token, base, cur);
1636       return;
1637     }
1638   if (terminator == '"')
1639     type = (*base == 'L' ? CPP_WSTRING :
1640             *base == 'U' ? CPP_STRING32 :
1641             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1642                          : CPP_STRING);
1643   else if (terminator == '\'')
1644     type = (*base == 'L' ? CPP_WCHAR :
1645             *base == 'U' ? CPP_CHAR32 :
1646             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1647   else
1648     terminator = '>', type = CPP_HEADER_NAME;
1649
1650   for (;;)
1651     {
1652       cppchar_t c = *cur++;
1653
1654       /* In #include-style directives, terminators are not escapable.  */
1655       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1656         cur++;
1657       else if (c == terminator)
1658         break;
1659       else if (c == '\n')
1660         {
1661           cur--;
1662           /* Unmatched quotes always yield undefined behavior, but
1663              greedy lexing means that what appears to be an unterminated
1664              header name may actually be a legitimate sequence of tokens.  */
1665           if (terminator == '>')
1666             {
1667               token->type = CPP_LESS;
1668               return;
1669             }
1670           type = CPP_OTHER;
1671           break;
1672         }
1673       else if (c == '\0')
1674         saw_NUL = true;
1675     }
1676
1677   if (saw_NUL && !pfile->state.skipping)
1678     cpp_error (pfile, CPP_DL_WARNING,
1679                "null character(s) preserved in literal");
1680
1681   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1682     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1683                (int) terminator);
1684
1685   if (CPP_OPTION (pfile, user_literals))
1686     {
1687       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1688          underscore is ill-formed.  Since this breaks programs using macros
1689          from inttypes.h, we generate a warning and treat the ud-suffix as a
1690          separate preprocessing token.  This approach is under discussion by
1691          the standards committee, and has been adopted as a conforming
1692          extension by other front ends such as clang. */
1693       if (ISALPHA (*cur))
1694         {
1695           // Raise a warning, but do not consume subsequent tokens.
1696           if (CPP_OPTION (pfile, warn_literal_suffix))
1697             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1698                                    token->src_loc, 0,
1699                                    "invalid suffix on literal; C++11 requires "
1700                                    "a space between literal and identifier");
1701         }
1702       /* Grab user defined literal suffix.  */
1703       else if (*cur == '_')
1704         {
1705           type = cpp_userdef_char_add_type (type);
1706           type = cpp_userdef_string_add_type (type);
1707           ++cur;
1708
1709           while (ISIDNUM (*cur))
1710             ++cur;
1711         }
1712     }
1713
1714   pfile->buffer->cur = cur;
1715   create_literal (pfile, token, base, cur - base, type);
1716 }
1717
1718 /* Return the comment table. The client may not make any assumption
1719    about the ordering of the table.  */
1720 cpp_comment_table *
1721 cpp_get_comments (cpp_reader *pfile)
1722 {
1723   return &pfile->comments;
1724 }
1725
1726 /* Append a comment to the end of the comment table. */
1727 static void
1728 store_comment (cpp_reader *pfile, cpp_token *token)
1729 {
1730   int len;
1731
1732   if (pfile->comments.allocated == 0)
1733     {
1734       pfile->comments.allocated = 256;
1735       pfile->comments.entries = (cpp_comment *) xmalloc
1736         (pfile->comments.allocated * sizeof (cpp_comment));
1737     }
1738
1739   if (pfile->comments.count == pfile->comments.allocated)
1740     {
1741       pfile->comments.allocated *= 2;
1742       pfile->comments.entries = (cpp_comment *) xrealloc
1743         (pfile->comments.entries,
1744          pfile->comments.allocated * sizeof (cpp_comment));
1745     }
1746
1747   len = token->val.str.len;
1748
1749   /* Copy comment. Note, token may not be NULL terminated. */
1750   pfile->comments.entries[pfile->comments.count].comment =
1751     (char *) xmalloc (sizeof (char) * (len + 1));
1752   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1753           token->val.str.text, len);
1754   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1755
1756   /* Set source location. */
1757   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1758
1759   /* Increment the count of entries in the comment table. */
1760   pfile->comments.count++;
1761 }
1762
1763 /* The stored comment includes the comment start and any terminator.  */
1764 static void
1765 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1766               cppchar_t type)
1767 {
1768   unsigned char *buffer;
1769   unsigned int len, clen, i;
1770
1771   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1772
1773   /* C++ comments probably (not definitely) have moved past a new
1774      line, which we don't want to save in the comment.  */
1775   if (is_vspace (pfile->buffer->cur[-1]))
1776     len--;
1777
1778   /* If we are currently in a directive or in argument parsing, then
1779      we need to store all C++ comments as C comments internally, and
1780      so we need to allocate a little extra space in that case.
1781
1782      Note that the only time we encounter a directive here is
1783      when we are saving comments in a "#define".  */
1784   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1785           && type == '/') ? len + 2 : len;
1786
1787   buffer = _cpp_unaligned_alloc (pfile, clen);
1788
1789   token->type = CPP_COMMENT;
1790   token->val.str.len = clen;
1791   token->val.str.text = buffer;
1792
1793   buffer[0] = '/';
1794   memcpy (buffer + 1, from, len - 1);
1795
1796   /* Finish conversion to a C comment, if necessary.  */
1797   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1798     {
1799       buffer[1] = '*';
1800       buffer[clen - 2] = '*';
1801       buffer[clen - 1] = '/';
1802       /* As there can be in a C++ comments illegal sequences for C comments
1803          we need to filter them out.  */
1804       for (i = 2; i < (clen - 2); i++)
1805         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1806           buffer[i] = '|';
1807     }
1808
1809   /* Finally store this comment for use by clients of libcpp. */
1810   store_comment (pfile, token);
1811 }
1812
1813 /* Allocate COUNT tokens for RUN.  */
1814 void
1815 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1816 {
1817   run->base = XNEWVEC (cpp_token, count);
1818   run->limit = run->base + count;
1819   run->next = NULL;
1820 }
1821
1822 /* Returns the next tokenrun, or creates one if there is none.  */
1823 static tokenrun *
1824 next_tokenrun (tokenrun *run)
1825 {
1826   if (run->next == NULL)
1827     {
1828       run->next = XNEW (tokenrun);
1829       run->next->prev = run;
1830       _cpp_init_tokenrun (run->next, 250);
1831     }
1832
1833   return run->next;
1834 }
1835
1836 /* Return the number of not yet processed token in a given
1837    context.  */
1838 int
1839 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1840 {
1841   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1842     return (LAST (context).token - FIRST (context).token);
1843   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1844            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1845     return (LAST (context).ptoken - FIRST (context).ptoken);
1846   else
1847       abort ();
1848 }
1849
1850 /* Returns the token present at index INDEX in a given context.  If
1851    INDEX is zero, the next token to be processed is returned.  */
1852 static const cpp_token*
1853 _cpp_token_from_context_at (cpp_context *context, int index)
1854 {
1855   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1856     return &(FIRST (context).token[index]);
1857   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1858            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1859     return FIRST (context).ptoken[index];
1860  else
1861    abort ();
1862 }
1863
1864 /* Look ahead in the input stream.  */
1865 const cpp_token *
1866 cpp_peek_token (cpp_reader *pfile, int index)
1867 {
1868   cpp_context *context = pfile->context;
1869   const cpp_token *peektok;
1870   int count;
1871
1872   /* First, scan through any pending cpp_context objects.  */
1873   while (context->prev)
1874     {
1875       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1876
1877       if (index < (int) sz)
1878         return _cpp_token_from_context_at (context, index);
1879       index -= (int) sz;
1880       context = context->prev;
1881     }
1882
1883   /* We will have to read some new tokens after all (and do so
1884      without invalidating preceding tokens).  */
1885   count = index;
1886   pfile->keep_tokens++;
1887
1888   do
1889     {
1890       peektok = _cpp_lex_token (pfile);
1891       if (peektok->type == CPP_EOF)
1892         return peektok;
1893     }
1894   while (index--);
1895
1896   _cpp_backup_tokens_direct (pfile, count + 1);
1897   pfile->keep_tokens--;
1898
1899   return peektok;
1900 }
1901
1902 /* Allocate a single token that is invalidated at the same time as the
1903    rest of the tokens on the line.  Has its line and col set to the
1904    same as the last lexed token, so that diagnostics appear in the
1905    right place.  */
1906 cpp_token *
1907 _cpp_temp_token (cpp_reader *pfile)
1908 {
1909   cpp_token *old, *result;
1910   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1911   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1912
1913   old = pfile->cur_token - 1;
1914   /* Any pre-existing lookaheads must not be clobbered.  */
1915   if (la)
1916     {
1917       if (sz <= la)
1918         {
1919           tokenrun *next = next_tokenrun (pfile->cur_run);
1920
1921           if (sz < la)
1922             memmove (next->base + 1, next->base,
1923                      (la - sz) * sizeof (cpp_token));
1924
1925           next->base[0] = pfile->cur_run->limit[-1];
1926         }
1927
1928       if (sz > 1)
1929         memmove (pfile->cur_token + 1, pfile->cur_token,
1930                  MIN (la, sz - 1) * sizeof (cpp_token));
1931     }
1932
1933   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1934     {
1935       pfile->cur_run = next_tokenrun (pfile->cur_run);
1936       pfile->cur_token = pfile->cur_run->base;
1937     }
1938
1939   result = pfile->cur_token++;
1940   result->src_loc = old->src_loc;
1941   return result;
1942 }
1943
1944 /* Lex a token into RESULT (external interface).  Takes care of issues
1945    like directive handling, token lookahead, multiple include
1946    optimization and skipping.  */
1947 const cpp_token *
1948 _cpp_lex_token (cpp_reader *pfile)
1949 {
1950   cpp_token *result;
1951
1952   for (;;)
1953     {
1954       if (pfile->cur_token == pfile->cur_run->limit)
1955         {
1956           pfile->cur_run = next_tokenrun (pfile->cur_run);
1957           pfile->cur_token = pfile->cur_run->base;
1958         }
1959       /* We assume that the current token is somewhere in the current
1960          run.  */
1961       if (pfile->cur_token < pfile->cur_run->base
1962           || pfile->cur_token >= pfile->cur_run->limit)
1963         abort ();
1964
1965       if (pfile->lookaheads)
1966         {
1967           pfile->lookaheads--;
1968           result = pfile->cur_token++;
1969         }
1970       else
1971         result = _cpp_lex_direct (pfile);
1972
1973       if (result->flags & BOL)
1974         {
1975           /* Is this a directive.  If _cpp_handle_directive returns
1976              false, it is an assembler #.  */
1977           if (result->type == CPP_HASH
1978               /* 6.10.3 p 11: Directives in a list of macro arguments
1979                  gives undefined behavior.  This implementation
1980                  handles the directive as normal.  */
1981               && pfile->state.parsing_args != 1)
1982             {
1983               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1984                 {
1985                   if (pfile->directive_result.type == CPP_PADDING)
1986                     continue;
1987                   result = &pfile->directive_result;
1988                 }
1989             }
1990           else if (pfile->state.in_deferred_pragma)
1991             result = &pfile->directive_result;
1992
1993           if (pfile->cb.line_change && !pfile->state.skipping)
1994             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1995         }
1996
1997       /* We don't skip tokens in directives.  */
1998       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1999         break;
2000
2001       /* Outside a directive, invalidate controlling macros.  At file
2002          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2003          get here and MI optimization works.  */
2004       pfile->mi_valid = false;
2005
2006       if (!pfile->state.skipping || result->type == CPP_EOF)
2007         break;
2008     }
2009
2010   return result;
2011 }
2012
2013 /* Returns true if a fresh line has been loaded.  */
2014 bool
2015 _cpp_get_fresh_line (cpp_reader *pfile)
2016 {
2017   int return_at_eof;
2018
2019   /* We can't get a new line until we leave the current directive.  */
2020   if (pfile->state.in_directive)
2021     return false;
2022
2023   for (;;)
2024     {
2025       cpp_buffer *buffer = pfile->buffer;
2026
2027       if (!buffer->need_line)
2028         return true;
2029
2030       if (buffer->next_line < buffer->rlimit)
2031         {
2032           _cpp_clean_line (pfile);
2033           return true;
2034         }
2035
2036       /* First, get out of parsing arguments state.  */
2037       if (pfile->state.parsing_args)
2038         return false;
2039
2040       /* End of buffer.  Non-empty files should end in a newline.  */
2041       if (buffer->buf != buffer->rlimit
2042           && buffer->next_line > buffer->rlimit
2043           && !buffer->from_stage3)
2044         {
2045           /* Clip to buffer size.  */
2046           buffer->next_line = buffer->rlimit;
2047         }
2048
2049       return_at_eof = buffer->return_at_eof;
2050       _cpp_pop_buffer (pfile);
2051       if (pfile->buffer == NULL || return_at_eof)
2052         return false;
2053     }
2054 }
2055
2056 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2057   do                                                    \
2058     {                                                   \
2059       result->type = ELSE_TYPE;                         \
2060       if (*buffer->cur == CHAR)                         \
2061         buffer->cur++, result->type = THEN_TYPE;        \
2062     }                                                   \
2063   while (0)
2064
2065 /* Lex a token into pfile->cur_token, which is also incremented, to
2066    get diagnostics pointing to the correct location.
2067
2068    Does not handle issues such as token lookahead, multiple-include
2069    optimization, directives, skipping etc.  This function is only
2070    suitable for use by _cpp_lex_token, and in special cases like
2071    lex_expansion_token which doesn't care for any of these issues.
2072
2073    When meeting a newline, returns CPP_EOF if parsing a directive,
2074    otherwise returns to the start of the token buffer if permissible.
2075    Returns the location of the lexed token.  */
2076 cpp_token *
2077 _cpp_lex_direct (cpp_reader *pfile)
2078 {
2079   cppchar_t c;
2080   cpp_buffer *buffer;
2081   const unsigned char *comment_start;
2082   cpp_token *result = pfile->cur_token++;
2083
2084  fresh_line:
2085   result->flags = 0;
2086   buffer = pfile->buffer;
2087   if (buffer->need_line)
2088     {
2089       if (pfile->state.in_deferred_pragma)
2090         {
2091           result->type = CPP_PRAGMA_EOL;
2092           pfile->state.in_deferred_pragma = false;
2093           if (!pfile->state.pragma_allow_expansion)
2094             pfile->state.prevent_expansion--;
2095           return result;
2096         }
2097       if (!_cpp_get_fresh_line (pfile))
2098         {
2099           result->type = CPP_EOF;
2100           if (!pfile->state.in_directive)
2101             {
2102               /* Tell the compiler the line number of the EOF token.  */
2103               result->src_loc = pfile->line_table->highest_line;
2104               result->flags = BOL;
2105             }
2106           return result;
2107         }
2108       if (!pfile->keep_tokens)
2109         {
2110           pfile->cur_run = &pfile->base_run;
2111           result = pfile->base_run.base;
2112           pfile->cur_token = result + 1;
2113         }
2114       result->flags = BOL;
2115       if (pfile->state.parsing_args == 2)
2116         result->flags |= PREV_WHITE;
2117     }
2118   buffer = pfile->buffer;
2119  update_tokens_line:
2120   result->src_loc = pfile->line_table->highest_line;
2121
2122  skipped_white:
2123   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2124       && !pfile->overlaid_buffer)
2125     {
2126       _cpp_process_line_notes (pfile, false);
2127       result->src_loc = pfile->line_table->highest_line;
2128     }
2129   c = *buffer->cur++;
2130
2131   if (pfile->forced_token_location_p)
2132     result->src_loc = *pfile->forced_token_location_p;
2133   else
2134     result->src_loc = linemap_position_for_column (pfile->line_table,
2135                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2136
2137   switch (c)
2138     {
2139     case ' ': case '\t': case '\f': case '\v': case '\0':
2140       result->flags |= PREV_WHITE;
2141       skip_whitespace (pfile, c);
2142       goto skipped_white;
2143
2144     case '\n':
2145       if (buffer->cur < buffer->rlimit)
2146         CPP_INCREMENT_LINE (pfile, 0);
2147       buffer->need_line = true;
2148       goto fresh_line;
2149
2150     case '0': case '1': case '2': case '3': case '4':
2151     case '5': case '6': case '7': case '8': case '9':
2152       {
2153         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2154         result->type = CPP_NUMBER;
2155         lex_number (pfile, &result->val.str, &nst);
2156         warn_about_normalization (pfile, result, &nst);
2157         break;
2158       }
2159
2160     case 'L':
2161     case 'u':
2162     case 'U':
2163     case 'R':
2164       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2165          wide strings or raw strings.  */
2166       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2167           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2168         {
2169           if ((*buffer->cur == '\'' && c != 'R')
2170               || *buffer->cur == '"'
2171               || (*buffer->cur == 'R'
2172                   && c != 'R'
2173                   && buffer->cur[1] == '"'
2174                   && CPP_OPTION (pfile, rliterals))
2175               || (*buffer->cur == '8'
2176                   && c == 'u'
2177                   && (buffer->cur[1] == '"'
2178                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2179                           && CPP_OPTION (pfile, rliterals)))))
2180             {
2181               lex_string (pfile, result, buffer->cur - 1);
2182               break;
2183             }
2184         }
2185       /* Fall through.  */
2186
2187     case '_':
2188     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2189     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2190     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2191     case 's': case 't':           case 'v': case 'w': case 'x':
2192     case 'y': case 'z':
2193     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2194     case 'G': case 'H': case 'I': case 'J': case 'K':
2195     case 'M': case 'N': case 'O': case 'P': case 'Q':
2196     case 'S': case 'T':           case 'V': case 'W': case 'X':
2197     case 'Y': case 'Z':
2198       result->type = CPP_NAME;
2199       {
2200         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2201         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2202                                                 &nst);
2203         warn_about_normalization (pfile, result, &nst);
2204       }
2205
2206       /* Convert named operators to their proper types.  */
2207       if (result->val.node.node->flags & NODE_OPERATOR)
2208         {
2209           result->flags |= NAMED_OP;
2210           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2211         }
2212       break;
2213
2214     case '\'':
2215     case '"':
2216       lex_string (pfile, result, buffer->cur - 1);
2217       break;
2218
2219     case '/':
2220       /* A potential block or line comment.  */
2221       comment_start = buffer->cur;
2222       c = *buffer->cur;
2223
2224       if (c == '*')
2225         {
2226           if (_cpp_skip_block_comment (pfile))
2227             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2228         }
2229       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2230                             || cpp_in_system_header (pfile)))
2231         {
2232           /* Warn about comments only if pedantically GNUC89, and not
2233              in system headers.  */
2234           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2235               && ! buffer->warned_cplusplus_comments)
2236             {
2237               cpp_error (pfile, CPP_DL_PEDWARN,
2238                          "C++ style comments are not allowed in ISO C90");
2239               cpp_error (pfile, CPP_DL_PEDWARN,
2240                          "(this will be reported only once per input file)");
2241               buffer->warned_cplusplus_comments = 1;
2242             }
2243
2244           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2245             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2246         }
2247       else if (c == '=')
2248         {
2249           buffer->cur++;
2250           result->type = CPP_DIV_EQ;
2251           break;
2252         }
2253       else
2254         {
2255           result->type = CPP_DIV;
2256           break;
2257         }
2258
2259       if (!pfile->state.save_comments)
2260         {
2261           result->flags |= PREV_WHITE;
2262           goto update_tokens_line;
2263         }
2264
2265       /* Save the comment as a token in its own right.  */
2266       save_comment (pfile, result, comment_start, c);
2267       break;
2268
2269     case '<':
2270       if (pfile->state.angled_headers)
2271         {
2272           lex_string (pfile, result, buffer->cur - 1);
2273           if (result->type != CPP_LESS)
2274             break;
2275         }
2276
2277       result->type = CPP_LESS;
2278       if (*buffer->cur == '=')
2279         buffer->cur++, result->type = CPP_LESS_EQ;
2280       else if (*buffer->cur == '<')
2281         {
2282           buffer->cur++;
2283           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2284         }
2285       else if (CPP_OPTION (pfile, digraphs))
2286         {
2287           if (*buffer->cur == ':')
2288             {
2289               buffer->cur++;
2290               result->flags |= DIGRAPH;
2291               result->type = CPP_OPEN_SQUARE;
2292             }
2293           else if (*buffer->cur == '%')
2294             {
2295               buffer->cur++;
2296               result->flags |= DIGRAPH;
2297               result->type = CPP_OPEN_BRACE;
2298             }
2299         }
2300       break;
2301
2302     case '>':
2303       result->type = CPP_GREATER;
2304       if (*buffer->cur == '=')
2305         buffer->cur++, result->type = CPP_GREATER_EQ;
2306       else if (*buffer->cur == '>')
2307         {
2308           buffer->cur++;
2309           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2310         }
2311       break;
2312
2313     case '%':
2314       result->type = CPP_MOD;
2315       if (*buffer->cur == '=')
2316         buffer->cur++, result->type = CPP_MOD_EQ;
2317       else if (CPP_OPTION (pfile, digraphs))
2318         {
2319           if (*buffer->cur == ':')
2320             {
2321               buffer->cur++;
2322               result->flags |= DIGRAPH;
2323               result->type = CPP_HASH;
2324               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2325                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2326             }
2327           else if (*buffer->cur == '>')
2328             {
2329               buffer->cur++;
2330               result->flags |= DIGRAPH;
2331               result->type = CPP_CLOSE_BRACE;
2332             }
2333         }
2334       break;
2335
2336     case '.':
2337       result->type = CPP_DOT;
2338       if (ISDIGIT (*buffer->cur))
2339         {
2340           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2341           result->type = CPP_NUMBER;
2342           lex_number (pfile, &result->val.str, &nst);
2343           warn_about_normalization (pfile, result, &nst);
2344         }
2345       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2346         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2347       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2348         buffer->cur++, result->type = CPP_DOT_STAR;
2349       break;
2350
2351     case '+':
2352       result->type = CPP_PLUS;
2353       if (*buffer->cur == '+')
2354         buffer->cur++, result->type = CPP_PLUS_PLUS;
2355       else if (*buffer->cur == '=')
2356         buffer->cur++, result->type = CPP_PLUS_EQ;
2357       break;
2358
2359     case '-':
2360       result->type = CPP_MINUS;
2361       if (*buffer->cur == '>')
2362         {
2363           buffer->cur++;
2364           result->type = CPP_DEREF;
2365           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2366             buffer->cur++, result->type = CPP_DEREF_STAR;
2367         }
2368       else if (*buffer->cur == '-')
2369         buffer->cur++, result->type = CPP_MINUS_MINUS;
2370       else if (*buffer->cur == '=')
2371         buffer->cur++, result->type = CPP_MINUS_EQ;
2372       break;
2373
2374     case '&':
2375       result->type = CPP_AND;
2376       if (*buffer->cur == '&')
2377         buffer->cur++, result->type = CPP_AND_AND;
2378       else if (*buffer->cur == '=')
2379         buffer->cur++, result->type = CPP_AND_EQ;
2380       break;
2381
2382     case '|':
2383       result->type = CPP_OR;
2384       if (*buffer->cur == '|')
2385         buffer->cur++, result->type = CPP_OR_OR;
2386       else if (*buffer->cur == '=')
2387         buffer->cur++, result->type = CPP_OR_EQ;
2388       break;
2389
2390     case ':':
2391       result->type = CPP_COLON;
2392       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2393         buffer->cur++, result->type = CPP_SCOPE;
2394       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2395         {
2396           buffer->cur++;
2397           result->flags |= DIGRAPH;
2398           result->type = CPP_CLOSE_SQUARE;
2399         }
2400       break;
2401
2402     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2403     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2404     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2405     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2406     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2407
2408     case '?': result->type = CPP_QUERY; break;
2409     case '~': result->type = CPP_COMPL; break;
2410     case ',': result->type = CPP_COMMA; break;
2411     case '(': result->type = CPP_OPEN_PAREN; break;
2412     case ')': result->type = CPP_CLOSE_PAREN; break;
2413     case '[': result->type = CPP_OPEN_SQUARE; break;
2414     case ']': result->type = CPP_CLOSE_SQUARE; break;
2415     case '{': result->type = CPP_OPEN_BRACE; break;
2416     case '}': result->type = CPP_CLOSE_BRACE; break;
2417     case ';': result->type = CPP_SEMICOLON; break;
2418
2419       /* @ is a punctuator in Objective-C.  */
2420     case '@': result->type = CPP_ATSIGN; break;
2421
2422     case '$':
2423     case '\\':
2424       {
2425         const uchar *base = --buffer->cur;
2426         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2427
2428         if (forms_identifier_p (pfile, true, &nst))
2429           {
2430             result->type = CPP_NAME;
2431             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2432             warn_about_normalization (pfile, result, &nst);
2433             break;
2434           }
2435         buffer->cur++;
2436       }
2437
2438     default:
2439       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2440       break;
2441     }
2442
2443   return result;
2444 }
2445
2446 /* An upper bound on the number of bytes needed to spell TOKEN.
2447    Does not include preceding whitespace.  */
2448 unsigned int
2449 cpp_token_len (const cpp_token *token)
2450 {
2451   unsigned int len;
2452
2453   switch (TOKEN_SPELL (token))
2454     {
2455     default:            len = 6;                                break;
2456     case SPELL_LITERAL: len = token->val.str.len;               break;
2457     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2458     }
2459
2460   return len;
2461 }
2462
2463 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2464    Return the number of bytes read out of NAME.  (There are always
2465    10 bytes written to BUFFER.)  */
2466
2467 static size_t
2468 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2469 {
2470   int j;
2471   int ucn_len = 0;
2472   int ucn_len_c;
2473   unsigned t;
2474   unsigned long utf32;
2475
2476   /* Compute the length of the UTF-8 sequence.  */
2477   for (t = *name; t & 0x80; t <<= 1)
2478     ucn_len++;
2479
2480   utf32 = *name & (0x7F >> ucn_len);
2481   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2482     {
2483       utf32 = (utf32 << 6) | (*++name & 0x3F);
2484
2485       /* Ill-formed UTF-8.  */
2486       if ((*name & ~0x3F) != 0x80)
2487         abort ();
2488     }
2489
2490   *buffer++ = '\\';
2491   *buffer++ = 'U';
2492   for (j = 7; j >= 0; j--)
2493     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2494   return ucn_len;
2495 }
2496
2497 /* Given a token TYPE corresponding to a digraph, return a pointer to
2498    the spelling of the digraph.  */
2499 static const unsigned char *
2500 cpp_digraph2name (enum cpp_ttype type)
2501 {
2502   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2503 }
2504
2505 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2506    already contain the enough space to hold the token's spelling.
2507    Returns a pointer to the character after the last character written.
2508    FORSTRING is true if this is to be the spelling after translation
2509    phase 1 (this is different for UCNs).
2510    FIXME: Would be nice if we didn't need the PFILE argument.  */
2511 unsigned char *
2512 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2513                  unsigned char *buffer, bool forstring)
2514 {
2515   switch (TOKEN_SPELL (token))
2516     {
2517     case SPELL_OPERATOR:
2518       {
2519         const unsigned char *spelling;
2520         unsigned char c;
2521
2522         if (token->flags & DIGRAPH)
2523           spelling = cpp_digraph2name (token->type);
2524         else if (token->flags & NAMED_OP)
2525           goto spell_ident;
2526         else
2527           spelling = TOKEN_NAME (token);
2528
2529         while ((c = *spelling++) != '\0')
2530           *buffer++ = c;
2531       }
2532       break;
2533
2534     spell_ident:
2535     case SPELL_IDENT:
2536       if (forstring)
2537         {
2538           memcpy (buffer, NODE_NAME (token->val.node.node),
2539                   NODE_LEN (token->val.node.node));
2540           buffer += NODE_LEN (token->val.node.node);
2541         }
2542       else
2543         {
2544           size_t i;
2545           const unsigned char * name = NODE_NAME (token->val.node.node);
2546
2547           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2548             if (name[i] & ~0x7F)
2549               {
2550                 i += utf8_to_ucn (buffer, name + i) - 1;
2551                 buffer += 10;
2552               }
2553             else
2554               *buffer++ = NODE_NAME (token->val.node.node)[i];
2555         }
2556       break;
2557
2558     case SPELL_LITERAL:
2559       memcpy (buffer, token->val.str.text, token->val.str.len);
2560       buffer += token->val.str.len;
2561       break;
2562
2563     case SPELL_NONE:
2564       cpp_error (pfile, CPP_DL_ICE,
2565                  "unspellable token %s", TOKEN_NAME (token));
2566       break;
2567     }
2568
2569   return buffer;
2570 }
2571
2572 /* Returns TOKEN spelt as a null-terminated string.  The string is
2573    freed when the reader is destroyed.  Useful for diagnostics.  */
2574 unsigned char *
2575 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2576 {
2577   unsigned int len = cpp_token_len (token) + 1;
2578   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2579
2580   end = cpp_spell_token (pfile, token, start, false);
2581   end[0] = '\0';
2582
2583   return start;
2584 }
2585
2586 /* Returns a pointer to a string which spells the token defined by
2587    TYPE and FLAGS.  Used by C front ends, which really should move to
2588    using cpp_token_as_text.  */
2589 const char *
2590 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2591 {
2592   if (flags & DIGRAPH)
2593     return (const char *) cpp_digraph2name (type);
2594   else if (flags & NAMED_OP)
2595     return cpp_named_operator2name (type);
2596
2597   return (const char *) token_spellings[type].name;
2598 }
2599
2600 /* Writes the spelling of token to FP, without any preceding space.
2601    Separated from cpp_spell_token for efficiency - to avoid stdio
2602    double-buffering.  */
2603 void
2604 cpp_output_token (const cpp_token *token, FILE *fp)
2605 {
2606   switch (TOKEN_SPELL (token))
2607     {
2608     case SPELL_OPERATOR:
2609       {
2610         const unsigned char *spelling;
2611         int c;
2612
2613         if (token->flags & DIGRAPH)
2614           spelling = cpp_digraph2name (token->type);
2615         else if (token->flags & NAMED_OP)
2616           goto spell_ident;
2617         else
2618           spelling = TOKEN_NAME (token);
2619
2620         c = *spelling;
2621         do
2622           putc (c, fp);
2623         while ((c = *++spelling) != '\0');
2624       }
2625       break;
2626
2627     spell_ident:
2628     case SPELL_IDENT:
2629       {
2630         size_t i;
2631         const unsigned char * name = NODE_NAME (token->val.node.node);
2632
2633         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2634           if (name[i] & ~0x7F)
2635             {
2636               unsigned char buffer[10];
2637               i += utf8_to_ucn (buffer, name + i) - 1;
2638               fwrite (buffer, 1, 10, fp);
2639             }
2640           else
2641             fputc (NODE_NAME (token->val.node.node)[i], fp);
2642       }
2643       break;
2644
2645     case SPELL_LITERAL:
2646       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2647       break;
2648
2649     case SPELL_NONE:
2650       /* An error, most probably.  */
2651       break;
2652     }
2653 }
2654
2655 /* Compare two tokens.  */
2656 int
2657 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2658 {
2659   if (a->type == b->type && a->flags == b->flags)
2660     switch (TOKEN_SPELL (a))
2661       {
2662       default:                  /* Keep compiler happy.  */
2663       case SPELL_OPERATOR:
2664         /* token_no is used to track where multiple consecutive ##
2665            tokens were originally located.  */
2666         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2667       case SPELL_NONE:
2668         return (a->type != CPP_MACRO_ARG
2669                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2670       case SPELL_IDENT:
2671         return a->val.node.node == b->val.node.node;
2672       case SPELL_LITERAL:
2673         return (a->val.str.len == b->val.str.len
2674                 && !memcmp (a->val.str.text, b->val.str.text,
2675                             a->val.str.len));
2676       }
2677
2678   return 0;
2679 }
2680
2681 /* Returns nonzero if a space should be inserted to avoid an
2682    accidental token paste for output.  For simplicity, it is
2683    conservative, and occasionally advises a space where one is not
2684    needed, e.g. "." and ".2".  */
2685 int
2686 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2687                  const cpp_token *token2)
2688 {
2689   enum cpp_ttype a = token1->type, b = token2->type;
2690   cppchar_t c;
2691
2692   if (token1->flags & NAMED_OP)
2693     a = CPP_NAME;
2694   if (token2->flags & NAMED_OP)
2695     b = CPP_NAME;
2696
2697   c = EOF;
2698   if (token2->flags & DIGRAPH)
2699     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2700   else if (token_spellings[b].category == SPELL_OPERATOR)
2701     c = token_spellings[b].name[0];
2702
2703   /* Quickly get everything that can paste with an '='.  */
2704   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2705     return 1;
2706
2707   switch (a)
2708     {
2709     case CPP_GREATER:   return c == '>';
2710     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2711     case CPP_PLUS:      return c == '+';
2712     case CPP_MINUS:     return c == '-' || c == '>';
2713     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2714     case CPP_MOD:       return c == ':' || c == '>';
2715     case CPP_AND:       return c == '&';
2716     case CPP_OR:        return c == '|';
2717     case CPP_COLON:     return c == ':' || c == '>';
2718     case CPP_DEREF:     return c == '*';
2719     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2720     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2721     case CPP_NAME:      return ((b == CPP_NUMBER
2722                                  && name_p (pfile, &token2->val.str))
2723                                 || b == CPP_NAME
2724                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2725     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2726                                 || c == '.' || c == '+' || c == '-');
2727                                       /* UCNs */
2728     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2729                                  && b == CPP_NAME)
2730                                 || (CPP_OPTION (pfile, objc)
2731                                     && token1->val.str.text[0] == '@'
2732                                     && (b == CPP_NAME || b == CPP_STRING)));
2733     default:            break;
2734     }
2735
2736   return 0;
2737 }
2738
2739 /* Output all the remaining tokens on the current line, and a newline
2740    character, to FP.  Leading whitespace is removed.  If there are
2741    macros, special token padding is not performed.  */
2742 void
2743 cpp_output_line (cpp_reader *pfile, FILE *fp)
2744 {
2745   const cpp_token *token;
2746
2747   token = cpp_get_token (pfile);
2748   while (token->type != CPP_EOF)
2749     {
2750       cpp_output_token (token, fp);
2751       token = cpp_get_token (pfile);
2752       if (token->flags & PREV_WHITE)
2753         putc (' ', fp);
2754     }
2755
2756   putc ('\n', fp);
2757 }
2758
2759 /* Return a string representation of all the remaining tokens on the
2760    current line.  The result is allocated using xmalloc and must be
2761    freed by the caller.  */
2762 unsigned char *
2763 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2764 {
2765   const cpp_token *token;
2766   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2767   unsigned int alloced = 120 + out;
2768   unsigned char *result = (unsigned char *) xmalloc (alloced);
2769
2770   /* If DIR_NAME is empty, there are no initial contents.  */
2771   if (dir_name)
2772     {
2773       sprintf ((char *) result, "#%s ", dir_name);
2774       out += 2;
2775     }
2776
2777   token = cpp_get_token (pfile);
2778   while (token->type != CPP_EOF)
2779     {
2780       unsigned char *last;
2781       /* Include room for a possible space and the terminating nul.  */
2782       unsigned int len = cpp_token_len (token) + 2;
2783
2784       if (out + len > alloced)
2785         {
2786           alloced *= 2;
2787           if (out + len > alloced)
2788             alloced = out + len;
2789           result = (unsigned char *) xrealloc (result, alloced);
2790         }
2791
2792       last = cpp_spell_token (pfile, token, &result[out], 0);
2793       out = last - result;
2794
2795       token = cpp_get_token (pfile);
2796       if (token->flags & PREV_WHITE)
2797         result[out++] = ' ';
2798     }
2799
2800   result[out] = '\0';
2801   return result;
2802 }
2803
2804 /* Memory buffers.  Changing these three constants can have a dramatic
2805    effect on performance.  The values here are reasonable defaults,
2806    but might be tuned.  If you adjust them, be sure to test across a
2807    range of uses of cpplib, including heavy nested function-like macro
2808    expansion.  Also check the change in peak memory usage (NJAMD is a
2809    good tool for this).  */
2810 #define MIN_BUFF_SIZE 8000
2811 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2812 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2813         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2814
2815 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2816   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2817 #endif
2818
2819 /* Create a new allocation buffer.  Place the control block at the end
2820    of the buffer, so that buffer overflows will cause immediate chaos.  */
2821 static _cpp_buff *
2822 new_buff (size_t len)
2823 {
2824   _cpp_buff *result;
2825   unsigned char *base;
2826
2827   if (len < MIN_BUFF_SIZE)
2828     len = MIN_BUFF_SIZE;
2829   len = CPP_ALIGN (len);
2830
2831   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2832   result = (_cpp_buff *) (base + len);
2833   result->base = base;
2834   result->cur = base;
2835   result->limit = base + len;
2836   result->next = NULL;
2837   return result;
2838 }
2839
2840 /* Place a chain of unwanted allocation buffers on the free list.  */
2841 void
2842 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2843 {
2844   _cpp_buff *end = buff;
2845
2846   while (end->next)
2847     end = end->next;
2848   end->next = pfile->free_buffs;
2849   pfile->free_buffs = buff;
2850 }
2851
2852 /* Return a free buffer of size at least MIN_SIZE.  */
2853 _cpp_buff *
2854 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2855 {
2856   _cpp_buff *result, **p;
2857
2858   for (p = &pfile->free_buffs;; p = &(*p)->next)
2859     {
2860       size_t size;
2861
2862       if (*p == NULL)
2863         return new_buff (min_size);
2864       result = *p;
2865       size = result->limit - result->base;
2866       /* Return a buffer that's big enough, but don't waste one that's
2867          way too big.  */
2868       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2869         break;
2870     }
2871
2872   *p = result->next;
2873   result->next = NULL;
2874   result->cur = result->base;
2875   return result;
2876 }
2877
2878 /* Creates a new buffer with enough space to hold the uncommitted
2879    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2880    the excess bytes to the new buffer.  Chains the new buffer after
2881    BUFF, and returns the new buffer.  */
2882 _cpp_buff *
2883 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2884 {
2885   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2886   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2887
2888   buff->next = new_buff;
2889   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2890   return new_buff;
2891 }
2892
2893 /* Creates a new buffer with enough space to hold the uncommitted
2894    remaining bytes of the buffer pointed to by BUFF, and at least
2895    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2896    Chains the new buffer before the buffer pointed to by BUFF, and
2897    updates the pointer to point to the new buffer.  */
2898 void
2899 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2900 {
2901   _cpp_buff *new_buff, *old_buff = *pbuff;
2902   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2903
2904   new_buff = _cpp_get_buff (pfile, size);
2905   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2906   new_buff->next = old_buff;
2907   *pbuff = new_buff;
2908 }
2909
2910 /* Free a chain of buffers starting at BUFF.  */
2911 void
2912 _cpp_free_buff (_cpp_buff *buff)
2913 {
2914   _cpp_buff *next;
2915
2916   for (; buff; buff = next)
2917     {
2918       next = buff->next;
2919       free (buff->base);
2920     }
2921 }
2922
2923 /* Allocate permanent, unaligned storage of length LEN.  */
2924 unsigned char *
2925 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2926 {
2927   _cpp_buff *buff = pfile->u_buff;
2928   unsigned char *result = buff->cur;
2929
2930   if (len > (size_t) (buff->limit - result))
2931     {
2932       buff = _cpp_get_buff (pfile, len);
2933       buff->next = pfile->u_buff;
2934       pfile->u_buff = buff;
2935       result = buff->cur;
2936     }
2937
2938   buff->cur = result + len;
2939   return result;
2940 }
2941
2942 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2943    That buffer is used for growing allocations when saving macro
2944    replacement lists in a #define, and when parsing an answer to an
2945    assertion in #assert, #unassert or #if (and therefore possibly
2946    whilst expanding macros).  It therefore must not be used by any
2947    code that they might call: specifically the lexer and the guts of
2948    the macro expander.
2949
2950    All existing other uses clearly fit this restriction: storing
2951    registered pragmas during initialization.  */
2952 unsigned char *
2953 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2954 {
2955   _cpp_buff *buff = pfile->a_buff;
2956   unsigned char *result = buff->cur;
2957
2958   if (len > (size_t) (buff->limit - result))
2959     {
2960       buff = _cpp_get_buff (pfile, len);
2961       buff->next = pfile->a_buff;
2962       pfile->a_buff = buff;
2963       result = buff->cur;
2964     }
2965
2966   buff->cur = result + len;
2967   return result;
2968 }
2969
2970 /* Say which field of TOK is in use.  */
2971
2972 enum cpp_token_fld_kind
2973 cpp_token_val_index (cpp_token *tok)
2974 {
2975   switch (TOKEN_SPELL (tok))
2976     {
2977     case SPELL_IDENT:
2978       return CPP_TOKEN_FLD_NODE;
2979     case SPELL_LITERAL:
2980       return CPP_TOKEN_FLD_STR;
2981     case SPELL_OPERATOR:
2982       if (tok->type == CPP_PASTE)
2983         return CPP_TOKEN_FLD_TOKEN_NO;
2984       else
2985         return CPP_TOKEN_FLD_NONE;
2986     case SPELL_NONE:
2987       if (tok->type == CPP_MACRO_ARG)
2988         return CPP_TOKEN_FLD_ARG_NO;
2989       else if (tok->type == CPP_PADDING)
2990         return CPP_TOKEN_FLD_SOURCE;
2991       else if (tok->type == CPP_PRAGMA)
2992         return CPP_TOKEN_FLD_PRAGMA;
2993       /* else fall through */
2994     default:
2995       return CPP_TOKEN_FLD_NONE;
2996     }
2997 }
2998
2999 /* All tokens lexed in R after calling this function will be forced to have
3000    their source_location the same as the location referenced by P, until
3001    cpp_stop_forcing_token_locations is called for R.  */
3002
3003 void
3004 cpp_force_token_locations (cpp_reader *r, source_location *p)
3005 {
3006   r->forced_token_location_p = p;
3007 }
3008
3009 /* Go back to assigning locations naturally for lexed tokens.  */
3010
3011 void
3012 cpp_stop_forcing_token_locations (cpp_reader *r)
3013 {
3014   r->forced_token_location_p = NULL;
3015 }