]> gcc.gnu.org Git - gcc.git/blame - gcc/d/dmd/lexer.d
d: Merge upstream dmd d579c467c1, phobos 88aa69b14.
[gcc.git] / gcc / d / dmd / lexer.d
CommitLineData
5fee5ec3
IB
1/**
2 * Implements the lexical analyzer, which converts source code into lexical tokens.
3 *
4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
5 *
c43b5909
IB
6 * Copyright: Copyright (C) 1999-2022 by The D Language Foundation, All Rights Reserved
7 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright)
8 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
5fee5ec3
IB
9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
10 * Documentation: https://dlang.org/phobos/dmd_lexer.html
11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
12 */
13
14module dmd.lexer;
15
16import core.stdc.ctype;
17import core.stdc.errno;
18import core.stdc.stdarg;
19import core.stdc.stdio;
20import core.stdc.stdlib : getenv;
21import core.stdc.string;
22import core.stdc.time;
23
24import dmd.entity;
25import dmd.errors;
26import dmd.globals;
27import dmd.id;
28import dmd.identifier;
0fb57034 29import dmd.root.array;
5fee5ec3 30import dmd.root.ctfloat;
0fb57034 31import dmd.common.outbuffer;
5fee5ec3
IB
32import dmd.root.port;
33import dmd.root.rmem;
34import dmd.root.string;
c43b5909 35import dmd.root.utf;
5fee5ec3 36import dmd.tokens;
5fee5ec3
IB
37import dmd.utils;
38
39nothrow:
40
5fee5ec3
IB
41version (DMDLIB)
42{
43 version = LocOffset;
44}
45
46/***********************************************************
47 */
48class Lexer
49{
50 private __gshared OutBuffer stringbuffer;
51
52 Loc scanloc; // for error messages
53 Loc prevloc; // location of token before current
54
55 const(char)* p; // current character
56
57 Token token;
58
59 // For ImportC
60 bool Ccompile; /// true if compiling ImportC
61
62 // The following are valid only if (Ccompile == true)
1027dc45
IB
63 ubyte boolsize; /// size of a C _Bool, default 1
64 ubyte shortsize; /// size of a C short, default 2
65 ubyte intsize; /// size of a C int, default 4
5fee5ec3 66 ubyte longsize; /// size of C long, 4 or 8
1027dc45 67 ubyte long_longsize; /// size of a C long long, default 8
5fee5ec3
IB
68 ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof
69 ubyte wchar_tsize; /// size of C wchar_t, 2 or 4
70
71 private
72 {
73 const(char)* base; // pointer to start of buffer
74 const(char)* end; // pointer to last element of buffer
75 const(char)* line; // start of current line
76
77 bool doDocComment; // collect doc comment information
78 bool anyToken; // seen at least one token
79 bool commentToken; // comments are TOK.comment's
235d5a96 80 bool tokenizeNewlines; // newlines are turned into TOK.endOfLine's
8977f4be
IB
81
82 version (DMDLIB)
83 {
84 bool whitespaceToken; // tokenize whitespaces
85 }
86
5fee5ec3
IB
87 int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
88 int lastDocLine; // last line of previous doc comment
89
90 Token* tokenFreelist;
91 }
92
93 nothrow:
94
95 /*********************
96 * Creates a Lexer for the source code base[begoffset..endoffset+1].
97 * The last character, base[endoffset], must be null (0) or EOF (0x1A).
98 *
99 * Params:
100 * filename = used for error messages
101 * base = source code, must be terminated by a null (0) or EOF (0x1A) character
102 * begoffset = starting offset into base[]
103 * endoffset = the last offset to read into base[]
104 * doDocComment = handle documentation comments
105 * commentToken = comments become TOK.comment's
106 */
107 this(const(char)* filename, const(char)* base, size_t begoffset,
108 size_t endoffset, bool doDocComment, bool commentToken) pure
109 {
110 scanloc = Loc(filename, 1, 1);
31350635
IB
111 // debug printf("Lexer::Lexer(%p)\n", base);
112 // debug printf("lexer.filename = %s\n", filename);
5fee5ec3
IB
113 token = Token.init;
114 this.base = base;
115 this.end = base + endoffset;
116 p = base + begoffset;
117 line = p;
118 this.doDocComment = doDocComment;
119 this.commentToken = commentToken;
235d5a96 120 this.tokenizeNewlines = false;
5fee5ec3
IB
121 this.inTokenStringConstant = 0;
122 this.lastDocLine = 0;
123 //initKeywords();
124 /* If first line starts with '#!', ignore the line
125 */
126 if (p && p[0] == '#' && p[1] == '!')
127 {
128 p += 2;
129 while (1)
130 {
131 char c = *p++;
132 switch (c)
133 {
134 case 0:
135 case 0x1A:
136 p--;
137 goto case;
138 case '\n':
139 break;
140 default:
141 continue;
142 }
143 break;
144 }
145 endOfLine();
146 }
147 }
148
610d7898
IB
149 /******************
150 * Used for unittests for a mock Lexer
151 */
152 this() { }
153
154 /**************************************
155 * Reset lexer to lex #define's
156 */
157 final void resetDefineLines(const(char)[] slice)
158 {
159 base = slice.ptr;
160 end = base + slice.length;
161 assert(*end == 0);
162 p = base;
163 line = p;
164 tokenizeNewlines = true;
165 inTokenStringConstant = 0;
166 lastDocLine = 0;
167 scanloc = Loc("#defines", 1, 1);
168 }
169
170 /**********************************
171 * Set up for next #define line.
172 * p should be at start of next line.
173 */
174 final void nextDefineLine()
175 {
176 tokenizeNewlines = true;
177 }
178
8977f4be
IB
179 version (DMDLIB)
180 {
181 this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset,
182 bool doDocComment, bool commentToken, bool whitespaceToken)
183 {
184 this(filename, base, begoffset, endoffset, doDocComment, commentToken);
185 this.whitespaceToken = whitespaceToken;
186 }
187
188 bool empty() const pure @property @nogc @safe
189 {
190 return front() == TOK.endOfFile;
191 }
192
193 TOK front() const pure @property @nogc @safe
194 {
195 return token.value;
196 }
197
198 void popFront()
199 {
200 nextToken();
201 }
202 }
203
5fee5ec3
IB
204 /// Returns: a newly allocated `Token`.
205 Token* allocateToken() pure nothrow @safe
206 {
207 if (tokenFreelist)
208 {
209 Token* t = tokenFreelist;
210 tokenFreelist = t.next;
211 t.next = null;
212 return t;
213 }
214 return new Token();
215 }
216
217 /// Frees the given token by returning it to the freelist.
218 private void releaseToken(Token* token) pure nothrow @nogc @safe
219 {
220 if (mem.isGCEnabled)
221 *token = Token.init;
222 token.next = tokenFreelist;
223 tokenFreelist = token;
224 }
225
226 final TOK nextToken()
227 {
228 prevloc = token.loc;
229 if (token.next)
230 {
231 Token* t = token.next;
232 memcpy(&token, t, Token.sizeof);
233 releaseToken(t);
234 }
235 else
236 {
237 scan(&token);
238 }
239 //printf(token.toChars());
240 return token.value;
241 }
242
243 /***********************
244 * Look ahead at next token's value.
245 */
246 final TOK peekNext()
247 {
248 return peek(&token).value;
249 }
250
251 /***********************
252 * Look 2 tokens ahead at value.
253 */
254 final TOK peekNext2()
255 {
256 Token* t = peek(&token);
257 return peek(t).value;
258 }
259
260 /****************************
261 * Turn next token in buffer into a token.
235d5a96
IB
262 * Params:
263 * t = the token to set the resulting Token to
5fee5ec3
IB
264 */
265 final void scan(Token* t)
266 {
267 const lastLine = scanloc.linnum;
268 Loc startLoc;
269 t.blockComment = null;
270 t.lineComment = null;
271
272 while (1)
273 {
274 t.ptr = p;
275 //printf("p = %p, *p = '%c'\n",p,*p);
276 t.loc = loc();
277 switch (*p)
278 {
279 case 0:
280 case 0x1A:
281 t.value = TOK.endOfFile; // end of file
282 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
283 return;
284 case ' ':
9c7d5e88
IB
285 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
286 while ((cast(size_t)p) % uint.sizeof)
287 {
288 if (*p != ' ')
289 goto LendSkipFourSpaces;
290 p++;
291 }
292 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20
293 p += 4;
294 // Skip over any remaining space on the line.
295 while (*p == ' ')
296 p++;
297 LendSkipFourSpaces:
8977f4be
IB
298 version (DMDLIB)
299 {
300 if (whitespaceToken)
301 {
302 t.value = TOK.whitespace;
303 return;
304 }
305 }
9c7d5e88 306 continue; // skip white space
5fee5ec3
IB
307 case '\t':
308 case '\v':
309 case '\f':
310 p++;
8977f4be
IB
311 version (DMDLIB)
312 {
313 if (whitespaceToken)
314 {
315 t.value = TOK.whitespace;
316 return;
317 }
318 }
5fee5ec3
IB
319 continue; // skip white space
320 case '\r':
321 p++;
322 if (*p != '\n') // if CR stands by itself
235d5a96 323 {
5fee5ec3 324 endOfLine();
235d5a96
IB
325 if (tokenizeNewlines)
326 {
327 t.value = TOK.endOfLine;
328 tokenizeNewlines = false;
329 return;
330 }
331 }
8977f4be
IB
332 version (DMDLIB)
333 {
334 if (whitespaceToken)
335 {
336 t.value = TOK.whitespace;
337 return;
338 }
339 }
5fee5ec3
IB
340 continue; // skip white space
341 case '\n':
342 p++;
343 endOfLine();
235d5a96
IB
344 if (tokenizeNewlines)
345 {
346 t.value = TOK.endOfLine;
347 tokenizeNewlines = false;
348 return;
349 }
8977f4be
IB
350 version (DMDLIB)
351 {
352 if (whitespaceToken)
353 {
354 t.value = TOK.whitespace;
355 return;
356 }
357 }
5fee5ec3
IB
358 continue; // skip white space
359 case '0':
360 if (!isZeroSecond(p[1])) // if numeric literal does not continue
361 {
362 ++p;
363 t.unsvalue = 0;
364 t.value = TOK.int32Literal;
365 return;
366 }
367 goto Lnumber;
368
369 case '1': .. case '9':
370 if (!isDigitSecond(p[1])) // if numeric literal does not continue
371 {
372 t.unsvalue = *p - '0';
373 ++p;
374 t.value = TOK.int32Literal;
375 return;
376 }
377 Lnumber:
378 t.value = number(t);
379 return;
380
381 case '\'':
382 if (issinglechar(p[1]) && p[2] == '\'')
383 {
384 t.unsvalue = p[1]; // simple one character literal
6384eff5 385 t.value = TOK.charLiteral;
5fee5ec3
IB
386 p += 3;
387 }
388 else if (Ccompile)
389 {
390 clexerCharConstant(*t, 0);
391 }
392 else
393 {
394 t.value = charConstant(t);
395 }
396 return;
397
398 case 'u':
399 case 'U':
400 case 'L':
401 if (!Ccompile)
402 goto case_ident;
403 if (p[1] == '\'') // C wide character constant
404 {
405 char c = *p;
406 if (c == 'L') // convert L to u or U
407 c = (wchar_tsize == 4) ? 'u' : 'U';
408 ++p;
409 clexerCharConstant(*t, c);
410 return;
411 }
412 else if (p[1] == '\"') // C wide string literal
413 {
414 const c = *p;
415 ++p;
416 escapeStringConstant(t);
417 t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
418 c == 'u' ? 'w' :
419 'd';
420 return;
421 }
fbdaa581
IB
422 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal
423 {
424 p += 2;
425 escapeStringConstant(t);
426 return;
427 }
5fee5ec3
IB
428 goto case_ident;
429
430 case 'r':
7e287503 431 if (Ccompile || p[1] != '"')
5fee5ec3
IB
432 goto case_ident;
433 p++;
434 goto case '`';
435 case '`':
7e287503
IB
436 if (Ccompile)
437 goto default;
5fee5ec3
IB
438 wysiwygStringConstant(t);
439 return;
5fee5ec3 440 case 'q':
7e287503
IB
441 if (Ccompile)
442 goto case_ident;
5fee5ec3
IB
443 if (p[1] == '"')
444 {
445 p++;
446 delimitedStringConstant(t);
447 return;
448 }
449 else if (p[1] == '{')
450 {
451 p++;
452 tokenStringConstant(t);
453 return;
454 }
455 else
456 goto case_ident;
457 case '"':
458 escapeStringConstant(t);
459 return;
460 case 'a':
461 case 'b':
462 case 'c':
463 case 'd':
464 case 'e':
465 case 'f':
466 case 'g':
467 case 'h':
468 case 'i':
469 case 'j':
470 case 'k':
471 case 'l':
472 case 'm':
473 case 'n':
474 case 'o':
475 case 'p':
476 /*case 'q': case 'r':*/
477 case 's':
478 case 't':
479 //case 'u':
480 case 'v':
481 case 'w':
7e287503 482 case 'x':
5fee5ec3
IB
483 case 'y':
484 case 'z':
485 case 'A':
486 case 'B':
487 case 'C':
488 case 'D':
489 case 'E':
490 case 'F':
491 case 'G':
492 case 'H':
493 case 'I':
494 case 'J':
495 case 'K':
496 //case 'L':
497 case 'M':
498 case 'N':
499 case 'O':
500 case 'P':
501 case 'Q':
502 case 'R':
503 case 'S':
504 case 'T':
505 //case 'U':
506 case 'V':
507 case 'W':
508 case 'X':
509 case 'Y':
510 case 'Z':
511 case '_':
512 case_ident:
513 {
514 while (1)
515 {
516 const c = *++p;
517 if (isidchar(c))
518 continue;
519 else if (c & 0x80)
520 {
521 const s = p;
522 const u = decodeUTF();
523 if (isUniAlpha(u))
524 continue;
525 error("char 0x%04x not allowed in identifier", u);
526 p = s;
527 }
528 break;
529 }
530 Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr));
531 t.ident = id;
532 t.value = cast(TOK)id.getValue();
533
534 anyToken = 1;
535
536 /* Different keywords for C and D
537 */
538 if (Ccompile)
539 {
540 if (t.value != TOK.identifier)
541 {
542 t.value = Ckeywords[t.value]; // filter out D keywords
543 }
544 }
545 else if (t.value >= FirstCKeyword)
546 t.value = TOK.identifier; // filter out C keywords
547
548 else if (*t.ptr == '_') // if special identifier token
549 {
550 // Lazy initialization
551 TimeStampInfo.initialize(t.loc);
552
553 if (id == Id.DATE)
554 {
555 t.ustring = TimeStampInfo.date.ptr;
556 goto Lstr;
557 }
558 else if (id == Id.TIME)
559 {
560 t.ustring = TimeStampInfo.time.ptr;
561 goto Lstr;
562 }
563 else if (id == Id.VENDOR)
564 {
565 t.ustring = global.vendor.xarraydup.ptr;
566 goto Lstr;
567 }
568 else if (id == Id.TIMESTAMP)
569 {
570 t.ustring = TimeStampInfo.timestamp.ptr;
571 Lstr:
572 t.value = TOK.string_;
573 t.postfix = 0;
574 t.len = cast(uint)strlen(t.ustring);
575 }
576 else if (id == Id.VERSIONX)
577 {
578 t.value = TOK.int64Literal;
579 t.unsvalue = global.versionNumber();
580 }
581 else if (id == Id.EOFX)
582 {
583 t.value = TOK.endOfFile;
584 // Advance scanner to end of file
585 while (!(*p == 0 || *p == 0x1A))
586 p++;
587 }
588 }
589 //printf("t.value = %d\n",t.value);
590 return;
591 }
592 case '/':
593 p++;
594 switch (*p)
595 {
596 case '=':
597 p++;
598 t.value = TOK.divAssign;
599 return;
600 case '*':
601 p++;
602 startLoc = loc();
603 while (1)
604 {
605 while (1)
606 {
607 const c = *p;
608 switch (c)
609 {
610 case '/':
611 break;
612 case '\n':
613 endOfLine();
614 p++;
615 continue;
616 case '\r':
617 p++;
618 if (*p != '\n')
619 endOfLine();
620 continue;
621 case 0:
622 case 0x1A:
623 error("unterminated /* */ comment");
624 p = end;
625 t.loc = loc();
626 t.value = TOK.endOfFile;
627 return;
628 default:
629 if (c & 0x80)
630 {
631 const u = decodeUTF();
632 if (u == PS || u == LS)
633 endOfLine();
634 }
635 p++;
636 continue;
637 }
638 break;
639 }
640 p++;
641 if (p[-2] == '*' && p - 3 != t.ptr)
642 break;
643 }
644 if (commentToken)
645 {
646 t.loc = startLoc;
647 t.value = TOK.comment;
648 return;
649 }
650 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
651 {
652 // if /** but not /**/
653 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
654 lastDocLine = scanloc.linnum;
655 }
656 continue;
657 case '/': // do // style comments
658 startLoc = loc();
659 while (1)
660 {
661 const c = *++p;
662 switch (c)
663 {
664 case '\n':
665 break;
666 case '\r':
667 if (p[1] == '\n')
668 p++;
669 break;
670 case 0:
671 case 0x1A:
672 if (commentToken)
673 {
674 p = end;
675 t.loc = startLoc;
676 t.value = TOK.comment;
677 return;
678 }
679 if (doDocComment && t.ptr[2] == '/')
680 {
681 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
682 lastDocLine = scanloc.linnum;
683 }
684 p = end;
685 t.loc = loc();
686 t.value = TOK.endOfFile;
687 return;
688 default:
689 if (c & 0x80)
690 {
691 const u = decodeUTF();
692 if (u == PS || u == LS)
693 break;
694 }
695 continue;
696 }
697 break;
698 }
699 if (commentToken)
700 {
8977f4be
IB
701 version (DMDLIB) {}
702 else
703 {
704 p++;
705 endOfLine();
706 }
5fee5ec3
IB
707 t.loc = startLoc;
708 t.value = TOK.comment;
709 return;
710 }
711 if (doDocComment && t.ptr[2] == '/')
712 {
713 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
714 lastDocLine = scanloc.linnum;
715 }
716 p++;
717 endOfLine();
718 continue;
719 case '+':
7e287503 720 if (!Ccompile)
5fee5ec3
IB
721 {
722 int nest;
723 startLoc = loc();
724 p++;
725 nest = 1;
726 while (1)
727 {
728 char c = *p;
729 switch (c)
730 {
731 case '/':
732 p++;
733 if (*p == '+')
734 {
735 p++;
736 nest++;
737 }
738 continue;
739 case '+':
740 p++;
741 if (*p == '/')
742 {
743 p++;
744 if (--nest == 0)
745 break;
746 }
747 continue;
748 case '\r':
749 p++;
750 if (*p != '\n')
751 endOfLine();
752 continue;
753 case '\n':
754 endOfLine();
755 p++;
756 continue;
757 case 0:
758 case 0x1A:
759 error("unterminated /+ +/ comment");
760 p = end;
761 t.loc = loc();
762 t.value = TOK.endOfFile;
763 return;
764 default:
765 if (c & 0x80)
766 {
767 uint u = decodeUTF();
768 if (u == PS || u == LS)
769 endOfLine();
770 }
771 p++;
772 continue;
773 }
774 break;
775 }
776 if (commentToken)
777 {
778 t.loc = startLoc;
779 t.value = TOK.comment;
780 return;
781 }
782 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
783 {
784 // if /++ but not /++/
785 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
786 lastDocLine = scanloc.linnum;
787 }
788 continue;
789 }
7e287503 790 break;
5fee5ec3
IB
791 default:
792 break;
793 }
794 t.value = TOK.div;
795 return;
796 case '.':
797 p++;
798 if (isdigit(*p))
799 {
800 /* Note that we don't allow ._1 and ._ as being
801 * valid floating point numbers.
802 */
803 p--;
804 t.value = inreal(t);
805 }
806 else if (p[0] == '.')
807 {
808 if (p[1] == '.')
809 {
810 p += 2;
811 t.value = TOK.dotDotDot;
812 }
813 else
814 {
815 p++;
816 t.value = TOK.slice;
817 }
818 }
819 else
820 t.value = TOK.dot;
821 return;
822 case '&':
823 p++;
824 if (*p == '=')
825 {
826 p++;
827 t.value = TOK.andAssign;
828 }
829 else if (*p == '&')
830 {
831 p++;
832 t.value = TOK.andAnd;
833 }
834 else
835 t.value = TOK.and;
836 return;
837 case '|':
838 p++;
839 if (*p == '=')
840 {
841 p++;
842 t.value = TOK.orAssign;
843 }
844 else if (*p == '|')
845 {
846 p++;
847 t.value = TOK.orOr;
848 }
849 else
850 t.value = TOK.or;
851 return;
852 case '-':
853 p++;
854 if (*p == '=')
855 {
856 p++;
857 t.value = TOK.minAssign;
858 }
859 else if (*p == '-')
860 {
861 p++;
862 t.value = TOK.minusMinus;
863 }
864 else if (*p == '>')
865 {
866 ++p;
867 t.value = TOK.arrow;
868 }
869 else
870 t.value = TOK.min;
871 return;
872 case '+':
873 p++;
874 if (*p == '=')
875 {
876 p++;
877 t.value = TOK.addAssign;
878 }
879 else if (*p == '+')
880 {
881 p++;
882 t.value = TOK.plusPlus;
883 }
884 else
885 t.value = TOK.add;
886 return;
887 case '<':
888 p++;
889 if (*p == '=')
890 {
891 p++;
892 t.value = TOK.lessOrEqual; // <=
893 }
894 else if (*p == '<')
895 {
896 p++;
897 if (*p == '=')
898 {
899 p++;
900 t.value = TOK.leftShiftAssign; // <<=
901 }
902 else
903 t.value = TOK.leftShift; // <<
904 }
905 else if (*p == ':' && Ccompile)
906 {
907 ++p;
908 t.value = TOK.leftBracket; // <:
909 }
910 else if (*p == '%' && Ccompile)
911 {
912 ++p;
913 t.value = TOK.leftCurly; // <%
914 }
915 else
916 t.value = TOK.lessThan; // <
917 return;
918 case '>':
919 p++;
920 if (*p == '=')
921 {
922 p++;
923 t.value = TOK.greaterOrEqual; // >=
924 }
925 else if (*p == '>')
926 {
927 p++;
928 if (*p == '=')
929 {
930 p++;
931 t.value = TOK.rightShiftAssign; // >>=
932 }
933 else if (*p == '>')
934 {
935 p++;
936 if (*p == '=')
937 {
938 p++;
939 t.value = TOK.unsignedRightShiftAssign; // >>>=
940 }
941 else
942 t.value = TOK.unsignedRightShift; // >>>
943 }
944 else
945 t.value = TOK.rightShift; // >>
946 }
947 else
948 t.value = TOK.greaterThan; // >
949 return;
950 case '!':
951 p++;
952 if (*p == '=')
953 {
954 p++;
955 t.value = TOK.notEqual; // !=
956 }
957 else
958 t.value = TOK.not; // !
959 return;
960 case '=':
961 p++;
962 if (*p == '=')
963 {
964 p++;
965 t.value = TOK.equal; // ==
966 }
967 else if (*p == '>')
968 {
969 p++;
970 t.value = TOK.goesTo; // =>
971 }
972 else
973 t.value = TOK.assign; // =
974 return;
975 case '~':
976 p++;
977 if (*p == '=')
978 {
979 p++;
980 t.value = TOK.concatenateAssign; // ~=
981 }
982 else
983 t.value = TOK.tilde; // ~
984 return;
985 case '^':
986 p++;
987 if (*p == '^')
988 {
989 p++;
990 if (*p == '=')
991 {
992 p++;
993 t.value = TOK.powAssign; // ^^=
994 }
995 else
996 t.value = TOK.pow; // ^^
997 }
998 else if (*p == '=')
999 {
1000 p++;
1001 t.value = TOK.xorAssign; // ^=
1002 }
1003 else
1004 t.value = TOK.xor; // ^
1005 return;
1006 case '(':
1007 p++;
1008 t.value = TOK.leftParenthesis;
1009 return;
1010 case ')':
1011 p++;
1012 t.value = TOK.rightParenthesis;
1013 return;
1014 case '[':
1015 p++;
1016 t.value = TOK.leftBracket;
1017 return;
1018 case ']':
1019 p++;
1020 t.value = TOK.rightBracket;
1021 return;
1022 case '{':
1023 p++;
1024 t.value = TOK.leftCurly;
1025 return;
1026 case '}':
1027 p++;
1028 t.value = TOK.rightCurly;
1029 return;
1030 case '?':
1031 p++;
1032 t.value = TOK.question;
1033 return;
1034 case ',':
1035 p++;
1036 t.value = TOK.comma;
1037 return;
1038 case ';':
1039 p++;
1040 t.value = TOK.semicolon;
1041 return;
1042 case ':':
1043 p++;
1044 if (*p == ':')
1045 {
1046 ++p;
1047 t.value = TOK.colonColon;
1048 }
1049 else if (*p == '>' && Ccompile)
1050 {
1051 ++p;
1052 t.value = TOK.rightBracket;
1053 }
1054 else
1055 t.value = TOK.colon;
1056 return;
1057 case '$':
1058 p++;
1059 t.value = TOK.dollar;
1060 return;
1061 case '@':
1062 p++;
1063 t.value = TOK.at;
1064 return;
1065 case '*':
1066 p++;
1067 if (*p == '=')
1068 {
1069 p++;
1070 t.value = TOK.mulAssign;
1071 }
1072 else
1073 t.value = TOK.mul;
1074 return;
1075 case '%':
1076 p++;
1077 if (*p == '=')
1078 {
1079 p++;
1080 t.value = TOK.modAssign;
1081 }
1082 else if (*p == '>' && Ccompile)
1083 {
1084 ++p;
1085 t.value = TOK.rightCurly;
1086 }
1087 else if (*p == ':' && Ccompile)
1088 {
1089 goto case '#'; // %: means #
1090 }
1091 else
1092 t.value = TOK.mod;
1093 return;
1094 case '#':
1095 {
235d5a96
IB
1096 // https://issues.dlang.org/show_bug.cgi?id=22825
1097 // Special token sequences are terminated by newlines,
1098 // and should not be skipped over.
1099 this.tokenizeNewlines = true;
5fee5ec3 1100 p++;
7e287503 1101 if (parseSpecialTokenSequence())
5fee5ec3 1102 continue;
5fee5ec3
IB
1103 t.value = TOK.pound;
1104 return;
1105 }
1106 default:
1107 {
1108 dchar c = *p;
1109 if (c & 0x80)
1110 {
1111 c = decodeUTF();
1112 // Check for start of unicode identifier
1113 if (isUniAlpha(c))
1114 goto case_ident;
1115 if (c == PS || c == LS)
1116 {
1117 endOfLine();
1118 p++;
235d5a96
IB
1119 if (tokenizeNewlines)
1120 {
1121 t.value = TOK.endOfLine;
1122 tokenizeNewlines = false;
1123 return;
1124 }
5fee5ec3
IB
1125 continue;
1126 }
1127 }
1128 if (c < 0x80 && isprint(c))
1129 error("character '%c' is not a valid token", c);
1130 else
1131 error("character 0x%02x is not a valid token", c);
1132 p++;
1133 continue;
1134 }
1135 }
1136 }
1137 }
1138
1139 final Token* peek(Token* ct)
1140 {
1141 Token* t;
1142 if (ct.next)
1143 t = ct.next;
1144 else
1145 {
1146 t = allocateToken();
1147 scan(t);
1148 ct.next = t;
1149 }
1150 return t;
1151 }
1152
1153 /*********************************
1154 * tk is on the opening (.
1155 * Look ahead and return token that is past the closing ).
1156 */
1157 final Token* peekPastParen(Token* tk)
1158 {
1159 //printf("peekPastParen()\n");
1160 int parens = 1;
1161 int curlynest = 0;
1162 while (1)
1163 {
1164 tk = peek(tk);
1165 //tk.print();
1166 switch (tk.value)
1167 {
1168 case TOK.leftParenthesis:
1169 parens++;
1170 continue;
1171 case TOK.rightParenthesis:
1172 --parens;
1173 if (parens)
1174 continue;
1175 tk = peek(tk);
1176 break;
1177 case TOK.leftCurly:
1178 curlynest++;
1179 continue;
1180 case TOK.rightCurly:
1181 if (--curlynest >= 0)
1182 continue;
1183 break;
1184 case TOK.semicolon:
1185 if (curlynest)
1186 continue;
1187 break;
1188 case TOK.endOfFile:
1189 break;
1190 default:
1191 continue;
1192 }
1193 return tk;
1194 }
1195 }
1196
1197 /*******************************************
1198 * Parse escape sequence.
1199 */
1200 private uint escapeSequence()
1201 {
1202 return Lexer.escapeSequence(token.loc, p, Ccompile);
1203 }
1204
1205 /********
1206 * Parse the given string literal escape sequence into a single character.
1207 * D https://dlang.org/spec/lex.html#escape_sequences
1208 * C11 6.4.4.4
1209 * Params:
1210 * loc = location to use for error messages
1211 * sequence = pointer to string with escape sequence to parse. Updated to
1212 * point past the end of the escape sequence
1213 * Ccompile = true for compile C11 escape sequences
1214 * Returns:
1215 * the escape sequence as a single character
1216 */
610d7898 1217 private dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile)
5fee5ec3
IB
1218 {
1219 const(char)* p = sequence; // cache sequence reference on stack
1220 scope(exit) sequence = p;
1221
1222 uint c = *p;
1223 int ndigits;
1224 switch (c)
1225 {
1226 case '\'':
1227 case '"':
1228 case '?':
1229 case '\\':
1230 Lconsume:
1231 p++;
1232 break;
1233 case 'a':
1234 c = 7;
1235 goto Lconsume;
1236 case 'b':
1237 c = 8;
1238 goto Lconsume;
1239 case 'f':
1240 c = 12;
1241 goto Lconsume;
1242 case 'n':
1243 c = 10;
1244 goto Lconsume;
1245 case 'r':
1246 c = 13;
1247 goto Lconsume;
1248 case 't':
1249 c = 9;
1250 goto Lconsume;
1251 case 'v':
1252 c = 11;
1253 goto Lconsume;
1254 case 'u':
1255 ndigits = 4;
1256 goto Lhex;
1257 case 'U':
1258 ndigits = 8;
1259 goto Lhex;
1260 case 'x':
1261 ndigits = 2;
1262 Lhex:
1263 p++;
1264 c = *p;
1265 if (ishex(cast(char)c))
1266 {
1267 uint v = 0;
1268 int n = 0;
ae56e2da 1269 if (Ccompile && ndigits == 2)
5fee5ec3 1270 {
ae56e2da
IB
1271 /* C11 6.4.4.4-7 one to infinity hex digits
1272 */
1273 do
5fee5ec3 1274 {
ae56e2da
IB
1275 if (isdigit(cast(char)c))
1276 c -= '0';
1277 else if (islower(c))
1278 c -= 'a' - 10;
1279 else
1280 c -= 'A' - 10;
1281 v = v * 16 + c;
1282 c = *++p;
1283 } while (ishex(cast(char)c));
5fee5ec3 1284 }
ae56e2da 1285 else
5fee5ec3 1286 {
ae56e2da
IB
1287 while (1)
1288 {
1289 if (isdigit(cast(char)c))
1290 c -= '0';
1291 else if (islower(c))
1292 c -= 'a' - 10;
1293 else
1294 c -= 'A' - 10;
1295 v = v * 16 + c;
1296 c = *++p;
1297 if (++n == ndigits)
1298 break;
1299 if (!ishex(cast(char)c))
1300 {
610d7898 1301 error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
ae56e2da
IB
1302 break;
1303 }
1304 }
1305 if (ndigits != 2 && !utf_isValidDchar(v))
1306 {
610d7898 1307 error(loc, "invalid UTF character \\U%08x", v);
ae56e2da
IB
1308 v = '?'; // recover with valid UTF character
1309 }
5fee5ec3
IB
1310 }
1311 c = v;
1312 }
1313 else
1314 {
610d7898 1315 error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
5fee5ec3
IB
1316 p++;
1317 }
1318 break;
1319 case '&':
1320 if (Ccompile)
1321 goto default;
1322
1323 // named character entity
1324 for (const idstart = ++p; 1; p++)
1325 {
1326 switch (*p)
1327 {
1328 case ';':
b6df1132 1329 c = HtmlNamedEntity(idstart[0 .. p - idstart]);
5fee5ec3
IB
1330 if (c == ~0)
1331 {
610d7898 1332 error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
5fee5ec3
IB
1333 c = '?';
1334 }
1335 p++;
1336 break;
1337 default:
1338 if (isalpha(*p) || (p != idstart && isdigit(*p)))
1339 continue;
610d7898 1340 error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
5fee5ec3
IB
1341 c = '?';
1342 break;
1343 }
1344 break;
1345 }
1346 break;
1347 case 0:
1348 case 0x1A:
1349 // end of file
1350 c = '\\';
1351 break;
1352 default:
1353 if (isoctal(cast(char)c))
1354 {
1355 uint v = 0;
1356 int n = 0;
1357 do
1358 {
1359 v = v * 8 + (c - '0');
1360 c = *++p;
1361 }
1362 while (++n < 3 && isoctal(cast(char)c));
1363 c = v;
1364 if (c > 0xFF)
610d7898 1365 error(loc, "escape octal sequence \\%03o is larger than \\377", c);
5fee5ec3
IB
1366 }
1367 else
1368 {
610d7898 1369 error(loc, "undefined escape sequence \\%c", c);
5fee5ec3
IB
1370 p++;
1371 }
1372 break;
1373 }
1374 return c;
1375 }
1376
1377 /**
1378 Lex a wysiwyg string. `p` must be pointing to the first character before the
1379 contents of the string literal. The character pointed to by `p` will be used as
1380 the terminating character (i.e. backtick or double-quote).
1381 Params:
1382 result = pointer to the token that accepts the result
1383 */
1384 private void wysiwygStringConstant(Token* result)
1385 {
1386 result.value = TOK.string_;
1387 Loc start = loc();
1388 auto terminator = p[0];
1389 p++;
1390 stringbuffer.setsize(0);
1391 while (1)
1392 {
1393 dchar c = p[0];
1394 p++;
1395 switch (c)
1396 {
1397 case '\n':
1398 endOfLine();
1399 break;
1400 case '\r':
1401 if (p[0] == '\n')
1402 continue; // ignore
1403 c = '\n'; // treat EndOfLine as \n character
1404 endOfLine();
1405 break;
1406 case 0:
1407 case 0x1A:
1408 error("unterminated string constant starting at %s", start.toChars());
1409 result.setString();
1410 // rewind `p` so it points to the EOF character
1411 p--;
1412 return;
1413 default:
1414 if (c == terminator)
1415 {
1416 result.setString(stringbuffer);
1417 stringPostfix(result);
1418 return;
1419 }
1420 else if (c & 0x80)
1421 {
1422 p--;
1423 const u = decodeUTF();
1424 p++;
1425 if (u == PS || u == LS)
1426 endOfLine();
1427 stringbuffer.writeUTF8(u);
1428 continue;
1429 }
1430 break;
1431 }
1432 stringbuffer.writeByte(c);
1433 }
1434 }
1435
5fee5ec3
IB
1436 /**
1437 Lex a delimited string. Some examples of delimited strings are:
1438 ---
1439 q"(foo(xxx))" // "foo(xxx)"
1440 q"[foo$(LPAREN)]" // "foo$(LPAREN)"
1441 q"/foo]/" // "foo]"
1442 q"HERE
1443 foo
1444 HERE" // "foo\n"
1445 ---
1446 It is assumed that `p` points to the opening double-quote '"'.
1447 Params:
1448 result = pointer to the token that accepts the result
1449 */
1450 private void delimitedStringConstant(Token* result)
1451 {
1452 result.value = TOK.string_;
1453 Loc start = loc();
1454 dchar delimleft = 0;
1455 dchar delimright = 0;
1456 uint nest = 1;
1457 uint nestcount = ~0; // dead assignment, needed to suppress warning
1458 Identifier hereid = null;
1459 uint blankrol = 0;
1460 uint startline = 0;
1461 p++;
1462 stringbuffer.setsize(0);
1463 while (1)
1464 {
1465 dchar c = *p++;
1466 //printf("c = '%c'\n", c);
1467 switch (c)
1468 {
1469 case '\n':
1470 Lnextline:
1471 endOfLine();
1472 startline = 1;
1473 if (blankrol)
1474 {
1475 blankrol = 0;
1476 continue;
1477 }
1478 if (hereid)
1479 {
1480 stringbuffer.writeUTF8(c);
1481 continue;
1482 }
1483 break;
1484 case '\r':
1485 if (*p == '\n')
1486 continue; // ignore
1487 c = '\n'; // treat EndOfLine as \n character
1488 goto Lnextline;
1489 case 0:
1490 case 0x1A:
1491 error("unterminated delimited string constant starting at %s", start.toChars());
1492 result.setString();
1493 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1494 p--;
1495 return;
1496 default:
1497 if (c & 0x80)
1498 {
1499 p--;
1500 c = decodeUTF();
1501 p++;
1502 if (c == PS || c == LS)
1503 goto Lnextline;
1504 }
1505 break;
1506 }
1507 if (delimleft == 0)
1508 {
1509 delimleft = c;
1510 nest = 1;
1511 nestcount = 1;
1512 if (c == '(')
1513 delimright = ')';
1514 else if (c == '{')
1515 delimright = '}';
1516 else if (c == '[')
1517 delimright = ']';
1518 else if (c == '<')
1519 delimright = '>';
1520 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1521 {
1522 // Start of identifier; must be a heredoc
1523 Token tok;
1524 p--;
1525 scan(&tok); // read in heredoc identifier
1526 if (tok.value != TOK.identifier)
1527 {
1528 error("identifier expected for heredoc, not %s", tok.toChars());
1529 delimright = c;
1530 }
1531 else
1532 {
1533 hereid = tok.ident;
1534 //printf("hereid = '%s'\n", hereid.toChars());
1535 blankrol = 1;
1536 }
1537 nest = 0;
1538 }
1539 else
1540 {
1541 delimright = c;
1542 nest = 0;
1543 if (isspace(c))
1544 error("delimiter cannot be whitespace");
1545 }
1546 }
1547 else
1548 {
1549 if (blankrol)
1550 {
1551 error("heredoc rest of line should be blank");
1552 blankrol = 0;
1553 continue;
1554 }
1555 if (nest == 1)
1556 {
1557 if (c == delimleft)
1558 nestcount++;
1559 else if (c == delimright)
1560 {
1561 nestcount--;
1562 if (nestcount == 0)
1563 goto Ldone;
1564 }
1565 }
1566 else if (c == delimright)
1567 goto Ldone;
1568 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
1569 {
1570 Token tok;
1571 auto psave = p;
1572 p--;
1573 scan(&tok); // read in possible heredoc identifier
1574 //printf("endid = '%s'\n", tok.ident.toChars());
1575 if (tok.value == TOK.identifier && tok.ident is hereid)
1576 {
1577 /* should check that rest of line is blank
1578 */
1579 goto Ldone;
1580 }
1581 p = psave;
1582 }
1583 stringbuffer.writeUTF8(c);
1584 startline = 0;
1585 }
1586 }
1587 Ldone:
1588 if (*p == '"')
1589 p++;
1590 else if (hereid)
6384eff5
IB
1591 error("delimited string must end in `%s\"`", hereid.toChars());
1592 else if (isspace(delimright))
1593 error("delimited string must end in `\"`");
5fee5ec3 1594 else
6384eff5 1595 error("delimited string must end in `%c\"`", delimright);
5fee5ec3
IB
1596 result.setString(stringbuffer);
1597 stringPostfix(result);
1598 }
1599
1600 /**
1601 Lex a token string. Some examples of token strings are:
1602 ---
1603 q{ foo(xxx) } // " foo(xxx) "
1604 q{foo$(LPAREN)} // "foo$(LPAREN)"
1605 q{{foo}"}"} // "{foo}"}""
1606 ---
1607 It is assumed that `p` points to the opening curly-brace.
1608 Params:
1609 result = pointer to the token that accepts the result
1610 */
1611 private void tokenStringConstant(Token* result)
1612 {
1613 result.value = TOK.string_;
1614
1615 uint nest = 1;
1616 const start = loc();
1617 const pstart = ++p;
1618 inTokenStringConstant++;
1619 scope(exit) inTokenStringConstant--;
1620 while (1)
1621 {
1622 Token tok;
1623 scan(&tok);
1624 switch (tok.value)
1625 {
1626 case TOK.leftCurly:
1627 nest++;
1628 continue;
1629 case TOK.rightCurly:
1630 if (--nest == 0)
1631 {
1632 result.setString(pstart, p - 1 - pstart);
1633 stringPostfix(result);
1634 return;
1635 }
1636 continue;
1637 case TOK.endOfFile:
1638 error("unterminated token string constant starting at %s", start.toChars());
1639 result.setString();
1640 return;
1641 default:
1642 continue;
1643 }
1644 }
1645 }
1646
1647 /**
1648 Scan a quoted string while building the processed string value by
1649 handling escape sequences. The result is returned in the given `t` token.
1650 This function assumes that `p` currently points to the opening quote
1651 of the string.
1652 Params:
1653 t = the token to set the resulting string to
1654 * References:
1655 * D https://dlang.org/spec/lex.html#double_quoted_strings
1656 * ImportC C11 6.4.5
1657 */
1658 private void escapeStringConstant(Token* t)
1659 {
1660 t.value = TOK.string_;
1661
1662 const start = loc();
1663 const tc = *p++; // opening quote
1664 stringbuffer.setsize(0);
1665 while (1)
1666 {
1667 dchar c = *p++;
1668 switch (c)
1669 {
1670 case '\\':
1671 switch (*p)
1672 {
1673 case '&':
1674 if (Ccompile)
1675 goto default;
1676 goto case;
1677
1678 case 'u':
1679 case 'U':
1680 c = escapeSequence();
1681 stringbuffer.writeUTF8(c);
1682 continue;
1683 default:
1684 c = escapeSequence();
1685 break;
1686 }
1687 break;
1688 case '\n':
1689 endOfLine();
1690 if (Ccompile)
1691 goto Lunterminated;
1692 break;
1693 case '\r':
1694 if (*p == '\n')
1695 continue; // ignore
1696 c = '\n'; // treat EndOfLine as \n character
1697 endOfLine();
1698 if (Ccompile)
1699 goto Lunterminated;
1700 break;
1701 case '\'':
1702 case '"':
1703 if (c != tc)
1704 goto default;
1705 t.setString(stringbuffer);
1706 if (!Ccompile)
1707 stringPostfix(t);
1708 return;
1709 case 0:
1710 case 0x1A:
1711 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1712 p--;
1713 Lunterminated:
1714 error("unterminated string constant starting at %s", start.toChars());
1715 t.setString();
1716 return;
1717 default:
1718 if (c & 0x80)
1719 {
1720 p--;
1721 c = decodeUTF();
1722 if (c == LS || c == PS)
1723 {
1724 c = '\n';
1725 endOfLine();
1726 if (Ccompile)
1727 goto Lunterminated;
1728 }
1729 p++;
1730 stringbuffer.writeUTF8(c);
1731 continue;
1732 }
1733 break;
1734 }
1735 stringbuffer.writeByte(c);
1736 }
1737 }
1738
1739 /**************************************
1740 * Reference:
1741 * https://dlang.org/spec/lex.html#characterliteral
1742 */
1743 private TOK charConstant(Token* t)
1744 {
1745 TOK tk = TOK.charLiteral;
1746 //printf("Lexer::charConstant\n");
1747 p++;
1748 dchar c = *p++;
1749 switch (c)
1750 {
1751 case '\\':
1752 switch (*p)
1753 {
1754 case 'u':
1755 t.unsvalue = escapeSequence();
1756 tk = TOK.wcharLiteral;
1757 break;
1758 case 'U':
1759 case '&':
1760 t.unsvalue = escapeSequence();
1761 tk = TOK.dcharLiteral;
1762 break;
1763 default:
1764 t.unsvalue = escapeSequence();
1765 break;
1766 }
1767 break;
1768 case '\n':
1769 L1:
1770 endOfLine();
1771 goto case;
1772 case '\r':
1773 goto case '\'';
1774 case 0:
1775 case 0x1A:
1776 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1777 p--;
1778 goto case;
1779 case '\'':
1780 error("unterminated character constant");
1781 t.unsvalue = '?';
1782 return tk;
1783 default:
1784 if (c & 0x80)
1785 {
1786 p--;
1787 c = decodeUTF();
1788 p++;
1789 if (c == LS || c == PS)
1790 goto L1;
1791 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1792 tk = TOK.wcharLiteral;
1793 else
1794 tk = TOK.dcharLiteral;
1795 }
1796 t.unsvalue = c;
1797 break;
1798 }
1799 if (*p != '\'')
1800 {
1801 while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
1802 *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
1803 {
1804 if (*p & 0x80)
1805 {
1806 const s = p;
1807 c = decodeUTF();
1808 if (c == LS || c == PS)
1809 {
1810 p = s;
1811 break;
1812 }
1813 }
1814 p++;
1815 }
1816
1817 if (*p == '\'')
1818 {
1819 error("character constant has multiple characters");
1820 p++;
1821 }
1822 else
1823 error("unterminated character constant");
1824 t.unsvalue = '?';
1825 return tk;
1826 }
1827 p++;
1828 return tk;
1829 }
1830
1831 /***************************************
1832 * Lex C character constant.
1833 * Parser is on the opening quote.
1834 * Params:
1835 * t = token to fill in
1836 * prefix = one of `u`, `U` or 0.
1837 * Reference:
1838 * C11 6.4.4.4
1839 */
1840 private void clexerCharConstant(ref Token t, char prefix)
1841 {
1842 escapeStringConstant(&t);
1843 const(char)[] str = t.ustring[0 .. t.len];
1844 const n = str.length;
1845 const loc = t.loc;
1846 if (n == 0)
1847 {
1848 error(loc, "empty character constant");
1849 t.value = TOK.semicolon;
1850 return;
1851 }
1852
1853 uint u;
1854 switch (prefix)
1855 {
1856 case 0:
1857 if (n == 1) // fast case
1858 {
1859 u = str[0];
1860 }
1861 else if (n > 4)
1862 error(loc, "max number of chars in character literal is 4, had %d",
1863 cast(int)n);
1864 else
1865 {
1866 foreach (i, c; str)
1867 (cast(char*)&u)[n - 1 - i] = c;
1868 }
1869 break;
1870
1871 case 'u':
1872 dchar d1;
1873 size_t idx;
1874 auto msg = utf_decodeChar(str, idx, d1);
1875 dchar d2 = 0;
1876 if (idx < n && !msg)
1877 msg = utf_decodeChar(str, idx, d2);
1878 if (msg)
1879 error(loc, "%s", msg);
1880 else if (idx < n)
1881 error(loc, "max number of chars in 16 bit character literal is 2, had %d",
1882 (n + 1) >> 1);
1883 else if (d1 > 0x1_0000)
1884 error(loc, "%d does not fit in 16 bits", d1);
1885 else if (d2 > 0x1_0000)
1886 error(loc, "%d does not fit in 16 bits", d2);
1887 u = d1;
1888 if (d2)
1889 u = (d1 << 16) | d2;
1890 break;
1891
1892 case 'U':
1893 dchar d;
1894 size_t idx;
1895 auto msg = utf_decodeChar(str, idx, d);
1896 if (msg)
1897 error(loc, "%s", msg);
1898 else if (idx < n)
1899 error(loc, "max number of chars in 32 bit character literal is 1, had %d",
1900 (n + 3) >> 2);
1901 u = d;
1902 break;
1903
1904 default:
1905 assert(0);
1906 }
6384eff5 1907 t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal;
5fee5ec3
IB
1908 t.unsvalue = u;
1909 }
1910
1911 /***************************************
1912 * Get postfix of string literal.
1913 */
1914 private void stringPostfix(Token* t) pure @nogc
1915 {
1916 switch (*p)
1917 {
1918 case 'c':
1919 case 'w':
1920 case 'd':
1921 t.postfix = *p;
1922 p++;
1923 break;
1924 default:
1925 t.postfix = 0;
1926 break;
1927 }
1928 }
1929
1930 /**************************************
1931 * Read in a number.
1932 * If it's an integer, store it in tok.TKutok.Vlong.
1933 * integers can be decimal, octal or hex
1934 * Handle the suffixes U, UL, LU, L, etc.
1935 * If it's double, store it in tok.TKutok.Vdouble.
1936 * Returns:
1937 * TKnum
1938 * TKdouble,...
1939 */
1940 private TOK number(Token* t)
1941 {
1942 int base = 10;
1943 const start = p;
1944 uinteger_t n = 0; // unsigned >=64 bit integer type
1945 int d;
1946 bool err = false;
1947 bool overflow = false;
1948 bool anyBinaryDigitsNoSingleUS = false;
1949 bool anyHexDigitsNoSingleUS = false;
fd43568c 1950 char errorDigit = 0;
5fee5ec3
IB
1951 dchar c = *p;
1952 if (c == '0')
1953 {
1954 ++p;
1955 c = *p;
1956 switch (c)
1957 {
1958 case '0':
1959 case '1':
1960 case '2':
1961 case '3':
1962 case '4':
1963 case '5':
1964 case '6':
1965 case '7':
1966 base = 8;
1967 break;
1968
1969 case '8':
1970 case '9':
fd43568c 1971 errorDigit = cast(char) c;
5fee5ec3
IB
1972 base = 8;
1973 break;
1974 case 'x':
1975 case 'X':
1976 ++p;
1977 base = 16;
1978 break;
1979 case 'b':
1980 case 'B':
1981 if (Ccompile)
1982 error("binary constants not allowed");
1983 ++p;
1984 base = 2;
1985 break;
1986 case '.':
1987 if (p[1] == '.')
1988 goto Ldone; // if ".."
1989 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
6384eff5
IB
1990 {
1991 if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
1992 goto Lreal; // if `0.f` or `0.L`
5fee5ec3 1993 goto Ldone; // if ".identifier" or ".unicode"
6384eff5 1994 }
5fee5ec3
IB
1995 goto Lreal; // '.' is part of current token
1996 case 'i':
1997 case 'f':
1998 case 'F':
1999 goto Lreal;
2000 case '_':
2001 if (Ccompile)
2002 error("embedded `_` not allowed");
2003 ++p;
2004 base = 8;
2005 break;
2006 case 'L':
2007 if (p[1] == 'i')
2008 goto Lreal;
2009 break;
2010 default:
2011 break;
2012 }
2013 }
2014 while (1)
2015 {
2016 c = *p;
2017 switch (c)
2018 {
2019 case '0':
2020 case '1':
2021 case '2':
2022 case '3':
2023 case '4':
2024 case '5':
2025 case '6':
2026 case '7':
2027 case '8':
2028 case '9':
2029 ++p;
2030 d = c - '0';
2031 break;
2032 case 'a':
2033 case 'b':
2034 case 'c':
2035 case 'd':
2036 case 'e':
2037 case 'f':
2038 case 'A':
2039 case 'B':
2040 case 'C':
2041 case 'D':
2042 case 'E':
2043 case 'F':
2044 ++p;
2045 if (base != 16)
2046 {
2047 if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
2048 goto Lreal;
2049 }
2050 if (c >= 'a')
2051 d = c + 10 - 'a';
2052 else
2053 d = c + 10 - 'A';
2054 break;
2055 case 'L':
2056 if (p[1] == 'i')
2057 goto Lreal;
2058 goto Ldone;
2059 case '.':
2060 if (p[1] == '.')
2061 goto Ldone; // if ".."
0fb57034 2062 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
6384eff5
IB
2063 {
2064 if (Ccompile && base == 10 &&
fbdaa581
IB
2065 (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2066 goto Lreal; // if `1.e6` or `1.f` or `1.L`
5fee5ec3 2067 goto Ldone; // if ".identifier" or ".unicode"
6384eff5 2068 }
5fee5ec3
IB
2069 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
2070 goto Ldone; // if ".identifier" or ".unicode"
2071 if (base == 2)
2072 goto Ldone; // if ".identifier" or ".unicode"
2073 goto Lreal; // otherwise as part of a floating point literal
2074 case 'p':
2075 case 'P':
2076 case 'i':
2077 Lreal:
2078 p = start;
2079 return inreal(t);
2080 case '_':
2081 if (Ccompile)
2082 goto default;
2083 ++p;
2084 continue;
2085 default:
2086 goto Ldone;
2087 }
2088 // got a digit here, set any necessary flags, check for errors
2089 anyHexDigitsNoSingleUS = true;
2090 anyBinaryDigitsNoSingleUS = true;
fd43568c 2091 if (!errorDigit && d >= base)
5fee5ec3 2092 {
fd43568c 2093 errorDigit = cast(char) c;
5fee5ec3
IB
2094 }
2095 // Avoid expensive overflow check if we aren't at risk of overflow
2096 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
2097 n = n * base + d;
2098 else
2099 {
2100 import core.checkedint : mulu, addu;
2101
2102 n = mulu(n, base, overflow);
2103 n = addu(n, d, overflow);
2104 }
2105 }
2106 Ldone:
fd43568c
IB
2107 if (errorDigit)
2108 {
2109 error("%s digit expected, not `%c`", base == 2 ? "binary".ptr :
2110 base == 8 ? "octal".ptr :
2111 "decimal".ptr, errorDigit);
2112 err = true;
2113 }
5fee5ec3
IB
2114 if (overflow && !err)
2115 {
2116 error("integer overflow");
2117 err = true;
2118 }
2119 if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
2120 (base == 16 && !anyHexDigitsNoSingleUS))
2121 error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
2122
2123 t.unsvalue = n;
2124
2125 if (Ccompile)
2126 return cnumber(base, n);
2127
2128 enum FLAGS : int
2129 {
2130 none = 0,
2131 decimal = 1, // decimal
2132 unsigned = 2, // u or U suffix
2133 long_ = 4, // L suffix
2134 }
2135
2136 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
2137 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2138 const psuffix = p;
2139 while (1)
2140 {
2141 FLAGS f;
2142 switch (*p)
2143 {
2144 case 'U':
2145 case 'u':
2146 f = FLAGS.unsigned;
2147 goto L1;
2148 case 'l':
2149 f = FLAGS.long_;
2150 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2151 goto L1;
2152 case 'L':
2153 f = FLAGS.long_;
2154 L1:
2155 p++;
2156 if ((flags & f) && !err)
2157 {
2158 error("unrecognized token");
2159 err = true;
2160 }
2161 flags = cast(FLAGS)(flags | f);
2162 continue;
2163 default:
2164 break;
2165 }
2166 break;
2167 }
2168 if (base == 8 && n >= 8)
2169 {
2170 if (err)
2171 // can't translate invalid octal value, just show a generic message
2172 error("octal literals larger than 7 are no longer supported");
2173 else
31350635 2174 error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
5fee5ec3
IB
2175 n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
2176 }
2177 TOK result;
2178 switch (flags)
2179 {
2180 case FLAGS.none:
2181 /* Octal or Hexadecimal constant.
2182 * First that fits: int, uint, long, ulong
2183 */
2184 if (n & 0x8000000000000000L)
2185 result = TOK.uns64Literal;
2186 else if (n & 0xFFFFFFFF00000000L)
2187 result = TOK.int64Literal;
2188 else if (n & 0x80000000)
2189 result = TOK.uns32Literal;
2190 else
2191 result = TOK.int32Literal;
2192 break;
2193 case FLAGS.decimal:
2194 /* First that fits: int, long, long long
2195 */
2196 if (n & 0x8000000000000000L)
2197 {
2198 result = TOK.uns64Literal;
2199 }
2200 else if (n & 0xFFFFFFFF80000000L)
2201 result = TOK.int64Literal;
2202 else
2203 result = TOK.int32Literal;
2204 break;
2205 case FLAGS.unsigned:
2206 case FLAGS.decimal | FLAGS.unsigned:
2207 /* First that fits: uint, ulong
2208 */
2209 if (n & 0xFFFFFFFF00000000L)
2210 result = TOK.uns64Literal;
2211 else
2212 result = TOK.uns32Literal;
2213 break;
2214 case FLAGS.decimal | FLAGS.long_:
2215 if (n & 0x8000000000000000L)
2216 {
2217 if (!err)
2218 {
2219 error("signed integer overflow");
2220 err = true;
2221 }
2222 result = TOK.uns64Literal;
2223 }
2224 else
2225 result = TOK.int64Literal;
2226 break;
2227 case FLAGS.long_:
2228 if (n & 0x8000000000000000L)
2229 result = TOK.uns64Literal;
2230 else
2231 result = TOK.int64Literal;
2232 break;
2233 case FLAGS.unsigned | FLAGS.long_:
2234 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2235 result = TOK.uns64Literal;
2236 break;
2237 default:
2238 debug
2239 {
2240 printf("%x\n", flags);
2241 }
2242 assert(0);
2243 }
2244 return result;
2245 }
2246
2247 /**************************************
2248 * Lex C integer-suffix
2249 * Params:
2250 * base = number base
2251 * n = raw integer value
2252 * Returns:
2253 * token value
2254 */
2255 private TOK cnumber(int base, uinteger_t n)
2256 {
2257 /* C11 6.4.4.1
2258 * Parse trailing suffixes:
2259 * u or U
2260 * l or L
2261 * ll or LL
2262 */
2263 enum FLAGS : uint
2264 {
2265 octalhex = 1, // octal or hexadecimal
2266 decimal = 2, // decimal
2267 unsigned = 4, // u or U suffix
2268 long_ = 8, // l or L suffix
2269 llong = 0x10 // ll or LL
2270 }
2271 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
2272 bool err;
2273 Lsuffixes:
2274 while (1)
2275 {
2276 FLAGS f;
2277 const cs = *p;
2278 switch (cs)
2279 {
2280 case 'U':
2281 case 'u':
2282 f = FLAGS.unsigned;
2283 break;
2284
2285 case 'l':
2286 case 'L':
2287 f = FLAGS.long_;
2288 if (cs == p[1])
2289 {
2290 f = FLAGS.long_ | FLAGS.llong;
2291 ++p;
2292 }
2293 break;
2294
2295 default:
2296 break Lsuffixes;
2297 }
2298 ++p;
2299 if ((flags & f) && !err)
2300 {
2301 error("duplicate integer suffixes");
2302 err = true;
2303 }
2304 flags = cast(FLAGS)(flags | f);
2305 }
2306
5fee5ec3
IB
2307 TOK result = TOK.int32Literal; // default
2308 switch (flags)
2309 {
2310 /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2311 * this code deviates from C by picking D int, uint, long, or ulong instead
2312 */
2313
2314 case FLAGS.octalhex:
2315 /* Octal or Hexadecimal constant.
2316 * First that fits: int, unsigned, long, unsigned long,
2317 * long long, unsigned long long
2318 */
6384eff5
IB
2319 if (n & 0x8000000000000000L)
2320 result = TOK.uns64Literal; // unsigned long
2321 else if (n & 0xFFFFFFFF00000000L)
2322 result = TOK.int64Literal; // long
2323 else if (n & 0x80000000)
2324 result = TOK.uns32Literal;
5fee5ec3 2325 else
6384eff5 2326 result = TOK.int32Literal;
5fee5ec3
IB
2327 break;
2328
2329 case FLAGS.decimal:
2330 /* First that fits: int, long, long long
2331 */
6384eff5
IB
2332 if (n & 0x8000000000000000L)
2333 result = TOK.uns64Literal; // unsigned long
2334 else if (n & 0xFFFFFFFF80000000L)
2335 result = TOK.int64Literal; // long
5fee5ec3 2336 else
6384eff5 2337 result = TOK.int32Literal;
5fee5ec3
IB
2338 break;
2339
2340 case FLAGS.octalhex | FLAGS.unsigned:
2341 case FLAGS.decimal | FLAGS.unsigned:
2342 /* First that fits: unsigned, unsigned long, unsigned long long
2343 */
6384eff5
IB
2344 if (n & 0xFFFFFFFF00000000L)
2345 result = TOK.uns64Literal; // unsigned long
5fee5ec3 2346 else
6384eff5 2347 result = TOK.uns32Literal;
5fee5ec3
IB
2348 break;
2349
2350 case FLAGS.decimal | FLAGS.long_:
2351 /* First that fits: long, long long
2352 */
1027dc45 2353 if (longsize == 4 || long_longsize == 4)
5fee5ec3 2354 {
6384eff5 2355 if (n & 0xFFFFFFFF_80000000L)
5fee5ec3
IB
2356 result = TOK.int64Literal;
2357 else
6384eff5 2358 result = TOK.int32Literal; // long
5fee5ec3
IB
2359 }
2360 else
2361 {
6384eff5 2362 result = TOK.int64Literal; // long
5fee5ec3
IB
2363 }
2364 break;
2365
2366 case FLAGS.octalhex | FLAGS.long_:
2367 /* First that fits: long, unsigned long, long long,
2368 * unsigned long long
2369 */
1027dc45 2370 if (longsize == 4 || long_longsize == 4)
5fee5ec3
IB
2371 {
2372 if (n & 0x8000000000000000L)
2373 result = TOK.uns64Literal;
2374 else if (n & 0xFFFFFFFF00000000L)
2375 result = TOK.int64Literal;
2376 else if (n & 0x80000000)
2377 result = TOK.uns32Literal; // unsigned long
2378 else
2379 result = TOK.int32Literal; // long
2380 }
2381 else
2382 {
2383 if (n & 0x80000000_00000000L)
2384 result = TOK.uns64Literal; // unsigned long
2385 else
2386 result = TOK.int64Literal; // long
2387 }
2388 break;
2389
2390 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
2391 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2392 /* First that fits: unsigned long, unsigned long long
2393 */
1027dc45 2394 if (longsize == 4 || long_longsize == 4)
5fee5ec3
IB
2395 {
2396 if (n & 0xFFFFFFFF00000000L)
2397 result = TOK.uns64Literal;
2398 else
2399 result = TOK.uns32Literal; // unsigned long
2400 }
2401 else
2402 {
2403 result = TOK.uns64Literal; // unsigned long
2404 }
2405 break;
2406
2407 case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
2408 /* First that fits: long long, unsigned long long
2409 */
2410 if (n & 0x8000000000000000L)
2411 result = TOK.uns64Literal;
2412 else
2413 result = TOK.int64Literal;
2414 break;
2415
2416 case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
2417 /* long long
2418 */
2419 result = TOK.int64Literal;
2420 break;
2421
2422 case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2423 case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2424 result = TOK.uns64Literal;
2425 break;
2426
2427 default:
2428 debug printf("%x\n",flags);
2429 assert(0);
2430 }
2431 return result;
2432 }
2433
2434 /**************************************
2435 * Read in characters, converting them to real.
2436 * Bugs:
2437 * Exponent overflow not detected.
2438 * Too much requested precision is not detected.
2439 */
2440 private TOK inreal(Token* t)
2441 {
2442 //printf("Lexer::inreal()\n");
2443 debug
2444 {
2445 assert(*p == '.' || isdigit(*p));
2446 }
2447 bool isWellformedString = true;
2448 stringbuffer.setsize(0);
2449 auto pstart = p;
2450 bool hex = false;
2451 dchar c = *p++;
2452 // Leading '0x'
2453 if (c == '0')
2454 {
2455 c = *p++;
2456 if (c == 'x' || c == 'X')
2457 {
2458 hex = true;
2459 c = *p++;
2460 }
2461 }
2462 // Digits to left of '.'
2463 while (1)
2464 {
2465 if (c == '.')
2466 {
2467 c = *p++;
2468 break;
2469 }
2470 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2471 {
2472 c = *p++;
2473 continue;
2474 }
2475 break;
2476 }
2477 // Digits to right of '.'
2478 while (1)
2479 {
2480 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2481 {
2482 c = *p++;
2483 continue;
2484 }
2485 break;
2486 }
2487 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2488 {
2489 c = *p++;
2490 if (c == '-' || c == '+')
2491 {
2492 c = *p++;
2493 }
2494 bool anyexp = false;
2495 while (1)
2496 {
2497 if (isdigit(c))
2498 {
2499 anyexp = true;
2500 c = *p++;
2501 continue;
2502 }
2503 if (c == '_')
2504 {
2505 if (Ccompile)
2506 error("embedded `_` in numeric literals not allowed");
2507 c = *p++;
2508 continue;
2509 }
2510 if (!anyexp)
2511 {
2512 error("missing exponent");
2513 isWellformedString = false;
2514 }
2515 break;
2516 }
2517 }
2518 else if (hex)
2519 {
2520 error("exponent required for hex float");
2521 isWellformedString = false;
2522 }
2523 --p;
2524 while (pstart < p)
2525 {
2526 if (*pstart != '_')
2527 stringbuffer.writeByte(*pstart);
2528 ++pstart;
2529 }
2530 stringbuffer.writeByte(0);
2531 auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
2532 TOK result;
2533 bool isOutOfRange = false;
b7a586be 2534 t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, isOutOfRange) : CTFloat.zero);
5fee5ec3
IB
2535 switch (*p)
2536 {
2537 case 'F':
2538 case 'f':
2539 if (isWellformedString && !isOutOfRange)
2540 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
2541 result = TOK.float32Literal;
2542 p++;
2543 break;
2544 default:
2545 if (isWellformedString && !isOutOfRange)
2546 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
2547 result = TOK.float64Literal;
2548 break;
2549 case 'l':
2550 if (!Ccompile)
2551 error("use 'L' suffix instead of 'l'");
2552 goto case 'L';
2553 case 'L':
2554 ++p;
2555 if (Ccompile && long_doublesize == 8)
2556 goto default;
2557 result = TOK.float80Literal;
2558 break;
2559 }
2560 if ((*p == 'i' || *p == 'I') && !Ccompile)
2561 {
2562 if (*p == 'I')
2563 error("use 'i' suffix instead of 'I'");
2564 p++;
2565 switch (result)
2566 {
2567 case TOK.float32Literal:
2568 result = TOK.imaginary32Literal;
2569 break;
2570 case TOK.float64Literal:
2571 result = TOK.imaginary64Literal;
2572 break;
2573 case TOK.float80Literal:
2574 result = TOK.imaginary80Literal;
2575 break;
2576 default:
2577 break;
2578 }
2579 }
2580 const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
fbdaa581 2581 if (isOutOfRange && !isLong && (!Ccompile || hex))
5fee5ec3 2582 {
fbdaa581
IB
2583 /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2584 */
c8dfa79c
IB
2585 const char* suffix = result == TOK.float32Literal ? "f" : result == TOK.float80Literal ? "L" : "";
2586 const char* type = [TOK.float32Literal: "`float`".ptr,
2587 TOK.float64Literal: "`double`".ptr,
2588 TOK.float80Literal: "`real` for the current target".ptr][result];
2589 error(scanloc, "number `%s%s` is not representable as a %s", sbufptr, suffix, type);
2590 const char* extra = result == TOK.float64Literal ? "`real` literals can be written using the `L` suffix. " : "";
2591 errorSupplemental(scanloc, "%shttps://dlang.org/spec/lex.html#floatliteral", extra);
5fee5ec3
IB
2592 }
2593 debug
2594 {
2595 switch (result)
2596 {
2597 case TOK.float32Literal:
2598 case TOK.float64Literal:
2599 case TOK.float80Literal:
2600 case TOK.imaginary32Literal:
2601 case TOK.imaginary64Literal:
2602 case TOK.imaginary80Literal:
2603 break;
2604 default:
2605 assert(0);
2606 }
2607 }
2608 return result;
2609 }
2610
2611 final Loc loc() pure @nogc
2612 {
2613 scanloc.charnum = cast(uint)(1 + p - line);
2614 version (LocOffset)
2615 scanloc.fileOffset = cast(uint)(p - base);
2616 return scanloc;
2617 }
2618
2619 final void error(const(char)* format, ...)
2620 {
2621 va_list args;
2622 va_start(args, format);
2623 .verror(token.loc, format, args);
2624 va_end(args);
2625 }
2626
2627 final void error(const ref Loc loc, const(char)* format, ...)
2628 {
2629 va_list args;
2630 va_start(args, format);
2631 .verror(loc, format, args);
2632 va_end(args);
2633 }
2634
2635 final void deprecation(const(char)* format, ...)
2636 {
2637 va_list args;
2638 va_start(args, format);
2639 .vdeprecation(token.loc, format, args);
2640 va_end(args);
2641 }
2642
7e287503
IB
2643 /***************************************
2644 * Parse special token sequence:
2645 * Returns:
2646 * true if the special token sequence was handled
2647 * References:
2648 * https://dlang.org/spec/lex.html#special-token-sequence
2649 */
2650 bool parseSpecialTokenSequence()
2651 {
2652 Token n;
2653 scan(&n);
2654 if (n.value == TOK.identifier)
2655 {
2656 if (n.ident == Id.line)
2657 {
2658 poundLine(n, false);
2659 return true;
2660 }
2661 else
2662 {
2663 const locx = loc();
2664 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
2665 }
2666 }
2667 else if (n.value == TOK.if_)
2668 {
2669 error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
2670 }
2671 return false;
2672 }
2673
5fee5ec3
IB
2674 /*********************************************
2675 * Parse line/file preprocessor directive:
2676 * #line linnum [filespec]
2677 * Allow __LINE__ for linnum, and __FILE__ for filespec.
2678 * Accept linemarker format:
2679 * # linnum [filespec] {flags}
2680 * There can be zero or more flags, which are one of the digits 1..4, and
2681 * must be in ascending order. The flags are ignored.
2682 * Params:
2683 * tok = token we're on, which is linnum of linemarker
2684 * linemarker = true if line marker format and lexer is on linnum
2685 * References:
2686 * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2687 */
7e287503 2688 final void poundLine(ref Token tok, bool linemarker)
5fee5ec3
IB
2689 {
2690 auto linnum = this.scanloc.linnum;
2691 const(char)* filespec = null;
5fee5ec3
IB
2692 bool flags;
2693
2694 if (!linemarker)
2695 scan(&tok);
2696 if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
2697 {
235d5a96
IB
2698 const lin = cast(int)(tok.unsvalue);
2699 if (lin != tok.unsvalue)
2700 {
2701 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue);
2702 skipToNextLine();
2703 return;
2704 }
5fee5ec3
IB
2705 else
2706 linnum = lin;
2707 }
2708 else if (tok.value == TOK.line) // #line __LINE__
2709 {
2710 }
2711 else
235d5a96
IB
2712 {
2713 error(tok.loc, "positive integer argument expected following `#line`");
2714 if (tok.value != TOK.endOfLine)
2715 skipToNextLine();
2716 return;
2717 }
5fee5ec3
IB
2718 while (1)
2719 {
235d5a96
IB
2720 scan(&tok);
2721 switch (tok.value)
5fee5ec3 2722 {
235d5a96
IB
2723 case TOK.endOfFile:
2724 case TOK.endOfLine:
5fee5ec3
IB
2725 if (!inTokenStringConstant)
2726 {
2727 this.scanloc.linnum = linnum;
2728 if (filespec)
2729 this.scanloc.filename = filespec;
2730 }
2731 return;
235d5a96 2732 case TOK.file:
5fee5ec3
IB
2733 if (filespec || flags)
2734 goto Lerr;
235d5a96
IB
2735 filespec = mem.xstrdup(scanloc.filename);
2736 continue;
2737 case TOK.string_:
5fee5ec3
IB
2738 if (filespec || flags)
2739 goto Lerr;
235d5a96 2740 if (tok.ptr[0] != '"' || tok.postfix != 0)
1027dc45 2741 goto Lerr;
235d5a96 2742 filespec = tok.ustring;
5fee5ec3 2743 continue;
235d5a96
IB
2744 case TOK.int32Literal:
2745 if (!filespec)
2746 goto Lerr;
2747 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4)
5fee5ec3 2748 {
235d5a96
IB
2749 flags = true; // linemarker flags seen
2750 continue;
5fee5ec3
IB
2751 }
2752 goto Lerr;
235d5a96
IB
2753 default:
2754 goto Lerr;
5fee5ec3
IB
2755 }
2756 }
2757 Lerr:
235d5a96
IB
2758 if (filespec is null)
2759 error(tok.loc, "invalid filename for `#line` directive");
2760 else if (linemarker)
2761 error(tok.loc, "invalid flag for line marker directive");
2762 else if (!Ccompile)
2763 error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars());
2764 if (tok.value != TOK.endOfLine)
2765 skipToNextLine();
5fee5ec3
IB
2766 }
2767
0fb57034
IB
2768 /***************************************
2769 * Scan forward to start of next line.
610d7898
IB
2770 * Params:
2771 * defines = send characters to `defines`
0fb57034 2772 */
610d7898 2773 final void skipToNextLine(OutBuffer* defines = null)
0fb57034
IB
2774 {
2775 while (1)
2776 {
2777 switch (*p)
2778 {
2779 case 0:
2780 case 0x1A:
2781 return; // do not advance p
2782
2783 case '\n':
2784 ++p;
2785 break;
2786
2787 case '\r':
2788 ++p;
2789 if (p[0] == '\n')
2790 ++p;
2791 break;
2792
2793 default:
610d7898
IB
2794 if (defines)
2795 defines.writeByte(*p); // don't care about Unicode line endings for C
2796 else if (*p & 0x80)
0fb57034
IB
2797 {
2798 const u = decodeUTF();
2799 if (u == PS || u == LS)
2800 {
2801 ++p;
2802 break;
2803 }
2804 }
2805 ++p;
2806 continue;
2807 }
2808 break;
2809 }
2810 endOfLine();
235d5a96 2811 tokenizeNewlines = false;
0fb57034
IB
2812 }
2813
5fee5ec3
IB
2814 /********************************************
2815 * Decode UTF character.
2816 * Issue error messages for invalid sequences.
2817 * Return decoded character, advance p to last character in UTF sequence.
2818 */
2819 private uint decodeUTF()
2820 {
2821 const s = p;
2822 assert(*s & 0x80);
2823 // Check length of remaining string up to 4 UTF-8 characters
2824 size_t len;
2825 for (len = 1; len < 4 && s[len]; len++)
2826 {
2827 }
2828 size_t idx = 0;
2829 dchar u;
2830 const msg = utf_decodeChar(s[0 .. len], idx, u);
2831 p += idx - 1;
2832 if (msg)
2833 {
2834 error("%.*s", cast(int)msg.length, msg.ptr);
2835 }
2836 return u;
2837 }
2838
2839 /***************************************************
2840 * Parse doc comment embedded between t.ptr and p.
2841 * Remove trailing blanks and tabs from lines.
2842 * Replace all newlines with \n.
2843 * Remove leading comment character from each line.
2844 * Decide if it's a lineComment or a blockComment.
2845 * Append to previous one for this token.
2846 *
2847 * If newParagraph is true, an extra newline will be
2848 * added between adjoining doc comments.
2849 */
2850 private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
2851 {
2852 /* ct tells us which kind of comment it is: '/', '*', or '+'
2853 */
2854 const ct = t.ptr[2];
2855 /* Start of comment text skips over / * *, / + +, or / / /
2856 */
2857 const(char)* q = t.ptr + 3; // start of comment text
2858 const(char)* qend = p;
2859 if (ct == '*' || ct == '+')
2860 qend -= 2;
2861 /* Scan over initial row of ****'s or ++++'s or ////'s
2862 */
2863 for (; q < qend; q++)
2864 {
2865 if (*q != ct)
2866 break;
2867 }
2868 /* Remove leading spaces until start of the comment
2869 */
2870 int linestart = 0;
2871 if (ct == '/')
2872 {
2873 while (q < qend && (*q == ' ' || *q == '\t'))
2874 ++q;
2875 }
2876 else if (q < qend)
2877 {
2878 if (*q == '\r')
2879 {
2880 ++q;
2881 if (q < qend && *q == '\n')
2882 ++q;
2883 linestart = 1;
2884 }
2885 else if (*q == '\n')
2886 {
2887 ++q;
2888 linestart = 1;
2889 }
2890 }
2891 /* Remove trailing row of ****'s or ++++'s
2892 */
2893 if (ct != '/')
2894 {
2895 for (; q < qend; qend--)
2896 {
2897 if (qend[-1] != ct)
2898 break;
2899 }
2900 }
2901 /* Comment is now [q .. qend].
2902 * Canonicalize it into buf[].
2903 */
2904 OutBuffer buf;
2905
2906 void trimTrailingWhitespace()
2907 {
2908 const s = buf[];
2909 auto len = s.length;
2910 while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
2911 --len;
2912 buf.setsize(len);
2913 }
2914
2915 for (; q < qend; q++)
2916 {
2917 char c = *q;
2918 switch (c)
2919 {
2920 case '*':
2921 case '+':
2922 if (linestart && c == ct)
2923 {
2924 linestart = 0;
2925 /* Trim preceding whitespace up to preceding \n
2926 */
2927 trimTrailingWhitespace();
2928 continue;
2929 }
2930 break;
2931 case ' ':
2932 case '\t':
2933 break;
2934 case '\r':
2935 if (q[1] == '\n')
2936 continue; // skip the \r
2937 goto Lnewline;
2938 default:
2939 if (c == 226)
2940 {
2941 // If LS or PS
2942 if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
2943 {
2944 q += 2;
2945 goto Lnewline;
2946 }
2947 }
2948 linestart = 0;
2949 break;
2950 Lnewline:
2951 c = '\n'; // replace all newlines with \n
2952 goto case;
2953 case '\n':
2954 linestart = 1;
2955 /* Trim trailing whitespace
2956 */
2957 trimTrailingWhitespace();
2958 break;
2959 }
2960 buf.writeByte(c);
2961 }
2962 /* Trim trailing whitespace (if the last line does not have newline)
2963 */
2964 trimTrailingWhitespace();
2965
2966 // Always end with a newline
2967 const s = buf[];
2968 if (s.length == 0 || s[$ - 1] != '\n')
2969 buf.writeByte('\n');
2970
2971 // It's a line comment if the start of the doc comment comes
2972 // after other non-whitespace on the same line.
2973 auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
2974 // Combine with previous doc comment, if any
2975 if (*dc)
2976 *dc = combineComments(*dc, buf[], newParagraph).toDString();
2977 else
2978 *dc = buf.extractSlice(true);
2979 }
2980
2981 /********************************************
2982 * Combine two document comments into one,
2983 * separated by an extra newline if newParagraph is true.
2984 */
2985 static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
2986 {
31350635 2987 //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
5fee5ec3
IB
2988 const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
2989 if (!c1)
2990 return c2.ptr;
2991 if (!c2)
2992 return c1.ptr;
2993
2994 int insertNewLine = 0;
2995 if (c1.length && c1[$ - 1] != '\n')
2996 insertNewLine = 1;
2997 const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
2998 auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
2999 p[0 .. c1.length] = c1[];
3000 if (insertNewLine)
3001 p[c1.length] = '\n';
3002 if (newParagraph)
3003 p[c1.length + insertNewLine] = '\n';
3004 p[retSize - c2.length .. retSize] = c2[];
3005 p[retSize] = 0;
3006 return p;
3007 }
3008
0fb57034
IB
3009 /**************************
3010 * `p` should be at start of next line
3011 */
3012 private void endOfLine() pure @nogc @safe
5fee5ec3
IB
3013 {
3014 scanloc.linnum++;
3015 line = p;
3016 }
3017}
3018
6384eff5
IB
3019
3020/******************************* Private *****************************************/
3021
3022private:
3023
5fee5ec3
IB
3024/// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__`
3025private struct TimeStampInfo
3026{
3027 private __gshared bool initdone = false;
3028
3029 // Note: Those properties need to be guarded by a call to `init`
3030 // The API isn't safe, and quite brittle, but it was left this way
3031 // over performance concerns.
3032 // This is currently only called once, from the lexer.
3033 __gshared char[11 + 1] date;
3034 __gshared char[8 + 1] time;
3035 __gshared char[24 + 1] timestamp;
3036
3037 public static void initialize(const ref Loc loc) nothrow
3038 {
3039 if (initdone)
3040 return;
3041
3042 initdone = true;
3043 time_t ct;
3044 // https://issues.dlang.org/show_bug.cgi?id=20444
3045 if (auto p = getenv("SOURCE_DATE_EPOCH"))
3046 {
3047 if (!ct.parseDigits(p.toDString()))
235d5a96 3048 error(loc, "value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p);
5fee5ec3
IB
3049 }
3050 else
3051 .time(&ct);
3052 const p = ctime(&ct);
3053 assert(p);
3054 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
3055 sprintf(&time[0], "%.8s", p + 11);
3056 sprintf(&timestamp[0], "%.24s", p);
3057 }
3058}
3059
6384eff5
IB
3060private enum LS = 0x2028; // UTF line separator
3061private enum PS = 0x2029; // UTF paragraph separator
3062
3063/********************************************
3064 * Do our own char maps
3065 */
3066private static immutable cmtable = ()
3067{
3068 ubyte[256] table;
3069 foreach (const c; 0 .. table.length)
3070 {
3071 if ('0' <= c && c <= '7')
3072 table[c] |= CMoctal;
3073 if (c_isxdigit(c))
3074 table[c] |= CMhex;
3075 if (c_isalnum(c) || c == '_')
3076 table[c] |= CMidchar;
3077
3078 switch (c)
3079 {
3080 case 'x': case 'X':
3081 case 'b': case 'B':
3082 table[c] |= CMzerosecond;
3083 break;
3084
3085 case '0': .. case '9':
3086 case 'e': case 'E':
3087 case 'f': case 'F':
3088 case 'l': case 'L':
3089 case 'p': case 'P':
3090 case 'u': case 'U':
3091 case 'i':
3092 case '.':
3093 case '_':
3094 table[c] |= CMzerosecond | CMdigitsecond;
3095 break;
3096
3097 default:
3098 break;
3099 }
3100
3101 switch (c)
3102 {
3103 case '\\':
3104 case '\n':
3105 case '\r':
3106 case 0:
3107 case 0x1A:
3108 case '\'':
3109 break;
3110 default:
3111 if (!(c & 0x80))
3112 table[c] |= CMsinglechar;
3113 break;
3114 }
3115 }
3116 return table;
3117}();
3118
3119private
3120{
3121 enum CMoctal = 0x1;
3122 enum CMhex = 0x2;
3123 enum CMidchar = 0x4;
3124 enum CMzerosecond = 0x8;
3125 enum CMdigitsecond = 0x10;
3126 enum CMsinglechar = 0x20;
3127}
3128
3129private bool isoctal(const char c) pure @nogc @safe
3130{
3131 return (cmtable[c] & CMoctal) != 0;
3132}
3133
3134private bool ishex(const char c) pure @nogc @safe
3135{
3136 return (cmtable[c] & CMhex) != 0;
3137}
3138
3139private bool isidchar(const char c) pure @nogc @safe
3140{
3141 return (cmtable[c] & CMidchar) != 0;
3142}
3143
3144private bool isZeroSecond(const char c) pure @nogc @safe
3145{
3146 return (cmtable[c] & CMzerosecond) != 0;
3147}
3148
3149private bool isDigitSecond(const char c) pure @nogc @safe
3150{
3151 return (cmtable[c] & CMdigitsecond) != 0;
3152}
3153
3154private bool issinglechar(const char c) pure @nogc @safe
3155{
3156 return (cmtable[c] & CMsinglechar) != 0;
3157}
3158
3159private bool c_isxdigit(const int c) pure @nogc @safe
3160{
3161 return (( c >= '0' && c <= '9') ||
3162 ( c >= 'a' && c <= 'f') ||
3163 ( c >= 'A' && c <= 'F'));
3164}
3165
3166private bool c_isalnum(const int c) pure @nogc @safe
3167{
3168 return (( c >= '0' && c <= '9') ||
3169 ( c >= 'a' && c <= 'z') ||
3170 ( c >= 'A' && c <= 'Z'));
3171}
3172
3173/******************************* Unittest *****************************************/
3174
5fee5ec3
IB
3175unittest
3176{
3177 import dmd.console;
3178 nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
3179 const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
3180 {
3181 assert(0);
3182 }
3183 diagnosticHandler = &assertDiagnosticHandler;
3184
3185 static void test(T)(string sequence, T expected, bool Ccompile = false)
3186 {
3187 auto p = cast(const(char)*)sequence.ptr;
610d7898
IB
3188 Lexer lexer = new Lexer();
3189 assert(expected == lexer.escapeSequence(Loc.initial, p, Ccompile));
5fee5ec3
IB
3190 assert(p == sequence.ptr + sequence.length);
3191 }
3192
3193 test(`'`, '\'');
3194 test(`"`, '"');
3195 test(`?`, '?');
3196 test(`\`, '\\');
3197 test(`0`, '\0');
3198 test(`a`, '\a');
3199 test(`b`, '\b');
3200 test(`f`, '\f');
3201 test(`n`, '\n');
3202 test(`r`, '\r');
3203 test(`t`, '\t');
3204 test(`v`, '\v');
3205
3206 test(`x00`, 0x00);
3207 test(`xff`, 0xff);
3208 test(`xFF`, 0xff);
3209 test(`xa7`, 0xa7);
3210 test(`x3c`, 0x3c);
3211 test(`xe2`, 0xe2);
3212
3213 test(`1`, '\1');
3214 test(`42`, '\42');
3215 test(`357`, '\357');
3216
3217 test(`u1234`, '\u1234');
3218 test(`uf0e4`, '\uf0e4');
3219
3220 test(`U0001f603`, '\U0001f603');
3221
3222 test(`&quot;`, '"');
3223 test(`&lt;`, '<');
3224 test(`&gt;`, '>');
3225
3226 diagnosticHandler = null;
3227}
6384eff5 3228
5fee5ec3
IB
3229unittest
3230{
3231 import dmd.console;
3232 string expected;
3233 bool gotError;
3234
3235 nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
3236 const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
3237 {
3238 assert(cast(Classification)headerColor == Classification.error);
3239
3240 gotError = true;
3241 char[100] buffer = void;
3242 auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)];
3243 assert(expected == actual);
3244 return true;
3245 }
3246
3247 diagnosticHandler = &expectDiagnosticHandler;
3248
3249 void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
3250 {
3251 uint errors = global.errors;
3252 gotError = false;
3253 expected = expectedError;
3254 auto p = cast(const(char)*)sequence.ptr;
610d7898
IB
3255 Lexer lexer = new Lexer();
3256 auto actualReturnValue = lexer.escapeSequence(Loc.initial, p, Ccompile);
5fee5ec3
IB
3257 assert(gotError);
3258 assert(expectedReturnValue == actualReturnValue);
3259
3260 auto actualScanLength = p - sequence.ptr;
3261 assert(expectedScanLength == actualScanLength);
3262 global.errors = errors;
3263 }
3264
3265 test("c", `undefined escape sequence \c`, 'c', 1);
3266 test("!", `undefined escape sequence \!`, '!', 1);
3267 test("&quot;", `undefined escape sequence \&`, '&', 1, true);
3268
3269 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3270
3271 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2);
3272 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3);
3273 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3274
3275 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2);
3276 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3);
3277 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4);
3278 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5);
3279 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6);
3280 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7);
3281 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3282
3283 test("ud800" , `invalid UTF character \U0000d800`, '?', 5);
3284 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5);
3285 test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3286
3287 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2);
3288 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2);
3289 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3290
3291 test("&BAD;", `unnamed character entity &BAD;` , '?', 5);
3292 test("&quot", `unterminated named entity &quot;`, '?', 5);
3293 test("&quot", `unterminated named entity &quot;`, '?', 5);
3294
3295 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3296
3297 diagnosticHandler = null;
3298}
6384eff5
IB
3299
3300unittest
3301{
3302 //printf("lexer.unittest\n");
3303 /* Not much here, just trying things out.
3304 */
3305 string text = "int"; // We rely on the implicit null-terminator
3306 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0);
3307 TOK tok;
3308 tok = lex1.nextToken();
3309 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3310 assert(tok == TOK.int32);
3311 tok = lex1.nextToken();
3312 assert(tok == TOK.endOfFile);
3313 tok = lex1.nextToken();
3314 assert(tok == TOK.endOfFile);
3315 tok = lex1.nextToken();
3316 assert(tok == TOK.endOfFile);
3317}
3318
3319unittest
3320{
3321 // We don't want to see Lexer error output during these tests.
3322 uint errors = global.startGagging();
3323 scope(exit) global.endGagging(errors);
3324
3325 // Test malformed input: even malformed input should end in a TOK.endOfFile.
3326 static immutable char[][] testcases =
3327 [ // Testcase must end with 0 or 0x1A.
3328 [0], // not malformed, but pathological
3329 ['\'', 0],
3330 ['\'', 0x1A],
3331 ['{', '{', 'q', '{', 0],
3332 [0xFF, 0],
3333 [0xFF, 0x80, 0],
3334 [0xFF, 0xFF, 0],
3335 [0xFF, 0xFF, 0],
3336 ['x', '"', 0x1A],
3337 ];
3338
3339 foreach (testcase; testcases)
3340 {
3341 scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0);
3342 TOK tok = lex2.nextToken();
3343 size_t iterations = 1;
3344 while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
3345 {
3346 tok = lex2.nextToken();
3347 }
3348 assert(tok == TOK.endOfFile);
3349 tok = lex2.nextToken();
3350 assert(tok == TOK.endOfFile);
3351 }
3352}
This page took 0.693601 seconds and 5 git commands to generate.