1 module zua.parser.lexer;
2 import zua.diagnostic;
3 import std.variant;
4 import std.utf;
5 import std.typecons;
6 import std.string;
7 import std.container.rbtree;
8 import std.range.primitives;
9 import std.format;
10 import std.conv;
11
12 /** Describes the type of a token */
13 enum TokenType {
14 Eof,
15 Identifier,
16 Keyword,
17 String,
18 Number,
19 Symbol,
20 Comment,
21 BlockComment
22 }
23
24 /** A single token in a source file */
25 struct Token {
26
27 /** Describes the type of this token */
28 TokenType type;
29
30 /** The raw value of this token */
31 string rawValue;
32
33 /** The parsed value of this token */
34 string value;
35
36 /** Only used for numbers; the numeric value of this token */
37 double numValue;
38
39 /** The index at which this token is located */
40 size_t index;
41
42 }
43
44 alias IgnoreCommentTokens = Flag!"ignoreCommentTokens";
45 alias ToConsume = Flag!"toConsume";
46
47 private struct LexerState {
48 size_t diagnosticsLength;
49 size_t index;
50 Nullable!Token last;
51 }
52
53 /** A lexer for Lua */
54 final class Lexer {
55
56 private string code;
57 private size_t index;
58 private Diagnostic[]* diagnostics;
59
60 private LexerState[] stateStack;
61
62 private RedBlackTree!string keywords;
63
64 private string[] symbols;
65
66 /** The last consumed token */
67 Nullable!Token last;
68
69 /** Create a new lexer from a piece of code */
70 this(string code, ref Diagnostic[] diagnostics) {
71 this.code = code;
72 this.diagnostics = &diagnostics;
73
74 keywords = redBlackTree!string(
75 "and", "break", "do", "else", "elseif",
76 "end", "false", "for", "function", "if",
77 "in", "local", "nil", "not", "or",
78 "repeat", "return", "then", "true", "until", "while"
79 );
80
81 symbols = [
82 "...",
83
84 "..",
85 "==", "~=", "<=", ">=",
86
87 "+", "-", "*", "/", "%", "^", "#",
88 "<", ">", "=",
89 "(", ")", "{", "}", "[", "]",
90 ";", ":", ",", ".",
91 ];
92 }
93
94 /** Get the source code that this Lexer is lexing */
95 string source() {
96 return code;
97 }
98
99 /** Save the current state of the lexer */
100 void save() {
101 const LexerState state = {
102 diagnosticsLength: (*diagnostics).length,
103 index: index,
104 last: last
105 };
106
107 stateStack ~= state;
108 }
109
110 /** Restore the previously-saved lexer state */
111 void restore() {
112 const LexerState state = stateStack.back;
113 (*diagnostics) = (*diagnostics)[0..state.diagnosticsLength];
114 index = state.index;
115 last = state.last;
116
117 discard();
118 }
119
120 /** Discard the previously-saved lexer state */
121 void discard() {
122 stateStack.popBack();
123 }
124
125 /** Peek a segment of characters */
126 private dstring peekChars(size_t length) {
127 dstring result;
128
129 size_t prevIndex = index;
130
131 foreach (i; 0..length) {
132 if (prevIndex >= code.length) return "";
133 result ~= code.decode!(Yes.useReplacementDchar)(prevIndex);
134 }
135
136 return result;
137 }
138
139 /** Peek the next char */
140 private dchar peekChar() {
141 if (charEof) return 0;
142 size_t prevIndex = index;
143 return code.decode!(Yes.useReplacementDchar)(prevIndex);
144 }
145
146 /** Consume a segment of characters */
147 private dstring nextChars(size_t length) {
148 dstring result;
149
150 foreach (i; 0..length) {
151 if (charEof) return "";
152 result ~= code.decode!(Yes.useReplacementDchar)(index);
153 }
154
155 return result;
156 }
157
158 /** Consume the next char in the lexer */
159 private dchar nextChar() {
160 if (charEof) return 0;
161 return code.decode!(Yes.useReplacementDchar)(index);
162 }
163
164 /** Check if we're at the end of the file */
165 private bool charEof() {
166 return index >= code.length;
167 }
168
169 /** Read a string for as long as the given predicate holds true */
170 private string readWhile(bool delegate(dchar c) predicate) {
171 string result;
172 while (!charEof && predicate(peekChar())) {
173 result ~= nextChar();
174 }
175 return result;
176 }
177
178 private void skipWhitespace() {
179 readWhile(c => " \t\r\n"d.indexOf(c) != -1);
180 }
181
182 private bool isIdentifierCharacter(dchar c) {
183 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_';
184 }
185
186 private bool isNumeralCharacter(dchar c) {
187 return (c >= '0' && c <= '9') || c == '.';
188 }
189
190 private bool isHexadecimalCharacter(dchar c) {
191 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
192 }
193
194 private Token* readLongString(string tokenName) {
195 // assume char = '['
196 const size_t prevIndex = index;
197 nextChar();
198 string open = "[";
199 string closeSeq = "]";
200 uint level = 0;
201 while (peekChar() == '=') {
202 level++;
203 open ~= '=';
204 closeSeq ~= '=';
205 nextChar();
206 }
207 closeSeq ~= ']';
208 if (peekChar() == '[') {
209 int closeLevel = -1;
210 bool closed = false;
211 size_t closeBeginP = -1;
212 size_t closeBegin = -1;
213 size_t closeEnd = -1;
214 string rawValue = open ~ readWhile(delegate(ic) {
215 if (ic == ']') {
216 if (closeBeginP != -1) {
217 closeBegin = closeBeginP;
218 closeEnd = index + 1;
219 }
220
221 if (closeLevel == level) {
222 closed = true;
223 return false;
224 }
225 else {
226 closeLevel = 0;
227 closeBeginP = index;
228 }
229 }
230 else if (ic == '=' && closeLevel != -1) {
231 closeLevel++;
232 }
233 else {
234 closeLevel = -1;
235 }
236 return true;
237 });
238
239 if (!closed) {
240 Diagnostic err;
241 err.type = DiagnosticType.Error;
242 err.message = "unclosed " ~ tokenName;
243 err.add(index);
244 const Quickfix close = {
245 message: "add closing sequence to " ~ tokenName,
246 ranges: [QuickfixRange(index, index, closeSeq)]
247 };
248 err.quickfix ~= close;
249 if (closeBegin != -1) {
250 const Quickfix fixExistingClose = {
251 message: "fix existing closing sequence to match level of " ~ tokenName,
252 ranges: [QuickfixRange(closeBegin, closeEnd, closeSeq)]
253 };
254 err.quickfix ~= fixExistingClose;
255 }
256 *diagnostics ~= err;
257 }
258 else {
259 nextChar();
260 rawValue ~= "]";
261 }
262
263 string value = rawValue[2 + level .. $ - (2 + level)];
264
265 if (value.length > 0 && value[0] == '\n') {
266 value = value[1..$];
267 }
268
269 Token* result = new Token;
270 result.value = value;
271 result.rawValue = rawValue;
272 result.type = TokenType.String;
273 result.index = prevIndex;
274 return result;
275 }
276 else { // turns out it wasn't a string...
277 index = prevIndex;
278 return null;
279 }
280 }
281
282 private Token nextInternal() {
283 skipWhitespace();
284
285 Token result;
286 result.index = index;
287
288 if (charEof) {
289 result.type = TokenType.Eof;
290 return result;
291 }
292
293 const dchar c = peekChar();
294
295 if (peekChars(2) == "--"d) {
296 nextChars(2);
297
298 if (peekChar() == '[') {
299 const auto res = readLongString("block comment");
300 if (res !is null) {
301 result = *res;
302 result.type = TokenType.BlockComment;
303 return result;
304 }
305 }
306
307 result.rawValue = result.value = readWhile(delegate(c) {
308 return c != '\n';
309 });
310
311 result.type = TokenType.Comment;
312 return result;
313 }
314
315 if (peekChars(2) == "0x"d) {
316 nextChars(2);
317 const string value = readWhile(delegate(c) {
318 return isHexadecimalCharacter(c);
319 });
320
321 if (value == "") { // if it's just 0x, it's clearly not a number
322 index -= 2; // put it back and continue
323 }
324 else {
325 result.value = value.to!ulong(16).to!string;
326 result.numValue = value.to!ulong(16).to!double;
327 result.rawValue = "0x" ~ value;
328 result.type = TokenType.Number;
329 return result;
330 }
331 }
332 else if (isNumeralCharacter(c)) {
333 bool dot = false;
334 string value = readWhile(delegate(c) {
335 if (c == '.') {
336 if (dot) return false; // if we already have a dot, don't read two dots
337 dot = true;
338 }
339
340 return isNumeralCharacter(c);
341 });
342
343 if (value == ".") { // if it's just a dot, it's clearly not a number
344 index--; // put it back and continue
345 }
346 else {
347 double numValue = value.to!double;
348 if (peekChar() == 'e') {
349 value ~= 'e';
350 nextChar();
351 bool pos = true;
352 if (peekChar() == '+') {
353 value ~= '+';
354 nextChar();
355 }
356 else if (peekChar() == '-') {
357 value ~= '-';
358 pos = false;
359 nextChar();
360 }
361 const string exp = readWhile(delegate(c) {
362 return isNumeralCharacter(c) && c != '.';
363 });
364 value ~= exp;
365 numValue *= 10.0 ^^ ((!pos ? -1 : 1) * exp.to!double);
366 }
367 result.numValue = numValue;
368 result.value = value;
369 result.rawValue = value;
370 result.type = TokenType.Number;
371 return result;
372 }
373 }
374
375 if (isIdentifierCharacter(c)) {
376 const string value = readWhile(&isIdentifierCharacter);
377 result.value = value;
378 result.rawValue = value;
379 result.type = value in keywords ? TokenType.Keyword : TokenType.Identifier;
380 return result;
381 }
382
383 if (c == '"' || c == '\'') {
384 bool escape = false;
385 int digits = 0;
386 int codepoint = 0;
387 string value;
388 bool closed = false;
389 string rawValue = [nextChar()].toUTF8 ~ readWhile(delegate(ic) {
390 if (escape) {
391 if (ic == 'a') value ~= '\a';
392 else if (ic == 'b') value ~= '\b';
393 else if (ic == 'f') value ~= '\f';
394 else if (ic == 'n') value ~= '\n';
395 else if (ic == 'r') value ~= '\r';
396 else if (ic == 't') value ~= '\t';
397 else if (ic == 'v') value ~= '\v';
398 else if (ic >= '0' && ic <= '9') {
399 digits = 1;
400 codepoint = ic - '0';
401 }
402 else value ~= ic;
403 escape = false;
404 }
405 else if (ic == '\n') {
406 return false;
407 }
408 else if (ic >= '0' && ic <= '9' && digits > 0 && digits < 3) {
409 digits++;
410 codepoint *= 10;
411 codepoint += ic - '0';
412 }
413 else {
414 if (digits > 0) {
415 value ~= cast(char)codepoint;
416 digits = 0;
417 }
418
419 if (ic == '\\') escape = true;
420 else if (ic == c) {
421 closed = true;
422 return false;
423 }
424 else value ~= ic;
425 }
426 return true;
427 });
428
429 if (digits > 0) value ~= cast(char)codepoint;
430
431 if (!closed) {
432 Diagnostic err;
433 err.type = DiagnosticType.Error;
434 err.message = "unclosed string";
435 err.add(index);
436 if (escape) {
437 const Quickfix close = {
438 message: "remove extraneous '\\' and add closing quote to string",
439 ranges: [QuickfixRange(index - 1, index, [c].toUTF8)]
440 };
441 err.quickfix ~= close;
442 }
443 else {
444 const Quickfix close = {
445 message: "add closing quote to string",
446 ranges: [QuickfixRange(index, index, [c].toUTF8)]
447 };
448 err.quickfix ~= close;
449 }
450 *diagnostics ~= err;
451 }
452 else {
453 rawValue ~= [nextChar()].toUTF8;
454 }
455
456 result.value = value;
457 result.rawValue = rawValue;
458 result.type = TokenType.String;
459 return result;
460 }
461
462 if (c == '[') {
463 const auto res = readLongString("string");
464 if (res !is null) return *res;
465 }
466
467 foreach (symbol; symbols) {
468 const auto utf32sym = symbol.toUTF32;
469 if (peekChars(utf32sym.length) == utf32sym) {
470 result.value = symbol;
471 result.rawValue = symbol;
472 result.type = TokenType.Symbol;
473 nextChars(utf32sym.length);
474 return result;
475 }
476 }
477
478 nextChar();
479 Diagnostic err;
480 err.type = DiagnosticType.Error;
481 err.message = "unknown character";
482 err.add(result.index, index);
483 const Quickfix remove = {
484 message: "remove unknown character",
485 ranges: [QuickfixRange(result.index, index, "")]
486 };
487 err.quickfix ~= remove;
488 *diagnostics ~= err;
489
490 return nextInternal();
491 }
492
493 /** Consume the next token and return it */
494 Token next(IgnoreCommentTokens ignoreCommentTokens = Yes.ignoreCommentTokens)() {
495 static if (ignoreCommentTokens == Yes.ignoreCommentTokens) {
496 Token result = next!(No.ignoreCommentTokens)();
497 if (result.type == TokenType.Comment || result.type == TokenType.BlockComment) return next();
498 else return result;
499 }
500 else {
501 const Token result = nextInternal();
502 last = result;
503 return result;
504 }
505 }
506
507 /** Peek the next token and return it, leaving the lexer in the same state as it was before */
508 Token peek(IgnoreCommentTokens ignoreCommentTokens = Yes.ignoreCommentTokens)() {
509 save();
510 const Token result = next!ignoreCommentTokens();
511 restore();
512 return result;
513 }
514
515 /**
516 * Attempt to consume the next token.
517 * If the token is missing and toConsume is Yes, the following token will be consumed regardless of whether it matches the given pattern.
518 * Returns (false, Token) if the token is not of the required type; returns (true, Token) otherwise
519 */
520 Tuple!(bool, Token) consume(ToConsume toConsume = Yes.toConsume, string message = "")
521 (TokenType type, string value = "") {
522 save();
523
524 Token token;
525 if (type == TokenType.Comment || type == TokenType.BlockComment) token = next!(No.ignoreCommentTokens);
526 else token = next();
527
528 if (token.type == type && (value == "" || token.value == value)) {
529 return tuple(true, token);
530 }
531 else {
532 static if (toConsume == No.toConsume) restore();
533 else discard();
534
535 string finalMessage = "expected " ~ type.to!string;
536
537 if (value != "") finalMessage ~= " '" ~ value ~ "'";
538 if (message != "") finalMessage ~= " " ~ message;
539
540 finalMessage ~= ", got " ~ token.type.to!string;
541
542 Diagnostic err;
543 err.type = DiagnosticType.Error;
544 err.message = finalMessage;
545 err.add(token);
546 *diagnostics ~= err;
547
548 return tuple(false, token);
549 }
550 }
551
552 /** Check if the next token matches the given pattern */
553 bool isNext(TokenType type, string value = "") {
554 Token token;
555 if (type == TokenType.Comment || type == TokenType.BlockComment) token = peek!(No.ignoreCommentTokens);
556 else token = peek();
557
558 return token.type == type && (value == "" || token.value == value);
559 }
560
561 /**
562 * Attempt to consume the next token if it matches the given pattern.
563 * Has no effect on the state of the lexer if the token is not found.
564 */
565 bool tryConsume(TokenType type, string value = "") {
566 if (isNext(type, value)) {
567 consume(type, value);
568 return true;
569 }
570 else return false;
571 }
572
573 /** Check if the next token is EOF */
574 bool eof() {
575 return isNext(TokenType.Eof);
576 }
577
578 }
579
580 unittest {
581 Diagnostic[] diagnostics;
582 Lexer lex = new Lexer(q"(--[[ hello
583 world ]] ident0__)", diagnostics);
584
585 assert(lex.isNext(TokenType.BlockComment, " hello\nworld "));
586 assert(lex.isNext(TokenType.Identifier));
587 assert(!lex.isNext(TokenType.Identifier, "a"));
588
589 const Token t = lex.next!(No.ignoreCommentTokens);
590 assert(t.type == TokenType.BlockComment);
591 assert(t.value == " hello\nworld ");
592
593 assert(diagnostics.length == 0);
594 }
595
596 unittest {
597 Diagnostic[] diagnostics;
598 Lexer lex = new Lexer(q"(--[[ hello
599 world ]] ident0__)", diagnostics);
600
601 const Token t = lex.next();
602 assert(t.type == TokenType.Identifier);
603 assert(t.value == "ident0__");
604
605 assert(diagnostics.length == 0);
606 }
607
608 unittest {
609 Diagnostic[] diagnostics;
610 Lexer lex = new Lexer(q"(--[[ hello
611 world ]] ident0__)", diagnostics);
612
613 const Tuple!(bool, Token) t = lex.consume(TokenType.Identifier);
614 assert(t[0] == true);
615 assert(t[1].type == TokenType.Identifier);
616 assert(t[1].value == "ident0__");
617
618 assert(diagnostics.length == 0);
619 }
620
621 unittest {
622 Diagnostic[] diagnostics;
623 Lexer lex = new Lexer(q"(--[[ hello
624 world ]] ident0__)", diagnostics);
625
626 const Tuple!(bool, Token) t = lex.consume(TokenType.Identifier, "ident0__");
627 assert(t[0] == true);
628 assert(t[1].type == TokenType.Identifier);
629 assert(t[1].value == "ident0__");
630
631 assert(diagnostics.length == 0);
632 }
633
634 unittest {
635 Diagnostic[] diagnostics;
636 Lexer lex = new Lexer(q"(--[[ hello
637 world ]] ident0__)", diagnostics);
638
639 const Tuple!(bool, Token) t = lex.consume(TokenType.Identifier, "asd");
640 assert(t[0] == false);
641 assert(t[1].type == TokenType.Identifier);
642 assert(t[1].value == "ident0__");
643
644 assert(diagnostics.length == 1);
645 }
646
647 unittest {
648 Diagnostic[] diagnostics;
649 Lexer lex = new Lexer(q"(--[[ hello
650 world ]] 2. ident0__)", diagnostics);
651
652 const Tuple!(bool, Token) t = lex.consume(TokenType.Identifier);
653 assert(t[0] == false);
654 assert(t[1].type == TokenType.Number);
655 assert(t[1].value == "2.");
656
657 assert(lex.peek().type == TokenType.Identifier);
658
659 assert(diagnostics.length == 1);
660 }
661
662 unittest {
663 Diagnostic[] diagnostics;
664 Lexer lex = new Lexer(q"(--[[ hello
665 world ]] 2. ident0__)", diagnostics);
666
667 const Tuple!(bool, Token) t = lex.consume!(No.toConsume)(TokenType.Identifier);
668 assert(t[0] == false);
669 assert(t[1].type == TokenType.Number);
670 assert(t[1].value == "2.");
671
672 assert(lex.peek().type == TokenType.Number);
673
674 assert(diagnostics.length == 1);
675 }
676
677 unittest {
678 Diagnostic[] diagnostics;
679 const Token a = new Lexer(q"('alo\n123"')", diagnostics).nextInternal();
680 const Token b = new Lexer(q"("alo\n123\"")", diagnostics).nextInternal();
681 const Token c = new Lexer(q"('\97lo\10\04923"')", diagnostics).nextInternal();
682 const Token d = new Lexer(q"([[alo
683 123"]])", diagnostics).nextInternal();
684 const Token e = new Lexer(q"([==[
685 alo
686 123"]==])", diagnostics).nextInternal();
687
688 const Token f = new Lexer(q"([==[]==])", diagnostics).nextInternal();
689 const Token g = new Lexer(q"([==[]]]==])", diagnostics).nextInternal();
690 const Token h = new Lexer(q"([=[[==[]==]]=])", diagnostics).nextInternal();
691
692 assert(a.type == TokenType.String);
693 assert(b.type == TokenType.String);
694 assert(c.type == TokenType.String);
695 assert(d.type == TokenType.String);
696 assert(e.type == TokenType.String);
697 assert(f.type == TokenType.String);
698 assert(g.type == TokenType.String);
699 assert(h.type == TokenType.String);
700
701 assert(a.value == "alo\n123\"");
702 assert(a.value == b.value);
703 assert(a.value == c.value);
704 assert(a.value == d.value);
705 assert(a.value == e.value);
706
707 assert(f.value == "");
708 assert(g.value == "]]");
709 assert(h.value == "[==[]==]");
710
711 assert(diagnostics.length == 0);
712
713 Diagnostic[] da;
714
715 new Lexer(q"("x\")", da).nextInternal();
716 new Lexer(q"([==[x]=])", da).nextInternal();
717 new Lexer(q"([==[x]=]])", da).nextInternal();
718
719 assert(da.length == 3);
720
721 assert(da[0].quickfix.length == 1);
722 assert(da[1].quickfix.length == 2);
723 assert(da[2].quickfix.length == 2);
724
725 assert(da[0].quickfix[0].apply(q"("x\")") == q"("x\"")");
726
727 assert(da[1].quickfix[0].apply(q"([==[x]=])") == q"([==[x]=]]==])");
728 assert(da[1].quickfix[1].apply(q"([==[x]=])") == q"([==[x]==])");
729
730 assert(da[2].quickfix[0].apply(q"([==[x]=]])") == q"([==[x]=]]]==])");
731 assert(da[2].quickfix[1].apply(q"([==[x]=]])") == q"([==[x]=]==])");
732 }
733
734 unittest {
735 Diagnostic[] diagnostics;
736 Lexer l = new Lexer(" \r\r\n \thi", diagnostics);
737 assert(l.index == 0);
738 l.skipWhitespace();
739 assert(l.index == 9);
740 assert(l.peekChar() == 'h');
741 assert(l.nextChar() == 'h');
742 assert(l.nextChar() == 'i');
743 assert(l.nextChar() == 0);
744
745 assert(diagnostics.length == 0);
746 }
747
748 unittest {
749 Diagnostic[] diagnostics;
750 Lexer l = new Lexer(" \r\r\n \t", diagnostics);
751 assert(l.index == 0);
752 l.skipWhitespace();
753 assert(l.index == 8);
754 assert(l.peekChar() == 0);
755
756 assert(diagnostics.length == 0);
757 }
758
759 unittest {
760 Diagnostic[] diagnostics;
761 Lexer l = new Lexer(" hi and bye", diagnostics);
762
763 const Token hi = l.nextInternal();
764 const Token and = l.nextInternal();
765 const Token bye = l.nextInternal();
766
767 assert(hi.type == TokenType.Identifier);
768 assert(and.type == TokenType.Keyword);
769 assert(bye.type == TokenType.Identifier);
770
771 assert(hi.value == "hi");
772 assert(and.value == "and");
773 assert(bye.value == "bye");
774
775 assert(hi.index == 1);
776 assert(and.index == 4);
777 assert(bye.index == 8);
778
779 assert(diagnostics.length == 0);
780 }
781
782 unittest {
783 Diagnostic[] diagnostics;
784 Lexer l = new Lexer(" hi And bye", diagnostics);
785
786 const Token hi = l.nextInternal();
787 const Token and = l.nextInternal();
788 const Token bye = l.nextInternal();
789
790 assert(hi.type == TokenType.Identifier);
791 assert(and.type == TokenType.Identifier);
792 assert(bye.type == TokenType.Identifier);
793
794 assert(hi.value == "hi");
795 assert(and.value == "And");
796 assert(bye.value == "bye");
797
798 assert(hi.index == 1);
799 assert(and.index == 4);
800 assert(bye.index == 8);
801
802 assert(diagnostics.length == 0);
803 }