1 module zua.parser.lexer; 2 import zua.diagnostic; 3 import std.variant; 4 import std.utf; 5 import std.typecons; 6 import std.string; 7 import std.container.rbtree; 8 import std.range.primitives; 9 import std.format; 10 import std.conv; 11 12 /** Describes the type of a token */ 13 enum TokenType { 14 Eof, 15 Identifier, 16 Keyword, 17 String, 18 Number, 19 Symbol, 20 Comment, 21 BlockComment 22 } 23 24 /** A single token in a source file */ 25 struct Token { 26 27 /** Describes the type of this token */ 28 TokenType type; 29 30 /** The raw value of this token */ 31 string rawValue; 32 33 /** The parsed value of this token */ 34 string value; 35 36 /** Only used for numbers; the numeric value of this token */ 37 double numValue; 38 39 /** The index at which this token is located */ 40 size_t index; 41 42 } 43 44 alias IgnoreCommentTokens = Flag!"ignoreCommentTokens"; 45 alias ToConsume = Flag!"toConsume"; 46 47 private struct LexerState { 48 size_t diagnosticsLength; 49 size_t index; 50 Nullable!Token last; 51 } 52 53 /** A lexer for Lua */ 54 final class Lexer { 55 56 private string code; 57 private size_t index; 58 private Diagnostic[]* diagnostics; 59 60 private LexerState[] stateStack; 61 62 private RedBlackTree!string keywords; 63 64 private string[] symbols; 65 66 /** The last consumed token */ 67 Nullable!Token last; 68 69 /** Create a new lexer from a piece of code */ 70 this(string code, ref Diagnostic[] diagnostics) { 71 this.code = code; 72 this.diagnostics = &diagnostics; 73 74 keywords = redBlackTree!string( 75 "and", "break", "do", "else", "elseif", 76 "end", "false", "for", "function", "if", 77 "in", "local", "nil", "not", "or", 78 "repeat", "return", "then", "true", "until", "while" 79 ); 80 81 symbols = [ 82 "...", 83 84 "..", 85 "==", "~=", "<=", ">=", 86 87 "+", "-", "*", "/", "%", "^", "#", 88 "<", ">", "=", 89 "(", ")", "{", "}", "[", "]", 90 ";", ":", ",", ".", 91 ]; 92 } 93 94 /** Get the source code that this Lexer is lexing */ 95 string source() { 96 return code; 97 } 98 99 /** Save the current state of the lexer */ 100 void save() { 101 const LexerState state = { 102 diagnosticsLength: (*diagnostics).length, 103 index: index, 104 last: last 105 }; 106 107 stateStack ~= state; 108 } 109 110 /** Restore the previously-saved lexer state */ 111 void restore() { 112 const LexerState state = stateStack.back; 113 (*diagnostics) = (*diagnostics)[0..state.diagnosticsLength]; 114 index = state.index; 115 last = state.last; 116 117 discard(); 118 } 119 120 /** Discard the previously-saved lexer state */ 121 void discard() { 122 stateStack.popBack(); 123 } 124 125 /** Peek a segment of characters */ 126 private dstring peekChars(size_t length) { 127 dstring result; 128 129 size_t prevIndex = index; 130 131 foreach (i; 0..length) { 132 if (prevIndex >= code.length) return ""; 133 result ~= code.decode!(Yes.useReplacementDchar)(prevIndex); 134 } 135 136 return result; 137 } 138 139 /** Peek the next char */ 140 private dchar peekChar() { 141 if (charEof) return 0; 142 size_t prevIndex = index; 143 return code.decode!(Yes.useReplacementDchar)(prevIndex); 144 } 145 146 /** Consume a segment of characters */ 147 private dstring nextChars(size_t length) { 148 dstring result; 149 150 foreach (i; 0..length) { 151 if (charEof) return ""; 152 result ~= code.decode!(Yes.useReplacementDchar)(index); 153 } 154 155 return result; 156 } 157 158 /** Consume the next char in the lexer */ 159 private dchar nextChar() { 160 if (charEof) return 0; 161 return code.decode!(Yes.useReplacementDchar)(index); 162 } 163 164 /** Check if we're at the end of the file */ 165 private bool charEof() { 166 return index >= code.length; 167 } 168 169 /** Read a string for as long as the given predicate holds true */ 170 private string readWhile(bool delegate(dchar c) predicate) { 171 string result; 172 while (!charEof && predicate(peekChar())) { 173 result ~= nextChar(); 174 } 175 return result; 176 } 177 178 private void skipWhitespace() { 179 readWhile(c => " \t\r\n"d.indexOf(c) != -1); 180 } 181 182 private bool isIdentifierCharacter(dchar c) { 183 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'; 184 } 185 186 private bool isNumeralCharacter(dchar c) { 187 return (c >= '0' && c <= '9') || c == '.'; 188 } 189 190 private bool isHexadecimalCharacter(dchar c) { 191 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); 192 } 193 194 private Token* readLongString(string tokenName) { 195 // assume char = '[' 196 const size_t prevIndex = index; 197 nextChar(); 198 string open = "["; 199 string closeSeq = "]"; 200 uint level = 0; 201 while (peekChar() == '=') { 202 level++; 203 open ~= '='; 204 closeSeq ~= '='; 205 nextChar(); 206 } 207 closeSeq ~= ']'; 208 if (peekChar() == '[') { 209 int closeLevel = -1; 210 bool closed = false; 211 size_t closeBeginP = -1; 212 size_t closeBegin = -1; 213 size_t closeEnd = -1; 214 string rawValue = open ~ readWhile(delegate(ic) { 215 if (ic == ']') { 216 if (closeBeginP != -1) { 217 closeBegin = closeBeginP; 218 closeEnd = index + 1; 219 } 220 221 if (closeLevel == level) { 222 closed = true; 223 return false; 224 } 225 else { 226 closeLevel = 0; 227 closeBeginP = index; 228 } 229 } 230 else if (ic == '=' && closeLevel != -1) { 231 closeLevel++; 232 } 233 else { 234 closeLevel = -1; 235 } 236 return true; 237 }); 238 239 if (!closed) { 240 Diagnostic err; 241 err.type = DiagnosticType.Error; 242 err.message = "unclosed " ~ tokenName; 243 err.add(index); 244 const Quickfix close = { 245 message: "add closing sequence to " ~ tokenName, 246 ranges: [QuickfixRange(index, index, closeSeq)] 247 }; 248 err.quickfix ~= close; 249 if (closeBegin != -1) { 250 const Quickfix fixExistingClose = { 251 message: "fix existing closing sequence to match level of " ~ tokenName, 252 ranges: [QuickfixRange(closeBegin, closeEnd, closeSeq)] 253 }; 254 err.quickfix ~= fixExistingClose; 255 } 256 *diagnostics ~= err; 257 } 258 else { 259 nextChar(); 260 rawValue ~= "]"; 261 } 262 263 string value = rawValue[2 + level .. $ - (2 + level)]; 264 265 if (value.length > 0 && value[0] == '\n') { 266 value = value[1..$]; 267 } 268 269 Token* result = new Token; 270 result.value = value; 271 result.rawValue = rawValue; 272 result.type = TokenType.String; 273 result.index = prevIndex; 274 return result; 275 } 276 else { // turns out it wasn't a string... 277 index = prevIndex; 278 return null; 279 } 280 } 281 282 private Token nextInternal() { 283 skipWhitespace(); 284 285 Token result; 286 result.index = index; 287 288 if (charEof) { 289 result.type = TokenType.Eof; 290 return result; 291 } 292 293 const dchar c = peekChar(); 294 295 if (peekChars(2) == "--"d) { 296 nextChars(2); 297 298 if (peekChar() == '[') { 299 const auto res = readLongString("block comment"); 300 if (res !is null) { 301 result = *res; 302 result.type = TokenType.BlockComment; 303 return result; 304 } 305 } 306 307 result.rawValue = result.value = readWhile(delegate(c) { 308 return c != '\n'; 309 }); 310 311 result.type = TokenType.Comment; 312 return result; 313 } 314 315 if (peekChars(2) == "0x"d) { 316 nextChars(2); 317 const string value = readWhile(delegate(c) { 318 return isHexadecimalCharacter(c); 319 }); 320 321 if (value == "") { // if it's just 0x, it's clearly not a number 322 index -= 2; // put it back and continue 323 } 324 else { 325 result.value = value.to!ulong(16).to!string; 326 result.numValue = value.to!ulong(16).to!double; 327 result.rawValue = "0x" ~ value; 328 result.type = TokenType.Number; 329 return result; 330 } 331 } 332 else if (isNumeralCharacter(c)) { 333 bool dot = false; 334 string value = readWhile(delegate(c) { 335 if (c == '.') { 336 if (dot) return false; // if we already have a dot, don't read two dots 337 dot = true; 338 } 339 340 return isNumeralCharacter(c); 341 }); 342 343 if (value == ".") { // if it's just a dot, it's clearly not a number 344 index--; // put it back and continue 345 } 346 else { 347 double numValue = value.to!double; 348 if (peekChar() == 'e') { 349 value ~= 'e'; 350 nextChar(); 351 bool pos = true; 352 if (peekChar() == '+') { 353 value ~= '+'; 354 nextChar(); 355 } 356 else if (peekChar() == '-') { 357 value ~= '-'; 358 pos = false; 359 nextChar(); 360 } 361 const string exp = readWhile(delegate(c) { 362 return isNumeralCharacter(c) && c != '.'; 363 }); 364 value ~= exp; 365 numValue *= 10.0 ^^ ((!pos ? -1 : 1) * exp.to!double); 366 } 367 result.numValue = numValue; 368 result.value = value; 369 result.rawValue = value; 370 result.type = TokenType.Number; 371 return result; 372 } 373 } 374 375 if (isIdentifierCharacter(c)) { 376 const string value = readWhile(&isIdentifierCharacter); 377 result.value = value; 378 result.rawValue = value; 379 result.type = value in keywords ? TokenType.Keyword : TokenType.Identifier; 380 return result; 381 } 382 383 if (c == '"' || c == '\'') { 384 bool escape = false; 385 int digits = 0; 386 int codepoint = 0; 387 string value; 388 bool closed = false; 389 string rawValue = [nextChar()].toUTF8 ~ readWhile(delegate(ic) { 390 if (escape) { 391 if (ic == 'a') value ~= '\a'; 392 else if (ic == 'b') value ~= '\b'; 393 else if (ic == 'f') value ~= '\f'; 394 else if (ic == 'n') value ~= '\n'; 395 else if (ic == 'r') value ~= '\r'; 396 else if (ic == 't') value ~= '\t'; 397 else if (ic == 'v') value ~= '\v'; 398 else if (ic >= '0' && ic <= '9') { 399 digits = 1; 400 codepoint = ic - '0'; 401 } 402 else value ~= ic; 403 escape = false; 404 } 405 else if (ic == '\n') { 406 return false; 407 } 408 else if (ic >= '0' && ic <= '9' && digits > 0 && digits < 3) { 409 digits++; 410 codepoint *= 10; 411 codepoint += ic - '0'; 412 } 413 else { 414 if (digits > 0) { 415 value ~= cast(char)codepoint; 416 digits = 0; 417 } 418 419 if (ic == '\\') escape = true; 420 else if (ic == c) { 421 closed = true; 422 return false; 423 } 424 else value ~= ic; 425 } 426 return true; 427 }); 428 429 if (digits > 0) value ~= cast(char)codepoint; 430 431 if (!closed) { 432 Diagnostic err; 433 err.type = DiagnosticType.Error; 434 err.message = "unclosed string"; 435 err.add(index); 436 if (escape) { 437 const Quickfix close = { 438 message: "remove extraneous '\\' and add closing quote to string", 439 ranges: [QuickfixRange(index - 1, index, [c].toUTF8)] 440 }; 441 err.quickfix ~= close; 442 } 443 else { 444 const Quickfix close = { 445 message: "add closing quote to string", 446 ranges: [QuickfixRange(index, index, [c].toUTF8)] 447 }; 448 err.quickfix ~= close; 449 } 450 *diagnostics ~= err; 451 } 452 else { 453 rawValue ~= [nextChar()].toUTF8; 454 } 455 456 result.value = value; 457 result.rawValue = rawValue; 458 result.type = TokenType.String; 459 return result; 460 } 461 462 if (c == '[') { 463 const auto res = readLongString("string"); 464 if (res !is null) return *res; 465 } 466 467 foreach (symbol; symbols) { 468 const auto utf32sym = symbol.toUTF32; 469 if (peekChars(utf32sym.length) == utf32sym) { 470 result.value = symbol; 471 result.rawValue = symbol; 472 result.type = TokenType.Symbol; 473 nextChars(utf32sym.length); 474 return result; 475 } 476 } 477 478 nextChar(); 479 Diagnostic err; 480 err.type = DiagnosticType.Error; 481 err.message = "unknown character"; 482 err.add(result.index, index); 483 const Quickfix remove = { 484 message: "remove unknown character", 485 ranges: [QuickfixRange(result.index, index, "")] 486 }; 487 err.quickfix ~= remove; 488 *diagnostics ~= err; 489 490 return nextInternal(); 491 } 492 493 /** Consume the next token and return it */ 494 Token next(IgnoreCommentTokens ignoreCommentTokens = Yes.ignoreCommentTokens)() { 495 static if (ignoreCommentTokens == Yes.ignoreCommentTokens) { 496 Token result = next!(No.ignoreCommentTokens)(); 497 if (result.type == TokenType.Comment || result.type == TokenType.BlockComment) return next(); 498 else return result; 499 } 500 else { 501 const Token result = nextInternal(); 502 last = result; 503 return result; 504 } 505 } 506 507 /** Peek the next token and return it, leaving the lexer in the same state as it was before */ 508 Token peek(IgnoreCommentTokens ignoreCommentTokens = Yes.ignoreCommentTokens)() { 509 save(); 510 const Token result = next!ignoreCommentTokens(); 511 restore(); 512 return result; 513 } 514 515 /** 516 * Attempt to consume the next token. 517 * If the token is missing and toConsume is Yes, the following token will be consumed regardless of whether it matches the given pattern. 518 * Returns (false, Token) if the token is not of the required type; returns (true, Token) otherwise 519 */ 520 Tuple!(bool, Token) consume(ToConsume toConsume = Yes.toConsume, string message = "") 521 (TokenType type, string value = "") { 522 save(); 523 524 Token token; 525 if (type == TokenType.Comment || type == TokenType.BlockComment) token = next!(No.ignoreCommentTokens); 526 else token = next(); 527 528 if (token.type == type && (value == "" || token.value == value)) { 529 return tuple(true, token); 530 } 531 else { 532 static if (toConsume == No.toConsume) restore(); 533 else discard(); 534 535 string finalMessage = "expected " ~ type.to!string; 536 537 if (value != "") finalMessage ~= " '" ~ value ~ "'"; 538 if (message != "") finalMessage ~= " " ~ message; 539 540 finalMessage ~= ", got " ~ token.type.to!string; 541 542 Diagnostic err; 543 err.type = DiagnosticType.Error; 544 err.message = finalMessage; 545 err.add(token); 546 *diagnostics ~= err; 547 548 return tuple(false, token); 549 } 550 } 551 552 /** Check if the next token matches the given pattern */ 553 bool isNext(TokenType type, string value = "") { 554 Token token; 555 if (type == TokenType.Comment || type == TokenType.BlockComment) token = peek!(No.ignoreCommentTokens); 556 else token = peek(); 557 558 return token.type == type && (value == "" || token.value == value); 559 } 560 561 /** 562 * Attempt to consume the next token if it matches the given pattern. 563 * Has no effect on the state of the lexer if the token is not found. 564 */ 565 bool tryConsume(TokenType type, string value = "") { 566 if (isNext(type, value)) { 567 consume(type, value); 568 return true; 569 } 570 else return false; 571 } 572 573 /** Check if the next token is EOF */ 574 bool eof() { 575 return isNext(TokenType.Eof); 576 } 577 578 } 579 580 unittest { 581 Diagnostic[] diagnostics; 582 Lexer lex = new Lexer(q"(--[[ hello 583 world ]] ident0__)", diagnostics); 584 585 assert(lex.isNext(TokenType.BlockComment, " hello\nworld ")); 586 assert(lex.isNext(TokenType.Identifier)); 587 assert(!lex.isNext(TokenType.Identifier, "a")); 588 589 const Token t = lex.next!(No.ignoreCommentTokens); 590 assert(t.type == TokenType.BlockComment); 591 assert(t.value == " hello\nworld "); 592 593 assert(diagnostics.length == 0); 594 } 595 596 unittest { 597 Diagnostic[] diagnostics; 598 Lexer lex = new Lexer(q"(--[[ hello 599 world ]] ident0__)", diagnostics); 600 601 const Token t = lex.next(); 602 assert(t.type == TokenType.Identifier); 603 assert(t.value == "ident0__"); 604 605 assert(diagnostics.length == 0); 606 } 607 608 unittest { 609 Diagnostic[] diagnostics; 610 Lexer lex = new Lexer(q"(--[[ hello 611 world ]] ident0__)", diagnostics); 612 613 const Tuple!(bool, Token) t = lex.consume(TokenType.Identifier); 614 assert(t[0] == true); 615 assert(t[1].type == TokenType.Identifier); 616 assert(t[1].value == "ident0__"); 617 618 assert(diagnostics.length == 0); 619 } 620 621 unittest { 622 Diagnostic[] diagnostics; 623 Lexer lex = new Lexer(q"(--[[ hello 624 world ]] ident0__)", diagnostics); 625 626 const Tuple!(bool, Token) t = lex.consume(TokenType.Identifier, "ident0__"); 627 assert(t[0] == true); 628 assert(t[1].type == TokenType.Identifier); 629 assert(t[1].value == "ident0__"); 630 631 assert(diagnostics.length == 0); 632 } 633 634 unittest { 635 Diagnostic[] diagnostics; 636 Lexer lex = new Lexer(q"(--[[ hello 637 world ]] ident0__)", diagnostics); 638 639 const Tuple!(bool, Token) t = lex.consume(TokenType.Identifier, "asd"); 640 assert(t[0] == false); 641 assert(t[1].type == TokenType.Identifier); 642 assert(t[1].value == "ident0__"); 643 644 assert(diagnostics.length == 1); 645 } 646 647 unittest { 648 Diagnostic[] diagnostics; 649 Lexer lex = new Lexer(q"(--[[ hello 650 world ]] 2. ident0__)", diagnostics); 651 652 const Tuple!(bool, Token) t = lex.consume(TokenType.Identifier); 653 assert(t[0] == false); 654 assert(t[1].type == TokenType.Number); 655 assert(t[1].value == "2."); 656 657 assert(lex.peek().type == TokenType.Identifier); 658 659 assert(diagnostics.length == 1); 660 } 661 662 unittest { 663 Diagnostic[] diagnostics; 664 Lexer lex = new Lexer(q"(--[[ hello 665 world ]] 2. ident0__)", diagnostics); 666 667 const Tuple!(bool, Token) t = lex.consume!(No.toConsume)(TokenType.Identifier); 668 assert(t[0] == false); 669 assert(t[1].type == TokenType.Number); 670 assert(t[1].value == "2."); 671 672 assert(lex.peek().type == TokenType.Number); 673 674 assert(diagnostics.length == 1); 675 } 676 677 unittest { 678 Diagnostic[] diagnostics; 679 const Token a = new Lexer(q"('alo\n123"')", diagnostics).nextInternal(); 680 const Token b = new Lexer(q"("alo\n123\"")", diagnostics).nextInternal(); 681 const Token c = new Lexer(q"('\97lo\10\04923"')", diagnostics).nextInternal(); 682 const Token d = new Lexer(q"([[alo 683 123"]])", diagnostics).nextInternal(); 684 const Token e = new Lexer(q"([==[ 685 alo 686 123"]==])", diagnostics).nextInternal(); 687 688 const Token f = new Lexer(q"([==[]==])", diagnostics).nextInternal(); 689 const Token g = new Lexer(q"([==[]]]==])", diagnostics).nextInternal(); 690 const Token h = new Lexer(q"([=[[==[]==]]=])", diagnostics).nextInternal(); 691 692 assert(a.type == TokenType.String); 693 assert(b.type == TokenType.String); 694 assert(c.type == TokenType.String); 695 assert(d.type == TokenType.String); 696 assert(e.type == TokenType.String); 697 assert(f.type == TokenType.String); 698 assert(g.type == TokenType.String); 699 assert(h.type == TokenType.String); 700 701 assert(a.value == "alo\n123\""); 702 assert(a.value == b.value); 703 assert(a.value == c.value); 704 assert(a.value == d.value); 705 assert(a.value == e.value); 706 707 assert(f.value == ""); 708 assert(g.value == "]]"); 709 assert(h.value == "[==[]==]"); 710 711 assert(diagnostics.length == 0); 712 713 Diagnostic[] da; 714 715 new Lexer(q"("x\")", da).nextInternal(); 716 new Lexer(q"([==[x]=])", da).nextInternal(); 717 new Lexer(q"([==[x]=]])", da).nextInternal(); 718 719 assert(da.length == 3); 720 721 assert(da[0].quickfix.length == 1); 722 assert(da[1].quickfix.length == 2); 723 assert(da[2].quickfix.length == 2); 724 725 assert(da[0].quickfix[0].apply(q"("x\")") == q"("x\"")"); 726 727 assert(da[1].quickfix[0].apply(q"([==[x]=])") == q"([==[x]=]]==])"); 728 assert(da[1].quickfix[1].apply(q"([==[x]=])") == q"([==[x]==])"); 729 730 assert(da[2].quickfix[0].apply(q"([==[x]=]])") == q"([==[x]=]]]==])"); 731 assert(da[2].quickfix[1].apply(q"([==[x]=]])") == q"([==[x]=]==])"); 732 } 733 734 unittest { 735 Diagnostic[] diagnostics; 736 Lexer l = new Lexer(" \r\r\n \thi", diagnostics); 737 assert(l.index == 0); 738 l.skipWhitespace(); 739 assert(l.index == 9); 740 assert(l.peekChar() == 'h'); 741 assert(l.nextChar() == 'h'); 742 assert(l.nextChar() == 'i'); 743 assert(l.nextChar() == 0); 744 745 assert(diagnostics.length == 0); 746 } 747 748 unittest { 749 Diagnostic[] diagnostics; 750 Lexer l = new Lexer(" \r\r\n \t", diagnostics); 751 assert(l.index == 0); 752 l.skipWhitespace(); 753 assert(l.index == 8); 754 assert(l.peekChar() == 0); 755 756 assert(diagnostics.length == 0); 757 } 758 759 unittest { 760 Diagnostic[] diagnostics; 761 Lexer l = new Lexer(" hi and bye", diagnostics); 762 763 const Token hi = l.nextInternal(); 764 const Token and = l.nextInternal(); 765 const Token bye = l.nextInternal(); 766 767 assert(hi.type == TokenType.Identifier); 768 assert(and.type == TokenType.Keyword); 769 assert(bye.type == TokenType.Identifier); 770 771 assert(hi.value == "hi"); 772 assert(and.value == "and"); 773 assert(bye.value == "bye"); 774 775 assert(hi.index == 1); 776 assert(and.index == 4); 777 assert(bye.index == 8); 778 779 assert(diagnostics.length == 0); 780 } 781 782 unittest { 783 Diagnostic[] diagnostics; 784 Lexer l = new Lexer(" hi And bye", diagnostics); 785 786 const Token hi = l.nextInternal(); 787 const Token and = l.nextInternal(); 788 const Token bye = l.nextInternal(); 789 790 assert(hi.type == TokenType.Identifier); 791 assert(and.type == TokenType.Identifier); 792 assert(bye.type == TokenType.Identifier); 793 794 assert(hi.value == "hi"); 795 assert(and.value == "And"); 796 assert(bye.value == "bye"); 797 798 assert(hi.index == 1); 799 assert(and.index == 4); 800 assert(bye.index == 8); 801 802 assert(diagnostics.length == 0); 803 }