-
Notifications
You must be signed in to change notification settings - Fork 48
/
compiler.go
1005 lines (904 loc) · 35.7 KB
/
compiler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* TTTTTTTTTTTTTTTTTTTTTTTHHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE
* T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E
* T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E
* T:::::TT:::::::TT:::::THH::::::H H::::::HHEE::::::EEEEEEEEE::::E
* TTTTTT T:::::T TTTTTT H:::::H H:::::H E:::::E EEEEEE
* T:::::T H:::::H H:::::H E:::::E
* T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE
* T:::::T H:::::::::::::::::H E:::::::::::::::E
* T:::::T H:::::::::::::::::H E:::::::::::::::E
* T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE
* T:::::T H:::::H H:::::H E:::::E
* T:::::T H:::::H H:::::H E:::::E EEEEEE
* TT:::::::TT HH::::::H H::::::HHEE::::::EEEEEEEE:::::E
* T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E
* T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E
* TTTTTTTTTTT HHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE
*
* SSSSSSSSSSSSSSS UUUUUUUU UUUUUUUUPPPPPPPPPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR
* SS:::::::::::::::SU::::::U U::::::UP::::::::::::::::P E::::::::::::::::::::ER::::::::::::::::R
* S:::::SSSSSS::::::SU::::::U U::::::UP::::::PPPPPP:::::P E::::::::::::::::::::ER::::::RRRRRR:::::R
* S:::::S SSSSSSSUU:::::U U:::::UUPP:::::P P:::::PEE::::::EEEEEEEEE::::ERR:::::R R:::::R
* S:::::S U:::::U U:::::U P::::P P:::::P E:::::E EEEEEE R::::R R:::::R
* S:::::S U:::::U U:::::U P::::P P:::::P E:::::E R::::R R:::::R
* S::::SSSS U:::::U U:::::U P::::PPPPPP:::::P E::::::EEEEEEEEEE R::::RRRRRR:::::R
* SS::::::SSSSS U:::::U U:::::U P:::::::::::::PP E:::::::::::::::E R:::::::::::::RR
* SSS::::::::SS U:::::U U:::::U P::::PPPPPPPPP E:::::::::::::::E R::::RRRRRR:::::R
* SSSSSS::::S U:::::U U:::::U P::::P E::::::EEEEEEEEEE R::::R R:::::R
* S:::::S U:::::U U:::::U P::::P E:::::E R::::R R:::::R
* S:::::S U::::::U U::::::U P::::P E:::::E EEEEEE R::::R R:::::R
* SSSSSSS S:::::S U:::::::UUU:::::::U PP::::::PP EE::::::EEEEEEEE:::::ERR:::::R R:::::R
* S::::::SSSSSS:::::S UU:::::::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R
* S:::::::::::::::SS UU:::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R
* SSSSSSSSSSSSSSS UUUUUUUUU PPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR
*
* TTTTTTTTTTTTTTTTTTTTTTTIIIIIIIIIINNNNNNNN NNNNNNNNYYYYYYY YYYYYYY
* T:::::::::::::::::::::TI::::::::IN:::::::N N::::::NY:::::Y Y:::::Y
* T:::::::::::::::::::::TI::::::::IN::::::::N N::::::NY:::::Y Y:::::Y
* T:::::TT:::::::TT:::::TII::::::IIN:::::::::N N::::::NY::::::Y Y::::::Y
* TTTTTT T:::::T TTTTTT I::::I N::::::::::N N::::::NYYY:::::Y Y:::::YYY
* T:::::T I::::I N:::::::::::N N::::::N Y:::::Y Y:::::Y
* T:::::T I::::I N:::::::N::::N N::::::N Y:::::Y:::::Y
* T:::::T I::::I N::::::N N::::N N::::::N Y:::::::::Y
* T:::::T I::::I N::::::N N::::N:::::::N Y:::::::Y
* T:::::T I::::I N::::::N N:::::::::::N Y:::::Y
* T:::::T I::::I N::::::N N::::::::::N Y:::::Y
* T:::::T I::::I N::::::N N:::::::::N Y:::::Y
* TT:::::::TT II::::::IIN::::::N N::::::::N Y:::::Y
* T:::::::::T I::::::::IN::::::N N:::::::N YYYY:::::YYYY
* T:::::::::T I::::::::IN::::::N N::::::N Y:::::::::::Y
* TTTTTTTTTTT IIIIIIIIIINNNNNNNN NNNNNNN YYYYYYYYYYYYY
*
* CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPPPPPPPPP IIIIIIIIIILLLLLLLLLLL EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR
* CCC::::::::::::C OO:::::::::OO M:::::::M M:::::::MP::::::::::::::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::::::::::::R
* CC:::::::::::::::C OO:::::::::::::OO M::::::::M M::::::::MP::::::PPPPPP:::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::RRRRRR:::::R
* C:::::CCCCCCCC::::CO:::::::OOO:::::::OM:::::::::M M:::::::::MPP:::::P P:::::PII::::::IILL:::::::LL EE::::::EEEEEEEEE::::ERR:::::R R:::::R
* C:::::C CCCCCCO::::::O O::::::OM::::::::::M M::::::::::M P::::P P:::::P I::::I L:::::L E:::::E EEEEEE R::::R R:::::R
* C:::::C O:::::O O:::::OM:::::::::::M M:::::::::::M P::::P P:::::P I::::I L:::::L E:::::E R::::R R:::::R
* C:::::C O:::::O O:::::OM:::::::M::::M M::::M:::::::M P::::PPPPPP:::::P I::::I L:::::L E::::::EEEEEEEEEE R::::RRRRRR:::::R
* C:::::C O:::::O O:::::OM::::::M M::::M M::::M M::::::M P:::::::::::::PP I::::I L:::::L E:::::::::::::::E R:::::::::::::RR
* C:::::C O:::::O O:::::OM::::::M M::::M::::M M::::::M P::::PPPPPPPPP I::::I L:::::L E:::::::::::::::E R::::RRRRRR:::::R
* C:::::C O:::::O O:::::OM::::::M M:::::::M M::::::M P::::P I::::I L:::::L E::::::EEEEEEEEEE R::::R R:::::R
* C:::::C O:::::O O:::::OM::::::M M:::::M M::::::M P::::P I::::I L:::::L E:::::E R::::R R:::::R
* C:::::C CCCCCCO::::::O O::::::OM::::::M MMMMM M::::::M P::::P I::::I L:::::L LLLLLL E:::::E EEEEEE R::::R R:::::R
* C:::::CCCCCCCC::::CO:::::::OOO:::::::OM::::::M M::::::MPP::::::PP II::::::IILL:::::::LLLLLLLLL:::::LEE::::::EEEEEEEE:::::ERR:::::R R:::::R
* CC:::::::::::::::C OO:::::::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R
* CCC::::::::::::C OO:::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R
* CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPP IIIIIIIIIILLLLLLLLLLLLLLLLLLLLLLLLEEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR
*
* =======================================================================================================================================================================
* =======================================================================================================================================================================
* =======================================================================================================================================================================
* =======================================================================================================================================================================
*/
/**
* Today we're going to write a compiler together. But not just any compiler... A
* super duper teeny tiny compiler! A compiler that is so small that if you
* remove all the comments this file would only be ~200 lines of actual code.
*
* We're going to compile some lisp-like function calls into some C-like
* function calls.
*
* If you are not familiar with one or the other. I'll just give you a quick intro.
*
* If we had two functions `add` and `subtract` they would be written like this:
*
* LISP C
*
* 2 + 2 (add 2 2) add(2, 2)
* 4 - 2 (subtract 4 2) subtract(4, 2)
* 2 + (4 - 2) (add 2 (subtract 4 2)) add(2, subtract(4, 2))
*
* Easy peezy right?
*
* Well good, because this is exactly what we are going to compile. While this
* is neither a complete LISP or C syntax, it will be enough of the syntax to
* demonstrate many of the major pieces of a modern compiler.
*/
/**
* Most compilers break down into three primary stages: Parsing, Transformation,
* and Code Generation
*
* 1. *Parsing* is taking raw code and turning it into a more abstract
* representation of the code.
*
* 2. *Transformation* takes this abstract representation and manipulates to do
* whatever the compiler wants it to.
*
* 3. *Code Generation* takes the transformed representation of the code and
* turns it into new code.
*/
/**
* Parsing
* -------
*
* Parsing typically gets broken down into two phases: Lexical Analysis and
* Syntactic Analysis.
*
* 1. *Lexical Analysis* takes the raw code and splits it apart into these things
* called tokens by a thing called a tokenizer (or lexer).
*
* Tokens are an array of tiny little objects that describe an isolated piece
* of the syntax. They could be numbers, labels, punctuation, operators,
* whatever.
*
* 2. *Syntactic Analysis* takes the tokens and reformats them into a
* representation that describes each part of the syntax and their relation
* to one another. This is known as an intermediate representation or
* Abstract Syntax Tree.
*
* An Abstract Syntax Tree, or AST for short, is a deeply nested object that
* represents code in a way that is both easy to work with and tells us a lot
* of information.
*
* For the following syntax:
*
* (add 2 (subtract 4 2))
*
* Tokens might look something like this:
*
* [
* { type: 'paren', value: '(' },
* { type: 'name', value: 'add' },
* { type: 'number', value: '2' },
* { type: 'paren', value: '(' },
* { type: 'name', value: 'subtract' },
* { type: 'number', value: '4' },
* { type: 'number', value: '2' },
* { type: 'paren', value: ')' },
* { type: 'paren', value: ')' }
* ]
*
* And an Abstract Syntax Tree (AST) might look like this:
*
* {
* type: 'Program',
* body: [{
* type: 'CallExpression',
* name: 'add',
* params: [{
* type: 'NumberLiteral',
* value: '2'
* }, {
* type: 'CallExpression',
* name: 'subtract',
* params: [{
* type: 'NumberLiteral',
* value: '4'
* }, {
* type: 'NumberLiteral',
* value: '2'
* }]
* }]
* }]
* }
*/
/**
* Transformation
* --------------
*
* The next type of stage for a compiler is transformation. Again, this just
* takes the AST from the last step and makes changes to it. It can manipulate
* the AST in the same language or it can translate it into an entirely new
* language.
*
* Let’s look at how we would transform an AST.
*
* You might notice that our AST has elements within it that look very similar.
* There are these objects with a type property. Each of these are known as an
* AST Node. These nodes have defined properties on them that describe one
* isolated part of the tree.
*
* We can have a node for a "NumberLiteral":
*
* {
* type: 'NumberLiteral',
* value: '2'
* }
*
* Or maybe a node for a "CallExpression":
*
* {
* type: 'CallExpression',
* name: 'subtract',
* params: [...nested nodes go here...]
* }
*
* When transforming the AST we can manipulate nodes by
* adding/removing/replacing properties, we can add new nodes, remove nodes, or
* we could leave the existing AST alone and create an entirely new one based
* on it.
*
* Since we’re targeting a new language, we’re going to focus on creating an
* entirely new AST that is specific to the target language.
*
* Traversal
* ---------
*
* In order to navigate through all of these nodes, we need to be able to
* traverse through them. This traversal process goes to each node in the AST
* depth-first.
*
* {
* type: 'Program',
* body: [{
* type: 'CallExpression',
* name: 'add',
* params: [{
* type: 'NumberLiteral',
* value: '2'
* }, {
* type: 'CallExpression',
* name: 'subtract',
* params: [{
* type: 'NumberLiteral',
* value: '4'
* }, {
* type: 'NumberLiteral',
* value: '2'
* }]
* }]
* }]
* }
*
* So for the above AST we would go:
*
* 1. Program - Starting at the top level of the AST
* 2. CallExpression (add) - Moving to the first element of the Program's body
* 3. NumberLiteral (2) - Moving to the first element of CallExpression's params
* 4. CallExpression (subtract) - Moving to the second element of CallExpression's params
* 5. NumberLiteral (4) - Moving to the first element of CallExpression's params
* 6. NumberLiteral (2) - Moving to the second element of CallExpression's params
*
* If we were manipulating this AST directly, instead of creating a separate AST,
* we would likely introduce all sorts of abstractions here. But just visiting
* each node in the tree is enough.
*
* The reason I use the word “visiting” is because there is this pattern of how
* to represent operations on elements of an object structure.
*
* Visitors
* --------
*
* The basic idea here is that we are going to create a “visitor” object that
* has methods that will accept different node types.
*
* var visitor = {
* NumberLiteral() {},
* CallExpression() {}
* };
*
* When we traverse our AST we will call the methods on this visitor whenever we
* encounter a node of a matching type.
*
* In order to make this useful we will also pass the node and a reference to
* the parent node.
*
* var visitor = {
* NumberLiteral(node, parent) {},
* CallExpression(node, parent) {}
* };
*/
/**
* Code Generation
* ---------------
*
* The final phase of a compiler is code generation. Sometimes compilers will do
* things that overlap with transformation, but for the most part code
* generation just means take our AST and string-ify code back out.
*
* Code generators work several different ways, some compilers will reuse the
* tokens from earlier, others will have created a separate representation of
* the code so that they can print node linearly, but from what I can tell most
* will use the same AST we just created, which is what we’re going to focus on.
*
* Effectively our code generator will know how to “print” all of the different
* node types of the AST, and it will recursively call itself to print nested
* nodes until everything is printed into one long string of code.
*/
/**
* And that's it! That's all the different pieces of a compiler.
*
* Now that isn’t to say every compiler looks exactly like I described here.
* Compilers serve many different purposes, and they might need more steps than
* I have detailed.
*
* But now you should have a general high-level idea of what most compilers look
* like.
*
* Now that I’ve explained all of this, you’re all good to go write your own
* compilers right?
*
* Just kidding, that's what I'm here to help with :P
*
* So let's begin...
*/
/**
* ============================================================================
* (/^▽^)/
* THE TOKENIZER!
* ============================================================================
*/
/**
* We're gonna start off with our first phase of parsing, lexical analysis, with
* the tokenizer.
*
* We're just going to take our string of code and break it down into an array
* of tokens.
*
* (add 2 (subtract 4 2)) => [{ type: 'paren', value: '(' }, ...]
*/
package main
import (
"fmt"
"log"
"strings"
)
type token struct {
kind string
value string
}
// We start by accepting an input string of code, and we're gonna set up two
// things...
func tokenizer(input string) []token {
// A new line is appended to the program
input += "\n"
// A `current` variable for tracking our position in the code like a cursor.
current := 0
// And a slice of our `token` type for appending tokens to.
tokens := []token{}
// We start by creating a `for` loop where we are setting up our `current`
// variable to be incremented as much as we want `inside` the loop.
//
// We do this because we may want to increment `current` many times within a
// single loop because our tokens can be any length.
for current < len([]rune(input)) {
// We're also going to store the `current` character in the `input`.
char := string([]rune(input)[current])
// The first thing we want to check for is an open parenthesis. This will
// later be used for `CallExpressions` but for now we only care about the
// character.
//
// We check to see if we have an open parenthesis:
if char == "(" {
// If we do, we append a new token to our slice with the kind `paren`
// and set the value to an open parenthesis.
tokens = append(tokens, token{
kind: "paren",
value: "(",
})
// Then we increment `current`
current++
// And we `continue` onto the next cycle of the loop.
continue
}
// Next we're going to check for a closing parenthesis. We do the same exact
// thing as before: Check for a closing parenthesis, add a new token,
// increment `current`, and `continue`.
if char == ")" {
tokens = append(tokens, token{
kind: "paren",
value: ")",
})
current++
continue
}
// Moving on, we're now going to check for whitespace. This is interesting
// because we care that whitespace exists to separate characters, but it
// isn't actually important for us to store as a token. We would only throw
// it out later.
//
// So here we're just going to test for existence and if it does exist we're
// going to just `continue` on.
if char == " " {
current++
continue
}
// The next type of token is a number. This is different than what we have
// seen before because a number could be any number of characters and we
// want to capture the entire sequence of characters as one token.
//
// (add 123 456)
// ^^^ ^^^
// Only two separate tokens
//
// So we start this off when we encounter the first number in a sequence.
if isNumber(char) {
// We're going to create a `value` string that we are going to append
// characters to.
value := ""
// Then we're going to loop through each character in the sequence until
// we encounter a character that is not a number, pushing each character
// that is a number to our `value` and incrementing `current` as we go.
for isNumber(char) {
value += char
current++
char = string([]rune(input)[current])
}
// After that we append our `number` token to the `tokens` slice.
tokens = append(tokens, token{
kind: "number",
value: value,
})
// And we continue on.
continue
}
// The last type of token will be a `name` token. This is a sequence of
// letters instead of numbers, that are the names of functions in our lisp
// syntax.
//
// (add 2 4)
// ^^^
// Name token
//
if isLetter(char) {
value := ""
// Again we're just going to loop through all the letters pushing them to
// a value.
for isLetter(char) {
value += char
current++
char = string([]rune(input)[current])
}
// And appending that value as a token with the type `name` and continuing.
tokens = append(tokens, token{
kind: "name",
value: value,
})
continue
}
break
}
// Then at the end of our `tokenizer` we simply return the tokens array.
return tokens
}
// isNumber accepts a string and will check to see whether or not what has been
// passed through is between the range of 0 - 9.
func isNumber(char string) bool {
if char == "" {
return false
}
n := []rune(char)[0]
if n >= '0' && n <= '9' {
return true
}
return false
}
// isLetter works in a similar way to isNumber, but checks the range for a
// letter in the range of a - z.
func isLetter(char string) bool {
if char == "" {
return false
}
n := []rune(char)[0]
if n >= 'a' && n <= 'z' {
return true
}
return false
}
/**
* ============================================================================
* ヽ/❀o ل͜ o\ノ
* THE PARSER!!!
* ============================================================================
*/
/**
* For our parser we're going to take our array of tokens and turn it into an
* AST.
*
* [{ type: 'paren', value: '(' }, ...] => { type: 'Program', body: [...] }
*/
// We will define our type, `node` here. Within node are pointers types to what
// would otherwise be recursive types in Go. e.g.
//
// callee node
//
// Would cause the Go compiler to complain about a recursive type. When we want
// to use one of these types to pass through to a function, for example, we'd
// use `&` as it'd be a reference. But we'll come to that a bit later on.
type node struct {
kind string
value string
name string
callee *node
expression *node
body []node
params []node
arguments *[]node
context *[]node
}
// Type `ast` is just another alias type. I find this makes part of the code
// more readable, as you'll come to see that there are a ton of references to
// `node`.
type ast node
// This is the counter variable that we'll use for parsing.
var pc int
// This variable will store our slice of `token`s inside of it.
var pt []token
// Okay, so we define a `parser` function that accepts our slice of `tokens`.
func parser(tokens []token) ast {
// Here, we initially give both the parser counter and the parser tokens a
// value.
pc = 0
pt = tokens
// Now, we're going to create our AST which will have a root which is a
// `Program` node.
ast := ast{
kind: "Program",
body: []node{},
}
// And we're going to kickstart our `walk` function, which you can find just
// below this, we'll be pushing nodes to our `ast.body` slice.
//
// The reason we are doing this inside a loop is because our program can have
// `CallExpressions` after one another instead of being nested.
//
// (add 2 2)
// (subtract 4 2)
//
for pc < len(pt) {
ast.body = append(ast.body, walk())
}
// At the end of our parser we'll return the AST.
return ast
}
// But this time we're going to use recursion instead of a `while` loop. So we
// define a `walk` function.
func walk() node {
// Inside the walk function we start by grabbing the `current` token.
token := pt[pc]
// We're going to split each type of token off into a different code path,
// starting off with `number` tokens.
//
// We test to see if we have a `number` token.
if token.kind == "number" {
// If we have one, we'll increment `current`.
pc++
// And we'll return a new AST node called `NumberLiteral` and setting its
// value to the value of our token.
return node{
kind: "NumberLiteral",
value: token.value,
}
}
// Next we're going to look for CallExpressions. We start this off when we
// encounter an open parenthesis.
if token.kind == "paren" && token.value == "(" {
// We'll increment `current` to skip the parenthesis since we don't care
// about it in our AST.
pc++
token = pt[pc]
// We create a base node with the type `CallExpression`, and we're going
// to set the name as the current token's value since the next token after
// the open parenthesis is the name of the function.
n := node{
kind: "CallExpression",
name: token.value,
params: []node{},
}
// We increment `current` *again* to skip the name token.
pc++
token = pt[pc]
// And now we want to loop through each token that will be the `params` of
// our `CallExpression` until we encounter a closing parenthesis.
//
// Now this is where recursion comes in. Instead of trying to parse a
// potentially infinitely nested set of nodes we're going to rely on
// recursion to resolve things.
//
// To explain this, let's take our Lisp code. You can see that the
// parameters of the `add` are a number and a nested `CallExpression` that
// includes its own numbers.
//
// (add 2 (subtract 4 2))
//
// You'll also notice that in our tokens array we have multiple closing
// parenthesis.
//
// [
// { type: 'paren', value: '(' },
// { type: 'name', value: 'add' },
// { type: 'number', value: '2' },
// { type: 'paren', value: '(' },
// { type: 'name', value: 'subtract' },
// { type: 'number', value: '4' },
// { type: 'number', value: '2' },
// { type: 'paren', value: ')' }, <<< Closing parenthesis
// { type: 'paren', value: ')' } <<< Closing parenthesis
// ]
//
// We're going to rely on the nested `walk` function to increment our
// `current` variable past any nested `CallExpressions`.
// So we create a `for` loop that will continue until it encounters a
// token with a `type` of `'paren'` and a `value` of a closing
// parenthesis.
for token.kind != "paren" || (token.kind == "paren" && token.value != ")") {
// we'll call the `walk` function which will return a `node` and we'll
// push it into our `node.params`.
n.params = append(n.params, walk())
token = pt[pc]
}
// Finally we will increment `current` one last time to skip the closing
// parenthesis.
pc++
// And return the node.
return n
}
// Again, if we haven't recognized the token type by now we're going to
// throw an error.
log.Fatal(token.kind)
return node{}
}
/**
* ============================================================================
* ⌒(❀>◞౪◟<❀)⌒
* THE TRAVERSER!!!
* ============================================================================
*/
/**
* So now we have our AST, and we want to be able to visit different nodes with
* a visitor. We need to be able to call the methods on the visitor whenever we
* encounter a node with a matching type.
*
* traverse(ast, {
* Program(node, parent) {
* // ...
* },
*
* CallExpression(node, parent) {
* // ...
* },
*
* NumberLiteral(node, parent) {
* // ...
* }
* });
*/
// We will define our `visitor` type here in such a way that strings can
// be tested against easily and then represent the function associoated with it.
//
// e.g.
// "NumberLiteral" : func(n *node, p node) {
// // do something
// }
//
type visitor map[string]func(n *node, p node)
// So we define a traverser function which accepts an AST and a
// visitor. Inside we're going to define two functions...
func traverser(a ast, v visitor) {
// We kickstart the traverser by calling `traverseNode` with our ast
// with no `parent` because the top level of the AST doesn't have a parent.
traverseNode(node(a), node{}, v)
}
// A `traverseArray` function that will allow us to iterate over a slice and
// call the next function that we will define: `traverseNode`.
func traverseArray(a []node, p node, v visitor) {
for _, child := range a {
traverseNode(child, p, v)
}
}
// `traverseNode` will accept a `node` and its `parent` node. So that it can
// pass both to our visitor methods.
func traverseNode(n, p node, v visitor) {
// We start by testing for the existence of a method on the visitor with a
// matching `type` and then calling it with the `node` and its `parent`.
for k, va := range v {
if k == n.kind {
va(&n, p)
}
}
// Next we are going to split things up by the current node type.
switch n.kind {
// We'll start with our top level `Program`. Since Program nodes have a
// property named body that has an array of nodes, we will call
// `traverseArray` to traverse down into them.
//
// (Remember that `traverseArray` will in turn call `traverseNode` so we
// are causing the tree to be traversed recursively)
case "Program":
traverseArray(n.body, n, v)
break
// Next we do the same with `CallExpressions` and traverse their `params`.
case "CallExpression":
traverseArray(n.params, n, v)
break
// In the case of `NumberLiterals` we don't have any child nodes to visit,
// so we'll just break.
case "NumberLiteral":
break
// And again, if we haven't recognized the node type then we'll throw an
// error.
default:
log.Fatal(n.kind)
}
}
/**
* ============================================================================
* ⁽(◍˃̵͈̑ᴗ˂̵͈̑)⁽
* THE TRANSFORMER!!!
* ============================================================================
*/
/**
* Next up, the transformer. Our transformer is going to take the AST that we
* have built and pass it to our traverser function with a visitor and will
* create a new ast.
*
* ----------------------------------------------------------------------------
* Original AST | Transformed AST
* ----------------------------------------------------------------------------
* { | {
* type: 'Program', | type: 'Program',
* body: [{ | body: [{
* type: 'CallExpression', | type: 'ExpressionStatement',
* name: 'add', | expression: {
* params: [{ | type: 'CallExpression',
* type: 'NumberLiteral', | callee: {
* value: '2' | type: 'Identifier',
* }, { | name: 'add'
* type: 'CallExpression', | },
* name: 'subtract', | arguments: [{
* params: [{ | type: 'NumberLiteral',
* type: 'NumberLiteral', | value: '2'
* value: '4' | }, {
* }, { | type: 'CallExpression',
* type: 'NumberLiteral', | callee: {
* value: '2' | type: 'Identifier',
* }] | name: 'subtract'
* }] | },
* }] | arguments: [{
* } | type: 'NumberLiteral',
* | value: '4'
* ---------------------------------- | }, {
* | type: 'NumberLiteral',
* | value: '2'
* | }]
* (sorry the other one is longer.) | }]
* | }
* | }]
* | }
* ----------------------------------------------------------------------------
*/
// So we have our transformer function which will accept the lisp ast.
func transformer(a ast) ast {
// We'll create a new ast which like our previous AST will have a program
// node.
nast := ast{
kind: "Program",
body: []node{},
}
// Next I'm going to cheat a little and create a bit of a hack. We're going to
// use a property named `context` on our parent nodes that we're going to push
// nodes to their parent's `context`. Normally you would have a better
// abstraction than this, but for our purposes this keeps things simple.
//
// Just take note that the context is a reference *from* the old ast *to* the
// new ast.
a.context = &nast.body
// We'll start by calling the traverser function with our ast and a visitor.
traverser(a, map[string]func(n *node, p node){
// The first visitor method accepts `NumberLiterals`
"NumberLiteral": func(n *node, p node) {
// We'll create a new node also named `NumberLiteral` that we will push to
// the parent context.
*p.context = append(*p.context, node{
kind: "NumberLiteral",
value: n.value,
})
},
// Next up, `CallExpressions`.
"CallExpression": func(n *node, p node) {
// We start creating a new node `CallExpression` with a nested
// `Identifier`.
e := node{
kind: "CallExpression",
callee: &node{
kind: "Identifier",
name: n.name,
},
arguments: new([]node),
}
// Next we're going to define a new context on the original
// `CallExpression` node that will reference the `expression`'s arguments
// so that we can push arguments.
n.context = e.arguments
// Then we're going to check if the parent node is a `CallExpression`.
// If it is not...
if p.kind != "CallExpression" {
// We're going to wrap our `CallExpression` node with an
// `ExpressionStatement`.
es := node{
kind: "ExpressionStatement",
expression: &e,
}
// Last, we push our (possibly wrapped) `CallExpression` to the `parent`'s
// `context`.
*p.context = append(*p.context, es)
} else {
*p.context = append(*p.context, e)
}
},
})
// At the end of our transformer function we'll return the new ast that we
// just created.
return nast
}
/**
* ============================================================================
* ヾ(〃^∇^)ノ♪
* THE CODE GENERATOR!!!!
* ============================================================================
*/
/**
* Now let's move onto our last phase: The Code Generator.
*
* Our code generator is going to recursively call itself to print each node in
* the tree into one giant string.
*/
func codeGenerator(n node) string {
// We'll break things down by the `type` of the `node`.
switch n.kind {
// If we have a `Program` node. We will map through each node in the `body`
// and run them through the code generator and join them with a newline.
case "Program":
var r []string
for _, no := range n.body {
r = append(r, codeGenerator(no))
}
return strings.Join(r, "\n")
// For `ExpressionStatements` we'll call the code generator on the nested
// expression and we'll add a semicolon...
case "ExpressionStatement":
return codeGenerator(*n.expression) + ";"
// For `CallExpressions` we will print the `callee`, add an open
// parenthesis, we'll map through each node in the `arguments` array and run
// them through the code generator, joining them with a comma, and then
// we'll add a closing parenthesis.
case "CallExpression":
var ra []string
c := codeGenerator(*n.callee)
for _, no := range *n.arguments {
ra = append(ra, codeGenerator(no))
}
r := strings.Join(ra, ", ")
return c + "(" + r + ")"
// For `Identifiers` we'll just return the `node`'s name.
case "Identifier":
return n.name
// For `NumberLiterals` we'll just return the `node`'s value.
case "NumberLiteral":
return n.value
// And if we haven't recognized the node, we'll throw an error.
default:
log.Fatal("err")
return ""
}
}
/**
* ============================================================================
* (۶* ‘ヮ’)۶”
* !!!!!!!!THE COMPILER!!!!!!!!
* ============================================================================
*/
/**
* FINALLY! We'll create our `compiler` function. Here we will link together
* every part of the pipeline.
*
* 1. input => tokenizer => tokens
* 2. tokens => parser => ast
* 3. ast => transformer => newAst
* 4. newAst => generator => output
*/
func compiler(input string) string {
tokens := tokenizer(input)
ast := parser(tokens)
nast := transformer(ast)
out := codeGenerator(node(nast))
// and simply return the output!
return out
}