From 364a7ca4b8ed6ba209a859d4f62bfe97d9db20d6 Mon Sep 17 00:00:00 2001 From: Evan Burkey Date: Fri, 31 May 2024 14:33:33 -0700 Subject: [PATCH] basic lexing --- cmd/repl/main.go | 41 ++++++++++ go.mod | 11 +++ go.sum | 10 +++ internal/lexer/lexer.go | 139 +++++++++++++++++++++++++++++++ internal/lexer/lexer_test.go | 154 +++++++++++++++++++++++++++++++++++ internal/token/token.go | 61 ++++++++++++++ 6 files changed, 416 insertions(+) create mode 100644 cmd/repl/main.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 internal/lexer/lexer.go create mode 100644 internal/lexer/lexer_test.go create mode 100644 internal/token/token.go diff --git a/cmd/repl/main.go b/cmd/repl/main.go new file mode 100644 index 0000000..58f49f8 --- /dev/null +++ b/cmd/repl/main.go @@ -0,0 +1,41 @@ +package main + +import ( + "bufio" + "fmt" + "io" + "monkey/internal/lexer" + "monkey/internal/token" + "os" + "os/user" +) + +const prompt = ">> " + +func main() { + user, err := user.Current() + if err != nil { + panic(err) + } + fmt.Printf("Hello %s! This is the Monkey programming language!\n", + user.Username) + fmt.Printf("Feel free to type in commands\n") + start(os.Stdin, os.Stdout) +} + +func start(in io.Reader, out io.Writer) { + scanner := bufio.NewScanner(in) + + for { + fmt.Fprintf(out, prompt) + scanned := scanner.Scan() + if !scanned { + return + } + line := scanner.Text() + l := lexer.New(line) + for tok := l.NextToken(); tok.Type != token.EOF; tok = l.NextToken() { + fmt.Fprintf(out, "%+v\n", tok) + } + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..341b8d4 --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module monkey + +go 1.22 + +require github.com/stretchr/testify v1.9.0 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..60ce688 --- /dev/null +++ b/go.sum @@ -0,0 +1,10 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/lexer/lexer.go b/internal/lexer/lexer.go new file mode 100644 index 0000000..6d58131 --- /dev/null +++ b/internal/lexer/lexer.go @@ -0,0 +1,139 @@ +package lexer + +import ( + "monkey/internal/token" +) + +type Lexer struct { + input string + position int + readPosition int + ch byte +} + +func New(input string) *Lexer { + l := &Lexer{input: input} + l.readChar() + return l +} + +func (l *Lexer) readChar() { + if l.readPosition >= len(l.input) { + l.ch = 0 + } else { + l.ch = l.input[l.readPosition] + } + l.position = l.readPosition + l.readPosition += 1 +} + +func (l *Lexer) NextToken() token.Token { + var tok token.Token + l.skipWhitespace() + switch l.ch { + case '=': + if l.peekChar() == '=' { + ch := l.ch + l.readChar() + literal := string(ch) + string(l.ch) + tok = token.Token{Type: token.EQUAL, Literal: literal} + } else { + tok = newToken(token.ASSIGN, l.ch) + } + case '!': + if l.peekChar() == '=' { + ch := l.ch + l.readChar() + literal := string(ch) + string(l.ch) + tok = token.Token{Type: token.NEQUAL, Literal: literal} + } else { + tok = newToken(token.BANG, l.ch) + } + case ';': + tok = newToken(token.SEMICOLON, l.ch) + case '(': + tok = newToken(token.LPAREN, l.ch) + case ')': + tok = newToken(token.RPAREN, l.ch) + case ',': + tok = newToken(token.COMMA, l.ch) + case '+': + tok = newToken(token.PLUS, l.ch) + case '{': + tok = newToken(token.LBRACE, l.ch) + case '}': + tok = newToken(token.RBRACE, l.ch) + case '-': + tok = newToken(token.MINUS, l.ch) + case '<': + tok = newToken(token.LT, l.ch) + case '>': + tok = newToken(token.GT, l.ch) + case '/': + tok = newToken(token.SLASH, l.ch) + case '*': + tok = newToken(token.ASTERISK, l.ch) + case '[': + tok = newToken(token.LBRACKET, l.ch) + case ']': + tok = newToken(token.RBRACKET, l.ch) + case 0: + tok.Literal = "" + tok.Type = token.EOF + default: + if isLetter(l.ch) { + tok.Literal = l.readIdentifier() + tok.Type = token.LookupIdent(tok.Literal) + return tok + } else if isDigit(l.ch) { + tok.Literal = l.readNumber() + tok.Type = token.INT + return tok + } else { + tok = newToken(token.ILLEGAL, l.ch) + } + } + l.readChar() + return tok +} + +func (l *Lexer) readIdentifier() string { + p := l.position + for isLetter(l.ch) { + l.readChar() + } + return l.input[p:l.position] +} + +func (l *Lexer) readNumber() string { + pos := l.position + for isDigit(l.ch) { + l.readChar() + } + return l.input[pos:l.position] +} + +func (l *Lexer) skipWhitespace() { + for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' { + l.readChar() + } +} + +func (l *Lexer) peekChar() byte { + if l.readPosition >= len(l.input) { + return 0 + } + return l.input[l.readPosition] +} + +func isLetter(ch byte) bool { + return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' +} + +func isDigit(ch byte) bool { + return '0' <= ch && ch <= '9' +} + +func newToken(tokenType token.TokenType, ch byte) token.Token { + return token.Token{Type: tokenType, Literal: string(ch)} +} diff --git a/internal/lexer/lexer_test.go b/internal/lexer/lexer_test.go new file mode 100644 index 0000000..eaee260 --- /dev/null +++ b/internal/lexer/lexer_test.go @@ -0,0 +1,154 @@ +package lexer + +import ( + "github.com/stretchr/testify/assert" + "monkey/internal/token" + "testing" +) + +func TestNextToken_Simple(t *testing.T) { + input := `=+(){},;` + tests := []struct { + expectedType token.TokenType + expectedLiteral string + }{ + {token.ASSIGN, "="}, + {token.PLUS, "+"}, + {token.LPAREN, "("}, + {token.RPAREN, ")"}, + {token.LBRACE, "{"}, + {token.RBRACE, "}"}, + {token.COMMA, ","}, + {token.SEMICOLON, ";"}, + {token.EOF, ""}, + } + + l := New(input) + for i, tt := range tests { + tok := l.NextToken() + assert.Equal(t, tt.expectedType, tok.Type, "[%d]: %v %v", i, tok.Type, tt.expectedType) + assert.Equal(t, tt.expectedLiteral, tok.Literal, "[%d]: %v %v", i, tok.Literal, tt.expectedLiteral) + } +} + +func TestNextToken_Keywords(t *testing.T) { + input := `fn test() { + if (5 < 10) { + return true; + } else { + return false; + } + }` + + tests := []struct { + expectedType token.TokenType + expectedLiteral string + }{ + {token.FUNCTION, "fn"}, + {token.IDENT, "test"}, + {token.LPAREN, "("}, + {token.RPAREN, ")"}, + {token.LBRACE, "{"}, + {token.IF, "if"}, + {token.LPAREN, "("}, + {token.INT, "5"}, + {token.LT, "<"}, + {token.INT, "10"}, + {token.RPAREN, ")"}, + {token.LBRACE, "{"}, + {token.RETURN, "return"}, + {token.TRUE, "true"}, + {token.SEMICOLON, ";"}, + {token.RBRACE, "}"}, + {token.ELSE, "else"}, + {token.LBRACE, "{"}, + {token.RETURN, "return"}, + {token.FALSE, "false"}, + {token.SEMICOLON, ";"}, + {token.RBRACE, "}"}, + {token.RBRACE, "}"}, + {token.EOF, ""}, + } + + l := New(input) + for i, tt := range tests { + tok := l.NextToken() + assert.Equal(t, tt.expectedType, tok.Type, "[%d]: %v %v", i, tok.Type, tt.expectedType) + assert.Equal(t, tt.expectedLiteral, tok.Literal, "[%d]: %v %v", i, tok.Literal, tt.expectedLiteral) + } +} + +func TestNextToken_Complex(t *testing.T) { + input := `let five = 5; + let ten = 10; + let add = fn(x, y) { + x + y; + }; + let result = add(five, ten); + -/*<987>! + 10 == 10 + 10 != 9 + ` + tests := []struct { + expectedType token.TokenType + expectedLiteral string + }{ + {token.LET, "let"}, + {token.IDENT, "five"}, + {token.ASSIGN, "="}, + {token.INT, "5"}, + {token.SEMICOLON, ";"}, + {token.LET, "let"}, + {token.IDENT, "ten"}, + {token.ASSIGN, "="}, + {token.INT, "10"}, + {token.SEMICOLON, ";"}, + {token.LET, "let"}, + {token.IDENT, "add"}, + {token.ASSIGN, "="}, + {token.FUNCTION, "fn"}, + {token.LPAREN, "("}, + {token.IDENT, "x"}, + {token.COMMA, ","}, + {token.IDENT, "y"}, + {token.RPAREN, ")"}, + {token.LBRACE, "{"}, + {token.IDENT, "x"}, + {token.PLUS, "+"}, + {token.IDENT, "y"}, + {token.SEMICOLON, ";"}, + {token.RBRACE, "}"}, + {token.SEMICOLON, ";"}, + {token.LET, "let"}, + {token.IDENT, "result"}, + {token.ASSIGN, "="}, + {token.IDENT, "add"}, + {token.LPAREN, "("}, + {token.IDENT, "five"}, + {token.COMMA, ","}, + {token.IDENT, "ten"}, + {token.RPAREN, ")"}, + {token.SEMICOLON, ";"}, + {token.MINUS, "-"}, + {token.SLASH, "/"}, + {token.ASTERISK, "*"}, + {token.LT, "<"}, + {token.INT, "987"}, + {token.GT, ">"}, + {token.BANG, "!"}, + {token.INT, "10"}, + {token.EQUAL, "=="}, + {token.INT, "10"}, + {token.INT, "10"}, + {token.NEQUAL, "!="}, + {token.INT, "9"}, + {token.EOF, ""}, + } + + l := New(input) + for i, tt := range tests { + tok := l.NextToken() + assert.Equal(t, tt.expectedType, tok.Type, "[%d]: %v %v", i, tok.Type, tt.expectedType) + assert.Equal(t, tt.expectedLiteral, tok.Literal, "[%d]: %v %v", i, tok.Literal, tt.expectedLiteral) + } +} diff --git a/internal/token/token.go b/internal/token/token.go new file mode 100644 index 0000000..5d9ccce --- /dev/null +++ b/internal/token/token.go @@ -0,0 +1,61 @@ +package token + +const ( + ILLEGAL = "ILLEGAL" + EOF = "EOF" + + IDENT = "IDENT" + INT = "INT" + + ASSIGN = "=" + PLUS = "+" + MINUS = "-" + BANG = "!" + ASTERISK = "*" + SLASH = "/" + EQUAL = "==" + NEQUAL = "!=" + + COMMA = "," + SEMICOLON = ";" + LPAREN = "(" + RPAREN = ")" + LBRACE = "{" + RBRACE = "}" + LBRACKET = "[" + RBRACKET = "]" + LT = "<" + GT = ">" + + FUNCTION = "FUNCTION" + LET = "LET" + TRUE = "TRUE" + FALSE = "FALSE" + IF = "IF" + ELSE = "ELSE" + RETURN = "RETURN" +) + +type TokenType string + +type Token struct { + Type TokenType + Literal string +} + +var keywords = map[string]TokenType{ + "fn": FUNCTION, + "let": LET, + "true": TRUE, + "false": FALSE, + "if": IF, + "else": ELSE, + "return": RETURN, +} + +func LookupIdent(ident string) TokenType { + if tok, ok := keywords[ident]; ok { + return tok + } + return IDENT +}