diff --git a/build.zig b/build.zig index 5539bb1..c5fbdea 100644 --- a/build.zig +++ b/build.zig @@ -93,6 +93,7 @@ pub fn build(b: *std.Build) void { const files = [_][]const u8{ "src/01_lexic/root.zig", "src/01_lexic/number.zig", + "src/01_lexic/identifier.zig", "src/01_lexic/token.zig", "src/01_lexic/utils.zig", }; diff --git a/src/01_lexic/identifier.zig b/src/01_lexic/identifier.zig new file mode 100644 index 0000000..7201686 --- /dev/null +++ b/src/01_lexic/identifier.zig @@ -0,0 +1,100 @@ +const std = @import("std"); +const token = @import("./token.zig"); +const utils = @import("./utils.zig"); + +const Token = token.Token; +const TokenType = token.TokenType; +const LexError = token.LexError; +const LexReturn = token.LexReturn; + +pub fn lex(input: []const u8, start: usize) LexError!?LexReturn { + const cap = input.len; + var final_pos = start; + + if (start >= cap) { + return null; + } + + // lex lowercase or underscore + if (!utils.is_lowercase_underscore(input[start])) { + return null; + } + final_pos += 1; + + // lex many lowercase/uppercase/underscore/number + if (utils.lex_many(utils.is_identifier_char, input, final_pos)) |new_pos| { + final_pos = new_pos; + } + + return .{ + Token.init(input[start..final_pos], TokenType.Identifier, start), + final_pos, + }; +} + +test "should lex single letter" { + const input = "a"; + const output = try lex(input, 0); + + if (output) |tuple| { + const t = tuple[0]; + try std.testing.expectEqualDeep("a", t.value); + } else { + try std.testing.expect(false); + } +} + +test "should lex single underscore" { + const input = "_"; + const output = try lex(input, 0); + + if (output) |tuple| { + const t = tuple[0]; + try std.testing.expectEqualDeep("_", t.value); + } else { + try std.testing.expect(false); + } +} + +test "should lex identifier 1" { + const input = "abc"; + const output = try lex(input, 0); + + if (output) |tuple| { + const t = tuple[0]; + try std.testing.expectEqualDeep("abc", t.value); + } else { + try std.testing.expect(false); + } +} + +test "should lex identifier 2" { + const input = "snake_case"; + const output = try lex(input, 0); + + if (output) |tuple| { + const t = tuple[0]; + try std.testing.expectEqualDeep("snake_case", t.value); + } else { + try std.testing.expect(false); + } +} + +test "should lex identifier 3" { + const input = "camelCase"; + const output = try lex(input, 0); + + if (output) |tuple| { + const t = tuple[0]; + try std.testing.expectEqualDeep("camelCase", t.value); + } else { + try std.testing.expect(false); + } +} + +test "shouldnt lex datatype" { + const input = "MyDatatype"; + const output = try lex(input, 0); + + try std.testing.expect(output == null); +} diff --git a/src/01_lexic/number.zig b/src/01_lexic/number.zig index 9904828..fc9dcf8 100644 --- a/src/01_lexic/number.zig +++ b/src/01_lexic/number.zig @@ -5,11 +5,10 @@ const utils = @import("./utils.zig"); const Token = token.Token; const TokenType = token.TokenType; const LexError = token.LexError; +const LexReturn = token.LexReturn; const is_decimal_digit = utils.is_decimal_digit; -const LexReturn = struct { Token, usize }; - /// Attempts to lex a number, as per the language grammar. /// /// A number is either an Int or a Float. diff --git a/src/01_lexic/root.zig b/src/01_lexic/root.zig index ea504e1..8e0db11 100644 --- a/src/01_lexic/root.zig +++ b/src/01_lexic/root.zig @@ -31,7 +31,7 @@ pub fn tokenize(input: []const u8, alloc: std.mem.Allocator) !void { std.debug.print("array list len: {d}", .{tokens.items.len}); } -/// Ignores all whitespace from usize, +/// Ignores all whitespace on `input` since `start` /// and returns the position where whitespace ends. /// /// Whitespace is: tabs, spaces diff --git a/src/01_lexic/token.zig b/src/01_lexic/token.zig index 4d443ad..7e7f68c 100644 --- a/src/01_lexic/token.zig +++ b/src/01_lexic/token.zig @@ -1,6 +1,7 @@ pub const TokenType = enum { Int, Float, + Identifier, }; pub const Token = struct { @@ -23,3 +24,7 @@ pub const LexError = error{ IncompleteFloatingNumber, IncompleteScientificNumber, }; + +/// Contains the lexed token and the next position +/// from which the next lex should start. +pub const LexReturn = struct { Token, usize }; diff --git a/src/01_lexic/utils.zig b/src/01_lexic/utils.zig index 71fa433..17ad51e 100644 --- a/src/01_lexic/utils.zig +++ b/src/01_lexic/utils.zig @@ -1,3 +1,7 @@ +const token = @import("./token.zig"); +const LexError = token.LexError; +const LexReturn = token.LexReturn; + pub fn is_decimal_digit(c: u8) bool { return '0' <= c and c <= '9'; } @@ -13,3 +17,74 @@ pub fn is_binary_digit(c: u8) bool { pub fn is_hex_digit(c: u8) bool { return ('0' <= c and c <= '9') or ('a' <= c and c <= 'f') or ('A' <= c and c <= 'F'); } + +pub fn is_lowercase(c: u8) bool { + return 'a' <= c and c <= 'z'; +} + +pub fn is_lowercase_underscore(c: u8) bool { + return c == '_' or ('a' <= c and c <= 'z'); +} + +/// identifier_letter = underscore | lowercase | uppercase | digit +pub fn is_identifier_char(c: u8) bool { + return c == '_' or ('a' <= c and c <= 'z') or ('A' <= c and c <= 'Z') or ('0' <= c and c <= '9'); +} + +/// Runs a discriminator function at least once, +/// and returns the end position of the lex. +/// +/// If there is no more input or the lexer does not match +/// at least once, returns null. +pub fn lex_many_1( + comptime lex_fun: fn (c: u8) bool, + input: []const u8, + start: usize, +) usize { + // assert that there is input left + const cap = input.len; + var current_pos = start; + + if (current_pos >= cap) { + return null; + } + + // run the lexer at least once + if (!lex_fun(input[current_pos])) { + return null; + } + current_pos += 1; + + // run the lexer many times + while (current_pos < cap and lex_fun(input[current_pos])) { + current_pos += 1; + } + + return current_pos; +} + +/// Runs a discriminator function zero, one or more times +/// and returns the end position of the lex. +/// +/// If there is no more input or the lexer does not match +/// at least once, returns null. +pub fn lex_many( + comptime lex_fun: fn (c: u8) bool, + input: []const u8, + start: usize, +) ?usize { + // assert that there is input left + const cap = input.len; + var current_pos = start; + + if (current_pos >= cap) { + return null; + } + + // run the lexer many times + while (current_pos < cap and lex_fun(input[current_pos])) { + current_pos += 1; + } + + return current_pos; +}