feat: lex identifier. create comptime fn for lexing many, many_1

2024-11-18 21:05:46 -05:00 · 2024-11-18 21:05:46 -05:00 · 92794cc07a
commit 92794cc07a
parent d6a83ff46c
6 changed files with 183 additions and 3 deletions
--- a/build.zig
+++ b/build.zig
@ -93,6 +93,7 @@ pub fn build(b: *std.Build) void {
    const files = [_][]const u8{
        "src/01_lexic/root.zig",
        "src/01_lexic/number.zig",
+        "src/01_lexic/identifier.zig",
        "src/01_lexic/token.zig",
        "src/01_lexic/utils.zig",
    };
--- a/src/01_lexic/identifier.zig
+++ b/src/01_lexic/identifier.zig
@ -0,0 +1,100 @@
+const std = @import("std");
+const token = @import("./token.zig");
+const utils = @import("./utils.zig");
+
+const Token = token.Token;
+const TokenType = token.TokenType;
+const LexError = token.LexError;
+const LexReturn = token.LexReturn;
+
+pub fn lex(input: []const u8, start: usize) LexError!?LexReturn {
+    const cap = input.len;
+    var final_pos = start;
+
+    if (start >= cap) {
+        return null;
+    }
+
+    // lex lowercase or underscore
+    if (!utils.is_lowercase_underscore(input[start])) {
+        return null;
+    }
+    final_pos += 1;
+
+    // lex many lowercase/uppercase/underscore/number
+    if (utils.lex_many(utils.is_identifier_char, input, final_pos)) |new_pos| {
+        final_pos = new_pos;
+    }
+
+    return .{
+        Token.init(input[start..final_pos], TokenType.Identifier, start),
+        final_pos,
+    };
+}
+
+test "should lex single letter" {
+    const input = "a";
+    const output = try lex(input, 0);
+
+    if (output) |tuple| {
+        const t = tuple[0];
+        try std.testing.expectEqualDeep("a", t.value);
+    } else {
+        try std.testing.expect(false);
+    }
+}
+
+test "should lex single underscore" {
+    const input = "_";
+    const output = try lex(input, 0);
+
+    if (output) |tuple| {
+        const t = tuple[0];
+        try std.testing.expectEqualDeep("_", t.value);
+    } else {
+        try std.testing.expect(false);
+    }
+}
+
+test "should lex identifier 1" {
+    const input = "abc";
+    const output = try lex(input, 0);
+
+    if (output) |tuple| {
+        const t = tuple[0];
+        try std.testing.expectEqualDeep("abc", t.value);
+    } else {
+        try std.testing.expect(false);
+    }
+}
+
+test "should lex identifier 2" {
+    const input = "snake_case";
+    const output = try lex(input, 0);
+
+    if (output) |tuple| {
+        const t = tuple[0];
+        try std.testing.expectEqualDeep("snake_case", t.value);
+    } else {
+        try std.testing.expect(false);
+    }
+}
+
+test "should lex identifier 3" {
+    const input = "camelCase";
+    const output = try lex(input, 0);
+
+    if (output) |tuple| {
+        const t = tuple[0];
+        try std.testing.expectEqualDeep("camelCase", t.value);
+    } else {
+        try std.testing.expect(false);
+    }
+}
+
+test "shouldnt lex datatype" {
+    const input = "MyDatatype";
+    const output = try lex(input, 0);
+
+    try std.testing.expect(output == null);
+}
--- a/src/01_lexic/number.zig
+++ b/src/01_lexic/number.zig
@ -5,11 +5,10 @@ const utils = @import("./utils.zig");
 const Token = token.Token;
 const TokenType = token.TokenType;
 const LexError = token.LexError;
+const LexReturn = token.LexReturn;

 const is_decimal_digit = utils.is_decimal_digit;

-const LexReturn = struct { Token, usize };
-
 /// Attempts to lex a number, as per the language grammar.
 ///
 /// A number is either an Int or a Float.
--- a/src/01_lexic/root.zig
+++ b/src/01_lexic/root.zig
@ -31,7 +31,7 @@ pub fn tokenize(input: []const u8, alloc: std.mem.Allocator) !void {
    std.debug.print("array list len: {d}", .{tokens.items.len});
 }

-/// Ignores all whitespace from usize,
+/// Ignores all whitespace on `input` since `start`
 /// and returns the position where whitespace ends.
 ///
 /// Whitespace is: tabs, spaces
--- a/src/01_lexic/token.zig
+++ b/src/01_lexic/token.zig
@ -1,6 +1,7 @@
 pub const TokenType = enum {
    Int,
    Float,
+    Identifier,
 };

 pub const Token = struct {
@ -23,3 +24,7 @@ pub const LexError = error{
    IncompleteFloatingNumber,
    IncompleteScientificNumber,
 };
+
+/// Contains the lexed token and the next position
+/// from which the next lex should start.
+pub const LexReturn = struct { Token, usize };
--- a/src/01_lexic/utils.zig
+++ b/src/01_lexic/utils.zig
@ -1,3 +1,7 @@
+const token = @import("./token.zig");
+const LexError = token.LexError;
+const LexReturn = token.LexReturn;
+
 pub fn is_decimal_digit(c: u8) bool {
    return '0' <= c and c <= '9';
 }
@ -13,3 +17,74 @@ pub fn is_binary_digit(c: u8) bool {
 pub fn is_hex_digit(c: u8) bool {
    return ('0' <= c and c <= '9') or ('a' <= c and c <= 'f') or ('A' <= c and c <= 'F');
 }
+
+pub fn is_lowercase(c: u8) bool {
+    return 'a' <= c and c <= 'z';
+}
+
+pub fn is_lowercase_underscore(c: u8) bool {
+    return c == '_' or ('a' <= c and c <= 'z');
+}
+
+/// identifier_letter = underscore | lowercase | uppercase | digit
+pub fn is_identifier_char(c: u8) bool {
+    return c == '_' or ('a' <= c and c <= 'z') or ('A' <= c and c <= 'Z') or ('0' <= c and c <= '9');
+}
+
+/// Runs a discriminator function at least once,
+/// and returns the end position of the lex.
+///
+/// If there is no more input or the lexer does not match
+/// at least once, returns null.
+pub fn lex_many_1(
+    comptime lex_fun: fn (c: u8) bool,
+    input: []const u8,
+    start: usize,
+) usize {
+    // assert that there is input left
+    const cap = input.len;
+    var current_pos = start;
+
+    if (current_pos >= cap) {
+        return null;
+    }
+
+    // run the lexer at least once
+    if (!lex_fun(input[current_pos])) {
+        return null;
+    }
+    current_pos += 1;
+
+    // run the lexer many times
+    while (current_pos < cap and lex_fun(input[current_pos])) {
+        current_pos += 1;
+    }
+
+    return current_pos;
+}
+
+/// Runs a discriminator function zero, one or more times
+/// and returns the end position of the lex.
+///
+/// If there is no more input or the lexer does not match
+/// at least once, returns null.
+pub fn lex_many(
+    comptime lex_fun: fn (c: u8) bool,
+    input: []const u8,
+    start: usize,
+) ?usize {
+    // assert that there is input left
+    const cap = input.len;
+    var current_pos = start;
+
+    if (current_pos >= cap) {
+        return null;
+    }
+
+    // run the lexer many times
+    while (current_pos < cap and lex_fun(input[current_pos])) {
+        current_pos += 1;
+    }
+
+    return current_pos;
+}