feat: lex comments, forbid CR

feat: lex operators
2024-11-24 07:35:20 -05:00 · 2024-11-24 06:53:58 -05:00
8 changed files with 188 additions and 5 deletions
--- a/build.zig
+++ b/build.zig
@ -95,6 +95,8 @@ pub fn build(b: *std.Build) void {
        "src/01_lexic/number.zig",
        "src/01_lexic/identifier.zig",
        "src/01_lexic/datatype.zig",
+        "src/01_lexic/operator.zig",
+        "src/01_lexic/comment.zig",
        "src/01_lexic/token.zig",
        "src/01_lexic/utils.zig",
    };
--- a/src/01_lexic/comment.zig
+++ b/src/01_lexic/comment.zig
@ -0,0 +1,77 @@
+const std = @import("std");
+const assert = std.debug.assert;
+const token = @import("./token.zig");
+const utils = @import("./utils.zig");
+
+const Token = token.Token;
+const TokenType = token.TokenType;
+const LexError = token.LexError;
+const LexReturn = token.LexReturn;
+
+pub fn lex(input: []const u8, start: usize) LexError!?LexReturn {
+    const cap = input.len;
+    assert(start < cap);
+
+    // there should be at least 2 characters
+    if (start + 1 >= cap) {
+        return null;
+    }
+
+    if (input[start] == '/' and input[start + 1] == '/') {
+        var current_pos = start + 2;
+
+        // consume all bytes until newline (LF)
+        while (current_pos < cap and input[current_pos] != '\n') {
+            // check for CR, and throw error
+            if (input[current_pos] == '\r') {
+                return LexError.CRLF;
+            }
+            current_pos += 1;
+        }
+
+        return .{ Token.init(input[start..current_pos], TokenType.Comment, start), current_pos };
+    } else {
+        return null;
+    }
+}
+
+test "should lex comment until EOF" {
+    const input = "// aea";
+    const output = try lex(input, 0);
+
+    if (output) |tuple| {
+        const t = tuple[0];
+        try std.testing.expectEqualDeep("// aea", t.value);
+        try std.testing.expectEqual(6, tuple[1]);
+    } else {
+        try std.testing.expect(false);
+    }
+}
+
+test "should lex comment until newline (LF)" {
+    const input = "// my comment\n// other comment";
+    const output = try lex(input, 0);
+
+    if (output) |tuple| {
+        const t = tuple[0];
+        try std.testing.expectEqualDeep("// my comment", t.value);
+        try std.testing.expectEqual(13, tuple[1]);
+    } else {
+        try std.testing.expect(false);
+    }
+}
+
+test "shouldn lex incomplete comment" {
+    const input = "/aa";
+    const output = try lex(input, 0);
+    try std.testing.expect(output == null);
+}
+
+test "should fail on CRLF" {
+    const input = "// my comment\x0D\x0A// other comment";
+    _ = lex(input, 0) catch |err| {
+        try std.testing.expectEqual(LexError.CRLF, err);
+        return;
+    };
+    try std.testing.expect(false);
+}
--- a/src/01_lexic/identifier.zig
+++ b/src/01_lexic/identifier.zig
@ -1,4 +1,5 @@
 const std = @import("std");
+const assert = std.debug.assert;
 const token = @import("./token.zig");
 const utils = @import("./utils.zig");

@ -11,10 +12,7 @@ const LexReturn = token.LexReturn;
 pub fn lex(input: []const u8, start: usize) LexError!?LexReturn {
    const cap = input.len;
    var final_pos = start;
-
-    if (start >= cap) {
-        return null;
-    }
+    assert(start < cap);

    // lex lowercase or underscore
    if (!utils.is_lowercase_underscore(input[start])) {
--- a/src/01_lexic/number.zig
+++ b/src/01_lexic/number.zig
@ -1,4 +1,5 @@
 const std = @import("std");
+const assert = std.debug.assert;
 const token = @import("./token.zig");
 const utils = @import("./utils.zig");

@ -13,6 +14,7 @@ const is_decimal_digit = utils.is_decimal_digit;
 ///
 /// A number is either an Int or a Float.
 pub fn lex(input: []const u8, cap: usize, start: usize) LexError!?LexReturn {
+    assert(start < cap);
    const first_char = input[start];

    // Attempt to lex a hex, octal or binary number
@ -74,6 +76,7 @@ fn prefixed(comptime prefix: u8, input: []const u8, cap: usize, start: usize) !?
 /// avoid confussion with PHP literal octals.
 /// Floating point numbers can.
 fn integer(input: []const u8, cap: usize, start: usize) LexError!?LexReturn {
+    assert(start < cap);
    const first_char = input[start];
    if (!is_decimal_digit(first_char)) {
        return null;
--- a/src/01_lexic/operator.zig
+++ b/src/01_lexic/operator.zig
@ -0,0 +1,73 @@
+const std = @import("std");
+const assert = std.debug.assert;
+const token = @import("./token.zig");
+const utils = @import("./utils.zig");
+
+const Token = token.Token;
+const TokenType = token.TokenType;
+const LexError = token.LexError;
+const LexReturn = token.LexReturn;
+
+// lex an operator
+pub fn lex(input: []const u8, start: usize) LexError!?LexReturn {
+    const cap = input.len;
+    assert(start < cap);
+
+    // lex operator
+    if (utils.lex_many_1(utils.is_operator_char, input, start)) |final_pos| {
+        return .{
+            Token.init(input[start..final_pos], TokenType.Operator, start),
+            final_pos,
+        };
+    }
+    // no operator found
+    else {
+        return null;
+    }
+}
+
+test "should lex single operator" {
+    const input = "=";
+    const output = try lex(input, 0);
+
+    if (output) |tuple| {
+        const t = tuple[0];
+        try std.testing.expectEqualDeep("=", t.value);
+        try std.testing.expectEqual(1, tuple[1]);
+    } else {
+        try std.testing.expect(false);
+    }
+}
+
+test "should lex operator of len 2" {
+    const input = "+=";
+    const output = try lex(input, 0);
+
+    if (output) |tuple| {
+        const t = tuple[0];
+        try std.testing.expectEqualDeep("+=", t.value);
+        try std.testing.expectEqual(2, tuple[1]);
+    } else {
+        try std.testing.expect(false);
+    }
+}
+
+test "should lex operator of len 3" {
+    const input = " >>= ";
+    const output = try lex(input, 1);
+
+    if (output) |tuple| {
+        const t = tuple[0];
+        try std.testing.expectEqualDeep(">>=", t.value);
+        try std.testing.expectEqual(4, tuple[1]);
+    } else {
+        try std.testing.expect(false);
+    }
+}
+
+test "should not lex something else" {
+    const input = "322";
+    const output = try lex(input, 0);
+
+    try std.testing.expect(output == null);
+}
--- a/src/01_lexic/root.zig
+++ b/src/01_lexic/root.zig
@ -1,8 +1,11 @@
 const std = @import("std");
+const assert = std.debug.assert;
 const number = @import("./number.zig");
 const identifier = @import("./identifier.zig");
 const datatype = @import("./datatype.zig");
 const token = @import("./token.zig");
+const operator = @import("./operator.zig");
+const comment = @import("./comment.zig");

 const TokenType = token.TokenType;
 const Token = token.Token;
@ -16,9 +19,11 @@ pub fn tokenize(input: []const u8, alloc: std.mem.Allocator) !void {

    while (current_pos < input_len) {
        const actual_next_pos = ignore_whitespace(input, current_pos);
+        assert(current_pos <= actual_next_pos);

        // attempt to lex a number
        if (try number.lex(input, input_len, actual_next_pos)) |tuple| {
+            assert(tuple[1] > current_pos);
            const t = tuple[0];
            current_pos = tuple[1];

@ -26,6 +31,7 @@ pub fn tokenize(input: []const u8, alloc: std.mem.Allocator) !void {
        }
        // attempt to lex an identifier
        else if (try identifier.lex(input, actual_next_pos)) |tuple| {
+            assert(tuple[1] > current_pos);
            const t = tuple[0];
            current_pos = tuple[1];

@ -33,6 +39,23 @@ pub fn tokenize(input: []const u8, alloc: std.mem.Allocator) !void {
        }
        // attempt to lex a datatype
        else if (try datatype.lex(input, actual_next_pos)) |tuple| {
+            assert(tuple[1] > current_pos);
+            const t = tuple[0];
+            current_pos = tuple[1];
+
+            try tokens.append(t);
+        }
+        // attempt to lex a comment
+        else if (try comment.lex(input, actual_next_pos)) |tuple| {
+            assert(tuple[1] > current_pos);
+            const t = tuple[0];
+            current_pos = tuple[1];
+
+            try tokens.append(t);
+        }
+        // attempt to lex an operator
+        else if (try operator.lex(input, actual_next_pos)) |tuple| {
+            assert(tuple[1] > current_pos);
            const t = tuple[0];
            current_pos = tuple[1];

--- a/src/01_lexic/token.zig
+++ b/src/01_lexic/token.zig
@ -2,6 +2,8 @@ pub const TokenType = enum {
    Int,
    Float,
    Identifier,
+    Operator,
+    Comment,
 };

 pub const Token = struct {
@ -23,6 +25,7 @@ pub const LexError = error{
    Incomplete,
    IncompleteFloatingNumber,
    IncompleteScientificNumber,
+    CRLF,
 };

 /// Contains the lexed token and the next position
--- a/src/01_lexic/utils.zig
+++ b/src/01_lexic/utils.zig
@ -35,6 +35,10 @@ pub fn is_identifier_char(c: u8) bool {
    return c == '_' or ('a' <= c and c <= 'z') or ('A' <= c and c <= 'Z') or ('0' <= c and c <= '9');
 }

+pub fn is_operator_char(c: u8) bool {
+    return c == '+' or c == '-' or c == '=' or c == '*' or c == '!' or c == '/' or c == '|' or c == '@' or c == '#' or c == '$' or c == '~' or c == '%' or c == '&' or c == '?' or c == '<' or c == '>' or c == '^' or c == '.' or c == ':';
+}
+
 /// Runs a discriminator function at least once,
 /// and returns the end position of the lex.
 ///
@ -44,7 +48,7 @@ pub fn lex_many_1(
    comptime lex_fun: fn (c: u8) bool,
    input: []const u8,
    start: usize,
-) usize {
+) ?usize {
    // assert that there is input left
    const cap = input.len;
    var current_pos = start;
Author	SHA1	Message	Date
Araozu	3f95515964	feat: lex comments, forbid CR	2024-11-24 07:35:20 -05:00
Araozu	00597752da	feat: lex operators	2024-11-24 06:53:58 -05:00