From 3f9551596457a56d1727705967aeb027165ba9d0 Mon Sep 17 00:00:00 2001 From: Araozu Date: Sun, 24 Nov 2024 07:32:37 -0500 Subject: [PATCH] feat: lex comments, forbid CR --- build.zig | 1 + src/01_lexic/comment.zig | 77 ++++++++++++++++++++++++++++++++++++++++ src/01_lexic/root.zig | 9 +++++ src/01_lexic/token.zig | 2 ++ 4 files changed, 89 insertions(+) create mode 100644 src/01_lexic/comment.zig diff --git a/build.zig b/build.zig index cbc5822..7678f5d 100644 --- a/build.zig +++ b/build.zig @@ -96,6 +96,7 @@ pub fn build(b: *std.Build) void { "src/01_lexic/identifier.zig", "src/01_lexic/datatype.zig", "src/01_lexic/operator.zig", + "src/01_lexic/comment.zig", "src/01_lexic/token.zig", "src/01_lexic/utils.zig", }; diff --git a/src/01_lexic/comment.zig b/src/01_lexic/comment.zig new file mode 100644 index 0000000..d0c3fcb --- /dev/null +++ b/src/01_lexic/comment.zig @@ -0,0 +1,77 @@ +const std = @import("std"); +const assert = std.debug.assert; +const token = @import("./token.zig"); +const utils = @import("./utils.zig"); + +const Token = token.Token; +const TokenType = token.TokenType; +const LexError = token.LexError; +const LexReturn = token.LexReturn; + +pub fn lex(input: []const u8, start: usize) LexError!?LexReturn { + const cap = input.len; + assert(start < cap); + + // there should be at least 2 characters + if (start + 1 >= cap) { + return null; + } + + if (input[start] == '/' and input[start + 1] == '/') { + var current_pos = start + 2; + + // consume all bytes until newline (LF) + while (current_pos < cap and input[current_pos] != '\n') { + // check for CR, and throw error + if (input[current_pos] == '\r') { + return LexError.CRLF; + } + current_pos += 1; + } + + return .{ Token.init(input[start..current_pos], TokenType.Comment, start), current_pos }; + } else { + return null; + } +} + +test "should lex comment until EOF" { + const input = "// aea"; + const output = try lex(input, 0); + + if (output) |tuple| { + const t = tuple[0]; + try std.testing.expectEqualDeep("// aea", t.value); + try std.testing.expectEqual(6, tuple[1]); + } else { + try std.testing.expect(false); + } +} + +test "should lex comment until newline (LF)" { + const input = "// my comment\n// other comment"; + const output = try lex(input, 0); + + if (output) |tuple| { + const t = tuple[0]; + try std.testing.expectEqualDeep("// my comment", t.value); + try std.testing.expectEqual(13, tuple[1]); + } else { + try std.testing.expect(false); + } +} + +test "shouldn lex incomplete comment" { + const input = "/aa"; + const output = try lex(input, 0); + try std.testing.expect(output == null); +} + +test "should fail on CRLF" { + const input = "// my comment\x0D\x0A// other comment"; + _ = lex(input, 0) catch |err| { + try std.testing.expectEqual(LexError.CRLF, err); + return; + }; + try std.testing.expect(false); +} diff --git a/src/01_lexic/root.zig b/src/01_lexic/root.zig index a3f3fe1..cee5514 100644 --- a/src/01_lexic/root.zig +++ b/src/01_lexic/root.zig @@ -5,6 +5,7 @@ const identifier = @import("./identifier.zig"); const datatype = @import("./datatype.zig"); const token = @import("./token.zig"); const operator = @import("./operator.zig"); +const comment = @import("./comment.zig"); const TokenType = token.TokenType; const Token = token.Token; @@ -44,6 +45,14 @@ pub fn tokenize(input: []const u8, alloc: std.mem.Allocator) !void { try tokens.append(t); } + // attempt to lex a comment + else if (try comment.lex(input, actual_next_pos)) |tuple| { + assert(tuple[1] > current_pos); + const t = tuple[0]; + current_pos = tuple[1]; + + try tokens.append(t); + } // attempt to lex an operator else if (try operator.lex(input, actual_next_pos)) |tuple| { assert(tuple[1] > current_pos); diff --git a/src/01_lexic/token.zig b/src/01_lexic/token.zig index 18a8284..17b176d 100644 --- a/src/01_lexic/token.zig +++ b/src/01_lexic/token.zig @@ -3,6 +3,7 @@ pub const TokenType = enum { Float, Identifier, Operator, + Comment, }; pub const Token = struct { @@ -24,6 +25,7 @@ pub const LexError = error{ Incomplete, IncompleteFloatingNumber, IncompleteScientificNumber, + CRLF, }; /// Contains the lexed token and the next position