Skip to main content
python database internals building a persistent engine from scratch

The Tokenizer

6 min read Chapter 20 of 21

The Tokenizer

The tokenizer (also called a lexer) is the first stage of the parsing pipeline. It takes a raw input string and produces a sequence of Token objects, each carrying a type and a value. The tokenizer does not validate grammar — it only identifies what kind of thing each piece of text is.

Token Representation

# tokenizer.py
from dataclasses import dataclass


@dataclass
class Token:
    """A single token extracted from input text."""
    type: TokenType
    value: str          # the raw text that produced this token
    position: int       # character offset in the original input

The position field enables precise error messages: “Unexpected character at position 12.”

The Tokenizer Class

The tokenizer is a single-pass state machine. It reads the input string character by character, building tokens as it goes:

class Tokenizer:
    """Breaks an input string into a sequence of Tokens."""

    KEYWORDS: dict[str, TokenType] = {
        "INSERT": TokenType.KEYWORD_INSERT,
        "SELECT": TokenType.KEYWORD_SELECT,
    }

    def __init__(self, text: str) -> None:
        self.text = text
        self.pos = 0
        self.tokens: list[Token] = []

    def tokenize(self) -> list[Token]:
        """Process the entire input and return a list of Tokens."""
        while self.pos < len(self.text):
            ch = self.text[self.pos]

            # Skip whitespace
            if ch.isspace():
                self.pos += 1
                continue

            # Meta command: starts with '.'
            if ch == '.':
                self._read_meta_command()
                continue

            # Number: starts with a digit or negative sign
            if ch.isdigit() or (ch == '-' and self._peek_digit()):
                self._read_integer()
                continue

            # Word: keyword or string identifier
            if ch.isalpha() or ch == '_':
                self._read_word()
                continue

            raise TokenError(
                f"Unexpected character '{ch}' at position {self.pos}"
            )

        self.tokens.append(Token(TokenType.EOF, "", self.pos))
        return self.tokens

    def _read_meta_command(self) -> None:
        """Read a meta-command starting with '.'."""
        start = self.pos
        self.pos += 1  # skip the '.'
        while self.pos < len(self.text) and not self.text[self.pos].isspace():
            self.pos += 1
        value = self.text[start:self.pos]
        self.tokens.append(Token(TokenType.META_COMMAND, value, start))

    def _read_integer(self) -> None:
        """Read an integer literal (possibly negative)."""
        start = self.pos
        if self.text[self.pos] == '-':
            self.pos += 1
        while self.pos < len(self.text) and self.text[self.pos].isdigit():
            self.pos += 1
        value = self.text[start:self.pos]
        if value == '-':
            raise TokenError(f"Expected digit after '-' at position {start}")
        self.tokens.append(Token(TokenType.INTEGER, value, start))

    def _read_word(self) -> None:
        """Read a keyword or string identifier."""
        start = self.pos
        while self.pos < len(self.text) and (
            self.text[self.pos].isalnum() or self.text[self.pos] == '_'
        ):
            self.pos += 1
        value = self.text[start:self.pos]
        upper = value.upper()
        if upper in self.KEYWORDS:
            token_type = self.KEYWORDS[upper]
        else:
            token_type = TokenType.STRING
        self.tokens.append(Token(token_type, value, start))

    def _peek_digit(self) -> bool:
        """Check if the character after current position is a digit."""
        next_pos = self.pos + 1
        return next_pos < len(self.text) and self.text[next_pos].isdigit()

State Machine Walkthrough

The tokenizer processes input one character at a time. Here’s how it handles INSERT 1 alice:

PositionCharacterActionToken produced
0IStart word
1-5NSERTContinue word
6(end of word)Lookup: “INSERT” is keywordKEYWORD_INSERT("INSERT", 0)
6 Skip whitespace
71Start integer
8(end of number)Emit integerINTEGER("1", 7)
8 Skip whitespace
9aStart word
10-13liceContinue word
14(end of input)“alice” not a keywordSTRING("alice", 9)
14Append EOFEOF("", 14)

Result: [KEYWORD_INSERT, INTEGER("1"), STRING("alice"), EOF]

Testing the Tokenizer

def test_tokenizer():
    """Verify tokenization of all supported input types."""

    # INSERT statement
    tokens = Tokenizer("INSERT 42 bob").tokenize()
    assert tokens[0].type == TokenType.KEYWORD_INSERT
    assert tokens[1].type == TokenType.INTEGER
    assert tokens[1].value == "42"
    assert tokens[2].type == TokenType.STRING
    assert tokens[2].value == "bob"
    assert tokens[3].type == TokenType.EOF

    # SELECT statement
    tokens = Tokenizer("SELECT").tokenize()
    assert tokens[0].type == TokenType.KEYWORD_SELECT
    assert tokens[1].type == TokenType.EOF

    # Meta command
    tokens = Tokenizer(".exit").tokenize()
    assert tokens[0].type == TokenType.META_COMMAND
    assert tokens[0].value == ".exit"

    # Case insensitivity for keywords
    tokens = Tokenizer("select").tokenize()
    assert tokens[0].type == TokenType.KEYWORD_SELECT

    # Negative integer
    tokens = Tokenizer("INSERT -5 test_user").tokenize()
    assert tokens[1].type == TokenType.INTEGER
    assert tokens[1].value == "-5"

    # Error: unexpected character
    try:
        Tokenizer("INSERT @invalid").tokenize()
        assert False, "Should have raised TokenError"
    except TokenError as e:
        assert "position 7" in str(e)

    # Whitespace handling
    tokens = Tokenizer("  INSERT   1   alice  ").tokenize()
    assert len(tokens) == 4  # INSERT, 1, alice, EOF
    assert tokens[0].type == TokenType.KEYWORD_INSERT

    print("All tokenizer tests passed.")


test_tokenizer()

The Parser

With tokens in hand, the parser validates their sequence and produces a Statement object:

class Parser:
    """Parses a token list into a Statement (AST node)."""

    def __init__(self, tokens: list[Token]) -> None:
        self.tokens = tokens
        self.pos = 0

    def parse(self) -> InsertStatement | SelectStatement | MetaCommand:
        """Parse the token stream into a single statement."""
        token = self._current()

        if token.type == TokenType.META_COMMAND:
            return MetaCommand(command=token.value)

        if token.type == TokenType.KEYWORD_INSERT:
            return self._parse_insert()

        if token.type == TokenType.KEYWORD_SELECT:
            return SelectStatement()

        raise ParseError(
            f"Expected INSERT, SELECT, or meta-command, "
            f"got {token.type.name} at position {token.position}"
        )

    def _parse_insert(self) -> InsertStatement:
        """Parse: INSERT <id:int> <username:str>"""
        self._advance()  # consume INSERT

        # Expect integer id
        id_token = self._expect(TokenType.INTEGER)
        row_id = int(id_token.value)

        if row_id < 0:
            raise ParseError(
                f"Row id must be non-negative, got {row_id} "
                f"at position {id_token.position}"
            )

        # Expect string username
        username_token = self._expect(TokenType.STRING)

        return InsertStatement(id=row_id, username=username_token.value)

    def _current(self) -> Token:
        if self.pos >= len(self.tokens):
            raise ParseError("Unexpected end of input")
        return self.tokens[self.pos]

    def _advance(self) -> Token:
        token = self._current()
        self.pos += 1
        return token

    def _expect(self, expected_type: TokenType) -> Token:
        token = self._advance()
        if token.type != expected_type:
            raise ParseError(
                f"Expected {expected_type.name}, got {token.type.name} "
                f"('{token.value}') at position {token.position}"
            )
        return token

Recursive descent. This parser is a recursive descent parser with one level of recursion. Each grammar rule (_parse_insert) is a method that consumes tokens left to right. The _expect helper enforces the expected token sequence. If any expectation fails, a ParseError with position information is raised.

Testing the Parser

def test_parser():
    """Verify parsing of valid and invalid inputs."""

    # Valid INSERT
    tokens = Tokenizer("INSERT 1 alice").tokenize()
    stmt = Parser(tokens).parse()
    assert isinstance(stmt, InsertStatement)
    assert stmt.id == 1
    assert stmt.username == "alice"

    # Valid SELECT
    tokens = Tokenizer("SELECT").tokenize()
    stmt = Parser(tokens).parse()
    assert isinstance(stmt, SelectStatement)

    # Meta command
    tokens = Tokenizer(".btree").tokenize()
    stmt = Parser(tokens).parse()
    assert isinstance(stmt, MetaCommand)
    assert stmt.command == ".btree"

    # Error: INSERT without id
    try:
        tokens = Tokenizer("INSERT alice").tokenize()
        Parser(tokens).parse()
        assert False, "Should have raised ParseError"
    except ParseError as e:
        assert "Expected INTEGER" in str(e)

    # Error: negative id
    try:
        tokens = Tokenizer("INSERT -1 alice").tokenize()
        Parser(tokens).parse()
        assert False, "Should have raised ParseError"
    except ParseError as e:
        assert "non-negative" in str(e)

    # Error: unknown command
    try:
        tokens = Tokenizer("DELETE 1").tokenize()
        Parser(tokens).parse()
        assert False, "Should have raised ParseError"
    except ParseError:
        pass

    print("All parser tests passed.")


test_parser()

The tokenizer and parser transform text into structured Statement objects. The next section connects these statements to the B-Tree engine — the final integration step.