自定义分词器
test.c
#include <ctype.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>typedef enum {TK_PUNCT, // PunctuatorsTK_NUM, // Numeric literalsTK_EOF, // End-of-file markers
} TokenKind;// Token type
typedef struct Token Token;
struct Token {TokenKind kind; // Token kindToken *next; // Next tokenint val; // If kind is TK_NUM, its valuechar *loc; // Token locationint len; // Token length
};// Input string
static char *current_input;// Reports an error and exit.
static void error(char *fmt, ...) {va_list ap;va_start(ap, fmt);vfprintf(stderr, fmt, ap);fprintf(stderr, "\n");exit(1);
}// Reports an error location and exit.
static void verror_at(char *loc, char *fmt, va_list ap) {int pos = loc - current_input;fprintf(stderr, "%s\n", current_input);fprintf(stderr, "%*s", pos, ""); // print pos spaces.fprintf(stderr, "^ ");vfprintf(stderr, fmt, ap);fprintf(stderr, "\n");exit(1);
}static void error_at(char *loc, char *fmt, ...) {va_list ap;va_start(ap, fmt);verror_at(loc, fmt, ap);
}static void error_tok(Token *tok, char *fmt, ...) {va_list ap;va_start(ap, fmt);verror_at(tok->loc, fmt, ap);
}// Consumes the current token if it matches `s`.
static bool equal(Token *tok, char *op) {return memcmp(tok->loc, op, tok->len) == 0 && op[tok->len] == '\0';
}// Ensure that the current token is `s`.
static Token *skip(Token *tok, char *s) {if (!equal(tok, s))error_tok(tok, "expected '%s'", s);return tok->next;
}// Ensure that the current token is TK_NUM.
static int get_number(Token *tok) {if (tok->kind != TK_NUM)error_tok(tok, "expected a number");return tok->val;
}// Create a new token.
static Token *new_token(TokenKind kind, char *start, char *end) {Token *tok = calloc(1, sizeof(Token));tok->kind = kind;tok->loc = start;tok->len = end - start;return tok;
}// Tokenize `current_input` and returns new tokens.
static Token *tokenize(void) {char *p = current_input;Token head = {};Token *cur = &head;while (*p) {// Skip whitespace characters.if (isspace(*p)) {p++;continue;}// Numeric literalif (isdigit(*p)) {cur = cur->next = new_token(TK_NUM, p, p);char *q = p;cur->val = strtoul(p, &p, 10);cur->len = p - q;continue;}// Punctuatorif (*p == '+' || *p == '-') {cur = cur->next = new_token(TK_PUNCT, p, p + 1);p++;continue;}error_at(p, "invalid token");}cur = cur->next = new_token(TK_EOF, p, p);return head.next;
}int main(int argc, char **argv) {if (argc != 2)error("%s: invalid number of arguments", argv[0]);current_input = argv[1];Token *tok = tokenize();printf(" .globl main\n");printf("main:\n");// The first token must be a numberprintf(" mov $%d, %%rax\n", get_number(tok));tok = tok->next;// ... followed by either `+ <number>` or `- <number>`.while (tok->kind != TK_EOF) {if (equal(tok, "+")) {printf(" add $%d, %%rax\n", get_number(tok->next));tok = tok->next->next;continue;}tok = skip(tok, "-");printf(" sub $%d, %%rax\n", get_number(tok));tok = tok->next;}printf(" ret\n");return 0;
}
编译
cc -o test test.c
./test "1 +2 +3" > tmp.s
cc -o tmp tmp.s
./tmp
echo $?
错误处理
./test "1 + 2 * +3" > tmp.s
1 + 2 * +3
^ invalid token