better keyword tokenization system

This commit is contained in:
pommicket 2022-01-11 11:53:36 -05:00
parent 97dca844c8
commit fc96e22d4f
3 changed files with 326 additions and 484 deletions

View file

@ -53,6 +53,49 @@ function file_add
*1p = 255
return
; return keyword ID associated with str, or 0 if it's not a keyword
function get_keyword_id
argument keyword_str
local p
local c
local b
p = .keyword_table
:keyword_id_loop
c = *1p
if c == 255 goto no_such_keyword_str
p += 1
b = str_equals(keyword_str, p)
if b != 0 goto got_keyword_id
p = memchr(p, 0)
p += 1
goto keyword_id_loop
:no_such_keyword_str
return 0
:got_keyword_id
return c
; get string associated with keyword id, or "@BAD_KEYWORD_ID" if it's not a keyword
function get_keyword_str
argument keyword_id
local p
local c
local b
p = .keyword_table
:keyword_str_loop
c = *1p
if c == 255 goto no_such_keyword_id
if c == keyword_id goto found_keyword_id
p = memchr(p, 0)
p += 1
goto keyword_str_loop
:found_keyword_id
return p + 1
:no_such_keyword_id
return .str_no_such_keyword_id
:str_no_such_keyword_id
string @BAD_KEYWORD_ID
byte 0
; turn pptokens into tokens, written to out.
; tokens are 16 bytes and have the following format:
; ushort type
@ -73,146 +116,8 @@ function tokenize
if *1in == 10 goto tokenize_newline
if *1in == 0 goto tokenize_loop_end
b = str_equals(in, .str_comma)
if b != 0 goto keyword_comma
b = str_equals(in, .str_eq)
if b != 0 goto keyword_eq
b = str_equals(in, .str_plus_eq)
if b != 0 goto keyword_plus_eq
b = str_equals(in, .str_minus_eq)
if b != 0 goto keyword_minus_eq
b = str_equals(in, .str_times_eq)
if b != 0 goto keyword_times_eq
b = str_equals(in, .str_div_eq)
if b != 0 goto keyword_div_eq
b = str_equals(in, .str_percent_eq)
if b != 0 goto keyword_percent_eq
b = str_equals(in, .str_lshift_eq)
if b != 0 goto keyword_rshift_eq
b = str_equals(in, .str_and_eq)
if b != 0 goto keyword_and_eq
b = str_equals(in, .str_or_eq)
if b != 0 goto keyword_or_eq
b = str_equals(in, .str_question)
if b != 0 goto keyword_question
b = str_equals(in, .str_or_or)
if b != 0 goto keyword_or_or
b = str_equals(in, .str_and_and)
if b != 0 goto keyword_and_and
b = str_equals(in, .str_or)
if b != 0 goto keyword_or
b = str_equals(in, .str_xor)
if b != 0 goto keyword_xor
b = str_equals(in, .str_and)
if b != 0 goto keyword_and
b = str_equals(in, .str_eq_eq)
if b != 0 goto keyword_eq_eq
b = str_equals(in, .str_not_eq)
if b != 0 goto keyword_not_eq
b = str_equals(in, .str_lt)
if b != 0 goto keyword_lt
b = str_equals(in, .str_gt)
if b != 0 goto keyword_gt
b = str_equals(in, .str_lt_eq)
if b != 0 goto keyword_lt_eq
b = str_equals(in, .str_gt_eq)
if b != 0 goto keyword_gt_eq
b = str_equals(in, .str_lshift)
if b != 0 goto keyword_lshift
b = str_equals(in, .str_rshift)
if b != 0 goto keyword_rshift
b = str_equals(in, .str_plus)
if b != 0 goto keyword_plus
b = str_equals(in, .str_minus)
if b != 0 goto keyword_minus
b = str_equals(in, .str_times)
if b != 0 goto keyword_times
b = str_equals(in, .str_div)
if b != 0 goto keyword_div
b = str_equals(in, .str_percent)
if b != 0 goto keyword_percent
b = str_equals(in, .str_plus_plus)
if b != 0 goto keyword_plus_plus
b = str_equals(in, .str_minus_minus)
if b != 0 goto keyword_minus_minus
b = str_equals(in, .str_not)
if b != 0 goto keyword_not
b = str_equals(in, .str_tilde)
if b != 0 goto keyword_tilde
b = str_equals(in, .str_arrow)
if b != 0 goto keyword_arrow
b = str_equals(in, .str_dotdotdot)
if b != 0 goto keyword_dotdotdot
b = str_equals(in, .str_colon)
if b != 0 goto keyword_colon
b = str_equals(in, .str_lbrace)
if b != 0 goto keyword_lbrace
b = str_equals(in, .str_rbrace)
if b != 0 goto keyword_rbrace
b = str_equals(in, .str_lsquare)
if b != 0 goto keyword_lsquare
b = str_equals(in, .str_rsquare)
if b != 0 goto keyword_rsquare
b = str_equals(in, .str_lparen)
if b != 0 goto keyword_lparen
b = str_equals(in, .str_rparen)
if b != 0 goto keyword_rparen
b = str_equals(in, .str_semicolon)
if b != 0 goto keyword_semicolon
b = str_equals(in, .str_double)
if b != 0 goto keyword_double
b = str_equals(in, .str_int)
if b != 0 goto keyword_int
b = str_equals(in, .str_struct)
if b != 0 goto keyword_struct
b = str_equals(in, .str_break)
if b != 0 goto keyword_break
b = str_equals(in, .str_else)
if b != 0 goto keyword_else
b = str_equals(in, .str_long)
if b != 0 goto keyword_long
b = str_equals(in, .str_switch)
if b != 0 goto keyword_switch
b = str_equals(in, .str_case)
if b != 0 goto keyword_case
b = str_equals(in, .str_enum)
if b != 0 goto keyword_enum
b = str_equals(in, .str_typedef)
if b != 0 goto keyword_typedef
b = str_equals(in, .str_char)
if b != 0 goto keyword_char
b = str_equals(in, .str_extern)
if b != 0 goto keyword_extern
b = str_equals(in, .str_return)
if b != 0 goto keyword_return
b = str_equals(in, .str_union)
if b != 0 goto keyword_union
b = str_equals(in, .str_float)
if b != 0 goto keyword_float
b = str_equals(in, .str_short)
if b != 0 goto keyword_short
b = str_equals(in, .str_unsigned)
if b != 0 goto keyword_unsigned
b = str_equals(in, .str_continue)
if b != 0 goto keyword_continue
b = str_equals(in, .str_for)
if b != 0 goto keyword_for
b = str_equals(in, .str_void)
if b != 0 goto keyword_void
b = str_equals(in, .str_default)
if b != 0 goto keyword_default
b = str_equals(in, .str_goto)
if b != 0 goto keyword_goto
b = str_equals(in, .str_sizeof)
if b != 0 goto keyword_sizeof
b = str_equals(in, .str_do)
if b != 0 goto keyword_do
b = str_equals(in, .str_if)
if b != 0 goto keyword_if
b = str_equals(in, .str_static)
if b != 0 goto keyword_static
b = str_equals(in, .str_while)
if b != 0 goto keyword_while
b = get_keyword_id(in)
if b != 0 goto tokenize_keyword
byte 0xcc
@ -243,222 +148,6 @@ function tokenize
out += 8
pptoken_skip(&in)
goto tokenize_loop
:keyword_comma
b = SYMBOL_COMMA
goto tokenize_keyword
:keyword_eq
b = SYMBOL_EQ
goto tokenize_keyword
:keyword_plus_eq
b = SYMBOL_PLUS_EQ
goto tokenize_keyword
:keyword_minus_eq
b = SYMBOL_MINUS_EQ
goto tokenize_keyword
:keyword_times_eq
b = SYMBOL_TIMES_EQ
goto tokenize_keyword
:keyword_div_eq
b = SYMBOL_DIV_EQ
goto tokenize_keyword
:keyword_percent_eq
b = SYMBOL_PERCENT_EQ
goto tokenize_keyword
:keyword_lshift_eq
b = SYMBOL_LSHIFT_EQ
goto tokenize_keyword
:keyword_rshift_eq
b = SYMBOL_RSHIFT_EQ
goto tokenize_keyword
:keyword_and_eq
b = SYMBOL_AND_EQ
goto tokenize_keyword
:keyword_xor_eq
b = SYMBOL_XOR_EQ
goto tokenize_keyword
:keyword_or_eq
b = SYMBOL_OR_EQ
goto tokenize_keyword
:keyword_question
b = SYMBOL_QUESTION
goto tokenize_keyword
:keyword_or_or
b = SYMBOL_OR_OR
goto tokenize_keyword
:keyword_and_and
b = SYMBOL_AND_AND
goto tokenize_keyword
:keyword_or
b = SYMBOL_OR
goto tokenize_keyword
:keyword_xor
b = SYMBOL_XOR
goto tokenize_keyword
:keyword_and
b = SYMBOL_AND
goto tokenize_keyword
:keyword_eq_eq
b = SYMBOL_EQ_EQ
goto tokenize_keyword
:keyword_not_eq
b = SYMBOL_NOT_EQ
goto tokenize_keyword
:keyword_lt
b = SYMBOL_LT
goto tokenize_keyword
:keyword_gt
b = SYMBOL_GT
goto tokenize_keyword
:keyword_lt_eq
b = SYMBOL_LT_EQ
goto tokenize_keyword
:keyword_gt_eq
b = SYMBOL_GT_EQ
goto tokenize_keyword
:keyword_lshift
b = SYMBOL_LSHIFT
goto tokenize_keyword
:keyword_rshift
b = SYMBOL_RSHIFT
goto tokenize_keyword
:keyword_plus
b = SYMBOL_PLUS
goto tokenize_keyword
:keyword_minus
b = SYMBOL_MINUS
goto tokenize_keyword
:keyword_times
b = SYMBOL_TIMES
goto tokenize_keyword
:keyword_div
b = SYMBOL_DIV
goto tokenize_keyword
:keyword_percent
b = SYMBOL_PERCENT
goto tokenize_keyword
:keyword_plus_plus
b = SYMBOL_PLUS_PLUS
goto tokenize_keyword
:keyword_minus_minus
b = SYMBOL_MINUS_MINUS
goto tokenize_keyword
:keyword_not
b = SYMBOL_NOT
goto tokenize_keyword
:keyword_tilde
b = SYMBOL_TILDE
goto tokenize_keyword
:keyword_arrow
b = SYMBOL_ARROW
goto tokenize_keyword
:keyword_dotdotdot
b = SYMBOL_DOTDOTDOT
goto tokenize_keyword
:keyword_colon
b = SYMBOL_COLON
goto tokenize_keyword
:keyword_lbrace
b = SYMBOL_LBRACE
goto tokenize_keyword
:keyword_rbrace
b = SYMBOL_RBRACE
goto tokenize_keyword
:keyword_lsquare
b = SYMBOL_LSQUARE
goto tokenize_keyword
:keyword_rsquare
b = SYMBOL_RSQUARE
goto tokenize_keyword
:keyword_lparen
b = SYMBOL_LPAREN
goto tokenize_keyword
:keyword_rparen
b = SYMBOL_RPAREN
goto tokenize_keyword
:keyword_semicolon
b = SYMBOL_SEMICOLON
goto tokenize_keyword
:keyword_double
b = KEYWORD_DOUBLE
goto tokenize_keyword
:keyword_int
b = KEYWORD_INT
goto tokenize_keyword
:keyword_struct
b = KEYWORD_STRUCT
goto tokenize_keyword
:keyword_break
b = KEYWORD_BREAK
goto tokenize_keyword
:keyword_else
b = KEYWORD_ELSE
goto tokenize_keyword
:keyword_long
b = KEYWORD_LONG
goto tokenize_keyword
:keyword_switch
b = KEYWORD_SWITCH
goto tokenize_keyword
:keyword_case
b = KEYWORD_CASE
goto tokenize_keyword
:keyword_enum
b = KEYWORD_ENUM
goto tokenize_keyword
:keyword_typedef
b = KEYWORD_TYPEDEF
goto tokenize_keyword
:keyword_char
b = KEYWORD_CHAR
goto tokenize_keyword
:keyword_extern
b = KEYWORD_EXTERN
goto tokenize_keyword
:keyword_return
b = KEYWORD_RETURN
goto tokenize_keyword
:keyword_union
b = KEYWORD_UNION
goto tokenize_keyword
:keyword_float
b = KEYWORD_FLOAT
goto tokenize_keyword
:keyword_short
b = KEYWORD_SHORT
goto tokenize_keyword
:keyword_unsigned
b = KEYWORD_UNSIGNED
goto tokenize_keyword
:keyword_continue
b = KEYWORD_CONTINUE
goto tokenize_keyword
:keyword_for
b = KEYWORD_FOR
goto tokenize_keyword
:keyword_void
b = KEYWORD_VOID
goto tokenize_keyword
:keyword_default
b = KEYWORD_DEFAULT
goto tokenize_keyword
:keyword_goto
b = KEYWORD_GOTO
goto tokenize_keyword
:keyword_sizeof
b = KEYWORD_SIZEOF
goto tokenize_keyword
:keyword_do
b = KEYWORD_DO
goto tokenize_keyword
:keyword_if
b = KEYWORD_IF
goto tokenize_keyword
:keyword_static
b = KEYWORD_STATIC
goto tokenize_keyword
:keyword_while
b = KEYWORD_WHILE
goto tokenize_keyword
:tokenize_loop_end
return 0
@ -466,18 +155,27 @@ function tokenize
function print_tokens
argument tokens
local p
local s
p = tokens
:print_tokens_loop
if *2p == 0 goto print_tokens_loop_end
putn(*2p)
if *2p > 20 goto print_token_keyword
fputs(2, .str_print_bad_token)
exit(1)
:print_token_keyword
s = get_keyword_str(*2p)
puts(s)
goto print_token_data
:print_token_data
p += 2
putc(':)
putc('@)
putn(*2p)
p += 2
putc(':)
putn(*4p)
p += 4
putc(':)
putc(61)
putn(*8p)
p += 8
putc(32)
@ -485,3 +183,6 @@ function print_tokens
:print_tokens_loop_end
putc(10)
return
:str_print_bad_token
string Unrecognized token type in print_tokens. Aborting.
byte 10