tokenizing string literals and identifiers

This commit is contained in:
pommicket 2022-01-11 15:55:37 -05:00
parent a245a5be96
commit f8f044a7a2
5 changed files with 99 additions and 11 deletions

View file

@ -1,3 +1,10 @@
; this is the format of the executables we produce:
; elf header + code 4MB addresses 0x400000-0x7fffff
; read-only data 4MB addresses 0x800000-0xbfffff
; read-write data 4MB addresses 0xc00000-0xffffff
#define RODATA_OFFSET 0x400000
#define RODATA_ADDR 0x800000
; C OPERATOR PRECEDENCE ; C OPERATOR PRECEDENCE
; lowest ; lowest
; 1 , ; 1 ,
@ -69,7 +76,7 @@
#define TOKEN_CONSTANT_FLOAT 2 #define TOKEN_CONSTANT_FLOAT 2
#define TOKEN_CONSTANT_INT 3 #define TOKEN_CONSTANT_INT 3
#define TOKEN_CONSTANT_CHAR 4 #define TOKEN_CONSTANT_CHAR 4
#define TOKEN_STRING 5 #define TOKEN_STRING_LITERAL 5
; these are stored in the "info" field of the token ; these are stored in the "info" field of the token
#define NUMBER_NO_SUFFIX 0 #define NUMBER_NO_SUFFIX 0

View file

@ -8,6 +8,8 @@ byte 0
byte 0 byte 0
goto main goto main
global output_fd
global object_macros_size global object_macros_size
global function_macros_size global function_macros_size
@ -78,6 +80,7 @@ function main
local processed_pptokens local processed_pptokens
local tokens local tokens
dat_banned_objmacros = 255 dat_banned_objmacros = 255
dat_banned_fmacros = 255 dat_banned_fmacros = 255
@ -104,9 +107,15 @@ function main
print_separator() print_separator()
;print_object_macros() ;print_object_macros()
;print_function_macros() ;print_function_macros()
output_fd = open_w(output_filename)
rodata_end_offset = RODATA_OFFSET
tokens = malloc(16000000) tokens = malloc(16000000)
tokenize(pptokens, tokens) tokenize(pptokens, tokens)
print_tokens(tokens) print_tokens(tokens)
; NOTE: do NOT free pptokens as identifiers still reference them.
exit(0) exit(0)
:usage_error :usage_error

View file

@ -1,7 +1,4 @@
'\xfa' 'w' 'e' 'l' 'l' '\'' '\\' "Hello ther" "e good fellow."
sizeof(int) char * = "How are you"" d""o""i""ng today?\n";
0x332l hi
0xffffffffffffffff _TEST _ING _1
0755u
double * = &;

View file

@ -96,6 +96,10 @@ function get_keyword_str
string @BAD_KEYWORD_ID string @BAD_KEYWORD_ID
byte 0 byte 0
; file offset to write next piece of read-only data; initialized in main.b
global rodata_end_offset
; turn pptokens into tokens, written to out. ; turn pptokens into tokens, written to out.
; tokens are 16 bytes and have the following format: ; tokens are 16 bytes and have the following format:
; uchar type ; uchar type
@ -104,6 +108,7 @@ function get_keyword_str
; uint line ; uint line
; ulong data ; ulong data
; This corresponds to translation phases 5-6 and the first half of 7 ; This corresponds to translation phases 5-6 and the first half of 7
; IMPORTANT: this function uses pointers to pptokens, so they should NOT be freed!
function tokenize function tokenize
argument pptokens argument pptokens
argument out argument out
@ -113,6 +118,7 @@ function tokenize
local b local b
local c local c
local n local n
local p
local data local data
in = pptokens in = pptokens
@ -122,6 +128,7 @@ function tokenize
if c == 32 goto tokenize_skip_pptoken if c == 32 goto tokenize_skip_pptoken
if c == 10 goto tokenize_newline if c == 10 goto tokenize_newline
if c == '' goto tokenize_constant_char if c == '' goto tokenize_constant_char
if c == '" goto tokenize_string_literal
if c == 0 goto tokenize_loop_end if c == 0 goto tokenize_loop_end
b = get_keyword_id(in) b = get_keyword_id(in)
@ -130,7 +137,22 @@ function tokenize
b = isdigit_or_dot(c) b = isdigit_or_dot(c)
if b != 0 goto tokenize_number if b != 0 goto tokenize_number
byte 0xcc ; it's an identifier. we just need to make sure it's made up of identifier characters.
p = in
b = isalpha_or_underscore(*1p)
if b == 0 goto bad_token
:ident_check_loop
b = isalnum_or_underscore(*1p)
if b == 0 goto bad_token
p += 1
if *1p != 0 goto ident_check_loop
; all good.
*1out = TOKEN_IDENTIFIER
out += 2 ; no info
data = in ; data will point to the identifier name
pptoken_skip(&in)
goto token_output
:tokenize_newline :tokenize_newline
line_number += 1 line_number += 1
@ -217,7 +239,28 @@ function tokenize
:tokenize_float :tokenize_float
; @TODO ; @TODO
byte 0xcc byte 0xcc
:tokenize_string_literal
n = rodata_end_offset - RODATA_OFFSET
n += RODATA_ADDR ; address of string
lseek(output_fd, rodata_end_offset, SEEK_SET)
:string_literal_loop
in += 1 ; skip opening "
:string_literal_char_loop
if *1in == '" goto string_literal_char_loop_end
c = read_c_char(&in)
if c ] 255 goto bad_char_in_string
fputc(output_fd, c)
goto string_literal_char_loop
:string_literal_char_loop_end
pptoken_skip(&in) ; skip closing "
pptoken_skip_spaces(&in)
if *1in == '" goto string_literal_loop ; string concatenation, e.g. "Hello, " "world!"
fputc(output_fd, 0) ; null terminator
rodata_end_offset = lseek(output_fd, 0, SEEK_CUR)
*1out = TOKEN_STRING_LITERAL
out += 2 ; no info
data = n
goto token_output
:tokenize_loop_end :tokenize_loop_end
return 0 return 0
@ -236,6 +279,16 @@ function tokenize
:str_bad_char_constant :str_bad_char_constant
string Bad character constant. Note that multibyte constants are not supported. string Bad character constant. Note that multibyte constants are not supported.
byte 0 byte 0
:bad_char_in_string
compile_error(file, line_number, .str_bad_char_in_string)
:str_bad_char_in_string
string Bad character in string literal.
byte 0
:bad_token
compile_error(file, line_number, .str_bad_token)
:str_bad_token
string Bad token.
byte 0
; return character or escaped character from *p_in, advancing accordingly ; return character or escaped character from *p_in, advancing accordingly
; returns -1 on bad character ; returns -1 on bad character
@ -390,6 +443,8 @@ function print_tokens
if *1p > 20 goto print_token_keyword if *1p > 20 goto print_token_keyword
if *1p == TOKEN_CONSTANT_INT goto print_token_int if *1p == TOKEN_CONSTANT_INT goto print_token_int
if *1p == TOKEN_CONSTANT_CHAR goto print_token_char if *1p == TOKEN_CONSTANT_CHAR goto print_token_char
if *1p == TOKEN_STRING_LITERAL goto print_token_string_literal
if *1p == TOKEN_IDENTIFIER goto print_token_identifier
fputs(2, .str_print_bad_token) fputs(2, .str_print_bad_token)
exit(1) exit(1)
:print_token_keyword :print_token_keyword
@ -402,6 +457,13 @@ function print_tokens
:print_token_char :print_token_char
puts(.str_constant_char) puts(.str_constant_char)
goto print_token_data goto print_token_data
:print_token_string_literal
puts(.str_string_literal)
goto print_token_data
:print_token_identifier
s = p + 8
puts(*8s)
goto print_token_data
:print_token_info :print_token_info
p += 1 p += 1
putc('~) putc('~)
@ -429,6 +491,9 @@ function print_tokens
:str_constant_char :str_constant_char
string character string character
byte 0 byte 0
:str_string_literal
string string
byte 0
:str_print_bad_token :str_print_bad_token
string Unrecognized token type in print_tokens. Aborting. string Unrecognized token type in print_tokens. Aborting.
byte 10 byte 10

View file

@ -351,6 +351,16 @@ function close
syscall(3, fd) syscall(3, fd)
return return
#define SEEK_SET 0
#define SEEK_CUR 1
#define SEEK_END 2
function lseek
argument fd
argument offset
argument whence
return syscall(8, fd, offset, whence)
function isupper function isupper
argument c argument c
if c < 'A goto return_0 if c < 'A goto return_0