tokenizing string literals and identifiers
This commit is contained in:
parent
a245a5be96
commit
f8f044a7a2
5 changed files with 99 additions and 11 deletions
|
@ -1,3 +1,10 @@
|
||||||
|
; this is the format of the executables we produce:
|
||||||
|
; elf header + code 4MB addresses 0x400000-0x7fffff
|
||||||
|
; read-only data 4MB addresses 0x800000-0xbfffff
|
||||||
|
; read-write data 4MB addresses 0xc00000-0xffffff
|
||||||
|
#define RODATA_OFFSET 0x400000
|
||||||
|
#define RODATA_ADDR 0x800000
|
||||||
|
|
||||||
; C OPERATOR PRECEDENCE
|
; C OPERATOR PRECEDENCE
|
||||||
; lowest
|
; lowest
|
||||||
; 1 ,
|
; 1 ,
|
||||||
|
@ -69,7 +76,7 @@
|
||||||
#define TOKEN_CONSTANT_FLOAT 2
|
#define TOKEN_CONSTANT_FLOAT 2
|
||||||
#define TOKEN_CONSTANT_INT 3
|
#define TOKEN_CONSTANT_INT 3
|
||||||
#define TOKEN_CONSTANT_CHAR 4
|
#define TOKEN_CONSTANT_CHAR 4
|
||||||
#define TOKEN_STRING 5
|
#define TOKEN_STRING_LITERAL 5
|
||||||
|
|
||||||
; these are stored in the "info" field of the token
|
; these are stored in the "info" field of the token
|
||||||
#define NUMBER_NO_SUFFIX 0
|
#define NUMBER_NO_SUFFIX 0
|
||||||
|
|
|
@ -8,6 +8,8 @@ byte 0
|
||||||
byte 0
|
byte 0
|
||||||
goto main
|
goto main
|
||||||
|
|
||||||
|
global output_fd
|
||||||
|
|
||||||
|
|
||||||
global object_macros_size
|
global object_macros_size
|
||||||
global function_macros_size
|
global function_macros_size
|
||||||
|
@ -78,6 +80,7 @@ function main
|
||||||
local processed_pptokens
|
local processed_pptokens
|
||||||
local tokens
|
local tokens
|
||||||
|
|
||||||
|
|
||||||
dat_banned_objmacros = 255
|
dat_banned_objmacros = 255
|
||||||
dat_banned_fmacros = 255
|
dat_banned_fmacros = 255
|
||||||
|
|
||||||
|
@ -104,9 +107,15 @@ function main
|
||||||
print_separator()
|
print_separator()
|
||||||
;print_object_macros()
|
;print_object_macros()
|
||||||
;print_function_macros()
|
;print_function_macros()
|
||||||
|
|
||||||
|
output_fd = open_w(output_filename)
|
||||||
|
rodata_end_offset = RODATA_OFFSET
|
||||||
|
|
||||||
tokens = malloc(16000000)
|
tokens = malloc(16000000)
|
||||||
tokenize(pptokens, tokens)
|
tokenize(pptokens, tokens)
|
||||||
print_tokens(tokens)
|
print_tokens(tokens)
|
||||||
|
; NOTE: do NOT free pptokens as identifiers still reference them.
|
||||||
|
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
:usage_error
|
:usage_error
|
||||||
|
|
11
05/main.c
11
05/main.c
|
@ -1,7 +1,4 @@
|
||||||
'\xfa' 'w' 'e' 'l' 'l' '\'' '\\'
|
"Hello ther" "e good fellow."
|
||||||
sizeof(int)
|
char * = "How are you"" d""o""i""ng today?\n";
|
||||||
0x332l
|
hi
|
||||||
0xffffffffffffffff
|
_TEST _ING _1
|
||||||
0755u
|
|
||||||
double * = &;
|
|
||||||
|
|
||||||
|
|
|
@ -96,6 +96,10 @@ function get_keyword_str
|
||||||
string @BAD_KEYWORD_ID
|
string @BAD_KEYWORD_ID
|
||||||
byte 0
|
byte 0
|
||||||
|
|
||||||
|
|
||||||
|
; file offset to write next piece of read-only data; initialized in main.b
|
||||||
|
global rodata_end_offset
|
||||||
|
|
||||||
; turn pptokens into tokens, written to out.
|
; turn pptokens into tokens, written to out.
|
||||||
; tokens are 16 bytes and have the following format:
|
; tokens are 16 bytes and have the following format:
|
||||||
; uchar type
|
; uchar type
|
||||||
|
@ -104,6 +108,7 @@ function get_keyword_str
|
||||||
; uint line
|
; uint line
|
||||||
; ulong data
|
; ulong data
|
||||||
; This corresponds to translation phases 5-6 and the first half of 7
|
; This corresponds to translation phases 5-6 and the first half of 7
|
||||||
|
; IMPORTANT: this function uses pointers to pptokens, so they should NOT be freed!
|
||||||
function tokenize
|
function tokenize
|
||||||
argument pptokens
|
argument pptokens
|
||||||
argument out
|
argument out
|
||||||
|
@ -113,6 +118,7 @@ function tokenize
|
||||||
local b
|
local b
|
||||||
local c
|
local c
|
||||||
local n
|
local n
|
||||||
|
local p
|
||||||
local data
|
local data
|
||||||
|
|
||||||
in = pptokens
|
in = pptokens
|
||||||
|
@ -122,6 +128,7 @@ function tokenize
|
||||||
if c == 32 goto tokenize_skip_pptoken
|
if c == 32 goto tokenize_skip_pptoken
|
||||||
if c == 10 goto tokenize_newline
|
if c == 10 goto tokenize_newline
|
||||||
if c == '' goto tokenize_constant_char
|
if c == '' goto tokenize_constant_char
|
||||||
|
if c == '" goto tokenize_string_literal
|
||||||
if c == 0 goto tokenize_loop_end
|
if c == 0 goto tokenize_loop_end
|
||||||
|
|
||||||
b = get_keyword_id(in)
|
b = get_keyword_id(in)
|
||||||
|
@ -130,7 +137,22 @@ function tokenize
|
||||||
b = isdigit_or_dot(c)
|
b = isdigit_or_dot(c)
|
||||||
if b != 0 goto tokenize_number
|
if b != 0 goto tokenize_number
|
||||||
|
|
||||||
byte 0xcc
|
; it's an identifier. we just need to make sure it's made up of identifier characters.
|
||||||
|
p = in
|
||||||
|
b = isalpha_or_underscore(*1p)
|
||||||
|
if b == 0 goto bad_token
|
||||||
|
|
||||||
|
:ident_check_loop
|
||||||
|
b = isalnum_or_underscore(*1p)
|
||||||
|
if b == 0 goto bad_token
|
||||||
|
p += 1
|
||||||
|
if *1p != 0 goto ident_check_loop
|
||||||
|
; all good.
|
||||||
|
*1out = TOKEN_IDENTIFIER
|
||||||
|
out += 2 ; no info
|
||||||
|
data = in ; data will point to the identifier name
|
||||||
|
pptoken_skip(&in)
|
||||||
|
goto token_output
|
||||||
|
|
||||||
:tokenize_newline
|
:tokenize_newline
|
||||||
line_number += 1
|
line_number += 1
|
||||||
|
@ -217,7 +239,28 @@ function tokenize
|
||||||
:tokenize_float
|
:tokenize_float
|
||||||
; @TODO
|
; @TODO
|
||||||
byte 0xcc
|
byte 0xcc
|
||||||
|
:tokenize_string_literal
|
||||||
|
n = rodata_end_offset - RODATA_OFFSET
|
||||||
|
n += RODATA_ADDR ; address of string
|
||||||
|
lseek(output_fd, rodata_end_offset, SEEK_SET)
|
||||||
|
:string_literal_loop
|
||||||
|
in += 1 ; skip opening "
|
||||||
|
:string_literal_char_loop
|
||||||
|
if *1in == '" goto string_literal_char_loop_end
|
||||||
|
c = read_c_char(&in)
|
||||||
|
if c ] 255 goto bad_char_in_string
|
||||||
|
fputc(output_fd, c)
|
||||||
|
goto string_literal_char_loop
|
||||||
|
:string_literal_char_loop_end
|
||||||
|
pptoken_skip(&in) ; skip closing "
|
||||||
|
pptoken_skip_spaces(&in)
|
||||||
|
if *1in == '" goto string_literal_loop ; string concatenation, e.g. "Hello, " "world!"
|
||||||
|
fputc(output_fd, 0) ; null terminator
|
||||||
|
rodata_end_offset = lseek(output_fd, 0, SEEK_CUR)
|
||||||
|
*1out = TOKEN_STRING_LITERAL
|
||||||
|
out += 2 ; no info
|
||||||
|
data = n
|
||||||
|
goto token_output
|
||||||
:tokenize_loop_end
|
:tokenize_loop_end
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
@ -236,7 +279,17 @@ function tokenize
|
||||||
:str_bad_char_constant
|
:str_bad_char_constant
|
||||||
string Bad character constant. Note that multibyte constants are not supported.
|
string Bad character constant. Note that multibyte constants are not supported.
|
||||||
byte 0
|
byte 0
|
||||||
|
:bad_char_in_string
|
||||||
|
compile_error(file, line_number, .str_bad_char_in_string)
|
||||||
|
:str_bad_char_in_string
|
||||||
|
string Bad character in string literal.
|
||||||
|
byte 0
|
||||||
|
:bad_token
|
||||||
|
compile_error(file, line_number, .str_bad_token)
|
||||||
|
:str_bad_token
|
||||||
|
string Bad token.
|
||||||
|
byte 0
|
||||||
|
|
||||||
; return character or escaped character from *p_in, advancing accordingly
|
; return character or escaped character from *p_in, advancing accordingly
|
||||||
; returns -1 on bad character
|
; returns -1 on bad character
|
||||||
function read_c_char
|
function read_c_char
|
||||||
|
@ -390,6 +443,8 @@ function print_tokens
|
||||||
if *1p > 20 goto print_token_keyword
|
if *1p > 20 goto print_token_keyword
|
||||||
if *1p == TOKEN_CONSTANT_INT goto print_token_int
|
if *1p == TOKEN_CONSTANT_INT goto print_token_int
|
||||||
if *1p == TOKEN_CONSTANT_CHAR goto print_token_char
|
if *1p == TOKEN_CONSTANT_CHAR goto print_token_char
|
||||||
|
if *1p == TOKEN_STRING_LITERAL goto print_token_string_literal
|
||||||
|
if *1p == TOKEN_IDENTIFIER goto print_token_identifier
|
||||||
fputs(2, .str_print_bad_token)
|
fputs(2, .str_print_bad_token)
|
||||||
exit(1)
|
exit(1)
|
||||||
:print_token_keyword
|
:print_token_keyword
|
||||||
|
@ -402,6 +457,13 @@ function print_tokens
|
||||||
:print_token_char
|
:print_token_char
|
||||||
puts(.str_constant_char)
|
puts(.str_constant_char)
|
||||||
goto print_token_data
|
goto print_token_data
|
||||||
|
:print_token_string_literal
|
||||||
|
puts(.str_string_literal)
|
||||||
|
goto print_token_data
|
||||||
|
:print_token_identifier
|
||||||
|
s = p + 8
|
||||||
|
puts(*8s)
|
||||||
|
goto print_token_data
|
||||||
:print_token_info
|
:print_token_info
|
||||||
p += 1
|
p += 1
|
||||||
putc('~)
|
putc('~)
|
||||||
|
@ -429,6 +491,9 @@ function print_tokens
|
||||||
:str_constant_char
|
:str_constant_char
|
||||||
string character
|
string character
|
||||||
byte 0
|
byte 0
|
||||||
|
:str_string_literal
|
||||||
|
string string
|
||||||
|
byte 0
|
||||||
:str_print_bad_token
|
:str_print_bad_token
|
||||||
string Unrecognized token type in print_tokens. Aborting.
|
string Unrecognized token type in print_tokens. Aborting.
|
||||||
byte 10
|
byte 10
|
||||||
|
|
10
05/util.b
10
05/util.b
|
@ -351,6 +351,16 @@ function close
|
||||||
syscall(3, fd)
|
syscall(3, fd)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
#define SEEK_SET 0
|
||||||
|
#define SEEK_CUR 1
|
||||||
|
#define SEEK_END 2
|
||||||
|
|
||||||
|
function lseek
|
||||||
|
argument fd
|
||||||
|
argument offset
|
||||||
|
argument whence
|
||||||
|
return syscall(8, fd, offset, whence)
|
||||||
|
|
||||||
function isupper
|
function isupper
|
||||||
argument c
|
argument c
|
||||||
if c < 'A goto return_0
|
if c < 'A goto return_0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue