tokenizing integer literals

This commit is contained in:
pommicket 2022-01-11 14:03:13 -05:00
parent fc96e22d4f
commit b0837b367e
5 changed files with 221 additions and 31 deletions

View file

@ -71,6 +71,12 @@
#define TOKEN_CONSTANT_CHAR 4 #define TOKEN_CONSTANT_CHAR 4
#define TOKEN_STRING 5 #define TOKEN_STRING 5
; these are stored in the "info" field of the token
#define NUMBER_NO_SUFFIX 0
#define NUMBER_SUFFIX_U 1
#define NUMBER_SUFFIX_L 2
#define NUMBER_SUFFIX_UL 3
#define NUMBER_SUFFIX_F 4
; #define KEYWORD_AUTO 21 (@NONSTANDARD auto only exists in C for legacy reasons and doesn't appear in TCC's source code) ; #define KEYWORD_AUTO 21 (@NONSTANDARD auto only exists in C for legacy reasons and doesn't appear in TCC's source code)
#define KEYWORD_DOUBLE 22 #define KEYWORD_DOUBLE 22

View file

@ -21,7 +21,6 @@ function fprint_filename
argument file argument file
if file ] 65535 goto print_filename_string if file ] 65535 goto print_filename_string
file = file_get(file) file = file_get(file)
fputs(2, file)
; (fallthrough) ; (fallthrough)
:print_filename_string :print_filename_string
fputs(2, file) fputs(2, file)

View file

@ -1,4 +1,7 @@
sizeof(int) sizeof(int)
0x332l
0xffffffffffffffff
0755u
double * = &; double * = &;

View file

@ -98,7 +98,8 @@ function get_keyword_str
; turn pptokens into tokens, written to out. ; turn pptokens into tokens, written to out.
; tokens are 16 bytes and have the following format: ; tokens are 16 bytes and have the following format:
; ushort type ; uchar type
; uchar info
; ushort file ; ushort file
; uint line ; uint line
; ulong data ; ulong data
@ -109,16 +110,24 @@ function tokenize
local file local file
local line_number local line_number
local b local b
local c
local n
local data
in = pptokens in = pptokens
:tokenize_loop :tokenize_loop
if *1in == '$ goto tokenize_line_directive c = *1in
if *1in == 32 goto tokenize_skip_pptoken if c == '$ goto tokenize_line_directive
if *1in == 10 goto tokenize_newline if c == 32 goto tokenize_skip_pptoken
if *1in == 0 goto tokenize_loop_end if c == 10 goto tokenize_newline
if c == 0 goto tokenize_loop_end
b = get_keyword_id(in) b = get_keyword_id(in)
if b != 0 goto tokenize_keyword if b != 0 goto tokenize_keyword
b = isdigit_or_dot(c)
if b != 0 goto tokenize_number
byte 0xcc byte 0xcc
:tokenize_newline :tokenize_newline
@ -137,36 +146,157 @@ function tokenize
file = file_get_index(in) file = file_get_index(in)
pptoken_skip(&in) pptoken_skip(&in)
goto tokenize_loop goto tokenize_loop
:tokenize_keyword :token_no_data
*2out = b ; type data = 0
out += 2 ; (fallthrough)
:token_output ; write token location & data (see local variable data), and continue tokenizing
*2out = file *2out = file
out += 2 out += 2
*4out = line_number *4out = line_number
out += 4 out += 4
; no data *8out = data
out += 8 out += 8
pptoken_skip(&in)
goto tokenize_loop goto tokenize_loop
:tokenize_keyword
pptoken_skip(&in)
*1out = b ; type
; no info for keywords
out += 2
goto token_no_data
:tokenize_number
; first, check if it's a float
b = strchr(in, '.)
if b != 0 goto tokenize_float
b = strchr(in, 'x) ; e may appear in hex integer literals, so we need to check this
if b != 0 goto tokenize_hex_integer
b = strchr(in, 'X)
if b != 0 goto tokenize_hex_integer
b = strchr(in, 'e) ; exponent
if b != 0 goto tokenize_float
b = strchr(in, 'E) ; exponent
if b != 0 goto tokenize_float
if *1in == '0 goto tokenize_octal_integer ; fun fact: in the C89 standard, 0 is considered an octal integer
; plain ol' decimal constant
n = strtoi(&in, 10)
goto tokenize_finish_integer
:tokenize_hex_integer
if *1in != '0 goto bad_number_token
in += 1
c = *1in
c &= 223 ; 223 = ~32 -- remove case
if c != 'X goto bad_number_token
in += 1
n = strtoi(&in, 16)
goto tokenize_finish_integer
:tokenize_octal_integer
in += 1 ; skip 0
n = strtoi(&in, 8)
goto tokenize_finish_integer
:tokenize_finish_integer
c = read_number_suffix(file, line_number, &in)
if c == NUMBER_SUFFIX_F goto f_suffix_on_integer
in += 1 ; move past null separator
*1out = TOKEN_CONSTANT_INT
out += 1
*1out = c ; info = suffix
out += 1
data = n
goto token_output
:tokenize_float
; @TODO
byte 0xcc
:tokenize_loop_end :tokenize_loop_end
return 0 return 0
:f_suffix_on_integer
compile_error(file, line_number, .str_f_suffix_on_integer)
:str_f_suffix_on_integer
string Integer with f suffix.
byte 0
:bad_number_token
compile_error(file, line_number, .str_bad_number_token)
:str_bad_number_token
string Bad number literal.
byte 0
function read_number_suffix
argument file
argument line_number
argument p_s
local s
local c
local suffix
s = *8p_s
c = *1s
suffix = 0
if c == 0 goto number_suffix_return
if c == 'u goto number_suffix_u
if c == 'l goto number_suffix_l
if c == 'f goto number_suffix_f
goto bad_number_suffix
:number_suffix_u
s += 1
c = *1s
if c == 'l goto number_suffix_ul
if c != 0 goto bad_number_suffix
suffix = NUMBER_SUFFIX_U
goto number_suffix_return
:number_suffix_l
s += 1
c = *1s
if c == 'u goto number_suffix_ul
if c != 0 goto bad_number_suffix
suffix = NUMBER_SUFFIX_L
goto number_suffix_return
:number_suffix_ul
s += 1
c = *1s
if c != 0 goto bad_number_suffix
suffix = NUMBER_SUFFIX_UL
goto number_suffix_return
:number_suffix_f
s += 1
c = *1s
if c != 0 goto bad_number_suffix
suffix = NUMBER_SUFFIX_F
goto number_suffix_return
:number_suffix_return
*8p_s = s
return suffix
:bad_number_suffix
compile_error(file, line_number, .str_bad_number_suffix)
:str_bad_number_suffix
string Bad number suffix.
byte 0
function print_tokens function print_tokens
argument tokens argument tokens
local p local p
local s local s
p = tokens p = tokens
:print_tokens_loop :print_tokens_loop
if *2p == 0 goto print_tokens_loop_end if *1p == 0 goto print_tokens_loop_end
if *2p > 20 goto print_token_keyword if *1p > 20 goto print_token_keyword
if *1p == TOKEN_CONSTANT_INT goto print_token_int
fputs(2, .str_print_bad_token) fputs(2, .str_print_bad_token)
exit(1) exit(1)
:print_token_keyword :print_token_keyword
s = get_keyword_str(*2p) s = get_keyword_str(*1p)
puts(s) puts(s)
goto print_token_data goto print_token_data
:print_token_int
puts(.str_constant_int)
goto print_token_info
:print_token_info
p += 1
putc('~)
putn(*1p)
p -= 1
:print_token_data :print_token_data
p += 2 p += 2
putc('@) putc('@)
@ -183,6 +313,10 @@ function print_tokens
:print_tokens_loop_end :print_tokens_loop_end
putc(10) putc(10)
return return
:str_constant_int
string integer
byte 0
:str_print_bad_token :str_print_bad_token
string Unrecognized token type in print_tokens. Aborting. string Unrecognized token type in print_tokens. Aborting.
byte 10 byte 10
byte 0

View file

@ -58,25 +58,49 @@ function itos
:itos_loop_end :itos_loop_end
return p return p
; returns the number in the given base at the start of the string, advancing the string past it.
; returns the number at the start of the given string function strtoi
argument p_s
argument base
local s
local c
local n
n = 0
s = *8p_s
:strtoi_loop
c = *1s
if c < '0 goto strtoi_loop_end
if c <= '9 goto strtoi_decimal_digit
if c < 'A goto strtoi_loop_end
if c <= 'F goto strtoi_upper_hexdigit
if c < 'a goto strtoi_loop_end
if c <= 'f goto strtoi_lower_hexdigit
goto strtoi_loop_end
:strtoi_decimal_digit
c -= '0
goto strtoi_digit
:strtoi_upper_hexdigit
c += 10 - 'A
goto strtoi_digit
:strtoi_lower_hexdigit
c += 10 - 'a
goto strtoi_digit
:strtoi_digit
if c >= base goto strtoi_loop_end
n *= base
n += c
s += 1
goto strtoi_loop
:strtoi_loop_end
*8p_s = s
return n
; returns the decimal number at the start of the given string
function stoi function stoi
argument s argument s
local p return strtoi(&s, 10)
local n
local c
n = 0
p = s
:stoi_loop
c = *1p
if c < '0 goto stoi_loop_end
if c > '9 goto stoi_loop_end
n *= 10
n += c - '0
p += 1
goto stoi_loop
:stoi_loop_end
return n
function memchr function memchr
argument mem argument mem
@ -90,6 +114,19 @@ function memchr
:memchr_loop_end :memchr_loop_end
return p return p
function strchr
argument str
argument c
local p
p = str
:strchr_loop
if *1p == 0 goto return_0
if *1p == c goto strchr_loop_end
p += 1
goto strchr_loop
:strchr_loop_end
return p
; copy from *p_src to *p_dest until terminator is reached, setting both to point to their respective terminators ; copy from *p_src to *p_dest until terminator is reached, setting both to point to their respective terminators
function memccpy_advance function memccpy_advance
argument p_dest argument p_dest
@ -362,6 +399,17 @@ function isalnum_or_underscore
if c <= 'z goto return_1 if c <= 'z goto return_1
goto return_0 goto return_0
; is the given character one of:
; .0123456789
; (these are the characters which can appear at the start of a number in C)
function isdigit_or_dot
argument c
if c < '. goto return_0
if c == '. goto return_1
if c < '0 goto return_0
if c <= '9 goto return_1
goto return_0
function exit function exit
argument status_code argument status_code
syscall(0x3c, status_code) syscall(0x3c, status_code)