preprocessing tokens

This commit is contained in:
pommicket 2022-01-08 12:15:17 -05:00
parent d48816e226
commit ac6cb985db
5 changed files with 452 additions and 28 deletions

View file

@ -30,3 +30,79 @@
#define KEYWORD_IF 130 #define KEYWORD_IF 130
#define KEYWORD_STATIC 131 #define KEYWORD_STATIC 131
#define KEYWORD_WHILE 132 #define KEYWORD_WHILE 132
:str_comment_start
string /*
byte 0
:str_comment_end
string */
byte 0
:str_lshift_eq
string <<=
byte 0
:str_rshift_eq
string >>=
byte 0
:str_eq_eq
string ==
byte 0
:str_not_eq
string !=
byte 0
:str_gt_eq
string >=
byte 0
:str_lt_eq
string <=
byte 0
:str_plus_plus
string ++
byte 0
:str_minus_minus
string --
byte 0
:str_plus_eq
string +=
byte 0
:str_minus_eq
string -=
byte 0
:str_times_eq
string *=
byte 0
:str_div_eq
string /=
byte 0
:str_remainder_eq
string %=
byte 0
:str_and_eq
string &=
byte 0
:str_or_eq
string |=
byte 0
:str_xor_eq
string ^=
byte 0
:str_and_and
string &&
byte 0
:str_or_or
string ||
byte 0
:str_lshift
string <<
byte 0
:str_rshift
string >>
byte 0
:str_arrow
string ->
byte 0
:str_dotdotdot
string ...
byte 0
:str_hash_hash
string ##
byte 0

View file

@ -8,6 +8,23 @@ byte 0
byte 0 byte 0
goto main goto main
function compile_error
argument file
argument line
argument message
fputs(2, file)
fputc(2, ':)
fputn(2, line)
fputs(2, .str_error)
fputs(2, message)
fputc(2, 10)
exit(1)
:str_error
string : Error:
byte 32
byte 0
#include util.b #include util.b
#include constants.b #include constants.b
#include preprocess.b #include preprocess.b
@ -19,6 +36,7 @@ function main
argument argc argument argc
local input_filename local input_filename
local output_filename local output_filename
local pptokens
input_filename = .str_default_input_filename input_filename = .str_default_input_filename
output_filename = .str_default_output_filename output_filename = .str_default_output_filename
@ -27,7 +45,8 @@ function main
input_filename = argv1 input_filename = argv1
output_filename = argv2 output_filename = argv2
:have_filenames :have_filenames
split_into_preprocessing_tokens(input_filename) pptokens = split_into_preprocessing_tokens(input_filename)
print_pptokens(pptokens)
exit(0) exit(0)
:usage_error :usage_error

View file

@ -1,6 +1,14 @@
test\ #include <stdio.h>
ing/*
I am */testing int test(int, double, ...);\
that this is working /* here is a nice
hello \ comment it is
there. here
*/
int main(void) {
printf("\"Hello, world!%c\n\"", '\'');
_X55 = Y4_C_;
a.b = c;
5 + (.3e+5+6) & 0xff | 93 -~5;
return 0;
}

View file

@ -6,10 +6,12 @@ function split_into_preprocessing_tokens
local file_contents local file_contents
local pptokens local pptokens
local p local p
local b
local c local c
local in local in
local out local out
local n local n
local line_number
fd = open_r(filename) fd = open_r(filename)
file_contents = malloc(2000000) file_contents = malloc(2000000)
@ -19,6 +21,7 @@ function split_into_preprocessing_tokens
n = syscall(0, fd, p, 4096) n = syscall(0, fd, p, 4096)
if n == 0 goto pptokens_read_loop_end if n == 0 goto pptokens_read_loop_end
p += n p += n
goto pptokens_read_loop
:pptokens_read_loop_end :pptokens_read_loop_end
; okay we read the file. first, delete every backslash-newline sequence (phase 2) ; okay we read the file. first, delete every backslash-newline sequence (phase 2)
@ -56,20 +59,304 @@ function split_into_preprocessing_tokens
:backslashnewline_loop_end :backslashnewline_loop_end
*1out = 0 *1out = 0
; split file into preprocessing tokens, remove comments (phase 3)
; we're still doing the trick with newlines, this time for ones inside comments
; this is needed because the following is legal C:
; #include/*
; */<stdio.h>
; and is not equivalent to:
; #include
; <stdio.h>
newlines = 1
in = file_contents in = file_contents
out = pptokens
fputs(1, file_contents) line_number = 1
:pptokens_loop
c = *1in
if c == 10 goto pptokens_newline_loop
if c == 0 goto pptokens_loop_end
if c == 32 goto pptoken_space
if c == 9 goto pptoken_space
b = isdigit(c)
if b != 0 goto pptoken_number
b = isalpha_or_underscore(c)
if b != 0 goto pptoken_identifier
b = str_startswith(in, .str_comment_start)
if b != 0 goto pptoken_comment
; now we check for all the various operators and symbols in C
if c == 59 goto pptoken_single_character ; semicolon
if c == '( goto pptoken_single_character
if c == ') goto pptoken_single_character
if c == '[ goto pptoken_single_character
if c == '] goto pptoken_single_character
if c == '{ goto pptoken_single_character
if c == '} goto pptoken_single_character
if c == ', goto pptoken_single_character
if c == '~ goto pptoken_single_character
if c == '? goto pptoken_single_character
if c == ': goto pptoken_single_character
if c == '" goto pptoken_string_or_char_literal
if c == '' goto pptoken_string_or_char_literal
b = str_startswith(in, .str_lshift_eq)
if b != 0 goto pptoken_3_chars
b = str_startswith(in, .str_rshift_eq)
if b != 0 goto pptoken_3_chars
b = str_startswith(in, .str_eq_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_not_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_gt_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_lt_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_plus_plus)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_minus_minus)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_plus_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_minus_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_times_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_div_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_remainder_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_and_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_or_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_xor_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_and_and)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_or_or)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_lshift)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_rshift)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_arrow)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_dotdotdot)
if b != 0 goto pptoken_3_chars
b = str_startswith(in, .str_hash_hash)
if b != 0 goto pptoken_2_chars
if c == '+ goto pptoken_single_character
if c == '- goto pptoken_single_character
if c == '* goto pptoken_single_character
if c == '/ goto pptoken_single_character
if c == '% goto pptoken_single_character
if c == '& goto pptoken_single_character
if c == '| goto pptoken_single_character
if c == '^ goto pptoken_single_character
if c == '> goto pptoken_single_character
if c == '< goto pptoken_single_character
if c == '! goto pptoken_single_character
if c == '= goto pptoken_single_character
if c == '# goto pptoken_single_character
if c == '. goto pptoken_dot
goto bad_pptoken
:pptoken_comment
; emit a space ("Each comment is replaced by one space character.")
*1out = 32
out += 1
*1out = 0
out += 1
; skip over comment
:pptoken_comment_loop
b = str_startswith(in, .str_comment_end)
if b != 0 goto pptoken_comment_loop_end
c = *1in
in += 1
if c == 0 goto unterminated_comment
if c == 10 goto pptoken_comment_newline
goto pptoken_comment_loop
:pptoken_comment_loop_end
in += 2 ; skip */
goto pptokens_loop
:pptoken_comment_newline
; keep line numbers correct
newlines += 1
goto pptoken_comment_loop
:pptoken_dot
; could just be a . or could be .3 -- we need to check if *(in+1) is a digit
p = in + 1
b = isdigit(*1p)
if b != 0 goto pptoken_number
; okay it's just a dot
goto pptoken_single_character
:pptoken_string_or_char_literal
local delimiter
local backslash
delimiter = c
backslash = 0
*1out = c
out += 1
in += 1
:pptoken_strchar_loop
c = *1in
*1out = c
in += 1
out += 1
if c == '\ goto pptoken_strchar_backslash
if c == 10 goto unterminated_string
if c == 0 goto unterminated_string
b = backslash
backslash = 0
if b == 1 goto pptoken_strchar_loop ; string can't end with an odd number of backslashes
if c == delimiter goto pptoken_strchar_loop_end
goto pptoken_strchar_loop
:pptoken_strchar_backslash
backslash ^= 1
goto pptoken_strchar_loop
:pptoken_strchar_loop_end
*1out = 0
out += 1
goto pptokens_loop
:pptoken_number
c = *1in
b = is_ppnumber_char(c)
if b == 0 goto pptoken_number_end
*1out = c
out += 1
in += 1
if c == 'e goto pptoken_number_e
if c == 'E goto pptoken_number_e
goto pptoken_number
:pptoken_number_e
c = *1in
if c == '+ goto pptoken_number_sign
if c == '- goto pptoken_number_sign
goto pptoken_number
:pptoken_number_sign
; special code to handle + - immediately following e
*1out = c
in += 1
out += 1
goto pptoken_number
:pptoken_number_end
*1out = 0
out += 1
goto pptokens_loop
:pptoken_identifier
c = *1in
b = isalnum_or_underscore(c)
if b == 0 goto pptoken_identifier_end
*1out = c
in += 1
out += 1
goto pptoken_identifier
:pptoken_identifier_end
*1out = 0
out += 1
goto pptokens_loop
:pptoken_space
; space character token
*1out = 32
in += 1
out += 1
*1out = 0
out += 1
goto pptokens_loop
:pptoken_single_character
; a single character preprocessing token, like {?}
*1out = c
in += 1
out += 1
*1out = 0
out += 1
goto pptokens_loop
:pptoken_2_chars
; two-character pptoken (e.g. ##)
*1out = c
in += 1
out += 1
*1out = *1in
in += 1
out += 1
*1out = 0
out += 1
goto pptokens_loop
:pptoken_3_chars
; three-character pptoken (e.g. >>=)
*1out = c
in += 1
out += 1
*1out = *1in
in += 1
out += 1
*1out = *1in
in += 1
out += 1
*1out = 0
out += 1
goto pptokens_loop
:pptokens_newline_loop
if newlines == 0 goto pptokens_newline_loop_end
; output a newline
*1out = 10
out += 1
*1out = 0
out += 1
line_number += 1
newlines -= 1
goto pptokens_newline_loop
:pptokens_newline_loop_end
newlines = 1
in += 1
goto pptokens_loop
:pptokens_loop_end
free(file_contents) free(file_contents)
close(fd) close(fd)
return return pptokens
:unterminated_comment :unterminated_comment
fputs(2, .str_unterminated_comment) compile_error(filename, line_number, .str_unterminated_comment)
fputs(2, filename)
fputc(2, 10)
exit(1)
:str_unterminated_comment :str_unterminated_comment
string Unterminated comment in file string Unterminated comment.
byte 32
byte 0 byte 0
:unterminated_string
compile_error(filename, line_number, .str_unterminated_string)
:str_unterminated_string
string Unterminated string or character literal.
byte 0
:bad_pptoken
compile_error(filename, line_number, .str_bad_pptoken)
:str_bad_pptoken
string Bad preprocessing token.
byte 0
; can the given character appear in a C89 ppnumber?
function is_ppnumber_char
argument c
if c == '. goto return_1
if c < '0 goto return_0
if c <= '9 goto return_1
if c < 'A goto return_0
if c <= 'Z goto return_1
if c == '_ goto return_1
if c < 'a goto return_0
if c <= 'z goto return_1
goto return_0
function print_pptokens
argument pptokens
local p
p = pptokens
:print_pptokens_loop
if *1p == 0 goto print_pptokens_loop_end
putc('{)
puts(p)
putc('})
p += strlen(p)
p += 1
goto print_pptokens_loop
:print_pptokens_loop_end
putc(10)
return

View file

@ -82,11 +82,9 @@ function memchr
argument mem argument mem
argument c argument c
local p local p
local a
p = mem p = mem
:memchr_loop :memchr_loop
a = *1p if *1p == c goto memchr_loop_end
if a == c goto memchr_loop_end
p += 1 p += 1
goto memchr_loop goto memchr_loop
:memchr_loop_end :memchr_loop_end
@ -94,12 +92,10 @@ function memchr
function strlen function strlen
argument s argument s
local c
local p local p
p = s p = s
:strlen_loop :strlen_loop
c = *1p if *1p == 0 goto strlen_loop_end
if c == 0 goto strlen_loop_end
p += 1 p += 1
goto strlen_loop goto strlen_loop
:strlen_loop_end :strlen_loop_end
@ -165,9 +161,7 @@ function fputn
function fputc function fputc
argument fd argument fd
argument c argument c
local p syscall(1, fd, &c, 1)
p = &c
syscall(1, fd, p, 1)
return return
function putc function putc
@ -179,10 +173,8 @@ function putc
function fgetc function fgetc
argument fd argument fd
local c local c
local p
c = 0 c = 0
p = &c syscall(0, fd, &c, 1)
syscall(0, fd, p, 1)
return c return c
; read a line from fd as a null-terminated string ; read a line from fd as a null-terminated string
@ -251,6 +243,48 @@ function isupper
if c <= 'Z goto return_1 if c <= 'Z goto return_1
goto return_0 goto return_0
function islower
argument c
if c < 'a goto return_0
if c <= 'z goto return_1
goto return_0
function isdigit
argument c
if c < '0 goto return_0
if c <= '9 goto return_1
goto return_0
function isalpha
argument c
if c < 'A goto return_0
if c <= 'Z goto return_1
if c < 'a goto return_0
if c <= 'z goto return_1
goto return_0
; characters which can start identifiers in C
function isalpha_or_underscore
argument c
if c < 'A goto return_0
if c <= 'Z goto return_1
if c == '_ goto return_1
if c < 'a goto return_0
if c <= 'z goto return_1
goto return_0
; characters which can appear in identifiers in C
function isalnum_or_underscore
argument c
if c < '0 goto return_0
if c <= '9 goto return_1
if c < 'A goto return_0
if c <= 'Z goto return_1
if c == '_ goto return_1
if c < 'a goto return_0
if c <= 'z goto return_1
goto return_0
function exit function exit
argument status_code argument status_code
syscall(0x3c, status_code) syscall(0x3c, status_code)