lang-bootstrap/05/preprocess.b

516 lines
13 KiB
Brainfuck
Raw Normal View History

2022-01-07 23:32:27 -05:00
; returns a string of null character-separated preprocessing tokens
; this corresponds to translation phases 1-3 in the C89 standard
function split_into_preprocessing_tokens
argument filename
local fd
local file_contents
local pptokens
local p
2022-01-08 12:15:17 -05:00
local b
2022-01-07 23:32:27 -05:00
local c
local in
local out
local n
2022-01-08 12:15:17 -05:00
local line_number
2022-01-07 23:32:27 -05:00
fd = open_r(filename)
file_contents = malloc(2000000)
pptokens = malloc(2000000)
p = file_contents
:pptokens_read_loop
n = syscall(0, fd, p, 4096)
if n == 0 goto pptokens_read_loop_end
p += n
2022-01-08 12:15:17 -05:00
goto pptokens_read_loop
2022-01-07 23:32:27 -05:00
:pptokens_read_loop_end
; okay we read the file. first, delete every backslash-newline sequence (phase 2)
local newlines ; we add more newlines to keep line numbers right
newlines = 1
in = file_contents
out = file_contents
:backslashnewline_loop
c = *1in
if c == 0 goto backslashnewline_loop_end
if c == 10 goto proper_newline_loop
if c != '\ goto not_backslashnewline
p = in + 1
c = *1p
if c != 10 goto not_backslashnewline
in += 2 ; skip backlash and newline
newlines += 1 ; add one additional newline the next time around to compensate
goto backslashnewline_loop
:not_backslashnewline
*1out = *1in
out += 1
in += 1
goto backslashnewline_loop
:proper_newline_loop
if newlines == 0 goto proper_newline_loop_end
; output a newline
*1out = 10
out += 1
newlines -= 1
goto proper_newline_loop
:proper_newline_loop_end
newlines = 1
in += 1
goto backslashnewline_loop
:backslashnewline_loop_end
*1out = 0
2022-01-08 12:15:17 -05:00
; split file into preprocessing tokens, remove comments (phase 3)
; we're still doing the trick with newlines, this time for ones inside comments
; this is needed because the following is legal C:
; #include/*
; */<stdio.h>
; and is not equivalent to:
; #include
; <stdio.h>
newlines = 1
2022-01-07 23:32:27 -05:00
in = file_contents
2022-01-08 12:15:17 -05:00
out = pptokens
line_number = 1
:pptokens_loop
c = *1in
if c == 10 goto pptokens_newline_loop
if c == 0 goto pptokens_loop_end
if c == 32 goto pptoken_space
if c == 9 goto pptoken_space
b = isdigit(c)
if b != 0 goto pptoken_number
b = isalpha_or_underscore(c)
if b != 0 goto pptoken_identifier
b = str_startswith(in, .str_comment_start)
if b != 0 goto pptoken_comment
; now we check for all the various operators and symbols in C
if c == 59 goto pptoken_single_character ; semicolon
if c == '( goto pptoken_single_character
if c == ') goto pptoken_single_character
if c == '[ goto pptoken_single_character
if c == '] goto pptoken_single_character
if c == '{ goto pptoken_single_character
if c == '} goto pptoken_single_character
if c == ', goto pptoken_single_character
if c == '~ goto pptoken_single_character
if c == '? goto pptoken_single_character
if c == ': goto pptoken_single_character
if c == '" goto pptoken_string_or_char_literal
if c == '' goto pptoken_string_or_char_literal
b = str_startswith(in, .str_lshift_eq)
if b != 0 goto pptoken_3_chars
b = str_startswith(in, .str_rshift_eq)
if b != 0 goto pptoken_3_chars
b = str_startswith(in, .str_eq_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_not_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_gt_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_lt_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_plus_plus)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_minus_minus)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_plus_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_minus_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_times_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_div_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_remainder_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_and_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_or_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_xor_eq)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_and_and)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_or_or)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_lshift)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_rshift)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_arrow)
if b != 0 goto pptoken_2_chars
b = str_startswith(in, .str_dotdotdot)
if b != 0 goto pptoken_3_chars
b = str_startswith(in, .str_hash_hash)
if b != 0 goto pptoken_2_chars
if c == '+ goto pptoken_single_character
if c == '- goto pptoken_single_character
if c == '* goto pptoken_single_character
if c == '/ goto pptoken_single_character
if c == '% goto pptoken_single_character
if c == '& goto pptoken_single_character
if c == '| goto pptoken_single_character
if c == '^ goto pptoken_single_character
if c == '> goto pptoken_single_character
if c == '< goto pptoken_single_character
if c == '! goto pptoken_single_character
if c == '= goto pptoken_single_character
if c == '# goto pptoken_single_character
if c == '. goto pptoken_dot
goto bad_pptoken
:pptoken_comment
; emit a space ("Each comment is replaced by one space character.")
*1out = 32
out += 1
*1out = 0
out += 1
; skip over comment
:pptoken_comment_loop
b = str_startswith(in, .str_comment_end)
if b != 0 goto pptoken_comment_loop_end
c = *1in
in += 1
if c == 0 goto unterminated_comment
if c == 10 goto pptoken_comment_newline
goto pptoken_comment_loop
:pptoken_comment_loop_end
in += 2 ; skip */
goto pptokens_loop
:pptoken_comment_newline
; keep line numbers correct
newlines += 1
goto pptoken_comment_loop
:pptoken_dot
; could just be a . or could be .3 -- we need to check if *(in+1) is a digit
p = in + 1
b = isdigit(*1p)
if b != 0 goto pptoken_number
; okay it's just a dot
goto pptoken_single_character
:pptoken_string_or_char_literal
local delimiter
local backslash
delimiter = c
backslash = 0
*1out = c
out += 1
in += 1
:pptoken_strchar_loop
c = *1in
*1out = c
in += 1
out += 1
if c == '\ goto pptoken_strchar_backslash
if c == 10 goto unterminated_string
if c == 0 goto unterminated_string
b = backslash
backslash = 0
if b == 1 goto pptoken_strchar_loop ; string can't end with an odd number of backslashes
if c == delimiter goto pptoken_strchar_loop_end
goto pptoken_strchar_loop
:pptoken_strchar_backslash
backslash ^= 1
goto pptoken_strchar_loop
:pptoken_strchar_loop_end
*1out = 0
out += 1
goto pptokens_loop
:pptoken_number
c = *1in
b = is_ppnumber_char(c)
if b == 0 goto pptoken_number_end
*1out = c
out += 1
in += 1
if c == 'e goto pptoken_number_e
if c == 'E goto pptoken_number_e
goto pptoken_number
:pptoken_number_e
c = *1in
if c == '+ goto pptoken_number_sign
if c == '- goto pptoken_number_sign
goto pptoken_number
:pptoken_number_sign
; special code to handle + - immediately following e
*1out = c
in += 1
out += 1
goto pptoken_number
:pptoken_number_end
*1out = 0
out += 1
goto pptokens_loop
:pptoken_identifier
c = *1in
b = isalnum_or_underscore(c)
if b == 0 goto pptoken_identifier_end
*1out = c
in += 1
out += 1
goto pptoken_identifier
:pptoken_identifier_end
*1out = 0
out += 1
goto pptokens_loop
:pptoken_space
; space character token
*1out = 32
in += 1
out += 1
*1out = 0
out += 1
goto pptokens_loop
:pptoken_single_character
; a single character preprocessing token, like {?}
*1out = c
in += 1
out += 1
*1out = 0
out += 1
goto pptokens_loop
:pptoken_2_chars
; two-character pptoken (e.g. ##)
*1out = c
in += 1
out += 1
*1out = *1in
in += 1
out += 1
*1out = 0
out += 1
goto pptokens_loop
:pptoken_3_chars
; three-character pptoken (e.g. >>=)
*1out = c
in += 1
out += 1
*1out = *1in
in += 1
out += 1
*1out = *1in
in += 1
out += 1
*1out = 0
out += 1
goto pptokens_loop
:pptokens_newline_loop
if newlines == 0 goto pptokens_newline_loop_end
; output a newline
*1out = 10
out += 1
*1out = 0
out += 1
line_number += 1
newlines -= 1
goto pptokens_newline_loop
:pptokens_newline_loop_end
newlines = 1
in += 1
goto pptokens_loop
:pptokens_loop_end
2022-01-07 23:32:27 -05:00
free(file_contents)
close(fd)
2022-01-08 12:15:17 -05:00
return pptokens
2022-01-07 23:32:27 -05:00
:unterminated_comment
2022-01-08 12:15:17 -05:00
compile_error(filename, line_number, .str_unterminated_comment)
2022-01-07 23:32:27 -05:00
:str_unterminated_comment
2022-01-08 12:15:17 -05:00
string Unterminated comment.
byte 0
:unterminated_string
compile_error(filename, line_number, .str_unterminated_string)
:str_unterminated_string
string Unterminated string or character literal.
byte 0
:bad_pptoken
compile_error(filename, line_number, .str_bad_pptoken)
:str_bad_pptoken
string Bad preprocessing token.
2022-01-07 23:32:27 -05:00
byte 0
2022-01-08 12:15:17 -05:00
; can the given character appear in a C89 ppnumber?
function is_ppnumber_char
argument c
if c == '. goto return_1
if c < '0 goto return_0
if c <= '9 goto return_1
if c < 'A goto return_0
if c <= 'Z goto return_1
if c == '_ goto return_1
if c < 'a goto return_0
if c <= 'z goto return_1
goto return_0
function print_pptokens
argument pptokens
local p
p = pptokens
:print_pptokens_loop
if *1p == 0 goto print_pptokens_loop_end
putc('{)
puts(p)
putc('})
p += strlen(p)
p += 1
goto print_pptokens_loop
:print_pptokens_loop_end
putc(10)
return
2022-01-08 14:37:39 -05:00
function pptoken_copy_and_advance
argument p_in
argument p_out
local in
local out
in = *8p_in
out = *8p_out
out = strcpy(out, in)
in = memchr(in, 0)
*8p_in = in + 1
*8p_out = out + 1
return
function pptoken_skip
argument p_in
local in
in = *8p_in
in = memchr(in, 0)
*8p_in = in + 1
return
; skip any space tokens here
function pptoken_skip_spaces
argument p_in
local in
in = *8p_in
:pptoken_skip_spaces_loop
if *1in != 32 goto pptoken_skip_spaces_loop_end
pptoken_skip(&in)
goto pptoken_skip_spaces_loop
:pptoken_skip_spaces_loop_end
*8p_in = in
return
; phase 4:
; Preprocessing directives are executed and macro invocations are expanded.
; A #include preprocessing directive causes the named header or source file to be processed from phase 1 through phase 4, recursively.
function translation_phase_4
argument filename
argument input
local output
local in
local out
local p
local c
local b
local line_number
output = malloc(16000000)
out = output
in = input
line_number = 0
:phase4_line
line_number += 1
c = *1in
if c == 0 goto phase4_end
if c == '# goto pp_directive ; NOTE: ## cannot appear at the start of a line
:process_pptoken
c = *1in
if c == 10 goto phase4_next_line
b = isdigit(c)
if b != 0 goto phase4_next_pptoken
:phase4_next_pptoken
pptoken_copy_and_advance(&in, &out)
goto process_pptoken
:phase4_next_line
pptoken_copy_and_advance(&in, &out)
goto phase4_line
:pp_directive
pptoken_skip(&in)
c = *1in
if c == 10 goto phase4_next_line ; "null directive" C89 § 3.8.7
b = str_equals(in, .str_error)
if b != 0 goto pp_directive_error
b = str_equals(in, .str_define)
if b != 0 goto pp_directive_define
goto unrecognized_directive
:pp_directive_error
fputs(2, filename)
fputc(2, ':)
fputn(2, line_number)
fputs(2, .str_directive_error)
exit(1)
:pp_directive_define
local macro_name
pptoken_skip(&in)
pptoken_skip_spaces(&in)
macro_name = in
pptoken_skip(&in)
c = *1in
if c == '( goto function_macro_definition
; it's an object-like macro, e.g. #define X 47
p = object_macros + object_macros_size
; copy name
p = strcpy(p, macro_name)
p += 1
; copy contents
memccpy_advance(&p, &in, 10) ; copy until newline
*1p = 255 ; replace newline with special "macro end" character
p += 1
object_macros_size = p - object_macros
in += 2 ; skip newline and following null character
goto phase4_line
:function_macro_definition
; a function-like macro, e.g. #define JOIN(a,b) a##b
byte 0xcc
:str_directive_error
string : #error
byte 10
byte 0
:phase4_end
return output
:unrecognized_directive
compile_error(filename, line_number, .str_unrecognized_directive)
:str_unrecognized_directive
string Unrecognized preprocessor directive.
byte 0
function look_up_object_macro
argument name
local p
p = object_macros
:objmacro_lookup_loop
@TODO
function print_object_macros
local p
local c
p = object_macros
:print_objmacros_loop
if *1p == 0 goto return_0 ; done!
puts(p)
putc(':)
putc(32)
p = memchr(p, 0)
p += 1
:print_objreplacement_loop
putc('{)
puts(p)
putc('})
p = memchr(p, 0)
p += 1
c = *1p
if c == 255 goto print_objreplacement_loop_end
goto print_objreplacement_loop
:print_objreplacement_loop_end
p += 1
fputc(1, 10)
goto print_objmacros_loop