simple expressions

This commit is contained in:
pommicket 2022-01-13 16:12:28 -05:00
parent e43f32b932
commit 13363eac1a
6 changed files with 470 additions and 25 deletions

View file

@ -77,6 +77,7 @@
#define TOKEN_CONSTANT_INT 3
#define TOKEN_CONSTANT_CHAR 4
#define TOKEN_STRING_LITERAL 5
#define TOKEN_EOF 6
; these are stored in the "info" field of the token
#define NUMBER_NO_SUFFIX 0
@ -118,6 +119,104 @@
#define KEYWORD_STATIC 51
#define KEYWORD_WHILE 52
; the format of expression headers is:
; uchar kind (one of the constants below)
; uchar info
; ushort (padding)
; uint type (0 if expression hasn't been typed yet)
; immediately following the header in memory are the arguments of the expression
; - for constant ints, the 64-bit integral value
; - for constant floats, the 64-bit double value (even if expression has type float)
; - for string literals, a 64-bit pointer to the string (for the executable, not for the compiler)
; - for unary operators, the operand
; - for binary operators, the first operand followed by the second
; - for the operators . and ->, the first operand is an expression and the second is just a pointer to the name of the member
; - for the ternary operator ? :, the first followed by the second followed by the third
; - for function calls, the function, followed by each of the arguments to the function — info indicates the number of arguments
; Note that file/line number are not stored in expressions.
#define EXPRESSION_IDENTIFIER 200
#define EXPRESSION_CONSTANT_INT 201
#define EXPRESSION_CONSTANT_FLOAT 202
#define EXPRESSION_STRING_LITERAL 203
#define EXPRESSION_SUBSCRIPT 204
#define EXPRESSION_CALL 205
#define EXPRESSION_DOT 206
#define EXPRESSION_ARROW 207
#define EXPRESSION_POST_INCREMENT 208
#define EXPRESSION_POST_DECREMENT 209
#define EXPRESSION_PRE_INCREMENT 210
#define EXPRESSION_PRE_DECREMENT 211
#define EXPRESSION_ADDRESS_OF 212
#define EXPRESSION_DEREFERENCE 213
; this matters for promotion. if x is a char, sizeof(+x) should be sizeof(int)
#define EXPRESSION_UNARY_PLUS 214
#define EXPRESSION_UNARY_MINUS 215
#define EXPRESSION_BITWISE_NOT 216
#define EXPRESSION_NOT 217
#define EXPRESSION_SIZEOF 218
#define EXPRESSION_CAST 219
#define EXPRESSION_MUL 220
#define EXPRESSION_DIV 221
#define EXPRESSION_REMAINDER 222
#define EXPRESSION_ADD 223
#define EXPRESSION_SUB 224
#define EXPRESSION_LSHIFT 225
#define EXPRESSION_RSHIFT 226
#define EXPRESSION_LT 227
#define EXPRESSION_GT 228
#define EXPRESSION_LEQ 229
#define EXPRESSION_GEQ 230
#define EXPRESSION_EQ 231
#define EXPRESSION_NEQ 232
#define EXPRESSION_BITWISE_AND 233
#define EXPRESSION_BITWISE_XOR 234
#define EXPRESSION_BITWISE_OR 235
#define EXPRESSION_AND 236
#define EXPRESSION_OR 237
; e.g. x == 5 ? 6 : 7
#define EXPRESSION_CONDITIONAL 238
#define EXPRESSION_ASSIGN 239
#define EXPRESSION_ASSIGN_ADD 240
#define EXPRESSION_ASSIGN_SUB 241
#define EXPRESSION_ASSIGN_MUL 242
#define EXPRESSION_ASSIGN_DIV 243
#define EXPRESSION_ASSIGN_REMAINDER 244
#define EXPRESSION_ASSIGN_LSHIFT 245
#define EXPRESSION_ASSIGN_RSHIFT 246
#define EXPRESSION_ASSIGN_AND 247
#define EXPRESSION_ASSIGN_XOR 248
#define EXPRESSION_ASSIGN_OR 249
#define EXPRESSION_COMMA 250
; TYPES: A type is a 4-byte index into the global array `types`. Byte 0 in `types`
; is reserved, and bytes 1-16 contain the values 1-16. Thus TYPE_INT, etc.
; can be used as types directly.
; The format of each type is as follows:
; char, unsigned char, etc.: TYPE_CHAR, TYPE_UNSIGNED_CHAR, etc. as a single byte
; pointer to type t: TYPE_PTR t
; array of n t's: TYPE_ARRAY {n as 8 bytes} t
; struct/union: TYPE_STRUCT/TYPE_UNION {0 for incomplete types/4-byte pointer to struct/union}
; NOTE: we just treat function pointers as pointers to the function return type.
#define TYPE_VOID 1
#define TYPE_CHAR 3
#define TYPE_UNSIGNED_CHAR 4
#define TYPE_SHORT 5
#define TYPE_UNSIGNED_SHORT 6
#define TYPE_INT 7
#define TYPE_UNSIGNED_INT 8
#define TYPE_LONG 9
#define TYPE_UNSIGNED_LONG 10
#define TYPE_FLOAT 11
; note that long double is treated the same as double.
#define TYPE_DOUBLE 12
#define TYPE_POINTER 13
#define TYPE_STRUCT 14
#define TYPE_UNION 15
#define TYPE_ARRAY 16
; types willl be initialized (in main) so that this refers to the type char*
#define TYPE_POINTER_TO_CHAR 20
:keyword_table
byte SYMBOL_SEMICOLON
byte 59
@ -538,3 +637,42 @@
:str___STDC__
string __STDC__
byte 0
:str_void
string void
byte 0
:str_char
string char
byte 0
:str_unsigned_char
string unsigned char
byte 0
:str_short
string short
byte 0
:str_unsigned_short
string unsigned short
byte 0
:str_int
string int
byte 0
:str_unsigned_int
string unsigned int
byte 0
:str_long
string long
byte 0
:str_unsigned_long
string unsigned long
byte 0
:str_float
string float
byte 0
:str_double
string double
byte 0
:str_struct
string struct
byte 0
:str_union
string union
byte 0

View file

@ -41,6 +41,18 @@ function compile_error
fputc(2, 10)
exit(1)
function token_error
argument token
argument message
local p
local file
local line
p = token + 2
file = *2p
p += 2
line = *4p
compile_error(file, line, message)
; accepts EITHER file index OR pointer to filename
function compile_warning
argument file
@ -71,10 +83,15 @@ function compile_warning
; 10^i = significand * 2^exponent
global powers_of_10
global types
global types_end
#include util.b
#include constants.b
#include preprocess.b
#include tokenize.b
#include parse.b
function main
argument argv2
@ -86,7 +103,9 @@ function main
local pptokens
local processed_pptokens
local tokens
local ast
local p
local i
fill_in_powers_of_10()
dat_banned_objmacros = 255
@ -97,6 +116,23 @@ function main
object_macros = malloc(4000000)
function_macros = malloc(4000000)
types = malloc(16000000)
i = 0
p = types
:fill_initial_types_loop
*1p = i
p += 1
i += 1
if i <= 16 goto fill_initial_types_loop
p = types + TYPE_POINTER_TO_CHAR
*1p = TYPE_POINTER
p += 1
*1p = TYPE_CHAR
types_end = p
input_filename = .str_default_input_filename
output_filename = .str_default_output_filename
if argc == 1 goto have_filenames
@ -104,6 +140,9 @@ function main
input_filename = argv1
output_filename = argv2
:have_filenames
output_fd = open_w(output_filename)
rodata_end_offset = RODATA_OFFSET
pptokens = split_into_preprocessing_tokens(input_filename)
;print_pptokens(pptokens)
;print_separator()
@ -116,14 +155,17 @@ function main
;print_object_macros()
;print_function_macros()
output_fd = open_w(output_filename)
rodata_end_offset = RODATA_OFFSET
tokens = malloc(16000000)
tokenize(pptokens, tokens)
p = tokenize(pptokens, tokens, input_filename, 1)
print_tokens(tokens)
; NOTE: do NOT free pptokens as identifiers still reference them.
ast = malloc(56000000)
p -= 16
parse_expression(tokens, p, ast)
print_expression(ast)
putc(10)
exit(0)
:usage_error

View file

@ -1,5 +1 @@
"Hello ther" "e good fellow."
char * = "How are you"" d""o""i""ng today?\n";
hi
_TEST _ING _1
5e+307
'a'

216
05/parse.b Normal file
View file

@ -0,0 +1,216 @@
function parse_expression
argument tokens
argument tokens_end
argument out
local in
local a
local b
local c
local p
local value
if tokens == tokens_end goto empty_expression
p = tokens + 16
if p == tokens_end goto single_token_expression
goto unrecognized_expression
:single_token_expression
in = tokens
c = *1in
if c == TOKEN_CONSTANT_INT goto expression_integer
if c == TOKEN_CONSTANT_CHAR goto expression_integer ; character constants are basically the same as integer constants
if c == TOKEN_CONSTANT_FLOAT goto expression_float
if c == TOKEN_STRING_LITERAL goto expression_string_literal
byte 0xcc
:expression_integer
*1out = EXPRESSION_CONSTANT_INT
p = in + 8
value = *8p
p = out + 8
*8p = value
p = in + 1
a = int_suffix_to_type(*1p) ; what the suffix says the type should be
b = int_value_to_type(value) ; what the value says the type should be (if the value is too large to fit in int)
a = max_signed(a, b) ; take the maximum of the two types
; make sure that if the integer has a u suffix, the type will be unsigned
a &= b | 0xfe
p = out + 4
*4p = a
in += 16
out += 16
return out
:expression_float
*1out = EXPRESSION_CONSTANT_FLOAT
p = in + 8
value = *8p
p = out + 8
*8p = value
p = in + 1
a = float_suffix_to_type(*1p)
p = out + 4
*4p = a
in += 16
out += 16
return out
:expression_string_literal
*1out = EXPRESSION_STRING_LITERAL
p = in + 8
value = *8p
p = out + 8
*8p = value
; we already know this is char*
p = out + 4
*4p = TYPE_POINTER_TO_CHAR
in += 16
out += 16
return out
:empty_expression
token_error(tokens, .str_empty_expression)
:str_empty_expression
string Empty expression.
byte 0
:unrecognized_expression
token_error(tokens, .str_unrecognized_expression)
:str_unrecognized_expression
string Unrecognized expression.
byte 0
:return_type_int
return TYPE_INT
:return_type_long
return TYPE_LONG
:return_type_unsigned_int
return TYPE_UNSIGNED_INT
:return_type_unsigned_long
return TYPE_UNSIGNED_LONG
:return_type_float
return TYPE_FLOAT
:return_type_double
return TYPE_DOUBLE
function int_suffix_to_type
argument suffix
if suffix == NUMBER_SUFFIX_L goto return_type_long
if suffix == NUMBER_SUFFIX_U goto return_type_unsigned_int
if suffix == NUMBER_SUFFIX_UL goto return_type_unsigned_long
goto return_type_int
function float_suffix_to_type
argument suffix
if suffix == NUMBER_SUFFIX_F goto return_type_float
goto return_type_double
; smallest integer type which can fit this value, only using unsigned if necessary
function int_value_to_type
argument value
if value [ 0x80000000 goto return_type_int
if value [ 0x8000000000000000 goto return_type_long
goto return_type_unsigned_long
function print_expression
argument expression
local c
local p
p = expression + 4
putc(40)
print_type(*4p)
putc(41)
c = *1expression
if c == EXPRESSION_CONSTANT_INT goto print_expr_int
if c == EXPRESSION_CONSTANT_FLOAT goto print_expr_float
if c == EXPRESSION_STRING_LITERAL goto print_expr_str
byte 0xcc
:print_expr_int
expression += 8
putn(*8expression)
return
:print_expr_float
expression += 8
putx64(*8expression)
return
:print_expr_str
expression += 8
putc('0)
putc('x)
putx32(*8expression)
return
; NOTE: to make things easier, the format which this outputs isn't the same as C's, specifically we have
; *int for pointer to int and [5]int for array of 5 ints
function print_type
argument type
local c
:print_type_top
c = types + type
c = *1c
if c == TYPE_VOID goto print_type_void
if c == TYPE_CHAR goto print_type_char
if c == TYPE_UNSIGNED_CHAR goto print_type_unsigned_char
if c == TYPE_SHORT goto print_type_short
if c == TYPE_UNSIGNED_SHORT goto print_type_unsigned_short
if c == TYPE_INT goto print_type_int
if c == TYPE_UNSIGNED_INT goto print_type_unsigned_int
if c == TYPE_LONG goto print_type_long
if c == TYPE_UNSIGNED_LONG goto print_type_unsigned_long
if c == TYPE_FLOAT goto print_type_float
if c == TYPE_DOUBLE goto print_type_double
if c == TYPE_POINTER goto print_type_pointer
if c == TYPE_ARRAY goto print_type_array
if c == TYPE_STRUCT goto print_type_struct
if c == TYPE_UNION goto print_type_union
fputs(2, .str_bad_print_type)
exit(1)
:str_bad_print_type
string Bad type passed to print_type.
byte 10
byte 0
:print_type_void
return puts(.str_void)
:print_type_char
return puts(.str_char)
:print_type_unsigned_char
return puts(.str_unsigned_char)
:print_type_short
return puts(.str_short)
:print_type_unsigned_short
return puts(.str_unsigned_short)
:print_type_int
return puts(.str_int)
:print_type_unsigned_int
return puts(.str_unsigned_int)
:print_type_long
return puts(.str_long)
:print_type_unsigned_long
return puts(.str_unsigned_long)
:print_type_float
return puts(.str_float)
:print_type_double
return puts(.str_double)
:print_type_pointer
putc('*)
type += 1
goto print_type_top
:print_type_array
putc('[)
type += 1
putn(*8type) ; UNALIGNED
putc('])
type += 8
goto print_type_top
:print_type_struct
return puts(.str_struct)
:print_type_union
return puts(.str_union)

View file

@ -108,10 +108,16 @@ global rodata_end_offset
; uint line
; ulong data
; This corresponds to translation phases 5-6 and the first half of 7
; IMPORTANT: this function uses pointers to pptokens, so they should NOT be freed!
; IMPORTANT: this function uses pointers to pptokens, so it should NOT be freed!
; Returns a pointer to the end of tokens.
function tokenize
argument pptokens
argument out
; you might think we wouldn't need these arguments because the pptokens array starts with
; a line directive. but we also use this function to tokenize the expression of a #if,
; where that isn't the case.
argument initial_filename
argument initial_line_number
local in
local file
local line_number
@ -129,6 +135,11 @@ function tokenize
local lower
local upper
file_add(initial_filename)
file = file_get_index(initial_filename)
line_number = initial_line_number
in = pptokens
:tokenize_loop
c = *1in
@ -301,10 +312,10 @@ function tokenize
:float_have_significand_and_exponent
if significand == 0 goto float_zero
normalize_float(&significand, &exponent)
putn(significand)
putc(32)
putn_signed(exponent)
putc(10)
; putn(significand)
; putc(32)
; putn_signed(exponent)
; putc(10)
; make number round to the nearest representable float roughly (this is what gcc does)
; this fails for 5e-100 probably because of imprecision, but mostly works
significand += 15
@ -357,8 +368,15 @@ function tokenize
data = 0x7ff0000000000000 ; double infinity
goto float_have_data
:tokenize_loop_end
; EOF token
*1out = TOKEN_EOF
out += 2
*2out = file
out += 2
*4out = line_number
out += 12
return 0
return out
:f_suffix_on_integer
compile_error(file, line_number, .str_f_suffix_on_integer)
:str_f_suffix_on_integer
@ -581,6 +599,7 @@ function print_tokens
if *1p == TOKEN_CONSTANT_FLOAT goto print_token_float
if *1p == TOKEN_STRING_LITERAL goto print_token_string_literal
if *1p == TOKEN_IDENTIFIER goto print_token_identifier
if *1p == TOKEN_EOF goto print_token_eof
fputs(2, .str_print_bad_token)
exit(1)
:print_token_keyword
@ -603,9 +622,13 @@ function print_tokens
:print_token_float
p += 8
puts(.str_constant_float)
putx(*8p)
putx64(*8p)
p += 8
putc(32)
goto print_tokens_loop
:print_token_eof
puts(.str_eof)
goto print_token_data
:print_token_info
p += 1
putc('~)
@ -643,3 +666,6 @@ function print_tokens
string Unrecognized token type in print_tokens. Aborting.
byte 10
byte 0
:str_eof
string EOF
byte 0

View file

@ -44,6 +44,14 @@ function left_shift
:left_shift_negative
n = 0 - n
return x > n
function max_signed
argument a
argument b
if a > b goto maxs_return_a
return b
:maxs_return_a
return a
function file_error
argument name
@ -327,26 +335,45 @@ function fputn_signed
fputn(fd, n)
return
function fputx
:hex_digits
string 0123456789abcdef
function fputx64
argument fd
argument n
local m
local x
m = 60
:fputx_loop
:fputx64_loop
x = n > m
x &= 0xf
x += .hex_digits
fputc(fd, *1x)
m -= 4
if m >= 0 goto fputx_loop
if m >= 0 goto fputx64_loop
return
:hex_digits
string 0123456789abcdef
function putx
function putx64
argument n
fputx(1, n)
fputx64(1, n)
return
function fputx32
argument fd
argument n
local m
local x
m = 28
:fputx32_loop
x = n > m
x &= 0xf
x += .hex_digits
fputc(fd, *1x)
m -= 4
if m >= 0 goto fputx32_loop
return
function putx32
argument n
fputx32(1, n)
return
function putn