start parsing statements (not a lot yet)

This commit is contained in:
pommicket 2022-02-03 22:53:38 -05:00
parent fd02968c23
commit d1167f03d0
4 changed files with 260 additions and 65 deletions

View file

@ -230,6 +230,52 @@
; types willl be initialized (in main) so that this refers to the type char* ; types willl be initialized (in main) so that this refers to the type char*
#define TYPE_POINTER_TO_CHAR 20 #define TYPE_POINTER_TO_CHAR 20
; STATEMENTS
; In C, note that `if', `while', etc. always have a single statement as their body:
; if (x) { y; z; w; }
; here {y; z; w;} is a single `compound' statement containing three statements.
; our statements don't directly correspond to the C89 standard's notion of statements, in particular,
; labels count as separate statements and declarations count as statements.
; each statement is stored as exactly 40 bytes
; uchar type
; uchar padding
; ushort file
; uint line
; ulong data1
; ulong data2
; ulong data3
; ulong data4
; a type of 0 indicates the end of the block.
; data layout for particular statements:
; - STATEMENT_EXPRESSION - data1 is a pointer to expression data; data2,3,4 are unused
; - STATEMENT_LOCAL_DECLARATION - declaring a local variable (automatic/"register" storage duration), data1 = total bytes used by all local variables so far in this function including this one; data2,3,4 unused
; - STATEMENT_LABEL - data1 is a pointer to the name of the label; data2,3,4 are unused
; - STATEMENT_BLOCK - data1 is a pointer to an array of statements; data2,3,4 are unused
; - STATEMENT_IF - data1 is a pointer to the condition, data2 is a pointer to the `if' branch statement, data3 is a pointer to the `else' branch statement, or 0 if there is none; data4 is unused
; - STATEMENT_SWITCH - data1 is a pointer to the expression, data2 is a pointer to the body statement; data3,4 are unused
; - STATEMENT_WHILE - data1 is a pointer to the condition, data2 is a pointer to the body statement; data3,4 are unused
; - STATEMENT_DO - data1 is a pointer to the body statement, data2 is a pointer to the condition; data3,4 are unused
; - STATEMENT_FOR - data1,2,3 are pointers to the first, second, and third expressions inside parentheses, data4 is a pointer to the body statement
; - STATEMENT_GOTO - data1 is a pointer to the name of the label; data2,3,4 are unused
; - STATEMENT_CONTINUE - data1,2,3,4 are unused
; - STATEMENT_BREAK - data1,2,3,4 are unused
; - STATEMENT_RETURN - data1 is a pointer to the expression, or 0 if there is none; data2,3,4 are unused
#define STATEMENT_EXPRESSION 1
#define STATEMENT_LOCAL_DECLARATION 2
#define STATEMENT_LABEL 3
#define STATEMENT_BLOCK 4
#define STATEMENT_IF 5
#define STATEMENT_SWITCH 6
#define STATEMENT_WHILE 7
#define STATEMENT_DO 8
#define STATEMENT_FOR 9
#define STATEMENT_GOTO 0xa
#define STATEMENT_CONTINUE 0xb
#define STATEMENT_BREAK 0xc
#define STATEMENT_RETURN 0xd
:keyword_table :keyword_table
byte SYMBOL_SEMICOLON byte SYMBOL_SEMICOLON
byte 59 byte 59

View file

@ -42,6 +42,16 @@ global output_file_data
; ident list of global variables. each one is stored as ; ident list of global variables. each one is stored as
; (type << 32) | address ; (type << 32) | address
global global_variables global global_variables
; ident list of functions. each entry is a pointer to a single statement - which should always be a STATEMENT_BLOCK
global function_statements
; statement_datas[0] = pointer to statement data for block-nesting depth 0 (i.e. function bodies)
; statement_datas[1] = pointer to statement data for block-nesting depth 1 (blocks inside functions)
; statement_datas[2] = pointer to statement data for block-nesting depth 2 (blocks inside blocks inside functions)
; etc. up to statement_datas[15] "* 15 nesting levels of compound statements, iteration control structures, and selection control structures" C89 § 2.2.4.1
; these have to be separated for reasons™
global statement_datas
global statement_datas_ends
global parse_stmt_depth
#include util.b #include util.b
#include idents.b #include idents.b
@ -154,15 +164,32 @@ function main
local tokens local tokens
local ast local ast
local p local p
local q
local i local i
local output_fd local output_fd
statement_datas = malloc(4000)
statement_datas_ends = malloc(4000)
p = statement_datas
q = statement_datas_ends
i = 0
:statement_datas_loop
*8p = malloc(4000000) ; supports 100,000 statements at each level
*8q = p
p += 8
q += 8
i += 1
if i < 16 goto statement_datas_loop
fill_in_powers_of_10() fill_in_powers_of_10()
typedefs = ident_list_create(100000) typedefs = ident_list_create(100000)
enumerators = ident_list_create(4000000) enumerators = ident_list_create(4000000)
structures = ident_list_create(4000000) structures = ident_list_create(4000000)
global_variables = ident_list_create(4000000) global_variables = ident_list_create(400000)
function_statements = ident_list_create(400000)
function_stmt_data = malloc(800000) ; should be at least 40 bytes * max # of functions
dat_banned_objmacros = 255 dat_banned_objmacros = 255
dat_banned_fmacros = 255 dat_banned_fmacros = 255
@ -197,14 +224,15 @@ function main
translation_phase_4(input_filename, pptokens, processed_pptokens) translation_phase_4(input_filename, pptokens, processed_pptokens)
free(pptokens) free(pptokens)
pptokens = processed_pptokens pptokens = processed_pptokens
print_pptokens(pptokens) ;print_pptokens(pptokens)
print_separator() ;print_separator()
;print_object_macros() ;print_object_macros()
;print_function_macros() ;print_function_macros()
tokens = malloc(16000000) tokens = malloc(16000000)
p = tokenize(pptokens, tokens, input_filename, 1) p = tokenize(pptokens, tokens, input_filename, 1)
print_tokens(tokens, p) print_tokens(tokens, p)
print_separator()
; NOTE: do NOT free pptokens; identifiers still reference them. ; NOTE: do NOT free pptokens; identifiers still reference them.
parse_tokens(tokens) parse_tokens(tokens)

115
05/main.c
View file

@ -1,59 +1,64 @@
typedef struct { int f(void) {
int i[41]; blah:blah:blah:;
long double d; }
} (*x___)(void);
typedef enum X {
R,S,T
} *Foo[sizeof(unsigned long)];
typedef int A___[T];
typedef struct A { /* typedef struct { */
int x, y; /* int i[41]; */
long double c; /* long double d; */
unsigned long d; /* } (*x___)(void); */
char e[3]; /* */
long f; /* typedef enum X { */
} A; /* R,S,T */
/* } *Foo[sizeof(unsigned long)]; */
typedef union B{ /* typedef int A___[T]; */
int x; /* */
struct { /* typedef struct A { */
int y; /* int x, y; */
struct {long z; } c; /* long double c; */
} c; /* unsigned long d; */
}B; /* char e[3]; */
/* long f; */
typedef int QQQ[sizeof(A)+sizeof"hello"]; /* } A; */
typedef int RRR[sizeof(struct B)]; /* */
/* typedef union B{ */
static unsigned int x={55}; /* int x; */
static char *s = "hello"; /* struct { */
static char *t = "goodbye"; /* int y; */
static char u[8] = "hellothe"; /* struct {long z; } c; */
static char v[100] = "re my"; /* } c; */
static char w[] = "friendly"; /* }B; */
static char x_[] = "hi"; /* */
typedef int A_[sizeof x_ + sizeof u]; /* typedef int QQQ[sizeof(A)+sizeof"hello"]; */
/* typedef int RRR[sizeof(struct B)]; */
static int a[5] = {1,2,3}; /* */
static char b[6][7] = {{'a'},{'b'},{'c'},{'d'},{'e'}}; /* static unsigned int x={55}; */
static char __b[][7] = {{'a'},"hello",'r'}; /* static char *s = "hello"; */
static int _u = sizeof __b; /* static char *t = "goodbye"; */
/* static char u[8] = "hellothe"; */
struct { /* static char v[100] = "re my"; */
int a; /* static char w[] = "friendly"; */
long b; /* static char x_[] = "hi"; */
} x1[] = {0x1234567890, 1ul<<60|1ul<<3, 77}; /* typedef int A_[sizeof x_ + sizeof u]; */
int y1 = 0x12345678; /* */
/* static int a[5] = {1,2,3}; */
struct { /* static char b[6][7] = {{'a'},{'b'},{'c'},{'d'},{'e'}}; */
int x[2], y; /* static char __b[][7] = {{'a'},"hello",'r'}; */
} test[] = {3, 5,0x1234,0x4321}; /* static int _u = sizeof __b; */
typedef int Blah[sizeof((B *)0)->c.y]; /* */
unsigned marker = 0xdeadbeef; /* struct { */
/* int a; */
typedef int (*FUNCTION)(void); /* long b; */
typedef int AAAA[sizeof*****((FUNCTION)0)]; /* } x1[] = {0x1234567890, 1ul<<60|1ul<<3, 77}; */
/* int y1 = 0x12345678; */
/* */
/* struct { */
/* int x[2], y; */
/* } test[] = {3, 5,0x1234,0x4321}; */
/* typedef int Blah[sizeof((B *)0)->c.y]; */
/* unsigned marker = 0xdeadbeef; */
/* */
/* typedef int (*FUNCTION)(void); */
/* typedef int AAAA[sizeof*****((FUNCTION)0)]; */
/* typedef int X[sizeof(int)+4]; */ /* typedef int X[sizeof(int)+4]; */

View file

@ -35,7 +35,7 @@ function structure_is_union
if offset == 0 goto return_1 ; if that's 0, it's a union or 1-element struct if offset == 0 goto return_1 ; if that's 0, it's a union or 1-element struct
goto return_0 goto return_0
; parse a translation unit
function parse_tokens function parse_tokens
argument tokens argument tokens
local token local token
@ -44,6 +44,7 @@ function parse_tokens
local p local p
local b local b
local c local c
local n
local base_type local base_type
local base_type_end local base_type_end
local name local name
@ -52,6 +53,7 @@ function parse_tokens
local suffix local suffix
local suffix_end local suffix_end
local is_extern local is_extern
local out
token = tokens token = tokens
:parse_tokens_loop :parse_tokens_loop
@ -64,7 +66,7 @@ function parse_tokens
b = token_is_type(token) b = token_is_type(token)
if b != 0 goto parse_toplevel_decl if b != 0 goto parse_toplevel_decl
die(.str_bad_statement) token_error(token, .str_bad_statement)
:str_bad_statement :str_bad_statement
string Bad statement. string Bad statement.
byte 0 byte 0
@ -164,12 +166,26 @@ function parse_tokens
byte 0 byte 0
:parse_function_definition :parse_function_definition
p = types + type p = types + type
; @NOTE: remember to turn array members into pointers ; @TODO: parameters
; @NOTE: remember to turn array members into pointers
if *1p != TYPE_FUNCTION goto lbrace_after_declaration if *1p != TYPE_FUNCTION goto lbrace_after_declaration
die(.str_fdNI) ; @TODO
:str_fdNI global function_stmt_data ; initialized in main
string function definitions not implemented. global function_stmt_data_bytes_used
byte 10
n = function_stmt_data_bytes_used
out = function_stmt_data + function_stmt_data_bytes_used
parse_statement(&token, &out)
if parse_stmt_depth != 0 goto stmtdepth_internal_err
function_stmt_data_bytes_used = out - function_stmt_data
ident_list_add(function_statements, name, n)
goto parse_tokens_loop
:stmtdepth_internal_err
token_error(token, .str_stmtdepth_internal_err)
:str_stmtdepth_internal_err
string Internal compiler error: parse_stmt_depth is not 0 after parsing function body.
byte 0 byte 0
:lbrace_after_declaration :lbrace_after_declaration
token_error(token, .str_lbrace_after_declaration) token_error(token, .str_lbrace_after_declaration)
@ -240,6 +256,106 @@ function parse_tokens
:parse_tokens_eof :parse_tokens_eof
return return
; write type, file, and line info for statement
function write_statement_header
local out
local type
local token
*1out = type
out += 2
token += 2
*2out = *2token
out += 2
token += 2
*4out = *4token
return 0
; writes statement data for the statement at *p_token to (*)*p_out
; always advances *p_out by exactly 40 bytes, since that's the length of a statement.
function parse_statement
argument p_token
argument p_out
local out
local token
local p
local c
local n
out = *8p_out
token = *8p_token
:stmt_label_loop
; if second token in statement is a colon, this must be a label
p = token + 16
if *1p == SYMBOL_COLON goto stmt_label
goto stmt_label_loop_end
:stmt_label
write_statement_header(out, STATEMENT_LABEL, token)
out += 8
token += 8
*8out = *8token ; copy label name
out += 32
token += 24 ; skip ident name, and colon
goto stmt_label_loop
:stmt_label_loop_end
c = *1token
if c == SYMBOL_SEMICOLON goto stmt_empty
if c == SYMBOL_LBRACE goto stmt_block
token_error(token, .str_unrecognized_statement)
:str_unrecognized_statement
string Unrecognized statement.
byte 0
:parse_statement_ret
*8p_token = token
*8p_out = out
return
:stmt_block
local block_p_out
; find the appropriate statement data to use for this block's body
block_p_out = statement_datas_ends
block_p_out += parse_stmt_depth < 3
write_statement_header(out, STATEMENT_BLOCK, token)
out += 8
*8out = *8block_p_out
out += 32
parse_stmt_depth += 1
if parse_stmt_depth >= 16 goto too_much_nesting
token += 16 ; skip opening {
:parse_block_loop
if *1token == TOKEN_EOF goto parse_block_eof
if *1token == SYMBOL_RBRACE goto parse_block_loop_end
parse_statement(&token, block_p_out)
goto parse_block_loop
:parse_block_loop_end
token += 16 ; skip closing }
p = *8block_p_out
*1p = 0 ; probably redundant, but whatever
*8block_p_out += 8 ; add 8 and not 1 because of alignment
parse_stmt_depth -= 1
goto parse_statement_ret
:parse_block_eof
token_error(*8p_token, .str_parse_block_eof)
:str_parse_block_eof
string End of file reached while trying to parse block. Are you missing a closing brace?
byte 0
:too_much_nesting
token_error(token, .str_too_much_nesting)
:str_too_much_nesting
string Too many levels of nesting blocks.
byte 0
:stmt_empty
; empty statement, e.g. while(something)-> ; <-
token += 16 ; skip semicolon
goto parse_statement_ret
; parse a global variable's initializer ; parse a global variable's initializer
; e.g. int x[5] = {1+8, 2, 3, 4, 5}; ; e.g. int x[5] = {1+8, 2, 3, 4, 5};
; advances *p_token to the token right after the initializer ; advances *p_token to the token right after the initializer