markdown to HTML converter

2021-09-01 18:27:51 -04:00 · 2021-09-01 18:27:51 -04:00 · 336a793149
commit 336a793149
parent f71545c939
9 changed files with 233 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 README.html
 out??
 markdown
--- a/00/Makefile
+++ b/00/Makefile
@ -1,5 +1,5 @@
 all: README.html out00
 %.html: %.md
 	markdown $< > $@
 out00: in00
 	./hexcompile
 %.html: %.md ../markdown
 	../markdown $<
--- a/00/README.md
+++ b/00/README.md
@ -1,7 +1,7 @@
 # stage 00
 This directory contains the file `hexcompile`, a handwritten executable. It
-takes input file `in00` containing space/newline/[any character]-separated
+takes input file `in00` containing space/newline/(any character)-separated
 hexadecimal digit pairs (e.g. `3f`) and outputs them as bytes to the file
 `out00`. On 64-bit Linux, try running `./hexcompile` from this directory (I've
 already provided an `in00` file, which you can take a look at), and you will get
@ -369,7 +369,7 @@ That's quite a lot to take in for such a simple program, but here we are! We now
 have something that will let us write individual bytes with an ordinary text
 editor and get them translated into a binary file.
-## Limitations
+## limitations
 There are many ways in which this is a bad program. It will *only* properly
 handle lowercase hexadecimal digit pairs, separated by exactly one character,
@ -381,7 +381,7 @@ Also, we only read in data *three bytes at a time*, and output one byte at a
 time. This is a very bad idea because syscalls (e.g. `read`) are slow. `read`
 might take ~3 microseconds, which doesn't sound like a lot, but it means that if
 we used code like this to process a 50 megabyte file, say, we'd be waiting for
-a long time.
+a while.
 But these problems aren't really a big deal. We'll only be running this on
 little programs and we'll be sure to check that our input is in the right
--- a/01/Makefile
+++ b/01/Makefile
@ -3,5 +3,5 @@ out01: in01 out00
 	./out00
 out00: in00
 	../00/hexcompile
-%.html: %.md
+%.html: %.md ../markdown
-	markdown $< > $@
+	../markdown $<
--- a/01/README.md
+++ b/01/README.md
@ -333,7 +333,7 @@ header. But by a lucky coincidence, all those entries actually land on 0 bytes,
 so they'll just be treated as unrecognized (as they should be). So it's all
 good.
-## Limitations
+## limitations
 Like our last program, this one will be slow for large files. Again, that isn't
 much of a problem for us. Also, if you forget a `;` at the end of a file, it'll
--- a/7
+++ b/7
@ -0,0 +1,7 @@
 all: markdown README.html
 	$(MAKE) -C 00
 	$(MAKE) -C 01
 markdown: markdown.c
 	$(CC) -O2 -o markdown -Wall -Wconversion -Wshadow -std=c89 markdown.c
 README.html: markdown README.md
 	./markdown README.md
--- a/README.md
+++ b/README.md
@ -11,14 +11,15 @@ executable, and the last one will be a C compiler. Each directory has its own
 README explaining what's going on.
 You can run `bootstrap.sh` to run through and test every stage.
 To get HTML versions of all README pages, run `make`.
 ## the basics
 In this series, I want to explain *everything* that's going on. I'm going to
-need to assume some passing knowledge about computers, but here's a quick
+need to assume some passing knowledge, so here's a quick overview of what you'll
-overview of what you'll want to know before starting. I can't explain everything
+want to know before starting. I can't explain everything so you may need to do
-so you may need to do your own research. You don't need to understand each of
+your own research. You don't need to understand each of these in full, just get
-these in full, just get a general idea at least:
+a general idea at least:
 - what an operating system is
 - what memory is
@ -59,8 +60,8 @@ not right away.
 Bootstrapping a compiler is not an easy task, so we're trying to make it as easy
 as possible. We don't even necessarily need a standard-compliant C compiler, we
-only need enough to compile someone else's C compiler, specifically TCC
+only need enough to compile someone else's C compiler, specifically we'll be
-(https://bellard.org/tcc/) since that's a compiler with very few dependencies.
+using [TCC](https://bellard.org/tcc/) since it's written in standard C89.
 - efficiency is not a concern
@ -71,7 +72,7 @@ with itself, we'll get the same executable either way.
 ## reflections on trusting trust
 In 1984, Ken Thompson wrote the well-known article
-[*Reflections on Trusting Trust*](http://users.ece.cmu.edu/~ganger/712.fall02/papers/p761-thompson.pdf).
+[Reflections on Trusting Trust](http://users.ece.cmu.edu/~ganger/712.fall02/papers/p761-thompson.pdf).
 This is one of the things that inspired me to start this project. To summarize
 the article: it is possible to create a malicious C compiler which will
 replicate its own malicious functionalities (e.g. detecting password-checking
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -26,6 +26,7 @@ else
 	exit 1
 fi
 echo 'Processing stage 00...'
 cd 00
 rm -f out00
 make -s out00
@ -36,6 +37,7 @@ fi
 rm -f out00
 cd ..
 echo 'Processing stage 01...'
 cd 01
 rm -f out0[01]
 make -s out01
--- a/markdown.c
+++ b/markdown.c
@ -0,0 +1,207 @@
 /*
 a little program to convert markdown to html, for READMEs
 I was using markdown.pl but that has some annoying problems
 This doesn't support all of markdown; I'll add more as I need it.
 */
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 /* output text with *s for italics and stuff */
 static void output_md_text(FILE *out, int *flags, int line_number, const char *text) {
 	enum {
 		FLAG_I = 0x01, /* italics */
 		FLAG_B = 0x02,
 		FLAG_CODE = 0x04
 	};
 	const char *p;
 	for (p = text; *p; ++p) {
 		if ((*flags & FLAG_CODE) && *p != '`') {
 			putc(*p, out);
 			continue;
 		}
 		switch (*p) {
 		case '*':
 			if (p[1] == '*') {
 				/* bold */
 				if (*flags & FLAG_B) {
 					fprintf(out, "</b>");
 					*flags &= ~FLAG_B;
 				} else {
 					fprintf(out, "<b>");
 					*flags |= FLAG_B;
 				}
 				++p;
 			} else {
 				/* italics */
 				if (*flags & FLAG_I) {
 					fprintf(out, "</i>");
 					*flags &= ~FLAG_I;
 				} else {
 					fprintf(out, "<i>");
 					*flags |= FLAG_I;
 				}
 			}
 			break;
 		case '`':
 			/* code */
 			if (*flags & FLAG_CODE) {
 				fprintf(out, "</code>");
 				*flags &= ~FLAG_CODE;
 			} else {
 				fprintf(out, "<code>");
 				*flags |= FLAG_CODE;
 			}
 			break;
 		case '[': {
 			/* link */
 			const char *label, *url, *label_end, *url_end;
 			int n_label, n_url;
 			label = p+1;
 			label_end = strchr(label, ']');
 			if (!label_end) {
 				fprintf(stderr, "line %d: Unterminated link.\n", line_number);
 				exit(-1);
 			}
 			if (label_end[1] != '(') {
 				fprintf(stderr, "line %d: Bad link syntax.\n", line_number);
 				exit(-1);
 			}
 			url = label_end + 2;
 			url_end = strchr(url, ')');
 			if (!url_end) {
 				fprintf(stderr, "line %d: Unterminated URL.\n", line_number);
 				exit(-1);
 			}
 			n_label = (int)(label_end - label);
 			n_url  = (int)(url_end  - url);
 			fprintf(out, "<a href=\"%.*s\" target=\"_blank\">%.*s</a>",
 				n_url, url, n_label, label);
 			p = url_end;
 		} break;
 		case '-':
 			if (p[1] == '-') {
 				/* em dash */
 				fprintf(out, "—");
 				++p;
 			} else {
 				goto default_case;
 			}
 			break;
 		default:
 		default_case:
 			putc(*p, out);
 			break;
 		}
 	}
 }
 int main(int argc, char **argv) {
 	FILE *in, *out;
 	char line[1024] = {0};
 	char title[256] = {0};
 	int flags = 0, txtflags = 0;
 	int line_number = 0;
 	enum {
 		FLAG_UL = 1
 	};
 	if (argc < 2) {
 		fprintf(stderr, "Please provide an input file.\n");
 		return -1;
 	}
 	{
 		const char *in_filename = argv[1];
 		char out_filename[256] = {0};
 		char *dot;
 		strncpy(out_filename, argv[1], 200);
 		dot = strrchr(out_filename, '.');
 		if (!dot || strcmp(dot, ".md") != 0) {
 			fprintf(stderr, "Input filename does not end in .md\n");
 			return -1;
 		}
 		*dot = '\0';
 		strcpy(title, out_filename);
 		strcpy(dot, ".html");
 		in = fopen(in_filename, "rb");
 		out = fopen(out_filename, "wb");
 	}
 	if (!in) {
 		perror("Couldn't open input file");
 		return -1;
 	}
 	if (!out) {
 		perror("Couldn't open output file");
 		return -1;
 	}
 	fprintf(out,
 		"<!DOCTYPE html>\n"
 		"<html lang=\"en\">\n"
 		"<head>\n"
 		"<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n"
 		"<meta charset=\"utf-8\">\n"
 		"<style>\n"
 		"body { font-family: sans-serif; }\n"
 		"</style>\n"
 		"<title>%s</title>\n"
 		"</head>\n"
 		"<body>\n"
 		"<p>\n", title
 	);
 	while (fgets(line, sizeof line, in)) {
 		++line_number;
 		line[strcspn(line, "\r\n")] = '\0';
 		if (line[0] == '#') {
 			/* heading */
 			int n = 1;
 			while (line[n] == '#') ++n;
 			fprintf(out, "</p><h%d>", n);
 			output_md_text(out, &txtflags, line_number, line + n);
 			fprintf(out, "</h%d><p>\n", n);
 		} else if (line[0] == '\0') {
 			if (flags & FLAG_UL) {
 				fprintf(out, "</li></ul>\n");
 				flags &= ~FLAG_UL;
 			}
 			fprintf(out, "</p>\n<p>\n");
 		} else if (strncmp(line, "- ", 2) == 0) {
 			/* bullet */
 			if (flags & FLAG_UL) {
 				fprintf(out, "</li><li>");
 			} else {
 				fprintf(out, "<ul><li>");
 				flags |= FLAG_UL;
 			}
 			output_md_text(out, &txtflags, line_number, line + 2);
 			fprintf(out, "\n");
 		} else if (strncmp(line, "```", 3) == 0) {
 			fprintf(out, "<pre><code>\n");
 			while (fgets(line, sizeof line, in)) {
 				++line_number;
 				if (strncmp(line, "```", 3) == 0)
 					break;
 				fprintf(out, "%s", line);
 			}
 			fprintf(out, "</code></pre>\n");
 		} else {
 			output_md_text(out, &txtflags, line_number, line);
 			fprintf(out, "\n");
 		}
 	}
 	fprintf(out, "</p>\n</body>\n</html>\n");
 }