markdown to HTML converter

2021-09-01 18:27:51 -04:00 · 2021-09-01 18:27:51 -04:00 · 336a793149
commit 336a793149
parent f71545c939
9 changed files with 233 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 README.html
 out??
+markdown
--- a/00/Makefile
+++ b/00/Makefile
@ -1,5 +1,5 @@
 all: README.html out00
-%.html: %.md
-	markdown $< > $@
 out00: in00
 	./hexcompile
+%.html: %.md ../markdown
+	../markdown $<
--- a/00/README.md
+++ b/00/README.md
@ -1,7 +1,7 @@
 # stage 00

 This directory contains the file `hexcompile`, a handwritten executable. It
-takes input file `in00` containing space/newline/[any character]-separated
+takes input file `in00` containing space/newline/(any character)-separated
 hexadecimal digit pairs (e.g. `3f`) and outputs them as bytes to the file
 `out00`. On 64-bit Linux, try running `./hexcompile` from this directory (I've
 already provided an `in00` file, which you can take a look at), and you will get
@ -369,7 +369,7 @@ That's quite a lot to take in for such a simple program, but here we are! We now
 have something that will let us write individual bytes with an ordinary text
 editor and get them translated into a binary file.

-## Limitations
+## limitations

 There are many ways in which this is a bad program. It will *only* properly
 handle lowercase hexadecimal digit pairs, separated by exactly one character,
@ -381,7 +381,7 @@ Also, we only read in data *three bytes at a time*, and output one byte at a
 time. This is a very bad idea because syscalls (e.g. `read`) are slow. `read`
 might take ~3 microseconds, which doesn't sound like a lot, but it means that if
 we used code like this to process a 50 megabyte file, say, we'd be waiting for
-a long time.
+a while.

 But these problems aren't really a big deal. We'll only be running this on
 little programs and we'll be sure to check that our input is in the right
--- a/01/Makefile
+++ b/01/Makefile
@ -3,5 +3,5 @@ out01: in01 out00
 	./out00
 out00: in00
 	../00/hexcompile
-%.html: %.md
-	markdown $< > $@
+%.html: %.md ../markdown
+	../markdown $<
--- a/01/README.md
+++ b/01/README.md
@ -333,7 +333,7 @@ header. But by a lucky coincidence, all those entries actually land on 0 bytes,
 so they'll just be treated as unrecognized (as they should be). So it's all
 good.

-## Limitations
+## limitations

 Like our last program, this one will be slow for large files. Again, that isn't
 much of a problem for us. Also, if you forget a `;` at the end of a file, it'll
--- a/7
+++ b/7
@ -0,0 +1,7 @@
+all: markdown README.html
+	$(MAKE) -C 00
+	$(MAKE) -C 01
+markdown: markdown.c
+	$(CC) -O2 -o markdown -Wall -Wconversion -Wshadow -std=c89 markdown.c
+README.html: markdown README.md
+	./markdown README.md
--- a/README.md
+++ b/README.md
@ -11,14 +11,15 @@ executable, and the last one will be a C compiler. Each directory has its own
 README explaining what's going on.

 You can run `bootstrap.sh` to run through and test every stage.
+To get HTML versions of all README pages, run `make`.

 ## the basics

 In this series, I want to explain *everything* that's going on. I'm going to
-need to assume some passing knowledge about computers, but here's a quick
-overview of what you'll want to know before starting. I can't explain everything
-so you may need to do your own research. You don't need to understand each of
-these in full, just get a general idea at least:
+need to assume some passing knowledge, so here's a quick overview of what you'll
+want to know before starting. I can't explain everything so you may need to do
+your own research. You don't need to understand each of these in full, just get
+a general idea at least:

 - what an operating system is
 - what memory is
@ -59,8 +60,8 @@ not right away.

 Bootstrapping a compiler is not an easy task, so we're trying to make it as easy
 as possible. We don't even necessarily need a standard-compliant C compiler, we
-only need enough to compile someone else's C compiler, specifically TCC
-(https://bellard.org/tcc/) since that's a compiler with very few dependencies.
+only need enough to compile someone else's C compiler, specifically we'll be
+using [TCC](https://bellard.org/tcc/) since it's written in standard C89.

 - efficiency is not a concern

@ -71,7 +72,7 @@ with itself, we'll get the same executable either way.
 ## reflections on trusting trust

 In 1984, Ken Thompson wrote the well-known article
-[*Reflections on Trusting Trust*](http://users.ece.cmu.edu/~ganger/712.fall02/papers/p761-thompson.pdf).
+[Reflections on Trusting Trust](http://users.ece.cmu.edu/~ganger/712.fall02/papers/p761-thompson.pdf).
 This is one of the things that inspired me to start this project. To summarize
 the article: it is possible to create a malicious C compiler which will
 replicate its own malicious functionalities (e.g. detecting password-checking
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -26,6 +26,7 @@ else
 	exit 1
 fi

+echo 'Processing stage 00...'
 cd 00
 rm -f out00
 make -s out00
@ -36,6 +37,7 @@ fi
 rm -f out00
 cd ..

+echo 'Processing stage 01...'
 cd 01
 rm -f out0[01]
 make -s out01
--- a/markdown.c
+++ b/markdown.c
@ -0,0 +1,207 @@
+/*
+a little program to convert markdown to html, for READMEs
+I was using markdown.pl but that has some annoying problems
+This doesn't support all of markdown; I'll add more as I need it.
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* output text with *s for italics and stuff */
+static void output_md_text(FILE *out, int *flags, int line_number, const char *text) {
+	enum {
+		FLAG_I = 0x01, /* italics */
+		FLAG_B = 0x02,
+		FLAG_CODE = 0x04
+	};
+	const char *p;
+
+	for (p = text; *p; ++p) {
+		if ((*flags & FLAG_CODE) && *p != '`') {
+			putc(*p, out);
+			continue;
+		}
+		switch (*p) {
+		case '*':
+			if (p[1] == '*') {
+				/* bold */
+				if (*flags & FLAG_B) {
+					fprintf(out, "</b>");
+					*flags &= ~FLAG_B;
+				} else {
+					fprintf(out, "<b>");
+					*flags |= FLAG_B;
+				}
+				++p;
+			} else {
+				/* italics */
+				if (*flags & FLAG_I) {
+					fprintf(out, "</i>");
+					*flags &= ~FLAG_I;
+				} else {
+					fprintf(out, "<i>");
+					*flags |= FLAG_I;
+				}
+			}
+			break;
+		case '`':
+			/* code */
+			if (*flags & FLAG_CODE) {
+				fprintf(out, "</code>");
+				*flags &= ~FLAG_CODE;
+			} else {
+				fprintf(out, "<code>");
+				*flags |= FLAG_CODE;
+			}
+			break;
+		case '[': {
+			/* link */
+			const char *label, *url, *label_end, *url_end;
+			int n_label, n_url;
+
+			label = p+1;
+			label_end = strchr(label, ']');
+			if (!label_end) {
+				fprintf(stderr, "line %d: Unterminated link.\n", line_number);
+				exit(-1);
+			}
+			if (label_end[1] != '(') {
+				fprintf(stderr, "line %d: Bad link syntax.\n", line_number);
+				exit(-1);
+			}
+			url = label_end + 2;
+			url_end = strchr(url, ')');
+			if (!url_end) {
+				fprintf(stderr, "line %d: Unterminated URL.\n", line_number);
+				exit(-1);
+			}
+
+			n_label = (int)(label_end - label);
+			n_url  = (int)(url_end  - url);
+			fprintf(out, "<a href=\"%.*s\" target=\"_blank\">%.*s</a>",
+				n_url, url, n_label, label);
+			p = url_end;
+		} break;
+		case '-':
+			if (p[1] == '-') {
+				/* em dash */
+				fprintf(out, "—");
+				++p;
+			} else {
+				goto default_case;
+			}
+			break;
+		default:
+		default_case:
+			putc(*p, out);
+			break;
+		}
+	}
+}
+
+int main(int argc, char **argv) {
+	FILE *in, *out;
+	char line[1024] = {0};
+	char title[256] = {0};
+	int flags = 0, txtflags = 0;
+	int line_number = 0;
+	enum {
+		FLAG_UL = 1
+	};
+
+	if (argc < 2) {
+		fprintf(stderr, "Please provide an input file.\n");
+		return -1;
+	}
+
+	{
+		const char *in_filename = argv[1];
+		char out_filename[256] = {0};
+		char *dot;
+		strncpy(out_filename, argv[1], 200);
+		dot = strrchr(out_filename, '.');
+		if (!dot || strcmp(dot, ".md") != 0) {
+			fprintf(stderr, "Input filename does not end in .md\n");
+			return -1;
+		}
+		*dot = '\0';
+		strcpy(title, out_filename);
+		strcpy(dot, ".html");
+
+
+		in = fopen(in_filename, "rb");
+		out = fopen(out_filename, "wb");
+	}
+
+	if (!in) {
+		perror("Couldn't open input file");
+		return -1;
+	}
+	if (!out) {
+		perror("Couldn't open output file");
+		return -1;
+	}
+
+	fprintf(out,
+		"<!DOCTYPE html>\n"
+		"<html lang=\"en\">\n"
+		"<head>\n"
+		"<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n"
+		"<meta charset=\"utf-8\">\n"
+		"<style>\n"
+		"body { font-family: sans-serif; }\n"
+		"</style>\n"
+		"<title>%s</title>\n"
+		"</head>\n"
+		"<body>\n"
+		"<p>\n", title
+	);
+	while (fgets(line, sizeof line, in)) {
+		++line_number;
+		line[strcspn(line, "\r\n")] = '\0';
+
+		if (line[0] == '#') {
+			/* heading */
+			int n = 1;
+			while (line[n] == '#') ++n;
+			fprintf(out, "</p><h%d>", n);
+			output_md_text(out, &txtflags, line_number, line + n);
+			fprintf(out, "</h%d><p>\n", n);
+		} else if (line[0] == '\0') {
+			if (flags & FLAG_UL) {
+				fprintf(out, "</li></ul>\n");
+				flags &= ~FLAG_UL;
+			}
+			fprintf(out, "</p>\n<p>\n");
+		} else if (strncmp(line, "- ", 2) == 0) {
+			/* bullet */
+			if (flags & FLAG_UL) {
+				fprintf(out, "</li><li>");
+			} else {
+				fprintf(out, "<ul><li>");
+				flags |= FLAG_UL;
+			}
+			output_md_text(out, &txtflags, line_number, line + 2);
+			fprintf(out, "\n");
+		} else if (strncmp(line, "```", 3) == 0) {
+			fprintf(out, "<pre><code>\n");
+			
+			while (fgets(line, sizeof line, in)) {
+				++line_number;
+				if (strncmp(line, "```", 3) == 0)
+					break;
+				fprintf(out, "%s", line);
+			}
+
+			fprintf(out, "</code></pre>\n");
+		} else {
+			output_md_text(out, &txtflags, line_number, line);
+			fprintf(out, "\n");
+		}
+
+
+
+	}
+	fprintf(out, "</p>\n</body>\n</html>\n");
+}