hlhtml.py

#!/usr/bin/env python3
# vim: set ts=4 sw=4 expandtab syntax=python :

import os
import os.path
import io
import re
import html
import base64
import hashlib
import textwrap
import mimetypes
import pygments
import pygments.lexers
import pygments.formatters

from pathlib import Path
from argparse import ArgumentParser


RE_VIM_SYNTAX = re.compile(r"vim\s*:\s*.*syntax=(\S+)")


def parseArguments():
    parser = ArgumentParser(description='Highlight a source file for https://irys.cc')
    parser.add_argument('-l', action='store', dest='lexer', default=None, help='source lexer (default: auto-detect)')
    parser.add_argument('-f', action='store', dest='display_fn', default=None, help='filename to display in output (default: input file basename)')
    parser.add_argument('-o', action='store', dest='output', default=None, help='output file (default: <display filename>.html)')
    parser.add_argument('fn', action='store', metavar='FILE', type=str, help='input file')

    return parser.parse_args()


def guessLexer(file, names = []):
    lexer = None
    file.seek(0)
    for _ in range(10):
        m = RE_VIM_SYNTAX.search(file.readline())
        if m is not None:
            try:
                lexer = pygments.lexers.get_lexer_by_name(m.group(1))
                break
            except:
                pass
    
    if lexer is None:
        for name in names:
            if name is not None:
                try:
                    lexer = pygments.lexers.get_lexer_for_filename(name)
                except:
                    pass

    file.seek(0)
    return lexer


def main():
    args = parseArguments()
    filename = args.display_fn or Path(args.fn).name
    output_fn = args.output or (filename + '.html')

    # read file into memory
    file = io.StringIO()
    with open(args.fn, 'r', encoding='utf-8') as fh:
        file.write(fh.read())

    # get the lexer to use
    lexer = guessLexer(file, [args.display_fn, args.fn])
    if args.lexer is not None:
        lexer = pygments.lexers.get_lexer_by_name(args.lexer)
    
    # rough mime type guess lmao
    filemime = lexer.mimetypes[0] or mimetypes.guess_type(filename) 

    # get sha256
    file.seek(0)
    sha256 = hashlib.sha256(file.read().encode('utf-8')).hexdigest()

    # base64 the entire thing
    file.seek(0)
    b64content = base64.b64encode(file.read().encode('utf-8')).decode('utf-8')
    dataurl = f"data:{filemime};base64,{b64content}"

    # TODO: render documentation, if any
    docblock = ""
    if docblock is not None and docblock != "":
        docblock = f"<section class=\"sourcepage-docblock\">{docblock}</section>"
    docblock = docblock.replace("\n", "&#x0a;")

    # pygmentize
    file.seek(0)
    formatter = pygments.formatters.get_formatter_by_name('html')
    hlsource = pygments.highlight(file.read(), lexer, formatter)
    hlsource = hlsource.replace("\n", "&#x0a;")

    # write the output
    output = textwrap.dedent(f"""\
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="utf-8">
            <meta name="viewport" content="initial-scale=1, width=device-width">
            <link rel="stylesheet" href="/styles.css">
            <link rel="stylesheet" href="/pygments.css">
            <title>{filename}</title>
        </head>
        <body>
            <header class="sourcepage-header">
                <nav><a href="/">&laquo; back</a></nav>
                <h1>{filename}</h1>
            </header>
            {docblock}
            <section class="sourcepage-hlsource">{hlsource}</section>
            <footer class="sourcepage-footer">
                <ul class="inline-list">
                    <li>SHA256: <code>{sha256}</code></li>
                    <li><a href="{dataurl}" target="_blank">raw source</a></li>
                </ul>
            </footer>
        </body>
        </html>
    """)

    with open(output_fn, 'w') as out_fh:
        out_fh.write(output)

    return 0


if __name__ == "__main__":
    exit(main())