view abcfield.py @ 735:68f926e61d16

Revise Markdown handling of character entities. Ideally I'd like the ABC character entities to survive Markdown and then get converted. But because they start with '\', they don't. So I have no alternative but to convert them to HTML entities, which Markdown then converts to UTF-8.
author Jim Hague <jim.hague@acm.org>
date Thu, 12 Oct 2017 13:32:24 +0100
parents e896cf93fe98
children 6bd946700312
line wrap: on
line source

#!/usr/bin/env python3
#
# Extact a text field (title, by default) from a .abc file, and print it out
# with any ABC accented characters converted to HTML (default) or Latex.
#
# Optionally rearrange a field into display format:
# * In Title fields, change 'sort' form such as 'Exploding Potato, The'
#   to display format 'The Exploding Potato'.
# * In Key fields, translate the ABC key representation to full text,
#   e.g. G#dor becomes G# Dorian.
#
# Recognise continuation header fields and print those too. The ABC standard
# defines continuation fields as starting ':+'. Regrettably none of the tools
# I am using the Booke recognise that syntax, so I am adopting a Booke
# convention of '<header>:+' *also* being a continuation. Note that a
# continuation is a distinct line in the field value; the value has a line
# break between it and the previous line.
#

import argparse
import pathlib
import re
import subprocess
import sys

accentedletters = {
    # Acute accents
    "'A" : ("&Aacute;", "\\'{A}"),
    "'E" : ("&Eacute;", "\\'{E}"),
    "'I" : ("&Iacute;", "\\'{I}"),
    "'O" : ("&Oacute;", "\\'{O}"),
    "'U" : ("&Uacute;", "\\'{U}"),
    "'Y" : ("&Yacute;", "\\'{Y}"),
    "'a" : ("&aacute;", "\\'{a}"),
    "'e" : ("&eacute;", "\\'{e}"),
    "'i" : ("&iacute;", "\\'{i}"),
    "'o" : ("&oacute;", "\\'{o}"),
    "'u" : ("&uacute;", "\\'{u}"),
    "'y" : ("&yacute;", "\\'{y}"),

    # Grave accents
    "`A" : ("&Agrave;", "\\`{A}"),
    "`E" : ("&Egrave;", "\\`{E}"),
    "`I" : ("&Igrave;", "\\`{I}"),
    "`O" : ("&Ograve;", "\\`{O}"),
    "`U" : ("&Ugrave;", "\\`{U}"),
    "`a" : ("&agrave;", "\\`{a}"),
    "`e" : ("&egrave;", "\\`{e}"),
    "`i" : ("&igrave;", "\\`{i}"),
    "`o" : ("&ograve;", "\\`{o}"),
    "`u" : ("&ugrave;", "\\`{u}"),

    # Umlauts
    "\"A" : ("&Auml;", "\\\"{A}"),
    "\"E" : ("&Euml;", "\\\"{E}"),
    "\"I" : ("&Iuml;", "\\\"{I}"),
    "\"O" : ("&Ouml;", "\\\"{O}"),
    "\"U" : ("&Uuml;", "\\\"{U}"),
    "\"Y" : ("&Yuml;", "\\\"{Y}"),
    "\"a" : ("&auml;", "\\\"{a}"),
    "\"e" : ("&euml;", "\\\"{e}"),
    "\"i" : ("&iuml;", "\\\"{\i}"),
    "\"o" : ("&ouml;", "\\\"{o}"),
    "\"u" : ("&uuml;", "\\\"{u}"),
    "\"y" : ("&yuml;", "\\\"{y}"),

    # Circumflexes
    "^A" : ("&Acirc;", "\\^{A}"),
    "^E" : ("&Ecirc;", "\\^{E}"),
    "^I" : ("&Icirc;", "\\^{I}"),
    "^O" : ("&Ocirc;", "\\^{O}"),
    "^U" : ("&Ucirc;", "\\^{U}"),
    "^a" : ("&acirc;", "\\^{a}"),
    "^e" : ("&ecirc;", "\\^{e}"),
    "^i" : ("&icirc;", "\\^{\i}"),
    "^o" : ("&ocirc;", "\\^{o}"),
    "^u" : ("&ucirc;", "\\^{u}"),

    # Tilde
    "~A" : ("&Atilde;", "\\~{A}"),
    "~N" : ("&Ntilde;", "\\~{N}"),
    "~O" : ("&Otilde;", "\\~{O}"),
    "~a" : ("&atilde;", "\\~{a}"),
    "~n" : ("&ntilde;", "\\~{n}"),
    "~o" : ("&otilde;", "\\~{o}"),

    # Cedilla
    ",C" : ("&Ccedil;", "\\c{C}"),
    ",c" : ("&ccedil;", "\\c{c}"),

    # Slash
    "/O" : ("&Oslash;", "\\O"),
    "/o" : ("&oslash;", "\\o"),

    # Ring
    "AA" : ("&Aring;", "\\r{A}"),
    "aa" : ("&aring;", "\\r{a}"),

    # Ligatures
    "AE" : ("&AElig;", "\\AE"),
    "ae" : ("&aelig;", "\\ae"),
    "ss" : ("&szlig;", "\\ss"),

    # Quote marks
    "''" : ("&apos;", "'"),
    "'\"" : ("&quot;", "\""),
}

abckeys = {
    "m":   "Minor",
    "min": "Minor",
    "mix": "Mixolydian",
    "dor": "Dorian",
    "phr": "Phrygian",
    "lyd": "Lydian",
    "loc": "Locrian",
}

# Convert ABC accented chars to HTML entities or LaTex.
def convertAccents(t, latex=False):
    res = ""
    while True:
        p = t.partition('\\')
        res += p[0]
        if p[1] == "":
            break
        abc = p[2][0:2]
        t = p[2][2:]
        if abc in accentedletters:
            if latex:
                res += accentedletters[abc][1]
            else:
                res += accentedletters[abc][0]
        else:
            res += "\\" + abc
    return res

# Convert Title fields from sort to display, so Bat, The->The Bat.
def convertTitleToDisplay(t):
    p = t.rpartition(',')
    if p[1] == "":
        return t
    else:
        first = p[2].strip()
        second = p[0].strip()
        return (first + " " if first.isalnum() else first) + second

# Convert Key field from ABC to display, so G#dor->G# Dorian.
def convertKeyToDisplay(t):
    letter = t[0].upper()
    accidental = ""
    mode = ""
    try:
        accidental = t[1]
        if accidental == '#' or accidental == 'b':
            mode = t[2:]
        else:
            accidental = ""
            mode = t[1:]
    except IndexError:
        pass
    mode = mode.strip().lower()
    return letter + accidental + ' ' + abckeys.get(mode, "Major")

# Convert input string from Markdown to HTML or LaTeX. Fix up link
# targets so any 'foo.abc' target links to the tune with that name.
def convertMarkdown(t, latex):
    if latex:
        target = "--to=latex"
    else:
        target = "--to=html"
    res = subprocess.check_output(['pandoc', '--from=markdown', target], input=t, universal_newlines=True)
    if latex:
        res = re.sub(r'\\href{(.*?).abc}', r'\\hyperlink{\1}', res)
    else:
        res = re.sub(r'href="(.*?).abc"', r'href="\1.html"', res)
    return res.strip()

# Implement a custom Markdown shorthand for referencing ABC files.
# <foo.abc> will expand to ['title of foo'](foo.abc).
def expandCustomMarkdown(t, dir):
    # Given a match to (foo.abc), return a markdown link to the tune with the
    # title (and subtitle, if present) of the tune as the text of the link.
    # Because we're going through Markdown, character entities must be
    # HTML. Pandoc will convert them to UTF-8.
    def getTitleLink(m):
        fname = m.group(1) + ".abc"
        path = pathlib.Path(dir, fname)
        with path.open() as f:
            lines = f.readlines()
            return "[" + getFullTitle(lines, dir) + "](" + fname + ")"
    return re.sub(r'<(.*?).abc>', getTitleLink, t)

# Return the raw text for a given field. Optionally the nth field is taken,
# or the field data must start with a designated string to be recognised.
def getFieldText(lines, field, n = 1, starts = None):
    res = ""
    for line in lines:
        line = line.strip()
        if len(line) > 2 and line[1] == ':':
            if line[0] == "+" or (line[0] == field and line[2] == "+"):
                if not res:
                    continue
                if line[0] == "+":
                    line = line[2:]
                else:
                    line = line[3:]
                res = res + '\n' + line.strip()
            else:
                if res:
                    break
                if line[0] == field:
                    line = line[2:].strip()
                    if starts:
                        if line.find(starts) != 0:
                            continue
                        line = line[len(starts):].strip()
                    if n > 1:
                        n = n - 1
                        continue
                    res = line
    return res

# Return display text for a given field.
def getFieldDisplayText(lines, dir, field, n = 1, starts = None, latex = False):
    res = getFieldText(lines, field, n, starts)
    if res:
        # Fields that go through Markdown must have HTML entities.
        mdfield = field.upper() in ['H', 'N'];
        res = convertAccents(res, False if mdfield else latex)
        if field.upper() == "T":
            res = convertTitleToDisplay(res)
        elif field.upper() == "K":
            res = convertKeyToDisplay(res)
        elif mdfield:
            res = convertMarkdown(expandCustomMarkdown(res, dir), latex)
    return res

# Return full title (title + [" (" + subtitle + ")"] if subtitle exists).
def getFullTitle(lines, dir, starts = None, latex = False):
    title = getFieldDisplayText(lines, dir, "T", starts=starts, latex=latex)
    subtitle = getFieldDisplayText(lines, dir, "T", n=2, starts=starts, latex=latex)
    return title if len(subtitle) == 0 else title + " (" + subtitle + ")"

if __name__ == "__main__":
    def process(f, dir, args):
        lines = f.readlines()
        if args.display:
            if args.field.upper() == "FT":
                line = getFullTitle(lines, dir, args.starts, args.latex)
            else:
                line = getFieldDisplayText(lines, dir, args.field, args.index, args.starts, args.latex)
        else:
            if args.field.upper() == "FT":
                args.field = "T"
            line = getFieldText(lines, args.field, args.index, args.starts)
        if line:
            print(line)
            return True
        else:
            return False

    # execute only if run as a script
    parser = argparse.ArgumentParser(description="Extract field data from ABC file.")
    parser.add_argument("-f", "--field", dest="field", default="T",
                        help=("extract the given field [default: %(default)s]. "
                              "Field FT is special; it returns the full title "
                              "- the title followed by subtitle in () if "
                              "present - for display text, or just the title "
                              "for non-display text."))
    parser.add_argument("-l", "--latex", dest="latex",
                        action="store_true", default=False,
                        help="convert special characters for LaTeX (default HTML)")
    parser.add_argument("-d", "--display", dest="display",
                        action="store_true", default=False,
                        help=("convert to display text. Convert accents to "
                              "LaTeX or HTML, in titles convert 'Tune, The' to "
                              "'The Tune', convert keys to full key name, "
                              "and expand Markdown in notes and history."))
    parser.add_argument("-n", "--index", dest="index",
                        action="store", type=int, default=1,
                        help="report INDEXth value [default: %(default)s]")
    parser.add_argument("-s", "--starts", dest="starts",
                        action="store", default=None,
                        help=("report only if line starts with CONTENT "
                              "and remove CONTENT"),
                        metavar="CONTENT")
    parser.add_argument('input', type=argparse.FileType('r'),
                        help='input ABC file')
    args = parser.parse_args()

    path = pathlib.Path(args.input.name)
    with path.open() as f:
        res = process(f, path.parent, args)
    sys.exit(int(not res))