Mercurial > dottes
view abcfield.py @ 735:68f926e61d16
Revise Markdown handling of character entities.
Ideally I'd like the ABC character entities to survive Markdown and then
get converted. But because they start with '\', they don't. So I have
no alternative but to convert them to HTML entities, which Markdown then
converts to UTF-8.
author | Jim Hague <jim.hague@acm.org> |
---|---|
date | Thu, 12 Oct 2017 13:32:24 +0100 |
parents | e896cf93fe98 |
children | 6bd946700312 |
line wrap: on
line source
#!/usr/bin/env python3 # # Extact a text field (title, by default) from a .abc file, and print it out # with any ABC accented characters converted to HTML (default) or Latex. # # Optionally rearrange a field into display format: # * In Title fields, change 'sort' form such as 'Exploding Potato, The' # to display format 'The Exploding Potato'. # * In Key fields, translate the ABC key representation to full text, # e.g. G#dor becomes G# Dorian. # # Recognise continuation header fields and print those too. The ABC standard # defines continuation fields as starting ':+'. Regrettably none of the tools # I am using the Booke recognise that syntax, so I am adopting a Booke # convention of '<header>:+' *also* being a continuation. Note that a # continuation is a distinct line in the field value; the value has a line # break between it and the previous line. # import argparse import pathlib import re import subprocess import sys accentedletters = { # Acute accents "'A" : ("Á", "\\'{A}"), "'E" : ("É", "\\'{E}"), "'I" : ("Í", "\\'{I}"), "'O" : ("Ó", "\\'{O}"), "'U" : ("Ú", "\\'{U}"), "'Y" : ("Ý", "\\'{Y}"), "'a" : ("á", "\\'{a}"), "'e" : ("é", "\\'{e}"), "'i" : ("í", "\\'{i}"), "'o" : ("ó", "\\'{o}"), "'u" : ("ú", "\\'{u}"), "'y" : ("ý", "\\'{y}"), # Grave accents "`A" : ("À", "\\`{A}"), "`E" : ("È", "\\`{E}"), "`I" : ("Ì", "\\`{I}"), "`O" : ("Ò", "\\`{O}"), "`U" : ("Ù", "\\`{U}"), "`a" : ("à", "\\`{a}"), "`e" : ("è", "\\`{e}"), "`i" : ("ì", "\\`{i}"), "`o" : ("ò", "\\`{o}"), "`u" : ("ù", "\\`{u}"), # Umlauts "\"A" : ("Ä", "\\\"{A}"), "\"E" : ("Ë", "\\\"{E}"), "\"I" : ("Ï", "\\\"{I}"), "\"O" : ("Ö", "\\\"{O}"), "\"U" : ("Ü", "\\\"{U}"), "\"Y" : ("Ÿ", "\\\"{Y}"), "\"a" : ("ä", "\\\"{a}"), "\"e" : ("ë", "\\\"{e}"), "\"i" : ("ï", "\\\"{\i}"), "\"o" : ("ö", "\\\"{o}"), "\"u" : ("ü", "\\\"{u}"), "\"y" : ("ÿ", "\\\"{y}"), # Circumflexes "^A" : ("Â", "\\^{A}"), "^E" : ("Ê", "\\^{E}"), "^I" : ("Î", "\\^{I}"), "^O" : ("Ô", "\\^{O}"), "^U" : ("Û", "\\^{U}"), "^a" : ("â", "\\^{a}"), "^e" : ("ê", "\\^{e}"), "^i" : ("î", "\\^{\i}"), "^o" : ("ô", "\\^{o}"), "^u" : ("û", "\\^{u}"), # Tilde "~A" : ("Ã", "\\~{A}"), "~N" : ("Ñ", "\\~{N}"), "~O" : ("Õ", "\\~{O}"), "~a" : ("ã", "\\~{a}"), "~n" : ("ñ", "\\~{n}"), "~o" : ("õ", "\\~{o}"), # Cedilla ",C" : ("Ç", "\\c{C}"), ",c" : ("ç", "\\c{c}"), # Slash "/O" : ("Ø", "\\O"), "/o" : ("ø", "\\o"), # Ring "AA" : ("Å", "\\r{A}"), "aa" : ("å", "\\r{a}"), # Ligatures "AE" : ("Æ", "\\AE"), "ae" : ("æ", "\\ae"), "ss" : ("ß", "\\ss"), # Quote marks "''" : ("'", "'"), "'\"" : (""", "\""), } abckeys = { "m": "Minor", "min": "Minor", "mix": "Mixolydian", "dor": "Dorian", "phr": "Phrygian", "lyd": "Lydian", "loc": "Locrian", } # Convert ABC accented chars to HTML entities or LaTex. def convertAccents(t, latex=False): res = "" while True: p = t.partition('\\') res += p[0] if p[1] == "": break abc = p[2][0:2] t = p[2][2:] if abc in accentedletters: if latex: res += accentedletters[abc][1] else: res += accentedletters[abc][0] else: res += "\\" + abc return res # Convert Title fields from sort to display, so Bat, The->The Bat. def convertTitleToDisplay(t): p = t.rpartition(',') if p[1] == "": return t else: first = p[2].strip() second = p[0].strip() return (first + " " if first.isalnum() else first) + second # Convert Key field from ABC to display, so G#dor->G# Dorian. def convertKeyToDisplay(t): letter = t[0].upper() accidental = "" mode = "" try: accidental = t[1] if accidental == '#' or accidental == 'b': mode = t[2:] else: accidental = "" mode = t[1:] except IndexError: pass mode = mode.strip().lower() return letter + accidental + ' ' + abckeys.get(mode, "Major") # Convert input string from Markdown to HTML or LaTeX. Fix up link # targets so any 'foo.abc' target links to the tune with that name. def convertMarkdown(t, latex): if latex: target = "--to=latex" else: target = "--to=html" res = subprocess.check_output(['pandoc', '--from=markdown', target], input=t, universal_newlines=True) if latex: res = re.sub(r'\\href{(.*?).abc}', r'\\hyperlink{\1}', res) else: res = re.sub(r'href="(.*?).abc"', r'href="\1.html"', res) return res.strip() # Implement a custom Markdown shorthand for referencing ABC files. # <foo.abc> will expand to ['title of foo'](foo.abc). def expandCustomMarkdown(t, dir): # Given a match to (foo.abc), return a markdown link to the tune with the # title (and subtitle, if present) of the tune as the text of the link. # Because we're going through Markdown, character entities must be # HTML. Pandoc will convert them to UTF-8. def getTitleLink(m): fname = m.group(1) + ".abc" path = pathlib.Path(dir, fname) with path.open() as f: lines = f.readlines() return "[" + getFullTitle(lines, dir) + "](" + fname + ")" return re.sub(r'<(.*?).abc>', getTitleLink, t) # Return the raw text for a given field. Optionally the nth field is taken, # or the field data must start with a designated string to be recognised. def getFieldText(lines, field, n = 1, starts = None): res = "" for line in lines: line = line.strip() if len(line) > 2 and line[1] == ':': if line[0] == "+" or (line[0] == field and line[2] == "+"): if not res: continue if line[0] == "+": line = line[2:] else: line = line[3:] res = res + '\n' + line.strip() else: if res: break if line[0] == field: line = line[2:].strip() if starts: if line.find(starts) != 0: continue line = line[len(starts):].strip() if n > 1: n = n - 1 continue res = line return res # Return display text for a given field. def getFieldDisplayText(lines, dir, field, n = 1, starts = None, latex = False): res = getFieldText(lines, field, n, starts) if res: # Fields that go through Markdown must have HTML entities. mdfield = field.upper() in ['H', 'N']; res = convertAccents(res, False if mdfield else latex) if field.upper() == "T": res = convertTitleToDisplay(res) elif field.upper() == "K": res = convertKeyToDisplay(res) elif mdfield: res = convertMarkdown(expandCustomMarkdown(res, dir), latex) return res # Return full title (title + [" (" + subtitle + ")"] if subtitle exists). def getFullTitle(lines, dir, starts = None, latex = False): title = getFieldDisplayText(lines, dir, "T", starts=starts, latex=latex) subtitle = getFieldDisplayText(lines, dir, "T", n=2, starts=starts, latex=latex) return title if len(subtitle) == 0 else title + " (" + subtitle + ")" if __name__ == "__main__": def process(f, dir, args): lines = f.readlines() if args.display: if args.field.upper() == "FT": line = getFullTitle(lines, dir, args.starts, args.latex) else: line = getFieldDisplayText(lines, dir, args.field, args.index, args.starts, args.latex) else: if args.field.upper() == "FT": args.field = "T" line = getFieldText(lines, args.field, args.index, args.starts) if line: print(line) return True else: return False # execute only if run as a script parser = argparse.ArgumentParser(description="Extract field data from ABC file.") parser.add_argument("-f", "--field", dest="field", default="T", help=("extract the given field [default: %(default)s]. " "Field FT is special; it returns the full title " "- the title followed by subtitle in () if " "present - for display text, or just the title " "for non-display text.")) parser.add_argument("-l", "--latex", dest="latex", action="store_true", default=False, help="convert special characters for LaTeX (default HTML)") parser.add_argument("-d", "--display", dest="display", action="store_true", default=False, help=("convert to display text. Convert accents to " "LaTeX or HTML, in titles convert 'Tune, The' to " "'The Tune', convert keys to full key name, " "and expand Markdown in notes and history.")) parser.add_argument("-n", "--index", dest="index", action="store", type=int, default=1, help="report INDEXth value [default: %(default)s]") parser.add_argument("-s", "--starts", dest="starts", action="store", default=None, help=("report only if line starts with CONTENT " "and remove CONTENT"), metavar="CONTENT") parser.add_argument('input', type=argparse.FileType('r'), help='input ABC file') args = parser.parse_args() path = pathlib.Path(args.input.name) with path.open() as f: res = process(f, path.parent, args) sys.exit(int(not res))