The Mudcat Café TM
Thread #112265   Message #2373053
Posted By: Artful Codger
24-Jun-08 - 04:13 AM
Thread Name: Tech: htmlesc.py: Mac script to escape text
Subject: RE: Tech: htmlesc.py: Mac script to escape text

Script updated 12 Feb 2011 to remove several mnemonics which aren't well-supported: hibar, Zcaron, zcaron and bdquote. The script supplies numeric escapes for these characters instead.

-Artful Codger-



#!/usr/bin/env python
# Convert non-ASCII characters in clipboard text to HTML escapes.

import sys
from cStringIO import StringIO
from optparse import OptionParser
from AppKit import NSPasteboard

charDict = {
0x22 : '"', 0x26 : '&', 0x3C : '<', 0x3E : '>', 0xA0 : ' ',
0xA1 : '¡', 0xA2 : '¢', 0xA3 : '£', 0xA4 : '¤',
0xA5 : '¥', 0xA6 : '¦', 0xA7 : '§', 0xA8 : '¨',
0xA9 : '©', 0xAA : 'ª', 0xAB : '«', 0xAC : '¬',
0xAD : '­', 0xAE : '®', 0xB0 : '°',
0xB1 : '±', 0xB2 : '²', 0xB3 : '³', 0xB4 : '´',
0xB5 : 'µ', 0xB6 : '¶', 0xB7 : '·', 0xB8 : '¸',
0xB9 : '¹', 0xBA : 'º', 0xBB : '»', 0xBC : '¼',
0xBD : '½', 0xBE : '¾', 0xBF : '¿', 0xC0 : 'À',
0xC1 : 'Á', 0xC2 : 'Â', 0xC3 : 'Ã', 0xC4 : 'Ä',
0xC5 : 'Å', 0xC6 : 'Æ', 0xC7 : 'Ç', 0xC8 : 'È',
0xC9 : 'É', 0xCA : 'Ê', 0xCB : 'Ë', 0xCC : 'Ì',
0xCD : 'Í', 0xCE : 'Î', 0xCF : 'Ï', 0xD0 : 'Ð',
0xD1 : 'Ñ', 0xD2 : 'Ò', 0xD3 : 'Ó', 0xD4 : 'Ô',
0xD5 : 'Õ', 0xD6 : 'Ö', 0xD7 : '×', 0xD8 : 'Ø',
0xD9 : 'Ù', 0xDA : 'Ú', 0xDB : 'Û', 0xDC : 'Ü',
0xDD : 'Ý', 0xDE : 'Þ', 0xDF : 'ß', 0xE0 : 'à',
0xE1 : 'á', 0xE2 : 'â', 0xE3 : 'ã', 0xE4 : 'ä',
0xE5 : 'å', 0xE6 : 'æ', 0xE7 : 'ç', 0xE8 : 'è',
0xE9 : 'é', 0xEA : 'ê', 0xEB : 'ë', 0xEC : 'ì',
0xED : 'í', 0xEE : 'î', 0xEF : 'ï', 0xF0 : 'ð',
0xF1 : 'ñ', 0xF2 : 'ò', 0xF3 : 'ó', 0xF4 : 'ô',
0xF5 : 'õ', 0xF6 : 'ö', 0xF7 : '÷', 0xF8 : 'ø',
0xF9 : 'ù', 0xFA : 'ú', 0xFB : 'û', 0xFC : 'ü',
0xFD : 'ý', 0xFE : 'þ', 0xFF : 'ÿ', 0x152 : 'Œ',
0x153 : 'œ', 0x160 : 'Š', 0x161 : 'š', 0x178 : 'Ÿ',
0x192 : 'ƒ', 0x2C6 : 'ˆ',
0x2DC : '˜', 0x3A9 : 'Ω', 0x3C0 : 'π', 0x2013 : '–',
0x2014 : '—', 0x2018 : '‘', 0x2019 : '’', 0x201A : '‚',
0x201C : '“', 0x201D : '”', 0x2020 : '†',
0x2021 : '‡', 0x2022 : '•', 0x2026 : '…', 0x2030 : '‰',
0x2039 : '‹', 0x203A : '›', 0x2044 : '⁄', 0x20AC : '€',
0x2122 : '™', 0x2202 : '∂', 0x220F : '∏', 0x2211 : '∑',
0x221A : '√', 0x221E : '∞', 0x222B : '∫', 0x2248 : '≈',
0x2260 : '≠', 0x2264 : '≤', 0x2265 : '≥', 0x25CA : '◊'
}

_textType = 'NSStringPboardType'

# ---- MacClipboard ----------------------------------------------------------

class MacClipboard(object):
    def __init__(s):
       s.mClipboard = NSPasteboard.generalPasteboard()
    def getPlainText(s):
       global _textType
       bestType = s.mClipboard.availableTypeFromArray_([_textType])
       if bestType is not None:
            utext = s.mClipboard.stringForType_(bestType)
       return utext
    def putPlainText(s, text):
       global _textType
       cb = s.mClipboard
       cb.declareTypes_owner_([_textType], None)
       bOK = cb.setString_forType_(text, _textType)
       return bOK

# ---- HtmlEscaper -----------------------------------------------------------

class HtmlEscaper(object):
    def __init__(s):
       global charDict
       s.mCharDict = charDict
    def convertChar(s, uch): # unicode (1 char) => str (ascii)
       ich = ord(uch)
       rslt = s.mCharDict.get(ich, None)
       if rslt is None:
            if ich >= 0x80:
                rslt = '&#x%x;' % ich
            else:
                rslt = chr(ich)
            # Insert this translation into the dictionary
            s.mCharDict[uch] = rslt
       return rslt
    def convertText(s, text): # unicode => str (ascii)
       buff = StringIO()
       for uch in text:
            ch = s.convertChar(uch)
            buff.write(ch)
       return buff.getvalue()
    def convertOnClipboard(s):
       cb = MacClipboard()
       utext = cb.getPlainText()
       if utext is not None and len(utext) != 0:
            xtext = s.convertText(utext)
            cb.putPlainText(xtext)

# ---- main -----------------------------------------------------------------

_usage = '''Usage: %prog'''
_desc = (
    '''%prog modifies plain text on the clipboard, translating non-ASCII
    and HTML special characters in the text to equivalent HTML escape
    sequences.''')

def run():
    global _usage, _desc
    p = OptionParser(usage=_usage, description=_desc)
    (opts, args) = p.parse_args()
    he = HtmlEscaper()
    he.convertOnClipboard()

if __name__ == '__main__':
    run()