The Mudcat Café TM
Thread #112488   Message #2380640
Posted By: Artful Codger
03-Jul-08 - 10:15 PM
Thread Name: Tech: HtmlEsc.java: Convert special chars
Subject: RE: Tech: HtmlEsc.java: Convert special chars

Script updated 12 Feb 2011 to remove several mnemonics which aren't well-supported: hibar, Zcaron, zcaron and bdquote. The script supplies numeric escapes for these characters instead.

-Artful Codger-



/*
This program converts text on the clipboard to plain-text with all the non-ASCI and HTML special
characters converted to HTML entities or character references (like é or ǀ).
This includes a number of word processing characters (like quotes and dashes) which aren't ASCII,
and thus get trashed when copied into postings. It is especially useful when posting text in
languages other than English.

The character reference values correspond to Unicode values (specifically, to UTF-16 code points.)

Installation:
(1) Save this script to a plain-text file named "HtmlEsc.java".
(2) In a command window, change to the directory where the script file resides and compile it:
       javac HtmlEsc.java
    This produces a class file named "HtmlEsc.class". This is all the Java interpreter needs to
    run the program.
See further compilation notes below.

Usage:
(1) Copy the text you wish to convert to the clipboard.
(2) In a command window, change to the directory where the class file resides and run:
       java HtmlEsc            Do Not include the .class extension.
    The clipboard will now contain the converted text as plain text.
(3) Paste the modified text from the clipboard to you post.
    Please preview your message (after any further editing) before you save it.

If you wish to execute the command from any directory, you will either have to supply a full path
to the class file (either on the command line or in a shell script, expressed in UNIX forward-slash
format) or place the class file in a directory you have defined in the CLASSPATH environment
variable. See Java tutorials or documentation for details.

Compilation notes:
This script can only be compiled on a system that has the Java Development Kit installed. The
minimum version is v1.5 (JDK 5). You should be able to copy and run the resulting class file on
any system that has an equivalent or later Java Runtime Environment (JRE). Both of these are
available as free downloads from http://java.sun.com/

Author: Artful Codger
*/

import java.util.HashMap;
import java.awt.datatransfer.Clipboard;
import java.awt.datatransfer.StringSelection;
import java.awt.datatransfer.DataFlavor;

public class HtmlEsc {
    public static void main(String[] args) {
       // Get the current clipboard text.
       MyClipboard cb = new MyClipboard();
       String cbText = cb.getText();
       String outText = HtmlEscaper.convertText(cbText);
       // Append some text and write it back to the clipboard.
       cb.setText(outText);
    }
}
//----------------------------------------------------------------------------

class MyClipboard {
   
    public String
    getText() {
       String rtn = null;
       Clipboard cb = getClipboard();
       DataFlavor flavor = DataFlavor.stringFlavor;
       if (cb.isDataFlavorAvailable(flavor)) {
            try {
                rtn = (String) cb.getData(flavor);
            } catch (Exception e) {} // UnsupportedFlavor will never be thrwosn.
       }
       return rtn;
    }
   
    public void
    setText(String text) {
       Clipboard cb = getClipboard();
       StringSelection sel = new StringSelection(text);
       cb.setContents(sel, null);
    }
   
    private Clipboard
    getClipboard() {
       return java.awt.Toolkit.getDefaultToolkit().getSystemClipboard();
    }
}
//----------------------------------------------------------------------------

class HtmlEscaper {
    private HtmlEscaper() {}    // Construction disallowed.
   
    static String convertChar(char ch) {
       String rslt;
       int ich = (int) ch;
       Integer Ich = ich;
       if (charMap.containsKey(Ich)) {
            rslt = charMap.get(Ich);
       } else {
            if (ich >= 0x80) {
                rslt = "&#x" + Integer.toHexString(ich) + ";";
            } else {
                rslt = Character.toString(ch);
            }
            charMap.put(Ich, rslt);
       }
       return rslt;
    }
   
    static String convertText(String inText) {
       StringBuilder buf = new StringBuilder();
       int textLen = inText.length();
       for (int ix = 0; ix < textLen; ix++) {
            buf.append(convertChar(inText.charAt(ix)));
       }
       return buf.toString();
    }
   
    // A map of Unicode values to HTML/XML char "entities" (named escape seqs).
    static private HashMap<Integer,String> charMap;
    // Key/value pairs to populate the charMap.
    static private final String[] charMapData = {
       "&", "&amp;", "<", "&lt;", ">", "&gt;",
       "\u00A0", "&nbsp;",
       "\u00A1", "&iexcl;", "\u00A2", "&cent;", "\u00A3", "&pound;", "\u00A4", "&curren;",
       "\u00A5", "&yen;", "\u00A6", "&brvbar;", "\u00A7", "&sect;", "\u00A8", "&uml;",
       "\u00A9", "&copy;", "\u00AA", "&ordf;", "\u00AB", "&laquo;", "\u00AC", "&not;",
       "\u00AD", "&shy;", "\u00AE", "&reg;", "\u00B0", "&deg;",
       "\u00B1", "&plusmn;", "\u00B2", "&sup2;", "\u00B3", "&sup3;", "\u00B4", "&acute;",
       "\u00B5", "&micro;", "\u00B6", "&para;", "\u00B7", "&middot;", "\u00B8", "&cedil;",
       "\u00B9", "&sup1;", "\u00BA", "&ordm;", "\u00BB", "&raquo;", "\u00BC", "&frac14;",
       "\u00BD", "&frac12;", "\u00BE", "&frac34;", "\u00BF", "&iquest;", "\u00C0", "&Agrave;",
       "\u00C1", "&Aacute;", "\u00C2", "&Acirc;", "\u00C3", "&Atilde;", "\u00C4", "&Auml;",
       "\u00C5", "&Aring;", "\u00C6", "&AElig;", "\u00C7", "&Ccedil;", "\u00C8", "&Egrave;",
       "\u00C9", "&Eacute;", "\u00CA", "&Ecirc;", "\u00CB", "&Euml;", "\u00CC", "&Igrave;",
       "\u00CD", "&Iacute;", "\u00CE", "&Icirc;", "\u00CF", "&Iuml;", "\u00D0", "&ETH;",
       "\u00D1", "&Ntilde;", "\u00D2", "&Ograve;", "\u00D3", "&Oacute;", "\u00D4", "&Ocirc;",
       "\u00D5", "&Otilde;", "\u00D6", "&Ouml;", "\u00D7", "&times;", "\u00D8", "&Oslash;",
       "\u00D9", "&Ugrave;", "\u00DA", "&Uacute;", "\u00DB", "&Ucirc;", "\u00DC", "&Uuml;",
       "\u00DD", "&Yacute;", "\u00DE", "&THORN;", "\u00DF", "&szlig;", "\u00E0", "&agrave;",
       "\u00E1", "&aacute;", "\u00E2", "&acirc;", "\u00E3", "&atilde;", "\u00E4", "&auml;",
       "\u00E5", "&aring;", "\u00E6", "&aelig;", "\u00E7", "&ccedil;", "\u00E8", "&egrave;",
       "\u00E9", "&eacute;", "\u00EA", "&ecirc;", "\u00EB", "&euml;", "\u00EC", "&igrave;",
       "\u00ED", "&iacute;", "\u00EE", "&icirc;", "\u00EF", "&iuml;", "\u00F0", "&eth;",
       "\u00F1", "&ntilde;", "\u00F2", "&ograve;", "\u00F3", "&oacute;", "\u00F4", "&ocirc;",
       "\u00F5", "&otilde;", "\u00F6", "&ouml;", "\u00F7", "&divide;", "\u00F8", "&oslash;",
       "\u00F9", "&ugrave;", "\u00FA", "&uacute;", "\u00FB", "&ucirc;", "\u00FC", "&uuml;",
       "\u00FD", "&yacute;", "\u00FE", "&thorn;", "\u00FF", "&yuml;",
       "\u0152", "&OElig;",
       "\u0153", "&oelig;", "\u0160", "&Scaron;", "\u0161", "&scaron;", "\u0178", "&Yuml;",
       "\u0192", "&fnof;", "\u02C6", "&circ;",
       "\u02DC", "&tilde;", "\u03A9", "&Omega;", "\u03C0", "&pi;",
       "\u2013", "&ndash;",
       "\u2014", "&mdash;", "\u2018", "&lsquo;", "\u2019", "&rsquo;", "\u201A", "&sbquo;",
       "\u201C", "&ldquo;", "\u201D", "&rdquo;", "\u2020", "&dagger;",
       "\u2021", "&Dagger;", "\u2022", "&bull;", "\u2026", "&hellip;", "\u2030", "&permil;",
       "\u2039", "&lsaquo;", "\u203A", "&rsaquo;", "\u2044", "&frasl;", "\u20AC", "&euro;",
       "\u2122", "&trade;", "\u2202", "&part;", "\u220F", "&prod;", "\u2211", "&sum;",
       "\u221A", "&radic;", "\u221E", "&infin;", "\u222B", "&int;", "\u2248", "&asymp;",
       "\u2260", "&ne;", "\u2264", "&le;", "\u2265", "&ge;", "\u25CA", "&loz;"
    };
    static {    // Static block to init the char map from the data array.
       charMap = new HashMap<Integer,String>();
       for (int ix = charMapData.length; ix > 0; ) {
            String val = charMapData[--ix];
            String sKey = charMapData[--ix];
            Integer key = (Integer) (sKey.codePointAt(0));
            charMap.put(key,val);
       }
    }
}