EncodingDetector.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.jasper.compiler;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

/*
 * The BoM detection is derived from:
 * https://svn.us.apache.org/viewvc/tomcat/trunk/java/org/apache/jasper/xmlparser/XMLEncodingDetector.java?annotate=1742248
 *
 * The prolog is always at least as specific as the BOM therefore any encoding
 * specified in the prolog should take priority over the BOM.
 */
class EncodingDetector {

    private static final XMLInputFactory XML_INPUT_FACTORY;
    static {
        XML_INPUT_FACTORY = XMLInputFactory.newInstance();
    }

    private final String encoding;
    private final int skip;
    private final boolean encodingSpecifiedInProlog;


    /*
     * TODO: Refactor Jasper InputStream creation and handling so the
     *       InputStream passed to this method is buffered and therefore saves
     *       on multiple opening and re-opening of the same file.
     */
    EncodingDetector(InputStream is) throws IOException {
        // Keep buffer size to a minimum here. BoM will be no more than 4 bytes
        // so that is the maximum we need to buffer
        BufferedInputStream bis = new BufferedInputStream(is, 4);
        bis.mark(4);

        BomResult bomResult = processBom(bis);

        // Reset the stream back to the start to allow the XML prolog detection
        // to work. Skip any BoM we discovered.
        bis.reset();
        for (int i = 0; i < bomResult.skip; i++) {
            bis.read();
        }

        String prologEncoding = getPrologEncoding(bis);
        if (prologEncoding == null) {
            encodingSpecifiedInProlog = false;
            encoding = bomResult.encoding;
        } else {
            encodingSpecifiedInProlog = true;
            encoding = prologEncoding;
        }
        skip = bomResult.skip;
    }


    String getEncoding() {
        return encoding;
    }


    int getSkip() {
        return skip;
    }


    boolean isEncodingSpecifiedInProlog() {
        return encodingSpecifiedInProlog;
    }


    private String getPrologEncoding(InputStream stream) {
        String encoding = null;
        try {
            XMLStreamReader xmlStreamReader = XML_INPUT_FACTORY.createXMLStreamReader(stream);
            encoding = xmlStreamReader.getCharacterEncodingScheme();
        } catch (XMLStreamException e) {
            // Ignore
        }
        return encoding;
    }


    private BomResult processBom(InputStream stream) {
        // Read first four bytes (or as many are available) and determine
        // encoding
        try {
            final byte[] b4 = new byte[4];
            int count = 0;
            int singleByteRead;
            while (count < 4) {
                singleByteRead = stream.read();
                if (singleByteRead == -1) {
                    break;
                }
                b4[count] = (byte) singleByteRead;
                count++;
            }

            return parseBom(b4, count);
        } catch (IOException ioe) {
            // Failed.
            return new BomResult("UTF-8", 0);
        }
    }


    private BomResult parseBom(byte[] b4, int count) {

        if (count < 2) {
            return new BomResult("UTF-8", 0);
        }

        // UTF-16, with BOM
        int b0 = b4[0] & 0xFF;
        int b1 = b4[1] & 0xFF;
        if (b0 == 0xFE && b1 == 0xFF) {
            // UTF-16, big-endian
            return new BomResult("UTF-16BE", 2);
        }
        if (b0 == 0xFF && b1 == 0xFE) {
            // UTF-16, little-endian
            return new BomResult("UTF-16LE", 2);
        }

        // default to UTF-8 if we don't have enough bytes to make a
        // good determination of the encoding
        if (count < 3) {
            return new BomResult("UTF-8", 0);
        }

        // UTF-8 with a BOM
        int b2 = b4[2] & 0xFF;
        if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
            return new BomResult("UTF-8", 3);
        }

        // default to UTF-8 if we don't have enough bytes to make a
        // good determination of the encoding
        if (count < 4) {
            return new BomResult("UTF-8", 0);
        }

        // Other encodings. No BOM. Try and ID encoding.
        int b3 = b4[3] & 0xFF;
        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
            // UCS-4, big endian (1234)
            return new BomResult("ISO-10646-UCS-4", 0);
        }
        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
            // UCS-4, little endian (4321)
            return new BomResult("ISO-10646-UCS-4", 0);
        }
        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
            // UCS-4, unusual octet order (2143)
            // REVISIT: What should this be?
            return new BomResult("ISO-10646-UCS-4", 0);
        }
        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
            // UCS-4, unusual octet order (3412)
            // REVISIT: What should this be?
            return new BomResult("ISO-10646-UCS-4", 0);
        }
        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
            // UTF-16, big-endian, no BOM
            // (or could turn out to be UCS-2...
            // REVISIT: What should this be?
            return new BomResult("UTF-16BE", 0);
        }
        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
            // UTF-16, little-endian, no BOM
            // (or could turn out to be UCS-2...
            return new BomResult("UTF-16LE", 0);
        }
        if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
            // EBCDIC
            // a la xerces1, return CP037 instead of EBCDIC here
            return new BomResult("CP037", 0);
        }

        // default encoding
        return new BomResult("UTF-8", 0);
    }


    private static class BomResult {

        public final String encoding;
        public final int skip;

        BomResult(String encoding,  int skip) {
            this.encoding = encoding;
            this.skip = skip;
        }
    }
}