001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.wicket.util.io;
018
019import java.io.IOException;
020import java.io.InputStream;
021import java.util.Arrays;
022import java.util.Comparator;
023import java.util.List;
024
025/**
026 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
027 * 
028 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
029 * first byte in the stream.
030 * 
031 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
032 * <ul>
033 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
034 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
035 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
036 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
037 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
038 * </ul>
039 *
040 * <p> Example 1 - Detect and exclude a UTF-8 BOM
041 * 
042 * <pre>
043 * BOMInputStream bomIn = new BOMInputStream(in);
044 * if (bomIn.hasBOM()) {
045 *     // has a UTF-8 BOM
046 * }
047 * </pre>
048 * 
049 * <p> Example 2 - Detect a UTF-8 BOM (but don't exclude it)
050 * 
051 * <pre>
052 * boolean include = true;
053 * BOMInputStream bomIn = new BOMInputStream(in, include);
054 * if (bomIn.hasBOM()) {
055 *     // has a UTF-8 BOM
056 * }
057 * </pre>
058 * 
059 * <p> Example 3 - Detect Multiple BOMs
060 * 
061 * <pre>
062 * BOMInputStream bomIn = new BOMInputStream(in, 
063 *   ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
064 *   ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
065 *   );
066 * if (bomIn.hasBOM() == false) {
067 *     // No BOM found
068 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
069 *     // has a UTF-16LE BOM
070 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
071 *     // has a UTF-16BE BOM
072 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
073 *     // has a UTF-32LE BOM
074 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
075 *     // has a UTF-32BE BOM
076 * }
077 * </pre>
078 * 
079 * @see ByteOrderMark
080 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
081 * @version $Id$
082 * @since 2.0
083 */
084public class BOMInputStream extends ProxyInputStream
085{
086    private final boolean include;
087    /**
088     * BOMs are sorted from longest to shortest.
089     */
090    private final List<ByteOrderMark> boms;
091    private ByteOrderMark byteOrderMark;
092    private int[] firstBytes;
093    private int fbLength;
094    private int fbIndex;
095    private int markFbIndex;
096    private boolean markedAtStart;
097
098    /**
099     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
100     * 
101     * @param delegate
102     *            the InputStream to delegate to
103     */
104    public BOMInputStream(final InputStream delegate) {
105        this(delegate, false, ByteOrderMark.UTF_8);
106    }
107
108    /**
109     * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it.
110     * 
111     * @param delegate
112     *            the InputStream to delegate to
113     * @param include
114     *            true to include the UTF-8 BOM or false to exclude it
115     */
116    public BOMInputStream(final InputStream delegate, final boolean include) {
117        this(delegate, include, ByteOrderMark.UTF_8);
118    }
119
120    /**
121     * Constructs a new BOM InputStream that excludes the specified BOMs.
122     * 
123     * @param delegate
124     *            the InputStream to delegate to
125     * @param boms
126     *            The BOMs to detect and exclude
127     */
128    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
129        this(delegate, false, boms);
130    }
131
132    /**
133     * Compares ByteOrderMark objects in descending length order.
134     */
135    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() {
136
137        @Override
138                public int compare(final ByteOrderMark bom1, final ByteOrderMark bom2) {
139            final int len1 = bom1.length();
140            final int len2 = bom2.length();
141            if (len1 > len2) {
142                return -1;
143            }
144            if (len2 > len1) {
145                return 1;
146            }
147            return 0;
148        }
149    };
150
151    /**
152     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
153     * 
154     * @param delegate
155     *            the InputStream to delegate to
156     * @param include
157     *            true to include the specified BOMs or false to exclude them
158     * @param boms
159     *            The BOMs to detect and optionally exclude
160     */
161    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
162        super(delegate);
163        if (boms == null || boms.length == 0) {
164            throw new IllegalArgumentException("No BOMs specified");
165        }
166        this.include = include;
167        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
168        Arrays.sort(boms, ByteOrderMarkLengthComparator);
169        this.boms = Arrays.asList(boms);
170
171    }
172
173    /**
174     * Indicates whether the stream contains one of the specified BOMs.
175     * 
176     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
177     * @throws IOException
178     *             if an error reading the first bytes of the stream occurs
179     */
180    public boolean hasBOM() throws IOException {
181        return getBOM() != null;
182    }
183
184    /**
185     * Indicates whether the stream contains the specified BOM.
186     * 
187     * @param bom
188     *            The BOM to check for
189     * @return true if the stream has the specified BOM, otherwise false if it does not
190     * @throws IllegalArgumentException
191     *             if the BOM is not one the stream is configured to detect
192     * @throws IOException
193     *             if an error reading the first bytes of the stream occurs
194     */
195    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
196        if (!boms.contains(bom)) {
197            throw new IllegalArgumentException("Stream not configure to detect " + bom);
198        }
199        return byteOrderMark != null && getBOM().equals(bom);
200    }
201
202    /**
203     * Return the BOM (Byte Order Mark).
204     * 
205     * @return The BOM or null if none
206     * @throws IOException
207     *             if an error reading the first bytes of the stream occurs
208     */
209    public ByteOrderMark getBOM() throws IOException {
210        if (firstBytes == null) {
211            fbLength = 0;
212            // BOMs are sorted from longest to shortest
213            final int maxBomSize = boms.get(0).length();
214            firstBytes = new int[maxBomSize];
215            // Read first maxBomSize bytes
216            for (int i = 0; i < firstBytes.length; i++) {
217                firstBytes[i] = in.read();
218                fbLength++;
219                if (firstBytes[i] < 0) {
220                    break;
221                }
222            }
223            // match BOM in firstBytes
224            byteOrderMark = find();
225            if (byteOrderMark != null) {
226                if (!include) {
227                    if (byteOrderMark.length() < firstBytes.length) {
228                        fbIndex = byteOrderMark.length();
229                    } else {
230                        fbLength = 0;
231                    }
232                }
233            }
234        }
235        return byteOrderMark;
236    }
237
238    /**
239     * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
240     * 
241     * @return The BOM charset Name or null if no BOM found
242     * @throws IOException
243     *             if an error reading the first bytes of the stream occurs
244     * 
245     */
246    public String getBOMCharsetName() throws IOException {
247        getBOM();
248        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
249    }
250
251    /**
252     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
253     * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been
254     * processed already.
255     * 
256     * @return the byte read (excluding BOM) or -1 if the end of stream
257     * @throws IOException
258     *             if an I/O error occurs
259     */
260    private int readFirstBytes() throws IOException {
261        getBOM();
262        return fbIndex < fbLength ? firstBytes[fbIndex++] : -1;
263    }
264
265    /**
266     * Find a BOM with the specified bytes.
267     * 
268     * @return The matched BOM or null if none matched
269     */
270    private ByteOrderMark find() {
271        for (final ByteOrderMark bom : boms) {
272            if (matches(bom)) {
273                return bom;
274            }
275        }
276        return null;
277    }
278
279    /**
280     * Check if the bytes match a BOM.
281     * 
282     * @param bom
283     *            The BOM
284     * @return true if the bytes match the bom, otherwise false
285     */
286    private boolean matches(final ByteOrderMark bom) {
287        // if (bom.length() != fbLength) {
288        // return false;
289        // }
290        // firstBytes may be bigger than the BOM bytes
291        for (int i = 0; i < bom.length(); i++) {
292            if (bom.get(i) != firstBytes[i]) {
293                return false;
294            }
295        }
296        return true;
297    }
298
299    // ----------------------------------------------------------------------------
300    // Implementation of InputStream
301    // ----------------------------------------------------------------------------
302
303    /**
304     * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM.
305     * 
306     * @return the byte read (excluding BOM) or -1 if the end of stream
307     * @throws IOException
308     *             if an I/O error occurs
309     */
310    @Override
311    public int read() throws IOException {
312        final int b = readFirstBytes();
313        return b >= 0 ? b : in.read();
314    }
315
316    /**
317     * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM.
318     * 
319     * @param buf
320     *            the buffer to read the bytes into
321     * @param off
322     *            The start offset
323     * @param len
324     *            The number of bytes to read (excluding BOM)
325     * @return the number of bytes read or -1 if the end of stream
326     * @throws IOException
327     *             if an I/O error occurs
328     */
329    @Override
330    public int read(final byte[] buf, int off, int len) throws IOException {
331        int firstCount = 0;
332        int b = 0;
333        while (len > 0 && b >= 0) {
334            b = readFirstBytes();
335            if (b >= 0) {
336                buf[off++] = (byte) (b & 0xFF);
337                len--;
338                firstCount++;
339            }
340        }
341        final int secondCount = in.read(buf, off, len);
342        return secondCount < 0 ? firstCount > 0 ? firstCount : -1 : firstCount + secondCount;
343    }
344
345    /**
346     * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM.
347     * 
348     * @param buf
349     *            the buffer to read the bytes into
350     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
351     * @throws IOException
352     *             if an I/O error occurs
353     */
354    @Override
355    public int read(final byte[] buf) throws IOException {
356        return read(buf, 0, buf.length);
357    }
358
359    /**
360     * Invokes the delegate's <code>mark(int)</code> method.
361     * 
362     * @param readlimit
363     *            read ahead limit
364     */
365    @Override
366    public synchronized void mark(final int readlimit) {
367        markFbIndex = fbIndex;
368        markedAtStart = firstBytes == null;
369        in.mark(readlimit);
370    }
371
372    /**
373     * Invokes the delegate's <code>reset()</code> method.
374     * 
375     * @throws IOException
376     *             if an I/O error occurs
377     */
378    @Override
379    public synchronized void reset() throws IOException {
380        fbIndex = markFbIndex;
381        if (markedAtStart) {
382            firstBytes = null;
383        }
384
385        in.reset();
386    }
387
388    /**
389     * Invokes the delegate's <code>skip(long)</code> method, detecting and optionallyskipping BOM.
390     * 
391     * @param n
392     *            the number of bytes to skip
393     * @return the number of bytes to skipped or -1 if the end of stream
394     * @throws IOException
395     *             if an I/O error occurs
396     */
397    @Override
398    public long skip(long n) throws IOException {
399        while (n > 0 && readFirstBytes() >= 0) {
400            n--;
401        }
402        return in.skip(n);
403    }
404}