001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.wicket.util.io; 018 019import java.io.IOException; 020import java.io.InputStream; 021import java.util.Arrays; 022import java.util.Comparator; 023import java.util.List; 024 025/** 026 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 027 * 028 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 029 * first byte in the stream. 030 * 031 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs: 032 * <ul> 033 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 034 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 035 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 036 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 037 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 038 * </ul> 039 * 040 * <p> Example 1 - Detect and exclude a UTF-8 BOM 041 * 042 * <pre> 043 * BOMInputStream bomIn = new BOMInputStream(in); 044 * if (bomIn.hasBOM()) { 045 * // has a UTF-8 BOM 046 * } 047 * </pre> 048 * 049 * <p> Example 2 - Detect a UTF-8 BOM (but don't exclude it) 050 * 051 * <pre> 052 * boolean include = true; 053 * BOMInputStream bomIn = new BOMInputStream(in, include); 054 * if (bomIn.hasBOM()) { 055 * // has a UTF-8 BOM 056 * } 057 * </pre> 058 * 059 * <p> Example 3 - Detect Multiple BOMs 060 * 061 * <pre> 062 * BOMInputStream bomIn = new BOMInputStream(in, 063 * ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, 064 * ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE 065 * ); 066 * if (bomIn.hasBOM() == false) { 067 * // No BOM found 068 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 069 * // has a UTF-16LE BOM 070 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 071 * // has a UTF-16BE BOM 072 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 073 * // has a UTF-32LE BOM 074 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 075 * // has a UTF-32BE BOM 076 * } 077 * </pre> 078 * 079 * @see ByteOrderMark 080 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 081 * @version $Id$ 082 * @since 2.0 083 */ 084public class BOMInputStream extends ProxyInputStream 085{ 086 private final boolean include; 087 /** 088 * BOMs are sorted from longest to shortest. 089 */ 090 private final List<ByteOrderMark> boms; 091 private ByteOrderMark byteOrderMark; 092 private int[] firstBytes; 093 private int fbLength; 094 private int fbIndex; 095 private int markFbIndex; 096 private boolean markedAtStart; 097 098 /** 099 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 100 * 101 * @param delegate 102 * the InputStream to delegate to 103 */ 104 public BOMInputStream(final InputStream delegate) { 105 this(delegate, false, ByteOrderMark.UTF_8); 106 } 107 108 /** 109 * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it. 110 * 111 * @param delegate 112 * the InputStream to delegate to 113 * @param include 114 * true to include the UTF-8 BOM or false to exclude it 115 */ 116 public BOMInputStream(final InputStream delegate, final boolean include) { 117 this(delegate, include, ByteOrderMark.UTF_8); 118 } 119 120 /** 121 * Constructs a new BOM InputStream that excludes the specified BOMs. 122 * 123 * @param delegate 124 * the InputStream to delegate to 125 * @param boms 126 * The BOMs to detect and exclude 127 */ 128 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { 129 this(delegate, false, boms); 130 } 131 132 /** 133 * Compares ByteOrderMark objects in descending length order. 134 */ 135 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() { 136 137 @Override 138 public int compare(final ByteOrderMark bom1, final ByteOrderMark bom2) { 139 final int len1 = bom1.length(); 140 final int len2 = bom2.length(); 141 if (len1 > len2) { 142 return -1; 143 } 144 if (len2 > len1) { 145 return 1; 146 } 147 return 0; 148 } 149 }; 150 151 /** 152 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 153 * 154 * @param delegate 155 * the InputStream to delegate to 156 * @param include 157 * true to include the specified BOMs or false to exclude them 158 * @param boms 159 * The BOMs to detect and optionally exclude 160 */ 161 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { 162 super(delegate); 163 if (boms == null || boms.length == 0) { 164 throw new IllegalArgumentException("No BOMs specified"); 165 } 166 this.include = include; 167 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 168 Arrays.sort(boms, ByteOrderMarkLengthComparator); 169 this.boms = Arrays.asList(boms); 170 171 } 172 173 /** 174 * Indicates whether the stream contains one of the specified BOMs. 175 * 176 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 177 * @throws IOException 178 * if an error reading the first bytes of the stream occurs 179 */ 180 public boolean hasBOM() throws IOException { 181 return getBOM() != null; 182 } 183 184 /** 185 * Indicates whether the stream contains the specified BOM. 186 * 187 * @param bom 188 * The BOM to check for 189 * @return true if the stream has the specified BOM, otherwise false if it does not 190 * @throws IllegalArgumentException 191 * if the BOM is not one the stream is configured to detect 192 * @throws IOException 193 * if an error reading the first bytes of the stream occurs 194 */ 195 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 196 if (!boms.contains(bom)) { 197 throw new IllegalArgumentException("Stream not configure to detect " + bom); 198 } 199 return byteOrderMark != null && getBOM().equals(bom); 200 } 201 202 /** 203 * Return the BOM (Byte Order Mark). 204 * 205 * @return The BOM or null if none 206 * @throws IOException 207 * if an error reading the first bytes of the stream occurs 208 */ 209 public ByteOrderMark getBOM() throws IOException { 210 if (firstBytes == null) { 211 fbLength = 0; 212 // BOMs are sorted from longest to shortest 213 final int maxBomSize = boms.get(0).length(); 214 firstBytes = new int[maxBomSize]; 215 // Read first maxBomSize bytes 216 for (int i = 0; i < firstBytes.length; i++) { 217 firstBytes[i] = in.read(); 218 fbLength++; 219 if (firstBytes[i] < 0) { 220 break; 221 } 222 } 223 // match BOM in firstBytes 224 byteOrderMark = find(); 225 if (byteOrderMark != null) { 226 if (!include) { 227 if (byteOrderMark.length() < firstBytes.length) { 228 fbIndex = byteOrderMark.length(); 229 } else { 230 fbLength = 0; 231 } 232 } 233 } 234 } 235 return byteOrderMark; 236 } 237 238 /** 239 * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 240 * 241 * @return The BOM charset Name or null if no BOM found 242 * @throws IOException 243 * if an error reading the first bytes of the stream occurs 244 * 245 */ 246 public String getBOMCharsetName() throws IOException { 247 getBOM(); 248 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 249 } 250 251 /** 252 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte 253 * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been 254 * processed already. 255 * 256 * @return the byte read (excluding BOM) or -1 if the end of stream 257 * @throws IOException 258 * if an I/O error occurs 259 */ 260 private int readFirstBytes() throws IOException { 261 getBOM(); 262 return fbIndex < fbLength ? firstBytes[fbIndex++] : -1; 263 } 264 265 /** 266 * Find a BOM with the specified bytes. 267 * 268 * @return The matched BOM or null if none matched 269 */ 270 private ByteOrderMark find() { 271 for (final ByteOrderMark bom : boms) { 272 if (matches(bom)) { 273 return bom; 274 } 275 } 276 return null; 277 } 278 279 /** 280 * Check if the bytes match a BOM. 281 * 282 * @param bom 283 * The BOM 284 * @return true if the bytes match the bom, otherwise false 285 */ 286 private boolean matches(final ByteOrderMark bom) { 287 // if (bom.length() != fbLength) { 288 // return false; 289 // } 290 // firstBytes may be bigger than the BOM bytes 291 for (int i = 0; i < bom.length(); i++) { 292 if (bom.get(i) != firstBytes[i]) { 293 return false; 294 } 295 } 296 return true; 297 } 298 299 // ---------------------------------------------------------------------------- 300 // Implementation of InputStream 301 // ---------------------------------------------------------------------------- 302 303 /** 304 * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM. 305 * 306 * @return the byte read (excluding BOM) or -1 if the end of stream 307 * @throws IOException 308 * if an I/O error occurs 309 */ 310 @Override 311 public int read() throws IOException { 312 final int b = readFirstBytes(); 313 return b >= 0 ? b : in.read(); 314 } 315 316 /** 317 * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM. 318 * 319 * @param buf 320 * the buffer to read the bytes into 321 * @param off 322 * The start offset 323 * @param len 324 * The number of bytes to read (excluding BOM) 325 * @return the number of bytes read or -1 if the end of stream 326 * @throws IOException 327 * if an I/O error occurs 328 */ 329 @Override 330 public int read(final byte[] buf, int off, int len) throws IOException { 331 int firstCount = 0; 332 int b = 0; 333 while (len > 0 && b >= 0) { 334 b = readFirstBytes(); 335 if (b >= 0) { 336 buf[off++] = (byte) (b & 0xFF); 337 len--; 338 firstCount++; 339 } 340 } 341 final int secondCount = in.read(buf, off, len); 342 return secondCount < 0 ? firstCount > 0 ? firstCount : -1 : firstCount + secondCount; 343 } 344 345 /** 346 * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM. 347 * 348 * @param buf 349 * the buffer to read the bytes into 350 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 351 * @throws IOException 352 * if an I/O error occurs 353 */ 354 @Override 355 public int read(final byte[] buf) throws IOException { 356 return read(buf, 0, buf.length); 357 } 358 359 /** 360 * Invokes the delegate's <code>mark(int)</code> method. 361 * 362 * @param readlimit 363 * read ahead limit 364 */ 365 @Override 366 public synchronized void mark(final int readlimit) { 367 markFbIndex = fbIndex; 368 markedAtStart = firstBytes == null; 369 in.mark(readlimit); 370 } 371 372 /** 373 * Invokes the delegate's <code>reset()</code> method. 374 * 375 * @throws IOException 376 * if an I/O error occurs 377 */ 378 @Override 379 public synchronized void reset() throws IOException { 380 fbIndex = markFbIndex; 381 if (markedAtStart) { 382 firstBytes = null; 383 } 384 385 in.reset(); 386 } 387 388 /** 389 * Invokes the delegate's <code>skip(long)</code> method, detecting and optionallyskipping BOM. 390 * 391 * @param n 392 * the number of bytes to skip 393 * @return the number of bytes to skipped or -1 if the end of stream 394 * @throws IOException 395 * if an I/O error occurs 396 */ 397 @Override 398 public long skip(long n) throws IOException { 399 while (n > 0 && readFirstBytes() >= 0) { 400 n--; 401 } 402 return in.skip(n); 403 } 404}