001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.wicket.util.io;
018
019import java.io.BufferedInputStream;
020import java.io.IOException;
021import java.io.InputStream;
022import java.io.InputStreamReader;
023import java.io.Reader;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026
027import org.apache.wicket.util.lang.Args;
028import org.apache.wicket.util.string.Strings;
029
030
031/**
032 * This is a simple XmlReader. Its only purpose is to read the xml decl string from the input and
033 * apply proper character encoding to all subsequent characters. The xml decl string itself is
034 * removed from the output.
035 * 
036 * @author Juergen Donnerstag
037 */
038public final class XmlReader extends Reader
039{
040        /** Regex to find <?xml encoding ... ?> */
041        private static final Pattern xmlDecl = Pattern.compile("[\\s\\n\\r]*<\\?xml(\\s+.*)?\\?>");
042
043        /** Regex to find <?xml encoding ... ?> */
044        private static final Pattern encodingPattern = Pattern.compile("\\s+encoding\\s*=\\s*([\"\'](.*?)[\"\']|(\\S*)).*\\?>");
045
046        /** Null, if JVM default. Else from <?xml encoding=""> */
047        private String encoding;
048
049        /** The input stream to read the data from */
050        private final InputStream inputStream;
051
052        /** The reader which does the character encoding */
053        private Reader reader;
054
055        /**
056         * Construct.
057         * 
058         * @param inputStream
059         *            The InputStream to read the xml data from
060         * @param defaultEncoding
061         *            Default character encoding to use when not specified in XML declaration, specify
062         *            null to use JVM default
063         * @throws IOException
064         *             In case something went wrong while reading the data
065         */
066        public XmlReader(final InputStream inputStream, final String defaultEncoding)
067                throws IOException
068        {
069                Args.notNull(inputStream, "inputStream");
070
071                if (!inputStream.markSupported())
072                {
073                        this.inputStream = new BufferedInputStream(new BOMInputStream(inputStream));
074                }
075                else
076                {
077                        this.inputStream = new BOMInputStream(inputStream);
078                }
079                encoding = defaultEncoding;
080
081                init();
082        }
083
084        /**
085         * Return the encoding used while reading the markup file.
086         * 
087         * @return if null, then JVM default
088         */
089        public final String getEncoding()
090        {
091                return encoding;
092        }
093
094        /**
095         * Reads and parses markup from a resource such as file.
096         * 
097         * @throws IOException
098         */
099        public void init() throws IOException
100        {
101                // read ahead buffer required for the first line of the markup (encoding)
102                final int readAheadSize = 80;
103                inputStream.mark(readAheadSize);
104
105                // read-ahead the input stream and check if it starts with <?xml..?>.
106                String xmlDeclaration = getXmlDeclaration(inputStream, readAheadSize);
107                if (!Strings.isEmpty(xmlDeclaration))
108                {
109                        // If yes than determine the encoding from the xml decl
110                        encoding = determineEncoding(xmlDeclaration);
111                }
112                else
113                {
114                        // If not, reset the input stream to the beginning of the file
115                        inputStream.reset();
116                }
117
118                if (encoding == null)
119                {
120                        // Use JVM default
121                        reader = new InputStreamReader(inputStream);
122                }
123                else
124                {
125                        // Use the encoding provided
126                        reader = new InputStreamReader(inputStream, encoding);
127                }
128        }
129
130        /**
131         * Determine the encoding from the xml decl.
132         * 
133         * @param string
134         *            The xmlDecl string
135         * @return The encoding. Null, if not found
136         */
137        private String determineEncoding(final CharSequence string)
138        {
139                // Does the string match the <?xml .. ?> pattern
140                final Matcher matcher = encodingPattern.matcher(string);
141                if (!matcher.find())
142                {
143                        // No
144                        return null;
145                }
146
147                // Extract the encoding
148                String encoding = matcher.group(2);
149                if ((encoding == null) || (encoding.length() == 0))
150                {
151                        encoding = matcher.group(3);
152                }
153
154                if (encoding != null)
155                {
156                        encoding = encoding.trim();
157                }
158
159                return encoding;
160        }
161
162        /**
163         * Read-ahead the input stream (markup file). If the first line contains &lt;?xml...?&gt;, than
164         * remember the xml decl for later to determine the encoding.
165         * <p>
166         * The xml decl will not be forwarded to the user.
167         * 
168         * @param in
169         *            The markup file
170         * @param readAheadSize
171         *            The read ahead buffer available to read the xml encoding information
172         * @return true, if &lt;?xml ..?&gt; has been found
173         * @throws IOException
174         */
175        private String getXmlDeclaration(final InputStream in, final int readAheadSize)
176                throws IOException
177        {
178                // Max one line
179                final StringBuilder pushBack = new StringBuilder(readAheadSize);
180
181                // The current char from the markup file
182                int value;
183                while ((value = in.read()) != -1)
184                {
185                        pushBack.append((char)value);
186
187                        // Stop at the end of the first tag or end of line. If it is HTML
188                        // without newlines, stop after X bytes (= characters)
189                        if ((value == '>') || (value == '\n') || (value == '\r') ||
190                                (pushBack.length() >= (readAheadSize - 1)))
191                        {
192                                break;
193                        }
194                }
195
196                // Does the string match the <?xml .. ?> pattern
197                final Matcher matcher = xmlDecl.matcher(pushBack);
198                if (!matcher.matches())
199                {
200                        // No
201                        return null;
202                }
203
204                // Save the whole <?xml ..> string for later
205                return pushBack.toString().trim();
206        }
207
208        /**
209         * @see java.io.Reader#close()
210         */
211        @Override
212        public void close() throws IOException
213        {
214                try
215                {
216                        reader.close();
217                }
218                finally
219                {
220                        inputStream.close();
221                }
222        }
223
224        /**
225         * @see java.io.Reader#read(char[], int, int)
226         */
227        @Override
228        public int read(final char[] buf, final int from, final int to) throws IOException
229        {
230                return reader.read(buf, from, to);
231        }
232
233        /**
234         * @return The markup to be parsed
235         */
236        @Override
237        public String toString()
238        {
239                return inputStream.toString() + " (" + encoding + ")";
240        }
241}