001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.wicket.markup.parser;
018
019import java.io.BufferedInputStream;
020import java.io.IOException;
021import java.io.InputStream;
022import java.io.StringReader;
023import java.text.ParseException;
024import java.util.Locale;
025
026import org.apache.wicket.markup.parser.XmlTag.TagType;
027import org.apache.wicket.markup.parser.XmlTag.TextSegment;
028import org.apache.wicket.util.io.FullyBufferedReader;
029import org.apache.wicket.util.io.IOUtils;
030import org.apache.wicket.util.io.XmlReader;
031import org.apache.wicket.util.lang.Args;
032import org.apache.wicket.util.parse.metapattern.parsers.TagNameParser;
033import org.apache.wicket.util.parse.metapattern.parsers.VariableAssignmentParser;
034import org.apache.wicket.util.string.Strings;
035
036/**
037 * A fairly shallow markup pull parser which parses a markup string of a given type of markup (for
038 * example, html, xml, vxml or wml) into ComponentTag and RawMarkup tokens.
039 *
040 * @author Jonathan Locke
041 * @author Juergen Donnerstag
042 */
043public final class XmlPullParser implements IXmlPullParser
044{
045        /** */
046        public static final String STYLE = "style";
047
048        /** */
049        public static final String SCRIPT = "script";
050
051        /**
052         * The encoding of the XML.
053         */
054        private String encoding;
055
056        /**
057         * A XML independent reader which loads the whole source data into memory and which provides
058         * convenience methods to access the data.
059         */
060        private FullyBufferedReader input;
061
062        /** temporary variable which will hold the name of the closing tag. */
063        private String skipUntilText;
064
065        /** The last substring selected from the input */
066        private CharSequence lastText;
067
068        /** Everything in between <!DOCTYPE ... > */
069        private CharSequence doctype;
070
071        /** The type of what is in lastText */
072        private HttpTagType lastType = HttpTagType.NOT_INITIALIZED;
073
074        /** The last tag found */
075        private XmlTag lastTag;
076
077        /**
078         * Construct.
079         */
080        public XmlPullParser()
081        {
082        }
083
084        @Override
085        public final String getEncoding()
086        {
087                return encoding;
088        }
089
090        @Override
091        public final CharSequence getDoctype()
092        {
093                return doctype;
094        }
095
096        @Override
097        public final CharSequence getInputFromPositionMarker(final int toPos)
098        {
099                return input.getSubstring(toPos);
100        }
101
102        @Override
103        public final CharSequence getInput(final int fromPos, final int toPos)
104        {
105                return input.getSubstring(fromPos, toPos);
106        }
107
108        /**
109         * Whatever will be in between the current index and the closing tag, will be ignored (and thus
110         * treated as raw markup (text). This is useful for tags like 'script'.
111         *
112         * @throws ParseException
113         */
114        private void skipUntil() throws ParseException
115        {
116                // this is a tag with non-XHTML text as body - skip this until the
117                // skipUntilText is found.
118                final int startIndex = input.getPosition();
119                final int tagNameLen = skipUntilText.length();
120
121                int pos = input.getPosition() - 1;
122                String endTagText = null;
123                int lastPos = 0;
124                while (!skipUntilText.equalsIgnoreCase(endTagText))
125                {
126                        pos = input.find("</", pos + 1);
127                        if ((pos == -1) || ((pos + (tagNameLen + 2)) >= input.size()))
128                        {
129                                throw new ParseException(
130                                        skipUntilText + " tag not closed" + getLineAndColumnText(), startIndex);
131                        }
132
133                        lastPos = pos + 2;
134                        endTagText = input.getSubstring(lastPos, lastPos + tagNameLen).toString();
135                }
136
137                input.setPosition(pos);
138                lastText = input.getSubstring(startIndex, pos);
139                lastType = HttpTagType.BODY;
140
141                // Check that the tag is properly closed
142                lastPos = input.find('>', lastPos + tagNameLen);
143                if (lastPos == -1)
144                {
145                        throw new ParseException(skipUntilText + " tag not closed" + getLineAndColumnText(),
146                                startIndex);
147                }
148
149                // Reset the state variable
150                skipUntilText = null;
151        }
152
153        /**
154         *
155         * @return line and column number
156         */
157        private String getLineAndColumnText()
158        {
159                return " (line " + input.getLineNumber() + ", column " + input.getColumnNumber() + ")";
160        }
161
162        /**
163         * @return XXX
164         * @throws ParseException
165         */
166        @Override
167        public final HttpTagType next() throws ParseException
168        {
169                // Reached end of markup file?
170                if (input.getPosition() >= input.size())
171                {
172                        return HttpTagType.NOT_INITIALIZED;
173                }
174
175                if (skipUntilText != null)
176                {
177                        skipUntil();
178                        return lastType;
179                }
180
181                // Any more tags in the markup?
182                final int openBracketIndex = input.find('<');
183
184                // Tag or Body?
185                if (input.charAt(input.getPosition()) != '<')
186                {
187                        // It's a BODY
188                        if (openBracketIndex == -1)
189                        {
190                                // There is no next matching tag.
191                                lastText = input.getSubstring(-1);
192                                input.setPosition(input.size());
193                                lastType = HttpTagType.BODY;
194                                return lastType;
195                        }
196
197                        lastText = input.getSubstring(openBracketIndex);
198                        input.setPosition(openBracketIndex);
199                        lastType = HttpTagType.BODY;
200                        return lastType;
201                }
202
203                // Determine the line number
204                input.countLinesTo(openBracketIndex);
205
206                // Get index of closing tag and advance past the tag
207                int closeBracketIndex = -1;
208
209                if (openBracketIndex != -1 && openBracketIndex < input.size() - 1)
210                {
211                        char nextChar = input.charAt(openBracketIndex + 1);
212
213                        if ((nextChar == '!') || (nextChar == '?'))
214                                closeBracketIndex = input.find('>', openBracketIndex);
215                        else
216                                closeBracketIndex = input.findOutOfQuotes('>', openBracketIndex);
217                }
218
219                if (closeBracketIndex == -1)
220                {
221                        throw new ParseException("No matching close bracket at" + getLineAndColumnText(),
222                                input.getPosition());
223                }
224
225                // Get the complete tag text
226                lastText = input.getSubstring(openBracketIndex, closeBracketIndex + 1);
227
228                // Get the tagtext between open and close brackets
229                String tagText = lastText.subSequence(1, lastText.length() - 1).toString();
230                if (tagText.length() == 0)
231                {
232                        throw new ParseException("Found empty tag: '<>' at" + getLineAndColumnText(),
233                                input.getPosition());
234                }
235
236                // Type of the tag, to be determined next
237                final TagType type;
238
239                // If the tag ends in '/', it's a "simple" tag like <foo/>
240                if (tagText.endsWith("/"))
241                {
242                        type = TagType.OPEN_CLOSE;
243                        tagText = tagText.substring(0, tagText.length() - 1);
244                }
245                else if (tagText.startsWith("/"))
246                {
247                        // The tag text starts with a '/', it's a simple close tag
248                        type = TagType.CLOSE;
249                        tagText = tagText.substring(1);
250                }
251                else
252                {
253                        // It must be an open tag
254                        type = TagType.OPEN;
255
256                        // If open tag and starts with "s" like "script" or "style", than ...
257                        if ((tagText.length() > STYLE.length()) &&
258                                ((tagText.charAt(0) == 's') || (tagText.charAt(0) == 'S')))
259                        {
260                                final String lowerCase = tagText.toLowerCase(Locale.ROOT);
261                                if (lowerCase.startsWith(SCRIPT))
262                                {
263                                        String typeAttr = "type=";
264                                        int idxOfType = lowerCase.indexOf(typeAttr);
265                                        if (idxOfType > 0)
266                                        {
267                                                // +1 to remove the ' or "
268                                                String typePrefix = lowerCase.substring(idxOfType + typeAttr.length() + 1);
269                                                if (typePrefix.startsWith("text/javascript") || typePrefix.startsWith("module")
270                                                        || typePrefix.startsWith("importmap"))
271                                                {
272                                                        // prepare to skip everything between the open and close tag
273                                                        skipUntilText = SCRIPT;
274                                                }
275                                                // any other type is assumed to be a template so it can contain child nodes.
276                                                // See WICKET-5288
277                                        }
278                                        else
279                                        {
280                                                // no type attribute so it is 'text/javascript'
281                                                // prepare to skip everything between the open and close tag
282                                                skipUntilText = SCRIPT;
283                                        }
284                                }
285                                else if (lowerCase.startsWith(STYLE))
286                                {
287                                        // prepare to skip everything between the open and close tag
288                                        skipUntilText = STYLE;
289                                }
290                        }
291                }
292
293                // Handle special tags like <!-- and <![CDATA ...
294                final char firstChar = tagText.charAt(0);
295                if ((firstChar == '!') || (firstChar == '?'))
296                {
297                        specialTagHandling(tagText, openBracketIndex, closeBracketIndex);
298
299                        input.countLinesTo(openBracketIndex);
300                        TextSegment text = new TextSegment(lastText, openBracketIndex, input.getLineNumber(),
301                                input.getColumnNumber());
302                        lastTag = new XmlTag(text, type);
303
304                        return lastType;
305                }
306
307                TextSegment text = new TextSegment(lastText, openBracketIndex, input.getLineNumber(),
308                        input.getColumnNumber());
309                XmlTag tag = new XmlTag(text, type);
310                lastTag = tag;
311
312                // Parse the tag text and populate tag attributes
313                if (parseTagText(tag, tagText))
314                {
315                        // Move to position after the tag
316                        input.setPosition(closeBracketIndex + 1);
317                        lastType = HttpTagType.TAG;
318                        return lastType;
319                }
320                else
321                {
322                        throw new ParseException("Malformed tag" + getLineAndColumnText(), openBracketIndex);
323                }
324        }
325
326        /**
327         * Handle special tags like &lt;!-- --&gt; or &lt;![CDATA[..]]&gt; or &lt;?xml&gt;
328         *
329         * @param tagText
330         * @param openBracketIndex
331         * @param closeBracketIndex
332         * @throws ParseException
333         */
334        protected void specialTagHandling(String tagText, final int openBracketIndex,
335                int closeBracketIndex) throws ParseException
336        {
337                // Handle comments
338                if (tagText.startsWith("!--"))
339                {
340                        // downlevel-revealed conditional comments e.g.: <!--[if (gt IE9)|!(IE)]><!-->
341                        if (tagText.contains("![endif]--"))
342                        {
343                                lastType = HttpTagType.CONDITIONAL_COMMENT_ENDIF;
344
345                                // Move to position after the tag
346                                input.setPosition(closeBracketIndex + 1);
347                                return;
348                        }
349
350                        // Conditional comment? E.g.
351                        // "<!--[if IE]><a href='test.html'>my link</a><![endif]-->"
352                        if (tagText.startsWith("!--[if ") && tagText.endsWith("]"))
353                        {
354                                int pos = input.find("]-->", openBracketIndex + 1);
355                                if (pos == -1)
356                                {
357                                        throw new ParseException("Unclosed conditional comment beginning at" +
358                                                getLineAndColumnText(), openBracketIndex);
359                                }
360
361                                pos += 4;
362                                lastText = input.getSubstring(openBracketIndex, pos);
363
364                                // Actually it is no longer a comment. It is now
365                                // up to the browser to select the section appropriate.
366                                input.setPosition(closeBracketIndex + 1);
367                                lastType = HttpTagType.CONDITIONAL_COMMENT;
368                        }
369                        else
370                        {
371                                // Normal comment section.
372                                // Skip ahead to "-->". Note that you can not simply test for
373                                // tagText.endsWith("--") as the comment might contain a '>'
374                                // inside.
375                                int pos = input.find("-->", openBracketIndex + 1);
376                                if (pos == -1)
377                                {
378                                        throw new ParseException("Unclosed comment beginning at" +
379                                                getLineAndColumnText(), openBracketIndex);
380                                }
381
382                                pos += 3;
383                                lastText = input.getSubstring(openBracketIndex, pos);
384                                lastType = HttpTagType.COMMENT;
385                                input.setPosition(pos);
386                        }
387                        return;
388                }
389
390                // The closing tag of a conditional comment, e.g.
391                // "<!--[if IE]><a href='test.html'>my link</a><![endif]-->
392                // and also <!--<![endif]-->"
393                if (tagText.equals("![endif]--"))
394                {
395                        lastType = HttpTagType.CONDITIONAL_COMMENT_ENDIF;
396                        input.setPosition(closeBracketIndex + 1);
397                        return;
398                }
399
400                // CDATA sections might contain "<" which is not part of an XML tag.
401                // Make sure escaped "<" are treated right
402                if (tagText.startsWith("!["))
403                {
404                        final String startText = (tagText.length() <= 8 ? tagText : tagText.substring(0, 8));
405                        if (startText.toUpperCase(Locale.ROOT).equals("![CDATA["))
406                        {
407                                int pos1 = openBracketIndex;
408                                do
409                                {
410                                        // Get index of closing tag and advance past the tag
411                                        closeBracketIndex = findChar('>', pos1);
412
413                                        if (closeBracketIndex == -1)
414                                        {
415                                                throw new ParseException("No matching close bracket at" +
416                                                        getLineAndColumnText(), input.getPosition());
417                                        }
418
419                                        // Get the tagtext between open and close brackets
420                                        tagText = input.getSubstring(openBracketIndex + 1, closeBracketIndex)
421                                                .toString();
422
423                                        pos1 = closeBracketIndex + 1;
424                                }
425                                while (tagText.endsWith("]]") == false);
426
427                                // Move to position after the tag
428                                input.setPosition(closeBracketIndex + 1);
429
430                                lastText = tagText;
431                                lastType = HttpTagType.CDATA;
432                                return;
433                        }
434                }
435
436                if (tagText.charAt(0) == '?')
437                {
438                        lastType = HttpTagType.PROCESSING_INSTRUCTION;
439
440                        // Move to position after the tag
441                        input.setPosition(closeBracketIndex + 1);
442                        return;
443                }
444
445                if (tagText.startsWith("!DOCTYPE"))
446                {
447                        lastType = HttpTagType.DOCTYPE;
448
449                        // Get the tagtext between open and close brackets
450                        doctype = input.getSubstring(openBracketIndex + 1, closeBracketIndex);
451
452                        // Move to position after the tag
453                        input.setPosition(closeBracketIndex + 1);
454                        return;
455                }
456
457                // Move to position after the tag
458                lastType = HttpTagType.SPECIAL_TAG;
459                input.setPosition(closeBracketIndex + 1);
460        }
461
462        /**
463         * @return MarkupElement
464         */
465        @Override
466        public final XmlTag getElement()
467        {
468                return lastTag;
469        }
470
471        /**
472         * @return The xml string from the last element
473         */
474        @Override
475        public final CharSequence getString()
476        {
477                return lastText;
478        }
479
480        /**
481         * @return The next XML tag
482         * @throws ParseException
483         */
484        public final XmlTag nextTag() throws ParseException
485        {
486                while (next() != HttpTagType.NOT_INITIALIZED)
487                {
488                        switch (lastType)
489                        {
490                                case TAG :
491                                        return lastTag;
492
493                                case BODY :
494                                        break;
495
496                                case COMMENT :
497                                        break;
498
499                                case CONDITIONAL_COMMENT :
500                                        break;
501
502                                case CDATA :
503                                        break;
504
505                                case PROCESSING_INSTRUCTION :
506                                        break;
507
508                                case SPECIAL_TAG :
509                                        break;
510                        }
511                }
512
513                return null;
514        }
515
516        /**
517         * Find the char but ignore any text within ".." and '..'
518         *
519         * @param ch
520         *            The character to search
521         * @param startIndex
522         *            Start index
523         * @return -1 if not found, else the index
524         */
525        private int findChar(final char ch, int startIndex)
526        {
527                char quote = 0;
528
529                for (; startIndex < input.size(); startIndex++)
530                {
531                        final char charAt = input.charAt(startIndex);
532                        if (quote != 0)
533                        {
534                                if (quote == charAt)
535                                {
536                                        quote = 0;
537                                }
538                        }
539                        else if ((charAt == '"') || (charAt == '\''))
540                        {
541                                quote = charAt;
542                        }
543                        else if (charAt == ch)
544                        {
545                                return startIndex;
546                        }
547                }
548
549                return -1;
550        }
551
552        /**
553         * Parse the given string.
554         * <p>
555         * Note: xml character encoding is NOT applied. It is assumed the input provided does have the
556         * correct encoding already.
557         *
558         * @param string
559         *            The input string
560         * @throws IOException
561         *             Error while reading the resource
562         */
563        @Override
564        public void parse(final CharSequence string) throws IOException
565        {
566                Args.notNull(string, "string");
567
568                this.input = new FullyBufferedReader(new StringReader(string.toString()));
569                this.encoding = null;
570        }
571
572        /**
573         * Reads and parses markup from an input stream, using UTF-8 encoding by default when not
574         * specified in XML declaration.
575         *
576         * @param in
577         *            The input stream to read and parse
578         * @throws IOException
579         *
580         * @see #parse(InputStream, String)
581         */
582        @Override
583        public void parse(final InputStream in) throws IOException
584        {
585                // When XML declaration does not specify encoding, it defaults to UTF-8
586                parse(in, "UTF-8");
587        }
588
589        /**
590         * Reads and parses markup from an input stream.
591         * <p>
592         * Note: The input is closed after parsing.
593         *
594         * @param inputStream
595         *            The input stream to read and parse
596         * @param encoding
597         *            The default character encoding of the input
598         * @throws IOException
599         */
600        @Override
601        public void parse(final InputStream inputStream, final String encoding) throws IOException
602        {
603                Args.notNull(inputStream, "inputStream");
604
605                try
606                {
607                        XmlReader xmlReader = new XmlReader(new BufferedInputStream(inputStream, 4000),
608                                encoding);
609                        this.input = new FullyBufferedReader(xmlReader);
610                        this.encoding = xmlReader.getEncoding();
611                }
612                finally
613                {
614                        IOUtils.closeQuietly(inputStream);
615                }
616        }
617
618        @Override
619        public final void setPositionMarker()
620        {
621                input.setPositionMarker(input.getPosition());
622        }
623
624        @Override
625        public final void setPositionMarker(final int pos)
626        {
627                input.setPositionMarker(pos);
628        }
629
630        @Override
631        public String toString()
632        {
633                return input.toString();
634        }
635
636        /**
637         * Parses the text between tags. For example, "a href=foo.html".
638         *
639         * @param tag
640         * @param tagText
641         *            The text between tags
642         * @return false in case of an error
643         * @throws ParseException
644         */
645        private boolean parseTagText(final XmlTag tag, final String tagText) throws ParseException
646        {
647                // Get the length of the tagtext
648                final int tagTextLength = tagText.length();
649
650                // If we match tagname pattern
651                final TagNameParser tagnameParser = new TagNameParser(tagText);
652                if (tagnameParser.matcher().lookingAt())
653                {
654                        // Extract the tag from the pattern matcher
655                        tag.name = tagnameParser.getName();
656                        tag.namespace = tagnameParser.getNamespace();
657
658                        // Are we at the end? Then there are no attributes, so we just
659                        // return the tag
660                        int pos = tagnameParser.matcher().end(0);
661                        if (pos == tagTextLength)
662                        {
663                                return true;
664                        }
665
666                        // Extract attributes
667                        final VariableAssignmentParser attributeParser = new VariableAssignmentParser(tagText);
668                        while (attributeParser.matcher().find(pos))
669                        {
670                                // Get key and value using attribute pattern
671                                String value = attributeParser.getValue();
672
673                                // In case like <html xmlns:wicket> will the value be null
674                                if (value == null)
675                                {
676                                        value = "";
677                                }
678
679                                // Set new position to end of attribute
680                                pos = attributeParser.matcher().end(0);
681
682                                // Chop off double quotes or single quotes
683                                if (value.startsWith("\"") || value.startsWith("\'"))
684                                {
685                                        value = value.substring(1, value.length() - 1);
686                                }
687
688                                // Trim trailing whitespace
689                                value = value.trim();
690
691                                // Unescape
692                                value = Strings.unescapeMarkup(value).toString();
693
694                                // Get key
695                                final String key = attributeParser.getKey();
696
697                                // Put the attribute in the attributes hash
698                                if (null != tag.getAttributes().put(key, value))
699                                {
700                                        throw new ParseException("Same attribute found twice: " + key +
701                                                getLineAndColumnText(), input.getPosition());
702                                }
703
704                                // The input has to match exactly (no left over junk after
705                                // attributes)
706                                if (pos == tagTextLength)
707                                {
708                                        return true;
709                                }
710                        }
711
712                        return true;
713                }
714
715                return false;
716        }
717}