001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.wicket.markup.parser; 018 019import java.io.BufferedInputStream; 020import java.io.IOException; 021import java.io.InputStream; 022import java.io.StringReader; 023import java.text.ParseException; 024import java.util.Locale; 025 026import org.apache.wicket.markup.parser.XmlTag.TagType; 027import org.apache.wicket.markup.parser.XmlTag.TextSegment; 028import org.apache.wicket.util.io.FullyBufferedReader; 029import org.apache.wicket.util.io.IOUtils; 030import org.apache.wicket.util.io.XmlReader; 031import org.apache.wicket.util.lang.Args; 032import org.apache.wicket.util.parse.metapattern.parsers.TagNameParser; 033import org.apache.wicket.util.parse.metapattern.parsers.VariableAssignmentParser; 034import org.apache.wicket.util.string.Strings; 035 036/** 037 * A fairly shallow markup pull parser which parses a markup string of a given type of markup (for 038 * example, html, xml, vxml or wml) into ComponentTag and RawMarkup tokens. 039 * 040 * @author Jonathan Locke 041 * @author Juergen Donnerstag 042 */ 043public final class XmlPullParser implements IXmlPullParser 044{ 045 /** */ 046 public static final String STYLE = "style"; 047 048 /** */ 049 public static final String SCRIPT = "script"; 050 051 /** 052 * The encoding of the XML. 053 */ 054 private String encoding; 055 056 /** 057 * A XML independent reader which loads the whole source data into memory and which provides 058 * convenience methods to access the data. 059 */ 060 private FullyBufferedReader input; 061 062 /** temporary variable which will hold the name of the closing tag. */ 063 private String skipUntilText; 064 065 /** The last substring selected from the input */ 066 private CharSequence lastText; 067 068 /** Everything in between <!DOCTYPE ... > */ 069 private CharSequence doctype; 070 071 /** The type of what is in lastText */ 072 private HttpTagType lastType = HttpTagType.NOT_INITIALIZED; 073 074 /** The last tag found */ 075 private XmlTag lastTag; 076 077 /** 078 * Construct. 079 */ 080 public XmlPullParser() 081 { 082 } 083 084 @Override 085 public final String getEncoding() 086 { 087 return encoding; 088 } 089 090 @Override 091 public final CharSequence getDoctype() 092 { 093 return doctype; 094 } 095 096 @Override 097 public final CharSequence getInputFromPositionMarker(final int toPos) 098 { 099 return input.getSubstring(toPos); 100 } 101 102 @Override 103 public final CharSequence getInput(final int fromPos, final int toPos) 104 { 105 return input.getSubstring(fromPos, toPos); 106 } 107 108 /** 109 * Whatever will be in between the current index and the closing tag, will be ignored (and thus 110 * treated as raw markup (text). This is useful for tags like 'script'. 111 * 112 * @throws ParseException 113 */ 114 private void skipUntil() throws ParseException 115 { 116 // this is a tag with non-XHTML text as body - skip this until the 117 // skipUntilText is found. 118 final int startIndex = input.getPosition(); 119 final int tagNameLen = skipUntilText.length(); 120 121 int pos = input.getPosition() - 1; 122 String endTagText = null; 123 int lastPos = 0; 124 while (!skipUntilText.equalsIgnoreCase(endTagText)) 125 { 126 pos = input.find("</", pos + 1); 127 if ((pos == -1) || ((pos + (tagNameLen + 2)) >= input.size())) 128 { 129 throw new ParseException( 130 skipUntilText + " tag not closed" + getLineAndColumnText(), startIndex); 131 } 132 133 lastPos = pos + 2; 134 endTagText = input.getSubstring(lastPos, lastPos + tagNameLen).toString(); 135 } 136 137 input.setPosition(pos); 138 lastText = input.getSubstring(startIndex, pos); 139 lastType = HttpTagType.BODY; 140 141 // Check that the tag is properly closed 142 lastPos = input.find('>', lastPos + tagNameLen); 143 if (lastPos == -1) 144 { 145 throw new ParseException(skipUntilText + " tag not closed" + getLineAndColumnText(), 146 startIndex); 147 } 148 149 // Reset the state variable 150 skipUntilText = null; 151 } 152 153 /** 154 * 155 * @return line and column number 156 */ 157 private String getLineAndColumnText() 158 { 159 return " (line " + input.getLineNumber() + ", column " + input.getColumnNumber() + ")"; 160 } 161 162 /** 163 * @return XXX 164 * @throws ParseException 165 */ 166 @Override 167 public final HttpTagType next() throws ParseException 168 { 169 // Reached end of markup file? 170 if (input.getPosition() >= input.size()) 171 { 172 return HttpTagType.NOT_INITIALIZED; 173 } 174 175 if (skipUntilText != null) 176 { 177 skipUntil(); 178 return lastType; 179 } 180 181 // Any more tags in the markup? 182 final int openBracketIndex = input.find('<'); 183 184 // Tag or Body? 185 if (input.charAt(input.getPosition()) != '<') 186 { 187 // It's a BODY 188 if (openBracketIndex == -1) 189 { 190 // There is no next matching tag. 191 lastText = input.getSubstring(-1); 192 input.setPosition(input.size()); 193 lastType = HttpTagType.BODY; 194 return lastType; 195 } 196 197 lastText = input.getSubstring(openBracketIndex); 198 input.setPosition(openBracketIndex); 199 lastType = HttpTagType.BODY; 200 return lastType; 201 } 202 203 // Determine the line number 204 input.countLinesTo(openBracketIndex); 205 206 // Get index of closing tag and advance past the tag 207 int closeBracketIndex = -1; 208 209 if (openBracketIndex != -1 && openBracketIndex < input.size() - 1) 210 { 211 char nextChar = input.charAt(openBracketIndex + 1); 212 213 if ((nextChar == '!') || (nextChar == '?')) 214 closeBracketIndex = input.find('>', openBracketIndex); 215 else 216 closeBracketIndex = input.findOutOfQuotes('>', openBracketIndex); 217 } 218 219 if (closeBracketIndex == -1) 220 { 221 throw new ParseException("No matching close bracket at" + getLineAndColumnText(), 222 input.getPosition()); 223 } 224 225 // Get the complete tag text 226 lastText = input.getSubstring(openBracketIndex, closeBracketIndex + 1); 227 228 // Get the tagtext between open and close brackets 229 String tagText = lastText.subSequence(1, lastText.length() - 1).toString(); 230 if (tagText.length() == 0) 231 { 232 throw new ParseException("Found empty tag: '<>' at" + getLineAndColumnText(), 233 input.getPosition()); 234 } 235 236 // Type of the tag, to be determined next 237 final TagType type; 238 239 // If the tag ends in '/', it's a "simple" tag like <foo/> 240 if (tagText.endsWith("/")) 241 { 242 type = TagType.OPEN_CLOSE; 243 tagText = tagText.substring(0, tagText.length() - 1); 244 } 245 else if (tagText.startsWith("/")) 246 { 247 // The tag text starts with a '/', it's a simple close tag 248 type = TagType.CLOSE; 249 tagText = tagText.substring(1); 250 } 251 else 252 { 253 // It must be an open tag 254 type = TagType.OPEN; 255 256 // If open tag and starts with "s" like "script" or "style", than ... 257 if ((tagText.length() > STYLE.length()) && 258 ((tagText.charAt(0) == 's') || (tagText.charAt(0) == 'S'))) 259 { 260 final String lowerCase = tagText.toLowerCase(Locale.ROOT); 261 if (lowerCase.startsWith(SCRIPT)) 262 { 263 String typeAttr = "type="; 264 int idxOfType = lowerCase.indexOf(typeAttr); 265 if (idxOfType > 0) 266 { 267 // +1 to remove the ' or " 268 String typePrefix = lowerCase.substring(idxOfType + typeAttr.length() + 1); 269 if (typePrefix.startsWith("text/javascript") || typePrefix.startsWith("module") 270 || typePrefix.startsWith("importmap")) 271 { 272 // prepare to skip everything between the open and close tag 273 skipUntilText = SCRIPT; 274 } 275 // any other type is assumed to be a template so it can contain child nodes. 276 // See WICKET-5288 277 } 278 else 279 { 280 // no type attribute so it is 'text/javascript' 281 // prepare to skip everything between the open and close tag 282 skipUntilText = SCRIPT; 283 } 284 } 285 else if (lowerCase.startsWith(STYLE)) 286 { 287 // prepare to skip everything between the open and close tag 288 skipUntilText = STYLE; 289 } 290 } 291 } 292 293 // Handle special tags like <!-- and <![CDATA ... 294 final char firstChar = tagText.charAt(0); 295 if ((firstChar == '!') || (firstChar == '?')) 296 { 297 specialTagHandling(tagText, openBracketIndex, closeBracketIndex); 298 299 input.countLinesTo(openBracketIndex); 300 TextSegment text = new TextSegment(lastText, openBracketIndex, input.getLineNumber(), 301 input.getColumnNumber()); 302 lastTag = new XmlTag(text, type); 303 304 return lastType; 305 } 306 307 TextSegment text = new TextSegment(lastText, openBracketIndex, input.getLineNumber(), 308 input.getColumnNumber()); 309 XmlTag tag = new XmlTag(text, type); 310 lastTag = tag; 311 312 // Parse the tag text and populate tag attributes 313 if (parseTagText(tag, tagText)) 314 { 315 // Move to position after the tag 316 input.setPosition(closeBracketIndex + 1); 317 lastType = HttpTagType.TAG; 318 return lastType; 319 } 320 else 321 { 322 throw new ParseException("Malformed tag" + getLineAndColumnText(), openBracketIndex); 323 } 324 } 325 326 /** 327 * Handle special tags like <!-- --> or <![CDATA[..]]> or <?xml> 328 * 329 * @param tagText 330 * @param openBracketIndex 331 * @param closeBracketIndex 332 * @throws ParseException 333 */ 334 protected void specialTagHandling(String tagText, final int openBracketIndex, 335 int closeBracketIndex) throws ParseException 336 { 337 // Handle comments 338 if (tagText.startsWith("!--")) 339 { 340 // downlevel-revealed conditional comments e.g.: <!--[if (gt IE9)|!(IE)]><!--> 341 if (tagText.contains("![endif]--")) 342 { 343 lastType = HttpTagType.CONDITIONAL_COMMENT_ENDIF; 344 345 // Move to position after the tag 346 input.setPosition(closeBracketIndex + 1); 347 return; 348 } 349 350 // Conditional comment? E.g. 351 // "<!--[if IE]><a href='test.html'>my link</a><![endif]-->" 352 if (tagText.startsWith("!--[if ") && tagText.endsWith("]")) 353 { 354 int pos = input.find("]-->", openBracketIndex + 1); 355 if (pos == -1) 356 { 357 throw new ParseException("Unclosed conditional comment beginning at" + 358 getLineAndColumnText(), openBracketIndex); 359 } 360 361 pos += 4; 362 lastText = input.getSubstring(openBracketIndex, pos); 363 364 // Actually it is no longer a comment. It is now 365 // up to the browser to select the section appropriate. 366 input.setPosition(closeBracketIndex + 1); 367 lastType = HttpTagType.CONDITIONAL_COMMENT; 368 } 369 else 370 { 371 // Normal comment section. 372 // Skip ahead to "-->". Note that you can not simply test for 373 // tagText.endsWith("--") as the comment might contain a '>' 374 // inside. 375 int pos = input.find("-->", openBracketIndex + 1); 376 if (pos == -1) 377 { 378 throw new ParseException("Unclosed comment beginning at" + 379 getLineAndColumnText(), openBracketIndex); 380 } 381 382 pos += 3; 383 lastText = input.getSubstring(openBracketIndex, pos); 384 lastType = HttpTagType.COMMENT; 385 input.setPosition(pos); 386 } 387 return; 388 } 389 390 // The closing tag of a conditional comment, e.g. 391 // "<!--[if IE]><a href='test.html'>my link</a><![endif]--> 392 // and also <!--<![endif]-->" 393 if (tagText.equals("![endif]--")) 394 { 395 lastType = HttpTagType.CONDITIONAL_COMMENT_ENDIF; 396 input.setPosition(closeBracketIndex + 1); 397 return; 398 } 399 400 // CDATA sections might contain "<" which is not part of an XML tag. 401 // Make sure escaped "<" are treated right 402 if (tagText.startsWith("![")) 403 { 404 final String startText = (tagText.length() <= 8 ? tagText : tagText.substring(0, 8)); 405 if (startText.toUpperCase(Locale.ROOT).equals("![CDATA[")) 406 { 407 int pos1 = openBracketIndex; 408 do 409 { 410 // Get index of closing tag and advance past the tag 411 closeBracketIndex = findChar('>', pos1); 412 413 if (closeBracketIndex == -1) 414 { 415 throw new ParseException("No matching close bracket at" + 416 getLineAndColumnText(), input.getPosition()); 417 } 418 419 // Get the tagtext between open and close brackets 420 tagText = input.getSubstring(openBracketIndex + 1, closeBracketIndex) 421 .toString(); 422 423 pos1 = closeBracketIndex + 1; 424 } 425 while (tagText.endsWith("]]") == false); 426 427 // Move to position after the tag 428 input.setPosition(closeBracketIndex + 1); 429 430 lastText = tagText; 431 lastType = HttpTagType.CDATA; 432 return; 433 } 434 } 435 436 if (tagText.charAt(0) == '?') 437 { 438 lastType = HttpTagType.PROCESSING_INSTRUCTION; 439 440 // Move to position after the tag 441 input.setPosition(closeBracketIndex + 1); 442 return; 443 } 444 445 if (tagText.startsWith("!DOCTYPE")) 446 { 447 lastType = HttpTagType.DOCTYPE; 448 449 // Get the tagtext between open and close brackets 450 doctype = input.getSubstring(openBracketIndex + 1, closeBracketIndex); 451 452 // Move to position after the tag 453 input.setPosition(closeBracketIndex + 1); 454 return; 455 } 456 457 // Move to position after the tag 458 lastType = HttpTagType.SPECIAL_TAG; 459 input.setPosition(closeBracketIndex + 1); 460 } 461 462 /** 463 * @return MarkupElement 464 */ 465 @Override 466 public final XmlTag getElement() 467 { 468 return lastTag; 469 } 470 471 /** 472 * @return The xml string from the last element 473 */ 474 @Override 475 public final CharSequence getString() 476 { 477 return lastText; 478 } 479 480 /** 481 * @return The next XML tag 482 * @throws ParseException 483 */ 484 public final XmlTag nextTag() throws ParseException 485 { 486 while (next() != HttpTagType.NOT_INITIALIZED) 487 { 488 switch (lastType) 489 { 490 case TAG : 491 return lastTag; 492 493 case BODY : 494 break; 495 496 case COMMENT : 497 break; 498 499 case CONDITIONAL_COMMENT : 500 break; 501 502 case CDATA : 503 break; 504 505 case PROCESSING_INSTRUCTION : 506 break; 507 508 case SPECIAL_TAG : 509 break; 510 } 511 } 512 513 return null; 514 } 515 516 /** 517 * Find the char but ignore any text within ".." and '..' 518 * 519 * @param ch 520 * The character to search 521 * @param startIndex 522 * Start index 523 * @return -1 if not found, else the index 524 */ 525 private int findChar(final char ch, int startIndex) 526 { 527 char quote = 0; 528 529 for (; startIndex < input.size(); startIndex++) 530 { 531 final char charAt = input.charAt(startIndex); 532 if (quote != 0) 533 { 534 if (quote == charAt) 535 { 536 quote = 0; 537 } 538 } 539 else if ((charAt == '"') || (charAt == '\'')) 540 { 541 quote = charAt; 542 } 543 else if (charAt == ch) 544 { 545 return startIndex; 546 } 547 } 548 549 return -1; 550 } 551 552 /** 553 * Parse the given string. 554 * <p> 555 * Note: xml character encoding is NOT applied. It is assumed the input provided does have the 556 * correct encoding already. 557 * 558 * @param string 559 * The input string 560 * @throws IOException 561 * Error while reading the resource 562 */ 563 @Override 564 public void parse(final CharSequence string) throws IOException 565 { 566 Args.notNull(string, "string"); 567 568 this.input = new FullyBufferedReader(new StringReader(string.toString())); 569 this.encoding = null; 570 } 571 572 /** 573 * Reads and parses markup from an input stream, using UTF-8 encoding by default when not 574 * specified in XML declaration. 575 * 576 * @param in 577 * The input stream to read and parse 578 * @throws IOException 579 * 580 * @see #parse(InputStream, String) 581 */ 582 @Override 583 public void parse(final InputStream in) throws IOException 584 { 585 // When XML declaration does not specify encoding, it defaults to UTF-8 586 parse(in, "UTF-8"); 587 } 588 589 /** 590 * Reads and parses markup from an input stream. 591 * <p> 592 * Note: The input is closed after parsing. 593 * 594 * @param inputStream 595 * The input stream to read and parse 596 * @param encoding 597 * The default character encoding of the input 598 * @throws IOException 599 */ 600 @Override 601 public void parse(final InputStream inputStream, final String encoding) throws IOException 602 { 603 Args.notNull(inputStream, "inputStream"); 604 605 try 606 { 607 XmlReader xmlReader = new XmlReader(new BufferedInputStream(inputStream, 4000), 608 encoding); 609 this.input = new FullyBufferedReader(xmlReader); 610 this.encoding = xmlReader.getEncoding(); 611 } 612 finally 613 { 614 IOUtils.closeQuietly(inputStream); 615 } 616 } 617 618 @Override 619 public final void setPositionMarker() 620 { 621 input.setPositionMarker(input.getPosition()); 622 } 623 624 @Override 625 public final void setPositionMarker(final int pos) 626 { 627 input.setPositionMarker(pos); 628 } 629 630 @Override 631 public String toString() 632 { 633 return input.toString(); 634 } 635 636 /** 637 * Parses the text between tags. For example, "a href=foo.html". 638 * 639 * @param tag 640 * @param tagText 641 * The text between tags 642 * @return false in case of an error 643 * @throws ParseException 644 */ 645 private boolean parseTagText(final XmlTag tag, final String tagText) throws ParseException 646 { 647 // Get the length of the tagtext 648 final int tagTextLength = tagText.length(); 649 650 // If we match tagname pattern 651 final TagNameParser tagnameParser = new TagNameParser(tagText); 652 if (tagnameParser.matcher().lookingAt()) 653 { 654 // Extract the tag from the pattern matcher 655 tag.name = tagnameParser.getName(); 656 tag.namespace = tagnameParser.getNamespace(); 657 658 // Are we at the end? Then there are no attributes, so we just 659 // return the tag 660 int pos = tagnameParser.matcher().end(0); 661 if (pos == tagTextLength) 662 { 663 return true; 664 } 665 666 // Extract attributes 667 final VariableAssignmentParser attributeParser = new VariableAssignmentParser(tagText); 668 while (attributeParser.matcher().find(pos)) 669 { 670 // Get key and value using attribute pattern 671 String value = attributeParser.getValue(); 672 673 // In case like <html xmlns:wicket> will the value be null 674 if (value == null) 675 { 676 value = ""; 677 } 678 679 // Set new position to end of attribute 680 pos = attributeParser.matcher().end(0); 681 682 // Chop off double quotes or single quotes 683 if (value.startsWith("\"") || value.startsWith("\'")) 684 { 685 value = value.substring(1, value.length() - 1); 686 } 687 688 // Trim trailing whitespace 689 value = value.trim(); 690 691 // Unescape 692 value = Strings.unescapeMarkup(value).toString(); 693 694 // Get key 695 final String key = attributeParser.getKey(); 696 697 // Put the attribute in the attributes hash 698 if (null != tag.getAttributes().put(key, value)) 699 { 700 throw new ParseException("Same attribute found twice: " + key + 701 getLineAndColumnText(), input.getPosition()); 702 } 703 704 // The input has to match exactly (no left over junk after 705 // attributes) 706 if (pos == tagTextLength) 707 { 708 return true; 709 } 710 } 711 712 return true; 713 } 714 715 return false; 716 } 717}