001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.wicket.validation.validator;
018
019import java.util.Arrays;
020import java.util.HashSet;
021import java.util.Set;
022import java.util.regex.Matcher;
023import java.util.regex.Pattern;
024
025import org.apache.wicket.validation.IValidatable;
026import org.apache.wicket.validation.IValidationError;
027import org.apache.wicket.validation.IValidator;
028import org.apache.wicket.validation.ValidationError;
029
030/**
031 * Validator for checking URLs. The default schemes allowed are <code>http://</code>,
032 * <code>https://</code>, and <code>ftp://</code>.
033 * <p>
034 * The behavior of validation is modified by passing in one of these options:
035 * <p>
036 * <ul>
037 * <li><code>ALLOW_2_SLASHES - [FALSE]</code>: Allows double '/' characters in the path component.</li>
038 * <li><code>NO_FRAGMENT- [FALSE]</code>: By default fragments are allowed. If this option is
039 * included then fragments are flagged as illegal.</li>
040 * <li><code>ALLOW_ALL_SCHEMES - [FALSE]</code>: By default only http, https, and ftp are considered
041 * valid schemes. Enabling this option will let any scheme pass validation.</li>
042 * </ul>
043 * <p>
044 * This was originally based <code>org.apache.commons.validator.UrlValidator</code>, but the
045 * dependency on Jakarta-ORO was removed and it now uses java.util.regexp instead. Usage example:
046 * <p>
047 * 
048 * <pre>
049 * &lt;code&gt;
050 * Component.add(new UrlValidator({&quot;http&quot;, &quot;https&quot;}));
051 * &lt;/code&gt;
052 * </pre>
053 * 
054 * @author Vincent Demay
055 * @since 1.2.6
056 * @see "http://www.ietf.org/rfc/rfc2396.txt"
057 */
058public class UrlValidator implements IValidator<String>
059{
060        private static final long serialVersionUID = 1L;
061
062        /**
063         * Allows all validly-formatted schemes to pass validation instead of supplying a set of valid
064         * schemes.
065         */
066        public static final int ALLOW_ALL_SCHEMES = 1 << 0;
067
068        /**
069         * Allow two slashes in the path component of the <code>URL</code>.
070         */
071        public static final int ALLOW_2_SLASHES = 1 << 1;
072
073        /**
074         * Enabling this option disallows any <code>URL</code> fragments.
075         */
076        public static final int NO_FRAGMENTS = 1 << 2;
077
078        private static final String ALPHA_CHARS = "a-zA-Z";
079
080        private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d";
081
082        private static final String SPECIAL_CHARS = ";/@&=,.?:+$";
083
084        private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]";
085
086        private static final String SCHEME_CHARS = ALPHA_CHARS;
087
088        // Drop numeric, and "+-." for now
089        private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\.";
090
091        private static final String ATOM = VALID_CHARS + '+';
092
093        /**
094         * This expression derived/taken from the BNF for URI (RFC2396).
095         */
096        private static final String URL_PATTERN = "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
097
098        /**
099         * Schema / Protocol (<code>http:</code>, <code>ftp:</code>, <code>file:</code>, etc).
100         */
101        private static final int PARSE_URL_SCHEME = 2;
102        private static final int PARSE_URL_AUTHORITY = 4; // Includes hostname / ip and port number.
103        private static final int PARSE_URL_PATH = 5;
104        private static final int PARSE_URL_QUERY = 7;
105        private static final int PARSE_URL_FRAGMENT = 9;
106
107        /**
108         * Protocol (<code>http:</code>, <code>ftp:</code>, or <code>https:</code>).
109         */
110        private static final String SCHEME_PATTERN = "^[" + SCHEME_CHARS + "].*$";
111
112        private static final String AUTHORITY_PATTERN = "^(.+(:.*)?@)?([" + AUTHORITY_CHARS +
113                "]*)(:\\d*)?(.*)?";
114
115        private static final int PARSE_AUTHORITY_HOST_IP = 3;
116        private static final int PARSE_AUTHORITY_PORT = 4;
117        private static final int PARSE_AUTHORITY_EXTRA = 5; // Should always be empty.
118
119        private static final String PATH_PATTERN = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$";
120
121        private static final String QUERY_PATTERN = "^(.*)$";
122
123        private static final String LEGAL_ASCII_PATTERN = "^[\\x00-\\x7F]+$";
124
125        private static final String IP_V4_DOMAIN_PATTERN = "^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$";
126
127        private static final String DOMAIN_PATTERN = "^" + ATOM + "(\\." + ATOM + ")*$";
128
129        private static final String PORT_PATTERN = "^:(\\d{1,5})$";
130
131        private static final String ATOM_PATTERN = "(" + ATOM + ")";
132
133        private static final String ALPHA_PATTERN = "^[" + ALPHA_CHARS + "]";
134
135        /**
136         * Holds the set of current validation options.
137         */
138        private long options = 0;
139
140        /**
141         * The set of schemes that are allowed to be in a URL.
142         */
143        private final Set<String> allowedSchemes = new HashSet<String>();
144
145        /**
146         * If no schemes are provided, default to this set of protocols.
147         */
148        protected String[] defaultSchemes = { "http", "https", "ftp" };
149
150        /**
151         * Constructs a <code>UrlValidator</code> with default properties.
152         */
153        public UrlValidator()
154        {
155                this(null);
156        }
157
158        /**
159         * Constructs a <code>UrlValidator</code> with the given <code>String</code> array of scheme
160         * options. The validation is modified by passing in options in the <code>schemes</code>
161         * argument.
162         * 
163         * @param schemes
164         *            Pass in one or more <code>URL</code> schemes to consider valid. Passing in a
165         *            <code>null</code> will default to "<code>http,https,ftp</code>" being used. If a
166         *            non-<code>null</code> scheme is specified, then all valid schemes must be
167         *            specified. Setting the <code>ALLOW_ALL_SCHEMES</code> option will ignore the
168         *            contents of <code>schemes</code>.
169         */
170        public UrlValidator(String[] schemes)
171        {
172                this(schemes, 0);
173        }
174
175        /**
176         * Constructs a <code>UrlValidator</code> with the given validation options.
177         * 
178         * @param options
179         *            The options should be set using the public constants declared in this class. To
180         *            set multiple options you simply add them together. For example,
181         *            <code>ALLOW_2_SLASHES</code> + <code>NO_FRAGMENTS</code> enables both of those
182         *            options.
183         */
184        public UrlValidator(int options)
185        {
186                this(null, options);
187        }
188
189        /**
190         * Constructs a <code>UrlValidator</code> with the given scheme and validation options (see
191         * class description).
192         * 
193         * @param schemes
194         *            Pass in one or more <code>URL</code> schemes to consider valid. Passing in a
195         *            <code>null</code> will default to "<code>http,https,ftp</code>" being used. If a
196         *            non-<code>null</code> scheme is specified, then all valid schemes must be
197         *            specified. Setting the <code>ALLOW_ALL_SCHEMES</code> option will ignore the
198         *            contents of <code>schemes</code>.
199         * @param options
200         *            The options should be set using the public constants declared in this class. To
201         *            set multiple options you simply add them together. For example,
202         *            <code>ALLOW_2_SLASHES</code> + <code>NO_FRAGMENTS</code> enables both of those
203         *            options.
204         * 
205         */
206        public UrlValidator(String[] schemes, int options)
207        {
208                this.options = options;
209
210                if (isOn(ALLOW_ALL_SCHEMES))
211                {
212                        return;
213                }
214
215                if (schemes == null)
216                {
217                        schemes = defaultSchemes;
218                }
219
220                allowedSchemes.addAll(Arrays.asList(schemes));
221        }
222
223
224        @Override
225        public void validate(IValidatable<String> validatable)
226        {
227                String url = validatable.getValue();
228                if (!isValid(url))
229                {
230                        validatable.error(decorate(new ValidationError(this), validatable));
231                }
232        }
233
234        /**
235         * Allows subclasses to decorate reported errors
236         * 
237         * @param error
238         * @return decorated error
239         */
240        protected IValidationError decorate(IValidationError error, IValidatable<String> validatable)
241        {
242                return error;
243        }
244
245        /**
246         * Checks if a field has a valid <code>URL</code>. This method is public because it is directly
247         * used in tests.
248         * 
249         * @param value
250         *            The value validation is being performed on. A <code>null</code> value is
251         *            considered invalid.
252         * @return <code>true</code> if the <code>URL</code> is valid
253         */
254        public final boolean isValid(String value)
255        {
256                if (value == null)
257                {
258                        return false;
259                }
260
261                Matcher matchAsciiPat = Pattern.compile(LEGAL_ASCII_PATTERN).matcher(value);
262                if (!matchAsciiPat.matches())
263                {
264                        return false;
265                }
266
267                // Check the whole url address structure
268                Matcher matchUrlPat = Pattern.compile(URL_PATTERN).matcher(value);
269                if (!matchUrlPat.matches())
270                {
271                        return false;
272                }
273
274                if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME)))
275                {
276                        return false;
277                }
278
279                if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY)))
280                {
281                        return false;
282                }
283
284                if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH)))
285                {
286                        return false;
287                }
288
289                if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY)))
290                {
291                        return false;
292                }
293
294                if (!isValidFragment(matchUrlPat.group(PARSE_URL_FRAGMENT)))
295                {
296                        return false;
297                }
298
299                return true;
300        }
301
302        /**
303         * Validates a scheme. If schemes[] was initialized to non-<code>null</code>, then only those
304         * schemes are allowed. Note that this is slightly different than for the constructor.
305         * 
306         * @param scheme
307         *            The scheme to validate. A <code>null</code> value is considered invalid.
308         * @return <code>true</code> if the <code>URL</code> is valid
309         */
310        protected boolean isValidScheme(String scheme)
311        {
312                if (scheme == null)
313                {
314                        return false;
315                }
316
317                if (!Pattern.compile(SCHEME_PATTERN).matcher(scheme).matches())
318                {
319                        return false;
320                }
321
322                if (isOff(ALLOW_ALL_SCHEMES))
323                {
324
325                        if (!allowedSchemes.contains(scheme))
326                        {
327                                return false;
328                        }
329                }
330
331                return true;
332        }
333
334        /**
335         * Returns <code>true</code> if the authority is properly formatted. An authority is the
336         * combination of host name and port. A <code>null</code> authority value is considered invalid.
337         * 
338         * @param authority
339         *            an authority value to validate
340         * @return true if authority (host name and port) is valid.
341         */
342        protected boolean isValidAuthority(String authority)
343        {
344                if (authority == null)
345                {
346                        return false;
347                }
348
349                Matcher authorityMatcher = Pattern.compile(AUTHORITY_PATTERN).matcher(authority);
350                if (!authorityMatcher.matches())
351                {
352                        return false;
353                }
354
355                boolean ipV4Address = false;
356                boolean hostname = false;
357                // check if authority is IP address or hostname
358                String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
359                Matcher matchIPV4Pat = Pattern.compile(IP_V4_DOMAIN_PATTERN).matcher(hostIP);
360                ipV4Address = matchIPV4Pat.matches();
361
362                if (ipV4Address)
363                {
364                        // this is an IP address so check components
365                        for (int i = 1; i <= 4; i++)
366                        {
367                                String ipSegment = matchIPV4Pat.group(i);
368                                if (ipSegment == null || ipSegment.length() <= 0)
369                                {
370                                        return false;
371                                }
372
373                                try
374                                {
375                                        if (Integer.parseInt(ipSegment) > 255)
376                                        {
377                                                return false;
378                                        }
379                                }
380                                catch (NumberFormatException e)
381                                {
382                                        return false;
383                                }
384
385                        }
386                }
387                else
388                {
389                        // Domain is hostname name
390                        hostname = Pattern.compile(DOMAIN_PATTERN).matcher(hostIP).matches();
391                }
392
393                // rightmost hostname will never start with a digit.
394                if (hostname)
395                {
396                        // LOW-TECH FIX FOR VALIDATOR-202
397                        // TODO: Rewrite to use ArrayList and .add semantics: see
398                        // VALIDATOR-203
399                        char[] chars = hostIP.toCharArray();
400                        int size = 1;
401                        for (char ch : chars)
402                        {
403                                if (ch == '.')
404                                {
405                                        size++;
406                                }
407                        }
408                        String[] domainSegment = new String[size];
409                        boolean match = true;
410                        int segmentCount = 0;
411                        int segmentLength = 0;
412
413                        while (match)
414                        {
415                                Matcher atomMatcher = Pattern.compile(ATOM_PATTERN).matcher(hostIP);
416                                match = atomMatcher.find();
417                                if (match)
418                                {
419                                        domainSegment[segmentCount] = atomMatcher.group(1);
420                                        segmentLength = domainSegment[segmentCount].length() + 1;
421                                        hostIP = (segmentLength >= hostIP.length()) ? ""
422                                                : hostIP.substring(segmentLength);
423
424                                        segmentCount++;
425                                }
426                        }
427
428                        if (segmentCount > 1)
429                        {
430                                String topLevel = domainSegment[segmentCount - 1];
431                                if (topLevel.length() < 2)
432                                {
433                                        return false;
434                                }
435
436                                // First letter of top level must be a alpha
437                                Matcher alphaMatcher = Pattern.compile(ALPHA_PATTERN).matcher(
438                                        topLevel.substring(0, 1));
439                                if (!alphaMatcher.matches())
440                                {
441                                        return false;
442                                }
443                        }
444                }
445
446                if (!hostname && !ipV4Address)
447                {
448                        return false;
449                }
450
451                String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
452                if (port != null)
453                {
454                        Matcher portMatcher = Pattern.compile(PORT_PATTERN).matcher(port);
455                        if (!portMatcher.matches())
456                        {
457                                return false;
458                        }
459                }
460
461                String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
462                if (!isBlankOrNull(extra))
463                {
464                        return false;
465                }
466
467                return true;
468        }
469
470        /**
471         * Returns <code>true</code> if the path is valid. A <code>null</code> value is considered
472         * invalid.
473         * 
474         * @param path
475         *            a path value to validate.
476         * @return <code>true</code> if path is valid.
477         */
478        protected boolean isValidPath(String path)
479        {
480                if (path == null)
481                {
482                        return false;
483                }
484
485                Matcher pathMatcher = Pattern.compile(PATH_PATTERN).matcher(path);
486
487                if (!pathMatcher.matches())
488                {
489                        return false;
490                }
491
492                int slash2Count = countToken("//", path);
493                if (isOff(ALLOW_2_SLASHES) && (slash2Count > 0))
494                {
495                        return false;
496                }
497
498                int slashCount = countToken("/", path);
499                int dot2Count = countToken("/..", path);
500                if (dot2Count > 0)
501                {
502                        if ((slashCount - slash2Count - 1) <= dot2Count)
503                        {
504                                return false;
505                        }
506                }
507
508                return true;
509        }
510
511        /**
512         * Returns <code>true</code> if the query is <code>null</code> or if it's a properly-formatted
513         * query string.
514         * 
515         * @param query
516         *            a query value to validate
517         * @return <code>true</code> if the query is valid
518         */
519        protected boolean isValidQuery(String query)
520        {
521                if (query == null)
522                {
523                        return true;
524                }
525
526                Matcher queryMatcher = Pattern.compile(QUERY_PATTERN).matcher(query);
527                return queryMatcher.matches();
528        }
529
530        /**
531         * Returns <code>true</code> if the given fragment is <code>null</code> or fragments are
532         * allowed.
533         * 
534         * @param fragment
535         *            a fragment value to validate
536         * @return <code>true</code> if the fragment is valid
537         */
538        protected boolean isValidFragment(String fragment)
539        {
540                if (fragment == null)
541                {
542                        return true;
543                }
544
545                return isOff(NO_FRAGMENTS);
546        }
547
548        /**
549         * Returns the number of times the token appears in the target.
550         * 
551         * @param token
552         *            a token value to be counted
553         * @param target
554         *            a target <code>String</code> to count tokens in
555         * @return the number of tokens
556         */
557        protected int countToken(String token, String target)
558        {
559                int tokenIndex = 0;
560                int count = 0;
561                while (tokenIndex != -1)
562                {
563                        tokenIndex = target.indexOf(token, tokenIndex);
564                        if (tokenIndex > -1)
565                        {
566                                tokenIndex++;
567                                count++;
568                        }
569                }
570                return count;
571        }
572
573        /**
574         * Checks if the field isn't <code>null</code> and if length of the field is greater than zero,
575         * not including whitespace.
576         * 
577         * @param value
578         *            the value validation is being performed on
579         * @return <code>true</code> if blank or <code>null</code>
580         */
581        public static boolean isBlankOrNull(String value)
582        {
583                return ((value == null) || (value.trim().length() == 0));
584        }
585
586        // Flag Management
587        /**
588         * Tests whether the given flag is on. If the flag is not a power of 2 (ie. 3) this tests
589         * whether the combination of flags is on.
590         * 
591         * @param flag
592         *            flag value to check
593         * @return whether the specified flag value is on
594         */
595        public boolean isOn(long flag)
596        {
597                return (options & flag) > 0;
598        }
599
600        /**
601         * Tests whether the given flag is off. If the flag is not a power of 2 (ie. 3) this tests
602         * whether the combination of flags is off.
603         * 
604         * @param flag
605         *            flag value to check.
606         * @return whether the specified flag value is off
607         */
608        public boolean isOff(long flag)
609        {
610                return (options & flag) == 0;
611        }
612
613}