001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.wicket.util.encoding;
018
019import java.io.ByteArrayOutputStream;
020import java.io.UnsupportedEncodingException;
021import java.nio.charset.Charset;
022import java.nio.charset.IllegalCharsetNameException;
023import java.nio.charset.UnsupportedCharsetException;
024
025import org.apache.wicket.util.lang.Args;
026
027/**
028 * Adapted from Spring Framework's UriUtils class, but defines instances for query string encoding versus URL path
029 * component encoding.
030 * <p/>
031 * The difference is important because a space is encoded as a + in a query string, but this is a
032 * valid value in a path component (and is therefore not decode back to a space).
033 *
034 * @author Thomas Heigl
035 * @see <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC-2396</a>
036 */
037public class UrlEncoder
038{
039
040        enum Type {
041                //@formatter:off
042                QUERY {
043                        @Override
044                        public boolean isAllowed(int c) 
045                        {
046                                return isPchar(c) ||
047                                                ' ' == c || // encoding a space to a + is done in the encode() method
048                                                '*' == c ||
049                                                '/' == c || // to allow direct passing of URL in query
050                                                ',' == c ||
051                                                ':' == c || // allowed and used in wicket interface
052                                                '@' == c ;
053                        }
054                },
055                PATH {
056                        @Override
057                        public boolean isAllowed(int c) 
058                        {
059                                return isPchar(c) ||
060                                                '*' == c ||
061                                                '&' == c ||
062                                                '+' == c ||
063                                                ',' == c ||
064                                                ';' == c || // semicolon is used in ;jsessionid=
065                                                '=' == c ||
066                                                ':' == c || // allowed and used in wicket interface
067                                                '@' == c ;
068
069                        }
070                },
071                HEADER {
072                        @Override
073                        public boolean isAllowed(int c) 
074                        {
075                                return isPchar(c) ||
076                                                '#' == c ||
077                                                '&' == c ||
078                                                '+' == c ||
079                                                '^' == c ||
080                                                '`' == c ||
081                                                '|' ==c;
082                        }
083                };
084                //@formatter:on
085
086                /**
087                 * Indicates whether the given character is allowed in this URI component.
088                 * @return {@code true} if the character is allowed; {@code false} otherwise
089                 */
090                public abstract boolean isAllowed(int c);
091
092                /**
093                 * Indicates whether the given character is in the {@code ALPHA} set.
094                 * @see <a href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986, appendix A</a>
095                 */
096                protected boolean isAlpha(int c)
097                {
098                        return (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z');
099                }
100
101                /**
102                 * Indicates whether the given character is in the {@code DIGIT} set.
103                 * @see <a href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986, appendix A</a>
104                 */
105                protected boolean isDigit(int c)
106                {
107                        return (c >= '0' && c <= '9');
108                }
109
110                /**
111                 * Indicates whether the given character is in the {@code sub-delims} set.
112                 * @see <a href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986, appendix A</a>
113                 */
114                protected boolean isSubDelimiter(int c)
115                {
116                        return ('!' == c || '$' == c);
117                }
118
119                /**
120                 * Indicates whether the given character is in the {@code unreserved} set.
121                 * @see <a href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986, appendix A</a>
122                 */
123                protected boolean isUnreserved(int c)
124                {
125                        return (isAlpha(c) || isDigit(c) || '-' == c || '.' == c || '_' == c || '~' == c);
126                }
127
128                /**
129                 * Indicates whether the given character is in the {@code pchar} set.
130                 * @see <a href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986, appendix A</a>
131                 */
132                protected boolean isPchar(int c)
133                {
134                        return (isUnreserved(c) || isSubDelimiter(c));
135                }
136        }
137
138        private final Type type;
139
140        /**
141         * Encoder used to encode name or value components of a query string.<br/>
142         * <br/>
143         *
144         * For example: http://org.acme/notthis/northis/oreventhis?buthis=isokay&amp;asis=thispart
145         */
146        public static final UrlEncoder QUERY_INSTANCE = new UrlEncoder(Type.QUERY);
147
148        /**
149         * Encoder used to encode segments of a path.<br/>
150         * <br/>
151         *
152         * For example: http://org.acme/foo/thispart/orthispart?butnot=thispart
153         */
154        public static final UrlEncoder PATH_INSTANCE = new UrlEncoder(Type.PATH);
155
156        /**
157         * Encoder used to encode a header.
158         */
159        public static final UrlEncoder HEADER_INSTANCE = new UrlEncoder(Type.HEADER);
160
161        /**
162         * Allow subclass to call constructor.
163         *
164         * @param type
165         *            encoder type
166         */
167        protected UrlEncoder(final Type type)
168        {
169                this.type = type;
170        }
171
172        /**
173         * @param s
174         *            string to encode
175         * @param charsetName
176         *            charset to use for encoding
177         * @return encoded string
178         */
179        public String encode(final String s, final String charsetName)
180        {
181                Args.notNull(charsetName, "charsetName");
182
183                try
184                {
185                        return encode(s, Charset.forName(charsetName));
186                }
187                catch (IllegalCharsetNameException | UnsupportedCharsetException e)
188                {
189                        throw new RuntimeException(new UnsupportedEncodingException(charsetName));
190                }
191        }
192
193        /**
194         * @param unsafeInput
195         *            string to encode
196         * @param charset
197         *            encoding to use
198         * @return encoded string
199         */
200        public String encode(final String unsafeInput, final Charset charset)
201        {
202                if (unsafeInput == null || unsafeInput.isEmpty())
203                {
204                        return unsafeInput;
205                }
206
207                Args.notNull(charset, "charset");
208
209                final byte[] bytes = unsafeInput.getBytes(charset);
210                boolean original = true;
211                for (final byte b : bytes)
212                {
213                        if (!type.isAllowed(b) || b == ' ' || b == '\0')
214                        {
215                                original = false;
216                                break;
217                        }
218                }
219                if (original)
220                {
221                        return unsafeInput;
222                }
223
224                final ByteArrayOutputStream bos = new ByteArrayOutputStream(bytes.length);
225                for (final byte b : bytes)
226                {
227                        if (type.isAllowed(b))
228                        {
229                                if (b == ' ')
230                                {
231                                        bos.write('+');
232                                }
233                                else
234                                {
235                                        bos.write(b);
236                                }
237                        }
238                        else
239                        {
240                                if (b == '\0')
241                                {
242                                        bos.writeBytes("NULL".getBytes(charset));
243                                }
244                                else
245                                {
246                                        bos.write('%');
247                                        bos.write(Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16)));
248                                        bos.write(Character.toUpperCase(Character.forDigit(b & 0xF, 16)));
249                                }
250                        }
251                }
252                return bos.toString(charset);
253        }
254
255}