001/*
002 *  Licensed to the Apache Software Foundation (ASF) under one
003 *  or more contributor license agreements.  See the NOTICE file
004 *  distributed with this work for additional information
005 *  regarding copyright ownership.  The ASF licenses this file
006 *  to you under the Apache License, Version 2.0 (the
007 *  "License"); you may not use this file except in compliance
008 *  with the License.  You may obtain a copy of the License at
009 * 
010 *    https://www.apache.org/licenses/LICENSE-2.0
011 * 
012 *  Unless required by applicable law or agreed to in writing,
013 *  software distributed under the License is distributed on an
014 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 *  KIND, either express or implied.  See the License for the
016 *  specific language governing permissions and limitations
017 *  under the License.
018 * 
019 */
020package org.apache.directory.api.util;
021
022
023import java.io.IOException;
024import java.io.ObjectInput;
025import java.io.ObjectOutput;
026
027
028/**
029 * Various unicode manipulation methods that are more efficient then chaining
030 * operations: all is done in the same buffer without creating a bunch of string
031 * objects.
032 * 
033 * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a>
034 */
035public final class Unicode
036{
037    private static final int UTF8_MULTI_BYTES_MASK = 0x0080;
038    private static final int UTF8_TWO_BYTES_MASK = 0x00E0;
039    private static final int UTF8_TWO_BYTES = 0x00C0;
040    private static final int UTF8_THREE_BYTES_MASK = 0x00F0;
041    private static final int UTF8_THREE_BYTES = 0x00E0;
042    private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;
043    private static final int UTF8_FOUR_BYTES = 0x00F0;
044    private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;
045    private static final int UTF8_FIVE_BYTES = 0x00F8;
046    private static final int UTF8_SIX_BYTES_MASK = 0x00FE;
047    private static final int UTF8_SIX_BYTES = 0x00FC;
048
049    /** %01-%27 %2B-%5B %5D-%7F */
050    private static final boolean[] UNICODE_SUBSET =
051        {
052            // '\0'
053            false, true,  true,  true,  true,  true,  true,  true, 
054            true,  true,  true,  true,  true,  true,  true,  true,
055            true,  true,  true,  true,  true,  true,  true,  true,
056            true,  true,  true,  true,  true,  true,  true,  true,
057            true,  true,  true,  true,  true,  true,  true,  true,
058            // '(', ')', '*'
059            false, false, false, true,  true,  true,  true,  true, 
060            true,  true,  true,  true,  true,  true,  true,  true,
061            true,  true,  true,  true,  true,  true,  true,  true,
062            true,  true,  true,  true,  true,  true,  true,  true,
063            true,  true,  true,  true,  true,  true,  true,  true,
064            true,  true,  true,  true,  true,  true,  true,  true,
065            // '\'
066            true,  true,  true,  true,  false, true,  true,  true,
067            true,  true,  true,  true,  true,  true,  true,  true,
068            true,  true,  true,  true,  true,  true,  true,  true,
069            true,  true,  true,  true,  true,  true,  true,  true,
070            true,  true,  true,  true,  true,  true,  true,  true,
071        };
072    private static final int CHAR_ONE_BYTE_MASK = 0xFFFFFF80;
073    private static final int CHAR_TWO_BYTES_MASK = 0xFFFFF800;
074    private static final int CHAR_THREE_BYTES_MASK = 0xFFFF0000;
075    private static final int CHAR_FOUR_BYTES_MASK = 0xFFE00000;
076
077    private Unicode()
078    {
079    }
080
081    /**
082     * Count the number of bytes needed to return an Unicode char. This can be
083     * from 1 to 6.
084     *
085     * @param bytes The bytes to read
086     * @param pos Position to start counting. It must be a valid start of a
087     *            encoded char !
088     * @return The number of bytes to create a char, or -1 if the encoding is
089     *         wrong. TODO : Should stop after the third byte, as a char is only
090     *         2 bytes long.
091     */
092    public static int countBytesPerChar( byte[] bytes, int pos )
093    {
094        if ( bytes == null )
095        {
096            return -1;
097        }
098
099        if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
100        {
101            return 1;
102        }
103        else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
104        {
105            return 2;
106        }
107        else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
108        {
109            return 3;
110        }
111        else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
112        {
113            return 4;
114        }
115        else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
116        {
117            return 5;
118        }
119        else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
120        {
121            return 6;
122        }
123        else
124        {
125            return -1;
126        }
127    }
128
129
130    /**
131     * Return the Unicode char which is coded in the bytes at position 0.
132     *
133     * @param bytes The byte[] represntation of an Unicode string.
134     * @return The first char found.
135     */
136    public static char bytesToChar( byte[] bytes )
137    {
138        return bytesToChar( bytes, 0 );
139    }
140
141
142    /**
143     * Return the Unicode char which is coded in the bytes at the given
144     * position.
145     *
146     * @param bytes The byte[] represntation of an Unicode string.
147     * @param pos The current position to start decoding the char
148     * @return The decoded char, or -1 if no char can be decoded TODO : Should
149     *         stop after the third byte, as a char is only 2 bytes long.
150     */
151    public static char bytesToChar( byte[] bytes, int pos )
152    {
153        if ( bytes == null )
154        {
155            return ( char ) -1;
156        }
157
158        if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
159        {
160            return ( char ) bytes[pos];
161        }
162        else
163        {
164            if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
165            {
166                // Two bytes char
167                // 110x-xxyy 10zz-zzzz -> 0000-0xxx yyzz-zzzz
168                return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + ( ( bytes[pos] & 0x03 ) << 6 ) + ( bytes[pos + 1] & 0x3F ) );
169            }
170            else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
171            {
172                // Three bytes char
173                // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-xxxx yyzz-zzzz (FF FF)
174                return ( char ) ( ( ( bytes[pos] & 0x0F ) << 12 )
175                    + ( ( bytes[pos + 1] & 0x3C ) << 6 )
176                    + ( ( bytes[pos + 1] & 0x03 ) << 6 )
177                    + ( bytes[pos + 2] & 0x3F )
178                );
179            }
180            else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
181            {
182                // Four bytes char
183                return ( char ) (
184                // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
185                ( ( bytes[pos] & 0x07 ) << 18 )
186                    + ( ( bytes[pos + 1] & 0x30 ) << 16 )
187                    + ( ( bytes[pos + 1] & 0x0F ) << 12 )
188                    + ( ( bytes[pos + 2] & 0x3C ) << 6 )
189                    + ( ( bytes[pos + 2] & 0x03 ) << 6 )
190                    + ( bytes[pos + 3] & 0x3F )
191                );
192            }
193            else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
194            {
195                // Five bytes char
196                return ( char ) (
197                // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
198                // 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
199                ( ( bytes[pos] & 0x03 ) << 24 )
200                    + ( ( bytes[pos + 1] & 0x3F ) << 18 )
201                    + ( ( bytes[pos + 2] & 0x30 ) << 12 )
202                    + ( ( bytes[pos + 2] & 0x0F ) << 12 )
203                    + ( ( bytes[pos + 3] & 0x3C ) << 6 )
204                    + ( ( bytes[pos + 3] & 0x03 ) << 6 )
205                    + ( bytes[pos + 4] & 0x3F )
206                );
207            }
208            else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
209            {
210                // Six bytes char
211                return ( char ) (
212                // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
213                // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
214                ( ( bytes[pos] & 0x01 ) << 30 )
215                    + ( ( bytes[pos + 1] & 0x3F ) << 24 )
216                    + ( ( bytes[pos + 2] & 0x3F ) << 18 )
217                    + ( ( bytes[pos + 3] & 0x30 ) << 12 )
218                    + ( ( bytes[pos + 3] & 0x0F ) << 12 )
219                    + ( ( bytes[pos + 4] & 0x3C ) << 6 )
220                    + ( ( bytes[pos + 4] & 0x03 ) << 6 )
221                    + ( bytes[pos + 5] & 0x3F )
222                );
223            }
224            else
225            {
226                return ( char ) -1;
227            }
228        }
229    }
230
231
232    /**
233     * Return the number of bytes that hold an Unicode char.
234     *
235     * @param car The character to be decoded
236     * @return The number of bytes to hold the char. TODO : Should stop after
237     *         the third byte, as a char is only 2 bytes long.
238     */
239    public static int countNbBytesPerChar( char car )
240    {
241        if ( ( car & CHAR_ONE_BYTE_MASK ) == 0 )
242        {
243            return 1;
244        }
245        else if ( ( car & CHAR_TWO_BYTES_MASK ) == 0 )
246        {
247            return 2;
248        }
249        else if ( ( car & CHAR_THREE_BYTES_MASK ) == 0 )
250        {
251            return 3;
252        }
253        else if ( ( car & CHAR_FOUR_BYTES_MASK ) == 0 )
254        {
255            return 4;
256        }
257        else
258        {
259            return -1;
260        }
261    }
262
263
264    /**
265     * Count the number of bytes included in the given char[].
266     *
267     * @param chars The char array to decode
268     * @return The number of bytes in the char array
269     */
270    public static int countBytes( char[] chars )
271    {
272        if ( chars == null )
273        {
274            return 0;
275        }
276
277        int nbBytes = 0;
278        int currentPos = 0;
279
280        while ( currentPos < chars.length )
281        {
282            int nbb = countNbBytesPerChar( chars[currentPos] );
283
284            // If the number of bytes necessary to encode a character is
285            // above 3, we will need two UTF-16 chars
286            currentPos += ( nbb < 4 ? 1 : 2 );
287            nbBytes += nbb;
288        }
289
290        return nbBytes;
291    }
292
293
294    /**
295     * Count the number of chars included in the given byte[].
296     *
297     * @param bytes The byte array to decode
298     * @return The number of char in the byte array
299     */
300    public static int countChars( byte[] bytes )
301    {
302        if ( bytes == null )
303        {
304            return 0;
305        }
306
307        int nbChars = 0;
308        int currentPos = 0;
309
310        while ( currentPos < bytes.length )
311        {
312            currentPos += countBytesPerChar( bytes, currentPos );
313            nbChars++;
314        }
315
316        return nbChars;
317    }
318
319
320    /**
321     * Return the Unicode char which is coded in the bytes at the given
322     * position.
323     *
324     * @param car The character to be transformed to an array of bytes
325     *
326     * @return The byte array representing the char
327     *
328     * TODO : Should stop after the third byte, as a char is only 2 bytes long.
329     */
330    public static byte[] charToBytes( char car )
331    {
332        if ( car <= 0x007F )
333        {
334            byte[] bytes = new byte[1];
335
336            // Single byte char
337            bytes[0] = ( byte ) car;
338            
339            return bytes;
340        }
341        else if ( car <= 0x07FF )
342        {
343            byte[] bytes = new byte[2];
344
345            // two bytes char
346            bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) );
347            bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
348            
349            return bytes;
350        }
351        else
352        {
353            byte[] bytes = new byte[3];
354
355            // Three bytes char
356            bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) );
357            bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) );
358            bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
359            
360            return bytes;
361        }
362    }
363
364
365    /**
366     * Check if the current char is in the unicodeSubset : all chars but
367     * '\0', '(', ')', '*' and '\'
368     *
369     * @param str The string to check
370     * @param pos Position of the current char
371     * @return True if the current char is in the unicode subset
372     */
373    public static boolean isUnicodeSubset( String str, int pos )
374    {
375        if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) )
376        {
377            return false;
378        }
379
380        char c = str.charAt( pos );
381
382        return ( c > 127 ) || UNICODE_SUBSET[c];
383    }
384
385
386    /**
387     * Check if the current char is in the unicodeSubset : all chars but
388     * '\0', '(', ')', '*' and '\'
389     *
390     * @param c The char to check
391     * @return True if the current char is in the unicode subset
392     */
393    public static boolean isUnicodeSubset( char c )
394    {
395        return ( c > 127 ) || UNICODE_SUBSET[c];
396    }
397
398
399    /**
400     * Check if the current byte is in the unicodeSubset : all chars but
401     * '\0', '(', ')', '*' and '\'
402     *
403     * @param b The byte to check
404     * @return True if the current byte is in the unicode subset
405     */
406    public static boolean isUnicodeSubset( byte b )
407    {
408        return ( b < 0 ) || ( b > 127 ) || UNICODE_SUBSET[b];
409    }
410
411
412    /**
413     *
414     * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation
415     * of every character in the string str. If str is null, the string value 'null' is written with a length of 0
416     * instead of throwing an NullPointerException. Each character in the string s  is converted to a group of one,
417     * two, or three bytes, depending on the value of the character.
418     *
419     * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is
420     * written in the length information (four bytes (writeInt)) and the string is split into smaller parts
421     * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes
422     * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at
423     * once.
424     *
425     * See also {@link java.io.DataOutput#writeUTF(String)}.
426     *
427     * @param objectOutput The objectOutput to write to
428     * @param str The value to write
429     * @throws java.io.IOException If the value can't be written to the file
430     */
431    public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException
432    {
433        // Write a 'null' string
434        if ( str == null )
435        {
436            objectOutput.writeInt( 0 );
437            objectOutput.writeUTF( "null" );
438        }
439        else
440        {
441            // Write length of string
442            objectOutput.writeInt( str.length() );
443
444            StringBuilder strBuf = new StringBuilder( str );
445
446            // Write the string in portions not larger than 21845 characters
447            while ( strBuf != null )
448            {
449                if ( strBuf.length() < 21845 )
450                {
451                    objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) );
452                    strBuf = null;
453                }
454                else
455                {
456                    objectOutput.writeUTF( strBuf.substring( 0, 21845 ) );
457                    strBuf.delete( 0, 21845 );
458                }
459            }
460        }
461    }
462
463
464    /**
465     *
466     * Reads in a string that has been encoded using a modified UTF-8  format. The general contract of readUTF  is
467     * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of
468     * characters is then returned as a String.
469     *
470     * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner
471     * of the readUnsignedShort  method . This integer value is called the UTF length and specifies the number of
472     * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The
473     * length of each group is computed from the value of the first byte of the group. The byte following a group, if
474     * any, is the first byte of the next group.
475     *
476     *See also {@link java.io.DataInput#readUTF()}.
477     *
478     * @param objectInput The objectInput to read from
479     * @return The read string
480     * @throws java.io.IOException If the value can't be read
481     */
482    public static String readUTF( ObjectInput objectInput ) throws IOException
483    {
484        // Read length of the string
485        int strLength = objectInput.readInt();
486
487        // Start reading the string
488        StringBuilder strBuf = new StringBuilder( objectInput.readUTF() );
489
490        if ( ( strLength == 0 ) && ( "null".equals( strBuf.toString() ) ) ) 
491        {
492            // The special case of a 'null' string
493            return null;
494        }
495        else
496        {
497            while ( strLength > strBuf.length() )
498            {
499                strBuf.append( objectInput.readUTF() );
500            }
501            return strBuf.toString();
502        }
503    }
504}