Unicode

/*
 *  Licensed to the Apache Software Foundation (ASF) under one
 *  or more contributor license agreements.  See the NOTICE file
 *  distributed with this work for additional information
 *  regarding copyright ownership.  The ASF licenses this file
 *  to you under the Apache License, Version 2.0 (the
 *  "License"); you may not use this file except in compliance
 *  with the License.  You may obtain a copy of the License at
 *
 *    https://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing,
 *  software distributed under the License is distributed on an
 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *  KIND, either express or implied.  See the License for the
 *  specific language governing permissions and limitations
 *  under the License.
 *
 */
package org.apache.directory.api.util;


import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;


/**
 * Various unicode manipulation methods that are more efficient then chaining
 * operations: all is done in the same buffer without creating a bunch of string
 * objects.
 *
 * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a>
 */
public final class Unicode
{
    private static final int UTF8_MULTI_BYTES_MASK = 0x0080;
    private static final int UTF8_TWO_BYTES_MASK = 0x00E0;
    private static final int UTF8_TWO_BYTES = 0x00C0;
    private static final int UTF8_THREE_BYTES_MASK = 0x00F0;
    private static final int UTF8_THREE_BYTES = 0x00E0;
    private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;
    private static final int UTF8_FOUR_BYTES = 0x00F0;
    private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;
    private static final int UTF8_FIVE_BYTES = 0x00F8;
    private static final int UTF8_SIX_BYTES_MASK = 0x00FE;
    private static final int UTF8_SIX_BYTES = 0x00FC;

    /** %01-%27 %2B-%5B %5D-%7F */
    private static final boolean[] UNICODE_SUBSET =
        {
            // '\0'
            false, true,  true,  true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            // '(', ')', '*'
            false, false, false, true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            // '\'
            true,  true,  true,  true,  false, true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
            true,  true,  true,  true,  true,  true,  true,  true,
        };
    private static final int CHAR_ONE_BYTE_MASK = 0xFFFFFF80;
    private static final int CHAR_TWO_BYTES_MASK = 0xFFFFF800;
    private static final int CHAR_THREE_BYTES_MASK = 0xFFFF0000;
    private static final int CHAR_FOUR_BYTES_MASK = 0xFFE00000;

    private Unicode()
    {
    }

    /**
     * Count the number of bytes needed to return an Unicode char. This can be
     * from 1 to 6.
     *
     * @param bytes The bytes to read
     * @param pos Position to start counting. It must be a valid start of a
     *            encoded char !
     * @return The number of bytes to create a char, or -1 if the encoding is
     *         wrong. TODO : Should stop after the third byte, as a char is only
     *         2 bytes long.
     */
    public static int countBytesPerChar( byte[] bytes, int pos )
    {
        if ( bytes == null )
        {
            return -1;
        }

        if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
        {
            return 1;
        }
        else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
        {
            return 2;
        }
        else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
        {
            return 3;
        }
        else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
        {
            return 4;
        }
        else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
        {
            return 5;
        }
        else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
        {
            return 6;
        }
        else
        {
            return -1;
        }
    }


    /**
     * Return the Unicode char which is coded in the bytes at position 0.
     *
     * @param bytes The byte[] represntation of an Unicode string.
     * @return The first char found.
     */
    public static char bytesToChar( byte[] bytes )
    {
        return bytesToChar( bytes, 0 );
    }


    /**
     * Return the Unicode char which is coded in the bytes at the given
     * position.
     *
     * @param bytes The byte[] represntation of an Unicode string.
     * @param pos The current position to start decoding the char
     * @return The decoded char, or -1 if no char can be decoded TODO : Should
     *         stop after the third byte, as a char is only 2 bytes long.
     */
    public static char bytesToChar( byte[] bytes, int pos )
    {
        if ( bytes == null )
        {
            return ( char ) -1;
        }

        if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
        {
            return ( char ) bytes[pos];
        }
        else
        {
            if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
            {
                // Two bytes char
                // 110x-xxyy 10zz-zzzz -> 0000-0xxx yyzz-zzzz
                return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + ( ( bytes[pos] & 0x03 ) << 6 ) + ( bytes[pos + 1] & 0x3F ) );
            }
            else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
            {
                // Three bytes char
                // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-xxxx yyzz-zzzz (FF FF)
                return ( char ) ( ( ( bytes[pos] & 0x0F ) << 12 )
                    + ( ( bytes[pos + 1] & 0x3C ) << 6 )
                    + ( ( bytes[pos + 1] & 0x03 ) << 6 )
                    + ( bytes[pos + 2] & 0x3F )
                );
            }
            else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
            {
                // Four bytes char
                return ( char ) (
                // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
                ( ( bytes[pos] & 0x07 ) << 18 )
                    + ( ( bytes[pos + 1] & 0x30 ) << 16 )
                    + ( ( bytes[pos + 1] & 0x0F ) << 12 )
                    + ( ( bytes[pos + 2] & 0x3C ) << 6 )
                    + ( ( bytes[pos + 2] & 0x03 ) << 6 )
                    + ( bytes[pos + 3] & 0x3F )
                );
            }
            else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
            {
                // Five bytes char
                return ( char ) (
                // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
                // 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
                ( ( bytes[pos] & 0x03 ) << 24 )
                    + ( ( bytes[pos + 1] & 0x3F ) << 18 )
                    + ( ( bytes[pos + 2] & 0x30 ) << 12 )
                    + ( ( bytes[pos + 2] & 0x0F ) << 12 )
                    + ( ( bytes[pos + 3] & 0x3C ) << 6 )
                    + ( ( bytes[pos + 3] & 0x03 ) << 6 )
                    + ( bytes[pos + 4] & 0x3F )
                );
            }
            else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
            {
                // Six bytes char
                return ( char ) (
                // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
                // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
                ( ( bytes[pos] & 0x01 ) << 30 )
                    + ( ( bytes[pos + 1] & 0x3F ) << 24 )
                    + ( ( bytes[pos + 2] & 0x3F ) << 18 )
                    + ( ( bytes[pos + 3] & 0x30 ) << 12 )
                    + ( ( bytes[pos + 3] & 0x0F ) << 12 )
                    + ( ( bytes[pos + 4] & 0x3C ) << 6 )
                    + ( ( bytes[pos + 4] & 0x03 ) << 6 )
                    + ( bytes[pos + 5] & 0x3F )
                );
            }
            else
            {
                return ( char ) -1;
            }
        }
    }


    /**
     * Return the number of bytes that hold an Unicode char.
     *
     * @param car The character to be decoded
     * @return The number of bytes to hold the char. TODO : Should stop after
     *         the third byte, as a char is only 2 bytes long.
     */
    public static int countNbBytesPerChar( char car )
    {
        if ( ( car & CHAR_ONE_BYTE_MASK ) == 0 )
        {
            return 1;
        }
        else if ( ( car & CHAR_TWO_BYTES_MASK ) == 0 )
        {
            return 2;
        }
        else if ( ( car & CHAR_THREE_BYTES_MASK ) == 0 )
        {
            return 3;
        }
        else if ( ( car & CHAR_FOUR_BYTES_MASK ) == 0 )
        {
            return 4;
        }
        else
        {
            return -1;
        }
    }


    /**
     * Count the number of bytes included in the given char[].
     *
     * @param chars The char array to decode
     * @return The number of bytes in the char array
     */
    public static int countBytes( char[] chars )
    {
        if ( chars == null )
        {
            return 0;
        }

        int nbBytes = 0;
        int currentPos = 0;

        while ( currentPos < chars.length )
        {
            int nbb = countNbBytesPerChar( chars[currentPos] );

            // If the number of bytes necessary to encode a character is
            // above 3, we will need two UTF-16 chars
            currentPos += ( nbb < 4 ? 1 : 2 );
            nbBytes += nbb;
        }

        return nbBytes;
    }


    /**
     * Count the number of chars included in the given byte[].
     *
     * @param bytes The byte array to decode
     * @return The number of char in the byte array
     */
    public static int countChars( byte[] bytes )
    {
        if ( bytes == null )
        {
            return 0;
        }

        int nbChars = 0;
        int currentPos = 0;

        while ( currentPos < bytes.length )
        {
            currentPos += countBytesPerChar( bytes, currentPos );
            nbChars++;
        }

        return nbChars;
    }


    /**
     * Return the Unicode char which is coded in the bytes at the given
     * position.
     *
     * @param car The character to be transformed to an array of bytes
     *
     * @return The byte array representing the char
     *
     * TODO : Should stop after the third byte, as a char is only 2 bytes long.
     */
    public static byte[] charToBytes( char car )
    {
        if ( car <= 0x007F )
        {
            byte[] bytes = new byte[1];

            // Single byte char
            bytes[0] = ( byte ) car;

            return bytes;
        }
        else if ( car <= 0x07FF )
        {
            byte[] bytes = new byte[2];

            // two bytes char
            bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) );
            bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) );

            return bytes;
        }
        else
        {
            byte[] bytes = new byte[3];

            // Three bytes char
            bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) );
            bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) );
            bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) );

            return bytes;
        }
    }


    /**
     * Check if the current char is in the unicodeSubset : all chars but
     * '\0', '(', ')', '*' and '\'
     *
     * @param str The string to check
     * @param pos Position of the current char
     * @return True if the current char is in the unicode subset
     */
    public static boolean isUnicodeSubset( String str, int pos )
    {
        if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) )
        {
            return false;
        }

        char c = str.charAt( pos );

        return ( c > 127 ) || UNICODE_SUBSET[c];
    }


    /**
     * Check if the current char is in the unicodeSubset : all chars but
     * '\0', '(', ')', '*' and '\'
     *
     * @param c The char to check
     * @return True if the current char is in the unicode subset
     */
    public static boolean isUnicodeSubset( char c )
    {
        return ( c > 127 ) || UNICODE_SUBSET[c];
    }


    /**
     * Check if the current byte is in the unicodeSubset : all chars but
     * '\0', '(', ')', '*' and '\'
     *
     * @param b The byte to check
     * @return True if the current byte is in the unicode subset
     */
    public static boolean isUnicodeSubset( byte b )
    {
        return ( b < 0 ) || ( b > 127 ) || UNICODE_SUBSET[b];
    }


    /**
     *
     * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation
     * of every character in the string str. If str is null, the string value 'null' is written with a length of 0
     * instead of throwing an NullPointerException. Each character in the string s  is converted to a group of one,
     * two, or three bytes, depending on the value of the character.
     *
     * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is
     * written in the length information (four bytes (writeInt)) and the string is split into smaller parts
     * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes
     * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at
     * once.
     *
     * See also {@link java.io.DataOutput#writeUTF(String)}.
     *
     * @param objectOutput The objectOutput to write to
     * @param str The value to write
     * @throws java.io.IOException If the value can't be written to the file
     */
    public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException
    {
        // Write a 'null' string
        if ( str == null )
        {
            objectOutput.writeInt( 0 );
            objectOutput.writeUTF( "null" );
        }
        else
        {
            // Write length of string
            objectOutput.writeInt( str.length() );

            StringBuilder strBuf = new StringBuilder( str );

            // Write the string in portions not larger than 21845 characters
            while ( strBuf != null )
            {
                if ( strBuf.length() < 21845 )
                {
                    objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) );
                    strBuf = null;
                }
                else
                {
                    objectOutput.writeUTF( strBuf.substring( 0, 21845 ) );
                    strBuf.delete( 0, 21845 );
                }
            }
        }
    }


    /**
     *
     * Reads in a string that has been encoded using a modified UTF-8  format. The general contract of readUTF  is
     * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of
     * characters is then returned as a String.
     *
     * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner
     * of the readUnsignedShort  method . This integer value is called the UTF length and specifies the number of
     * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The
     * length of each group is computed from the value of the first byte of the group. The byte following a group, if
     * any, is the first byte of the next group.
     *
     *See also {@link java.io.DataInput#readUTF()}.
     *
     * @param objectInput The objectInput to read from
     * @return The read string
     * @throws java.io.IOException If the value can't be read
     */
    public static String readUTF( ObjectInput objectInput ) throws IOException
    {
        // Read length of the string
        int strLength = objectInput.readInt();

        // Start reading the string
        StringBuilder strBuf = new StringBuilder( objectInput.readUTF() );

        if ( ( strLength == 0 ) && ( "null".equals( strBuf.toString() ) ) )
        {
            // The special case of a 'null' string
            return null;
        }
        else
        {
            while ( strLength > strBuf.length() )
            {
                strBuf.append( objectInput.readUTF() );
            }
            return strBuf.toString();
        }
    }
}