View Javadoc
1   /*
2    *  Licensed to the Apache Software Foundation (ASF) under one
3    *  or more contributor license agreements.  See the NOTICE file
4    *  distributed with this work for additional information
5    *  regarding copyright ownership.  The ASF licenses this file
6    *  to you under the Apache License, Version 2.0 (the
7    *  "License"); you may not use this file except in compliance
8    *  with the License.  You may obtain a copy of the License at
9    * 
10   *    https://www.apache.org/licenses/LICENSE-2.0
11   * 
12   *  Unless required by applicable law or agreed to in writing,
13   *  software distributed under the License is distributed on an
14   *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   *  KIND, either express or implied.  See the License for the
16   *  specific language governing permissions and limitations
17   *  under the License.
18   * 
19   */
20  package org.apache.directory.api.util;
21  
22  
23  import java.io.IOException;
24  import java.io.ObjectInput;
25  import java.io.ObjectOutput;
26  
27  
28  /**
29   * Various unicode manipulation methods that are more efficient then chaining
30   * operations: all is done in the same buffer without creating a bunch of string
31   * objects.
32   * 
33   * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a>
34   */
35  public final class Unicode
36  {
37      private static final int UTF8_MULTI_BYTES_MASK = 0x0080;
38      private static final int UTF8_TWO_BYTES_MASK = 0x00E0;
39      private static final int UTF8_TWO_BYTES = 0x00C0;
40      private static final int UTF8_THREE_BYTES_MASK = 0x00F0;
41      private static final int UTF8_THREE_BYTES = 0x00E0;
42      private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;
43      private static final int UTF8_FOUR_BYTES = 0x00F0;
44      private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;
45      private static final int UTF8_FIVE_BYTES = 0x00F8;
46      private static final int UTF8_SIX_BYTES_MASK = 0x00FE;
47      private static final int UTF8_SIX_BYTES = 0x00FC;
48  
49      /** %01-%27 %2B-%5B %5D-%7F */
50      private static final boolean[] UNICODE_SUBSET =
51          {
52              // '\0'
53              false, true,  true,  true,  true,  true,  true,  true, 
54              true,  true,  true,  true,  true,  true,  true,  true,
55              true,  true,  true,  true,  true,  true,  true,  true,
56              true,  true,  true,  true,  true,  true,  true,  true,
57              true,  true,  true,  true,  true,  true,  true,  true,
58              // '(', ')', '*'
59              false, false, false, true,  true,  true,  true,  true, 
60              true,  true,  true,  true,  true,  true,  true,  true,
61              true,  true,  true,  true,  true,  true,  true,  true,
62              true,  true,  true,  true,  true,  true,  true,  true,
63              true,  true,  true,  true,  true,  true,  true,  true,
64              true,  true,  true,  true,  true,  true,  true,  true,
65              // '\'
66              true,  true,  true,  true,  false, true,  true,  true,
67              true,  true,  true,  true,  true,  true,  true,  true,
68              true,  true,  true,  true,  true,  true,  true,  true,
69              true,  true,  true,  true,  true,  true,  true,  true,
70              true,  true,  true,  true,  true,  true,  true,  true,
71          };
72      private static final int CHAR_ONE_BYTE_MASK = 0xFFFFFF80;
73      private static final int CHAR_TWO_BYTES_MASK = 0xFFFFF800;
74      private static final int CHAR_THREE_BYTES_MASK = 0xFFFF0000;
75      private static final int CHAR_FOUR_BYTES_MASK = 0xFFE00000;
76  
77      private Unicode()
78      {
79      }
80  
81      /**
82       * Count the number of bytes needed to return an Unicode char. This can be
83       * from 1 to 6.
84       *
85       * @param bytes The bytes to read
86       * @param pos Position to start counting. It must be a valid start of a
87       *            encoded char !
88       * @return The number of bytes to create a char, or -1 if the encoding is
89       *         wrong. TODO : Should stop after the third byte, as a char is only
90       *         2 bytes long.
91       */
92      public static int countBytesPerChar( byte[] bytes, int pos )
93      {
94          if ( bytes == null )
95          {
96              return -1;
97          }
98  
99          if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
100         {
101             return 1;
102         }
103         else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
104         {
105             return 2;
106         }
107         else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
108         {
109             return 3;
110         }
111         else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
112         {
113             return 4;
114         }
115         else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
116         {
117             return 5;
118         }
119         else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
120         {
121             return 6;
122         }
123         else
124         {
125             return -1;
126         }
127     }
128 
129 
130     /**
131      * Return the Unicode char which is coded in the bytes at position 0.
132      *
133      * @param bytes The byte[] represntation of an Unicode string.
134      * @return The first char found.
135      */
136     public static char bytesToChar( byte[] bytes )
137     {
138         return bytesToChar( bytes, 0 );
139     }
140 
141 
142     /**
143      * Return the Unicode char which is coded in the bytes at the given
144      * position.
145      *
146      * @param bytes The byte[] represntation of an Unicode string.
147      * @param pos The current position to start decoding the char
148      * @return The decoded char, or -1 if no char can be decoded TODO : Should
149      *         stop after the third byte, as a char is only 2 bytes long.
150      */
151     public static char bytesToChar( byte[] bytes, int pos )
152     {
153         if ( bytes == null )
154         {
155             return ( char ) -1;
156         }
157 
158         if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
159         {
160             return ( char ) bytes[pos];
161         }
162         else
163         {
164             if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
165             {
166                 // Two bytes char
167                 // 110x-xxyy 10zz-zzzz -> 0000-0xxx yyzz-zzzz
168                 return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + ( ( bytes[pos] & 0x03 ) << 6 ) + ( bytes[pos + 1] & 0x3F ) );
169             }
170             else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
171             {
172                 // Three bytes char
173                 // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-xxxx yyzz-zzzz (FF FF)
174                 return ( char ) ( ( ( bytes[pos] & 0x0F ) << 12 )
175                     + ( ( bytes[pos + 1] & 0x3C ) << 6 )
176                     + ( ( bytes[pos + 1] & 0x03 ) << 6 )
177                     + ( bytes[pos + 2] & 0x3F )
178                 );
179             }
180             else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
181             {
182                 // Four bytes char
183                 return ( char ) (
184                 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
185                 ( ( bytes[pos] & 0x07 ) << 18 )
186                     + ( ( bytes[pos + 1] & 0x30 ) << 16 )
187                     + ( ( bytes[pos + 1] & 0x0F ) << 12 )
188                     + ( ( bytes[pos + 2] & 0x3C ) << 6 )
189                     + ( ( bytes[pos + 2] & 0x03 ) << 6 )
190                     + ( bytes[pos + 3] & 0x3F )
191                 );
192             }
193             else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
194             {
195                 // Five bytes char
196                 return ( char ) (
197                 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
198                 // 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
199                 ( ( bytes[pos] & 0x03 ) << 24 )
200                     + ( ( bytes[pos + 1] & 0x3F ) << 18 )
201                     + ( ( bytes[pos + 2] & 0x30 ) << 12 )
202                     + ( ( bytes[pos + 2] & 0x0F ) << 12 )
203                     + ( ( bytes[pos + 3] & 0x3C ) << 6 )
204                     + ( ( bytes[pos + 3] & 0x03 ) << 6 )
205                     + ( bytes[pos + 4] & 0x3F )
206                 );
207             }
208             else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
209             {
210                 // Six bytes char
211                 return ( char ) (
212                 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
213                 // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
214                 ( ( bytes[pos] & 0x01 ) << 30 )
215                     + ( ( bytes[pos + 1] & 0x3F ) << 24 )
216                     + ( ( bytes[pos + 2] & 0x3F ) << 18 )
217                     + ( ( bytes[pos + 3] & 0x30 ) << 12 )
218                     + ( ( bytes[pos + 3] & 0x0F ) << 12 )
219                     + ( ( bytes[pos + 4] & 0x3C ) << 6 )
220                     + ( ( bytes[pos + 4] & 0x03 ) << 6 )
221                     + ( bytes[pos + 5] & 0x3F )
222                 );
223             }
224             else
225             {
226                 return ( char ) -1;
227             }
228         }
229     }
230 
231 
232     /**
233      * Return the number of bytes that hold an Unicode char.
234      *
235      * @param car The character to be decoded
236      * @return The number of bytes to hold the char. TODO : Should stop after
237      *         the third byte, as a char is only 2 bytes long.
238      */
239     public static int countNbBytesPerChar( char car )
240     {
241         if ( ( car & CHAR_ONE_BYTE_MASK ) == 0 )
242         {
243             return 1;
244         }
245         else if ( ( car & CHAR_TWO_BYTES_MASK ) == 0 )
246         {
247             return 2;
248         }
249         else if ( ( car & CHAR_THREE_BYTES_MASK ) == 0 )
250         {
251             return 3;
252         }
253         else if ( ( car & CHAR_FOUR_BYTES_MASK ) == 0 )
254         {
255             return 4;
256         }
257         else
258         {
259             return -1;
260         }
261     }
262 
263 
264     /**
265      * Count the number of bytes included in the given char[].
266      *
267      * @param chars The char array to decode
268      * @return The number of bytes in the char array
269      */
270     public static int countBytes( char[] chars )
271     {
272         if ( chars == null )
273         {
274             return 0;
275         }
276 
277         int nbBytes = 0;
278         int currentPos = 0;
279 
280         while ( currentPos < chars.length )
281         {
282             int nbb = countNbBytesPerChar( chars[currentPos] );
283 
284             // If the number of bytes necessary to encode a character is
285             // above 3, we will need two UTF-16 chars
286             currentPos += ( nbb < 4 ? 1 : 2 );
287             nbBytes += nbb;
288         }
289 
290         return nbBytes;
291     }
292 
293 
294     /**
295      * Count the number of chars included in the given byte[].
296      *
297      * @param bytes The byte array to decode
298      * @return The number of char in the byte array
299      */
300     public static int countChars( byte[] bytes )
301     {
302         if ( bytes == null )
303         {
304             return 0;
305         }
306 
307         int nbChars = 0;
308         int currentPos = 0;
309 
310         while ( currentPos < bytes.length )
311         {
312             currentPos += countBytesPerChar( bytes, currentPos );
313             nbChars++;
314         }
315 
316         return nbChars;
317     }
318 
319 
320     /**
321      * Return the Unicode char which is coded in the bytes at the given
322      * position.
323      *
324      * @param car The character to be transformed to an array of bytes
325      *
326      * @return The byte array representing the char
327      *
328      * TODO : Should stop after the third byte, as a char is only 2 bytes long.
329      */
330     public static byte[] charToBytes( char car )
331     {
332         if ( car <= 0x007F )
333         {
334             byte[] bytes = new byte[1];
335 
336             // Single byte char
337             bytes[0] = ( byte ) car;
338             
339             return bytes;
340         }
341         else if ( car <= 0x07FF )
342         {
343             byte[] bytes = new byte[2];
344 
345             // two bytes char
346             bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) );
347             bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
348             
349             return bytes;
350         }
351         else
352         {
353             byte[] bytes = new byte[3];
354 
355             // Three bytes char
356             bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) );
357             bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) );
358             bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
359             
360             return bytes;
361         }
362     }
363 
364 
365     /**
366      * Check if the current char is in the unicodeSubset : all chars but
367      * '\0', '(', ')', '*' and '\'
368      *
369      * @param str The string to check
370      * @param pos Position of the current char
371      * @return True if the current char is in the unicode subset
372      */
373     public static boolean isUnicodeSubset( String str, int pos )
374     {
375         if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) )
376         {
377             return false;
378         }
379 
380         char c = str.charAt( pos );
381 
382         return ( c > 127 ) || UNICODE_SUBSET[c];
383     }
384 
385 
386     /**
387      * Check if the current char is in the unicodeSubset : all chars but
388      * '\0', '(', ')', '*' and '\'
389      *
390      * @param c The char to check
391      * @return True if the current char is in the unicode subset
392      */
393     public static boolean isUnicodeSubset( char c )
394     {
395         return ( c > 127 ) || UNICODE_SUBSET[c];
396     }
397 
398 
399     /**
400      * Check if the current byte is in the unicodeSubset : all chars but
401      * '\0', '(', ')', '*' and '\'
402      *
403      * @param b The byte to check
404      * @return True if the current byte is in the unicode subset
405      */
406     public static boolean isUnicodeSubset( byte b )
407     {
408         return ( b < 0 ) || ( b > 127 ) || UNICODE_SUBSET[b];
409     }
410 
411 
412     /**
413      *
414      * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation
415      * of every character in the string str. If str is null, the string value 'null' is written with a length of 0
416      * instead of throwing an NullPointerException. Each character in the string s  is converted to a group of one,
417      * two, or three bytes, depending on the value of the character.
418      *
419      * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is
420      * written in the length information (four bytes (writeInt)) and the string is split into smaller parts
421      * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes
422      * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at
423      * once.
424      *
425      * See also {@link java.io.DataOutput#writeUTF(String)}.
426      *
427      * @param objectOutput The objectOutput to write to
428      * @param str The value to write
429      * @throws java.io.IOException If the value can't be written to the file
430      */
431     public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException
432     {
433         // Write a 'null' string
434         if ( str == null )
435         {
436             objectOutput.writeInt( 0 );
437             objectOutput.writeUTF( "null" );
438         }
439         else
440         {
441             // Write length of string
442             objectOutput.writeInt( str.length() );
443 
444             StringBuilder strBuf = new StringBuilder( str );
445 
446             // Write the string in portions not larger than 21845 characters
447             while ( strBuf != null )
448             {
449                 if ( strBuf.length() < 21845 )
450                 {
451                     objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) );
452                     strBuf = null;
453                 }
454                 else
455                 {
456                     objectOutput.writeUTF( strBuf.substring( 0, 21845 ) );
457                     strBuf.delete( 0, 21845 );
458                 }
459             }
460         }
461     }
462 
463 
464     /**
465      *
466      * Reads in a string that has been encoded using a modified UTF-8  format. The general contract of readUTF  is
467      * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of
468      * characters is then returned as a String.
469      *
470      * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner
471      * of the readUnsignedShort  method . This integer value is called the UTF length and specifies the number of
472      * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The
473      * length of each group is computed from the value of the first byte of the group. The byte following a group, if
474      * any, is the first byte of the next group.
475      *
476      *See also {@link java.io.DataInput#readUTF()}.
477      *
478      * @param objectInput The objectInput to read from
479      * @return The read string
480      * @throws java.io.IOException If the value can't be read
481      */
482     public static String readUTF( ObjectInput objectInput ) throws IOException
483     {
484         // Read length of the string
485         int strLength = objectInput.readInt();
486 
487         // Start reading the string
488         StringBuilder strBuf = new StringBuilder( objectInput.readUTF() );
489 
490         if ( ( strLength == 0 ) && ( "null".equals( strBuf.toString() ) ) ) 
491         {
492             // The special case of a 'null' string
493             return null;
494         }
495         else
496         {
497             while ( strLength > strBuf.length() )
498             {
499                 strBuf.append( objectInput.readUTF() );
500             }
501             return strBuf.toString();
502         }
503     }
504 }