001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * https://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 * 019 */ 020package org.apache.directory.api.util; 021 022 023import java.io.IOException; 024import java.io.ObjectInput; 025import java.io.ObjectOutput; 026 027 028/** 029 * Various unicode manipulation methods that are more efficient then chaining 030 * operations: all is done in the same buffer without creating a bunch of string 031 * objects. 032 * 033 * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a> 034 */ 035public final class Unicode 036{ 037 private static final int UTF8_MULTI_BYTES_MASK = 0x0080; 038 private static final int UTF8_TWO_BYTES_MASK = 0x00E0; 039 private static final int UTF8_TWO_BYTES = 0x00C0; 040 private static final int UTF8_THREE_BYTES_MASK = 0x00F0; 041 private static final int UTF8_THREE_BYTES = 0x00E0; 042 private static final int UTF8_FOUR_BYTES_MASK = 0x00F8; 043 private static final int UTF8_FOUR_BYTES = 0x00F0; 044 private static final int UTF8_FIVE_BYTES_MASK = 0x00FC; 045 private static final int UTF8_FIVE_BYTES = 0x00F8; 046 private static final int UTF8_SIX_BYTES_MASK = 0x00FE; 047 private static final int UTF8_SIX_BYTES = 0x00FC; 048 049 /** %01-%27 %2B-%5B %5D-%7F */ 050 private static final boolean[] UNICODE_SUBSET = 051 { 052 // '\0' 053 false, true, true, true, true, true, true, true, 054 true, true, true, true, true, true, true, true, 055 true, true, true, true, true, true, true, true, 056 true, true, true, true, true, true, true, true, 057 true, true, true, true, true, true, true, true, 058 // '(', ')', '*' 059 false, false, false, true, true, true, true, true, 060 true, true, true, true, true, true, true, true, 061 true, true, true, true, true, true, true, true, 062 true, true, true, true, true, true, true, true, 063 true, true, true, true, true, true, true, true, 064 true, true, true, true, true, true, true, true, 065 // '\' 066 true, true, true, true, false, true, true, true, 067 true, true, true, true, true, true, true, true, 068 true, true, true, true, true, true, true, true, 069 true, true, true, true, true, true, true, true, 070 true, true, true, true, true, true, true, true, 071 }; 072 private static final int CHAR_ONE_BYTE_MASK = 0xFFFFFF80; 073 private static final int CHAR_TWO_BYTES_MASK = 0xFFFFF800; 074 private static final int CHAR_THREE_BYTES_MASK = 0xFFFF0000; 075 private static final int CHAR_FOUR_BYTES_MASK = 0xFFE00000; 076 077 private Unicode() 078 { 079 } 080 081 /** 082 * Count the number of bytes needed to return an Unicode char. This can be 083 * from 1 to 6. 084 * 085 * @param bytes The bytes to read 086 * @param pos Position to start counting. It must be a valid start of a 087 * encoded char ! 088 * @return The number of bytes to create a char, or -1 if the encoding is 089 * wrong. TODO : Should stop after the third byte, as a char is only 090 * 2 bytes long. 091 */ 092 public static int countBytesPerChar( byte[] bytes, int pos ) 093 { 094 if ( bytes == null ) 095 { 096 return -1; 097 } 098 099 if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 ) 100 { 101 return 1; 102 } 103 else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES ) 104 { 105 return 2; 106 } 107 else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES ) 108 { 109 return 3; 110 } 111 else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES ) 112 { 113 return 4; 114 } 115 else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES ) 116 { 117 return 5; 118 } 119 else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES ) 120 { 121 return 6; 122 } 123 else 124 { 125 return -1; 126 } 127 } 128 129 130 /** 131 * Return the Unicode char which is coded in the bytes at position 0. 132 * 133 * @param bytes The byte[] represntation of an Unicode string. 134 * @return The first char found. 135 */ 136 public static char bytesToChar( byte[] bytes ) 137 { 138 return bytesToChar( bytes, 0 ); 139 } 140 141 142 /** 143 * Return the Unicode char which is coded in the bytes at the given 144 * position. 145 * 146 * @param bytes The byte[] represntation of an Unicode string. 147 * @param pos The current position to start decoding the char 148 * @return The decoded char, or -1 if no char can be decoded TODO : Should 149 * stop after the third byte, as a char is only 2 bytes long. 150 */ 151 public static char bytesToChar( byte[] bytes, int pos ) 152 { 153 if ( bytes == null ) 154 { 155 return ( char ) -1; 156 } 157 158 if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 ) 159 { 160 return ( char ) bytes[pos]; 161 } 162 else 163 { 164 if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES ) 165 { 166 // Two bytes char 167 // 110x-xxyy 10zz-zzzz -> 0000-0xxx yyzz-zzzz 168 return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + ( ( bytes[pos] & 0x03 ) << 6 ) + ( bytes[pos + 1] & 0x3F ) ); 169 } 170 else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES ) 171 { 172 // Three bytes char 173 // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-xxxx yyzz-zzzz (FF FF) 174 return ( char ) ( ( ( bytes[pos] & 0x0F ) << 12 ) 175 + ( ( bytes[pos + 1] & 0x3C ) << 6 ) 176 + ( ( bytes[pos + 1] & 0x03 ) << 6 ) 177 + ( bytes[pos + 2] & 0x3F ) 178 ); 179 } 180 else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES ) 181 { 182 // Four bytes char 183 return ( char ) ( 184 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF) 185 ( ( bytes[pos] & 0x07 ) << 18 ) 186 + ( ( bytes[pos + 1] & 0x30 ) << 16 ) 187 + ( ( bytes[pos + 1] & 0x0F ) << 12 ) 188 + ( ( bytes[pos + 2] & 0x3C ) << 6 ) 189 + ( ( bytes[pos + 2] & 0x03 ) << 6 ) 190 + ( bytes[pos + 3] & 0x3F ) 191 ); 192 } 193 else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES ) 194 { 195 // Five bytes char 196 return ( char ) ( 197 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 198 // 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF) 199 ( ( bytes[pos] & 0x03 ) << 24 ) 200 + ( ( bytes[pos + 1] & 0x3F ) << 18 ) 201 + ( ( bytes[pos + 2] & 0x30 ) << 12 ) 202 + ( ( bytes[pos + 2] & 0x0F ) << 12 ) 203 + ( ( bytes[pos + 3] & 0x3C ) << 6 ) 204 + ( ( bytes[pos + 3] & 0x03 ) << 6 ) 205 + ( bytes[pos + 4] & 0x3F ) 206 ); 207 } 208 else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES ) 209 { 210 // Six bytes char 211 return ( char ) ( 212 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz 213 // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF) 214 ( ( bytes[pos] & 0x01 ) << 30 ) 215 + ( ( bytes[pos + 1] & 0x3F ) << 24 ) 216 + ( ( bytes[pos + 2] & 0x3F ) << 18 ) 217 + ( ( bytes[pos + 3] & 0x30 ) << 12 ) 218 + ( ( bytes[pos + 3] & 0x0F ) << 12 ) 219 + ( ( bytes[pos + 4] & 0x3C ) << 6 ) 220 + ( ( bytes[pos + 4] & 0x03 ) << 6 ) 221 + ( bytes[pos + 5] & 0x3F ) 222 ); 223 } 224 else 225 { 226 return ( char ) -1; 227 } 228 } 229 } 230 231 232 /** 233 * Return the number of bytes that hold an Unicode char. 234 * 235 * @param car The character to be decoded 236 * @return The number of bytes to hold the char. TODO : Should stop after 237 * the third byte, as a char is only 2 bytes long. 238 */ 239 public static int countNbBytesPerChar( char car ) 240 { 241 if ( ( car & CHAR_ONE_BYTE_MASK ) == 0 ) 242 { 243 return 1; 244 } 245 else if ( ( car & CHAR_TWO_BYTES_MASK ) == 0 ) 246 { 247 return 2; 248 } 249 else if ( ( car & CHAR_THREE_BYTES_MASK ) == 0 ) 250 { 251 return 3; 252 } 253 else if ( ( car & CHAR_FOUR_BYTES_MASK ) == 0 ) 254 { 255 return 4; 256 } 257 else 258 { 259 return -1; 260 } 261 } 262 263 264 /** 265 * Count the number of bytes included in the given char[]. 266 * 267 * @param chars The char array to decode 268 * @return The number of bytes in the char array 269 */ 270 public static int countBytes( char[] chars ) 271 { 272 if ( chars == null ) 273 { 274 return 0; 275 } 276 277 int nbBytes = 0; 278 int currentPos = 0; 279 280 while ( currentPos < chars.length ) 281 { 282 int nbb = countNbBytesPerChar( chars[currentPos] ); 283 284 // If the number of bytes necessary to encode a character is 285 // above 3, we will need two UTF-16 chars 286 currentPos += ( nbb < 4 ? 1 : 2 ); 287 nbBytes += nbb; 288 } 289 290 return nbBytes; 291 } 292 293 294 /** 295 * Count the number of chars included in the given byte[]. 296 * 297 * @param bytes The byte array to decode 298 * @return The number of char in the byte array 299 */ 300 public static int countChars( byte[] bytes ) 301 { 302 if ( bytes == null ) 303 { 304 return 0; 305 } 306 307 int nbChars = 0; 308 int currentPos = 0; 309 310 while ( currentPos < bytes.length ) 311 { 312 currentPos += countBytesPerChar( bytes, currentPos ); 313 nbChars++; 314 } 315 316 return nbChars; 317 } 318 319 320 /** 321 * Return the Unicode char which is coded in the bytes at the given 322 * position. 323 * 324 * @param car The character to be transformed to an array of bytes 325 * 326 * @return The byte array representing the char 327 * 328 * TODO : Should stop after the third byte, as a char is only 2 bytes long. 329 */ 330 public static byte[] charToBytes( char car ) 331 { 332 if ( car <= 0x007F ) 333 { 334 byte[] bytes = new byte[1]; 335 336 // Single byte char 337 bytes[0] = ( byte ) car; 338 339 return bytes; 340 } 341 else if ( car <= 0x07FF ) 342 { 343 byte[] bytes = new byte[2]; 344 345 // two bytes char 346 bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) ); 347 bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); 348 349 return bytes; 350 } 351 else 352 { 353 byte[] bytes = new byte[3]; 354 355 // Three bytes char 356 bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) ); 357 bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) ); 358 bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); 359 360 return bytes; 361 } 362 } 363 364 365 /** 366 * Check if the current char is in the unicodeSubset : all chars but 367 * '\0', '(', ')', '*' and '\' 368 * 369 * @param str The string to check 370 * @param pos Position of the current char 371 * @return True if the current char is in the unicode subset 372 */ 373 public static boolean isUnicodeSubset( String str, int pos ) 374 { 375 if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) ) 376 { 377 return false; 378 } 379 380 char c = str.charAt( pos ); 381 382 return ( c > 127 ) || UNICODE_SUBSET[c]; 383 } 384 385 386 /** 387 * Check if the current char is in the unicodeSubset : all chars but 388 * '\0', '(', ')', '*' and '\' 389 * 390 * @param c The char to check 391 * @return True if the current char is in the unicode subset 392 */ 393 public static boolean isUnicodeSubset( char c ) 394 { 395 return ( c > 127 ) || UNICODE_SUBSET[c]; 396 } 397 398 399 /** 400 * Check if the current byte is in the unicodeSubset : all chars but 401 * '\0', '(', ')', '*' and '\' 402 * 403 * @param b The byte to check 404 * @return True if the current byte is in the unicode subset 405 */ 406 public static boolean isUnicodeSubset( byte b ) 407 { 408 return ( b < 0 ) || ( b > 127 ) || UNICODE_SUBSET[b]; 409 } 410 411 412 /** 413 * 414 * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation 415 * of every character in the string str. If str is null, the string value 'null' is written with a length of 0 416 * instead of throwing an NullPointerException. Each character in the string s is converted to a group of one, 417 * two, or three bytes, depending on the value of the character. 418 * 419 * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is 420 * written in the length information (four bytes (writeInt)) and the string is split into smaller parts 421 * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes 422 * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at 423 * once. 424 * 425 * See also {@link java.io.DataOutput#writeUTF(String)}. 426 * 427 * @param objectOutput The objectOutput to write to 428 * @param str The value to write 429 * @throws java.io.IOException If the value can't be written to the file 430 */ 431 public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException 432 { 433 // Write a 'null' string 434 if ( str == null ) 435 { 436 objectOutput.writeInt( 0 ); 437 objectOutput.writeUTF( "null" ); 438 } 439 else 440 { 441 // Write length of string 442 objectOutput.writeInt( str.length() ); 443 444 StringBuilder strBuf = new StringBuilder( str ); 445 446 // Write the string in portions not larger than 21845 characters 447 while ( strBuf != null ) 448 { 449 if ( strBuf.length() < 21845 ) 450 { 451 objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) ); 452 strBuf = null; 453 } 454 else 455 { 456 objectOutput.writeUTF( strBuf.substring( 0, 21845 ) ); 457 strBuf.delete( 0, 21845 ); 458 } 459 } 460 } 461 } 462 463 464 /** 465 * 466 * Reads in a string that has been encoded using a modified UTF-8 format. The general contract of readUTF is 467 * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of 468 * characters is then returned as a String. 469 * 470 * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner 471 * of the readUnsignedShort method . This integer value is called the UTF length and specifies the number of 472 * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The 473 * length of each group is computed from the value of the first byte of the group. The byte following a group, if 474 * any, is the first byte of the next group. 475 * 476 *See also {@link java.io.DataInput#readUTF()}. 477 * 478 * @param objectInput The objectInput to read from 479 * @return The read string 480 * @throws java.io.IOException If the value can't be read 481 */ 482 public static String readUTF( ObjectInput objectInput ) throws IOException 483 { 484 // Read length of the string 485 int strLength = objectInput.readInt(); 486 487 // Start reading the string 488 StringBuilder strBuf = new StringBuilder( objectInput.readUTF() ); 489 490 if ( ( strLength == 0 ) && ( "null".equals( strBuf.toString() ) ) ) 491 { 492 // The special case of a 'null' string 493 return null; 494 } 495 else 496 { 497 while ( strLength > strBuf.length() ) 498 { 499 strBuf.append( objectInput.readUTF() ); 500 } 501 return strBuf.toString(); 502 } 503 } 504}