1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.directory.api.util;
21
22
23 import java.io.IOException;
24 import java.io.ObjectInput;
25 import java.io.ObjectOutput;
26
27
28
29
30
31
32
33
34
35 public final class Unicode
36 {
37 private static final int UTF8_MULTI_BYTES_MASK = 0x0080;
38 private static final int UTF8_TWO_BYTES_MASK = 0x00E0;
39 private static final int UTF8_TWO_BYTES = 0x00C0;
40 private static final int UTF8_THREE_BYTES_MASK = 0x00F0;
41 private static final int UTF8_THREE_BYTES = 0x00E0;
42 private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;
43 private static final int UTF8_FOUR_BYTES = 0x00F0;
44 private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;
45 private static final int UTF8_FIVE_BYTES = 0x00F8;
46 private static final int UTF8_SIX_BYTES_MASK = 0x00FE;
47 private static final int UTF8_SIX_BYTES = 0x00FC;
48
49
50 private static final boolean[] UNICODE_SUBSET =
51 {
52
53 false, true, true, true, true, true, true, true,
54 true, true, true, true, true, true, true, true,
55 true, true, true, true, true, true, true, true,
56 true, true, true, true, true, true, true, true,
57 true, true, true, true, true, true, true, true,
58
59 false, false, false, true, true, true, true, true,
60 true, true, true, true, true, true, true, true,
61 true, true, true, true, true, true, true, true,
62 true, true, true, true, true, true, true, true,
63 true, true, true, true, true, true, true, true,
64 true, true, true, true, true, true, true, true,
65
66 true, true, true, true, false, true, true, true,
67 true, true, true, true, true, true, true, true,
68 true, true, true, true, true, true, true, true,
69 true, true, true, true, true, true, true, true,
70 true, true, true, true, true, true, true, true,
71 };
72 private static final int CHAR_ONE_BYTE_MASK = 0xFFFFFF80;
73 private static final int CHAR_TWO_BYTES_MASK = 0xFFFFF800;
74 private static final int CHAR_THREE_BYTES_MASK = 0xFFFF0000;
75 private static final int CHAR_FOUR_BYTES_MASK = 0xFFE00000;
76
77 private Unicode()
78 {
79 }
80
81
82
83
84
85
86
87
88
89
90
91
92 public static int countBytesPerChar( byte[] bytes, int pos )
93 {
94 if ( bytes == null )
95 {
96 return -1;
97 }
98
99 if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
100 {
101 return 1;
102 }
103 else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
104 {
105 return 2;
106 }
107 else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
108 {
109 return 3;
110 }
111 else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
112 {
113 return 4;
114 }
115 else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
116 {
117 return 5;
118 }
119 else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
120 {
121 return 6;
122 }
123 else
124 {
125 return -1;
126 }
127 }
128
129
130
131
132
133
134
135
136 public static char bytesToChar( byte[] bytes )
137 {
138 return bytesToChar( bytes, 0 );
139 }
140
141
142
143
144
145
146
147
148
149
150
151 public static char bytesToChar( byte[] bytes, int pos )
152 {
153 if ( bytes == null )
154 {
155 return ( char ) -1;
156 }
157
158 if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
159 {
160 return ( char ) bytes[pos];
161 }
162 else
163 {
164 if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
165 {
166
167
168 return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + ( ( bytes[pos] & 0x03 ) << 6 ) + ( bytes[pos + 1] & 0x3F ) );
169 }
170 else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
171 {
172
173
174 return ( char ) ( ( ( bytes[pos] & 0x0F ) << 12 )
175 + ( ( bytes[pos + 1] & 0x3C ) << 6 )
176 + ( ( bytes[pos + 1] & 0x03 ) << 6 )
177 + ( bytes[pos + 2] & 0x3F )
178 );
179 }
180 else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
181 {
182
183 return ( char ) (
184
185 ( ( bytes[pos] & 0x07 ) << 18 )
186 + ( ( bytes[pos + 1] & 0x30 ) << 16 )
187 + ( ( bytes[pos + 1] & 0x0F ) << 12 )
188 + ( ( bytes[pos + 2] & 0x3C ) << 6 )
189 + ( ( bytes[pos + 2] & 0x03 ) << 6 )
190 + ( bytes[pos + 3] & 0x3F )
191 );
192 }
193 else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
194 {
195
196 return ( char ) (
197
198
199 ( ( bytes[pos] & 0x03 ) << 24 )
200 + ( ( bytes[pos + 1] & 0x3F ) << 18 )
201 + ( ( bytes[pos + 2] & 0x30 ) << 12 )
202 + ( ( bytes[pos + 2] & 0x0F ) << 12 )
203 + ( ( bytes[pos + 3] & 0x3C ) << 6 )
204 + ( ( bytes[pos + 3] & 0x03 ) << 6 )
205 + ( bytes[pos + 4] & 0x3F )
206 );
207 }
208 else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
209 {
210
211 return ( char ) (
212
213
214 ( ( bytes[pos] & 0x01 ) << 30 )
215 + ( ( bytes[pos + 1] & 0x3F ) << 24 )
216 + ( ( bytes[pos + 2] & 0x3F ) << 18 )
217 + ( ( bytes[pos + 3] & 0x30 ) << 12 )
218 + ( ( bytes[pos + 3] & 0x0F ) << 12 )
219 + ( ( bytes[pos + 4] & 0x3C ) << 6 )
220 + ( ( bytes[pos + 4] & 0x03 ) << 6 )
221 + ( bytes[pos + 5] & 0x3F )
222 );
223 }
224 else
225 {
226 return ( char ) -1;
227 }
228 }
229 }
230
231
232
233
234
235
236
237
238
239 public static int countNbBytesPerChar( char car )
240 {
241 if ( ( car & CHAR_ONE_BYTE_MASK ) == 0 )
242 {
243 return 1;
244 }
245 else if ( ( car & CHAR_TWO_BYTES_MASK ) == 0 )
246 {
247 return 2;
248 }
249 else if ( ( car & CHAR_THREE_BYTES_MASK ) == 0 )
250 {
251 return 3;
252 }
253 else if ( ( car & CHAR_FOUR_BYTES_MASK ) == 0 )
254 {
255 return 4;
256 }
257 else
258 {
259 return -1;
260 }
261 }
262
263
264
265
266
267
268
269
270 public static int countBytes( char[] chars )
271 {
272 if ( chars == null )
273 {
274 return 0;
275 }
276
277 int nbBytes = 0;
278 int currentPos = 0;
279
280 while ( currentPos < chars.length )
281 {
282 int nbb = countNbBytesPerChar( chars[currentPos] );
283
284
285
286 currentPos += ( nbb < 4 ? 1 : 2 );
287 nbBytes += nbb;
288 }
289
290 return nbBytes;
291 }
292
293
294
295
296
297
298
299
300 public static int countChars( byte[] bytes )
301 {
302 if ( bytes == null )
303 {
304 return 0;
305 }
306
307 int nbChars = 0;
308 int currentPos = 0;
309
310 while ( currentPos < bytes.length )
311 {
312 currentPos += countBytesPerChar( bytes, currentPos );
313 nbChars++;
314 }
315
316 return nbChars;
317 }
318
319
320
321
322
323
324
325
326
327
328
329
330 public static byte[] charToBytes( char car )
331 {
332 if ( car <= 0x007F )
333 {
334 byte[] bytes = new byte[1];
335
336
337 bytes[0] = ( byte ) car;
338
339 return bytes;
340 }
341 else if ( car <= 0x07FF )
342 {
343 byte[] bytes = new byte[2];
344
345
346 bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) );
347 bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
348
349 return bytes;
350 }
351 else
352 {
353 byte[] bytes = new byte[3];
354
355
356 bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) );
357 bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) );
358 bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
359
360 return bytes;
361 }
362 }
363
364
365
366
367
368
369
370
371
372
373 public static boolean isUnicodeSubset( String str, int pos )
374 {
375 if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) )
376 {
377 return false;
378 }
379
380 char c = str.charAt( pos );
381
382 return ( c > 127 ) || UNICODE_SUBSET[c];
383 }
384
385
386
387
388
389
390
391
392
393 public static boolean isUnicodeSubset( char c )
394 {
395 return ( c > 127 ) || UNICODE_SUBSET[c];
396 }
397
398
399
400
401
402
403
404
405
406 public static boolean isUnicodeSubset( byte b )
407 {
408 return ( b < 0 ) || ( b > 127 ) || UNICODE_SUBSET[b];
409 }
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431 public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException
432 {
433
434 if ( str == null )
435 {
436 objectOutput.writeInt( 0 );
437 objectOutput.writeUTF( "null" );
438 }
439 else
440 {
441
442 objectOutput.writeInt( str.length() );
443
444 StringBuilder strBuf = new StringBuilder( str );
445
446
447 while ( strBuf != null )
448 {
449 if ( strBuf.length() < 21845 )
450 {
451 objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) );
452 strBuf = null;
453 }
454 else
455 {
456 objectOutput.writeUTF( strBuf.substring( 0, 21845 ) );
457 strBuf.delete( 0, 21845 );
458 }
459 }
460 }
461 }
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482 public static String readUTF( ObjectInput objectInput ) throws IOException
483 {
484
485 int strLength = objectInput.readInt();
486
487
488 StringBuilder strBuf = new StringBuilder( objectInput.readUTF() );
489
490 if ( ( strLength == 0 ) && ( "null".equals( strBuf.toString() ) ) )
491 {
492
493 return null;
494 }
495 else
496 {
497 while ( strLength > strBuf.length() )
498 {
499 strBuf.append( objectInput.readUTF() );
500 }
501 return strBuf.toString();
502 }
503 }
504 }