Utf8Encoder.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tomcat.util.buf;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.StandardCharsets;
/**
* Encodes characters as bytes using UTF-8. Extracted from Apache Harmony with some minor bug fixes applied.
*/
public class Utf8Encoder extends CharsetEncoder {
public Utf8Encoder() {
super(StandardCharsets.UTF_8, 1.1f, 4.0f);
}
@Override
protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
if (in.hasArray() && out.hasArray()) {
return encodeHasArray(in, out);
}
return encodeNotHasArray(in, out);
}
private CoderResult encodeHasArray(CharBuffer in, ByteBuffer out) {
int outRemaining = out.remaining();
int pos = in.position();
int limit = in.limit();
byte[] bArr;
char[] cArr;
int x = pos;
bArr = out.array();
cArr = in.array();
int outPos = out.position();
int rem = in.remaining();
for (x = pos; x < pos + rem; x++) {
int jchar = (cArr[x] & 0xFFFF);
if (jchar <= 0x7F) {
if (outRemaining < 1) {
in.position(x);
out.position(outPos);
return CoderResult.OVERFLOW;
}
bArr[outPos++] = (byte) (jchar & 0xFF);
outRemaining--;
} else if (jchar <= 0x7FF) {
if (outRemaining < 2) {
in.position(x);
out.position(outPos);
return CoderResult.OVERFLOW;
}
bArr[outPos++] = (byte) (0xC0 + ((jchar >> 6) & 0x1F));
bArr[outPos++] = (byte) (0x80 + (jchar & 0x3F));
outRemaining -= 2;
} else if (jchar >= 0xD800 && jchar <= 0xDFFF) {
// in has to have one byte more.
if (limit <= x + 1) {
in.position(x);
out.position(outPos);
return CoderResult.UNDERFLOW;
}
if (outRemaining < 4) {
in.position(x);
out.position(outPos);
return CoderResult.OVERFLOW;
}
// The surrogate pair starts with a low-surrogate.
if (jchar >= 0xDC00) {
in.position(x);
out.position(outPos);
return CoderResult.malformedForLength(1);
}
int jchar2 = cArr[x + 1] & 0xFFFF;
// The surrogate pair ends with a high-surrogate.
if (jchar2 < 0xDC00) {
in.position(x);
out.position(outPos);
return CoderResult.malformedForLength(1);
}
// Note, the Unicode scalar value n is defined
// as follows:
// n = (jchar-0xD800)*0x400+(jchar2-0xDC00)+0x10000
// Where jchar is a high-surrogate,
// jchar2 is a low-surrogate.
int n = (jchar << 10) + jchar2 + 0xFCA02400;
bArr[outPos++] = (byte) (0xF0 + ((n >> 18) & 0x07));
bArr[outPos++] = (byte) (0x80 + ((n >> 12) & 0x3F));
bArr[outPos++] = (byte) (0x80 + ((n >> 6) & 0x3F));
bArr[outPos++] = (byte) (0x80 + (n & 0x3F));
outRemaining -= 4;
x++;
} else {
if (outRemaining < 3) {
in.position(x);
out.position(outPos);
return CoderResult.OVERFLOW;
}
bArr[outPos++] = (byte) (0xE0 + ((jchar >> 12) & 0x0F));
bArr[outPos++] = (byte) (0x80 + ((jchar >> 6) & 0x3F));
bArr[outPos++] = (byte) (0x80 + (jchar & 0x3F));
outRemaining -= 3;
}
if (outRemaining == 0) {
in.position(x + 1);
out.position(outPos);
// If both input and output are exhausted, return UNDERFLOW
if (x + 1 == limit) {
return CoderResult.UNDERFLOW;
} else {
return CoderResult.OVERFLOW;
}
}
}
if (rem != 0) {
in.position(x);
out.position(outPos);
}
return CoderResult.UNDERFLOW;
}
private CoderResult encodeNotHasArray(CharBuffer in, ByteBuffer out) {
int outRemaining = out.remaining();
int pos = in.position();
int limit = in.limit();
try {
while (pos < limit) {
if (outRemaining == 0) {
return CoderResult.OVERFLOW;
}
int jchar = (in.get() & 0xFFFF);
if (jchar <= 0x7F) {
if (outRemaining < 1) {
return CoderResult.OVERFLOW;
}
out.put((byte) jchar);
outRemaining--;
} else if (jchar <= 0x7FF) {
if (outRemaining < 2) {
return CoderResult.OVERFLOW;
}
out.put((byte) (0xC0 + ((jchar >> 6) & 0x1F)));
out.put((byte) (0x80 + (jchar & 0x3F)));
outRemaining -= 2;
} else if (jchar >= 0xD800 && jchar <= 0xDFFF) {
// in has to have one byte more.
if (limit <= pos + 1) {
return CoderResult.UNDERFLOW;
}
if (outRemaining < 4) {
return CoderResult.OVERFLOW;
}
// The surrogate pair starts with a low-surrogate.
if (jchar >= 0xDC00) {
return CoderResult.malformedForLength(1);
}
int jchar2 = (in.get() & 0xFFFF);
// The surrogate pair ends with a high-surrogate.
if (jchar2 < 0xDC00) {
return CoderResult.malformedForLength(1);
}
// Note, the Unicode scalar value n is defined
// as follows:
// n = (jchar-0xD800)*0x400+(jchar2-0xDC00)+0x10000
// Where jchar is a high-surrogate,
// jchar2 is a low-surrogate.
int n = (jchar << 10) + jchar2 + 0xFCA02400;
out.put((byte) (0xF0 + ((n >> 18) & 0x07)));
out.put((byte) (0x80 + ((n >> 12) & 0x3F)));
out.put((byte) (0x80 + ((n >> 6) & 0x3F)));
out.put((byte) (0x80 + (n & 0x3F)));
outRemaining -= 4;
pos++;
} else {
if (outRemaining < 3) {
return CoderResult.OVERFLOW;
}
out.put((byte) (0xE0 + ((jchar >> 12) & 0x0F)));
out.put((byte) (0x80 + ((jchar >> 6) & 0x3F)));
out.put((byte) (0x80 + (jchar & 0x3F)));
outRemaining -= 3;
}
pos++;
}
} finally {
in.position(pos);
}
return CoderResult.UNDERFLOW;
}
}