Utf8Encoder.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tomcat.util.buf;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.StandardCharsets;

/**
 * Encodes characters as bytes using UTF-8. Extracted from Apache Harmony with some minor bug fixes applied.
 */
public class Utf8Encoder extends CharsetEncoder {

    public Utf8Encoder() {
        super(StandardCharsets.UTF_8, 1.1f, 4.0f);
    }

    @Override
    protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
        if (in.hasArray() && out.hasArray()) {
            return encodeHasArray(in, out);
        }
        return encodeNotHasArray(in, out);
    }

    private CoderResult encodeHasArray(CharBuffer in, ByteBuffer out) {
        int outRemaining = out.remaining();
        int pos = in.position();
        int limit = in.limit();
        byte[] bArr;
        char[] cArr;
        int x = pos;
        bArr = out.array();
        cArr = in.array();
        int outPos = out.position();
        int rem = in.remaining();
        for (x = pos; x < pos + rem; x++) {
            int jchar = (cArr[x] & 0xFFFF);

            if (jchar <= 0x7F) {
                if (outRemaining < 1) {
                    in.position(x);
                    out.position(outPos);
                    return CoderResult.OVERFLOW;
                }
                bArr[outPos++] = (byte) (jchar & 0xFF);
                outRemaining--;
            } else if (jchar <= 0x7FF) {

                if (outRemaining < 2) {
                    in.position(x);
                    out.position(outPos);
                    return CoderResult.OVERFLOW;
                }
                bArr[outPos++] = (byte) (0xC0 + ((jchar >> 6) & 0x1F));
                bArr[outPos++] = (byte) (0x80 + (jchar & 0x3F));
                outRemaining -= 2;

            } else if (jchar >= 0xD800 && jchar <= 0xDFFF) {

                // in has to have one byte more.
                if (limit <= x + 1) {
                    in.position(x);
                    out.position(outPos);
                    return CoderResult.UNDERFLOW;
                }

                if (outRemaining < 4) {
                    in.position(x);
                    out.position(outPos);
                    return CoderResult.OVERFLOW;
                }

                // The surrogate pair starts with a low-surrogate.
                if (jchar >= 0xDC00) {
                    in.position(x);
                    out.position(outPos);
                    return CoderResult.malformedForLength(1);
                }

                int jchar2 = cArr[x + 1] & 0xFFFF;

                // The surrogate pair ends with a high-surrogate.
                if (jchar2 < 0xDC00) {
                    in.position(x);
                    out.position(outPos);
                    return CoderResult.malformedForLength(1);
                }

                // Note, the Unicode scalar value n is defined
                // as follows:
                // n = (jchar-0xD800)*0x400+(jchar2-0xDC00)+0x10000
                // Where jchar is a high-surrogate,
                // jchar2 is a low-surrogate.
                int n = (jchar << 10) + jchar2 + 0xFCA02400;

                bArr[outPos++] = (byte) (0xF0 + ((n >> 18) & 0x07));
                bArr[outPos++] = (byte) (0x80 + ((n >> 12) & 0x3F));
                bArr[outPos++] = (byte) (0x80 + ((n >> 6) & 0x3F));
                bArr[outPos++] = (byte) (0x80 + (n & 0x3F));
                outRemaining -= 4;
                x++;

            } else {

                if (outRemaining < 3) {
                    in.position(x);
                    out.position(outPos);
                    return CoderResult.OVERFLOW;
                }
                bArr[outPos++] = (byte) (0xE0 + ((jchar >> 12) & 0x0F));
                bArr[outPos++] = (byte) (0x80 + ((jchar >> 6) & 0x3F));
                bArr[outPos++] = (byte) (0x80 + (jchar & 0x3F));
                outRemaining -= 3;
            }
            if (outRemaining == 0) {
                in.position(x + 1);
                out.position(outPos);
                // If both input and output are exhausted, return UNDERFLOW
                if (x + 1 == limit) {
                    return CoderResult.UNDERFLOW;
                } else {
                    return CoderResult.OVERFLOW;
                }
            }

        }
        if (rem != 0) {
            in.position(x);
            out.position(outPos);
        }
        return CoderResult.UNDERFLOW;
    }

    private CoderResult encodeNotHasArray(CharBuffer in, ByteBuffer out) {
        int outRemaining = out.remaining();
        int pos = in.position();
        int limit = in.limit();
        try {
            while (pos < limit) {
                if (outRemaining == 0) {
                    return CoderResult.OVERFLOW;
                }

                int jchar = (in.get() & 0xFFFF);

                if (jchar <= 0x7F) {

                    if (outRemaining < 1) {
                        return CoderResult.OVERFLOW;
                    }
                    out.put((byte) jchar);
                    outRemaining--;

                } else if (jchar <= 0x7FF) {

                    if (outRemaining < 2) {
                        return CoderResult.OVERFLOW;
                    }
                    out.put((byte) (0xC0 + ((jchar >> 6) & 0x1F)));
                    out.put((byte) (0x80 + (jchar & 0x3F)));
                    outRemaining -= 2;

                } else if (jchar >= 0xD800 && jchar <= 0xDFFF) {

                    // in has to have one byte more.
                    if (limit <= pos + 1) {
                        return CoderResult.UNDERFLOW;
                    }

                    if (outRemaining < 4) {
                        return CoderResult.OVERFLOW;
                    }

                    // The surrogate pair starts with a low-surrogate.
                    if (jchar >= 0xDC00) {
                        return CoderResult.malformedForLength(1);
                    }

                    int jchar2 = (in.get() & 0xFFFF);

                    // The surrogate pair ends with a high-surrogate.
                    if (jchar2 < 0xDC00) {
                        return CoderResult.malformedForLength(1);
                    }

                    // Note, the Unicode scalar value n is defined
                    // as follows:
                    // n = (jchar-0xD800)*0x400+(jchar2-0xDC00)+0x10000
                    // Where jchar is a high-surrogate,
                    // jchar2 is a low-surrogate.
                    int n = (jchar << 10) + jchar2 + 0xFCA02400;

                    out.put((byte) (0xF0 + ((n >> 18) & 0x07)));
                    out.put((byte) (0x80 + ((n >> 12) & 0x3F)));
                    out.put((byte) (0x80 + ((n >> 6) & 0x3F)));
                    out.put((byte) (0x80 + (n & 0x3F)));
                    outRemaining -= 4;
                    pos++;

                } else {

                    if (outRemaining < 3) {
                        return CoderResult.OVERFLOW;
                    }
                    out.put((byte) (0xE0 + ((jchar >> 12) & 0x0F)));
                    out.put((byte) (0x80 + ((jchar >> 6) & 0x3F)));
                    out.put((byte) (0x80 + (jchar & 0x3F)));
                    outRemaining -= 3;
                }
                pos++;
            }
        } finally {
            in.position(pos);
        }
        return CoderResult.UNDERFLOW;
    }
}