android13/external/guava/guava-tests/test/com/google/common/base/Utf8Test.java

349 lines
13 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (C) 2013 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.common.base;
import static com.google.common.truth.Truth.assertThat;
import static java.lang.Character.MAX_CODE_POINT;
import static java.lang.Character.MAX_HIGH_SURROGATE;
import static java.lang.Character.MAX_LOW_SURROGATE;
import static java.lang.Character.MIN_HIGH_SURROGATE;
import static java.lang.Character.MIN_LOW_SURROGATE;
import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
import com.google.common.annotations.GwtCompatible;
import com.google.common.annotations.GwtIncompatible;
import com.google.common.collect.ImmutableList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Random;
import junit.framework.TestCase;
/**
* Unit tests for {@link Utf8}.
*
* @author Jon Perlow
* @author Martin Buchholz
* @author Clément Roux
*/
@GwtCompatible(emulated = true)
public class Utf8Test extends TestCase {
private static final ImmutableList<String> ILL_FORMED_STRINGS;
static {
ImmutableList.Builder<String> builder = ImmutableList.builder();
char[] surrogates = {
MAX_LOW_SURROGATE, MAX_HIGH_SURROGATE, MIN_LOW_SURROGATE, MIN_HIGH_SURROGATE,
};
for (char surrogate : surrogates) {
builder.add(newString(surrogate));
builder.add(newString(surrogate, 'n'));
builder.add(newString('n', surrogate));
builder.add(newString(surrogate, surrogate));
}
builder.add(newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE));
ILL_FORMED_STRINGS = builder.build();
}
public void testEncodedLength_validStrings() {
assertEquals(0, Utf8.encodedLength(""));
assertEquals(11, Utf8.encodedLength("Hello world"));
assertEquals(8, Utf8.encodedLength("Résumé"));
assertEquals(
461,
Utf8.encodedLength(
"威廉·莎士比亞William Shakespeare"
+ "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
+ "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
+ "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
+ "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
+ "哈都拕人翻譯做好多話。"));
// A surrogate pair
assertEquals(4, Utf8.encodedLength(newString(MIN_HIGH_SURROGATE, MIN_LOW_SURROGATE)));
}
public void testEncodedLength_validStrings2() {
HashMap<Integer, Integer> utf8Lengths = new HashMap<>();
utf8Lengths.put(0x00, 1);
utf8Lengths.put(0x7f, 1);
utf8Lengths.put(0x80, 2);
utf8Lengths.put(0x7ff, 2);
utf8Lengths.put(0x800, 3);
utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT, 4);
utf8Lengths.put(MAX_CODE_POINT, 4);
Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[] {});
StringBuilder sb = new StringBuilder();
Random rnd = new Random();
for (int trial = 0; trial < 100; trial++) {
sb.setLength(0);
int utf8Length = 0;
for (int i = 0; i < 6; i++) {
Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
sb.appendCodePoint(randomCodePoint);
utf8Length += utf8Lengths.get(randomCodePoint);
if (utf8Length != Utf8.encodedLength(sb)) {
StringBuilder repro = new StringBuilder();
for (int j = 0; j < sb.length(); j++) {
repro.append(" ").append((int) sb.charAt(j)); // GWT compatible
}
assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
}
}
}
}
public void testEncodedLength_invalidStrings() {
testEncodedLengthFails(newString(MIN_HIGH_SURROGATE), 0);
testEncodedLengthFails("foobar" + newString(MIN_HIGH_SURROGATE), 6);
testEncodedLengthFails(newString(MIN_LOW_SURROGATE), 0);
testEncodedLengthFails("foobar" + newString(MIN_LOW_SURROGATE), 6);
testEncodedLengthFails(newString(MIN_HIGH_SURROGATE, MIN_HIGH_SURROGATE), 0);
}
private static void testEncodedLengthFails(String invalidString, int invalidCodePointIndex) {
try {
Utf8.encodedLength(invalidString);
fail();
} catch (IllegalArgumentException expected) {
assertThat(expected)
.hasMessageThat()
.isEqualTo("Unpaired surrogate at index " + invalidCodePointIndex);
}
}
// 128 - [chars 0x0000 to 0x007f]
private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
// 128
private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
// 1920 [chars 0x0080 to 0x07FF]
private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
// 18,304
private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
// Both bytes are one byte characters
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2)
+
// The possible number of two byte characters
TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
// 2048
private static final long THREE_BYTE_SURROGATES = 2 * 1024;
// 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
// 2,650,112
private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
// All one byte characters
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3)
+
// One two byte character and a one byte character
2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
+
// Three byte characters
THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
// 1,048,576 [chars 0x10000L to 0x10FFFF]
private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
// 289,571,839
private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
// All one byte characters
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4)
+
// One and three byte characters
2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
+
// Two two byte characters
TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
+
// Permutations of one and two byte characters
3
* TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
* ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
* ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
+
// Four byte characters
FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
/** Tests that round tripping of all two byte permutations work. */
@GwtIncompatible // java.nio.charset.Charset
public void testIsWellFormed_1Byte() {
testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
}
/** Tests that round tripping of all two byte permutations work. */
@GwtIncompatible // java.nio.charset.Charset
public void testIsWellFormed_2Bytes() {
testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
}
/** Tests that round tripping of all three byte permutations work. */
@GwtIncompatible // java.nio.charset.Charset
public void testIsWellFormed_3Bytes() {
testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
}
/**
* Tests that round tripping of a sample of four byte permutations work. All permutations are
* prohibitively expensive to test for automated runs. This method tests specific four-byte cases.
*/
public void testIsWellFormed_4BytesSamples() {
// Valid 4 byte.
assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
// Bad trailing bytes
assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
// Special cases for byte2
assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
}
/** Tests some hard-coded test cases. */
public void testSomeSequences() {
// Empty
assertWellFormed();
// One-byte characters, including control characters
assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
// Two-byte characters
assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
// Three-byte characters
assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
// Four-byte characters
// "\u024B62\u024B62"
assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
// Mixed string
// "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
assertWellFormed(
0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 0x32, 0x34, 0x42, 0x36, 0x32,
0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
// Not a valid string
assertNotWellFormed(-1, 0, -1, 0);
}
public void testShardsHaveExpectedRoundTrippables() {
// A sanity check.
long actual = 0;
for (long expected : generateFourByteShardsExpectedRunnables()) {
actual += expected;
}
assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
}
private static String newString(char... chars) {
return new String(chars);
}
private static byte[] toByteArray(int... bytes) {
byte[] realBytes = new byte[bytes.length];
for (int i = 0; i < bytes.length; i++) {
realBytes[i] = (byte) bytes[i];
}
return realBytes;
}
private static void assertWellFormed(int... bytes) {
assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
}
private static void assertNotWellFormed(int... bytes) {
assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
}
private static long[] generateFourByteShardsExpectedRunnables() {
long[] expected = new long[128];
// 0-63 are all 5300224
for (int i = 0; i <= 63; i++) {
expected[i] = 5300224;
}
// 97-111 are all 2342912
for (int i = 97; i <= 111; i++) {
expected[i] = 2342912;
}
// 113-117 are all 1048576
for (int i = 113; i <= 117; i++) {
expected[i] = 1048576;
}
// One offs
expected[112] = 786432;
expected[118] = 786432;
expected[119] = 1048576;
expected[120] = 458752;
expected[121] = 524288;
expected[122] = 65536;
// Anything not assigned was the default 0.
return expected;
}
/**
* Helper to run the loop to test all the permutations for the number of bytes specified.
*
* @param numBytes the number of bytes in the byte array
* @param expectedCount the expected number of roundtrippable permutations
*/
@GwtIncompatible // java.nio.charset.Charset
private static void testBytes(int numBytes, long expectedCount) {
testBytes(numBytes, expectedCount, 0, -1);
}
/**
* Helper to run the loop to test all the permutations for the number of bytes specified. This
* overload is useful for debugging to get the loop to start at a certain character.
*
* @param numBytes the number of bytes in the byte array
* @param expectedCount the expected number of roundtrippable permutations
* @param start the starting bytes encoded as a long as big-endian
* @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max
* limit for numBytes
*/
@GwtIncompatible // java.nio.charset.Charset
private static void testBytes(int numBytes, long expectedCount, long start, long lim) {
byte[] bytes = new byte[numBytes];
if (lim == -1) {
lim = 1L << (numBytes * 8);
}
long countRoundTripped = 0;
for (long byteChar = start; byteChar < lim; byteChar++) {
long tmpByteChar = byteChar;
for (int i = 0; i < numBytes; i++) {
bytes[bytes.length - i - 1] = (byte) tmpByteChar;
tmpByteChar = tmpByteChar >> 8;
}
boolean isRoundTrippable = Utf8.isWellFormed(bytes);
assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
String s = new String(bytes, Charsets.UTF_8);
byte[] bytesReencoded = s.getBytes(Charsets.UTF_8);
boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
if (bytesEqual != isRoundTrippable) {
fail();
}
if (isRoundTrippable) {
countRoundTripped++;
}
}
assertEquals(expectedCount, countRoundTripped);
}
}