171 lines
5.3 KiB
Java
171 lines
5.3 KiB
Java
/*
|
|
* Copyright (C) 2017 The Guava Authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package com.google.common.hash;
|
|
|
|
import com.google.caliper.BeforeExperiment;
|
|
import com.google.caliper.Benchmark;
|
|
import com.google.caliper.Param;
|
|
import java.nio.charset.StandardCharsets;
|
|
import java.util.Random;
|
|
|
|
/** Benchmarks for the hashing of UTF-8 strings. */
|
|
public class HashStringBenchmark {
|
|
static class MaxCodePoint {
|
|
final int value;
|
|
|
|
/**
|
|
* Convert the input string to a code point. Accepts regular decimal numerals, hex strings, and
|
|
* some symbolic names meaningful to humans.
|
|
*/
|
|
private static int decode(String userFriendly) {
|
|
try {
|
|
return Integer.decode(userFriendly);
|
|
} catch (NumberFormatException ignored) {
|
|
if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
|
|
// 1-byte UTF-8 sequences - "American" ASCII text
|
|
return 0x80;
|
|
} else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
|
|
// Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
|
|
// sequences - "Western European" text
|
|
return 0x90;
|
|
} else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
|
|
// Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time.
|
|
return 0x100;
|
|
} else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
|
|
// Mostly 2-byte UTF-8 sequences - "European" text
|
|
return 0x800;
|
|
} else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
|
|
// Mostly 3-byte UTF-8 sequences - "Asian" text
|
|
return Character.MIN_SUPPLEMENTARY_CODE_POINT;
|
|
} else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
|
|
// Mostly 4-byte UTF-8 sequences - "rare exotic" text
|
|
return Character.MAX_CODE_POINT;
|
|
} else {
|
|
throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
|
|
}
|
|
}
|
|
}
|
|
|
|
public static MaxCodePoint valueOf(String userFriendly) {
|
|
return new MaxCodePoint(userFriendly);
|
|
}
|
|
|
|
public MaxCodePoint(String userFriendly) {
|
|
value = decode(userFriendly);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* The default values of maxCodePoint below provide pretty good performance models of different
|
|
* kinds of common human text.
|
|
*
|
|
* @see MaxCodePoint#decode
|
|
*/
|
|
@Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"})
|
|
MaxCodePoint maxCodePoint;
|
|
|
|
@Param({"16384"})
|
|
int charCount;
|
|
|
|
@Param({"MURMUR3_32", "MURMUR3_128", "SHA1"})
|
|
HashFunctionEnum hashFunctionEnum;
|
|
|
|
private String[] strings;
|
|
|
|
static final int SAMPLES = 0x100;
|
|
static final int SAMPLE_MASK = 0xFF;
|
|
|
|
/**
|
|
* Compute arrays of valid unicode text, and store it in 3 forms: byte arrays, Strings, and
|
|
* StringBuilders (in a CharSequence[] to make it a little harder for the JVM).
|
|
*/
|
|
@BeforeExperiment
|
|
void setUp() {
|
|
final long seed = 99;
|
|
final Random rnd = new Random(seed);
|
|
strings = new String[SAMPLES];
|
|
for (int i = 0; i < SAMPLES; i++) {
|
|
StringBuilder sb = new StringBuilder();
|
|
for (int j = 0; j < charCount; j++) {
|
|
int codePoint;
|
|
// discard illegal surrogate "codepoints"
|
|
do {
|
|
codePoint = rnd.nextInt(maxCodePoint.value);
|
|
} while (Character.isSurrogate((char) codePoint));
|
|
sb.appendCodePoint(codePoint);
|
|
}
|
|
strings[i] = sb.toString();
|
|
}
|
|
}
|
|
|
|
@Benchmark
|
|
int hashUtf8(int reps) {
|
|
int res = 0;
|
|
for (int i = 0; i < reps; i++) {
|
|
res +=
|
|
System.identityHashCode(
|
|
hashFunctionEnum
|
|
.getHashFunction()
|
|
.hashString(strings[i & SAMPLE_MASK], StandardCharsets.UTF_8));
|
|
}
|
|
return res;
|
|
}
|
|
|
|
@Benchmark
|
|
int hashUtf8Hasher(int reps) {
|
|
int res = 0;
|
|
for (int i = 0; i < reps; i++) {
|
|
res +=
|
|
System.identityHashCode(
|
|
hashFunctionEnum
|
|
.getHashFunction()
|
|
.newHasher()
|
|
.putString(strings[i & SAMPLE_MASK], StandardCharsets.UTF_8)
|
|
.hash());
|
|
}
|
|
return res;
|
|
}
|
|
|
|
@Benchmark
|
|
int hashUtf8GetBytes(int reps) {
|
|
int res = 0;
|
|
for (int i = 0; i < reps; i++) {
|
|
res +=
|
|
System.identityHashCode(
|
|
hashFunctionEnum
|
|
.getHashFunction()
|
|
.hashBytes(strings[i & SAMPLE_MASK].getBytes(StandardCharsets.UTF_8)));
|
|
}
|
|
return res;
|
|
}
|
|
|
|
@Benchmark
|
|
int hashUtf8GetBytesHasher(int reps) {
|
|
int res = 0;
|
|
for (int i = 0; i < reps; i++) {
|
|
res +=
|
|
System.identityHashCode(
|
|
hashFunctionEnum
|
|
.getHashFunction()
|
|
.newHasher()
|
|
.putBytes(strings[i & SAMPLE_MASK].getBytes(StandardCharsets.UTF_8))
|
|
.hash());
|
|
}
|
|
return res;
|
|
}
|
|
}
|