720 lines
28 KiB
C++
720 lines
28 KiB
C++
// Copyright 2021 Code Intelligence GmbH
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
// Modified from
|
|
// https://raw.githubusercontent.com/google/atheris/034284dc4bb1ad4f4ab6ba5d34fb4dca7c633660/fuzzed_data_provider.cc
|
|
//
|
|
// Original license and copyright notices:
|
|
//
|
|
// Copyright 2020 Google LLC
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
// Modified from
|
|
// https://github.com/llvm/llvm-project/blob/70de7e0d9a95b7fcd7c105b06bd90fdf4e01f563/compiler-rt/include/fuzzer/FuzzedDataProvider.h
|
|
//
|
|
// Original license and copyright notices:
|
|
//
|
|
//===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
|
|
#include "fuzzed_data_provider.h"
|
|
|
|
#include <algorithm>
|
|
#include <cstdint>
|
|
#include <string>
|
|
#include <type_traits>
|
|
#include <vector>
|
|
|
|
#include "absl/strings/str_format.h"
|
|
|
|
namespace {
|
|
|
|
const uint8_t *gDataPtr = nullptr;
|
|
std::size_t gRemainingBytes = 0;
|
|
|
|
// Advance by `bytes` bytes in the buffer or stay at the end if it has been
|
|
// consumed.
|
|
void Advance(const std::size_t bytes) {
|
|
if (bytes > gRemainingBytes) {
|
|
gRemainingBytes = 0;
|
|
} else {
|
|
gDataPtr += bytes;
|
|
gRemainingBytes -= bytes;
|
|
}
|
|
}
|
|
|
|
void ThrowIllegalArgumentException(JNIEnv &env, const std::string &message) {
|
|
jclass illegal_argument_exception =
|
|
env.FindClass("java/lang/IllegalArgumentException");
|
|
env.ThrowNew(illegal_argument_exception, message.c_str());
|
|
}
|
|
|
|
template <typename T>
|
|
struct JniArrayType {};
|
|
|
|
#define JNI_ARRAY_TYPE(lower_case, sentence_case) \
|
|
template <> \
|
|
struct JniArrayType<j##lower_case> { \
|
|
typedef j##lower_case type; \
|
|
typedef j##lower_case##Array array_type; \
|
|
static constexpr array_type (JNIEnv::*kNewArrayFunc)(jsize) = \
|
|
&JNIEnv::New##sentence_case##Array; \
|
|
static constexpr void (JNIEnv::*kSetArrayRegionFunc)( \
|
|
array_type array, jsize start, jsize len, \
|
|
const type *buf) = &JNIEnv::Set##sentence_case##ArrayRegion; \
|
|
};
|
|
|
|
JNI_ARRAY_TYPE(boolean, Boolean);
|
|
JNI_ARRAY_TYPE(byte, Byte);
|
|
JNI_ARRAY_TYPE(short, Short);
|
|
JNI_ARRAY_TYPE(int, Int);
|
|
JNI_ARRAY_TYPE(long, Long);
|
|
|
|
template <typename T>
|
|
typename JniArrayType<T>::array_type JNICALL
|
|
ConsumeIntegralArray(JNIEnv &env, jobject self, jint max_length) {
|
|
if (max_length < 0) {
|
|
ThrowIllegalArgumentException(env, "maxLength must not be negative");
|
|
return nullptr;
|
|
}
|
|
// Arrays of integral types are considered data and thus consumed from the
|
|
// beginning of the buffer.
|
|
std::size_t max_num_bytes = std::min(sizeof(T) * max_length, gRemainingBytes);
|
|
jsize actual_length = max_num_bytes / sizeof(T);
|
|
std::size_t actual_num_bytes = sizeof(T) * actual_length;
|
|
auto array = (env.*(JniArrayType<T>::kNewArrayFunc))(actual_length);
|
|
(env.*(JniArrayType<T>::kSetArrayRegionFunc))(
|
|
array, 0, actual_length, reinterpret_cast<const T *>(gDataPtr));
|
|
Advance(actual_num_bytes);
|
|
return array;
|
|
}
|
|
|
|
template <typename T>
|
|
jbyteArray JNICALL ConsumeRemainingAsArray(JNIEnv &env, jobject self) {
|
|
return ConsumeIntegralArray<T>(env, self, std::numeric_limits<jint>::max());
|
|
}
|
|
|
|
template <typename T>
|
|
T JNICALL ConsumeIntegralInRange(JNIEnv &env, jobject self, T min, T max) {
|
|
if (min > max) {
|
|
ThrowIllegalArgumentException(
|
|
env, absl::StrFormat(
|
|
"Consume*InRange: min must be <= max (got min: %d, max: %d)",
|
|
min, max));
|
|
return 0;
|
|
}
|
|
|
|
uint64_t range = static_cast<uint64_t>(max) - min;
|
|
uint64_t result = 0;
|
|
std::size_t offset = 0;
|
|
|
|
while (offset < 8 * sizeof(T) && (range >> offset) > 0 &&
|
|
gRemainingBytes != 0) {
|
|
--gRemainingBytes;
|
|
result = (result << 8u) | gDataPtr[gRemainingBytes];
|
|
offset += 8;
|
|
}
|
|
|
|
if (range != std::numeric_limits<T>::max())
|
|
// We accept modulo bias in favor of reading a dynamic number of bytes as
|
|
// this would make it harder for the fuzzer to mutate towards values from
|
|
// the table of recent compares.
|
|
result = result % (range + 1);
|
|
|
|
return static_cast<T>(min + result);
|
|
}
|
|
|
|
template <typename T>
|
|
T JNICALL ConsumeIntegral(JNIEnv &env, jobject self) {
|
|
// First generate an unsigned value and then (safely) cast it to a signed
|
|
// integral type. By doing this rather than calling ConsumeIntegralInRange
|
|
// with bounds [signed_min, signed_max], we ensure that there is a direct
|
|
// correspondence between the consumed raw bytes and the result (e.g., 0
|
|
// corresponds to 0 and not to signed_min). This should help mutating
|
|
// towards entries of the table of recent compares.
|
|
using UnsignedT = typename std::make_unsigned<T>::type;
|
|
static_assert(
|
|
std::numeric_limits<UnsignedT>::is_modulo,
|
|
"Unsigned to signed conversion requires modulo-based overflow handling");
|
|
return static_cast<T>(ConsumeIntegralInRange<UnsignedT>(
|
|
env, self, 0, std::numeric_limits<UnsignedT>::max()));
|
|
}
|
|
|
|
bool JNICALL ConsumeBool(JNIEnv &env, jobject self) {
|
|
return ConsumeIntegral<uint8_t>(env, self) & 1u;
|
|
}
|
|
|
|
jchar ConsumeCharInternal(JNIEnv &env, jobject self, bool filter_surrogates) {
|
|
auto raw_codepoint = ConsumeIntegral<jchar>(env, self);
|
|
if (filter_surrogates && raw_codepoint >= 0xd800 && raw_codepoint < 0xe000)
|
|
raw_codepoint -= 0xd800;
|
|
return raw_codepoint;
|
|
}
|
|
|
|
jchar JNICALL ConsumeChar(JNIEnv &env, jobject self) {
|
|
return ConsumeCharInternal(env, self, false);
|
|
}
|
|
|
|
jchar JNICALL ConsumeCharNoSurrogates(JNIEnv &env, jobject self) {
|
|
return ConsumeCharInternal(env, self, true);
|
|
}
|
|
|
|
template <typename T>
|
|
T JNICALL ConsumeProbability(JNIEnv &env, jobject self) {
|
|
using IntegralType =
|
|
typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t,
|
|
uint64_t>::type;
|
|
T result = static_cast<T>(ConsumeIntegral<IntegralType>(env, self));
|
|
result /= static_cast<T>(std::numeric_limits<IntegralType>::max());
|
|
return result;
|
|
}
|
|
|
|
template <typename T>
|
|
T JNICALL ConsumeFloatInRange(JNIEnv &env, jobject self, T min, T max) {
|
|
if (min > max) {
|
|
ThrowIllegalArgumentException(
|
|
env, absl::StrFormat(
|
|
"Consume*InRange: min must be <= max (got min: %f, max: %f)",
|
|
min, max));
|
|
return 0.0;
|
|
}
|
|
|
|
T range;
|
|
T result = min;
|
|
|
|
// Deal with overflow, in the event min and max are very far apart
|
|
if (min < 0 && max > 0 && min + std::numeric_limits<T>::max() < max) {
|
|
range = (max / 2) - (min / 2);
|
|
if (ConsumeBool(env, self)) {
|
|
result += range;
|
|
}
|
|
} else {
|
|
range = max - min;
|
|
}
|
|
|
|
T probability = ConsumeProbability<T>(env, self);
|
|
return result + range * probability;
|
|
}
|
|
|
|
template <typename T>
|
|
T JNICALL ConsumeRegularFloat(JNIEnv &env, jobject self) {
|
|
return ConsumeFloatInRange(env, self, std::numeric_limits<T>::lowest(),
|
|
std::numeric_limits<T>::max());
|
|
}
|
|
|
|
template <typename T>
|
|
T JNICALL ConsumeFloat(JNIEnv &env, jobject self) {
|
|
if (!gRemainingBytes) return 0.0;
|
|
|
|
auto type_val = ConsumeIntegral<uint8_t>(env, self);
|
|
|
|
if (type_val <= 10) {
|
|
// Consume the same amount of bytes as for a regular float/double
|
|
ConsumeRegularFloat<T>(env, self);
|
|
|
|
switch (type_val) {
|
|
case 0:
|
|
return 0.0;
|
|
case 1:
|
|
return -0.0;
|
|
case 2:
|
|
return std::numeric_limits<T>::infinity();
|
|
case 3:
|
|
return -std::numeric_limits<T>::infinity();
|
|
case 4:
|
|
return std::numeric_limits<T>::quiet_NaN();
|
|
case 5:
|
|
return std::numeric_limits<T>::denorm_min();
|
|
case 6:
|
|
return -std::numeric_limits<T>::denorm_min();
|
|
case 7:
|
|
return std::numeric_limits<T>::min();
|
|
case 8:
|
|
return -std::numeric_limits<T>::min();
|
|
case 9:
|
|
return std::numeric_limits<T>::max();
|
|
case 10:
|
|
return -std::numeric_limits<T>::max();
|
|
default:
|
|
abort();
|
|
}
|
|
}
|
|
|
|
T regular = ConsumeRegularFloat<T>(env, self);
|
|
return regular;
|
|
}
|
|
|
|
// Polyfill for C++20 std::countl_one, which counts the number of leading ones
|
|
// in an unsigned integer.
|
|
inline __attribute__((always_inline)) uint8_t countl_one(uint8_t byte) {
|
|
// The result of __builtin_clz is undefined for 0.
|
|
if (byte == 0xFF) return 8;
|
|
return __builtin_clz(static_cast<uint8_t>(~byte)) - 24;
|
|
}
|
|
|
|
// Forces a byte to be a valid UTF-8 continuation byte.
|
|
inline __attribute__((always_inline)) void ForceContinuationByte(
|
|
uint8_t &byte) {
|
|
byte = (byte | (1u << 7u)) & ~(1u << 6u);
|
|
}
|
|
|
|
constexpr uint8_t kTwoByteZeroLeadingByte = 0b11000000;
|
|
constexpr uint8_t kTwoByteZeroContinuationByte = 0b10000000;
|
|
constexpr uint8_t kThreeByteLowLeadingByte = 0b11100000;
|
|
constexpr uint8_t kSurrogateLeadingByte = 0b11101101;
|
|
|
|
enum class Utf8GenerationState {
|
|
LeadingByte_Generic,
|
|
LeadingByte_AfterBackslash,
|
|
ContinuationByte_Generic,
|
|
ContinuationByte_LowLeadingByte,
|
|
FirstContinuationByte_LowLeadingByte,
|
|
FirstContinuationByte_SurrogateLeadingByte,
|
|
FirstContinuationByte_Generic,
|
|
SecondContinuationByte_Generic,
|
|
LeadingByte_LowSurrogate,
|
|
FirstContinuationByte_LowSurrogate,
|
|
SecondContinuationByte_HighSurrogate,
|
|
SecondContinuationByte_LowSurrogate,
|
|
};
|
|
|
|
// Consumes up to `max_bytes` arbitrary bytes pointed to by `ptr` and returns a
|
|
// valid "modified UTF-8" string of length at most `max_length` that resembles
|
|
// the input bytes as closely as possible as well as the number of consumed
|
|
// bytes. If `stop_on_slash` is true, then the string will end on the first
|
|
// single consumed '\'.
|
|
//
|
|
// "Modified UTF-8" is the string encoding used by the JNI. It is the same as
|
|
// the legacy encoding CESU-8, but with `\0` coded on two bytes. In these
|
|
// encodings, code points requiring 4 bytes in modern UTF-8 are represented as
|
|
// two surrogates, each of which is coded on 3 bytes.
|
|
//
|
|
// This function has been designed with the following goals in mind:
|
|
// 1. The generated string should be biased towards containing ASCII characters
|
|
// as these are often the ones that affect control flow directly.
|
|
// 2. Correctly encoded data (e.g. taken from the table of recent compares)
|
|
// should be emitted unchanged.
|
|
// 3. The raw fuzzer input should be preserved as far as possible, but the
|
|
// output must always be correctly encoded.
|
|
//
|
|
// The JVM accepts string in two encodings: UTF-16 and modified UTF-8.
|
|
// Generating UTF-16 would make it harder to fulfill the first design goal and
|
|
// would potentially hinder compatibility with corpora using the much more
|
|
// widely used UTF-8 encoding, which is reasonably similar to modified UTF-8. As
|
|
// a result, this function uses modified UTF-8.
|
|
//
|
|
// See Algorithm 1 of https://arxiv.org/pdf/2010.03090.pdf for more details on
|
|
// the individual cases involved in determining the validity of a UTF-8 string.
|
|
template <bool ascii_only, bool stop_on_backslash>
|
|
std::pair<std::string, std::size_t> FixUpModifiedUtf8(const uint8_t *data,
|
|
std::size_t max_bytes,
|
|
jint max_length) {
|
|
std::string str;
|
|
// Every character in modified UTF-8 is coded on at most six bytes. Every
|
|
// consumed byte is transformed into at most one code unit, except for the
|
|
// case of a zero byte which requires two bytes.
|
|
if (max_bytes > std::numeric_limits<std::size_t>::max() / 2)
|
|
max_bytes = std::numeric_limits<std::size_t>::max() / 2;
|
|
if (ascii_only) {
|
|
str.reserve(
|
|
std::min(2 * static_cast<std::size_t>(max_length), 2 * max_bytes));
|
|
} else {
|
|
str.reserve(
|
|
std::min(6 * static_cast<std::size_t>(max_length), 2 * max_bytes));
|
|
}
|
|
|
|
Utf8GenerationState state = Utf8GenerationState::LeadingByte_Generic;
|
|
const uint8_t *pos = data;
|
|
const auto data_end = data + max_bytes;
|
|
for (std::size_t length = 0; length < max_length && pos != data_end; ++pos) {
|
|
uint8_t c = *pos;
|
|
if (ascii_only) {
|
|
// Clamp to 7-bit ASCII range.
|
|
c &= 0x7Fu;
|
|
}
|
|
// Fix up c or previously read bytes according to the value of c and the
|
|
// current state. In the end, add the fixed up code unit c to the string.
|
|
// Exception: The zero character has to be coded on two bytes and is the
|
|
// only case in which an iteration of the loop adds two code units.
|
|
switch (state) {
|
|
case Utf8GenerationState::LeadingByte_Generic: {
|
|
switch (ascii_only ? 0 : countl_one(c)) {
|
|
case 0: {
|
|
// valid - 1-byte code point (ASCII)
|
|
// The zero character has to be coded on two bytes in modified
|
|
// UTF-8.
|
|
if (c == 0) {
|
|
str += static_cast<char>(kTwoByteZeroLeadingByte);
|
|
c = kTwoByteZeroContinuationByte;
|
|
} else if (stop_on_backslash && c == '\\') {
|
|
state = Utf8GenerationState::LeadingByte_AfterBackslash;
|
|
// The slash either signals the end of the string or is skipped,
|
|
// so don't append anything.
|
|
continue;
|
|
}
|
|
// Remain in state LeadingByte.
|
|
++length;
|
|
break;
|
|
}
|
|
case 1: {
|
|
// invalid - continuation byte at leader byte position
|
|
// Fix it up to be of the form 0b110XXXXX and fall through to the
|
|
// case of a 2-byte sequence.
|
|
c |= 1u << 6u;
|
|
c &= ~(1u << 5u);
|
|
[[fallthrough]];
|
|
}
|
|
case 2: {
|
|
// (most likely) valid - start of a 2-byte sequence
|
|
// ASCII characters must be coded on a single byte, so we must
|
|
// ensure that the lower two bits combined with the six non-header
|
|
// bits of the following byte do not form a 7-bit ASCII value. This
|
|
// could only be the case if at most the lowest bit is set.
|
|
if ((c & 0b00011110u) == 0) {
|
|
state = Utf8GenerationState::ContinuationByte_LowLeadingByte;
|
|
} else {
|
|
state = Utf8GenerationState::ContinuationByte_Generic;
|
|
}
|
|
break;
|
|
}
|
|
// The default case falls through to the case of three leading ones
|
|
// coming right after.
|
|
default: {
|
|
// invalid - at least four leading ones
|
|
// In the case of exactly four leading ones, this would be valid
|
|
// UTF-8, but is not valid in the JVM's modified UTF-8 encoding.
|
|
// Fix it up by clearing the fourth leading one and falling through
|
|
// to the 3-byte case.
|
|
c &= ~(1u << 4u);
|
|
[[fallthrough]];
|
|
}
|
|
case 3: {
|
|
// valid - start of a 3-byte sequence
|
|
if (c == kThreeByteLowLeadingByte) {
|
|
state = Utf8GenerationState::FirstContinuationByte_LowLeadingByte;
|
|
} else if (c == kSurrogateLeadingByte) {
|
|
state = Utf8GenerationState::
|
|
FirstContinuationByte_SurrogateLeadingByte;
|
|
} else {
|
|
state = Utf8GenerationState::FirstContinuationByte_Generic;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case Utf8GenerationState::LeadingByte_AfterBackslash: {
|
|
if (c != '\\') {
|
|
// Mark the current byte as consumed.
|
|
++pos;
|
|
goto done;
|
|
}
|
|
// A double backslash is consumed as a single one. As we skipped the
|
|
// first one, emit the second one as usual.
|
|
state = Utf8GenerationState::LeadingByte_Generic;
|
|
++length;
|
|
break;
|
|
}
|
|
case Utf8GenerationState::ContinuationByte_LowLeadingByte: {
|
|
ForceContinuationByte(c);
|
|
// Preserve the zero character, which is coded on two bytes in modified
|
|
// UTF-8. In all other cases ensure that we are not incorrectly encoding
|
|
// an ASCII character on two bytes by setting the eigth least
|
|
// significant bit of the encoded value (second least significant bit of
|
|
// the leading byte).
|
|
auto previous_c = static_cast<uint8_t>(str.back());
|
|
if (previous_c != kTwoByteZeroLeadingByte ||
|
|
c != kTwoByteZeroContinuationByte) {
|
|
str.back() = static_cast<char>(previous_c | (1u << 1u));
|
|
}
|
|
state = Utf8GenerationState::LeadingByte_Generic;
|
|
++length;
|
|
break;
|
|
}
|
|
case Utf8GenerationState::ContinuationByte_Generic: {
|
|
ForceContinuationByte(c);
|
|
state = Utf8GenerationState::LeadingByte_Generic;
|
|
++length;
|
|
break;
|
|
}
|
|
case Utf8GenerationState::FirstContinuationByte_LowLeadingByte: {
|
|
ForceContinuationByte(c);
|
|
// Ensure that the current code point could not have been coded on two
|
|
// bytes. As two bytes encode up to 11 bits and three bytes encode up
|
|
// to 16 bits, we thus have to make it such that the five highest bits
|
|
// are not all zero. Four of these bits are the non-header bits of the
|
|
// leader byte. Thus, set the highest non-header bit in this byte (fifth
|
|
// highest in the encoded value).
|
|
c |= 1u << 5u;
|
|
state = Utf8GenerationState::SecondContinuationByte_Generic;
|
|
break;
|
|
}
|
|
case Utf8GenerationState::FirstContinuationByte_SurrogateLeadingByte: {
|
|
ForceContinuationByte(c);
|
|
if (c & (1u << 5u)) {
|
|
// Start with a high surrogate (0xD800-0xDBFF). c contains the second
|
|
// byte and the first two bits of the third byte. The first two bits
|
|
// of this second byte are fixed to 10 (in 0x8-0xB).
|
|
c |= 1u << 5u;
|
|
c &= ~(1u << 4u);
|
|
// The high surrogate must be followed by a low surrogate.
|
|
state = Utf8GenerationState::SecondContinuationByte_HighSurrogate;
|
|
} else {
|
|
state = Utf8GenerationState::SecondContinuationByte_Generic;
|
|
}
|
|
break;
|
|
}
|
|
case Utf8GenerationState::FirstContinuationByte_Generic: {
|
|
ForceContinuationByte(c);
|
|
state = Utf8GenerationState::SecondContinuationByte_Generic;
|
|
break;
|
|
}
|
|
case Utf8GenerationState::SecondContinuationByte_HighSurrogate: {
|
|
ForceContinuationByte(c);
|
|
state = Utf8GenerationState::LeadingByte_LowSurrogate;
|
|
++length;
|
|
break;
|
|
}
|
|
case Utf8GenerationState::SecondContinuationByte_LowSurrogate:
|
|
case Utf8GenerationState::SecondContinuationByte_Generic: {
|
|
ForceContinuationByte(c);
|
|
state = Utf8GenerationState::LeadingByte_Generic;
|
|
++length;
|
|
break;
|
|
}
|
|
case Utf8GenerationState::LeadingByte_LowSurrogate: {
|
|
// We have to emit a low surrogate leading byte, which is a fixed value.
|
|
// We still consume a byte from the input to make fuzzer changes more
|
|
// stable and preserve valid surrogate pairs picked up from e.g. the
|
|
// table of recent compares.
|
|
c = kSurrogateLeadingByte;
|
|
state = Utf8GenerationState::FirstContinuationByte_LowSurrogate;
|
|
break;
|
|
}
|
|
case Utf8GenerationState::FirstContinuationByte_LowSurrogate: {
|
|
ForceContinuationByte(c);
|
|
// Low surrogates are code points in the range 0xDC00-0xDFFF. c contains
|
|
// the second byte and the first two bits of the third byte. The first
|
|
// two bits of this second byte are fixed to 11 (in 0xC-0xF).
|
|
c |= (1u << 5u) | (1u << 4u);
|
|
// The second continuation byte of a low surrogate is not restricted,
|
|
// but we need to track it differently to allow for correct backtracking
|
|
// if it isn't completed.
|
|
state = Utf8GenerationState::SecondContinuationByte_LowSurrogate;
|
|
break;
|
|
}
|
|
}
|
|
str += static_cast<uint8_t>(c);
|
|
}
|
|
|
|
// Backtrack the current incomplete character.
|
|
switch (state) {
|
|
case Utf8GenerationState::SecondContinuationByte_LowSurrogate:
|
|
str.pop_back();
|
|
[[fallthrough]];
|
|
case Utf8GenerationState::FirstContinuationByte_LowSurrogate:
|
|
str.pop_back();
|
|
[[fallthrough]];
|
|
case Utf8GenerationState::LeadingByte_LowSurrogate:
|
|
str.pop_back();
|
|
[[fallthrough]];
|
|
case Utf8GenerationState::SecondContinuationByte_Generic:
|
|
case Utf8GenerationState::SecondContinuationByte_HighSurrogate:
|
|
str.pop_back();
|
|
[[fallthrough]];
|
|
case Utf8GenerationState::ContinuationByte_Generic:
|
|
case Utf8GenerationState::ContinuationByte_LowLeadingByte:
|
|
case Utf8GenerationState::FirstContinuationByte_Generic:
|
|
case Utf8GenerationState::FirstContinuationByte_LowLeadingByte:
|
|
case Utf8GenerationState::FirstContinuationByte_SurrogateLeadingByte:
|
|
str.pop_back();
|
|
[[fallthrough]];
|
|
case Utf8GenerationState::LeadingByte_Generic:
|
|
case Utf8GenerationState::LeadingByte_AfterBackslash:
|
|
// No backtracking required.
|
|
break;
|
|
}
|
|
|
|
done:
|
|
return std::make_pair(str, pos - data);
|
|
}
|
|
} // namespace
|
|
|
|
namespace jazzer {
|
|
// Exposed for testing only.
|
|
std::pair<std::string, std::size_t> FixUpModifiedUtf8(const uint8_t *data,
|
|
std::size_t max_bytes,
|
|
jint max_length,
|
|
bool ascii_only,
|
|
bool stop_on_backslash) {
|
|
if (ascii_only) {
|
|
if (stop_on_backslash) {
|
|
return ::FixUpModifiedUtf8<true, true>(data, max_bytes, max_length);
|
|
} else {
|
|
return ::FixUpModifiedUtf8<true, false>(data, max_bytes, max_length);
|
|
}
|
|
} else {
|
|
if (stop_on_backslash) {
|
|
return ::FixUpModifiedUtf8<false, true>(data, max_bytes, max_length);
|
|
} else {
|
|
return ::FixUpModifiedUtf8<false, false>(data, max_bytes, max_length);
|
|
}
|
|
}
|
|
}
|
|
} // namespace jazzer
|
|
|
|
namespace {
|
|
jstring ConsumeStringInternal(JNIEnv &env, jint max_length, bool ascii_only,
|
|
bool stop_on_backslash) {
|
|
if (max_length < 0) {
|
|
ThrowIllegalArgumentException(env, "maxLength must not be negative");
|
|
return nullptr;
|
|
}
|
|
|
|
if (max_length == 0 || gRemainingBytes == 0) return env.NewStringUTF("");
|
|
|
|
if (gRemainingBytes == 1) {
|
|
Advance(1);
|
|
return env.NewStringUTF("");
|
|
}
|
|
|
|
std::size_t max_bytes = gRemainingBytes;
|
|
std::string str;
|
|
std::size_t consumed_bytes;
|
|
std::tie(str, consumed_bytes) = jazzer::FixUpModifiedUtf8(
|
|
gDataPtr, max_bytes, max_length, ascii_only, stop_on_backslash);
|
|
Advance(consumed_bytes);
|
|
return env.NewStringUTF(str.c_str());
|
|
}
|
|
|
|
jstring JNICALL ConsumeAsciiString(JNIEnv &env, jobject self, jint max_length) {
|
|
return ConsumeStringInternal(env, max_length, true, true);
|
|
}
|
|
|
|
jstring JNICALL ConsumeString(JNIEnv &env, jobject self, jint max_length) {
|
|
return ConsumeStringInternal(env, max_length, false, true);
|
|
}
|
|
|
|
jstring JNICALL ConsumeRemainingAsAsciiString(JNIEnv &env, jobject self) {
|
|
return ConsumeStringInternal(env, std::numeric_limits<jint>::max(), true,
|
|
false);
|
|
}
|
|
|
|
jstring JNICALL ConsumeRemainingAsString(JNIEnv &env, jobject self) {
|
|
return ConsumeStringInternal(env, std::numeric_limits<jint>::max(), false,
|
|
false);
|
|
}
|
|
|
|
std::size_t RemainingBytes(JNIEnv &env, jobject self) {
|
|
return gRemainingBytes;
|
|
}
|
|
|
|
const JNINativeMethod kFuzzedDataMethods[]{
|
|
{(char *)"consumeBoolean", (char *)"()Z", (void *)&ConsumeBool},
|
|
{(char *)"consumeByte", (char *)"()B", (void *)&ConsumeIntegral<jbyte>},
|
|
{(char *)"consumeByte", (char *)"(BB)B",
|
|
(void *)&ConsumeIntegralInRange<jbyte>},
|
|
{(char *)"consumeShort", (char *)"()S", (void *)&ConsumeIntegral<jshort>},
|
|
{(char *)"consumeShort", (char *)"(SS)S",
|
|
(void *)&ConsumeIntegralInRange<jshort>},
|
|
{(char *)"consumeInt", (char *)"()I", (void *)&ConsumeIntegral<jint>},
|
|
{(char *)"consumeInt", (char *)"(II)I",
|
|
(void *)&ConsumeIntegralInRange<jint>},
|
|
{(char *)"consumeLong", (char *)"()J", (void *)&ConsumeIntegral<jlong>},
|
|
{(char *)"consumeLong", (char *)"(JJ)J",
|
|
(void *)&ConsumeIntegralInRange<jlong>},
|
|
{(char *)"consumeFloat", (char *)"()F", (void *)&ConsumeFloat<jfloat>},
|
|
{(char *)"consumeRegularFloat", (char *)"()F",
|
|
(void *)&ConsumeRegularFloat<jfloat>},
|
|
{(char *)"consumeRegularFloat", (char *)"(FF)F",
|
|
(void *)&ConsumeFloatInRange<jfloat>},
|
|
{(char *)"consumeProbabilityFloat", (char *)"()F",
|
|
(void *)&ConsumeProbability<jfloat>},
|
|
{(char *)"consumeDouble", (char *)"()D", (void *)&ConsumeFloat<jdouble>},
|
|
{(char *)"consumeRegularDouble", (char *)"()D",
|
|
(void *)&ConsumeRegularFloat<jdouble>},
|
|
{(char *)"consumeRegularDouble", (char *)"(DD)D",
|
|
(void *)&ConsumeFloatInRange<jdouble>},
|
|
{(char *)"consumeProbabilityDouble", (char *)"()D",
|
|
(void *)&ConsumeProbability<jdouble>},
|
|
{(char *)"consumeChar", (char *)"()C", (void *)&ConsumeChar},
|
|
{(char *)"consumeChar", (char *)"(CC)C",
|
|
(void *)&ConsumeIntegralInRange<jchar>},
|
|
{(char *)"consumeCharNoSurrogates", (char *)"()C",
|
|
(void *)&ConsumeCharNoSurrogates},
|
|
{(char *)"consumeAsciiString", (char *)"(I)Ljava/lang/String;",
|
|
(void *)&ConsumeAsciiString},
|
|
{(char *)"consumeRemainingAsAsciiString", (char *)"()Ljava/lang/String;",
|
|
(void *)&ConsumeRemainingAsAsciiString},
|
|
{(char *)"consumeString", (char *)"(I)Ljava/lang/String;",
|
|
(void *)&ConsumeString},
|
|
{(char *)"consumeRemainingAsString", (char *)"()Ljava/lang/String;",
|
|
(void *)&ConsumeRemainingAsString},
|
|
{(char *)"consumeBooleans", (char *)"(I)[Z",
|
|
(void *)&ConsumeIntegralArray<jboolean>},
|
|
{(char *)"consumeBytes", (char *)"(I)[B",
|
|
(void *)&ConsumeIntegralArray<jbyte>},
|
|
{(char *)"consumeShorts", (char *)"(I)[S",
|
|
(void *)&ConsumeIntegralArray<jshort>},
|
|
{(char *)"consumeInts", (char *)"(I)[I",
|
|
(void *)&ConsumeIntegralArray<jint>},
|
|
{(char *)"consumeLongs", (char *)"(I)[J",
|
|
(void *)&ConsumeIntegralArray<jlong>},
|
|
{(char *)"consumeRemainingAsBytes", (char *)"()[B",
|
|
(void *)&ConsumeRemainingAsArray<jbyte>},
|
|
{(char *)"remainingBytes", (char *)"()I", (void *)&RemainingBytes},
|
|
};
|
|
const jint kNumFuzzedDataMethods =
|
|
sizeof(kFuzzedDataMethods) / sizeof(kFuzzedDataMethods[0]);
|
|
} // namespace
|
|
|
|
namespace jazzer {
|
|
|
|
void SetUpFuzzedDataProvider(JNIEnv &env) {
|
|
jclass fuzzed_data_provider_class =
|
|
env.FindClass(kFuzzedDataProviderImplClass);
|
|
if (env.ExceptionCheck()) {
|
|
env.ExceptionDescribe();
|
|
throw std::runtime_error("failed to find FuzzedDataProviderImpl class");
|
|
}
|
|
env.RegisterNatives(fuzzed_data_provider_class, kFuzzedDataMethods,
|
|
kNumFuzzedDataMethods);
|
|
if (env.ExceptionCheck()) {
|
|
env.ExceptionDescribe();
|
|
throw std::runtime_error(
|
|
"could not register native callbacks for FuzzedDataProvider");
|
|
}
|
|
}
|
|
|
|
void FeedFuzzedDataProvider(const uint8_t *data, std::size_t size) {
|
|
gDataPtr = data;
|
|
gRemainingBytes = size;
|
|
}
|
|
} // namespace jazzer
|