321 lines
11 KiB
C++
321 lines
11 KiB
C++
// Copyright 2016 PDFium Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
|
|
|
|
#include "core/fpdftext/cpdf_linkextract.h"
|
|
|
|
#include <vector>
|
|
|
|
#include "core/fpdftext/cpdf_textpage.h"
|
|
#include "core/fxcrt/fx_extension.h"
|
|
#include "core/fxcrt/fx_string.h"
|
|
#include "core/fxcrt/fx_system.h"
|
|
|
|
namespace {
|
|
|
|
// Find the end of a web link starting from offset |start| and ending at offset
|
|
// |end|. The purpose of this function is to separate url from the surrounding
|
|
// context characters, we do not intend to fully validate the url. |str|
|
|
// contains lower case characters only.
|
|
size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) {
|
|
if (str.Contains(L'/', start)) {
|
|
// When there is a path and query after '/', most ASCII chars are allowed.
|
|
// We don't sanitize in this case.
|
|
return end;
|
|
}
|
|
|
|
// When there is no path, it only has IP address or host name.
|
|
// Port is optional at the end.
|
|
if (str[start] == L'[') {
|
|
// IPv6 reference.
|
|
// Find the end of the reference.
|
|
auto result = str.Find(L']', start + 1);
|
|
if (result.has_value()) {
|
|
end = result.value();
|
|
if (end > start + 1) { // Has content inside brackets.
|
|
size_t len = str.GetLength();
|
|
size_t off = end + 1;
|
|
if (off < len && str[off] == L':') {
|
|
off++;
|
|
while (off < len && FXSYS_IsDecimalDigit(str[off]))
|
|
off++;
|
|
if (off > end + 2 &&
|
|
off <= len) // At least one digit in port number.
|
|
end = off - 1; // |off| is offset of the first invalid char.
|
|
}
|
|
}
|
|
}
|
|
return end;
|
|
}
|
|
|
|
// According to RFC1123, host name only has alphanumeric chars, hyphens,
|
|
// and periods. Hyphen should not at the end though.
|
|
// Non-ASCII chars are ignored during checking.
|
|
while (end > start && str[end] < 0x80) {
|
|
if (FXSYS_IsDecimalDigit(str[end]) ||
|
|
(str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.') {
|
|
break;
|
|
}
|
|
end--;
|
|
}
|
|
return end;
|
|
}
|
|
|
|
// Remove characters from the end of |str|, delimited by |start| and |end|, up
|
|
// to and including |charToFind|. No-op if |charToFind| is not present. Updates
|
|
// |end| if characters were removed.
|
|
void TrimBackwardsToChar(const WideString& str,
|
|
wchar_t charToFind,
|
|
size_t start,
|
|
size_t* end) {
|
|
for (size_t pos = *end; pos >= start; pos--) {
|
|
if (str[pos] == charToFind) {
|
|
*end = pos - 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Finds opening brackets ()[]{}<> and quotes "' before the URL delimited by
|
|
// |start| and |end| in |str|. Matches a closing bracket or quote for each
|
|
// opening character and, if present, removes everything afterwards. Returns the
|
|
// new end position for the string.
|
|
size_t TrimExternalBracketsFromWebLink(const WideString& str,
|
|
size_t start,
|
|
size_t end) {
|
|
for (size_t pos = 0; pos < start; pos++) {
|
|
if (str[pos] == '(') {
|
|
TrimBackwardsToChar(str, ')', start, &end);
|
|
} else if (str[pos] == '[') {
|
|
TrimBackwardsToChar(str, ']', start, &end);
|
|
} else if (str[pos] == '{') {
|
|
TrimBackwardsToChar(str, '}', start, &end);
|
|
} else if (str[pos] == '<') {
|
|
TrimBackwardsToChar(str, '>', start, &end);
|
|
} else if (str[pos] == '"') {
|
|
TrimBackwardsToChar(str, '"', start, &end);
|
|
} else if (str[pos] == '\'') {
|
|
TrimBackwardsToChar(str, '\'', start, &end);
|
|
}
|
|
}
|
|
return end;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
|
|
: m_pTextPage(pTextPage) {}
|
|
|
|
CPDF_LinkExtract::~CPDF_LinkExtract() = default;
|
|
|
|
void CPDF_LinkExtract::ExtractLinks() {
|
|
m_LinkArray.clear();
|
|
int start = 0;
|
|
int pos = 0;
|
|
bool bAfterHyphen = false;
|
|
bool bLineBreak = false;
|
|
const int nTotalChar = m_pTextPage->CountChars();
|
|
const WideString page_text = m_pTextPage->GetAllPageText();
|
|
while (pos < nTotalChar) {
|
|
const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos);
|
|
if (char_info.m_CharType != CPDF_TextPage::CharType::kGenerated &&
|
|
char_info.m_Unicode != L' ' && pos != nTotalChar - 1) {
|
|
bAfterHyphen =
|
|
(char_info.m_CharType == CPDF_TextPage::CharType::kHyphen ||
|
|
(char_info.m_CharType == CPDF_TextPage::CharType::kNormal &&
|
|
char_info.m_Unicode == L'-'));
|
|
++pos;
|
|
continue;
|
|
}
|
|
|
|
int nCount = pos - start;
|
|
if (pos == nTotalChar - 1) {
|
|
++nCount;
|
|
} else if (bAfterHyphen &&
|
|
(char_info.m_Unicode == L'\n' || char_info.m_Unicode == L'\r')) {
|
|
// Handle text breaks with a hyphen to the next line.
|
|
bLineBreak = true;
|
|
++pos;
|
|
continue;
|
|
}
|
|
|
|
WideString strBeCheck = page_text.Substr(start, nCount);
|
|
if (bLineBreak) {
|
|
strBeCheck.Remove(L'\n');
|
|
strBeCheck.Remove(L'\r');
|
|
bLineBreak = false;
|
|
}
|
|
// Replace the generated code with the hyphen char.
|
|
strBeCheck.Replace(L"\xfffe", L"-");
|
|
|
|
if (strBeCheck.GetLength() > 5) {
|
|
while (strBeCheck.GetLength() > 0) {
|
|
wchar_t ch = strBeCheck.Back();
|
|
if (ch != L')' && ch != L',' && ch != L'>' && ch != L'.')
|
|
break;
|
|
|
|
strBeCheck = strBeCheck.First(strBeCheck.GetLength() - 1);
|
|
nCount--;
|
|
}
|
|
|
|
// Check for potential web URLs and email addresses.
|
|
// Ftp address, file system links, data, blob etc. are not checked.
|
|
if (nCount > 5) {
|
|
int32_t nStartOffset;
|
|
int32_t nCountOverload;
|
|
if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) {
|
|
m_LinkArray.push_back(
|
|
{start + nStartOffset, nCountOverload, strBeCheck});
|
|
} else if (CheckMailLink(&strBeCheck)) {
|
|
m_LinkArray.push_back({start, nCount, strBeCheck});
|
|
}
|
|
}
|
|
}
|
|
start = ++pos;
|
|
}
|
|
}
|
|
|
|
bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck,
|
|
int32_t* nStart,
|
|
int32_t* nCount) {
|
|
static const wchar_t kHttpScheme[] = L"http";
|
|
static const wchar_t kWWWAddrStart[] = L"www.";
|
|
|
|
const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme);
|
|
const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
|
|
|
|
WideString str = *strBeCheck;
|
|
str.MakeLower();
|
|
|
|
size_t len = str.GetLength();
|
|
// First, try to find the scheme.
|
|
auto start = str.Find(kHttpScheme);
|
|
if (start.has_value()) {
|
|
size_t off = start.value() + kHttpSchemeLen; // move after "http".
|
|
if (len > off + 4) { // At least "://<char>" follows.
|
|
if (str[off] == L's') // "https" scheme is accepted.
|
|
off++;
|
|
if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
|
|
off += 3;
|
|
size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
|
|
str.GetLength() - 1);
|
|
end = FindWebLinkEnding(str, off, end);
|
|
if (end > off) { // Non-empty host name.
|
|
*nStart = start.value();
|
|
*nCount = end - start.value() + 1;
|
|
*strBeCheck = strBeCheck->Substr(*nStart, *nCount);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// When there is no scheme, try to find url starting with "www.".
|
|
start = str.Find(kWWWAddrStart);
|
|
if (start.has_value() && len > start.value() + kWWWAddrStartLen) {
|
|
size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
|
|
str.GetLength() - 1);
|
|
end = FindWebLinkEnding(str, start.value(), end);
|
|
if (end > start.value() + kWWWAddrStartLen) {
|
|
*nStart = start.value();
|
|
*nCount = end - start.value() + 1;
|
|
*strBeCheck = L"http://" + strBeCheck->Substr(*nStart, *nCount);
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
|
|
auto aPos = str->Find(L'@');
|
|
// Invalid when no '@' or when starts/ends with '@'.
|
|
if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1)
|
|
return false;
|
|
|
|
// Check the local part.
|
|
size_t pPos = aPos.value(); // Used to track the position of '@' or '.'.
|
|
for (size_t i = aPos.value(); i > 0; i--) {
|
|
wchar_t ch = (*str)[i - 1];
|
|
if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
|
|
continue;
|
|
|
|
if (ch != L'.' || i == pPos || i == 1) {
|
|
if (i == aPos.value()) {
|
|
// There is '.' or invalid char before '@'.
|
|
return false;
|
|
}
|
|
// End extracting for other invalid chars, '.' at the beginning, or
|
|
// consecutive '.'.
|
|
size_t removed_len = i == pPos ? i + 1 : i;
|
|
*str = str->Last(str->GetLength() - removed_len);
|
|
break;
|
|
}
|
|
// Found a valid '.'.
|
|
pPos = i - 1;
|
|
}
|
|
|
|
// Check the domain name part.
|
|
aPos = str->Find(L'@');
|
|
if (!aPos.has_value() || aPos.value() == 0)
|
|
return false;
|
|
|
|
str->TrimRight(L'.');
|
|
// At least one '.' in domain name, but not at the beginning.
|
|
// TODO(weili): RFC5322 allows domain names to be a local name without '.'.
|
|
// Check whether we should remove this check.
|
|
auto ePos = str->Find(L'.', aPos.value() + 1);
|
|
if (!ePos.has_value() || ePos.value() == aPos.value() + 1)
|
|
return false;
|
|
|
|
// Validate all other chars in domain name.
|
|
size_t nLen = str->GetLength();
|
|
pPos = 0; // Used to track the position of '.'.
|
|
for (size_t i = aPos.value() + 1; i < nLen; i++) {
|
|
wchar_t wch = (*str)[i];
|
|
if (wch == L'-' || FXSYS_iswalnum(wch))
|
|
continue;
|
|
|
|
if (wch != L'.' || i == pPos + 1) {
|
|
// Domain name should end before invalid char.
|
|
size_t host_end = i == pPos + 1 ? i - 2 : i - 1;
|
|
if (pPos > 0 && host_end - aPos.value() >= 3) {
|
|
// Trim the ending invalid chars if there is at least one '.' and name.
|
|
*str = str->First(host_end + 1);
|
|
break;
|
|
}
|
|
return false;
|
|
}
|
|
pPos = i;
|
|
}
|
|
|
|
if (!str->Contains(L"mailto:"))
|
|
*str = L"mailto:" + *str;
|
|
|
|
return true;
|
|
}
|
|
|
|
WideString CPDF_LinkExtract::GetURL(size_t index) const {
|
|
return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl
|
|
: WideString();
|
|
}
|
|
|
|
std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
|
|
if (index >= m_LinkArray.size())
|
|
return std::vector<CFX_FloatRect>();
|
|
|
|
return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
|
|
m_LinkArray[index].m_Count);
|
|
}
|
|
|
|
bool CPDF_LinkExtract::GetTextRange(size_t index,
|
|
int* start_char_index,
|
|
int* char_count) const {
|
|
if (index >= m_LinkArray.size())
|
|
return false;
|
|
*start_char_index = m_LinkArray[index].m_Start;
|
|
*char_count = m_LinkArray[index].m_Count;
|
|
return true;
|
|
}
|