move all url's functions to FunctionsURL

This commit is contained in:
artpaul 2016-12-15 17:05:05 +05:00
parent 6694c14338
commit 785a98e624
4 changed files with 177 additions and 155 deletions

View File

@ -1,14 +0,0 @@
#pragma once
#include <DB/Common/StringView.h>
/// Percent decode of url data.
std::string decodeUrl(const StringView & url);
/// Extracts scheme from given url.
StringView getUrlScheme(const StringView & url);
/// Extracts host from given url.
StringView getUrlHost(const StringView & url);

View File

@ -3,7 +3,8 @@
#include <DB/DataTypes/DataTypeString.h>
#include <DB/Columns/ColumnString.h>
#include <DB/Columns/ColumnConst.h>
#include <DB/Common/URLUtils.h>
#include <DB/Common/StringUtils.h>
#include <DB/Common/StringView.h>
#include <DB/Functions/FunctionsString.h>
#include <DB/Functions/FunctionsStringSearch.h>
#include <DB/Functions/FunctionsStringArray.h>
@ -58,24 +59,70 @@ namespace DB
using Pos = const char *;
/// Extracts scheme from given url.
inline StringView getUrlScheme(const StringView & url)
{
// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
const char* p = url.data();
const char* end = url.data() + url.size();
if (isAlphaASCII(*p))
{
for (++p; p < end; ++p)
{
if (!(isAlphaNumericASCII(*p) || *p == '+' || *p == '-' || *p == '.'))
{
break;
}
}
return StringView(url.data(), p - url.data());
}
return StringView();
}
/// Extracts host from given url.
inline StringView getUrlHost(const StringView & url)
{
StringView scheme = getUrlScheme(url);
const char* p = url.data() + scheme.size();
const char* end = url.data() + url.size();
// Colon must follows after scheme.
if (p == end || *p != ':')
return StringView();
// Authority component must starts with "//".
if (end - p < 2 || (p[1] != '/' || p[2] != '/'))
return StringView();
else
p += 3;
const char* st = p;
for (; p < end; ++p)
{
if (*p == '@')
{
st = p + 1;
}
else if (*p == ':' || *p == '/' || *p == '?' || *p == '#')
{
break;
}
}
return (p == st) ? StringView() : StringView(st, p - st);
}
struct ExtractProtocol
{
static size_t getReserveLengthForElement() { return makeStringView("https").size() + 1; }
static size_t getReserveLengthForElement();
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
StringView scheme = getUrlScheme(StringView(data, size));
Pos pos = data + scheme.size();
if (scheme.empty() || (data + size) - pos < 4)
return;
if (pos[0] == ':')
res_size = pos - data;
}
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size);
};
template <bool without_www>
@ -971,44 +1018,17 @@ struct CutSubstringImpl
};
/// Percent decode of url data.
struct DecodeURLComponentImpl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < size; ++i)
{
const char * current = reinterpret_cast<const char *>(&data[prev_offset]);
std::string url(decodeUrl(StringView(current, offsets[i] - prev_offset - 1)));
res_data.resize(res_data.size() + url.size() + 1);
memcpy(&res_data[res_offset], url.data(), url.size());
res_offset += url.size() + 1;
res_data[res_offset - 1] = 0;
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
}
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets);
static void constant(const std::string & data,
std::string & res_data)
{
res_data = decodeUrl(data);
}
std::string & res_data);
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
{
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
}
ColumnString::Chars_t & res_data);
};

View File

@ -1,94 +0,0 @@
#include <DB/Common/hex.h>
#include <DB/Common/StringUtils.h>
#include <DB/Common/URLUtils.h>
std::string decodeUrl(const StringView & url)
{
const char* p = url.data();
const char* st = url.data();
const char* end = url.data() + url.size();
std::string result;
for (; p < end; ++p)
{
if (*p != '%' || end - p < 3)
continue;
unsigned char h = char_to_digit_table[static_cast<unsigned char>(p[1])];
unsigned char l = char_to_digit_table[static_cast<unsigned char>(p[2])];
if (h != 0xFF && l != 0xFF)
{
unsigned char digit = (h << 4) + l;
if (digit < 127) {
result.append(st, p - st + 1);
result.back() = digit;
st = p + 3;
}
}
p += 2;
}
if (st == url.data())
return std::string(url.data(), url.size());
if (st < p)
result.append(st, p - st);
return result;
}
StringView getUrlScheme(const StringView & url)
{
// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
const char* p = url.data();
const char* end = url.data() + url.size();
if (isAlphaASCII(*p))
{
for (++p; p < end; ++p)
{
if (!(isAlphaNumericASCII(*p) || *p == '+' || *p == '-' || *p == '.'))
{
break;
}
}
return StringView(url.data(), p - url.data());
}
return StringView();
}
StringView getUrlHost(const StringView & url)
{
StringView scheme = getUrlScheme(url);
const char* p = url.data() + scheme.size();
const char* end = url.data() + url.size();
// Colon must follows after scheme.
if (p == end || *p != ':')
return StringView();
// Authority component must starts with "//".
if (end - p < 2 || (p[1] != '/' || p[2] != '/'))
return StringView();
else
p += 3;
const char* st = p;
for (; p < end; ++p)
{
if (*p == '@')
{
st = p + 1;
}
else if (*p == ':' || *p == '/' || *p == '?' || *p == '#')
{
break;
}
}
return (p == st) ? StringView() : StringView(st, p - st);
}

View File

@ -1,9 +1,119 @@
#include <DB/Common/hex.h>
#include <DB/Functions/FunctionFactory.h>
#include <DB/Functions/FunctionsURL.h>
namespace DB
{
template <typename T>
static void decodeUrl(const StringView & url, T & dest, size_t & offset)
{
const char* p = url.data();
const char* st = url.data();
const char* end = url.data() + url.size();
for (; p < end; ++p)
{
if (*p != '%' || end - p < 3)
continue;
unsigned char h = char_to_digit_table[static_cast<unsigned char>(p[1])];
unsigned char l = char_to_digit_table[static_cast<unsigned char>(p[2])];
if (h != 0xFF && l != 0xFF)
{
unsigned char digit = (h << 4) + l;
if (digit < 127) {
dest.resize(dest.size() + p - st + 1);
memcpy(&dest[offset], st, p - st);
offset += p - st;
dest[offset] = digit;
offset++;
st = p + 3;
}
}
p += 2;
}
if (st == url.data())
{
dest.resize(dest.size() + url.size() + 1);
memcpy(&dest[offset], url.data(), url.size());
offset += url.size() + 1;
dest[offset - 1] = 0;
}
else if (st < p)
{
dest.resize(dest.size() + p - st);
memcpy(&dest[offset], st, p - st);
offset += p - st;
}
}
size_t ExtractProtocol::getReserveLengthForElement()
{
return makeStringView("https").size() + 1;
}
void ExtractProtocol::execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
StringView scheme = getUrlScheme(StringView(data, size));
Pos pos = data + scheme.size();
if (scheme.empty() || (data + size) - pos < 4)
return;
if (pos[0] == ':')
res_size = pos - data;
}
void DecodeURLComponentImpl::vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < size; ++i)
{
const char * current = reinterpret_cast<const char *>(&data[prev_offset]);
const StringView url(current, offsets[i] - prev_offset - 1);
decodeUrl(url, res_data, res_offset);
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
}
void DecodeURLComponentImpl::constant(const std::string & data,
std::string & res_data)
{
size_t offset = 0;
decodeUrl(data, res_data, offset);
}
void DecodeURLComponentImpl::vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
{
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
}
void registerFunctionsURL(FunctionFactory & factory)
{
factory.registerFunction<FunctionProtocol>();