mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-16 19:32:07 +00:00
164 lines
4.9 KiB
C++
164 lines
4.9 KiB
C++
#pragma once
|
|
|
|
#include <Columns/ColumnString.h>
|
|
#include <Common/memcpySmall.h>
|
|
|
|
|
|
namespace DB
|
|
{
|
|
|
|
/** URL processing functions. See implementation in separate .cpp files.
|
|
* All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons.
|
|
*
|
|
* Functions for extraction parts of URL.
|
|
* If URL has nothing like, then empty string is returned.
|
|
*
|
|
* domain
|
|
* domainWithoutWWW
|
|
* topLevelDomain
|
|
* protocol
|
|
* path
|
|
* queryString
|
|
* fragment
|
|
* queryStringAndFragment
|
|
* netloc
|
|
*
|
|
* Functions, removing parts from URL.
|
|
* If URL has nothing like, then it is returned unchanged.
|
|
*
|
|
* cutWWW
|
|
* cutFragment
|
|
* cutQueryString
|
|
* cutQueryStringAndFragment
|
|
*
|
|
* Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter.
|
|
* If there are many parameters with same name - return value of first one. Value is not %-decoded.
|
|
*
|
|
* extractURLParameter(URL, name)
|
|
*
|
|
* Extract all parameters from URL in form of array of strings name=value.
|
|
* extractURLParameters(URL)
|
|
*
|
|
* Extract names of all parameters from URL in form of array of strings.
|
|
* extractURLParameterNames(URL)
|
|
*
|
|
* Remove specified parameter from URL.
|
|
* cutURLParameter(URL, name)
|
|
*
|
|
* Get array of URL 'hierarchy' as in web-analytics tree-like reports. See the docs.
|
|
* URLHierarchy(URL)
|
|
*/
|
|
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int ILLEGAL_COLUMN;
|
|
}
|
|
|
|
using Pos = const char *;
|
|
|
|
|
|
/** Select part of string using the Extractor.
|
|
*/
|
|
template <typename Extractor>
|
|
struct ExtractSubstringImpl
|
|
{
|
|
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
|
|
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
|
|
{
|
|
size_t size = offsets.size();
|
|
res_offsets.resize(size);
|
|
res_data.reserve(size * Extractor::getReserveLengthForElement());
|
|
|
|
size_t prev_offset = 0;
|
|
size_t res_offset = 0;
|
|
|
|
/// Matched part.
|
|
Pos start;
|
|
size_t length;
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
{
|
|
Extractor::execute(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length);
|
|
|
|
res_data.resize(res_data.size() + length + 1);
|
|
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length);
|
|
res_offset += length + 1;
|
|
res_data[res_offset - 1] = 0;
|
|
|
|
res_offsets[i] = res_offset;
|
|
prev_offset = offsets[i];
|
|
}
|
|
}
|
|
|
|
static void constant(const std::string & data,
|
|
std::string & res_data)
|
|
{
|
|
Pos start;
|
|
size_t length;
|
|
Extractor::execute(data.data(), data.size(), start, length);
|
|
res_data.assign(start, length);
|
|
}
|
|
|
|
static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
|
{
|
|
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
|
|
}
|
|
};
|
|
|
|
|
|
/** Delete part of string using the Extractor.
|
|
*/
|
|
template <typename Extractor>
|
|
struct CutSubstringImpl
|
|
{
|
|
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
|
|
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
|
|
{
|
|
res_data.reserve(data.size());
|
|
size_t size = offsets.size();
|
|
res_offsets.resize(size);
|
|
|
|
size_t prev_offset = 0;
|
|
size_t res_offset = 0;
|
|
|
|
/// Matched part.
|
|
Pos start;
|
|
size_t length;
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
{
|
|
const char * current = reinterpret_cast<const char *>(&data[prev_offset]);
|
|
Extractor::execute(current, offsets[i] - prev_offset - 1, start, length);
|
|
size_t start_index = start - reinterpret_cast<const char *>(data.data());
|
|
|
|
res_data.resize(res_data.size() + offsets[i] - prev_offset - length);
|
|
memcpySmallAllowReadWriteOverflow15(
|
|
&res_data[res_offset], current, start - current);
|
|
memcpySmallAllowReadWriteOverflow15(
|
|
&res_data[res_offset + start - current], start + length, offsets[i] - start_index - length);
|
|
res_offset += offsets[i] - prev_offset - length;
|
|
|
|
res_offsets[i] = res_offset;
|
|
prev_offset = offsets[i];
|
|
}
|
|
}
|
|
|
|
static void constant(const std::string & data,
|
|
std::string & res_data)
|
|
{
|
|
Pos start;
|
|
size_t length;
|
|
Extractor::execute(data.data(), data.size(), start, length);
|
|
res_data.reserve(data.size() - length);
|
|
res_data.append(data.data(), start);
|
|
res_data.append(start + length, data.data() + data.size());
|
|
}
|
|
|
|
static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
|
{
|
|
throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
|
|
}
|
|
};
|
|
|
|
}
|