2012-07-16 03:42:36 +00:00
|
|
|
#pragma once
|
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Columns/ColumnString.h>
|
2018-11-25 23:24:26 +00:00
|
|
|
#include <Common/memcpySmall.h>
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2017-03-12 11:09:25 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2023-07-17 09:54:34 +00:00
|
|
|
/** These helpers are used by URL processing functions. See implementation in separate .cpp files.
|
|
|
|
* All functions do not strictly follow RFC, instead they are maximally simplified for performance reasons.
|
2012-07-16 03:42:36 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Functions for extraction parts of URL.
|
|
|
|
* If URL has nothing like, then empty string is returned.
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2012-07-16 03:42:36 +00:00
|
|
|
* domain
|
|
|
|
* domainWithoutWWW
|
|
|
|
* topLevelDomain
|
|
|
|
* protocol
|
|
|
|
* path
|
|
|
|
* queryString
|
|
|
|
* fragment
|
|
|
|
* queryStringAndFragment
|
2020-06-02 09:10:10 +00:00
|
|
|
* netloc
|
2012-07-16 03:42:36 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Functions, removing parts from URL.
|
2019-01-22 19:56:53 +00:00
|
|
|
* If URL has nothing like, then it is returned unchanged.
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2012-07-16 03:42:36 +00:00
|
|
|
* cutWWW
|
|
|
|
* cutFragment
|
|
|
|
* cutQueryString
|
|
|
|
* cutQueryStringAndFragment
|
2012-07-21 03:45:48 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter.
|
|
|
|
* If there are many parameters with same name - return value of first one. Value is not %-decoded.
|
2012-07-21 03:45:48 +00:00
|
|
|
*
|
2013-03-18 10:27:45 +00:00
|
|
|
* extractURLParameter(URL, name)
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Extract all parameters from URL in form of array of strings name=value.
|
2013-03-18 10:27:45 +00:00
|
|
|
* extractURLParameters(URL)
|
2013-08-02 13:55:43 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Extract names of all parameters from URL in form of array of strings.
|
2013-08-05 08:40:56 +00:00
|
|
|
* extractURLParameterNames(URL)
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2016-10-20 05:21:49 +00:00
|
|
|
* Remove specified parameter from URL.
|
2013-03-18 10:27:45 +00:00
|
|
|
* cutURLParameter(URL, name)
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2022-04-15 22:20:47 +00:00
|
|
|
* Get array of URL 'hierarchy' as in web-analytics tree-like reports. See the docs.
|
2013-03-18 10:27:45 +00:00
|
|
|
* URLHierarchy(URL)
|
2012-07-16 03:42:36 +00:00
|
|
|
*/
|
|
|
|
|
2018-11-25 23:24:26 +00:00
|
|
|
namespace ErrorCodes
|
2012-07-16 03:42:36 +00:00
|
|
|
{
|
2018-11-25 23:24:26 +00:00
|
|
|
extern const int ILLEGAL_COLUMN;
|
2016-12-15 12:05:05 +00:00
|
|
|
}
|
2012-07-16 03:42:36 +00:00
|
|
|
|
2018-11-25 23:24:26 +00:00
|
|
|
using Pos = const char *;
|
2014-02-11 19:18:38 +00:00
|
|
|
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/** Select part of string using the Extractor.
|
2012-07-16 03:42:36 +00:00
|
|
|
*/
|
|
|
|
template <typename Extractor>
|
|
|
|
struct ExtractSubstringImpl
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
|
|
|
|
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
|
2012-07-16 03:42:36 +00:00
|
|
|
{
|
|
|
|
size_t size = offsets.size();
|
|
|
|
res_offsets.resize(size);
|
2013-04-17 12:52:55 +00:00
|
|
|
res_data.reserve(size * Extractor::getReserveLengthForElement());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
size_t prev_offset = 0;
|
|
|
|
size_t res_offset = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Matched part.
|
2012-07-16 03:42:36 +00:00
|
|
|
Pos start;
|
|
|
|
size_t length;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
Extractor::execute(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
res_data.resize(res_data.size() + length + 1);
|
2016-04-15 00:33:21 +00:00
|
|
|
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length);
|
2012-07-16 03:42:36 +00:00
|
|
|
res_offset += length + 1;
|
|
|
|
res_data[res_offset - 1] = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
prev_offset = offsets[i];
|
|
|
|
}
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
static void constant(const std::string & data,
|
|
|
|
std::string & res_data)
|
|
|
|
{
|
|
|
|
Pos start;
|
|
|
|
size_t length;
|
|
|
|
Extractor::execute(data.data(), data.size(), start, length);
|
|
|
|
res_data.assign(start, length);
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-03-23 02:12:31 +00:00
|
|
|
static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
2012-07-16 03:42:36 +00:00
|
|
|
{
|
2023-07-17 09:54:34 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by this function");
|
2012-07-16 03:42:36 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/** Delete part of string using the Extractor.
|
2012-07-16 03:42:36 +00:00
|
|
|
*/
|
|
|
|
template <typename Extractor>
|
|
|
|
struct CutSubstringImpl
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
|
|
|
|
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
|
2012-07-16 03:42:36 +00:00
|
|
|
{
|
|
|
|
res_data.reserve(data.size());
|
|
|
|
size_t size = offsets.size();
|
|
|
|
res_offsets.resize(size);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
size_t prev_offset = 0;
|
|
|
|
size_t res_offset = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Matched part.
|
2012-07-16 03:42:36 +00:00
|
|
|
Pos start;
|
|
|
|
size_t length;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
const char * current = reinterpret_cast<const char *>(&data[prev_offset]);
|
|
|
|
Extractor::execute(current, offsets[i] - prev_offset - 1, start, length);
|
2018-09-02 03:00:04 +00:00
|
|
|
size_t start_index = start - reinterpret_cast<const char *>(data.data());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
res_data.resize(res_data.size() + offsets[i] - prev_offset - length);
|
2016-04-15 00:33:21 +00:00
|
|
|
memcpySmallAllowReadWriteOverflow15(
|
|
|
|
&res_data[res_offset], current, start - current);
|
|
|
|
memcpySmallAllowReadWriteOverflow15(
|
|
|
|
&res_data[res_offset + start - current], start + length, offsets[i] - start_index - length);
|
2012-07-16 03:42:36 +00:00
|
|
|
res_offset += offsets[i] - prev_offset - length;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
prev_offset = offsets[i];
|
|
|
|
}
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-07-16 03:42:36 +00:00
|
|
|
static void constant(const std::string & data,
|
|
|
|
std::string & res_data)
|
|
|
|
{
|
|
|
|
Pos start;
|
|
|
|
size_t length;
|
|
|
|
Extractor::execute(data.data(), data.size(), start, length);
|
2014-12-05 11:44:37 +00:00
|
|
|
res_data.reserve(data.size() - length);
|
|
|
|
res_data.append(data.data(), start);
|
|
|
|
res_data.append(start + length, data.data() + data.size());
|
2012-07-16 03:42:36 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-03-23 02:12:31 +00:00
|
|
|
static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
2012-07-16 03:42:36 +00:00
|
|
|
{
|
2023-07-17 11:58:27 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by this function");
|
2012-07-16 03:42:36 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
}
|