mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-16 03:12:43 +00:00
tmp
This commit is contained in:
parent
296a19129b
commit
b848189726
@ -8,6 +8,11 @@
|
||||
#include <Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h>
|
||||
#include <Common/assert_cast.h>
|
||||
|
||||
#include <chrono>
|
||||
|
||||
/* Only needed for the sake of this example. */
|
||||
#include <iostream>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
@ -41,16 +46,46 @@ String ExtractKeyValuePairs::getName() const
|
||||
|
||||
ColumnPtr ExtractKeyValuePairs::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const
|
||||
{
|
||||
using std::chrono::high_resolution_clock;
|
||||
using std::chrono::duration_cast;
|
||||
using std::chrono::duration;
|
||||
using std::chrono::microseconds;
|
||||
|
||||
// auto t1 = high_resolution_clock::now();
|
||||
|
||||
auto [data_column, escape_character, key_value_pair_delimiter, item_delimiter, enclosing_character, value_special_characters_allow_list]
|
||||
= parseArguments(arguments);
|
||||
|
||||
auto extractor = getExtractor(
|
||||
escape_character, key_value_pair_delimiter, item_delimiter, enclosing_character, value_special_characters_allow_list);
|
||||
|
||||
// auto t2 = high_resolution_clock::now();
|
||||
auto raw_columns = extract(extractor, data_column);
|
||||
|
||||
// improve escape character..
|
||||
return escape(raw_columns, escape_character ? escape_character.value() : '\\');
|
||||
ColumnPtr keys_ptr = std::move(raw_columns.keys);
|
||||
// auto t3 = high_resolution_clock::now();
|
||||
|
||||
|
||||
auto map = ColumnMap::create(keys_ptr, std::move(raw_columns.values), std::move(raw_columns.offsets));
|
||||
// auto t4 = high_resolution_clock::now();
|
||||
|
||||
// std::cout<<"Time taken for building extractor is: "<<duration_cast<microseconds>(t2 - t1).count()<<"u\n";
|
||||
// std::cout<<"Time taken for building extracting & creating output is: "<<duration_cast<microseconds>(t3 - t2).count()<<"u\n";
|
||||
// std::cout<<"Time taken for whole process is: "<<duration_cast<microseconds>(t4 - t1).count()<<"u\n";
|
||||
return map;
|
||||
|
||||
// return raw_columns;
|
||||
|
||||
//
|
||||
// // improve escape character..
|
||||
// auto escaped_out = escape(raw_columns, escape_character ? escape_character.value() : '\\');
|
||||
// auto t3 = high_resolution_clock::now();
|
||||
//
|
||||
//
|
||||
// std::cout<<"Time taken for extraction is: "<<duration_cast<microseconds>(t2 - t1).count()<<"u\n";
|
||||
// std::cout<<"Time taken for escaping is: "<<duration_cast<microseconds>(t3 - t2).count()<<"u\n";
|
||||
//
|
||||
// return escaped_out;
|
||||
}
|
||||
|
||||
bool ExtractKeyValuePairs::isVariadic() const
|
||||
@ -183,19 +218,37 @@ std::shared_ptr<KeyValuePairExtractor<ExtractKeyValuePairs::EscapingProcessorOut
|
||||
|
||||
ExtractKeyValuePairs::RawColumns ExtractKeyValuePairs::extract(std::shared_ptr<KeyValuePairExtractor<EscapingProcessorOutput>> extractor, ColumnPtr data_column)
|
||||
{
|
||||
using std::chrono::high_resolution_clock;
|
||||
using std::chrono::duration_cast;
|
||||
using std::chrono::duration;
|
||||
using std::chrono::microseconds;
|
||||
|
||||
auto offsets = ColumnUInt64::create();
|
||||
|
||||
auto keys = ColumnString::create();
|
||||
auto values = ColumnString::create();
|
||||
|
||||
// keys->reserve(data_column->byteSize());
|
||||
// values->reserve(data_column->byteSize());
|
||||
|
||||
auto row_offset = 0u;
|
||||
|
||||
// long totalExtractionTime = 0;
|
||||
// long shortestExtractionTime = std::numeric_limits<long>::max();
|
||||
// long longestExtractionTime = std::numeric_limits<long>::min();
|
||||
//
|
||||
// long totalInsertionTime = 0;
|
||||
// long shortestInsertionTime = std::numeric_limits<long>::max();
|
||||
// long longestInsertionTime = std::numeric_limits<long>::min();
|
||||
|
||||
|
||||
for (auto i = 0u; i < data_column->size(); i++)
|
||||
{
|
||||
auto row = data_column->getDataAt(i).toString();
|
||||
auto row = data_column->getDataAt(i).toView();
|
||||
// auto t2 = high_resolution_clock::now();
|
||||
|
||||
// TODO avoid copying
|
||||
auto response = extractor->extract(row);
|
||||
// auto t3 = high_resolution_clock::now();
|
||||
|
||||
for (auto [key, value] : response)
|
||||
{
|
||||
@ -206,8 +259,46 @@ ExtractKeyValuePairs::RawColumns ExtractKeyValuePairs::extract(std::shared_ptr<K
|
||||
}
|
||||
|
||||
offsets->insert(row_offset);
|
||||
|
||||
// auto t4 = high_resolution_clock::now();
|
||||
//
|
||||
// long extractionTime = duration_cast<microseconds>(t3 - t2).count();
|
||||
// long insertionTime = duration_cast<microseconds>(t4 - t3).count();
|
||||
//
|
||||
// totalExtractionTime += extractionTime;
|
||||
// totalInsertionTime += insertionTime;
|
||||
//
|
||||
// if (extractionTime > longestExtractionTime)
|
||||
// {
|
||||
// longestExtractionTime = extractionTime;
|
||||
// }
|
||||
//
|
||||
// if (extractionTime < shortestExtractionTime)
|
||||
// {
|
||||
// shortestExtractionTime = extractionTime;
|
||||
// }
|
||||
//
|
||||
// if (insertionTime > longestInsertionTime)
|
||||
// {
|
||||
// longestInsertionTime = insertionTime;
|
||||
// }
|
||||
//
|
||||
// if (insertionTime < shortestInsertionTime)
|
||||
// {
|
||||
// shortestInsertionTime = insertionTime;
|
||||
// }
|
||||
}
|
||||
|
||||
// if (!data_column->empty())
|
||||
// {
|
||||
// auto averageInsertionTime = totalInsertionTime / data_column->size();
|
||||
// auto averageExtractionTime = totalExtractionTime / data_column->size();
|
||||
//
|
||||
// std::cout<<"Longest extraction time: "<<longestExtractionTime<<" - Shortest extraction time: "<<shortestExtractionTime<<" - Average extraction time: "<<averageExtractionTime<<" - Total: "<<totalExtractionTime<<"\n";
|
||||
// std::cout<<"Longest insertion time: "<<longestInsertionTime<<" - Shortest extraction time: "<<shortestInsertionTime<<" - Average insertion time: "<<averageInsertionTime<<" - Total: "<<totalInsertionTime<<"\n";
|
||||
//
|
||||
// }
|
||||
|
||||
return {std::move(keys), std::move(values), std::move(offsets)};
|
||||
}
|
||||
|
||||
@ -218,6 +309,9 @@ ColumnPtr ExtractKeyValuePairs::escape(RawColumns & raw_columns, char escape_cha
|
||||
auto escaped_keys = ColumnString::create();
|
||||
auto escaped_values = ColumnString::create();
|
||||
|
||||
escaped_keys->reserve(raw_keys->size());
|
||||
escaped_values->reserve(raw_values->size());
|
||||
|
||||
auto escape_character_string = std::string(1, escape_character);
|
||||
|
||||
using ReplaceString = ReplaceStringImpl<ReplaceStringTraits::Replace::All>;
|
||||
|
@ -39,6 +39,8 @@ struct KeyValuePairExtractor
|
||||
virtual ~KeyValuePairExtractor() = default;
|
||||
|
||||
virtual Response extract(const std::string & data) = 0;
|
||||
|
||||
virtual Response extract(const std::string_view & data) = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -33,6 +33,12 @@ public:
|
||||
}
|
||||
|
||||
[[nodiscard]] Response extract(const std::string & file) override
|
||||
{
|
||||
auto view = std::string_view {file};
|
||||
return extract(view);
|
||||
}
|
||||
|
||||
[[nodiscard]] Response extract(const std::string_view & file) override
|
||||
{
|
||||
std::unordered_map<std::string_view, std::string_view> response_views;
|
||||
|
||||
@ -55,7 +61,7 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
NextState processState(const std::string & file, std::size_t pos, State state,
|
||||
NextState processState(std::string_view file, std::size_t pos, State state,
|
||||
std::string_view & key, std::string_view & value,
|
||||
std::unordered_map<std::string_view, std::string_view> & response_views)
|
||||
{
|
||||
@ -84,7 +90,7 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
NextState flushPair(const std::string & file, std::size_t pos, std::string_view key,
|
||||
NextState flushPair(const std::string_view & file, std::size_t pos, std::string_view key,
|
||||
std::string_view value, std::unordered_map<std::string_view, std::string_view> & response_views)
|
||||
{
|
||||
response_views[key] = value;
|
||||
|
@ -8,7 +8,7 @@ KeyStateHandler::KeyStateHandler(char key_value_delimiter_, char escape_characte
|
||||
{
|
||||
}
|
||||
|
||||
NextState KeyStateHandler::wait(const std::string & file, size_t pos) const
|
||||
NextState KeyStateHandler::wait(std::string_view file, size_t pos)
|
||||
{
|
||||
while (pos < file.size())
|
||||
{
|
||||
@ -17,7 +17,7 @@ NextState KeyStateHandler::wait(const std::string & file, size_t pos) const
|
||||
{
|
||||
return {pos, State::READING_KEY};
|
||||
}
|
||||
else if (enclosing_character && current_character == enclosing_character)
|
||||
else if (current_character == '"')
|
||||
{
|
||||
return {pos + 1u, State::READING_ENCLOSED_KEY};
|
||||
}
|
||||
@ -30,7 +30,7 @@ NextState KeyStateHandler::wait(const std::string & file, size_t pos) const
|
||||
return {pos, State::END};
|
||||
}
|
||||
|
||||
NextState KeyStateHandler::read(const std::string & file, size_t pos, std::string_view & key)
|
||||
NextState KeyStateHandler::read(std::string_view file, size_t pos, std::string_view & key)
|
||||
{
|
||||
bool escape = false;
|
||||
|
||||
@ -45,11 +45,11 @@ NextState KeyStateHandler::read(const std::string & file, size_t pos, std::strin
|
||||
{
|
||||
escape = false;
|
||||
}
|
||||
else if (escape_character == current_character)
|
||||
else if ('\\' == current_character)
|
||||
{
|
||||
escape = true;
|
||||
}
|
||||
else if (current_character == key_value_delimiter)
|
||||
else if (current_character == '=')
|
||||
{
|
||||
// not checking for empty key because with current waitKey implementation
|
||||
// there is no way this piece of code will be reached for the very first key character
|
||||
@ -65,7 +65,7 @@ NextState KeyStateHandler::read(const std::string & file, size_t pos, std::strin
|
||||
return {pos, State::END};
|
||||
}
|
||||
|
||||
NextState KeyStateHandler::readEnclosed(const std::string & file, size_t pos, std::string_view & key)
|
||||
NextState KeyStateHandler::readEnclosed(std::string_view file, size_t pos, std::string_view & key)
|
||||
{
|
||||
auto start_index = pos;
|
||||
key = {};
|
||||
@ -74,7 +74,7 @@ NextState KeyStateHandler::readEnclosed(const std::string & file, size_t pos, st
|
||||
{
|
||||
const auto current_character = file[pos++];
|
||||
|
||||
if (enclosing_character == current_character)
|
||||
if ('"' == current_character)
|
||||
{
|
||||
auto is_key_empty = start_index == pos;
|
||||
|
||||
@ -91,7 +91,7 @@ NextState KeyStateHandler::readEnclosed(const std::string & file, size_t pos, st
|
||||
return {pos, State::END};
|
||||
}
|
||||
|
||||
NextState KeyStateHandler::readKeyValueDelimiter(const std::string & file, size_t pos) const
|
||||
NextState KeyStateHandler::readKeyValueDelimiter(std::string_view file, size_t pos)
|
||||
{
|
||||
if (pos == file.size())
|
||||
{
|
||||
@ -100,7 +100,7 @@ NextState KeyStateHandler::readKeyValueDelimiter(const std::string & file, size_
|
||||
else
|
||||
{
|
||||
const auto current_character = file[pos++];
|
||||
return {pos, current_character == key_value_delimiter ? State::WAITING_VALUE : State::WAITING_KEY};
|
||||
return {pos, current_character == '=' ? State::WAITING_VALUE : State::WAITING_KEY};
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -14,13 +14,13 @@ class KeyStateHandler : StateHandler
|
||||
public:
|
||||
KeyStateHandler(char key_value_delimiter, char escape_character, std::optional<char> enclosing_character);
|
||||
|
||||
[[nodiscard]] NextState wait(const std::string & file, size_t pos) const;
|
||||
[[nodiscard]] NextState read(const std::string & file, size_t pos, std::string_view & key);
|
||||
[[nodiscard]] NextState readEnclosed(const std::string & file, size_t pos, std::string_view & key);
|
||||
[[nodiscard]] NextState readKeyValueDelimiter(const std::string & file, size_t pos) const;
|
||||
[[nodiscard]] static NextState wait(std::string_view file, size_t pos) ;
|
||||
[[nodiscard]] static NextState read(std::string_view file, size_t pos, std::string_view & key);
|
||||
[[nodiscard]] static NextState readEnclosed(std::string_view file, size_t pos, std::string_view & key);
|
||||
[[nodiscard]] static NextState readKeyValueDelimiter(std::string_view file, size_t pos);
|
||||
|
||||
private:
|
||||
const char key_value_delimiter;
|
||||
[[maybe_unused]] const char key_value_delimiter;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -9,7 +9,7 @@ StateHandler::StateHandler(char escape_character_, std::optional<char> enclosing
|
||||
{
|
||||
}
|
||||
|
||||
std::string_view StateHandler::createElement(const std::string & file, std::size_t begin, std::size_t end)
|
||||
std::string_view StateHandler::createElement(std::string_view file, std::size_t begin, std::size_t end)
|
||||
{
|
||||
return std::string_view{file.begin() + begin, file.begin() + end};
|
||||
}
|
||||
|
@ -17,7 +17,7 @@ struct StateHandler
|
||||
const std::optional<char> enclosing_character;
|
||||
|
||||
protected:
|
||||
[[nodiscard]] static std::string_view createElement(const std::string & file, std::size_t begin, std::size_t end);
|
||||
[[nodiscard]] static std::string_view createElement(std::string_view file, std::size_t begin, std::size_t end);
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -14,7 +14,7 @@ ValueStateHandler::ValueStateHandler(
|
||||
{
|
||||
}
|
||||
|
||||
NextState ValueStateHandler::wait(const std::string & file, size_t pos) const
|
||||
NextState ValueStateHandler::wait(std::string_view file, size_t pos) const
|
||||
{
|
||||
while (pos < file.size())
|
||||
{
|
||||
@ -41,7 +41,7 @@ NextState ValueStateHandler::wait(const std::string & file, size_t pos) const
|
||||
return {pos, State::READING_EMPTY_VALUE};
|
||||
}
|
||||
|
||||
NextState ValueStateHandler::read(const std::string & file, size_t pos, std::string_view & value)
|
||||
NextState ValueStateHandler::read(const std::string_view file, size_t pos, std::string_view & value)
|
||||
{
|
||||
bool escape = false;
|
||||
|
||||
@ -73,7 +73,7 @@ NextState ValueStateHandler::read(const std::string & file, size_t pos, std::str
|
||||
return {pos, State::FLUSH_PAIR};
|
||||
}
|
||||
|
||||
NextState ValueStateHandler::readEnclosed(const std::string & file, size_t pos, std::string_view & value)
|
||||
NextState ValueStateHandler::readEnclosed(std::string_view file, size_t pos, std::string_view & value)
|
||||
{
|
||||
auto start_index = pos;
|
||||
|
||||
@ -92,7 +92,7 @@ NextState ValueStateHandler::readEnclosed(const std::string & file, size_t pos,
|
||||
return {pos, State::END};
|
||||
}
|
||||
|
||||
NextState ValueStateHandler::readEmpty(const std::string &, size_t pos, std::string_view & value)
|
||||
NextState ValueStateHandler::readEmpty(std::string_view, size_t pos, std::string_view & value)
|
||||
{
|
||||
value = {};
|
||||
return {pos + 1, State::FLUSH_PAIR};
|
||||
@ -100,7 +100,7 @@ NextState ValueStateHandler::readEmpty(const std::string &, size_t pos, std::str
|
||||
|
||||
bool ValueStateHandler::isValidCharacter(char character) const
|
||||
{
|
||||
return special_character_allowlist.contains(character) || std::isalnum(character) || character == '_';
|
||||
return /*special_character_allowlist.contains(character) ||*/ std::isalnum(character) || character == '_';
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -18,14 +18,14 @@ public:
|
||||
std::optional<char> enclosing_character,
|
||||
std::unordered_set<char> special_character_allowlist_);
|
||||
|
||||
[[nodiscard]] NextState wait(const std::string & file, size_t pos) const;
|
||||
[[nodiscard]] NextState read(const std::string & file, size_t pos, std::string_view & value);
|
||||
[[nodiscard]] NextState readEnclosed(const std::string & file, size_t pos, std::string_view & value);
|
||||
[[nodiscard]] static NextState readEmpty(const std::string & file, size_t pos, std::string_view & value);
|
||||
[[nodiscard]] NextState wait(std::string_view file, size_t pos) const;
|
||||
[[nodiscard]] NextState read(std::string_view file, size_t pos, std::string_view & value);
|
||||
[[nodiscard]] NextState readEnclosed(std::string_view file, size_t pos, std::string_view & value);
|
||||
[[nodiscard]] static NextState readEmpty(std::string_view file, size_t pos, std::string_view & value);
|
||||
|
||||
private:
|
||||
const char item_delimiter;
|
||||
std::unordered_set<char> special_character_allowlist;
|
||||
[[maybe_unused]] std::unordered_set<char> special_character_allowlist;
|
||||
|
||||
bool isValidCharacter(char character) const;
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user