From b8481897261ceb7f7e41db01f7b48f6ca258fdef Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 13 Jan 2023 15:24:14 -0300 Subject: [PATCH] tmp --- src/Functions/extractKeyValuePairs.cpp | 102 +++++++++++++++++- .../keyvaluepair/src/KeyValuePairExtractor.h | 2 + .../impl/LazyEscapingKeyValuePairExtractor.h | 10 +- .../src/impl/state/KeyStateHandler.cpp | 18 ++-- .../src/impl/state/KeyStateHandler.h | 10 +- .../src/impl/state/StateHandler.cpp | 2 +- .../src/impl/state/StateHandler.h | 2 +- .../src/impl/state/ValueStateHandler.cpp | 10 +- .../src/impl/state/ValueStateHandler.h | 10 +- 9 files changed, 134 insertions(+), 32 deletions(-) diff --git a/src/Functions/extractKeyValuePairs.cpp b/src/Functions/extractKeyValuePairs.cpp index 8e875bff1c5..6bc0086e503 100644 --- a/src/Functions/extractKeyValuePairs.cpp +++ b/src/Functions/extractKeyValuePairs.cpp @@ -8,6 +8,11 @@ #include #include +#include + +/* Only needed for the sake of this example. */ +#include + namespace DB { @@ -41,16 +46,46 @@ String ExtractKeyValuePairs::getName() const ColumnPtr ExtractKeyValuePairs::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const { + using std::chrono::high_resolution_clock; + using std::chrono::duration_cast; + using std::chrono::duration; + using std::chrono::microseconds; + +// auto t1 = high_resolution_clock::now(); + auto [data_column, escape_character, key_value_pair_delimiter, item_delimiter, enclosing_character, value_special_characters_allow_list] = parseArguments(arguments); auto extractor = getExtractor( escape_character, key_value_pair_delimiter, item_delimiter, enclosing_character, value_special_characters_allow_list); +// auto t2 = high_resolution_clock::now(); auto raw_columns = extract(extractor, data_column); - // improve escape character.. - return escape(raw_columns, escape_character ? escape_character.value() : '\\'); + ColumnPtr keys_ptr = std::move(raw_columns.keys); +// auto t3 = high_resolution_clock::now(); + + + auto map = ColumnMap::create(keys_ptr, std::move(raw_columns.values), std::move(raw_columns.offsets)); +// auto t4 = high_resolution_clock::now(); + +// std::cout<<"Time taken for building extractor is: "<(t2 - t1).count()<<"u\n"; +// std::cout<<"Time taken for building extracting & creating output is: "<(t3 - t2).count()<<"u\n"; +// std::cout<<"Time taken for whole process is: "<(t4 - t1).count()<<"u\n"; + return map; + +// return raw_columns; + +// +// // improve escape character.. +// auto escaped_out = escape(raw_columns, escape_character ? escape_character.value() : '\\'); +// auto t3 = high_resolution_clock::now(); +// +// +// std::cout<<"Time taken for extraction is: "<(t2 - t1).count()<<"u\n"; +// std::cout<<"Time taken for escaping is: "<(t3 - t2).count()<<"u\n"; +// +// return escaped_out; } bool ExtractKeyValuePairs::isVariadic() const @@ -183,19 +218,37 @@ std::shared_ptr> extractor, ColumnPtr data_column) { + using std::chrono::high_resolution_clock; + using std::chrono::duration_cast; + using std::chrono::duration; + using std::chrono::microseconds; + auto offsets = ColumnUInt64::create(); auto keys = ColumnString::create(); auto values = ColumnString::create(); +// keys->reserve(data_column->byteSize()); +// values->reserve(data_column->byteSize()); + auto row_offset = 0u; +// long totalExtractionTime = 0; +// long shortestExtractionTime = std::numeric_limits::max(); +// long longestExtractionTime = std::numeric_limits::min(); +// +// long totalInsertionTime = 0; +// long shortestInsertionTime = std::numeric_limits::max(); +// long longestInsertionTime = std::numeric_limits::min(); + + for (auto i = 0u; i < data_column->size(); i++) { - auto row = data_column->getDataAt(i).toString(); + auto row = data_column->getDataAt(i).toView(); +// auto t2 = high_resolution_clock::now(); - // TODO avoid copying auto response = extractor->extract(row); +// auto t3 = high_resolution_clock::now(); for (auto [key, value] : response) { @@ -206,8 +259,46 @@ ExtractKeyValuePairs::RawColumns ExtractKeyValuePairs::extract(std::shared_ptrinsert(row_offset); + +// auto t4 = high_resolution_clock::now(); +// +// long extractionTime = duration_cast(t3 - t2).count(); +// long insertionTime = duration_cast(t4 - t3).count(); +// +// totalExtractionTime += extractionTime; +// totalInsertionTime += insertionTime; +// +// if (extractionTime > longestExtractionTime) +// { +// longestExtractionTime = extractionTime; +// } +// +// if (extractionTime < shortestExtractionTime) +// { +// shortestExtractionTime = extractionTime; +// } +// +// if (insertionTime > longestInsertionTime) +// { +// longestInsertionTime = insertionTime; +// } +// +// if (insertionTime < shortestInsertionTime) +// { +// shortestInsertionTime = insertionTime; +// } } +// if (!data_column->empty()) +// { +// auto averageInsertionTime = totalInsertionTime / data_column->size(); +// auto averageExtractionTime = totalExtractionTime / data_column->size(); +// +// std::cout<<"Longest extraction time: "<reserve(raw_keys->size()); + escaped_values->reserve(raw_values->size()); + auto escape_character_string = std::string(1, escape_character); using ReplaceString = ReplaceStringImpl; diff --git a/src/Functions/keyvaluepair/src/KeyValuePairExtractor.h b/src/Functions/keyvaluepair/src/KeyValuePairExtractor.h index 2ec27edc467..d811f5c6534 100644 --- a/src/Functions/keyvaluepair/src/KeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/src/KeyValuePairExtractor.h @@ -39,6 +39,8 @@ struct KeyValuePairExtractor virtual ~KeyValuePairExtractor() = default; virtual Response extract(const std::string & data) = 0; + + virtual Response extract(const std::string_view & data) = 0; }; } diff --git a/src/Functions/keyvaluepair/src/impl/LazyEscapingKeyValuePairExtractor.h b/src/Functions/keyvaluepair/src/impl/LazyEscapingKeyValuePairExtractor.h index 7536a824ab4..d65ded8c3b0 100644 --- a/src/Functions/keyvaluepair/src/impl/LazyEscapingKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/src/impl/LazyEscapingKeyValuePairExtractor.h @@ -33,6 +33,12 @@ public: } [[nodiscard]] Response extract(const std::string & file) override + { + auto view = std::string_view {file}; + return extract(view); + } + + [[nodiscard]] Response extract(const std::string_view & file) override { std::unordered_map response_views; @@ -55,7 +61,7 @@ public: } private: - NextState processState(const std::string & file, std::size_t pos, State state, + NextState processState(std::string_view file, std::size_t pos, State state, std::string_view & key, std::string_view & value, std::unordered_map & response_views) { @@ -84,7 +90,7 @@ private: } } - NextState flushPair(const std::string & file, std::size_t pos, std::string_view key, + NextState flushPair(const std::string_view & file, std::size_t pos, std::string_view key, std::string_view value, std::unordered_map & response_views) { response_views[key] = value; diff --git a/src/Functions/keyvaluepair/src/impl/state/KeyStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/KeyStateHandler.cpp index dbdfe099635..8b68f813293 100644 --- a/src/Functions/keyvaluepair/src/impl/state/KeyStateHandler.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/KeyStateHandler.cpp @@ -8,7 +8,7 @@ KeyStateHandler::KeyStateHandler(char key_value_delimiter_, char escape_characte { } -NextState KeyStateHandler::wait(const std::string & file, size_t pos) const +NextState KeyStateHandler::wait(std::string_view file, size_t pos) { while (pos < file.size()) { @@ -17,7 +17,7 @@ NextState KeyStateHandler::wait(const std::string & file, size_t pos) const { return {pos, State::READING_KEY}; } - else if (enclosing_character && current_character == enclosing_character) + else if (current_character == '"') { return {pos + 1u, State::READING_ENCLOSED_KEY}; } @@ -30,7 +30,7 @@ NextState KeyStateHandler::wait(const std::string & file, size_t pos) const return {pos, State::END}; } -NextState KeyStateHandler::read(const std::string & file, size_t pos, std::string_view & key) +NextState KeyStateHandler::read(std::string_view file, size_t pos, std::string_view & key) { bool escape = false; @@ -45,11 +45,11 @@ NextState KeyStateHandler::read(const std::string & file, size_t pos, std::strin { escape = false; } - else if (escape_character == current_character) + else if ('\\' == current_character) { escape = true; } - else if (current_character == key_value_delimiter) + else if (current_character == '=') { // not checking for empty key because with current waitKey implementation // there is no way this piece of code will be reached for the very first key character @@ -65,7 +65,7 @@ NextState KeyStateHandler::read(const std::string & file, size_t pos, std::strin return {pos, State::END}; } -NextState KeyStateHandler::readEnclosed(const std::string & file, size_t pos, std::string_view & key) +NextState KeyStateHandler::readEnclosed(std::string_view file, size_t pos, std::string_view & key) { auto start_index = pos; key = {}; @@ -74,7 +74,7 @@ NextState KeyStateHandler::readEnclosed(const std::string & file, size_t pos, st { const auto current_character = file[pos++]; - if (enclosing_character == current_character) + if ('"' == current_character) { auto is_key_empty = start_index == pos; @@ -91,7 +91,7 @@ NextState KeyStateHandler::readEnclosed(const std::string & file, size_t pos, st return {pos, State::END}; } -NextState KeyStateHandler::readKeyValueDelimiter(const std::string & file, size_t pos) const +NextState KeyStateHandler::readKeyValueDelimiter(std::string_view file, size_t pos) { if (pos == file.size()) { @@ -100,7 +100,7 @@ NextState KeyStateHandler::readKeyValueDelimiter(const std::string & file, size_ else { const auto current_character = file[pos++]; - return {pos, current_character == key_value_delimiter ? State::WAITING_VALUE : State::WAITING_KEY}; + return {pos, current_character == '=' ? State::WAITING_VALUE : State::WAITING_KEY}; } } diff --git a/src/Functions/keyvaluepair/src/impl/state/KeyStateHandler.h b/src/Functions/keyvaluepair/src/impl/state/KeyStateHandler.h index 215700c8f09..ebc2849e10c 100644 --- a/src/Functions/keyvaluepair/src/impl/state/KeyStateHandler.h +++ b/src/Functions/keyvaluepair/src/impl/state/KeyStateHandler.h @@ -14,13 +14,13 @@ class KeyStateHandler : StateHandler public: KeyStateHandler(char key_value_delimiter, char escape_character, std::optional enclosing_character); - [[nodiscard]] NextState wait(const std::string & file, size_t pos) const; - [[nodiscard]] NextState read(const std::string & file, size_t pos, std::string_view & key); - [[nodiscard]] NextState readEnclosed(const std::string & file, size_t pos, std::string_view & key); - [[nodiscard]] NextState readKeyValueDelimiter(const std::string & file, size_t pos) const; + [[nodiscard]] static NextState wait(std::string_view file, size_t pos) ; + [[nodiscard]] static NextState read(std::string_view file, size_t pos, std::string_view & key); + [[nodiscard]] static NextState readEnclosed(std::string_view file, size_t pos, std::string_view & key); + [[nodiscard]] static NextState readKeyValueDelimiter(std::string_view file, size_t pos); private: - const char key_value_delimiter; + [[maybe_unused]] const char key_value_delimiter; }; } diff --git a/src/Functions/keyvaluepair/src/impl/state/StateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/StateHandler.cpp index b4f9cb0fd7a..67254ee4217 100644 --- a/src/Functions/keyvaluepair/src/impl/state/StateHandler.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/StateHandler.cpp @@ -9,7 +9,7 @@ StateHandler::StateHandler(char escape_character_, std::optional enclosing { } -std::string_view StateHandler::createElement(const std::string & file, std::size_t begin, std::size_t end) +std::string_view StateHandler::createElement(std::string_view file, std::size_t begin, std::size_t end) { return std::string_view{file.begin() + begin, file.begin() + end}; } diff --git a/src/Functions/keyvaluepair/src/impl/state/StateHandler.h b/src/Functions/keyvaluepair/src/impl/state/StateHandler.h index 0a7f503ecf5..abc9f1ad4fc 100644 --- a/src/Functions/keyvaluepair/src/impl/state/StateHandler.h +++ b/src/Functions/keyvaluepair/src/impl/state/StateHandler.h @@ -17,7 +17,7 @@ struct StateHandler const std::optional enclosing_character; protected: - [[nodiscard]] static std::string_view createElement(const std::string & file, std::size_t begin, std::size_t end); + [[nodiscard]] static std::string_view createElement(std::string_view file, std::size_t begin, std::size_t end); }; } diff --git a/src/Functions/keyvaluepair/src/impl/state/ValueStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/ValueStateHandler.cpp index 896ee2a6a42..dbd3fe2d495 100644 --- a/src/Functions/keyvaluepair/src/impl/state/ValueStateHandler.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/ValueStateHandler.cpp @@ -14,7 +14,7 @@ ValueStateHandler::ValueStateHandler( { } -NextState ValueStateHandler::wait(const std::string & file, size_t pos) const +NextState ValueStateHandler::wait(std::string_view file, size_t pos) const { while (pos < file.size()) { @@ -41,7 +41,7 @@ NextState ValueStateHandler::wait(const std::string & file, size_t pos) const return {pos, State::READING_EMPTY_VALUE}; } -NextState ValueStateHandler::read(const std::string & file, size_t pos, std::string_view & value) +NextState ValueStateHandler::read(const std::string_view file, size_t pos, std::string_view & value) { bool escape = false; @@ -73,7 +73,7 @@ NextState ValueStateHandler::read(const std::string & file, size_t pos, std::str return {pos, State::FLUSH_PAIR}; } -NextState ValueStateHandler::readEnclosed(const std::string & file, size_t pos, std::string_view & value) +NextState ValueStateHandler::readEnclosed(std::string_view file, size_t pos, std::string_view & value) { auto start_index = pos; @@ -92,7 +92,7 @@ NextState ValueStateHandler::readEnclosed(const std::string & file, size_t pos, return {pos, State::END}; } -NextState ValueStateHandler::readEmpty(const std::string &, size_t pos, std::string_view & value) +NextState ValueStateHandler::readEmpty(std::string_view, size_t pos, std::string_view & value) { value = {}; return {pos + 1, State::FLUSH_PAIR}; @@ -100,7 +100,7 @@ NextState ValueStateHandler::readEmpty(const std::string &, size_t pos, std::str bool ValueStateHandler::isValidCharacter(char character) const { - return special_character_allowlist.contains(character) || std::isalnum(character) || character == '_'; + return /*special_character_allowlist.contains(character) ||*/ std::isalnum(character) || character == '_'; } } diff --git a/src/Functions/keyvaluepair/src/impl/state/ValueStateHandler.h b/src/Functions/keyvaluepair/src/impl/state/ValueStateHandler.h index 346cabc0f64..0dc610f01ec 100644 --- a/src/Functions/keyvaluepair/src/impl/state/ValueStateHandler.h +++ b/src/Functions/keyvaluepair/src/impl/state/ValueStateHandler.h @@ -18,14 +18,14 @@ public: std::optional enclosing_character, std::unordered_set special_character_allowlist_); - [[nodiscard]] NextState wait(const std::string & file, size_t pos) const; - [[nodiscard]] NextState read(const std::string & file, size_t pos, std::string_view & value); - [[nodiscard]] NextState readEnclosed(const std::string & file, size_t pos, std::string_view & value); - [[nodiscard]] static NextState readEmpty(const std::string & file, size_t pos, std::string_view & value); + [[nodiscard]] NextState wait(std::string_view file, size_t pos) const; + [[nodiscard]] NextState read(std::string_view file, size_t pos, std::string_view & value); + [[nodiscard]] NextState readEnclosed(std::string_view file, size_t pos, std::string_view & value); + [[nodiscard]] static NextState readEmpty(std::string_view file, size_t pos, std::string_view & value); private: const char item_delimiter; - std::unordered_set special_character_allowlist; + [[maybe_unused]] std::unordered_set special_character_allowlist; bool isValidCharacter(char character) const; };