This commit is contained in:
Arthur Passos 2023-01-13 15:24:14 -03:00
parent 296a19129b
commit b848189726
9 changed files with 134 additions and 32 deletions

View File

@ -8,6 +8,11 @@
#include <Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h>
#include <Common/assert_cast.h>
#include <chrono>
/* Only needed for the sake of this example. */
#include <iostream>
namespace DB
{
@ -41,16 +46,46 @@ String ExtractKeyValuePairs::getName() const
ColumnPtr ExtractKeyValuePairs::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const
{
using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::duration;
using std::chrono::microseconds;
// auto t1 = high_resolution_clock::now();
auto [data_column, escape_character, key_value_pair_delimiter, item_delimiter, enclosing_character, value_special_characters_allow_list]
= parseArguments(arguments);
auto extractor = getExtractor(
escape_character, key_value_pair_delimiter, item_delimiter, enclosing_character, value_special_characters_allow_list);
// auto t2 = high_resolution_clock::now();
auto raw_columns = extract(extractor, data_column);
// improve escape character..
return escape(raw_columns, escape_character ? escape_character.value() : '\\');
ColumnPtr keys_ptr = std::move(raw_columns.keys);
// auto t3 = high_resolution_clock::now();
auto map = ColumnMap::create(keys_ptr, std::move(raw_columns.values), std::move(raw_columns.offsets));
// auto t4 = high_resolution_clock::now();
// std::cout<<"Time taken for building extractor is: "<<duration_cast<microseconds>(t2 - t1).count()<<"u\n";
// std::cout<<"Time taken for building extracting & creating output is: "<<duration_cast<microseconds>(t3 - t2).count()<<"u\n";
// std::cout<<"Time taken for whole process is: "<<duration_cast<microseconds>(t4 - t1).count()<<"u\n";
return map;
// return raw_columns;
//
// // improve escape character..
// auto escaped_out = escape(raw_columns, escape_character ? escape_character.value() : '\\');
// auto t3 = high_resolution_clock::now();
//
//
// std::cout<<"Time taken for extraction is: "<<duration_cast<microseconds>(t2 - t1).count()<<"u\n";
// std::cout<<"Time taken for escaping is: "<<duration_cast<microseconds>(t3 - t2).count()<<"u\n";
//
// return escaped_out;
}
bool ExtractKeyValuePairs::isVariadic() const
@ -183,19 +218,37 @@ std::shared_ptr<KeyValuePairExtractor<ExtractKeyValuePairs::EscapingProcessorOut
ExtractKeyValuePairs::RawColumns ExtractKeyValuePairs::extract(std::shared_ptr<KeyValuePairExtractor<EscapingProcessorOutput>> extractor, ColumnPtr data_column)
{
using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::duration;
using std::chrono::microseconds;
auto offsets = ColumnUInt64::create();
auto keys = ColumnString::create();
auto values = ColumnString::create();
// keys->reserve(data_column->byteSize());
// values->reserve(data_column->byteSize());
auto row_offset = 0u;
// long totalExtractionTime = 0;
// long shortestExtractionTime = std::numeric_limits<long>::max();
// long longestExtractionTime = std::numeric_limits<long>::min();
//
// long totalInsertionTime = 0;
// long shortestInsertionTime = std::numeric_limits<long>::max();
// long longestInsertionTime = std::numeric_limits<long>::min();
for (auto i = 0u; i < data_column->size(); i++)
{
auto row = data_column->getDataAt(i).toString();
auto row = data_column->getDataAt(i).toView();
// auto t2 = high_resolution_clock::now();
// TODO avoid copying
auto response = extractor->extract(row);
// auto t3 = high_resolution_clock::now();
for (auto [key, value] : response)
{
@ -206,8 +259,46 @@ ExtractKeyValuePairs::RawColumns ExtractKeyValuePairs::extract(std::shared_ptr<K
}
offsets->insert(row_offset);
// auto t4 = high_resolution_clock::now();
//
// long extractionTime = duration_cast<microseconds>(t3 - t2).count();
// long insertionTime = duration_cast<microseconds>(t4 - t3).count();
//
// totalExtractionTime += extractionTime;
// totalInsertionTime += insertionTime;
//
// if (extractionTime > longestExtractionTime)
// {
// longestExtractionTime = extractionTime;
// }
//
// if (extractionTime < shortestExtractionTime)
// {
// shortestExtractionTime = extractionTime;
// }
//
// if (insertionTime > longestInsertionTime)
// {
// longestInsertionTime = insertionTime;
// }
//
// if (insertionTime < shortestInsertionTime)
// {
// shortestInsertionTime = insertionTime;
// }
}
// if (!data_column->empty())
// {
// auto averageInsertionTime = totalInsertionTime / data_column->size();
// auto averageExtractionTime = totalExtractionTime / data_column->size();
//
// std::cout<<"Longest extraction time: "<<longestExtractionTime<<" - Shortest extraction time: "<<shortestExtractionTime<<" - Average extraction time: "<<averageExtractionTime<<" - Total: "<<totalExtractionTime<<"\n";
// std::cout<<"Longest insertion time: "<<longestInsertionTime<<" - Shortest extraction time: "<<shortestInsertionTime<<" - Average insertion time: "<<averageInsertionTime<<" - Total: "<<totalInsertionTime<<"\n";
//
// }
return {std::move(keys), std::move(values), std::move(offsets)};
}
@ -218,6 +309,9 @@ ColumnPtr ExtractKeyValuePairs::escape(RawColumns & raw_columns, char escape_cha
auto escaped_keys = ColumnString::create();
auto escaped_values = ColumnString::create();
escaped_keys->reserve(raw_keys->size());
escaped_values->reserve(raw_values->size());
auto escape_character_string = std::string(1, escape_character);
using ReplaceString = ReplaceStringImpl<ReplaceStringTraits::Replace::All>;

View File

@ -39,6 +39,8 @@ struct KeyValuePairExtractor
virtual ~KeyValuePairExtractor() = default;
virtual Response extract(const std::string & data) = 0;
virtual Response extract(const std::string_view & data) = 0;
};
}

View File

@ -33,6 +33,12 @@ public:
}
[[nodiscard]] Response extract(const std::string & file) override
{
auto view = std::string_view {file};
return extract(view);
}
[[nodiscard]] Response extract(const std::string_view & file) override
{
std::unordered_map<std::string_view, std::string_view> response_views;
@ -55,7 +61,7 @@ public:
}
private:
NextState processState(const std::string & file, std::size_t pos, State state,
NextState processState(std::string_view file, std::size_t pos, State state,
std::string_view & key, std::string_view & value,
std::unordered_map<std::string_view, std::string_view> & response_views)
{
@ -84,7 +90,7 @@ private:
}
}
NextState flushPair(const std::string & file, std::size_t pos, std::string_view key,
NextState flushPair(const std::string_view & file, std::size_t pos, std::string_view key,
std::string_view value, std::unordered_map<std::string_view, std::string_view> & response_views)
{
response_views[key] = value;

View File

@ -8,7 +8,7 @@ KeyStateHandler::KeyStateHandler(char key_value_delimiter_, char escape_characte
{
}
NextState KeyStateHandler::wait(const std::string & file, size_t pos) const
NextState KeyStateHandler::wait(std::string_view file, size_t pos)
{
while (pos < file.size())
{
@ -17,7 +17,7 @@ NextState KeyStateHandler::wait(const std::string & file, size_t pos) const
{
return {pos, State::READING_KEY};
}
else if (enclosing_character && current_character == enclosing_character)
else if (current_character == '"')
{
return {pos + 1u, State::READING_ENCLOSED_KEY};
}
@ -30,7 +30,7 @@ NextState KeyStateHandler::wait(const std::string & file, size_t pos) const
return {pos, State::END};
}
NextState KeyStateHandler::read(const std::string & file, size_t pos, std::string_view & key)
NextState KeyStateHandler::read(std::string_view file, size_t pos, std::string_view & key)
{
bool escape = false;
@ -45,11 +45,11 @@ NextState KeyStateHandler::read(const std::string & file, size_t pos, std::strin
{
escape = false;
}
else if (escape_character == current_character)
else if ('\\' == current_character)
{
escape = true;
}
else if (current_character == key_value_delimiter)
else if (current_character == '=')
{
// not checking for empty key because with current waitKey implementation
// there is no way this piece of code will be reached for the very first key character
@ -65,7 +65,7 @@ NextState KeyStateHandler::read(const std::string & file, size_t pos, std::strin
return {pos, State::END};
}
NextState KeyStateHandler::readEnclosed(const std::string & file, size_t pos, std::string_view & key)
NextState KeyStateHandler::readEnclosed(std::string_view file, size_t pos, std::string_view & key)
{
auto start_index = pos;
key = {};
@ -74,7 +74,7 @@ NextState KeyStateHandler::readEnclosed(const std::string & file, size_t pos, st
{
const auto current_character = file[pos++];
if (enclosing_character == current_character)
if ('"' == current_character)
{
auto is_key_empty = start_index == pos;
@ -91,7 +91,7 @@ NextState KeyStateHandler::readEnclosed(const std::string & file, size_t pos, st
return {pos, State::END};
}
NextState KeyStateHandler::readKeyValueDelimiter(const std::string & file, size_t pos) const
NextState KeyStateHandler::readKeyValueDelimiter(std::string_view file, size_t pos)
{
if (pos == file.size())
{
@ -100,7 +100,7 @@ NextState KeyStateHandler::readKeyValueDelimiter(const std::string & file, size_
else
{
const auto current_character = file[pos++];
return {pos, current_character == key_value_delimiter ? State::WAITING_VALUE : State::WAITING_KEY};
return {pos, current_character == '=' ? State::WAITING_VALUE : State::WAITING_KEY};
}
}

View File

@ -14,13 +14,13 @@ class KeyStateHandler : StateHandler
public:
KeyStateHandler(char key_value_delimiter, char escape_character, std::optional<char> enclosing_character);
[[nodiscard]] NextState wait(const std::string & file, size_t pos) const;
[[nodiscard]] NextState read(const std::string & file, size_t pos, std::string_view & key);
[[nodiscard]] NextState readEnclosed(const std::string & file, size_t pos, std::string_view & key);
[[nodiscard]] NextState readKeyValueDelimiter(const std::string & file, size_t pos) const;
[[nodiscard]] static NextState wait(std::string_view file, size_t pos) ;
[[nodiscard]] static NextState read(std::string_view file, size_t pos, std::string_view & key);
[[nodiscard]] static NextState readEnclosed(std::string_view file, size_t pos, std::string_view & key);
[[nodiscard]] static NextState readKeyValueDelimiter(std::string_view file, size_t pos);
private:
const char key_value_delimiter;
[[maybe_unused]] const char key_value_delimiter;
};
}

View File

@ -9,7 +9,7 @@ StateHandler::StateHandler(char escape_character_, std::optional<char> enclosing
{
}
std::string_view StateHandler::createElement(const std::string & file, std::size_t begin, std::size_t end)
std::string_view StateHandler::createElement(std::string_view file, std::size_t begin, std::size_t end)
{
return std::string_view{file.begin() + begin, file.begin() + end};
}

View File

@ -17,7 +17,7 @@ struct StateHandler
const std::optional<char> enclosing_character;
protected:
[[nodiscard]] static std::string_view createElement(const std::string & file, std::size_t begin, std::size_t end);
[[nodiscard]] static std::string_view createElement(std::string_view file, std::size_t begin, std::size_t end);
};
}

View File

@ -14,7 +14,7 @@ ValueStateHandler::ValueStateHandler(
{
}
NextState ValueStateHandler::wait(const std::string & file, size_t pos) const
NextState ValueStateHandler::wait(std::string_view file, size_t pos) const
{
while (pos < file.size())
{
@ -41,7 +41,7 @@ NextState ValueStateHandler::wait(const std::string & file, size_t pos) const
return {pos, State::READING_EMPTY_VALUE};
}
NextState ValueStateHandler::read(const std::string & file, size_t pos, std::string_view & value)
NextState ValueStateHandler::read(const std::string_view file, size_t pos, std::string_view & value)
{
bool escape = false;
@ -73,7 +73,7 @@ NextState ValueStateHandler::read(const std::string & file, size_t pos, std::str
return {pos, State::FLUSH_PAIR};
}
NextState ValueStateHandler::readEnclosed(const std::string & file, size_t pos, std::string_view & value)
NextState ValueStateHandler::readEnclosed(std::string_view file, size_t pos, std::string_view & value)
{
auto start_index = pos;
@ -92,7 +92,7 @@ NextState ValueStateHandler::readEnclosed(const std::string & file, size_t pos,
return {pos, State::END};
}
NextState ValueStateHandler::readEmpty(const std::string &, size_t pos, std::string_view & value)
NextState ValueStateHandler::readEmpty(std::string_view, size_t pos, std::string_view & value)
{
value = {};
return {pos + 1, State::FLUSH_PAIR};
@ -100,7 +100,7 @@ NextState ValueStateHandler::readEmpty(const std::string &, size_t pos, std::str
bool ValueStateHandler::isValidCharacter(char character) const
{
return special_character_allowlist.contains(character) || std::isalnum(character) || character == '_';
return /*special_character_allowlist.contains(character) ||*/ std::isalnum(character) || character == '_';
}
}

View File

@ -18,14 +18,14 @@ public:
std::optional<char> enclosing_character,
std::unordered_set<char> special_character_allowlist_);
[[nodiscard]] NextState wait(const std::string & file, size_t pos) const;
[[nodiscard]] NextState read(const std::string & file, size_t pos, std::string_view & value);
[[nodiscard]] NextState readEnclosed(const std::string & file, size_t pos, std::string_view & value);
[[nodiscard]] static NextState readEmpty(const std::string & file, size_t pos, std::string_view & value);
[[nodiscard]] NextState wait(std::string_view file, size_t pos) const;
[[nodiscard]] NextState read(std::string_view file, size_t pos, std::string_view & value);
[[nodiscard]] NextState readEnclosed(std::string_view file, size_t pos, std::string_view & value);
[[nodiscard]] static NextState readEmpty(std::string_view file, size_t pos, std::string_view & value);
private:
const char item_delimiter;
std::unordered_set<char> special_character_allowlist;
[[maybe_unused]] std::unordered_set<char> special_character_allowlist;
bool isValidCharacter(char character) const;
};