Implemented hasTokenCaseInsensitive function

And tests;
This commit is contained in:
Vasily Nemkov 2019-08-26 11:00:48 +03:00
parent eeeaf8fbee
commit d0d63d769e
5 changed files with 123 additions and 37 deletions

View File

@ -775,6 +775,7 @@ using ASCIICaseInsensitiveStringSearcher = StringSearcher<false, true>;
using UTF8CaseSensitiveStringSearcher = StringSearcher<true, false>;
using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>;
using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
/** Uses functions from libc.

View File

@ -652,7 +652,8 @@ using VolnitskyUTF8 = VolnitskyBase<true, false, ASCIICaseSensitiveStringSearche
using VolnitskyCaseInsensitive = VolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes
using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
using VolnitskyToken = VolnitskyBase<true, true, ASCIICaseSensitiveTokenSearcher>;
using VolnitskyCaseSensitiveToken = VolnitskyBase<true, true, ASCIICaseSensitiveTokenSearcher>;
using VolnitskyCaseInsensitiveToken = VolnitskyBase<false, true, ASCIICaseInsensitiveTokenSearcher>;
using MultiVolnitsky = MultiVolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>;

View File

@ -436,7 +436,7 @@ struct MultiSearchFirstIndexImpl
/** Token search the string, means that needle must be surrounded by some separator chars, like whitespace or puctuation.
*/
template <bool negate_result = false>
template <typename TokenSearcher, bool negate_result = false>
struct HasTokenImpl
{
using ResultType = UInt8;
@ -454,7 +454,7 @@ struct HasTokenImpl
/// The current index in the array of strings.
size_t i = 0;
VolnitskyToken searcher(pattern.data(), pattern.size(), end - pos);
TokenSearcher searcher(pattern.data(), pattern.size(), end - pos);
/// We will search for the next occurrence in all rows at once.
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
@ -483,7 +483,7 @@ struct HasTokenImpl
static void constant_constant(const std::string & data, const std::string & pattern, UInt8 & res)
{
VolnitskyToken searcher(pattern.data(), pattern.size(), data.size());
TokenSearcher searcher(pattern.data(), pattern.size(), data.size());
const auto found = searcher.search(data.c_str(), data.size()) != data.end().base();
res = negate_result ^ found;
}
@ -589,6 +589,11 @@ struct NameHasToken
static constexpr auto name = "hasToken";
};
struct NameHasTokenCaseInsensitive
{
static constexpr auto name = "hasTokenCaseInsensitive";
};
using FunctionPosition = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveASCII>, NamePosition>;
using FunctionPositionUTF8 = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveUTF8>, NamePositionUTF8>;
@ -615,7 +620,8 @@ using FunctionMultiSearchFirstPositionUTF8 = FunctionsMultiStringSearch<MultiSea
using FunctionMultiSearchFirstPositionCaseInsensitive = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstPositionCaseInsensitive>;
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
using FunctionHasToken = FunctionsStringSearch<HasTokenImpl<false>, NameHasToken>;
using FunctionHasToken = FunctionsStringSearch<HasTokenImpl<VolnitskyCaseSensitiveToken, false>, NameHasToken>;
using FunctionHasTokenCaseInsensitive = FunctionsStringSearch<HasTokenImpl<VolnitskyCaseInsensitiveToken, false>, NameHasTokenCaseInsensitive>;
void registerFunctionsStringSearch(FunctionFactory & factory)
{
@ -645,6 +651,7 @@ void registerFunctionsStringSearch(FunctionFactory & factory)
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionHasToken>();
factory.registerFunction<FunctionHasTokenCaseInsensitive>();
factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive);
}

View File

@ -2,11 +2,12 @@
# encoding: utf-8
import re
from string import Template
HAYSTACKS = [
"hay hay hay hay hay hay hay hay hay needle hay hay hay hay hay hay hay hay hay",
"hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay needle",
"needle hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay",
"hay hay hay hay hay hay hay hay hay needle hay hay hay hay hay hay hay hay hay",
]
NEEDLE = "needle"
@ -48,47 +49,77 @@ def transform_needle(query, string_transformation_func):
return NEEDLE_RE.sub(replace_with_transformation, query)
def create_cases(table_row_template, table_query_template, const_query_template):
def create_cases(case_sensitive_func, case_insensitive_func, table_row_template, table_query_template, const_query_template):
const_queries = []
table_rows = []
table_queries = set()
def add_case(haystack, needle, match):
def add_case(func, haystack, needle, match):
match = int(match)
const_queries.append(const_query_template.format(haystack=haystack, needle=needle, match=match))
table_queries.add(table_query_template.format(haystack=haystack, needle=needle, match=match))
table_rows.append(table_row_template.format(haystack=haystack, needle=needle, match=match))
args = dict(
func = func,
haystack = haystack,
needle = needle,
match = match
)
const_queries.append(const_query_template.substitute(args))
table_queries.add(table_query_template.substitute(args))
table_rows.append(table_row_template.substitute(args))
def add_case_sensitive(haystack, needle, match):
add_case(case_sensitive_func, haystack, needle, match)
if match:
add_case(case_sensitive_func, transform_needle(haystack, str.swapcase), transform_needle(needle, str.swapcase), match)
def add_case_insensitive(haystack, needle, match):
add_case(case_insensitive_func, haystack, needle, match)
if match:
add_case(case_insensitive_func, transform_needle(haystack, str.swapcase), needle, match)
add_case(case_insensitive_func, haystack, transform_needle(needle, str.swapcase), match)
# Negative cases
add_case(remove_needle(HAYSTACKS[0]), NEEDLE, False)
add_case_sensitive(remove_needle(HAYSTACKS[0]), NEEDLE, False)
add_case_insensitive(remove_needle(HAYSTACKS[0]), NEEDLE, False)
for haystack in HAYSTACKS:
add_case(transform_needle(haystack, str.title), NEEDLE, False)
add_case_sensitive(transform_needle(haystack, str.swapcase), NEEDLE, False)
sep = ''
h = replace_separators(haystack, sep)
add_case(h, NEEDLE, False)
add_case(small_needle(h), small_needle(NEEDLE), False)
add_case(enlarge_haystack(h, 10, sep), NEEDLE, False)
add_case_sensitive(h, NEEDLE, False)
add_case_insensitive(h, NEEDLE, False)
add_case_sensitive(small_needle(h), small_needle(NEEDLE), False)
add_case_insensitive(small_needle(h), small_needle(NEEDLE), False)
add_case_sensitive(enlarge_haystack(h, 10, sep), NEEDLE, False)
add_case_insensitive(enlarge_haystack(h, 10, sep), NEEDLE, False)
# positive cases
for haystack in HAYSTACKS:
add_case(transform_needle(haystack, str.title), transform_needle(NEEDLE, str.title), True)
add_case(transform_needle(haystack, str.upper), transform_needle(NEEDLE, str.upper), True)
add_case_sensitive(haystack, NEEDLE, True)
add_case_insensitive(haystack, NEEDLE, True)
# Not checking all separators since some (like ' and \n) cause issues when coupled with
# re-based replacement and quoting in query
# other are rare in practice and checking all separators makes this test too lengthy.
# r'\\\\' turns into a single '\' in query
#separators = list(''' \t`~!@#$%^&*()-=+|]}[{";:/?.>,<''') + [r'\\\\']
separators = list(''' \t;:?.,''') + [r'\\\\']
for sep in separators:
for sep in list(''' ,'''):
h = replace_separators(haystack, sep)
add_case(h, NEEDLE, True)
add_case(small_needle(h), small_needle(NEEDLE), True)
add_case(enlarge_haystack(h, 200, sep), NEEDLE, True)
add_case(replace_needle(h, 'иголка'), replace_needle(NEEDLE, 'иголка'), True)
add_case(replace_needle(h, '指针'), replace_needle(NEEDLE, '指针'), True)
add_case_sensitive(h, NEEDLE, True)
add_case_sensitive(small_needle(h), small_needle(NEEDLE), True)
add_case_sensitive(enlarge_haystack(h, 200, sep), NEEDLE, True)
add_case_insensitive(h, NEEDLE, True)
add_case_insensitive(small_needle(h), small_needle(NEEDLE), True)
add_case_insensitive(enlarge_haystack(h, 200, sep), NEEDLE, True)
# case insesitivity works only on ASCII strings
add_case_sensitive(replace_needle(h, 'иголка'), replace_needle(NEEDLE, 'иголка'), True)
add_case_sensitive(replace_needle(h, '指针'), replace_needle(NEEDLE, '指针'), True)
for sep in list('''~!@$%^&*()-=+|]}[{";:/?.><\t''') + [r'\\\\']:
h = replace_separators(HAYSTACKS[0], sep)
add_case(case_sensitive_func, h, NEEDLE, True)
return table_rows, table_queries, const_queries
@ -97,12 +128,14 @@ def main():
def query(x):
print x
CONST_QUERY = """SELECT hasToken('{haystack}', '{needle}'), ' expecting ', {match};"""
#SELECT hasToken(haystack, '{needle}') FROM ht WHERE needle = '{needle}' AND match = {match};"""
TABLE_QUERY = """WITH '{needle}' as n SELECT haystack, needle, hasToken(haystack, n) as result FROM ht WHERE needle = n AND result != match;"""
TABLE_ROW = """('{haystack}', '{needle}', {match})"""
CONST_QUERY = Template("""SELECT ${func}('${haystack}', '${needle}'), ' expecting ', ${match};""")
TABLE_QUERY = Template("""WITH '${needle}' as n
SELECT haystack, needle, ${func}(haystack, n) as result
FROM ht
WHERE func = '${func}' AND needle = n AND result != match;""")
TABLE_ROW = Template("""('${haystack}', '${needle}', ${match}, '${func}')""")
rows, table_queries, const_queries = create_cases(TABLE_ROW, TABLE_QUERY, CONST_QUERY)
rows, table_queries, const_queries = create_cases('hasToken', 'hasTokenCaseInsensitive', TABLE_ROW, TABLE_QUERY, CONST_QUERY)
for q in const_queries:
query(q)
@ -112,7 +145,8 @@ def main():
(
haystack String,
needle String,
match UInt8
match UInt8,
func String
)
ENGINE MergeTree()
ORDER BY haystack;
@ -120,5 +154,7 @@ INSERT INTO ht VALUES {values};""".format(values=", ".join(rows)))
for q in sorted(table_queries):
query(q)
query("""DROP TABLE ht""")
if __name__ == '__main__':
main()

View File

@ -11,6 +11,47 @@
0 expecting 0
0 expecting 0
0 expecting 0
0 expecting 0
0 expecting 0
0 expecting 0
0 expecting 0
0 expecting 0
0 expecting 0
0 expecting 0
0 expecting 0
0 expecting 0
0 expecting 0
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1
1 expecting 1