mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Implemented hasTokenCaseInsensitive function
And tests;
This commit is contained in:
parent
eeeaf8fbee
commit
d0d63d769e
@ -775,6 +775,7 @@ using ASCIICaseInsensitiveStringSearcher = StringSearcher<false, true>;
|
||||
using UTF8CaseSensitiveStringSearcher = StringSearcher<true, false>;
|
||||
using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
|
||||
using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>;
|
||||
using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
|
||||
|
||||
|
||||
/** Uses functions from libc.
|
||||
|
@ -652,7 +652,8 @@ using VolnitskyUTF8 = VolnitskyBase<true, false, ASCIICaseSensitiveStringSearche
|
||||
using VolnitskyCaseInsensitive = VolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes
|
||||
using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
|
||||
|
||||
using VolnitskyToken = VolnitskyBase<true, true, ASCIICaseSensitiveTokenSearcher>;
|
||||
using VolnitskyCaseSensitiveToken = VolnitskyBase<true, true, ASCIICaseSensitiveTokenSearcher>;
|
||||
using VolnitskyCaseInsensitiveToken = VolnitskyBase<false, true, ASCIICaseInsensitiveTokenSearcher>;
|
||||
|
||||
using MultiVolnitsky = MultiVolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
|
||||
using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>;
|
||||
|
@ -436,7 +436,7 @@ struct MultiSearchFirstIndexImpl
|
||||
|
||||
/** Token search the string, means that needle must be surrounded by some separator chars, like whitespace or puctuation.
|
||||
*/
|
||||
template <bool negate_result = false>
|
||||
template <typename TokenSearcher, bool negate_result = false>
|
||||
struct HasTokenImpl
|
||||
{
|
||||
using ResultType = UInt8;
|
||||
@ -454,7 +454,7 @@ struct HasTokenImpl
|
||||
/// The current index in the array of strings.
|
||||
size_t i = 0;
|
||||
|
||||
VolnitskyToken searcher(pattern.data(), pattern.size(), end - pos);
|
||||
TokenSearcher searcher(pattern.data(), pattern.size(), end - pos);
|
||||
|
||||
/// We will search for the next occurrence in all rows at once.
|
||||
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
|
||||
@ -483,7 +483,7 @@ struct HasTokenImpl
|
||||
|
||||
static void constant_constant(const std::string & data, const std::string & pattern, UInt8 & res)
|
||||
{
|
||||
VolnitskyToken searcher(pattern.data(), pattern.size(), data.size());
|
||||
TokenSearcher searcher(pattern.data(), pattern.size(), data.size());
|
||||
const auto found = searcher.search(data.c_str(), data.size()) != data.end().base();
|
||||
res = negate_result ^ found;
|
||||
}
|
||||
@ -589,6 +589,11 @@ struct NameHasToken
|
||||
static constexpr auto name = "hasToken";
|
||||
};
|
||||
|
||||
struct NameHasTokenCaseInsensitive
|
||||
{
|
||||
static constexpr auto name = "hasTokenCaseInsensitive";
|
||||
};
|
||||
|
||||
|
||||
using FunctionPosition = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveASCII>, NamePosition>;
|
||||
using FunctionPositionUTF8 = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveUTF8>, NamePositionUTF8>;
|
||||
@ -615,7 +620,8 @@ using FunctionMultiSearchFirstPositionUTF8 = FunctionsMultiStringSearch<MultiSea
|
||||
using FunctionMultiSearchFirstPositionCaseInsensitive = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstPositionCaseInsensitive>;
|
||||
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
|
||||
|
||||
using FunctionHasToken = FunctionsStringSearch<HasTokenImpl<false>, NameHasToken>;
|
||||
using FunctionHasToken = FunctionsStringSearch<HasTokenImpl<VolnitskyCaseSensitiveToken, false>, NameHasToken>;
|
||||
using FunctionHasTokenCaseInsensitive = FunctionsStringSearch<HasTokenImpl<VolnitskyCaseInsensitiveToken, false>, NameHasTokenCaseInsensitive>;
|
||||
|
||||
void registerFunctionsStringSearch(FunctionFactory & factory)
|
||||
{
|
||||
@ -645,6 +651,7 @@ void registerFunctionsStringSearch(FunctionFactory & factory)
|
||||
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitiveUTF8>();
|
||||
|
||||
factory.registerFunction<FunctionHasToken>();
|
||||
factory.registerFunction<FunctionHasTokenCaseInsensitive>();
|
||||
|
||||
factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive);
|
||||
}
|
||||
|
@ -2,11 +2,12 @@
|
||||
# encoding: utf-8
|
||||
|
||||
import re
|
||||
from string import Template
|
||||
|
||||
HAYSTACKS = [
|
||||
"hay hay hay hay hay hay hay hay hay needle hay hay hay hay hay hay hay hay hay",
|
||||
"hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay needle",
|
||||
"needle hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay",
|
||||
"hay hay hay hay hay hay hay hay hay needle hay hay hay hay hay hay hay hay hay",
|
||||
]
|
||||
|
||||
NEEDLE = "needle"
|
||||
@ -48,47 +49,77 @@ def transform_needle(query, string_transformation_func):
|
||||
|
||||
return NEEDLE_RE.sub(replace_with_transformation, query)
|
||||
|
||||
|
||||
def create_cases(table_row_template, table_query_template, const_query_template):
|
||||
def create_cases(case_sensitive_func, case_insensitive_func, table_row_template, table_query_template, const_query_template):
|
||||
const_queries = []
|
||||
table_rows = []
|
||||
table_queries = set()
|
||||
|
||||
def add_case(haystack, needle, match):
|
||||
def add_case(func, haystack, needle, match):
|
||||
match = int(match)
|
||||
const_queries.append(const_query_template.format(haystack=haystack, needle=needle, match=match))
|
||||
table_queries.add(table_query_template.format(haystack=haystack, needle=needle, match=match))
|
||||
table_rows.append(table_row_template.format(haystack=haystack, needle=needle, match=match))
|
||||
args = dict(
|
||||
func = func,
|
||||
haystack = haystack,
|
||||
needle = needle,
|
||||
match = match
|
||||
)
|
||||
const_queries.append(const_query_template.substitute(args))
|
||||
table_queries.add(table_query_template.substitute(args))
|
||||
table_rows.append(table_row_template.substitute(args))
|
||||
|
||||
def add_case_sensitive(haystack, needle, match):
|
||||
add_case(case_sensitive_func, haystack, needle, match)
|
||||
if match:
|
||||
add_case(case_sensitive_func, transform_needle(haystack, str.swapcase), transform_needle(needle, str.swapcase), match)
|
||||
|
||||
def add_case_insensitive(haystack, needle, match):
|
||||
add_case(case_insensitive_func, haystack, needle, match)
|
||||
if match:
|
||||
add_case(case_insensitive_func, transform_needle(haystack, str.swapcase), needle, match)
|
||||
add_case(case_insensitive_func, haystack, transform_needle(needle, str.swapcase), match)
|
||||
|
||||
|
||||
# Negative cases
|
||||
add_case(remove_needle(HAYSTACKS[0]), NEEDLE, False)
|
||||
add_case_sensitive(remove_needle(HAYSTACKS[0]), NEEDLE, False)
|
||||
add_case_insensitive(remove_needle(HAYSTACKS[0]), NEEDLE, False)
|
||||
|
||||
for haystack in HAYSTACKS:
|
||||
add_case(transform_needle(haystack, str.title), NEEDLE, False)
|
||||
add_case_sensitive(transform_needle(haystack, str.swapcase), NEEDLE, False)
|
||||
|
||||
sep = ''
|
||||
h = replace_separators(haystack, sep)
|
||||
add_case(h, NEEDLE, False)
|
||||
add_case(small_needle(h), small_needle(NEEDLE), False)
|
||||
add_case(enlarge_haystack(h, 10, sep), NEEDLE, False)
|
||||
|
||||
add_case_sensitive(h, NEEDLE, False)
|
||||
add_case_insensitive(h, NEEDLE, False)
|
||||
|
||||
add_case_sensitive(small_needle(h), small_needle(NEEDLE), False)
|
||||
add_case_insensitive(small_needle(h), small_needle(NEEDLE), False)
|
||||
|
||||
add_case_sensitive(enlarge_haystack(h, 10, sep), NEEDLE, False)
|
||||
add_case_insensitive(enlarge_haystack(h, 10, sep), NEEDLE, False)
|
||||
|
||||
# positive cases
|
||||
for haystack in HAYSTACKS:
|
||||
add_case(transform_needle(haystack, str.title), transform_needle(NEEDLE, str.title), True)
|
||||
add_case(transform_needle(haystack, str.upper), transform_needle(NEEDLE, str.upper), True)
|
||||
add_case_sensitive(haystack, NEEDLE, True)
|
||||
add_case_insensitive(haystack, NEEDLE, True)
|
||||
|
||||
# Not checking all separators since some (like ' and \n) cause issues when coupled with
|
||||
# re-based replacement and quoting in query
|
||||
# other are rare in practice and checking all separators makes this test too lengthy.
|
||||
|
||||
# r'\\\\' turns into a single '\' in query
|
||||
#separators = list(''' \t`~!@#$%^&*()-=+|]}[{";:/?.>,<''') + [r'\\\\']
|
||||
separators = list(''' \t;:?.,''') + [r'\\\\']
|
||||
for sep in separators:
|
||||
for sep in list(''' ,'''):
|
||||
h = replace_separators(haystack, sep)
|
||||
add_case(h, NEEDLE, True)
|
||||
add_case(small_needle(h), small_needle(NEEDLE), True)
|
||||
add_case(enlarge_haystack(h, 200, sep), NEEDLE, True)
|
||||
add_case(replace_needle(h, 'иголка'), replace_needle(NEEDLE, 'иголка'), True)
|
||||
add_case(replace_needle(h, '指针'), replace_needle(NEEDLE, '指针'), True)
|
||||
add_case_sensitive(h, NEEDLE, True)
|
||||
add_case_sensitive(small_needle(h), small_needle(NEEDLE), True)
|
||||
add_case_sensitive(enlarge_haystack(h, 200, sep), NEEDLE, True)
|
||||
|
||||
add_case_insensitive(h, NEEDLE, True)
|
||||
add_case_insensitive(small_needle(h), small_needle(NEEDLE), True)
|
||||
add_case_insensitive(enlarge_haystack(h, 200, sep), NEEDLE, True)
|
||||
|
||||
# case insesitivity works only on ASCII strings
|
||||
add_case_sensitive(replace_needle(h, 'иголка'), replace_needle(NEEDLE, 'иголка'), True)
|
||||
add_case_sensitive(replace_needle(h, '指针'), replace_needle(NEEDLE, '指针'), True)
|
||||
|
||||
for sep in list('''~!@$%^&*()-=+|]}[{";:/?.><\t''') + [r'\\\\']:
|
||||
h = replace_separators(HAYSTACKS[0], sep)
|
||||
add_case(case_sensitive_func, h, NEEDLE, True)
|
||||
|
||||
return table_rows, table_queries, const_queries
|
||||
|
||||
@ -97,12 +128,14 @@ def main():
|
||||
def query(x):
|
||||
print x
|
||||
|
||||
CONST_QUERY = """SELECT hasToken('{haystack}', '{needle}'), ' expecting ', {match};"""
|
||||
#SELECT hasToken(haystack, '{needle}') FROM ht WHERE needle = '{needle}' AND match = {match};"""
|
||||
TABLE_QUERY = """WITH '{needle}' as n SELECT haystack, needle, hasToken(haystack, n) as result FROM ht WHERE needle = n AND result != match;"""
|
||||
TABLE_ROW = """('{haystack}', '{needle}', {match})"""
|
||||
CONST_QUERY = Template("""SELECT ${func}('${haystack}', '${needle}'), ' expecting ', ${match};""")
|
||||
TABLE_QUERY = Template("""WITH '${needle}' as n
|
||||
SELECT haystack, needle, ${func}(haystack, n) as result
|
||||
FROM ht
|
||||
WHERE func = '${func}' AND needle = n AND result != match;""")
|
||||
TABLE_ROW = Template("""('${haystack}', '${needle}', ${match}, '${func}')""")
|
||||
|
||||
rows, table_queries, const_queries = create_cases(TABLE_ROW, TABLE_QUERY, CONST_QUERY)
|
||||
rows, table_queries, const_queries = create_cases('hasToken', 'hasTokenCaseInsensitive', TABLE_ROW, TABLE_QUERY, CONST_QUERY)
|
||||
for q in const_queries:
|
||||
query(q)
|
||||
|
||||
@ -112,7 +145,8 @@ def main():
|
||||
(
|
||||
haystack String,
|
||||
needle String,
|
||||
match UInt8
|
||||
match UInt8,
|
||||
func String
|
||||
)
|
||||
ENGINE MergeTree()
|
||||
ORDER BY haystack;
|
||||
@ -120,5 +154,7 @@ INSERT INTO ht VALUES {values};""".format(values=", ".join(rows)))
|
||||
for q in sorted(table_queries):
|
||||
query(q)
|
||||
|
||||
query("""DROP TABLE ht""")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@ -11,6 +11,47 @@
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
|
Loading…
Reference in New Issue
Block a user