#!/usr/bin/env python3 # encoding: utf-8 import re from string import Template HAYSTACKS = [ "hay hay hay hay hay hay hay hay hay needle hay hay hay hay hay hay hay hay hay", "hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay needle", "needle hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay", ] NEEDLE = "needle" HAY_RE = re.compile(r"\bhay\b", re.IGNORECASE) NEEDLE_RE = re.compile(r"\bneedle\b", re.IGNORECASE) def replace_follow_case(replacement): def func(match): g = match.group() if g.islower(): return replacement.lower() if g.istitle(): return replacement.title() if g.isupper(): return replacement.upper() return replacement return func def replace_separators(query, new_sep): SEP_RE = re.compile("\\s+") result = SEP_RE.sub(new_sep, query) return result def enlarge_haystack(query, times, separator=""): return HAY_RE.sub(replace_follow_case(("hay" + separator) * times), query) def small_needle(query): return NEEDLE_RE.sub(replace_follow_case("n"), query) def remove_needle(query): return NEEDLE_RE.sub("", query) def replace_needle(query, new_needle): return NEEDLE_RE.sub(new_needle, query) # with str.lower, str.uppert, str.title and such def transform_needle(query, string_transformation_func): def replace_with_transformation(match): g = match.group() return string_transformation_func(g) return NEEDLE_RE.sub(replace_with_transformation, query) def create_cases( case_sensitive_func, case_insensitive_func, table_row_template, table_query_template, const_query_template, ): const_queries = [] table_rows = [] table_queries = set() def add_case(func, haystack, needle, match): match = int(match) args = dict(func=func, haystack=haystack, needle=needle, match=match) const_queries.append(const_query_template.substitute(args)) table_queries.add(table_query_template.substitute(args)) table_rows.append(table_row_template.substitute(args)) def add_case_sensitive(haystack, needle, match): add_case(case_sensitive_func, haystack, needle, match) if match: add_case( case_sensitive_func, transform_needle(haystack, str.swapcase), transform_needle(needle, str.swapcase), match, ) def add_case_insensitive(haystack, needle, match): add_case(case_insensitive_func, haystack, needle, match) if match: add_case( case_insensitive_func, transform_needle(haystack, str.swapcase), needle, match, ) add_case( case_insensitive_func, haystack, transform_needle(needle, str.swapcase), match, ) # Negative cases add_case_sensitive(remove_needle(HAYSTACKS[0]), NEEDLE, False) add_case_insensitive(remove_needle(HAYSTACKS[0]), NEEDLE, False) for haystack in HAYSTACKS: add_case_sensitive(transform_needle(haystack, str.swapcase), NEEDLE, False) sep = "" h = replace_separators(haystack, sep) add_case_sensitive(h, NEEDLE, False) add_case_insensitive(h, NEEDLE, False) add_case_sensitive(small_needle(h), small_needle(NEEDLE), False) add_case_insensitive(small_needle(h), small_needle(NEEDLE), False) add_case_sensitive(enlarge_haystack(h, 10, sep), NEEDLE, False) add_case_insensitive(enlarge_haystack(h, 10, sep), NEEDLE, False) # positive cases for haystack in HAYSTACKS: add_case_sensitive(haystack, NEEDLE, True) add_case_insensitive(haystack, NEEDLE, True) for sep in list(""" ,"""): h = replace_separators(haystack, sep) add_case_sensitive(h, NEEDLE, True) add_case_sensitive(small_needle(h), small_needle(NEEDLE), True) add_case_sensitive(enlarge_haystack(h, 200, sep), NEEDLE, True) add_case_insensitive(h, NEEDLE, True) add_case_insensitive(small_needle(h), small_needle(NEEDLE), True) add_case_insensitive(enlarge_haystack(h, 200, sep), NEEDLE, True) # case insesitivity works only on ASCII strings add_case_sensitive( replace_needle(h, "иголка"), replace_needle(NEEDLE, "иголка"), True ) add_case_sensitive( replace_needle(h, "指针"), replace_needle(NEEDLE, "指针"), True ) for sep in list("""~!@$%^&*()-=+|]}[{";:/?.><\t""") + [r"\\\\"]: h = replace_separators(HAYSTACKS[0], sep) add_case(case_sensitive_func, h, NEEDLE, True) return table_rows, table_queries, const_queries def main(): def query(x): print(x) CONST_QUERY = Template( """SELECT ${func}('${haystack}', '${needle}'), ' expecting ', ${match};""" ) TABLE_QUERY = Template( """WITH '${needle}' as n SELECT haystack, needle, ${func}(haystack, n) as result FROM ht WHERE func = '${func}' AND needle = n AND result != match;""" ) TABLE_ROW = Template("""('${haystack}', '${needle}', ${match}, '${func}')""") rows, table_queries, const_queries = create_cases( "hasToken", "hasTokenCaseInsensitive", TABLE_ROW, TABLE_QUERY, CONST_QUERY ) for q in const_queries: query(q) query( """DROP TABLE IF EXISTS ht; CREATE TABLE IF NOT EXISTS ht ( haystack String, needle String, match UInt8, func String ) ENGINE MergeTree() ORDER BY haystack; INSERT INTO ht VALUES {values};""".format( values=", ".join(rows) ) ) for q in sorted(table_queries): query(q) query("""DROP TABLE ht""") if __name__ == "__main__": main()