ClickHouse/tests/performance/re2_regex_caching.xml

<!-- Simple benchmark to verify the caching of compiled re2 patterns (e.g. LIKE/MATCH) -->

<test>
    <substitutions>
        <substitution>
           <name>numbers</name>
           <values>
               <value>numbers_mt(1500000)</value>
           </values>
        </substitution>
        <substitution>
            <name>needle_like</name>
            <values>
                <!-- simple patterns, all unique -->
                <value>'%' || toString(number) || '_'</value> 
                <!-- simple patterns, low distinctness (10 patterns) -->
                <value>'%' || toString(number % 10) || '_'</value> 
            </values>
        </substitution>
        <substitution>
            <name>needle_match</name>
            <values>
                <!-- simple patterns, all unique -->
                <value>'.*' || toString(number) || '.'</value>
                <!-- simple patterns, low distinctness (10 patterns) -->
                <value>'.*' || toString(number % 10) || '.'</value>
                <!-- complex patterns, all unique - this is very slow (from 2 to 15 seconds) -->
                <!-- <value>'([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?([^ @]+)@([^ @]+)([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])' || toString(number)</value> -->
                <!-- complex patterns, low distinctness -->
                <value>'([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?([^ @]+)@([^ @]+)([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])' || toString(number % 10)</value>
                <!-- Note: for this benchmark, we are only interested in compilation time, not correctness, evaluation time or the result.
                    Therefore, this is a maximally expensive to compile "pattern from hell": a concatenation of || email || date ||
                    ip4 || <number (% 10), see http://lh3lh3.users.sourceforge.net/reb.shtml and https://github.com/mariomka/regex-benchmark -->
            </values>
        </substitution>
    </substitutions>

    <!-- const needle (just for reference) -->

    <query>
        select toString(number) as haystack, like(haystack, '%x_')
        from(select * from {numbers})
        format Null
    </query>

    <query>
        select toString(number) as haystack, match(haystack, '.*x.')
        from(select * from {numbers})
        format Null
    </query>

    <!-- non-const needle -->

    <query>
        select toString(number) as haystack, {needle_like} as needle, like(haystack, needle)
        from (select * from {numbers})
        format Null
    </query>

    <query>
        select toString(number) as haystack, {needle_match} as needle, match(haystack, needle)
        from (select * from {numbers})
        format Null
    </query>

</test>