ClickHouse/tests/queries/0_stateless/02346_full_text_search.sql

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

303 lines
13 KiB
MySQL
Raw Normal View History

SET allow_experimental_inverted_index = 1;
SET log_queries = 1;
----------------------------------------------------
SELECT 'Test inverted(2)';
DROP TABLE IF EXISTS tab;
2022-06-30 23:29:29 +00:00
CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(2))
2022-06-30 17:35:12 +00:00
ENGINE = MergeTree() ORDER BY k
SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
INSERT INTO tab VALUES (101, 'Alick a01'), (102, 'Blick a02'), (103, 'Click a03'), (104, 'Dlick a04'), (105, 'Elick a05'), (106, 'Alick a06'), (107, 'Blick a07'), (108, 'Click a08'), (109, 'Dlick a09'), (110, 'Elick a10'), (111, 'Alick b01'), (112, 'Blick b02'), (113, 'Click b03'), (114, 'Dlick b04'), (115, 'Elick b05'), (116, 'Alick b06'), (117, 'Blick b07'), (118, 'Click b08'), (119, 'Dlick b09'), (120, 'Elick b10');
-- check inverted index was created
SELECT name, type FROM system.data_skipping_indices WHERE table =='tab' AND database = currentDatabase() LIMIT 1;
2022-06-30 17:35:12 +00:00
-- throw in a random consistency check
CHECK TABLE tab;
2022-12-31 17:06:56 +00:00
-- search inverted index with ==
SELECT * FROM tab WHERE s == 'Alick a01';
2022-12-31 17:06:56 +00:00
-- check the query only read 1 granules (2 rows total; each granule has 2 rows)
SYSTEM FLUSH LOGS;
2022-12-31 17:06:56 +00:00
SELECT read_rows==2 from system.query_log
WHERE query_kind ='Select'
AND current_database = currentDatabase()
AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE s == \'Alick a01\';')
AND type='QueryFinish'
AND result_rows==1
LIMIT 1;
2022-12-31 17:06:56 +00:00
-- search inverted index with LIKE
SELECT * FROM tab WHERE s LIKE '%01%' ORDER BY k;
-- check the query only read 2 granules (4 rows total; each granule has 2 rows)
SYSTEM FLUSH LOGS;
2022-12-31 17:06:56 +00:00
SELECT read_rows==4 from system.query_log
WHERE query_kind ='Select'
AND current_database = currentDatabase()
AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE s LIKE \'%01%\' ORDER BY k;')
AND type='QueryFinish'
AND result_rows==2
LIMIT 1;
2022-06-30 17:35:12 +00:00
2022-12-31 17:06:56 +00:00
-- search inverted index with hasToken
SELECT * FROM tab WHERE hasToken(s, 'Click') ORDER BY k;
2022-12-31 17:06:56 +00:00
-- check the query only read 4 granules (8 rows total; each granule has 2 rows)
SYSTEM FLUSH LOGS;
2022-12-31 17:06:56 +00:00
SELECT read_rows==8 from system.query_log
WHERE query_kind ='Select'
AND current_database = currentDatabase()
AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE hasToken(s, \'Click\') ORDER BY k;')
AND type='QueryFinish'
AND result_rows==4
LIMIT 1;
----------------------------------------------------
SELECT 'Test inverted()';
DROP TABLE IF EXISTS tab_x;
CREATE TABLE tab_x(k UInt64, s String, INDEX af(s) TYPE inverted())
2022-06-30 17:35:12 +00:00
ENGINE = MergeTree() ORDER BY k
SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
2022-06-30 17:35:12 +00:00
INSERT INTO tab_x VALUES (101, 'Alick a01'), (102, 'Blick a02'), (103, 'Click a03'), (104, 'Dlick a04'), (105, 'Elick a05'), (106, 'Alick a06'), (107, 'Blick a07'), (108, 'Click a08'), (109, 'Dlick a09'), (110, 'Elick a10'), (111, 'Alick b01'), (112, 'Blick b02'), (113, 'Click b03'), (114, 'Dlick b04'), (115, 'Elick b05'), (116, 'Alick b06'), (117, 'Blick b07'), (118, 'Click b08'), (119, 'Dlick b09'), (120, 'Elick b10');
2022-06-30 17:35:12 +00:00
-- check inverted index was created
SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab_x' AND database = currentDatabase() LIMIT 1;
2022-12-31 17:06:56 +00:00
-- search inverted index with hasToken
SELECT * FROM tab_x WHERE hasToken(s, 'Alick') ORDER BY k;
2022-06-30 17:35:12 +00:00
-- check the query only read 4 granules (8 rows total; each granule has 2 rows)
SYSTEM FLUSH LOGS;
2022-12-31 17:06:56 +00:00
SELECT read_rows==8 from system.query_log
WHERE query_kind ='Select'
AND current_database = currentDatabase()
AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE hasToken(s, \'Alick\');')
AND type='QueryFinish'
AND result_rows==4
LIMIT 1;
2022-06-30 17:35:12 +00:00
2022-12-31 17:06:56 +00:00
-- search inverted index with IN operator
SELECT * FROM tab_x WHERE s IN ('Alick a01', 'Alick a06') ORDER BY k;
2022-12-31 17:06:56 +00:00
-- check the query only read 2 granules (4 rows total; each granule has 2 rows)
SYSTEM FLUSH LOGS;
2022-12-31 17:06:56 +00:00
SELECT read_rows==4 from system.query_log
WHERE query_kind ='Select'
AND current_database = currentDatabase()
AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE s IN (\'Alick a01\', \'Alick a06\') ORDER BY k;')
AND type='QueryFinish'
AND result_rows==2
LIMIT 1;
2022-12-31 17:06:56 +00:00
-- search inverted index with multiSearch
SELECT * FROM tab_x WHERE multiSearchAny(s, ['a01', 'b01']) ORDER BY k;
2022-12-31 17:06:56 +00:00
-- check the query only read 2 granules (4 rows total; each granule has 2 rows)
SYSTEM FLUSH LOGS;
2022-12-31 17:06:56 +00:00
SELECT read_rows==4 from system.query_log
WHERE query_kind ='Select'
AND current_database = currentDatabase()
AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE multiSearchAny(s, [\'a01\', \'b01\']) ORDER BY k;')
AND type='QueryFinish'
AND result_rows==2
LIMIT 1;
----------------------------------------------------
SELECT 'Test on array columns';
DROP TABLE IF EXISTS tab;
create table tab (k UInt64, s Array(String), INDEX af(s) TYPE inverted(2))
2022-12-31 17:06:56 +00:00
ENGINE = MergeTree() ORDER BY k
SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
INSERT INTO tab SELECT rowNumberInBlock(), groupArray(s) FROM tab_x GROUP BY k%10;
2022-12-31 17:06:56 +00:00
-- check inverted index was created
SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
2022-12-31 17:06:56 +00:00
-- search inverted index with has
SELECT * FROM tab WHERE has(s, 'Click a03') ORDER BY k;
2022-12-31 17:06:56 +00:00
-- check the query must read all 10 granules (20 rows total; each granule has 2 rows)
SYSTEM FLUSH LOGS;
2022-12-31 17:06:56 +00:00
SELECT read_rows==2 from system.query_log
WHERE query_kind ='Select'
AND current_database = currentDatabase()
AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE has(s, \'Click a03\') ORDER BY k;')
AND type='QueryFinish'
AND result_rows==1
LIMIT 1;
----------------------------------------------------
SELECT 'Test on map columns';
DROP TABLE IF EXISTS tab;
CREATE TABLE tab (k UInt64, s Map(String,String), INDEX af(mapKeys(s)) TYPE inverted(2))
2022-12-31 17:06:56 +00:00
ENGINE = MergeTree() ORDER BY k
SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
INSERT INTO tab VALUES (101, {'Alick':'Alick a01'}), (102, {'Blick':'Blick a02'}), (103, {'Click':'Click a03'}), (104, {'Dlick':'Dlick a04'}), (105, {'Elick':'Elick a05'}), (106, {'Alick':'Alick a06'}), (107, {'Blick':'Blick a07'}), (108, {'Click':'Click a08'}), (109, {'Dlick':'Dlick a09'}), (110, {'Elick':'Elick a10'}), (111, {'Alick':'Alick b01'}), (112, {'Blick':'Blick b02'}), (113, {'Click':'Click b03'}), (114, {'Dlick':'Dlick b04'}), (115, {'Elick':'Elick b05'}), (116, {'Alick':'Alick b06'}), (117, {'Blick':'Blick b07'}), (118, {'Click':'Click b08'}), (119, {'Dlick':'Dlick b09'}), (120, {'Elick':'Elick b10'});
2022-12-31 17:06:56 +00:00
-- check inverted index was created
SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
2022-12-31 17:06:56 +00:00
-- search inverted index with mapContains
SELECT * FROM tab WHERE mapContains(s, 'Click') ORDER BY k;
2022-12-31 17:06:56 +00:00
-- check the query must read all 4 granules (8 rows total; each granule has 2 rows)
SYSTEM FLUSH LOGS;
2022-12-31 17:06:56 +00:00
SELECT read_rows==8 from system.query_log
WHERE query_kind ='Select'
AND current_database = currentDatabase()
AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE mapContains(s, \'Click\') ORDER BY k;')
AND type='QueryFinish'
AND result_rows==4
LIMIT 1;
2022-12-31 17:06:56 +00:00
-- search inverted index with map key
SELECT * FROM tab WHERE s['Click'] = 'Click a03';
2022-12-31 17:06:56 +00:00
-- check the query must read all 4 granules (8 rows total; each granule has 2 rows)
SYSTEM FLUSH LOGS;
2022-12-31 17:06:56 +00:00
SELECT read_rows==8 from system.query_log
WHERE query_kind ='Select'
AND current_database = currentDatabase()
AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE s[\'Click\'] = \'Click a03\';')
AND type='QueryFinish'
AND result_rows==1
LIMIT 1;
----------------------------------------------------
SELECT 'Test inverted(2) on a column with two parts';
DROP TABLE IF EXISTS tab;
CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(2))
2022-06-30 17:35:12 +00:00
ENGINE = MergeTree() ORDER BY k
SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
INSERT INTO tab VALUES (101, 'Alick a01'), (102, 'Blick a02'), (103, 'Click a03'), (104, 'Dlick a04'), (105, 'Elick a05'), (106, 'Alick a06'), (107, 'Blick a07'), (108, 'Click a08'), (109, 'Dlick a09'), (110, 'Elick b10'), (111, 'Alick b01'), (112, 'Blick b02'), (113, 'Click b03'), (114, 'Dlick b04'), (115, 'Elick b05'), (116, 'Alick b06'), (117, 'Blick b07'), (118, 'Click b08'), (119, 'Dlick b09'), (120, 'Elick b10');
INSERT INTO tab VALUES (201, 'rick c01'), (202, 'mick c02'), (203, 'nick c03');
-- check inverted index was created
SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
-- search inverted index
SELECT * FROM tab WHERE s LIKE '%01%' ORDER BY k;
2022-06-30 17:35:12 +00:00
-- check the query only read 3 granules (6 rows total; each granule has 2 rows)
SYSTEM FLUSH LOGS;
2022-12-31 17:06:56 +00:00
SELECT read_rows==6 from system.query_log
WHERE query_kind ='Select'
AND current_database = currentDatabase()
AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE s LIKE \'%01%\' ORDER BY k;')
AND type='QueryFinish'
AND result_rows==3
LIMIT 1;
----------------------------------------------------
SELECT 'Test inverted(2) on UTF-8 data';
DROP TABLE IF EXISTS tab;
CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(2))
ENGINE = MergeTree()
ORDER BY k
SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
INSERT INTO tab VALUES (101, 'Alick 好'), (102, 'clickhouse你好'), (103, 'Click 你'), (104, 'Dlick 你a好'), (105, 'Elick 好好你你'), (106, 'Alick 好a好a你a你');
-- check inverted index was created
SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
-- search inverted index
SELECT * FROM tab WHERE s LIKE '%你好%' ORDER BY k;
2022-06-30 17:35:12 +00:00
-- check the query only read 1 granule (2 rows total; each granule has 2 rows)
SYSTEM FLUSH LOGS;
2022-12-31 17:06:56 +00:00
SELECT read_rows==2 from system.query_log
WHERE query_kind ='Select'
AND current_database = currentDatabase()
AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE s LIKE \'%%\' ORDER BY k;')
AND type='QueryFinish'
AND result_rows==1
LIMIT 1;
2023-09-13 07:56:23 +00:00
----------------------------------------------------
SELECT 'AST Fuzzer crash, issue #54541';
DROP TABLE IF EXISTS tab;
CREATE TABLE tab (row_id UInt32, str String, INDEX idx str TYPE inverted) ENGINE = MergeTree ORDER BY row_id;
INSERT INTO tab VALUES (0, 'a');
SELECT * FROM tab WHERE str == 'b' AND 1.0;
2023-03-30 14:14:46 +00:00
-- Tests with parameter max_digestion_size_per_segment are flaky in CI, not clear why --> comment out for the time being:
-- ----------------------------------------------------
-- SELECT 'Test max_digestion_size_per_segment';
--
-- DROP TABLE IF EXISTS tab;
--
-- CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0))
-- Engine=MergeTree
-- ORDER BY (k)
-- SETTINGS max_digestion_size_per_segment = 1024, index_granularity = 256
-- AS
-- SELECT
-- number,
-- format('{},{},{},{}', hex(12345678), hex(87654321), hex(number/17 + 5), hex(13579012)) as s
-- FROM numbers(10240);
--
-- -- check inverted index was created
-- SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
--
-- -- search inverted index
-- SELECT s FROM tab WHERE hasToken(s, '6969696969898240');
--
-- -- check the query only read 1 granule (1 row total; each granule has 256 rows)
-- SYSTEM FLUSH LOGS;
-- SELECT read_rows==256 from system.query_log
-- WHERE query_kind ='Select'
-- AND current_database = currentDatabase()
-- AND endsWith(trimRight(query), 'SELECT s FROM tab WHERE hasToken(s, \'6969696969898240\');')
-- AND type='QueryFinish'
-- AND result_rows==1
-- LIMIT 1;
--
SELECT 'Test max_rows_per_postings_list';
DROP TABLE IF EXISTS tab;
-- create table 'tab' with inverted index parameter (ngrams, max_rows_per_most_list) which is (0, 10240)
CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0, 12040))
Engine=MergeTree
ORDER BY (k)
AS
SELECT
number,
format('{},{},{},{}', hex(12345678), hex(87654321), hex(number/17 + 5), hex(13579012)) as s
FROM numbers(1024);
SELECT count(s) FROM tab WHERE hasToken(s, '4C4B4B4B4B4B5040');
DROP TABLE IF EXISTS tab;
-- create table 'tab' with inverted index parameter (ngrams, max_rows_per_most_list) which is (0, 123)
-- it should throw exception since max_rows_per_most_list(123) is less than its minimum value(8196)
CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(3, 123))
Engine=MergeTree
ORDER BY (k)
AS
SELECT
number,
format('{},{},{},{}', hex(12345678), hex(87654321), hex(number/17 + 5), hex(13579012)) as s
FROM numbers(1024); -- { serverError 80 }