Implemented globs to select * from '<file>'

This commit is contained in:
Alexey Gerasimchuck 2023-08-31 04:20:44 +00:00
parent 9382dd90ee
commit 3a212217a3
10 changed files with 223 additions and 20 deletions

View File

@ -30,7 +30,7 @@ curl https://clickhouse.com/ | sh
The binary you just downloaded can run all sorts of ClickHouse tools and utilities. If you want to run ClickHouse as a database server, check out the [Quick Start](../../quick-start.mdx).
:::
## Query data in a CSV file using SQL
## Query data in a file using SQL {#query_data_in_file}
A common use of `clickhouse-local` is to run ad-hoc queries on files: where you don't have to insert the data into a table. `clickhouse-local` can stream the data from a file into a temporary table and execute your SQL.
@ -57,6 +57,19 @@ The `file` table function creates a table, and you can use `DESCRIBE` to see the
./clickhouse local -q "DESCRIBE file('reviews.tsv')"
```
:::tip
You are allowed to use globs in file name (See [glob substitutions](/docs/en/sql-reference/table-functions/file.md/#globs-in-path)).
Examples:
```bash
./clickhouse local -q "SELECT * FROM 'reviews*.jsonl'"
./clickhouse local -q "SELECT * FROM 'review_?.csv'"
./clickhouse local -q "SELECT * FROM 'review_{1..3}.csv'"
```
:::
```response
marketplace Nullable(String)
customer_id Nullable(Int64)

View File

@ -110,3 +110,42 @@ Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec.
├──────────┼──────────┤
...
```
## Запрос данных в файле с помощью SQL {#query_data_in_file}
Часто `clickhouse-local` используется для выполнения специальных запросов к файлам, когда не нужно вставлять данные в таблицу. `clickhouse-local` может транслировать данные из файла во временную таблицу и выполнить ваш SQL.
Если файл находится на той же машине, что и `clickhouse-local`, то можно просто указать файл для загрузки. Следующий файл `reviews.tsv` содержит выборку отзывов о товарах Amazon:
```bash
./clickhouse local -q "SELECT * FROM 'reviews.tsv'"
```
Эта команда является сокращением команды:
```bash
./clickhouse local -q "SELECT * FROM file('reviews.tsv')"
```
ClickHouse знает, что файл использует формат, разделенный табуляцией, из расширения имени файла. Если необходимо явно указать формат, просто добавьте один из [множества входных форматов ClickHouse](../../interfaces/formats.md):
```bash
./clickhouse local -q "SELECT * FROM file('reviews.tsv', 'TabSeparated')"
```
Функция таблицы `file` создает таблицу, и вы можете использовать `DESCRIBE` для просмотра предполагаемой схемы:
```bash
./clickhouse local -q "DESCRIBE file('reviews.tsv')"
```
:::tip
В имени файла разрешается использовать [Шаблоны поиска](/docs/ru/sql-reference/table-functions/file.md/#globs-in-path).
Примеры:
```bash
./clickhouse local -q "SELECT * FROM 'reviews*.jsonl'"
./clickhouse local -q "SELECT * FROM 'review_?.csv'"
./clickhouse local -q "SELECT * FROM 'review_{1..3}.csv'"
```

View File

@ -318,3 +318,8 @@ inline void trim(std::string & str, char c = ' ')
trimRight(str, c);
trimLeft(str, c);
}
constexpr bool containsGlobs(const std::string & str)
{
return str.find_first_of("*?{") != std::string::npos;
}

View File

@ -81,22 +81,24 @@ bool DatabaseFilesystem::checkTableFilePath(const std::string & table_path, Cont
throw Exception(ErrorCodes::PATH_ACCESS_DENIED, "File is not inside {}", user_files_path);
}
/// Check if the corresponding file exists.
if (!fs::exists(table_path))
if (!containsGlobs(table_path))
{
if (throw_on_error)
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File does not exist: {}", table_path);
else
return false;
}
/// Check if the corresponding file exists.
if (!fs::exists(table_path))
{
if (throw_on_error)
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File does not exist: {}", table_path);
else
return false;
}
if (!fs::is_regular_file(table_path))
{
if (throw_on_error)
throw Exception(ErrorCodes::FILE_DOESNT_EXIST,
"File is directory, but expected a file: {}", table_path);
else
return false;
if (!fs::is_regular_file(table_path))
{
if (throw_on_error)
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File is directory, but expected a file: {}", table_path);
else
return false;
}
}
return true;
@ -141,19 +143,18 @@ StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr cont
if (!checkTableFilePath(table_path, context_, throw_on_error))
return {};
String format = FormatFactory::instance().getFormatFromFileName(table_path, throw_on_error);
auto format = FormatFactory::instance().getFormatFromFileName(table_path, throw_on_error);
if (format.empty())
return {};
/// If the file exists, create a new table using TableFunctionFile and return it.
auto args = makeASTFunction("file", std::make_shared<ASTLiteral>(table_path), std::make_shared<ASTLiteral>(format));
auto ast_function_ptr = makeASTFunction("file", std::make_shared<ASTLiteral>(table_path), std::make_shared<ASTLiteral>(format));
auto table_function = TableFunctionFactory::instance().get(args, context_);
auto table_function = TableFunctionFactory::instance().get(ast_function_ptr, context_);
if (!table_function)
return nullptr;
/// TableFunctionFile throws exceptions, if table cannot be created.
auto table_storage = table_function->execute(args, context_, name);
auto table_storage = table_function->execute(ast_function_ptr, context_, name);
if (table_storage)
addTable(name, table_storage);

View File

@ -5,5 +5,11 @@ implicit:
4
Test 2: check Filesystem database
4
30
10
4
3
2
1
Test 3: check show database with Filesystem
test02707

View File

@ -15,6 +15,23 @@ echo '2,"def",456,"bacabaa"' >> $dir/tmp.csv
echo '3,"story",78912,"acabaab"' >> $dir/tmp.csv
echo '4,"history",21321321,"cabaaba"' >> $dir/tmp.csv
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_1.csv') select * from numbers(1, 10)"
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_2.csv') select * from numbers(11, 10)"
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_30.csv') select * from numbers(21, 10)"
readonly nested_dir=$dir/nested
[[ -d $nested_dir ]] && rm -rd $nested_dir
mkdir $nested_dir
mkdir $nested_dir/subnested
cp ${dir}/tmp_numbers_1.csv ${nested_dir}/nested_tmp_numbers_1.csv
cp ${dir}/tmp_numbers_1.csv ${nested_dir}/subnested/subnested_tmp_numbers_1.csv
readonly other_nested_dir=$dir/other_nested
[[ -d $other_nested_dir ]] && rm -rd $other_nested_dir
mkdir $other_nested_dir
cp ${dir}/tmp_numbers_1.csv ${other_nested_dir}/tmp_numbers_1.csv
#################
echo "Test 1: check explicit and implicit call of the file table function"
@ -29,6 +46,12 @@ $CLICKHOUSE_LOCAL --multiline --multiquery -q """
DROP DATABASE IF EXISTS test;
CREATE DATABASE test ENGINE = Filesystem('${dir}');
SELECT COUNT(*) FROM test.\`tmp.csv\`;
SELECT COUNT(*) FROM test.\`tmp_numbers_*.csv\`;
SELECT COUNT(*) FROM test.\`nested/nested_tmp_numbers_1*.csv\`;
SELECT count(DISTINCT _path) FROM test.\`*.csv\`;
SELECT count(DISTINCT _path) FROM test.\`**/*.csv\`;
SELECT count(DISTINCT _path) FROM test.\`**/*.csv\` WHERE position(_path, '${nested_dir}') > 0;
SELECT count(DISTINCT _path) FROM test.\`**/*.csv\` WHERE position(_path, '${nested_dir}') = 0;
DROP DATABASE test;
"""

View File

@ -3,6 +3,14 @@ Test 1: create filesystem database and check implicit calls
test1
4
4
30
10
10
4
0
2
0
OK
4
Test 2: check DatabaseFilesystem access rights and errors handling on server
OK
@ -13,3 +21,6 @@ OK
OK
OK
OK
OK
OK
OK

View File

@ -19,11 +19,17 @@ echo '3,"story",78912,"acabaab"' >> ${user_files_tmp_dir}/tmp.csv
echo '4,"history",21321321,"cabaaba"' >> ${user_files_tmp_dir}/tmp.csv
tmp_dir=${CLICKHOUSE_TEST_UNIQUE_NAME}
$CLICKHOUSE_LOCAL -q "insert into function file('$user_files_tmp_dir/tmp_numbers_1.csv') select * from numbers(1, 10)"
$CLICKHOUSE_LOCAL -q "insert into function file('$user_files_tmp_dir/tmp_numbers_2.csv') select * from numbers(11, 10)"
$CLICKHOUSE_LOCAL -q "insert into function file('$user_files_tmp_dir/tmp_numbers_30.csv') select * from numbers(21, 10)"
[[ -d $tmp_dir ]] && rm -rd $tmp_dir
mkdir $tmp_dir
cp ${user_files_tmp_dir}/tmp.csv ${tmp_dir}/tmp.csv
cp ${user_files_tmp_dir}/tmp.csv ${user_files_tmp_dir}/tmp/tmp.csv
cp ${user_files_tmp_dir}/tmp.csv ${user_files_tmp_dir}/tmp.myext
cp ${user_files_tmp_dir}/tmp_numbers_1.csv ${user_files_tmp_dir}/tmp/tmp_numbers_1.csv
#################
echo "Test 1: create filesystem database and check implicit calls"
@ -35,6 +41,15 @@ echo $?
${CLICKHOUSE_CLIENT} --query "SHOW DATABASES" | grep "test1"
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/tmp.csv\`;"
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/tmp/tmp.csv\`;"
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/tmp_numbers_*.csv\`;"
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/tmp/*tmp_numbers_*.csv\`;"
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/*/*tmp_numbers_*.csv\`;"
${CLICKHOUSE_CLIENT} --query "SELECT count(DISTINCT _path) FROM test1.\`${unique_name}/*.csv\` WHERE startsWith(_path, '${user_files_tmp_dir}')";
${CLICKHOUSE_CLIENT} --query "SELECT count(DISTINCT _path) FROM test1.\`${unique_name}/*.csv\` WHERE not startsWith(_path, '${user_files_tmp_dir}')";
# **/* does not search in the current directory but searches recursively in nested directories.
${CLICKHOUSE_CLIENT} --query "SELECT count(DISTINCT _path) FROM test1.\`${unique_name}/**/*.csv\` WHERE startsWith(_path, '${user_files_tmp_dir}')";
${CLICKHOUSE_CLIENT} --query "SELECT count(DISTINCT _path) FROM test1.\`${unique_name}/**/*.csv\` WHERE not startsWith(_path, '${user_files_tmp_dir}')";
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`tmp_numbers_*.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||:
${CLICKHOUSE_LOCAL} -q "SELECT COUNT(*) FROM \"${tmp_dir}/tmp.csv\""
#################
@ -42,6 +57,9 @@ echo "Test 2: check DatabaseFilesystem access rights and errors handling on serv
# DATABASE_ACCESS_DENIED: Allows list files only inside user_files
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../tmp.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "PATH_ACCESS_DENIED" > /dev/null && echo "OK" || echo 'FAIL' ||:
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`/tmp/tmp.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "PATH_ACCESS_DENIED" > /dev/null && echo "OK" || echo 'FAIL' ||:
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../*/tmp_numbers_*.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "PATH_ACCESS_DENIED" > /dev/null && echo "OK" || echo 'FAIL' ||:
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../tmp_numbers_*.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "PATH_ACCESS_DENIED" > /dev/null && echo "OK" || echo 'FAIL' ||:
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../*.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "PATH_ACCESS_DENIED" > /dev/null && echo "OK" || echo 'FAIL' ||:
${CLICKHOUSE_CLIENT} --multiline --multiquery --query """
USE test1;
SELECT COUNT(*) FROM \"../${tmp_dir}/tmp.csv\";

View File

@ -25,3 +25,37 @@ Test 3a: check literal no parsing overflow
1
Test 3b: check literal empty
1
Test 4: select using * wildcard
30
30
30
30
30
10
30
10
Test 4b: select using ? wildcard
20
10
20
10
20
Test 4c: select using '{' + '}' wildcards
20
20
1
Test 4d: select using ? and * wildcards
30
30
30
1
30
30
Test 4e: select using ?, * and '{' + '}' wildcards
10
20
20
20
Test 4f: recursive search
2
1

View File

@ -7,6 +7,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
dir=${CLICKHOUSE_TEST_UNIQUE_NAME}
[[ -d $dir ]] && rm -rd $dir
mkdir $dir
mkdir $dir/nested
mkdir $dir/nested/nested
# Create temporary csv file for tests
echo '"id","str","int","text"' > $dir/tmp.csv
@ -15,6 +17,14 @@ echo '2,"def",456,"bacabaa"' >> $dir/tmp.csv
echo '3,"story",78912,"acabaab"' >> $dir/tmp.csv
echo '4,"history",21321321,"cabaaba"' >> $dir/tmp.csv
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_1.jsonl') select * from numbers(1, 10)"
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_2.jsonl') select * from numbers(11, 10)"
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_30.jsonl') select * from numbers(21, 10)"
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/nested/nested_numbers.jsonl') select * from numbers(1)"
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/nested/nested/nested_nested_numbers.jsonl') select * from numbers(1)"
#################
echo "Test 1: check double quotes"
@ -52,5 +62,48 @@ echo "Test 3b: check literal empty"
$CLICKHOUSE_LOCAL -q "SELECT * FROM ''" 2>&1 | grep -c "SYNTAX_ERROR"
echo "Test 4: select using * wildcard"
# Extension is required for auto table structure detection
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers_*.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/**.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/**********************.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*_numbers_*.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*_nu*ers_*.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*_nu*ers_2.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*tmp_numbers_*.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*tmp_numbers_1*.jsonl'"
echo "Test 4b: select using ? wildcard"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers_?.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers_??.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/??p_numbers??.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_n?mbers_1.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/t?p_n?mbers_?.jsonl'"
echo "Test 4c: select using '{' + '}' wildcards"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers_{1..3}.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers_{1,2}.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers__{1,2}.jsonl'" 2>&1 | grep -c "CANNOT_EXTRACT_TABLE_STRUCTURE"
echo "Test 4d: select using ? and * wildcards"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/?*.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/?*????.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/?*?***_.jsonl'" 2>&1 | grep -c "CANNOT_EXTRACT_TABLE_STRUCTURE"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/?*????_*.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?*_num*e?s_*.jsonl'"
echo "Test 4e: select using ?, * and '{' + '}' wildcards"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?{1,3}.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?{1..3}.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?*_num*e?s_{1..3}.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?*_num*e?s_{1,2}.jsonl'"
echo "Test 4f: recursive search"
# /**/* pattern does not look in current directory
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/**/*.jsonl'"
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/nested/**/*.jsonl'"
# Remove temporary dir with files
rm -rd $dir