mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 00:30:49 +00:00
Implemented globs to select * from '<file>'
This commit is contained in:
parent
9382dd90ee
commit
3a212217a3
@ -30,7 +30,7 @@ curl https://clickhouse.com/ | sh
|
||||
The binary you just downloaded can run all sorts of ClickHouse tools and utilities. If you want to run ClickHouse as a database server, check out the [Quick Start](../../quick-start.mdx).
|
||||
:::
|
||||
|
||||
## Query data in a CSV file using SQL
|
||||
## Query data in a file using SQL {#query_data_in_file}
|
||||
|
||||
A common use of `clickhouse-local` is to run ad-hoc queries on files: where you don't have to insert the data into a table. `clickhouse-local` can stream the data from a file into a temporary table and execute your SQL.
|
||||
|
||||
@ -57,6 +57,19 @@ The `file` table function creates a table, and you can use `DESCRIBE` to see the
|
||||
./clickhouse local -q "DESCRIBE file('reviews.tsv')"
|
||||
```
|
||||
|
||||
:::tip
|
||||
You are allowed to use globs in file name (See [glob substitutions](/docs/en/sql-reference/table-functions/file.md/#globs-in-path)).
|
||||
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
./clickhouse local -q "SELECT * FROM 'reviews*.jsonl'"
|
||||
./clickhouse local -q "SELECT * FROM 'review_?.csv'"
|
||||
./clickhouse local -q "SELECT * FROM 'review_{1..3}.csv'"
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
```response
|
||||
marketplace Nullable(String)
|
||||
customer_id Nullable(Int64)
|
||||
|
@ -110,3 +110,42 @@ Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec.
|
||||
├──────────┼──────────┤
|
||||
...
|
||||
```
|
||||
|
||||
## Запрос данных в файле с помощью SQL {#query_data_in_file}
|
||||
|
||||
Часто `clickhouse-local` используется для выполнения специальных запросов к файлам, когда не нужно вставлять данные в таблицу. `clickhouse-local` может транслировать данные из файла во временную таблицу и выполнить ваш SQL.
|
||||
|
||||
Если файл находится на той же машине, что и `clickhouse-local`, то можно просто указать файл для загрузки. Следующий файл `reviews.tsv` содержит выборку отзывов о товарах Amazon:
|
||||
|
||||
```bash
|
||||
./clickhouse local -q "SELECT * FROM 'reviews.tsv'"
|
||||
```
|
||||
|
||||
Эта команда является сокращением команды:
|
||||
|
||||
```bash
|
||||
./clickhouse local -q "SELECT * FROM file('reviews.tsv')"
|
||||
```
|
||||
|
||||
ClickHouse знает, что файл использует формат, разделенный табуляцией, из расширения имени файла. Если необходимо явно указать формат, просто добавьте один из [множества входных форматов ClickHouse](../../interfaces/formats.md):
|
||||
|
||||
```bash
|
||||
./clickhouse local -q "SELECT * FROM file('reviews.tsv', 'TabSeparated')"
|
||||
```
|
||||
|
||||
Функция таблицы `file` создает таблицу, и вы можете использовать `DESCRIBE` для просмотра предполагаемой схемы:
|
||||
|
||||
```bash
|
||||
./clickhouse local -q "DESCRIBE file('reviews.tsv')"
|
||||
```
|
||||
|
||||
:::tip
|
||||
В имени файла разрешается использовать [Шаблоны поиска](/docs/ru/sql-reference/table-functions/file.md/#globs-in-path).
|
||||
|
||||
Примеры:
|
||||
|
||||
```bash
|
||||
./clickhouse local -q "SELECT * FROM 'reviews*.jsonl'"
|
||||
./clickhouse local -q "SELECT * FROM 'review_?.csv'"
|
||||
./clickhouse local -q "SELECT * FROM 'review_{1..3}.csv'"
|
||||
```
|
||||
|
@ -318,3 +318,8 @@ inline void trim(std::string & str, char c = ' ')
|
||||
trimRight(str, c);
|
||||
trimLeft(str, c);
|
||||
}
|
||||
|
||||
constexpr bool containsGlobs(const std::string & str)
|
||||
{
|
||||
return str.find_first_of("*?{") != std::string::npos;
|
||||
}
|
||||
|
@ -81,6 +81,8 @@ bool DatabaseFilesystem::checkTableFilePath(const std::string & table_path, Cont
|
||||
throw Exception(ErrorCodes::PATH_ACCESS_DENIED, "File is not inside {}", user_files_path);
|
||||
}
|
||||
|
||||
if (!containsGlobs(table_path))
|
||||
{
|
||||
/// Check if the corresponding file exists.
|
||||
if (!fs::exists(table_path))
|
||||
{
|
||||
@ -93,11 +95,11 @@ bool DatabaseFilesystem::checkTableFilePath(const std::string & table_path, Cont
|
||||
if (!fs::is_regular_file(table_path))
|
||||
{
|
||||
if (throw_on_error)
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST,
|
||||
"File is directory, but expected a file: {}", table_path);
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File is directory, but expected a file: {}", table_path);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -141,19 +143,18 @@ StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr cont
|
||||
if (!checkTableFilePath(table_path, context_, throw_on_error))
|
||||
return {};
|
||||
|
||||
String format = FormatFactory::instance().getFormatFromFileName(table_path, throw_on_error);
|
||||
auto format = FormatFactory::instance().getFormatFromFileName(table_path, throw_on_error);
|
||||
if (format.empty())
|
||||
return {};
|
||||
|
||||
/// If the file exists, create a new table using TableFunctionFile and return it.
|
||||
auto args = makeASTFunction("file", std::make_shared<ASTLiteral>(table_path), std::make_shared<ASTLiteral>(format));
|
||||
auto ast_function_ptr = makeASTFunction("file", std::make_shared<ASTLiteral>(table_path), std::make_shared<ASTLiteral>(format));
|
||||
|
||||
auto table_function = TableFunctionFactory::instance().get(args, context_);
|
||||
auto table_function = TableFunctionFactory::instance().get(ast_function_ptr, context_);
|
||||
if (!table_function)
|
||||
return nullptr;
|
||||
|
||||
/// TableFunctionFile throws exceptions, if table cannot be created.
|
||||
auto table_storage = table_function->execute(args, context_, name);
|
||||
auto table_storage = table_function->execute(ast_function_ptr, context_, name);
|
||||
if (table_storage)
|
||||
addTable(name, table_storage);
|
||||
|
||||
|
@ -5,5 +5,11 @@ implicit:
|
||||
4
|
||||
Test 2: check Filesystem database
|
||||
4
|
||||
30
|
||||
10
|
||||
4
|
||||
3
|
||||
2
|
||||
1
|
||||
Test 3: check show database with Filesystem
|
||||
test02707
|
||||
|
@ -15,6 +15,23 @@ echo '2,"def",456,"bacabaa"' >> $dir/tmp.csv
|
||||
echo '3,"story",78912,"acabaab"' >> $dir/tmp.csv
|
||||
echo '4,"history",21321321,"cabaaba"' >> $dir/tmp.csv
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_1.csv') select * from numbers(1, 10)"
|
||||
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_2.csv') select * from numbers(11, 10)"
|
||||
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_30.csv') select * from numbers(21, 10)"
|
||||
|
||||
readonly nested_dir=$dir/nested
|
||||
[[ -d $nested_dir ]] && rm -rd $nested_dir
|
||||
mkdir $nested_dir
|
||||
mkdir $nested_dir/subnested
|
||||
|
||||
cp ${dir}/tmp_numbers_1.csv ${nested_dir}/nested_tmp_numbers_1.csv
|
||||
cp ${dir}/tmp_numbers_1.csv ${nested_dir}/subnested/subnested_tmp_numbers_1.csv
|
||||
|
||||
readonly other_nested_dir=$dir/other_nested
|
||||
[[ -d $other_nested_dir ]] && rm -rd $other_nested_dir
|
||||
mkdir $other_nested_dir
|
||||
cp ${dir}/tmp_numbers_1.csv ${other_nested_dir}/tmp_numbers_1.csv
|
||||
|
||||
#################
|
||||
echo "Test 1: check explicit and implicit call of the file table function"
|
||||
|
||||
@ -29,6 +46,12 @@ $CLICKHOUSE_LOCAL --multiline --multiquery -q """
|
||||
DROP DATABASE IF EXISTS test;
|
||||
CREATE DATABASE test ENGINE = Filesystem('${dir}');
|
||||
SELECT COUNT(*) FROM test.\`tmp.csv\`;
|
||||
SELECT COUNT(*) FROM test.\`tmp_numbers_*.csv\`;
|
||||
SELECT COUNT(*) FROM test.\`nested/nested_tmp_numbers_1*.csv\`;
|
||||
SELECT count(DISTINCT _path) FROM test.\`*.csv\`;
|
||||
SELECT count(DISTINCT _path) FROM test.\`**/*.csv\`;
|
||||
SELECT count(DISTINCT _path) FROM test.\`**/*.csv\` WHERE position(_path, '${nested_dir}') > 0;
|
||||
SELECT count(DISTINCT _path) FROM test.\`**/*.csv\` WHERE position(_path, '${nested_dir}') = 0;
|
||||
DROP DATABASE test;
|
||||
"""
|
||||
|
||||
|
@ -3,6 +3,14 @@ Test 1: create filesystem database and check implicit calls
|
||||
test1
|
||||
4
|
||||
4
|
||||
30
|
||||
10
|
||||
10
|
||||
4
|
||||
0
|
||||
2
|
||||
0
|
||||
OK
|
||||
4
|
||||
Test 2: check DatabaseFilesystem access rights and errors handling on server
|
||||
OK
|
||||
@ -13,3 +21,6 @@ OK
|
||||
OK
|
||||
OK
|
||||
OK
|
||||
OK
|
||||
OK
|
||||
OK
|
||||
|
@ -19,11 +19,17 @@ echo '3,"story",78912,"acabaab"' >> ${user_files_tmp_dir}/tmp.csv
|
||||
echo '4,"history",21321321,"cabaaba"' >> ${user_files_tmp_dir}/tmp.csv
|
||||
|
||||
tmp_dir=${CLICKHOUSE_TEST_UNIQUE_NAME}
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "insert into function file('$user_files_tmp_dir/tmp_numbers_1.csv') select * from numbers(1, 10)"
|
||||
$CLICKHOUSE_LOCAL -q "insert into function file('$user_files_tmp_dir/tmp_numbers_2.csv') select * from numbers(11, 10)"
|
||||
$CLICKHOUSE_LOCAL -q "insert into function file('$user_files_tmp_dir/tmp_numbers_30.csv') select * from numbers(21, 10)"
|
||||
|
||||
[[ -d $tmp_dir ]] && rm -rd $tmp_dir
|
||||
mkdir $tmp_dir
|
||||
cp ${user_files_tmp_dir}/tmp.csv ${tmp_dir}/tmp.csv
|
||||
cp ${user_files_tmp_dir}/tmp.csv ${user_files_tmp_dir}/tmp/tmp.csv
|
||||
cp ${user_files_tmp_dir}/tmp.csv ${user_files_tmp_dir}/tmp.myext
|
||||
cp ${user_files_tmp_dir}/tmp_numbers_1.csv ${user_files_tmp_dir}/tmp/tmp_numbers_1.csv
|
||||
|
||||
#################
|
||||
echo "Test 1: create filesystem database and check implicit calls"
|
||||
@ -35,6 +41,15 @@ echo $?
|
||||
${CLICKHOUSE_CLIENT} --query "SHOW DATABASES" | grep "test1"
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/tmp.csv\`;"
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/tmp/tmp.csv\`;"
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/tmp_numbers_*.csv\`;"
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/tmp/*tmp_numbers_*.csv\`;"
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/*/*tmp_numbers_*.csv\`;"
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT count(DISTINCT _path) FROM test1.\`${unique_name}/*.csv\` WHERE startsWith(_path, '${user_files_tmp_dir}')";
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT count(DISTINCT _path) FROM test1.\`${unique_name}/*.csv\` WHERE not startsWith(_path, '${user_files_tmp_dir}')";
|
||||
# **/* does not search in the current directory but searches recursively in nested directories.
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT count(DISTINCT _path) FROM test1.\`${unique_name}/**/*.csv\` WHERE startsWith(_path, '${user_files_tmp_dir}')";
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT count(DISTINCT _path) FROM test1.\`${unique_name}/**/*.csv\` WHERE not startsWith(_path, '${user_files_tmp_dir}')";
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`tmp_numbers_*.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||:
|
||||
${CLICKHOUSE_LOCAL} -q "SELECT COUNT(*) FROM \"${tmp_dir}/tmp.csv\""
|
||||
|
||||
#################
|
||||
@ -42,6 +57,9 @@ echo "Test 2: check DatabaseFilesystem access rights and errors handling on serv
|
||||
# DATABASE_ACCESS_DENIED: Allows list files only inside user_files
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../tmp.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "PATH_ACCESS_DENIED" > /dev/null && echo "OK" || echo 'FAIL' ||:
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`/tmp/tmp.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "PATH_ACCESS_DENIED" > /dev/null && echo "OK" || echo 'FAIL' ||:
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../*/tmp_numbers_*.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "PATH_ACCESS_DENIED" > /dev/null && echo "OK" || echo 'FAIL' ||:
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../tmp_numbers_*.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "PATH_ACCESS_DENIED" > /dev/null && echo "OK" || echo 'FAIL' ||:
|
||||
${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../*.csv\`;" 2>&1 | tr '\n' ' ' | grep -oF "PATH_ACCESS_DENIED" > /dev/null && echo "OK" || echo 'FAIL' ||:
|
||||
${CLICKHOUSE_CLIENT} --multiline --multiquery --query """
|
||||
USE test1;
|
||||
SELECT COUNT(*) FROM \"../${tmp_dir}/tmp.csv\";
|
||||
|
@ -25,3 +25,37 @@ Test 3a: check literal no parsing overflow
|
||||
1
|
||||
Test 3b: check literal empty
|
||||
1
|
||||
Test 4: select using * wildcard
|
||||
30
|
||||
30
|
||||
30
|
||||
30
|
||||
30
|
||||
10
|
||||
30
|
||||
10
|
||||
Test 4b: select using ? wildcard
|
||||
20
|
||||
10
|
||||
20
|
||||
10
|
||||
20
|
||||
Test 4c: select using '{' + '}' wildcards
|
||||
20
|
||||
20
|
||||
1
|
||||
Test 4d: select using ? and * wildcards
|
||||
30
|
||||
30
|
||||
30
|
||||
1
|
||||
30
|
||||
30
|
||||
Test 4e: select using ?, * and '{' + '}' wildcards
|
||||
10
|
||||
20
|
||||
20
|
||||
20
|
||||
Test 4f: recursive search
|
||||
2
|
||||
1
|
||||
|
@ -7,6 +7,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
dir=${CLICKHOUSE_TEST_UNIQUE_NAME}
|
||||
[[ -d $dir ]] && rm -rd $dir
|
||||
mkdir $dir
|
||||
mkdir $dir/nested
|
||||
mkdir $dir/nested/nested
|
||||
|
||||
# Create temporary csv file for tests
|
||||
echo '"id","str","int","text"' > $dir/tmp.csv
|
||||
@ -15,6 +17,14 @@ echo '2,"def",456,"bacabaa"' >> $dir/tmp.csv
|
||||
echo '3,"story",78912,"acabaab"' >> $dir/tmp.csv
|
||||
echo '4,"history",21321321,"cabaaba"' >> $dir/tmp.csv
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_1.jsonl') select * from numbers(1, 10)"
|
||||
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_2.jsonl') select * from numbers(11, 10)"
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/tmp_numbers_30.jsonl') select * from numbers(21, 10)"
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/nested/nested_numbers.jsonl') select * from numbers(1)"
|
||||
$CLICKHOUSE_LOCAL -q "insert into function file('$dir/nested/nested/nested_nested_numbers.jsonl') select * from numbers(1)"
|
||||
|
||||
#################
|
||||
echo "Test 1: check double quotes"
|
||||
|
||||
@ -52,5 +62,48 @@ echo "Test 3b: check literal empty"
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "SELECT * FROM ''" 2>&1 | grep -c "SYNTAX_ERROR"
|
||||
|
||||
echo "Test 4: select using * wildcard"
|
||||
# Extension is required for auto table structure detection
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers_*.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/**.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/**********************.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*_numbers_*.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*_nu*ers_*.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*_nu*ers_2.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*tmp_numbers_*.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*tmp_numbers_1*.jsonl'"
|
||||
|
||||
echo "Test 4b: select using ? wildcard"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers_?.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers_??.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/??p_numbers??.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_n?mbers_1.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/t?p_n?mbers_?.jsonl'"
|
||||
|
||||
echo "Test 4c: select using '{' + '}' wildcards"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers_{1..3}.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers_{1,2}.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/tmp_numbers__{1,2}.jsonl'" 2>&1 | grep -c "CANNOT_EXTRACT_TABLE_STRUCTURE"
|
||||
|
||||
echo "Test 4d: select using ? and * wildcards"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/?*.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/?*????.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/?*?***_.jsonl'" 2>&1 | grep -c "CANNOT_EXTRACT_TABLE_STRUCTURE"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/?*????_*.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?*_num*e?s_*.jsonl'"
|
||||
|
||||
echo "Test 4e: select using ?, * and '{' + '}' wildcards"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?{1,3}.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?{1..3}.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?*_num*e?s_{1..3}.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/*?*_num*e?s_{1,2}.jsonl'"
|
||||
|
||||
echo "Test 4f: recursive search"
|
||||
# /**/* pattern does not look in current directory
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/**/*.jsonl'"
|
||||
$CLICKHOUSE_LOCAL -q "SELECT count(*) FROM '$dir/nested/**/*.jsonl'"
|
||||
|
||||
|
||||
# Remove temporary dir with files
|
||||
rm -rd $dir
|
||||
|
Loading…
Reference in New Issue
Block a user