mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-11 08:52:06 +00:00
59 lines
2.5 KiB
Bash
59 lines
2.5 KiB
Bash
|
#!/usr/bin/env bash
|
||
|
|
||
|
#
|
||
|
# Load all possible .parquet files found in submodules.
|
||
|
# TODO: Add more files.
|
||
|
#
|
||
|
|
||
|
# To regenerate data install perl JSON::XS module: sudo apt install libjson-xs-perl
|
||
|
|
||
|
# Also 5 sample files from
|
||
|
# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata1.parquet
|
||
|
# ...
|
||
|
# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata5.parquet
|
||
|
|
||
|
|
||
|
# set -x
|
||
|
|
||
|
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||
|
. $CUR_DIR/../shell_config.sh
|
||
|
|
||
|
CB_DIR=$(dirname "$CLICKHOUSE_CLIENT_BINARY")
|
||
|
[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../../..
|
||
|
[ "$CB_DIR" != "." ] && BUILD_DIR=$CB_DIR/../..
|
||
|
[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../../..
|
||
|
|
||
|
DATA_DIR=$CUR_DIR/data_parquet
|
||
|
|
||
|
# To update:
|
||
|
# cp $ROOT_DIR/contrib/arrow/cpp/submodules/parquet-testing/data/*.parquet $ROOT_DIR/contrib/arrow/python/pyarrow/tests/data/parquet/*.parquet $CUR_DIR/data_parquet/
|
||
|
|
||
|
# BUG! nulls.snappy.parquet - parquet-reader shows wrong structure. Actual structure is {"type":"struct","fields":[{"name":"b_struct","type":{"type":"struct","fields":[{"name":"b_c_int","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]}
|
||
|
# why? repeated_no_annotation.parquet
|
||
|
|
||
|
for NAME in `ls -1 $DATA_DIR/*.parquet | xargs -n 1 basename | sort`; do
|
||
|
echo === Try load data from $NAME
|
||
|
|
||
|
JSON=$DATA_DIR/$NAME.json
|
||
|
COLUMNS_FILE=$DATA_DIR/$NAME.columns
|
||
|
|
||
|
# If you want change or add .parquet file - rm data_parquet/*.json data_parquet/*.columns
|
||
|
[ -n "$BUILD_DIR" ] && [ ! -s $COLUMNS_FILE ] && [ ! -s $JSON ] && $BUILD_DIR/contrib/arrow-cmake/parquet-reader --json $DATA_DIR/$NAME > $JSON
|
||
|
[ -n "$BUILD_DIR" ] && [ ! -s $COLUMNS_FILE ] && $CUR_DIR/00900_parquet_create_table_columns.pl $JSON > $COLUMNS_FILE
|
||
|
|
||
|
# Debug only:
|
||
|
# [ -n "$BUILD_DIR" ] && $BUILD_DIR/contrib/arrow-cmake/parquet-reader $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump
|
||
|
|
||
|
#COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.pl $JSON` 2>&1 || continue
|
||
|
COLUMNS=`cat $COLUMNS_FILE` || continue
|
||
|
|
||
|
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_load"
|
||
|
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_load ($COLUMNS) ENGINE = Memory"
|
||
|
|
||
|
# Some files is broken, exception is ok.
|
||
|
cat $DATA_DIR/$NAME | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_load FORMAT Parquet" 2>&1 | sed 's/Exception/Ex---tion/'
|
||
|
|
||
|
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_load LIMIT 100"
|
||
|
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.parquet_load"
|
||
|
done
|