2019-02-19 20:51:44 +00:00
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
|
|
|
#
|
|
|
|
# Load all possible .parquet files found in submodules.
|
|
|
|
# TODO: Add more files.
|
|
|
|
#
|
|
|
|
|
|
|
|
# Also 5 sample files from
|
|
|
|
# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata1.parquet
|
|
|
|
# ...
|
|
|
|
# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata5.parquet
|
|
|
|
|
|
|
|
|
|
|
|
# set -x
|
|
|
|
|
|
|
|
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
2020-12-28 11:46:53 +00:00
|
|
|
# shellcheck source=../shell_config.sh
|
2020-08-01 00:56:32 +00:00
|
|
|
. "$CUR_DIR"/../shell_config.sh
|
2019-02-19 20:51:44 +00:00
|
|
|
|
|
|
|
CB_DIR=$(dirname "$CLICKHOUSE_CLIENT_BINARY")
|
2020-11-07 13:25:57 +00:00
|
|
|
[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../..
|
|
|
|
[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../..
|
2019-02-19 20:51:44 +00:00
|
|
|
|
|
|
|
DATA_DIR=$CUR_DIR/data_parquet
|
|
|
|
|
2020-11-07 13:25:57 +00:00
|
|
|
[ -n "$ROOT_DIR" ] && [ -z "$PARQUET_READER" ] && PARQUET_READER="$ROOT_DIR"/contrib/arrow/cpp/build/release/parquet-reader
|
|
|
|
|
2019-02-19 20:51:44 +00:00
|
|
|
# To update:
|
|
|
|
# cp $ROOT_DIR/contrib/arrow/cpp/submodules/parquet-testing/data/*.parquet $ROOT_DIR/contrib/arrow/python/pyarrow/tests/data/parquet/*.parquet $CUR_DIR/data_parquet/
|
|
|
|
|
2020-11-08 02:27:33 +00:00
|
|
|
# ClickHouse Parquet reader doesn't support such complex types, so I didn't burrow into the issue.
|
2020-11-07 13:25:57 +00:00
|
|
|
# There is failure due parsing nested arrays or nested maps with NULLs:
|
|
|
|
# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())
|
2020-11-08 02:27:33 +00:00
|
|
|
|
2020-11-08 12:00:55 +00:00
|
|
|
# Strange behaviour for repeated_no_annotation.parquet around __buitin_expect, so this file was disabled:
|
2020-11-08 02:27:33 +00:00
|
|
|
# debug:
|
|
|
|
# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type)
|
|
|
|
# release:
|
|
|
|
# Code: 349. DB::Ex---tion: Can not insert NULL data into non-nullable column "phoneNumbers": data for INSERT was parsed from stdin
|
2019-02-19 20:51:44 +00:00
|
|
|
|
2020-11-08 01:32:27 +00:00
|
|
|
for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | LC_ALL=C sort); do
|
2020-08-01 00:56:32 +00:00
|
|
|
echo === Try load data from "$NAME"
|
2019-02-19 20:51:44 +00:00
|
|
|
|
|
|
|
JSON=$DATA_DIR/$NAME.json
|
|
|
|
COLUMNS_FILE=$DATA_DIR/$NAME.columns
|
|
|
|
|
|
|
|
# If you want change or add .parquet file - rm data_parquet/*.json data_parquet/*.columns
|
2020-11-07 13:25:57 +00:00
|
|
|
[ -n "$PARQUET_READER" ] && [ ! -s "$COLUMNS_FILE" ] && [ ! -s "$JSON" ] && "$PARQUET_READER" --json "$DATA_DIR"/"$NAME" > "$JSON"
|
2020-11-08 01:32:27 +00:00
|
|
|
[ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/helpers/00900_parquet_create_table_columns.py "$JSON" > "$COLUMNS_FILE"
|
2019-02-19 20:51:44 +00:00
|
|
|
|
|
|
|
# Debug only:
|
2020-11-07 13:25:57 +00:00
|
|
|
# [ -n "$PARQUET_READER" ] && $PARQUET_READER $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump
|
2019-02-19 20:51:44 +00:00
|
|
|
|
2020-11-07 13:25:57 +00:00
|
|
|
# COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.py $JSON` 2>&1 || continue
|
2020-08-01 01:20:22 +00:00
|
|
|
COLUMNS=$(cat "$COLUMNS_FILE") || continue
|
2019-02-19 20:51:44 +00:00
|
|
|
|
2019-04-16 14:13:13 +00:00
|
|
|
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load"
|
|
|
|
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load ($COLUMNS) ENGINE = Memory"
|
2019-02-19 20:51:44 +00:00
|
|
|
|
2020-11-07 13:25:57 +00:00
|
|
|
# Some files contain unsupported data structures, exception is ok.
|
2020-08-01 00:56:32 +00:00
|
|
|
cat "$DATA_DIR"/"$NAME" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_load FORMAT Parquet" 2>&1 | sed 's/Exception/Ex---tion/'
|
2019-02-19 20:51:44 +00:00
|
|
|
|
2019-04-16 14:13:13 +00:00
|
|
|
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load LIMIT 100"
|
|
|
|
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_load"
|
2019-02-19 20:51:44 +00:00
|
|
|
done
|