Merge remote-tracking branch 'blessed/master' into parallel_replicas_cte_fix

This commit is contained in:
Raúl Marín 2024-02-01 12:04:46 +01:00
commit 18386f7fb9
31 changed files with 652 additions and 150 deletions

View File

@ -16,29 +16,30 @@ message(STATUS "Checking Rust toolchain for current target")
# See https://doc.rust-lang.org/nightly/rustc/platform-support.html
if(CMAKE_TOOLCHAIN_FILE MATCHES "ppc64le")
set(Rust_CARGO_TARGET "powerpc64le-unknown-linux-gnu")
elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl"))
set(Rust_CARGO_TARGET "x86_64-unknown-linux-musl")
elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64")
set(Rust_CARGO_TARGET "x86_64-unknown-linux-gnu")
elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl"))
set(Rust_CARGO_TARGET "aarch64-unknown-linux-musl")
elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64")
set(Rust_CARGO_TARGET "aarch64-unknown-linux-gnu")
elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64"))
set(Rust_CARGO_TARGET "x86_64-apple-darwin")
elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "aarch64"))
set(Rust_CARGO_TARGET "aarch64-apple-darwin")
elseif((CMAKE_TOOLCHAIN_FILE MATCHES "freebsd") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64"))
set(Rust_CARGO_TARGET "x86_64-unknown-freebsd")
elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-riscv64")
set(Rust_CARGO_TARGET "riscv64gc-unknown-linux-gnu")
else()
message(FATAL_ERROR "Unsupported rust target")
endif()
message(STATUS "Switched Rust target to ${Rust_CARGO_TARGET}")
if(DEFINED CMAKE_TOOLCHAIN_FILE)
if(CMAKE_TOOLCHAIN_FILE MATCHES "ppc64le")
set(Rust_CARGO_TARGET "powerpc64le-unknown-linux-gnu")
elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl"))
set(Rust_CARGO_TARGET "x86_64-unknown-linux-musl")
elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64")
set(Rust_CARGO_TARGET "x86_64-unknown-linux-gnu")
elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl"))
set(Rust_CARGO_TARGET "aarch64-unknown-linux-musl")
elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64")
set(Rust_CARGO_TARGET "aarch64-unknown-linux-gnu")
elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64"))
set(Rust_CARGO_TARGET "x86_64-apple-darwin")
elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "aarch64"))
set(Rust_CARGO_TARGET "aarch64-apple-darwin")
elseif((CMAKE_TOOLCHAIN_FILE MATCHES "freebsd") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64"))
set(Rust_CARGO_TARGET "x86_64-unknown-freebsd")
elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-riscv64")
set(Rust_CARGO_TARGET "riscv64gc-unknown-linux-gnu")
else()
message(FATAL_ERROR "Unsupported rust target")
endif()
message(STATUS "Switched Rust target to ${Rust_CARGO_TARGET}")
endif ()
# FindRust.cmake
list(APPEND CMAKE_MODULE_PATH "${ClickHouse_SOURCE_DIR}/contrib/corrosion/cmake")

2
contrib/curl vendored

@ -1 +1 @@
Subproject commit d755a5f7c009dd63a61b2c745180d8ba937cbfeb
Subproject commit 7161cb17c01dcff1dc5bf89a18437d9d729f1ecd

2
contrib/libxml2 vendored

@ -1 +1 @@
Subproject commit 223cb03a5d27b1b2393b266a8657443d046139d6
Subproject commit 8292f361458fcffe0bff515a385be02e9d35582c

View File

@ -21,7 +21,7 @@ extern "C" {
* your library and includes mismatch
*/
#ifndef LIBXML2_COMPILING_MSCCDEF
XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
XMLPUBFUN void xmlCheckVersion(int version);
#endif /* LIBXML2_COMPILING_MSCCDEF */
/**
@ -29,28 +29,28 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
*
* the version string like "1.2.3"
*/
#define LIBXML_DOTTED_VERSION "2.10.3"
#define LIBXML_DOTTED_VERSION "2.12.4"
/**
* LIBXML_VERSION:
*
* the version number: 1.2.3 value is 10203
*/
#define LIBXML_VERSION 21003
#define LIBXML_VERSION 21204
/**
* LIBXML_VERSION_STRING:
*
* the version number string, 1.2.3 value is "10203"
*/
#define LIBXML_VERSION_STRING "21003"
#define LIBXML_VERSION_STRING "21204"
/**
* LIBXML_VERSION_EXTRA:
*
* extra version information, used to show a git commit description
*/
#define LIBXML_VERSION_EXTRA ""
#define LIBXML_VERSION_EXTRA "-GITv2.12.4"
/**
* LIBXML_TEST_VERSION:
@ -58,7 +58,7 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
* Macro to check that the libxml version in use is compatible with
* the version the software has been compiled against
*/
#define LIBXML_TEST_VERSION xmlCheckVersion(21003);
#define LIBXML_TEST_VERSION xmlCheckVersion(21204);
#ifndef VMS
#if 0
@ -270,7 +270,7 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
*
* Whether iconv support is available
*/
#if 0
#if 1
#define LIBXML_ICONV_ENABLED
#endif
@ -313,7 +313,7 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
/**
* LIBXML_DEBUG_RUNTIME:
*
* Whether the runtime debugging is configured in
* Removed
*/
#if 0
#define LIBXML_DEBUG_RUNTIME
@ -409,12 +409,7 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
#endif
#ifdef __GNUC__
/**
* ATTRIBUTE_UNUSED:
*
* Macro used to signal to GCC unused function parameters
*/
/** DOC_DISABLE */
#ifndef ATTRIBUTE_UNUSED
# if ((__GNUC__ > 2) || ((__GNUC__ == 2) && (__GNUC_MINOR__ >= 7)))
@ -424,12 +419,6 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
# endif
#endif
/**
* LIBXML_ATTR_ALLOC_SIZE:
*
* Macro used to indicate to GCC this is an allocator function
*/
#ifndef LIBXML_ATTR_ALLOC_SIZE
# if (!defined(__clang__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3))))
# define LIBXML_ATTR_ALLOC_SIZE(x) __attribute__((alloc_size(x)))
@ -440,12 +429,6 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
# define LIBXML_ATTR_ALLOC_SIZE(x)
#endif
/**
* LIBXML_ATTR_FORMAT:
*
* Macro used to indicate to GCC the parameter are printf like
*/
#ifndef LIBXML_ATTR_FORMAT
# if ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)))
# define LIBXML_ATTR_FORMAT(fmt,args) __attribute__((__format__(__printf__,fmt,args)))
@ -457,44 +440,69 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
#endif
#ifndef XML_DEPRECATED
# ifdef IN_LIBXML
# if defined (IN_LIBXML) || (__GNUC__ * 100 + __GNUC_MINOR__ < 301)
# define XML_DEPRECATED
# else
/* Available since at least GCC 3.1 */
# else
# define XML_DEPRECATED __attribute__((deprecated))
# endif
#endif
#if defined(__clang__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 406)
#if defined(__clang__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 800)
#define XML_IGNORE_FPTR_CAST_WARNINGS \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wpedantic\"") \
_Pragma("GCC diagnostic ignored \"-Wcast-function-type\"")
#else
#define XML_IGNORE_FPTR_CAST_WARNINGS \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wpedantic\"")
#endif
#define XML_POP_WARNINGS \
_Pragma("GCC diagnostic pop")
#else
#define XML_IGNORE_FPTR_CAST_WARNINGS
#define XML_POP_WARNINGS
#endif
#else /* ! __GNUC__ */
/**
* ATTRIBUTE_UNUSED:
*
* Macro used to signal to GCC unused function parameters
*/
#define ATTRIBUTE_UNUSED
/**
* LIBXML_ATTR_ALLOC_SIZE:
*
* Macro used to indicate to GCC this is an allocator function
*/
#define LIBXML_ATTR_ALLOC_SIZE(x)
/**
* LIBXML_ATTR_FORMAT:
*
* Macro used to indicate to GCC the parameter are printf like
*/
#define LIBXML_ATTR_FORMAT(fmt,args)
/**
* XML_DEPRECATED:
*
* Macro used to indicate that a function, variable, type or struct member
* is deprecated.
*/
#ifndef XML_DEPRECATED
#define XML_DEPRECATED
# if defined (IN_LIBXML) || !defined (_MSC_VER)
# define XML_DEPRECATED
/* Available since Visual Studio 2005 */
# elif defined (_MSC_VER) && (_MSC_VER >= 1400)
# define XML_DEPRECATED __declspec(deprecated)
# endif
#endif
#if defined (_MSC_VER) && (_MSC_VER >= 1400)
# define XML_IGNORE_FPTR_CAST_WARNINGS __pragma(warning(push))
#else
# define XML_IGNORE_FPTR_CAST_WARNINGS
#endif
#ifndef XML_POP_WARNINGS
# if defined (_MSC_VER) && (_MSC_VER >= 1400)
# define XML_POP_WARNINGS __pragma(warning(pop))
# else
# define XML_POP_WARNINGS
# endif
#endif
#endif /* __GNUC__ */
#define XML_NO_ATTR
#ifdef LIBXML_THREAD_ENABLED
#define XML_DECLARE_GLOBAL(name, type, attrs) \
attrs XMLPUBFUN type *__##name(void);
#define XML_GLOBAL_MACRO(name) (*__##name())
#else
#define XML_DECLARE_GLOBAL(name, type, attrs) \
attrs XMLPUBVAR type name;
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */

View File

@ -293,10 +293,10 @@ if [ $failed_to_save_logs -ne 0 ]; then
# for files >64MB, we want this files to be compressed explicitly
for table in query_log zookeeper_log trace_log transactions_info_log metric_log
do
clickhouse-local "$data_path_config" --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
clickhouse-local "$data_path_config" --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
clickhouse-local --path /var/lib/clickhouse1/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst ||:
clickhouse-local --path /var/lib/clickhouse2/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst ||:
clickhouse-local --path /var/lib/clickhouse1/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst ||:
clickhouse-local --path /var/lib/clickhouse2/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst ||:
fi
done
fi

View File

@ -0,0 +1,342 @@
---
slug: /en/getting-started/example-datasets/noaa
sidebar_label: NOAA Global Historical Climatology Network
sidebar_position: 1
description: 2.5 billion rows of climate data for the last 120 yrs
---
# NOAA Global Historical Climatology Network
This dataset contains weather measurements for the last 120 years. Each row is a measurement for a point in time and station.
More precisely and according to the [origin of this data](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn):
> GHCN-Daily is a dataset that contains daily observations over global land areas. It contains station-based measurements from land-based stations worldwide, about two-thirds of which are for precipitation measurements only (Menne et al., 2012). GHCN-Daily is a composite of climate records from numerous sources that were merged together and subjected to a common suite of quality assurance reviews (Durre et al., 2010). The archive includes the following meteorological elements:
- Daily maximum temperature
- Daily minimum temperature
- Temperature at the time of observation
- Precipitation (i.e., rain, melted snow)
- Snowfall
- Snow depth
- Other elements where available
## Downloading the data
- A [pre-prepared version](#pre-prepared-data) of the data for ClickHouse, which has been cleansed, re-structured, and enriched. This data covers the years 1900 to 2022.
- [Download the original data](#original-data) and convert to the format required by ClickHouse. Users wanting to add their own columns may wish to explore this approach.
### Pre-prepared data
More specifically, rows have been removed that did not fail any quality assurance checks by Noaa. The data has also been restructured from a measurement per line to a row per station id and date, i.e.
```csv
"station_id","date","tempAvg","tempMax","tempMin","precipitation","snowfall","snowDepth","percentDailySun","averageWindSpeed","maxWindSpeed","weatherType"
"AEM00041194","2022-07-30",347,0,308,0,0,0,0,0,0,0
"AEM00041194","2022-07-31",371,413,329,0,0,0,0,0,0,0
"AEM00041194","2022-08-01",384,427,357,0,0,0,0,0,0,0
"AEM00041194","2022-08-02",381,424,352,0,0,0,0,0,0,0
```
This is simpler to query and ensures the resulting table is less sparse. Finally, the data has also been enriched with latitude and longitude.
This data is available in the following S3 location. Either download the data to your local filesystem (and insert using the ClickHouse client) or insert directly into ClickHouse (see [Inserting from S3](#inserting-from-s3)).
To download:
```bash
wget https://datasets-documentation.s3.eu-west-3.amazonaws.com/noaa/noaa_enriched.parquet
```
### Original data
The following details the steps to download and transform the original data in preparation for loading into ClickHouse.
#### Download
To download the original data:
```bash
for i in {1900..2023}; do wget https://noaa-ghcn-pds.s3.amazonaws.com/csv.gz/${i}.csv.gz; done
```
#### Sampling the data
```bash
$ clickhouse-local --query "SELECT * FROM '2021.csv.gz' LIMIT 10" --format PrettyCompact
┌─c1──────────┬───────c2─┬─c3───┬──c4─┬─c5───┬─c6───┬─c7─┬───c8─┐
│ AE000041196 │ 20210101 │ TMAX │ 278 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │
│ AE000041196 │ 20210101 │ PRCP │ 0 │ D │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │
│ AE000041196 │ 20210101 │ TAVG │ 214 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │
│ AEM00041194 │ 20210101 │ TMAX │ 266 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │
│ AEM00041194 │ 20210101 │ TMIN │ 178 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │
│ AEM00041194 │ 20210101 │ PRCP │ 0 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │
│ AEM00041194 │ 20210101 │ TAVG │ 217 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │
│ AEM00041217 │ 20210101 │ TMAX │ 262 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │
│ AEM00041217 │ 20210101 │ TMIN │ 155 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │
│ AEM00041217 │ 20210101 │ TAVG │ 202 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │
└─────────────┴──────────┴──────┴─────┴──────┴──────┴────┴──────┘
```
Summarizing the [format documentation](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn):
Summarizing the format documentation and the columns in order:
- An 11 character station identification code. This itself encodes some useful information
- YEAR/MONTH/DAY = 8 character date in YYYYMMDD format (e.g. 19860529 = May 29, 1986)
- ELEMENT = 4 character indicator of element type. Effectively the measurement type. While there are many measurements available, we select the following:
- PRCP - Precipitation (tenths of mm)
- SNOW - Snowfall (mm)
- SNWD - Snow depth (mm)
- TMAX - Maximum temperature (tenths of degrees C)
- TAVG - Average temperature (tenths of a degree C)
- TMIN - Minimum temperature (tenths of degrees C)
- PSUN - Daily percent of possible sunshine (percent)
- AWND - Average daily wind speed (tenths of meters per second)
- WSFG - Peak gust wind speed (tenths of meters per second)
- WT** = Weather Type where ** defines the weather type. Full list of weather types here.
- DATA VALUE = 5 character data value for ELEMENT i.e. the value of the measurement.
- M-FLAG = 1 character Measurement Flag. This has 10 possible values. Some of these values indicate questionable data accuracy. We accept data where this is set to “P” - identified as missing presumed zero, as this is only relevant to the PRCP, SNOW and SNWD measurements.
- Q-FLAG is the measurement quality flag with 14 possible values. We are only interested in data with an empty value i.e. it did not fail any quality assurance checks.
- S-FLAG is the source flag for the observation. Not useful for our analysis and ignored.
- OBS-TIME = 4-character time of observation in hour-minute format (i.e. 0700 =7:00 am). Typically not present in older data. We ignore this for our purposes.
A measurement per line would result in a sparse table structure in ClickHouse. We should transform to a row per time and station, with measurements as columns. First, we limit the dataset to those rows without issues i.e. where `qFlag` is equal to an empty string.
#### Clean the data
Using [ClickHouse local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) we can filter rows that represent measurements of interest and pass our quality requirements:
```bash
clickhouse local --query "SELECT count()
FROM file('*.csv.gz', CSV, 'station_id String, date String, measurement String, value Int64, mFlag String, qFlag String, sFlag String, obsTime String') WHERE qFlag = '' AND (measurement IN ('PRCP', 'SNOW', 'SNWD', 'TMAX', 'TAVG', 'TMIN', 'PSUN', 'AWND', 'WSFG') OR startsWith(measurement, 'WT'))"
2679264563
```
With over 2.6 billion rows, this isnt a fast query since it involves parsing all the files. On our 8 core machine, this takes around 160 seconds.
### Pivot data
While the measurement per line structure can be used with ClickHouse, it will unnecessarily complicate future queries. Ideally, we need a row per station id and date, where each measurement type and associated value are a column i.e.
```csv
"station_id","date","tempAvg","tempMax","tempMin","precipitation","snowfall","snowDepth","percentDailySun","averageWindSpeed","maxWindSpeed","weatherType"
"AEM00041194","2022-07-30",347,0,308,0,0,0,0,0,0,0
"AEM00041194","2022-07-31",371,413,329,0,0,0,0,0,0,0
"AEM00041194","2022-08-01",384,427,357,0,0,0,0,0,0,0
"AEM00041194","2022-08-02",381,424,352,0,0,0,0,0,0,0
```
Using ClickHouse local and a simple `GROUP BY`, we can repivot our data to this structure. To limit memory overhead, we do this one file at a time.
```bash
for i in {1900..2022}
do
clickhouse-local --query "SELECT station_id,
toDate32(date) as date,
anyIf(value, measurement = 'TAVG') as tempAvg,
anyIf(value, measurement = 'TMAX') as tempMax,
anyIf(value, measurement = 'TMIN') as tempMin,
anyIf(value, measurement = 'PRCP') as precipitation,
anyIf(value, measurement = 'SNOW') as snowfall,
anyIf(value, measurement = 'SNWD') as snowDepth,
anyIf(value, measurement = 'PSUN') as percentDailySun,
anyIf(value, measurement = 'AWND') as averageWindSpeed,
anyIf(value, measurement = 'WSFG') as maxWindSpeed,
toUInt8OrZero(replaceOne(anyIf(measurement, startsWith(measurement, 'WT') AND value = 1), 'WT', '')) as weatherType
FROM file('$i.csv.gz', CSV, 'station_id String, date String, measurement String, value Int64, mFlag String, qFlag String, sFlag String, obsTime String')
WHERE qFlag = '' AND (measurement IN ('PRCP', 'SNOW', 'SNWD', 'TMAX', 'TAVG', 'TMIN', 'PSUN', 'AWND', 'WSFG') OR startsWith(measurement, 'WT'))
GROUP BY station_id, date
ORDER BY station_id, date FORMAT CSV" >> "noaa.csv";
done
```
This query produces a single 50GB file `noaa.csv`.
### Enriching the data
The data has no indication of location aside from a station id, which includes a prefix country code. Ideally, each station would have a latitude and longitude associated with it. To achieve this, NOAA conveniently provides the details of each station as a separate [ghcnd-stations.txt](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn#format-of-ghcnd-stationstxt-file). This file has [several columns](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn#format-of-ghcnd-stationstxt-file), of which five are useful to our future analysis: id, latitude, longitude, elevation, and name.
```bash
wget http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-stations.txt
```
```bash
clickhouse local --query "WITH stations AS (SELECT id, lat, lon, elevation, splitByString(' GSN ',name)[1] as name FROM file('ghcnd-stations.txt', Regexp, 'id String, lat Float64, lon Float64, elevation Float32, name String'))
SELECT station_id,
date,
tempAvg,
tempMax,
tempMin,
precipitation,
snowfall,
snowDepth,
percentDailySun,
averageWindSpeed,
maxWindSpeed,
weatherType,
tuple(lon, lat) as location,
elevation,
name
FROM file('noaa.csv', CSV,
'station_id String, date Date32, tempAvg Int32, tempMax Int32, tempMin Int32, precipitation Int32, snowfall Int32, snowDepth Int32, percentDailySun Int8, averageWindSpeed Int32, maxWindSpeed Int32, weatherType UInt8') as noaa LEFT OUTER
JOIN stations ON noaa.station_id = stations.id INTO OUTFILE 'noaa_enriched.parquet' FORMAT Parquet SETTINGS format_regexp='^(.{11})\s+(\-?\d{1,2}\.\d{4})\s+(\-?\d{1,3}\.\d{1,4})\s+(\-?\d*\.\d*)\s+(.*)\s+(?:[\d]*)'"
```
This query takes a few minutes to run and produces a 6.4 GB file, `noaa_enriched.parquet`.
## Create table
Create a MergeTree table in ClickHouse (from the ClickHouse client).
```sql
CREATE TABLE noaa
(
`station_id` LowCardinality(String),
`date` Date32,
`tempAvg` Int32 COMMENT 'Average temperature (tenths of a degrees C)',
`tempMax` Int32 COMMENT 'Maximum temperature (tenths of degrees C)',
`tempMin` Int32 COMMENT 'Minimum temperature (tenths of degrees C)',
`precipitation` UInt32 COMMENT 'Precipitation (tenths of mm)',
`snowfall` UInt32 COMMENT 'Snowfall (mm)',
`snowDepth` UInt32 COMMENT 'Snow depth (mm)',
`percentDailySun` UInt8 COMMENT 'Daily percent of possible sunshine (percent)',
`averageWindSpeed` UInt32 COMMENT 'Average daily wind speed (tenths of meters per second)',
`maxWindSpeed` UInt32 COMMENT 'Peak gust wind speed (tenths of meters per second)',
`weatherType` Enum8('Normal' = 0, 'Fog' = 1, 'Heavy Fog' = 2, 'Thunder' = 3, 'Small Hail' = 4, 'Hail' = 5, 'Glaze' = 6, 'Dust/Ash' = 7, 'Smoke/Haze' = 8, 'Blowing/Drifting Snow' = 9, 'Tornado' = 10, 'High Winds' = 11, 'Blowing Spray' = 12, 'Mist' = 13, 'Drizzle' = 14, 'Freezing Drizzle' = 15, 'Rain' = 16, 'Freezing Rain' = 17, 'Snow' = 18, 'Unknown Precipitation' = 19, 'Ground Fog' = 21, 'Freezing Fog' = 22),
`location` Point,
`elevation` Float32,
`name` LowCardinality(String)
) ENGINE = MergeTree() ORDER BY (station_id, date);
```
## Inserting into ClickHouse
### Inserting from local file
Data can be inserted from a local file as follows (from the ClickHouse client):
```sql
INSERT INTO noaa FROM INFILE '<path>/noaa_enriched.parquet'
```
where `<path>` represents the full path to the local file on disk.
See [here](https://clickhouse.com/blog/real-world-data-noaa-climate-data#load-the-data) for how to speed this load up.
### Inserting from S3
```sql
INSERT INTO noaa SELECT *
FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/noaa/noaa_enriched.parquet')
```
For how to speed this up, see our blog post on [tuning large data loads](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part2).
## Sample queries
### Highest temperature ever
```sql
SELECT
tempMax / 10 AS maxTemp,
location,
name,
date
FROM blogs.noaa
WHERE tempMax > 500
ORDER BY
tempMax DESC,
date ASC
LIMIT 5
┌─maxTemp─┬─location──────────┬─name───────────────────────────────────────────┬───────date─┐
│ 56.7 │ (-116.8667,36.45) │ CA GREENLAND RCH │ 1913-07-10 │
│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1949-08-20 │
│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1949-09-18 │
│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1952-07-17 │
│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1952-09-04 │
└─────────┴───────────────────┴────────────────────────────────────────────────┴────────────┘
5 rows in set. Elapsed: 0.514 sec. Processed 1.06 billion rows, 4.27 GB (2.06 billion rows/s., 8.29 GB/s.)
```
Reassuringly consistent with the [documented record](https://en.wikipedia.org/wiki/List_of_weather_records#Highest_temperatures_ever_recorded) at [Furnace Creek](https://www.google.com/maps/place/36%C2%B027'00.0%22N+116%C2%B052'00.1%22W/@36.1329666,-116.1104099,8.95z/data=!4m5!3m4!1s0x0:0xf2ed901b860f4446!8m2!3d36.45!4d-116.8667) as of 2023.
### Best ski resorts
Using a [list of ski resorts](https://gist.githubusercontent.com/gingerwizard/dd022f754fd128fdaf270e58fa052e35/raw/622e03c37460f17ef72907afe554cb1c07f91f23/ski_resort_stats.csv) in the united states and their respective locations, we join these against the top 1000 weather stations with the most in any month in the last 5 yrs. Sorting this join by [geoDistance](https://clickhouse.com/docs/en/sql-reference/functions/geo/coordinates/#geodistance) and restricting the results to those where the distance is less than 20km, we select the top result per resort and sort this by total snow. Note we also restrict resorts to those above 1800m, as a broad indicator of good skiing conditions.
```sql
SELECT
resort_name,
total_snow / 1000 AS total_snow_m,
resort_location,
month_year
FROM
(
WITH resorts AS
(
SELECT
resort_name,
state,
(lon, lat) AS resort_location,
'US' AS code
FROM url('https://gist.githubusercontent.com/gingerwizard/dd022f754fd128fdaf270e58fa052e35/raw/622e03c37460f17ef72907afe554cb1c07f91f23/ski_resort_stats.csv', CSVWithNames)
)
SELECT
resort_name,
highest_snow.station_id,
geoDistance(resort_location.1, resort_location.2, station_location.1, station_location.2) / 1000 AS distance_km,
highest_snow.total_snow,
resort_location,
station_location,
month_year
FROM
(
SELECT
sum(snowfall) AS total_snow,
station_id,
any(location) AS station_location,
month_year,
substring(station_id, 1, 2) AS code
FROM noaa
WHERE (date > '2017-01-01') AND (code = 'US') AND (elevation > 1800)
GROUP BY
station_id,
toYYYYMM(date) AS month_year
ORDER BY total_snow DESC
LIMIT 1000
) AS highest_snow
INNER JOIN resorts ON highest_snow.code = resorts.code
WHERE distance_km < 20
ORDER BY
resort_name ASC,
total_snow DESC
LIMIT 1 BY
resort_name,
station_id
)
ORDER BY total_snow DESC
LIMIT 5
┌─resort_name──────────┬─total_snow_m─┬─resort_location─┬─month_year─┐
│ Sugar Bowl, CA │ 7.799 │ (-120.3,39.27) │ 201902 │
│ Donner Ski Ranch, CA │ 7.799 │ (-120.34,39.31) │ 201902 │
│ Boreal, CA │ 7.799 │ (-120.35,39.33) │ 201902 │
│ Homewood, CA │ 4.926 │ (-120.17,39.08) │ 201902 │
│ Alpine Meadows, CA │ 4.926 │ (-120.22,39.17) │ 201902 │
└──────────────────────┴──────────────┴─────────────────┴────────────┘
5 rows in set. Elapsed: 0.750 sec. Processed 689.10 million rows, 3.20 GB (918.20 million rows/s., 4.26 GB/s.)
Peak memory usage: 67.66 MiB.
```
## Credits
We would like to acknowledge the efforts of the Global Historical Climatology Network for preparing, cleansing, and distributing this data. We appreciate your efforts.
Menne, M.J., I. Durre, B. Korzeniewski, S. McNeal, K. Thomas, X. Yin, S. Anthony, R. Ray, R.S. Vose, B.E.Gleason, and T.G. Houston, 2012: Global Historical Climatology Network - Daily (GHCN-Daily), Version 3. [indicate subset used following decimal, e.g. Version 3.25]. NOAA National Centers for Environmental Information. http://doi.org/10.7289/V5D21VHZ [17/08/2020]

View File

@ -197,6 +197,29 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va
Instead of `--host`, `--port`, `--user` and `--password` options, ClickHouse client also supports connection strings (see next section).
## Aliases {#cli_aliases}
- `\l` - SHOW DATABASES
- `\d` - SHOW TABLES
- `\c <DATABASE>` - USE DATABASE
- `.` - repeat the last query
## Shortkeys {#shortkeys_aliases}
- `Alt (Option) + Shift + e` - open editor with current query. It is possible to set up an environment variable - `EDITOR`, by default vim is used.
- `Alt (Option) + #` - comment line.
- `Ctrl + r` - fuzzy history search.
:::tip
To configure the correct work of meta key (Option) on MacOS:
iTerm2: Go to Preferences -> Profile -> Keys -> Left Option key and click Esc+
:::
The full list with all available shortkeys - [replxx](https://github.com/AmokHuginnsson/replxx/blob/1f149bf/src/replxx_impl.cxx#L262).
## Connection string {#connection_string}
clickhouse-client alternatively supports connecting to clickhouse server using a connection string similar to [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostgreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). It has the following syntax:

View File

@ -45,11 +45,11 @@ clickhouse-benchmark [keys] < queries_file;
- `-c N`, `--concurrency=N` — Number of queries that `clickhouse-benchmark` sends simultaneously. Default value: 1.
- `-d N`, `--delay=N` — Interval in seconds between intermediate reports (to disable reports set 0). Default value: 1.
- `-h HOST`, `--host=HOST` — Server host. Default value: `localhost`. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-h` keys.
- `-p N`, `--port=N` — Server port. Default value: 9000. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-p` keys.
- `-i N`, `--iterations=N` — Total number of queries. Default value: 0 (repeat forever).
- `-r`, `--randomize` — Random order of queries execution if there is more than one input query.
- `-s`, `--secure` — Using `TLS` connection.
- `-t N`, `--timelimit=N` — Time limit in seconds. `clickhouse-benchmark` stops sending queries when the specified time limit is reached. Default value: 0 (time limit disabled).
- `--port=N` — Server port. Default value: 9000. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `--port` keys.
- `--confidence=N` — Level of confidence for T-test. Possible values: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Default value: 5. In the [comparison mode](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` performs the [Independent two-sample Students t-test](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) to determine whether the two distributions arent different with the selected level of confidence.
- `--cumulative` — Printing cumulative data instead of data per interval.
- `--database=DATABASE_NAME` — ClickHouse database name. Default value: `default`.

View File

@ -2,6 +2,7 @@ use prql_compiler::sql::Dialect;
use prql_compiler::{Options, Target};
use std::ffi::{c_char, CString};
use std::slice;
use std::panic;
fn set_output(result: String, out: *mut *mut u8, out_size: *mut u64) {
assert!(!out_size.is_null());
@ -13,8 +14,7 @@ fn set_output(result: String, out: *mut *mut u8, out_size: *mut u64) {
*out_ptr = CString::new(result).unwrap().into_raw() as *mut u8;
}
#[no_mangle]
pub unsafe extern "C" fn prql_to_sql(
pub unsafe extern "C" fn prql_to_sql_impl(
query: *const u8,
size: u64,
out: *mut *mut u8,
@ -50,6 +50,23 @@ pub unsafe extern "C" fn prql_to_sql(
}
}
#[no_mangle]
pub unsafe extern "C" fn prql_to_sql(
query: *const u8,
size: u64,
out: *mut *mut u8,
out_size: *mut u64,
) -> i64 {
let ret = panic::catch_unwind(|| {
return prql_to_sql_impl(query, size, out, out_size);
});
return match ret {
// NOTE: using cxxbridge we can return proper Result<> type.
Err(_err) => 1,
Ok(res) => res,
}
}
#[no_mangle]
pub unsafe extern "C" fn prql_free_pointer(ptr_to_free: *mut u8) {
std::mem::drop(CString::from_raw(ptr_to_free as *mut c_char));

View File

@ -1,6 +1,7 @@
use skim::prelude::*;
use term::terminfo::TermInfo;
use cxx::{CxxString, CxxVector};
use std::panic;
#[cxx::bridge]
mod ffi {
@ -36,7 +37,7 @@ impl SkimItem for Item {
}
}
fn skim(prefix: &CxxString, words: &CxxVector<CxxString>) -> Result<String, String> {
fn skim_impl(prefix: &CxxString, words: &CxxVector<CxxString>) -> Result<String, String> {
// Let's check is terminal available. To avoid panic.
if let Err(err) = TermInfo::from_env() {
return Err(format!("{}", err));
@ -89,3 +90,22 @@ fn skim(prefix: &CxxString, words: &CxxVector<CxxString>) -> Result<String, Stri
}
return Ok(output.selected_items[0].output().to_string());
}
fn skim(prefix: &CxxString, words: &CxxVector<CxxString>) -> Result<String, String> {
let ret = panic::catch_unwind(|| {
return skim_impl(prefix, words);
});
return match ret {
Err(err) => {
let e = if let Some(s) = err.downcast_ref::<String>() {
format!("{}", s)
} else if let Some(s) = err.downcast_ref::<&str>() {
format!("{}", s)
} else {
format!("Unknown panic type: {:?}", err.type_id())
};
Err(format!("Rust panic: {:?}", e))
},
Ok(res) => res,
}
}

View File

@ -17,6 +17,7 @@ namespace DB
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int INCORRECT_DATA;
extern const int LOGICAL_ERROR;
extern const int NOT_IMPLEMENTED;
}
@ -30,12 +31,12 @@ class ApproxSampler
public:
struct Stats
{
T value; // the sampled value
Int64 g; // the minimum rank jump from the previous value's minimum rank
Int64 delta; // the maximum span of the rank
T value; // The sampled value
Int64 g; // The minimum rank jump from the previous value's minimum rank
Int64 delta; // The maximum span of the rank
Stats() = default;
Stats(T value_, Int64 g_, Int64 delta_) : value(value_), g(g_), delta(delta_) {}
Stats(T value_, Int64 g_, Int64 delta_) : value(value_), g(g_), delta(delta_) { }
};
struct QueryResult
@ -49,20 +50,20 @@ public:
ApproxSampler() = default;
explicit ApproxSampler(
double relative_error_,
size_t compress_threshold_ = default_compress_threshold,
size_t count_ = 0,
bool compressed_ = false)
: relative_error(relative_error_)
, compress_threshold(compress_threshold_)
, count(count_)
, compressed(compressed_)
ApproxSampler(const ApproxSampler & other)
: relative_error(other.relative_error)
, compress_threshold(other.compress_threshold)
, count(other.count)
, compressed(other.compressed)
, sampled(other.sampled.begin(), other.sampled.end())
, backup_sampled(other.backup_sampled.begin(), other.backup_sampled.end())
, head_sampled(other.head_sampled.begin(), other.head_sampled.end())
{
sampled.reserve(compress_threshold);
backup_sampled.reserve(compress_threshold);
}
head_sampled.reserve(default_head_size);
explicit ApproxSampler(double relative_error_)
: relative_error(relative_error_), compress_threshold(default_compress_threshold), count(0), compressed(false)
{
}
bool isCompressed() const { return compressed; }
@ -95,9 +96,9 @@ public:
Int64 current_max = std::numeric_limits<Int64>::min();
for (const auto & stats : sampled)
current_max = std::max(stats.delta + stats.g, current_max);
Int64 target_error = current_max/2;
Int64 target_error = current_max / 2;
size_t index= 0;
size_t index = 0;
auto min_rank = sampled[0].g;
for (size_t i = 0; i < size; ++i)
{
@ -118,7 +119,6 @@ public:
result[indices[i]] = res.value;
}
}
}
void compress()
@ -256,16 +256,27 @@ public:
void read(ReadBuffer & buf)
{
readBinaryLittleEndian(compress_threshold, buf);
if (compress_threshold != default_compress_threshold)
throw Exception(
ErrorCodes::INCORRECT_DATA,
"The compress threshold {} isn't the expected one {}",
compress_threshold,
default_compress_threshold);
readBinaryLittleEndian(relative_error, buf);
readBinaryLittleEndian(count, buf);
size_t sampled_len = 0;
readBinaryLittleEndian(sampled_len, buf);
if (sampled_len > compress_threshold)
throw Exception(
ErrorCodes::INCORRECT_DATA, "The number of elements {} for quantileGK exceeds {}", sampled_len, compress_threshold);
sampled.resize(sampled_len);
for (size_t i = 0; i < sampled_len; ++i)
{
auto stats = sampled[i];
auto & stats = sampled[i];
readBinaryLittleEndian(stats.value, buf);
readBinaryLittleEndian(stats.g, buf);
readBinaryLittleEndian(stats.delta, buf);
@ -291,7 +302,7 @@ private:
min_rank += curr_sample.g;
}
}
return {sampled.size()-1, 0, sampled.back().value};
return {sampled.size() - 1, 0, sampled.back().value};
}
void withHeadBufferInserted()
@ -389,12 +400,11 @@ private:
double relative_error;
size_t compress_threshold;
size_t count = 0;
size_t count;
bool compressed;
PaddedPODArray<Stats> sampled;
PaddedPODArray<Stats> backup_sampled;
PaddedPODArray<T> head_sampled;
static constexpr size_t default_compress_threshold = 10000;
@ -406,17 +416,14 @@ class QuantileGK
{
private:
using Data = ApproxSampler<Value>;
mutable Data data;
Data data;
public:
QuantileGK() = default;
explicit QuantileGK(size_t accuracy) : data(1.0 / static_cast<double>(accuracy)) { }
void add(const Value & x)
{
data.insert(x);
}
void add(const Value & x) { data.insert(x); }
template <typename Weight>
void add(const Value &, const Weight &)
@ -429,22 +436,34 @@ public:
if (!data.isCompressed())
data.compress();
data.merge(rhs.data);
if (rhs.data.isCompressed())
data.merge(rhs.data);
else
{
/// We can't modify rhs, so copy it and compress
Data rhs_data_copy(rhs.data);
rhs_data_copy.compress();
data.merge(rhs_data_copy);
}
}
void serialize(WriteBuffer & buf) const
{
/// Always compress before serialization
if (!data.isCompressed())
data.compress();
data.write(buf);
if (data.isCompressed())
data.write(buf);
else
{
/// We can't modify rhs, so copy it and compress
Data data_copy(data);
data_copy.compress();
data_copy.write(buf);
}
}
void deserialize(ReadBuffer & buf)
{
data.read(buf);
/// Serialized data is always compressed
data.setCompressed();
}
@ -481,7 +500,6 @@ public:
}
};
template <typename Value, bool _> using FuncQuantileGK = AggregateFunctionQuantile<Value, QuantileGK<Value>, NameQuantileGK, false, void, false, true>;
template <typename Value, bool _> using FuncQuantilesGK = AggregateFunctionQuantile<Value, QuantileGK<Value>, NameQuantilesGK, false, void, true, true>;

View File

@ -136,12 +136,12 @@ namespace
{
void assertDigest(
const KeeperStorage::Digest & first,
const KeeperStorage::Digest & second,
const KeeperStorage::Digest & expected,
const KeeperStorage::Digest & actual,
const Coordination::ZooKeeperRequest & request,
bool committing)
{
if (!KeeperStorage::checkDigest(first, second))
if (!KeeperStorage::checkDigest(expected, actual))
{
LOG_FATAL(
getLogger("KeeperStateMachine"),
@ -149,9 +149,9 @@ void assertDigest(
"{}). Keeper will terminate to avoid inconsistencies.\nExtra information about the request:\n{}",
committing ? "committing" : "preprocessing",
request.getOpNum(),
first.value,
second.value,
first.version,
expected.value,
actual.value,
expected.version,
request.toString());
std::terminate();
}

View File

@ -174,7 +174,6 @@ uint64_t calculateDigest(std::string_view path, std::string_view data, const Kee
hash.update(data);
hash.update(stat.czxid);
hash.update(stat.czxid);
hash.update(stat.mzxid);
hash.update(stat.ctime);
@ -183,7 +182,6 @@ uint64_t calculateDigest(std::string_view path, std::string_view data, const Kee
hash.update(stat.cversion);
hash.update(stat.aversion);
hash.update(stat.ephemeralOwner);
hash.update(data.length());
hash.update(stat.numChildren);
hash.update(stat.pzxid);
@ -2531,6 +2529,17 @@ void KeeperStorage::recalculateStats()
container.recalculateDataSize();
}
bool KeeperStorage::checkDigest(const Digest & first, const Digest & second)
{
if (first.version != second.version)
return true;
if (first.version == DigestVersion::NO_DIGEST)
return true;
return first.value == second.value;
}
String KeeperStorage::generateDigest(const String & userdata)
{
std::vector<String> user_password;

View File

@ -95,10 +95,11 @@ public:
{
NO_DIGEST = 0,
V1 = 1,
V2 = 2 // added system nodes that modify the digest on startup so digest from V0 is invalid
V2 = 2, // added system nodes that modify the digest on startup so digest from V0 is invalid
V3 = 3 // fixed bug with casting, removed duplicate czxid usage
};
static constexpr auto CURRENT_DIGEST_VERSION = DigestVersion::V2;
static constexpr auto CURRENT_DIGEST_VERSION = DigestVersion::V3;
struct ResponseForSession
{
@ -113,16 +114,7 @@ public:
uint64_t value{0};
};
static bool checkDigest(const Digest & first, const Digest & second)
{
if (first.version != second.version)
return true;
if (first.version == DigestVersion::NO_DIGEST)
return true;
return first.value == second.value;
}
static bool checkDigest(const Digest & first, const Digest & second);
static String generateDigest(const String & userdata);

View File

@ -6,6 +6,7 @@
#include <Formats/FormatSettings.h>
#include <IO/WriteBufferFromString.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/PeekableReadBuffer.h>
#include <IO/readFloatText.h>
#include <IO/Operators.h>
#include <base/find_symbols.h>

View File

@ -38,7 +38,6 @@
#include <IO/CompressionMethod.h>
#include <IO/ReadBuffer.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/PeekableReadBuffer.h>
#include <IO/VarInt.h>
#include <pcg_random.hpp>
@ -51,6 +50,7 @@ namespace DB
template <typename Allocator>
struct Memory;
class PeekableReadBuffer;
namespace ErrorCodes
{

View File

@ -30,6 +30,7 @@ static const std::unordered_map<String, String> quantile_fuse_name_mapping =
{"quantileTDigestWeighted", "quantilesTDigestWeighted"},
{"quantileTiming", "quantilesTiming"},
{"quantileTimingWeighted", "quantilesTimingWeighted"},
{"quantileGK", "quantilesGK"},
};
String GatherFunctionQuantileData::toFusedNameOrSelf(const String & func_name)

View File

@ -7,6 +7,7 @@
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <IO/PeekableReadBuffer.h>
namespace DB

View File

@ -2,6 +2,7 @@
#include <Formats/JSONUtils.h>
#include <Formats/FormatFactory.h>
#include <Formats/EscapingRuleUtils.h>
#include <IO/PeekableReadBuffer.h>
#include <IO/ReadHelpers.h>
namespace DB

View File

@ -4,6 +4,7 @@
#include <Formats/FormatSettings.h>
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Processors/Formats/ISchemaReader.h>
#include <IO/PeekableReadBuffer.h>
namespace DB

View File

@ -7,6 +7,7 @@
#include <IO/ReadHelpers.h>
#include <IO/Operators.h>
#include <IO/ReadBufferFromString.h>
#include <IO/PeekableReadBuffer.h>
#include <Formats/EscapingRuleUtils.h>

View File

@ -25,6 +25,7 @@
#include <IO/WriteHelpers.h>
#include <IO/Archives/createArchiveReader.h>
#include <IO/Archives/IArchiveReader.h>
#include <IO/PeekableReadBuffer.h>
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>

View File

@ -1,5 +1,6 @@
#include <algorithm>
#include <memory>
#include <stack>
#include <Core/NamesAndTypes.h>
#include <Core/TypeId.h>

View File

@ -1,11 +1,14 @@
#!/usr/bin/env python3
import atexit
import sys
import logging
import sys
from typing import Tuple
# isort: off
from github import Github
# isort: on
from commit_status_helper import (
CI_STATUS_NAME,
create_ci_report,
@ -18,12 +21,12 @@ from commit_status_helper import (
)
from env_helper import GITHUB_REPOSITORY, GITHUB_SERVER_URL
from get_robot_token import get_best_robot_token
from pr_info import FORCE_TESTS_LABEL, PRInfo
from lambda_shared_package.lambda_shared.pr import (
CATEGORY_TO_LABEL,
TRUSTED_CONTRIBUTORS,
check_pr_description,
)
from pr_info import FORCE_TESTS_LABEL, PRInfo
from report import FAILURE
TRUSTED_ORG_IDS = {
@ -146,7 +149,7 @@ def main():
)
post_commit_status(
commit,
"failure",
FAILURE,
url,
format_description(description_error),
PR_CHECK,
@ -170,6 +173,14 @@ def main():
# allow the workflow to continue
if not can_run:
post_commit_status(
commit,
FAILURE,
"",
description,
PR_CHECK,
pr_info,
)
print("::notice ::Cannot run")
sys.exit(1)

View File

@ -4,6 +4,7 @@
<server_id>1</server_id>
<create_snapshot_on_exit>1</create_snapshot_on_exit>
<digest_enabled>1</digest_enabled>
<coordination_settings>
<operation_timeout_ms>10000</operation_timeout_ms>

View File

@ -1087,9 +1087,11 @@ def test_stop_other_host_during_backup(kill):
status = node1.query(f"SELECT status FROM system.backups WHERE id='{id}'").strip()
if kill:
assert status in ["BACKUP_CREATED", "BACKUP_FAILED"]
expected_statuses = ["BACKUP_CREATED", "BACKUP_FAILED"]
else:
assert status == "BACKUP_CREATED"
expected_statuses = ["BACKUP_CREATED", "BACKUP_CANCELLED"]
assert status in expected_statuses
node2.start_clickhouse()

View File

@ -19,6 +19,20 @@ select quantilesGK(1000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(numbe
[99,199,249,313,776]
select quantilesGK(10000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number + 1) from numbers(1000);
[100,200,250,314,777]
SELECT quantileGKMerge(100, 0.5)(x)
FROM
(
SELECT quantileGKState(100, 0.5)(number + 1) AS x
FROM numbers(49999)
);
24902
SELECT quantilesGKMerge(100, 0.5, 0.9, 0.99)(x)
FROM
(
SELECT quantilesGKState(100, 0.5, 0.9, 0.99)(number + 1) AS x
FROM numbers(49999)
);
[24902,44518,49999]
select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS }
select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
select quantileGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS }

View File

@ -15,6 +15,19 @@ select quantilesGK(100, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number
select quantilesGK(1000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number + 1) from numbers(1000);
select quantilesGK(10000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number + 1) from numbers(1000);
SELECT quantileGKMerge(100, 0.5)(x)
FROM
(
SELECT quantileGKState(100, 0.5)(number + 1) AS x
FROM numbers(49999)
);
SELECT quantilesGKMerge(100, 0.5, 0.9, 0.99)(x)
FROM
(
SELECT quantilesGKState(100, 0.5, 0.9, 0.99)(number + 1) AS x
FROM numbers(49999)
);
select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS }
select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }

View File

@ -1 +1 @@
2024-01-01 Hello World
1

View File

@ -1,6 +1,6 @@
CREATE table if not exists table_with_dot_column (date Date, regular_column String, `other_column.2` String) ENGINE = MergeTree() ORDER BY date;
INSERT INTO table_with_dot_column select '2020-01-01', 'Hello', 'World';
INSERT INTO table_with_dot_column select '2024-01-01', 'Hello', 'World';
CREATE TABLE IF NOT EXISTS table_with_dot_column (date Date, regular_column String, `other_column.2` String) ENGINE = MergeTree() ORDER BY date;
INSERT INTO table_with_dot_column SELECT '2020-01-01', 'Hello', 'World';
INSERT INTO table_with_dot_column SELECT toDate(now() + 48*3600), 'Hello', 'World';
CREATE ROW POLICY IF NOT EXISTS row_policy ON table_with_dot_column USING toDate(date) >= today() - 30 TO ALL;
SELECT * FROM table_with_dot_column;
SELECT count(*) FROM table_with_dot_column;
DROP TABLE table_with_dot_column;

View File

@ -1,4 +1,4 @@
personal_ws-1.1 en 2657
personal_ws-1.1 en 2724
AArch
ACLs
ALTERs
@ -12,6 +12,7 @@ ARMv
ASLR
ASOF
ASan
AWND
AWST
Actian
ActionsMenu
@ -238,6 +239,7 @@ DistributedSend
DockerHub
DoubleDelta
Doxygen
Durre
ECMA
Ecto
EdgeAngle
@ -289,6 +291,7 @@ ForEach
FreeBSD
Fuzzer
Fuzzers
GHCN
GTID
GTest
Gb
@ -444,6 +447,7 @@ Khanna
KittenHouse
Klickhouse
Kolmogorov
Korzeniewski
Kubernetes
LDAP
LGPL
@ -503,6 +507,7 @@ MaxMind
MaxPartCountForPartition
MaxPushedDDLEntryID
Mbps
McNeal
Memcheck
MemoryCode
MemoryDataAndStack
@ -512,6 +517,7 @@ MemorySanitizer
MemoryShared
MemoryTracking
MemoryVirtual
Menne
MergeJoin
MergeState
MergeTree
@ -556,6 +562,7 @@ NEWDATE
NEWDECIMAL
NFKC
NFKD
NOAA
NULLIF
NVME
NVMe
@ -576,6 +583,7 @@ NetworkSendBytes
NetworkSendDrop
NetworkSendErrors
NetworkSendPackets
Noaa
NodeJs
NuRaft
NumHexagons
@ -656,8 +664,10 @@ OrZero
OvercommitTracker
PAAMAYIM
PCRE
PRCP
PREWHERE
PROCESSLIST
PSUN
PagerDuty
ParallelFormattingOutputFormatThreads
ParallelFormattingOutputFormatThreadsActive
@ -802,6 +812,7 @@ SIMD
SLES
SLRU
SMALLINT
SNWD
SPNEGO
SQEs
SQLAlchemy
@ -874,11 +885,14 @@ SupersetDocker
SystemReplicasThreads
SystemReplicasThreadsActive
TABLUM
TAVG
TCPConnection
TCPThreads
TDigest
TINYINT
TLSv
TMAX
TMIN
TPCH
TSDB
TSVRaw
@ -980,7 +994,9 @@ VersionedCollapsingMergeTree
VideoContainer
ViewAllLink
VirtualBox
Vose
WALs
WSFG
Welch's
Werror
Wether
@ -999,6 +1015,7 @@ Xeon
YAML
YAMLRegExpTree
YYYY
YYYYMMDD
YYYYMMDDToDate
YYYYMMDDhhmmssToDateTime
Yandex
@ -1570,6 +1587,7 @@ getSetting
getSizeOfEnumType
getblockinfo
getevents
ghcnd
github
glibc
globalIn
@ -1954,6 +1972,7 @@ ngramSimHashCaseInsensitiveUTF
ngramSimHashUTF
ngrambf
ngrams
noaa
nonNegativeDerivative
noop
normalizeQuery
@ -2210,6 +2229,7 @@ reinterpretAsString
reinterpretAsUInt
reinterpretAsUUID
remoteSecure
repivot
replaceAll
replaceOne
replaceRegexpAll
@ -2706,3 +2726,6 @@ znode
znodes
zookeeperSessionUptime
zstd
iTerm
shortkeys
Shortkeys