From 1dacfc53ff97fbab6ee349c6df27b3ad2f9df1e8 Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Fri, 12 Jan 2024 17:28:45 +0000 Subject: [PATCH 01/27] weather data --- .../getting-started/example-datasets/noaa.md | 340 ++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 docs/en/getting-started/example-datasets/noaa.md diff --git a/docs/en/getting-started/example-datasets/noaa.md b/docs/en/getting-started/example-datasets/noaa.md new file mode 100644 index 00000000000..8d34ff8d3ee --- /dev/null +++ b/docs/en/getting-started/example-datasets/noaa.md @@ -0,0 +1,340 @@ +--- +slug: /en/getting-started/example-datasets/noaa +sidebar_label: NOAA Global Historical Climatology Network +sidebar_position: 1 +description: 2.5 billion rows of climate data for the last 120 yrs +--- + +# NOAA Global Historical Climatology Network + +This dataset contains weather measurements for the last 120 years. Each row is a measurement for a point in time and station. + +More precisely and according to the [origin of this data](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn): + +> GHCN-Daily is a dataset that contains daily observations over global land areas. It contains station-based measurements from land-based stations worldwide, about two thirds of which are for precipitation measurements only (Menne et al., 2012). GHCN-Daily is a composite of climate records from numerous sources that were merged together and subjected to a common suite of quality assurance reviews (Durre et al., 2010). The archive includes the following meteorological elements: + + - Daily maximum temperature + - Daily minimum temperature + - Temperature at the time of observation + - Precipitation (i.e., rain, melted snow) + - Snowfall + - Snow depth + - Other elements where available + +## Downloading the data + +- A [pre-prepared version](#pre-prepared-data) of the data for ClickHouse, which has been cleansed, re-structured, and enriched. This data covers the years 1900 to 2022. +- [Download the original data](#original-data) and convert to the format required by ClickHouse. Users wanting to add their own columns may wish to explore this approach. + +### Pre-prepared data + +More specifically, rows have been removed that did not fail any quality assurance checks by Noaa. The data has also been restructured from a measurement per line to a row per station id and date i.e. + +```csv +"station_id","date","tempAvg","tempMax","tempMin","precipitation","snowfall","snowDepth","percentDailySun","averageWindSpeed","maxWindSpeed","weatherType" +"AEM00041194","2022-07-30",347,0,308,0,0,0,0,0,0,0 +"AEM00041194","2022-07-31",371,413,329,0,0,0,0,0,0,0 +"AEM00041194","2022-08-01",384,427,357,0,0,0,0,0,0,0 +"AEM00041194","2022-08-02",381,424,352,0,0,0,0,0,0,0 +``` + +This is simpler to query and ensures the resulting table is less sparse. Finally, the data has also been enriched with latitude and longitude. + +This data is available in the following S3 location. Either download the data to your local filesystem (and insert using the ClickHouse client) or insert directly into ClickHouse (see [Inserting from S3](#inserting-from-s3)). + +To download: + +```bash +wget https://datasets-documentation.s3.eu-west-3.amazonaws.com/noaa/noaa_enriched.parquet +``` + +### Original data + +The following details the steps to download and transform the original data in preparation for loading into ClickHouse. + +#### Download + +To download the original data: + +```bash +for i in {1900..2023}; do wget https://noaa-ghcn-pds.s3.amazonaws.com/csv.gz/${i}.csv.gz; done +``` + +#### Sampling the data + +```bash +zcat 2021.csv.gz | head +AE000041196,20210101,TMAX,278,,,S, +AE000041196,20210101,PRCP,0,D,,S, +AE000041196,20210101,TAVG,214,H,,S, +AEM00041194,20210101,TMAX,266,,,S, +AEM00041194,20210101,TMIN,178,,,S, +AEM00041194,20210101,PRCP,0,,,S, +AEM00041194,20210101,TAVG,217,H,,S, +AEM00041217,20210101,TMAX,262,,,S, +AEM00041217,20210101,TMIN,155,,,S, +AEM00041217,20210101,TAVG,202,H,,S, +``` + +Summarizing the [format documentation](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn): + + +Summarizing the format documentation and the columns in order: + + - An 11 character station identification code. This itself encodes some useful information + - YEAR/MONTH/DAY = 8 character date in YYYYMMDD format (e.g. 19860529 = May 29, 1986) + - ELEMENT = 4 character indicator of element type. Effectively the measurement type. While there are many measurements available, we select the following: + - PRCP - Precipitation (tenths of mm) + - SNOW - Snowfall (mm) + - SNWD - Snow depth (mm) + - TMAX - Maximum temperature (tenths of degrees C) + - TAVG - Average temperature (tenths of a degrees C) + - TMIN - Minimum temperature (tenths of degrees C) + - PSUN - Daily percent of possible sunshine (percent) + - AWND - Average daily wind speed (tenths of meters per second) + - WSFG - Peak gust wind speed (tenths of meters per second) + - WT** = Weather Type where ** defines the weather type. Full list of weather types here. +- DATA VALUE = 5 character data value for ELEMENT i.e. the value of the measurement. +- M-FLAG = 1 character Measurement Flag. This has 10 possible values. Some of these values indicate questionable data accuracy. We accept data where this is set to “P” - identified as missing presumed zero, as this is only relevant to the PRCP, SNOW and SNWD measurements. +- Q-FLAG is the measurement quality flag with 14 possible values. We are only interested in data with an empty value i.e. it did not fail any quality assurance checks. +- S-FLAG is the source flag for the observation. Not useful for our analysis and ignored. +- OBS-TIME = 4-character time of observation in hour-minute format (i.e. 0700 =7:00 am). Typically not present in older data. We ignore this for our purposes. + +A measurement per line would result in a sparse table structure in ClickHouse. We should transform to a row per time and station, with measurements as columns. First, we limit the dataset to those rows without issues i.e. where `qFlag` is equal to an empty string. + +#### Clean the data + +Using [ClickHouse local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) we can filter rows that represent measurements of interest and pass our quality requirements: + +```bash +clickhouse local --query "SELECT count() +FROM file('*.csv.gz', CSV, 'station_id String, date String, measurement String, value Int64, mFlag String, qFlag String, sFlag String, obsTime String') WHERE qFlag = '' AND (measurement IN ('PRCP', 'SNOW', 'SNWD', 'TMAX', 'TAVG', 'TMIN', 'PSUN', 'AWND', 'WSFG') OR startsWith(measurement, 'WT'))" + +2679264563 +``` + +With over 2.6 billion rows, this isn’t a fast query since it involves parsing all the files. On our 8 core machine, this takes around 160 seconds. + + +### Pivot data + +While the measurement per line structure can be used with ClickHouse, it will unnecessarily complicate future queries. Ideally, we need a row per station id and date, where each measurement type and associated value are a column i.e. + +```csv +"station_id","date","tempAvg","tempMax","tempMin","precipitation","snowfall","snowDepth","percentDailySun","averageWindSpeed","maxWindSpeed","weatherType" +"AEM00041194","2022-07-30",347,0,308,0,0,0,0,0,0,0 +"AEM00041194","2022-07-31",371,413,329,0,0,0,0,0,0,0 +"AEM00041194","2022-08-01",384,427,357,0,0,0,0,0,0,0 +"AEM00041194","2022-08-02",381,424,352,0,0,0,0,0,0,0 +``` + +Using ClickHouse local and a simple `GROUP BY`, we can repivot our data to this structure. To limit memory overhead, we do this one file at a time. + +```bash +for i in {1900..2022} +do +clickhouse-local --query "SELECT station_id, + toDate32(date) as date, + anyIf(value, measurement = 'TAVG') as tempAvg, + anyIf(value, measurement = 'TMAX') as tempMax, + anyIf(value, measurement = 'TMIN') as tempMin, + anyIf(value, measurement = 'PRCP') as precipitation, + anyIf(value, measurement = 'SNOW') as snowfall, + anyIf(value, measurement = 'SNWD') as snowDepth, + anyIf(value, measurement = 'PSUN') as percentDailySun, + anyIf(value, measurement = 'AWND') as averageWindSpeed, + anyIf(value, measurement = 'WSFG') as maxWindSpeed, + toUInt8OrZero(replaceOne(anyIf(measurement, startsWith(measurement, 'WT') AND value = 1), 'WT', '')) as weatherType +FROM file('$i.csv.gz', CSV, 'station_id String, date String, measurement String, value Int64, mFlag String, qFlag String, sFlag String, obsTime String') + WHERE qFlag = '' AND (measurement IN ('PRCP', 'SNOW', 'SNWD', 'TMAX', 'TAVG', 'TMIN', 'PSUN', 'AWND', 'WSFG') OR startsWith(measurement, 'WT')) +GROUP BY station_id, date +ORDER BY station_id, date FORMAT CSV" >> "noaa.csv"; +done +``` + +This query produces a single 50GB file `noaa.csv`. + +### Enriching the data + +The data has no indication of location aside from a station id, which includes a prefix country code. Ideally, each station would have a latitude and longitude associated with it. To achieve this, NOAA conveniently provides the details of each station as a separate [ghcnd-stations.txt](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn#format-of-ghcnd-stationstxt-file). This file has [several columns](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn#format-of-ghcnd-stationstxt-file), of which five are useful to our future analysis: id, latitude, longitude, elevation, and name. + +```bash +wget http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-stations.txt +``` + +```bash +clickhouse local --query "WITH stations AS (SELECT id, lat, lon, elevation, splitByString(' GSN ',name)[1] as name FROM file('ghcnd-stations.txt', Regexp, 'id String, lat Float64, lon Float64, elevation Float32, name String')) +SELECT station_id, + date, + tempAvg, + tempMax, + tempMin, + precipitation, + snowfall, + snowDepth, + percentDailySun, + averageWindSpeed, + maxWindSpeed, + weatherType, + tuple(lon, lat) as location, + elevation, + name +FROM file('noaa.csv', CSV, + 'station_id String, date Date32, tempAvg Int32, tempMax Int32, tempMin Int32, precipitation Int32, snowfall Int32, snowDepth Int32, percentDailySun Int8, averageWindSpeed Int32, maxWindSpeed Int32, weatherType UInt8') as noaa LEFT OUTER + JOIN stations ON noaa.station_id = stations.id INTO OUTFILE 'noaa_enriched.parquet' FORMAT Parquet SETTINGS format_regexp='^(.{11})\s+(\-?\d{1,2}\.\d{4})\s+(\-?\d{1,3}\.\d{1,4})\s+(\-?\d*\.\d*)\s+(.*)\s+(?:[\d]*)'" +``` +This query takes a few minutes to run and produces an 6.4GB file `noaa_enriched.parquet`. + +## Create table + +Create a MergeTree table in ClickHouse (from the ClickHouse client). + +```sql +CREATE TABLE noaa +( + `station_id` LowCardinality(String), + `date` Date32, + `tempAvg` Int32 COMMENT 'Average temperature (tenths of a degrees C)', + `tempMax` Int32 COMMENT 'Maximum temperature (tenths of degrees C)', + `tempMin` Int32 COMMENT 'Minimum temperature (tenths of degrees C)', + `precipitation` UInt32 COMMENT 'Precipitation (tenths of mm)', + `snowfall` UInt32 COMMENT 'Snowfall (mm)', + `snowDepth` UInt32 COMMENT 'Snow depth (mm)', + `percentDailySun` UInt8 COMMENT 'Daily percent of possible sunshine (percent)', + `averageWindSpeed` UInt32 COMMENT 'Average daily wind speed (tenths of meters per second)', + `maxWindSpeed` UInt32 COMMENT 'Peak gust wind speed (tenths of meters per second)', + `weatherType` Enum8('Normal' = 0, 'Fog' = 1, 'Heavy Fog' = 2, 'Thunder' = 3, 'Small Hail' = 4, 'Hail' = 5, 'Glaze' = 6, 'Dust/Ash' = 7, 'Smoke/Haze' = 8, 'Blowing/Drifting Snow' = 9, 'Tornado' = 10, 'High Winds' = 11, 'Blowing Spray' = 12, 'Mist' = 13, 'Drizzle' = 14, 'Freezing Drizzle' = 15, 'Rain' = 16, 'Freezing Rain' = 17, 'Snow' = 18, 'Unknown Precipitation' = 19, 'Ground Fog' = 21, 'Freezing Fog' = 22), + `location` Point, + `elevation` Float32, + `name` LowCardinality(String) +) ENGINE = MergeTree() ORDER BY (station_id, date); + +``` + +## Inserting into ClickHouse + +### Inserting from local file + +Data can be inserted from local file as follows (from the ClickHouse client): + +```sql +INSERT INTO noaa FROM INFILE '/noaa_enriched.parquet' +``` + +where `` represents the full path to the local file on disk. + +See [here](https://clickhouse.com/blog/real-world-data-noaa-climate-data#load-the-data) for how to speed this load up. + +### Inserting from S3 + +```sql +INSERT INTO noaa SELECT * +FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/noaa/noaa_enriched.parquet') + +``` +For how to speed this up, see our blog post on [tuning large data loads](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part2). + +## Sample queries + +### Highest temperature ever + +```sql +SELECT + tempMax / 10 AS maxTemp, + location, + name, + date +FROM blogs.noaa +WHERE tempMax > 500 +ORDER BY + tempMax DESC, + date ASC +LIMIT 5 + +┌─maxTemp─┬─location──────────┬─name───────────────────────────────────────────┬───────date─┐ +│ 56.7 │ (-116.8667,36.45) │ CA GREENLAND RCH │ 1913-07-10 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1949-08-20 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1949-09-18 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1952-07-17 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1952-09-04 │ +└─────────┴───────────────────┴────────────────────────────────────────────────┴────────────┘ + +5 rows in set. Elapsed: 0.514 sec. Processed 1.06 billion rows, 4.27 GB (2.06 billion rows/s., 8.29 GB/s.) +``` + +Reassuringly consistent with the [documented record](https://en.wikipedia.org/wiki/List_of_weather_records#Highest_temperatures_ever_recorded) at [Furnace Creek](https://www.google.com/maps/place/36%C2%B027'00.0%22N+116%C2%B052'00.1%22W/@36.1329666,-116.1104099,8.95z/data=!4m5!3m4!1s0x0:0xf2ed901b860f4446!8m2!3d36.45!4d-116.8667) as of 2023. + +### Best ski resorts + +Using a [list of ski resorts](https://gist.githubusercontent.com/gingerwizard/dd022f754fd128fdaf270e58fa052e35/raw/622e03c37460f17ef72907afe554cb1c07f91f23/ski_resort_stats.csv) in the united states and their respective locations, we join these against the top 1000 weather stations with the most in any month in the last 5 yrs. Sorting this join by [geoDistance](https://clickhouse.com/docs/en/sql-reference/functions/geo/coordinates/#geodistance) and restricting the results to those where the distance is less than 20km, we select the top result per resort and sort this by total snow. Note we also restrict resorts to those above 1800m, as a broad indicator of good skiing conditions. + +```sql +SELECT + resort_name, + total_snow / 1000 AS total_snow_m, + resort_location, + month_year +FROM +( + WITH resorts AS + ( + SELECT + resort_name, + state, + (lon, lat) AS resort_location, + 'US' AS code + FROM url('https://gist.githubusercontent.com/gingerwizard/dd022f754fd128fdaf270e58fa052e35/raw/622e03c37460f17ef72907afe554cb1c07f91f23/ski_resort_stats.csv', CSVWithNames) + ) + SELECT + resort_name, + highest_snow.station_id, + geoDistance(resort_location.1, resort_location.2, station_location.1, station_location.2) / 1000 AS distance_km, + highest_snow.total_snow, + resort_location, + station_location, + month_year + FROM + ( + SELECT + sum(snowfall) AS total_snow, + station_id, + any(location) AS station_location, + month_year, + substring(station_id, 1, 2) AS code + FROM noaa + WHERE (date > '2017-01-01') AND (code = 'US') AND (elevation > 1800) + GROUP BY + station_id, + toYYYYMM(date) AS month_year + ORDER BY total_snow DESC + LIMIT 1000 + ) AS highest_snow + INNER JOIN resorts ON highest_snow.code = resorts.code + WHERE distance_km < 20 + ORDER BY + resort_name ASC, + total_snow DESC + LIMIT 1 BY + resort_name, + station_id +) +ORDER BY total_snow DESC +LIMIT 5 + +┌─resort_name──────────┬─total_snow_m─┬─resort_location─┬─month_year─┐ +│ Sugar Bowl, CA │ 7.799 │ (-120.3,39.27) │ 201902 │ +│ Donner Ski Ranch, CA │ 7.799 │ (-120.34,39.31) │ 201902 │ +│ Boreal, CA │ 7.799 │ (-120.35,39.33) │ 201902 │ +│ Homewood, CA │ 4.926 │ (-120.17,39.08) │ 201902 │ +│ Alpine Meadows, CA │ 4.926 │ (-120.22,39.17) │ 201902 │ +└──────────────────────┴──────────────┴─────────────────┴────────────┘ + +5 rows in set. Elapsed: 0.750 sec. Processed 689.10 million rows, 3.20 GB (918.20 million rows/s., 4.26 GB/s.) +Peak memory usage: 67.66 MiB. +``` + +## Credits + +We would like to acknowledge the efforts of the Global Historical Climatology Network for preparing, cleansing, and distributing this data. We appreciate your efforts. + +Menne, M.J., I. Durre, B. Korzeniewski, S. McNeal, K. Thomas, X. Yin, S. Anthony, R. Ray, R.S. Vose, B.E.Gleason, and T.G. Houston, 2012: Global Historical Climatology Network - Daily (GHCN-Daily), Version 3. [indicate subset used following decimal, e.g. Version 3.25]. NOAA National Centers for Environmental Information. http://doi.org/10.7289/V5D21VHZ [17/08/2020] From 5ba6def57d0e256be75b729678fc37d4c8989f7e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 07:29:28 +0300 Subject: [PATCH 02/27] Update noaa.md --- .../getting-started/example-datasets/noaa.md | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/docs/en/getting-started/example-datasets/noaa.md b/docs/en/getting-started/example-datasets/noaa.md index 8d34ff8d3ee..bc2e9fecae1 100644 --- a/docs/en/getting-started/example-datasets/noaa.md +++ b/docs/en/getting-started/example-datasets/noaa.md @@ -11,7 +11,7 @@ This dataset contains weather measurements for the last 120 years. Each row is a More precisely and according to the [origin of this data](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn): -> GHCN-Daily is a dataset that contains daily observations over global land areas. It contains station-based measurements from land-based stations worldwide, about two thirds of which are for precipitation measurements only (Menne et al., 2012). GHCN-Daily is a composite of climate records from numerous sources that were merged together and subjected to a common suite of quality assurance reviews (Durre et al., 2010). The archive includes the following meteorological elements: +> GHCN-Daily is a dataset that contains daily observations over global land areas. It contains station-based measurements from land-based stations worldwide, about two-thirds of which are for precipitation measurements only (Menne et al., 2012). GHCN-Daily is a composite of climate records from numerous sources that were merged together and subjected to a common suite of quality assurance reviews (Durre et al., 2010). The archive includes the following meteorological elements: - Daily maximum temperature - Daily minimum temperature @@ -28,7 +28,7 @@ More precisely and according to the [origin of this data](https://github.com/aws ### Pre-prepared data -More specifically, rows have been removed that did not fail any quality assurance checks by Noaa. The data has also been restructured from a measurement per line to a row per station id and date i.e. +More specifically, rows have been removed that did not fail any quality assurance checks by Noaa. The data has also been restructured from a measurement per line to a row per station id and date, i.e. ```csv "station_id","date","tempAvg","tempMax","tempMin","precipitation","snowfall","snowDepth","percentDailySun","averageWindSpeed","maxWindSpeed","weatherType" @@ -63,17 +63,19 @@ for i in {1900..2023}; do wget https://noaa-ghcn-pds.s3.amazonaws.com/csv.gz/${i #### Sampling the data ```bash -zcat 2021.csv.gz | head -AE000041196,20210101,TMAX,278,,,S, -AE000041196,20210101,PRCP,0,D,,S, -AE000041196,20210101,TAVG,214,H,,S, -AEM00041194,20210101,TMAX,266,,,S, -AEM00041194,20210101,TMIN,178,,,S, -AEM00041194,20210101,PRCP,0,,,S, -AEM00041194,20210101,TAVG,217,H,,S, -AEM00041217,20210101,TMAX,262,,,S, -AEM00041217,20210101,TMIN,155,,,S, -AEM00041217,20210101,TAVG,202,H,,S, +$ clickhouse-local --query "SELECT * FROM '2021.csv.gz' LIMIT 10" --format PrettyCompact +┌─c1──────────┬───────c2─┬─c3───┬──c4─┬─c5───┬─c6───┬─c7─┬───c8─┐ +│ AE000041196 │ 20210101 │ TMAX │ 278 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AE000041196 │ 20210101 │ PRCP │ 0 │ D │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AE000041196 │ 20210101 │ TAVG │ 214 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ TMAX │ 266 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ TMIN │ 178 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ PRCP │ 0 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ TAVG │ 217 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041217 │ 20210101 │ TMAX │ 262 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041217 │ 20210101 │ TMIN │ 155 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041217 │ 20210101 │ TAVG │ 202 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +└─────────────┴──────────┴──────┴─────┴──────┴──────┴────┴──────┘ ``` Summarizing the [format documentation](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn): @@ -88,7 +90,7 @@ Summarizing the format documentation and the columns in order: - SNOW - Snowfall (mm) - SNWD - Snow depth (mm) - TMAX - Maximum temperature (tenths of degrees C) - - TAVG - Average temperature (tenths of a degrees C) + - TAVG - Average temperature (tenths of a degree C) - TMIN - Minimum temperature (tenths of degrees C) - PSUN - Daily percent of possible sunshine (percent) - AWND - Average daily wind speed (tenths of meters per second) @@ -215,7 +217,7 @@ CREATE TABLE noaa ### Inserting from local file -Data can be inserted from local file as follows (from the ClickHouse client): +Data can be inserted from a local file as follows (from the ClickHouse client): ```sql INSERT INTO noaa FROM INFILE '/noaa_enriched.parquet' From 1c1e1512bf92c4788ce17f38cf228d4525cdb9eb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 01:29:38 +0300 Subject: [PATCH 03/27] Update noaa.md --- docs/en/getting-started/example-datasets/noaa.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/getting-started/example-datasets/noaa.md b/docs/en/getting-started/example-datasets/noaa.md index bc2e9fecae1..9a3ec7791b6 100644 --- a/docs/en/getting-started/example-datasets/noaa.md +++ b/docs/en/getting-started/example-datasets/noaa.md @@ -185,7 +185,7 @@ FROM file('noaa.csv', CSV, 'station_id String, date Date32, tempAvg Int32, tempMax Int32, tempMin Int32, precipitation Int32, snowfall Int32, snowDepth Int32, percentDailySun Int8, averageWindSpeed Int32, maxWindSpeed Int32, weatherType UInt8') as noaa LEFT OUTER JOIN stations ON noaa.station_id = stations.id INTO OUTFILE 'noaa_enriched.parquet' FORMAT Parquet SETTINGS format_regexp='^(.{11})\s+(\-?\d{1,2}\.\d{4})\s+(\-?\d{1,3}\.\d{1,4})\s+(\-?\d*\.\d*)\s+(.*)\s+(?:[\d]*)'" ``` -This query takes a few minutes to run and produces an 6.4GB file `noaa_enriched.parquet`. +This query takes a few minutes to run and produces a 6.4 GB file, `noaa_enriched.parquet`. ## Create table From 0a6331f5f756d5d6465095edac89d2a03618d773 Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Thu, 18 Jan 2024 13:18:13 +0000 Subject: [PATCH 04/27] spell-check additions --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 73b7a081797..1f6b24597da 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 2657 +personal_ws-1.1 en 2697 AArch ACLs ALTERs @@ -2016,6 +2016,7 @@ pcre performant perl persistency +personal_ws-1.1 en 2657 phpclickhouse pipelining plaintext From 8b730811efd055f1b200f277202a34258a93722e Mon Sep 17 00:00:00 2001 From: Dale McDiarmid Date: Thu, 18 Jan 2024 13:24:40 +0000 Subject: [PATCH 05/27] Update aspell-dict.txt --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 73b7a081797..1f6b24597da 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 2657 +personal_ws-1.1 en 2697 AArch ACLs ALTERs @@ -2016,6 +2016,7 @@ pcre performant perl persistency +personal_ws-1.1 en 2657 phpclickhouse pipelining plaintext From 8944e7a0b180a711d9eae4cb6a1064a2030e445a Mon Sep 17 00:00:00 2001 From: Joshua Hildred Date: Sat, 27 Jan 2024 12:20:49 -0800 Subject: [PATCH 06/27] update curl submodule to be version 8.50 to address CVE-2023-48795 --- contrib/curl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/curl b/contrib/curl index d755a5f7c00..7161cb17c01 160000 --- a/contrib/curl +++ b/contrib/curl @@ -1 +1 @@ -Subproject commit d755a5f7c009dd63a61b2c745180d8ba937cbfeb +Subproject commit 7161cb17c01dcff1dc5bf89a18437d9d729f1ecd From 2fa1aebe511147348b75a910d5f927dd19095f2d Mon Sep 17 00:00:00 2001 From: Aris Tritas Date: Sun, 28 Jan 2024 16:46:37 +0100 Subject: [PATCH 07/27] Update condition required to define Rust target toolchain in corrosion-cmake The `CMAKE_TOOLCHAIN_FILE` variable is used for cross-compilation. Currently, the build is blocked when it's unset. Keep the default Rust target toolchain variable unless the CMake toolchain variable is set. --- contrib/corrosion-cmake/CMakeLists.txt | 47 +++++++++++++------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/contrib/corrosion-cmake/CMakeLists.txt b/contrib/corrosion-cmake/CMakeLists.txt index 9b98ed6efb3..4f60304d74d 100644 --- a/contrib/corrosion-cmake/CMakeLists.txt +++ b/contrib/corrosion-cmake/CMakeLists.txt @@ -16,29 +16,30 @@ message(STATUS "Checking Rust toolchain for current target") # See https://doc.rust-lang.org/nightly/rustc/platform-support.html -if(CMAKE_TOOLCHAIN_FILE MATCHES "ppc64le") - set(Rust_CARGO_TARGET "powerpc64le-unknown-linux-gnu") -elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl")) - set(Rust_CARGO_TARGET "x86_64-unknown-linux-musl") -elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64") - set(Rust_CARGO_TARGET "x86_64-unknown-linux-gnu") -elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl")) - set(Rust_CARGO_TARGET "aarch64-unknown-linux-musl") -elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64") - set(Rust_CARGO_TARGET "aarch64-unknown-linux-gnu") -elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64")) - set(Rust_CARGO_TARGET "x86_64-apple-darwin") -elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "aarch64")) - set(Rust_CARGO_TARGET "aarch64-apple-darwin") -elseif((CMAKE_TOOLCHAIN_FILE MATCHES "freebsd") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64")) - set(Rust_CARGO_TARGET "x86_64-unknown-freebsd") -elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-riscv64") - set(Rust_CARGO_TARGET "riscv64gc-unknown-linux-gnu") -else() - message(FATAL_ERROR "Unsupported rust target") -endif() - -message(STATUS "Switched Rust target to ${Rust_CARGO_TARGET}") +if(DEFINED CMAKE_TOOLCHAIN_FILE) + if(CMAKE_TOOLCHAIN_FILE MATCHES "ppc64le") + set(Rust_CARGO_TARGET "powerpc64le-unknown-linux-gnu") + elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl")) + set(Rust_CARGO_TARGET "x86_64-unknown-linux-musl") + elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64") + set(Rust_CARGO_TARGET "x86_64-unknown-linux-gnu") + elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl")) + set(Rust_CARGO_TARGET "aarch64-unknown-linux-musl") + elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64") + set(Rust_CARGO_TARGET "aarch64-unknown-linux-gnu") + elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64")) + set(Rust_CARGO_TARGET "x86_64-apple-darwin") + elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "aarch64")) + set(Rust_CARGO_TARGET "aarch64-apple-darwin") + elseif((CMAKE_TOOLCHAIN_FILE MATCHES "freebsd") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64")) + set(Rust_CARGO_TARGET "x86_64-unknown-freebsd") + elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-riscv64") + set(Rust_CARGO_TARGET "riscv64gc-unknown-linux-gnu") + else() + message(FATAL_ERROR "Unsupported rust target") + endif() + message(STATUS "Switched Rust target to ${Rust_CARGO_TARGET}") +endif () # FindRust.cmake list(APPEND CMAKE_MODULE_PATH "${ClickHouse_SOURCE_DIR}/contrib/corrosion/cmake") From be6a6ca3b3eaf9208ce96f1ef1a2ef00e71572ce Mon Sep 17 00:00:00 2001 From: Aleksandr Musorin Date: Mon, 29 Jan 2024 18:31:11 +0100 Subject: [PATCH 08/27] docs: added hidden clickhouse-client shortkeys and aliases --- docs/en/interfaces/cli.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index a53844e792f..518037a2c7c 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -197,6 +197,29 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va Instead of `--host`, `--port`, `--user` and `--password` options, ClickHouse client also supports connection strings (see next section). +## Aliases {#cli_aliases} + +- `\l` - SHOW DATABASES +- `\d` - SHOW TABLES +- `\c ` - USE DATABASE +- `.` - repeat the last query + + +## Shortkeys {#shortkeys_aliases} + +- `Alt (Option) + Shift + e` - open editor with current query. It is possible to set up an environment variable - `EDITOR`, by default vim is used. +- `Alt (Option) + #` - comment line. +- `Ctrl + r` - fuzzy history search. + +:::tip +To configure the correct work of meta key (Option) on MacOS: + +iTerm2: Go to Preferences -> Profile -> Keys -> Left Option key and click Esc+ +::: + +The full list with all available shortkeys - [replxx](https://github.com/AmokHuginnsson/replxx/blob/1f149bf/src/replxx_impl.cxx#L262). + + ## Connection string {#connection_string} clickhouse-client alternatively supports connecting to clickhouse server using a connection string similar to [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostgreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). It has the following syntax: From 432fab621fa42aef0a24897f8150254eae116091 Mon Sep 17 00:00:00 2001 From: Joshua Hildred Date: Mon, 29 Jan 2024 11:27:03 -0800 Subject: [PATCH 09/27] upgrade libxml2 to 12.4 to avoid security issues --- contrib/libxml2 | 2 +- .../linux_x86_64/include/libxml/xmlversion.h | 109 ++++++++++-------- 2 files changed, 60 insertions(+), 51 deletions(-) diff --git a/contrib/libxml2 b/contrib/libxml2 index 223cb03a5d2..8292f361458 160000 --- a/contrib/libxml2 +++ b/contrib/libxml2 @@ -1 +1 @@ -Subproject commit 223cb03a5d27b1b2393b266a8657443d046139d6 +Subproject commit 8292f361458fcffe0bff515a385be02e9d35582c diff --git a/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h b/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h index c2faeb47cb1..010bc2787a1 100644 --- a/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h +++ b/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h @@ -21,7 +21,7 @@ extern "C" { * your library and includes mismatch */ #ifndef LIBXML2_COMPILING_MSCCDEF -XMLPUBFUN void XMLCALL xmlCheckVersion(int version); +XMLPUBFUN void xmlCheckVersion(int version); #endif /* LIBXML2_COMPILING_MSCCDEF */ /** @@ -29,28 +29,28 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); * * the version string like "1.2.3" */ -#define LIBXML_DOTTED_VERSION "2.10.3" +#define LIBXML_DOTTED_VERSION "2.12.2" /** * LIBXML_VERSION: * * the version number: 1.2.3 value is 10203 */ -#define LIBXML_VERSION 21003 +#define LIBXML_VERSION 21202 /** * LIBXML_VERSION_STRING: * * the version number string, 1.2.3 value is "10203" */ -#define LIBXML_VERSION_STRING "21003" +#define LIBXML_VERSION_STRING "21202" /** * LIBXML_VERSION_EXTRA: * * extra version information, used to show a git commit description */ -#define LIBXML_VERSION_EXTRA "" +#define LIBXML_VERSION_EXTRA "-GITv2.12.2-5-g23dd0b76" /** * LIBXML_TEST_VERSION: @@ -58,7 +58,7 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); * Macro to check that the libxml version in use is compatible with * the version the software has been compiled against */ -#define LIBXML_TEST_VERSION xmlCheckVersion(21003); +#define LIBXML_TEST_VERSION xmlCheckVersion(21202); #ifndef VMS #if 0 @@ -270,7 +270,7 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); * * Whether iconv support is available */ -#if 0 +#if 1 #define LIBXML_ICONV_ENABLED #endif @@ -313,7 +313,7 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); /** * LIBXML_DEBUG_RUNTIME: * - * Whether the runtime debugging is configured in + * Removed */ #if 0 #define LIBXML_DEBUG_RUNTIME @@ -409,12 +409,7 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); #endif #ifdef __GNUC__ - -/** - * ATTRIBUTE_UNUSED: - * - * Macro used to signal to GCC unused function parameters - */ +/** DOC_DISABLE */ #ifndef ATTRIBUTE_UNUSED # if ((__GNUC__ > 2) || ((__GNUC__ == 2) && (__GNUC_MINOR__ >= 7))) @@ -424,12 +419,6 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); # endif #endif -/** - * LIBXML_ATTR_ALLOC_SIZE: - * - * Macro used to indicate to GCC this is an allocator function - */ - #ifndef LIBXML_ATTR_ALLOC_SIZE # if (!defined(__clang__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)))) # define LIBXML_ATTR_ALLOC_SIZE(x) __attribute__((alloc_size(x))) @@ -440,12 +429,6 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); # define LIBXML_ATTR_ALLOC_SIZE(x) #endif -/** - * LIBXML_ATTR_FORMAT: - * - * Macro used to indicate to GCC the parameter are printf like - */ - #ifndef LIBXML_ATTR_FORMAT # if ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3))) # define LIBXML_ATTR_FORMAT(fmt,args) __attribute__((__format__(__printf__,fmt,args))) @@ -457,45 +440,71 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); #endif #ifndef XML_DEPRECATED -# ifdef IN_LIBXML +# if defined (IN_LIBXML) || (__GNUC__ * 100 + __GNUC_MINOR__ < 301) # define XML_DEPRECATED -# else /* Available since at least GCC 3.1 */ +# else # define XML_DEPRECATED __attribute__((deprecated)) # endif #endif +#if defined(__clang__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 406) + #if defined(__clang__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 800) + #define XML_IGNORE_FPTR_CAST_WARNINGS \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wpedantic\"") \ + _Pragma("GCC diagnostic ignored \"-Wcast-function-type\"") + #else + #define XML_IGNORE_FPTR_CAST_WARNINGS \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wpedantic\"") + #endif + #define XML_POP_WARNINGS \ + _Pragma("GCC diagnostic pop") +#else + #define XML_IGNORE_FPTR_CAST_WARNINGS + #define XML_POP_WARNINGS +#endif + #else /* ! __GNUC__ */ -/** - * ATTRIBUTE_UNUSED: - * - * Macro used to signal to GCC unused function parameters - */ #define ATTRIBUTE_UNUSED -/** - * LIBXML_ATTR_ALLOC_SIZE: - * - * Macro used to indicate to GCC this is an allocator function - */ #define LIBXML_ATTR_ALLOC_SIZE(x) -/** - * LIBXML_ATTR_FORMAT: - * - * Macro used to indicate to GCC the parameter are printf like - */ #define LIBXML_ATTR_FORMAT(fmt,args) -/** - * XML_DEPRECATED: - * - * Macro used to indicate that a function, variable, type or struct member - * is deprecated. - */ #ifndef XML_DEPRECATED -#define XML_DEPRECATED +# if defined (IN_LIBXML) || !defined (_MSC_VER) +# define XML_DEPRECATED +/* Available since Visual Studio 2005 */ +# elif defined (_MSC_VER) && (_MSC_VER >= 1400) +# define XML_DEPRECATED __declspec(deprecated) +# endif +#endif +#if defined (_MSC_VER) && (_MSC_VER >= 1400) +# define XML_IGNORE_FPTR_CAST_WARNINGS __pragma(warning(push)) +#else +# define XML_IGNORE_FPTR_CAST_WARNINGS +#endif +#ifndef XML_POP_WARNINGS +# if defined (_MSC_VER) && (_MSC_VER >= 1400) +# define XML_POP_WARNINGS __pragma(warning(pop)) +# else +# define XML_POP_WARNINGS +# endif #endif #endif /* __GNUC__ */ +#define XML_NO_ATTR + +#ifdef LIBXML_THREAD_ENABLED + #define XML_DECLARE_GLOBAL(name, type, attrs) \ + attrs XMLPUBFUN type *__##name(void); + #define XML_GLOBAL_MACRO(name) (*__##name()) +#else + #define XML_DECLARE_GLOBAL(name, type, attrs) \ + attrs XMLPUBVAR type name; +#endif + #ifdef __cplusplus } #endif /* __cplusplus */ #endif + From be5b3722b733d0aad17216cf6eb85b4d538c53f9 Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Tue, 30 Jan 2024 08:54:41 +0000 Subject: [PATCH 10/27] fix spelling --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 1f6b24597da..77152804740 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2016,7 +2016,6 @@ pcre performant perl persistency -personal_ws-1.1 en 2657 phpclickhouse pipelining plaintext From 7e0bea4a66dee89e00e1be60e97656085107aba2 Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Tue, 30 Jan 2024 08:55:40 +0000 Subject: [PATCH 11/27] fix spelling --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 1f6b24597da..77152804740 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2016,7 +2016,6 @@ pcre performant perl persistency -personal_ws-1.1 en 2657 phpclickhouse pipelining plaintext From 49d2b26820e5b184b21a6f131c3f89a34e9eaa41 Mon Sep 17 00:00:00 2001 From: Aleksandr Musorin Date: Tue, 30 Jan 2024 12:11:00 +0100 Subject: [PATCH 12/27] Added new words in aspell-dict --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index f4be6ebcf09..2614a0f55bc 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2703,3 +2703,6 @@ znode znodes zookeeperSessionUptime zstd +iTerm +shortkeys +Shortkeys From b379adde9266a1afd45ccc228d84f9a36e0af92d Mon Sep 17 00:00:00 2001 From: Joshua Hildred Date: Tue, 30 Jan 2024 07:23:13 -0800 Subject: [PATCH 13/27] Include proper version information --- .../linux_x86_64/include/libxml/xmlversion.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h b/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h index 010bc2787a1..d8535e91a0e 100644 --- a/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h +++ b/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h @@ -29,28 +29,28 @@ XMLPUBFUN void xmlCheckVersion(int version); * * the version string like "1.2.3" */ -#define LIBXML_DOTTED_VERSION "2.12.2" +#define LIBXML_DOTTED_VERSION "2.12.4" /** * LIBXML_VERSION: * * the version number: 1.2.3 value is 10203 */ -#define LIBXML_VERSION 21202 +#define LIBXML_VERSION 21204 /** * LIBXML_VERSION_STRING: * * the version number string, 1.2.3 value is "10203" */ -#define LIBXML_VERSION_STRING "21202" +#define LIBXML_VERSION_STRING "21204" /** * LIBXML_VERSION_EXTRA: * * extra version information, used to show a git commit description */ -#define LIBXML_VERSION_EXTRA "-GITv2.12.2-5-g23dd0b76" +#define LIBXML_VERSION_EXTRA "-GITv2.12.4" /** * LIBXML_TEST_VERSION: @@ -58,7 +58,7 @@ XMLPUBFUN void xmlCheckVersion(int version); * Macro to check that the libxml version in use is compatible with * the version the software has been compiled against */ -#define LIBXML_TEST_VERSION xmlCheckVersion(21202); +#define LIBXML_TEST_VERSION xmlCheckVersion(21204); #ifndef VMS #if 0 @@ -507,4 +507,3 @@ XMLPUBFUN void xmlCheckVersion(int version); } #endif /* __cplusplus */ #endif - From 010f1c6cb79b9f9a5a96b9ce32faac234c1297e9 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 30 Jan 2024 19:10:26 +0100 Subject: [PATCH 14/27] Forward declaration for PeekableReadBuffer ReadHelpers.h is very common header and is the root cause of "recompile everything". Signed-off-by: Azat Khuzhin --- src/IO/ReadHelpers.cpp | 1 + src/IO/ReadHelpers.h | 2 +- src/Processors/Formats/Impl/CSVRowInputFormat.h | 1 + src/Processors/Formats/Impl/JSONRowInputFormat.cpp | 1 + src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h | 1 + src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp | 1 + src/Storages/StorageFile.cpp | 1 + src/Storages/VirtualColumnUtils.cpp | 1 + 8 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 253abb3fee7..bcfe5fd5230 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 2549b40e243..49530f4787a 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -38,7 +38,6 @@ #include #include #include -#include #include #include @@ -51,6 +50,7 @@ namespace DB template struct Memory; +class PeekableReadBuffer; namespace ErrorCodes { diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index c4b3c8feb8c..fe4d4e3be08 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB diff --git a/src/Processors/Formats/Impl/JSONRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONRowInputFormat.cpp index f78ce530ecb..23faa057715 100644 --- a/src/Processors/Formats/Impl/JSONRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONRowInputFormat.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include namespace DB diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 00a270e9611..32abd532a52 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 478ce41f924..2ad6a825c8f 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 8b8a151fb1d..0d9e79d1d54 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 430ed012fa8..33ff6e7104f 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include From 0f5cb76a618ba5182d9463ed88d8acea1632b89b Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Tue, 30 Jan 2024 20:37:51 +0000 Subject: [PATCH 15/27] more words --- .../aspell-ignore/en/aspell-dict.txt | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 77152804740..74e32ed31ce 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 2697 +personal_ws-1.1 en 2716 AArch ACLs ALTERs @@ -12,6 +12,7 @@ ARMv ASLR ASOF ASan +AWND AWST Actian ActionsMenu @@ -237,6 +238,7 @@ DistributedSend DockerHub DoubleDelta Doxygen +Durre ECMA Ecto EdgeAngle @@ -287,6 +289,7 @@ ForEach FreeBSD Fuzzer Fuzzers +GHCN GTID GTest Gb @@ -442,6 +445,7 @@ Khanna KittenHouse Klickhouse Kolmogorov +Korzeniewski Kubernetes LDAP LGPL @@ -501,6 +505,7 @@ MaxMind MaxPartCountForPartition MaxPushedDDLEntryID Mbps +McNeal Memcheck MemoryCode MemoryDataAndStack @@ -510,6 +515,7 @@ MemorySanitizer MemoryShared MemoryTracking MemoryVirtual +Menne MergeJoin MergeState MergeTree @@ -554,6 +560,7 @@ NEWDATE NEWDECIMAL NFKC NFKD +NOAA NULLIF NVME NVMe @@ -574,6 +581,7 @@ NetworkSendBytes NetworkSendDrop NetworkSendErrors NetworkSendPackets +Noaa NodeJs NuRaft NumHexagons @@ -654,8 +662,10 @@ OrZero OvercommitTracker PAAMAYIM PCRE +PRCP PREWHERE PROCESSLIST +PSUN PagerDuty ParallelFormattingOutputFormatThreads ParallelFormattingOutputFormatThreadsActive @@ -800,6 +810,7 @@ SIMD SLES SLRU SMALLINT +SNWD SPNEGO SQEs SQLAlchemy @@ -872,11 +883,14 @@ SupersetDocker SystemReplicasThreads SystemReplicasThreadsActive TABLUM +TAVG TCPConnection TCPThreads TDigest TINYINT TLSv +TMAX +TMIN TPCH TSDB TSVRaw @@ -978,7 +992,9 @@ VersionedCollapsingMergeTree VideoContainer ViewAllLink VirtualBox +Vose WALs +WSFG Welch's Werror Wether @@ -997,6 +1013,7 @@ Xeon YAML YAMLRegExpTree YYYY +YYYYMMDD YYYYMMDDToDate YYYYMMDDhhmmssToDateTime Yandex @@ -1566,6 +1583,7 @@ getSetting getSizeOfEnumType getblockinfo getevents +ghcnd github glibc globalIn @@ -1947,6 +1965,7 @@ ngramSimHashCaseInsensitiveUTF ngramSimHashUTF ngrambf ngrams +noaa nonNegativeDerivative noop normalizeQuery @@ -2202,6 +2221,7 @@ reinterpretAsString reinterpretAsUInt reinterpretAsUUID remoteSecure +repivot replaceAll replaceOne replaceRegexpAll From db84527dd01742329186c11c5c9be066eb652e8c Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Tue, 30 Jan 2024 20:39:37 +0000 Subject: [PATCH 16/27] more words --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 3eb233f2995..1cb24705d19 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 2716 +personal_ws-1.1 en 2724 AArch ACLs ALTERs From a89890babe6ba06825c0a7659db2e37be72a3046 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Tue, 30 Jan 2024 19:46:23 +0800 Subject: [PATCH 17/27] fix bug of quantileGK --- .../AggregateFunctionQuantileGK.cpp | 90 +++++++++++-------- .../GatherFunctionQuantileVisitor.cpp | 1 + .../02661_quantile_approx.reference | 14 +++ .../0_stateless/02661_quantile_approx.sql | 13 +++ 4 files changed, 82 insertions(+), 36 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionQuantileGK.cpp b/src/AggregateFunctions/AggregateFunctionQuantileGK.cpp index 2c0b3e55136..2e8ccb2e5e4 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantileGK.cpp +++ b/src/AggregateFunctions/AggregateFunctionQuantileGK.cpp @@ -17,6 +17,7 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int INCORRECT_DATA; extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; } @@ -30,12 +31,12 @@ class ApproxSampler public: struct Stats { - T value; // the sampled value - Int64 g; // the minimum rank jump from the previous value's minimum rank - Int64 delta; // the maximum span of the rank + T value; // The sampled value + Int64 g; // The minimum rank jump from the previous value's minimum rank + Int64 delta; // The maximum span of the rank Stats() = default; - Stats(T value_, Int64 g_, Int64 delta_) : value(value_), g(g_), delta(delta_) {} + Stats(T value_, Int64 g_, Int64 delta_) : value(value_), g(g_), delta(delta_) { } }; struct QueryResult @@ -49,20 +50,20 @@ public: ApproxSampler() = default; - explicit ApproxSampler( - double relative_error_, - size_t compress_threshold_ = default_compress_threshold, - size_t count_ = 0, - bool compressed_ = false) - : relative_error(relative_error_) - , compress_threshold(compress_threshold_) - , count(count_) - , compressed(compressed_) + ApproxSampler(const ApproxSampler & other) + : relative_error(other.relative_error) + , compress_threshold(other.compress_threshold) + , count(other.count) + , compressed(other.compressed) + , sampled(other.sampled.begin(), other.sampled.end()) + , backup_sampled(other.backup_sampled.begin(), other.backup_sampled.end()) + , head_sampled(other.head_sampled.begin(), other.head_sampled.end()) { - sampled.reserve(compress_threshold); - backup_sampled.reserve(compress_threshold); + } - head_sampled.reserve(default_head_size); + explicit ApproxSampler(double relative_error_) + : relative_error(relative_error_), compress_threshold(default_compress_threshold), count(0), compressed(false) + { } bool isCompressed() const { return compressed; } @@ -95,9 +96,9 @@ public: Int64 current_max = std::numeric_limits::min(); for (const auto & stats : sampled) current_max = std::max(stats.delta + stats.g, current_max); - Int64 target_error = current_max/2; + Int64 target_error = current_max / 2; - size_t index= 0; + size_t index = 0; auto min_rank = sampled[0].g; for (size_t i = 0; i < size; ++i) { @@ -118,7 +119,6 @@ public: result[indices[i]] = res.value; } } - } void compress() @@ -256,16 +256,27 @@ public: void read(ReadBuffer & buf) { readBinaryLittleEndian(compress_threshold, buf); + if (compress_threshold != default_compress_threshold) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "The compress threshold {} isn't the expected one {}", + compress_threshold, + default_compress_threshold); + readBinaryLittleEndian(relative_error, buf); readBinaryLittleEndian(count, buf); size_t sampled_len = 0; readBinaryLittleEndian(sampled_len, buf); + if (sampled_len > compress_threshold) + throw Exception( + ErrorCodes::INCORRECT_DATA, "The number of elements {} for quantileGK exceeds {}", sampled_len, compress_threshold); + sampled.resize(sampled_len); for (size_t i = 0; i < sampled_len; ++i) { - auto stats = sampled[i]; + auto & stats = sampled[i]; readBinaryLittleEndian(stats.value, buf); readBinaryLittleEndian(stats.g, buf); readBinaryLittleEndian(stats.delta, buf); @@ -291,7 +302,7 @@ private: min_rank += curr_sample.g; } } - return {sampled.size()-1, 0, sampled.back().value}; + return {sampled.size() - 1, 0, sampled.back().value}; } void withHeadBufferInserted() @@ -389,12 +400,11 @@ private: double relative_error; size_t compress_threshold; - size_t count = 0; + size_t count; bool compressed; PaddedPODArray sampled; PaddedPODArray backup_sampled; - PaddedPODArray head_sampled; static constexpr size_t default_compress_threshold = 10000; @@ -406,17 +416,14 @@ class QuantileGK { private: using Data = ApproxSampler; - mutable Data data; + Data data; public: QuantileGK() = default; explicit QuantileGK(size_t accuracy) : data(1.0 / static_cast(accuracy)) { } - void add(const Value & x) - { - data.insert(x); - } + void add(const Value & x) { data.insert(x); } template void add(const Value &, const Weight &) @@ -429,22 +436,34 @@ public: if (!data.isCompressed()) data.compress(); - data.merge(rhs.data); + if (rhs.data.isCompressed()) + data.merge(rhs.data); + else + { + /// We can't modify rhs, so copy it and compress + Data rhs_data_copy(rhs.data); + rhs_data_copy.compress(); + data.merge(rhs_data_copy); + } } void serialize(WriteBuffer & buf) const { - /// Always compress before serialization - if (!data.isCompressed()) - data.compress(); - - data.write(buf); + if (data.isCompressed()) + data.write(buf); + else + { + /// We can't modify rhs, so copy it and compress + Data data_copy(data); + data_copy.compress(); + data_copy.write(buf); + } } void deserialize(ReadBuffer & buf) { data.read(buf); - + /// Serialized data is always compressed data.setCompressed(); } @@ -481,7 +500,6 @@ public: } }; - template using FuncQuantileGK = AggregateFunctionQuantile, NameQuantileGK, false, void, false, true>; template using FuncQuantilesGK = AggregateFunctionQuantile, NameQuantilesGK, false, void, true, true>; diff --git a/src/Interpreters/GatherFunctionQuantileVisitor.cpp b/src/Interpreters/GatherFunctionQuantileVisitor.cpp index 664bb9e9383..6b6dc362771 100644 --- a/src/Interpreters/GatherFunctionQuantileVisitor.cpp +++ b/src/Interpreters/GatherFunctionQuantileVisitor.cpp @@ -30,6 +30,7 @@ static const std::unordered_map quantile_fuse_name_mapping = {"quantileTDigestWeighted", "quantilesTDigestWeighted"}, {"quantileTiming", "quantilesTiming"}, {"quantileTimingWeighted", "quantilesTimingWeighted"}, + {"quantileGK", "quantilesGK"}, }; String GatherFunctionQuantileData::toFusedNameOrSelf(const String & func_name) diff --git a/tests/queries/0_stateless/02661_quantile_approx.reference b/tests/queries/0_stateless/02661_quantile_approx.reference index 8369363aa9b..0ee846a268b 100644 --- a/tests/queries/0_stateless/02661_quantile_approx.reference +++ b/tests/queries/0_stateless/02661_quantile_approx.reference @@ -19,6 +19,20 @@ select quantilesGK(1000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(numbe [99,199,249,313,776] select quantilesGK(10000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number + 1) from numbers(1000); [100,200,250,314,777] +SELECT quantileGKMerge(100, 0.5)(x) +FROM +( + SELECT quantileGKState(100, 0.5)(number + 1) AS x + FROM numbers(49999) +); +24902 +SELECT quantilesGKMerge(100, 0.5, 0.9, 0.99)(x) +FROM +( + SELECT quantilesGKState(100, 0.5, 0.9, 0.99)(number + 1) AS x + FROM numbers(49999) +); +[24902,44518,49999] select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS } select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } select quantileGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS } diff --git a/tests/queries/0_stateless/02661_quantile_approx.sql b/tests/queries/0_stateless/02661_quantile_approx.sql index 52c2979ad44..c0004260fa1 100644 --- a/tests/queries/0_stateless/02661_quantile_approx.sql +++ b/tests/queries/0_stateless/02661_quantile_approx.sql @@ -15,6 +15,19 @@ select quantilesGK(100, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number select quantilesGK(1000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number + 1) from numbers(1000); select quantilesGK(10000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number + 1) from numbers(1000); +SELECT quantileGKMerge(100, 0.5)(x) +FROM +( + SELECT quantileGKState(100, 0.5)(number + 1) AS x + FROM numbers(49999) +); + +SELECT quantilesGKMerge(100, 0.5, 0.9, 0.99)(x) +FROM +( + SELECT quantilesGKState(100, 0.5, 0.9, 0.99)(number + 1) AS x + FROM numbers(49999) +); select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS } select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } From 79c068571de3dc142f6f1de8bb5a69286c4120ef Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 31 Jan 2024 12:58:22 +0100 Subject: [PATCH 18/27] Fix test test_stop_other_host_during_backup. --- tests/integration/test_backup_restore_on_cluster/test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index 20f538cca58..027c9736c32 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -1087,9 +1087,11 @@ def test_stop_other_host_during_backup(kill): status = node1.query(f"SELECT status FROM system.backups WHERE id='{id}'").strip() if kill: - assert status in ["BACKUP_CREATED", "BACKUP_FAILED"] + expected_statuses = ["BACKUP_CREATED", "BACKUP_FAILED"] else: - assert status == "BACKUP_CREATED" + expected_statuses = ["BACKUP_CREATED", "BACKUP_CANCELLED"] + + assert status in expected_statuses node2.start_clickhouse() From 60c37fb9bf00f39f5f9637045f302a9efb0f296d Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 31 Jan 2024 13:44:16 +0100 Subject: [PATCH 19/27] Update run.sh --- docker/test/stateless/run.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 05b9ec2a06f..ea76447aef2 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -293,10 +293,10 @@ if [ $failed_to_save_logs -ne 0 ]; then # for files >64MB, we want this files to be compressed explicitly for table in query_log zookeeper_log trace_log transactions_info_log metric_log do - clickhouse-local "$data_path_config" --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||: + clickhouse-local "$data_path_config" --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||: if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then - clickhouse-local --path /var/lib/clickhouse1/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst ||: - clickhouse-local --path /var/lib/clickhouse2/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst ||: + clickhouse-local --path /var/lib/clickhouse1/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst ||: + clickhouse-local --path /var/lib/clickhouse2/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst ||: fi done fi From 08f91907dc6f1734b948f7da932a94ccb77d9c29 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 31 Jan 2024 15:51:52 +0000 Subject: [PATCH 20/27] Fix digest calculation --- src/Coordination/KeeperStateMachine.cpp | 12 ++++++------ src/Coordination/KeeperStorage.cpp | 13 +++++++++++-- src/Coordination/KeeperStorage.h | 16 ++++------------ tests/config/config.d/keeper_port.xml | 1 + 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 8d50f0a76b1..c82f8301eff 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -136,12 +136,12 @@ namespace { void assertDigest( - const KeeperStorage::Digest & first, - const KeeperStorage::Digest & second, + const KeeperStorage::Digest & expected, + const KeeperStorage::Digest & actual, const Coordination::ZooKeeperRequest & request, bool committing) { - if (!KeeperStorage::checkDigest(first, second)) + if (!KeeperStorage::checkDigest(expected, actual)) { LOG_FATAL( getLogger("KeeperStateMachine"), @@ -149,9 +149,9 @@ void assertDigest( "{}). Keeper will terminate to avoid inconsistencies.\nExtra information about the request:\n{}", committing ? "committing" : "preprocessing", request.getOpNum(), - first.value, - second.value, - first.version, + expected.value, + actual.value, + expected.version, request.toString()); std::terminate(); } diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 992d4ca8a95..f30cbb65182 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -174,7 +174,6 @@ uint64_t calculateDigest(std::string_view path, std::string_view data, const Kee hash.update(data); - hash.update(stat.czxid); hash.update(stat.czxid); hash.update(stat.mzxid); hash.update(stat.ctime); @@ -183,7 +182,6 @@ uint64_t calculateDigest(std::string_view path, std::string_view data, const Kee hash.update(stat.cversion); hash.update(stat.aversion); hash.update(stat.ephemeralOwner); - hash.update(data.length()); hash.update(stat.numChildren); hash.update(stat.pzxid); @@ -2531,6 +2529,17 @@ void KeeperStorage::recalculateStats() container.recalculateDataSize(); } +bool KeeperStorage::checkDigest(const Digest & first, const Digest & second) +{ + if (first.version != second.version) + return true; + + if (first.version == DigestVersion::NO_DIGEST) + return true; + + return first.value == second.value; +} + String KeeperStorage::generateDigest(const String & userdata) { std::vector user_password; diff --git a/src/Coordination/KeeperStorage.h b/src/Coordination/KeeperStorage.h index 01c1413a884..048adf3ffaa 100644 --- a/src/Coordination/KeeperStorage.h +++ b/src/Coordination/KeeperStorage.h @@ -95,10 +95,11 @@ public: { NO_DIGEST = 0, V1 = 1, - V2 = 2 // added system nodes that modify the digest on startup so digest from V0 is invalid + V2 = 2, // added system nodes that modify the digest on startup so digest from V0 is invalid + V3 = 3 // fixed bug with casting, removed duplicate czxid usage }; - static constexpr auto CURRENT_DIGEST_VERSION = DigestVersion::V2; + static constexpr auto CURRENT_DIGEST_VERSION = DigestVersion::V3; struct ResponseForSession { @@ -113,16 +114,7 @@ public: uint64_t value{0}; }; - static bool checkDigest(const Digest & first, const Digest & second) - { - if (first.version != second.version) - return true; - - if (first.version == DigestVersion::NO_DIGEST) - return true; - - return first.value == second.value; - } + static bool checkDigest(const Digest & first, const Digest & second); static String generateDigest(const String & userdata); diff --git a/tests/config/config.d/keeper_port.xml b/tests/config/config.d/keeper_port.xml index b87014d2485..3cf439a5bdf 100644 --- a/tests/config/config.d/keeper_port.xml +++ b/tests/config/config.d/keeper_port.xml @@ -4,6 +4,7 @@ 1 1 + 1 10000 From 0f931057e9fbe61bbeb0d27d7ff442fb7ef6e3e7 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 31 Jan 2024 17:59:13 +0100 Subject: [PATCH 21/27] Post a failure status if can not run the CI --- tests/ci/run_check.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 108aa7d1946..a6312872c2a 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -1,11 +1,14 @@ #!/usr/bin/env python3 import atexit -import sys import logging +import sys from typing import Tuple +# isort: off from github import Github +# isort: on + from commit_status_helper import ( CI_STATUS_NAME, create_ci_report, @@ -18,12 +21,12 @@ from commit_status_helper import ( ) from env_helper import GITHUB_REPOSITORY, GITHUB_SERVER_URL from get_robot_token import get_best_robot_token -from pr_info import FORCE_TESTS_LABEL, PRInfo from lambda_shared_package.lambda_shared.pr import ( CATEGORY_TO_LABEL, TRUSTED_CONTRIBUTORS, check_pr_description, ) +from pr_info import FORCE_TESTS_LABEL, PRInfo from report import FAILURE TRUSTED_ORG_IDS = { @@ -146,7 +149,7 @@ def main(): ) post_commit_status( commit, - "failure", + FAILURE, url, format_description(description_error), PR_CHECK, @@ -170,6 +173,14 @@ def main(): # allow the workflow to continue if not can_run: + post_commit_status( + commit, + FAILURE, + "", + description, + PR_CHECK, + pr_info, + ) print("::notice ::Cannot run") sys.exit(1) From cd8e2075313208165992bf425ec2d51987be3346 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 31 Jan 2024 18:45:42 +0100 Subject: [PATCH 22/27] Fix tag --- tests/config/config.d/keeper_port.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/config/config.d/keeper_port.xml b/tests/config/config.d/keeper_port.xml index 3cf439a5bdf..b724d5dd87e 100644 --- a/tests/config/config.d/keeper_port.xml +++ b/tests/config/config.d/keeper_port.xml @@ -4,7 +4,7 @@ 1 1 - 1 + 1 10000 From 65cfbaaa4b6194937478e98c773ed6ed56a6d70f Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 31 Jan 2024 22:24:51 +0100 Subject: [PATCH 23/27] Safer Rust (catch panic with catch_unwind()) Crossing boundaries of multiple languages is tricky, but we can do at least something about this, in particular, use catch_unwind() [1] to catch possible panic!()s. [1]: https://doc.rust-lang.org/std/panic/fn.catch_unwind.html Signed-off-by: Azat Khuzhin --- rust/prql/src/lib.rs | 21 +++++++++++++++++++-- rust/skim/src/lib.rs | 22 +++++++++++++++++++++- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/rust/prql/src/lib.rs b/rust/prql/src/lib.rs index fb71d62d527..d51acfbd485 100644 --- a/rust/prql/src/lib.rs +++ b/rust/prql/src/lib.rs @@ -2,6 +2,7 @@ use prql_compiler::sql::Dialect; use prql_compiler::{Options, Target}; use std::ffi::{c_char, CString}; use std::slice; +use std::panic; fn set_output(result: String, out: *mut *mut u8, out_size: *mut u64) { assert!(!out_size.is_null()); @@ -13,8 +14,7 @@ fn set_output(result: String, out: *mut *mut u8, out_size: *mut u64) { *out_ptr = CString::new(result).unwrap().into_raw() as *mut u8; } -#[no_mangle] -pub unsafe extern "C" fn prql_to_sql( +pub unsafe extern "C" fn prql_to_sql_impl( query: *const u8, size: u64, out: *mut *mut u8, @@ -50,6 +50,23 @@ pub unsafe extern "C" fn prql_to_sql( } } +#[no_mangle] +pub unsafe extern "C" fn prql_to_sql( + query: *const u8, + size: u64, + out: *mut *mut u8, + out_size: *mut u64, +) -> i64 { + let ret = panic::catch_unwind(|| { + return prql_to_sql_impl(query, size, out, out_size); + }); + return match ret { + // NOTE: using cxxbridge we can return proper Result<> type. + Err(_err) => 1, + Ok(res) => res, + } +} + #[no_mangle] pub unsafe extern "C" fn prql_free_pointer(ptr_to_free: *mut u8) { std::mem::drop(CString::from_raw(ptr_to_free as *mut c_char)); diff --git a/rust/skim/src/lib.rs b/rust/skim/src/lib.rs index 2221ed63df4..a20b1b35033 100644 --- a/rust/skim/src/lib.rs +++ b/rust/skim/src/lib.rs @@ -1,6 +1,7 @@ use skim::prelude::*; use term::terminfo::TermInfo; use cxx::{CxxString, CxxVector}; +use std::panic; #[cxx::bridge] mod ffi { @@ -36,7 +37,7 @@ impl SkimItem for Item { } } -fn skim(prefix: &CxxString, words: &CxxVector) -> Result { +fn skim_impl(prefix: &CxxString, words: &CxxVector) -> Result { // Let's check is terminal available. To avoid panic. if let Err(err) = TermInfo::from_env() { return Err(format!("{}", err)); @@ -89,3 +90,22 @@ fn skim(prefix: &CxxString, words: &CxxVector) -> Result) -> Result { + let ret = panic::catch_unwind(|| { + return skim_impl(prefix, words); + }); + return match ret { + Err(err) => { + let e = if let Some(s) = err.downcast_ref::() { + format!("{}", s) + } else if let Some(s) = err.downcast_ref::<&str>() { + format!("{}", s) + } else { + format!("Unknown panic type: {:?}", err.type_id()) + }; + Err(format!("Rust panic: {:?}", e)) + }, + Ok(res) => res, + } +} From ed3d5ce0a3249c884182a3461fd292a3ac516429 Mon Sep 17 00:00:00 2001 From: Sean Haynes Date: Wed, 31 Jan 2024 21:52:52 +0000 Subject: [PATCH 24/27] Update bench docs to reflect removal of -p (port) short option --- docs/en/operations/utilities/clickhouse-benchmark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/utilities/clickhouse-benchmark.md b/docs/en/operations/utilities/clickhouse-benchmark.md index 8b7d7f85552..9261b22a538 100644 --- a/docs/en/operations/utilities/clickhouse-benchmark.md +++ b/docs/en/operations/utilities/clickhouse-benchmark.md @@ -45,11 +45,11 @@ clickhouse-benchmark [keys] < queries_file; - `-c N`, `--concurrency=N` — Number of queries that `clickhouse-benchmark` sends simultaneously. Default value: 1. - `-d N`, `--delay=N` — Interval in seconds between intermediate reports (to disable reports set 0). Default value: 1. - `-h HOST`, `--host=HOST` — Server host. Default value: `localhost`. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-h` keys. -- `-p N`, `--port=N` — Server port. Default value: 9000. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-p` keys. - `-i N`, `--iterations=N` — Total number of queries. Default value: 0 (repeat forever). - `-r`, `--randomize` — Random order of queries execution if there is more than one input query. - `-s`, `--secure` — Using `TLS` connection. - `-t N`, `--timelimit=N` — Time limit in seconds. `clickhouse-benchmark` stops sending queries when the specified time limit is reached. Default value: 0 (time limit disabled). +- `--port=N` — Server port. Default value: 9000. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-p` keys. - `--confidence=N` — Level of confidence for T-test. Possible values: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Default value: 5. In the [comparison mode](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` performs the [Independent two-sample Student’s t-test](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) to determine whether the two distributions aren’t different with the selected level of confidence. - `--cumulative` — Printing cumulative data instead of data per interval. - `--database=DATABASE_NAME` — ClickHouse database name. Default value: `default`. From 31e99ee61f6d80410653dcd73cdec213331303d8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 1 Feb 2024 01:01:59 +0300 Subject: [PATCH 25/27] Update clickhouse-benchmark.md --- docs/en/operations/utilities/clickhouse-benchmark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/utilities/clickhouse-benchmark.md b/docs/en/operations/utilities/clickhouse-benchmark.md index 9261b22a538..6d5148ad965 100644 --- a/docs/en/operations/utilities/clickhouse-benchmark.md +++ b/docs/en/operations/utilities/clickhouse-benchmark.md @@ -49,7 +49,7 @@ clickhouse-benchmark [keys] < queries_file; - `-r`, `--randomize` — Random order of queries execution if there is more than one input query. - `-s`, `--secure` — Using `TLS` connection. - `-t N`, `--timelimit=N` — Time limit in seconds. `clickhouse-benchmark` stops sending queries when the specified time limit is reached. Default value: 0 (time limit disabled). -- `--port=N` — Server port. Default value: 9000. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-p` keys. +- `--port=N` — Server port. Default value: 9000. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `--port` keys. - `--confidence=N` — Level of confidence for T-test. Possible values: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Default value: 5. In the [comparison mode](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` performs the [Independent two-sample Student’s t-test](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) to determine whether the two distributions aren’t different with the selected level of confidence. - `--cumulative` — Printing cumulative data instead of data per interval. - `--database=DATABASE_NAME` — ClickHouse database name. Default value: `default`. From 009c0dc136b8e74c41784011d77e85ce18ff7e8d Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Thu, 1 Feb 2024 05:27:24 +0000 Subject: [PATCH 26/27] fix 02720_row_policy_column_with_dots Signed-off-by: Duc Canh Le --- .../0_stateless/02720_row_policy_column_with_dots.reference | 2 +- .../queries/0_stateless/02720_row_policy_column_with_dots.sql | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference b/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference index dd2c30cc9f8..3856762b151 100644 --- a/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference +++ b/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference @@ -1 +1 @@ -2024-01-01 Hello World +2124-01-01 Hello World diff --git a/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql b/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql index 361bd0e0ec7..732a2f1ebd3 100644 --- a/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql +++ b/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql @@ -1,6 +1,6 @@ CREATE table if not exists table_with_dot_column (date Date, regular_column String, `other_column.2` String) ENGINE = MergeTree() ORDER BY date; INSERT INTO table_with_dot_column select '2020-01-01', 'Hello', 'World'; -INSERT INTO table_with_dot_column select '2024-01-01', 'Hello', 'World'; -CREATE ROW POLICY IF NOT EXISTS row_policy ON table_with_dot_column USING toDate(date) >= today() - 30 TO ALL; +INSERT INTO table_with_dot_column select '2124-01-01', 'Hello', 'World'; +CREATE ROW POLICY IF NOT EXISTS row_policy ON table_with_dot_column USING toDate(date) >= '2123-01-01' TO ALL; SELECT * FROM table_with_dot_column; DROP TABLE table_with_dot_column; From 500e8e505d0bb3531c6fc37587fc9121a929ed45 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Thu, 1 Feb 2024 08:03:19 +0000 Subject: [PATCH 27/27] better Signed-off-by: Duc Canh Le --- .../02720_row_policy_column_with_dots.reference | 2 +- .../0_stateless/02720_row_policy_column_with_dots.sql | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference b/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference index 3856762b151..d00491fd7e5 100644 --- a/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference +++ b/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference @@ -1 +1 @@ -2124-01-01 Hello World +1 diff --git a/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql b/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql index 732a2f1ebd3..fcb0bf62859 100644 --- a/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql +++ b/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql @@ -1,6 +1,6 @@ -CREATE table if not exists table_with_dot_column (date Date, regular_column String, `other_column.2` String) ENGINE = MergeTree() ORDER BY date; -INSERT INTO table_with_dot_column select '2020-01-01', 'Hello', 'World'; -INSERT INTO table_with_dot_column select '2124-01-01', 'Hello', 'World'; -CREATE ROW POLICY IF NOT EXISTS row_policy ON table_with_dot_column USING toDate(date) >= '2123-01-01' TO ALL; -SELECT * FROM table_with_dot_column; +CREATE TABLE IF NOT EXISTS table_with_dot_column (date Date, regular_column String, `other_column.2` String) ENGINE = MergeTree() ORDER BY date; +INSERT INTO table_with_dot_column SELECT '2020-01-01', 'Hello', 'World'; +INSERT INTO table_with_dot_column SELECT toDate(now() + 48*3600), 'Hello', 'World'; +CREATE ROW POLICY IF NOT EXISTS row_policy ON table_with_dot_column USING toDate(date) >= today() - 30 TO ALL; +SELECT count(*) FROM table_with_dot_column; DROP TABLE table_with_dot_column;