diff --git a/README.md b/README.md index fa8df29468e..3b5209dcbe9 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ curl https://clickhouse.com/ | sh Every month we get together with the community (users, contributors, customers, those interested in learning more about ClickHouse) to discuss what is coming in the latest release. If you are interested in sharing what you've built on ClickHouse, let us know. -* [v24.9 Community Call](https://clickhouse.com/company/events/v24-9-community-release-call) - September 26 +* [v24.10 Community Call](https://clickhouse.com/company/events/v24-10-community-release-call) - October 31 ## Upcoming Events diff --git a/ci_v2/docker/style-test/Dockerfile b/ci_v2/docker/style-test/Dockerfile new file mode 100644 index 00000000000..165cdc3dcb1 --- /dev/null +++ b/ci_v2/docker/style-test/Dockerfile @@ -0,0 +1,17 @@ +# docker build -t clickhouse/style-test . +FROM ubuntu:22.04 + +RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ + aspell \ + libxml2-utils \ + python3-pip \ + locales \ + git \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* + +RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8 +ENV LC_ALL=en_US.UTF-8 + +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt diff --git a/ci_v2/docker/style-test/requirements.txt b/ci_v2/docker/style-test/requirements.txt new file mode 100644 index 00000000000..987b014d9ba --- /dev/null +++ b/ci_v2/docker/style-test/requirements.txt @@ -0,0 +1,4 @@ +requests==2.32.3 +yamllint==1.26.3 +codespell==2.2.1 +https://clickhouse-builds.s3.amazonaws.com/packages/praktika-0.1-py3-none-any.whl diff --git a/ci_v2/jobs/check_style.py b/ci_v2/jobs/check_style.py new file mode 100644 index 00000000000..4dd3864e865 --- /dev/null +++ b/ci_v2/jobs/check_style.py @@ -0,0 +1,410 @@ +import math +import multiprocessing +import os +import re +import sys +from concurrent.futures import ProcessPoolExecutor +from pathlib import Path + +from praktika.result import Result +from praktika.utils import Shell, Utils + +NPROC = multiprocessing.cpu_count() + + +def chunk_list(data, n): + """Split the data list into n nearly equal-sized chunks.""" + chunk_size = math.ceil(len(data) / n) + for i in range(0, len(data), chunk_size): + yield data[i : i + chunk_size] + + +def run_check_concurrent(check_name, check_function, files, nproc=NPROC): + stop_watch = Utils.Stopwatch() + + if not files: + print(f"File list is empty [{files}]") + raise + + file_chunks = list(chunk_list(files, nproc)) + results = [] + + # Run check_function concurrently on each chunk + with ProcessPoolExecutor(max_workers=NPROC) as executor: + futures = [executor.submit(check_function, chunk) for chunk in file_chunks] + # Wait for results and process them (optional) + for future in futures: + try: + res = future.result() + if res and res not in results: + results.append(res) + except Exception as e: + results.append(f"Exception in {check_name}: {e}") + + result = Result( + name=check_name, + status=Result.Status.SUCCESS if not results else Result.Status.FAILED, + start_time=stop_watch.start_time, + duration=stop_watch.duration, + info=f"errors: {results}" if results else "", + ) + return result + + +def run_simple_check(check_name, check_function, **kwargs): + stop_watch = Utils.Stopwatch() + + error = check_function(**kwargs) + + result = Result( + name=check_name, + status=Result.Status.SUCCESS if not error else Result.Status.FAILED, + start_time=stop_watch.start_time, + duration=stop_watch.duration, + info=error, + ) + return result + + +def run_check(check_name, check_function, files): + return run_check_concurrent(check_name, check_function, files, nproc=1) + + +def check_duplicate_includes(file_path): + includes = [] + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + for line in f: + if re.match(r"^#include ", line): + includes.append(line.strip()) + + include_counts = {line: includes.count(line) for line in includes} + duplicates = {line: count for line, count in include_counts.items() if count > 1} + + if duplicates: + return f"{file_path}: {duplicates}" + return "" + + +def check_whitespaces(file_paths): + for file in file_paths: + exit_code, out, err = Shell.get_res_stdout_stderr( + f'./ci_v2/jobs/scripts/check_style/double_whitespaces.pl "{file}"', + verbose=False, + ) + if out or err: + return out + " err: " + err + return "" + + +def check_yamllint(file_paths): + file_paths = " ".join([f"'{file}'" for file in file_paths]) + exit_code, out, err = Shell.get_res_stdout_stderr( + f"yamllint --config-file=./.yamllint {file_paths}", verbose=False + ) + return out or err + + +def check_xmllint(file_paths): + if not isinstance(file_paths, list): + file_paths = [file_paths] + file_paths = " ".join([f"'{file}'" for file in file_paths]) + exit_code, out, err = Shell.get_res_stdout_stderr( + f"xmllint --noout --nonet {file_paths}", verbose=False + ) + return out or err + + +def check_functional_test_cases(files): + """ + Queries with event_date should have yesterday() not today() + NOTE: it is not that accuate, but at least something. + """ + + patterns = [ + re.compile( + r"(?i)where.*?\bevent_date\s*(=|>=)\s*today\(\)(?!\s*-\s*1)", + re.IGNORECASE | re.DOTALL, + ) + ] + + errors = [] + for test_case in files: + try: + with open(test_case, "r", encoding="utf-8", errors="replace") as f: + file_content = " ".join( + f.read().splitlines() + ) # Combine lines into a single string + + # Check if any pattern matches in the concatenated string + if any(pattern.search(file_content) for pattern in patterns): + errors.append( + f"event_date should be filtered using >=yesterday() in {test_case} (to avoid flakiness)" + ) + + except Exception as e: + errors.append(f"Error checking {test_case}: {e}") + + for test_case in files: + if "fail" in test_case: + errors.append(f"test case {test_case} includes 'fail' in its name") + + return " ".join(errors) + + +def check_gaps_in_tests_numbers(file_paths, gap_threshold=100): + test_numbers = set() + + pattern = re.compile(r"(\d+)") + + for file in file_paths: + file_name = os.path.basename(file) + match = pattern.search(file_name) + if match: + test_numbers.add(int(match.group(1))) + + sorted_numbers = sorted(test_numbers) + large_gaps = [] + for i in range(1, len(sorted_numbers)): + prev_num = sorted_numbers[i - 1] + next_num = sorted_numbers[i] + diff = next_num - prev_num + if diff >= gap_threshold: + large_gaps.append(f"Gap ({prev_num}, {next_num}) > {gap_threshold}") + + return large_gaps + + +def check_broken_links(path, exclude_paths): + broken_symlinks = [] + + for path in Path(path).rglob("*"): + if any(exclude_path in str(path) for exclude_path in exclude_paths): + continue + if path.is_symlink(): + if not path.exists(): + broken_symlinks.append(str(path)) + + if broken_symlinks: + for symlink in broken_symlinks: + print(symlink) + return f"Broken symlinks found: {broken_symlinks}" + else: + return "" + + +def check_cpp_code(): + res, out, err = Shell.get_res_stdout_stderr( + "./ci_v2/jobs/scripts/check_style/check_cpp.sh" + ) + if err: + out += err + return out + + +def check_repo_submodules(): + res, out, err = Shell.get_res_stdout_stderr( + "./ci_v2/jobs/scripts/check_style/check_submodules.sh" + ) + if err: + out += err + return out + + +def check_other(): + res, out, err = Shell.get_res_stdout_stderr( + "./ci_v2/jobs/scripts/check_style/checks_to_refactor.sh" + ) + if err: + out += err + return out + + +def check_codespell(): + res, out, err = Shell.get_res_stdout_stderr( + "./ci_v2/jobs/scripts/check_style/check_typos.sh" + ) + if err: + out += err + return out + + +def check_aspell(): + res, out, err = Shell.get_res_stdout_stderr( + "./ci_v2/jobs/scripts/check_style/check_aspell.sh" + ) + if err: + out += err + return out + + +def check_mypy(): + res, out, err = Shell.get_res_stdout_stderr( + "./ci_v2/jobs/scripts/check_style/check-mypy" + ) + if err: + out += err + return out + + +def check_pylint(): + res, out, err = Shell.get_res_stdout_stderr( + "./ci_v2/jobs/scripts/check_style/check-pylint" + ) + if err: + out += err + return out + + +def check_file_names(files): + files_set = set() + for file in files: + file_ = file.lower() + if file_ in files_set: + return f"Non-uniq file name in lower case: {file}" + files_set.add(file_) + return "" + + +if __name__ == "__main__": + results = [] + stop_watch = Utils.Stopwatch() + + all_files = Utils.traverse_paths( + include_paths=["."], + exclude_paths=[ + "./.git", + "./contrib", + "./build", + ], + not_exists_ok=True, # ./build may exist if runs locally + ) + + cpp_files = Utils.traverse_paths( + include_paths=["./src", "./base", "./programs", "./utils"], + exclude_paths=[ + "./base/glibc-compatibility", + "./contrib/consistent-hashing", + "./base/widechar_width", + ], + file_suffixes=[".h", ".cpp"], + ) + + yaml_workflow_files = Utils.traverse_paths( + include_paths=["./.github"], + exclude_paths=[], + file_suffixes=[".yaml", ".yml"], + ) + + xml_files = Utils.traverse_paths( + include_paths=["."], + exclude_paths=["./.git", "./contrib/"], + file_suffixes=[".xml"], + ) + + functional_test_files = Utils.traverse_paths( + include_paths=["./tests/queries"], + exclude_paths=[], + file_suffixes=[".sql", ".sh", ".py", ".j2"], + ) + + results.append( + Result( + name="Read Files", + status=Result.Status.SUCCESS, + start_time=stop_watch.start_time, + duration=stop_watch.duration, + ) + ) + + results.append( + run_check_concurrent( + check_name="Whitespace Check", + check_function=check_whitespaces, + files=cpp_files, + ) + ) + results.append( + run_check_concurrent( + check_name="YamlLint Check", + check_function=check_yamllint, + files=yaml_workflow_files, + ) + ) + results.append( + run_check_concurrent( + check_name="XmlLint Check", + check_function=check_xmllint, + files=xml_files, + ) + ) + results.append( + run_check_concurrent( + check_name="Functional Tests scripts smoke check", + check_function=check_functional_test_cases, + files=functional_test_files, + ) + ) + results.append( + run_check( + check_name="Check Tests Numbers", + check_function=check_gaps_in_tests_numbers, + files=functional_test_files, + ) + ) + results.append( + run_simple_check( + check_name="Check Broken Symlinks", + check_function=check_broken_links, + path="./", + exclude_paths=["contrib/", "metadata/", "programs/server/data"], + ) + ) + results.append( + run_simple_check( + check_name="Check CPP code", + check_function=check_cpp_code, + ) + ) + results.append( + run_simple_check( + check_name="Check Submodules", + check_function=check_repo_submodules, + ) + ) + results.append( + run_check( + check_name="Check File Names", + check_function=check_file_names, + files=all_files, + ) + ) + results.append( + run_simple_check( + check_name="Check Many Different Things", + check_function=check_other, + ) + ) + results.append( + run_simple_check( + check_name="Check Codespell", + check_function=check_codespell, + ) + ) + results.append( + run_simple_check( + check_name="Check Aspell", + check_function=check_aspell, + ) + ) + + res = Result.create_from(results=results, stopwatch=stop_watch).dump() + + if not res.is_ok(): + print("Style check: failed") + for result in results: + if not result.is_ok(): + print("Failed check:") + print(" | ", result) + sys.exit(1) + else: + print("Style check: ok") diff --git a/ci_v2/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt b/ci_v2/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt new file mode 100644 index 00000000000..8ec2e001a73 --- /dev/null +++ b/ci_v2/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt @@ -0,0 +1,3050 @@ +personal_ws-1.1 en 2984 +AArch +ACLs +ALTERs +AMPLab +AMQP +ANNIndex +ANNIndexes +ANOVA +AORM +APIs +ARMv +ASLR +ASOF +ASan +AWND +AWST +Actian +ActionsMenu +ActiveRecord +AddressSanitizer +AggregateFunction +Aggregatefunction +AggregatingMergeTree +AggregatorThreads +AggregatorThreadsActive +Akka +AlertManager +Alexey +AnyEvent +AppleClang +Approximative +ArrayJoin +ArrowStream +AsyncInsertCacheSize +AsynchronousHeavyMetricsCalculationTimeSpent +AsynchronousHeavyMetricsUpdateInterval +AsynchronousInsert +AsynchronousInsertThreads +AsynchronousInsertThreadsActive +AsynchronousMetricsCalculationTimeSpent +AsynchronousMetricsUpdateInterval +AsynchronousReadWait +Authenticator +Authenticators +AutoFDO +AutoML +Autocompletion +AvroConfluent +AzureQueue +BIGINT +BIGSERIAL +BORO +BSON +BSONEachRow +BackgroundBufferFlushSchedulePool +BackgroundBufferFlushSchedulePoolSize +BackgroundBufferFlushSchedulePoolTask +BackgroundCommonPoolSize +BackgroundCommonPoolTask +BackgroundDistributedSchedulePool +BackgroundDistributedSchedulePoolSize +BackgroundDistributedSchedulePoolTask +BackgroundFetchesPoolSize +BackgroundFetchesPoolTask +BackgroundMergesAndMutationsPoolSize +BackgroundMergesAndMutationsPoolTask +BackgroundMessageBrokerSchedulePoolSize +BackgroundMessageBrokerSchedulePoolTask +BackgroundMovePoolSize +BackgroundMovePoolTask +BackgroundProcessingPool +BackgroundSchedulePool +BackgroundSchedulePoolSize +BackgroundSchedulePoolTask +BackupsIO +BackupsIOThreads +BackupsIOThreadsActive +BackupsThreads +BackupsThreadsActive +BestEffort +BestEffortOrNull +BestEffortOrZero +BestEffortUS +BestEffortUSOrNull +BestEffortUSOrZero +Blazingly +BlockActiveTime +BlockDiscardBytes +BlockDiscardMerges +BlockDiscardOps +BlockDiscardTime +BlockInFlightOps +BlockQueueTime +BlockReadBytes +BlockReadMerges +BlockReadOps +BlockReadTime +BlockWriteBytes +BlockWriteMerges +BlockWriteOps +BlockWriteTime +Boncz +Bool +BrokenDistributedFilesToInsert +Bugfix +BuildID +BuilderBinAarch +BuilderBinAmd +Bytebase +CCTOOLS +CDATA +CDFs +CDMA +CESU +CIDR +CIDRToRange +CKMAN +CKibana +CLOB +CLion +CMPLNT +CMake +CMakeLists +CODECS +COVID +CPUFrequencyMHz +CPUs +CSVWithNames +CSVWithNamesAndTypes +CSVs +CTEs +CacheDetachedFileSegments +CacheDictionaries +CacheDictionary +CacheDictionaryThreads +CacheDictionaryThreadsActive +CacheDictionaryUpdateQueueBatches +CacheDictionaryUpdateQueueKeys +CacheFileSegments +CamelCase +Cap'n +CapContains +CapUnion +CapnProto +CatBoost +CellAreaM +CellAreaRads +CellsIntersect +CentOS +CertificateHandler +Chadmin +ChannelID +Cidr +Ciphertext +CityHash +Clangd +ClickBench +ClickCat +ClickHouse +ClickHouse's +ClickHouseClient +ClickHouseMigrator +ClickHouseNIO +ClickHouseVapor +ClickVisual +ClickableSquare +CloudAvailableBadge +CloudDetails +CloudNotSupportedBadge +CloudStorage +CodeBlock +CodeLLDB +Codecs +CollapsingMergeTree +Combinators +Compat +CompiledExpressionCacheBytes +CompiledExpressionCacheCount +ComplexKeyCache +ComplexKeyDirect +ComplexKeyHashed +Composable +Config +ConnectionDetails +Const +ContextLockWait +Contrib +CountMin +Covid +Cramer's +Criteo +Crotty +Crowdsourced +Ctrl +CurrentMetrics +CustomSeparated +CustomSeparatedWithNames +CustomSeparatedWithNamesAndTypes +DBAs +DBMSs +DBeaver +DD +DDLWORKER +DDLWorker +DDLWorkerThreads +DDLWorkerThreadsActive +DECRYPT +DELETEs +DESC +DIEs +DOGEFI +Damerau +DataGrip +DataLens +DataTime +DataTypes +DatabaseCatalog +DatabaseCatalogThreads +DatabaseCatalogThreadsActive +DatabaseOnDisk +DatabaseOnDiskThreads +DatabaseOnDiskThreadsActive +DatabaseOrdinaryThreads +DatabaseOrdinaryThreadsActive +DateTime +DateTimes +DbCL +Decrypted +Deduplicate +Deduplication +DelayedInserts +DeliveryTag +DeltaLake +Denormalize +DestroyAggregatesThreads +DestroyAggregatesThreadsActive +DictCacheRequests +DiskAvailable +DiskObjectStorage +DiskObjectStorageAsyncThreads +DiskObjectStorageAsyncThreadsActive +DiskSpaceReservedForMerge +DiskTotal +DiskUnreserved +DiskUsed +DistributedFilesToInsert +DistributedSend +DockerHub +DoubleDelta +Doxygen +Dresseler +Durre +ECMA +ETag +Ecto +EdgeAngle +EdgeLengthKm +EdgeLengthM +ElasticSearch +EmbeddedRocksDB +Embeddings +Encodings +Enum +Enums +Eoan +EphemeralNode +Ethereum +ExactEdgeLengthKm +ExactEdgeLengthM +ExactEdgeLengthRads +ExecutablePool +ExtType +ExternalDistributed +FFFD +FFFFFFFF +FIPS +FOSDEM +FQDN +Failover +FarmHash +FileCluster +FileLog +FilesystemCacheBytes +FilesystemCacheElements +FilesystemCacheFiles +FilesystemCacheReadBuffers +FilesystemCacheSize +FilesystemLogsPathAvailableBytes +FilesystemLogsPathAvailableINodes +FilesystemLogsPathTotalBytes +FilesystemLogsPathTotalINodes +FilesystemLogsPathUsedBytes +FilesystemLogsPathUsedINodes +FilesystemMainPathAvailableBytes +FilesystemMainPathAvailableINodes +FilesystemMainPathTotalBytes +FilesystemMainPathTotalINodes +FilesystemMainPathUsedBytes +FilesystemMainPathUsedINodes +FixedString +FlameGraph +Flink +ForEach +FreeBSD +Fuzzer +Fuzzers +GHCN +GTID +GTest +Gb +Gbit +Gcc +GenerateRandom +GeoCoord +Geobases +Geohash +Geoid +GetBaseCell +GetDestinationIndexFromUnidirectionalEdge +GetFaces +GetIndexesFromUnidirectionalEdge +GetNeighbors +GetOriginIndexFromUnidirectionalEdge +GetPentagonIndexes +GetRes +GetResolution +GetUnidirectionalEdge +GetUnidirectionalEdgeBoundary +GetUnidirectionalEdgesFromHexagon +GitLab +GlobalThread +GlobalThreadActive +GoLand +GoogleTest +Grafana +GraphQL +GraphiteMergeTree +Greenwald +HDDs +HHMM +HMAC +HNSW +HSTS +HTTPConnection +HTTPThreads +HashedDictionary +HashedDictionaryThreads +HashedDictionaryThreadsActive +Haversine +Heredoc +HexAreaKm +HexAreaM +HexRing +Holistics +Homebrew +Homebrew's +HorizontalDivide +Hostname +HouseOps +Hudi +HyperLogLog +Hypot +IANA +IDE +IDEs +IDNA +IMDS +INFILE +INSERTed +INSERTs +IOPrefetchThreads +IOPrefetchThreadsActive +IOThreads +IOThreadsActive +IOUringInFlightEvents +IOUringPendingEvents +IOWriterThreads +IOWriterThreadsActive +IPTrie +IProcessor +IPv +Identifiant +Incrementing +IndexesAreNeighbors +InfluxDB +Instana +IntN +Integrations +IntelliJ +IntelliSense +InterserverConnection +InterserverThreads +IntervalDay +IntervalHour +IntervalMicrosecond +IntervalMillisecond +IntervalMilliseconds +IntervalMinute +IntervalMonth +IntervalNanosecond +IntervalQuarter +IntervalSecond +IntervalWeek +IntervalYear +IsPentagon +IsResClassIII +IsValid +JBOD +JOINed +JOINs +JSONAllPaths +JSONAllPathsWithTypes +JSONArrayLength +JSONAsObject +JSONAsString +JSONColumns +JSONColumnsWithMetadata +JSONCompact +JSONCompactColumns +JSONCompactEachRow +JSONCompactEachRowWithNames +JSONCompactEachRowWithNamesAndTypes +JSONCompactStrings +JSONCompactStringsEachRow +JSONCompactStringsEachRowWithNames +JSONCompactStringsEachRowWithNamesAndTypes +JSONCompactWithProgress +JSONDynamicPaths +JSONDynamicPathsWithTypes +JSONEachRow +JSONEachRowWithProgress +JSONExtract +JSONExtractArrayRaw +JSONExtractBool +JSONExtractFloat +JSONExtractInt +JSONExtractKeys +JSONExtractKeysAndValues +JSONExtractKeysAndValuesRaw +JSONExtractRaw +JSONExtractString +JSONExtractUInt +JSONHas +JSONLength +JSONObjectEachRow +JSONSharedDataPaths +JSONSharedDataPathsWithTypes +JSONStrings +JSONStringsEachRow +JSONStringsEachRowWithProgress +JSONType +JSONs +Jaeger +Jannis +Jaro +JavaHash +Jemalloc +Jepsen +JetBrains +Jitter +Joda +JumpConsistentHash +Jupyter +KDevelop +KafkaAssignedPartitions +KafkaBackgroundReads +KafkaConsumers +KafkaConsumersInUse +KafkaConsumersWithAssignment +KafkaLibrdkafkaThreads +KafkaProducers +KafkaWrites +Kahan +Kaser +KeeperAliveConnections +KeeperMap +KeeperOutstandingRequests +Kerberos +Khanna +Kibana +KittenHouse +Klickhouse +Kolmogorov +Konstantin +Korzeniewski +Kubernetes +LDAP +LGPL +LIMITs +LINEITEM +LLDB +LLVM's +LOCALTIME +LOCALTIMESTAMP +LONGLONG +LOONGARCH +Lemire +Levenshtein +Liao +LibFuzzer +LightHouse +LineAsString +LineString +Linf +LinfDistance +LinfNorm +LinfNormalize +LinksDeployment +Linq +LoadAverage +LocalThread +LocalThreadActive +LogQL +Logstash +LookML +LoongArch +LowCardinality +LpDistance +LpNorm +LpNormalize +Luebbe +Lyft +MACNumToString +MACStringToNum +MACStringToOUI +MEDIUMINT +MEMTABLE +MMapCacheCells +MMappedAllocBytes +MMappedAllocs +MMappedFileBytes +MMappedFiles +MSSQL +MSan +MVCC +MacBook +MacOS +MapState +MarkCacheBytes +MarkCacheFiles +MarksLoaderThreads +MarksLoaderThreadsActive +MaterializedMySQL +MaterializedPostgreSQL +MaterializedView +MaxDDLEntryID +MaxMind +MaxPartCountForPartition +MaxPushedDDLEntryID +Mbps +McNeal +Memcheck +MemoryCode +MemoryDataAndStack +MemoryResident +MemoryResidentMax +MemorySanitizer +MemoryShared +MemoryTracking +MemoryVirtual +Menne +MergeJoin +MergeState +MergeTree +MergeTreeAllRangesAnnouncementsSent +MergeTreeBackgroundExecutor +MergeTreeBackgroundExecutorThreads +MergeTreeBackgroundExecutorThreadsActive +MergeTreeDataSelectExecutor +MergeTreeDataSelectExecutorThreads +MergeTreeDataSelectExecutorThreadsActive +MergeTreePartsCleanerThreads +MergeTreePartsCleanerThreadsActive +MergeTreePartsLoaderThreads +MergeTreePartsLoaderThreadsActive +MergeTreeReadTaskRequestsSent +MergeTreeSettings +MessagePack +Metastore +MetroHash +MiB +Milli +Milovidov +MinHash +MinIO +MinMax +MindsDB +Mongo +Mongodb +Monotonicity +MsgPack +MultiLineString +MultiPolygon +Multiline +Multiqueries +Multithreading +Multiword +MurmurHash +MySQLConnection +MySQLDump +MySQLThreads +NATS +NCHAR +NDJSON +NEKUDOTAYIM +NEWDATE +NEWDECIMAL +NFKC +NFKD +NOAA +NULLIF +NVME +NVMe +NYPD +NaNs +Nagios +Nambiar +Namenode +NamesAndTypesList +Nano +Nesterov +NetworkReceive +NetworkReceiveBytes +NetworkReceiveDrop +NetworkReceiveErrors +NetworkReceivePackets +NetworkSend +NetworkSendBytes +NetworkSendDrop +NetworkSendErrors +NetworkSendPackets +Noaa +NodeJs +NonMonotonic +NuRaft +NumHexagons +NumPy +NumToString +NumToStringClassC +NumberOfDatabases +NumberOfDetachedByUserParts +NumberOfDetachedParts +NumberOfTables +OFNS +OLAP +OLTP +OSContextSwitches +OSGuestNiceTime +OSGuestNiceTimeCPU +OSGuestNiceTimeNormalized +OSGuestTime +OSGuestTimeCPU +OSGuestTimeNormalized +OSIOWaitMicroseconds +OSIOWaitTime +OSIOWaitTimeCPU +OSIOWaitTimeNormalized +OSIdleTime +OSIdleTimeCPU +OSIdleTimeNormalized +OSInterrupts +OSIrqTime +OSIrqTimeCPU +OSIrqTimeNormalized +OSMemoryAvailable +OSMemoryBuffers +OSMemoryCached +OSMemoryFreePlusCached +OSMemoryFreeWithoutCached +OSMemoryTotal +OSNiceTime +OSNiceTimeCPU +OSNiceTimeNormalized +OSOpenFiles +OSProcessesBlocked +OSProcessesCreated +OSProcessesRunning +OSSoftIrqTime +OSSoftIrqTimeCPU +OSSoftIrqTimeNormalized +OSStealTime +OSStealTimeCPU +OSStealTimeNormalized +OSSystemTime +OSSystemTimeCPU +OSSystemTimeNormalized +OSThreadsRunnable +OSThreadsTotal +OSUptime +OSUserTime +OSUserTimeCPU +OSUserTimeNormalized +OTLP +OUTFILE +ObjectId +Oblakov +Observability +Octonica +Ok +OnTime +OpenCelliD +OpenFileForRead +OpenFileForWrite +OpenSSL +OpenSUSE +OpenSky +OpenStack +OpenTelemetry +OrDefault +OrNull +OrZero +OvercommitTracker +PAAMAYIM +PCRE +PRCP +PREWHERE +PROCESSLIST +PROXYv +PSUN +PagerDuty +ParallelFormattingOutputFormatThreads +ParallelFormattingOutputFormatThreadsActive +ParallelParsingInputFormat +ParallelParsingInputFormatThreads +ParallelParsingInputFormatThreadsActive +Parametrized +ParquetMetadata +Parsers +PartMutation +Partitioner +PartsActive +PartsCommitted +PartsCompact +PartsDeleteOnDestroy +PartsDeleting +PartsOutdated +PartsPreActive +PartsPreCommitted +PartsTemporary +PartsWide +PeerDB +PendingAsyncInsert +Percona +PhpStorm +PlantUML +Poess +PointDistKm +PointDistM +PointDistRads +PostHistory +PostLink +PostgreSQLConnection +PostgreSQLThreads +Postgres +PostgresSQL +Precompiled +Preprocess +PrettyCompact +PrettyCompactMonoBlock +PrettyCompactNoEscapes +PrettyCompactNoEscapesMonoBlock +PrettyJSONEachRow +PrettyMonoBlock +PrettyNoEscapes +PrettyNoEscapesMonoBlock +PrettySpace +PrettySpaceMonoBlock +PrettySpaceNoEscapes +PrettySpaceNoEscapesMonoBlock +Prewhere +PrivateKeyPassphraseHandler +ProfileEvents +Profiler +Proleptic +PromHouse +PromQL +Promql +Promtail +Protobuf +ProtobufList +ProtobufSingle +ProxySQL +Punycode +PyArrow +PyCharm +QATlib +QEMU +QTCreator +Quantile +QueryCacheBytes +QueryCacheEntries +QueryCacheHits +QueryCacheMisses +QueryPreempted +QueryThread +QuickAssist +QuickSight +QuoteMeta +RBAC +RClickHouse +RHEL +ROLLUP +RWLock +RWLockActiveReaders +RWLockActiveWriters +RWLockWaitingReaders +RWLockWaitingWriters +RabbitMQ +Rabl +RangeHashed +RawBLOB +ReDoS +ReadTaskRequestsSent +ReadonlyReplica +RecipeNLG +Recompressing +Recompression +RectAdd +RectContains +RectIntersection +RectUnion +RedHat +Redash +Reddit +Refactorings +ReferenceKeyed +Refreshable +RegexpTree +RemoteRead +ReplacingMergeTree +ReplicasMaxAbsoluteDelay +ReplicasMaxInsertsInQueue +ReplicasMaxMergesInQueue +ReplicasMaxQueueSize +ReplicasMaxRelativeDelay +ReplicasSumInsertsInQueue +ReplicasSumMergesInQueue +ReplicasSumQueueSize +ReplicatedAggregatingMergeTree +ReplicatedChecks +ReplicatedCollapsingMergeTree +ReplicatedFetch +ReplicatedGraphiteMergeTree +ReplicatedMergeTree +ReplicatedReplacingMergeTree +ReplicatedSend +ReplicatedSummingMergeTree +ReplicatedVersionedCollapsingMergeTree +Resample +RestartReplicaThreads +RestartReplicaThreadsActive +RestoreThreads +RestoreThreadsActive +RoaringBitmap +RocksDB +Rollup +RowBinary +RowBinaryWithDefaults +RowBinaryWithNames +RowBinaryWithNamesAndTypes +Runtime +SATA +SELECTs +SERIALIZABLE +SIGTERM +SIMD +SLES +SLRU +SMALLINT +SNWD +SPNEGO +SQEs +SQLAlchemy +SQLConsoleDetail +SQLInsert +SQLSTATE +SSDCache +SSDComplexKeyCache +SSDs +SSLManager +SSRF +SSSE +SaaS +Sanjeev +Sankey +Scalable +Scatterplot +Schaefer +Schemas +Schwartzian +SeasClick +SeekTable +SelfManaged +Sematext +SendExternalTables +SendScalars +ShareAlike +SharedMergeTree +Shortkeys +SimHash +Simhash +SimpleAggregateFunction +SimpleState +SipHash +Smirnov's +Smirnov'test +Soundex +SpanKind +Spearman's +SquaredDistance +SquaredNorm +StartTLS +StartTime +StartupSystemTables +StartupSystemTablesThreads +StartupSystemTablesThreadsActive +Stateful +StorageBufferBytes +StorageBufferRows +StorageDistributed +StorageDistributedThreads +StorageDistributedThreadsActive +StorageHive +StorageHiveThreads +StorageHiveThreadsActive +StorageODBC +StorageS +StringToNum +StringToNumOrDefault +StringToNumOrNull +StripeLog +Stripelog +Strohmeier +Subcolumns +Subexpression +Submodules +Subqueries +Substrings +SummingMergeTree +SuperSet +Superset +SupersetDocker +SystemReplicasThreads +SystemReplicasThreadsActive +TABLUM +TAVG +TCPConnection +TCPThreads +TDigest +TINYINT +TLSv +TMAX +TMIN +TPCH +TSDB +TSVRaw +TSVWithNames +TSVs +TSan +TThe +TabItem +TabSeparated +TabSeparatedRaw +TabSeparatedRawWithNames +TabSeparatedRawWithNamesAndTypes +TabSeparatedWithNames +TabSeparatedWithNamesAndTypes +Tabix +TablesLoaderBackgroundThreads +TablesLoaderBackgroundThreadsActive +TablesLoaderForegroundThreads +TablesLoaderForegroundThreadsActive +TablesToDropQueueSize +TargetSpecific +Telegraf +TemplateIgnoreSpaces +TemporaryFilesForAggregation +TemporaryFilesForJoin +TemporaryFilesForSort +TemporaryFilesUnknown +Testflows +Tgz +Theil's +ThreadMonotonic +ThreadPoolFSReaderThreads +ThreadPoolFSReaderThreadsActive +ThreadPoolRemoteFSReaderThreads +ThreadPoolRemoteFSReaderThreadsActive +ThreadsActive +ThreadsInOvercommitTracker +TimeSeries +TimescaleDB's +Timeunit +TinyLog +Tkachenko +ToASCII +ToCenterChild +ToChildren +ToGeo +ToGeoBoundary +ToIPv +ToParent +ToSnowflake +ToSnowflakeID +ToString +ToUnicode +Toolset +TopK +TotalBytesOfMergeTreeTables +TotalPartsOfMergeTreeTables +TotalPrimaryKeyBytesInMemory +TotalPrimaryKeyBytesInMemoryAllocated +TotalRowsOfMergeTreeTables +TotalTemporaryFiles +Tradeoff +Transactional +Tsai +Tukey +TwoColumnList +UBSan +UDFs +UInt +UIntN +ULID +ULIDStringToDateTime +UMTS +UNDROP +UPDATEs +URIs +URL +URL's +URLDecode +URLEncode +URLHash +URLHierarchy +URLPathHierarchy +USearch +UTCTimestamp +UUIDNumToString +UUIDStringToNum +UUIDToNum +UUIDs +UUIDv +UUid +Uber +Uint +UncompressedCacheBytes +UncompressedCacheCells +UnidirectionalEdgeIsValid +UniqThetaSketch +Updatable +Uppercased +Uptime +Uptrace +UserID +Util +VARCHAR +VIEWs +Vadim +Valgrind +Vectorized +VersionBadge +VersionInteger +VersionedCollapsingMergeTree +VideoContainer +ViewAllLink +VirtualBox +Vose +WALs +WSFG +Welch's +Werror +Wether +WikiStat +WindowView +Winkler +WithCounter +WithFastCounter +WithNames +WithNamesAndTypes +WordNet +WriteBuffer +WriteBuffers +XCode +XHTML +XORs +Xeon +YAML +YAMLRegExpTree +YYYY +YYYYMMDD +YYYYMMDDToDate +YYYYMMDDhhmmssToDateTime +Yandex +Yasm +ZCurve +ZSTDQAT +Zabbix +Zipkin +ZooKeeper +ZooKeeper's +ZooKeeperRequest +ZooKeeperSession +ZooKeeperWatch +ZooKeepers +aarch +accurateCast +accurateCastOrDefault +accurateCastOrNull +acos +acosh +activecube +activerecord +addDate +addDays +addHours +addInterval +addMicroseconds +addMilliseconds +addMinutes +addMonths +addNanoseconds +addQuarters +addSeconds +addTupleOfIntervals +addWeeks +addYears +addr +addressToLine +addressToLineWithInlines +addressToSymbol +adviced +agg +aggThrow +aggregatefunction +aggregatingmergetree +aggregatio +aggretate +aggthrow +aiochclient +allocator +alphaTokens +amplab +analysisOfVariance +analytics +anonymize +anonymized +ansi +anyHeavy +anyIf +anyLast +anyheavy +anylast +appendTrailingCharIfAbsent +approximative +approxtopk +approxtopsum +argMax +argMin +argmax +argmin +arrayAUC +arrayAll +arrayAvg +arrayCompact +arrayConcat +arrayCount +arrayCumSum +arrayCumSumNonNegative +arrayDifference +arrayDistinct +arrayDotProduct +arrayElement +arrayElementOrNull +arrayEnumerate +arrayEnumerateDense +arrayEnumerateDenseRanked +arrayEnumerateUniq +arrayEnumerateUniqRanked +arrayExists +arrayFill +arrayFilter +arrayFirst +arrayFirstIndex +arrayFirstOrNull +arrayFlatten +arrayFold +arrayIntersect +arrayJaccardIndex +arrayJoin +arrayLast +arrayLastIndex +arrayLastOrNull +arrayMap +arrayMax +arrayMin +arrayPartialReverseSort +arrayPartialShuffle +arrayPartialSort +arrayPopBack +arrayPopFront +arrayProduct +arrayPushBack +arrayPushFront +arrayRandomSample +arrayReduce +arrayReduceInRanges +arrayResize +arrayReverse +arrayReverseFill +arrayReverseSort +arrayReverseSplit +arrayRotateLeft +arrayRotateRight +arrayShiftLeft +arrayShiftRight +arrayShingles +arrayShuffle +arraySlice +arraySort +arraySplit +arrayStringConcat +arraySum +arrayUnion +arrayUniq +arrayWithConstant +arrayZip +arrayZipUnaligned +ascii +asin +asinh +assumeNotNull +async +asynch +atan +atanh +atomicity +auth +authenticator +authenticators +autocompletion +autodetect +autodetected +autogen +autogenerate +autogenerated +autogeneration +autostart +avgWeighted +avgweighted +avro +avx +aws +azureBlobStorage +azureBlobStorageCluster +backend +backoff +backticks +backupview +balancer +basename +bcrypt +benchmarking +bfloat +bigrams +binlog +bitAnd +bitCount +bitHammingDistance +bitNot +bitOr +bitPositionsToArray +bitRotateLeft +bitRotateRight +bitShiftLeft +bitShiftRight +bitSlice +bitTest +bitTestAll +bitTestAny +bitXor +bitmapAnd +bitmapAndCardinality +bitmapAndnot +bitmapAndnotCardinality +bitmapBuild +bitmapCardinality +bitmapContains +bitmapHasAll +bitmapHasAny +bitmapMax +bitmapMin +bitmapOr +bitmapOrCardinality +bitmapSubsetInRange +bitmapSubsetLimit +bitmapToArray +bitmapTransform +bitmapXor +bitmapXorCardinality +bitmask +bitmaskToArray +bitmaskToList +bitov +blake +blockNumber +blockSerializedSize +blockSize +blockinfo +blockreader +blocksize +bool +boolean +bools +boringssl +boundingRatio +bozerkins +broadcasted +brotli +bson +bsoneachrow +buffersize +bugfix +buildId +buildable +builtins +byteHammingDistance +byteSize +byteSlice +byteSwap +bytebase +bytesToCutForIPv +cLoki +caConfig +cacheSessions +cachesize +camelCase +capn +capnproto +cardinalities +cardinality +cartesian +cassandra +casted +catboost +catboostEvaluate +categoricalInformationValue +categoricalinformationvalue +cathetus +cbindgen +cbrt +ccache +cctz +ceil +centroid +certificateFile +cetera +cfg +cgroup +cgroups +chadmin +changeDay +changeHour +changeMinute +changeMonth +changeSecond +changeYear +changelog +changelogs +charset +charsets +chconn +cheatsheet +checkouting +checksummed +checksumming +checksums +childern +chproxy +chunksize +cickhouse +cipherList +ciphertext +cityHash +cityhash +ckibana +ckman +clangd +cli +clickcache +clickcat +clickhouse +clickhousedb +clickhousex +clickmate +clickstream +clickvisual +clockhour +clusterAllReplicas +cmake +codebase +codec +codecs +codepoint +codepoints +collapsingmergetree +combinator +combinators +comparising +composable +compressability +concat +concatAssumeInjective +concatWithSeparator +concatWithSeparatorAssumeInjective +cond +conf +config +configs +conformant +congruential +conjuction +conjuctive +connectionId +const +contrib +convertCharset +coroutines +corrMatrix +corrStable +corrmatrix +corrstable +cosineDistance +countDigits +countEqual +countMatches +countMatchesCaseInsensitive +countSubstrings +countSubstringsCaseInsensitive +countSubstringsCaseInsensitiveUTF +covarPop +covarPopMatrix +covarPopStable +covarSamp +covarSampMatrix +covarSampStable +covarStable +covariates +covarpop +covarpopmatrix +covarpopstable +covarsamp +covarsampmatrix +covarsampstable +covid +cpp +cppkafka +cpu +cramersV +cramersVBiasCorrected +cramersv +cramersvbiascorrected +criteo +crlf +croaring +cronjob +cryptocurrencies +cryptocurrency +cryptographic +csv +csvwithnames +csvwithnamesandtypes +ctukey +curdate +currentDatabase +currentProfiles +currentRoles +currentUser +customizable +customizations +customseparated +customseparatedwithnames +customseparatedwithnamesandtypes +cutFragment +cutIPv +cutQueryString +cutQueryStringAndFragment +cutToFirstSignificantSubdomain +cutToFirstSignificantSubdomainCustom +cutToFirstSignificantSubdomainCustomRFC +cutToFirstSignificantSubdomainCustomWithWWW +cutToFirstSignificantSubdomainCustomWithWWWRFC +cutToFirstSignificantSubdomainRFC +cutToFirstSignificantSubdomainWithWWW +cutToFirstSignificantSubdomainWithWWWRFC +cutURLParameter +cutWWW +cyrus +damerauLevenshteinDistance +datacenter +datacenters +datafiles +datagrip +datalens +datanode +dataset +datasets +datasource +datatypes +dateName +dateTime +dateTimeToSnowflake +dateTimeToSnowflakeID +datetime +datetimes +dayofyear +dbal +dbeaver +dbgen +dbms +ddl +deallocated +deallocation +deallocations +debian +decodeHTMLComponent +decodeURLComponent +decodeURLFormComponent +decodeXMLComponent +decompressor +decrypt +decrypted +decrypts +deduplicate +deduplicated +deduplicating +deduplication +defaultProfiles +defaultRoles +defaultValueOfArgumentType +defaultValueOfTypeName +delim +deltaLake +deltaSum +deltaSumTimestamp +deltalake +deltasum +deltasumtimestamp +demangle +denormalize +denormalized +denormalizing +denormals +dequeued +dequeues +deserialization +deserialized +deserializing +dest +destructor +destructors +detectCharset +detectLanguage +detectLanguageMixed +detectLanguageUnknown +detectProgrammingLanguage +detectTonality +determinator +deterministically +dictGet +dictGetAll +dictGetChildren +dictGetDescendant +dictGetHierarchy +dictGetOrDefault +dictGetOrNull +dictGetUUID +dictHas +dictIsIn +disableProtocols +disjunction +disjunctions +displayName +displaySecretsInShowAndSelect +distinctDynamicTypes +distinctJSONPaths +distinctJSONPathsAndTypes +distinctdynamictypes +distinctjsonpaths +distro +divideDecimal +dmesg +doesnt +domainRFC +domainWithoutWWW +domainWithoutWWWRFC +dont +dotProduct +downsampling +dplyr +dragonbox +dropoff +dumpColumnStructure +durations +ecto +editDistance +editDistanceUTF +embeddings +emptyArray +emptyArrayDate +emptyArrayDateTime +emptyArrayFloat +emptyArrayInt +emptyArrayString +emptyArrayToSingle +emptyArrayUInt +enabledProfiles +enabledRoles +encodeURLComponent +encodeURLFormComponent +encodeXMLComponent +encodings +encryptions +endian +endianness +endsWith +endsWithUTF +endswith +enqueued +enum +enum's +enums +erfc +errorCodeToName +etag +evalMLMethod +exFAT +expiryMsec +exponentialMovingAverage +exponentialTimeDecayedAvg +exponentialTimeDecayedCount +exponentialTimeDecayedMax +exponentialTimeDecayedSum +exponentialmovingaverage +expr +exprN +extendedVerification +extractAll +extractAllGroups +extractAllGroupsHorizontal +extractAllGroupsVertical +extractKeyValuePairs +extractKeyValuePairsWithEscaping +extractTextFromHTML +extractURLParameter +extractURLParameterNames +extractURLParameters +failover +farmFingerprint +farmHash +fastops +fcoverage +fibonacci +fifo +fileCluster +filelog +filesystem +filesystemAvailable +filesystemCapacity +filesystemFree +filesystemUnreserved +filesystems +finalizeAggregation +fips +firstLine +firstSignficantSubdomain +firstSignificantSubdomain +firstSignificantSubdomainCustom +firstSignificantSubdomainCustomRFC +firstSignificantSubdomainRFC +fixedstring +flameGraph +flamegraph +flatbuffers +flattenTuple +flink +fluentd +fmtlib +formatDateTime +formatDateTimeInJoda +formatDateTimeInJodaSyntax +formatQuery +formatQuerySingleLine +formatReadableDecimalSize +formatReadableQuantity +formatReadableSize +formatReadableTimeDelta +formatRow +formatRowNoNewline +formated +formatschema +formatter +formatters +fqdn +frac +freezed +fromDaysSinceYearZero +fromModifiedJulianDay +fromModifiedJulianDayOrNull +fromUTCTimestamp +fromUnixTimestamp +fromUnixTimestampInJodaSyntax +fsync +func +fuzzBits +fuzzJSON +fuzzQuery +fuzzer +fuzzers +gRPC +gaugehistogram +gccMurmurHash +gcem +generateRandom +generateRandomStructure +generateSeries +generateSnowflakeID +generateULID +generateUUIDv +geoDistance +geoToH +geoToS +geobase +geobases +geocode +geohash +geohashDecode +geohashEncode +geohashesInBox +geoip +geospatial +getClientHTTPHeader +getMacro +getOSKernelVersion +getServerPort +getSetting +getSizeOfEnumType +getSubcolumn +getTypeSerializationStreams +getblockinfo +getevents +ghcnd +github +glibc +globalIn +globalNotIn +globbing +glushkovds +golang +googletest +grafana +graphitemergetree +graphouse +graphql +greatCircleAngle +greatCircleDistance +greaterOrEquals +greaterorequals +greenspace +groupArray +groupArrayInsertAt +groupArrayIntersect +groupArrayLast +groupArrayMovingAvg +groupArrayMovingSum +groupArraySample +groupArraySorted +groupBitAnd +groupBitOr +groupBitXor +groupBitmap +groupBitmapAnd +groupBitmapOr +groupBitmapXor +groupConcat +groupUniqArray +grouparray +grouparrayinsertat +grouparrayintersect +grouparraylast +grouparraymovingavg +grouparraymovingsum +grouparraysample +grouparraysorted +groupbitand +groupbitmap +groupbitmapand +groupbitmapor +groupbitmapxor +groupbitor +groupbitxor +groupconcat +groupuniqarray +grpc +grpcio +gtest +gtid +gzip +gzipped +hadoop +halfMD +halfday +hardlink +hardlinked +hardlinks +hasAll +hasAny +hasColumnInTable +hasSubsequence +hasSubsequenceCaseInsensitive +hasSubsequenceCaseInsensitiveUTF +hasSubsequenceUTF +hasSubstr +hasThreadFuzzer +hasToken +hasTokenCaseInsensitive +hasTokenCaseInsensitiveOrNull +hasTokenOrNull +hasall +hasany +hashtables +haversine +hdbc +hdfs +hdfsCluster +heredoc +heredocs +hilbertDecode +hilbertEncode +hiveHash +holistics +homebrew +hopEnd +hopStart +horgh +hostName +hostname +hostnames +houseops +hsts +html +http +https +hudi +hyperscan +hypot +hyvor +iTerm +icosahedron +icudata +idempotency +idnaDecode +idnaEncode +ifNotFinite +ifNull +iframe +ilike +incrementing +indexHint +indexOf +infi +inflight +infty +initcap +initcapUTF +initialQueryID +initializeAggregation +injective +innogames +inodes +instantiation +instantiations +intDiv +intDivOrZero +intExp +intHash +integrational +integrations +interserver +intervalLengthSum +invalidCertificateHandler +invariants +invertedindexes +isConstant +isDecimalOverflow +isFinite +isIPAddressInRange +isIPv +isInfinite +isNaN +isNotDistinctFrom +isNotNull +isNull +isNullable +isValidJSON +isValidUTF +isZeroOrNull +iteratively +jaccard +jaccardIndex +jaroSimilarity +jaroWinklerSimilarity +javaHash +javaHashUTF +jbod +jdbc +jemalloc +jeprof +joinGet +joinGetOrNull +json +jsonMergePatch +jsonasobject +jsonasstring +jsoncolumns +jsoncolumnsmonoblock +jsoncompact +jsoncompactcolumns +jsoncompacteachrow +jsoncompacteachrowwithnames +jsoncompacteachrowwithnamesandtypes +jsoncompactstrings +jsoncompactstringseachrow +jsoncompactstringseachrowwithnames +jsoncompactstringseachrowwithnamesandtypes +jsoneachrow +jsoneachrowwithprogress +jsonobjecteachrow +jsonstrings +jsonstringseachrow +jsonstringseachrowwithprogress +jumpConsistentHash +kRing +kafka +kafkaMurmurHash +kafkacat +keepermap +kerberized +kerberos +kernal +keyspace +keytab +kittenhouse +kolmogorovSmirnovTest +kolmogorovsmirnovtest +kolya +konsole +kostik +kostikConsistentHash +kurtPop +kurtSamp +kurtosis +kurtpop +kurtsamp +lagInFrame +laion +lang +laravel +largestTriangleThreeBuckets +latencies +ldap +leadInFrame +leftPad +leftPadUTF +leftUTF +lemmatization +lemmatize +lemmatized +lengthUTF +lessOrEquals +lessorequals +levenshtein +levenshteinDistance +levenshteinDistanceUTF +lexicographically +lgamma +libFuzzer +libc +libcatboost +libcpuid +libcxx +libcxxabi +libdivide +libfarmhash +libfuzzer +libgsasl +libhdfs +libmetrohash +libpq +libpqxx +librdkafka +libs +libunwind +libuv +libvirt +linearizability +linearizable +linearized +lineasstring +linefeeds +lineitem +lineorder +linestring +linux +llvm +loadDefaultCAFile +localhost +localread +loess +logTrace +logagent +loghouse +london +lookups +loongarch +lowCardinalityIndices +lowCardinalityKeys +lowcardinality +lowerUTF +lowercased +lttb +lzma +macOS +mailrugo +mailto +makeDate +makeDateTime +mannWhitneyUTest +mannwhitneyutest +mapAdd +mapAll +mapApply +mapConcat +mapContains +mapContainsKeyLike +mapExists +mapExtractKeyLike +mapFilter +mapFromArrays +mapKeys +mapPartialReverseSort +mapPartialSort +mapPopulateSeries +mapReverseSort +mapSort +mapSubtract +mapUpdate +mapValues +mappedfile +mariadb +matcher +materializedview +maxIntersections +maxIntersectionsPosition +maxMap +maxintersections +maxintersectionsposition +maxmap +maxmind +mdadm +meanZTest +meanztest +mebibytes +memtable +memtables +mergeTreeIndex +mergeable +mergetree +messageID +metacharacters +metasymbols +metrica +metroHash +mfedotov +mflix +minMap +minMappedArrays +minSampleSizeContinuous +minSampleSizeConversion +mindsdb +minimalistic +mininum +miniselect +minmap +minmax +mins +misconfiguration +mispredictions +mlock +mlockall +mmap +mmapped +modularization +moduli +moduloOrZero +mongoc +mongocxx +mongodb +monotonicity +monthName +mortonDecode +mortonEncode +moscow +msgpack +msgpk +multiFuzzyMatchAllIndices +multiFuzzyMatchAny +multiFuzzyMatchAnyIndex +multiIf +multiMatchAllIndices +multiMatchAny +multiMatchAnyIndex +multiSearchAllPositions +multiSearchAllPositionsCaseInsensitive +multiSearchAllPositionsCaseInsensitiveUTF +multiSearchAllPositionsUTF +multiSearchAny +multiSearchAnyCaseInsensitive +multiSearchAnyCaseInsensitiveUTF +multiSearchAnyUTF +multiSearchFirstIndex +multiSearchFirstIndexCaseInsensitive +multiSearchFirstIndexCaseInsensitiveUTF +multiSearchFirstIndexUTF +multiSearchFirstPosition +multiSearchFirstPositionCaseInsensitive +multiSearchFirstPositionCaseInsensitiveUTF +multiSearchFirstPositionUTF +multibyte +multidirectory +multiline +multilinestring +multiplyDecimal +multipolygon +multisearchany +multisets +multithread +multiword +munmap +murmurHash +murmurhash +musqldump +mutex +mydb +myfilter +mysql +mysqldump +mysqljs +mytable +namedatabases +namenetworks +namenode +namepassword +nameprofile +namequota +namespace +namespaces +natively +nats +ness +nestjs +netloc +newjson +ngram +ngramDistance +ngramDistanceCaseInsensitive +ngramDistanceCaseInsensitiveUTF +ngramDistanceUTF +ngramMinHash +ngramMinHashArg +ngramMinHashArgCaseInsensitive +ngramMinHashArgCaseInsensitiveUTF +ngramMinHashArgUTF +ngramMinHashCaseInsensitive +ngramMinHashCaseInsensitiveUTF +ngramMinHashUTF +ngramSearch +ngramSearchCaseInsensitive +ngramSearchCaseInsensitiveUTF +ngramSearchUTF +ngramSimHash +ngramSimHashCaseInsensitive +ngramSimHashCaseInsensitiveUTF +ngramSimHashUTF +ngrambf +ngrams +noaa +nonNegativeDerivative +noop +normalizeQuery +normalizeQueryKeepNames +normalizeUTF +normalizedQueryHash +normalizedQueryHashKeepNames +notEmpty +notEquals +notILike +notIn +notLike +notempty +notequals +notlike +notretry +nowInBlock +ntile +nullIf +nullability +nullable +nullables +num +numerics +nypd +obfuscator +observability +odbc +ok +omclickhouse +onstraints +ontime +onwards +openSSL +openSUSE +openldap +opensky +openssl +opentelemetry +outfile +overcommit +overcommitted +overfitting +overlayUTF +overparallelization +packetpool +packetsize +pageviews +parallelization +parallelize +parallelized +params +parseDateTime +parseDateTimeBestEffort +parseDateTimeBestEffortOrNull +parseDateTimeBestEffortOrZero +parseDateTimeBestEffortUS +parseDateTimeBestEffortUSOrNull +parseDateTimeBestEffortUSOrZero +parseDateTimeInJodaSyntax +parseDateTimeInJodaSyntaxOrNull +parseDateTimeInJodaSyntaxOrZero +parseDateTimeOrNull +parseDateTimeOrZero +parseReadableSize +parseReadableSizeOrNull +parseReadableSizeOrZero +parseTimeDelta +parseable +parsers +partitionID +partitionId +pathFull +pclmulqdq +pcre +performant +perl +persistency +phpclickhouse +pipelining +plaintext +plantuml +poco +pointInEllipses +pointInPolygon +polygonAreaCartesian +polygonAreaSpherical +polygonConvexHullCartesian +polygonPerimeterCartesian +polygonPerimeterSpherical +polygonsDistanceCartesian +polygonsDistanceSpherical +polygonsEqualsCartesian +polygonsIntersectionCartesian +polygonsIntersectionSpherical +polygonsSymDifferenceCartesian +polygonsSymDifferenceSpherical +polygonsUnionCartesian +polygonsUnionSpherical +polygonsWithinCartesian +polygonsWithinSpherical +popcnt +portRFC +porthttps +positionCaseInsensitive +positionCaseInsensitiveUTF +positionUTF +positiveModulo +postfix +postfixes +postgresql +pre +pread +preallocate +prebuild +prebuilt +preemptable +preferServerCiphers +prefetch +prefetchsize +preloaded +prem +prepend +prepended +prepends +preprocess +preprocessed +preprocessing +preprocessor +presentational +prestable +prettycompact +prettycompactmonoblock +prettycompactnoescapes +prettycompactnoescapesmonoblock +prettyjsoneachrow +prettymonoblock +prettynoescapes +prettynoescapesmonoblock +prettyspace +prettyspacemonoblock +prettyspacenoescapes +prettyspacenoescapesmonoblock +prewhere +printf +privateKeyFile +privateKeyPassphraseHandler +prlimit +procfs +profiler +proleptic +prometheus +proportionsZTest +proto +protobuf +protobuflist +protobufsingle +protocol +proxied +pseudorandom +pseudorandomize +psql +ptrs +punycodeDecode +punycodeEncode +pushdown +pwrite +py +qryn +quantile +quantileBFloat +quantileDD +quantileDeterministic +quantileExact +quantileExactExclusive +quantileExactHigh +quantileExactInclusive +quantileExactLow +quantileExactWeighted +quantileGK +quantileInterpolatedWeighted +quantileTDigest +quantileTDigestWeighted +quantileTiming +quantileTimingWeighted +quantilebfloat +quantileddsketch +quantiledeterministic +quantileexact +quantileexactweighted +quantiles +quantilesExactExclusive +quantilesExactInclusive +quantilesGK +quantilesTimingWeighted +quantiletdigest +quantiletdigestweighted +quantiletiming +quantiletimingweighted +quartile +queryID +queryString +queryStringAndFragment +rabbitmq +raduis +randBernoulli +randBinomial +randCanonical +randChiSquared +randConstant +randExponential +randFisherF +randLogNormal +randNegativeBinomial +randNormal +randPoisson +randStudentT +randUniform +randomFixedString +randomPrintableASCII +randomString +randomStringUTF +rankCorr +rapidjson +rawblob +readWKTLineString +readWKTMultiLineString +readWKTMultiPolygon +readWKTPoint +readWKTPolygon +readWKTRing +readahead +readline +readme +readonly +rebalance +rebalanced +recency +recompress +recompressed +recompressing +recompression +reconnection +recurse +redash +reddit +redis +redisstreams +refcounter +refreshable +regexpExtract +regexpQuoteMeta +regionHierarchy +regionIn +regionToArea +regionToCity +regionToContinent +regionToCountry +regionToDistrict +regionToName +regionToPopulation +regionToTopContinent +reinitialization +reinitializing +reinterpretAs +reinterpretAsDate +reinterpretAsDateTime +reinterpretAsFixedString +reinterpretAsFloat +reinterpretAsInt +reinterpretAsString +reinterpretAsUInt +reinterpretAsUUID +remoteSecure +repivot +replaceAll +replaceOne +replaceRegexpAll +replaceRegexpOne +replacingmergetree +replicatable +replicatedmergetree +replxx +repo +representable +requestor +requireTLSv +resharding +reshards +resolvers +resultset +retentions +rethrow +retransmit +retriable +reverseUTF +rewritable +rightPad +rightPadUTF +rightUTF +risc +riscv +ro +roadmap +rocksdb +rollup +roundAge +roundBankers +roundDown +roundDuration +roundToExp +routineley +rowNumberInAllBlocks +rowNumberInBlock +rowbinary +rowbinarywithdefaults +rowbinarywithnames +rowbinarywithnamesandtypes +rsync +rsyslog +runnable +runningAccumulate +runningConcurrency +runningDifference +runningDifferenceStartingWithFirstValue +runtime +russian +rustc +rustup +rw +sasl +satisfiable +scala +sccache +schemas +seekable +seektable +sequenceCount +sequenceMatch +sequenceNextNode +seriesDecomposeSTL +seriesOutliersDetectTukey +seriesPeriodDetectFFT +serverTimeZone +serverTimezone +serverUUID +sessionCacheSize +sessionIdContext +sessionTimeout +seva +shardCount +shardNum +sharded +sharding +shortcircuit +shortkeys +shoutout +showCertificate +sigmoid +simdjson +simpleJSON +simpleJSONExtractBool +simpleJSONExtractFloat +simpleJSONExtractInt +simpleJSONExtractRaw +simpleJSONExtractString +simpleJSONExtractUInt +simpleJSONHas +simpleLinearRegression +simpleaggregatefunction +simplelinearregression +simpod +singleValueOrNull +singlepart +singlevalueornull +sinh +sipHash +siphash +skewPop +skewSamp +skewness +skewpop +skewsamp +skippingerrors +sleepEachRow +snowflakeIDToDateTime +snowflakeToDateTime +socketcache +soundex +sparkBar +sparkbar +sparsehash +speedscope +splitBy +splitByChar +splitByNonAlpha +splitByRegexp +splitByString +splitByWhitespace +splitby +sqid +sqidDecode +sqidEncode +sql +sqlalchemy +sqlinsert +sqlite +sqrt +src +srcReplicas +sshkey +stackoverflow +stacktrace +stacktraces +startsWith +startsWithUTF +startswith +statbox +stateful +stateset +stddev +stddevPop +stddevPopStable +stddevSamp +stddevSampStable +stddevpop +stddevpopstable +stddevsamp +stddevsampstable +stderr +stdin +stdout +stochasticLinearRegression +stochasticLogisticRegression +stochastically +stochasticlinearregression +stochasticlogisticregression +storages +storig +stringJaccardIndex +stringJaccardIndexUTF +stringToH +stripelog +strtod +strtoll +strtoull +struct +structs +structureToCapnProtoSchema +structureToProtobufSchema +studentTTest +studentttest +subBitmap +subDate +subarray +subarrays +subcolumn +subcolumns +subdirectories +subdirectory +subexpression +subexpressions +subfolder +subfolders +subinterval +subintervals +subkey +submatch +submodule +submodules +subnet +subnetwork +subpattern +subpatterns +subqueries +subquery +subranges +subreddits +subseconds +subsequence +substreams +substring +substringIndex +substringIndexUTF +substringUTF +substrings +subtitiles +subtractDays +subtractHours +subtractInterval +subtractMicroseconds +subtractMilliseconds +subtractMinutes +subtractMonths +subtractNanoseconds +subtractQuarters +subtractSeconds +subtractTupleOfIntervals +subtractWeeks +subtractYears +subtree +subtrees +subtype +sudo +sumCount +sumKahan +sumMap +sumMapFiltered +sumMapFilteredWithOverflow +sumMapWithOverflow +sumWithOverflow +sumcount +sumkahan +summap +summapwithoverflow +summingmergetree +sumwithoverflow +superaggregates +supertype +supremum +symlink +symlinks +syntaxes +syscall +syscalls +sysctl +syslog +syslogd +systemd +tabix +tablum +tabseparated +tabseparatedraw +tabseparatedrawwithnames +tabseparatedrawwithnamesandtypes +tabseparatedwithnames +tabseparatedwithnamesandtypes +tanh +tcp +tcpPort +tcpnodelay +templateignorespaces +tgamma +tgz +th +theilsU +theilsu +themself +threadpool +throwIf +timeDiff +timeSeriesData +timeSeriesMetrics +timeSeriesTags +timeSlot +timeSlots +timeZone +timeZoneOf +timeZoneOffset +timezones +tinylog +tmp +toBool +toColumnTypeName +toDate +toDateOrDefault +toDateOrNull +toDateOrZero +toDateTime +toDateTimeOrDefault +toDateTimeOrNull +toDateTimeOrZero +toDayOfMonth +toDayOfWeek +toDayOfYear +toDaysSinceYearZero +toDecimal +toDecimalString +toFixedString +toFloat +toHour +toIPv +toISOWeek +toISOYear +toInt +toInterval +toIntervalDay +toIntervalHour +toIntervalMicrosecond +toIntervalMillisecond +toIntervalMinute +toIntervalMonth +toIntervalNanosecond +toIntervalQuarter +toIntervalSecond +toIntervalWeek +toIntervalYear +toJSONString +toLastDayOfMonth +toLastDayOfWeek +toLowCardinality +toMillisecond +toMinute +toModifiedJulianDay +toModifiedJulianDayOrNull +toMonday +toMonth +toNullable +toQuarter +toRelativeDayNum +toRelativeHourNum +toRelativeMinuteNum +toRelativeMonthNum +toRelativeQuarterNum +toRelativeSecondNum +toRelativeWeekNum +toRelativeYearNum +toSecond +toStartOfDay +toStartOfFifteenMinutes +toStartOfFiveMinutes +toStartOfHour +toStartOfISOYear +toStartOfInterval +toStartOfMicrosecond +toStartOfMillisecond +toStartOfMinute +toStartOfMonth +toStartOfNanosecond +toStartOfQuarter +toStartOfSecond +toStartOfTenMinutes +toStartOfWeek +toStartOfYear +toString +toStringCutToZero +toTime +toTimeZone +toType +toTypeName +toUInt +toUTCTimestamp +toUUID +toUUIDOrDefault +toUUIDOrNull +toUUIDOrZero +toUnixTimestamp +toValidUTF +toWeek +toYYYYMM +toYYYYMMDD +toYYYYMMDDhhmmss +toYear +toYearWeek +tokenbf +tokenization +tokenized +tokenizer +toml +toolchain +toolset +topK +topKWeighted +topLevelDomain +topLevelDomainRFC +topk +topkweighted +tpcds +tpch +transactionID +transactionLatestSnapshot +transactionOldestSnapshot +transactional +transactionally +translateUTF +translocality +trie +trimBoth +trimLeft +trimRight +trunc +tryBase +tryDecrypt +tryIdnaEncode +tryPunycodeDecode +tskv +tsv +tui +tukey +tumbleEnd +tumbleStart +tupleConcat +tupleDivide +tupleDivideByNumber +tupleElement +tupleHammingDistance +tupleIntDiv +tupleIntDivByNumber +tupleIntDivOrZero +tupleIntDivOrZeroByNumber +tupleMinus +tupleModulo +tupleModuloByNumber +tupleMultiply +tupleMultiplyByNumber +tupleNames +tupleNegate +tuplePlus +tupleToNameValuePairs +turbostat +txt +typename +ubuntu +uint +ulid +unary +unbin +uncomment +undrop +unencoded +unencrypted +unescaped +unescaping +unhex +unicode +unidimensional +unigrams +unintuitive +uniq +uniqCombined +uniqExact +uniqHLL +uniqTheta +uniqThetaIntersect +uniqThetaNot +uniqThetaSketch +uniqThetaUnion +uniqUpTo +uniqcombined +uniqexact +uniqhll +uniqtheta +uniqthetasketch +unix +unixODBC +unixodbc +unoptimized +unparsed +unpooled +unrealiable +unreplicated +unresolvable +unrounded +unshuffled +untracked +untrusted +untuple +uploader +uploaders +upperUTF +uptime +uptrace +uring +url +urlCluster +urlencoded +urls +usearch +userspace +userver +utils +uuid +uuidv +vCPU +varPop +varPopStable +varSamp +varSampStable +variadic +variantElement +variantType +varint +varpop +varpopstable +varsamp +varsampstable +vectorized +vectorscan +vendoring +verificationDepth +verificationMode +versionedcollapsingmergetree +vhost +virtualized +visibleWidth +visitParam +visitParamExtractBool +visitParamExtractFloat +visitParamExtractInt +visitParamExtractRaw +visitParamExtractString +visitParamExtractUInt +visitParamHas +vruntime +wchc +wchs +webpage +webserver +weekyear +welchTTest +welchttest +wget +which's +whitespace +whitespaces +wikistat +windowFunnel +wordShingleMinHash +wordShingleMinHashArg +wordShingleMinHashArgCaseInsensitive +wordShingleMinHashArgCaseInsensitiveUTF +wordShingleMinHashArgUTF +wordShingleMinHashCaseInsensitive +wordShingleMinHashCaseInsensitiveUTF +wordShingleMinHashUTF +wordShingleSimHash +wordShingleSimHashCaseInsensitive +wordShingleSimHashCaseInsensitiveUTF +wordShingleSimHashUTF +wordshingleMinHash +writability +wrt +wyHash +xcode +xeus +xkcd +xlarge +xml +xxHash +xz +yaml +yandex +youtube +zLib +zLinux +zabbix +zkcopy +zlib +znode +znodes +zookeeperSessionUptime +zstd diff --git a/ci_v2/jobs/scripts/check_style/check_aspell.sh b/ci_v2/jobs/scripts/check_style/check_aspell.sh new file mode 100755 index 00000000000..61726aab0f0 --- /dev/null +++ b/ci_v2/jobs/scripts/check_style/check_aspell.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +# force-enable double star globbing +shopt -s globstar + +# Perform spell checking on the docs + +if [[ ${1:-} == "--help" ]] || [[ ${1:-} == "-h" ]]; then + echo "Usage $0 [--help|-h] [-i [filename]]" + echo " --help|-h: print this help" + echo " -i: interactive mode. If filename is specified, check only this file, otherwise check all files" + exit 0 +fi + +ROOT_PATH="." + +CHECK_LANG=en + +ASPELL_IGNORE_PATH="${ROOT_PATH}/utils/check-style/aspell-ignore/${CHECK_LANG}" + +if [[ ${1:-} == "-i" ]]; then + if [[ ! -z ${2:-} ]]; then + FILES_TO_CHECK=${ROOT_PATH}/docs/${CHECK_LANG}/${2} + else + FILES_TO_CHECK=${ROOT_PATH}/docs/${CHECK_LANG}/**/*.md + fi + for fname in ${FILES_TO_CHECK}; do + echo "Checking $fname" + aspell --personal=aspell-dict.txt --add-sgml-skip=code --encoding=utf-8 --mode=markdown -W 3 --lang=${CHECK_LANG} --home-dir=${ASPELL_IGNORE_PATH} -c "$fname" + done + exit +fi + +STATUS=0 +for fname in ${ROOT_PATH}/docs/${CHECK_LANG}/**/*.md; do + errors=$(cat "$fname" \ + | aspell list \ + -W 3 \ + --personal=aspell-dict.txt \ + --add-sgml-skip=code \ + --encoding=utf-8 \ + --mode=markdown \ + --lang=${CHECK_LANG} \ + --home-dir=${ASPELL_IGNORE_PATH} \ + | sort | uniq) + if [ ! -z "$errors" ]; then + STATUS=1 + echo "====== $fname ======" + echo "$errors" + fi +done + +if (( STATUS != 0 )); then + echo "====== Errors found ======" + echo "To exclude some words add them to the dictionary file \"${ASPELL_IGNORE_PATH}/aspell-dict.txt\"" + echo "You can also run ${0} -i to see the errors interactively and fix them or add to the dictionary file" +fi + +exit ${STATUS} diff --git a/ci_v2/jobs/scripts/check_style/check_cpp.sh b/ci_v2/jobs/scripts/check_style/check_cpp.sh new file mode 100755 index 00000000000..1611fac8c5e --- /dev/null +++ b/ci_v2/jobs/scripts/check_style/check_cpp.sh @@ -0,0 +1,339 @@ +#!/usr/bin/env bash + +# For code formatting we have clang-format. +# +# But it's not sane to apply clang-format for whole code base, +# because it sometimes makes worse for properly formatted files. +# +# It's only reasonable to blindly apply clang-format only in cases +# when the code is likely to be out of style. +# +# For this purpose we have a script that will use very primitive heuristics +# (simple regexps) to check if the code is likely to have basic style violations. +# and then to run formatter only for the specified files. + +LC_ALL="en_US.UTF-8" +ROOT_PATH="." +EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|poco/|memcpy/|consistent-hashing|benchmark|tests/.*.cpp|utils/keeper-bench/example.yaml' + +# From [1]: +# But since array_to_string_internal() in array.c still loops over array +# elements and concatenates them into a string, it's probably not more +# efficient than the looping solutions proposed, but it's more readable. +# +# [1]: https://stackoverflow.com/a/15394738/328260 +function in_array() +{ + local IFS="|" + local value=$1 && shift + + [[ "${IFS}${*}${IFS}" =~ "${IFS}${value}${IFS}" ]] +} + +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | + grep -vP $EXCLUDE_DIRS | + xargs grep $@ -P '((class|struct|namespace|enum|if|for|while|else|throw|switch).*|\)(\s*const)?(\s*override)?\s*)\{$|\s$|^ {1,3}[^\* ]\S|\t|^\s*(if|else if|if constexpr|else if constexpr|for|while|catch|switch)\(|\( [^\s\\]|\S \)' | +# a curly brace not in a new line, but not for the case of C++11 init or agg. initialization | trailing whitespace | number of ws not a multiple of 4, but not in the case of comment continuation | missing whitespace after for/if/while... before opening brace | whitespaces inside braces + grep -v -P '(//|:\s+\*|\$\(\()| \)"' +# single-line comment | continuation of a multiline comment | a typical piece of embedded shell code | something like ending of raw string literal + +# Tabs +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | + grep -vP $EXCLUDE_DIRS | + xargs grep $@ -F $'\t' + +# // namespace comments are unneeded +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | + grep -vP $EXCLUDE_DIRS | + xargs grep $@ -P '}\s*//+\s*namespace\s*' + +# Broken symlinks +find -L $ROOT_PATH -type l 2>/dev/null | grep -v contrib && echo "^ Broken symlinks found" + +# Duplicated or incorrect setting declarations +SETTINGS_FILE=$(mktemp) +cat $ROOT_PATH/src/Core/Settings.cpp $ROOT_PATH/src/Core/FormatFactorySettingsDeclaration.h | grep "M(" | awk '{print substr($2, 0, length($2) - 1) " " substr($1, 3, length($1) - 3) " SettingsDeclaration" }' > ${SETTINGS_FILE} +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep "extern const Settings" -T | awk '{print substr($5, 0, length($5) -1) " " substr($4, 9) " " substr($1, 0, length($1) - 1)}' >> ${SETTINGS_FILE} + +# Duplicate extern declarations for settings +awk '{if (seen[$0]++) print $3 " -> " $1 ;}' ${SETTINGS_FILE} | while read line; +do + echo "Found duplicated setting declaration in: $line" +done + +# Incorrect declarations for settings +for setting in $(awk '{print $1 " " $2}' ${SETTINGS_FILE} | sort | uniq | awk '{ print $1 }' | sort | uniq -d); +do + expected=$(grep "^$setting " ${SETTINGS_FILE} | grep SettingsDeclaration | awk '{ print $2 }') + grep "^$setting " ${SETTINGS_FILE} | grep -v " $expected" | awk '{ print $3 " found setting " $1 " with type " $2 }' | while read line; + do + echo "In $line but it should be $expected" + done +done + +rm ${SETTINGS_FILE} + +# Unused/Undefined/Duplicates ErrorCodes/ProfileEvents/CurrentMetrics +declare -A EXTERN_TYPES +EXTERN_TYPES[ErrorCodes]=int +EXTERN_TYPES[ProfileEvents]=Event +EXTERN_TYPES[CurrentMetrics]=Metric + +EXTERN_TYPES_EXCLUDES=( + ProfileEvents::global_counters + ProfileEvents::Event + ProfileEvents::Count + ProfileEvents::Counters + ProfileEvents::end + ProfileEvents::increment + ProfileEvents::incrementForLogMessage + ProfileEvents::getName + ProfileEvents::Timer + ProfileEvents::Type + ProfileEvents::TypeEnum + ProfileEvents::dumpToMapColumn + ProfileEvents::getProfileEvents + ProfileEvents::ThreadIdToCountersSnapshot + ProfileEvents::LOCAL_NAME + ProfileEvents::keeper_profile_events + ProfileEvents::CountersIncrement + + CurrentMetrics::add + CurrentMetrics::sub + CurrentMetrics::get + CurrentMetrics::set + CurrentMetrics::end + CurrentMetrics::Increment + CurrentMetrics::Metric + CurrentMetrics::values + CurrentMetrics::Value + CurrentMetrics::keeper_metrics + + ErrorCodes::ErrorCode + ErrorCodes::getName + ErrorCodes::increment + ErrorCodes::end + ErrorCodes::values + ErrorCodes::values[i] + ErrorCodes::getErrorCodeByName + ErrorCodes::Value +) +for extern_type in ${!EXTERN_TYPES[@]}; do + type_of_extern=${EXTERN_TYPES[$extern_type]} + allowed_chars='[_A-Za-z]+' + + # Unused + # NOTE: to fix automatically, replace echo with: + # sed -i "/extern const $type_of_extern $val/d" $file + find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | { + # NOTE: the check is pretty dumb and distinguish only by the type_of_extern, + # and this matches with zkutil::CreateMode + grep -v -e 'src/Common/ZooKeeper/Types.h' -e 'src/Coordination/KeeperConstants.cpp' + } | { + grep -vP $EXCLUDE_DIRS | xargs grep -l -P "extern const $type_of_extern $allowed_chars" + } | while read file; do + grep -P "extern const $type_of_extern $allowed_chars;" $file | sed -r -e "s/^.*?extern const $type_of_extern ($allowed_chars);.*?$/\1/" | while read val; do + if ! grep -q "$extern_type::$val" $file; then + # Excludes for SOFTWARE_EVENT/HARDWARE_EVENT/CACHE_EVENT in ThreadProfileEvents.cpp + if [[ ! $extern_type::$val =~ ProfileEvents::Perf.* ]]; then + echo "$extern_type::$val is defined but not used in file $file" + fi + fi + done + done + + # Undefined + # NOTE: to fix automatically, replace echo with: + # ( grep -q -F 'namespace $extern_type' $file && \ + # sed -i -r "0,/(\s*)extern const $type_of_extern [$allowed_chars]+/s//\1extern const $type_of_extern $val;\n&/" $file || \ + # awk '{ print; if (ns == 1) { ns = 2 }; if (ns == 2) { ns = 0; print "namespace $extern_type\n{\n extern const $type_of_extern '$val';\n}" } }; /namespace DB/ { ns = 1; };' < $file > ${file}.tmp && mv ${file}.tmp $file ) + find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | { + grep -vP $EXCLUDE_DIRS | xargs grep -l -P "$extern_type::$allowed_chars" + } | while read file; do + grep -P "$extern_type::$allowed_chars" $file | grep -P -v '^\s*//' | sed -r -e "s/^.*?$extern_type::($allowed_chars).*?$/\1/" | while read val; do + if ! grep -q "extern const $type_of_extern $val" $file; then + if ! in_array "$extern_type::$val" "${EXTERN_TYPES_EXCLUDES[@]}"; then + echo "$extern_type::$val is used in file $file but not defined" + fi + fi + done + done + + # Duplicates + find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | { + grep -vP $EXCLUDE_DIRS | xargs grep -l -P "$extern_type::$allowed_chars" + } | while read file; do + grep -P "extern const $type_of_extern $allowed_chars;" $file | sort | uniq -c | grep -v -P ' +1 ' && echo "Duplicate $extern_type in file $file" + done +done + +# Three or more consecutive empty lines +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + while read file; do awk '/^$/ { ++i; if (i > 2) { print "More than two consecutive empty lines in file '$file'" } } /./ { i = 0 }' $file; done + +# Check that every header file has #pragma once in first line +find $ROOT_PATH/{src,programs,utils} -name '*.h' | + grep -vP $EXCLUDE_DIRS | + while read file; do [[ $(head -n1 $file) != '#pragma once' ]] && echo "File $file must have '#pragma once' in first line"; done + +# Too many exclamation marks +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -F '!!!' | grep -P '.' && echo "Too many exclamation marks (looks dirty, unconfident)." + +# Exclamation mark in a message +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -F '!",' | grep -P '.' && echo "No need for an exclamation mark (looks dirty, unconfident)." + +# Trailing whitespaces +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -n -P ' $' | grep -n -P '.' && echo "^ Trailing whitespaces." + +# Forbid stringstream because it's easy to use them incorrectly and hard to debug possible issues +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -P 'std::[io]?stringstream' | grep -v "STYLE_CHECK_ALLOW_STD_STRING_STREAM" && echo "Use WriteBufferFromOwnString or ReadBufferFromString instead of std::stringstream" + +# Forbid std::cerr/std::cout in src (fine in programs/utils) +std_cerr_cout_excludes=( + /examples/ + /tests/ + _fuzzer + # OK + src/Common/ProgressIndication.cpp + # only under #ifdef DBMS_HASH_MAP_DEBUG_RESIZES, that is used only in tests + src/Common/HashTable/HashTable.h + # SensitiveDataMasker::printStats() + src/Common/SensitiveDataMasker.cpp + # StreamStatistics::print() + src/Compression/LZ4_decompress_faster.cpp + # ContextSharedPart with subsequent std::terminate() + src/Interpreters/Context.cpp + # IProcessor::dump() + src/Processors/IProcessor.cpp + src/Client/ClientApplicationBase.cpp + src/Client/ClientBase.cpp + src/Client/LineReader.cpp + src/Client/QueryFuzzer.cpp + src/Client/Suggest.cpp + src/Client/ClientBase.h + src/Client/LineReader.h + src/Client/ReplxxLineReader.h + src/Bridge/IBridge.cpp + src/Daemon/BaseDaemon.cpp + src/Loggers/Loggers.cpp + src/Common/GWPAsan.cpp + src/Common/ProgressIndication.h +) +sources_with_std_cerr_cout=( $( + find $ROOT_PATH/{src,base} -name '*.h' -or -name '*.cpp' | \ + grep -vP $EXCLUDE_DIRS | \ + grep -F -v $(printf -- "-e %s " "${std_cerr_cout_excludes[@]}") | \ + xargs grep -F --with-filename -e std::cerr -e std::cout | cut -d: -f1 | sort -u +) ) + +# Exclude comments +for src in "${sources_with_std_cerr_cout[@]}"; do + # suppress stderr, since it may contain warning for #pargma once in headers + if gcc -fpreprocessed -dD -E "$src" 2>/dev/null | grep -F -q -e std::cerr -e std::cout; then + echo "$src: uses std::cerr/std::cout" + fi +done + +expect_tests=( $(find $ROOT_PATH/tests/queries -name '*.expect') ) +for test_case in "${expect_tests[@]}"; do + pattern="^exp_internal -f \$CLICKHOUSE_TMP/\$basename.debuglog 0$" + grep -q "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'" + + if grep -q "^spawn.*CLICKHOUSE_CLIENT_BINARY$" "$test_case"; then + pattern="^spawn.*CLICKHOUSE_CLIENT_BINARY.*--history_file$" + grep -q "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'" + fi + + # Otherwise expect_after/expect_before will not bail without stdin attached + # (and actually this is a hack anyway, correct way is to use $any_spawn_id) + pattern="-i \$any_spawn_id timeout" + grep -q -- "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'" + pattern="-i \$any_spawn_id eof" + grep -q -- "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'" +done + +# Forbid non-unique error codes +if [[ "$(grep -Po "M\([0-9]*," $ROOT_PATH/src/Common/ErrorCodes.cpp | wc -l)" != "$(grep -Po "M\([0-9]*," $ROOT_PATH/src/Common/ErrorCodes.cpp | sort | uniq | wc -l)" ]] +then + echo "ErrorCodes.cpp contains non-unique error codes" +fi + +# Check that there is no system-wide libraries/headers in use. +# +# NOTE: it is better to override find_path/find_library in cmake, but right now +# it is not possible, see [1] for the reference. +# +# [1]: git grep --recurse-submodules -e find_library -e find_path contrib +if git grep -e find_path -e find_library -- :**CMakeLists.txt; then + echo "There is find_path/find_library usage. ClickHouse should use everything bundled. Consider adding one more contrib module." +fi + +# Forbid std::filesystem::is_symlink and std::filesystem::read_symlink, because it's easy to use them incorrectly +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -P '::(is|read)_symlink' | grep -v "STYLE_CHECK_ALLOW_STD_FS_SYMLINK" && echo "Use DB::FS::isSymlink and DB::FS::readSymlink instead" + +# Forbid __builtin_unreachable(), because it's hard to debug when it becomes reachable +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -P '__builtin_unreachable' && echo "Use UNREACHABLE() from defines.h instead" + +# Forbid mt19937() and random_device() which are outdated and slow +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -P '(std::mt19937|std::mersenne_twister_engine|std::random_device)' && echo "Use pcg64_fast (from pcg_random.h) and randomSeed (from Common/randomSeed.h) instead" + +# Require checking return value of close(), +# since it can hide fd misuse and break other places. +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -e ' close(.*fd' -e ' ::close(' | grep -v = && echo "Return value of close() should be checked" + +# A small typo can lead to debug code in release builds, see https://github.com/ClickHouse/ClickHouse/pull/47647 +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -l -F '#ifdef NDEBUG' | xargs -I@FILE awk '/#ifdef NDEBUG/ { inside = 1; dirty = 1 } /#endif/ { if (inside && dirty) { print "File @FILE has suspicious #ifdef NDEBUG, possibly confused with #ifndef NDEBUG" }; inside = 0 } /#else/ { dirty = 0 }' @FILE + +# If a user is doing dynamic or typeid cast with a pointer, and immediately dereferencing it, it is unsafe. +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep --line-number -P '(dynamic|typeid)_cast<[^>]+\*>\([^\(\)]+\)->' | grep -P '.' && echo "It's suspicious when you are doing a dynamic_cast or typeid_cast with a pointer and immediately dereferencing it. Use references instead of pointers or check a pointer to nullptr." + +# Check for bad punctuation: whitespace before comma. +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -P --line-number '\w ,' | grep -v 'bad punctuation is ok here' && echo "^ There is bad punctuation: whitespace before comma. You should write it like this: 'Hello, world!'" + +# Check usage of std::regex which is too bloated and slow. +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -P --line-number 'std::regex' | grep -P '.' && echo "^ Please use re2 instead of std::regex" + +# Cyrillic characters hiding inside Latin. +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | grep -v StorageSystemContributors.generated.cpp | xargs grep -P --line-number '[a-zA-Z][а-яА-ЯёЁ]|[а-яА-ЯёЁ][a-zA-Z]' && echo "^ Cyrillic characters found in unexpected place." + +# Orphaned header files. +join -v1 <(find $ROOT_PATH/{src,programs,utils} -name '*.h' -printf '%f\n' | sort | uniq) <(find $ROOT_PATH/{src,programs,utils} -name '*.cpp' -or -name '*.c' -or -name '*.h' -or -name '*.S' | xargs grep --no-filename -o -P '[\w-]+\.h' | sort | uniq) | + grep . && echo '^ Found orphan header files.' + +# Don't allow dynamic compiler check with CMake, because we are using hermetic, reproducible, cross-compiled, static (TLDR, good) builds. +ls -1d $ROOT_PATH/contrib/*-cmake | xargs -I@ find @ -name 'CMakeLists.txt' -or -name '*.cmake' | xargs grep --with-filename -i -P 'check_c_compiler_flag|check_cxx_compiler_flag|check_c_source_compiles|check_cxx_source_compiles|check_include_file|check_symbol_exists|cmake_push_check_state|cmake_pop_check_state|find_package|CMAKE_REQUIRED_FLAGS|CheckIncludeFile|CheckCCompilerFlag|CheckCXXCompilerFlag|CheckCSourceCompiles|CheckCXXSourceCompiles|CheckCSymbolExists|CheckCXXSymbolExists' | grep -v Rust && echo "^ It's not allowed to have dynamic compiler checks with CMake." + +# Wrong spelling of abbreviations, e.g. SQL is right, Sql is wrong. XMLHttpRequest is very wrong. +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -P 'Sql|Html|Xml|Cpu|Tcp|Udp|Http|Db|Json|Yaml' | grep -v -P 'RabbitMQ|Azure|Aws|aws|Avro|IO/S3' && + echo "Abbreviations such as SQL, XML, HTTP, should be in all caps. For example, SQL is right, Sql is wrong. XMLHttpRequest is very wrong." + +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -F -i 'ErrorCodes::LOGICAL_ERROR, "Logical error:' && + echo "If an exception has LOGICAL_ERROR code, there is no need to include the text 'Logical error' in the exception message, because then the phrase 'Logical error' will be printed twice." + +# There shouldn't be any code snippets under GPL or LGPL +find $ROOT_PATH/{src,base,programs} -name '*.h' -or -name '*.cpp' 2>/dev/null | xargs grep -i -F 'General Public License' && echo "There shouldn't be any code snippets under GPL or LGPL" + +PATTERN="allow_"; +DIFF=$(comm -3 <(grep -o "\b$PATTERN\w*\b" $ROOT_PATH/src/Core/Settings.cpp | sort -u) <(grep -o -h "\b$PATTERN\w*\b" $ROOT_PATH/src/Databases/enableAllExperimentalSettings.cpp $ROOT_PATH/utils/check-style/experimental_settings_ignore.txt | sort -u)); +[ -n "$DIFF" ] && echo "$DIFF" && echo "^^ Detected 'allow_*' settings that might need to be included in src/Databases/enableAllExperimentalSettings.cpp" && echo "Alternatively, consider adding an exception to utils/check-style/experimental_settings_ignore.txt" diff --git a/ci_v2/jobs/scripts/check_style/check_submodules.sh b/ci_v2/jobs/scripts/check_style/check_submodules.sh new file mode 100755 index 00000000000..eeb893b9615 --- /dev/null +++ b/ci_v2/jobs/scripts/check_style/check_submodules.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# The script checks if all submodules defined in $GIT_ROOT/.gitmodules exist in $GIT_ROOT/contrib + +set -e + +GIT_ROOT="." + +cd "$GIT_ROOT" + +# Remove keys for submodule.*.path parameters, the values are separated by \0 +# and check if the directory exists +git config --file .gitmodules --null --get-regexp path | sed -z 's|.*\n||' | \ + xargs -P100 -0 --no-run-if-empty -I{} bash -c 'if ! test -d '"'{}'"'; then echo Directory for submodule {} is not found; exit 1; fi' 2>&1 + + +# And check that the submodule is fine +git config --file .gitmodules --null --get-regexp path | sed -z 's|.*\n||' | \ + xargs -P100 -0 --no-run-if-empty -I{} git submodule status -q '{}' 2>&1 + + +# All submodules should be from https://github.com/ +git config --file "$ROOT_PATH/.gitmodules" --get-regexp 'submodule\..+\.url' | \ +while read -r line; do + name=${line#submodule.}; name=${name%.url*} + url=${line#* } + [[ "$url" != 'https://github.com/'* ]] && echo "All submodules should be from https://github.com/, submodule '$name' has '$url'" +done + +# All submodules should be of this form: [submodule "contrib/libxyz"] (for consistency, the submodule name does matter too much) +# - restrict the check to top-level .gitmodules file +git config --file "$ROOT_PATH/.gitmodules" --get-regexp 'submodule\..+\.path' | \ +while read -r line; do + name=${line#submodule.}; name=${name%.path*} + path=${line#* } + [ "$name" != "$path" ] && echo "Submodule name '$name' is not equal to it's path '$path'" +done diff --git a/ci_v2/jobs/scripts/check_style/check_typos.sh b/ci_v2/jobs/scripts/check_style/check_typos.sh new file mode 100755 index 00000000000..764101a6eac --- /dev/null +++ b/ci_v2/jobs/scripts/check_style/check_typos.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +# Check for typos in code. + +ROOT_PATH="." + +#FIXME: check all (or almost all) repo +codespell \ + --skip "*generated*,*gperf*,*.bin,*.mrk*,*.idx,checksums.txt,*.dat,*.pyc,*.kate-swp,*obfuscateQueries.cpp,d3-*.js,*.min.js,*.sum,${ROOT_PATH}/utils/check-style/aspell-ignore" \ + --ignore-words "${ROOT_PATH}/utils/check-style/codespell-ignore-words.list" \ + --exclude-file "${ROOT_PATH}/utils/check-style/codespell-ignore-lines.list" \ + --quiet-level 2 \ + "$ROOT_PATH"/{src,base,programs,utils} \ + $@ | grep -P '.' \ + && echo -e "\nFound some typos in code.\nSee the files utils/check-style/codespell* if you want to add an exception." diff --git a/ci_v2/jobs/scripts/check_style/checks_to_refactor.sh b/ci_v2/jobs/scripts/check_style/checks_to_refactor.sh new file mode 100755 index 00000000000..ae4aae23c12 --- /dev/null +++ b/ci_v2/jobs/scripts/check_style/checks_to_refactor.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +ROOT_PATH="." + +# Queries to system.query_log/system.query_thread_log should have current_database = currentDatabase() condition +# NOTE: it is not that accurate, but at least something. +tests_with_query_log=( $( + find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | + xargs grep --with-filename -e system.query_log -e system.query_thread_log | cut -d: -f1 | sort -u +) ) +for test_case in "${tests_with_query_log[@]}"; do + grep -qE current_database.*currentDatabase "$test_case" || { + grep -qE 'current_database.*\$CLICKHOUSE_DATABASE' "$test_case" + } || echo "Queries to system.query_log/system.query_thread_log does not have current_database = currentDatabase() condition in $test_case" +done + +grep -iE 'SYSTEM STOP MERGES;?$' -R $ROOT_PATH/tests/queries && echo "Merges cannot be disabled globally in fast/stateful/stateless tests, because it will break concurrently running queries" + + +# Queries to: +tables_with_database_column=( + system.tables + system.parts + system.detached_parts + system.parts_columns + system.columns + system.projection_parts + system.mutations +) +# should have database = currentDatabase() condition +# +# NOTE: it is not that accuate, but at least something. +tests_with_database_column=( $( + find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | + xargs grep --with-filename $(printf -- "-e %s " "${tables_with_database_column[@]}") | + grep -v -e ':--' -e ':#' | + cut -d: -f1 | sort -u +) ) +for test_case in "${tests_with_database_column[@]}"; do + grep -qE database.*currentDatabase "$test_case" || { + grep -qE 'database.*\$CLICKHOUSE_DATABASE' "$test_case" + } || { + # explicit database + grep -qE "database[ ]*=[ ]*'" "$test_case" + } || { + echo "Queries to ${tables_with_database_column[*]} does not have database = currentDatabase()/\$CLICKHOUSE_DATABASE condition in $test_case" + } +done + +# Queries with ReplicatedMergeTree +# NOTE: it is not that accuate, but at least something. +tests_with_replicated_merge_tree=( $( + find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | + xargs grep --with-filename -e "Replicated.*MergeTree[ ]*(.*" | cut -d: -f1 | sort -u +) ) +for test_case in "${tests_with_replicated_merge_tree[@]}"; do + case "$test_case" in + *.gen.*) + ;; + *.sh) + test_case_zk_prefix="\(\$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX\|{database}\)" + grep -q -e "Replicated.*MergeTree[ ]*(.*$test_case_zk_prefix" "$test_case" || echo "Replicated.*MergeTree should contain '$test_case_zk_prefix' in zookeeper path to avoid overlaps ($test_case)" + ;; + *.sql|*.sql.j2) + test_case_zk_prefix="\({database}\|currentDatabase()\|{uuid}\|{default_path_test}\)" + grep -q -e "Replicated.*MergeTree[ ]*(.*$test_case_zk_prefix" "$test_case" || echo "Replicated.*MergeTree should contain '$test_case_zk_prefix' in zookeeper path to avoid overlaps ($test_case)" + ;; + *.py) + # Right now there is not such tests anyway + echo "No ReplicatedMergeTree style check for *.py ($test_case)" + ;; + esac +done + +# The stateful directory should only contain the tests that depend on the test dataset (hits or visits). +find $ROOT_PATH/tests/queries/1_stateful -name '*.sql' -or -name '*.sh' | grep -v '00076_system_columns_bytes' | xargs -I{} bash -c 'grep -q -P "hits|visits" "{}" || echo "The test {} does not depend on the test dataset (hits or visits table) and should be located in the 0_stateless directory. You can also add an exception to the check-style script."' + +# Check for existence of __init__.py files +for i in "${ROOT_PATH}"/tests/integration/test_*; do FILE="${i}/__init__.py"; [ ! -f "${FILE}" ] && echo "${FILE} should exist for every integration test"; done + +# Check for executable bit on non-executable files +find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} '(' -name '*.cpp' -or -name '*.h' -or -name '*.sql' -or -name '*.j2' -or -name '*.xml' -or -name '*.reference' -or -name '*.txt' -or -name '*.md' ')' -and -executable | grep -P '.' && echo "These files should not be executable." + +# Check for BOM +find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | xargs grep -l -F $'\xEF\xBB\xBF' | grep -P '.' && echo "Files should not have UTF-8 BOM" +find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | xargs grep -l -F $'\xFF\xFE' | grep -P '.' && echo "Files should not have UTF-16LE BOM" +find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | xargs grep -l -F $'\xFE\xFF' | grep -P '.' && echo "Files should not have UTF-16BE BOM" + +# Conflict markers +find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | + xargs grep -P '^(<<<<<<<|=======|>>>>>>>)$' | grep -P '.' && echo "Conflict markers are found in files" + +# DOS/Windows newlines +find $ROOT_PATH/{base,src,programs,utils,docs} -name '*.md' -or -name '*.h' -or -name '*.cpp' -or -name '*.js' -or -name '*.py' -or -name '*.html' | xargs grep -l -P '\r$' && echo "^ Files contain DOS/Windows newlines (\r\n instead of \n)." + +# # workflows check +# act --list --directory="$ROOT_PATH" 1>/dev/null 2>&1 || act --list --directory="$ROOT_PATH" 2>&1 +# actionlint -ignore 'reusable workflow call.+' || : diff --git a/ci_v2/jobs/scripts/check_style/double_whitespaces.pl b/ci_v2/jobs/scripts/check_style/double_whitespaces.pl new file mode 100755 index 00000000000..daeddecbd27 --- /dev/null +++ b/ci_v2/jobs/scripts/check_style/double_whitespaces.pl @@ -0,0 +1,37 @@ +#!/usr/bin/perl + +use strict; + +# Find double whitespace such as "a, b, c" that looks very ugly and annoying. +# But skip double whitespaces if they are used as an alignment - by comparing to surrounding lines. + +my $ret = 0; + +foreach my $file (@ARGV) +{ + my @array; + + open (FH,'<',$file); + while () + { + push @array, $_; + } + + for (my $i = 1; $i < $#array; ++$i) + { + if ($array[$i] =~ ',( {2,3})[^ /]') + { + # https://stackoverflow.com/questions/87380/how-can-i-find-the-location-of-a-regex-match-in-perl + + if ((substr($array[$i - 1], $+[1] - 1, 2) !~ /^[ -][^ ]$/) # whitespaces are not part of alignment + && (substr($array[$i + 1], $+[1] - 1, 2) !~ /^[ -][^ ]$/) + && $array[$i] !~ /(-?\d+\w*,\s+){3,}/) # this is not a number table like { 10, -1, 2 } + { + print($file . ":" . ($i + 1) . $array[$i]); + $ret = 1; + } + } + } +} + +exit $ret; diff --git a/ci_v2/settings/definitions.py b/ci_v2/settings/definitions.py new file mode 100644 index 00000000000..87669cdcf25 --- /dev/null +++ b/ci_v2/settings/definitions.py @@ -0,0 +1,251 @@ +from praktika import Docker, Secret + +S3_BUCKET_NAME = "clickhouse-builds" +S3_BUCKET_HTTP_ENDPOINT = "clickhouse-builds.s3.amazonaws.com" + + +class RunnerLabels: + CI_SERVICES = "ci_services" + CI_SERVICES_EBS = "ci_services_ebs" + + +BASE_BRANCH = "master" + +SECRETS = [ + Secret.Config( + name="dockerhub_robot_password", + type=Secret.Type.AWS_SSM_VAR, + ), + Secret.Config( + name="woolenwolf_gh_app.clickhouse-app-id", + type=Secret.Type.AWS_SSM_SECRET, + ), + Secret.Config( + name="woolenwolf_gh_app.clickhouse-app-key", + type=Secret.Type.AWS_SSM_SECRET, + ), +] + +DOCKERS = [ + # Docker.Config( + # name="clickhouse/binary-builder", + # path="./docker/packager/binary-builder", + # arm64=True, + # amd64=True, + # depends_on=[], + # ), + # Docker.Config( + # name="clickhouse/cctools", + # path="./docker/packager/cctools", + # arm64=True, + # amd64=True, + # depends_on=[], + # ), + # Docker.Config( + # name="clickhouse/test-old-centos", + # path="./docker/test/compatibility/centos", + # arm64=True, + # amd64=True, + # depends_on=[], + # ), + # Docker.Config( + # name="clickhouse/test-old-ubuntu", + # path="./docker/test/compatibility/ubuntu", + # arm64=True, + # amd64=True, + # depends_on=[], + # ), + # Docker.Config( + # name="clickhouse/test-util", + # path="./docker/test/util", + # arm64=True, + # amd64=True, + # depends_on=[], + # ), + # Docker.Config( + # name="clickhouse/integration-test", + # path="./docker/test/integration/base", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-base"], + # ), + # Docker.Config( + # name="clickhouse/fuzzer", + # path="./docker/test/fuzzer", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-base"], + # ), + # Docker.Config( + # name="clickhouse/performance-comparison", + # path="./docker/test/performance-comparison", + # arm64=True, + # amd64=True, + # depends_on=[], + # ), + # Docker.Config( + # name="clickhouse/fasttest", + # path="./docker/test/fasttest", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-util"], + # ), + # Docker.Config( + # name="clickhouse/test-base", + # path="./docker/test/base", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-util"], + # ), + # Docker.Config( + # name="clickhouse/clickbench", + # path="./docker/test/clickbench", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-base"], + # ), + # Docker.Config( + # name="clickhouse/keeper-jepsen-test", + # path="./docker/test/keeper-jepsen", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-base"], + # ), + # Docker.Config( + # name="clickhouse/server-jepsen-test", + # path="./docker/test/server-jepsen", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-base"], + # ), + # Docker.Config( + # name="clickhouse/sqllogic-test", + # path="./docker/test/sqllogic", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-base"], + # ), + # Docker.Config( + # name="clickhouse/sqltest", + # path="./docker/test/sqltest", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-base"], + # ), + # Docker.Config( + # name="clickhouse/stateless-test", + # path="./docker/test/stateless", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-base"], + # ), + # Docker.Config( + # name="clickhouse/stateful-test", + # path="./docker/test/stateful", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/stateless-test"], + # ), + # Docker.Config( + # name="clickhouse/stress-test", + # path="./docker/test/stress", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/stateful-test"], + # ), + # Docker.Config( + # name="clickhouse/unit-test", + # path="./docker/test/unit", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-base"], + # ), + # Docker.Config( + # name="clickhouse/integration-tests-runner", + # path="./docker/test/integration/runner", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-base"], + # ), + Docker.Config( + name="clickhouse/style-test", + path="./ci_v2/docker/style-test", + platforms=Docker.Platforms.arm_amd, + depends_on=[], + ), + # Docker.Config( + # name="clickhouse/docs-builder", + # path="./docker/docs/builder", + # arm64=True, + # amd64=True, + # depends_on=["clickhouse/test-base"], + # ), +] + +# TODO: +# "docker/test/integration/s3_proxy": { +# "name": "clickhouse/s3-proxy", +# "dependent": [] +# }, +# "docker/test/integration/resolver": { +# "name": "clickhouse/python-bottle", +# "dependent": [] +# }, +# "docker/test/integration/helper_container": { +# "name": "clickhouse/integration-helper", +# "dependent": [] +# }, +# "docker/test/integration/mysql_golang_client": { +# "name": "clickhouse/mysql-golang-client", +# "dependent": [] +# }, +# "docker/test/integration/dotnet_client": { +# "name": "clickhouse/dotnet-client", +# "dependent": [] +# }, +# "docker/test/integration/mysql_java_client": { +# "name": "clickhouse/mysql-java-client", +# "dependent": [] +# }, +# "docker/test/integration/mysql_js_client": { +# "name": "clickhouse/mysql-js-client", +# "dependent": [] +# }, +# "docker/test/integration/mysql_php_client": { +# "name": "clickhouse/mysql-php-client", +# "dependent": [] +# }, +# "docker/test/integration/postgresql_java_client": { +# "name": "clickhouse/postgresql-java-client", +# "dependent": [] +# }, +# "docker/test/integration/kerberos_kdc": { +# "only_amd64": true, +# "name": "clickhouse/kerberos-kdc", +# "dependent": [] +# }, +# "docker/test/integration/kerberized_hadoop": { +# "only_amd64": true, +# "name": "clickhouse/kerberized-hadoop", +# "dependent": [] +# }, +# "docker/test/sqlancer": { +# "name": "clickhouse/sqlancer-test", +# "dependent": [] +# }, +# "docker/test/install/deb": { +# "name": "clickhouse/install-deb-test", +# "dependent": [] +# }, +# "docker/test/install/rpm": { +# "name": "clickhouse/install-rpm-test", +# "dependent": [] +# }, +# "docker/test/integration/nginx_dav": { +# "name": "clickhouse/nginx-dav", +# "dependent": [] +# } + + +class JobNames: + STYLE_CHECK = "Style Check" diff --git a/ci_v2/settings/settings.py b/ci_v2/settings/settings.py new file mode 100644 index 00000000000..153aab93506 --- /dev/null +++ b/ci_v2/settings/settings.py @@ -0,0 +1,20 @@ +from ci_v2.settings.definitions import ( + S3_BUCKET_HTTP_ENDPOINT, + S3_BUCKET_NAME, + RunnerLabels, +) + +S3_ARTIFACT_PATH = f"{S3_BUCKET_NAME}/artifacts" +CI_CONFIG_RUNS_ON = [RunnerLabels.CI_SERVICES] +DOCKER_BUILD_RUNS_ON = [RunnerLabels.CI_SERVICES_EBS] +CACHE_S3_PATH = f"{S3_BUCKET_NAME}/ci_ch_cache" +HTML_S3_PATH = f"{S3_BUCKET_NAME}/reports" +S3_BUCKET_TO_HTTP_ENDPOINT = {S3_BUCKET_NAME: S3_BUCKET_HTTP_ENDPOINT} + +DOCKERHUB_USERNAME = "robotclickhouse" +DOCKERHUB_SECRET = "dockerhub_robot_password" + +CI_DB_DB_NAME = "default" +CI_DB_TABLE_NAME = "checks" + +INSTALL_PYTHON_REQS_FOR_NATIVE_JOBS = "" diff --git a/ci_v2/workflows/pull_request.py b/ci_v2/workflows/pull_request.py new file mode 100644 index 00000000000..226455c77f2 --- /dev/null +++ b/ci_v2/workflows/pull_request.py @@ -0,0 +1,44 @@ +from typing import List + +from ci_v2.settings.definitions import ( + BASE_BRANCH, + DOCKERS, + SECRETS, + JobNames, + RunnerLabels, +) +from praktika import Job, Workflow + +style_check_job = Job.Config( + name=JobNames.STYLE_CHECK, + runs_on=[RunnerLabels.CI_SERVICES], + command="python3 ./ci_v2/jobs/check_style.py", + run_in_docker="clickhouse/style-test", +) + +workflow = Workflow.Config( + name="PR", + event=Workflow.Event.PULL_REQUEST, + base_branches=[BASE_BRANCH], + jobs=[ + style_check_job, + ], + dockers=DOCKERS, + secrets=SECRETS, + enable_cache=True, + enable_report=True, + enable_merge_ready_status=True, +) + +WORKFLOWS = [ + workflow, +] # type: List[Workflow.Config] + + +if __name__ == "__main__": + # example: local job test inside praktika environment + from praktika.runner import Runner + + Runner.generate_dummy_environment(workflow, style_check_job) + + Runner().run(workflow, style_check_job) diff --git a/docs/en/getting-started/example-datasets/stackoverflow.md b/docs/en/getting-started/example-datasets/stackoverflow.md index e982a3c3dfc..defe157cc52 100644 --- a/docs/en/getting-started/example-datasets/stackoverflow.md +++ b/docs/en/getting-started/example-datasets/stackoverflow.md @@ -7,7 +7,7 @@ description: Analyzing Stack Overflow data with ClickHouse # Analyzing Stack Overflow data with ClickHouse -This dataset contains every `Post`, `User`, `Vote`, `Comment`, `Badge, `PostHistory`, and `PostLink` that has occurred on Stack Overflow. +This dataset contains every `Posts`, `Users`, `Votes`, `Comments`, `Badges`, `PostHistory`, and `PostLinks` that has occurred on Stack Overflow. Users can either download pre-prepared Parquet versions of the data, containing every post up to April 2024, or download the latest data in XML format and load this. Stack Overflow provide updates to this data periodically - historically every 3 months. @@ -159,7 +159,7 @@ INSERT INTO stackoverflow.badges SELECT * FROM s3('https://datasets-documentatio 0 rows in set. Elapsed: 6.635 sec. Processed 51.29 million rows, 797.05 MB (7.73 million rows/s., 120.13 MB/s.) ``` -### `PostLinks` +### PostLinks ```sql CREATE TABLE stackoverflow.postlinks @@ -178,7 +178,7 @@ INSERT INTO stackoverflow.postlinks SELECT * FROM s3('https://datasets-documenta 0 rows in set. Elapsed: 1.534 sec. Processed 6.55 million rows, 129.70 MB (4.27 million rows/s., 84.57 MB/s.) ``` -### `PostHistory` +### PostHistory ```sql CREATE TABLE stackoverflow.posthistory diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 93437f71dcc..d17d05165e8 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2789,6 +2789,45 @@ Result: - [Custom Settings](../../operations/settings/index.md#custom_settings) +## getSettingOrDefault + +Returns the current value of a [custom setting](../../operations/settings/index.md#custom_settings) or returns the default value specified in the 2nd argument if the custom setting is not set in the current profile. + +**Syntax** + +```sql +getSettingOrDefault('custom_setting', default_value); +``` + +**Parameter** + +- `custom_setting` — The setting name. [String](../data-types/string.md). +- `default_value` — Value to return if custom_setting is not set. Value may be of any data type or Null. + +**Returned value** + +- The setting's current value or default_value if setting is not set. + +**Example** + +```sql +SELECT getSettingOrDefault('custom_undef1', 'my_value'); +SELECT getSettingOrDefault('custom_undef2', 100); +SELECT getSettingOrDefault('custom_undef3', NULL); +``` + +Result: + +``` +my_value +100 +NULL +``` + +**See Also** + +- [Custom Settings](../../operations/settings/index.md#custom_settings) + ## isDecimalOverflow Checks whether the [Decimal](../data-types/decimal.md) value is outside its precision or outside the specified precision. diff --git a/docs/en/sql-reference/statements/exists.md b/docs/en/sql-reference/statements/exists.md index 8195b34d71f..9490c7d10a5 100644 --- a/docs/en/sql-reference/statements/exists.md +++ b/docs/en/sql-reference/statements/exists.md @@ -7,7 +7,7 @@ sidebar_label: EXISTS # EXISTS Statement ``` sql -EXISTS [TEMPORARY] [TABLE|DICTIONARY] [db.]name [INTO OUTFILE filename] [FORMAT format] +EXISTS [TEMPORARY] [TABLE|DICTIONARY|DATABASE] [db.]name [INTO OUTFILE filename] [FORMAT format] ``` Returns a single `UInt8`-type column, which contains the single value `0` if the table or database does not exist, or `1` if the table exists in the specified database. diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 9cf0e08e0ef..188dd2c019d 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include @@ -833,11 +833,13 @@ try const size_t physical_server_memory = getMemoryAmount(); - LOG_INFO(log, "Available RAM: {}; logical cores: {}; used cores: {}.", + LOG_INFO( + log, + "Available RAM: {}; logical cores: {}; used cores: {}.", formatReadableSizeWithBinarySuffix(physical_server_memory), std::thread::hardware_concurrency(), - getNumberOfPhysicalCPUCores() // on ARM processors it can show only enabled at current moment cores - ); + getNumberOfCPUCoresToUse() // on ARM processors it can show only enabled at current moment cores + ); #if defined(__x86_64__) String cpu_info; @@ -1060,8 +1062,9 @@ try 0, // We don't need any threads one all the parts will be deleted server_settings.max_parts_cleaning_thread_pool_size); - auto max_database_replicated_create_table_thread_pool_size = server_settings.max_database_replicated_create_table_thread_pool_size ? - server_settings.max_database_replicated_create_table_thread_pool_size : getNumberOfPhysicalCPUCores(); + auto max_database_replicated_create_table_thread_pool_size = server_settings.max_database_replicated_create_table_thread_pool_size + ? server_settings.max_database_replicated_create_table_thread_pool_size + : getNumberOfCPUCoresToUse(); getDatabaseReplicatedCreateTablesThreadPool().initialize( max_database_replicated_create_table_thread_pool_size, 0, // We don't need any threads once all the tables will be created @@ -1638,7 +1641,7 @@ try concurrent_threads_soft_limit = new_server_settings.concurrent_threads_soft_limit_num; if (new_server_settings.concurrent_threads_soft_limit_ratio_to_cores > 0) { - auto value = new_server_settings.concurrent_threads_soft_limit_ratio_to_cores * getNumberOfPhysicalCPUCores(); + auto value = new_server_settings.concurrent_threads_soft_limit_ratio_to_cores * getNumberOfCPUCoresToUse(); if (value > 0 && value < concurrent_threads_soft_limit) concurrent_threads_soft_limit = value; } diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index b2dd41715db..155e65ff568 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include @@ -1630,7 +1630,7 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des client_context, {}, client_context->getSettingsRef()[Setting::max_block_size], - getNumberOfPhysicalCPUCores()); + getNumberOfCPUCoresToUse()); auto builder = plan.buildQueryPipeline( QueryPlanOptimizationSettings::fromContext(client_context), diff --git a/src/Common/AsyncLoader.cpp b/src/Common/AsyncLoader.cpp index d40e320e741..9e928110ec9 100644 --- a/src/Common/AsyncLoader.cpp +++ b/src/Common/AsyncLoader.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include @@ -49,14 +49,14 @@ void logAboutProgress(LoggerPtr log, size_t processed, size_t total, AtomicStopw AsyncLoader::Pool::Pool(const AsyncLoader::PoolInitializer & init) : name(init.name) , priority(init.priority) - , max_threads(init.max_threads > 0 ? init.max_threads : getNumberOfPhysicalCPUCores()) + , max_threads(init.max_threads > 0 ? init.max_threads : getNumberOfCPUCoresToUse()) , thread_pool(std::make_unique( - init.metric_threads, - init.metric_active_threads, - init.metric_scheduled_threads, - /* max_threads = */ std::numeric_limits::max(), // Unlimited number of threads, we do worker management ourselves - /* max_free_threads = */ 0, // We do not require free threads - /* queue_size = */0)) // Unlimited queue to avoid blocking during worker spawning + init.metric_threads, + init.metric_active_threads, + init.metric_scheduled_threads, + /* max_threads = */ std::numeric_limits::max(), // Unlimited number of threads, we do worker management ourselves + /* max_free_threads = */ 0, // We do not require free threads + /* queue_size = */ 0)) // Unlimited queue to avoid blocking during worker spawning {} AsyncLoader::Pool::Pool(Pool&& o) noexcept @@ -491,7 +491,7 @@ void AsyncLoader::remove(const LoadJobSet & jobs) void AsyncLoader::setMaxThreads(size_t pool, size_t value) { if (value == 0) - value = getNumberOfPhysicalCPUCores(); + value = getNumberOfCPUCoresToUse(); std::unique_lock lock{mutex}; auto & p = pools[pool]; // Note that underlying `ThreadPool` always has unlimited `queue_size` and `max_threads`. diff --git a/src/Common/ThreadPool.cpp b/src/Common/ThreadPool.cpp index 8685533e2d1..92ad6c3466f 100644 --- a/src/Common/ThreadPool.cpp +++ b/src/Common/ThreadPool.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include @@ -93,7 +93,7 @@ static constexpr auto DEFAULT_THREAD_NAME = "ThreadPool"; template ThreadPoolImpl::ThreadPoolImpl(Metric metric_threads_, Metric metric_active_threads_, Metric metric_scheduled_jobs_) - : ThreadPoolImpl(metric_threads_, metric_active_threads_, metric_scheduled_jobs_, getNumberOfPhysicalCPUCores()) + : ThreadPoolImpl(metric_threads_, metric_active_threads_, metric_scheduled_jobs_, getNumberOfCPUCoresToUse()) { } diff --git a/src/Common/getNumberOfPhysicalCPUCores.cpp b/src/Common/getNumberOfCPUCoresToUse.cpp similarity index 97% rename from src/Common/getNumberOfPhysicalCPUCores.cpp rename to src/Common/getNumberOfCPUCoresToUse.cpp index 34a1add2f0e..28e1e3598ea 100644 --- a/src/Common/getNumberOfPhysicalCPUCores.cpp +++ b/src/Common/getNumberOfCPUCoresToUse.cpp @@ -1,4 +1,4 @@ -#include "getNumberOfPhysicalCPUCores.h" +#include "getNumberOfCPUCoresToUse.h" #if defined(OS_LINUX) # include @@ -165,7 +165,7 @@ catch (...) } #endif -unsigned getNumberOfPhysicalCPUCoresImpl() +unsigned getNumberOfCPUCoresToUseImpl() { unsigned cores = std::thread::hardware_concurrency(); /// logical cores (with SMT/HyperThreading) @@ -189,9 +189,9 @@ unsigned getNumberOfPhysicalCPUCoresImpl() } -unsigned getNumberOfPhysicalCPUCores() +unsigned getNumberOfCPUCoresToUse() { /// Calculate once. - static auto cores = getNumberOfPhysicalCPUCoresImpl(); + static const unsigned cores = getNumberOfCPUCoresToUseImpl(); return cores; } diff --git a/src/Common/getNumberOfCPUCoresToUse.h b/src/Common/getNumberOfCPUCoresToUse.h new file mode 100644 index 00000000000..e1740032858 --- /dev/null +++ b/src/Common/getNumberOfCPUCoresToUse.h @@ -0,0 +1,6 @@ +#pragma once + +/// Get the number of CPU cores to use. Depending on the machine size we choose +/// between the number of physical and logical cores. +/// Also under cgroups we respect possible cgroups limits. +unsigned getNumberOfCPUCoresToUse(); diff --git a/src/Common/getNumberOfPhysicalCPUCores.h b/src/Common/getNumberOfPhysicalCPUCores.h deleted file mode 100644 index 9e3412fdcba..00000000000 --- a/src/Common/getNumberOfPhysicalCPUCores.h +++ /dev/null @@ -1,5 +0,0 @@ -#pragma once - -/// Get number of CPU cores without hyper-threading. -/// The calculation respects possible cgroups limits. -unsigned getNumberOfPhysicalCPUCores(); diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index f54e5b8ca9d..b005ecf5e1e 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -26,7 +26,7 @@ #include #include #include -#include +#include #if USE_SSL # include @@ -444,7 +444,7 @@ void KeeperServer::launchRaftServer(const Poco::Util::AbstractConfiguration & co /// At least 16 threads for network communication in asio. /// asio is async framework, so even with 1 thread it should be ok, but /// still as safeguard it's better to have some redundant capacity here - asio_opts.thread_pool_size_ = std::max(16U, getNumberOfPhysicalCPUCores()); + asio_opts.thread_pool_size_ = std::max(16U, getNumberOfCPUCoresToUse()); if (state_manager->isSecure()) { diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index d560cb2c105..c7e0e9b7b37 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -818,6 +818,23 @@ Serializations Block::getSerializations() const return res; } +Serializations Block::getSerializations(const SerializationInfoByName & hints) const +{ + Serializations res; + res.reserve(data.size()); + + for (const auto & column : data) + { + auto it = hints.find(column.name); + if (it == hints.end()) + res.push_back(column.type->getDefaultSerialization()); + else + res.push_back(column.type->getSerialization(*it->second)); + } + + return res; +} + void convertToFullIfSparse(Block & block) { for (auto & column : block) diff --git a/src/Core/Block.h b/src/Core/Block.h index d998581a50f..841fb3fb663 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace DB @@ -99,6 +100,7 @@ public: NameMap getNamesToIndexesMap() const; Serializations getSerializations() const; + Serializations getSerializations(const SerializationInfoByName & hints) const; /// Returns number of rows from first column in block, not equal to nullptr. If no columns, returns 0. size_t rows() const; diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index dcd1d33ff27..c7d0bcaa7b7 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -240,6 +240,7 @@ namespace ErrorCodes M(Bool, output_format_parallel_formatting, true, "Enable parallel formatting for some data formats.", 0) \ M(UInt64, output_format_compression_level, 3, "Default compression level if query output is compressed. The setting is applied when `SELECT` query has `INTO OUTFILE` or when inserting to table function `file`, `url`, `hdfs`, `s3`, and `azureBlobStorage`.", 0) \ M(UInt64, output_format_compression_zstd_window_log, 0, "Can be used when the output compression method is `zstd`. If greater than `0`, this setting explicitly sets compression window size (power of `2`) and enables a long-range mode for zstd compression.", 0) \ + M(Bool, enable_parsing_to_custom_serialization, true, "If true then data can be parsed directly to columns with custom serialization (e.g. Sparse) according to hints for serialization got from the table.", 0) \ \ M(UInt64, merge_tree_min_rows_for_concurrent_read, (20 * 8192), "If at least as many lines are read from one file, the reading can be parallelized.", 0) \ M(UInt64, merge_tree_min_bytes_for_concurrent_read, (24 * 10 * 1024 * 1024), "If at least as many bytes are read from one file, the reading can be parallelized.", 0) \ @@ -916,6 +917,7 @@ namespace ErrorCodes M(UInt64, extract_key_value_pairs_max_pairs_per_row, 1000, "Max number of pairs that can be produced by the `extractKeyValuePairs` function. Used as a safeguard against consuming too much memory.", 0) ALIAS(extract_kvp_max_pairs_per_row) \ M(Bool, restore_replace_external_engines_to_null, false, "Replace all the external table engines to Null on restore. Useful for testing purposes", 0) \ M(Bool, restore_replace_external_table_functions_to_null, false, "Replace all table functions to Null on restore. Useful for testing purposes", 0) \ + M(Bool, restore_replace_external_dictionary_source_to_null, false, "Replace external dictionary sources to Null on restore. Useful for testing purposes", 0) \ M(Bool, create_if_not_exists, false, "Enable IF NOT EXISTS for CREATE statements by default", 0) \ M(Bool, mongodb_throw_on_unsupported_query, true, "If enabled, MongoDB tables will return an error when a MongoDB query cannot be built. Otherwise, ClickHouse reads the full table and processes it locally. This option does not apply to the legacy implementation or when 'allow_experimental_analyzer=0'.", 0) \ \ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index ad40b280dc4..d1f90f378e6 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -67,10 +67,12 @@ static std::initializer_list #include #include -#include +#include #include #include #include @@ -262,7 +262,7 @@ void SettingFieldMaxThreads::readBinary(ReadBuffer & in) UInt64 SettingFieldMaxThreads::getAuto() { - return getNumberOfPhysicalCPUCores(); + return getNumberOfCPUCoresToUse(); } namespace diff --git a/src/Core/SettingsQuirks.cpp b/src/Core/SettingsQuirks.cpp index 47e786409b3..bfd63685363 100644 --- a/src/Core/SettingsQuirks.cpp +++ b/src/Core/SettingsQuirks.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include @@ -110,7 +110,7 @@ void doSettingsSanityCheckClamp(Settings & current_settings, LoggerPtr log) }; UInt64 max_threads = get_current_value("max_threads").safeGet(); - UInt64 max_threads_max_value = 256 * getNumberOfPhysicalCPUCores(); + UInt64 max_threads_max_value = 256 * getNumberOfCPUCoresToUse(); if (max_threads > max_threads_max_value) { if (log) diff --git a/src/Daemon/SentryWriter.cpp b/src/Daemon/SentryWriter.cpp index c51a1100639..154b99ad541 100644 --- a/src/Daemon/SentryWriter.cpp +++ b/src/Daemon/SentryWriter.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include @@ -54,7 +54,7 @@ void setExtras(bool anonymize, const std::string & server_data_path) /// Sentry does not support 64-bit integers. sentry_set_extra("total_ram", sentry_value_new_string(formatReadableSizeWithBinarySuffix(getMemoryAmountOrZero()).c_str())); - sentry_set_extra("physical_cpu_cores", sentry_value_new_int32(getNumberOfPhysicalCPUCores())); + sentry_set_extra("cpu_cores", sentry_value_new_int32(getNumberOfCPUCoresToUse())); if (!server_data_path.empty()) sentry_set_extra("disk_free_space", sentry_value_new_string(formatReadableSizeWithBinarySuffix(fs::space(server_data_path).free).c_str())); diff --git a/src/DataTypes/Serializations/SerializationInfo.cpp b/src/DataTypes/Serializations/SerializationInfo.cpp index 7d5c456af7f..df9d27d4ca2 100644 --- a/src/DataTypes/Serializations/SerializationInfo.cpp +++ b/src/DataTypes/Serializations/SerializationInfo.cpp @@ -47,6 +47,12 @@ void SerializationInfo::Data::add(const Data & other) num_defaults += other.num_defaults; } +void SerializationInfo::Data::remove(const Data & other) +{ + num_rows -= other.num_rows; + num_defaults -= other.num_defaults; +} + void SerializationInfo::Data::addDefaults(size_t length) { num_rows += length; @@ -80,6 +86,14 @@ void SerializationInfo::add(const SerializationInfo & other) kind = chooseKind(data, settings); } +void SerializationInfo::remove(const SerializationInfo & other) +{ + data.remove(other.data); + if (settings.choose_kind) + kind = chooseKind(data, settings); +} + + void SerializationInfo::addDefaults(size_t length) { data.addDefaults(length); @@ -202,13 +216,37 @@ void SerializationInfoByName::add(const Block & block) void SerializationInfoByName::add(const SerializationInfoByName & other) { for (const auto & [name, info] : other) - { - auto it = find(name); - if (it == end()) - continue; + add(name, *info); +} - it->second->add(*info); - } +void SerializationInfoByName::add(const String & name, const SerializationInfo & info) +{ + if (auto it = find(name); it != end()) + it->second->add(info); +} + +void SerializationInfoByName::remove(const SerializationInfoByName & other) +{ + for (const auto & [name, info] : other) + remove(name, *info); +} + +void SerializationInfoByName::remove(const String & name, const SerializationInfo & info) +{ + if (auto it = find(name); it != end()) + it->second->remove(info); +} + +SerializationInfoPtr SerializationInfoByName::tryGet(const String & name) const +{ + auto it = find(name); + return it == end() ? nullptr : it->second; +} + +MutableSerializationInfoPtr SerializationInfoByName::tryGet(const String & name) +{ + auto it = find(name); + return it == end() ? nullptr : it->second; } void SerializationInfoByName::replaceData(const SerializationInfoByName & other) @@ -224,6 +262,12 @@ void SerializationInfoByName::replaceData(const SerializationInfoByName & other) } } +ISerialization::Kind SerializationInfoByName::getKind(const String & column_name) const +{ + auto it = find(column_name); + return it != end() ? it->second->getKind() : ISerialization::Kind::DEFAULT; +} + void SerializationInfoByName::writeJSON(WriteBuffer & out) const { Poco::JSON::Object object; diff --git a/src/DataTypes/Serializations/SerializationInfo.h b/src/DataTypes/Serializations/SerializationInfo.h index 5a900a5521c..c30e50ab12c 100644 --- a/src/DataTypes/Serializations/SerializationInfo.h +++ b/src/DataTypes/Serializations/SerializationInfo.h @@ -39,6 +39,7 @@ public: void add(const IColumn & column); void add(const Data & other); + void remove(const Data & other); void addDefaults(size_t length); }; @@ -52,6 +53,7 @@ public: virtual void add(const IColumn & column); virtual void add(const SerializationInfo & other); + virtual void remove(const SerializationInfo & other); virtual void addDefaults(size_t length); virtual void replaceData(const SerializationInfo & other); @@ -99,6 +101,14 @@ public: void add(const Block & block); void add(const SerializationInfoByName & other); + void add(const String & name, const SerializationInfo & info); + + void remove(const SerializationInfoByName & other); + void remove(const String & name, const SerializationInfo & info); + + SerializationInfoPtr tryGet(const String & name) const; + MutableSerializationInfoPtr tryGet(const String & name); + ISerialization::Kind getKind(const String & column_name) const; /// Takes data from @other, but keeps current serialization kinds. /// If column exists in @other infos, but not in current infos, diff --git a/src/DataTypes/Serializations/SerializationInfoTuple.cpp b/src/DataTypes/Serializations/SerializationInfoTuple.cpp index cd65b865248..b7449be3cc5 100644 --- a/src/DataTypes/Serializations/SerializationInfoTuple.cpp +++ b/src/DataTypes/Serializations/SerializationInfoTuple.cpp @@ -10,6 +10,7 @@ namespace ErrorCodes { extern const int CORRUPTED_DATA; extern const int THERE_IS_NO_COLUMN; + extern const int NOT_IMPLEMENTED; } SerializationInfoTuple::SerializationInfoTuple( @@ -68,6 +69,19 @@ void SerializationInfoTuple::add(const SerializationInfo & other) } } +void SerializationInfoTuple::remove(const SerializationInfo & other) +{ + if (!structureEquals(other)) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot remove from serialization info different structure"); + + SerializationInfo::remove(other); + const auto & other_elems = assert_cast(other).elems; + chassert(elems.size() == other_elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->remove(*other_elems[i]); +} + void SerializationInfoTuple::addDefaults(size_t length) { SerializationInfo::addDefaults(length); diff --git a/src/DataTypes/Serializations/SerializationInfoTuple.h b/src/DataTypes/Serializations/SerializationInfoTuple.h index a9f3bdb6c6e..a6b9c89166f 100644 --- a/src/DataTypes/Serializations/SerializationInfoTuple.h +++ b/src/DataTypes/Serializations/SerializationInfoTuple.h @@ -15,6 +15,7 @@ public: void add(const IColumn & column) override; void add(const SerializationInfo & other) override; + void remove(const SerializationInfo & other) override; void addDefaults(size_t length) override; void replaceData(const SerializationInfo & other) override; diff --git a/src/DataTypes/Serializations/SerializationSparse.cpp b/src/DataTypes/Serializations/SerializationSparse.cpp index 73488d308bb..327d1f23cca 100644 --- a/src/DataTypes/Serializations/SerializationSparse.cpp +++ b/src/DataTypes/Serializations/SerializationSparse.cpp @@ -13,7 +13,6 @@ namespace DB namespace ErrorCodes { - extern const int NOT_IMPLEMENTED; extern const int LOGICAL_ERROR; } @@ -313,15 +312,35 @@ void SerializationSparse::deserializeBinary(Field & field, ReadBuffer & istr, co nested->deserializeBinary(field, istr, settings); } +template +void SerializationSparse::deserialize(IColumn & column, Reader && reader) const +{ + auto & column_sparse = assert_cast(column); + auto & values = column_sparse.getValuesColumn(); + size_t old_size = column_sparse.size(); + + /// It just increments the size of column. + column_sparse.insertDefault(); + reader(column_sparse.getValuesColumn()); + + if (values.isDefaultAt(values.size() - 1)) + values.popBack(1); + else + column_sparse.getOffsetsData().push_back(old_size); +} + void SerializationSparse::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { const auto & column_sparse = assert_cast(column); nested->serializeBinary(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); } -void SerializationSparse::deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const +void SerializationSparse::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeBinary' is not implemented for SerializationSparse"); + deserialize(column, [&](auto & nested_column) + { + nested->deserializeBinary(nested_column, istr, settings); + }); } void SerializationSparse::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -330,9 +349,12 @@ void SerializationSparse::serializeTextEscaped(const IColumn & column, size_t ro nested->serializeTextEscaped(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); } -void SerializationSparse::deserializeTextEscaped(IColumn &, ReadBuffer &, const FormatSettings &) const +void SerializationSparse::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextEscaped' is not implemented for SerializationSparse"); + deserialize(column, [&](auto & nested_column) + { + nested->deserializeTextEscaped(nested_column, istr, settings); + }); } void SerializationSparse::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -341,9 +363,12 @@ void SerializationSparse::serializeTextQuoted(const IColumn & column, size_t row nested->serializeTextQuoted(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); } -void SerializationSparse::deserializeTextQuoted(IColumn &, ReadBuffer &, const FormatSettings &) const +void SerializationSparse::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextQuoted' is not implemented for SerializationSparse"); + deserialize(column, [&](auto & nested_column) + { + nested->deserializeTextQuoted(nested_column, istr, settings); + }); } void SerializationSparse::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -352,9 +377,12 @@ void SerializationSparse::serializeTextCSV(const IColumn & column, size_t row_nu nested->serializeTextCSV(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); } -void SerializationSparse::deserializeTextCSV(IColumn &, ReadBuffer &, const FormatSettings &) const +void SerializationSparse::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextCSV' is not implemented for SerializationSparse"); + deserialize(column, [&](auto & nested_column) + { + nested->deserializeTextCSV(nested_column, istr, settings); + }); } void SerializationSparse::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -363,9 +391,12 @@ void SerializationSparse::serializeText(const IColumn & column, size_t row_num, nested->serializeText(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); } -void SerializationSparse::deserializeWholeText(IColumn &, ReadBuffer &, const FormatSettings &) const +void SerializationSparse::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeWholeText' is not implemented for SerializationSparse"); + deserialize(column, [&](auto & nested_column) + { + nested->deserializeWholeText(nested_column, istr, settings); + }); } void SerializationSparse::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -374,9 +405,12 @@ void SerializationSparse::serializeTextJSON(const IColumn & column, size_t row_n nested->serializeTextJSON(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); } -void SerializationSparse::deserializeTextJSON(IColumn &, ReadBuffer &, const FormatSettings &) const +void SerializationSparse::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextJSON' is not implemented for SerializationSparse"); + deserialize(column, [&](auto & nested_column) + { + nested->deserializeTextJSON(nested_column, istr, settings); + }); } void SerializationSparse::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationSparse.h b/src/DataTypes/Serializations/SerializationSparse.h index a55856bacf0..b31a006639b 100644 --- a/src/DataTypes/Serializations/SerializationSparse.h +++ b/src/DataTypes/Serializations/SerializationSparse.h @@ -99,6 +99,9 @@ private: ColumnPtr create(const ColumnPtr & prev) const override; }; + template + void deserialize(IColumn & column, Reader && reader) const; + SerializationPtr nested; }; diff --git a/src/Functions/getSetting.cpp b/src/Functions/getSetting.cpp index aed6b2119e4..0ec2a7b11f3 100644 --- a/src/Functions/getSetting.cpp +++ b/src/Functions/getSetting.cpp @@ -19,11 +19,18 @@ namespace ErrorCodes namespace { +enum class ErrorHandlingMode : uint8_t +{ + Exception, /// Raise exception if setting not found (getSetting()) + Default, /// Return default value if setting not found (getSettingOrDefault()) +}; + /// Get the value of a setting. +template class FunctionGetSetting : public IFunction, WithContext { public: - static constexpr auto name = "getSetting"; + static constexpr auto name = (mode == ErrorHandlingMode::Exception) ? "getSetting" : "getSettingOrDefault"; static FunctionPtr create(ContextPtr context_) { return std::make_shared(context_); } explicit FunctionGetSetting(ContextPtr context_) : WithContext(context_) {} @@ -31,8 +38,8 @@ public: String getName() const override { return name; } bool isDeterministic() const override { return false; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - size_t getNumberOfArguments() const override { return 1; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; } + size_t getNumberOfArguments() const override { return (mode == ErrorHandlingMode::Default) ? 2 : 1 ; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { @@ -60,7 +67,21 @@ private: String{name}); std::string_view setting_name{column->getDataAt(0).toView()}; - return getContext()->getSettingsRef().get(setting_name); + Field setting_value; + if constexpr (mode == ErrorHandlingMode::Exception) + setting_value = getContext()->getSettingsRef().get(setting_name); + else + { + const auto * default_value_column = arguments[1].column.get(); + if (!default_value_column || !(isColumnConst(*default_value_column))) + { + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "The 2nd argument of function {} should be a constant with the default value of a setting", String{name}); + } + if (!getContext()->getSettingsRef().tryGet(setting_name, setting_value)) + setting_value = (*default_value_column)[0]; + } + return setting_value; } }; @@ -68,7 +89,35 @@ private: REGISTER_FUNCTION(GetSetting) { - factory.registerFunction(); + factory.registerFunction>(FunctionDocumentation{ + .description = R"( +Returns the current value of a custom setting. +)", + .syntax = "getSetting('custom_setting')", + .arguments = { + {"custom_setting", "The setting name. Type: String."} + }, + .returned_value = "The setting's current value.", + .examples = { + {"getSetting", "SET custom_a = 123; SELECT getSetting('custom_a');", "123"}, + }, + .categories{"Other"}}, FunctionFactory::Case::Sensitive); + factory.registerFunction>(FunctionDocumentation{ + .description = R"( +Returns the current value of a custom setting or returns the default value specified in the 2nd argument if the custom setting is not set in the current profile. +)", + .syntax = "getSettingOrDefault('custom_setting', default_value)", + .arguments = { + {"custom_setting", "The setting name. Type: String."}, + {"default_value", "Value to return if custom_setting is not set. Value may be of any data type or Null."}, + }, + .returned_value = "The setting's current value or the default_value if setting is not set.", + .examples = { + {"getSettingOrDefault", "SELECT getSettingOrDefault('custom_undef1', 'my_value');", "my_value"}, + {"getSettingOrDefault", "SELECT getSettingOrDefault('custom_undef1', 100);", "100"}, + {"getSettingOrDefault", "SELECT getSettingOrDefault('custom_undef1', NULL);", "NULL"}, + }, + .categories{"Other"}}, FunctionFactory::Case::Sensitive); } } diff --git a/src/Functions/materialize.cpp b/src/Functions/materialize.cpp index 5cef610b60a..e8a43dfc820 100644 --- a/src/Functions/materialize.cpp +++ b/src/Functions/materialize.cpp @@ -7,7 +7,7 @@ namespace DB REGISTER_FUNCTION(Materialize) { - factory.registerFunction(); + factory.registerFunction>(); } } diff --git a/src/Functions/materialize.h b/src/Functions/materialize.h index 571391faba7..ac4a01d875e 100644 --- a/src/Functions/materialize.h +++ b/src/Functions/materialize.h @@ -9,13 +9,14 @@ namespace DB /** materialize(x) - materialize the constant */ +template class FunctionMaterialize : public IFunction { public: static constexpr auto name = "materialize"; static FunctionPtr create(ContextPtr) { - return std::make_shared(); + return std::make_shared>(); } /// Get the function name. @@ -55,7 +56,10 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override { - return recursiveRemoveSparse(arguments[0].column->convertToFullColumnIfConst()); + auto res = arguments[0].column->convertToFullColumnIfConst(); + if constexpr (remove_sparse) + res = recursiveRemoveSparse(res); + return res; } bool hasInformationAboutMonotonicity() const override { return true; } diff --git a/src/Functions/toStartOfInterval.cpp b/src/Functions/toStartOfInterval.cpp index da89b250098..bc2a3e506f7 100644 --- a/src/Functions/toStartOfInterval.cpp +++ b/src/Functions/toStartOfInterval.cpp @@ -226,8 +226,17 @@ public: if (overload == Overload::Origin) origin_column = arguments[2]; - const size_t time_zone_arg_num = (overload == Overload::Default) ? 2 : 3; - const auto & time_zone = extractTimeZoneFromFunctionArguments(arguments, time_zone_arg_num, 0); + const DateLUTImpl * time_zone_tmp; + + if (isDateTimeOrDateTime64(time_column.type) || isDateTimeOrDateTime64(result_type)) + { + const size_t time_zone_arg_num = (overload == Overload::Default) ? 2 : 3; + time_zone_tmp = &extractTimeZoneFromFunctionArguments(arguments, time_zone_arg_num, 0); + } + else /// As we convert date to datetime and perform calculation, we don't need to take the timezone into account, so we set it to default + time_zone_tmp = &DateLUT::instance("UTC"); + + const DateLUTImpl & time_zone = *time_zone_tmp; ColumnPtr result_column; if (isDate(result_type)) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 3bf79dc173f..e536ca9c0c6 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1433,16 +1433,21 @@ bool ActionsDAG::hasNonDeterministic() const return false; } -void ActionsDAG::addMaterializingOutputActions() +void ActionsDAG::addMaterializingOutputActions(bool materialize_sparse) { for (auto & output_node : outputs) - output_node = &materializeNode(*output_node); + output_node = &materializeNode(*output_node, materialize_sparse); } -const ActionsDAG::Node & ActionsDAG::materializeNode(const Node & node) +const ActionsDAG::Node & ActionsDAG::materializeNode(const Node & node, bool materialize_sparse) { - FunctionOverloadResolverPtr func_builder_materialize - = std::make_unique(std::make_shared()); + FunctionPtr func_materialize; + if (materialize_sparse) + func_materialize = std::make_shared>(); + else + func_materialize = std::make_shared>(); + + FunctionOverloadResolverPtr func_builder_materialize = std::make_unique(std::move(func_materialize)); const auto & name = node.result_name; const auto * func = &addFunction(func_builder_materialize, {&node}, {}); @@ -1469,7 +1474,7 @@ ActionsDAG ActionsDAG::makeConvertingActions( ActionsDAG actions_dag(source); NodeRawConstPtrs projection(num_result_columns); - FunctionOverloadResolverPtr func_builder_materialize = std::make_unique(std::make_shared()); + FunctionOverloadResolverPtr func_builder_materialize = std::make_unique(std::make_shared>()); std::unordered_map> inputs; if (mode == MatchColumnsMode::Name) @@ -1596,7 +1601,7 @@ ActionsDAG ActionsDAG::makeAddingColumnActions(ColumnWithTypeAndName column) { ActionsDAG adding_column_action; FunctionOverloadResolverPtr func_builder_materialize - = std::make_unique(std::make_shared()); + = std::make_unique(std::make_shared>()); auto column_name = column.name; const auto * column_node = &adding_column_action.addColumn(std::move(column)); diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index f948e26e32e..ac92d99bfa8 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -282,14 +282,13 @@ public: /// For apply materialize() function for every output. /// Also add aliases so the result names remain unchanged. - void addMaterializingOutputActions(); + void addMaterializingOutputActions(bool materialize_sparse); /// Apply materialize() function to node. Result node has the same name. - const Node & materializeNode(const Node & node); + const Node & materializeNode(const Node & node, bool materialize_sparse = true); enum class MatchColumnsMode : uint8_t { - /// Require same number of columns in source and result. Match columns by corresponding positions, regardless to names. Position, /// Find columns in source by their names. Allow excessive columns in source. Name, diff --git a/src/Interpreters/BloomFilterHash.h b/src/Interpreters/BloomFilterHash.h index 8248e9e4469..49450b5932b 100644 --- a/src/Interpreters/BloomFilterHash.h +++ b/src/Interpreters/BloomFilterHash.h @@ -171,7 +171,7 @@ struct BloomFilterHash const auto * index_column = typeid_cast *>(column); if (unlikely(!index_column)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column type was passed to the bloom filter index."); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} was passed to the bloom filter index", column->getName()); const typename ColumnVector::Container & vec_from = index_column->getData(); diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 196c43798a7..3caebeb0ea5 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -3376,10 +3376,13 @@ size_t Context::getPrefetchThreadpoolSize() const ThreadPool & Context::getBuildVectorSimilarityIndexThreadPool() const { - callOnce(shared->build_vector_similarity_index_threadpool_initialized, [&] { + callOnce( + shared->build_vector_similarity_index_threadpool_initialized, + [&] + { size_t pool_size = shared->server_settings.max_build_vector_similarity_index_thread_pool_size > 0 ? shared->server_settings.max_build_vector_similarity_index_thread_pool_size - : getNumberOfPhysicalCPUCores(); + : getNumberOfCPUCoresToUse(); shared->build_vector_similarity_index_threadpool = std::make_unique( CurrentMetrics::BuildVectorSimilarityIndexThreads, CurrentMetrics::BuildVectorSimilarityIndexThreadsActive, diff --git a/src/Interpreters/FilesystemCacheLog.cpp b/src/Interpreters/FilesystemCacheLog.cpp index 90756f1c84a..9f744a5f366 100644 --- a/src/Interpreters/FilesystemCacheLog.cpp +++ b/src/Interpreters/FilesystemCacheLog.cpp @@ -25,6 +25,8 @@ ColumnsDescription FilesystemCacheLogElement::getColumnsDescription() std::make_shared>(), }; + auto low_cardinality_string = std::make_shared(std::make_shared()); + return ColumnsDescription { {"hostname", std::make_shared(std::make_shared()), "Hostname"}, @@ -39,7 +41,7 @@ ColumnsDescription FilesystemCacheLogElement::getColumnsDescription() {"size", std::make_shared(), "Read size"}, {"read_type", std::make_shared(), "Read type: READ_FROM_CACHE, READ_FROM_FS_AND_DOWNLOADED_TO_CACHE, READ_FROM_FS_BYPASSING_CACHE"}, {"read_from_cache_attempted", std::make_shared(), "Whether reading from cache was attempted"}, - {"ProfileEvents", std::make_shared(std::make_shared(), std::make_shared()), "Profile events collected while reading this file segment"}, + {"ProfileEvents", std::make_shared(low_cardinality_string, std::make_shared()), "Profile events collected while reading this file segment"}, {"read_buffer_id", std::make_shared(), "Internal implementation read buffer id"}, }; } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 9e3f38cc4a9..7267cd274ad 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -76,6 +76,8 @@ #include #include +#include + #include #include @@ -137,6 +139,7 @@ namespace Setting extern const SettingsUInt64 max_parser_depth; extern const SettingsBool restore_replace_external_engines_to_null; extern const SettingsBool restore_replace_external_table_functions_to_null; + extern const SettingsBool restore_replace_external_dictionary_source_to_null; } namespace ErrorCodes @@ -1155,6 +1158,22 @@ namespace storage.set(storage.engine, engine_ast); } + void setNullDictionarySourceIfExternal(ASTCreateQuery & create_query) + { + ASTDictionary & dict = *create_query.dictionary; + if (Poco::toLower(dict.source->name) == "clickhouse") + { + auto config = getDictionaryConfigurationFromAST(create_query, Context::getGlobalContextInstance()); + auto info = getInfoIfClickHouseDictionarySource(config, Context::getGlobalContextInstance()); + if (info && info->is_local) + return; + } + auto source_ast = std::make_shared(); + source_ast->name = "null"; + source_ast->elements = std::make_shared(); + source_ast->children.push_back(source_ast->elements); + dict.set(dict.source, source_ast); + } } void InterpreterCreateQuery::setEngine(ASTCreateQuery & create) const @@ -1181,6 +1200,9 @@ void InterpreterCreateQuery::setEngine(ASTCreateQuery & create) const return; } + if (create.is_dictionary && getContext()->getSettingsRef()[Setting::restore_replace_external_dictionary_source_to_null]) + setNullDictionarySourceIfExternal(create); + if (create.is_dictionary || create.is_ordinary_view || create.is_live_view || create.is_window_view) return; diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index baa7ef4f415..ef7d90d3ac1 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -71,6 +71,7 @@ namespace Setting extern const SettingsBool use_concurrency_control; extern const SettingsSeconds lock_acquire_timeout; extern const SettingsUInt64 parallel_distributed_insert_select; + extern const SettingsBool enable_parsing_to_custom_serialization; } namespace ErrorCodes @@ -563,11 +564,10 @@ QueryPipeline InterpreterInsertQuery::buildInsertSelectPipeline(ASTInsertQuery & return std::make_shared(in_header, actions); }); - /// We need to convert Sparse columns to full, because it's destination storage - /// may not support it or may have different settings for applying Sparse serialization. + /// We need to convert Sparse columns to full if the destination storage doesn't support them. pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr { - return std::make_shared(in_header); + return std::make_shared(in_header, !table->supportsSparseSerialization()); }); pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr @@ -737,11 +737,14 @@ QueryPipeline InterpreterInsertQuery::buildInsertPipeline(ASTInsertQuery & query if (query.hasInlinedData() && !async_insert) { - /// can execute without additional data auto format = getInputFormatFromASTInsertQuery(query_ptr, true, query_sample_block, getContext(), nullptr); - for (auto && buffer : owned_buffers) + + for (auto & buffer : owned_buffers) format->addBuffer(std::move(buffer)); + if (settings[Setting::enable_parsing_to_custom_serialization]) + format->setSerializationHints(table->getSerializationHints()); + auto pipe = getSourceFromInputFormat(query_ptr, std::move(format), getContext(), nullptr); pipeline.complete(std::move(pipe)); } diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index b743095e6f6..8c9d9453d79 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include @@ -942,7 +942,7 @@ void InterpreterSystemQuery::restartReplicas(ContextMutablePtr system_context) if (replica_names.empty()) return; - size_t threads = std::min(static_cast(getNumberOfPhysicalCPUCores()), replica_names.size()); + size_t threads = std::min(static_cast(getNumberOfCPUCoresToUse()), replica_names.size()); LOG_DEBUG(log, "Will restart {} replicas using {} threads", replica_names.size(), threads); ThreadPool pool(CurrentMetrics::RestartReplicaThreads, CurrentMetrics::RestartReplicaThreadsActive, CurrentMetrics::RestartReplicaThreadsScheduled, threads); diff --git a/src/Interpreters/PartLog.cpp b/src/Interpreters/PartLog.cpp index db339375231..49c817586fa 100644 --- a/src/Interpreters/PartLog.cpp +++ b/src/Interpreters/PartLog.cpp @@ -90,6 +90,8 @@ ColumnsDescription PartLogElement::getColumnsDescription() } ); + auto low_cardinality_string = std::make_shared(std::make_shared()); + ColumnsWithTypeAndName columns_with_type_and_name; return ColumnsDescription @@ -142,7 +144,7 @@ ColumnsDescription PartLogElement::getColumnsDescription() {"error", std::make_shared(), "The error code of the occurred exception."}, {"exception", std::make_shared(), "Text message of the occurred error."}, - {"ProfileEvents", std::make_shared(std::make_shared(), std::make_shared()), "All the profile events captured during this operation."}, + {"ProfileEvents", std::make_shared(low_cardinality_string, std::make_shared()), "All the profile events captured during this operation."}, }; } diff --git a/src/Interpreters/QueryViewsLog.cpp b/src/Interpreters/QueryViewsLog.cpp index a5441363340..c417cf34c09 100644 --- a/src/Interpreters/QueryViewsLog.cpp +++ b/src/Interpreters/QueryViewsLog.cpp @@ -33,6 +33,8 @@ ColumnsDescription QueryViewsLogElement::getColumnsDescription() {"Live", static_cast(ViewType::LIVE)}, {"Window", static_cast(ViewType::WINDOW)}}); + auto low_cardinality_string = std::make_shared(std::make_shared()); + return ColumnsDescription { {"hostname", std::make_shared(std::make_shared()), "Hostname of the server executing the query."}, @@ -53,7 +55,7 @@ ColumnsDescription QueryViewsLogElement::getColumnsDescription() {"written_rows", std::make_shared(), "Number of written rows."}, {"written_bytes", std::make_shared(), "Number of written bytes."}, {"peak_memory_usage", std::make_shared(), "The maximum difference between the amount of allocated and freed memory in context of this view."}, - {"ProfileEvents", std::make_shared(std::make_shared(), std::make_shared()), "ProfileEvents that measure different metrics. The description of them could be found in the table system.events."}, + {"ProfileEvents", std::make_shared(low_cardinality_string, std::make_shared()), "ProfileEvents that measure different metrics. The description of them could be found in the table system.events."}, {"status", std::move(view_status_datatype), "Status of the view. Values: " "'QueryStart' = 1 — Successful start the view execution. Should not appear, " diff --git a/src/Interpreters/Squashing.cpp b/src/Interpreters/Squashing.cpp index c656a1a797b..8122800f882 100644 --- a/src/Interpreters/Squashing.cpp +++ b/src/Interpreters/Squashing.cpp @@ -1,8 +1,9 @@ #include #include -#include "Common/Logger.h" -#include "Common/logger_useful.h" #include +#include +#include +#include #include namespace DB @@ -116,7 +117,7 @@ Chunk Squashing::squash(std::vector && input_chunks, Chunk::ChunkInfoColl return result; } - std::vector mutable_columns = {}; + std::vector mutable_columns; size_t rows = 0; for (const Chunk & chunk : input_chunks) rows += chunk.getNumRows(); @@ -130,8 +131,11 @@ Chunk Squashing::squash(std::vector && input_chunks, Chunk::ChunkInfoColl } size_t num_columns = mutable_columns.size(); + /// Collect the list of source columns for each column. - std::vector source_columns_list(num_columns, Columns{}); + std::vector source_columns_list(num_columns); + std::vector have_same_serialization(num_columns, true); + for (size_t i = 0; i != num_columns; ++i) source_columns_list[i].reserve(input_chunks.size() - 1); @@ -139,11 +143,21 @@ Chunk Squashing::squash(std::vector && input_chunks, Chunk::ChunkInfoColl { auto columns = input_chunks[i].detachColumns(); for (size_t j = 0; j != num_columns; ++j) + { + have_same_serialization[j] &= ISerialization::getKind(*columns[j]) == ISerialization::getKind(*mutable_columns[j]); source_columns_list[j].emplace_back(std::move(columns[j])); + } } for (size_t i = 0; i != num_columns; ++i) { + if (!have_same_serialization[i]) + { + mutable_columns[i] = recursiveRemoveSparse(std::move(mutable_columns[i]))->assumeMutable(); + for (auto & column : source_columns_list[i]) + column = recursiveRemoveSparse(column); + } + /// We know all the data we will insert in advance and can make all necessary pre-allocations. mutable_columns[i]->prepareForSquashing(source_columns_list[i]); for (auto & source_column : source_columns_list[i]) diff --git a/src/Interpreters/addMissingDefaults.cpp b/src/Interpreters/addMissingDefaults.cpp index 27d79e86622..173478332f3 100644 --- a/src/Interpreters/addMissingDefaults.cpp +++ b/src/Interpreters/addMissingDefaults.cpp @@ -85,7 +85,7 @@ ActionsDAG addMissingDefaults( /// Removes unused columns and reorders result. actions.removeUnusedActions(required_columns.getNames(), false); - actions.addMaterializingOutputActions(); + actions.addMaterializingOutputActions(/*materialize_sparse=*/ false); return actions; } diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index e687a48cd3b..066de0c7d76 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -1259,7 +1259,6 @@ static std::tuple executeQueryImpl( { if (!interpreter->supportsTransactions()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Transactions are not supported for this type of query ({})", ast->getID()); - } // InterpreterSelectQueryAnalyzer does not build QueryPlan in the constructor. diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h index 713c1089d28..64b289170d2 100644 --- a/src/Processors/Formats/IInputFormat.h +++ b/src/Processors/Formats/IInputFormat.h @@ -58,6 +58,10 @@ public: /// parallel parsing before creating this parser. virtual void setRowsReadBefore(size_t /*rows*/) {} + /// Sets the serialization hints for the columns. It allows to create columns + /// in custom serializations (e.g. Sparse) for parsing and avoid extra conversion. + virtual void setSerializationHints(const SerializationInfoByName & /*hints*/) {} + void addBuffer(std::unique_ptr buffer) { owned_buffers.emplace_back(std::move(buffer)); } void setErrorsLogger(const InputFormatErrorsLoggerPtr & errors_logger_) { errors_logger = errors_logger_; } diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp index 0b6c81923db..b8e8822e648 100644 --- a/src/Processors/Formats/IRowInputFormat.cpp +++ b/src/Processors/Formats/IRowInputFormat.cpp @@ -103,7 +103,10 @@ Chunk IRowInputFormat::read() const Block & header = getPort().getHeader(); size_t num_columns = header.columns(); - MutableColumns columns = header.cloneEmptyColumns(); + MutableColumns columns(num_columns); + + for (size_t i = 0; i < num_columns; ++i) + columns[i] = header.getByPosition(i).type->createColumn(*serializations[i]); block_missing_values.clear(); @@ -266,5 +269,10 @@ size_t IRowInputFormat::countRows(size_t) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method countRows is not implemented for input format {}", getName()); } +void IRowInputFormat::setSerializationHints(const SerializationInfoByName & hints) +{ + serializations = getPort().getHeader().getSerializations(hints); +} + } diff --git a/src/Processors/Formats/IRowInputFormat.h b/src/Processors/Formats/IRowInputFormat.h index f8796df8604..c6786f45ecb 100644 --- a/src/Processors/Formats/IRowInputFormat.h +++ b/src/Processors/Formats/IRowInputFormat.h @@ -5,6 +5,7 @@ #include #include #include +#include class Stopwatch; @@ -84,6 +85,7 @@ protected: size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; } void setRowsReadBefore(size_t rows) override { total_rows = rows; } + void setSerializationHints(const SerializationInfoByName & hints) override; Serializations serializations; diff --git a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp index 447adb1ed48..faf6bf81869 100644 --- a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp @@ -92,6 +92,7 @@ void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupPtr thread_grou InputFormatPtr input_format = internal_parser_creator(read_buffer); input_format->setRowsReadBefore(unit.offset); input_format->setErrorsLogger(errors_logger); + input_format->setSerializationHints(serialization_hints); InternalParser parser(input_format); unit.chunk_ext.chunk.clear(); diff --git a/src/Processors/Formats/Impl/ParallelParsingInputFormat.h b/src/Processors/Formats/Impl/ParallelParsingInputFormat.h index b97bf5213e6..e3753385ae8 100644 --- a/src/Processors/Formats/Impl/ParallelParsingInputFormat.h +++ b/src/Processors/Formats/Impl/ParallelParsingInputFormat.h @@ -129,6 +129,11 @@ public: return last_block_missing_values; } + void setSerializationHints(const SerializationInfoByName & hints) override + { + serialization_hints = hints; + } + size_t getApproxBytesReadForChunk() const override { return last_approx_bytes_read_for_chunk; } String getName() const final { return "ParallelParsingBlockInputFormat"; } @@ -207,6 +212,7 @@ private: BlockMissingValues last_block_missing_values; size_t last_approx_bytes_read_for_chunk = 0; + SerializationInfoByName serialization_hints; /// Non-atomic because it is used in one thread. std::optional next_block_in_current_unit; diff --git a/src/Processors/Transforms/MaterializingTransform.cpp b/src/Processors/Transforms/MaterializingTransform.cpp index 9ae80e21a68..771718e5ced 100644 --- a/src/Processors/Transforms/MaterializingTransform.cpp +++ b/src/Processors/Transforms/MaterializingTransform.cpp @@ -5,8 +5,11 @@ namespace DB { -MaterializingTransform::MaterializingTransform(const Block & header) - : ISimpleTransform(header, materializeBlock(header), false) {} +MaterializingTransform::MaterializingTransform(const Block & header, bool remove_sparse_) + : ISimpleTransform(header, materializeBlock(header), false) + , remove_sparse(remove_sparse_) +{ +} void MaterializingTransform::transform(Chunk & chunk) { @@ -14,7 +17,11 @@ void MaterializingTransform::transform(Chunk & chunk) auto columns = chunk.detachColumns(); for (auto & col : columns) - col = recursiveRemoveSparse(col->convertToFullColumnIfConst()); + { + col = col->convertToFullColumnIfConst(); + if (remove_sparse) + col = recursiveRemoveSparse(col); + } chunk.setColumns(std::move(columns), num_rows); } diff --git a/src/Processors/Transforms/MaterializingTransform.h b/src/Processors/Transforms/MaterializingTransform.h index 5ecd8522426..d384083a50d 100644 --- a/src/Processors/Transforms/MaterializingTransform.h +++ b/src/Processors/Transforms/MaterializingTransform.h @@ -8,12 +8,13 @@ namespace DB class MaterializingTransform : public ISimpleTransform { public: - explicit MaterializingTransform(const Block & header); + explicit MaterializingTransform(const Block & header, bool remove_sparse_ = true); String getName() const override { return "MaterializingTransform"; } protected: void transform(Chunk & chunk) override; + bool remove_sparse; }; } diff --git a/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp b/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp index 4bb3b88886e..1f6474da7d0 100644 --- a/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp +++ b/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp @@ -66,8 +66,7 @@ InputFormatPtr getInputFormatFromASTInsertQuery( : std::make_unique(); /// Create a source from input buffer using format from query - auto source - = context->getInputFormat(ast_insert_query->format, *input_buffer, header, context->getSettingsRef()[Setting::max_insert_block_size]); + auto source = context->getInputFormat(ast_insert_query->format, *input_buffer, header, context->getSettingsRef()[Setting::max_insert_block_size]); source->addBuffer(std::move(input_buffer)); return source; } diff --git a/src/Storages/FileLog/StorageFileLog.cpp b/src/Storages/FileLog/StorageFileLog.cpp index 9d93563d739..38e65766b52 100644 --- a/src/Storages/FileLog/StorageFileLog.cpp +++ b/src/Storages/FileLog/StorageFileLog.cpp @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include @@ -822,17 +822,17 @@ void registerStorageFileLog(StorageFactory & factory) filelog_settings->loadFromQuery(*args.storage_def); } - auto physical_cpu_cores = getNumberOfPhysicalCPUCores(); + auto cpu_cores = getNumberOfCPUCoresToUse(); auto num_threads = filelog_settings->max_threads.value; if (!num_threads) /// Default { - num_threads = std::max(1U, physical_cpu_cores / 4); + num_threads = std::max(1U, cpu_cores / 4); filelog_settings->set("max_threads", num_threads); } - else if (num_threads > physical_cpu_cores) + else if (num_threads > cpu_cores) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of threads to parse files can not be bigger than {}", physical_cpu_cores); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of threads to parse files can not be bigger than {}", cpu_cores); } else if (num_threads < 1) { diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 07058dfb5df..0dc48634282 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -269,6 +270,9 @@ public: /// because those are internally translated into 'ALTER UDPATE' mutations. virtual bool supportsDelete() const { return false; } + /// Returns true if storage can store columns in sparse serialization. + virtual bool supportsSparseSerialization() const { return false; } + /// Return true if the trivial count query could be optimized without reading the data at all /// in totalRows() or totalRowsByPartitionPredicate() methods or with optimized reading in read() method. /// 'storage_snapshot' may be nullptr. @@ -277,6 +281,9 @@ public: return false; } + /// Returns hints for serialization of columns accorsing to statistics accumulated by storage. + virtual SerializationInfoByName getSerializationHints() const { return {}; } + private: StorageID storage_id; diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index b5eb8d07f21..0be0f12a4f1 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Storages/Kafka/StorageKafka2.cpp b/src/Storages/Kafka/StorageKafka2.cpp index 6d167b0bb32..90cd09fdb92 100644 --- a/src/Storages/Kafka/StorageKafka2.cpp +++ b/src/Storages/Kafka/StorageKafka2.cpp @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Storages/Kafka/StorageKafkaUtils.cpp b/src/Storages/Kafka/StorageKafkaUtils.cpp index e49626c986b..7c23d1ab2f6 100644 --- a/src/Storages/Kafka/StorageKafkaUtils.cpp +++ b/src/Storages/Kafka/StorageKafkaUtils.cpp @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include @@ -168,7 +168,7 @@ void registerStorageKafka(StorageFactory & factory) #undef CHECK_KAFKA_STORAGE_ARGUMENT auto num_consumers = kafka_settings->kafka_num_consumers.value; - auto max_consumers = std::max(getNumberOfPhysicalCPUCores(), 16); + auto max_consumers = std::max(getNumberOfCPUCoresToUse(), 16); if (!args.getLocalContext()->getSettingsRef()[Setting::kafka_disable_num_consumers_limit] && num_consumers > max_consumers) { diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp index c87f66b64f3..4f42a7e9122 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace DB { @@ -11,13 +12,14 @@ namespace ErrorCodes } -Block getBlockAndPermute(const Block & block, const Names & names, const IColumn::Permutation * permutation) +Block getIndexBlockAndPermute(const Block & block, const Names & names, const IColumn::Permutation * permutation) { Block result; for (size_t i = 0, size = names.size(); i < size; ++i) { - const auto & name = names[i]; - result.insert(i, block.getByName(name)); + auto src_column = block.getByName(names[i]); + src_column.column = recursiveRemoveSparse(src_column.column); + result.insert(i, src_column); /// Reorder primary key columns in advance and add them to `primary_key_columns`. if (permutation) diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index 2fdb0794789..eb51a1b2922 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -16,7 +16,7 @@ namespace DB struct MergeTreeSettings; using MergeTreeSettingsPtr = std::shared_ptr; -Block getBlockAndPermute(const Block & block, const Names & names, const IColumn::Permutation * permutation); +Block getIndexBlockAndPermute(const Block & block, const Names & names, const IColumn::Permutation * permutation); Block permuteBlockIfNeeded(const Block & block, const IColumn::Permutation * permutation); diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp index 1f46d6b8e1b..b2f18f08f41 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.cpp +++ b/src/Storages/MergeTree/IMergeTreeReader.cpp @@ -172,7 +172,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns if (dag) { - dag->addMaterializingOutputActions(); + dag->addMaterializingOutputActions(/*materialize_sparse=*/ false); auto actions = std::make_shared( std::move(*dag), ExpressionActionsSettings::fromSettings(data_part_info_for_read->getContext()->getSettingsRef())); diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 503303443bd..f0447e71539 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -1008,7 +1008,7 @@ MergeTask::VerticalMergeRuntimeContext::PreparedColumnPipeline MergeTask::Vertic indexes_to_recalc = MergeTreeIndexFactory::instance().getMany(indexes_it->second); auto indices_expression_dag = indexes_it->second.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), global_ctx->data->getContext())->getActionsDAG().clone(); - indices_expression_dag.addMaterializingOutputActions(); /// Const columns cannot be written without materialization. + indices_expression_dag.addMaterializingOutputActions(/*materialize_sparse=*/ true); /// Const columns cannot be written without materialization. auto calculate_indices_expression_step = std::make_unique( merge_column_query_plan.getCurrentDataStream(), std::move(indices_expression_dag)); @@ -1730,7 +1730,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() const if (!global_ctx->merging_skip_indexes.empty()) { auto indices_expression_dag = global_ctx->merging_skip_indexes.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), global_ctx->data->getContext())->getActionsDAG().clone(); - indices_expression_dag.addMaterializingOutputActions(); /// Const columns cannot be written without materialization. + indices_expression_dag.addMaterializingOutputActions(/*materialize_sparse=*/ true); /// Const columns cannot be written without materialization. auto calculate_indices_expression_step = std::make_unique( merge_parts_query_plan.getCurrentDataStream(), std::move(indices_expression_dag)); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 0292a78a83d..cff381a3429 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -1904,6 +1903,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optionalrenameToDetached("broken-on-start"); /// detached parts must not have '_' in prefixes resetObjectColumnsFromActiveParts(part_lock); + resetSerializationHints(part_lock); calculateColumnAndSecondaryIndexSizesImpl(); PartLoadingTreeNodes unloaded_parts; @@ -6908,6 +6909,8 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(DataPartsLock } } + data.updateSerializationHints(precommitted_parts, total_covered_parts, parts_lock); + if (reduce_parts == 0) { for (const auto & part : precommitted_parts) @@ -8571,6 +8574,66 @@ void MergeTreeData::updateObjectColumns(const DataPartPtr & part, const DataPart DB::updateObjectColumns(object_columns, columns, part->getColumns()); } +template +static void updateSerializationHintsForPart(const DataPartPtr & part, const ColumnsDescription & storage_columns, SerializationInfoByName & hints, bool remove) +{ + const auto & part_columns = part->getColumnsDescription(); + for (const auto & [name, info] : part->getSerializationInfos()) + { + auto new_hint = hints.tryGet(name); + if (!new_hint) + continue; + + /// Structure may change after alter. Do not add info for such items. + /// Instead it will be updated on commit of the result part of alter. + if (part_columns.tryGetPhysical(name) != storage_columns.tryGetPhysical(name)) + continue; + + chassert(new_hint->structureEquals(*info)); + if (remove) + new_hint->remove(*info); + else + new_hint->add(*info); + } +} + +void MergeTreeData::resetSerializationHints(const DataPartsLock & /*lock*/) +{ + SerializationInfo::Settings settings = + { + .ratio_of_defaults_for_sparse = getSettings()->ratio_of_defaults_for_sparse_serialization, + .choose_kind = true, + }; + + const auto & storage_columns = getInMemoryMetadataPtr()->getColumns(); + serialization_hints = SerializationInfoByName(storage_columns.getAllPhysical(), settings); + auto range = getDataPartsStateRange(DataPartState::Active); + + for (const auto & part : range) + updateSerializationHintsForPart(part, storage_columns, serialization_hints, false); +} + +template +void MergeTreeData::updateSerializationHints(const AddedParts & added_parts, const RemovedParts & removed_parts, const DataPartsLock & /*lock*/) +{ + const auto & storage_columns = getInMemoryMetadataPtr()->getColumns(); + + for (const auto & part : added_parts) + updateSerializationHintsForPart(part, storage_columns, serialization_hints, false); + + for (const auto & part : removed_parts) + updateSerializationHintsForPart(part, storage_columns, serialization_hints, true); +} + +SerializationInfoByName MergeTreeData::getSerializationHints() const +{ + auto lock = lockParts(); + SerializationInfoByName res; + for (const auto & [name, info] : serialization_hints) + res.emplace(name, info->clone()); + return res; +} + bool MergeTreeData::supportsTrivialCountOptimization(const StorageSnapshotPtr & storage_snapshot, ContextPtr query_context) const { if (hasLightweightDeletedMask()) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 5edd24db40d..7a9730e8627 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -441,6 +441,7 @@ public: bool supportsDynamicSubcolumnsDeprecated() const override { return true; } bool supportsDynamicSubcolumns() const override { return true; } + bool supportsSparseSerialization() const override { return true; } bool supportsLightweightDelete() const override; @@ -1242,6 +1243,11 @@ protected: /// protected by @data_parts_mutex. ColumnsDescription object_columns; + /// Serialization info accumulated among all active parts. + /// It changes only when set of parts is changed and is + /// protected by @data_parts_mutex. + SerializationInfoByName serialization_hints; + MergeTreePartsMover parts_mover; /// Executors are common for both ReplicatedMergeTree and plain MergeTree @@ -1530,6 +1536,13 @@ protected: void resetObjectColumnsFromActiveParts(const DataPartsLock & lock); void updateObjectColumns(const DataPartPtr & part, const DataPartsLock & lock); + void resetSerializationHints(const DataPartsLock & lock); + + template + void updateSerializationHints(const AddedParts & added_parts, const RemovedParts & removed_parts, const DataPartsLock & lock); + + SerializationInfoByName getSerializationHints() const override; + /** A structure that explicitly represents a "merge tree" of parts * which is implicitly presented by min-max block numbers and levels of parts. * The children of node are parts which are covered by parent part. diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index f4be7619fc8..a859172023f 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -213,11 +213,11 @@ void MergeTreeDataPartWriterCompact::writeDataBlockPrimaryIndexAndSkipIndices(co if (settings.rewrite_primary_key) { - Block primary_key_block = getBlockAndPermute(block, metadata_snapshot->getPrimaryKeyColumns(), nullptr); + Block primary_key_block = getIndexBlockAndPermute(block, metadata_snapshot->getPrimaryKeyColumns(), nullptr); calculateAndSerializePrimaryIndex(primary_key_block, granules_to_write); } - Block skip_indices_block = getBlockAndPermute(block, getSkipIndicesColumns(), nullptr); + Block skip_indices_block = getIndexBlockAndPermute(block, getSkipIndicesColumns(), nullptr); calculateAndSerializeSkipIndices(skip_indices_block, granules_to_write); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index f050accd7a1..04e07a0588a 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -296,9 +296,9 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm auto offset_columns = written_offset_columns ? *written_offset_columns : WrittenOffsetColumns{}; Block primary_key_block; if (settings.rewrite_primary_key) - primary_key_block = getBlockAndPermute(block, metadata_snapshot->getPrimaryKeyColumns(), permutation); + primary_key_block = getIndexBlockAndPermute(block, metadata_snapshot->getPrimaryKeyColumns(), permutation); - Block skip_indexes_block = getBlockAndPermute(block, getSkipIndicesColumns(), permutation); + Block skip_indexes_block = getIndexBlockAndPermute(block, getSkipIndicesColumns(), permutation); auto it = columns_list.begin(); for (size_t i = 0; i < columns_list.size(); ++i, ++it) diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index ed68200041b..130d9ca8f6a 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -577,6 +577,13 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( SerializationInfoByName infos(columns, settings); infos.add(block); + for (const auto & [column_name, _] : columns) + { + auto & column = block.getByName(column_name); + if (column.column->isSparse() && infos.getKind(column_name) != ISerialization::Kind::SPARSE) + column.column = recursiveRemoveSparse(column.column); + } + new_data_part->setColumns(columns, infos, metadata_snapshot->getMetadataVersion()); new_data_part->rows_count = block.rows(); new_data_part->existing_rows_count = block.rows(); diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp index bf9aad6545d..bc36343ac93 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include @@ -272,7 +272,7 @@ void updateImpl(const ColumnArray * column_array, const ColumnArray::Offsets & c /// Reserving space is mandatory size_t max_thread_pool_size = Context::getGlobalContextInstance()->getServerSettings().max_build_vector_similarity_index_thread_pool_size; if (max_thread_pool_size == 0) - max_thread_pool_size = getNumberOfPhysicalCPUCores(); + max_thread_pool_size = getNumberOfCPUCoresToUse(); unum::usearch::index_limits_t limits(roundUpToPowerOfTwoOrZero(index->size() + rows), max_thread_pool_size); index->reserve(limits); diff --git a/src/Storages/MergeTree/MergeTreeReadTask.h b/src/Storages/MergeTree/MergeTreeReadTask.h index e90a07e0b55..748babb5b4c 100644 --- a/src/Storages/MergeTree/MergeTreeReadTask.h +++ b/src/Storages/MergeTree/MergeTreeReadTask.h @@ -66,7 +66,7 @@ struct MergeTreeReadTaskInfo MergeTreeReadTaskColumns task_columns; /// Shared initialized size predictor. It is copied for each new task. MergeTreeBlockSizePredictorPtr shared_size_predictor; - /// TODO: comment + /// Shared constant fields for virtual columns. VirtualFields const_virtual_fields; /// The amount of data to read per task based on size of the queried columns. size_t min_marks_per_task = 0; diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index a17fd163253..6dff60aeaa9 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -144,7 +144,7 @@ private: bool supports_subset_of_columns, ContextPtr local_context) override { - auto info = DB::prepareReadingFromFormat(requested_columns, storage_snapshot, supports_subset_of_columns); + auto info = DB::prepareReadingFromFormat(requested_columns, storage_snapshot, local_context, supports_subset_of_columns); if (!current_metadata) { Storage::updateConfiguration(local_context); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index bc27820707c..040ce8db51d 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -247,9 +247,9 @@ ReadFromFormatInfo StorageObjectStorage::prepareReadingFromFormat( const Strings & requested_columns, const StorageSnapshotPtr & storage_snapshot, bool supports_subset_of_columns, - ContextPtr /* local_context */) + ContextPtr local_context) { - return DB::prepareReadingFromFormat(requested_columns, storage_snapshot, supports_subset_of_columns); + return DB::prepareReadingFromFormat(requested_columns, storage_snapshot, local_context, supports_subset_of_columns); } void StorageObjectStorage::read( diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 641b43e57d6..0b7106de949 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -377,6 +377,8 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade compression_method, need_only_count); + input_format->setSerializationHints(read_from_format_info.serialization_hints); + if (key_condition_) input_format->setKeyCondition(key_condition_); diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.cpp b/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.cpp index 0f356a69556..c7e219038e7 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.cpp +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueTableMetadata.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include namespace DB @@ -53,7 +53,7 @@ ObjectStorageQueueTableMetadata::ObjectStorageQueueTableMetadata( { processing_threads_num_changed = engine_settings.processing_threads_num.changed; if (!processing_threads_num_changed && engine_settings.processing_threads_num <= 1) - processing_threads_num = std::max(getNumberOfPhysicalCPUCores(), 16); + processing_threads_num = std::max(getNumberOfCPUCoresToUse(), 16); else processing_threads_num = engine_settings.processing_threads_num; } diff --git a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp index 2fdbce15503..17cd1b5ac1f 100644 --- a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp +++ b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp @@ -298,7 +298,7 @@ void StorageObjectStorageQueue::read( } auto this_ptr = std::static_pointer_cast(shared_from_this()); - auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(local_context)); + auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, local_context, supportsSubsetOfColumns(local_context)); auto reading = std::make_unique( column_names, @@ -459,6 +459,7 @@ bool StorageObjectStorageQueue::streamToViews() auto read_from_format_info = prepareReadingFromFormat( block_io.pipeline.getHeader().getNames(), storage_snapshot, + queue_context, supportsSubsetOfColumns(queue_context)); Pipes pipes; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 1d846b6bb0f..46f4800b497 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -99,6 +99,7 @@ namespace Setting extern const SettingsLocalFSReadMethod storage_file_read_method; extern const SettingsBool use_cache_for_count_from_files; extern const SettingsInt64 zstd_window_log_max; + extern const SettingsBool enable_parsing_to_custom_serialization; } namespace ErrorCodes @@ -1136,7 +1137,6 @@ void StorageFile::setStorageMetadata(CommonArguments args) setInMemoryMetadata(storage_metadata); } - static std::chrono::seconds getLockTimeout(const ContextPtr & context) { const Settings & settings = context->getSettingsRef(); @@ -1209,6 +1209,7 @@ StorageFileSource::StorageFileSource( , requested_columns(info.requested_columns) , requested_virtual_columns(info.requested_virtual_columns) , block_for_format(info.format_header) + , serialization_hints(info.serialization_hints) , max_block_size(max_block_size_) , need_only_count(need_only_count_) { @@ -1439,6 +1440,8 @@ Chunk StorageFileSource::generate() storage->format_name, *read_buf, block_for_format, getContext(), max_block_size, storage->format_settings, max_parsing_threads, std::nullopt, /*is_remote_fs*/ false, CompressionMethod::None, need_only_count); + input_format->setSerializationHints(serialization_hints); + if (key_condition) input_format->setKeyCondition(key_condition); @@ -1630,7 +1633,7 @@ void StorageFile::read( auto this_ptr = std::static_pointer_cast(shared_from_this()); - auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(context)); + auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, context, supportsSubsetOfColumns(context)); bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) && context->getSettingsRef()[Setting::optimize_count_from_files]; diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index bb969c1877c..6b21353f161 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -296,6 +296,7 @@ private: NamesAndTypesList requested_columns; NamesAndTypesList requested_virtual_columns; Block block_for_format; + SerializationInfoByName serialization_hints; UInt64 max_block_size; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 5521a6d168a..f4d2ee67bb6 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -400,17 +400,18 @@ void StorageMergeTree::alter( DatabaseCatalog::instance().getDatabase(table_id.database_name)->alterTable(local_context, table_id, new_metadata); + { + /// Reset Object columns, because column of type + /// Object may be added or dropped by alter. + auto parts_lock = lockParts(); + resetObjectColumnsFromActiveParts(parts_lock); + resetSerializationHints(parts_lock); + } + if (!maybe_mutation_commands.empty()) mutation_version = startMutation(maybe_mutation_commands, local_context); } - { - /// Reset Object columns, because column of type - /// Object may be added or dropped by alter. - auto parts_lock = lockParts(); - resetObjectColumnsFromActiveParts(parts_lock); - } - if (!maybe_mutation_commands.empty() && query_settings[Setting::alter_sync] > 0) waitForMutation(mutation_version, false); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 15e44383527..a3d529c5fbb 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -6076,6 +6076,7 @@ bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMer /// Object may be added or dropped by alter. auto parts_lock = lockParts(); resetObjectColumnsFromActiveParts(parts_lock); + resetSerializationHints(parts_lock); } return true; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 42648ad73e6..80c07658055 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -408,6 +408,8 @@ StorageURLSource::StorageURLSource( compression_method, need_only_count); + input_format->setSerializationHints(info.serialization_hints); + if (key_condition) input_format->setKeyCondition(key_condition); @@ -1127,7 +1129,7 @@ void IStorageURLBase::read( size_t num_streams) { auto params = getReadURIParams(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size); - auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(local_context)); + auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, local_context, supportsSubsetOfColumns(local_context)); bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) && local_context->getSettingsRef()[Setting::optimize_count_from_files]; @@ -1297,7 +1299,7 @@ void StorageURLWithFailover::read( size_t num_streams) { auto params = getReadURIParams(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size); - auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(local_context)); + auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, local_context, supportsSubsetOfColumns(local_context)); bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) && local_context->getSettingsRef()[Setting::optimize_count_from_files]; diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index cb438e0efa6..bcbcd4f66c8 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -187,7 +187,7 @@ void StorageView::read( /// It's expected that the columns read from storage are not constant. /// Because method 'getSampleBlockForColumns' is used to obtain a structure of result in InterpreterSelectQuery. ActionsDAG materializing_actions(query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName()); - materializing_actions.addMaterializingOutputActions(); + materializing_actions.addMaterializingOutputActions(/*materialize_sparse=*/ true); auto materializing = std::make_unique(query_plan.getCurrentDataStream(), std::move(materializing_actions)); materializing->setStepDescription("Materialize constants after VIEW subquery"); diff --git a/src/Storages/prepareReadingFromFormat.cpp b/src/Storages/prepareReadingFromFormat.cpp index 406b7f379f9..b87af449dc5 100644 --- a/src/Storages/prepareReadingFromFormat.cpp +++ b/src/Storages/prepareReadingFromFormat.cpp @@ -1,10 +1,19 @@ #include #include +#include +#include +#include +#include namespace DB { -ReadFromFormatInfo prepareReadingFromFormat(const Strings & requested_columns, const StorageSnapshotPtr & storage_snapshot, bool supports_subset_of_columns) +namespace Setting +{ + extern const SettingsBool enable_parsing_to_custom_serialization; +} + +ReadFromFormatInfo prepareReadingFromFormat(const Strings & requested_columns, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context, bool supports_subset_of_columns) { ReadFromFormatInfo info; /// Collect requested virtual columns and remove them from requested columns. @@ -72,7 +81,35 @@ ReadFromFormatInfo prepareReadingFromFormat(const Strings & requested_columns, c /// Create header for InputFormat with columns that will be read from the data. info.format_header = storage_snapshot->getSampleBlockForColumns(info.columns_description.getNamesOfPhysical()); + info.serialization_hints = getSerializationHintsForFileLikeStorage(storage_snapshot->metadata, context); return info; } +SerializationInfoByName getSerializationHintsForFileLikeStorage(const StorageMetadataPtr & metadata_snapshot, const ContextPtr & context) +{ + if (!context->getSettingsRef()[Setting::enable_parsing_to_custom_serialization]) + return {}; + + auto insertion_table = context->getInsertionTable(); + if (!insertion_table) + return {}; + + auto storage_ptr = DatabaseCatalog::instance().tryGetTable(insertion_table, context); + if (!storage_ptr) + return {}; + + const auto & our_columns = metadata_snapshot->getColumns(); + const auto & storage_columns = storage_ptr->getInMemoryMetadataPtr()->getColumns(); + auto storage_hints = storage_ptr->getSerializationHints(); + SerializationInfoByName res; + + for (const auto & hint : storage_hints) + { + if (our_columns.tryGetPhysical(hint.first) == storage_columns.tryGetPhysical(hint.first)) + res.insert(hint); + } + + return res; +} + } diff --git a/src/Storages/prepareReadingFromFormat.h b/src/Storages/prepareReadingFromFormat.h index e4d62c29ec6..02e42056d0c 100644 --- a/src/Storages/prepareReadingFromFormat.h +++ b/src/Storages/prepareReadingFromFormat.h @@ -1,6 +1,8 @@ #pragma once #include #include +#include +#include namespace DB { @@ -19,8 +21,14 @@ namespace DB NamesAndTypesList requested_columns; /// The list of requested virtual columns. NamesAndTypesList requested_virtual_columns; + /// Hints for the serialization of columns. + /// For example can be retrieved from the destination table in INSERT SELECT query. + SerializationInfoByName serialization_hints; }; /// Get all needed information for reading from data in some input format. - ReadFromFormatInfo prepareReadingFromFormat(const Strings & requested_columns, const StorageSnapshotPtr & storage_snapshot, bool supports_subset_of_columns); + ReadFromFormatInfo prepareReadingFromFormat(const Strings & requested_columns, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context, bool supports_subset_of_columns); + + /// Returns the serialization hints from the insertion table (if it's set in the Context). + SerializationInfoByName getSerializationHintsForFileLikeStorage(const StorageMetadataPtr & metadata_snapshot, const ContextPtr & context); } diff --git a/tests/integration/test_restore_external_engines/test.py b/tests/integration/test_restore_external_engines/test.py index c28ff973b81..98ecf6d048a 100644 --- a/tests/integration/test_restore_external_engines/test.py +++ b/tests/integration/test_restore_external_engines/test.py @@ -120,6 +120,24 @@ SETTINGS input_format_with_names_use_header = 0""" ) assert node1.query(f"SELECT id FROM {dbname}.merge_tree") == "100\n" + node1.query( + f"CREATE DICTIONARY {dbname}.dict1 (id INT, data String) PRIMARY KEY id " + f"SOURCE(MYSQL(HOST 'mysql80' PORT 3306 USER 'root' PASSWORD 'clickhouse' DB 'clickhouse' TABLE 'inference_table'))" + f"LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 10)" + ) + + node1.query( + f"CREATE DICTIONARY {dbname}.dict2 (name String, value UInt32) PRIMARY KEY value " + f"SOURCE(CLICKHOUSE(HOST '127.0.0.2' PORT 9000 USER 'default' PASSWORD '' DB '{dbname}' TABLE 'example_s3_engine_table'))" + f"LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 10)" + ) + + node1.query( + f"CREATE DICTIONARY {dbname}.dict3 (name String, value UInt32) PRIMARY KEY value " + f"SOURCE(CLICKHOUSE(USER 'default' PASSWORD '' DB '{dbname}' TABLE 'example_s3_engine_table'))" + f"LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 10)" + ) + @pytest.fixture(scope="module") def start_cluster(): @@ -141,6 +159,9 @@ def test_restore_table(start_cluster): node2.query(f"BACKUP DATABASE replicated TO {backup_name}") + node2.query("DROP DICTIONARY IF EXISTS replicated.dict3 SYNC") + node2.query("DROP DICTIONARY IF EXISTS replicated.dict2 SYNC") + node2.query("DROP DICTIONARY IF EXISTS replicated.dict1 SYNC") node2.query("DROP TABLE replicated.example_s3_engine_table") node2.query("DROP TABLE replicated.mysql_schema_inference_engine") node2.query("DROP TABLE replicated.mysql_schema_inference_function") @@ -188,6 +209,9 @@ def test_restore_table_null(start_cluster): node2.query(f"BACKUP DATABASE replicated2 TO {backup_name}") + node2.query("DROP DICTIONARY IF EXISTS replicated2.dict3 SYNC") + node2.query("DROP DICTIONARY IF EXISTS replicated2.dict2 SYNC") + node2.query("DROP DICTIONARY IF EXISTS replicated2.dict1 SYNC") node2.query("DROP TABLE replicated2.example_s3_engine_table") node2.query("DROP TABLE replicated2.mysql_schema_inference_engine") node2.query("DROP TABLE replicated2.mysql_schema_inference_function") @@ -198,7 +222,8 @@ def test_restore_table_null(start_cluster): assert node3.query("EXISTS replicated2.mysql_schema_inference_function") == "0\n" node3.query( - f"RESTORE DATABASE replicated2 FROM {backup_name} SETTINGS allow_different_database_def=1, allow_different_table_def=1 SETTINGS restore_replace_external_engines_to_null=1, restore_replace_external_table_functions_to_null=1" + f"RESTORE DATABASE replicated2 FROM {backup_name} SETTINGS allow_different_database_def=1, allow_different_table_def=1 " + f"SETTINGS restore_replace_external_engines_to_null=1, restore_replace_external_table_functions_to_null=1, restore_replace_external_dictionary_source_to_null=1" ) node1.query(f"SYSTEM SYNC DATABASE REPLICA replicated2") @@ -236,4 +261,7 @@ def test_restore_table_null(start_cluster): ) == "MergeTree\n" ) + assert "SOURCE(NULL())" in node1.query("SHOW CREATE replicated2.dict1") + assert "SOURCE(NULL())" in node1.query("SHOW CREATE replicated2.dict1") + assert "SOURCE(CLICKHOUSE(" in node1.query("SHOW CREATE replicated2.dict3") cleanup_nodes(nodes, "replicated2") diff --git a/tests/performance/insert_sparse_column.xml b/tests/performance/insert_sparse_column.xml new file mode 100644 index 00000000000..0f6cdcec332 --- /dev/null +++ b/tests/performance/insert_sparse_column.xml @@ -0,0 +1,17 @@ + + CREATE TABLE t_insert_sparse (id UInt64, c0 String, c1 String, c2 String, c3 String, c4 String, c5 String, c6 String, c7 String, c8 String, c9 String, c10 String, c11 String, c12 String, c13 String, c14 String, c15 String, c16 String, c17 String, c18 String, c19 String, c20 String, c21 String, c22 String, c23 String, c24 String, c25 String, c26 String, c27 String, c28 String, c29 String, c30 String, c31 String, c32 String, c33 String, c34 String, c35 String, c36 String, c37 String, c38 String, c39 String, c40 String, c41 String, c42 String, c43 String, c44 String, c45 String, c46 String, c47 String, c48 String, c49 String, c50 String, c51 String, c52 String, c53 String, c54 String, c55 String, c56 String, c57 String, c58 String, c59 String, c60 String, c61 String, c62 String, c63 String, c64 String, c65 String, c66 String, c67 String, c68 String, c69 String, c70 String, c71 String, c72 String, c73 String, c74 String, c75 String, c76 String, c77 String, c78 String, c79 String, c80 String, c81 String, c82 String, c83 String, c84 String, c85 String, c86 String, c87 String, c88 String, c89 String, c90 String, c91 String, c92 String, c93 String, c94 String, c95 String, c96 String, c97 String, c98 String, c99 String) ENGINE = MergeTree ORDER BY id + + SYSTEM STOP MERGES t_insert_sparse + + + INSERT INTO FUNCTION file('test_data_sparse.json', LineAsString) + SELECT '{{"id": ' || number || ', "c' || number % 50 || '": "' || hex(rand()) || '"}}' + FROM numbers(100000) SETTINGS engine_file_truncate_on_insert = 1 + + + INSERT INTO t_insert_sparse SELECT * FROM file('test_data_sparse.json', JSONEachRow) + + INSERT INTO t_insert_sparse SELECT * FROM file('test_data_sparse.json', JSONEachRow) + + DROP TABLE IF EXISTS t_insert_sparse + diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index 205e772bbbb..7c541f272c8 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -321,7 +321,6 @@ geohashesInBox getMacro getOSKernelVersion getServerPort -getSetting getSizeOfEnumType getTypeSerializationStreams globalIn diff --git a/tests/queries/0_stateless/02423_insert_stats_behaviour.sh b/tests/queries/0_stateless/02423_insert_stats_behaviour.sh index b85ca311101..5680af7da71 100755 --- a/tests/queries/0_stateless/02423_insert_stats_behaviour.sh +++ b/tests/queries/0_stateless/02423_insert_stats_behaviour.sh @@ -4,9 +4,9 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -q "CREATE TABLE floats (v Float64) Engine=MergeTree() ORDER BY tuple();" -$CLICKHOUSE_CLIENT -q "CREATE TABLE target_1 (v Float64) Engine=MergeTree() ORDER BY tuple();" -$CLICKHOUSE_CLIENT -q "CREATE TABLE target_2 (v Float64) Engine=MergeTree() ORDER BY tuple();" +$CLICKHOUSE_CLIENT -q "CREATE TABLE floats (v Float64) Engine=MergeTree() ORDER BY tuple() SETTINGS ratio_of_defaults_for_sparse_serialization = 1.0" +$CLICKHOUSE_CLIENT -q "CREATE TABLE target_1 (v Float64) Engine=MergeTree() ORDER BY tuple() SETTINGS ratio_of_defaults_for_sparse_serialization = 1.0;" +$CLICKHOUSE_CLIENT -q "CREATE TABLE target_2 (v Float64) Engine=MergeTree() ORDER BY tuple() SETTINGS ratio_of_defaults_for_sparse_serialization = 1.0;" $CLICKHOUSE_CLIENT -q "CREATE MATERIALIZED VIEW floats_to_target TO target_1 AS SELECT * FROM floats" $CLICKHOUSE_CLIENT -q "CREATE MATERIALIZED VIEW floats_to_target_2 TO target_2 AS SELECT * FROM floats, numbers(2) n" diff --git a/tests/queries/0_stateless/02423_insert_summary_behaviour.sh b/tests/queries/0_stateless/02423_insert_summary_behaviour.sh index b184d9ccf47..cb28724ab58 100755 --- a/tests/queries/0_stateless/02423_insert_summary_behaviour.sh +++ b/tests/queries/0_stateless/02423_insert_summary_behaviour.sh @@ -4,9 +4,9 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -q "CREATE TABLE floats (v Float64) Engine=MergeTree() ORDER BY tuple();" -$CLICKHOUSE_CLIENT -q "CREATE TABLE target_1 (v Float64) Engine=MergeTree() ORDER BY tuple();" -$CLICKHOUSE_CLIENT -q "CREATE TABLE target_2 (v Float64) Engine=MergeTree() ORDER BY tuple();" +$CLICKHOUSE_CLIENT -q "CREATE TABLE floats (v Float64) Engine=MergeTree() ORDER BY tuple() SETTINGS ratio_of_defaults_for_sparse_serialization = 1.0;" +$CLICKHOUSE_CLIENT -q "CREATE TABLE target_1 (v Float64) Engine=MergeTree() ORDER BY tuple() SETTINGS ratio_of_defaults_for_sparse_serialization = 1.0;" +$CLICKHOUSE_CLIENT -q "CREATE TABLE target_2 (v Float64) Engine=MergeTree() ORDER BY tuple() SETTINGS ratio_of_defaults_for_sparse_serialization = 1.0;" $CLICKHOUSE_CLIENT -q "CREATE MATERIALIZED VIEW floats_to_target TO target_1 AS SELECT * FROM floats" $CLICKHOUSE_CLIENT -q "CREATE MATERIALIZED VIEW floats_to_target_2 TO target_2 AS SELECT * FROM floats, numbers(2) n" diff --git a/tests/queries/0_stateless/03234_get_setting_or_default.reference b/tests/queries/0_stateless/03234_get_setting_or_default.reference new file mode 100644 index 00000000000..0b47065a07b --- /dev/null +++ b/tests/queries/0_stateless/03234_get_setting_or_default.reference @@ -0,0 +1,10 @@ +value_a +value_b +\N +5 +default_e +500 +\N +1 +1 +backup diff --git a/tests/queries/0_stateless/03234_get_setting_or_default.sql b/tests/queries/0_stateless/03234_get_setting_or_default.sql new file mode 100644 index 00000000000..3954e9fe8ab --- /dev/null +++ b/tests/queries/0_stateless/03234_get_setting_or_default.sql @@ -0,0 +1,24 @@ +SET custom_a = 'value_a'; +SET custom_b = 'value_b'; +SET custom_c = null; +SET custom_d = 5; + +SELECT getSettingOrDefault('custom_a', 'default_a'); +SELECT getSettingOrDefault('custom_b', 'default_b'); +SELECT getSettingOrDefault('custom_c', 'default_c'); +SELECT getSettingOrDefault('custom_d', 'default_d'); + +SELECT getSetting('custom_e'); -- { serverError UNKNOWN_SETTING } + +SELECT getSettingOrDefault('custom_e', 'default_e'); +SELECT getSettingOrDefault('custom_e', 500); +SELECT getSettingOrDefault('custom_e', null); +SELECT isNull(getSettingOrDefault('custom_e', null)); + +SELECT getSettingOrDefault('custom_e'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT getSettingOrDefault(115, 'name should be string'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } + +SELECT count(*) FROM numbers(10) WHERE number = getSettingOrDefault('custom_e', 5); + +SET custom_e_backup = 'backup'; +SELECT getSettingOrDefault('custom_e', getSetting('custom_e_backup')); diff --git a/tests/queries/0_stateless/03237_insert_sparse_columns.reference b/tests/queries/0_stateless/03237_insert_sparse_columns.reference new file mode 100644 index 00000000000..592fcff9b25 --- /dev/null +++ b/tests/queries/0_stateless/03237_insert_sparse_columns.reference @@ -0,0 +1,21 @@ +1 0 +2 0 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 +10 0 +11 100 +12 200 +13 300 +14 400 +15 500 +all_1_1_0 id Default +all_1_1_0 v Sparse +all_2_2_0 id Default +all_2_2_0 v Sparse +all_3_3_0 id Default +all_3_3_0 v Default diff --git a/tests/queries/0_stateless/03237_insert_sparse_columns.sh b/tests/queries/0_stateless/03237_insert_sparse_columns.sh new file mode 100755 index 00000000000..a4d53a36b87 --- /dev/null +++ b/tests/queries/0_stateless/03237_insert_sparse_columns.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --query " + DROP TABLE IF EXISTS t_insert_sparse_columns; + CREATE TABLE t_insert_sparse_columns (id UInt64, v UInt64) ENGINE = MergeTree ORDER BY id SETTINGS ratio_of_defaults_for_sparse_serialization = 0.5; + SYSTEM STOP MERGES t_insert_sparse_columns; +" + +${CLICKHOUSE_CURL} -sS ${CLICKHOUSE_URL} -d 'INSERT INTO t_insert_sparse_columns FORMAT JSONEachRow {"id": 1} {"id": 2} {"id": 3} {"id": 4} {"id": 5}' +${CLICKHOUSE_CURL} -sS ${CLICKHOUSE_URL} -d 'INSERT INTO t_insert_sparse_columns FORMAT JSONEachRow {"id": 6} {"id": 7} {"id": 8} {"id": 9} {"id": 10}' +${CLICKHOUSE_CURL} -sS ${CLICKHOUSE_URL} -d 'INSERT INTO t_insert_sparse_columns FORMAT JSONEachRow {"id": 11, "v": 100} {"id": 12, "v": 200} {"id": 13, "v": 300} {"id": 14, "v": 400} {"id": 15, "v": 500}' + +$CLICKHOUSE_CLIENT --query " + SELECT * FROM t_insert_sparse_columns ORDER BY id; + + SELECT name, column, serialization_kind FROM system.parts_columns + WHERE table = 't_insert_sparse_columns' AND database = currentDatabase() AND active + ORDER BY name, column; + + DROP TABLE t_insert_sparse_columns; +" diff --git a/tests/queries/0_stateless/03237_insert_sparse_columns_mem.reference b/tests/queries/0_stateless/03237_insert_sparse_columns_mem.reference new file mode 100644 index 00000000000..09ef3399bad --- /dev/null +++ b/tests/queries/0_stateless/03237_insert_sparse_columns_mem.reference @@ -0,0 +1,9 @@ +120000 +435170936075214220 +435170936075214220 +Default 4 +Sparse 1000 +0 +1 +1 +1 diff --git a/tests/queries/0_stateless/03237_insert_sparse_columns_mem.sh b/tests/queries/0_stateless/03237_insert_sparse_columns_mem.sh new file mode 100755 index 00000000000..ac682a0f574 --- /dev/null +++ b/tests/queries/0_stateless/03237_insert_sparse_columns_mem.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, long + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +table_structure="id UInt64" + +for i in {1..250}; do + table_structure+=", c$i String" +done + +$CLICKHOUSE_CLIENT --query " + DROP TABLE IF EXISTS t_insert_mem; + DROP TABLE IF EXISTS t_reference; + + CREATE TABLE t_insert_mem ($table_structure) ENGINE = MergeTree ORDER BY id SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9; + CREATE TABLE t_reference ($table_structure) ENGINE = Log; + + SYSTEM STOP MERGES t_insert_mem; +" + +filename="test_data_sparse_$CLICKHOUSE_DATABASE.json" + +$CLICKHOUSE_CLIENT --query " + INSERT INTO FUNCTION file('$filename', LineAsString) + SELECT format('{{ \"id\": {}, \"c{}\": \"{}\" }}', number, number % 250, hex(number * 1000000)) FROM numbers(30000) + SETTINGS engine_file_truncate_on_insert = 1; + + INSERT INTO FUNCTION s3(s3_conn, filename='$filename', format='LineAsString') + SELECT * FROM file('$filename', LineAsString) + SETTINGS s3_truncate_on_insert = 1; +" + +for _ in {1..4}; do + $CLICKHOUSE_CLIENT --query "INSERT INTO t_reference SELECT * FROM file('$filename', JSONEachRow)" +done; + +$CLICKHOUSE_CLIENT --enable_parsing_to_custom_serialization 1 --query "INSERT INTO t_insert_mem SELECT * FROM file('$filename', JSONEachRow)" +$CLICKHOUSE_CLIENT --enable_parsing_to_custom_serialization 1 --query "INSERT INTO t_insert_mem SELECT * FROM file('$filename', JSONEachRow)" +$CLICKHOUSE_CLIENT --enable_parsing_to_custom_serialization 1 --query "INSERT INTO t_insert_mem SELECT * FROM s3(s3_conn, filename='$filename', format='JSONEachRow')" +$CLICKHOUSE_CLIENT --query "SELECT * FROM file('$filename', LineAsString) FORMAT LineAsString" | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+t_insert_mem+FORMAT+JSONEachRow&enable_parsing_to_custom_serialization=1" --data-binary @- + +$CLICKHOUSE_CLIENT --query " + SELECT count() FROM t_insert_mem; + SELECT sum(sipHash64(*)) FROM t_insert_mem; + SELECT sum(sipHash64(*)) FROM t_reference; + + SELECT serialization_kind, count() FROM system.parts_columns + WHERE table = 't_insert_mem' AND database = '$CLICKHOUSE_DATABASE' + GROUP BY serialization_kind ORDER BY serialization_kind; + + SYSTEM FLUSH LOGS; + + SELECT written_bytes <= 3000000 FROM system.query_log + WHERE query LIKE 'INSERT INTO t_insert_mem%' AND current_database = '$CLICKHOUSE_DATABASE' AND type = 'QueryFinish' + ORDER BY event_time_microseconds; + + DROP TABLE IF EXISTS t_insert_mem; + DROP TABLE IF EXISTS t_reference; +" diff --git a/tests/queries/0_stateless/03246_toStartOfInterval_date_timezone_bug.reference b/tests/queries/0_stateless/03246_toStartOfInterval_date_timezone_bug.reference new file mode 100644 index 00000000000..cd2f7be5edb --- /dev/null +++ b/tests/queries/0_stateless/03246_toStartOfInterval_date_timezone_bug.reference @@ -0,0 +1 @@ +2024-10-15 diff --git a/tests/queries/0_stateless/03246_toStartOfInterval_date_timezone_bug.sql b/tests/queries/0_stateless/03246_toStartOfInterval_date_timezone_bug.sql new file mode 100644 index 00000000000..0056877abb0 --- /dev/null +++ b/tests/queries/0_stateless/03246_toStartOfInterval_date_timezone_bug.sql @@ -0,0 +1,3 @@ +SET session_timezone = 'Europe/Amsterdam'; + +SELECT toStartOfInterval(CAST('2024-10-26', 'Date'), toIntervalMonth(1), CAST('2023-01-15', 'Date')); diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 40f57562e1c..2260705323b 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -750,6 +750,7 @@ PointDistM PointDistRads PostHistory PostLink +PostLinks PostgreSQLConnection PostgreSQLThreads Postgres @@ -1746,6 +1747,7 @@ getMacro getOSKernelVersion getServerPort getSetting +getSettingOrDefault getSizeOfEnumType getSubcolumn getTypeSerializationStreams diff --git a/utils/check-style/check-style b/utils/check-style/check-style index 1b488968782..2737be85a91 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -285,9 +285,6 @@ done # There shouldn't be any code snippets under GPL or LGPL find $ROOT_PATH/{src,base,programs} -name '*.h' -or -name '*.cpp' 2>/dev/null | xargs grep -i -F 'General Public License' && echo "There shouldn't be any code snippets under GPL or LGPL" -# There shouldn't be any docker containers outside docker directory -find $ROOT_PATH -not -path $ROOT_PATH'/tests/ci*' -not -path $ROOT_PATH'/docker*' -not -path $ROOT_PATH'/contrib*' -name Dockerfile -type f 2>/dev/null | xargs --no-run-if-empty -n1 echo "Please move Dockerfile to docker directory:" - # There shouldn't be any docker compose files outside docker directory find $ROOT_PATH -name '*compose*.yml' -type f -not -path $ROOT_PATH'/docker' -not -path $ROOT_PATH'/tests/integration*' -not -path $ROOT_PATH'/docker*' -not -path $ROOT_PATH'/contrib*' 2>/dev/null | grep -vP $EXCLUDE_DIRS | xargs --no-run-if-empty grep -l "version:" | xargs --no-run-if-empty -n1 echo "Please move docker compose to the 'docker' or 'tests' directory:"