Better check-doc-aspell

This commit is contained in:
vdimir 2022-06-03 16:10:15 +02:00
parent 27ebad89c5
commit 226fcbdac7
No known key found for this signature in database
GPG Key ID: 6EE4CE2BEDC51862
4 changed files with 292 additions and 189 deletions

View File

@ -1,18 +1,26 @@
FixedString personal_ws-1.1 en 479
AArch
ACLs ACLs
AMQP
ASLR
ASan ASan
Actian Actian
AddressSanitizer AddressSanitizer
AppleClang AppleClang
ArrowStream ArrowStream
AvroConfluent AvroConfluent
CCTOOLS
CLion CLion
CMake
CMakeLists CMakeLists
CPUs CPUs
CSVWithNames CSVWithNames
CSVWithNamesAndTypes CSVWithNamesAndTypes
CamelCase CamelCase
CapnProto
CentOS CentOS
ClickHouse
Config
Contrib Contrib
Ctrl Ctrl
CustomSeparated CustomSeparated
@ -25,6 +33,8 @@ Doxygen
Encodings Encodings
Enum Enum
Eoan Eoan
FixedString
FreeBSD
Fuzzer Fuzzer
Fuzzers Fuzzers
GTest GTest
@ -39,17 +49,56 @@ Hostname
IPv IPv
IntN IntN
Integrations Integrations
JSONAsString
JSONColumns
JSONColumnsWithMetadata
JSONCompact
JSONCompactColumns
JSONCompactEachRow
JSONCompactEachRowWithNames
JSONCompactEachRowWithNamesAndTypes
JSONCompactStrings
JSONCompactStringsEachRow
JSONCompactStringsEachRowWithNames
JSONCompactStringsEachRowWithNamesAndTypes
JSONEachRow
JSONEachRowWithProgress
JSONStrings
JSONStringsEachRow
JSONStringsEachRowWithProgress
JSONs
Jaeger
Jemalloc
Jepsen
KDevelop
LGPL
LOCALTIME
LOCALTIMESTAMP
LibFuzzer
LineAsString
LowCardinality LowCardinality
MEMTABLE
MSan
MacOS MacOS
Memcheck Memcheck
MemorySanitizer MemorySanitizer
MergeTree MergeTree
MessagePack MessagePack
MiB MiB
MsgPack
Multiline
Multithreading
MySQLDump
NEKUDOTAYIM
NULLIF
NVME
NuRaft
Ok Ok
OpenSUSE OpenSUSE
OpenStack OpenStack
OpenTelemetry OpenTelemetry
PAAMAYIM
Parsers
Postgres Postgres
Precompiled Precompiled
PrettyCompact PrettyCompact
@ -61,17 +110,31 @@ PrettySpaceNoEscapes
Protobuf Protobuf
ProtobufSingle ProtobufSingle
QTCreator QTCreator
RBAC
RawBLOB RawBLOB
RedHat RedHat
RowBinary RowBinary
RowBinaryWithNames RowBinaryWithNames
RowBinaryWithNamesAndTypes RowBinaryWithNamesAndTypes
Runtime Runtime
SATA
SERIALIZABLE
SIMD
SMALLINT
SQLSTATE
SSSE
Schemas
Stateful Stateful
Submodules Submodules
Subqueries Subqueries
TSVRaw TSVRaw
TSan TSan
TabSeparated
TabSeparatedRaw
TabSeparatedRawWithNames
TabSeparatedRawWithNamesAndTypes
TabSeparatedWithNames
TabSeparatedWithNamesAndTypes
TargetSpecific TargetSpecific
TemplateIgnoreSpaces TemplateIgnoreSpaces
Testflows Testflows
@ -84,6 +147,7 @@ UIntN
UPDATEs UPDATEs
Uint Uint
Updatable Updatable
Util
Valgrind Valgrind
Vectorized Vectorized
VirtualBox VirtualBox
@ -92,31 +156,211 @@ Woboq
WriteBuffer WriteBuffer
WriteBuffers WriteBuffers
XCode XCode
YAML
YYYY
Zipkin Zipkin
ZooKeeper ZooKeeper
ZooKeeper's ZooKeeper's
aarch
allocator
analytics
anonymized
ansi
async
autogeneration
autostart
avro
avx
aws
backoff
backticks
benchmarking
blake
blockSize blockSize
boolean
boringssl
brotli
buildable
camelCase camelCase
capn
capnproto
cardinality
cassandra
cbindgen
ccache
cctz
cfg
changelog
checkouting
checksummed
checksumming
checksums
cityhash
cli
clickhouse
clickstream
cmake
codebase
codec
comparising
config
configs
contrib
coroutines coroutines
cpp
cppkafka cppkafka
cpu cpu
croaring
cronjob cronjob
csv
csvwithnames csvwithnames
csvwithnamesandtypes csvwithnamesandtypes
customseparated customseparated
customseparatedwithnames customseparatedwithnames
customseparatedwithnamesandtypes
cyrus
datacenter
datafiles
dataset
datasets
datetime
dbms dbms
ddl ddl
deallocation
debian
decompressor
denormals
deserialization
deserialized
destructor
destructors
dmesg
dont
dragonbox
durations
endian
enum
fastops
fcoverage
filesystem
filesystems
flatbuffers
fmtlib
formatschema
formatter
fuzzer
fuzzers
gRPC
gcem
github
glibc
googletest
grpc
grpcio
gtest
hardlinks
hdfs
heredoc
heredocs
homebrew
http
https
hyperscan
icudata
instantiation
integrational
integrations
interserver
jdbc
jemalloc
json
jsonasstring
jsoncolumns
jsoncolumnsmonoblock
jsoncompact
jsoncompactcolumns
jsoncompacteachrow
jsoncompacteachrowwithnames
jsoncompacteachrowwithnamesandtypes
jsoncompactstrings
jsoncompactstringseachrow
jsoncompactstringseachrowwithnames
jsoncompactstringseachrowwithnamesandtypes
jsoneachrow
jsoneachrowwithprogress
jsonstrings
jsonstringseachrow
jsonstringseachrowwithprogress
kafka
kafkacat
konsole
latencies
lexicographically
libFuzzer
libc
libcpuid
libcxx
libcxxabi
libdivide
libfarmhash
libfuzzer
libgsasl
libhdfs
libmetrohash
libpq
libpqxx
librdkafka
libs
libunwind
libuv
libvirt
linearizability
linearizable
lineasstring
linefeeds
linux
llvm
localhost
macOS
mariadb
miniselect
msgpack
msgpk
multiline
multithread
murmurhash
mutex
mysql
mysqldump
mysqljs
noop
nullable
num
obfuscator
odbc
ok ok
openldap
opentelemetry
overcommit
parallelization
parallelize
parallelized
parsers
pclmulqdq
performant
poco
popcnt
postfix postfix
postfixes postfixes
postgresql postgresql
pre pre
prebuild prebuild
prebuilt prebuilt
preemptable
preloaded preloaded
preprocessed preprocessed
preprocessor preprocessor
presentational
prestable prestable
prettycompact prettycompact
prettycompactmonoblock prettycompactmonoblock
@ -141,6 +385,9 @@ readonly
rebalanced rebalanced
replxx replxx
repo repo
representable
requestor
resultset
rethrow rethrow
risc risc
ro ro
@ -151,9 +398,14 @@ rowbinarywithnames
rowbinarywithnamesandtypes rowbinarywithnamesandtypes
rsync rsync
runningAccumulate runningAccumulate
runtime
russian russian
rw rw
sasl
schemas
simdjson simdjson
skippingerrors
sparsehash
sql sql
src src
stacktraces stacktraces
@ -175,12 +427,19 @@ subpatterns
subqueries subqueries
subquery subquery
subseconds subseconds
substring
subtree subtree
sudo sudo
symlink symlink
symlinks symlinks
syntaxes syntaxes
systemd systemd
tabseparated
tabseparatedraw
tabseparatedrawwithnames
tabseparatedrawwithnamesandtypes
tabseparatedwithnames
tabseparatedwithnamesandtypes
tcp tcp
templateignorespaces templateignorespaces
tgz tgz
@ -199,7 +458,7 @@ unencrypted
unixodbc unixodbc
url url
userspace userspace
usr utils
variadic variadic
varint varint
vectorized vectorized
@ -208,6 +467,8 @@ wchs
webpage webpage
webserver webserver
wget wget
whitespace
whitespaces
wrt wrt
xcode xcode
xml xml
@ -217,126 +478,3 @@ zkcopy
zlib zlib
znodes znodes
zstd zstd
datacenter
datafiles
dataset
datasets
datetime
deallocation
libmetrohash
libpq
libpqxx
librdkafka
libs
libunwind
libuv
libvirt
linearizability
linearizable
LineAsString
llvm
localhost
macOS
mutex
mysql
MySQLDump
comparising
contrib
decompressor
deserialization
deserialized
destructor
destructors
dmesg
dragonbox
durations
endian
enum
fastops
fcoverage
filesystems
flatbuffers
fmtlib
formatter
FreeBSD
fuzzer
fuzzers
gcem
github
glibc
googletest
gRPC
grpc
grpcio
gtest
hardlinks
heredoc
heredocs
http
https
NuRaft
odbc
parallelization
parallelize
parallelized
Parsers
parsers
pclmulqdq
performant
poco
popcnt
presentational
representable
requestor
resultset
runtime
Jemalloc
jemalloc
Jepsen
KDevelop
konsole
libcxx
libcxxabi
libdivide
libfarmhash
LibFuzzer
libFuzzer
libfuzzer
libgsasl
libhdfs
linefeeds
mariadb
miniselect
MSan
MsgPack
msgpack
msgpk
Multiline
multiline
multithread
Multithreading
murmurhash
mysqljs
openldap
opentelemetry
overcommit
integrational
interserver
Jaeger
jdbc
kafka
kafkacat
lexicographically
libc
libcpuid
sasl
Schemas
schemas
sparsehash
croaring
cyrus
denormals
hyperscan
icudata
instantiation
integrations

View File

@ -1,54 +0,0 @@
aarch
allocator
analytics
anonymized
ansi
async
autogeneration
autostart
avro
avx
aws
backoff
backticks
benchmarking
blake
boolean
boringssl
brotli
buildable
capn
capnproto
cardinality
cassandra
cbindgen
ccache
cctz
cfg
changelog
checkouting
checksummed
checksumming
checksums
cityhash
cli
clickhouse
clickstream
cmake
codebase
codec
configs?
cpp
csv
filesystem
json[a-z]*
latencies
noop
nullable
num
obfuscator
preemptable
substring
tabseparated[a-z]*
utils?
whitespaces?

View File

@ -1,24 +1,37 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Perform spell checking on the docs # Perform spell checking on the docs
# Files casesensitive.txt and caseinsensitive.txt contains words to ignore (case insensitive and sensitive respectively)
# File todo.txt needs to be revised which words is actual misspellings if [[ ${1:-} == "--help" ]] || [[ ${1:-} == "-h" ]]; then
echo "Usage $0 [--help|-h] [-i]"
echo " --help|-h: print this help"
echo " -i: interactive mode"
exit 0
fi
ROOT_PATH=$(git rev-parse --show-toplevel) ROOT_PATH=$(git rev-parse --show-toplevel)
CHECK_LANG=${1:-en} CHECK_LANG=en
ASPELL_IGNORE_PATH="${ROOT_PATH}/utils/check-style/aspell-ignore/${CHECK_LANG}" ASPELL_IGNORE_PATH="${ROOT_PATH}/utils/check-style/aspell-ignore/${CHECK_LANG}"
STATUS=0 STATUS=0
for fname in ${ROOT_PATH}/docs/${CHECK_LANG}/**/*.md; do for fname in ${ROOT_PATH}/docs/${CHECK_LANG}/**/*.md; do
# vvv ---- remove anchors ---- vvv if [[ ${1:-} == "-i" ]]; then
errors=$(cat "$fname" | sed -E 's/(^#.*) \{#[a-z-]+\}$/\1/' \ echo "Checking $fname"
| aspell list --add-sgml-skip=code --encoding=utf-8 --mode=markdown -W 3 --lang=${CHECK_LANG} --home-dir=${ASPELL_IGNORE_PATH} \ aspell --personal=aspell-dict.txt --add-sgml-skip=code --encoding=utf-8 --mode=markdown -W 3 --lang=${CHECK_LANG} --home-dir=${ASPELL_IGNORE_PATH} -c "$fname"
| grep -Ewv -f "${ASPELL_IGNORE_PATH}/todo.txt" \ continue
| grep -Ewvi -f "${ASPELL_IGNORE_PATH}/caseinsensitive.txt" \ fi
| grep -Ewv -f "${ASPELL_IGNORE_PATH}/casesensitive.txt" \
| grep -Ewv "[A-Z]+" \ errors=$(cat "$fname" \
| aspell list \
-W 3 \
--personal=aspell-dict.txt \
--add-sgml-skip=code \
--encoding=utf-8 \
--mode=markdown \
--lang=${CHECK_LANG} \
--home-dir=${ASPELL_IGNORE_PATH} \
| sort | uniq) | sort | uniq)
if [ ! -z "$errors" ]; then if [ ! -z "$errors" ]; then
STATUS=1 STATUS=1
@ -27,4 +40,10 @@ for fname in ${ROOT_PATH}/docs/${CHECK_LANG}/**/*.md; do
fi fi
done done
if (( STATUS != 0 )); then
echo "====== Errors found ======"
echo "To exclude some words add them to the dictionary file \"${ASPELL_IGNORE_PATH}/aspell-dict.txt\""
echo "You can also run ${0} -i to see the errors interactively and fix them or add to the dictionary file"
fi
exit ${STATUS} exit ${STATUS}