Better check-doc-aspell

This commit is contained in:
vdimir 2022-06-03 16:10:15 +02:00
parent 27ebad89c5
commit 226fcbdac7
No known key found for this signature in database
GPG Key ID: 6EE4CE2BEDC51862
4 changed files with 292 additions and 189 deletions

View File

@ -1,18 +1,26 @@
FixedString
personal_ws-1.1 en 479
AArch
ACLs
AMQP
ASLR
ASan
Actian
AddressSanitizer
AppleClang
ArrowStream
AvroConfluent
CCTOOLS
CLion
CMake
CMakeLists
CPUs
CSVWithNames
CSVWithNamesAndTypes
CamelCase
CapnProto
CentOS
ClickHouse
Config
Contrib
Ctrl
CustomSeparated
@ -25,6 +33,8 @@ Doxygen
Encodings
Enum
Eoan
FixedString
FreeBSD
Fuzzer
Fuzzers
GTest
@ -39,17 +49,56 @@ Hostname
IPv
IntN
Integrations
JSONAsString
JSONColumns
JSONColumnsWithMetadata
JSONCompact
JSONCompactColumns
JSONCompactEachRow
JSONCompactEachRowWithNames
JSONCompactEachRowWithNamesAndTypes
JSONCompactStrings
JSONCompactStringsEachRow
JSONCompactStringsEachRowWithNames
JSONCompactStringsEachRowWithNamesAndTypes
JSONEachRow
JSONEachRowWithProgress
JSONStrings
JSONStringsEachRow
JSONStringsEachRowWithProgress
JSONs
Jaeger
Jemalloc
Jepsen
KDevelop
LGPL
LOCALTIME
LOCALTIMESTAMP
LibFuzzer
LineAsString
LowCardinality
MEMTABLE
MSan
MacOS
Memcheck
MemorySanitizer
MergeTree
MessagePack
MiB
MsgPack
Multiline
Multithreading
MySQLDump
NEKUDOTAYIM
NULLIF
NVME
NuRaft
Ok
OpenSUSE
OpenStack
OpenTelemetry
PAAMAYIM
Parsers
Postgres
Precompiled
PrettyCompact
@ -61,17 +110,31 @@ PrettySpaceNoEscapes
Protobuf
ProtobufSingle
QTCreator
RBAC
RawBLOB
RedHat
RowBinary
RowBinaryWithNames
RowBinaryWithNamesAndTypes
Runtime
SATA
SERIALIZABLE
SIMD
SMALLINT
SQLSTATE
SSSE
Schemas
Stateful
Submodules
Subqueries
TSVRaw
TSan
TabSeparated
TabSeparatedRaw
TabSeparatedRawWithNames
TabSeparatedRawWithNamesAndTypes
TabSeparatedWithNames
TabSeparatedWithNamesAndTypes
TargetSpecific
TemplateIgnoreSpaces
Testflows
@ -84,6 +147,7 @@ UIntN
UPDATEs
Uint
Updatable
Util
Valgrind
Vectorized
VirtualBox
@ -92,31 +156,211 @@ Woboq
WriteBuffer
WriteBuffers
XCode
YAML
YYYY
Zipkin
ZooKeeper
ZooKeeper's
aarch
allocator
analytics
anonymized
ansi
async
autogeneration
autostart
avro
avx
aws
backoff
backticks
benchmarking
blake
blockSize
boolean
boringssl
brotli
buildable
camelCase
capn
capnproto
cardinality
cassandra
cbindgen
ccache
cctz
cfg
changelog
checkouting
checksummed
checksumming
checksums
cityhash
cli
clickhouse
clickstream
cmake
codebase
codec
comparising
config
configs
contrib
coroutines
cpp
cppkafka
cpu
croaring
cronjob
csv
csvwithnames
csvwithnamesandtypes
customseparated
customseparatedwithnames
customseparatedwithnamesandtypes
cyrus
datacenter
datafiles
dataset
datasets
datetime
dbms
ddl
deallocation
debian
decompressor
denormals
deserialization
deserialized
destructor
destructors
dmesg
dont
dragonbox
durations
endian
enum
fastops
fcoverage
filesystem
filesystems
flatbuffers
fmtlib
formatschema
formatter
fuzzer
fuzzers
gRPC
gcem
github
glibc
googletest
grpc
grpcio
gtest
hardlinks
hdfs
heredoc
heredocs
homebrew
http
https
hyperscan
icudata
instantiation
integrational
integrations
interserver
jdbc
jemalloc
json
jsonasstring
jsoncolumns
jsoncolumnsmonoblock
jsoncompact
jsoncompactcolumns
jsoncompacteachrow
jsoncompacteachrowwithnames
jsoncompacteachrowwithnamesandtypes
jsoncompactstrings
jsoncompactstringseachrow
jsoncompactstringseachrowwithnames
jsoncompactstringseachrowwithnamesandtypes
jsoneachrow
jsoneachrowwithprogress
jsonstrings
jsonstringseachrow
jsonstringseachrowwithprogress
kafka
kafkacat
konsole
latencies
lexicographically
libFuzzer
libc
libcpuid
libcxx
libcxxabi
libdivide
libfarmhash
libfuzzer
libgsasl
libhdfs
libmetrohash
libpq
libpqxx
librdkafka
libs
libunwind
libuv
libvirt
linearizability
linearizable
lineasstring
linefeeds
linux
llvm
localhost
macOS
mariadb
miniselect
msgpack
msgpk
multiline
multithread
murmurhash
mutex
mysql
mysqldump
mysqljs
noop
nullable
num
obfuscator
odbc
ok
openldap
opentelemetry
overcommit
parallelization
parallelize
parallelized
parsers
pclmulqdq
performant
poco
popcnt
postfix
postfixes
postgresql
pre
prebuild
prebuilt
preemptable
preloaded
preprocessed
preprocessor
presentational
prestable
prettycompact
prettycompactmonoblock
@ -141,6 +385,9 @@ readonly
rebalanced
replxx
repo
representable
requestor
resultset
rethrow
risc
ro
@ -151,9 +398,14 @@ rowbinarywithnames
rowbinarywithnamesandtypes
rsync
runningAccumulate
runtime
russian
rw
sasl
schemas
simdjson
skippingerrors
sparsehash
sql
src
stacktraces
@ -175,12 +427,19 @@ subpatterns
subqueries
subquery
subseconds
substring
subtree
sudo
symlink
symlinks
syntaxes
systemd
tabseparated
tabseparatedraw
tabseparatedrawwithnames
tabseparatedrawwithnamesandtypes
tabseparatedwithnames
tabseparatedwithnamesandtypes
tcp
templateignorespaces
tgz
@ -199,7 +458,7 @@ unencrypted
unixodbc
url
userspace
usr
utils
variadic
varint
vectorized
@ -208,6 +467,8 @@ wchs
webpage
webserver
wget
whitespace
whitespaces
wrt
xcode
xml
@ -217,126 +478,3 @@ zkcopy
zlib
znodes
zstd
datacenter
datafiles
dataset
datasets
datetime
deallocation
libmetrohash
libpq
libpqxx
librdkafka
libs
libunwind
libuv
libvirt
linearizability
linearizable
LineAsString
llvm
localhost
macOS
mutex
mysql
MySQLDump
comparising
contrib
decompressor
deserialization
deserialized
destructor
destructors
dmesg
dragonbox
durations
endian
enum
fastops
fcoverage
filesystems
flatbuffers
fmtlib
formatter
FreeBSD
fuzzer
fuzzers
gcem
github
glibc
googletest
gRPC
grpc
grpcio
gtest
hardlinks
heredoc
heredocs
http
https
NuRaft
odbc
parallelization
parallelize
parallelized
Parsers
parsers
pclmulqdq
performant
poco
popcnt
presentational
representable
requestor
resultset
runtime
Jemalloc
jemalloc
Jepsen
KDevelop
konsole
libcxx
libcxxabi
libdivide
libfarmhash
LibFuzzer
libFuzzer
libfuzzer
libgsasl
libhdfs
linefeeds
mariadb
miniselect
MSan
MsgPack
msgpack
msgpk
Multiline
multiline
multithread
Multithreading
murmurhash
mysqljs
openldap
opentelemetry
overcommit
integrational
interserver
Jaeger
jdbc
kafka
kafkacat
lexicographically
libc
libcpuid
sasl
Schemas
schemas
sparsehash
croaring
cyrus
denormals
hyperscan
icudata
instantiation
integrations

View File

@ -1,54 +0,0 @@
aarch
allocator
analytics
anonymized
ansi
async
autogeneration
autostart
avro
avx
aws
backoff
backticks
benchmarking
blake
boolean
boringssl
brotli
buildable
capn
capnproto
cardinality
cassandra
cbindgen
ccache
cctz
cfg
changelog
checkouting
checksummed
checksumming
checksums
cityhash
cli
clickhouse
clickstream
cmake
codebase
codec
configs?
cpp
csv
filesystem
json[a-z]*
latencies
noop
nullable
num
obfuscator
preemptable
substring
tabseparated[a-z]*
utils?
whitespaces?

View File

@ -1,24 +1,37 @@
#!/usr/bin/env bash
# Perform spell checking on the docs
# Files casesensitive.txt and caseinsensitive.txt contains words to ignore (case insensitive and sensitive respectively)
# File todo.txt needs to be revised which words is actual misspellings
if [[ ${1:-} == "--help" ]] || [[ ${1:-} == "-h" ]]; then
echo "Usage $0 [--help|-h] [-i]"
echo " --help|-h: print this help"
echo " -i: interactive mode"
exit 0
fi
ROOT_PATH=$(git rev-parse --show-toplevel)
CHECK_LANG=${1:-en}
CHECK_LANG=en
ASPELL_IGNORE_PATH="${ROOT_PATH}/utils/check-style/aspell-ignore/${CHECK_LANG}"
STATUS=0
for fname in ${ROOT_PATH}/docs/${CHECK_LANG}/**/*.md; do
# vvv ---- remove anchors ---- vvv
errors=$(cat "$fname" | sed -E 's/(^#.*) \{#[a-z-]+\}$/\1/' \
| aspell list --add-sgml-skip=code --encoding=utf-8 --mode=markdown -W 3 --lang=${CHECK_LANG} --home-dir=${ASPELL_IGNORE_PATH} \
| grep -Ewv -f "${ASPELL_IGNORE_PATH}/todo.txt" \
| grep -Ewvi -f "${ASPELL_IGNORE_PATH}/caseinsensitive.txt" \
| grep -Ewv -f "${ASPELL_IGNORE_PATH}/casesensitive.txt" \
| grep -Ewv "[A-Z]+" \
if [[ ${1:-} == "-i" ]]; then
echo "Checking $fname"
aspell --personal=aspell-dict.txt --add-sgml-skip=code --encoding=utf-8 --mode=markdown -W 3 --lang=${CHECK_LANG} --home-dir=${ASPELL_IGNORE_PATH} -c "$fname"
continue
fi
errors=$(cat "$fname" \
| aspell list \
-W 3 \
--personal=aspell-dict.txt \
--add-sgml-skip=code \
--encoding=utf-8 \
--mode=markdown \
--lang=${CHECK_LANG} \
--home-dir=${ASPELL_IGNORE_PATH} \
| sort | uniq)
if [ ! -z "$errors" ]; then
STATUS=1
@ -27,4 +40,10 @@ for fname in ${ROOT_PATH}/docs/${CHECK_LANG}/**/*.md; do
fi
done
if (( STATUS != 0 )); then
echo "====== Errors found ======"
echo "To exclude some words add them to the dictionary file \"${ASPELL_IGNORE_PATH}/aspell-dict.txt\""
echo "You can also run ${0} -i to see the errors interactively and fix them or add to the dictionary file"
fi
exit ${STATUS}