The problem here is that ignorelist did not work by some reason, if I
will look at the ignored functions it should not contain any TSan
interseption code, while it does:
$ lldb-13 clickhouse
(lldb) target create "clickhouse"
disas -n rd_avg_rollover
Current executable set to '/home/azat/ch/tmp/tsan-test/clickhouse' (x86_64).
(lldb) disas -n rd_avg_rollover
clickhouse`rd_kafka_stats_emit_avg:
clickhouse[0x1cbf84a7] <+39>: leaq 0x30(%r15), %r12
clickhouse[0x1cbf84ab] <+43>: movq %r12, %rdi
clickhouse[0x1cbf84ae] <+46>: callq 0x1ccdad40 ; rdk_thread_mutex_lock at tinycthread.c:111
clickhouse[0x1cbf84b3] <+51>: leaq 0x58(%r15), %rdi
clickhouse[0x1cbf84b7] <+55>: callq 0x71b5390 ; __tsan_read4
clickhouse[0x1cbf84bc] <+60>: cmpl $0x0, 0x58(%r15)
clickhouse[0x1cbf84c1] <+65>: je 0x1cbf8595 ; <+277> [inlined] rd_avg_rollover + 238 at rdavg.h
clickhouse[0x1cbf84c7] <+71>: leaq -0xc8(%rbp), %rdi
clickhouse[0x1cbf84ce] <+78>: xorl %esi, %esi
clickhouse[0x1cbf84d0] <+80>: callq 0x1ccdac80 ; rdk_thread_mutex_init at tinycthread.c:62
clickhouse[0x1cbf84d5] <+85>: leaq 0x5c(%r15), %rdi
clickhouse[0x1cbf84d9] <+89>: callq 0x71b5390 ; __tsan_read4
(lldb) disas -n rd_avg_calc
clickhouse`rd_kafka_broker_ops_io_serve:
clickhouse[0x1cbdf086] <+1990>: leaq 0x5a4(%rbx), %rdi
clickhouse[0x1cbdf08d] <+1997>: callq 0x71b5390 ; __tsan_read4
clickhouse[0x1cbdf092] <+2002>: cmpl $0x0, 0x5a4(%rbx)
clickhouse[0x1cbdf099] <+2009>: je 0x1cbdf12b ; <+2155> [inlined] rd_kafka_broker_timeout_scan + 719 at rdkafka_broker.c
I guess the reason is that they had been inlined
So now rd_avg_calc() guarded with a mutex.
Refs: https://github.com/ClickHouse/librdkafka/pull/11
Fixes: https://github.com/ClickHouse/ClickHouse/issues/60939
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
ClickHouse uses JEMALLOC_PROF_LIBGCC - _Unwind_Backtrace() for jemalloc,
which calls _Unwind_Backtrace() during bootstrap of jemalloc, and if,
dlsym() has allocations it will lead to deadlock:
0 __lll_lock_wait (futex=futex@entry=0x16516410 <init_lock+64>, private=0) at lowlevellock.c:52
1 0x00007ffff7f9d0a3 in __GI___pthread_mutex_lock (mutex=0x16516410 <init_lock+64>) at ../nptl/pthread_mutex_lock.c:80
2 0x000000000ba6ec23 in pthread_mutex_lock (arg=arg@entry=0x16516410 <init_lock+64>) at ./build/RelWithDebInfo/./src/Common/ThreadFuzzer.cpp:445
3 0x0000000014215fd4 in malloc_mutex_lock_final (mutex=0x165163d0 <init_lock>) at ./contrib/jemalloc/include/jemalloc/internal/mutex.h:151
4 malloc_mutex_lock_slow (mutex=0x165163d0 <init_lock>) at ./build/RelWithDebInfo/./contrib/jemalloc/src/mutex.c:90
5 0x00000000141defc6 in malloc_mutex_lock (tsdn=0x0, mutex=<optimized out>) at ./contrib/jemalloc/include/jemalloc/internal/mutex.h:217
6 malloc_init_hard () at ./build/RelWithDebInfo/./contrib/jemalloc/src/jemalloc.c:2118
7 0x00000000141d05db in malloc_init () at ./build/RelWithDebInfo/./contrib/jemalloc/src/jemalloc.c:298
8 imalloc_init_check (sopts=<optimized out>, dopts=<optimized out>) at ./build/RelWithDebInfo/./contrib/jemalloc/src/jemalloc.c:2658
9 imalloc (sopts=<optimized out>, dopts=<optimized out>) at ./build/RelWithDebInfo/./contrib/jemalloc/src/jemalloc.c:2689
10 calloc (num=num@entry=1, size=size@entry=32) at ./build/RelWithDebInfo/./contrib/jemalloc/src/jemalloc.c:2852
11 0x00007ffff7c42c05 in _dlerror_run (operate=operate@entry=0x7ffff7c42490 <dlsym_doit>, args=args@entry=0x7fffffffda40) at dlerror.c:148
12 0x00007ffff7c42525 in __dlsym (handle=<optimized out>, name=0xe7f568 "dl_iterate_phdr") at dlsym.c:70
13 0x000000001408e1a0 in (anonymous namespace)::getOriginalDLIteratePHDR () at ./build/RelWithDebInfo/./base/base/phdr_cache.cpp:44
14 dl_iterate_phdr (callback=0x16287a60 <libunwind::findUnwindSectionsByPhdr(dl_phdr_info*, unsigned long, void*)>, data=0x7fffffffdb58) at ./build/RelWithDebInfo/./base/base/phdr_cache.cpp:64
15 0x0000000016288468 in libunwind::LocalAddressSpace::findUnwindSections (this=<optimized out>, targetAddr=371765377, info=...) at ./contrib/libunwind/src/AddressSpace.hpp:605
16 libunwind::UnwindCursor<libunwind::LocalAddressSpace, libunwind::Registers_x86_64>::setInfoBasedOnIPRegister (this=this@entry=0x7fffffffde70, isReturnAddress=false) at ./contrib/libunwind/src/UnwindCursor.hpp:2553
17 0x0000000016287617 in __unw_init_local (cursor=cursor@entry=0x7fffffffde70, context=context@entry=0x7fffffffddc8) at ./build/RelWithDebInfo/./contrib/libunwind/src/libunwind.cpp:91
18 0x000000001628b094 in _Unwind_Backtrace (callback=0x14226f40 <prof_unwind_init_callback>, ref=0x0) at ./build/RelWithDebInfo/./contrib/libunwind/src/UnwindLevel1-gcc-ext.c:106
19 0x000000001421a7e0 in prof_boot2 (tsd=tsd@entry=0x7ffff7c357c0, base=<optimized out>) at ./build/RelWithDebInfo/./contrib/jemalloc/src/prof.c:693
20 0x00000000141df3e7 in malloc_init_hard () at ./build/RelWithDebInfo/./contrib/jemalloc/src/jemalloc.c:2151
21 0x00000000141d96f1 in malloc_init () at ./build/RelWithDebInfo/./contrib/jemalloc/src/jemalloc.c:298
22 nallocx (size=32, flags=0) at ./build/RelWithDebInfo/./contrib/jemalloc/src/jemalloc.c:4011
23 0x000000000b9fdd9b in _ZN6Memory23getActualAllocationSizeITpTkNSt3__17same_asISt11align_val_tEEJEQsr2DBE16OptionalArgumentIDpT_EEEmmS5_ (size=32) at ./src/Common/memory.h:197
24 _ZN6Memory11trackMemoryITpTkNSt3__17same_asISt11align_val_tEEJEQsr2DBE16OptionalArgumentIDpT_EEEmmR15AllocationTraceS5_ (size=32, trace=...) at ./src/Common/memory.h:208
25 operator new (size=size@entry=32) at ./build/RelWithDebInfo/./src/Common/new_delete.cpp:53
26 0x000000001540e0a6 in google::protobuf::internal::ShutdownData::get () at ./build/RelWithDebInfo/./contrib/google-protobuf/src/google/protobuf/message_lite.cc:679
27 google::protobuf::internal::OnShutdownRun (f=0x15407420 <google::protobuf::internal::DestroyString(void const*)>, arg=0x167a6368 <google::protobuf::internal::fixed_address_empty_string>) at ./build/RelWithDebInfo/./contrib/google-protobuf/src/google/protobuf/message_lite.cc:697
28 0x0000000015407560 in global constructors keyed to 000101 () at ./contrib/google-protobuf/src/google/protobuf/generated_message_util.h:180
29 0x000000001629101d in __libc_csu_init ()
30 0x00007ffff7dba010 in __libc_start_main (main=0x66e3b00 <main(int, char**)>, argc=4, argv=0x7fffffffe258, init=0x16290fd0 <__libc_csu_init>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7fffffffe248) at ../csu/libc-start.c:264
31 0x00000000066e302e in _start ()
And this is indeed what happens for glibc prio 2.34, since only in [2]
the allocation had been removed
[2]: https://sourceware.org/git/?p=glibc.git;a=commit;h=fada9018199c21c469ff0e731ef75c6020074ac9
But, jemalloc provides also JEMALLOC_PROF_LIBUNWIND, yes it is not llvm
libunwind, but gnu one, but ClickHouse version of llvm libunwind
provides unw_backtrace(), so we can use it, and it will not be called
during jemalloc bootstrap, so deadlock should not be possible.
Funny thing that for this deadlock the workaround is to enable
background_thread, because a thread created for it will initialize TLS for
__dlerror().
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
Occasionally, 02479_mysql_connect_to_self fails on CI [1].
[1]: https://github.com/ClickHouse/ClickHouse/issues/50911
The problem was indeed query profiler and EINTR, but not in a way you
may think.
For such failures you may see the following trace in trace_log:
contrib/openssl/crypto/bio/bss_sock.c:127::sock_read
contrib/openssl/crypto/bio/bio_meth.c:121::bread_conv
contrib/openssl/crypto/bio/bio_lib.c:285::bio_read_intern
contrib/openssl/crypto/bio/bio_lib.c:311::BIO_read
contrib/openssl/ssl/record/methods/tls_common.c:398::tls_default_read_n
contrib/openssl/ssl/record/methods/tls_common.c:575::tls_get_more_records
contrib/openssl/ssl/record/methods/tls_common.c:1122::tls_read_record
contrib/openssl/ssl/record/rec_layer_s3.c:645::ssl3_read_bytes
contrib/openssl/ssl/s3_lib.c:4527::ssl3_read_internal
contrib/openssl/ssl/s3_lib.c:4551::ssl3_read
contrib/openssl/ssl/ssl_lib.c:2343::ssl_read_internal
contrib/openssl/ssl/ssl_lib.c:2357::SSL_read
contrib/mariadb-connector-c/libmariadb/secure/openssl.c:729::ma_tls_read
contrib/mariadb-connector-c/libmariadb/ma_tls.c:90::ma_pvio_tls_read
contrib/mariadb-connector-c/libmariadb/ma_pvio.c:250::ma_pvio_read
contrib/mariadb-connector-c/libmariadb/ma_pvio.c:297::ma_pvio_cache_read
contrib/mariadb-connector-c/libmariadb/ma_net.c:373::ma_real_read
contrib/mariadb-connector-c/libmariadb/ma_net.c:427::ma_net_read
contrib/mariadb-connector-c/libmariadb/mariadb_lib.c:192::ma_net_safe_read
contrib/mariadb-connector-c/libmariadb/mariadb_lib.c:2138::mthd_my_read_query_result
contrib/mariadb-connector-c/libmariadb/mariadb_lib.c:2212::mysql_real_query
src/Common/mysqlxx/Query.cpp:56::mysqlxx::Query::executeImpl()
src/Common/mysqlxx/Query.cpp:73::mysqlxx::Query::use()
src/Processors/Sources/MySQLSource.cpp:50::DB::MySQLSource::Connection::Connection()
After which the connection will fail with:
Code: 1000. DB::Exception: Received from localhost:9000. DB::Exception: mysqlxx::ConnectionLost: Lost connection to MySQL server during query (127.0.0.1:9004). (POCO_EXCEPTION)
But, if you will take a look at ma_tls_read() you will see that it has
proper retries for SSL_ERROR_WANT_READ (and EINTR is just a special case
of it), but still, for some reason it fails.
And the reason is the units of the read/write timeout, ma_tls_read()
calls poll(read_timeout) in case of SSL_ERROR_WANT_READ, but, it
incorrectly assume that the timeout is in milliseconds, but that timeout
was in seconds, this bug had been fixed in [2], and now it works like a
charm!
[2]: https://github.com/ClickHouse/mariadb-connector-c/pull/17
I've verified it with patching openssl library:
diff --git a/crypto/bio/bss_sock.c b/crypto/bio/bss_sock.c
index 82f7be85ae..3d2f3926a0 100644
--- a/crypto/bio/bss_sock.c
+++ b/crypto/bio/bss_sock.c
@@ -124,7 +124,24 @@ static int sock_read(BIO *b, char *out, int outl)
ret = ktls_read_record(b->num, out, outl);
else
# endif
- ret = readsocket(b->num, out, outl);
+ {
+ static int i = 0;
+ static int j = 0;
+ if (!(++j % 5))
+ {
+ fprintf(stderr, "sock_read: inject EAGAIN with ret=0\n");
+ ret = 0;
+ errno = EAGAIN;
+ }
+ else if (!(++i % 3))
+ {
+ fprintf(stderr, "sock_read: inject EAGAIN with ret=-1\n");
+ ret = -1;
+ errno = EAGAIN;
+ }
+ else
+ ret = readsocket(b->num, out, outl);
+ }
BIO_clear_retry_flags(b);
if (ret <= 0) {
if (BIO_sock_should_retry(ret))
And before this patch (well, not the patch itself, but the referenced
patch in mariadb-connector-c) if you will pass read_write_timeout=1 it
will fail:
SELECT * FROM mysql('127.0.0.1:9004', system, one, 'default', '', SETTINGS connect_timeout = 100, connection_wait_timeout = 100, read_write_timeout=1)
Code: 1000. DB::Exception: Received from localhost:9000. DB::Exception: mysqlxx::ConnectionLost: Lost connection to MySQL server during query (127.0.0.1:9004). (POCO_EXCEPTION)
But after, it always works:
$ ch benchmark -c10 -q "SELECT * FROM mysql('127.0.0.1:9004', system, one, 'default', '', SETTINGS connection_pool_size=1, connect_timeout = 100, connection_wait_timeout = 100, read_write_timeout=1)"
^CStopping launch of queries. SIGINT received.
Queries executed: 478.
localhost:9000, queries: 478, QPS: 120.171, RPS: 120.171, MiB/s: 0.001, result RPS: 120.171, result MiB/s: 0.001.
0.000% 0.014 sec.
10.000% 0.058 sec.
20.000% 0.065 sec.
30.000% 0.073 sec.
40.000% 0.079 sec.
50.000% 0.087 sec.
60.000% 0.089 sec.
70.000% 0.091 sec.
80.000% 0.095 sec.
90.000% 0.100 sec.
95.000% 0.102 sec.
99.000% 0.105 sec.
99.900% 0.140 sec.
99.990% 0.140 sec.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>