From 8fe0e9ed35eec97ffee11856924f2a062c5e6e5e Mon Sep 17 00:00:00 2001 From: Derek Bruening <bruening@google.com> Date: Mon, 14 Dec 2020 16:25:54 -0500 Subject: [PATCH 1/2] i#1921 native sig: Deliver signals to unmanaged threads When a signal arrives in an completely unmanaged thread with no dcontext, typically because DR is detaching, we now deliver the signal if the application has a handler for it. This requires adding support for no dcontext to several parts of the frame setup code even beyond what was added in PR #4603 for temporarily-native threads. We have to save the app's handler when we detach a thread so we know where to send a native signal. Full support is complex when we're cleaning up and have no dynamic storage, so we use a single global handler per signal. We detect whether multiple handlers are in operation in this single DR instance (quite rare: only custom non-pthread clones have this behavior) and in that case we abort like before on a native signal. Adds ATOMIC_READ_1BYTE() to complement the existing atomic operations for a cleaner read of the new multi-handler flag. Delivering the frame often overlaps with DR's frame and even DR's stack usage while delivering, if the app has no sigaltstack. We add logic to detect this overlap and avoid clobbering the stack memory. Alarm signals are still dropped, since they can arrive mid-thread-init when it is even harder to deliver. Adds a new test api.detach_signal which creates 10 threads who all sit in a loop sending 4 different alternating signals (SIGSEGV, SIGBUS, SIGURG, SIGALRM) while the main thread attaches and then detaches. When run in debug build, many many signals arrive in post-detach threads, since detach takes a while to do debug cleanup, exercising the new code. Adds a new RSTAT for native signals so we can identify when this happens in release build. Exports the stat to the API and uses it to check that at least some signals were delivered natively in the new test. Removes the fatal error on a signal arriving with no dcontext. But, non-ignore default signal actions when no handler is present are still not fully handled, along with multi-sighand-processes as mentioned, and the fatal error remains in those cases. For default actions, since the process is going to terminate anyway, the only shortcoming of this is whether a core is generated and whether the proper process exit code is raised. Issue: #1921 --- core/arch/arch_exports.h | 34 +++ core/globals.h | 2 + core/lib/statsx.h | 1 + core/unix/signal.c | 288 ++++++++++++++++-------- core/utils.c | 7 +- core/utils.h | 5 +- suite/tests/CMakeLists.txt | 4 + suite/tests/api/detach_signal.cpp | 173 ++++++++++++++ suite/tests/api/detach_signal.templatex | 5 + suite/tests/tools.h | 10 +- 10 files changed, 431 insertions(+), 98 deletions(-) create mode 100644 suite/tests/api/detach_signal.cpp create mode 100644 suite/tests/api/detach_signal.templatex diff --git a/core/arch/arch_exports.h b/core/arch/arch_exports.h index 5dc890e38..4ed83e4c3 100644 --- a/core/arch/arch_exports.h +++ b/core/arch/arch_exports.h @@ -327,6 +327,10 @@ emit_detach_callback_final_jmp(dcontext_t *dcontext, /* note that the microsoft compiler will not enregister variables across asm * blocks that touch those registers, so don't need to worry about clobbering * eax and ebx */ +# define ATOMIC_1BYTE_READ(addr_src, addr_res) \ + do { \ + *(BYTE *)(addr_res) = *(volatile BYTE *)(addr_src); \ + } while (0) # define ATOMIC_1BYTE_WRITE(target, value, hot_patch) \ do { \ ASSERT(sizeof(value) == 1); \ @@ -482,6 +486,13 @@ atomic_add_exchange_int64(volatile int64 *var, int64 value) /* IA-32 vol 3 7.1.4: processor will internally suppress the bus lock * if target is within cache line. */ +# define ATOMIC_1BYTE_READ(addr_src, addr_res) \ + do { \ + __asm__ __volatile__("movb %1, %%al; movb %%al, %0" \ + : "=m"(*(byte *)(addr_res)) \ + : "m"(*(byte *)(addr_src)) \ + : "al"); \ + } while (0) # define ATOMIC_1BYTE_WRITE(target, value, hot_patch) \ do { \ /* allow a constant to be passed in by supplying our own lvalue */ \ @@ -644,6 +655,14 @@ atomic_add_exchange_int64(volatile int64 *var, int64 value) # elif defined(DR_HOST_AARCH64) +# define ATOMIC_1BYTE_READ(addr_src, addr_res) \ + do { \ + /* We use "load-acquire" to add a barrier. */ \ + __asm__ __volatile__("ldarb w0, [%0]; strb w0, [%1]" \ + : \ + : "r"(addr_src), "r"(addr_res) \ + : "w0", "memory"); \ + } while (0) # define ATOMIC_1BYTE_WRITE(target, value, hot_patch) \ do { \ ASSERT(sizeof(value) == 1); \ @@ -829,6 +848,13 @@ atomic_dec_becomes_zero(volatile int *var) # elif defined(DR_HOST_ARM) +# define ATOMIC_1BYTE_READ(addr_src, addr_res) \ + do { \ + __asm__ __volatile__("ldrb r0, [%0]; dmb ish; strb r0, [%1]" \ + : \ + : "r"(addr_src), "r"(addr_res) \ + : "r0", "memory"); \ + } while (0) # define ATOMIC_1BYTE_WRITE(target, value, hot_patch) \ do { \ ASSERT(sizeof(value) == 1); \ @@ -1116,6 +1142,14 @@ atomic_aligned_read_int(volatile int *var) return temp; } +static inline bool +atomic_read_bool(volatile bool *var) +{ + bool temp; + ATOMIC_1BYTE_READ(var, &temp); + return temp; +} + #ifdef X64 static inline int64 atomic_aligned_read_int64(volatile int64 *var) diff --git a/core/globals.h b/core/globals.h index 4f172bca9..46a4288fb 100644 --- a/core/globals.h +++ b/core/globals.h @@ -325,6 +325,8 @@ typedef struct _dr_stats_t { uint64 peak_vmm_blocks_reach_special_heap; /** Peak number of memory blocks used for other reachable mappings. */ uint64 peak_vmm_blocks_reach_special_mmap; + /** Signals delivered to native threads. */ + uint64 num_native_signals; } dr_stats_t; /** diff --git a/core/lib/statsx.h b/core/lib/statsx.h index f8be77002..7acc79bcf 100644 --- a/core/lib/statsx.h +++ b/core/lib/statsx.h @@ -98,6 +98,7 @@ STATS_DEF("SetContextThread w/o CONTEXT_CONTROL", num_app_setcontext_no_control) STATS_DEF("Re-takeovers after native", num_retakeover_after_native) #else RSTATS_DEF("Total signals delivered", num_signals) +RSTATS_DEF("Total signals delivered to native threads", num_native_signals) RSTATS_DEF("Signals dropped", num_signals_dropped) RSTATS_DEF("Signals in coarse units delayed", num_signals_coarse_delayed) #endif diff --git a/core/unix/signal.c b/core/unix/signal.c index ffd1505bc..62eee8111 100644 --- a/core/unix/signal.c +++ b/core/unix/signal.c @@ -233,6 +233,21 @@ static bool removed_sig_handler; os_cxt_ptr_t osc_empty; +/* For delivering signals that arrive in now-native threads while we're still + * detaching other threads. The thread that receives the signal has no dcontext + * anymore, so we need a global record of its handler (DR's handler is still registered + * with the kernel). + * XXX i#1921: To properly handle multiple separate signal handler thread groups + * within this single DR process domain, we would need a list of these, along with + * lists of threads within each. Since that is very rare, and requires either + * hardcoded maximums or complexities with exit-time heap, we currently do not + * support that and die if asked to deliver a native signal in such circumstances. + */ +DECLARE_CXTSWPROT_VAR(static read_write_lock_t detached_sigact_lock, + INIT_READWRITE_LOCK(detached_sigact_lock)); +static kernel_sigaction_t detached_sigact[SIGARRAY_SIZE]; +static bool multiple_handlers_present; /* Accessed using atomics, not the lock. */ + /**** function prototypes ***********************************************/ /* in x86.asm */ @@ -510,6 +525,7 @@ d_r_signal_init(void) void d_r_signal_exit() { + DELETE_READWRITE_LOCK(detached_sigact_lock); IF_LINUX(signalfd_exit()); if (init_info.app_sigaction != NULL) { /* We never took over the app (e.g., standalone mode). Restore its state. */ @@ -960,6 +976,8 @@ signal_thread_inherit(dcontext_t *dcontext, void *clone_record) } else { /* copy handlers */ LOG(THREAD, LOG_ASYNCH, 2, "inheriting signal handlers from parent\n"); + if (!atomic_read_bool(&multiple_handlers_present)) + ATOMIC_1BYTE_WRITE(&multiple_handlers_present, true, false); info->app_sigaction = (kernel_sigaction_t **)handler_alloc( dcontext, SIGARRAY_SIZE * sizeof(kernel_sigaction_t *)); memset(info->app_sigaction, 0, SIGARRAY_SIZE * sizeof(kernel_sigaction_t *)); @@ -2937,10 +2955,38 @@ get_sigstack_frame_ptr(dcontext_t *dcontext, thread_sig_info_t *info, int sig, byte *sp; if (frame != NULL) { - /* signal happened while in cache, grab interrupted xsp */ + /* Signal happened natively or while in the cache: grab interrupted xsp. */ sp = (byte *)sc->SC_XSP; - LOG(THREAD, LOG_ASYNCH, 3, "get_sigstack_frame_ptr: using frame's xsp " PFX "\n", - sp); + /* Handle DR's frame already being on the app stack. For native delivery we + * could try to re-use this frame, but that doesn't work with plain vs rt. + * Instead we move below and live with the downsides of a potential stack + * overflow and confusing the app over why it's so low: but this is an app + * with no sigaltstack so it should have plenty of room. + */ + size_t frame_sz_max = sizeof(sigframe_rt_t) + REDZONE_SIZE + + IF_LINUX(IF_X86((sc->fpstate == NULL ? 0 + : signal_frame_extra_size( + true)) +)) 64 /* align + slop */; + if (sp > (byte *)frame && sp < (byte *)frame + frame_sz_max) { + sp = (byte *)frame; + /* We're probably *on* the stack right where we want to put the frame! */ + byte *cur_sp; + GET_STACK_PTR(cur_sp); + /* We have to copy the frame below our own stack usage high watermark + * in the rest of the code we'll execute from execute_native_handler(). + * Kind of a mess. For now we estimate two pages which should be plenty + * and still not be egregious usage for most app stacks. + */ +#define EXECUTE_NATIVE_STACK_USAGE (4096 * 2) + if (sp > cur_sp && sp - frame_sz_max - EXECUTE_NATIVE_STACK_USAGE < cur_sp) { + sp = cur_sp - frame_sz_max - EXECUTE_NATIVE_STACK_USAGE; + } + LOG(THREAD, LOG_ASYNCH, 3, + "get_sigstack_frame_ptr: moving beyond same-stack frame to %p\n", sp); + } else { + LOG(THREAD, LOG_ASYNCH, 3, + "get_sigstack_frame_ptr: using frame's xsp " PFX "\n", sp); + } } else { /* signal happened while in DR, use stored xsp */ sp = (byte *)get_mcontext(dcontext)->xsp; @@ -3059,16 +3105,15 @@ convert_frame_to_nonrt(dcontext_t *dcontext, int sig, sigframe_rt_t *f_old, * the intra-frame absolute pointers to point to the new addresses * in f_new. * Only updates the pretcode to the stored app restorer if for_app. + * dcontext can be NULL (which is why we take in info). */ -void -fixup_rtframe_pointers(dcontext_t *dcontext, int sig, sigframe_rt_t *f_old, - sigframe_rt_t *f_new, bool for_app, size_t f_new_alloc_size) +static void +fixup_rtframe_pointers(dcontext_t *dcontext, thread_sig_info_t *info, int sig, + sigframe_rt_t *f_old, sigframe_rt_t *f_new, bool for_app, + size_t f_new_alloc_size) { - if (dcontext == NULL) - dcontext = get_thread_private_dcontext(); - ASSERT(dcontext != NULL); + ASSERT(info != NULL); #if defined(X86) && defined(LINUX) - thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field; bool has_restorer = sig_has_restorer(info, sig); # ifdef DEBUG uint level = 3; @@ -3204,6 +3249,9 @@ memcpy_rt_frame(sigframe_rt_t *frame, byte *dst, bool from_pending) return; } #endif + /* Ensure no overlap. */ + ASSERT(dst >= (byte *)(frame + 1) + signal_frame_extra_size(true) || + dst + sizeof(sigframe_rt_t) + signal_frame_extra_size(true) <= (byte *)frame); memcpy(dst, frame, sizeof(sigframe_rt_t)); } @@ -3214,12 +3262,12 @@ memcpy_rt_frame(sigframe_rt_t *frame, byte *dst, bool from_pending) * If no restorer, touches up pretcode * (and if rt_frame, touches up pinfo and puc) * Also touches up fpstate pointer + * dcontext can be NULL (which is why we take in info). */ static void -copy_frame_to_stack(dcontext_t *dcontext, int sig, sigframe_rt_t *frame, byte *sp, - bool from_pending) +copy_frame_to_stack(dcontext_t *dcontext, thread_sig_info_t *info, int sig, + sigframe_rt_t *frame, byte *sp, bool from_pending) { - thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field; bool rtframe = IS_RT_FOR_APP(info, sig); uint frame_size = get_app_frame_size(info, sig); #if defined(LINUX) && defined(X86_32) @@ -3247,17 +3295,24 @@ copy_frame_to_stack(dcontext_t *dcontext, int sig, sigframe_rt_t *frame, byte *s false /* don't own initexit_lock */, false /* keep futures */); } - TRY_EXCEPT(dcontext, /* try */ - { - if (rtframe) { - ASSERT(frame_size == sizeof(*frame)); - memcpy_rt_frame(frame, sp, from_pending); - } - IF_NOT_X64( - IF_LINUX(else convert_frame_to_nonrt(dcontext, sig, frame, - (sigframe_plain_t *)sp);)); - }, - /* except */ { stack_unwritable = true; }); + ASSERT(!rtframe || frame_size == sizeof(*frame)); + if (dcontext == NULL) { + /* We have no safe-read mechanism so we do the best we can: blindly copy. */ + if (rtframe) + memcpy_rt_frame(frame, sp, from_pending); + IF_NOT_X64(IF_LINUX( + else convert_frame_to_nonrt(dcontext, sig, frame, (sigframe_plain_t *)sp);)); + } else { + TRY_EXCEPT(dcontext, /* try */ + { + if (rtframe) + memcpy_rt_frame(frame, sp, from_pending); + IF_NOT_X64( + IF_LINUX(else convert_frame_to_nonrt( + dcontext, sig, frame, (sigframe_plain_t *)sp);)); + }, + /* except */ { stack_unwritable = true; }); + } if (stack_unwritable) { /* Override the no-nested check in record_pending_signal(): it's ok b/c * receive_pending_signal() calls to here at a consistent point, @@ -3292,7 +3347,7 @@ copy_frame_to_stack(dcontext_t *dcontext, int sig, sigframe_rt_t *frame, byte *s /* fix up pretcode, pinfo, puc, fpstate */ if (rtframe) { sigframe_rt_t *f_new = (sigframe_rt_t *)sp; - fixup_rtframe_pointers(dcontext, sig, frame, f_new, true /*for app*/, size); + fixup_rtframe_pointers(dcontext, info, sig, frame, f_new, true /*for app*/, size); #ifdef HAVE_SIGALTSTACK /* Make sure the frame's sigstack reflects the app stack, both for transparency * of the app examining it and for correctness if we detach mid-handler. @@ -3396,7 +3451,8 @@ copy_frame_to_pending(dcontext_t *dcontext, int sig, #ifdef MACOS /* We rely on puc to find sc to we have to fix it up */ - fixup_rtframe_pointers(dcontext, sig, frame, dst, false /*!for app*/, sizeof(*dst)); + fixup_rtframe_pointers(dcontext, info, sig, frame, dst, false /*!for app*/, + sizeof(*dst)); #endif LOG(THREAD, LOG_ASYNCH, 3, "copy_frame_to_pending from " PFX "\n", frame); @@ -4929,27 +4985,6 @@ master_signal_handler_C(byte *xsp) } #endif - /* We are dropping asynchronous signals during detach. The thread may already have - * lost its TLS (xref i#3535). A safe read may result in a crash if DR's SIGSEGV - * handler is removed before the safe read's SIGSEGV is delivered. - * - * Note that besides dropping potentially important signals, there is still a small - * race window if the signal gets delivered after the detach has finished, i.e. - * doing detach is false. This is an issue in particular if the app has started - * re-attaching. - * - * Signals that are not clearly asynchronous may hit corner case(s) of i#3535. - * (xref i#26). - */ - if (doing_detach && can_always_delay[sig]) { - DOLOG(1, LOG_ASYNCH, { dump_sigcontext(GLOBAL_DCONTEXT, sc); }); - SYSLOG_INTERNAL_ERROR("ERROR: master_signal_handler with unreliable dcontext " - "during detach. Signal will be dropped and we're continuing" - " (i#3535?): tid=%d, sig=%d", - get_sys_thread_id(), sig); - return; - } - dcontext_t *dcontext = get_thread_private_dcontext(); #ifdef MACOS @@ -5034,17 +5069,21 @@ master_signal_handler_C(byte *xsp) (dcontext != GLOBAL_DCONTEXT && (dcontext->signal_field == NULL || !((thread_sig_info_t *)dcontext->signal_field)->fully_initialized))) { - /* FIXME: || !intercept_asynch, or maybe !under_our_control */ - /* FIXME i#26: this could be a signal arbitrarily sent to this thread. + /* XXX i#26: this could be a signal arbitrarily sent to this thread. * We could try to route it to another thread, using a global queue * of pending signals. But what if it was targeted to this thread * via SYS_{tgkill,tkill}? Can we tell the difference, even if * we watch the kill syscalls: could come from another process? + * For now we use our native thread signal delivery mechanism. */ if (sig_is_alarm_signal(sig)) { - /* assuming an alarm during thread exit or init (xref PR 596127, - * i#359): suppressing is fine + /* Assuming an alarm during thread exit or init (xref PR 596127, + * i#359). Delivering it sometimes works, but we could be mid-init + * where it's not easy and we could crash b/c of in-between state + * (observed for alarm mid-signal_thread_init where info->app_sigaction + * is still NULL). */ + return; } else if (sig == SUSPEND_SIGNAL && dcontext == NULL) { /* We sent SUSPEND_SIGNAL to a thread we don't control (no * dcontext), which means we want to take over. @@ -5054,29 +5093,18 @@ master_signal_handler_C(byte *xsp) return; ASSERT_NOT_REACHED(); /* else, shouldn't return */ } else { - /* Using global dcontext because dcontext is NULL here. */ + LOG(GLOBAL, LOG_ASYNCH, 1, + "signal with no siginfo (tid=%d, sig=%d): attempting to deliver to " + "native thread\n", + get_sys_thread_id(), sig); DOLOG(1, LOG_ASYNCH, { dump_sigcontext(GLOBAL_DCONTEXT, sc); }); - SYSLOG_INTERNAL_ERROR("ERROR: master_signal_handler with no siginfo " - "(i#26?): tid=%d, sig=%d", - get_sys_thread_id(), sig); } - /* TODO i#1921: call execute_native_handler() here (and remove SYSLOG). */ - - /* see FIXME comments above. - * workaround for now: suppressing is better than dying. + /* For can_always_delay[sig] we could just return and drop it, but we + * try to perturb the app behavior less with a native signal frame: */ - if (can_always_delay[sig]) - return; - - char signum_str[8]; - snprintf(signum_str, BUFFER_SIZE_ELEMENTS(signum_str), "%d", sig); - NULL_TERMINATE_BUFFER(signum_str); - char tid_str[16]; - snprintf(tid_str, BUFFER_SIZE_ELEMENTS(tid_str), TIDFMT, get_sys_thread_id()); - NULL_TERMINATE_BUFFER(tid_str); - REPORT_FATAL_ERROR_AND_EXIT(FAILED_TO_HANDLE_SIGNAL, 4, get_application_name(), - get_application_pid(), signum_str, tid_str); + execute_native_handler(dcontext, sig, frame); + return; } /* we may be entering dynamo from code cache! */ @@ -5470,7 +5498,7 @@ execute_handler_from_cache(dcontext_t *dcontext, int sig, sigframe_rt_t *our_fra LOG(THREAD, LOG_ASYNCH, 3, "\txsp is " PFX "\n", xsp); /* copy frame to appropriate stack and convert to non-rt if necessary */ - copy_frame_to_stack(dcontext, sig, our_frame, (void *)xsp, false /*!pending*/); + copy_frame_to_stack(dcontext, info, sig, our_frame, (void *)xsp, false /*!pending*/); LOG(THREAD, LOG_ASYNCH, 3, "\tcopied frame from " PFX " to " PFX "\n", our_frame, xsp); sigcontext_t *app_sc = get_sigcontext_from_app_frame(info, sig, (void *)xsp); @@ -5701,7 +5729,7 @@ execute_handler_from_dispatch(dcontext_t *dcontext, int sig) * chance to make changes, copy the frame to the appropriate stack * location and convert to non-rt if necessary */ - copy_frame_to_stack(dcontext, sig, frame, xsp, true /*pending*/); + copy_frame_to_stack(dcontext, info, sig, frame, xsp, true /*pending*/); /* now point at the app's frame */ sc = get_sigcontext_from_app_frame(info, sig, (void *)xsp); @@ -5780,6 +5808,21 @@ execute_handler_from_dispatch(dcontext_t *dcontext, int sig) return true; } +/* Does not return. */ +static void +report_unhandleable_signal_and_exit(int sig) +{ + /* TODO i#1921: Add more info such as the PC and disasm of surrounding instrs. */ + char signum_str[8]; + snprintf(signum_str, BUFFER_SIZE_ELEMENTS(signum_str), "%d", sig); + NULL_TERMINATE_BUFFER(signum_str); + char tid_str[16]; + snprintf(tid_str, BUFFER_SIZE_ELEMENTS(tid_str), TIDFMT, get_sys_thread_id()); + NULL_TERMINATE_BUFFER(tid_str); + REPORT_FATAL_ERROR_AND_EXIT(FAILED_TO_HANDLE_SIGNAL, 4, get_application_name(), + get_application_pid(), signum_str, tid_str); +} + /* Sends a signal to a currently-native thread. dcontext can be NULL. */ static void execute_native_handler(dcontext_t *dcontext, int sig, sigframe_rt_t *our_frame) @@ -5789,19 +5832,29 @@ execute_native_handler(dcontext_t *dcontext, int sig, sigframe_rt_t *our_frame) */ thread_sig_info_t synthetic = {}; kernel_sigaction_t *sigact_array[SIGARRAY_SIZE] = {}; + bool intercept_array[SIGARRAY_SIZE]; kernel_sigaction_t sigact_struct; thread_sig_info_t *info; + LOG(THREAD, LOG_ASYNCH, 2, "%s for signal %d in thread " TIDFMT "\n", __FUNCTION__, + sig, get_sys_thread_id()); if (dcontext != NULL) { info = (thread_sig_info_t *)dcontext->signal_field; } else { + if (atomic_read_bool(&multiple_handlers_present)) { + /* See i#1921 comment up top: we don't handle this. */ + report_unhandleable_signal_and_exit(sig); + ASSERT_NOT_REACHED(); + } info = &synthetic; + synthetic.we_intercept = intercept_array; synthetic.we_intercept[sig] = true; synthetic.app_sigaction = sigact_array; - sigact_array[sig] = &sigact_struct; - DEBUG_DECLARE(int rc =) - sigaction_syscall(sig, NULL, &sigact_struct); + synthetic.app_sigaction[sig] = &sigact_struct; + d_r_read_lock(&detached_sigact_lock); + memcpy(&sigact_struct, &detached_sigact[sig], sizeof(sigact_struct)); + d_r_read_unlock(&detached_sigact_lock); #ifdef HAVE_SIGALTSTACK - IF_DEBUG(rc =) + IF_DEBUG(int rc =) sigaltstack_syscall(NULL, &synthetic.app_sigstack); ASSERT(rc == 0); #endif @@ -5817,28 +5870,53 @@ execute_native_handler(dcontext_t *dcontext, int sig, sigframe_rt_t *our_frame) if (info->app_sigaction[sig] == NULL || info->app_sigaction[sig]->handler == (handler_t)SIG_DFL) { - /* TODO i#1921: We need to pass in an explicit info param with our - * synthetic version. - * Plus we need to audit this callee: not all its code will work with - * a NULL dcontext. + /* TODO i#1921: We want to call: + * execute_default_from_cache(dcontext, sig, our_frame, sc, false) + * But we need to pass in our sythentic info. + * And not all that callee's code will work with a NULL dcontext yet: it needs + * some work. * Plus it calls handle_free() which we need to arrange for. + * For now we bail. */ - execute_default_from_cache(dcontext, sig, our_frame, sc, false); + if (can_always_delay[sig]) { + /* We are dropping asynchronous signals during detach. The thread may + * already have lost its TLS (xref i#3535). A safe read may result in a + * crash if DR's SIGSEGV handler is removed before the safe read's + * SIGSEGV is delivered. + * + * Note that besides dropping potentially important signals, there is + * still a small race window if the signal gets delivered after the + * detach has finished, i.e. doing detach is false. This is an issue in + * particular if the app has started re-attaching. + * + * Signals that are not clearly asynchronous may hit corner case(s) of + * i#3535. (xref i#26). + */ + if (doing_detach) { + DOLOG(1, LOG_ASYNCH, { dump_sigcontext(GLOBAL_DCONTEXT, sc); }); + SYSLOG_INTERNAL_ERROR( + "Signal in native thread during detach with default action is not " + "fully supported: dropping and continuing (tid=%d, sig=%d)", + get_sys_thread_id(), sig); + } + return; /* Just drop the signal. */ + } + if (default_action[sig] == DEFAULT_IGNORE) + return; + report_unhandleable_signal_and_exit(sig); + ASSERT_NOT_REACHED(); return; } ASSERT(info->app_sigaction[sig] != NULL && info->app_sigaction[sig]->handler != (handler_t)SIG_IGN && info->app_sigaction[sig]->handler != (handler_t)SIG_DFL); - /* TODO i#1921: Can we have a LOG(THREAD_OR_GLOBAL so it's in the thread file if - * we have a dc, but global instead of dropped o/w? - */ - LOG(THREAD, LOG_ASYNCH, 2, "%s for signal %d\n", __FUNCTION__, sig); RSTATS_INC(num_signals); + RSTATS_INC(num_native_signals); report_app_problem(dcontext, APPFAULT_FAULT, (byte *)sc->SC_XIP, (byte *)sc->SC_FP, "\nSignal %d delivered to application handler.\n", sig); - copy_frame_to_stack(dcontext, sig, our_frame, (void *)xsp, false /*!pending*/); + copy_frame_to_stack(dcontext, info, sig, our_frame, (void *)xsp, false /*!pending*/); blocked = info->app_sigaction[sig]->mask; if (!TEST(SA_NOMASK, (info->app_sigaction[sig]->flags))) @@ -5853,7 +5931,7 @@ execute_native_handler(dcontext_t *dcontext, int sig, sigframe_rt_t *our_frame) * straight to the app handler. */ sc->SC_XSP = (ptr_uint_t)xsp; - /* Set up args to handler. */ +/* Set up args to handler. */ #ifdef X86_64 sc->SC_XDI = sig; sc->SC_XSI = (reg_t) & ((sigframe_rt_t *)xsp)->info; @@ -5871,11 +5949,18 @@ execute_native_handler(dcontext_t *dcontext, int sig, sigframe_rt_t *our_frame) #endif sc->SC_XIP = (reg_t)SIGACT_PRIMARY_HANDLER(info->app_sigaction[sig]); - if (TEST(SA_ONESHOT, info->app_sigaction[sig]->flags)) - info->app_sigaction[sig]->handler = (handler_t)SIG_DFL; - LOG(THREAD, LOG_ASYNCH, 2, "%s: set pc to handler %p with xsp=%p\n", __FUNCTION__, SIGACT_PRIMARY_HANDLER(info->app_sigaction[sig]), xsp); + + if (TEST(SA_ONESHOT, detached_sigact[sig].flags)) { + /* XXX: When we remove DR's handler from the final thread we'll ignore this + * delivery: but we can't safely access that data struct here. The best we + * can do is avoid delivering twice ourselves. + */ + d_r_write_lock(&detached_sigact_lock); + memset(&detached_sigact[sig], 0, sizeof(detached_sigact[sig])); + d_r_write_unlock(&detached_sigact_lock); + } } /* The arg to SYS_kill, i.e., the signal number, should be in dcontext->sys_param0 */ @@ -7446,6 +7531,27 @@ sig_detach(dcontext_t *dcontext, sigframe_rt_t *frame, KSYNCH_TYPE *detached) LOG(THREAD, LOG_ASYNCH, 1, "%s: detaching\n", __FUNCTION__); + if (!atomic_read_bool(&multiple_handlers_present)) { + /* Save the app's handlers (we only support one handler setups) for + * execute_native_handler(). + */ + d_r_read_lock(&detached_sigact_lock); + for (int i = 0; i < SIGARRAY_SIZE; ++i) { + if (info->app_sigaction[i] != NULL && + detached_sigact[i].handler == (handler_t)0) { + d_r_read_unlock(&detached_sigact_lock); + d_r_write_lock(&detached_sigact_lock); + if (detached_sigact[i].handler == (handler_t)0) { + memcpy(&detached_sigact[i], info->app_sigaction[i], + sizeof(detached_sigact[i])); + } + d_r_write_unlock(&detached_sigact_lock); + d_r_read_lock(&detached_sigact_lock); + } + } + d_r_read_unlock(&detached_sigact_lock); + } + /* Update the mask of the signal frame so that the later sigreturn will * restore the app signal mask. */ @@ -7457,7 +7563,7 @@ sig_detach(dcontext_t *dcontext, sigframe_rt_t *frame, KSYNCH_TYPE *detached) * and that we're not clobbering any app data beyond TOS. */ xsp = get_sigstack_frame_ptr(dcontext, info, SUSPEND_SIGNAL, frame); - copy_frame_to_stack(dcontext, SUSPEND_SIGNAL, frame, xsp, false /*!pending*/); + copy_frame_to_stack(dcontext, info, SUSPEND_SIGNAL, frame, xsp, false /*!pending*/); #ifdef HAVE_SIGALTSTACK /* Make sure the frame's sigstack reflects the app stack. diff --git a/core/utils.c b/core/utils.c index 0fbfe16cf..93268416c 100644 --- a/core/utils.c +++ b/core/utils.c @@ -414,7 +414,9 @@ locks_not_closed() cur_lock->rank == LOCK_RANK(report_buf_lock) || cur_lock->rank == LOCK_RANK(datasec_selfprot_lock) || cur_lock->rank == LOCK_RANK(logdir_mutex) || - cur_lock->rank == LOCK_RANK(options_lock))) { + cur_lock->rank == LOCK_RANK(options_lock) || + /* This lock can be used parallel to detach cleanup. */ + cur_lock->rank == LOCK_RANK(detached_sigact_lock))) { /* i#1058: curiosities during exit re-acquire these locks. */ ignored++; } else { @@ -4626,5 +4628,8 @@ stats_get_snapshot(dr_stats_t *drstats) drstats->peak_vmm_blocks_reach_special_mmap = GLOBAL_STAT(peak_vmm_blocks_reach_special_mmap); } + if (drstats->size > offsetof(dr_stats_t, num_native_signals)) { + drstats->num_native_signals = GLOBAL_STAT(num_native_signals); + } return true; } diff --git a/core/utils.h b/core/utils.h index 10d50d982..deb0f33f4 100644 --- a/core/utils.h +++ b/core/utils.h @@ -542,6 +542,9 @@ enum { # endif # ifdef WINDOWS LOCK_RANK(alt_tls_lock), +# endif +# ifdef UNIX + LOCK_RANK(detached_sigact_lock), # endif /* ADD HERE a lock around section that may allocate memory */ @@ -1223,7 +1226,7 @@ bitmap_check_consistency(bitmap_t b, uint bitmap_size, uint expect_free); # endif /* INTERNAL */ # define THREAD \ ((dcontext == NULL) \ - ? INVALID_FILE \ + ? main_logfile \ : ((dcontext == GLOBAL_DCONTEXT) ? main_logfile : dcontext->logfile)) # define THREAD_GET get_thread_private_logfile() # define GLOBAL main_logfile diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt index 6527a542e..76f434806 100644 --- a/suite/tests/CMakeLists.txt +++ b/suite/tests/CMakeLists.txt @@ -2848,6 +2848,10 @@ if (CLIENT_INTERFACE) set(api.detach_state_runavx 1) endif () endif () + if (UNIX) + tobuild_api(api.detach_signal api/detach_signal.cpp "" "" OFF OFF) + link_with_pthread(api.detach_signal) + endif () if (NOT WIN32) # XXX i#2611: fix for Windows if (X64) set(detach_spawn_name api.detach_spawn) diff --git a/suite/tests/api/detach_signal.cpp b/suite/tests/api/detach_signal.cpp new file mode 100644 index 000000000..05e0f5832 --- /dev/null +++ b/suite/tests/api/detach_signal.cpp @@ -0,0 +1,173 @@ +/* ********************************************************** + * Copyright (c) 2011-2020 Google, Inc. All rights reserved. + * Copyright (c) 2003-2008 VMware, Inc. All rights reserved. + * **********************************************************/ + +/* + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of VMware, Inc. nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* XXX: We undef this b/c it's easier than getting rid of from CMake with the + * global cflags config where all the other tests want this set. + */ +#undef DR_REG_ENUM_COMPATIBILITY + +#include <assert.h> +#include <stdio.h> +#include <math.h> +#include <setjmp.h> +#include <signal.h> +#include <stdint.h> +#include <atomic> +#include "configure.h" +#include "dr_api.h" +#include "tools.h" +#include "thread.h" +#include "condvar.h" + +#define VERBOSE 0 +#define NUM_THREADS 10 + +#if VERBOSE +# define VPRINT(...) print(__VA_ARGS__) +#else +# define VPRINT(...) /* nothing */ +#endif + +static volatile bool sideline_exit = false; +static void *sideline_continue; +static void *sideline_ready[NUM_THREADS]; + +static thread_local SIGJMP_BUF mark; +static std::atomic<int> count; + +static void +handle_signal(int signal, siginfo_t *siginfo, ucontext_t *ucxt) +{ + count++; + SIGLONGJMP(mark, count); +} + +THREAD_FUNC_RETURN_TYPE +sideline_spinner(void *arg) +{ + unsigned int idx = (unsigned int)(uintptr_t)arg; + if (dr_app_running_under_dynamorio()) + print("ERROR: thread %d should NOT be under DynamoRIO\n", idx); + VPRINT("%d signaling sideline_ready\n", idx); + signal_cond_var(sideline_ready[idx]); + + VPRINT("%d waiting for continue\n", idx); + wait_cond_var(sideline_continue); + if (!dr_app_running_under_dynamorio()) + print("ERROR: thread %d should be under DynamoRIO\n", idx); + VPRINT("%d signaling sideline_ready\n", idx); + signal_cond_var(sideline_ready[idx]); + + /* Now sit in a signal-generating loop. */ + while (!sideline_exit) { + /* We generate 4 different signals to test different types. */ + if (SIGSETJMP(mark) == 0) { + *(int *)arg = 42; /* SIGSEGV */ + } + if (SIGSETJMP(mark) == 0) { + pthread_kill(pthread_self(), SIGBUS); + } + if (SIGSETJMP(mark) == 0) { + pthread_kill(pthread_self(), SIGURG); + } + if (SIGSETJMP(mark) == 0) { + pthread_kill(pthread_self(), SIGALRM); + } + } + + return THREAD_FUNC_RETURN_ZERO; +} + +int +main(void) +{ + int i; + thread_t thread[NUM_THREADS]; /* On Linux, the tid. */ + + intercept_signal(SIGSEGV, (handler_3_t)&handle_signal, false); + intercept_signal(SIGBUS, (handler_3_t)&handle_signal, false); + intercept_signal(SIGURG, (handler_3_t)&handle_signal, false); + intercept_signal(SIGALRM, (handler_3_t)&handle_signal, false); + + sideline_continue = create_cond_var(); + for (i = 0; i < NUM_THREADS; i++) { + sideline_ready[i] = create_cond_var(); + thread[i] = create_thread(sideline_spinner, (void *)(uintptr_t)i); + } + + /* Initialize DR. */ + dr_app_setup(); + + /* Wait for all the threads to be scheduled. */ + VPRINT("waiting for ready\n"); + for (i = 0; i < NUM_THREADS; i++) { + wait_cond_var(sideline_ready[i]); + reset_cond_var(sideline_ready[i]); + } + /* Now get each thread to start its signal loop. */ + dr_app_start(); + VPRINT("signaling continue\n"); + signal_cond_var(sideline_continue); + VPRINT("waiting for ready\n"); + for (i = 0; i < NUM_THREADS; i++) { + wait_cond_var(sideline_ready[i]); + reset_cond_var(sideline_ready[i]); + } + reset_cond_var(sideline_continue); + + /* Detach */ + int pre_count = count.load(std::memory_order_acquire); + print("signal count pre-detach: %d\n", pre_count); + print("detaching\n"); + /* We use the _with_stats variant to catch register errors such as i#4457. */ + dr_stats_t stats = { sizeof(dr_stats_t) }; + dr_app_stop_and_cleanup_with_stats(&stats); + int post_count = count.load(std::memory_order_acquire); + assert(post_count > pre_count); + print("signal count post-detach: %d\n", count.load(std::memory_order_acquire)); + assert(stats.basic_block_count > 0); + print("native signals delivered: %ld\n", stats.num_native_signals); + assert(stats.num_native_signals > 0); + + sideline_exit = true; + for (i = 0; i < NUM_THREADS; i++) { + join_thread(thread[i]); + } + + destroy_cond_var(sideline_continue); + for (i = 0; i < NUM_THREADS; i++) + destroy_cond_var(sideline_ready[i]); + + print("All done\n"); + return 0; +} diff --git a/suite/tests/api/detach_signal.templatex b/suite/tests/api/detach_signal.templatex new file mode 100644 index 000000000..91796a5af --- /dev/null +++ b/suite/tests/api/detach_signal.templatex @@ -0,0 +1,5 @@ +signal count pre-detach: .* +detaching +signal count post-detach: .* +native signals delivered: .* +All done diff --git a/suite/tests/tools.h b/suite/tests/tools.h index 205ccf9ff..32d14bf1e 100644 --- a/suite/tests/tools.h +++ b/suite/tests/tools.h @@ -276,7 +276,7 @@ typedef enum { #else /* FIXME: varargs for windows...for now since we don't care about efficiency we do this: */ -static void +static inline void VERBOSE_PRINT(const char *fmt, ...) { } @@ -444,13 +444,13 @@ size(Code_Snippet func) return ret_val; } -static int +static inline int test(void *foo, int val) { return (*(int (*)(int))foo)(val); } -static int +static inline int call(Code_Snippet func, int val) { switch (func) { @@ -515,7 +515,7 @@ copy_to_buf_cross_page(char *buf, size_t buf_len, size_t *copied_len, Code_Snipp return copy_to_buf_normal(buf, buf_len, copied_len, func); } -static char * +static inline char * copy_to_buf(char *buf, size_t buf_len, size_t *copied_len, Code_Snippet func, Copy_Mode mode) { @@ -659,7 +659,7 @@ protect_mem_check(void *start, size_t len, int prot, int expected); void * reserve_memory(int size); -static void +static inline void test_print(void *buf, int n) { print("%d\n", test(buf, n)); -- GitLab From 03fbe0e87f4fbe2562f0d20fd84438e23b0861ad Mon Sep 17 00:00:00 2001 From: Derek Bruening <bruening@google.com> Date: Wed, 23 Dec 2020 13:03:36 -0500 Subject: [PATCH 2/2] Fix atomic 1byte assert to look at the target type; add missing UNIX ifdefs for stat and lock rank --- core/arch/arch_exports.h | 8 ++++---- core/utils.c | 11 ++++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/core/arch/arch_exports.h b/core/arch/arch_exports.h index 4ed83e4c3..eb44753b4 100644 --- a/core/arch/arch_exports.h +++ b/core/arch/arch_exports.h @@ -333,7 +333,7 @@ emit_detach_callback_final_jmp(dcontext_t *dcontext, } while (0) # define ATOMIC_1BYTE_WRITE(target, value, hot_patch) \ do { \ - ASSERT(sizeof(value) == 1); \ + ASSERT(sizeof(*target) == 1); \ /* No alignment check necessary, hot_patch parameter provided for \ * consistency. \ */ \ @@ -497,7 +497,7 @@ atomic_add_exchange_int64(volatile int64 *var, int64 value) do { \ /* allow a constant to be passed in by supplying our own lvalue */ \ char _myval = value; \ - ASSERT(sizeof(value) == 1); \ + ASSERT(sizeof(*target) == 1); \ /* No alignment check necessary, hot_patch parameter provided for \ * consistency. \ */ \ @@ -665,7 +665,7 @@ atomic_add_exchange_int64(volatile int64 *var, int64 value) } while (0) # define ATOMIC_1BYTE_WRITE(target, value, hot_patch) \ do { \ - ASSERT(sizeof(value) == 1); \ + ASSERT(sizeof(*target) == 1); \ /* Not currently used to write code */ \ ASSERT_CURIOSITY(!hot_patch); \ __asm__ __volatile__("stlrb %w0, [%1]" \ @@ -857,7 +857,7 @@ atomic_dec_becomes_zero(volatile int *var) } while (0) # define ATOMIC_1BYTE_WRITE(target, value, hot_patch) \ do { \ - ASSERT(sizeof(value) == 1); \ + ASSERT(sizeof(*target) == 1); \ __asm__ __volatile__("dmb ish; strb %0, [%1]" \ : \ : "r"(value), "r"(target) \ diff --git a/core/utils.c b/core/utils.c index 93268416c..1f9c2d455 100644 --- a/core/utils.c +++ b/core/utils.c @@ -414,9 +414,10 @@ locks_not_closed() cur_lock->rank == LOCK_RANK(report_buf_lock) || cur_lock->rank == LOCK_RANK(datasec_selfprot_lock) || cur_lock->rank == LOCK_RANK(logdir_mutex) || - cur_lock->rank == LOCK_RANK(options_lock) || - /* This lock can be used parallel to detach cleanup. */ - cur_lock->rank == LOCK_RANK(detached_sigact_lock))) { + cur_lock->rank == + LOCK_RANK(options_lock) + /* This lock can be used parallel to detach cleanup. */ + IF_UNIX(|| cur_lock->rank == LOCK_RANK(detached_sigact_lock)))) { /* i#1058: curiosities during exit re-acquire these locks. */ ignored++; } else { @@ -4629,7 +4630,11 @@ stats_get_snapshot(dr_stats_t *drstats) GLOBAL_STAT(peak_vmm_blocks_reach_special_mmap); } if (drstats->size > offsetof(dr_stats_t, num_native_signals)) { +#ifdef UNIX drstats->num_native_signals = GLOBAL_STAT(num_native_signals); +#else + drstats->num_native_signals = 0; +#endif } return true; } -- GitLab