Merging upstream version 0.7.1 (Closes: #991419).

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-09 07:39:31 +01:00 · 2025-02-09 07:39:31 +01:00 · 9e09e0ef69
commit 9e09e0ef69
parent 05c588e9d7
99 changed files with 6727 additions and 943 deletions
--- a/src/Makefile.in
+++ b/src/Makefile.in
@ -11,6 +11,7 @@ OBJECTS=ck_barrier_centralized.o	\
 	ck_barrier_dissemination.o	\
 	ck_barrier_tournament.o		\
 	ck_barrier_mcs.o		\
+	ck_ec.o				\
 	ck_epoch.o			\
 	ck_ht.o				\
 	ck_hp.o				\
@ -24,11 +25,14 @@ libck.so: $(OBJECTS)
 	$(LD) $(LDFLAGS) -o $(TARGET_DIR)/libck.so $(OBJECTS)

 libck.a: $(OBJECTS)
-	ar rcs $(TARGET_DIR)/libck.a $(OBJECTS)
+	$(AR) rcs $(TARGET_DIR)/libck.a $(OBJECTS)

 ck_array.o: $(INCLUDE_DIR)/ck_array.h $(SDIR)/ck_array.c
 	$(CC) $(CFLAGS) -c -o $(TARGET_DIR)/ck_array.o $(SDIR)/ck_array.c

+ck_ec.o: $(INCLUDE_DIR)/ck_ec.h $(SDIR)/ck_ec.c $(SDIR)/ck_ec_timeutil.h
+	$(CC) $(CFLAGS) -c -o $(TARGET_DIR)/ck_ec.o $(SDIR)/ck_ec.c
+
 ck_epoch.o: $(INCLUDE_DIR)/ck_epoch.h $(SDIR)/ck_epoch.c $(INCLUDE_DIR)/ck_stack.h
 	$(CC) $(CFLAGS) -c -o $(TARGET_DIR)/ck_epoch.o $(SDIR)/ck_epoch.c

--- a/src/ck_barrier_combining.c
+++ b/src/ck_barrier_combining.c
@ -35,7 +35,7 @@ struct ck_barrier_combining_queue {
 	struct ck_barrier_combining_group *tail;
 };

-CK_CC_INLINE static struct ck_barrier_combining_group *
+static struct ck_barrier_combining_group *
 ck_barrier_combining_queue_dequeue(struct ck_barrier_combining_queue *queue)
 {
 	struct ck_barrier_combining_group *front = NULL;
@ -48,7 +48,7 @@ ck_barrier_combining_queue_dequeue(struct ck_barrier_combining_queue *queue)
 	return front;
 }

-CK_CC_INLINE static void
+static void
 ck_barrier_combining_insert(struct ck_barrier_combining_group *parent,
    struct ck_barrier_combining_group *tnode,
    struct ck_barrier_combining_group **child)
@ -72,7 +72,7 @@ ck_barrier_combining_insert(struct ck_barrier_combining_group *parent,
 * into the barrier's tree. We use a queue to implement this
 * traversal.
 */
-CK_CC_INLINE static void
+static void
 ck_barrier_combining_queue_enqueue(struct ck_barrier_combining_queue *queue,
    struct ck_barrier_combining_group *node_value)
 {
@ -185,10 +185,10 @@ ck_barrier_combining_aux(struct ck_barrier_combining *barrier,
 		ck_pr_fence_store();
 		ck_pr_store_uint(&tnode->sense, ~tnode->sense);
 	} else {
-		ck_pr_fence_memory();
 		while (sense != ck_pr_load_uint(&tnode->sense))
 			ck_pr_stall();
 	}
+	ck_pr_fence_memory();

 	return;
 }
--- a/src/ck_ec.c
+++ b/src/ck_ec.c
@ -0,0 +1,425 @@
+#include <ck_ec.h>
+#include <ck_limits.h>
+
+#include "ck_ec_timeutil.h"
+
+#define DEFAULT_BUSY_LOOP_ITER 100U
+
+/*
+ * The 2ms, 8x/iter default parameter hit 1.024 seconds after 3
+ * iterations.
+ */
+#define DEFAULT_INITIAL_WAIT_NS 2000000L  /* Start at 2 ms */
+/* Grow the wait time 8x/iteration. */
+#define DEFAULT_WAIT_SCALE_FACTOR 8
+#define DEFAULT_WAIT_SHIFT_COUNT 0
+
+struct ck_ec32_slow_path_state {
+	struct ck_ec32 *ec;
+	uint32_t flagged_word;
+};
+
+#ifdef CK_F_EC64
+struct ck_ec64_slow_path_state {
+	struct ck_ec64 *ec;
+	uint64_t flagged_word;
+};
+#endif
+
+/* Once we've waited for >= 1 sec, go for the full deadline. */
+static const struct timespec final_wait_time = {
+	.tv_sec = 1
+};
+
+void
+ck_ec32_wake(struct ck_ec32 *ec, const struct ck_ec_ops *ops)
+{
+	/* Spurious wake-ups are OK. Clear the flag before futexing. */
+	ck_pr_and_32(&ec->counter, (1U << 31) - 1);
+	ops->wake32(ops, &ec->counter);
+	return;
+}
+
+int
+ck_ec32_wait_slow(struct ck_ec32 *ec,
+    const struct ck_ec_ops *ops,
+    uint32_t old_value,
+    const struct timespec *deadline)
+{
+	return ck_ec32_wait_pred_slow(ec, ops, old_value,
+				      NULL, NULL, deadline);
+}
+
+#ifdef CK_F_EC64
+void
+ck_ec64_wake(struct ck_ec64 *ec, const struct ck_ec_ops *ops)
+{
+	ck_pr_and_64(&ec->counter, ~1);
+	ops->wake64(ops, &ec->counter);
+	return;
+}
+
+int
+ck_ec64_wait_slow(struct ck_ec64 *ec,
+    const struct ck_ec_ops *ops,
+    uint64_t old_value,
+    const struct timespec *deadline)
+{
+	return ck_ec64_wait_pred_slow(ec, ops, old_value,
+				      NULL, NULL, deadline);
+}
+#endif
+
+int
+ck_ec_deadline_impl(struct timespec *new_deadline,
+    const struct ck_ec_ops *ops,
+    const struct timespec *timeout)
+{
+	struct timespec now;
+	int r;
+
+	if (timeout == NULL) {
+		new_deadline->tv_sec = TIME_MAX;
+		new_deadline->tv_nsec = NSEC_MAX;
+		return 0;
+	}
+
+	r = ops->gettime(ops, &now);
+	if (r != 0) {
+		return -1;
+	}
+
+	*new_deadline = timespec_add(now, *timeout);
+	return 0;
+}
+
+/* The rest of the file implements wait_pred_slow. */
+
+/*
+ * Returns a timespec value for deadline_ptr. If deadline_ptr is NULL,
+ * returns a timespec far in the future.
+ */
+static struct timespec
+canonical_deadline(const struct timespec *deadline_ptr)
+{
+
+	if (deadline_ptr == NULL) {
+		return (struct timespec) { .tv_sec = TIME_MAX };
+	}
+
+	return *deadline_ptr;
+}
+
+/*
+ * Really slow (sleeping) path for ck_ec_wait.	Drives the exponential
+ * backoff scheme to sleep for longer and longer periods of time,
+ * until either the sleep function returns true (the eventcount's
+ * value has changed), or the predicate returns non-0 (something else
+ * has changed).
+ *
+ * If deadline is ever reached, returns -1 (timeout).
+ *
+ * TODO: add some form of randomisation to the intermediate timeout
+ * values.
+ */
+static int
+exponential_backoff(struct ck_ec_wait_state *wait_state,
+    bool (*sleep)(const void *sleep_state,
+	const struct ck_ec_wait_state *wait_state,
+	const struct timespec *partial_deadline),
+    const void *sleep_state,
+    int (*pred)(const struct ck_ec_wait_state *state,
+	struct timespec *deadline),
+    const struct timespec *deadline)
+{
+	struct timespec begin;
+	struct timespec stop_backoff;
+	const struct ck_ec_ops *ops = wait_state->ops;
+	const uint32_t scale_factor = (ops->wait_scale_factor != 0)
+	    ? ops->wait_scale_factor
+	    : DEFAULT_WAIT_SCALE_FACTOR;
+	const uint32_t shift_count = (ops->wait_shift_count != 0)
+	    ? ops->wait_shift_count
+	    : DEFAULT_WAIT_SHIFT_COUNT;
+	uint32_t wait_ns = (ops->initial_wait_ns != 0)
+	    ? ops->initial_wait_ns
+	    : DEFAULT_INITIAL_WAIT_NS;
+	bool first = true;
+
+	for (;;) {
+		struct timespec now;
+		struct timespec partial_deadline;
+
+		if (check_deadline(&now, ops, *deadline) == true) {
+			/* Timeout. Bail out. */
+			return -1;
+		}
+
+		if (first) {
+			begin = now;
+			wait_state->start = begin;
+			stop_backoff = timespec_add(begin, final_wait_time);
+			first = false;
+		}
+
+		wait_state->now = now;
+		if (timespec_cmp(now, stop_backoff) >= 0) {
+			partial_deadline = *deadline;
+		} else {
+			do {
+				partial_deadline =
+				    timespec_add_ns(begin, wait_ns);
+				wait_ns =
+				    wait_time_scale(wait_ns,
+						    scale_factor,
+						    shift_count);
+			} while (timespec_cmp(partial_deadline, now) <= 0);
+		}
+
+		if (pred != NULL) {
+			int r = pred(wait_state, &partial_deadline);
+			if (r != 0) {
+				return r;
+			}
+		}
+
+		/* Canonicalize deadlines in the far future to NULL. */
+		if (sleep(sleep_state, wait_state,
+			  ((partial_deadline.tv_sec == TIME_MAX)
+			   ? NULL :  &partial_deadline)) == true) {
+			return 0;
+		}
+	}
+}
+
+/*
+ * Loops up to BUSY_LOOP_ITER times, or until ec's counter value
+ * (including the flag) differs from old_value.
+ *
+ * Returns the new value in ec.
+ */
+#define DEF_WAIT_EASY(W)						\
+	static uint##W##_t ck_ec##W##_wait_easy(struct ck_ec##W* ec,	\
+						const struct ck_ec_ops *ops, \
+						uint##W##_t expected)	\
+	{								\
+		uint##W##_t current = ck_pr_load_##W(&ec->counter);	\
+		size_t n = (ops->busy_loop_iter != 0)			\
+		    ? ops->busy_loop_iter				\
+		    : DEFAULT_BUSY_LOOP_ITER;				\
+									\
+		for (size_t i = 0;					\
+		     i < n && current == expected;			\
+		     i++) {						\
+			ck_pr_stall();					\
+			current = ck_pr_load_##W(&ec->counter);		\
+		}							\
+									\
+		return current;						\
+	}
+
+DEF_WAIT_EASY(32)
+#ifdef CK_F_EC64
+DEF_WAIT_EASY(64)
+#endif
+#undef DEF_WAIT_EASY
+/*
+ * Attempts to upgrade ec->counter from unflagged to flagged.
+ *
+ * Returns true if the event count has changed. Otherwise, ec's
+ * counter word is equal to flagged on return, or has been at some
+ * time before the return.
+ */
+#define DEF_UPGRADE(W)							\
+	static bool ck_ec##W##_upgrade(struct ck_ec##W* ec,		\
+				       uint##W##_t current,		\
+				       uint##W##_t unflagged,		\
+				       uint##W##_t flagged)		\
+	{								\
+		uint##W##_t old_word;					\
+									\
+		if (current == flagged) {				\
+			/* Nothing to do, no change. */			\
+			return false;					\
+		}							\
+									\
+		if (current != unflagged) {				\
+			/* We have a different counter value! */	\
+			return true;					\
+		}							\
+									\
+		/*							\
+		 * Flag the counter value. The CAS only fails if the	\
+		 * counter is already flagged, or has a new value.	\
+		 */							\
+		return (ck_pr_cas_##W##_value(&ec->counter,		\
+					      unflagged, flagged,	\
+					      &old_word) == false &&	\
+			old_word != flagged);				\
+	}
+
+DEF_UPGRADE(32)
+#ifdef CK_F_EC64
+DEF_UPGRADE(64)
+#endif
+#undef DEF_UPGRADE
+
+/*
+ * Blocks until partial_deadline on the ck_ec. Returns true if the
+ * eventcount's value has changed. If partial_deadline is NULL, wait
+ * forever.
+ */
+static bool
+ck_ec32_wait_slow_once(const void *vstate,
+    const struct ck_ec_wait_state *wait_state,
+    const struct timespec *partial_deadline)
+{
+	const struct ck_ec32_slow_path_state *state = vstate;
+	const struct ck_ec32 *ec = state->ec;
+	const uint32_t flagged_word = state->flagged_word;
+
+	wait_state->ops->wait32(wait_state, &ec->counter,
+				flagged_word, partial_deadline);
+	return ck_pr_load_32(&ec->counter) != flagged_word;
+}
+
+#ifdef CK_F_EC64
+static bool
+ck_ec64_wait_slow_once(const void *vstate,
+    const struct ck_ec_wait_state *wait_state,
+    const struct timespec *partial_deadline)
+{
+	const struct ck_ec64_slow_path_state *state = vstate;
+	const struct ck_ec64 *ec = state->ec;
+	const uint64_t flagged_word = state->flagged_word;
+
+	/* futex_wait will only compare the low 32 bits. Perform a
+	 * full comparison here to maximise the changes of catching an
+	 * ABA in the low 32 bits.
+	 */
+	if (ck_pr_load_64(&ec->counter) != flagged_word) {
+		return true;
+	}
+
+	wait_state->ops->wait64(wait_state, &ec->counter,
+				flagged_word, partial_deadline);
+	return ck_pr_load_64(&ec->counter) != flagged_word;
+}
+#endif
+
+/*
+ * The full wait logic is a lot of code (> 1KB). Encourage the
+ * compiler to lay this all out linearly with LIKELY annotations on
+ * every early exit.
+ */
+#define WAIT_SLOW_BODY(W, ec, ops, pred, data, deadline_ptr,		\
+		       old_value, unflagged, flagged)			\
+	do {								\
+		struct ck_ec_wait_state wait_state = {			\
+			.ops = ops,					\
+			.data = data					\
+		};							\
+		const struct ck_ec##W##_slow_path_state state = {	\
+			.ec = ec,					\
+			.flagged_word = flagged				\
+		};							\
+		const struct timespec deadline =			\
+			canonical_deadline(deadline_ptr);		\
+									\
+		/* Detect infinite past deadlines. */			\
+		if (CK_CC_LIKELY(deadline.tv_sec <= 0)) {		\
+			return -1;					\
+		}							\
+									\
+		for (;;) {						\
+			uint##W##_t current;				\
+			int r;						\
+									\
+			current = ck_ec##W##_wait_easy(ec, ops, unflagged); \
+									\
+			/*						\
+			 * We're about to wait harder (i.e.,		\
+			 * potentially with futex). Make sure the	\
+			 * counter word is flagged.			\
+			 */						\
+			if (CK_CC_LIKELY(				\
+				ck_ec##W##_upgrade(ec, current,		\
+					unflagged, flagged) == true)) { \
+				ck_pr_fence_acquire();			\
+				return 0;				\
+			}						\
+									\
+			/*						\
+			 * By now, ec->counter == flagged_word (at	\
+			 * some point in the past). Spin some more to	\
+			 * heuristically let any in-flight SP inc/add	\
+			 * to retire. This does not affect		\
+			 * correctness, but practically eliminates	\
+			 * lost wake-ups.				\
+			 */						\
+			current = ck_ec##W##_wait_easy(ec, ops, flagged); \
+			if (CK_CC_LIKELY(current != flagged_word)) {	\
+				ck_pr_fence_acquire();			\
+				return 0;				\
+			}						\
+									\
+			r = exponential_backoff(&wait_state,		\
+						ck_ec##W##_wait_slow_once, \
+						&state,			\
+						pred, &deadline); \
+			if (r != 0) {					\
+				return r;				\
+			}						\
+									\
+			if (ck_ec##W##_value(ec) != old_value) {	\
+				ck_pr_fence_acquire();			\
+				return 0;				\
+			}						\
+									\
+			/* Spurious wake-up. Redo the slow path. */	\
+		}							\
+	} while (0)
+
+int
+ck_ec32_wait_pred_slow(struct ck_ec32 *ec,
+    const struct ck_ec_ops *ops,
+    uint32_t old_value,
+    int (*pred)(const struct ck_ec_wait_state *state,
+	struct timespec *deadline),
+    void *data,
+    const struct timespec *deadline_ptr)
+{
+	const uint32_t unflagged_word = old_value;
+	const uint32_t flagged_word = old_value | (1UL << 31);
+
+	if (CK_CC_UNLIKELY(ck_ec32_value(ec) != old_value)) {
+		return 0;
+	}
+
+	WAIT_SLOW_BODY(32, ec, ops, pred, data, deadline_ptr,
+		       old_value, unflagged_word, flagged_word);
+}
+
+#ifdef CK_F_EC64
+int
+ck_ec64_wait_pred_slow(struct ck_ec64 *ec,
+    const struct ck_ec_ops *ops,
+    uint64_t old_value,
+    int (*pred)(const struct ck_ec_wait_state *state,
+	struct timespec *deadline),
+    void *data,
+    const struct timespec *deadline_ptr)
+{
+	const uint64_t unflagged_word = old_value << 1;
+	const uint64_t flagged_word = unflagged_word | 1;
+
+	if (CK_CC_UNLIKELY(ck_ec64_value(ec) != old_value)) {
+		return 0;
+	}
+
+	WAIT_SLOW_BODY(64, ec, ops, pred, data, deadline_ptr,
+		       old_value, unflagged_word, flagged_word);
+}
+#endif
+
+#undef WAIT_SLOW_BODY
--- a/src/ck_ec_timeutil.h
+++ b/src/ck_ec_timeutil.h
@ -0,0 +1,150 @@
+#ifndef CK_EC_TIMEUTIL_H
+#define CK_EC_TIMEUTIL_H
+#include <ck_cc.h>
+#include <ck_ec.h>
+#include <ck_limits.h>
+#include <ck_stdint.h>
+#include <sys/time.h>
+
+#define TIME_MAX ((time_t)((1ULL << ((sizeof(time_t) * CHAR_BIT) - 1)) - 1))
+#define NSEC_MAX ((1000L * 1000 * 1000) - 1)
+
+/*
+ * Approximates (nsec * multiplier) >> shift. Clamps to UINT32_MAX on
+ * overflow.
+ */
+CK_CC_UNUSED static uint32_t
+wait_time_scale(uint32_t nsec,
+		uint32_t multiplier,
+		unsigned int shift)
+{
+	uint64_t temp = (uint64_t)nsec * multiplier;
+	uint64_t max = (uint64_t)UINT32_MAX << shift;
+
+	if (temp >= max) {
+		return UINT32_MAX;
+	}
+
+	return temp >> shift;
+}
+
+
+/*
+ * Returns ts + ns. ns is clamped to at most 1 second. Clamps the
+ * return value to TIME_MAX, NSEC_MAX on overflow.
+ *
+ */
+CK_CC_UNUSED static struct timespec timespec_add_ns(const struct timespec ts,
+						    uint32_t ns)
+{
+	struct timespec ret = {
+		.tv_sec = TIME_MAX,
+		.tv_nsec = NSEC_MAX
+	};
+	time_t sec;
+	uint32_t sum_ns;
+
+	if (ns > (uint32_t)NSEC_MAX) {
+		if (ts.tv_sec >= TIME_MAX) {
+			return ret;
+		}
+
+		ret.tv_sec = ts.tv_sec + 1;
+		ret.tv_nsec = ts.tv_nsec;
+		return ret;
+	}
+
+	sec = ts.tv_sec;
+	sum_ns = ns + ts.tv_nsec;
+	if (sum_ns > NSEC_MAX) {
+		if (sec >= TIME_MAX) {
+			return ret;
+		}
+
+		sec++;
+		sum_ns -= (NSEC_MAX + 1);
+	}
+
+	ret.tv_sec = sec;
+	ret.tv_nsec = sum_ns;
+	return ret;
+}
+
+
+/*
+ * Returns ts + inc. If inc is negative, it is normalized to 0.
+ * Clamps the return value to TIME_MAX, NSEC_MAX on overflow.
+ */
+CK_CC_UNUSED static struct timespec timespec_add(const struct timespec ts,
+						 const struct timespec inc)
+{
+	/* Initial return value is clamped to infinite future. */
+	struct timespec ret = {
+		.tv_sec = TIME_MAX,
+		.tv_nsec = NSEC_MAX
+	};
+	time_t sec;
+	unsigned long nsec;
+
+	/* Non-positive delta is a no-op. Invalid nsec is another no-op. */
+	if (inc.tv_sec < 0 || inc.tv_nsec < 0 || inc.tv_nsec > NSEC_MAX) {
+		return ts;
+	}
+
+	/* Detect overflow early. */
+	if (inc.tv_sec > TIME_MAX - ts.tv_sec) {
+		return ret;
+	}
+
+	sec = ts.tv_sec + inc.tv_sec;
+	/* This sum can't overflow if the inputs are valid.*/
+	nsec = (unsigned long)ts.tv_nsec + inc.tv_nsec;
+
+	if (nsec > NSEC_MAX) {
+		if (sec >= TIME_MAX) {
+			return ret;
+		}
+
+		sec++;
+		nsec -= (NSEC_MAX + 1);
+	}
+
+	ret.tv_sec = sec;
+	ret.tv_nsec = nsec;
+	return ret;
+}
+
+/* Compares two timespecs. Returns -1 if x < y, 0 if x == y, and 1 if x > y. */
+CK_CC_UNUSED static int timespec_cmp(const struct timespec x,
+				     const struct timespec y)
+{
+	if (x.tv_sec != y.tv_sec) {
+		return (x.tv_sec < y.tv_sec) ? -1 : 1;
+	}
+
+	if (x.tv_nsec != y.tv_nsec) {
+		return (x.tv_nsec < y.tv_nsec) ? -1 : 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Overwrites now with the current CLOCK_MONOTONIC time, and returns
+ * true if the current time is greater than or equal to the deadline,
+ * or the clock is somehow broken.
+ */
+CK_CC_UNUSED static bool check_deadline(struct timespec *now,
+					const struct ck_ec_ops *ops,
+					const struct timespec deadline)
+{
+	int r;
+
+	r = ops->gettime(ops, now);
+	if (r != 0) {
+		return true;
+	}
+
+	return timespec_cmp(*now, deadline) >= 0;
+}
+#endif /* !CK_EC_TIMEUTIL_H */
--- a/src/ck_epoch.c
+++ b/src/ck_epoch.c
@ -127,6 +127,14 @@
 */
 #define CK_EPOCH_GRACE 3U

+/*
+ * CK_EPOCH_LENGTH must be a power-of-2 (because (CK_EPOCH_LENGTH - 1) is used
+ * as a mask, and it must be at least 3 (see comments above).
+ */
+#if (CK_EPOCH_LENGTH < 3 || (CK_EPOCH_LENGTH & (CK_EPOCH_LENGTH - 1)) != 0)
+#error "CK_EPOCH_LENGTH must be a power of 2 and >= 3"
+#endif
+
 enum {
 	CK_EPOCH_STATE_USED = 0,
 	CK_EPOCH_STATE_FREE = 1
@ -139,7 +147,7 @@ CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,

 #define CK_EPOCH_SENSE_MASK	(CK_EPOCH_SENSE - 1)

-void
+bool
 _ck_epoch_delref(struct ck_epoch_record *record,
    struct ck_epoch_section *section)
 {
@ -150,7 +158,7 @@ _ck_epoch_delref(struct ck_epoch_record *record,
 	current->count--;

 	if (current->count > 0)
-		return;
+		return false;

 	/*
 	 * If the current bucket no longer has any references, then
@ -161,8 +169,7 @@ _ck_epoch_delref(struct ck_epoch_record *record,
 	 * If no other active bucket exists, then the record will go
 	 * inactive in order to allow for forward progress.
 	 */
-	other = &record->local.bucket[(i + 1) &
-	    CK_EPOCH_SENSE_MASK];
+	other = &record->local.bucket[(i + 1) & CK_EPOCH_SENSE_MASK];
 	if (other->count > 0 &&
 	    ((int)(current->epoch - other->epoch) < 0)) {
 		/*
@ -172,7 +179,7 @@ _ck_epoch_delref(struct ck_epoch_record *record,
 		ck_pr_store_uint(&record->epoch, other->epoch);
 	}

-	return;
+	return true;
 }

 void
@ -230,7 +237,7 @@ ck_epoch_init(struct ck_epoch *global)
 }

 struct ck_epoch_record *
-ck_epoch_recycle(struct ck_epoch *global)
+ck_epoch_recycle(struct ck_epoch *global, void *ct)
 {
 	struct ck_epoch_record *record;
 	ck_stack_entry_t *cursor;
@ -249,6 +256,12 @@ ck_epoch_recycle(struct ck_epoch *global)
 			    CK_EPOCH_STATE_USED);
 			if (state == CK_EPOCH_STATE_FREE) {
 				ck_pr_dec_uint(&global->n_free);
+				ck_pr_store_ptr(&record->ct, ct);
+
+				/*
+				 * The context pointer is ordered by a
+				 * subsequent protected section.
+				 */
 				return record;
 			}
 		}
@ -258,7 +271,8 @@ ck_epoch_recycle(struct ck_epoch *global)
 }

 void
-ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record)
+ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record,
+    void *ct)
 {
 	size_t i;

@ -269,6 +283,7 @@ ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record)
 	record->n_dispatch = 0;
 	record->n_peak = 0;
 	record->n_pending = 0;
+	record->ct = ct;
 	memset(&record->local, 0, sizeof record->local);

 	for (i = 0; i < CK_EPOCH_LENGTH; i++)
@ -295,6 +310,7 @@ ck_epoch_unregister(struct ck_epoch_record *record)
 	for (i = 0; i < CK_EPOCH_LENGTH; i++)
 		ck_stack_init(&record->pending[i]);

+	ck_pr_store_ptr(&record->ct, NULL);
 	ck_pr_fence_store();
 	ck_pr_store_uint(&record->state, CK_EPOCH_STATE_FREE);
 	ck_pr_inc_uint(&global->n_free);
@ -340,31 +356,41 @@ ck_epoch_scan(struct ck_epoch *global,
 	return NULL;
 }

-static void
-ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e)
+static unsigned int
+ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e, ck_stack_t *deferred)
 {
 	unsigned int epoch = e & (CK_EPOCH_LENGTH - 1);
 	ck_stack_entry_t *head, *next, *cursor;
+	unsigned int n_pending, n_peak;
 	unsigned int i = 0;

-	head = CK_STACK_FIRST(&record->pending[epoch]);
-	ck_stack_init(&record->pending[epoch]);
-
+	head = ck_stack_batch_pop_upmc(&record->pending[epoch]);
 	for (cursor = head; cursor != NULL; cursor = next) {
 		struct ck_epoch_entry *entry =
 		    ck_epoch_entry_container(cursor);

 		next = CK_STACK_NEXT(cursor);
-		entry->function(entry);
+		if (deferred != NULL)
+			ck_stack_push_spnc(deferred, &entry->stack_entry);
+		else
+			entry->function(entry);
+
 		i++;
 	}

-	if (record->n_pending > record->n_peak)
-		record->n_peak = record->n_pending;
+	n_peak = ck_pr_load_uint(&record->n_peak);
+	n_pending = ck_pr_load_uint(&record->n_pending);

-	record->n_dispatch += i;
-	record->n_pending -= i;
-	return;
+	/* We don't require accuracy around peak calculation. */
+	if (n_pending > n_peak)
+		ck_pr_store_uint(&record->n_peak, n_peak);
+
+	if (i > 0) {
+		ck_pr_add_uint(&record->n_dispatch, i);
+		ck_pr_sub_uint(&record->n_pending, i);
+	}
+
+	return i;
 }

 /*
@ -376,7 +402,18 @@ ck_epoch_reclaim(struct ck_epoch_record *record)
 	unsigned int epoch;

 	for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++)
-		ck_epoch_dispatch(record, epoch);
+		ck_epoch_dispatch(record, epoch, NULL);
+
+	return;
+}
+
+CK_CC_FORCE_INLINE static void
+epoch_block(struct ck_epoch *global, struct ck_epoch_record *cr,
+    ck_epoch_wait_cb_t *cb, void *ct)
+{
+
+	if (cb != NULL)
+		cb(global, cr, ct);

 	return;
 }
@ -385,9 +422,9 @@ ck_epoch_reclaim(struct ck_epoch_record *record)
 * This function must not be called with-in read section.
 */
 void
-ck_epoch_synchronize(struct ck_epoch_record *record)
+ck_epoch_synchronize_wait(struct ck_epoch *global,
+    ck_epoch_wait_cb_t *cb, void *ct)
 {
-	struct ck_epoch *global = record->global;
 	struct ck_epoch_record *cr;
 	unsigned int delta, epoch, goal, i;
 	bool active;
@ -424,10 +461,27 @@ ck_epoch_synchronize(struct ck_epoch_record *record)
 			 * period.
 			 */
 			e_d = ck_pr_load_uint(&global->epoch);
-			if (e_d != delta) {
-				delta = e_d;
-				goto reload;
+			if (e_d == delta) {
+				epoch_block(global, cr, cb, ct);
+				continue;
 			}
+
+			/*
+			 * If the epoch has been updated, we may have already
+			 * met our goal.
+			 */
+			delta = e_d;
+			if ((goal > epoch) & (delta >= goal))
+				goto leave;
+
+			epoch_block(global, cr, cb, ct);
+
+			/*
+			 * If the epoch has been updated, then a grace period
+			 * requires that all threads are observed idle at the
+			 * same epoch.
+			 */
+			cr = NULL;
 		}

 		/*
@ -459,20 +513,6 @@ ck_epoch_synchronize(struct ck_epoch_record *record)
 		 * Otherwise, we have just acquired latest snapshot.
 		 */
 		delta = delta + r;
-		continue;
-
-reload:
-		if ((goal > epoch) & (delta >= goal)) {
-			/*
-			 * Right now, epoch overflow is handled as an edge
-			 * case. If we have already observed an epoch
-			 * generation, then we can be sure no hazardous
-			 * references exist to objects from this generation. We
-			 * can actually avoid an addtional scan step at this
-			 * point.
-			 */
-			break;
-		}
 	}

 	/*
@ -480,8 +520,16 @@ reload:
 	 * However, if non-temporal instructions are used, full barrier
 	 * semantics are necessary.
 	 */
+leave:
 	ck_pr_fence_memory();
-	record->epoch = delta;
+	return;
+}
+
+void
+ck_epoch_synchronize(struct ck_epoch_record *record)
+{
+
+	ck_epoch_synchronize_wait(record->global, NULL, NULL);
 	return;
 }

@ -494,6 +542,16 @@ ck_epoch_barrier(struct ck_epoch_record *record)
 	return;
 }

+void
+ck_epoch_barrier_wait(struct ck_epoch_record *record, ck_epoch_wait_cb_t *cb,
+    void *ct)
+{
+
+	ck_epoch_synchronize_wait(record->global, cb, ct);
+	ck_epoch_reclaim(record);
+	return;
+}
+
 /*
 * It may be worth it to actually apply these deferral semantics to an epoch
 * that was observed at ck_epoch_call time. The problem is that the latter
@ -505,41 +563,61 @@ ck_epoch_barrier(struct ck_epoch_record *record)
 * is far from ideal too.
 */
 bool
-ck_epoch_poll(struct ck_epoch_record *record)
+ck_epoch_poll_deferred(struct ck_epoch_record *record, ck_stack_t *deferred)
 {
 	bool active;
 	unsigned int epoch;
-	unsigned int snapshot;
 	struct ck_epoch_record *cr = NULL;
 	struct ck_epoch *global = record->global;
+	unsigned int n_dispatch;

 	epoch = ck_pr_load_uint(&global->epoch);

 	/* Serialize epoch snapshots with respect to global epoch. */
 	ck_pr_fence_memory();
+
+	/*
+	 * At this point, epoch is the current global epoch value.
+	 * There may or may not be active threads which observed epoch - 1.
+	 * (ck_epoch_scan() will tell us that). However, there should be
+	 * no active threads which observed epoch - 2.
+	 *
+	 * Note that checking epoch - 2 is necessary, as race conditions can
+	 * allow another thread to increment the global epoch before this
+	 * thread runs.
+	 */
+	n_dispatch = ck_epoch_dispatch(record, epoch - 2, deferred);
+
 	cr = ck_epoch_scan(global, cr, epoch, &active);
-	if (cr != NULL) {
-		record->epoch = epoch;
-		return false;
-	}
+	if (cr != NULL)
+		return (n_dispatch > 0);

 	/* We are at a grace period if all threads are inactive. */
 	if (active == false) {
 		record->epoch = epoch;
 		for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++)
-			ck_epoch_dispatch(record, epoch);
+			ck_epoch_dispatch(record, epoch, deferred);

 		return true;
 	}

-	/* If an active thread exists, rely on epoch observation. */
-	if (ck_pr_cas_uint_value(&global->epoch, epoch, epoch + 1,
-	    &snapshot) == false) {
-		record->epoch = snapshot;
-	} else {
-		record->epoch = epoch + 1;
-	}
+	/*
+	 * If an active thread exists, rely on epoch observation.
+	 *
+	 * All the active threads entered the epoch section during
+	 * the current epoch. Therefore, we can now run the handlers
+	 * for the immediately preceding epoch and attempt to
+	 * advance the epoch if it hasn't been already.
+	 */
+	(void)ck_pr_cas_uint(&global->epoch, epoch, epoch + 1);

-	ck_epoch_dispatch(record, epoch + 1);
+	ck_epoch_dispatch(record, epoch - 1, deferred);
 	return true;
 }
+
+bool
+ck_epoch_poll(struct ck_epoch_record *record)
+{
+
+	return ck_epoch_poll_deferred(record, NULL);
+}
--- a/src/ck_hs.c
+++ b/src/ck_hs.c
@ -105,19 +105,10 @@ ck_hs_map_signal(struct ck_hs_map *map, unsigned long h)
 	return;
 }

-void
-ck_hs_iterator_init(struct ck_hs_iterator *iterator)
+static bool 
+_ck_hs_next(struct ck_hs *hs, struct ck_hs_map *map,
+    struct ck_hs_iterator *i, void **key)
 {
-
-	iterator->cursor = NULL;
-	iterator->offset = 0;
-	return;
-}
-
-bool
-ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key)
-{
-	struct ck_hs_map *map = hs->map;
 	void *value;

 	if (i->offset >= map->capacity)
@ -129,6 +120,8 @@ ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key)
 #ifdef CK_HS_PP
 			if (hs->mode & CK_HS_MODE_OBJECT)
 				value = CK_HS_VMA(value);
+#else
+			(void)hs; /* Avoid unused parameter warning. */
 #endif
 			i->offset++;
 			*key = value;
@ -139,6 +132,35 @@ ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key)
 	return false;
 }

+void
+ck_hs_iterator_init(struct ck_hs_iterator *iterator)
+{
+
+	iterator->cursor = NULL;
+	iterator->offset = 0;
+	iterator->map = NULL;
+	return;
+}
+
+bool
+ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key)
+{
+
+	return _ck_hs_next(hs, hs->map, i, key);
+}
+
+bool
+ck_hs_next_spmc(struct ck_hs *hs, struct ck_hs_iterator *i, void **key)
+{
+	struct ck_hs_map *m = i->map;
+
+	if (m == NULL) {
+		m = i->map = ck_pr_load_ptr(&hs->map);
+	}
+
+	return _ck_hs_next(hs, m, i, key);
+}
+
 void
 ck_hs_stat(struct ck_hs *hs, struct ck_hs_stat *st)
 {
@ -206,7 +228,7 @@ ck_hs_map_create(struct ck_hs *hs, unsigned long entries)
 	map->probe_limit = (unsigned int)limit;
 	map->probe_maximum = 0;
 	map->capacity = n_entries;
-	map->step = ck_internal_bsf(n_entries);
+	map->step = ck_cc_ffsl(n_entries);
 	map->mask = n_entries - 1;
 	map->n_entries = 0;

--- a/src/ck_ht.c
+++ b/src/ck_ht.c
@ -30,9 +30,6 @@
 /*
 * This implementation borrows several techniques from Josh Dybnis's
 * nbds library which can be found at http://code.google.com/p/nbds
- *
- * This release currently only includes support for 64-bit platforms.
- * We can address 32-bit platforms in a future release.
 */
 #include <ck_cc.h>
 #include <ck_md.h>
@ -171,7 +168,7 @@ ck_ht_map_create(struct ck_ht *table, CK_HT_TYPE entries)
 	map->deletions = 0;
 	map->probe_maximum = 0;
 	map->capacity = n_entries;
-	map->step = ck_internal_bsf_64(map->capacity);
+	map->step = ck_cc_ffsll(map->capacity);
 	map->mask = map->capacity - 1;
 	map->n_entries = 0;
 	map->entries = (struct ck_ht_entry *)(((uintptr_t)&map[1] + prefix +
--- a/src/ck_ht_hash.h
+++ b/src/ck_ht_hash.h
@ -88,7 +88,15 @@ static inline uint64_t rotl64 ( uint64_t x, int8_t r )

 FORCE_INLINE static uint32_t getblock ( const uint32_t * p, int i )
 {
+#ifdef __s390x__
+  uint32_t res;
+
+  __asm__ ("	lrv	%0,%1\n"
+	   : "=r" (res) : "Q" (p[i]) : "cc", "mem");
+  return res;
+#else
  return p[i];
+#endif /* !__s390x__ */
 }

 //-----------------------------------------------------------------------------
@ -147,7 +155,9 @@ static inline void MurmurHash3_x86_32 ( const void * key, int len,
  switch(len & 3)
  {
  case 3: k1 ^= tail[2] << 16;
+  /* fall through */
  case 2: k1 ^= tail[1] << 8;
+  /* fall through */
  case 1: k1 ^= tail[0];
          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
  };
@ -196,11 +206,17 @@ static inline uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed
  switch(len & 7)
  {
  case 7: h ^= (uint64_t)(data2[6]) << 48;
+  /* fall through */
  case 6: h ^= (uint64_t)(data2[5]) << 40;
+  /* fall through */
  case 5: h ^= (uint64_t)(data2[4]) << 32;
+  /* fall through */
  case 4: h ^= (uint64_t)(data2[3]) << 24;
+  /* fall through */
  case 3: h ^= (uint64_t)(data2[2]) << 16;
+  /* fall through */
  case 2: h ^= (uint64_t)(data2[1]) << 8;
+  /* fall through */
  case 1: h ^= (uint64_t)(data2[0]);
          h *= m;
  };
@ -249,7 +265,9 @@ static inline uint64_t MurmurHash64B ( const void * key, int len, uint64_t seed
  switch(len)
  {
  case 3: h2 ^= ((const unsigned char*)data)[2] << 16;
+  /* fall through */
  case 2: h2 ^= ((const unsigned char*)data)[1] << 8;
+  /* fall through */
  case 1: h2 ^= ((const unsigned char*)data)[0];
      h2 *= m;
  };
--- a/src/ck_internal.h
+++ b/src/ck_internal.h
@ -80,40 +80,3 @@ ck_internal_max_32(uint32_t x, uint32_t y)

 	return x ^ ((x ^ y) & -(x < y));
 }
-
-CK_CC_INLINE static unsigned long
-ck_internal_bsf(unsigned long v)
-{
-#if defined(__GNUC__)
-	return __builtin_ffs(v);
-#else
-	unsigned int i;
-	const unsigned int s = sizeof(unsigned long) * 8 - 1;
-
-	for (i = 0; i < s; i++) {
-		if (v & (1UL << (s - i)))
-			return sizeof(unsigned long) * 8 - i;
-	}
-
-	return 1;
-#endif /* !__GNUC__ */
-}
-
-CK_CC_INLINE static uint64_t
-ck_internal_bsf_64(uint64_t v)
-{
-#if defined(__GNUC__)
-	return __builtin_ffs(v);
-#else
-	unsigned int i;
-	const unsigned int s = sizeof(unsigned long) * 8 - 1;
-
-	for (i = 0; i < s; i++) {
-		if (v & (1ULL << (63U - i)))
-			return i;
-	}
-#endif /* !__GNUC__ */
-
-	return 1;
-}
-
--- a/src/ck_rhs.c
+++ b/src/ck_rhs.c
@ -366,7 +366,7 @@ ck_rhs_map_create(struct ck_rhs *hs, unsigned long entries)
 	map->probe_limit = (unsigned int)limit;
 	map->probe_maximum = 0;
 	map->capacity = n_entries;
-	map->step = ck_internal_bsf(n_entries);
+	map->step = ck_cc_ffsl(n_entries);
 	map->mask = n_entries - 1;
 	map->n_entries = 0;