Merging upstream version 0.7.1 (Closes: #991419).

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-09 07:39:31 +01:00 · 2025-02-09 07:39:31 +01:00 · 9e09e0ef69
commit 9e09e0ef69
parent 05c588e9d7
99 changed files with 6727 additions and 943 deletions
--- a/.cirrus.yml
+++ b/.cirrus.yml
@ -0,0 +1,14 @@
+env:
+  CIRRUS_CLONE_DEPTH: 1
+  ARCH: amd64
+
+task:
+  freebsd_instance:
+    matrix:
+      image: freebsd-12-0-release-amd64
+      image: freebsd-11-2-release-amd64
+  script:
+   - cc --version
+   - export CFLAGS="-DITERATE=400 -DPAIRS_S=100 -DITERATIONS=24"
+   - ./tools/ci-build.sh --cores=$(sysctl -n hw.ncpu)
+   - make check
--- a/.drone.yml
+++ b/.drone.yml
@ -0,0 +1,29 @@
+kind: pipeline
+name: gcc/amd64/linux
+
+platform:
+  arch: amd64
+
+steps:
+- name: build
+  image: gcc
+  pull: true
+  commands:
+  - ./tools/ci-build.sh --cores=4
+  - make check
+
+---
+kind: pipeline
+name: gcc/arm64/linux
+
+platform:
+  arch: arm64
+
+steps:
+- name: build
+  image: gcc
+  pull: true
+  commands:
+  - ./tools/ci-build.sh --cores=4
+  - make check
+
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@ -0,0 +1,71 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+name: "CodeQL"
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [master]
+  schedule:
+    - cron: '0 11 * * 5'
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        # Override automatic language detection by changing the below list
+        # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python']
+        language: ['cpp']
+        # Learn more...
+        # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+      with:
+        # We must fetch at least the immediate parents so that if this is
+        # a pull request then we can checkout the head.
+        fetch-depth: 2
+
+    # If this run was triggered by a pull request event, then checkout
+    # the head of the pull request instead of the merge commit.
+    - run: git checkout HEAD^2
+      if: ${{ github.event_name == 'pull_request' }}
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v1
+      with:
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file. 
+        # Prefix the list here with "+" to use these queries and those in the config file.
+        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v1
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
+    #    and modify them (or add more) to build your code if your project
+    #    uses a compiled language
+
+    #- run: |
+    #   make bootstrap
+    #   make release
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v1
--- a/.gitignore
+++ b/.gitignore
@ -1,27 +1,25 @@
-/Makefile
+*.a
 build/ck.build
 build/ck.pc
-build/regressions.build
 build/ck.spec
-include/ck_md.h
-src/Makefile
-doc/Makefile
-doc/*.3
 build/Makefile
+build/regressions.build
+doc/*.3
+doc/Makefile
 .DS_Store
-LOG
-*.log
-*.html
-*.gz
-*.o
-*.a
-*.so
 *.dSYM
-.*.sw[op]
 GPATH
 GRTAGS
 GTAGS
+*.gz
+*.html
 ID
+include/ck_md.h
+include/freebsd/ck_md.h
+*.log
+LOG
+/Makefile
+*.o
 regressions/ck_array/validate/serial
 regressions/ck_backoff/validate/validate
 regressions/ck_bag/validate/order
@ -37,17 +35,28 @@ regressions/ck_brlock/benchmark/throughput
 regressions/ck_brlock/validate/validate
 regressions/ck_bytelock/benchmark/latency
 regressions/ck_bytelock/validate/validate
+regressions/ck_cc/validate/ck_cc
+regressions/ck_cc/validate/ck_cc_nobuiltin
 regressions/ck_cohort/benchmark/ck_cohort.LATENCY
 regressions/ck_cohort/benchmark/ck_cohort.THROUGHPUT
 regressions/ck_cohort/validate/validate
+regressions/ck_ec/benchmark/ck_ec
+regressions/ck_ec/validate/ck_ec_smoke_test
+regressions/ck_ec/validate/prop_test_slow_wakeup
+regressions/ck_ec/validate/prop_test_timeutil_add
+regressions/ck_ec/validate/prop_test_timeutil_add_ns
+regressions/ck_ec/validate/prop_test_timeutil_cmp
+regressions/ck_ec/validate/prop_test_timeutil_scale
+regressions/ck_ec/validate/prop_test_value
+regressions/ck_ec/validate/prop_test_wakeup
 regressions/ck_epoch/validate/ck_epoch_call
 regressions/ck_epoch/validate/ck_epoch_poll
 regressions/ck_epoch/validate/ck_epoch_section
 regressions/ck_epoch/validate/ck_epoch_section_2
-regressions/ck_epoch/validate/torture
 regressions/ck_epoch/validate/ck_epoch_synchronize
 regressions/ck_epoch/validate/ck_stack
 regressions/ck_epoch/validate/ck_stack_read
+regressions/ck_epoch/validate/torture
 regressions/ck_fifo/benchmark/latency
 regressions/ck_fifo/validate/ck_fifo_mpmc
 regressions/ck_fifo/validate/ck_fifo_mpmc_iterator
@ -75,11 +84,15 @@ regressions/ck_ht/validate/serial.delete
 regressions/ck_pflock/benchmark/latency
 regressions/ck_pflock/benchmark/throughput
 regressions/ck_pflock/validate/validate
+regressions/ck_pr/benchmark/ck_pr_add_64
 regressions/ck_pr/benchmark/ck_pr_cas_64
 regressions/ck_pr/benchmark/ck_pr_cas_64_2
+regressions/ck_pr/benchmark/ck_pr_faa_64
 regressions/ck_pr/benchmark/ck_pr_fas_64
+regressions/ck_pr/benchmark/ck_pr_neg_64
 regressions/ck_pr/benchmark/fp
 regressions/ck_pr/validate/ck_pr_add
+regressions/ck_pr/validate/ck_pr_add
 regressions/ck_pr/validate/ck_pr_and
 regressions/ck_pr/validate/ck_pr_bin
 regressions/ck_pr/validate/ck_pr_btc
@ -88,10 +101,13 @@ regressions/ck_pr/validate/ck_pr_bts
 regressions/ck_pr/validate/ck_pr_btx
 regressions/ck_pr/validate/ck_pr_cas
 regressions/ck_pr/validate/ck_pr_dec
+regressions/ck_pr/validate/ck_pr_dec_zero
 regressions/ck_pr/validate/ck_pr_faa
 regressions/ck_pr/validate/ck_pr_fas
 regressions/ck_pr/validate/ck_pr_fax
+regressions/ck_pr/validate/ck_pr_fence
 regressions/ck_pr/validate/ck_pr_inc
+regressions/ck_pr/validate/ck_pr_inc_zero
 regressions/ck_pr/validate/ck_pr_load
 regressions/ck_pr/validate/ck_pr_n
 regressions/ck_pr/validate/ck_pr_or
@ -106,12 +122,12 @@ regressions/ck_rhs/benchmark/parallel_bytestring
 regressions/ck_rhs/benchmark/serial
 regressions/ck_rhs/validate/serial
 regressions/ck_ring/benchmark/latency
+regressions/ck_ring/validate/ck_ring_mpmc
+regressions/ck_ring/validate/ck_ring_mpmc_template
 regressions/ck_ring/validate/ck_ring_spmc
 regressions/ck_ring/validate/ck_ring_spmc_template
 regressions/ck_ring/validate/ck_ring_spsc
 regressions/ck_ring/validate/ck_ring_spsc_template
-regressions/ck_ring/validate/ck_ring_mpmc
-regressions/ck_ring/validate/ck_ring_mpmc_template
 regressions/ck_rwcohort/benchmark/ck_neutral.LATENCY
 regressions/ck_rwcohort/benchmark/ck_neutral.THROUGHPUT
 regressions/ck_rwcohort/benchmark/ck_rp.LATENCY
@ -143,9 +159,9 @@ regressions/ck_spinlock/benchmark/ck_mcs.THROUGHPUT
 regressions/ck_spinlock/benchmark/ck_spinlock.LATENCY
 regressions/ck_spinlock/benchmark/ck_spinlock.THROUGHPUT
 regressions/ck_spinlock/benchmark/ck_ticket.LATENCY
-regressions/ck_spinlock/benchmark/ck_ticket.THROUGHPUT
 regressions/ck_spinlock/benchmark/ck_ticket_pb.LATENCY
 regressions/ck_spinlock/benchmark/ck_ticket_pb.THROUGHPUT
+regressions/ck_spinlock/benchmark/ck_ticket.THROUGHPUT
 regressions/ck_spinlock/benchmark/linux_spinlock.LATENCY
 regressions/ck_spinlock/benchmark/linux_spinlock.THROUGHPUT
 regressions/ck_spinlock/validate/ck_anderson
@ -185,3 +201,6 @@ regressions/ck_swlock/validate/validate
 regressions/ck_tflock/benchmark/latency
 regressions/ck_tflock/benchmark/throughput
 regressions/ck_tflock/validate/validate
+*.so
+src/Makefile
+.*.sw[op]
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,39 @@
+# sudo required as tests set cpu affinity
+sudo: false
+
+os:
+  - linux
+  - linux-ppc64le
+  - osx
+
+language:
+  - c
+
+compiler:
+  - gcc
+  - clang
+
+matrix:
+  exclude:
+    - os: osx
+      compiler: gcc
+addons:
+  apt:
+    sources:
+      - ubuntu-toolchain-r-test
+    packages:
+      - gcc-8
+      - clang-6.0
+
+script:
+  - >
+    if [[ $TRAVIS_OS_NAME == linux ]]; then
+      case "$CC" in
+        gcc) export CC=gcc-8 ;;
+        clang) export CC=clang-6.0 ;;
+      esac
+    fi
+  - ${CC} --version
+  - export CFLAGS="-DITERATE=400 -DPAIRS_S=100 -DITERATIONS=24"
+  - ./tools/ci-build.sh --cores=4
+  - make check
--- a/1
+++ b/1
@ -0,0 +1 @@
+concurrencykit.org
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,76 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at sbahra@repnop.org. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
--- a/Makefile.in
+++ b/Makefile.in
@ -36,12 +36,12 @@ install-headers:
 	mkdir -p $(DESTDIR)/$(HEADERS) || exit
 	cp $(SRC_DIR)/include/*.h $(DESTDIR)/$(HEADERS) || exit
 	chmod 644 $(DESTDIR)/$(HEADERS)/ck_*.h || exit
-	mkdir -p $(DESTDIR)$(HEADERS)/gcc || exit
+	mkdir -p $(DESTDIR)/$(HEADERS)/gcc || exit
 	cp -r $(SRC_DIR)/include/gcc/* $(DESTDIR)/$(HEADERS)/gcc || exit
 	cp include/ck_md.h $(DESTDIR)/$(HEADERS)/ck_md.h || exit
 	chmod 755 $(DESTDIR)/$(HEADERS)/gcc
 	chmod 644 $(DESTDIR)/$(HEADERS)/gcc/ck_*.h $(DESTDIR)/$(HEADERS)/gcc/*/ck_*.h || exit
-	mkdir -p $(DESTDIR)$(HEADERS)/spinlock || exit
+	mkdir -p $(DESTDIR)/$(HEADERS)/spinlock || exit
 	cp -r $(SRC_DIR)/include/spinlock/* $(DESTDIR)/$(HEADERS)/spinlock || exit
 	chmod 755 $(DESTDIR)/$(HEADERS)/spinlock
 	chmod 644 $(DESTDIR)/$(HEADERS)/spinlock/*.h || exit
--- a/21
+++ b/21
@ -1,21 +0,0 @@
-  ____                                                        _  ___ _
- / ___|___  _ __   ___ _   _ _ __ _ __ ___ _ __   ___ _   _  | |/ (_) |_
-| |   / _ \| '_ \ / __| | | | '__| '__/ _ \ '_ \ / __| | | | | ' /| | __|
-| |__| (_) | | | | (__| |_| | |  | | |  __/ | | | (__| |_| | | . \| | |_
- \____\___/|_| |_|\___|\__,_|_|  |_|  \___|_| |_|\___|\__, | |_|\_\_|\__|
-                                                      |___/
-
-Step 1.
-	./configure
-	For additional options try ./configure --help
-
-Step 2.
-	In order to compile regressions (requires POSIX threads) use
-        "make regressions". In order to compile libck use "make all" or "make".
-
-Step 3.
-	In order to install use "make install"
-	To uninstall use "make uninstall".
-
-See http://concurrencykit.org/ for more information.
-
--- a/README.md
+++ b/README.md
@ -0,0 +1,204 @@
+### Continuous Integration
+
+| Drone | Travis | Cirrus |
+| -------- | ------ | ------- |
+| [![Build Status](https://cloud.drone.io/api/badges/concurrencykit/ck/status.svg)](https://cloud.drone.io/concurrencykit/ck) | [![Build Status](https://travis-ci.org/concurrencykit/ck.svg)](https://travis-ci.org/concurrencykit/ck) | [![Build Status](https://api.cirrus-ci.com/github/concurrencykit/ck.svg?branch=master)](https://cirrus-ci.com/github/concurrencykit/ck) |
+
+Compilers tested in the past include gcc, clang, cygwin, icc, mingw32, mingw64 and suncc across all supported architectures. All new architectures are required to pass the integration test and under-go extensive code review.
+
+Continuous integration is currently enabled for the following targets:
+ * `darwin/clang/x86-64`
+ * `freebsd/clang/x86-64`
+ * `linux/gcc/arm64`
+ * `linux/gcc/x86-64`
+ * `linux/clang/x86-64`
+ * `linux/clang/ppc64le`
+
+### Compile and Build
+
+* Step 1.
+        `./configure`
+        For additional options try `./configure --help`
+
+* Step 2.
+        In order to compile regressions (requires POSIX threads) use
+        `make regressions`. In order to compile libck use `make all` or `make`.
+
+* Step 3.
+	In order to install use `make install`
+	To uninstall use `make uninstall`.
+
+See http://concurrencykit.org/ for more information.
+
+### Supported Architectures
+
+Concurrency Kit supports any architecture using compiler built-ins as a fallback. There is usually a performance degradation associated with this.
+
+Concurrency Kit has specialized assembly for the following architectures:
+ * `aarch64`
+ * `arm`
+ * `ppc`
+ * `ppc64`
+ * `s390x`
+ * `sparcv9+`
+ * `x86`
+ * `x86_64`
+ 
+### Features
+
+#### Concurrency Primitives
+
+##### ck_pr
+
+Concurrency primitives as made available by the underlying architecture, includes support for all atomic operations (natively), transactional memory, pipeline control, read-for-ownership and more.
+
+##### ck_backoff
+
+A simple and efficient (minimal noise) backoff function.
+
+##### ck_cc
+
+Abstracted compiler builtins when writing efficient concurrent data structures.
+
+#### Safe Memory Reclamation
+
+##### ck_epoch
+
+A scalable safe memory reclamation mechanism with support idle threads and various optimizations that make it better than or competitive with many state-of-the-art solutions.
+
+##### ck_hp
+
+Implements support for hazard pointers, a simple and efficient lock-free safe memory reclamation mechanism.
+
+#### Data Structures
+
+##### ck_array
+
+A simple concurrently-readable pointer array structure.
+
+##### ck_bitmap
+
+An efficient multi-reader and multi-writer concurrent bitmap structure.
+
+##### ck_ring
+
+Efficient concurrent bounded FIFO data structures with various performance trade-off. This includes specialization for single-reader, many-reader, single-writer and many-writer.
+
+##### ck_fifo
+
+A reference implementation of the first published lock-free FIFO algorithm, with specialization for single-enqueuer-single-dequeuer and many-enqueuer-single-dequeuer and extensions to allow for node re-use.
+
+##### ck_hp_fifo
+
+A reference implementation of the above algorithm, implemented with safe memory reclamation using hazard pointers.
+
+##### ck_hp_stack
+
+A reference implementation of a Treiber stack with support for hazard pointers.
+
+##### ck_stack
+
+A reference implementation of an efficient lock-free stack, with specialized variants for a variety of memory management strategies and bounded concurrency.
+
+##### ck_queue
+
+A concurrently readable friendly derivative of the BSD-queue interface. Coupled with a safe memory reclamation mechanism, implement scalable read-side queues with a simple search and replace.
+
+##### ck_hs
+
+An extremely efficient single-writer-many-reader hash set, that satisfies lock-freedom with bounded concurrency without any usage of atomic operations and allows for recycling of unused or deleted slots. This data structure is recommended for use as a general hash-set if it is possible to compute values from keys. Learn more at https://engineering.backtrace.io/workload-specialization/ and http://concurrencykit.org/articles/ck_hs.html.
+
+##### ck_ht
+
+A specialization of the `ck_hs` algorithm allowing for disjunct key-value pairs.
+
+##### ck_rhs
+
+A variant of `ck_hs` that utilizes robin-hood hashing to allow for improved performance with higher load factors and high deletion rates.
+
+#### Synchronization Primitives
+
+##### ck_ec
+
+An extremely efficient event counter implementation, a better alternative to condition variables.
+
+##### ck_barrier
+
+A plethora of execution barriers including: centralized barriers, combining barriers, dissemination barriers, MCS barriers, tournament barriers.
+
+##### ck_brlock
+
+A simple big-reader lock implementation, write-biased reader-writer lock with scalable read-side locking.
+
+##### ck_bytelock
+
+An implementation of bytelocks, for research purposes, allowing for (in theory), fast read-side acquisition without the use of atomic operations. In reality, memory barriers are required on the fast path.
+
+##### ck_cohort
+
+A generic lock cohorting interface, allows you to turn any lock into a NUMA-friendly scalable NUMA lock. There is a significant trade-off in fast path acquisition cost. Specialization is included for all relevant lock implementations in Concurrency Kit. Learn more by reading "Lock Cohorting: A General Technique for Designing NUMA Locks".
+
+##### ck_elide
+
+A generic lock elision framework, allows you to turn any lock implementation into an elision-aware implementation. This requires support for restricted transactional memory by the underlying hardware.
+
+##### ck_pflock
+
+Phase-fair reader-writer mutex that provides strong fairness guarantees between readers and writers. Learn more by reading "Spin-Based Reader-Writer Synchronization for Multiprocessor Real-Time Systems".
+
+##### ck_rwcohort
+
+A generic read-write lock cohorting interface, allows you to turn any read-write lock into a NUMA-friendly scalable NUMA lock. There is a significant trade-off in fast path acquisition cost. Specialization is included for all relevant lock implementations in Concurrency Kit. Learn more by reading "Lock Cohorting: A General Technique for Designing NUMA Locks".
+
+##### ck_rwlock
+
+A simple centralized write-biased read-write lock.
+
+##### ck_sequence
+
+A sequence counter lock, popularized by the Linux kernel, allows for very fast read and write synchronization for simple data structures where deep copy is permitted.
+
+##### ck_swlock
+
+A single-writer specialized read-lock that is copy-safe, useful for data structures that must remain small, be copied and contain in-band mutexes.
+
+##### ck_tflock
+
+Task-fair locks are fair read-write locks, derived from "Scalable reader-writer synchronization for shared-memory multiprocessors".
+
+##### ck_spinlock
+
+A basic but very fast spinlock implementation.
+
+##### ck_spinlock_anderson
+
+Scalable and fast anderson spinlocks. This is here for reference, one of the earliest scalable and fair lock implementations.
+
+##### ck_spinlock_cas
+
+A basic spinlock utilizing compare_and_swap.
+
+##### ck_spinlock_dec
+
+A basic spinlock, a C adaption of the older optimized Linux kernel spinlock for x86. Primarily here for reference.
+
+##### ck_spinlock_fas
+
+A basic spinlock utilizing atomic exchange.
+
+##### ck_spinlock_clh
+
+An efficient implementation of the scalable CLH lock, providing many of the same performance properties of MCS with a better fast-path.
+
+##### ck_spinlock_hclh
+
+A NUMA-friendly CLH lock.
+
+##### ck_spinlock_mcs
+
+An implementation of the seminal scalable and fair MCS lock.
+
+##### ck_spinlock_ticket
+
+An implementation of fair centralized locks.
+
--- a/_config.yml
+++ b/_config.yml
@ -0,0 +1 @@
+theme: jekyll-theme-cayman
--- a/build/ck.build.s390x
+++ b/build/ck.build.s390x
@ -0,0 +1 @@
+CFLAGS+=-O2 -D__s390x__
--- a/build/ck.build.unknown
+++ b/build/ck.build.unknown
--- a/227
+++ b/227
@ -34,7 +34,7 @@ WANT_PIC=yes

 P_PWD=`pwd`
 MAINTAINER='sbahra@repnop.org'
-VERSION=${VERSION:-'0.6.0'}
+VERSION=${VERSION:-'0.7.1'}
 VERSION_MAJOR='0'
 BUILD="$PWD/build/ck.build"
 PREFIX=${PREFIX:-"/usr/local"}
@ -119,6 +119,9 @@ generate()
 	    -e "s#@GZIP_SUFFIX@#$GZIP_SUFFIX#g"			\
 	    -e "s#@POINTER_PACK_ENABLE@#$POINTER_PACK_ENABLE#g"	\
 	    -e "s#@DISABLE_DOUBLE@#$DISABLE_DOUBLE#g"		\
+	    -e "s#@DISABLE_STATIC@#$DISABLE_STATIC#g"		\
+	    -e "s#@SSE_DISABLE@#$SSE_DISABLE#g"			\
+	    -e "s#@PPC32_LWSYNC_ENABLE@#$PPC32_LWSYNC_ENABLE#g"	\
 	    -e "s#@RTM_ENABLE@#$RTM_ENABLE#g"			\
 	    -e "s#@LSE_ENABLE@#$LSE_ENABLE#g"			\
 	    -e "s#@VMA_BITS@#$VMA_BITS_R#g"			\
@ -144,6 +147,7 @@ generate_stdout()
 	echo "           SRC_DIR = $BUILD_DIR"
 	echo "            SYSTEM = $SYSTEM"
 	echo "           PROFILE = $PROFILE"
+	echo "                AR = $AR"
 	echo "                CC = $CC"
 	echo "          COMPILER = $COMPILER"
 	echo "            CFLAGS = $CFLAGS"
@ -153,13 +157,16 @@ generate_stdout()
 	echo "    LDNAME_VERSION = $LDNAME_VERSION"
 	echo "      LDNAME_MAJOR = $LDNAME_MAJOR"
 	echo "           LDFLAGS = $LDFLAGS"
+	echo "        STATIC_LIB = $DISABLE_STATIC"
 	echo "              GZIP = $GZIP"
 	echo "             CORES = $CORES"
 	echo "      POINTER_PACK = $POINTER_PACK_ENABLE"
+	echo "      PPC32_LWSYNC = $PPC32_LWSYNC_ENABLE"
 	echo "          VMA_BITS = $VMA_BITS"
 	echo "      MEMORY_MODEL = $MM"
 	echo "               RTM = $RTM_ENABLE"
 	echo "               LSE = $LSE_ENABLE"
+	echo "               SSE = $SSE_DISABLE"
 	echo
 	echo "Headers will be installed in $HEADERS"
 	echo "Libraries will be installed in $LIBRARY"
@ -169,7 +176,8 @@ generate_stdout()
 for option; do
 	case "$option" in
 	*=?*)
-		value=`expr -- "$option" : '[^=]*=\(.*\)'`
+		optname=`echo $option|cut -c 3-`
+		value=`expr "$optname" : '[^=]*=\(.*\)'`
 		;;
 	*=)
 		value=
@ -194,18 +202,24 @@ for option; do
 		echo
 		echo "The following options will affect generated code."
 		echo "  --enable-pointer-packing Assumes address encoding is subset of pointer range"
-		echo "  --enable-rtm             Enable restricted transactional memory (power, x86_64)"
-		echo "  --enable-lse             Enable large system extensions (arm64)"
 		echo "  --memory-model=N         Specify memory model (currently tso, pso or rmo)"
 		echo "  --vma-bits=N             Specify valid number of VMA bits"
 		echo "  --platform=N             Force the platform type, instead of relying on autodetection"
-		echo "  --use-cc-builtins        Use the compiler atomic bultin functions, instead of the CK implementation"
+		echo "  --use-cc-builtins        Use the compiler atomic builtin functions, instead of the CK implementation"
 		echo "  --disable-double         Don't generate any of the functions using the \"double\" type"
+		echo "  --disable-static         Don't compile a static version of the ck lib"
+		echo
+		echo "The following options will affect specific platform-dependent generated code."
+		echo "  --disable-sse            Do not use any SSE instructions (x86)"
+		echo "  --enable-lse             Enable large system extensions (arm64)"
+		echo "  --enable-ppc32-lwsync    Enable lwsync instruction usage (32-bit Power ISA)"
+		echo "  --enable-rtm             Enable restricted transactional memory (Power ISA, x86_64)"
 		echo
 		echo "The following options affect regression testing."
 		echo "  --cores=N                Specify number of cores available on target machine"
 		echo
 		echo "The following environment variables may be used:"
+		echo "   AR       AR archiver command"
 		echo "   CC       C compiler command"
 		echo "   CFLAGS   C compiler flags"
 		echo "   LDFLAGS  Linker flags"
@ -237,12 +251,18 @@ for option; do
 	--enable-pointer-packing)
 		POINTER_PACK_ENABLE="CK_MD_POINTER_PACK_ENABLE"
 		;;
+	--enable-ppc32-lwsync)
+		PPC32_LWSYNC_ENABLE="CK_MD_PPC32_LWSYNC"
+		;;
 	--enable-rtm)
 		RTM_ENABLE_SET="CK_MD_RTM_ENABLE"
 		;;
 	--enable-lse)
 		LSE_ENABLE_SET="CK_MD_LSE_ENABLE"
 		;;
+	--disable-sse)
+		SSE_DISABLE="CK_MD_SSE_DISABLE"
+		;;
 	--cores=*)
 		CORES=$value
 		;;
@ -276,6 +296,9 @@ for option; do
 	--disable-double)
 		DISABLE_DOUBLE="CK_PR_DISABLE_DOUBLE"
 		;;
+	--disable-static)
+		DISABLE_STATIC=1
+		;;
 	--platform=*)
 		PLATFORM=$value
 		;;
@ -294,7 +317,8 @@ for option; do
 		fi
 		;;
 	*=*)
-		NAME=`expr -- "$option" : '\([^=]*\)='`
+		optname=`echo $option|cut -c 3-`
+		NAME=`expr "$optname" : '\([^=]*\)='`
 		eval "$NAME='$value'"
 		export $NAME
 		;;
@ -309,10 +333,13 @@ done
 HEADERS=${HEADERS:-"${PREFIX}/include"}
 LIBRARY=${LIBRARY:-"${PREFIX}/lib"}
 MANDIR=${MANDIR:-"${PREFIX}/share/man"}
-GZIP=${GZIP:-"gzip -c"}
+GZIP=${GZIP-"gzip -c"}
 POINTER_PACK_ENABLE=${POINTER_PACK_ENABLE:-"CK_MD_POINTER_PACK_DISABLE"}
 DISABLE_DOUBLE=${DISABLE_DOUBLE:-"CK_PR_ENABLE_DOUBLE"}
+DISABLE_STATIC=${DISABLE_STATIC:-"0"}
+PPC32_LWSYNC_ENABLE=${PPC32_LWSYNC_ENABLE:-"CK_MD_PPC32_LWSYNC_DISABLE"}
 RTM_ENABLE=${RTM_ENABLE_SET:-"CK_MD_RTM_DISABLE"}
+SSE_DISABLE=${SSE_DISABLE:-"CK_MD_SSE_ENABLE"}
 LSE_ENABLE=${LSE_ENABLE_SET:-"CK_MD_LSE_DISABLE"}
 VMA_BITS=${VMA_BITS:-"unknown"}

@ -347,14 +374,18 @@ case "$SYSTEM" in
 		DCORES=`sysctl -n hw.ncpu`
 		SYSTEM=darwin
 		;;
-	MINGW32*)
+	MINGW32*|MSYS_NT*)
 		SYSTEM=mingw32
 		LDFLAGS="-mthreads $LDFLAGS"
 		;;
-  CYGWIN_NT*)
-    SYSTEM=cygwin
-    LDFLAGS="-mthreads $LDFLAGS"
-    ;;
+	MINGW64*)
+		SYSTEM=mingw64
+		LDFLAGS="-mthreads $LDFLAGS"
+		;;
+	CYGWIN_NT*)
+		SYSTEM=cygwin
+		LDFLAGS="-mthreads $LDFLAGS"
+		;;
 	*)
 		SYSTEM=
 		;;
@ -365,11 +396,18 @@ assert "$SYSTEM" "$SYSTEM" "unsupported"
 CORES=${CORES:-${DCORES}}
 printf "Detecting machine architecture..."
 if test "x$PLATFORM" = "x"; then
-	PLATFORM=`uname -m 2> /dev/null`
+	case $SYSTEM in
+		"freebsd")
+			PLATFORM=`uname -p 2> /dev/null`
+			;;
+		*)
+			PLATFORM=`uname -m 2> /dev/null`
+			;;
+	esac
 fi

 case $PLATFORM in
-	"macppc"|"Power Macintosh"|"powerpc")
+	"macppc"|"Power Macintosh"|"powerpc"|"powerpcspe")
 		RTM_ENABLE="CK_MD_RTM_DISABLE"
 		LSE_ENABLE="CK_MD_LSE_DISABLE"
 		MM="${MM:-"CK_MD_RMO"}"
@ -457,19 +495,22 @@ case $PLATFORM in
 				;;
 		esac
 		;;
-	"ppc64"|"ppc64le")
+	"ppc64"|"ppc64le"|"powerpc64")
 		RTM_ENABLE="CK_MD_RTM_DISABLE"
 		LSE_ENABLE="CK_MD_LSE_DISABLE"
 		MM="${MM:-"CK_MD_RMO"}"
 		PLATFORM=ppc64
 		ENVIRONMENT=64
 		;;
-	arm|armv6l|armv7l)
-		if test "$PLATFORM" = "armv6l"; then
-			CFLAGS="$CFLAGS -march=armv6k";
-		elif test "$PLATFORM" = "armv7l"; then
-			CFLAGS="$CFLAGS -march=armv7-a";
-		fi
+	arm|armv6|armv6l|armv7|armv7l)
+		case "$PLATFORM" in
+			"armv6"|"armv6l")
+				CFLAGS="$CFLAGS -march=armv6k";
+				;;
+			"armv7"|"armv7l")
+				CFLAGS="$CFLAGS -march=armv7-a";
+				;;
+		esac
 		RTM_ENABLE="CK_MD_RTM_DISABLE"
 		LSE_ENABLE="CK_MD_LSE_DISABLE"
 		MM="${MM:-"CK_MD_RMO"}"
@ -482,11 +523,19 @@ case $PLATFORM in
 		PLATFORM=aarch64
 		ENVIRONMENT=64
 		;;
+	"s390x")
+		RTM_ENABLE="CK_MD_RTM_DISABLE"
+		LSE_ENABLE="CK_MD_LSE_DISABLE"
+		MM="${MM:-"CK_MD_RMO"}"
+		PLATFORM=s390x
+		ENVIRONMENT=64
+		;;
 	*)
 		RTM_ENABLE="CK_MD_RTM_DISABLE"
 		LSE_ENABLE="CK_MD_LSE_DISABLE"
-		PLATFORM=
+		PLATFORM=unknown
 		MM="${MM:-"CK_MD_RMO"}"
+		USE_CC_BUILTINS=1
 		;;
 esac

@ -543,27 +592,65 @@ else
 	echo "success [$BUILD_DIR]"
 fi

-printf "Finding gzip tool................"
-GZIP=`pathsearch "${GZIP:-gzip}"`
-if test -z "$GZIP" -o ! -x "$GZIP"; then
+if test -n "$GZIP"; then
+	printf "Finding gzip tool................"
 	GZIP=`pathsearch "${GZIP:-gzip}"`
-	GZIP="$GZIP"
+	if test -z "$GZIP" -o ! -x "$GZIP"; then
+		GZIP=`pathsearch "${GZIP:-gzip}"`
+		GZIP="$GZIP"
+	fi
+
+	if test -z "$GZIP"; then
+		echo "not found"
+	else
+		echo "success [$GZIP]"
+		GZIP="$GZIP -c"
+		GZIP_SUFFIX=".gz"
+	fi
 fi

 if test -z "$GZIP"; then
-	echo "not found"
 	GZIP=cat
 	GZIP_SUFFIX=""
-else
-	echo "success [$GZIP]"
-	GZIP="$GZIP -c"
-	GZIP_SUFFIX=".gz"
+fi
+
+if test "$PROFILE"; then
+	printf "Using user-specified profile....."
+
+	if test -z "$CC"; then
+		echo "failed [specify compiler]"
+		exit $EXIT_FAILURE
+	fi
+
+	if test ! -f build/ck.build.$PROFILE; then
+		echo "failed [$PROFILE]"
+		exit $EXIT_FAILURE
+	fi
+
+	echo "success [$PROFILE]"
+	printf "Generating header files.........."
+	generate include/ck_md.h.in include/ck_md.h
+	generate include/freebsd/ck_md.h.in include/freebsd/ck_md.h
+	echo "success"
+	printf "Generating build files..........."
+	generate src/Makefile.in src/Makefile
+	generate doc/Makefile.in doc/Makefile
+	generate build/ck.build.in build/ck.build
+	generate build/regressions.build.in build/regressions.build
+	generate build/ck.pc.in build/ck.pc
+	generate build/ck.spec.in build/ck.spec
+	generate Makefile.in Makefile
+	echo "success"
+	generate_stdout
+	exit $EXIT_SUCCESS
 fi

 printf "Finding suitable compiler........"
-CC=`pathsearch "${CC:-cc}"`
-if test -z "$CC" -o ! -x "$CC"; then
-	CC=`pathsearch "${CC:-gcc}"`
+if test ! -x "${CC}"; then
+	CC=`pathsearch "${CC:-cc}"`
+	if test -z "$CC" -o ! -x "$CC"; then
+		CC=`pathsearch "${CC:-gcc}"`
+	fi
 fi
 assert "$CC" "not found"

@ -596,7 +683,7 @@ int main(void) {
 EOF

 $CC -o .1 .1.c
-COMPILER=`./.1`
+COMPILER=`./.1 2> /dev/null`
 r=$?
 rm -f .1.c .1

@ -628,13 +715,24 @@ elif test "$COMPILER" = "gcc" || test "$COMPILER" = "clang" || test "$COMPILER"
 	if test "$WANT_PIC" = "yes"; then
 		LDFLAGS="$LDFLAGS -shared -fPIC"
 		CFLAGS="$CFLAGS -fPIC"
-		ALL_LIBS="libck.so libck.a"
-		INSTALL_LIBS="install-so install-lib"
+
+		if [ "$DISABLE_STATIC" -eq 1 ]; then
+			ALL_LIBS="libck.so"
+			INSTALL_LIBS="install-so"
+		else
+			ALL_LIBS="libck.so libck.a"
+			INSTALL_LIBS="install-so install-lib"
+		fi
 	else
 		LDFLAGS="$LDFLAGS -fno-PIC"
 		CFLAGS="$CFLAGS -fno-PIC"
-		ALL_LIBS="libck.a"
-		INSTALL_LIBS="install-lib"
+		if [ "$DISABLE_STATIC" -eq 1 ]; then
+			echo "Error: You have choosen to disable PIC, yet you also disabled the static lib." 1>&2
+			exit $EXIT_FAILURE
+		else
+			ALL_LIBS="libck.a"
+			INSTALL_LIBS="install-lib"
+		fi
 	fi

 	CFLAGS="-D_XOPEN_SOURCE=600 -D_BSD_SOURCE -D_DEFAULT_SOURCE -std=gnu99 -pedantic -Wall -W -Wundef -Wendif-labels -Wshadow -Wpointer-arith -Wcast-align -Wcast-qual -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes -Wnested-externs -Winline -Wdisabled-optimization -fstrict-aliasing -O2 -pipe -Wno-parentheses $CFLAGS"
@ -647,6 +745,17 @@ else
 	assert "" "unknown compiler"
 fi

+printf "Finding suitable archiver........"
+if test ! -x "${AR}"; then
+	AR=`pathsearch "${AR:-ar}"`
+	if test -z "$AR" -o ! -x "$AR"; then
+		AR=`pathsearch "${AR:-ar}"`
+	else
+		echo "success [$AR]"
+	fi
+fi
+assert "$AR" "not found"
+
 printf "Detecting VMA bits..............."
 VMA="unknown"
 if test "$VMA_BITS" = "unknown"; then
@ -732,42 +841,13 @@ printf "Detecting git SHA................"
 get_git_sha
 echo "$GIT_MSG [$GIT_SHA]"

-if test "$PROFILE"; then
-    printf "Using user-specified profile....."
-
-	if test -z "$CC"; then
-		echo "failed [specify compiler]"
-		exit $EXIT_FAILURE
-	fi
-
-	if test ! -f build/ck.build.$PROFILE; then
-		echo "failed [$PROFILE]"
-		exit $EXIT_FAILURE
-	fi
-
-	echo "success [$PROFILE]"
-	printf "Generating header files.........."
-	generate include/ck_md.h.in include/ck_md.h
-	echo "success"
-	printf "Generating build files..........."
-	generate src/Makefile.in src/Makefile
-	generate doc/Makefile.in doc/Makefile
-	generate build/ck.build.in build/ck.build
-	generate build/regressions.build.in build/regressions.build
-	generate build/ck.pc.in build/ck.pc
-	generate build/ck.spec.in build/ck.spec
-	generate Makefile.in Makefile
-	echo "success"
-	generate_stdout
-	exit $EXIT_SUCCESS
-fi
-
 # Platform will be used as a macro.
 PROFILE="${PROFILE:-$PLATFORM}"
 PLATFORM="__${PLATFORM}__"

 printf "Generating header files.........."
 generate include/ck_md.h.in include/ck_md.h
+generate include/freebsd/ck_md.h.in include/freebsd/ck_md.h
 echo "success"

 printf "Generating build files..........."
@ -794,3 +874,12 @@ generate Makefile.in $P_PWD/Makefile
 touch src/*.c
 echo "success"
 generate_stdout
+
+if test "$PROFILE" = "unknown"; then
+	echo
+	echo "WARNING: your target architecture is not a first-class citizen."
+	echo
+	echo "The test suite may not work as intended. Consider reaching out "
+	echo "to the mailing list about having the project add first-class "
+	echo "support for your architecture."
+fi
--- a/doc/ck_epoch_poll
+++ b/doc/ck_epoch_poll
@ -47,10 +47,9 @@ if deemed safe. This function is meant to be used in cases epoch
 reclamation cost must be amortized over time in a manner that does
 not affect caller progress.
 .Sh RETURN VALUES
-This function will return true if at least one function was dispatched.
-This function will return false if it has determined not all threads have
-observed the latest generation of epoch-protected objects. Neither value
-indicates an error.
+This functions returns false if the following conditions are met: 
+no memory was reclaimed, the records are not in a grace period and no forward
+progress was made.
 .Sh ERRORS
 Behavior is undefined if the object pointed to by
 .Fa record
--- a/doc/ck_epoch_register
+++ b/doc/ck_epoch_register
@ -34,7 +34,7 @@ Concurrency Kit (libck, \-lck)
 .Sh SYNOPSIS
 .In ck_epoch.h
 .Ft void
-.Fn ck_epoch_register "ck_epoch_t *epoch" "ck_epoch_record_t *record"
+.Fn ck_epoch_register "ck_epoch_t *epoch" "ck_epoch_record_t *record" "void *cl"
 .Sh DESCRIPTION
 The
 .Fn ck_epoch_register 3
@ -49,7 +49,11 @@ object pointed to by the
 argument will have lifetime managed by the underlying epoch sub-system.
 The record object must not be destroyed after it is associated with a
 .Fn ck_epoch_register 3
-call.
+call. An optional context pointer
+.Fa cl
+may be passed that is retrievable with the
+.Fn ck_epoch_record_ct 3
+function.
 .Sh RETURN VALUES
 This function has no return value.
 .Sh SEE ALSO
--- a/doc/ck_epoch_synchronize
+++ b/doc/ck_epoch_synchronize
@ -40,6 +40,8 @@ The
 .Fn ck_epoch_synchronize 3
 function will block the caller until a grace period has been
 detected, according to the semantics of epoch reclamation.
+It is not safe to call this function on a record that is
+in an active section.
 Any objects requiring safe memory reclamation which are logically
 deleted are safe for physical deletion following a call to
 .Fn ck_epoch_synchronize 3 .
--- a/doc/ck_pr_dec
+++ b/doc/ck_pr_dec
@ -29,22 +29,31 @@
 .Sh NAME
 .Nm ck_pr_dec_ptr ,
 .Nm ck_pr_dec_ptr_zero ,
+.Nm ck_pr_dec_ptr_is_zero ,
 .Nm ck_pr_dec_double ,
 .Nm ck_pr_dec_double_zero ,
+.Nm ck_pr_dec_double_is_zero ,
 .Nm ck_pr_dec_char ,
 .Nm ck_pr_dec_char_zero ,
+.Nm ck_pr_dec_char_is_zero ,
 .Nm ck_pr_dec_uint ,
 .Nm ck_pr_dec_uint_zero ,
+.Nm ck_pr_dec_char_is_zero ,
 .Nm ck_pr_dec_int ,
 .Nm ck_pr_dec_int_zero ,
+.Nm ck_pr_dec_int_is_zero ,
 .Nm ck_pr_dec_64 ,
 .Nm ck_pr_dec_64_zero ,
+.Nm ck_pr_dec_64_is_zero ,
 .Nm ck_pr_dec_32 ,
 .Nm ck_pr_dec_32_zero ,
+.Nm ck_pr_dec_32_is_zero ,
 .Nm ck_pr_dec_16 ,
 .Nm ck_pr_dec_16_zero ,
+.Nm ck_pr_dec_32_is_zero ,
 .Nm ck_pr_dec_8 ,
-.Nm ck_pr_dec_8_zero
+.Nm ck_pr_dec_8_zero ,
+.Nm ck_pr_dec_8_is_zero
 .Nd atomic decrement operations
 .Sh LIBRARY
 Concurrency Kit (libck, \-lck)
@ -54,38 +63,56 @@ Concurrency Kit (libck, \-lck)
 .Fn ck_pr_dec_ptr "void *target"
 .Ft void
 .Fn ck_pr_dec_ptr_zero "void *target" "bool *z"
+.Ft bool
+.Fn ck_pr_dec_ptr_is_zero "void *target"
 .Ft void
 .Fn ck_pr_dec_double "double *target"
 .Ft void
 .Fn ck_pr_dec_double_zero "double *target" "bool *z"
+.Ft bool
+.Fn ck_pr_dec_double_is_zero "double *target"
 .Ft void
 .Fn ck_pr_dec_char "char *target"
 .Ft void
 .Fn ck_pr_dec_char_zero "char *target" "bool *z"
+.Ft bool
+.Fn ck_pr_dec_char_is_zero "char *target"
 .Ft void
 .Fn ck_pr_dec_uint "unsigned int *target"
 .Ft void
 .Fn ck_pr_dec_uint_zero "unsigned int *target" "bool *z"
+.Ft bool
+.Fn ck_pr_dec_uint_is_zero "unsigned int *target"
 .Ft void
 .Fn ck_pr_dec_int "int *target"
 .Ft void
 .Fn ck_pr_dec_int_zero "int *target" "bool *z"
+.Ft bool
+.Fn ck_pr_dec_int_is_zero "int *target"
 .Ft void
 .Fn ck_pr_dec_64 "uint64_t *target"
 .Ft void
 .Fn ck_pr_dec_64_zero "uint64_t *target" "bool *z"
+.Ft bool
+.Fn ck_pr_dec_64_is_zero "uint64_t *target"
 .Ft void
 .Fn ck_pr_dec_32 "uint32_t *target"
 .Ft void
 .Fn ck_pr_dec_32_zero "uint32_t *target" "bool *z"
+.Ft bool
+.Fn ck_pr_dec_32_is_zero "uint32_t *target"
 .Ft void
 .Fn ck_pr_dec_16 "uint16_t *target"
 .Ft void
 .Fn ck_pr_dec_16_zero "uint16_t *target" "bool *z"
+.Ft bool
+.Fn ck_pr_dec_16_is_zero "uint16_t *target"
 .Ft void
 .Fn ck_pr_dec_8 "uint8_t *target"
 .Ft void
 .Fn ck_pr_dec_8_zero "uint8_t *target" "bool *z"
+.Ft bool
+.Fn ck_pr_dec_8_is_zero "uint8_t *target"
 .Sh DESCRIPTION
 The
 .Fn ck_pr_dec 3
@ -99,6 +126,8 @@ to true if the result
 of the decrement operation was 0. They set the value pointed to by
 .Fa z
 to false otherwise.
+The ck_pr_dec_is_zero family of function return true if the result
+of the decrement operation was 0 and false otherwise.
 .Sh SEE ALSO
 .Xr ck_pr_fence_load 3 ,
 .Xr ck_pr_fence_load_depends 3 ,
--- a/doc/ck_pr_inc
+++ b/doc/ck_pr_inc
@ -29,22 +29,31 @@
 .Sh NAME
 .Nm ck_pr_inc_ptr ,
 .Nm ck_pr_inc_ptr_zero ,
+.Nm ck_pr_inc_ptr_is_zero ,
 .Nm ck_pr_inc_double ,
 .Nm ck_pr_inc_double_zero ,
+.Nm ck_pr_inc_double_is_zero ,
 .Nm ck_pr_inc_char ,
 .Nm ck_pr_inc_char_zero ,
+.Nm ck_pr_inc_char_is_zero ,
 .Nm ck_pr_inc_uint ,
 .Nm ck_pr_inc_uint_zero ,
+.Nm ck_pr_inc_uint_is_zero ,
 .Nm ck_pr_inc_int ,
 .Nm ck_pr_inc_int_zero ,
+.Nm ck_pr_inc_int_is_zero ,
 .Nm ck_pr_inc_64 ,
 .Nm ck_pr_inc_64_zero ,
+.Nm ck_pr_inc_64_is_zero ,
 .Nm ck_pr_inc_32 ,
 .Nm ck_pr_inc_32_zero ,
+.Nm ck_pr_inc_32_is_zero ,
 .Nm ck_pr_inc_16 ,
 .Nm ck_pr_inc_16_zero ,
+.Nm ck_pr_inc_16_is_zero ,
 .Nm ck_pr_inc_8 ,
-.Nm ck_pr_inc_8_zero
+.Nm ck_pr_inc_8_zero ,
+.Nm ck_pr_inc_8_is_zero
 .Nd atomic increment operations
 .Sh LIBRARY
 Concurrency Kit (libck, \-lck)
@ -54,38 +63,56 @@ Concurrency Kit (libck, \-lck)
 .Fn ck_pr_inc_ptr "void *target"
 .Ft void
 .Fn ck_pr_inc_ptr_zero "void *target" "bool *z"
+.Ft bool
+.Fn ck_pr_inc_ptr_is_zero "void *target"
 .Ft void
 .Fn ck_pr_inc_double "double *target"
 .Ft void
 .Fn ck_pr_inc_double_zero "double *target" "bool *z"
+.Ft bool
+.Fn ck_pr_inc_double_is_zero "double *target"
 .Ft void
 .Fn ck_pr_inc_char "char *target"
 .Ft void
 .Fn ck_pr_inc_char_zero "char *target" "bool *z"
+.Ft bool
+.Fn ck_pr_inc_char_is_zero "char *target"
 .Ft void
 .Fn ck_pr_inc_uint "unsigned int *target"
 .Ft void
 .Fn ck_pr_inc_uint_zero "unsigned int *target" "bool *z"
+.Ft bool
+.Fn ck_pr_inc_uint_is_zero "unsigned int *target"
 .Ft void
 .Fn ck_pr_inc_int "int *target"
 .Ft void
 .Fn ck_pr_inc_int_zero "int *target" "bool *z"
+.Ft bool
+.Fn ck_pr_inc_int_is_zero "int *target"
 .Ft void
 .Fn ck_pr_inc_64 "uint64_t *target"
 .Ft void
 .Fn ck_pr_inc_64_zero "uint64_t *target" "bool *z"
+.Ft bool
+.Fn ck_pr_inc_64_is_zero "uint64_t *target"
 .Ft void
 .Fn ck_pr_inc_32 "uint32_t *target"
 .Ft void
 .Fn ck_pr_inc_32_zero "uint32_t *target" "bool *z"
+.Ft bool
+.Fn ck_pr_inc_32_is_zero "uint32_t *target"
 .Ft void
 .Fn ck_pr_inc_16 "uint16_t *target"
 .Ft void
 .Fn ck_pr_inc_16_zero "uint16_t *target" "bool *z"
+.Ft bool
+.Fn ck_pr_inc_16_is_zero "uint16_t *target"
 .Ft void
 .Fn ck_pr_inc_8 "uint8_t *target"
 .Ft void
 .Fn ck_pr_inc_8_zero "uint8_t *target" "bool *z"
+.Ft bool
+.Fn ck_pr_inc_8_is_zero "uint8_t *target"
 .Sh DESCRIPTION
 The
 .Fn ck_pr_inc 3
@ -99,6 +126,8 @@ to true if the result of the increment operation was 0. The functions set
 the value pointed to by
 .Fa z
 false otherwise.
+The ck_pr_inc_is_zero family of function return true if the result
+of the decrement operation was 0 and false otherwise.
 .Sh SEE ALSO
 .Xr ck_pr_fence_load 3 ,
 .Xr ck_pr_fence_load_depends 3 ,
--- a/include/ck_backoff.h
+++ b/include/ck_backoff.h
@ -50,7 +50,7 @@ ck_backoff_eb(unsigned int *c)
 	for (i = 0; i < ceiling; i++)
 		ck_pr_barrier();

-	*c = ceiling <<= ceiling < CK_BACKOFF_CEILING;
+	*c = ceiling << (ceiling < CK_BACKOFF_CEILING);
 	return;
 }

--- a/include/ck_cc.h
+++ b/include/ck_cc.h
@ -50,6 +50,7 @@
 * Container function.
 * This relies on (compiler) implementation-defined behavior.
 */
+#ifndef CK_CC_CONTAINER
 #define CK_CC_CONTAINER(F, T, M, N)						\
 	CK_CC_INLINE static T *							\
 	N(F *p)									\
@ -57,6 +58,7 @@
 		F *n = p;							\
 		return (T *)(void *)(((char *)n) - ((size_t)&((T *)0)->M));	\
 	}
+#endif

 #define CK_CC_PAD(x) union { char pad[x]; }

@ -104,41 +106,35 @@
 #define CK_CC_TYPEOF(X, DEFAULT) (DEFAULT)
 #endif

+#define CK_F_CC_FFS_G(L, T)				\
+CK_CC_INLINE static int					\
+ck_cc_##L(T v)						\
+{							\
+	unsigned int i;					\
+							\
+	if (v == 0)					\
+		return 0;				\
+							\
+	for (i = 1; (v & 1) == 0; i++, v >>= 1);	\
+	return i;					\
+}
+
 #ifndef CK_F_CC_FFS
 #define CK_F_CC_FFS
-CK_CC_INLINE static int
-ck_cc_ffs(unsigned int x)
-{
-	unsigned int i;
+CK_F_CC_FFS_G(ffs, unsigned int)
+#endif /* CK_F_CC_FFS */

-	if (x == 0)
-		return 0;
+#ifndef CK_F_CC_FFSL
+#define CK_F_CC_FFSL
+CK_F_CC_FFS_G(ffsl, unsigned long)
+#endif /* CK_F_CC_FFSL */

-	for (i = 1; (x & 1) == 0; i++, x >>= 1);
+#ifndef CK_F_CC_FFSLL
+#define CK_F_CC_FFSLL
+CK_F_CC_FFS_G(ffsll, unsigned long long)
+#endif /* CK_F_CC_FFSLL */

-	return i;
-}
-#endif
-
-#ifndef CK_F_CC_CLZ
-#define CK_F_CC_CLZ
-#include <ck_limits.h>
-
-CK_CC_INLINE static int
-ck_cc_clz(unsigned int x)
-{
-	unsigned int count, i;
-
-	for (count = 0, i = sizeof(unsigned int) * CHAR_BIT; i > 0; count++) {
-		unsigned int bit = 1U << --i;
-
-		if (x & bit)
-			break;
-	}
-
-	return count;
-}
-#endif
+#undef CK_F_CC_FFS_G

 #ifndef CK_F_CC_CTZ
 #define CK_F_CC_CTZ
@ -151,7 +147,6 @@ ck_cc_ctz(unsigned int x)
 		return 0;

 	for (i = 0; (x & 1) == 0; i++, x >>= 1);
-
 	return i;
 }
 #endif
--- a/include/ck_ec.h
+++ b/include/ck_ec.h
@ -0,0 +1,945 @@
+/*
+ * Copyright 2018 Paul Khuong, Google LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Overview
+ * ========
+ *
+ * ck_ec implements 32- and 64- bit event counts. Event counts let us
+ * easily integrate OS-level blocking (e.g., futexes) in lock-free
+ * protocols. Waiters block conditionally, if the event count's value
+ * is still equal to some old value.
+ *
+ * Event counts come in four variants: 32 and 64 bit (with one bit
+ * stolen for internal signaling, so 31 and 63 bit counters), and
+ * single or multiple producers (wakers). Waiters are always multiple
+ * consumers. The 32 bit variants are smaller, and more efficient,
+ * especially in single producer mode. The 64 bit variants are larger,
+ * but practically invulnerable to ABA.
+ *
+ * The 32 bit variant is always available. The 64 bit variant is only
+ * available if CK supports 64-bit atomic operations. Currently,
+ * specialization for single producer is only implemented for x86 and
+ * x86-64, on compilers that support GCC extended inline assembly;
+ * other platforms fall back to the multiple producer code path.
+ *
+ * A typical usage pattern is:
+ *
+ *  1. On the producer side:
+ *
+ *    - Make changes to some shared data structure, without involving
+ *	the event count at all.
+ *    - After each change, call ck_ec_inc on the event count. The call
+ *	acts as a write-write barrier, and wakes up any consumer blocked
+ *	on the event count (waiting for new changes).
+ *
+ *  2. On the consumer side:
+ *
+ *    - Snapshot ck_ec_value of the event count. The call acts as a
+ *	read barrier.
+ *    - Read and process the shared data structure.
+ *    - Wait for new changes by calling ck_ec_wait with the snapshot value.
+ *
+ * Some data structures may opt for tighter integration with their
+ * event count. For example, an SPMC ring buffer or disruptor might
+ * use the event count's value as the write pointer. If the buffer is
+ * regularly full, it might also make sense to store the read pointer
+ * in an MP event count.
+ *
+ * This event count implementation supports tighter integration in two
+ * ways.
+ *
+ * Producers may opt to increment by an arbitrary value (less than
+ * INT32_MAX / INT64_MAX), in order to encode, e.g., byte
+ * offsets. Larger increment values make wraparound more likely, so
+ * the increments should still be relatively small.
+ *
+ * Consumers may pass a predicate to ck_ec_wait_pred. This predicate
+ * can make `ck_ec_wait_pred` return early, before the event count's
+ * value changes, and can override the deadline passed to futex_wait.
+ * This lets consumer block on one eventcount, while optimistically
+ * looking at other waking conditions.
+ *
+ * API Reference
+ * =============
+ *
+ * When compiled as C11 or later, this header defines type-generic
+ * macros for ck_ec32 and ck_ec64; the reference describes this
+ * type-generic API.
+ *
+ * ck_ec needs additional OS primitives to determine the current time,
+ * to wait on an address, and to wake all threads waiting on a given
+ * address. These are defined with fields in a struct ck_ec_ops.  Each
+ * ck_ec_ops may additionally define the number of spin loop
+ * iterations in the slow path, as well as the initial wait time in
+ * the internal exponential backoff, the exponential scale factor, and
+ * the right shift count (< 32).
+ *
+ * The ops, in addition to the single/multiple producer flag, are
+ * encapsulated in a struct ck_ec_mode, passed to most ck_ec
+ * operations.
+ *
+ * ec is a struct ck_ec32 *, or a struct ck_ec64 *.
+ *
+ * value is an uint32_t for ck_ec32, and an uint64_t for ck_ec64. It
+ * never exceeds INT32_MAX and INT64_MAX respectively.
+ *
+ * mode is a struct ck_ec_mode *.
+ *
+ * deadline is either NULL, or a `const struct timespec *` that will
+ * be treated as an absolute deadline.
+ *
+ * `void ck_ec_init(ec, value)`: initializes the event count to value.
+ *
+ * `value ck_ec_value(ec)`: returns the current value of the event
+ *  counter.  This read acts as a read (acquire) barrier.
+ *
+ * `bool ck_ec_has_waiters(ec)`: returns whether some thread has
+ *  marked the event count as requiring an OS wakeup.
+ *
+ * `void ck_ec_inc(ec, mode)`: increments the value of the event
+ *  counter by one. This writes acts as a write barrier. Wakes up
+ *  any waiting thread.
+ *
+ * `value ck_ec_add(ec, mode, value)`: increments the event counter by
+ *  `value`, and returns the event counter's previous value. This
+ *  write acts as a write barrier. Wakes up any waiting thread.
+ *
+ * `int ck_ec_deadline(struct timespec *new_deadline,
+ *		       mode,
+ *		       const struct timespec *timeout)`:
+ *  computes a deadline `timeout` away from the current time. If
+ *  timeout is NULL, computes a deadline in the infinite future. The
+ *  resulting deadline is written to `new_deadline`. Returns 0 on
+ *  success, and -1 if ops->gettime failed (without touching errno).
+ *
+ * `int ck_ec_wait(ec, mode, value, deadline)`: waits until the event
+ *  counter's value differs from `value`, or, if `deadline` is
+ *  provided and non-NULL, until the current time is after that
+ *  deadline. Use a deadline with tv_sec = 0 for a non-blocking
+ *  execution. Returns 0 if the event counter has changed, and -1 on
+ *  timeout. This function acts as a read (acquire) barrier.
+ *
+ * `int ck_ec_wait_pred(ec, mode, value, pred, data, deadline)`: waits
+ * until the event counter's value differs from `value`, or until
+ * `pred` returns non-zero, or, if `deadline` is provided and
+ * non-NULL, until the current time is after that deadline. Use a
+ * deadline with tv_sec = 0 for a non-blocking execution. Returns 0 if
+ * the event counter has changed, `pred`'s return value if non-zero,
+ * and -1 on timeout. This function acts as a read (acquire) barrier.
+ *
+ * `pred` is always called as `pred(data, iteration_deadline, now)`,
+ * where `iteration_deadline` is a timespec of the deadline for this
+ * exponential backoff iteration, and `now` is the current time. If
+ * `pred` returns a non-zero value, that value is immediately returned
+ * to the waiter. Otherwise, `pred` is free to modify
+ * `iteration_deadline` (moving it further in the future is a bad
+ * idea).
+ *
+ * Implementation notes
+ * ====================
+ *
+ * The multiple producer implementation is a regular locked event
+ * count, with a single flag bit to denote the need to wake up waiting
+ * threads.
+ *
+ * The single producer specialization is heavily tied to
+ * [x86-TSO](https://www.cl.cam.ac.uk/~pes20/weakmemory/cacm.pdf), and
+ * to non-atomic read-modify-write instructions (e.g., `inc mem`);
+ * these non-atomic RMW let us write to the same memory locations with
+ * atomic and non-atomic instructions, without suffering from process
+ * scheduling stalls.
+ *
+ * The reason we can mix atomic and non-atomic writes to the `counter`
+ * word is that every non-atomic write obviates the need for the
+ * atomically flipped flag bit: we only use non-atomic writes to
+ * update the event count, and the atomic flag only informs the
+ * producer that we would like a futex_wake, because of the update.
+ * We only require the non-atomic RMW counter update to prevent
+ * preemption from introducing arbitrarily long worst case delays.
+ *
+ * Correctness does not rely on the usual ordering argument: in the
+ * absence of fences, there is no strict ordering between atomic and
+ * non-atomic writes. The key is instead x86-TSO's guarantee that a
+ * read is satisfied from the most recent buffered write in the local
+ * store queue if there is one, or from memory if there is no write to
+ * that address in the store queue.
+ *
+ * x86-TSO's constraint on reads suffices to guarantee that the
+ * producer will never forget about a counter update. If the last
+ * update is still queued, the new update will be based on the queued
+ * value. Otherwise, the new update will be based on the value in
+ * memory, which may or may not have had its flag flipped. In either
+ * case, the value of the counter (modulo flag) is correct.
+ *
+ * When the producer forwards the counter's value from its store
+ * queue, the new update might not preserve a flag flip. Any waiter
+ * thus has to check from time to time to determine if it wasn't
+ * woken up because the flag bit was silently cleared.
+ *
+ * In reality, the store queue in x86-TSO stands for in-flight
+ * instructions in the chip's out-of-order backend. In the vast
+ * majority of cases, instructions will only remain in flight for a
+ * few hundred or thousand of cycles. That's why ck_ec_wait spins on
+ * the `counter` word for ~100 iterations after flipping its flag bit:
+ * if the counter hasn't changed after that many iterations, it is
+ * very likely that the producer's next counter update will observe
+ * the flag flip.
+ *
+ * That's still not a hard guarantee of correctness. Conservatively,
+ * we can expect that no instruction will remain in flight for more
+ * than 1 second... if only because some interrupt will have forced
+ * the chip to store its architectural state in memory, at which point
+ * an instruction is either fully retired or rolled back. Interrupts,
+ * particularly the pre-emption timer, are why single-producer updates
+ * must happen in a single non-atomic read-modify-write instruction.
+ * Having a single instruction as the critical section means we only
+ * have to consider the worst-case execution time for that
+ * instruction. That's easier than doing the same for a pair of
+ * instructions, which an unlucky pre-emption could delay for
+ * arbitrarily long.
+ *
+ * Thus, after a short spin loop, ck_ec_wait enters an exponential
+ * backoff loop, where each "sleep" is instead a futex_wait.  The
+ * backoff is only necessary to handle rare cases where the flag flip
+ * was overwritten after the spin loop. Eventually, more than one
+ * second will have elapsed since the flag flip, and the sleep timeout
+ * becomes infinite: since the flag bit has been set for much longer
+ * than the time for which an instruction may remain in flight, the
+ * flag will definitely be observed at the next counter update.
+ *
+ * The 64 bit ck_ec_wait pulls another trick: futexes only handle 32
+ * bit ints, so we must treat the 64 bit counter's low 32 bits as an
+ * int in futex_wait. That's a bit dodgy, but fine in practice, given
+ * that the OS's futex code will always read whatever value is
+ * currently in memory: even if the producer thread were to wait on
+ * its own event count, the syscall and ring transition would empty
+ * the store queue (the out-of-order execution backend).
+ *
+ * Finally, what happens when the producer is migrated to another core
+ * or otherwise pre-empted? Migration must already incur a barrier, so
+ * that thread always sees its own writes, so that's safe. As for
+ * pre-emption, that requires storing the architectural state, which
+ * means every instruction must either be executed fully or not at
+ * all when pre-emption happens.
+ */
+
+#ifndef CK_EC_H
+#define CK_EC_H
+#include <ck_cc.h>
+#include <ck_pr.h>
+#include <ck_stdbool.h>
+#include <ck_stdint.h>
+#include <ck_stddef.h>
+#include <sys/time.h>
+
+/*
+ * If we have ck_pr_faa_64 (and, presumably, ck_pr_load_64), we
+ * support 63 bit counters.
+ */
+#ifdef CK_F_PR_FAA_64
+#define CK_F_EC64
+#endif /* CK_F_PR_FAA_64 */
+
+/*
+ * GCC inline assembly lets us exploit non-atomic read-modify-write
+ * instructions on x86/x86_64 for a fast single-producer mode.
+ *
+ * If we CK_F_EC_SP is not defined, CK_EC always uses the slower
+ * multiple producer code.
+ */
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define CK_F_EC_SP
+#endif /* GNUC && (__i386__ || __x86_64__) */
+
+struct ck_ec_ops;
+
+struct ck_ec_wait_state {
+	struct timespec start;	/* Time when we entered ck_ec_wait. */
+	struct timespec now;  /* Time now. */
+	const struct ck_ec_ops *ops;
+	void *data;  /* Opaque pointer for the predicate's internal state. */
+
+};
+
+/*
+ * ck_ec_ops define system-specific functions to get the current time,
+ * atomically wait on an address if it still has some expected value,
+ * and to wake all threads waiting on an address.
+ *
+ * Each platform is expected to have few (one) opaque pointer to a
+ * const ops struct, and reuse it for all ck_ec_mode structs.
+ */
+struct ck_ec_ops {
+	/* Populates out with the current time. Returns non-zero on failure. */
+	int (*gettime)(const struct ck_ec_ops *, struct timespec *out);
+
+	/*
+	 * Waits on address if its value is still `expected`.  If
+	 * deadline is non-NULL, stops waiting once that deadline is
+	 * reached. May return early for any reason.
+	 */
+	void (*wait32)(const struct ck_ec_wait_state *, const uint32_t *,
+		       uint32_t expected, const struct timespec *deadline);
+
+	/*
+	 * Same as wait32, but for a 64 bit counter. Only used if
+	 * CK_F_EC64 is defined.
+	 *
+	 * If underlying blocking primitive only supports 32 bit
+	 * control words, it should be safe to block on the least
+	 * significant half of the 64 bit address.
+	 */
+	void (*wait64)(const struct ck_ec_wait_state *, const uint64_t *,
+		       uint64_t expected, const struct timespec *deadline);
+
+	/* Wakes up all threads waiting on address. */
+	void (*wake32)(const struct ck_ec_ops *, const uint32_t *address);
+
+	/*
+	 * Same as wake32, but for a 64 bit counter. Only used if
+	 * CK_F_EC64 is defined.
+	 *
+	 * When wait64 truncates the control word at address to `only`
+	 * consider its least significant half, wake64 should perform
+	 * any necessary fixup (e.g., on big endian platforms).
+	 */
+	void (*wake64)(const struct ck_ec_ops *, const uint64_t *address);
+
+	/*
+	 * Number of iterations for the initial busy wait. 0 defaults
+	 * to 100 (not ABI stable).
+	 */
+	uint32_t busy_loop_iter;
+
+	/*
+	 * Delay in nanoseconds for the first iteration of the
+	 * exponential backoff. 0 defaults to 2 ms (not ABI stable).
+	 */
+	uint32_t initial_wait_ns;
+
+	/*
+	 * Scale factor for the exponential backoff. 0 defaults to 8x
+	 * (not ABI stable).
+	 */
+	uint32_t wait_scale_factor;
+
+	/*
+	 * Right shift count for the exponential backoff. The update
+	 * after each iteration is
+	 *     wait_ns = (wait_ns * wait_scale_factor) >> wait_shift_count,
+	 * until one second has elapsed. After that, the deadline goes
+	 * to infinity.
+	 */
+	uint32_t wait_shift_count;
+};
+
+/*
+ * ck_ec_mode wraps the ops table, and informs the fast path whether
+ * it should attempt to specialize for single producer mode.
+ *
+ * mode structs are expected to be exposed by value, e.g.,
+ *
+ *    extern const struct ck_ec_ops system_ec_ops;
+ *
+ *    static const struct ck_ec_mode ec_sp = {
+ *	  .ops = &system_ec_ops,
+ *	  .single_producer = true
+ *    };
+ *
+ *    static const struct ck_ec_mode ec_mp = {
+ *	  .ops = &system_ec_ops,
+ *	  .single_producer = false
+ *    };
+ *
+ * ck_ec_mode structs are only passed to inline functions defined in
+ * this header, and never escape to their slow paths, so they should
+ * not result in any object file size increase.
+ */
+struct ck_ec_mode {
+	const struct ck_ec_ops *ops;
+	/*
+	 * If single_producer is true, the event count has a unique
+	 * incrementer. The implementation will specialize ck_ec_inc
+	 * and ck_ec_add if possible (if CK_F_EC_SP is defined).
+	 */
+	bool single_producer;
+};
+
+struct ck_ec32 {
+	/* Flag is "sign" bit, value in bits 0:30. */
+	uint32_t counter;
+};
+
+typedef struct ck_ec32 ck_ec32_t;
+
+#ifdef CK_F_EC64
+struct ck_ec64 {
+	/*
+	 * Flag is bottom bit, value in bits 1:63. Eventcount only
+	 * works on x86-64 (i.e., little endian), so the futex int
+	 * lies in the first 4 (bottom) bytes.
+	 */
+	uint64_t counter;
+};
+
+typedef struct ck_ec64 ck_ec64_t;
+#endif /* CK_F_EC64 */
+
+#define CK_EC_INITIALIZER { .counter = 0 }
+
+/*
+ * Initializes the event count to `value`. The value must not
+ * exceed INT32_MAX.
+ */
+static void ck_ec32_init(struct ck_ec32 *ec, uint32_t value);
+
+#ifndef CK_F_EC64
+#define ck_ec_init ck_ec32_init
+#else
+/*
+ * Initializes the event count to `value`. The value must not
+ * exceed INT64_MAX.
+ */
+static void ck_ec64_init(struct ck_ec64 *ec, uint64_t value);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_init(EC, VALUE)				\
+	(_Generic(*(EC),				\
+		  struct ck_ec32 : ck_ec32_init,	\
+		  struct ck_ec64 : ck_ec64_init)((EC), (VALUE)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Returns the counter value in the event count. The value is at most
+ * INT32_MAX.
+ */
+static uint32_t ck_ec32_value(const struct ck_ec32* ec);
+
+#ifndef CK_F_EC64
+#define ck_ec_value ck_ec32_value
+#else
+/*
+ * Returns the counter value in the event count. The value is at most
+ * INT64_MAX.
+ */
+static uint64_t ck_ec64_value(const struct ck_ec64* ec);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_value(EC)					\
+	(_Generic(*(EC),				\
+		  struct ck_ec32 : ck_ec32_value,	\
+		struct ck_ec64 : ck_ec64_value)((EC)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Returns whether there may be slow pathed waiters that need an
+ * explicit OS wakeup for this event count.
+ */
+static bool ck_ec32_has_waiters(const struct ck_ec32 *ec);
+
+#ifndef CK_F_EC64
+#define ck_ec_has_waiters ck_ec32_has_waiters
+#else
+static bool ck_ec64_has_waiters(const struct ck_ec64 *ec);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_has_waiters(EC)				      \
+	(_Generic(*(EC),				      \
+		  struct ck_ec32 : ck_ec32_has_waiters,	      \
+		  struct ck_ec64 : ck_ec64_has_waiters)((EC)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Increments the counter value in the event count by one, and wakes
+ * up any waiter.
+ */
+static void ck_ec32_inc(struct ck_ec32 *ec, const struct ck_ec_mode *mode);
+
+#ifndef CK_F_EC64
+#define ck_ec_inc ck_ec32_inc
+#else
+static void ck_ec64_inc(struct ck_ec64 *ec, const struct ck_ec_mode *mode);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_inc(EC, MODE)					\
+	(_Generic(*(EC),					\
+		  struct ck_ec32 : ck_ec32_inc,			\
+		  struct ck_ec64 : ck_ec64_inc)((EC), (MODE)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Increments the counter value in the event count by delta, wakes
+ * up any waiter, and returns the previous counter value.
+ */
+static uint32_t ck_ec32_add(struct ck_ec32 *ec,
+			    const struct ck_ec_mode *mode,
+			    uint32_t delta);
+
+#ifndef CK_F_EC64
+#define ck_ec_add ck_ec32_add
+#else
+static uint64_t ck_ec64_add(struct ck_ec64 *ec,
+			    const struct ck_ec_mode *mode,
+			    uint64_t delta);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_add(EC, MODE, DELTA)					\
+	(_Generic(*(EC),						\
+		  struct ck_ec32 : ck_ec32_add,				\
+		  struct ck_ec64 : ck_ec64_add)((EC), (MODE), (DELTA)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Populates `new_deadline` with a deadline `timeout` in the future.
+ * Returns 0 on success, and -1 if clock_gettime failed, in which
+ * case errno is left as is.
+ */
+static int ck_ec_deadline(struct timespec *new_deadline,
+			  const struct ck_ec_mode *mode,
+			  const struct timespec *timeout);
+
+/*
+ * Waits until the counter value in the event count differs from
+ * old_value, or, if deadline is non-NULL, until CLOCK_MONOTONIC is
+ * past the deadline.
+ *
+ * Returns 0 on success, and -1 on timeout.
+ */
+static int ck_ec32_wait(struct ck_ec32 *ec,
+			const struct ck_ec_mode *mode,
+			uint32_t old_value,
+			const struct timespec *deadline);
+
+#ifndef CK_F_EC64
+#define ck_ec_wait ck_ec32_wait
+#else
+static int ck_ec64_wait(struct ck_ec64 *ec,
+			const struct ck_ec_mode *mode,
+			uint64_t old_value,
+			const struct timespec *deadline);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_wait(EC, MODE, OLD_VALUE, DEADLINE)			\
+	(_Generic(*(EC),						\
+		  struct ck_ec32 : ck_ec32_wait,			\
+		  struct ck_ec64 : ck_ec64_wait)((EC), (MODE),		\
+						 (OLD_VALUE), (DEADLINE)))
+
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Waits until the counter value in the event count differs from
+ * old_value, pred returns non-zero, or, if deadline is non-NULL,
+ * until CLOCK_MONOTONIC is past the deadline.
+ *
+ * Returns 0 on success, -1 on timeout, and the return value of pred
+ * if it returns non-zero.
+ *
+ * A NULL pred represents a function that always returns 0.
+ */
+static int ck_ec32_wait_pred(struct ck_ec32 *ec,
+			     const struct ck_ec_mode *mode,
+			     uint32_t old_value,
+			     int (*pred)(const struct ck_ec_wait_state *,
+					 struct timespec *deadline),
+			     void *data,
+			     const struct timespec *deadline);
+
+#ifndef CK_F_EC64
+#define ck_ec_wait_pred ck_ec32_wait_pred
+#else
+static int ck_ec64_wait_pred(struct ck_ec64 *ec,
+			     const struct ck_ec_mode *mode,
+			     uint64_t old_value,
+			     int (*pred)(const struct ck_ec_wait_state *,
+					 struct timespec *deadline),
+			     void *data,
+			     const struct timespec *deadline);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_wait_pred(EC, MODE, OLD_VALUE, PRED, DATA, DEADLINE)	\
+	(_Generic(*(EC),						\
+		  struct ck_ec32 : ck_ec32_wait_pred,			\
+		  struct ck_ec64 : ck_ec64_wait_pred)			\
+	 ((EC), (MODE), (OLD_VALUE), (PRED), (DATA), (DEADLINE)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Inline implementation details. 32 bit first, then 64 bit
+ * conditionally.
+ */
+CK_CC_FORCE_INLINE void ck_ec32_init(struct ck_ec32 *ec, uint32_t value)
+{
+	ec->counter = value & ~(1UL << 31);
+	return;
+}
+
+CK_CC_FORCE_INLINE uint32_t ck_ec32_value(const struct ck_ec32 *ec)
+{
+	uint32_t ret = ck_pr_load_32(&ec->counter) & ~(1UL << 31);
+
+	ck_pr_fence_acquire();
+	return ret;
+}
+
+CK_CC_FORCE_INLINE bool ck_ec32_has_waiters(const struct ck_ec32 *ec)
+{
+	return ck_pr_load_32(&ec->counter) & (1UL << 31);
+}
+
+/* Slow path for ck_ec{32,64}_{inc,add} */
+void ck_ec32_wake(struct ck_ec32 *ec, const struct ck_ec_ops *ops);
+
+CK_CC_FORCE_INLINE void ck_ec32_inc(struct ck_ec32 *ec,
+				    const struct ck_ec_mode *mode)
+{
+#if !defined(CK_F_EC_SP)
+	/* Nothing to specialize if we don't have EC_SP. */
+	ck_ec32_add(ec, mode, 1);
+	return;
+#else
+	char flagged;
+
+#if __GNUC__ >= 6
+	/*
+	 * We don't want to wake if the sign bit is 0. We do want to
+	 * wake if the sign bit just flipped from 1 to 0. We don't
+	 * care what happens when our increment caused the sign bit to
+	 * flip from 0 to 1 (that's once per 2^31 increment).
+	 *
+	 * This leaves us with four cases:
+	 *
+	 *  old sign bit | new sign bit | SF | OF | ZF
+	 *  -------------------------------------------
+	 *	       0 |	      0 |  0 |	0 | ?
+	 *	       0 |	      1 |  1 |	0 | ?
+	 *	       1 |	      1 |  1 |	0 | ?
+	 *	       1 |	      0 |  0 |	0 | 1
+	 *
+	 * In the first case, we don't want to hit ck_ec32_wake. In
+	 * the last two cases, we do want to call ck_ec32_wake. In the
+	 * second case, we don't care, so we arbitrarily choose to
+	 * call ck_ec32_wake.
+	 *
+	 * The "le" condition checks if SF != OF, or ZF == 1, which
+	 * meets our requirements.
+	 */
+#define CK_EC32_INC_ASM(PREFIX)					\
+	__asm__ volatile(PREFIX " incl %0"		    \
+			 : "+m"(ec->counter), "=@ccle"(flagged)	 \
+			 :: "cc", "memory")
+#else
+#define CK_EC32_INC_ASM(PREFIX)						\
+	__asm__ volatile(PREFIX " incl %0; setle %1"			\
+			 : "+m"(ec->counter), "=r"(flagged)		\
+			 :: "cc", "memory")
+#endif /* __GNUC__ */
+
+	if (mode->single_producer == true) {
+		ck_pr_fence_store();
+		CK_EC32_INC_ASM("");
+	} else {
+		ck_pr_fence_store_atomic();
+		CK_EC32_INC_ASM("lock");
+	}
+#undef CK_EC32_INC_ASM
+
+	if (CK_CC_UNLIKELY(flagged)) {
+		ck_ec32_wake(ec, mode->ops);
+	}
+
+	return;
+#endif /* CK_F_EC_SP */
+}
+
+CK_CC_FORCE_INLINE uint32_t ck_ec32_add_epilogue(struct ck_ec32 *ec,
+						 const struct ck_ec_mode *mode,
+						 uint32_t old)
+{
+	const uint32_t flag_mask = 1U << 31;
+	uint32_t ret;
+
+	ret = old & ~flag_mask;
+	/* These two only differ if the flag bit is set. */
+	if (CK_CC_UNLIKELY(old != ret)) {
+		ck_ec32_wake(ec, mode->ops);
+	}
+
+	return ret;
+}
+
+static CK_CC_INLINE uint32_t ck_ec32_add_mp(struct ck_ec32 *ec,
+					    const struct ck_ec_mode *mode,
+					    uint32_t delta)
+{
+	uint32_t old;
+
+	ck_pr_fence_store_atomic();
+	old = ck_pr_faa_32(&ec->counter, delta);
+	return ck_ec32_add_epilogue(ec, mode, old);
+}
+
+#ifdef CK_F_EC_SP
+static CK_CC_INLINE uint32_t ck_ec32_add_sp(struct ck_ec32 *ec,
+					    const struct ck_ec_mode *mode,
+					    uint32_t delta)
+{
+	uint32_t old;
+
+	/*
+	 * Correctness of this racy write depends on actually
+	 * having an update to write. Exit here if the update
+	 * is a no-op.
+	 */
+	if (CK_CC_UNLIKELY(delta == 0)) {
+		return ck_ec32_value(ec);
+	}
+
+	ck_pr_fence_store();
+	old = delta;
+	__asm__ volatile("xaddl %1, %0"
+			 : "+m"(ec->counter), "+r"(old)
+			 :: "cc", "memory");
+	return ck_ec32_add_epilogue(ec, mode, old);
+}
+#endif /* CK_F_EC_SP */
+
+CK_CC_FORCE_INLINE uint32_t ck_ec32_add(struct ck_ec32 *ec,
+					const struct ck_ec_mode *mode,
+					uint32_t delta)
+{
+#ifdef CK_F_EC_SP
+	if (mode->single_producer == true) {
+		return ck_ec32_add_sp(ec, mode, delta);
+	}
+#endif
+
+	return ck_ec32_add_mp(ec, mode, delta);
+}
+
+int ck_ec_deadline_impl(struct timespec *new_deadline,
+			const struct ck_ec_ops *ops,
+			const struct timespec *timeout);
+
+CK_CC_FORCE_INLINE int ck_ec_deadline(struct timespec *new_deadline,
+				      const struct ck_ec_mode *mode,
+				      const struct timespec *timeout)
+{
+	return ck_ec_deadline_impl(new_deadline, mode->ops, timeout);
+}
+
+
+int ck_ec32_wait_slow(struct ck_ec32 *ec,
+		      const struct ck_ec_ops *ops,
+		      uint32_t old_value,
+		      const struct timespec *deadline);
+
+CK_CC_FORCE_INLINE int ck_ec32_wait(struct ck_ec32 *ec,
+				    const struct ck_ec_mode *mode,
+				    uint32_t old_value,
+				    const struct timespec *deadline)
+{
+	if (ck_ec32_value(ec) != old_value) {
+		return 0;
+	}
+
+	return ck_ec32_wait_slow(ec, mode->ops, old_value, deadline);
+}
+
+int ck_ec32_wait_pred_slow(struct ck_ec32 *ec,
+			   const struct ck_ec_ops *ops,
+			   uint32_t old_value,
+			   int (*pred)(const struct ck_ec_wait_state *state,
+				       struct timespec *deadline),
+			   void *data,
+			   const struct timespec *deadline);
+
+CK_CC_FORCE_INLINE int
+ck_ec32_wait_pred(struct ck_ec32 *ec,
+		  const struct ck_ec_mode *mode,
+		  uint32_t old_value,
+		  int (*pred)(const struct ck_ec_wait_state *state,
+			      struct timespec *deadline),
+		  void *data,
+		  const struct timespec *deadline)
+{
+	if (ck_ec32_value(ec) != old_value) {
+		return 0;
+	}
+
+	return ck_ec32_wait_pred_slow(ec, mode->ops, old_value,
+				      pred, data, deadline);
+}
+
+#ifdef CK_F_EC64
+CK_CC_FORCE_INLINE void ck_ec64_init(struct ck_ec64 *ec, uint64_t value)
+{
+	ec->counter = value << 1;
+	return;
+}
+
+CK_CC_FORCE_INLINE uint64_t ck_ec64_value(const struct ck_ec64 *ec)
+{
+	uint64_t ret = ck_pr_load_64(&ec->counter) >> 1;
+
+	ck_pr_fence_acquire();
+	return ret;
+}
+
+CK_CC_FORCE_INLINE bool ck_ec64_has_waiters(const struct ck_ec64 *ec)
+{
+	return ck_pr_load_64(&ec->counter) & 1;
+}
+
+void ck_ec64_wake(struct ck_ec64 *ec, const struct ck_ec_ops *ops);
+
+CK_CC_FORCE_INLINE void ck_ec64_inc(struct ck_ec64 *ec,
+				    const struct ck_ec_mode *mode)
+{
+	/* We always xadd, so there's no special optimization here. */
+	(void)ck_ec64_add(ec, mode, 1);
+	return;
+}
+
+CK_CC_FORCE_INLINE uint64_t ck_ec_add64_epilogue(struct ck_ec64 *ec,
+					       const struct ck_ec_mode *mode,
+					       uint64_t old)
+{
+	uint64_t ret = old >> 1;
+
+	if (CK_CC_UNLIKELY(old & 1)) {
+		ck_ec64_wake(ec, mode->ops);
+	}
+
+	return ret;
+}
+
+static CK_CC_INLINE uint64_t ck_ec64_add_mp(struct ck_ec64 *ec,
+					    const struct ck_ec_mode *mode,
+					    uint64_t delta)
+{
+	uint64_t inc = 2 * delta;  /* The low bit is the flag bit. */
+
+	ck_pr_fence_store_atomic();
+	return ck_ec_add64_epilogue(ec, mode, ck_pr_faa_64(&ec->counter, inc));
+}
+
+#ifdef CK_F_EC_SP
+/* Single-producer specialisation. */
+static CK_CC_INLINE uint64_t ck_ec64_add_sp(struct ck_ec64 *ec,
+					    const struct ck_ec_mode *mode,
+					    uint64_t delta)
+{
+	uint64_t old;
+
+	/*
+	 * Correctness of this racy write depends on actually
+	 * having an update to write. Exit here if the update
+	 * is a no-op.
+	 */
+	if (CK_CC_UNLIKELY(delta == 0)) {
+		return ck_ec64_value(ec);
+	}
+
+	ck_pr_fence_store();
+	old = 2 * delta;  /* The low bit is the flag bit. */
+	__asm__ volatile("xaddq %1, %0"
+			 : "+m"(ec->counter), "+r"(old)
+			 :: "cc", "memory");
+	return ck_ec_add64_epilogue(ec, mode, old);
+}
+#endif /* CK_F_EC_SP */
+
+/*
+ * Dispatch on mode->single_producer in this FORCE_INLINE function:
+ * the end result is always small, but not all compilers have enough
+ * foresight to inline and get the reduction.
+ */
+CK_CC_FORCE_INLINE uint64_t ck_ec64_add(struct ck_ec64 *ec,
+					const struct ck_ec_mode *mode,
+					uint64_t delta)
+{
+#ifdef CK_F_EC_SP
+	if (mode->single_producer == true) {
+		return ck_ec64_add_sp(ec, mode, delta);
+	}
+#endif
+
+	return ck_ec64_add_mp(ec, mode, delta);
+}
+
+int ck_ec64_wait_slow(struct ck_ec64 *ec,
+		      const struct ck_ec_ops *ops,
+		      uint64_t old_value,
+		      const struct timespec *deadline);
+
+CK_CC_FORCE_INLINE int ck_ec64_wait(struct ck_ec64 *ec,
+				    const struct ck_ec_mode *mode,
+				    uint64_t old_value,
+				    const struct timespec *deadline)
+{
+	if (ck_ec64_value(ec) != old_value) {
+		return 0;
+	}
+
+	return ck_ec64_wait_slow(ec, mode->ops, old_value, deadline);
+}
+
+int ck_ec64_wait_pred_slow(struct ck_ec64 *ec,
+			   const struct ck_ec_ops *ops,
+			   uint64_t old_value,
+			   int (*pred)(const struct ck_ec_wait_state *state,
+				       struct timespec *deadline),
+			   void *data,
+			   const struct timespec *deadline);
+
+
+CK_CC_FORCE_INLINE int
+ck_ec64_wait_pred(struct ck_ec64 *ec,
+		  const struct ck_ec_mode *mode,
+		  uint64_t old_value,
+		  int (*pred)(const struct ck_ec_wait_state *state,
+			      struct timespec *deadline),
+		  void *data,
+		  const struct timespec *deadline)
+{
+	if (ck_ec64_value(ec) != old_value) {
+		return 0;
+	}
+
+	return ck_ec64_wait_pred_slow(ec, mode->ops, old_value,
+				      pred, data, deadline);
+}
+#endif /* CK_F_EC64 */
+#endif /* !CK_EC_H */
--- a/include/ck_epoch.h
+++ b/include/ck_epoch.h
@ -83,6 +83,7 @@ struct ck_epoch_ref {
 };

 struct ck_epoch_record {
+	ck_stack_entry_t record_next;
 	struct ck_epoch *global;
 	unsigned int state;
 	unsigned int epoch;
@ -92,17 +93,16 @@ struct ck_epoch_record {
 	} local CK_CC_CACHELINE;
 	unsigned int n_pending;
 	unsigned int n_peak;
-	unsigned long n_dispatch;
+	unsigned int n_dispatch;
+	void *ct;
 	ck_stack_t pending[CK_EPOCH_LENGTH];
-	ck_stack_entry_t record_next;
 } CK_CC_CACHELINE;
 typedef struct ck_epoch_record ck_epoch_record_t;

 struct ck_epoch {
 	unsigned int epoch;
-	char pad[CK_MD_CACHELINE - sizeof(unsigned int)];
-	ck_stack_t records;
 	unsigned int n_free;
+	ck_stack_t records;
 };
 typedef struct ck_epoch ck_epoch_t;

@ -110,7 +110,14 @@ typedef struct ck_epoch ck_epoch_t;
 * Internal functions.
 */
 void _ck_epoch_addref(ck_epoch_record_t *, ck_epoch_section_t *);
-void _ck_epoch_delref(ck_epoch_record_t *, ck_epoch_section_t *);
+bool _ck_epoch_delref(ck_epoch_record_t *, ck_epoch_section_t *);
+
+CK_CC_FORCE_INLINE static void *
+ck_epoch_record_ct(const ck_epoch_record_t *record)
+{
+
+	return ck_pr_load_ptr(&record->ct);
+}

 /*
 * Marks the beginning of an epoch-protected section.
@ -160,9 +167,10 @@ ck_epoch_begin(ck_epoch_record_t *record, ck_epoch_section_t *section)
 }

 /*
- * Marks the end of an epoch-protected section.
+ * Marks the end of an epoch-protected section. Returns true if no more
+ * sections exist for the caller.
 */
-CK_CC_FORCE_INLINE static void
+CK_CC_FORCE_INLINE static bool
 ck_epoch_end(ck_epoch_record_t *record, ck_epoch_section_t *section)
 {

@ -170,15 +178,19 @@ ck_epoch_end(ck_epoch_record_t *record, ck_epoch_section_t *section)
 	ck_pr_store_uint(&record->active, record->active - 1);

 	if (section != NULL)
-		_ck_epoch_delref(record, section);
+		return _ck_epoch_delref(record, section);

-	return;
+	return record->active == 0;
 }

 /*
 * Defers the execution of the function pointed to by the "cb"
 * argument until an epoch counter loop. This allows for a
 * non-blocking deferral.
+ *
+ * We can get away without a fence here due to the monotonic nature
+ * of the epoch counter. Worst case, this will result in some delays
+ * before object destruction.
 */
 CK_CC_FORCE_INLINE static void
 ck_epoch_call(ck_epoch_record_t *record,
@ -195,13 +207,75 @@ ck_epoch_call(ck_epoch_record_t *record,
 	return;
 }

+/*
+ * Same as ck_epoch_call, but allows for records to be shared and is reentrant.
+ */
+CK_CC_FORCE_INLINE static void
+ck_epoch_call_strict(ck_epoch_record_t *record,
+	      ck_epoch_entry_t *entry,
+	      ck_epoch_cb_t *function)
+{
+	struct ck_epoch *epoch = record->global;
+	unsigned int e = ck_pr_load_uint(&epoch->epoch);
+	unsigned int offset = e & (CK_EPOCH_LENGTH - 1);
+
+	ck_pr_inc_uint(&record->n_pending);
+	entry->function = function;
+
+	/* Store fence is implied by push operation. */
+	ck_stack_push_upmc(&record->pending[offset], &entry->stack_entry);
+	return;
+}
+
+/*
+ * This callback is used for synchronize_wait to allow for custom blocking
+ * behavior.
+ */
+typedef void ck_epoch_wait_cb_t(ck_epoch_t *, ck_epoch_record_t *,
+    void *);
+
+/*
+ * Return latest epoch value. This operation provides load ordering.
+ */
+CK_CC_FORCE_INLINE static unsigned int
+ck_epoch_value(const ck_epoch_t *ep)
+{
+
+	ck_pr_fence_load();
+	return ck_pr_load_uint(&ep->epoch);
+}
+
 void ck_epoch_init(ck_epoch_t *);
-ck_epoch_record_t *ck_epoch_recycle(ck_epoch_t *);
-void ck_epoch_register(ck_epoch_t *, ck_epoch_record_t *);
+
+/*
+ * Attempts to recycle an unused epoch record. If one is successfully
+ * allocated, the record context pointer is also updated.
+ */
+ck_epoch_record_t *ck_epoch_recycle(ck_epoch_t *, void *);
+
+/*
+ * Registers an epoch record. An optional context pointer may be passed that
+ * is retrievable with ck_epoch_record_ct.
+ */
+void ck_epoch_register(ck_epoch_t *, ck_epoch_record_t *, void *);
+
+/*
+ * Marks a record as available for re-use by a subsequent recycle operation.
+ * Note that the record cannot be physically destroyed.
+ */
 void ck_epoch_unregister(ck_epoch_record_t *);
+
 bool ck_epoch_poll(ck_epoch_record_t *);
+bool ck_epoch_poll_deferred(struct ck_epoch_record *record, ck_stack_t *deferred);
 void ck_epoch_synchronize(ck_epoch_record_t *);
+void ck_epoch_synchronize_wait(ck_epoch_t *, ck_epoch_wait_cb_t *, void *);
 void ck_epoch_barrier(ck_epoch_record_t *);
+void ck_epoch_barrier_wait(ck_epoch_record_t *, ck_epoch_wait_cb_t *, void *);
+
+/*
+ * Reclaim entries associated with a record. This is safe to call only on
+ * the caller's record or records that are using call_strict.
+ */
 void ck_epoch_reclaim(ck_epoch_record_t *);

 #endif /* CK_EPOCH_H */
--- a/include/ck_fifo.h
+++ b/include/ck_fifo.h
@ -115,7 +115,7 @@ CK_CC_INLINE static void
 ck_fifo_spsc_deinit(struct ck_fifo_spsc *fifo, struct ck_fifo_spsc_entry **garbage)
 {

-	*garbage = fifo->head;
+	*garbage = fifo->garbage;
 	fifo->head = fifo->tail = NULL;
 	return;
 }
--- a/include/ck_hs.h
+++ b/include/ck_hs.h
@ -100,18 +100,28 @@ struct ck_hs_stat {
 struct ck_hs_iterator {
 	void **cursor;
 	unsigned long offset;
+	struct ck_hs_map *map;
 };
 typedef struct ck_hs_iterator ck_hs_iterator_t;

-#define CK_HS_ITERATOR_INITIALIZER { NULL, 0 }
+#define CK_HS_ITERATOR_INITIALIZER { NULL, 0, NULL }

 /* Convenience wrapper to table hash function. */
 #define CK_HS_HASH(T, F, K) F((K), (T)->seed)

+/* Computes the hash of n bytes of k for the specified hash map. */
+static inline unsigned long
+ck_hs_hash(const struct ck_hs *hs, const void *k)
+{
+
+	return hs->hf(k, hs->seed);
+}
+
 typedef void *ck_hs_apply_fn_t(void *, void *);
 bool ck_hs_apply(ck_hs_t *, unsigned long, const void *, ck_hs_apply_fn_t *, void *);
 void ck_hs_iterator_init(ck_hs_iterator_t *);
 bool ck_hs_next(ck_hs_t *, ck_hs_iterator_t *, void **);
+bool ck_hs_next_spmc(ck_hs_t *, ck_hs_iterator_t *, void **);
 bool ck_hs_move(ck_hs_t *, ck_hs_t *, ck_hs_hash_cb_t *,
    ck_hs_compare_cb_t *, struct ck_malloc *);
 bool ck_hs_init(ck_hs_t *, unsigned int, ck_hs_hash_cb_t *,
--- a/include/ck_md.h.in
+++ b/include/ck_md.h.in
@ -47,7 +47,15 @@
 #define @POINTER_PACK_ENABLE@
 #endif /* @POINTER_PACK_ENABLE@ */

-#ifndef @VMA_BITS@ 
+#ifndef @SSE_DISABLE@
+#define @SSE_DISABLE@
+#endif /* @SSE_DISABLE@ */
+
+#ifndef @PPC32_LWSYNC_ENABLE@
+#define @PPC32_LWSYNC_ENABLE@
+#endif /* @PPC32_LWSYNC_ENABLE@ */
+
+#ifndef @VMA_BITS@
 #define @VMA_BITS@ @VMA_BITS_VALUE@
 #endif /* @VMA_BITS@ */

--- a/include/ck_pr.h
+++ b/include/ck_pr.h
@ -34,7 +34,20 @@
 #include <ck_stdint.h>
 #include <ck_stdbool.h>

-#ifndef CK_USE_CC_BUILTINS
+/*
+ * Default to using builtins for clang analyzer, coverity, and sparse:
+ * inline assembly is often too opaque for useful analysis.  Override
+ * the defaults by defining CK_USE_CC_BUILTINS=0 or 1.
+ */
+#if !defined(CK_USE_CC_BUILTINS)
+#if defined(__clang_analyzer__) || defined(__COVERITY__) || defined(__CHECKER__)
+#define CK_USE_CC_BUILTINS 1
+#else
+#define CK_USE_CC_BUILTINS 0
+#endif
+#endif
+
+#if !CK_USE_CC_BUILTINS
 #if defined(__x86_64__)
 #include "gcc/x86_64/ck_pr.h"
 #elif defined(__x86__)
@ -43,6 +56,8 @@
 #include "gcc/sparcv9/ck_pr.h"
 #elif defined(__ppc64__)
 #include "gcc/ppc64/ck_pr.h"
+#elif defined(__s390x__)
+#include "gcc/s390x/ck_pr.h"
 #elif defined(__ppc__)
 #include "gcc/ppc/ck_pr.h"
 #elif defined(__arm__)
@ -613,8 +628,8 @@ CK_PR_BTX_S(bts, 16, uint16_t, |,)
 	}

 #define CK_PR_UNARY_Z(K, S, M, T, P, C, Z)				\
-	CK_CC_INLINE static void					\
-	ck_pr_##K##_##S##_zero(M *target, bool *zero)			\
+	CK_CC_INLINE static bool					\
+	ck_pr_##K##_##S##_is_zero(M *target)				\
 	{								\
 		T previous;						\
 		C punt;							\
@ -625,12 +640,21 @@ CK_PR_BTX_S(bts, 16, uint16_t, |,)
 					     (C)(previous P 1),		\
 					     &previous) == false)	\
 			ck_pr_stall();					\
-		*zero = previous == (T)Z;				\
+		return previous == (T)Z;				\
+        }
+
+#define CK_PR_UNARY_Z_STUB(K, S, M)					\
+	CK_CC_INLINE static void					\
+	ck_pr_##K##_##S##_zero(M *target, bool *zero)			\
+	{								\
+		*zero = ck_pr_##K##_##S##_is_zero(target);		\
 		return;							\
 	}

 #define CK_PR_UNARY_S(K, X, S, M) CK_PR_UNARY(K, X, S, M, M)
-#define CK_PR_UNARY_Z_S(K, S, M, P, Z) CK_PR_UNARY_Z(K, S, M, M, P, M, Z)
+#define CK_PR_UNARY_Z_S(K, S, M, P, Z)          \
+        CK_PR_UNARY_Z(K, S, M, M, P, M, Z)      \
+        CK_PR_UNARY_Z_STUB(K, S, M)

 #if defined(CK_F_PR_LOAD_CHAR) && defined(CK_F_PR_CAS_CHAR_VALUE)

@ -642,6 +666,8 @@ CK_PR_UNARY_S(inc, add, char, char)
 #ifndef CK_F_PR_INC_CHAR_ZERO
 #define CK_F_PR_INC_CHAR_ZERO
 CK_PR_UNARY_Z_S(inc, char, char, +, -1)
+#else
+CK_PR_UNARY_Z_STUB(inc, char, char)
 #endif /* CK_F_PR_INC_CHAR_ZERO */

 #ifndef CK_F_PR_DEC_CHAR
@ -652,6 +678,8 @@ CK_PR_UNARY_S(dec, sub, char, char)
 #ifndef CK_F_PR_DEC_CHAR_ZERO
 #define CK_F_PR_DEC_CHAR_ZERO
 CK_PR_UNARY_Z_S(dec, char, char, -, 1)
+#else
+CK_PR_UNARY_Z_STUB(dec, char, char)
 #endif /* CK_F_PR_DEC_CHAR_ZERO */

 #endif /* CK_F_PR_LOAD_CHAR && CK_F_PR_CAS_CHAR_VALUE */
@ -666,6 +694,8 @@ CK_PR_UNARY_S(inc, add, int, int)
 #ifndef CK_F_PR_INC_INT_ZERO
 #define CK_F_PR_INC_INT_ZERO
 CK_PR_UNARY_Z_S(inc, int, int, +, -1)
+#else
+CK_PR_UNARY_Z_STUB(inc, int, int)
 #endif /* CK_F_PR_INC_INT_ZERO */

 #ifndef CK_F_PR_DEC_INT
@ -676,6 +706,8 @@ CK_PR_UNARY_S(dec, sub, int, int)
 #ifndef CK_F_PR_DEC_INT_ZERO
 #define CK_F_PR_DEC_INT_ZERO
 CK_PR_UNARY_Z_S(dec, int, int, -, 1)
+#else
+CK_PR_UNARY_Z_STUB(dec, int, int)
 #endif /* CK_F_PR_DEC_INT_ZERO */

 #endif /* CK_F_PR_LOAD_INT && CK_F_PR_CAS_INT_VALUE */
@ -705,6 +737,8 @@ CK_PR_UNARY_S(inc, add, uint, unsigned int)
 #ifndef CK_F_PR_INC_UINT_ZERO
 #define CK_F_PR_INC_UINT_ZERO
 CK_PR_UNARY_Z_S(inc, uint, unsigned int, +, UINT_MAX)
+#else
+CK_PR_UNARY_Z_STUB(inc, uint, unsigned int)
 #endif /* CK_F_PR_INC_UINT_ZERO */

 #ifndef CK_F_PR_DEC_UINT
@ -715,6 +749,8 @@ CK_PR_UNARY_S(dec, sub, uint, unsigned int)
 #ifndef CK_F_PR_DEC_UINT_ZERO
 #define CK_F_PR_DEC_UINT_ZERO
 CK_PR_UNARY_Z_S(dec, uint, unsigned int, -, 1)
+#else
+CK_PR_UNARY_Z_STUB(dec, uint, unsigned int)
 #endif /* CK_F_PR_DEC_UINT_ZERO */

 #endif /* CK_F_PR_LOAD_UINT && CK_F_PR_CAS_UINT_VALUE */
@ -729,6 +765,8 @@ CK_PR_UNARY(inc, add, ptr, void, uintptr_t)
 #ifndef CK_F_PR_INC_PTR_ZERO
 #define CK_F_PR_INC_PTR_ZERO
 CK_PR_UNARY_Z(inc, ptr, void, uintptr_t, +, void *, UINT_MAX)
+#else
+CK_PR_UNARY_Z_STUB(inc, ptr, void)
 #endif /* CK_F_PR_INC_PTR_ZERO */

 #ifndef CK_F_PR_DEC_PTR
@ -739,6 +777,8 @@ CK_PR_UNARY(dec, sub, ptr, void, uintptr_t)
 #ifndef CK_F_PR_DEC_PTR_ZERO
 #define CK_F_PR_DEC_PTR_ZERO
 CK_PR_UNARY_Z(dec, ptr, void, uintptr_t, -, void *, 1)
+#else
+CK_PR_UNARY_Z_STUB(dec, ptr, void)
 #endif /* CK_F_PR_DEC_PTR_ZERO */

 #endif /* CK_F_PR_LOAD_PTR && CK_F_PR_CAS_PTR_VALUE */
@ -753,6 +793,8 @@ CK_PR_UNARY_S(inc, add, 64, uint64_t)
 #ifndef CK_F_PR_INC_64_ZERO
 #define CK_F_PR_INC_64_ZERO
 CK_PR_UNARY_Z_S(inc, 64, uint64_t, +, UINT64_MAX)
+#else
+CK_PR_UNARY_Z_STUB(inc, 64, uint64_t)
 #endif /* CK_F_PR_INC_64_ZERO */

 #ifndef CK_F_PR_DEC_64
@ -763,6 +805,8 @@ CK_PR_UNARY_S(dec, sub, 64, uint64_t)
 #ifndef CK_F_PR_DEC_64_ZERO
 #define CK_F_PR_DEC_64_ZERO
 CK_PR_UNARY_Z_S(dec, 64, uint64_t, -, 1)
+#else
+CK_PR_UNARY_Z_STUB(dec, 64, uint64_t)
 #endif /* CK_F_PR_DEC_64_ZERO */

 #endif /* CK_F_PR_LOAD_64 && CK_F_PR_CAS_64_VALUE */
@ -777,6 +821,8 @@ CK_PR_UNARY_S(inc, add, 32, uint32_t)
 #ifndef CK_F_PR_INC_32_ZERO
 #define CK_F_PR_INC_32_ZERO
 CK_PR_UNARY_Z_S(inc, 32, uint32_t, +, UINT32_MAX)
+#else
+CK_PR_UNARY_Z_STUB(inc, 32, uint32_t)
 #endif /* CK_F_PR_INC_32_ZERO */

 #ifndef CK_F_PR_DEC_32
@ -787,6 +833,8 @@ CK_PR_UNARY_S(dec, sub, 32, uint32_t)
 #ifndef CK_F_PR_DEC_32_ZERO
 #define CK_F_PR_DEC_32_ZERO
 CK_PR_UNARY_Z_S(dec, 32, uint32_t, -, 1)
+#else
+CK_PR_UNARY_Z_STUB(dec, 32, uint32_t)
 #endif /* CK_F_PR_DEC_32_ZERO */

 #endif /* CK_F_PR_LOAD_32 && CK_F_PR_CAS_32_VALUE */
@ -801,6 +849,8 @@ CK_PR_UNARY_S(inc, add, 16, uint16_t)
 #ifndef CK_F_PR_INC_16_ZERO
 #define CK_F_PR_INC_16_ZERO
 CK_PR_UNARY_Z_S(inc, 16, uint16_t, +, UINT16_MAX)
+#else
+CK_PR_UNARY_Z_STUB(inc, 16, uint16_t)
 #endif /* CK_F_PR_INC_16_ZERO */

 #ifndef CK_F_PR_DEC_16
@ -811,6 +861,8 @@ CK_PR_UNARY_S(dec, sub, 16, uint16_t)
 #ifndef CK_F_PR_DEC_16_ZERO
 #define CK_F_PR_DEC_16_ZERO
 CK_PR_UNARY_Z_S(dec, 16, uint16_t, -, 1)
+#else
+CK_PR_UNARY_Z_STUB(dec, 16, uint16_t)
 #endif /* CK_F_PR_DEC_16_ZERO */

 #endif /* CK_F_PR_LOAD_16 && CK_F_PR_CAS_16_VALUE */
@ -825,6 +877,8 @@ CK_PR_UNARY_S(inc, add, 8, uint8_t)
 #ifndef CK_F_PR_INC_8_ZERO
 #define CK_F_PR_INC_8_ZERO
 CK_PR_UNARY_Z_S(inc, 8, uint8_t, +, UINT8_MAX)
+#else
+CK_PR_UNARY_Z_STUB(inc, 8, uint8_t)
 #endif /* CK_F_PR_INC_8_ZERO */

 #ifndef CK_F_PR_DEC_8
@ -835,6 +889,8 @@ CK_PR_UNARY_S(dec, sub, 8, uint8_t)
 #ifndef CK_F_PR_DEC_8_ZERO
 #define CK_F_PR_DEC_8_ZERO
 CK_PR_UNARY_Z_S(dec, 8, uint8_t, -, 1)
+#else
+CK_PR_UNARY_Z_STUB(dec, 8, uint8_t)
 #endif /* CK_F_PR_DEC_8_ZERO */

 #endif /* CK_F_PR_LOAD_8 && CK_F_PR_CAS_8_VALUE */
--- a/include/ck_queue.h
+++ b/include/ck_queue.h
@ -125,7 +125,7 @@
 */
 #define	CK_SLIST_HEAD(name, type)						\
 struct name {									\
-	struct type *slh_first;	/* first element */				\
+	struct type *cslh_first;	/* first element */				\
 }

 #define	CK_SLIST_HEAD_INITIALIZER(head)						\
@ -133,85 +133,95 @@ struct name {									\

 #define	CK_SLIST_ENTRY(type)							\
 struct {									\
-	struct type *sle_next;	/* next element */				\
+	struct type *csle_next;	/* next element */				\
 }

 /*
 * Singly-linked List functions.
 */
 #define	CK_SLIST_EMPTY(head)							\
-	(ck_pr_load_ptr(&(head)->slh_first) == NULL)
+	(ck_pr_load_ptr(&(head)->cslh_first) == NULL)

 #define	CK_SLIST_FIRST(head)							\
-	(ck_pr_load_ptr(&(head)->slh_first))
+	(ck_pr_load_ptr(&(head)->cslh_first))

 #define	CK_SLIST_NEXT(elm, field)						\
-	ck_pr_load_ptr(&((elm)->field.sle_next))
+	ck_pr_load_ptr(&((elm)->field.csle_next))

 #define	CK_SLIST_FOREACH(var, head, field)					\
 	for ((var) = CK_SLIST_FIRST((head));					\
-	    (var) && (ck_pr_fence_load(), 1);					\
+	    (var);								\
 	    (var) = CK_SLIST_NEXT((var), field))

-#define	CK_SLIST_FOREACH_SAFE(var, head, field, tvar)				 \
-	for ((var) = CK_SLIST_FIRST(head);					 \
-	    (var) && (ck_pr_fence_load(), (tvar) = CK_SLIST_NEXT(var, field), 1);\
+#define	CK_SLIST_FOREACH_SAFE(var, head, field, tvar)				\
+	for ((var) = CK_SLIST_FIRST(head);					\
+	    (var) && ((tvar) = CK_SLIST_NEXT(var, field), 1);			\
 	    (var) = (tvar))

 #define	CK_SLIST_FOREACH_PREVPTR(var, varp, head, field)			\
-	for ((varp) = &(head)->slh_first;					\
-	    ((var) = ck_pr_load_ptr(varp)) != NULL && (ck_pr_fence_load(), 1);	\
-	    (varp) = &(var)->field.sle_next)
+	for ((varp) = &(head)->cslh_first;					\
+	    ((var) = ck_pr_load_ptr(varp)) != NULL;				\
+	    (varp) = &(var)->field.csle_next)

 #define	CK_SLIST_INIT(head) do {						\
-	ck_pr_store_ptr(&(head)->slh_first, NULL);				\
+	ck_pr_store_ptr(&(head)->cslh_first, NULL);				\
 	ck_pr_fence_store();							\
 } while (0)

 #define	CK_SLIST_INSERT_AFTER(a, b, field) do {					\
-	(b)->field.sle_next = (a)->field.sle_next;				\
+	(b)->field.csle_next = (a)->field.csle_next;				\
 	ck_pr_fence_store();							\
-	ck_pr_store_ptr(&(a)->field.sle_next, b);				\
+	ck_pr_store_ptr(&(a)->field.csle_next, b);				\
 } while (0)

 #define	CK_SLIST_INSERT_HEAD(head, elm, field) do {				\
-	(elm)->field.sle_next = (head)->slh_first;				\
+	(elm)->field.csle_next = (head)->cslh_first;				\
 	ck_pr_fence_store();							\
-	ck_pr_store_ptr(&(head)->slh_first, elm);				\
+	ck_pr_store_ptr(&(head)->cslh_first, elm);				\
+} while (0)
+
+#define	CK_SLIST_INSERT_PREVPTR(prevp, slistelm, elm, field) do {		\
+	(elm)->field.csle_next = (slistelm);					\
+	ck_pr_fence_store();							\
+	ck_pr_store_ptr(prevp, elm);						\
 } while (0)

 #define CK_SLIST_REMOVE_AFTER(elm, field) do {					\
-	ck_pr_store_ptr(&(elm)->field.sle_next,					\
-	    (elm)->field.sle_next->field.sle_next);				\
+	ck_pr_store_ptr(&(elm)->field.csle_next,				\
+	    (elm)->field.csle_next->field.csle_next);				\
 } while (0)

 #define	CK_SLIST_REMOVE(head, elm, type, field) do {				\
-	if ((head)->slh_first == (elm)) {					\
+	if ((head)->cslh_first == (elm)) {					\
 		CK_SLIST_REMOVE_HEAD((head), field);				\
 	} else {								\
-		struct type *curelm = (head)->slh_first;			\
-		while (curelm->field.sle_next != (elm))				\
-			curelm = curelm->field.sle_next;			\
+		struct type *curelm = (head)->cslh_first;			\
+		while (curelm->field.csle_next != (elm))			\
+			curelm = curelm->field.csle_next;			\
 		CK_SLIST_REMOVE_AFTER(curelm, field);				\
 	}									\
 } while (0)

 #define	CK_SLIST_REMOVE_HEAD(head, field) do {					\
-	ck_pr_store_ptr(&(head)->slh_first,					\
-		(head)->slh_first->field.sle_next);				\
+	ck_pr_store_ptr(&(head)->cslh_first,					\
+		(head)->cslh_first->field.csle_next);				\
+} while (0)
+
+#define CK_SLIST_REMOVE_PREVPTR(prevp, elm, field) do {				\
+	ck_pr_store_ptr(prevptr, (elm)->field.csle_next);			\
 } while (0)

 #define CK_SLIST_MOVE(head1, head2, field) do {					\
-	ck_pr_store_ptr(&(head1)->slh_first, (head2)->slh_first);		\
+	ck_pr_store_ptr(&(head1)->cslh_first, (head2)->cslh_first);		\
 } while (0)

 /*
 * This operation is not applied atomically.
 */
 #define CK_SLIST_SWAP(a, b, type) do {						\
-	struct type *swap_first = (a)->slh_first;				\
-	(a)->slh_first = (b)->slh_first;					\
-	(b)->slh_first = swap_first;						\
+	struct type *swap_first = (a)->cslh_first;				\
+	(a)->cslh_first = (b)->cslh_first;					\
+	(b)->cslh_first = swap_first;						\
 } while (0)

 /*
@ -219,107 +229,107 @@ struct {									\
 */
 #define	CK_STAILQ_HEAD(name, type)					\
 struct name {								\
-	struct type *stqh_first;/* first element */			\
-	struct type **stqh_last;/* addr of last next element */		\
+	struct type *cstqh_first;/* first element */			\
+	struct type **cstqh_last;/* addr of last next element */		\
 }

 #define	CK_STAILQ_HEAD_INITIALIZER(head)				\
-	{ NULL, &(head).stqh_first }
+	{ NULL, &(head).cstqh_first }

 #define	CK_STAILQ_ENTRY(type)						\
 struct {								\
-	struct type *stqe_next;	/* next element */			\
+	struct type *cstqe_next;	/* next element */			\
 }

 /*
 * Singly-linked Tail queue functions.
 */
 #define	CK_STAILQ_CONCAT(head1, head2) do {					\
-	if ((head2)->stqh_first == NULL) {					\
-		ck_pr_store_ptr((head1)->stqh_last, (head2)->stqh_first);	\
+	if ((head2)->cstqh_first != NULL) {					\
+		ck_pr_store_ptr((head1)->cstqh_last, (head2)->cstqh_first);	\
 		ck_pr_fence_store();						\
-		(head1)->stqh_last = (head2)->stqh_last;			\
+		(head1)->cstqh_last = (head2)->cstqh_last;			\
 		CK_STAILQ_INIT((head2));					\
 	}									\
 } while (0)

-#define	CK_STAILQ_EMPTY(head)	(ck_pr_load_ptr(&(head)->stqh_first) == NULL)
+#define	CK_STAILQ_EMPTY(head)	(ck_pr_load_ptr(&(head)->cstqh_first) == NULL)

-#define	CK_STAILQ_FIRST(head)	(ck_pr_load_ptr(&(head)->stqh_first))
+#define	CK_STAILQ_FIRST(head)	(ck_pr_load_ptr(&(head)->cstqh_first))

 #define	CK_STAILQ_FOREACH(var, head, field)				\
 	for((var) = CK_STAILQ_FIRST((head));				\
-	   (var) && (ck_pr_fence_load(), 1);				\
+	   (var);							\
 	   (var) = CK_STAILQ_NEXT((var), field))

 #define	CK_STAILQ_FOREACH_SAFE(var, head, field, tvar)			\
 	for ((var) = CK_STAILQ_FIRST((head));				\
-	    (var) && (ck_pr_fence_load(), (tvar) =			\
+	    (var) && ((tvar) =						\
 		CK_STAILQ_NEXT((var), field), 1);			\
 	    (var) = (tvar))

 #define	CK_STAILQ_INIT(head) do {					\
-	ck_pr_store_ptr(&(head)->stqh_first, NULL);			\
+	ck_pr_store_ptr(&(head)->cstqh_first, NULL);			\
 	ck_pr_fence_store();						\
-	(head)->stqh_last = &(head)->stqh_first;			\
+	(head)->cstqh_last = &(head)->cstqh_first;			\
 } while (0)

 #define	CK_STAILQ_INSERT_AFTER(head, tqelm, elm, field) do {			\
-	(elm)->field.stqe_next = (tqelm)->field.stqe_next;			\
+	(elm)->field.cstqe_next = (tqelm)->field.cstqe_next;			\
 	ck_pr_fence_store();							\
-	ck_pr_store_ptr(&(tqelm)->field.stqe_next, elm);			\
-	if ((elm)->field.stqe_next == NULL)					\
-		(head)->stqh_last = &(elm)->field.stqe_next;			\
+	ck_pr_store_ptr(&(tqelm)->field.cstqe_next, elm);			\
+	if ((elm)->field.cstqe_next == NULL)					\
+		(head)->cstqh_last = &(elm)->field.cstqe_next;			\
 } while (0)

 #define	CK_STAILQ_INSERT_HEAD(head, elm, field) do {				\
-	(elm)->field.stqe_next = (head)->stqh_first;				\
+	(elm)->field.cstqe_next = (head)->cstqh_first;				\
 	ck_pr_fence_store();							\
-	ck_pr_store_ptr(&(head)->stqh_first, elm);				\
-	if ((elm)->field.stqe_next == NULL)					\
-		(head)->stqh_last = &(elm)->field.stqe_next;			\
+	ck_pr_store_ptr(&(head)->cstqh_first, elm);				\
+	if ((elm)->field.cstqe_next == NULL)					\
+		(head)->cstqh_last = &(elm)->field.cstqe_next;			\
 } while (0)

 #define	CK_STAILQ_INSERT_TAIL(head, elm, field) do {				\
-	(elm)->field.stqe_next = NULL;						\
+	(elm)->field.cstqe_next = NULL;						\
 	ck_pr_fence_store();							\
-	ck_pr_store_ptr((head)->stqh_last, (elm));				\
-	(head)->stqh_last = &(elm)->field.stqe_next;				\
+	ck_pr_store_ptr((head)->cstqh_last, (elm));				\
+	(head)->cstqh_last = &(elm)->field.cstqe_next;				\
 } while (0)

 #define	CK_STAILQ_NEXT(elm, field)						\
-	(ck_pr_load_ptr(&(elm)->field.stqe_next))
+	(ck_pr_load_ptr(&(elm)->field.cstqe_next))

 #define	CK_STAILQ_REMOVE(head, elm, type, field) do {				\
-	if ((head)->stqh_first == (elm)) {					\
+	if ((head)->cstqh_first == (elm)) {					\
 		CK_STAILQ_REMOVE_HEAD((head), field);				\
 	} else {								\
-		struct type *curelm = (head)->stqh_first;			\
-		while (curelm->field.stqe_next != (elm))			\
-			curelm = curelm->field.stqe_next;			\
+		struct type *curelm = (head)->cstqh_first;			\
+		while (curelm->field.cstqe_next != (elm))			\
+			curelm = curelm->field.cstqe_next;			\
 		CK_STAILQ_REMOVE_AFTER(head, curelm, field);			\
 	}									\
 } while (0)

 #define CK_STAILQ_REMOVE_AFTER(head, elm, field) do {				\
-	ck_pr_store_ptr(&(elm)->field.stqe_next,				\
-	    (elm)->field.stqe_next->field.stqe_next);				\
-	if ((elm)->field.stqe_next == NULL)					\
-		(head)->stqh_last = &(elm)->field.stqe_next;			\
+	ck_pr_store_ptr(&(elm)->field.cstqe_next,				\
+	    (elm)->field.cstqe_next->field.cstqe_next);				\
+	if ((elm)->field.cstqe_next == NULL)					\
+		(head)->cstqh_last = &(elm)->field.cstqe_next;			\
 } while (0)

 #define	CK_STAILQ_REMOVE_HEAD(head, field) do {					\
-	ck_pr_store_ptr(&(head)->stqh_first,					\
-	    (head)->stqh_first->field.stqe_next);				\
-	if ((head)->stqh_first == NULL)						\
-		(head)->stqh_last = &(head)->stqh_first;			\
+	ck_pr_store_ptr(&(head)->cstqh_first,					\
+	    (head)->cstqh_first->field.cstqe_next);				\
+	if ((head)->cstqh_first == NULL)						\
+		(head)->cstqh_last = &(head)->cstqh_first;			\
 } while (0)

 #define CK_STAILQ_MOVE(head1, head2, field) do {				\
-	ck_pr_store_ptr(&(head1)->stqh_first, (head2)->stqh_first);		\
-	(head1)->stqh_last = (head2)->stqh_last;				\
-	if ((head2)->stqh_last == &(head2)->stqh_first)				\
-		(head1)->stqh_last = &(head1)->stqh_first;			\
+	ck_pr_store_ptr(&(head1)->cstqh_first, (head2)->cstqh_first);		\
+	(head1)->cstqh_last = (head2)->cstqh_last;				\
+	if ((head2)->cstqh_last == &(head2)->cstqh_first)				\
+		(head1)->cstqh_last = &(head1)->cstqh_first;			\
 } while (0)

 /*
@ -327,15 +337,15 @@ struct {								\
 */
 #define CK_STAILQ_SWAP(head1, head2, type) do {				\
 	struct type *swap_first = CK_STAILQ_FIRST(head1);		\
-	struct type **swap_last = (head1)->stqh_last;			\
+	struct type **swap_last = (head1)->cstqh_last;			\
 	CK_STAILQ_FIRST(head1) = CK_STAILQ_FIRST(head2);		\
-	(head1)->stqh_last = (head2)->stqh_last;			\
+	(head1)->cstqh_last = (head2)->cstqh_last;			\
 	CK_STAILQ_FIRST(head2) = swap_first;				\
-	(head2)->stqh_last = swap_last;					\
+	(head2)->cstqh_last = swap_last;					\
 	if (CK_STAILQ_EMPTY(head1))					\
-		(head1)->stqh_last = &(head1)->stqh_first;		\
+		(head1)->cstqh_last = &(head1)->cstqh_first;		\
 	if (CK_STAILQ_EMPTY(head2))					\
-		(head2)->stqh_last = &(head2)->stqh_first;		\
+		(head2)->cstqh_last = &(head2)->cstqh_first;		\
 } while (0)

 /*
@ -343,7 +353,7 @@ struct {								\
 */
 #define	CK_LIST_HEAD(name, type)						\
 struct name {									\
-	struct type *lh_first;	/* first element */				\
+	struct type *clh_first;	/* first element */				\
 }

 #define	CK_LIST_HEAD_INITIALIZER(head)						\
@ -351,78 +361,78 @@ struct name {									\

 #define	CK_LIST_ENTRY(type)							\
 struct {									\
-	struct type *le_next;	/* next element */				\
-	struct type **le_prev;	/* address of previous next element */		\
+	struct type *cle_next;	/* next element */				\
+	struct type **cle_prev;	/* address of previous next element */		\
 }

-#define	CK_LIST_FIRST(head)		ck_pr_load_ptr(&(head)->lh_first)
+#define	CK_LIST_FIRST(head)		ck_pr_load_ptr(&(head)->clh_first)
 #define	CK_LIST_EMPTY(head)		(CK_LIST_FIRST(head) == NULL)
-#define	CK_LIST_NEXT(elm, field)	ck_pr_load_ptr(&(elm)->field.le_next)
+#define	CK_LIST_NEXT(elm, field)	ck_pr_load_ptr(&(elm)->field.cle_next)

 #define	CK_LIST_FOREACH(var, head, field)					\
 	for ((var) = CK_LIST_FIRST((head));					\
-	    (var) && (ck_pr_fence_load(), 1);					\
+	    (var);								\
 	    (var) = CK_LIST_NEXT((var), field))

 #define	CK_LIST_FOREACH_SAFE(var, head, field, tvar)				  \
 	for ((var) = CK_LIST_FIRST((head));					  \
-	    (var) && (ck_pr_fence_load(), (tvar) = CK_LIST_NEXT((var), field), 1);\
+	    (var) && ((tvar) = CK_LIST_NEXT((var), field), 1);			  \
 	    (var) = (tvar))

 #define	CK_LIST_INIT(head) do {							\
-	ck_pr_store_ptr(&(head)->lh_first, NULL);				\
+	ck_pr_store_ptr(&(head)->clh_first, NULL);				\
 	ck_pr_fence_store();							\
 } while (0)

 #define	CK_LIST_INSERT_AFTER(listelm, elm, field) do {				\
-	(elm)->field.le_next = (listelm)->field.le_next;			\
-	(elm)->field.le_prev = &(listelm)->field.le_next;			\
+	(elm)->field.cle_next = (listelm)->field.cle_next;			\
+	(elm)->field.cle_prev = &(listelm)->field.cle_next;			\
 	ck_pr_fence_store();							\
-	if ((listelm)->field.le_next != NULL)					\
-		(listelm)->field.le_next->field.le_prev = &(elm)->field.le_next;\
-	ck_pr_store_ptr(&(listelm)->field.le_next, elm);			\
+	if ((listelm)->field.cle_next != NULL)					\
+		(listelm)->field.cle_next->field.cle_prev = &(elm)->field.cle_next;\
+	ck_pr_store_ptr(&(listelm)->field.cle_next, elm);			\
 } while (0)

 #define	CK_LIST_INSERT_BEFORE(listelm, elm, field) do {				\
-	(elm)->field.le_prev = (listelm)->field.le_prev;			\
-	(elm)->field.le_next = (listelm);					\
+	(elm)->field.cle_prev = (listelm)->field.cle_prev;			\
+	(elm)->field.cle_next = (listelm);					\
 	ck_pr_fence_store();							\
-	ck_pr_store_ptr((listelm)->field.le_prev, (elm));			\
-	(listelm)->field.le_prev = &(elm)->field.le_next;			\
+	ck_pr_store_ptr((listelm)->field.cle_prev, (elm));			\
+	(listelm)->field.cle_prev = &(elm)->field.cle_next;			\
 } while (0)

 #define	CK_LIST_INSERT_HEAD(head, elm, field) do {				\
-	(elm)->field.le_next = (head)->lh_first;				\
+	(elm)->field.cle_next = (head)->clh_first;				\
 	ck_pr_fence_store();							\
-	if ((elm)->field.le_next != NULL)					\
-		(head)->lh_first->field.le_prev =  &(elm)->field.le_next;	\
-	ck_pr_store_ptr(&(head)->lh_first, elm);				\
-	(elm)->field.le_prev = &(head)->lh_first;				\
+	if ((elm)->field.cle_next != NULL)					\
+		(head)->clh_first->field.cle_prev =  &(elm)->field.cle_next;	\
+	ck_pr_store_ptr(&(head)->clh_first, elm);				\
+	(elm)->field.cle_prev = &(head)->clh_first;				\
 } while (0)

 #define	CK_LIST_REMOVE(elm, field) do {						\
-	ck_pr_store_ptr((elm)->field.le_prev, (elm)->field.le_next);		\
-	if ((elm)->field.le_next != NULL)					\
-		(elm)->field.le_next->field.le_prev = (elm)->field.le_prev;	\
+	ck_pr_store_ptr((elm)->field.cle_prev, (elm)->field.cle_next);		\
+	if ((elm)->field.cle_next != NULL)					\
+		(elm)->field.cle_next->field.cle_prev = (elm)->field.cle_prev;	\
 } while (0)

 #define CK_LIST_MOVE(head1, head2, field) do {				\
-	ck_pr_store_ptr(&(head1)->lh_first, (head2)->lh_first);		\
-	if ((head1)->lh_first != NULL)					\
-		(head1)->lh_first->field.le_prev = &(head1)->lh_first;	\
+	ck_pr_store_ptr(&(head1)->clh_first, (head2)->clh_first);		\
+	if ((head1)->clh_first != NULL)					\
+		(head1)->clh_first->field.cle_prev = &(head1)->clh_first;	\
 } while (0)

 /*
 * This operation is not applied atomically.
 */
 #define CK_LIST_SWAP(head1, head2, type, field) do {			\
-	struct type *swap_tmp = (head1)->lh_first;			\
-	(head1)->lh_first = (head2)->lh_first;				\
-	(head2)->lh_first = swap_tmp;					\
-	if ((swap_tmp = (head1)->lh_first) != NULL)			\
-		swap_tmp->field.le_prev = &(head1)->lh_first;		\
-	if ((swap_tmp = (head2)->lh_first) != NULL)			\
-		swap_tmp->field.le_prev = &(head2)->lh_first;		\
+	struct type *swap_tmp = (head1)->clh_first;			\
+	(head1)->clh_first = (head2)->clh_first;				\
+	(head2)->clh_first = swap_tmp;					\
+	if ((swap_tmp = (head1)->clh_first) != NULL)			\
+		swap_tmp->field.cle_prev = &(head1)->clh_first;		\
+	if ((swap_tmp = (head2)->clh_first) != NULL)			\
+		swap_tmp->field.cle_prev = &(head2)->clh_first;		\
 } while (0)

 #endif /* CK_QUEUE_H */
--- a/include/ck_ring.h
+++ b/include/ck_ring.h
@ -66,9 +66,56 @@ ck_ring_size(const struct ck_ring *ring)
 CK_CC_INLINE static unsigned int
 ck_ring_capacity(const struct ck_ring *ring)
 {
+
 	return ring->size;
 }

+/*
+ * This function is only safe to call when there are no concurrent operations
+ * on the ring. This is primarily meant for persistent ck_ring use-cases. The
+ * function returns true if any mutations were performed on the ring.
+ */
+CK_CC_INLINE static bool
+ck_ring_repair(struct ck_ring *ring)
+{
+	bool r = false;
+
+	if (ring->p_tail != ring->p_head) {
+		ring->p_tail = ring->p_head;
+		r = true;
+	}
+
+	return r;
+}
+
+/*
+ * This can be called when no concurrent updates are occurring on the ring
+ * structure to check for consistency. This is primarily meant to be used for
+ * persistent storage of the ring. If this functions returns false, the ring
+ * is in an inconsistent state.
+ */
+CK_CC_INLINE static bool
+ck_ring_valid(const struct ck_ring *ring)
+{
+	unsigned int size = ring->size;
+	unsigned int c_head = ring->c_head;
+	unsigned int p_head = ring->p_head;
+
+	/* The ring must be a power of 2. */
+	if (size & (size - 1))
+		return false;
+
+	/* The consumer counter must always be smaller than the producer. */
+	if (c_head > p_head)
+		return false;
+
+	/* The producer may only be up to size slots ahead of consumer. */
+	if (p_head - c_head >= size)
+		return false;
+
+	return true;
+}
+
 CK_CC_INLINE static void
 ck_ring_init(struct ck_ring *ring, unsigned int size)
 {
@ -84,6 +131,45 @@ ck_ring_init(struct ck_ring *ring, unsigned int size)
 /*
 * The _ck_ring_* namespace is internal only and must not used externally.
 */
+
+/*
+ * This function will return a region of memory to write for the next value
+ * for a single producer.
+ */
+CK_CC_FORCE_INLINE static void *
+_ck_ring_enqueue_reserve_sp(struct ck_ring *ring,
+    void *CK_CC_RESTRICT buffer,
+    unsigned int ts,
+    unsigned int *size)
+{
+	const unsigned int mask = ring->mask;
+	unsigned int consumer, producer, delta;
+
+	consumer = ck_pr_load_uint(&ring->c_head);
+	producer = ring->p_tail;
+	delta = producer + 1;
+	if (size != NULL)
+		*size = (producer - consumer) & mask;
+
+	if (CK_CC_UNLIKELY((delta & mask) == (consumer & mask)))
+		return NULL;
+
+	return (char *)buffer + ts * (producer & mask);
+}
+
+/*
+ * This is to be called to commit and make visible a region of previously
+ * reserved with reverse_sp.
+ */
+CK_CC_FORCE_INLINE static void
+_ck_ring_enqueue_commit_sp(struct ck_ring *ring)
+{
+
+	ck_pr_fence_store();
+	ck_pr_store_uint(&ring->p_tail, ring->p_tail + 1);
+	return;
+}
+
 CK_CC_FORCE_INLINE static bool
 _ck_ring_enqueue_sp(struct ck_ring *ring,
    void *CK_CC_RESTRICT buffer,
@ -163,6 +249,65 @@ _ck_ring_dequeue_sc(struct ck_ring *ring,
 	return true;
 }

+CK_CC_FORCE_INLINE static void *
+_ck_ring_enqueue_reserve_mp(struct ck_ring *ring,
+    void *buffer,
+    unsigned int ts,
+    unsigned int *ticket,
+    unsigned int *size)
+{
+	const unsigned int mask = ring->mask;
+	unsigned int producer, consumer, delta;
+
+	producer = ck_pr_load_uint(&ring->p_head);
+
+	for (;;) {
+		ck_pr_fence_load();
+		consumer = ck_pr_load_uint(&ring->c_head);
+
+		delta = producer + 1;
+
+		if (CK_CC_LIKELY((producer - consumer) < mask)) {
+			if (ck_pr_cas_uint_value(&ring->p_head,
+			    producer, delta, &producer) == true) {
+				break;
+			}
+		} else {
+			unsigned int new_producer;
+
+			ck_pr_fence_load();
+			new_producer = ck_pr_load_uint(&ring->p_head);
+
+			if (producer == new_producer) {
+				if (size != NULL)
+					*size = (producer - consumer) & mask;
+
+				return false;
+			}
+
+			producer = new_producer;
+		}
+	}
+
+	*ticket = producer;
+	if (size != NULL)
+		*size = (producer - consumer) & mask;
+
+	return (char *)buffer + ts * (producer & mask);
+}
+
+CK_CC_FORCE_INLINE static void
+_ck_ring_enqueue_commit_mp(struct ck_ring *ring, unsigned int producer)
+{
+
+	while (ck_pr_load_uint(&ring->p_tail) != producer)
+		ck_pr_stall();
+
+	ck_pr_fence_store();
+	ck_pr_store_uint(&ring->p_tail, producer + 1);
+	return;
+}
+
 CK_CC_FORCE_INLINE static bool
 _ck_ring_enqueue_mp(struct ck_ring *ring,
    void *buffer,
@ -176,23 +321,54 @@ _ck_ring_enqueue_mp(struct ck_ring *ring,

 	producer = ck_pr_load_uint(&ring->p_head);

-	do {
+	for (;;) {
 		/*
-		 * The snapshot of producer must be up to date with
-		 * respect to consumer.
+		 * The snapshot of producer must be up to date with respect to
+		 * consumer.
 		 */
 		ck_pr_fence_load();
 		consumer = ck_pr_load_uint(&ring->c_head);

 		delta = producer + 1;
-		if (CK_CC_UNLIKELY((delta & mask) == (consumer & mask))) {
-			r = false;
-			goto leave;
+
+		/*
+		 * Only try to CAS if the producer is not clearly stale (not
+		 * less than consumer) and the buffer is definitely not full.
+		 */
+		if (CK_CC_LIKELY((producer - consumer) < mask)) {
+			if (ck_pr_cas_uint_value(&ring->p_head,
+			    producer, delta, &producer) == true) {
+				break;
+			}
+		} else {
+			unsigned int new_producer;
+
+			/*
+			 * Slow path.  Either the buffer is full or we have a
+			 * stale snapshot of p_head.  Execute a second read of
+			 * p_read that must be ordered wrt the snapshot of
+			 * c_head.
+			 */
+			ck_pr_fence_load();
+			new_producer = ck_pr_load_uint(&ring->p_head);
+
+			/*
+			 * Only fail if we haven't made forward progress in
+			 * production: the buffer must have been full when we
+			 * read new_producer (or we wrapped around UINT_MAX
+			 * during this iteration).
+			 */
+			if (producer == new_producer) {
+				r = false;
+				goto leave;
+			}
+
+			/*
+			 * p_head advanced during this iteration. Try again.
+			 */
+			producer = new_producer;
 		}
-	} while (ck_pr_cas_uint_value(&ring->p_head,
-				      producer,
-				      delta,
-				      &producer) == false);
+	}

 	buffer = (char *)buffer + ts * (producer & mask);
 	memcpy(buffer, entry, ts);
@ -323,6 +499,33 @@ ck_ring_enqueue_spsc(struct ck_ring *ring,
 	    &entry, sizeof(entry), NULL);
 }

+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_spsc_size(struct ck_ring *ring,
+    struct ck_ring_buffer *buffer,
+    unsigned int *size)
+{
+
+	return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *),
+	    size);
+}
+
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_spsc(struct ck_ring *ring,
+    struct ck_ring_buffer *buffer)
+{
+
+	return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *),
+	    NULL);
+}
+
+CK_CC_INLINE static void
+ck_ring_enqueue_commit_spsc(struct ck_ring *ring)
+{
+
+	_ck_ring_enqueue_commit_sp(ring);
+	return;
+}
+
 CK_CC_INLINE static bool
 ck_ring_dequeue_spsc(struct ck_ring *ring,
    const struct ck_ring_buffer *buffer,
@ -344,8 +547,7 @@ ck_ring_enqueue_mpmc(struct ck_ring *ring,
    const void *entry)
 {

-	return _ck_ring_enqueue_mp(ring, buffer, &entry,
-	    sizeof(entry), NULL);
+	return _ck_ring_enqueue_mp(ring, buffer, &entry, sizeof(entry), NULL);
 }

 CK_CC_INLINE static bool
@ -355,8 +557,37 @@ ck_ring_enqueue_mpmc_size(struct ck_ring *ring,
    unsigned int *size)
 {

-	return _ck_ring_enqueue_mp_size(ring, buffer, &entry,
-	    sizeof(entry), size);
+	return _ck_ring_enqueue_mp_size(ring, buffer, &entry, sizeof(entry),
+	    size);
+}
+
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_mpmc(struct ck_ring *ring,
+    struct ck_ring_buffer *buffer,
+    unsigned int *ticket)
+{
+
+	return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *),
+	    ticket, NULL);
+}
+
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_mpmc_size(struct ck_ring *ring,
+    struct ck_ring_buffer *buffer,
+    unsigned int *ticket,
+    unsigned int *size)
+{
+
+	return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *),
+	    ticket, size);
+}
+
+CK_CC_INLINE static void
+ck_ring_enqueue_commit_mpmc(struct ck_ring *ring, unsigned int ticket)
+{
+
+	_ck_ring_enqueue_commit_mp(ring, ticket);
+	return;
 }

 CK_CC_INLINE static bool
@ -384,6 +615,31 @@ ck_ring_dequeue_mpmc(struct ck_ring *ring,
 * ring buffer containing pointers. Correctness is provided for any number of
 * consumers with up to one concurrent producer.
 */
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_spmc_size(struct ck_ring *ring,
+    struct ck_ring_buffer *buffer,
+    unsigned int *size)
+{
+
+	return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), size);
+}
+
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_spmc(struct ck_ring *ring,
+    struct ck_ring_buffer *buffer)
+{
+
+	return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), NULL);
+}
+
+CK_CC_INLINE static void
+ck_ring_enqueue_commit_spmc(struct ck_ring *ring)
+{
+
+	_ck_ring_enqueue_commit_sp(ring);
+	return;
+}
+
 CK_CC_INLINE static bool
 ck_ring_enqueue_spmc_size(struct ck_ring *ring,
    struct ck_ring_buffer *buffer,
@ -428,6 +684,35 @@ ck_ring_dequeue_spmc(struct ck_ring *ring,
 * ring buffer containing pointers. Correctness is provided for any number of
 * producers with up to one concurrent consumers.
 */
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_mpsc(struct ck_ring *ring,
+    struct ck_ring_buffer *buffer,
+    unsigned int *ticket)
+{
+
+	return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *),
+	    ticket, NULL);
+}
+
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_mpsc_size(struct ck_ring *ring,
+    struct ck_ring_buffer *buffer,
+    unsigned int *ticket,
+    unsigned int *size)
+{
+
+	return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *),
+	    ticket, size);
+}
+
+CK_CC_INLINE static void
+ck_ring_enqueue_commit_mpsc(struct ck_ring *ring, unsigned int ticket)
+{
+
+	_ck_ring_enqueue_commit_mp(ring, ticket);
+	return;
+}
+
 CK_CC_INLINE static bool
 ck_ring_enqueue_mpsc(struct ck_ring *ring,
    struct ck_ring_buffer *buffer,
@ -463,194 +748,290 @@ ck_ring_dequeue_mpsc(struct ck_ring *ring,
 * CK_RING_PROTOTYPE is used to define a type-safe interface for inlining
 * values of a particular type in the ring the buffer.
 */
-#define CK_RING_PROTOTYPE(name, type)			\
-CK_CC_INLINE static bool				\
-ck_ring_enqueue_spsc_size_##name(struct ck_ring *a,	\
-    struct type *b,					\
-    struct type *c,					\
-    unsigned int *d)					\
-{							\
-							\
-	return _ck_ring_enqueue_sp_size(a, b, c,	\
-	    sizeof(struct type), d);			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_enqueue_spsc_##name(struct ck_ring *a,		\
-    struct type *b,					\
-    struct type *c)					\
-{							\
-							\
-	return _ck_ring_enqueue_sp(a, b, c,		\
-	    sizeof(struct type), NULL);			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_dequeue_spsc_##name(struct ck_ring *a,		\
-    struct type *b,					\
-    struct type *c)					\
-{							\
-							\
-	return _ck_ring_dequeue_sc(a, b, c,		\
-	    sizeof(struct type));			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_enqueue_spmc_size_##name(struct ck_ring *a,	\
-    struct type *b,					\
-    struct type *c,					\
-    unsigned int *d)					\
-{							\
-							\
-	return _ck_ring_enqueue_sp_size(a, b, c,	\
-	    sizeof(struct type), d);			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_enqueue_spmc_##name(struct ck_ring *a,		\
-    struct type *b,					\
-    struct type *c)					\
-{							\
-							\
-	return _ck_ring_enqueue_sp(a, b, c,		\
-	    sizeof(struct type), NULL);			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_trydequeue_spmc_##name(struct ck_ring *a,	\
-    struct type *b,					\
-    struct type *c)					\
-{							\
-							\
-	return _ck_ring_trydequeue_mc(a,		\
-	    b, c, sizeof(struct type));			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_dequeue_spmc_##name(struct ck_ring *a,		\
-    struct type *b,					\
-    struct type *c)					\
-{							\
-							\
-	return _ck_ring_dequeue_mc(a, b, c,		\
-	    sizeof(struct type));			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_enqueue_mpsc_##name(struct ck_ring *a,		\
-    struct type *b,					\
-    struct type *c)					\
-{							\
-							\
-	return _ck_ring_enqueue_mp(a, b, c,		\
-	    sizeof(struct type), NULL);			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_enqueue_mpsc_size_##name(struct ck_ring *a,	\
-    struct type *b,					\
-    struct type *c,					\
-    unsigned int *d)					\
-{							\
-							\
-	return _ck_ring_enqueue_mp_size(a, b, c,	\
-	    sizeof(struct type), d);			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_dequeue_mpsc_##name(struct ck_ring *a,		\
-    struct type *b,					\
-    struct type *c)					\
-{							\
-							\
-	return _ck_ring_dequeue_sc(a, b, c,		\
-	    sizeof(struct type));			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_enqueue_mpmc_size_##name(struct ck_ring *a,	\
-    struct type *b,					\
-    struct type *c,					\
-    unsigned int *d)					\
-{							\
-							\
-	return _ck_ring_enqueue_mp_size(a, b, c,	\
-	    sizeof(struct type), d);			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_enqueue_mpmc_##name(struct ck_ring *a,		\
-    struct type *b,					\
-    struct type *c)					\
-{							\
-							\
-	return _ck_ring_enqueue_mp(a, b, c,		\
-	    sizeof(struct type), NULL);			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_trydequeue_mpmc_##name(struct ck_ring *a,	\
-    struct type *b,					\
-    struct type *c)					\
-{							\
-							\
-	return _ck_ring_trydequeue_mc(a,		\
-	    b, c, sizeof(struct type));			\
-}							\
-							\
-CK_CC_INLINE static bool				\
-ck_ring_dequeue_mpmc_##name(struct ck_ring *a,		\
-    struct type *b,					\
-    struct type *c)					\
-{							\
-							\
-	return _ck_ring_dequeue_mc(a, b, c,		\
-	    sizeof(struct type));			\
+#define CK_RING_PROTOTYPE(name, type)				\
+CK_CC_INLINE static struct type *				\
+ck_ring_enqueue_reserve_spsc_##name(struct ck_ring *a,		\
+    struct type *b)						\
+{								\
+								\
+	return _ck_ring_enqueue_reserve_sp(a, b, 		\
+	    sizeof(struct type), NULL);				\
+}								\
+								\
+CK_CC_INLINE static struct type *				\
+ck_ring_enqueue_reserve_spsc_size_##name(struct ck_ring *a,	\
+    struct type *b,						\
+    unsigned int *c)						\
+{								\
+								\
+	return _ck_ring_enqueue_reserve_sp(a, b, 		\
+	    sizeof(struct type), c);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_enqueue_spsc_size_##name(struct ck_ring *a,		\
+    struct type *b,						\
+    struct type *c,						\
+    unsigned int *d)						\
+{								\
+								\
+	return _ck_ring_enqueue_sp_size(a, b, c,		\
+	    sizeof(struct type), d);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_enqueue_spsc_##name(struct ck_ring *a,			\
+    struct type *b,						\
+    struct type *c)						\
+{								\
+								\
+	return _ck_ring_enqueue_sp(a, b, c,			\
+	    sizeof(struct type), NULL);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_dequeue_spsc_##name(struct ck_ring *a,			\
+    struct type *b,						\
+    struct type *c)						\
+{								\
+								\
+	return _ck_ring_dequeue_sc(a, b, c,			\
+	    sizeof(struct type));				\
+}								\
+								\
+CK_CC_INLINE static struct type *				\
+ck_ring_enqueue_reserve_spmc_##name(struct ck_ring *a,		\
+    struct type *b)						\
+{								\
+								\
+	return _ck_ring_enqueue_reserve_sp(a, b, 		\
+	    sizeof(struct type), NULL);				\
+}								\
+								\
+CK_CC_INLINE static struct type *				\
+ck_ring_enqueue_reserve_spmc_size_##name(struct ck_ring *a,	\
+    struct type *b,						\
+    unsigned int *c)						\
+{								\
+								\
+	return _ck_ring_enqueue_reserve_sp(a, b, 		\
+	    sizeof(struct type), c);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_enqueue_spmc_size_##name(struct ck_ring *a,		\
+    struct type *b,						\
+    struct type *c,						\
+    unsigned int *d)						\
+{								\
+								\
+	return _ck_ring_enqueue_sp_size(a, b, c,		\
+	    sizeof(struct type), d);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_enqueue_spmc_##name(struct ck_ring *a,			\
+    struct type *b,						\
+    struct type *c)						\
+{								\
+								\
+	return _ck_ring_enqueue_sp(a, b, c,			\
+	    sizeof(struct type), NULL);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_trydequeue_spmc_##name(struct ck_ring *a,		\
+    struct type *b,						\
+    struct type *c)						\
+{								\
+								\
+	return _ck_ring_trydequeue_mc(a,			\
+	    b, c, sizeof(struct type));				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_dequeue_spmc_##name(struct ck_ring *a,			\
+    struct type *b,						\
+    struct type *c)						\
+{								\
+								\
+	return _ck_ring_dequeue_mc(a, b, c,			\
+	    sizeof(struct type));				\
+}								\
+								\
+CK_CC_INLINE static struct type *				\
+ck_ring_enqueue_reserve_mpsc_##name(struct ck_ring *a,		\
+    struct type *b,						\
+    unsigned int *c)						\
+{								\
+								\
+	return _ck_ring_enqueue_reserve_mp(a, b, 		\
+	    sizeof(struct type), c, NULL);			\
+}								\
+								\
+CK_CC_INLINE static struct type *				\
+ck_ring_enqueue_reserve_mpsc_size_##name(struct ck_ring *a,	\
+    struct type *b,						\
+    unsigned int *c,						\
+    unsigned int *d)						\
+{								\
+								\
+	return _ck_ring_enqueue_reserve_mp(a, b, 		\
+	    sizeof(struct type), c, d);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_enqueue_mpsc_##name(struct ck_ring *a,			\
+    struct type *b,						\
+    struct type *c)						\
+{								\
+								\
+	return _ck_ring_enqueue_mp(a, b, c,			\
+	    sizeof(struct type), NULL);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_enqueue_mpsc_size_##name(struct ck_ring *a,		\
+    struct type *b,						\
+    struct type *c,						\
+    unsigned int *d)						\
+{								\
+								\
+	return _ck_ring_enqueue_mp_size(a, b, c,		\
+	    sizeof(struct type), d);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_dequeue_mpsc_##name(struct ck_ring *a,			\
+    struct type *b,						\
+    struct type *c)						\
+{								\
+								\
+	return _ck_ring_dequeue_sc(a, b, c,			\
+	    sizeof(struct type));				\
+}								\
+								\
+CK_CC_INLINE static struct type *				\
+ck_ring_enqueue_reserve_mpmc_##name(struct ck_ring *a,		\
+    struct type *b,						\
+    unsigned int *c)						\
+{								\
+								\
+	return _ck_ring_enqueue_reserve_mp(a, b, 		\
+	    sizeof(struct type), c, NULL);			\
+}								\
+								\
+CK_CC_INLINE static struct type *				\
+ck_ring_enqueue_reserve_mpmc_size_##name(struct ck_ring *a,	\
+    struct type *b,						\
+    unsigned int *c,						\
+    unsigned int *d)						\
+{								\
+								\
+	return _ck_ring_enqueue_reserve_mp(a, b, 		\
+	    sizeof(struct type), c, d);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_enqueue_mpmc_size_##name(struct ck_ring *a,		\
+    struct type *b,						\
+    struct type *c,						\
+    unsigned int *d)						\
+{								\
+								\
+	return _ck_ring_enqueue_mp_size(a, b, c,		\
+	    sizeof(struct type), d);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_enqueue_mpmc_##name(struct ck_ring *a,			\
+    struct type *b,						\
+    struct type *c)						\
+{								\
+								\
+	return _ck_ring_enqueue_mp(a, b, c,			\
+	    sizeof(struct type), NULL);				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_trydequeue_mpmc_##name(struct ck_ring *a,		\
+    struct type *b,						\
+    struct type *c)						\
+{								\
+								\
+	return _ck_ring_trydequeue_mc(a,			\
+	    b, c, sizeof(struct type));				\
+}								\
+								\
+CK_CC_INLINE static bool					\
+ck_ring_dequeue_mpmc_##name(struct ck_ring *a,			\
+    struct type *b,						\
+    struct type *c)						\
+{								\
+								\
+	return _ck_ring_dequeue_mc(a, b, c,			\
+	    sizeof(struct type));				\
 }

 /*
 * A single producer with one concurrent consumer.
 */
-#define CK_RING_ENQUEUE_SPSC(name, a, b, c)		\
+#define CK_RING_ENQUEUE_SPSC(name, a, b, c)			\
 	ck_ring_enqueue_spsc_##name(a, b, c)
-#define CK_RING_ENQUEUE_SPSC_SIZE(name, a, b, c, d)	\
+#define CK_RING_ENQUEUE_SPSC_SIZE(name, a, b, c, d)		\
 	ck_ring_enqueue_spsc_size_##name(a, b, c, d)
-#define CK_RING_DEQUEUE_SPSC(name, a, b, c)		\
+#define CK_RING_ENQUEUE_RESERVE_SPSC(name, a, b, c)		\
+	ck_ring_enqueue_reserve_spsc_##name(a, b, c)
+#define CK_RING_ENQUEUE_RESERVE_SPSC_SIZE(name, a, b, c, d)	\
+	ck_ring_enqueue_reserve_spsc_size_##name(a, b, c, d)
+#define CK_RING_DEQUEUE_SPSC(name, a, b, c)			\
 	ck_ring_dequeue_spsc_##name(a, b, c)

 /*
 * A single producer with any number of concurrent consumers.
 */
-#define CK_RING_ENQUEUE_SPMC(name, a, b, c)		\
+#define CK_RING_ENQUEUE_SPMC(name, a, b, c)			\
 	ck_ring_enqueue_spmc_##name(a, b, c)
-#define CK_RING_ENQUEUE_SPMC_SIZE(name, a, b, c, d)	\
+#define CK_RING_ENQUEUE_SPMC_SIZE(name, a, b, c, d)		\
 	ck_ring_enqueue_spmc_size_##name(a, b, c, d)
-#define CK_RING_TRYDEQUEUE_SPMC(name, a, b, c)		\
+#define CK_RING_ENQUEUE_RESERVE_SPMC(name, a, b, c)		\
+	ck_ring_enqueue_reserve_spmc_##name(a, b, c)
+#define CK_RING_ENQUEUE_RESERVE_SPMC_SIZE(name, a, b, c, d)	\
+	ck_ring_enqueue_reserve_spmc_size_##name(a, b, c, d)
+#define CK_RING_TRYDEQUEUE_SPMC(name, a, b, c)			\
 	ck_ring_trydequeue_spmc_##name(a, b, c)
-#define CK_RING_DEQUEUE_SPMC(name, a, b, c)		\
+#define CK_RING_DEQUEUE_SPMC(name, a, b, c)			\
 	ck_ring_dequeue_spmc_##name(a, b, c)

 /*
 * Any number of concurrent producers with up to one
 * concurrent consumer.
 */
-#define CK_RING_ENQUEUE_MPSC(name, a, b, c)		\
+#define CK_RING_ENQUEUE_MPSC(name, a, b, c)			\
 	ck_ring_enqueue_mpsc_##name(a, b, c)
-#define CK_RING_ENQUEUE_MPSC_SIZE(name, a, b, c, d)	\
+#define CK_RING_ENQUEUE_MPSC_SIZE(name, a, b, c, d)		\
 	ck_ring_enqueue_mpsc_size_##name(a, b, c, d)
-#define CK_RING_DEQUEUE_MPSC(name, a, b, c)		\
+#define CK_RING_ENQUEUE_RESERVE_MPSC(name, a, b, c)		\
+	ck_ring_enqueue_reserve_mpsc_##name(a, b, c)
+#define CK_RING_ENQUEUE_RESERVE_MPSC_SIZE(name, a, b, c, d)	\
+	ck_ring_enqueue_reserve_mpsc_size_##name(a, b, c, d)
+#define CK_RING_DEQUEUE_MPSC(name, a, b, c)			\
 	ck_ring_dequeue_mpsc_##name(a, b, c)

 /*
 * Any number of concurrent producers and consumers.
 */
-#define CK_RING_ENQUEUE_MPMC(name, a, b, c)		\
+#define CK_RING_ENQUEUE_MPMC(name, a, b, c)			\
 	ck_ring_enqueue_mpmc_##name(a, b, c)
-#define CK_RING_ENQUEUE_MPMC_SIZE(name, a, b, c, d)	\
+#define CK_RING_ENQUEUE_MPMC_SIZE(name, a, b, c, d)		\
 	ck_ring_enqueue_mpmc_size_##name(a, b, c, d)
-#define CK_RING_TRYDEQUEUE_MPMC(name, a, b, c)		\
+#define CK_RING_ENQUEUE_RESERVE_MPMC(name, a, b, c)		\
+	ck_ring_enqueue_reserve_mpmc_##name(a, b, c)
+#define CK_RING_ENQUEUE_RESERVE_MPMC_SIZE(name, a, b, c, d)	\
+	ck_ring_enqueue_reserve_mpmc_size_##name(a, b, c, d)
+#define CK_RING_TRYDEQUEUE_MPMC(name, a, b, c)			\
 	ck_ring_trydequeue_mpmc_##name(a, b, c)
-#define CK_RING_DEQUEUE_MPMC(name, a, b, c)		\
+#define CK_RING_DEQUEUE_MPMC(name, a, b, c)			\
 	ck_ring_dequeue_mpmc_##name(a, b, c)

 #endif /* CK_RING_H */
--- a/include/freebsd/ck_md.h.in
+++ b/include/freebsd/ck_md.h.in
@ -0,0 +1,133 @@
+/*
+ * Copyright 2018 Samy Al Bahra.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file is meant for use of Concurrency Kit in the FreeBSD kernel.
+ */
+
+#ifndef CK_MD_H
+#define CK_MD_H
+
+#include <sys/param.h>
+
+#ifndef _KERNEL
+#error This header file is meant for the FreeBSD kernel.
+#endif /* _KERNEL */
+
+#ifndef CK_MD_CACHELINE
+/*
+ * FreeBSD's CACHE_LINE macro is a compile-time maximum cache-line size for an
+ * architecture, defined to be 128 bytes by default on x86*. Even in presence
+ * of adjacent sector prefetch, this doesn't make sense from a modeling
+ * perspective.
+ */
+#if defined(__amd64__) || defined(__i386__)
+#define CK_MD_CACHELINE (64)
+#else
+#define CK_MD_CACHELINE	(CACHE_LINE_SIZE)
+#endif /* !__amd64__ && !__i386__ */
+#endif /* CK_MD_CACHELINE */
+
+#ifndef CK_MD_PAGESIZE
+#define CK_MD_PAGESIZE (PAGE_SIZE)
+#endif
+
+/*
+ * Once FreeBSD has a mechanism to detect RTM, this can be enabled and RTM
+ * facilities can be called. These facilities refer to TSX.
+ */
+#ifndef CK_MD_RTM_DISABLE
+#define CK_MD_RTM_DISABLE
+#endif /* CK_MD_RTM_DISABLE */
+
+/*
+ * Do not enable pointer-packing-related (VMA) optimizations in kernel-space.
+ */
+#ifndef CK_MD_POINTER_PACK_DISABLE
+#define CK_MD_POINTER_PACK_DISABLE
+#endif /* CK_MD_POINTER_PACK_DISABLE */
+
+/*
+ * The following would be used for pointer-packing tricks, disabled for the
+ * kernel.
+ */
+#ifndef CK_MD_VMA_BITS_UNKNOWN
+#define CK_MD_VMA_BITS_UNKNOWN
+#endif /* CK_MD_VMA_BITS_UNKNOWN */
+
+/*
+ * Do not enable double operations in kernel-space.
+ */
+#ifndef CK_PR_DISABLE_DOUBLE
+#define CK_PR_DISABLE_DOUBLE
+#endif /* CK_PR_DISABLE_DOUBLE */
+
+/*
+ * If building for a uni-processor target, then enable the uniprocessor
+ * feature flag. This, among other things, will remove the lock prefix.
+ */
+#ifndef SMP
+#define CK_MD_UMP
+#endif /* SMP */
+
+/*
+ * Disable the use of compiler builtin functions.
+ */
+#define CK_MD_CC_BUILTIN_DISABLE 1
+
+/*
+ * CK expects those, which are normally defined by the build system.
+ */
+#if defined(__i386__) && !defined(__x86__)
+#define __x86__
+/*
+ * If x86 becomes more relevant, we may want to consider importing in
+ * __mbk() to avoid potential issues around false sharing.
+ */
+#define CK_MD_TSO
+#define CK_MD_SSE_DISABLE 1
+#elif defined(__amd64__)
+#define CK_MD_TSO
+#elif defined(__sparc64__) && !defined(__sparcv9__)
+#define __sparcv9__
+#define CK_MD_TSO
+#elif defined(__powerpc64__) && !defined(__ppc64__)
+#define __ppc64__
+#elif defined(__powerpc__) && !defined(__ppc__)
+#define __ppc__
+#endif
+
+/* If no memory model has been defined, assume RMO. */
+#if !defined(CK_MD_RMO) && !defined(CK_MD_TSO) && !defined(CK_MD_PSO)
+#define CK_MD_RMO
+#endif
+
+#define CK_VERSION "@VERSION@"
+#define CK_GIT_SHA "@GIT_SHA@"
+
+#endif /* CK_MD_H */
--- a/include/gcc/aarch64/ck_pr.h
+++ b/include/gcc/aarch64/ck_pr.h
@ -92,7 +92,7 @@ CK_PR_FENCE(unlock, CK_DMB_SY)
 	ck_pr_md_load_##S(const M *target)			\
 	{							\
 		long r = 0;					\
-		__asm__ __volatile__(I " %w0, [%1];"		\
+		__asm__ __volatile__(I " %w0, [%1]\n"		\
 					: "=r" (r)		\
 					: "r"  (target)		\
 					: "memory");		\
@ -103,7 +103,7 @@ CK_PR_FENCE(unlock, CK_DMB_SY)
 	ck_pr_md_load_##S(const M *target)			\
 	{							\
 		long r = 0;					\
-		__asm__ __volatile__(I " %0, [%1];"		\
+		__asm__ __volatile__(I " %0, [%1]\n"		\
 					: "=r" (r)		\
 					: "r"  (target)		\
 					: "memory");		\
@ -195,10 +195,10 @@ CK_PR_STORE_S_64(double, double, "str")
                T previous = 0;					\
                T tmp = 0;					\
                __asm__ __volatile__("1:"			\
-                                     "ldxr" W " %" R "0, [%2];"	\
-                                     "neg %" R "0, %" R "0;"	\
-                                     "stxr" W " %w1, %" R "0, [%2];"	\
-                                     "cbnz %w1, 1b;"		\
+                                     "ldxr" W " %" R "0, [%2]\n"\
+                                     "neg %" R "0, %" R "0\n"	\
+                                     "stxr" W " %w1, %" R "0, [%2]\n"	\
+                                     "cbnz %w1, 1b\n"		\
                                        : "=&r" (previous),	\
                                          "=&r" (tmp)		\
                                        : "r"   (target)	\
--- a/include/gcc/aarch64/ck_pr_llsc.h
+++ b/include/gcc/aarch64/ck_pr_llsc.h
@ -38,17 +38,17 @@ ck_pr_cas_64_2_value(uint64_t target[2], uint64_t compare[2], uint64_t set[2], u
        uint64_t tmp1, tmp2;

        __asm__ __volatile__("1:"
-                             "ldxp %0, %1, [%4];"
-                             "mov %2, %0;"
-                             "mov %3, %1;"
-                             "eor %0, %0, %5;"
-                             "eor %1, %1, %6;"
-                             "orr %1, %0, %1;"
-                             "mov %w0, #0;"
-                             "cbnz %1, 2f;"
-                             "stxp %w0, %7, %8, [%4];"
-                             "cbnz %w0, 1b;"
-                             "mov %w0, #1;"
+                             "ldxp %0, %1, [%4]\n"
+                             "mov %2, %0\n"
+                             "mov %3, %1\n"
+                             "eor %0, %0, %5\n"
+                             "eor %1, %1, %6\n"
+                             "orr %1, %0, %1\n"
+                             "mov %w0, #0\n"
+                             "cbnz %1, 2f\n"
+                             "stxp %w0, %7, %8, [%4]\n"
+                             "cbnz %w0, 1b\n"
+                             "mov %w0, #1\n"
                             "2:"
                             : "=&r" (tmp1), "=&r" (tmp2), "=&r" (value[0]), "=&r" (value[1])
                             : "r" (target), "r" (compare[0]), "r" (compare[1]), "r" (set[0]), "r" (set[1])
@ -72,15 +72,15 @@ ck_pr_cas_64_2(uint64_t target[2], uint64_t compare[2], uint64_t set[2])
        uint64_t tmp1, tmp2;

        __asm__ __volatile__("1:"
-                             "ldxp %0, %1, [%2];"
-                             "eor %0, %0, %3;"
-                             "eor %1, %1, %4;"
-                             "orr %1, %0, %1;"
-                             "mov %w0, #0;"
-                             "cbnz %1, 2f;"
-                             "stxp %w0, %5, %6, [%2];"
-                             "cbnz %w0, 1b;"
-                             "mov %w0, #1;"
+                             "ldxp %0, %1, [%2]\n"
+                             "eor %0, %0, %3\n"
+                             "eor %1, %1, %4\n"
+                             "orr %1, %0, %1\n"
+                             "mov %w0, #0\n"
+                             "cbnz %1, 2f\n"
+                             "stxp %w0, %5, %6, [%2]\n"
+                             "cbnz %w0, 1b\n"
+                             "mov %w0, #1\n"
                             "2:"
                             : "=&r" (tmp1), "=&r" (tmp2)
                             : "r" (target), "r" (compare[0]), "r" (compare[1]), "r" (set[0]), "r" (set[1])
@ -103,12 +103,12 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set)
        {								\
                T previous;						\
                T tmp;							\
-                __asm__ __volatile__("1:"				\
-                                     "ldxr" W " %" R "0, [%2];"		\
-                                     "cmp  %" R "0, %" R "4;"		\
-                                     "b.ne 2f;"				\
-                                     "stxr" W " %w1, %" R "3, [%2];"	\
-                                     "cbnz %w1, 1b;"			\
+                __asm__ __volatile__("1:\n"				\
+                                     "ldxr" W " %" R "0, [%2]\n"	\
+                                     "cmp  %" R "0, %" R "4\n"		\
+                                     "b.ne 2f\n"			\
+                                     "stxr" W " %w1, %" R "3, [%2]\n"	\
+                                     "cbnz %w1, 1b\n"			\
                                     "2:"				\
                    : "=&r" (previous),					\
                    "=&r" (tmp)						\
@ -126,11 +126,11 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set)
                T tmp;							\
                __asm__ __volatile__(					\
                                     "1:"				\
-                                     "ldxr" W " %" R "0, [%2];"		\
-                                     "cmp  %" R "0, %" R "4;"		\
-                                     "b.ne 2f;"				\
-                                     "stxr" W " %w1, %" R "3, [%2];"	\
-                                     "cbnz %w1, 1b;"			\
+                                     "ldxr" W " %" R "0, [%2]\n"	\
+                                     "cmp  %" R "0, %" R "4\n"		\
+                                     "b.ne 2f\n"			\
+                                     "stxr" W " %w1, %" R "3, [%2]\n"	\
+                                     "cbnz %w1, 1b\n"			\
                                     "2:"				\
                    : "=&r" (previous),					\
                    "=&r" (tmp)						\
@ -167,9 +167,9 @@ CK_PR_CAS_S(char, char, "b", "w")
                T previous;					\
                T tmp;						\
                __asm__ __volatile__("1:"			\
-                                     "ldxr" W " %" R "0, [%2];"	\
-                                     "stxr" W " %w1, %" R "3, [%2];"\
-                                     "cbnz %w1, 1b;"		\
+                                     "ldxr" W " %" R "0, [%2]\n"\
+                                     "stxr" W " %w1, %" R "3, [%2]\n"\
+                                     "cbnz %w1, 1b\n"		\
                                        : "=&r" (previous),	\
                                          "=&r" (tmp) 		\
                                        : "r"   (target),	\
@ -198,10 +198,10 @@ CK_PR_FAS(char, char, char, "b", "w")
                T previous = 0;					\
                T tmp = 0;					\
                __asm__ __volatile__("1:"			\
-                                     "ldxr" W " %" R "0, [%2];"	\
-                                      I ";"			\
-                                     "stxr" W " %w1, %" R "0, [%2];"	\
-                                     "cbnz %w1, 1b;"		\
+                                     "ldxr" W " %" R "0, [%2]\n"\
+                                      I "\n"			\
+                                     "stxr" W " %w1, %" R "0, [%2]\n"	\
+                                     "cbnz %w1, 1b\n"		\
                                        : "=&r" (previous),	\
                                          "=&r" (tmp)		\
                                        : "r"   (target)	\
@ -239,10 +239,10 @@ CK_PR_UNARY_S(char, char, "b")
                T previous;					\
                T tmp;						\
                __asm__ __volatile__("1:"			\
-                                     "ldxr" W " %" R "0, [%2];"\
-                                      I " %" R "0, %" R "0, %" R "3;"	\
-                                     "stxr" W " %w1, %" R "0, [%2];"	\
-                                     "cbnz %w1, 1b;"		\
+                                     "ldxr" W " %" R "0, [%2]\n"\
+                                      I " %" R "0, %" R "0, %" R "3\n"	\
+                                     "stxr" W " %w1, %" R "0, [%2]\n"	\
+                                     "cbnz %w1, 1b\n"		\
                                        : "=&r" (previous),	\
                                          "=&r" (tmp)		\
                                        : "r"   (target),	\
@ -286,10 +286,10 @@ ck_pr_faa_ptr(void *target, uintptr_t delta)
        uintptr_t previous, r, tmp;

        __asm__ __volatile__("1:"
-                             "ldxr %0, [%3];"
-                             "add %1, %4, %0;"
-                             "stxr %w2, %1, [%3];"
-                             "cbnz %w2, 1b;"
+                             "ldxr %0, [%3]\n"
+                             "add %1, %4, %0\n"
+                             "stxr %w2, %1, [%3]\n"
+                             "cbnz %w2, 1b\n"
                                : "=&r" (previous),
                                  "=&r" (r),
                                  "=&r" (tmp)
@ -306,9 +306,9 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta)
        uint64_t previous, r, tmp;

        __asm__ __volatile__("1:"
-                             "ldxr %0, [%3];"
-                             "add %1, %4, %0;"
-                             "stxr %w2, %1, [%3];"
+                             "ldxr %0, [%3]\n"
+                             "add %1, %4, %0\n"
+                             "stxr %w2, %1, [%3]\n"
                             "cbnz %w2, 1b;"
                                : "=&r" (previous),
                                  "=&r" (r),
@ -326,10 +326,10 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta)
        {								\
                T previous, r, tmp;					\
                __asm__ __volatile__("1:"				\
-                                     "ldxr" W " %w0, [%3];"		\
-                                     "add %w1, %w4, %w0;"		\
-                                     "stxr" W " %w2, %w1, [%3];"	\
-                                     "cbnz %w2, 1b;"			\
+                                     "ldxr" W " %w0, [%3]\n"		\
+                                     "add %w1, %w4, %w0\n"		\
+                                     "stxr" W " %w2, %w1, [%3]\n"	\
+                                     "cbnz %w2, 1b\n"			\
                                        : "=&r" (previous),		\
                                          "=&r" (r),			\
                                          "=&r" (tmp)			\
--- a/include/gcc/aarch64/ck_pr_lse.h
+++ b/include/gcc/aarch64/ck_pr_lse.h
@ -29,6 +29,7 @@
 #ifndef CK_PR_AARCH64_LSE_H
 #define CK_PR_AARCH64_LSE_H

+#error bite
 #ifndef CK_PR_H
 #error Do not include this file directly, use ck_pr.h
 #endif
@ -43,10 +44,10 @@ ck_pr_cas_64_2_value(uint64_t target[2], uint64_t compare[2], uint64_t set[2], u
        register uint64_t x2 __asm__ ("x2") = set[0];
        register uint64_t x3 __asm__ ("x3") = set[1];

-        __asm__ __volatile__("casp %0, %1, %4, %5, [%6];"
-                             "eor %2, %0, %7;"
-                             "eor %3, %1, %8;"
-                             "orr %2, %2, %3;"
+        __asm__ __volatile__("casp %0, %1, %4, %5, [%6]\n"
+                             "eor %2, %0, %7\n"
+                             "eor %3, %1, %8\n"
+                             "orr %2, %2, %3\n"
                             : "+&r" (x0), "+&r" (x1), "=&r" (tmp1), "=&r" (tmp2)
                             : "r" (x2), "r" (x3), "r" (target), "r" (compare[0]), "r" (compare[1])
                             : "memory");
@ -74,10 +75,10 @@ ck_pr_cas_64_2(uint64_t target[2], uint64_t compare[2], uint64_t set[2])
        register uint64_t x2 __asm__ ("x2") = set[0];
        register uint64_t x3 __asm__ ("x3") = set[1];

-        __asm__ __volatile__("casp %0, %1, %2, %3, [%4];"
-                             "eor %0, %0, %5;"
-                             "eor %1, %1, %6;"
-                             "orr %0, %0, %1;"
+        __asm__ __volatile__("casp %0, %1, %2, %3, [%4]\n"
+                             "eor %0, %0, %5\n"
+                             "eor %1, %1, %6\n"
+                             "orr %0, %0, %1\n"
                             : "+&r" (x0), "+&r" (x1)
                             : "r" (x2), "r" (x3), "r" (target), "r" (compare[0]), "r" (compare[1])
                             : "memory");
@ -99,7 +100,7 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set)
        {								\
                  *(T *)value = compare;				\
                __asm__ __volatile__(					\
-                                     "cas" W " %" R "0, %" R "2, [%1];"	\
+                                     "cas" W " %" R "0, %" R "2, [%1]\n"\
                    : "+&r" (*(T *)value)				\
                    : "r"   (target),					\
                    "r"   (set)						\
@ -111,7 +112,7 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set)
        {								\
                T previous = compare;					\
                __asm__ __volatile__(					\
-                                     "cas" W " %" R "0, %" R "2, [%1];"	\
+                                     "cas" W " %" R "0, %" R "2, [%1]\n"\
                    : "+&r" (previous)					\
                    : "r"   (target),					\
                    "r"   (set)						\
@ -144,7 +145,7 @@ CK_PR_CAS_S(char, char, "b", "w")
        {								\
                T previous;						\
                __asm__ __volatile__(					\
-                                     "swp" W " %" R "2, %" R "0, [%1];"	\
+                                     "swp" W " %" R "2, %" R "0, [%1]\n"\
                                        : "=&r" (previous)		\
                                        : "r"   (target),		\
                                          "r"   (v)			\
@ -169,8 +170,8 @@ CK_PR_FAS(char, char, char, "b", "w")
        CK_CC_INLINE static void				\
        ck_pr_##O##_##N(M *target)				\
        {							\
-                __asm__ __volatile__(I ";"			\
-                                     "st" S W " " R "0, [%0];"	\
+                __asm__ __volatile__(I "\n"			\
+                                     "st" S W " " R "0, [%0]\n"	\
                                        :			\
                                        : "r"   (target)	\
                                        : "x0", "memory");	\
@ -204,8 +205,8 @@ CK_PR_UNARY_S(char, char, "b")
        CK_CC_INLINE static void				\
        ck_pr_##O##_##N(M *target, T delta)			\
        {							\
-                __asm__ __volatile__(I ";"			\
-                                     "st" S W " %" R "0, [%1];"	\
+                __asm__ __volatile__(I "\n"			\
+                                     "st" S W " %" R "0, [%1]\n"\
                                        : "+&r" (delta)		\
                                        : "r"   (target)	\
                                        : "memory");		\
@ -247,7 +248,7 @@ ck_pr_faa_ptr(void *target, uintptr_t delta)
        uintptr_t previous;

        __asm__ __volatile__(
-                             "ldadd %2, %0, [%1];"
+                             "ldadd %2, %0, [%1]\n"
                                : "=r" (previous)
                                : "r"   (target),
                                  "r"   (delta)
@ -262,7 +263,7 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta)
        uint64_t previous;

        __asm__ __volatile__(
-                             "ldadd %2, %0, [%1];"
+                             "ldadd %2, %0, [%1]\n"
                                : "=r" (previous)
                                : "r"   (target),
                                  "r"   (delta)
@ -277,7 +278,7 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta)
        {								\
                T previous;						\
                __asm__ __volatile__(					\
-                                     "ldadd" W " %w2, %w0, [%1];"	\
+                                     "ldadd" W " %w2, %w0, [%1]\n"	\
                                        : "=r" (previous)		\
                                        : "r"   (target),		\
                                          "r"   (delta)			\
--- a/include/gcc/ck_cc.h
+++ b/include/gcc/ck_cc.h
@ -39,6 +39,15 @@
 #define CK_CC_UNUSED __attribute__((unused))
 #define CK_CC_USED   __attribute__((used))
 #define CK_CC_IMM "i"
+
+#define CK_CC_CONTAINER(F, T, M, N)					       \
+       CK_CC_INLINE static T *						       \
+       N(F *p)								       \
+       {								       \
+									       \
+	       return (T *)(void *)((char *)p - __builtin_offsetof(T, M));     \
+       }
+
 #if defined(__x86_64__) || defined(__x86__)
 #define CK_CC_IMM_U32 "Z"
 #define CK_CC_IMM_S32 "e"
@ -103,28 +112,26 @@
 #define CK_CC_TYPEOF(X, DEFAULT) __typeof__(X)

 /*
- * Portability wrappers for bitwise ops.
+ * Portability wrappers for bitwise operations.
 */
-
+#ifndef CK_MD_CC_BUILTIN_DISABLE
 #define CK_F_CC_FFS
-#define CK_F_CC_CLZ
-#define CK_F_CC_CTZ
-#define CK_F_CC_POPCOUNT
-
 CK_CC_INLINE static int
 ck_cc_ffs(unsigned int x)
 {

-	return __builtin_ffs(x);
+	return __builtin_ffsl(x);
 }

+#define CK_F_CC_FFSL
 CK_CC_INLINE static int
-ck_cc_clz(unsigned int x)
+ck_cc_ffsl(unsigned long x)
 {

-	return __builtin_clz(x);
+	return __builtin_ffsll(x);
 }

+#define CK_F_CC_CTZ
 CK_CC_INLINE static int
 ck_cc_ctz(unsigned int x)
 {
@ -132,11 +139,12 @@ ck_cc_ctz(unsigned int x)
 	return __builtin_ctz(x);
 }

+#define CK_F_CC_POPCOUNT
 CK_CC_INLINE static int
 ck_cc_popcount(unsigned int x)
 {

 	return __builtin_popcount(x);
 }
-
+#endif /* CK_MD_CC_BUILTIN_DISABLE */
 #endif /* CK_GCC_CC_H */
--- a/include/gcc/ck_pr.h
+++ b/include/gcc/ck_pr.h
@ -80,7 +80,7 @@ ck_pr_md_load_ptr(const void *target)
 	void *r;

 	ck_pr_barrier();
-	r = CK_CC_DECONST_PTR(CK_PR_ACCESS(target));
+	r = CK_CC_DECONST_PTR(*(volatile void *const*)(target));
 	ck_pr_barrier();

 	return r;
@ -91,7 +91,7 @@ ck_pr_md_store_ptr(void *target, const void *v)
 {

 	ck_pr_barrier();
-	CK_PR_ACCESS(target) = CK_CC_DECONST_PTR(v);
+	*(volatile void **)target = CK_CC_DECONST_PTR(v);
 	ck_pr_barrier();
 	return;
 }
--- a/include/gcc/ppc/ck_pr.h
+++ b/include/gcc/ppc/ck_pr.h
@ -67,21 +67,29 @@ ck_pr_stall(void)
 		__asm__ __volatile__(I ::: "memory");   \
 	}

-CK_PR_FENCE(atomic, "lwsync")
-CK_PR_FENCE(atomic_store, "lwsync")
+#ifdef CK_MD_PPC32_LWSYNC
+#define CK_PR_LWSYNCOP "lwsync"
+#else /* CK_MD_PPC32_LWSYNC_DISABLE */
+#define CK_PR_LWSYNCOP "sync"
+#endif
+
+CK_PR_FENCE(atomic, CK_PR_LWSYNCOP)
+CK_PR_FENCE(atomic_store, CK_PR_LWSYNCOP)
 CK_PR_FENCE(atomic_load, "sync")
-CK_PR_FENCE(store_atomic, "lwsync")
-CK_PR_FENCE(load_atomic, "lwsync")
-CK_PR_FENCE(store, "lwsync")
+CK_PR_FENCE(store_atomic, CK_PR_LWSYNCOP)
+CK_PR_FENCE(load_atomic, CK_PR_LWSYNCOP)
+CK_PR_FENCE(store, CK_PR_LWSYNCOP)
 CK_PR_FENCE(store_load, "sync")
-CK_PR_FENCE(load, "lwsync")
-CK_PR_FENCE(load_store, "lwsync")
+CK_PR_FENCE(load, CK_PR_LWSYNCOP)
+CK_PR_FENCE(load_store, CK_PR_LWSYNCOP)
 CK_PR_FENCE(memory, "sync")
-CK_PR_FENCE(acquire, "lwsync")
-CK_PR_FENCE(release, "lwsync")
-CK_PR_FENCE(acqrel, "lwsync")
-CK_PR_FENCE(lock, "lwsync")
-CK_PR_FENCE(unlock, "lwsync")
+CK_PR_FENCE(acquire, CK_PR_LWSYNCOP)
+CK_PR_FENCE(release, CK_PR_LWSYNCOP)
+CK_PR_FENCE(acqrel, CK_PR_LWSYNCOP)
+CK_PR_FENCE(lock, CK_PR_LWSYNCOP)
+CK_PR_FENCE(unlock, CK_PR_LWSYNCOP)
+
+#undef CK_PR_LWSYNCOP

 #undef CK_PR_FENCE

--- a/include/gcc/s390x/ck_f_pr.h
+++ b/include/gcc/s390x/ck_f_pr.h
@ -0,0 +1,97 @@
+/* DO NOT EDIT. This is auto-generated from feature.sh */
+#define CK_F_PR_ADD_32
+#define CK_F_PR_ADD_64
+#define CK_F_PR_ADD_INT
+#define CK_F_PR_ADD_PTR
+#define CK_F_PR_ADD_UINT
+#define CK_F_PR_AND_32
+#define CK_F_PR_AND_64
+#define CK_F_PR_AND_INT
+#define CK_F_PR_AND_PTR
+#define CK_F_PR_AND_UINT
+#define CK_F_PR_CAS_32
+#define CK_F_PR_CAS_32_VALUE
+#define CK_F_PR_CAS_64
+#define CK_F_PR_CAS_64_VALUE
+#define CK_F_PR_CAS_INT
+#define CK_F_PR_CAS_INT_VALUE
+#define CK_F_PR_CAS_PTR
+#define CK_F_PR_CAS_PTR_VALUE
+#define CK_F_PR_CAS_UINT
+#define CK_F_PR_CAS_UINT_VALUE
+#define CK_F_PR_DEC_32
+#define CK_F_PR_DEC_64
+#define CK_F_PR_DEC_INT
+#define CK_F_PR_DEC_PTR
+#define CK_F_PR_DEC_UINT
+#define CK_F_PR_FAA_32
+#define CK_F_PR_FAA_64
+#define CK_F_PR_FAA_INT
+#define CK_F_PR_FAA_PTR
+#define CK_F_PR_FAA_UINT
+#define CK_F_PR_FAS_32
+#define CK_F_PR_FAS_64
+#define CK_F_PR_FAS_INT
+#define CK_F_PR_FAS_PTR
+#define CK_F_PR_FAS_UINT
+#define CK_F_PR_FAS_DOUBLE
+#define CK_F_PR_FENCE_LOAD
+#define CK_F_PR_FENCE_LOAD_DEPENDS
+#define CK_F_PR_FENCE_MEMORY
+#define CK_F_PR_FENCE_STORE
+#define CK_F_PR_FENCE_STRICT_LOAD
+#define CK_F_PR_FENCE_STRICT_LOAD_DEPENDS
+#define CK_F_PR_FENCE_STRICT_MEMORY
+#define CK_F_PR_FENCE_STRICT_STORE
+#define CK_F_PR_INC_32
+#define CK_F_PR_INC_64
+#define CK_F_PR_INC_INT
+#define CK_F_PR_INC_PTR
+#define CK_F_PR_INC_UINT
+#define CK_F_PR_LOAD_16
+#define CK_F_PR_LOAD_32
+#define CK_F_PR_LOAD_64
+#define CK_F_PR_LOAD_8
+#define CK_F_PR_LOAD_CHAR
+#define CK_F_PR_LOAD_DOUBLE
+#define CK_F_PR_LOAD_INT
+#define CK_F_PR_LOAD_PTR
+#define CK_F_PR_LOAD_SHORT
+#define CK_F_PR_LOAD_UINT
+#define CK_F_PR_NEG_32
+#define CK_F_PR_NEG_64
+#define CK_F_PR_NEG_INT
+#define CK_F_PR_NEG_PTR
+#define CK_F_PR_NEG_UINT
+#define CK_F_PR_NOT_32
+#define CK_F_PR_NOT_64
+#define CK_F_PR_NOT_INT
+#define CK_F_PR_NOT_PTR
+#define CK_F_PR_NOT_UINT
+#define CK_F_PR_OR_32
+#define CK_F_PR_OR_64
+#define CK_F_PR_OR_INT
+#define CK_F_PR_OR_PTR
+#define CK_F_PR_OR_UINT
+#define CK_F_PR_STALL
+#define CK_F_PR_STORE_16
+#define CK_F_PR_STORE_32
+#define CK_F_PR_STORE_64
+#define CK_F_PR_STORE_8
+#define CK_F_PR_STORE_CHAR
+#define CK_F_PR_STORE_DOUBLE
+#define CK_F_PR_STORE_INT
+#define CK_F_PR_STORE_PTR
+#define CK_F_PR_STORE_SHORT
+#define CK_F_PR_STORE_UINT
+#define CK_F_PR_SUB_32
+#define CK_F_PR_SUB_64
+#define CK_F_PR_SUB_INT
+#define CK_F_PR_SUB_PTR
+#define CK_F_PR_SUB_UINT
+#define CK_F_PR_XOR_32
+#define CK_F_PR_XOR_64
+#define CK_F_PR_XOR_INT
+#define CK_F_PR_XOR_PTR
+#define CK_F_PR_XOR_UINT
+
--- a/include/gcc/s390x/ck_pr.h
+++ b/include/gcc/s390x/ck_pr.h
@ -0,0 +1,373 @@
+/*
+ * Copyright 2009-2015 Samy Al Bahra.
+ * Copyright 2017 Neale Ferguson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef CK_PR_S390X_H
+#define CK_PR_S390X_H
+
+#ifndef CK_PR_H
+#error Do not include this file directly, use ck_pr.h
+#endif
+
+#include <ck_cc.h>
+#include <ck_md.h>
+
+/*
+ * The following represent supported atomic operations.
+ * These operations may be emulated.
+ */
+#include "ck_f_pr.h"
+
+/*
+ * Minimum interface requirement met.
+ */
+#define CK_F_PR
+
+/*
+ * This bounces the hardware thread from low to medium
+ * priority. I am unsure of the benefits of this approach
+ * but it is used by the Linux kernel.
+ */
+CK_CC_INLINE static void
+ck_pr_stall(void)
+{
+	__sync_synchronize();
+	return;
+}
+
+#define CK_PR_FENCE(T)					\
+	CK_CC_INLINE static void			\
+	ck_pr_fence_strict_##T(void)			\
+	{						\
+		__sync_synchronize();			\
+	}
+
+/*
+ * These are derived from:
+ *     http://www.ibm.com/developerworks/systems/articles/powerpc.html
+ */
+CK_PR_FENCE(atomic)
+CK_PR_FENCE(atomic_store)
+CK_PR_FENCE(atomic_load)
+CK_PR_FENCE(store_atomic)
+CK_PR_FENCE(load_atomic)
+CK_PR_FENCE(store)
+CK_PR_FENCE(store_load)
+CK_PR_FENCE(load)
+CK_PR_FENCE(load_store)
+CK_PR_FENCE(memory)
+CK_PR_FENCE(acquire)
+CK_PR_FENCE(release)
+CK_PR_FENCE(acqrel)
+CK_PR_FENCE(lock)
+CK_PR_FENCE(unlock)
+
+#undef CK_PR_FENCE
+
+#define CK_PR_LOAD(S, M, T, C, I)					\
+	CK_CC_INLINE static T						\
+	ck_pr_md_load_##S(const M *target)				\
+	{								\
+		T r;							\
+		__asm__ __volatile__(I "\t%0, %1\n"			\
+					: "=r" (r)			\
+					: "Q"  (*(const C *)target)	\
+					: "memory");			\
+		return (r);						\
+	}
+
+CK_PR_LOAD(ptr, void, void *, uint64_t, "lg")
+
+#define CK_PR_LOAD_S(S, T, I) CK_PR_LOAD(S, T, T, T, I)
+
+CK_PR_LOAD_S(64, uint64_t, "lg")
+CK_PR_LOAD_S(32, uint32_t, "llgf")
+CK_PR_LOAD_S(16, uint16_t, "llgh")
+CK_PR_LOAD_S(8, uint8_t, "llgc")
+CK_PR_LOAD_S(uint, unsigned int, "llgf")
+CK_PR_LOAD_S(int, int, "llgf")
+CK_PR_LOAD_S(short, short, "lgh")
+CK_PR_LOAD_S(char, char, "lgb")
+#ifndef CK_PR_DISABLE_DOUBLE
+CK_CC_INLINE static double
+ck_pr_md_load_double(const double *target)
+{
+	double r;
+	__asm__ __volatile__("ld  %0, %1\n"
+				: "=f" (r)
+				: "Q"  (*(const double *)target)   
+				: "memory");
+	return (r);			
+}
+#endif
+
+#undef CK_PR_LOAD_S
+#undef CK_PR_LOAD
+
+#define CK_PR_STORE(S, M, T, C, I)				\
+	CK_CC_INLINE static void				\
+	ck_pr_md_store_##S(M *target, T v)			\
+	{							\
+		__asm__ __volatile__(I "\t%1, %0\n"		\
+					: "=Q" (*(C *)target)	\
+					: "r" (v)		\
+					: "memory");		\
+		return;						\
+	}
+
+CK_PR_STORE(ptr, void, const void *, uint64_t, "stg")
+
+#define CK_PR_STORE_S(S, T, I) CK_PR_STORE(S, T, T, T, I)
+
+CK_PR_STORE_S(64, uint64_t, "stg")
+CK_PR_STORE_S(32, uint32_t, "st")
+CK_PR_STORE_S(16, uint16_t, "sth")
+CK_PR_STORE_S(8, uint8_t, "stc")
+CK_PR_STORE_S(uint, unsigned int, "st")
+CK_PR_STORE_S(int, int, "st")
+CK_PR_STORE_S(short, short, "sth")
+CK_PR_STORE_S(char, char, "stc")
+#ifndef CK_PR_DISABLE_DOUBLE
+CK_CC_INLINE static void
+ck_pr_md_store_double(double *target, double v)
+{
+	__asm__ __volatile__(" std  %1, %0\n"
+				: "=Q" (*(double *)target)   
+				: "f"  (v)
+				: "0", "memory");
+}
+#endif
+
+#undef CK_PR_STORE_S
+#undef CK_PR_STORE
+
+CK_CC_INLINE static bool
+ck_pr_cas_64_value(uint64_t *target, uint64_t compare, uint64_t set, uint64_t *value)
+{
+	*value = __sync_val_compare_and_swap(target,compare,set);
+        return (*value == compare);
+}
+
+CK_CC_INLINE static bool
+ck_pr_cas_ptr_value(void *target, void *compare, void *set, void *value)
+{
+	uintptr_t previous;
+
+	previous = __sync_val_compare_and_swap((uintptr_t *) target,
+					       (uintptr_t) compare,
+					       (uintptr_t) set);
+	*((uintptr_t *) value) = previous;
+        return (previous == (uintptr_t) compare);
+}
+
+CK_CC_INLINE static bool
+ck_pr_cas_64(uint64_t *target, uint64_t compare, uint64_t set)
+{
+	return(__sync_bool_compare_and_swap(target,compare,set));
+}
+
+CK_CC_INLINE static bool
+ck_pr_cas_ptr(void *target, void *compare, void *set)
+{
+	return(__sync_bool_compare_and_swap((uintptr_t *) target,
+					    (uintptr_t) compare,
+					    (uintptr_t) set));
+}
+
+#define CK_PR_CAS(N, T)							\
+	CK_CC_INLINE static bool					\
+	ck_pr_cas_##N##_value(T *target, T compare, T set, T *value)	\
+	{								\
+		*value = __sync_val_compare_and_swap(target,		\
+						     compare,		\
+						     set);		\
+		return(*value == compare);				\
+	}								\
+	CK_CC_INLINE static bool					\
+	ck_pr_cas_##N(T *target, T compare, T set)			\
+	{								\
+		return(__sync_bool_compare_and_swap(target,		\
+						   compare,		\
+						   set));		\
+	}
+
+CK_PR_CAS(32, uint32_t)
+CK_PR_CAS(uint, unsigned int)
+CK_PR_CAS(int, int)
+
+#undef CK_PR_CAS
+
+CK_CC_INLINE static void *
+ck_pr_fas_ptr(void *target, void *v)
+{
+	return((void *)__atomic_exchange_n((uintptr_t *) target, (uintptr_t) v, __ATOMIC_ACQUIRE));
+}
+
+#define CK_PR_FAS(N, M, T)							\
+	CK_CC_INLINE static T							\
+	ck_pr_fas_##N(M *target, T v)						\
+	{									\
+		return(__atomic_exchange_n(target, v, __ATOMIC_ACQUIRE));	\
+	}
+
+CK_PR_FAS(64, uint64_t, uint64_t)
+CK_PR_FAS(32, uint32_t, uint32_t)
+CK_PR_FAS(int, int, int)
+CK_PR_FAS(uint, unsigned int, unsigned int)
+
+#ifndef CK_PR_DISABLE_DOUBLE
+CK_CC_INLINE static double
+ck_pr_fas_double(double *target, double *v)
+{
+	double previous;
+
+	__asm__ __volatile__ ("	   lg   1,%2\n"
+			      "0:  lg	0,%1\n"
+			      "    csg	0,1,%1\n"
+			      "    jnz	0b\n"
+			      "    ldgr %0,0\n"
+			      : "=f" (previous) 
+			      : "Q" (target), "Q" (v)
+			      : "0", "1", "cc", "memory");
+	return (previous);
+}
+#endif
+
+#undef CK_PR_FAS
+
+/*
+ * Atomic store-only binary operations.
+ */
+#define CK_PR_BINARY(K, S, M, T)				\
+	CK_CC_INLINE static void				\
+	ck_pr_##K##_##S(M *target, T d)				\
+	{							\
+		d = __sync_fetch_and_##K((T *)target, d);	\
+		return;						\
+	}
+
+#define CK_PR_BINARY_S(K, S, T) CK_PR_BINARY(K, S, T, T)
+
+#define CK_PR_GENERATE(K)			\
+	CK_PR_BINARY(K, ptr, void, void *)	\
+	CK_PR_BINARY_S(K, char, char)		\
+	CK_PR_BINARY_S(K, int, int)		\
+	CK_PR_BINARY_S(K, uint, unsigned int)	\
+	CK_PR_BINARY_S(K, 64, uint64_t)		\
+	CK_PR_BINARY_S(K, 32, uint32_t)		\
+	CK_PR_BINARY_S(K, 16, uint16_t)		\
+	CK_PR_BINARY_S(K, 8, uint8_t)
+
+CK_PR_GENERATE(add)
+CK_PR_GENERATE(sub)
+CK_PR_GENERATE(and)
+CK_PR_GENERATE(or)
+CK_PR_GENERATE(xor)
+
+#undef CK_PR_GENERATE
+#undef CK_PR_BINARY_S
+#undef CK_PR_BINARY
+
+#define CK_PR_UNARY(S, M, T)			\
+	CK_CC_INLINE static void		\
+	ck_pr_inc_##S(M *target)		\
+	{					\
+		ck_pr_add_##S(target, (T)1);	\
+		return;				\
+	}					\
+	CK_CC_INLINE static void		\
+	ck_pr_dec_##S(M *target)		\
+	{					\
+		ck_pr_sub_##S(target, (T)1);	\
+		return;				\
+	}
+
+#define CK_PR_UNARY_X(S, M) 	  					\
+	CK_CC_INLINE static void					\
+	ck_pr_not_##S(M *target)					\
+	{								\
+		M newval;						\
+		do {							\
+			newval = ~(*target);				\
+		} while (!__sync_bool_compare_and_swap(target, 		\
+						      *target,		\
+						      newval));		\
+	}								\
+	CK_CC_INLINE static void					\
+	ck_pr_neg_##S(M *target)					\
+	{								\
+		M newval;						\
+		do {							\
+			newval = -(*target);				\
+		} while (!__sync_bool_compare_and_swap(target, 		\
+						      *target,		\
+						      newval));		\
+	}								
+
+#define CK_PR_UNARY_S(S, M) CK_PR_UNARY(S, M, M) \
+			    CK_PR_UNARY_X(S, M)
+
+CK_PR_UNARY(ptr, void, void *)
+CK_PR_UNARY_S(char, char)
+CK_PR_UNARY_S(int, int)
+CK_PR_UNARY_S(uint, unsigned int)
+CK_PR_UNARY_S(64, uint64_t)
+CK_PR_UNARY_S(32, uint32_t)
+CK_PR_UNARY_S(16, uint16_t)
+CK_PR_UNARY_S(8, uint8_t)
+
+#undef CK_PR_UNARY_S
+#undef CK_PR_UNARY
+
+CK_CC_INLINE static void *
+ck_pr_faa_ptr(void *target, uintptr_t delta)
+{
+	uintptr_t previous;
+
+	previous = __sync_fetch_and_add((uintptr_t *) target, delta);
+
+	return (void *)(previous);
+}
+
+#define CK_PR_FAA(S, T)							\
+	CK_CC_INLINE static T						\
+	ck_pr_faa_##S(T *target, T delta)				\
+	{								\
+		T previous;						\
+									\
+		previous = __sync_fetch_and_add(target, delta);		\
+									\
+		return (previous);					\
+	}
+
+CK_PR_FAA(64, uint64_t)
+CK_PR_FAA(32, uint32_t)
+CK_PR_FAA(uint, unsigned int)
+CK_PR_FAA(int, int)
+
+#undef CK_PR_FAA
+
+#endif /* CK_PR_S390X_H */
--- a/include/gcc/sparcv9/ck_pr.h
+++ b/include/gcc/sparcv9/ck_pr.h
@ -76,7 +76,7 @@ CK_PR_FENCE(store, "membar #StoreStore")
 CK_PR_FENCE(store_load, "membar #StoreLoad")
 CK_PR_FENCE(load, "membar #LoadLoad")
 CK_PR_FENCE(load_store, "membar #LoadStore")
-CK_PR_FENCE(memory, "membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad")
+CK_PR_FENCE(memory, "membar #MemIssue")
 CK_PR_FENCE(acquire, "membar #LoadLoad | #LoadStore")
 CK_PR_FENCE(release, "membar #LoadStore | #StoreStore")
 CK_PR_FENCE(acqrel, "membar #LoadLoad | #LoadStore | #StoreStore")
@ -136,11 +136,26 @@ CK_PR_STORE_S(int, int, "stsw")
 #undef CK_PR_STORE_S
 #undef CK_PR_STORE

+/* Use the appropriate address space for atomics within the FreeBSD kernel. */
+#if defined(__FreeBSD__) && defined(_KERNEL)
+#include <sys/cdefs.h>
+#include <machine/atomic.h>
+#define CK_PR_INS_CAS "casa"
+#define CK_PR_INS_CASX "casxa"
+#define CK_PR_INS_SWAP "swapa"
+#define CK_PR_ASI_ATOMIC __XSTRING(__ASI_ATOMIC)
+#else
+#define CK_PR_INS_CAS "cas"
+#define CK_PR_INS_CASX "casx"
+#define CK_PR_INS_SWAP "swap"
+#define CK_PR_ASI_ATOMIC ""
+#endif
+
 CK_CC_INLINE static bool
 ck_pr_cas_64_value(uint64_t *target, uint64_t compare, uint64_t set, uint64_t *value)
 {

-	__asm__ __volatile__("casx [%1], %2, %0"
+	__asm__ __volatile__(CK_PR_INS_CASX " [%1] " CK_PR_ASI_ATOMIC ", %2, %0"
 				: "+&r" (set)
 				: "r"   (target),
 				  "r"   (compare)
@ -154,7 +169,7 @@ CK_CC_INLINE static bool
 ck_pr_cas_64(uint64_t *target, uint64_t compare, uint64_t set)
 {

-	__asm__ __volatile__("casx [%1], %2, %0"
+	__asm__ __volatile__(CK_PR_INS_CASX " [%1] " CK_PR_ASI_ATOMIC ", %2, %0"
 				: "+&r" (set)
 				: "r" (target),
 				  "r" (compare)
@ -181,7 +196,7 @@ ck_pr_cas_ptr_value(void *target, void *compare, void *set, void *previous)
 	CK_CC_INLINE static bool					\
 	ck_pr_cas_##N##_value(T *target, T compare, T set, T *value)	\
 	{								\
-		__asm__ __volatile__("cas [%1], %2, %0"			\
+		__asm__ __volatile__(CK_PR_INS_CAS " [%1] " CK_PR_ASI_ATOMIC ", %2, %0" \
 					: "+&r" (set)			\
 					: "r"   (target),		\
 					  "r"   (compare)		\
@ -192,7 +207,7 @@ ck_pr_cas_ptr_value(void *target, void *compare, void *set, void *previous)
 	CK_CC_INLINE static bool					\
 	ck_pr_cas_##N(T *target, T compare, T set)			\
 	{								\
-		__asm__ __volatile__("cas [%1], %2, %0"			\
+		__asm__ __volatile__(CK_PR_INS_CAS " [%1] " CK_PR_ASI_ATOMIC ", %2, %0" \
 					: "+&r" (set)			\
 					: "r" (target),			\
 					  "r" (compare)			\
@ -211,7 +226,7 @@ CK_PR_CAS(int, int)
 	ck_pr_fas_##N(T *target, T update)			\
 	{							\
 								\
-		__asm__ __volatile__("swap [%1], %0"		\
+		__asm__ __volatile__(CK_PR_INS_SWAP " [%1] " CK_PR_ASI_ATOMIC ", %0"		\
 					: "+&r" (update)	\
 					: "r"   (target)	\
 					: "memory");		\
@ -224,5 +239,10 @@ CK_PR_FAS(32, uint32_t)

 #undef CK_PR_FAS

+#undef CK_PR_INS_CAS
+#undef CK_PR_INS_CASX
+#undef CK_PR_INS_SWAP
+#undef CK_PR_ASI_ATOMIC
+
 #endif /* CK_PR_SPARCV9_H */

--- a/include/gcc/x86/ck_pr.h
+++ b/include/gcc/x86/ck_pr.h
@ -45,15 +45,9 @@
 /* Minimum requirements for the CK_PR interface are met. */
 #define CK_F_PR

-#ifdef CK_MD_UMP
-#define CK_PR_LOCK_PREFIX
-#else
-#define CK_PR_LOCK_PREFIX "lock "
-#endif
-
 /*
- * Prevent speculative execution in busy-wait loops (P4 <=)
- * or "predefined delay".
+ * Prevent speculative execution in busy-wait loops (P4 <=) or "predefined
+ * delay".
 */
 CK_CC_INLINE static void
 ck_pr_stall(void)
@ -62,28 +56,52 @@ ck_pr_stall(void)
 	return;
 }

+#ifdef CK_MD_UMP
+#define CK_PR_LOCK_PREFIX
+#define CK_PR_FENCE(T, I)				\
+	CK_CC_INLINE static void			\
+	ck_pr_fence_strict_##T(void)			\
+	{						\
+		__asm__ __volatile__("" ::: "memory");	\
+		return;					\
+	}
+#else
+#define CK_PR_LOCK_PREFIX "lock "
 #define CK_PR_FENCE(T, I)				\
 	CK_CC_INLINE static void			\
 	ck_pr_fence_strict_##T(void)			\
 	{						\
 		__asm__ __volatile__(I ::: "memory");	\
+		return;					\
 	}
+#endif /* CK_MD_UMP */

-CK_PR_FENCE(atomic, "sfence")
-CK_PR_FENCE(atomic_store, "sfence")
-CK_PR_FENCE(atomic_load, "mfence")
-CK_PR_FENCE(store_atomic, "sfence")
-CK_PR_FENCE(load_atomic, "mfence")
-CK_PR_FENCE(load, "lfence")
-CK_PR_FENCE(load_store, "mfence")
-CK_PR_FENCE(store, "sfence")
-CK_PR_FENCE(store_load, "mfence")
-CK_PR_FENCE(memory, "mfence")
-CK_PR_FENCE(release, "mfence")
-CK_PR_FENCE(acquire, "mfence")
-CK_PR_FENCE(acqrel, "mfence")
-CK_PR_FENCE(lock, "mfence")
-CK_PR_FENCE(unlock, "mfence")
+#if defined(CK_MD_SSE_DISABLE)
+/* If SSE is disabled, then use atomic operations for serialization. */
+#define CK_MD_X86_MFENCE "lock addl $0, (%%esp)"
+#define CK_MD_X86_SFENCE CK_MD_X86_MFENCE
+#define CK_MD_X86_LFENCE CK_MD_X86_MFENCE
+#else
+#define CK_MD_X86_SFENCE "sfence"
+#define CK_MD_X86_LFENCE "lfence"
+#define CK_MD_X86_MFENCE "mfence"
+#endif /* !CK_MD_SSE_DISABLE */
+
+CK_PR_FENCE(atomic, "")
+CK_PR_FENCE(atomic_store, "")
+CK_PR_FENCE(atomic_load, "")
+CK_PR_FENCE(store_atomic, "")
+CK_PR_FENCE(load_atomic, "")
+CK_PR_FENCE(load, CK_MD_X86_LFENCE)
+CK_PR_FENCE(load_store, CK_MD_X86_MFENCE)
+CK_PR_FENCE(store, CK_MD_X86_SFENCE)
+CK_PR_FENCE(store_load, CK_MD_X86_MFENCE)
+CK_PR_FENCE(memory, CK_MD_X86_MFENCE)
+CK_PR_FENCE(release, CK_MD_X86_MFENCE)
+CK_PR_FENCE(acquire, CK_MD_X86_MFENCE)
+CK_PR_FENCE(acqrel, CK_MD_X86_MFENCE)
+CK_PR_FENCE(lock, CK_MD_X86_MFENCE)
+CK_PR_FENCE(unlock, CK_MD_X86_MFENCE)

 #undef CK_PR_FENCE

@ -215,18 +233,18 @@ CK_PR_FAA_S(8,  uint8_t,  "xaddb")
 	}

 #define CK_PR_UNARY_V(K, S, T, C, I)					\
-	CK_CC_INLINE static void					\
-	ck_pr_##K##_##S##_zero(T *target, bool *r)			\
+	CK_CC_INLINE static bool					\
+	ck_pr_##K##_##S##_is_zero(T *target)				\
 	{								\
+		bool ret;						\
 		__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %0; setz %1"	\
 					: "+m" (*(C *)target),		\
-					  "=m" (*r)			\
+					  "=qm" (ret)			\
 					:				\
 					: "memory", "cc");		\
-		return;							\
+		return ret;						\
 	}

-
 #define CK_PR_UNARY_S(K, S, T, I) CK_PR_UNARY(K, S, T, T, I)

 #define CK_PR_GENERATE(K)				\
@ -289,8 +307,38 @@ CK_PR_GENERATE(xor)
 #undef CK_PR_BINARY

 /*
- * Atomic compare and swap.
+ * Atomic compare and swap, with a variant that sets *v to the old value of target.
 */
+#ifdef __GCC_ASM_FLAG_OUTPUTS__
+#define CK_PR_CAS(S, M, T, C, I)						\
+	CK_CC_INLINE static bool						\
+	ck_pr_cas_##S(M *target, T compare, T set)				\
+	{									\
+		bool z;								\
+		__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0"		\
+					: "+m"    (*(C *)target),		\
+					  "=@ccz" (z),				\
+					  /* RAX is clobbered by cmpxchg. */	\
+					  "+a"    (compare)			\
+					: "q"     (set)				\
+					: "memory", "cc");			\
+		return z;							\
+	}									\
+										\
+	CK_CC_INLINE static bool						\
+	ck_pr_cas_##S##_value(M *target, T compare, T set, M *v)		\
+	{									\
+		bool z;								\
+		__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;"		\
+					: "+m"    (*(C *)target),		\
+					  "=@ccz" (z),				\
+					  "+a"    (compare)			\
+					: "q"     (set)				\
+					: "memory", "cc");			\
+		*(T *)v = compare;						\
+		return z;							\
+	}
+#else
 #define CK_PR_CAS(S, M, T, C, I)						\
 	CK_CC_INLINE static bool						\
 	ck_pr_cas_##S(M *target, T compare, T set)				\
@ -303,7 +351,23 @@ CK_PR_GENERATE(xor)
 					  "a"   (compare)			\
 					: "memory", "cc");			\
 		return z;							\
+	}									\
+										\
+	CK_CC_INLINE static bool						\
+	ck_pr_cas_##S##_value(M *target, T compare, T set, M *v)		\
+	{									\
+		bool z;								\
+		__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;"		\
+				     "setz %1;"					\
+					: "+m"  (*(C *)target),			\
+					  "=q"  (z),				\
+					  "+a"  (compare)			\
+					: "q"   (set)				\
+					: "memory", "cc");			\
+		*(T *)v = compare;						\
+		return z;							\
 	}
+#endif

 CK_PR_CAS(ptr, void, void *, char, "cmpxchgl")

@ -319,41 +383,6 @@ CK_PR_CAS_S(8,  uint8_t,  "cmpxchgb")
 #undef CK_PR_CAS_S
 #undef CK_PR_CAS

-/*
- * Compare and swap, set *v to old value of target.
- */
-#define CK_PR_CAS_O(S, M, T, C, I, R)						\
-	CK_CC_INLINE static bool						\
-	ck_pr_cas_##S##_value(M *target, T compare, T set, M *v)		\
-	{									\
-		bool z;								\
-		__asm__ __volatile__(CK_PR_LOCK_PREFIX "cmpxchg" I " %3, %0;"	\
-				     "mov %% " R ", %2;"			\
-				     "setz %1;"					\
-					: "+m"  (*(C *)target),			\
-					  "=a"  (z),				\
-					  "=m"  (*(C *)v)			\
-					: "q"   (set),				\
-					  "a"   (compare)			\
-					: "memory", "cc");			\
-		return (bool)z;							\
-	}
-
-CK_PR_CAS_O(ptr, void, void *, char, "l", "eax")
-
-#define CK_PR_CAS_O_S(S, T, I, R)	\
-	CK_PR_CAS_O(S, T, T, T, I, R)
-
-CK_PR_CAS_O_S(char, char, "b", "al")
-CK_PR_CAS_O_S(int, int, "l", "eax")
-CK_PR_CAS_O_S(uint, unsigned int, "l", "eax")
-CK_PR_CAS_O_S(32, uint32_t, "l", "eax")
-CK_PR_CAS_O_S(16, uint16_t, "w", "ax")
-CK_PR_CAS_O_S(8,  uint8_t,  "b", "al")
-
-#undef CK_PR_CAS_O_S
-#undef CK_PR_CAS_O
-
 /*
 * Atomic bit test operations.
 */
--- a/include/gcc/x86_64/ck_pr.h
+++ b/include/gcc/x86_64/ck_pr.h
@ -58,8 +58,8 @@
 #endif

 /*
- * Prevent speculative execution in busy-wait loops (P4 <=)
- * or "predefined delay".
+ * Prevent speculative execution in busy-wait loops (P4 <=) or "predefined
+ * delay".
 */
 CK_CC_INLINE static void
 ck_pr_stall(void)
@ -75,18 +75,39 @@ ck_pr_stall(void)
 		__asm__ __volatile__(I ::: "memory");	\
 	}

-CK_PR_FENCE(atomic, "sfence")
-CK_PR_FENCE(atomic_store, "sfence")
-CK_PR_FENCE(atomic_load, "mfence")
-CK_PR_FENCE(store_atomic, "sfence")
-CK_PR_FENCE(load_atomic, "mfence")
+/* Atomic operations are always serializing. */
+CK_PR_FENCE(atomic, "")
+CK_PR_FENCE(atomic_store, "")
+CK_PR_FENCE(atomic_load, "")
+CK_PR_FENCE(store_atomic, "")
+CK_PR_FENCE(load_atomic, "")
+
+/* Traditional fence interface. */
 CK_PR_FENCE(load, "lfence")
 CK_PR_FENCE(load_store, "mfence")
 CK_PR_FENCE(store, "sfence")
 CK_PR_FENCE(store_load, "mfence")
 CK_PR_FENCE(memory, "mfence")
+
+/* Below are stdatomic-style fences. */
+
+/*
+ * Provides load-store and store-store ordering. However, Intel specifies that
+ * the WC memory model is relaxed. It is likely an sfence *is* sufficient (in
+ * particular, stores are not re-ordered with respect to prior loads and it is
+ * really just the stores that are subject to re-ordering). However, we take
+ * the conservative route as the manuals are too ambiguous for my taste.
+ */
 CK_PR_FENCE(release, "mfence")
+
+/*
+ * Provides load-load and load-store ordering. The lfence instruction ensures
+ * all prior load operations are complete before any subsequent instructions
+ * actually begin execution. However, the manual also ends up going to describe
+ * WC memory as a relaxed model.
+ */
 CK_PR_FENCE(acquire, "mfence")
+
 CK_PR_FENCE(acqrel, "mfence")
 CK_PR_FENCE(lock, "mfence")
 CK_PR_FENCE(unlock, "mfence")
@ -311,18 +332,18 @@ CK_PR_FAA_S(8,  uint8_t,  "xaddb")
 	}

 #define CK_PR_UNARY_V(K, S, T, C, I)					\
-	CK_CC_INLINE static void					\
-	ck_pr_##K##_##S##_zero(T *target, bool *r)			\
+	CK_CC_INLINE static bool					\
+	ck_pr_##K##_##S##_is_zero(T *target)				\
 	{								\
+		bool ret;						\
 		__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %0; setz %1"	\
 					: "+m" (*(C *)target),		\
-					  "=m" (*r)			\
+					  "=rm" (ret)			\
 					:				\
 					: "memory", "cc");		\
-		return;							\
+		return ret;						\
 	}

-
 #define CK_PR_UNARY_S(K, S, T, I) CK_PR_UNARY(K, S, T, T, I)

 #define CK_PR_GENERATE(K)				\
@ -387,8 +408,38 @@ CK_PR_GENERATE(xor)
 #undef CK_PR_BINARY

 /*
- * Atomic compare and swap.
+ * Atomic compare and swap, with a variant that sets *v to the old value of target.
 */
+#ifdef __GCC_ASM_FLAG_OUTPUTS__
+#define CK_PR_CAS(S, M, T, C, I)						\
+	CK_CC_INLINE static bool						\
+	ck_pr_cas_##S(M *target, T compare, T set)				\
+	{									\
+		bool z;								\
+		__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0"		\
+					: "+m"    (*(C *)target),		\
+					  "=@ccz" (z),				\
+					  /* RAX is clobbered by cmpxchg. */	\
+					  "+a"    (compare)			\
+					: "q"     (set)				\
+					: "memory", "cc");			\
+		return z;							\
+	}									\
+										\
+	CK_CC_INLINE static bool						\
+	ck_pr_cas_##S##_value(M *target, T compare, T set, M *v)		\
+	{									\
+		bool z;								\
+		__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;"		\
+					: "+m"    (*(C *)target),		\
+					  "=@ccz" (z),				\
+					  "+a"    (compare)			\
+					: "q"     (set)				\
+					: "memory", "cc");			\
+		*(T *)v = compare;						\
+		return z;							\
+	}
+#else
 #define CK_PR_CAS(S, M, T, C, I)						\
 	CK_CC_INLINE static bool						\
 	ck_pr_cas_##S(M *target, T compare, T set)				\
@ -401,7 +452,23 @@ CK_PR_GENERATE(xor)
 					  "a"   (compare)			\
 					: "memory", "cc");			\
 		return z;							\
+	}									\
+										\
+	CK_CC_INLINE static bool						\
+	ck_pr_cas_##S##_value(M *target, T compare, T set, M *v)		\
+	{									\
+		bool z;								\
+		__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;"		\
+				     "setz %1;"					\
+					: "+m"  (*(C *)target),			\
+					  "=q"  (z),				\
+					  "+a"  (compare)			\
+					: "q"   (set)				\
+					: "memory", "cc");			\
+		*(T *)v = compare;						\
+		return z;							\
 	}
+#endif

 CK_PR_CAS(ptr, void, void *, char, "cmpxchgq")

@ -421,45 +488,6 @@ CK_PR_CAS_S(8,  uint8_t,  "cmpxchgb")
 #undef CK_PR_CAS_S
 #undef CK_PR_CAS

-/*
- * Compare and swap, set *v to old value of target.
- */
-#define CK_PR_CAS_O(S, M, T, C, I, R)						\
-	CK_CC_INLINE static bool						\
-	ck_pr_cas_##S##_value(M *target, T compare, T set, M *v)		\
-	{									\
-		bool z;								\
-		__asm__ __volatile__(CK_PR_LOCK_PREFIX "cmpxchg" I " %3, %0;"	\
-				     "mov %% " R ", %2;"			\
-				     "setz %1;"					\
-					: "+m"  (*(C *)target),			\
-					  "=a"  (z),				\
-					  "=m"  (*(C *)v)			\
-					: "q"   (set),				\
-					  "a"   (compare)			\
-					: "memory", "cc");			\
-		return z;							\
-	}
-
-CK_PR_CAS_O(ptr, void, void *, char, "q", "rax")
-
-#define CK_PR_CAS_O_S(S, T, I, R)	\
-	CK_PR_CAS_O(S, T, T, T, I, R)
-
-CK_PR_CAS_O_S(char, char, "b", "al")
-CK_PR_CAS_O_S(int, int, "l", "eax")
-CK_PR_CAS_O_S(uint, unsigned int, "l", "eax")
-#ifndef CK_PR_DISABLE_DOUBLE
-CK_PR_CAS_O_S(double, double, "q", "rax")
-#endif
-CK_PR_CAS_O_S(64, uint64_t, "q", "rax")
-CK_PR_CAS_O_S(32, uint32_t, "l", "eax")
-CK_PR_CAS_O_S(16, uint16_t, "w", "ax")
-CK_PR_CAS_O_S(8,  uint8_t,  "b", "al")
-
-#undef CK_PR_CAS_O_S
-#undef CK_PR_CAS_O
-
 /*
 * Contrary to C-interface, alignment requirements are that of uint64_t[2].
 */
--- a/include/spinlock/dec.h
+++ b/include/spinlock/dec.h
@ -111,7 +111,8 @@ ck_spinlock_dec_lock_eb(struct ck_spinlock_dec *lock)
 		if (r == true)
 			break;

-		ck_backoff_eb(&backoff);
+		while (ck_pr_load_uint(&lock->value) != 1)
+			ck_backoff_eb(&backoff);
 	}

 	ck_pr_fence_lock();
--- a/include/spinlock/fas.h
+++ b/include/spinlock/fas.h
@ -77,10 +77,11 @@ CK_CC_INLINE static void
 ck_spinlock_fas_lock(struct ck_spinlock_fas *lock)
 {

-	while (ck_pr_fas_uint(&lock->value, true) == true) {
-		while (ck_pr_load_uint(&lock->value) == true)
-			ck_pr_stall();
-	}
+        while (CK_CC_UNLIKELY(ck_pr_fas_uint(&lock->value, true) == true)) {
+                do {
+                        ck_pr_stall();
+                } while (ck_pr_load_uint(&lock->value) == true);
+        }

 	ck_pr_fence_lock();
 	return;
--- a/include/spinlock/hclh.h
+++ b/include/spinlock/hclh.h
@ -81,6 +81,8 @@ ck_spinlock_hclh_lock(struct ck_spinlock_hclh **glob_queue,
 	thread->wait = true;
 	thread->splice = false;
 	thread->cluster_id = (*local_queue)->cluster_id;
+	/* Make sure previous->previous doesn't appear to be NULL */
+	thread->previous = *local_queue;

 	/* Serialize with respect to update of local queue. */
 	ck_pr_fence_store_atomic();
@ -91,13 +93,15 @@ ck_spinlock_hclh_lock(struct ck_spinlock_hclh **glob_queue,

 	/* Wait until previous thread from the local queue is done with lock. */
 	ck_pr_fence_load();
-	if (previous->previous != NULL &&
-	    previous->cluster_id == thread->cluster_id) {
-		while (ck_pr_load_uint(&previous->wait) == true)
+	if (previous->previous != NULL) {
+		while (ck_pr_load_uint(&previous->wait) == true &&
+			ck_pr_load_int(&previous->cluster_id) == thread->cluster_id &&
+			ck_pr_load_uint(&previous->splice) == false)
 			ck_pr_stall();

 		/* We're head of the global queue, we're done */
-		if (ck_pr_load_uint(&previous->splice) == false)
+		if (ck_pr_load_int(&previous->cluster_id) == thread->cluster_id &&
+				ck_pr_load_uint(&previous->splice) == false)
 			return;
 	}

--- a/regressions/Makefile
+++ b/regressions/Makefile
@ -4,7 +4,9 @@ DIR=array	\
    bitmap	\
    brlock	\
    bytelock	\
+    cc		\
    cohort	\
+    ec		\
    epoch	\
    fifo	\
    hp		\
@ -27,6 +29,7 @@ DIR=array	\

 all:
 	$(MAKE) -C ./ck_array/validate all
+	$(MAKE) -C ./ck_cc/validate all
 	$(MAKE) -C ./ck_cohort/validate all
 	$(MAKE) -C ./ck_cohort/benchmark all
 	$(MAKE) -C ./ck_bitmap/validate all
@ -69,9 +72,12 @@ all:
 	$(MAKE) -C ./ck_pflock/benchmark all
 	$(MAKE) -C ./ck_hp/validate all
 	$(MAKE) -C ./ck_hp/benchmark all
+	$(MAKE) -C ./ck_ec/validate all
+	$(MAKE) -C ./ck_ec/benchmark all

 clean:
 	$(MAKE) -C ./ck_array/validate clean
+	$(MAKE) -C ./ck_cc/validate clean
 	$(MAKE) -C ./ck_pflock/validate clean
 	$(MAKE) -C ./ck_pflock/benchmark clean
 	$(MAKE) -C ./ck_tflock/validate clean
@ -116,6 +122,8 @@ clean:
 	$(MAKE) -C ./ck_pflock/benchmark clean
 	$(MAKE) -C ./ck_hp/validate clean
 	$(MAKE) -C ./ck_hp/benchmark clean
+	$(MAKE) -C ./ck_ec/validate clean
+	$(MAKE) -C ./ck_ec/benchmark clean

 check: all
 	rc=0; 							\
--- a/regressions/ck_bitmap/validate/serial.c
+++ b/regressions/ck_bitmap/validate/serial.c
@ -159,7 +159,7 @@ test_init(bool init)

 	bytes = ck_bitmap_size(length);
 	bitmap = malloc(bytes);
-	memset(bitmap, random(), bytes);
+	memset(bitmap, common_rand(), bytes);

 	ck_bitmap_init(bitmap, length, init);

@ -188,7 +188,7 @@ random_init(void)
 	ck_bitmap_init(bitmap, length, false);

 	for (i = 0; i < length; i++) {
-		if (random() & 1) {
+		if (common_rand() & 1) {
 			ck_bitmap_set(bitmap, i);
 		}
 	}
@ -259,7 +259,7 @@ random_test(unsigned int seed)
 	ck_bitmap_t *x, *x_copy, *y;
 	unsigned int i;

-	srandom(seed);
+	common_srand(seed);

 	test_init(false);
 	test_init(true);
--- a/regressions/ck_cc/validate/Makefile
+++ b/regressions/ck_cc/validate/Makefile
@ -0,0 +1,17 @@
+.PHONY: check clean distribution
+
+OBJECTS=ck_cc
+
+all: $(OBJECTS)
+
+ck_cc: ck_cc.c ../../../include/ck_cc.h
+	$(CC) $(CFLAGS) -g2 -o ck_cc ck_cc.c
+
+check: all
+	./ck_cc
+
+clean:
+	rm -rf *~ *.o $(OBJECTS) *.dSYM *.exe
+
+include ../../../build/regressions.build
+CFLAGS+=-D_GNU_SOURCE
--- a/regressions/ck_cc/validate/ck_cc.c
+++ b/regressions/ck_cc/validate/ck_cc.c
@ -0,0 +1,37 @@
+#include <ck_pr.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "../../common.h"
+
+int
+main(void)
+{
+	unsigned int x;
+
+	ck_pr_store_uint(&x, 0x10110);
+
+	if (ck_cc_ffs(0) != 0)
+		ck_error("ffs(0) = %d\n", ck_cc_ffs(0));
+	if (ck_cc_ffs(4) != 3)
+		ck_error("ffs(4) = %d\n", ck_cc_ffs(4));
+	if (ck_cc_ffs(UINT_MAX) != 1)
+		ck_error("ffs(UINT_MAX) = %d\n", ck_cc_ffs(UINT_MAX));
+	if (ck_cc_ffs(x) != 5)
+		ck_error("ffs(%u) = %d\n", x, ck_cc_ffs(x));
+
+	if (ck_cc_ffs(x) != ck_cc_ffsl(x) ||
+	    ck_cc_ffsl(x) != ck_cc_ffsll(x) ||
+	    ck_cc_ffs(x) != ck_cc_ffsll(x)) {
+		ck_error("    ffs = %d, ffsl = %d, ffsll = %d\n",
+		    ck_cc_ffs(x), ck_cc_ffsl(x), ck_cc_ffsll(x));
+	}
+
+	if (ck_cc_ctz(x) != 4)
+		ck_error("ctz = %d\n", ck_cc_ctz(x));
+
+	if (ck_cc_popcount(x) != 3)
+		ck_error("popcount = %d\n", ck_cc_popcount(x));
+
+	return 0;
+}
--- a/regressions/ck_ec/benchmark/Makefile
+++ b/regressions/ck_ec/benchmark/Makefile
@ -0,0 +1,18 @@
+.PHONY: check clean distribution
+
+OBJECTS=ck_ec
+
+all: $(OBJECTS)
+
+ck_ec: ck_ec.c ../../../include/ck_ec.h
+	$(CC) $(CFLAGS) ../../../src/ck_ec.c -o ck_ec ck_ec.c
+
+check: all
+	./ck_ec $(CORES) 1
+
+clean:
+	rm -rf *~ *.o $(OBJECTS) *.dSYM *.exe
+
+include ../../../build/regressions.build
+CFLAGS+=-D_GNU_SOURCE
+
--- a/regressions/ck_ec/benchmark/ck_ec.c
+++ b/regressions/ck_ec/benchmark/ck_ec.c
@ -0,0 +1,484 @@
+/*
+ * Copyright 2018 Paul Khuong.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <ck_cc.h>
+#include <ck_ec.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "../../common.h"
+
+#ifndef STEPS
+#define STEPS (65536 * 64)
+#endif
+
+static int gettime(const struct ck_ec_ops *, struct timespec *out);
+static void wake32(const struct ck_ec_ops *, const uint32_t *);
+static void wait32(const struct ck_ec_wait_state *,
+		   const uint32_t *, uint32_t, const struct timespec *);
+static void wake64(const struct ck_ec_ops *, const uint64_t *);
+static void wait64(const struct ck_ec_wait_state *,
+		   const uint64_t *, uint64_t, const struct timespec *);
+
+static const struct ck_ec_ops test_ops = {
+	.gettime = gettime,
+	.wait32 = wait32,
+	.wait64 = wait64,
+	.wake32 = wake32,
+	.wake64 = wake64
+};
+
+#ifndef __linux__
+static int gettime(const struct ck_ec_ops *ops, struct timespec *out)
+{
+	(void)out;
+
+	assert(ops == &test_ops);
+	return -1;
+}
+
+static void wait32(const struct ck_ec_wait_state *state,
+		   const uint32_t *address, uint32_t expected,
+		   const struct timespec *deadline)
+{
+	(void)address;
+	(void)expected;
+	(void)deadline;
+
+	assert(state->ops == &test_ops);
+	return;
+}
+
+static void wait64(const struct ck_ec_wait_state *state,
+		   const uint64_t *address, uint64_t expected,
+		   const struct timespec *deadline)
+{
+	(void)address;
+	(void)expected;
+	(void)deadline;
+
+	assert(state->ops == &test_ops);
+	return;
+}
+
+static void wake32(const struct ck_ec_ops *ops, const uint32_t *address)
+{
+	(void)address;
+
+	assert(ops == &test_ops);
+	return;
+}
+
+static void wake64(const struct ck_ec_ops *ops, const uint64_t *address)
+{
+	(void)address;
+
+	assert(ops == &test_ops);
+	return;
+}
+#else
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+static int gettime(const struct ck_ec_ops *ops, struct timespec *out)
+{
+	assert(ops == &test_ops);
+	return clock_gettime(CLOCK_MONOTONIC, out);
+}
+
+static void wait32(const struct ck_ec_wait_state *state,
+		   const uint32_t *address, uint32_t expected,
+		   const struct timespec *deadline)
+{
+	assert(state->ops == &test_ops);
+	syscall(SYS_futex, address,
+		FUTEX_WAIT_BITSET, expected, deadline,
+		NULL, FUTEX_BITSET_MATCH_ANY, 0);
+	return;
+}
+
+static void wait64(const struct ck_ec_wait_state *state,
+		   const uint64_t *address, uint64_t expected,
+		   const struct timespec *deadline)
+{
+	const void *low_half;
+
+	assert(state->ops == &test_ops);
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	low_half = address;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	low_half = (uintptr_t)address + sizeof(uint32_t);
+#else
+# error "__BYTE_ORDER__ must be defined."
+#endif
+
+	syscall(SYS_futex, low_half,
+		FUTEX_WAIT_BITSET, (uint32_t)expected, deadline,
+		NULL, FUTEX_BITSET_MATCH_ANY, 0);
+	return;
+}
+
+static void wake32(const struct ck_ec_ops *ops, const uint32_t *address)
+{
+	assert(ops == &test_ops);
+	syscall(SYS_futex, address,
+		FUTEX_WAKE, INT_MAX,
+		/* ignored arguments */NULL, NULL, 0);
+	return;
+}
+
+static void wake64(const struct ck_ec_ops *ops, const uint64_t *address)
+{
+	const void *low_half;
+
+	assert(ops == &test_ops);
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	low_half = address;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	low_half = (uintptr_t)address + sizeof(uint32_t);
+#else
+# error "__BYTE_ORDER__ must be defined."
+#endif
+
+	syscall(SYS_futex, low_half,
+		FUTEX_WAKE, INT_MAX,
+		/* ignored arguments */NULL, NULL, 0);
+	return;
+}
+#endif /* __linux__ */
+
+static const struct ck_ec_mode sp = {
+	.ops = &test_ops,
+	.single_producer = true
+};
+
+static const struct ck_ec_mode mp = {
+	.ops = &test_ops,
+	.single_producer = false
+};
+
+static CK_CC_FORCE_INLINE void bench32(const struct ck_ec_mode mode)
+{
+	ck_ec32_t ec CK_CC_CACHELINE = CK_EC_INITIALIZER;
+	uint64_t a;
+	uint64_t baseline = 1000 * 1000;
+	uint32_t value;
+
+	for (size_t i = 0; i < STEPS; i++) {
+		uint64_t s = rdtsc();
+		uint64_t elapsed = rdtsc() - s;
+
+		if (elapsed < baseline) {
+			baseline = elapsed;
+		}
+	}
+
+	/* Read value. */
+	a = 0;
+	value = 0;
+	for (size_t i = 0; i < STEPS / 4; i++) {
+		uint64_t s = rdtsc();
+
+		value ^= ck_ec32_value(&ec);
+		value ^= ck_ec32_value(&ec);
+		value ^= ck_ec32_value(&ec);
+		value ^= ck_ec32_value(&ec);
+
+		__asm__ volatile("" :: "r"(value));
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec32_value: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	/* Wait (fast path). */
+	a = 0;
+	for (size_t i = 0; i < STEPS / 4; i++) {
+		uint64_t s = rdtsc();
+
+		ck_ec32_wait(&ec, &mode, 1, NULL);
+		ck_ec32_wait(&ec, &mode, 1, NULL);
+		ck_ec32_wait(&ec, &mode, 1, NULL);
+		ck_ec32_wait(&ec, &mode, 1, NULL);
+
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec32_wait fast: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	/* trywait. */
+	a = 0;
+	for (size_t i = 0; i < STEPS / 4; i++) {
+		struct timespec past = { .tv_sec = 0 };
+		uint64_t s = rdtsc();
+
+		ck_ec32_wait(&ec, &mode, 0, &past);
+		ck_ec32_wait(&ec, &mode, 0, &past);
+		ck_ec32_wait(&ec, &mode, 0, &past);
+		ck_ec32_wait(&ec, &mode, 0, &past);
+
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec32_wait timeout: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	/* Inc (no waiter). */
+	assert(!ck_ec32_has_waiters(&ec));
+	a = 0;
+	for (size_t i = 0; i < STEPS / 4; i++) {
+		uint64_t s = rdtsc();
+
+		ck_ec32_inc(&ec, &mode);
+		ck_ec32_inc(&ec, &mode);
+		ck_ec32_inc(&ec, &mode);
+		ck_ec32_inc(&ec, &mode);
+
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec32_inc: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	/* Inc (with waiter). */
+	assert(!ck_ec32_has_waiters(&ec));
+	a = 0;
+	for (size_t i = 0; i < STEPS; i++) {
+		struct timespec past = { .tv_sec = 1 };
+		uint64_t s;
+
+		ck_ec32_wait(&ec, &mode, ck_ec32_value(&ec), &past);
+		assert(ck_ec32_has_waiters(&ec));
+
+		s = rdtsc();
+		ck_ec32_inc(&ec, &mode);
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec32_inc slow: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	/* Add (no waiter). */
+	assert(!ck_ec32_has_waiters(&ec));
+	a = 0;
+	for (size_t i = 0; i < STEPS / 4; i++) {
+		uint64_t s = rdtsc();
+
+		ck_ec32_add(&ec, &mode, i + 1);
+		ck_ec32_add(&ec, &mode, i + 2);
+		ck_ec32_add(&ec, &mode, i + 3);
+		ck_ec32_add(&ec, &mode, i + 4);
+
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec32_add: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	assert(!ck_ec32_has_waiters(&ec));
+	a = 0;
+	for (size_t i = 0; i < STEPS; i++) {
+		struct timespec past = { .tv_sec = 1 };
+		uint64_t s;
+
+		ck_ec32_wait(&ec, &mode, ck_ec32_value(&ec), &past);
+		assert(ck_ec32_has_waiters(&ec));
+
+		s = rdtsc();
+		ck_ec32_add(&ec, &mode, i + 1);
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec32_add slow: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+	return;
+}
+
+#ifdef CK_F_EC64
+static CK_CC_FORCE_INLINE void bench64(const struct ck_ec_mode mode)
+{
+	ck_ec64_t ec CK_CC_CACHELINE = CK_EC_INITIALIZER;
+	uint64_t a;
+	uint64_t baseline = 1000 * 1000;
+	uint64_t value;
+
+	for (size_t i = 0; i < STEPS; i++) {
+		uint64_t s = rdtsc();
+		uint64_t elapsed = rdtsc() - s;
+
+		if (elapsed < baseline) {
+			baseline = elapsed;
+		}
+	}
+
+	/* Read value. */
+	a = 0;
+	value = 0;
+	for (size_t i = 0; i < STEPS / 4; i++) {
+		uint64_t s = rdtsc();
+
+		value ^= ck_ec64_value(&ec);
+		value ^= ck_ec64_value(&ec);
+		value ^= ck_ec64_value(&ec);
+		value ^= ck_ec64_value(&ec);
+
+		__asm__ volatile("" :: "r"(value));
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec64_value: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	/* Wait (fast path). */
+	a = 0;
+	for (size_t i = 0; i < STEPS / 4; i++) {
+		uint64_t s = rdtsc();
+
+		ck_ec64_wait(&ec, &mode, 1, NULL);
+		ck_ec64_wait(&ec, &mode, 1, NULL);
+		ck_ec64_wait(&ec, &mode, 1, NULL);
+		ck_ec64_wait(&ec, &mode, 1, NULL);
+
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec64_wait fast: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	/* trywait. */
+	a = 0;
+	for (size_t i = 0; i < STEPS / 4; i++) {
+		struct timespec past = { .tv_sec = 0 };
+		uint64_t s = rdtsc();
+
+		ck_ec64_wait(&ec, &mode, 0, &past);
+		ck_ec64_wait(&ec, &mode, 0, &past);
+		ck_ec64_wait(&ec, &mode, 0, &past);
+		ck_ec64_wait(&ec, &mode, 0, &past);
+
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec64_wait timeout: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	/* Inc (no waiter). */
+	assert(!ck_ec64_has_waiters(&ec));
+	a = 0;
+	for (size_t i = 0; i < STEPS / 4; i++) {
+		uint64_t s = rdtsc();
+
+		ck_ec64_inc(&ec, &mode);
+		ck_ec64_inc(&ec, &mode);
+		ck_ec64_inc(&ec, &mode);
+		ck_ec64_inc(&ec, &mode);
+
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec64_inc: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	/* Inc (with waiter). */
+	assert(!ck_ec64_has_waiters(&ec));
+	a = 0;
+	for (size_t i = 0; i < STEPS; i++) {
+		struct timespec past = { .tv_sec = 1 };
+		uint64_t s;
+
+		ck_ec64_wait(&ec, &mode, ck_ec64_value(&ec), &past);
+		assert(ck_ec64_has_waiters(&ec));
+
+		s = rdtsc();
+		ck_ec64_inc(&ec, &mode);
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec64_inc slow: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	/* Add (no waiter). */
+	assert(!ck_ec64_has_waiters(&ec));
+	a = 0;
+	for (size_t i = 0; i < STEPS / 4; i++) {
+		uint64_t s = rdtsc();
+
+		ck_ec64_add(&ec, &mode, i + 1);
+		ck_ec64_add(&ec, &mode, i + 2);
+		ck_ec64_add(&ec, &mode, i + 3);
+		ck_ec64_add(&ec, &mode, i + 4);
+
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec64_add: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+
+	assert(!ck_ec64_has_waiters(&ec));
+	a = 0;
+	for (size_t i = 0; i < STEPS; i++) {
+		struct timespec past = { .tv_sec = 1 };
+		uint64_t s;
+
+		ck_ec64_wait(&ec, &mode, ck_ec64_value(&ec), &past);
+		assert(ck_ec64_has_waiters(&ec));
+
+		s = rdtsc();
+		ck_ec64_add(&ec, &mode, i + 1);
+		a += rdtsc() - s - baseline;
+	}
+
+	printf("%s ec64_add slow: %" PRIu64 "\n",
+	       (mode.single_producer ? "SP" : "MP"), a / STEPS);
+	return;
+}
+#endif /* CK_F_EC64 */
+
+int
+main(void)
+{
+	printf("SP ec32\n");
+	bench32(sp);
+	printf("\nMP ec32\n");
+	bench32(mp);
+
+#ifdef CK_F_EC64
+	printf("\nSP ec64\n");
+	bench64(sp);
+	printf("\nMP ec64\n");
+	bench64(mp);
+#endif /* CK_F_EC64 */
+
+	return 0;
+}
--- a/regressions/ck_ec/validate/Makefile
+++ b/regressions/ck_ec/validate/Makefile
@ -0,0 +1,73 @@
+.PHONY: check clean distribution
+
+FUZZER ?= none
+
+FUZZ_CFLAGS ?=
+
+# See http://gallium.inria.fr/blog/portable-conditionals-in-makefiles/ for
+# the portable conditional technique below.
+none_fuzz_cflags =
+libfuzzer_fuzz_cflags = -DUSE_LIBFUZZER -fsanitize=fuzzer,memory,undefined
+
+FUZZ_CFLAGS += ${${FUZZER}_fuzz_cflags}
+
+OBJECTS = ck_ec_smoke_test 		\
+	prop_test_timeutil_add		\
+	prop_test_timeutil_add_ns	\
+	prop_test_timeutil_cmp		\
+	prop_test_timeutil_scale	\
+	prop_test_value 			\
+	prop_test_wakeup			\
+	prop_test_slow_wakeup
+
+all: $(OBJECTS)
+
+check: all
+	./ck_ec_smoke_test
+        # the command line arguments are only consumed by libfuzzer.
+	./prop_test_slow_wakeup -max_total_time=60
+	./prop_test_timeutil_add -max_total_time=60
+	./prop_test_timeutil_add_ns -max_total_time=60
+	./prop_test_timeutil_cmp -max_total_time=60
+	./prop_test_timeutil_scale -max_total_time=60
+	./prop_test_value -max_total_time=60
+	./prop_test_wakeup -max_total_time=60
+
+quickfuzz: all
+	./prop_test_slow_wakeup -max_total_time=5
+	./prop_test_timeutil_add -max_total_time=5
+	./prop_test_timeutil_add_ns -max_total_time=5
+	./prop_test_timeutil_cmp -max_total_time=5
+	./prop_test_timeutil_scale -max_total_time=5
+	./prop_test_value -max_total_time=5
+	./prop_test_wakeup -max_total_time=5
+
+ck_ec_smoke_test: ../../../src/ck_ec.c ck_ec_smoke_test.c ../../../src/ck_ec_timeutil.h ../../../include/ck_ec.h
+	$(CC) $(CFLAGS) -std=gnu11 ../../../src/ck_ec.c -o ck_ec_smoke_test ck_ec_smoke_test.c
+
+prop_test_slow_wakeup: ../../../src/ck_ec.c prop_test_slow_wakeup.c ../../../src/ck_ec_timeutil.h ../../../include/ck_ec.h fuzz_harness.h
+	$(CC) $(CFLAGS) $(FUZZ_CFLAGS) ../../../src/ck_ec.c -o prop_test_slow_wakeup prop_test_slow_wakeup.c
+
+prop_test_timeutil_add: ../../../src/ck_ec.c prop_test_timeutil_add.c ../../../src/ck_ec_timeutil.h ../../../include/ck_ec.h fuzz_harness.h
+	$(CC) $(CFLAGS) $(FUZZ_CFLAGS) ../../../src/ck_ec.c -o prop_test_timeutil_add prop_test_timeutil_add.c
+
+prop_test_timeutil_add_ns: ../../../src/ck_ec.c prop_test_timeutil_add_ns.c ../../../src/ck_ec_timeutil.h ../../../include/ck_ec.h fuzz_harness.h
+	$(CC) $(CFLAGS) $(FUZZ_CFLAGS) ../../../src/ck_ec.c -o prop_test_timeutil_add_ns prop_test_timeutil_add_ns.c
+
+prop_test_timeutil_cmp: ../../../src/ck_ec.c prop_test_timeutil_cmp.c ../../../src/ck_ec_timeutil.h ../../../include/ck_ec.h fuzz_harness.h
+	$(CC) $(CFLAGS) $(FUZZ_CFLAGS) ../../../src/ck_ec.c -o prop_test_timeutil_cmp prop_test_timeutil_cmp.c
+
+prop_test_timeutil_scale: ../../../src/ck_ec.c prop_test_timeutil_scale.c ../../../src/ck_ec_timeutil.h ../../../include/ck_ec.h fuzz_harness.h
+	$(CC) $(CFLAGS) $(FUZZ_CFLAGS) ../../../src/ck_ec.c -o prop_test_timeutil_scale prop_test_timeutil_scale.c
+
+prop_test_value: ../../../src/ck_ec.c prop_test_value.c ../../../src/ck_ec_timeutil.h ../../../include/ck_ec.h fuzz_harness.h
+	$(CC) $(CFLAGS) $(FUZZ_CFLAGS) ../../../src/ck_ec.c -o prop_test_value prop_test_value.c
+
+prop_test_wakeup: ../../../src/ck_ec.c prop_test_wakeup.c ../../../src/ck_ec_timeutil.h ../../../include/ck_ec.h fuzz_harness.h
+	$(CC) $(CFLAGS) $(FUZZ_CFLAGS) ../../../src/ck_ec.c -o prop_test_wakeup prop_test_wakeup.c
+
+clean:
+	rm -rf *~ *.o *.dSYM *.exe $(OBJECTS)
+
+include ../../../build/regressions.build
+CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE
--- a/regressions/ck_ec/validate/ck_ec_smoke_test.c
+++ b/regressions/ck_ec/validate/ck_ec_smoke_test.c
@ -0,0 +1,450 @@
+#include <assert.h>
+#include <ck_ec.h>
+#include <ck_limits.h>
+#include <ck_stdbool.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#define TIME_MAX ((time_t)((1ULL << ((sizeof(time_t) * CHAR_BIT) - 1)) - 1))
+
+#ifndef __linux__
+/* Zero-initialize to mark the ops as unavailable. */
+static const struct ck_ec_ops test_ops;
+#else
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <time.h>
+
+static int gettime(const struct ck_ec_ops *, struct timespec *out);
+static void wake32(const struct ck_ec_ops *, const uint32_t *);
+static void wait32(const struct ck_ec_wait_state *, const uint32_t *,
+		   uint32_t, const struct timespec *);
+static void wake64(const struct ck_ec_ops *, const uint64_t *);
+static void wait64(const struct ck_ec_wait_state *, const uint64_t *,
+		   uint64_t, const struct timespec *);
+
+static const struct ck_ec_ops test_ops = {
+	.gettime = gettime,
+	.wait32 = wait32,
+	.wait64 = wait64,
+	.wake32 = wake32,
+	.wake64 = wake64
+};
+
+static int gettime(const struct ck_ec_ops *ops, struct timespec *out)
+{
+	assert(ops == &test_ops);
+	return clock_gettime(CLOCK_MONOTONIC, out);
+}
+
+static void wait32(const struct ck_ec_wait_state *state,
+		   const uint32_t *address, uint32_t expected,
+		   const struct timespec *deadline)
+{
+	assert(state->ops == &test_ops);
+	syscall(SYS_futex, address,
+		FUTEX_WAIT_BITSET, expected, deadline,
+		NULL, FUTEX_BITSET_MATCH_ANY, 0);
+	return;
+}
+
+static void wait64(const struct ck_ec_wait_state *state,
+		   const uint64_t *address, uint64_t expected,
+		   const struct timespec *deadline)
+{
+	const void *low_half;
+
+	assert(state->ops == &test_ops);
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	low_half = address;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	low_half = (uintptr_t)address + sizeof(uint32_t);
+#else
+# error "__BYTE_ORDER__ must be defined."
+#endif
+
+	syscall(SYS_futex, low_half,
+		FUTEX_WAIT_BITSET, (uint32_t)expected, deadline,
+		NULL, FUTEX_BITSET_MATCH_ANY, 0);
+	return;
+}
+
+static void wake32(const struct ck_ec_ops *ops, const uint32_t *address)
+{
+	assert(ops == &test_ops);
+	syscall(SYS_futex, address,
+		FUTEX_WAKE, INT_MAX,
+		/* ignored arguments */NULL, NULL, 0);
+	return;
+}
+
+static void wake64(const struct ck_ec_ops *ops, const uint64_t *address)
+{
+	const void *low_half;
+
+	assert(ops == &test_ops);
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	low_half = address;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	low_half = (uintptr_t)address + sizeof(uint32_t);
+#else
+# error "__BYTE_ORDER__ must be defined."
+#endif
+
+	syscall(SYS_futex, low_half,
+		FUTEX_WAKE, INT_MAX,
+		/* ignored arguments */NULL, NULL, 0);
+	return;
+}
+#endif /* __linux__ */
+
+static const struct ck_ec_mode sp = {
+	.ops = &test_ops,
+	.single_producer = true
+};
+
+static const struct ck_ec_mode mp = {
+	.ops = &test_ops,
+	.single_producer = false
+};
+
+static void test_update_counter_32(const struct ck_ec_mode *mode)
+{
+	struct ck_ec32 ec = CK_EC_INITIALIZER;
+
+	assert(ck_ec_value(&ec) == 0);
+
+	ck_ec_inc(&ec, mode);
+	assert(ck_ec_value(&ec) == 1);
+
+	uint32_t old = ck_ec_add(&ec, mode, 42);
+	assert(old == 1);
+	assert(ck_ec_value(&ec) == 43);
+	return;
+}
+
+#ifdef CK_F_EC64
+static void test_update_counter_64(const struct ck_ec_mode *mode)
+{
+	struct ck_ec64 ec = CK_EC_INITIALIZER;
+
+	assert(ck_ec_value(&ec) == 0);
+
+	ck_ec_inc(&ec, mode);
+	assert(ck_ec_value(&ec) == 1);
+
+	uint64_t old = ck_ec_add(&ec, mode, 42);
+	assert(old == 1);
+	assert(ck_ec_value(&ec) == 43);
+	return;
+}
+#endif
+
+static void test_deadline(void)
+{
+	struct timespec deadline;
+
+	assert(ck_ec_deadline(&deadline, &sp, NULL) == 0);
+	assert(deadline.tv_sec == TIME_MAX);
+
+	{
+		const struct timespec timeout = {
+			.tv_sec = 1,
+			.tv_nsec = 1000
+		};
+		const struct timespec no_timeout = {
+			.tv_sec = 0
+		};
+		struct timespec now;
+
+		assert(ck_ec_deadline(&deadline, &sp, &timeout) == 0);
+		assert(ck_ec_deadline(&now, &sp, &no_timeout) == 0);
+
+		double now_sec = now.tv_sec + 1e-9 * now.tv_nsec;
+		double deadline_sec = deadline.tv_sec + 1e-9 * deadline.tv_nsec;
+		assert(now_sec < deadline_sec);
+		assert(deadline_sec <= now_sec + 1 + 1000e-9);
+	}
+
+	{
+		const struct timespec timeout = {
+			.tv_sec = TIME_MAX - 1,
+			.tv_nsec = 1000
+		};
+
+		assert(ck_ec_deadline(&deadline, &sp, &timeout) == 0);
+		assert(deadline.tv_sec == TIME_MAX);
+	}
+
+	return;
+}
+
+static void test_wait_32(void)
+{
+	struct timespec deadline = { .tv_sec = 0 };
+	struct ck_ec32 ec;
+
+	ck_ec_init(&ec, 1);
+	assert(ck_ec_value(&ec) == 1);
+	assert(ck_ec_wait(&ec, &sp, 2, NULL) == 0);
+	assert(ck_ec_wait(&ec, &sp, 1, &deadline) == -1);
+
+	{
+		const struct timespec timeout = { .tv_nsec = 1 };
+
+		assert(ck_ec_deadline(&deadline, &sp, &timeout) == 0);
+		assert(ck_ec_wait(&ec, &sp, 1, &deadline) == -1);
+		assert(ck_ec_has_waiters(&ec));
+	}
+
+	return;
+}
+
+#ifdef CK_F_EC64
+static void test_wait_64(void)
+{
+	struct timespec deadline = { .tv_sec = 0 };
+	struct ck_ec64 ec;
+
+	ck_ec_init(&ec, 0);
+	assert(ck_ec_value(&ec) == 0);
+	assert(ck_ec_wait(&ec, &sp, 1, NULL) == 0);
+	assert(ck_ec_wait(&ec, &sp, 0, &deadline) == -1);
+
+	{
+		const struct timespec timeout = { .tv_nsec = 1 };
+
+		assert(ck_ec_deadline(&deadline, &sp, &timeout) == 0);
+		assert(ck_ec_wait(&ec, &sp, 0, &deadline) == -1);
+		assert(ck_ec_has_waiters(&ec));
+	}
+
+	return;
+}
+#endif
+
+static int pred(const struct ck_ec_wait_state *state,
+		struct timespec *deadline)
+{
+	double initial_ts = state->start.tv_sec +
+	    1e-9 * state->start.tv_nsec;
+	int *count = state->data;
+
+	printf("pred wait: %f\n",
+	       deadline->tv_sec + 1e-9 * deadline->tv_nsec - initial_ts);
+
+	if ((*count)++ < 3) {
+		return 0;
+	}
+
+	return (*count)++;
+}
+
+/*
+ * Check that pred's return value is correctly bubbled up,
+ * and that the event count is marked as having waiters.
+ */
+static void test_wait_pred_32(void)
+{
+	struct ck_ec32 ec = CK_EC_INITIALIZER;
+	int count = 0;
+
+	assert(!ck_ec_has_waiters(&ec));
+	assert(ck_ec_wait_pred(&ec, &sp, 0, pred, &count, NULL) == 4);
+	assert(ck_ec_has_waiters(&ec));
+	assert(count == 5);
+	return;
+}
+
+#ifdef CK_F_EC64
+static int pred2(const struct ck_ec_wait_state *state,
+		 struct timespec *deadline)
+{
+	double initial_ts = state->start.tv_sec +
+	    1e-9 * state->start.tv_nsec;
+	int *count = state->data;
+
+	printf("pred2 wait: %f\n",
+	       deadline->tv_sec + 1e-9 * deadline->tv_nsec - initial_ts);
+
+	*deadline = state->now;
+	deadline->tv_sec++;
+
+	(*count)++;
+	return 0;
+}
+
+/*
+ * wait_pred_64 is nearly identical to _32. Now check that deadline
+ * overriding works.
+ */
+static void test_wait_pred_64(void)
+{
+	const struct timespec timeout = { .tv_sec = 5 };
+	struct timespec deadline;
+	struct ck_ec64 ec = CK_EC_INITIALIZER;
+	int count = 0;
+
+	assert(!ck_ec_has_waiters(&ec));
+	assert(ck_ec_deadline(&deadline, &sp, &timeout) == 0);
+	assert(ck_ec_wait_pred(&ec, &sp, 0, pred2, &count, &deadline) == -1);
+	assert(ck_ec_has_waiters(&ec));
+	assert(count == 5);
+	return;
+}
+#endif
+
+static int woken = 0;
+
+static void *test_threaded_32_waiter(void *data)
+{
+	struct ck_ec32 *ec = data;
+
+	ck_ec_wait(ec, &sp, 0, NULL);
+	ck_pr_store_int(&woken, 1);
+	return NULL;
+}
+
+static void test_threaded_inc_32(const struct ck_ec_mode *mode)
+{
+	struct ck_ec32 ec = CK_EC_INITIALIZER;
+	pthread_t waiter;
+
+	ck_pr_store_int(&woken, 0);
+
+	pthread_create(&waiter, NULL, test_threaded_32_waiter, &ec);
+	usleep(10000);
+
+	assert(ck_pr_load_int(&woken) == 0);
+	ck_ec_inc(&ec, mode);
+
+	pthread_join(waiter, NULL);
+	assert(ck_pr_load_int(&woken) == 1);
+	return;
+}
+
+static void test_threaded_add_32(const struct ck_ec_mode *mode)
+{
+	struct ck_ec32 ec = CK_EC_INITIALIZER;
+	pthread_t waiter;
+
+	ck_pr_store_int(&woken, 0);
+
+	pthread_create(&waiter, NULL, test_threaded_32_waiter, &ec);
+	usleep(10000);
+
+	assert(ck_pr_load_int(&woken) == 0);
+	ck_ec_add(&ec, mode, 4);
+
+	pthread_join(waiter, NULL);
+	assert(ck_pr_load_int(&woken) == 1);
+	return;
+}
+
+#ifdef CK_F_EC64
+static void *test_threaded_64_waiter(void *data)
+{
+	struct ck_ec64 *ec = data;
+
+	ck_ec_wait(ec, &sp, 0, NULL);
+	ck_pr_store_int(&woken, 1);
+	return NULL;
+}
+
+static void test_threaded_inc_64(const struct ck_ec_mode *mode)
+{
+	struct ck_ec64 ec = CK_EC_INITIALIZER;
+	pthread_t waiter;
+
+	ck_pr_store_int(&woken, 0);
+
+	pthread_create(&waiter, NULL, test_threaded_64_waiter, &ec);
+	usleep(10000);
+
+	assert(ck_pr_load_int(&woken) == 0);
+	ck_ec_inc(&ec, mode);
+
+	pthread_join(waiter, NULL);
+	assert(ck_pr_load_int(&woken) == 1);
+	return;
+}
+
+static void test_threaded_add_64(const struct ck_ec_mode *mode)
+{
+	struct ck_ec64 ec = CK_EC_INITIALIZER;
+	pthread_t waiter;
+
+	ck_pr_store_int(&woken, 0);
+
+	pthread_create(&waiter, NULL, test_threaded_64_waiter, &ec);
+	usleep(10000);
+
+	assert(ck_pr_load_int(&woken) == 0);
+	ck_ec_add(&ec, mode, 4);
+
+	pthread_join(waiter, NULL);
+	assert(ck_pr_load_int(&woken) == 1);
+	return;
+}
+#endif
+
+int main(int argc, char **argv)
+{
+	(void)argc;
+	(void)argv;
+
+	if (test_ops.gettime == NULL ||
+	    test_ops.wake32 == NULL ||
+	    test_ops.wait32 == NULL) {
+		printf("No ck_ec ops for this platform. Trivial success.\n");
+		return 0;
+	}
+
+	test_update_counter_32(&sp);
+#ifdef CK_F_EC64
+	test_update_counter_64(&sp);
+#endif
+	printf("test_update_counter SP passed.\n");
+
+	test_update_counter_32(&mp);
+#ifdef CK_F_EC64
+	test_update_counter_64(&mp);
+#endif
+	printf("test_update_counter MP passed.\n");
+
+	test_deadline();
+	printf("test_deadline passed.\n");
+
+	test_wait_32();
+#ifdef CK_F_EC64
+	test_wait_64();
+#endif
+	printf("test_wait passed.\n");
+
+	test_wait_pred_32();
+#ifdef CK_F_EC64
+	test_wait_pred_64();
+#endif
+	printf("test_wait_pred passed.\n");
+
+	test_threaded_inc_32(&sp);
+	test_threaded_add_32(&sp);
+#ifdef CK_F_EC64
+	test_threaded_inc_64(&sp);
+	test_threaded_add_64(&sp);
+#endif
+	printf("test_threaded SP passed.\n");
+
+	test_threaded_inc_32(&mp);
+	test_threaded_add_32(&mp);
+#ifdef CK_F_EC64
+	test_threaded_inc_64(&mp);
+	test_threaded_add_64(&mp);
+#endif
+	printf("test_threaded MP passed.\n");
+	return 0;
+}
--- a/regressions/ck_ec/validate/fuzz_harness.h
+++ b/regressions/ck_ec/validate/fuzz_harness.h
@ -0,0 +1,95 @@
+#ifndef FUZZ_HARNESS_H
+#define FUZZ_HARNESS_H
+#include <assert.h>
+#include <ck_stddef.h>
+#include <ck_string.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#if defined(USE_LIBFUZZER)
+#define TEST(function, examples)					\
+	void LLVMFuzzerInitialize(int *argcp, char ***argvp);		\
+	int LLVMFuzzerTestOneInput(const void *data, size_t n);		\
+									\
+	void LLVMFuzzerInitialize(int *argcp, char ***argvp)		\
+	{								\
+		static char size[128];					\
+		static char *argv[1024];				\
+		int argc = *argcp;					\
+									\
+		assert(argc < 1023);					\
+									\
+		int r = snprintf(size, sizeof(size),			\
+				 "-max_len=%zu", sizeof(examples[0]));	\
+		assert((size_t)r < sizeof(size));			\
+									\
+		memcpy(argv, *argvp, argc * sizeof(argv[0]));		\
+		argv[argc++] = size;					\
+									\
+		*argcp = argc;						\
+		*argvp = argv;						\
+									\
+		for (size_t i = 0;					\
+		     i < sizeof(examples) / sizeof(examples[0]);	\
+		     i++) {						\
+			assert(function(&examples[i]) == 0);		\
+		}							\
+									\
+		return;							\
+	}								\
+									\
+	int LLVMFuzzerTestOneInput(const void *data, size_t n)		\
+	{								\
+		char buf[sizeof(examples[0])];				\
+									\
+		memset(buf, 0, sizeof(buf));				\
+		if (n < sizeof(buf)) {					\
+			memcpy(buf, data, n);				\
+		} else {						\
+			memcpy(buf, data, sizeof(buf));			\
+		}							\
+									\
+		assert(function((const void *)buf) == 0);		\
+		return 0;						\
+	}
+#elif defined(USE_AFL)
+#define TEST(function, examples)					\
+	int main(int argc, char **argv)					\
+	{								\
+		char buf[sizeof(examples[0])];				\
+									\
+		(void)argc;						\
+		(void)argv;						\
+		for (size_t i = 0;					\
+		     i < sizeof(examples) / sizeof(examples[0]);	\
+		     i++) {						\
+			assert(function(&examples[i]) == 0);		\
+		}							\
+									\
+									\
+		while (__AFL_LOOP(10000)) {				\
+			memset(buf, 0, sizeof(buf));			\
+			read(0, buf, sizeof(buf));			\
+									\
+			assert(function((const void *)buf) == 0);	\
+		}							\
+									\
+		return 0;						\
+	}
+#else
+#define TEST(function, examples)					\
+	int main(int argc, char **argv)					\
+	{								\
+		(void)argc;						\
+		(void)argv;						\
+									\
+		for (size_t i = 0;					\
+		     i < sizeof(examples) / sizeof(examples[0]);	\
+		     i++) {						\
+			assert(function(&examples[i]) == 0);		\
+		}							\
+									\
+		return 0;						\
+	}
+#endif
+#endif /* !FUZZ_HARNESS_H */
--- a/regressions/ck_ec/validate/prop_test_slow_wakeup.c
+++ b/regressions/ck_ec/validate/prop_test_slow_wakeup.c
@ -0,0 +1,110 @@
+#include <assert.h>
+#include <ck_ec.h>
+
+#include "fuzz_harness.h"
+
+static int gettime(const struct ck_ec_ops *, struct timespec *out);
+static void wake32(const struct ck_ec_ops *, const uint32_t *);
+static void wait32(const struct ck_ec_wait_state *, const uint32_t *,
+		   uint32_t, const struct timespec *);
+static void wake64(const struct ck_ec_ops *, const uint64_t *);
+static void wait64(const struct ck_ec_wait_state *, const uint64_t *,
+		   uint64_t, const struct timespec *);
+
+static const struct ck_ec_ops test_ops = {
+	.gettime = gettime,
+	.wait32 = wait32,
+	.wait64 = wait64,
+	.wake32 = wake32,
+	.wake64 = wake64
+};
+
+static int gettime(const struct ck_ec_ops *ops, struct timespec *out)
+{
+	(void)out;
+
+	assert(ops == &test_ops);
+	return -1;
+}
+
+static void wait32(const struct ck_ec_wait_state *wait_state,
+		   const uint32_t *addr, uint32_t expected,
+		   const struct timespec *deadline)
+{
+	(void)addr;
+	(void)expected;
+	(void)deadline;
+
+	assert(wait_state->ops == &test_ops);
+	return;
+}
+
+static void wait64(const struct ck_ec_wait_state *wait_state,
+		   const uint64_t *addr, uint64_t expected,
+		   const struct timespec *deadline)
+{
+	(void)addr;
+	(void)expected;
+	(void)deadline;
+
+	assert(wait_state->ops == &test_ops);
+	return;
+}
+
+static void wake32(const struct ck_ec_ops *ops, const uint32_t *addr)
+{
+	(void)addr;
+
+	assert(ops == &test_ops);
+	return;
+}
+
+static void wake64(const struct ck_ec_ops *ops, const uint64_t *addr)
+{
+	(void)addr;
+
+	assert(ops == &test_ops);
+	return;
+}
+
+/*
+ * Check that calling ck_ec{32,64}_wake always clears the waiting bit.
+ */
+
+struct example {
+	uint64_t value;
+};
+
+const struct example examples[] = {
+	{ 0 },
+	{ 1 },
+	{ 1UL << 30 },
+	{ 1UL << 31 },
+	{ INT32_MAX },
+	{ INT64_MAX },
+	{ 1ULL << 62 },
+	{ 1ULL << 63 },
+};
+
+static inline int test_slow_wakeup(const struct example *example)
+{
+	{
+		struct ck_ec32 ec = { .counter = example->value };
+
+		ck_ec32_wake(&ec, &test_ops);
+		assert(!ck_ec32_has_waiters(&ec));
+	}
+
+#ifdef CK_F_EC64
+	{
+		struct ck_ec64 ec = { .counter = example->value };
+
+		ck_ec64_wake(&ec, &test_ops);
+		assert(!ck_ec64_has_waiters(&ec));
+	}
+#endif /* CK_F_EC64 */
+
+	return 0;
+}
+
+TEST(test_slow_wakeup, examples)
--- a/regressions/ck_ec/validate/prop_test_timeutil_add.c
+++ b/regressions/ck_ec/validate/prop_test_timeutil_add.c
@ -0,0 +1,101 @@
+#include <assert.h>
+#include <ck_limits.h>
+#include <ck_stdint.h>
+
+#include "../../../src/ck_ec_timeutil.h"
+#include "fuzz_harness.h"
+
+#if ULONG_MAX > 4294967295
+typedef unsigned __int128 dword_t;
+#else
+typedef uint64_t dword_t;
+#endif
+
+struct example {
+	struct timespec ts;
+	struct timespec inc;
+};
+
+static const struct example examples[] = {
+	{
+		{
+			42,
+			100
+		},
+		{
+			1,
+			2
+		}
+	},
+	{
+		{
+			42,
+			100
+		},
+		{
+			1,
+			NSEC_MAX
+		}
+	},
+	{
+		{
+			42,
+			NSEC_MAX
+		},
+		{
+			0,
+			NSEC_MAX
+		}
+	},
+	{
+		{
+			TIME_MAX - 1,
+			1000
+		},
+		{
+			2,
+			NSEC_MAX
+		}
+	}
+};
+
+static struct timespec normalize_ts(const struct timespec ts)
+{
+	struct timespec ret = ts;
+
+	if (ret.tv_sec < 0) {
+		ret.tv_sec = ~ret.tv_sec;
+	}
+
+	if (ret.tv_nsec < 0) {
+		ret.tv_nsec = ~ret.tv_nsec;
+	}
+
+	ret.tv_nsec %= NSEC_MAX + 1;
+	return ret;
+}
+
+static dword_t ts_to_nanos(const struct timespec ts)
+{
+	return (dword_t)ts.tv_sec * (NSEC_MAX + 1) + ts.tv_nsec;
+}
+
+static inline int test_timespec_add(const struct example *example)
+{
+	const struct timespec ts = normalize_ts(example->ts);
+	const struct timespec inc = normalize_ts(example->inc);
+	const struct timespec actual = timespec_add(ts, inc);
+	const dword_t nanos = ts_to_nanos(ts) + ts_to_nanos(inc);
+
+	if (nanos / (NSEC_MAX + 1) > TIME_MAX) {
+		assert(actual.tv_sec == TIME_MAX);
+		assert(actual.tv_nsec == NSEC_MAX);
+	} else {
+		assert(actual.tv_sec == (time_t)(nanos / (NSEC_MAX + 1)));
+		assert(actual.tv_nsec == (long)(nanos % (NSEC_MAX + 1)));
+	}
+
+	return 0;
+}
+
+TEST(test_timespec_add, examples)
--- a/regressions/ck_ec/validate/prop_test_timeutil_add_ns.c
+++ b/regressions/ck_ec/validate/prop_test_timeutil_add_ns.c
@ -0,0 +1,88 @@
+#include <assert.h>
+
+#include "../../../src/ck_ec_timeutil.h"
+#include "fuzz_harness.h"
+
+#if ULONG_MAX > 4294967295
+typedef unsigned __int128 dword_t;
+#else
+typedef uint64_t dword_t;
+#endif
+
+struct example {
+	struct timespec ts;
+	uint32_t ns;
+};
+
+static const struct example examples[] = {
+	{
+		{
+			42,
+			100
+		},
+		1
+	},
+	{
+		{
+			42,
+			100
+		},
+		2 * NSEC_MAX
+	},
+	{
+		{
+			42,
+			NSEC_MAX
+		},
+		NSEC_MAX
+	},
+	{
+		{
+			TIME_MAX - 1,
+			1000
+		},
+		2 * NSEC_MAX
+	}
+};
+
+static inline int test_timespec_add_ns(const struct example *example)
+{
+	struct timespec ts = {
+		.tv_sec = example->ts.tv_sec,
+		.tv_nsec = example->ts.tv_nsec
+	};
+	const uint32_t ns = example->ns;
+
+	if (ts.tv_sec < 0) {
+		ts.tv_sec = ~ts.tv_sec;
+	}
+
+	if (ts.tv_nsec < 0) {
+		ts.tv_nsec = ~ts.tv_nsec;
+	}
+
+	ts.tv_nsec %= NSEC_MAX + 1;
+
+	const struct timespec actual = timespec_add_ns(ts, ns);
+
+	dword_t nanos =
+	    (dword_t)ts.tv_sec * (NSEC_MAX + 1) + ts.tv_nsec;
+
+	if (ns > NSEC_MAX) {
+		nanos += NSEC_MAX + 1;
+	} else {
+		nanos += ns;
+	}
+
+	if (nanos / (NSEC_MAX + 1) > TIME_MAX) {
+		assert(actual.tv_sec == TIME_MAX);
+		assert(actual.tv_nsec == NSEC_MAX);
+	} else {
+		assert(actual.tv_sec == (time_t)(nanos / (NSEC_MAX + 1)));
+		assert(actual.tv_nsec == (long)(nanos % (NSEC_MAX + 1)));
+	}
+
+	return 0;
+}
+
+TEST(test_timespec_add_ns, examples)
--- a/regressions/ck_ec/validate/prop_test_timeutil_cmp.c
+++ b/regressions/ck_ec/validate/prop_test_timeutil_cmp.c
@ -0,0 +1,99 @@
+#include <assert.h>
+
+#include "../../../src/ck_ec_timeutil.h"
+#include "fuzz_harness.h"
+
+#if ULONG_MAX > 4294967295
+typedef __int128 dsword_t;
+#else
+typedef int64_t dsword_t;
+#endif
+
+struct example {
+	struct timespec x;
+	struct timespec y;
+};
+
+static const struct example examples[] = {
+	{
+		{
+			42,
+			100
+		},
+		{
+			1,
+			2
+		}
+	},
+	{
+		{
+			42,
+			100
+		},
+		{
+			1,
+			NSEC_MAX
+		}
+	},
+	{
+		{
+			42,
+			NSEC_MAX
+		},
+		{
+			0,
+			NSEC_MAX
+		}
+	},
+	{
+		{
+			TIME_MAX - 1,
+			1000
+		},
+		{
+			2,
+			NSEC_MAX
+		}
+	}
+};
+
+static struct timespec normalize_ts(const struct timespec ts)
+{
+	struct timespec ret = ts;
+
+	if (ret.tv_nsec < 0) {
+		ret.tv_nsec = ~ret.tv_nsec;
+	}
+
+	ret.tv_nsec %= NSEC_MAX + 1;
+	return ret;
+}
+
+static dsword_t ts_to_nanos(const struct timespec ts)
+{
+	return (dsword_t)ts.tv_sec * (NSEC_MAX + 1) + ts.tv_nsec;
+}
+
+static inline int test_timespec_cmp(const struct example *example)
+{
+	const struct timespec x = normalize_ts(example->y);
+	const struct timespec y = normalize_ts(example->x);
+	const dsword_t x_nanos = ts_to_nanos(x);
+	const dsword_t y_nanos = ts_to_nanos(y);
+
+	assert(timespec_cmp(x, x) == 0);
+	assert(timespec_cmp(y, y) == 0);
+	assert(timespec_cmp(x, y) == -timespec_cmp(y, x));
+
+	if (x_nanos == y_nanos) {
+		assert(timespec_cmp(x, y) == 0);
+	} else if (x_nanos < y_nanos) {
+		assert(timespec_cmp(x, y) == -1);
+	} else {
+		assert(timespec_cmp(x, y) == 1);
+	}
+
+	return 0;
+}
+
+TEST(test_timespec_cmp, examples)
--- a/regressions/ck_ec/validate/prop_test_timeutil_scale.c
+++ b/regressions/ck_ec/validate/prop_test_timeutil_scale.c
@ -0,0 +1,41 @@
+#include <assert.h>
+
+#include "../../../src/ck_ec_timeutil.h"
+#include "fuzz_harness.h"
+
+struct example {
+	uint32_t nsec;
+	uint32_t multiplier;
+	unsigned int shift;
+};
+
+static const struct example examples[] = {
+	{
+		UINT32_MAX,
+		UINT32_MAX,
+		1
+	},
+	{
+		10,
+		20,
+		0
+	}
+};
+
+static inline int test_wait_time_scale(const struct example *example)
+{
+	const uint32_t nsec = example->nsec;
+	const uint32_t multiplier = example->multiplier;
+	const unsigned int shift = example->shift % 32;
+	uint32_t actual = wait_time_scale(nsec, multiplier, shift);
+	uint64_t expected = ((uint64_t)nsec * multiplier) >> shift;
+
+	if (expected > UINT32_MAX) {
+		expected = UINT32_MAX;
+	}
+
+	assert(actual == expected);
+	return 0;
+}
+
+TEST(test_wait_time_scale, examples)
--- a/regressions/ck_ec/validate/prop_test_value.c
+++ b/regressions/ck_ec/validate/prop_test_value.c
@ -0,0 +1,150 @@
+#include <assert.h>
+#include <ck_ec.h>
+
+#include "fuzz_harness.h"
+
+static int gettime(const struct ck_ec_ops *, struct timespec *out);
+static void wake32(const struct ck_ec_ops *, const uint32_t *);
+static void wait32(const struct ck_ec_wait_state *, const uint32_t *,
+		   uint32_t, const struct timespec *);
+static void wake64(const struct ck_ec_ops *, const uint64_t *);
+static void wait64(const struct ck_ec_wait_state *, const uint64_t *,
+		   uint64_t, const struct timespec *);
+
+static const struct ck_ec_ops test_ops = {
+	.gettime = gettime,
+	.wait32 = wait32,
+	.wait64 = wait64,
+	.wake32 = wake32,
+	.wake64 = wake64
+};
+
+static const struct ck_ec_mode modes[] = {
+	{
+		.single_producer = true,
+		.ops = &test_ops
+	},
+	{
+		.single_producer = false,
+		.ops = &test_ops
+	},
+};
+
+static int gettime(const struct ck_ec_ops *ops, struct timespec *out)
+{
+	(void)out;
+
+	assert(ops == &test_ops);
+	return -1;
+}
+
+static void wait32(const struct ck_ec_wait_state *wait_state,
+		   const uint32_t *addr, uint32_t expected,
+		   const struct timespec *deadline)
+{
+	(void)addr;
+	(void)expected;
+	(void)deadline;
+
+	assert(wait_state->ops == &test_ops);
+	return;
+}
+
+static void wait64(const struct ck_ec_wait_state *wait_state,
+		   const uint64_t *addr, uint64_t expected,
+		   const struct timespec *deadline)
+{
+	(void)addr;
+	(void)expected;
+	(void)deadline;
+
+	assert(wait_state->ops == &test_ops);
+	return;
+}
+
+static void wake32(const struct ck_ec_ops *ops, const uint32_t *addr)
+{
+	(void)addr;
+
+	assert(ops == &test_ops);
+	return;
+}
+
+static void wake64(const struct ck_ec_ops *ops, const uint64_t *addr)
+{
+	(void)addr;
+
+	assert(ops == &test_ops);
+	return;
+}
+
+/*
+ * Check that adding a value correctly updates the counter, and that
+ * incrementing after that also works.
+ */
+struct example {
+	uint64_t value[2];
+};
+
+static const struct example examples[] = {
+	{ { 0, 0 } },
+	{ { 1, 2 } },
+	{ { 0, INT32_MAX - 2 } },
+	{ { 0, INT32_MAX - 1 } },
+	{ { 0, INT32_MAX } },
+	{ { 0, INT64_MAX - 2 } },
+	{ { 0, INT64_MAX - 1 } },
+	{ { 0, INT64_MAX } },
+};
+
+static inline int test_value(const struct example *example)
+{
+	for (size_t i = 0; i < 2; i++) {
+		const struct ck_ec_mode *mode = &modes[i];
+		const uint32_t value0 = example->value[0] & INT32_MAX;
+		const uint32_t value1 = example->value[1] & INT32_MAX;
+		struct ck_ec32 ec;
+
+		ck_ec32_init(&ec, 0);
+		assert(ck_ec32_value(&ec) == 0);
+
+		ck_ec32_add(&ec, mode, value0);
+		assert(ck_ec32_value(&ec) == value0);
+
+		ck_ec32_add(&ec, mode, value1);
+		assert(ck_ec32_value(&ec) ==
+		   ((value0 + value1) & INT32_MAX));
+
+
+		ck_ec32_inc(&ec, mode);
+		assert(ck_ec32_value(&ec) ==
+		   ((value0 + value1 + 1) & INT32_MAX));
+	}
+
+#ifdef CK_F_EC64
+	for (size_t i = 0; i < 2; i++) {
+		const struct ck_ec_mode *mode = &modes[i];
+		const uint64_t value0 = example->value[0] & INT64_MAX;
+		const uint64_t value1 = example->value[1] & INT64_MAX;
+		struct ck_ec64 ec;
+
+		ck_ec64_init(&ec, 0);
+		assert(ck_ec64_value(&ec) == 0);
+
+		ck_ec64_add(&ec, mode, value0);
+		assert(ck_ec64_value(&ec) == value0);
+
+		ck_ec64_add(&ec, mode, value1);
+		assert(ck_ec64_value(&ec) ==
+		   ((value0 + value1) & INT64_MAX));
+
+		ck_ec64_inc(&ec, mode);
+		assert(ck_ec64_value(&ec) ==
+		   ((value0 + value1 + 1) & INT64_MAX));
+	}
+#endif /* CK_F_EC64 */
+
+	return 0;
+}
+
+TEST(test_value, examples)
--- a/regressions/ck_ec/validate/prop_test_wakeup.c
+++ b/regressions/ck_ec/validate/prop_test_wakeup.c
@ -0,0 +1,193 @@
+#include <assert.h>
+#include <ck_ec.h>
+#include <ck_stdbool.h>
+
+#include "fuzz_harness.h"
+
+static int gettime(const struct ck_ec_ops *, struct timespec *out);
+static void wake32(const struct ck_ec_ops *, const uint32_t *);
+static void wait32(const struct ck_ec_wait_state *, const uint32_t *,
+		   uint32_t, const struct timespec *);
+static void wake64(const struct ck_ec_ops *, const uint64_t *);
+static void wait64(const struct ck_ec_wait_state *, const uint64_t *,
+		   uint64_t, const struct timespec *);
+
+static const struct ck_ec_ops test_ops = {
+	.gettime = gettime,
+	.wait32 = wait32,
+	.wait64 = wait64,
+	.wake32 = wake32,
+	.wake64 = wake64
+};
+
+static const struct ck_ec_mode modes[] = {
+	{
+		.single_producer = true,
+		.ops = &test_ops
+	},
+	{
+		.single_producer = false,
+		.ops = &test_ops
+	},
+};
+
+static bool woken = false;
+
+static int gettime(const struct ck_ec_ops *ops, struct timespec *out)
+{
+	(void)out;
+
+	assert(ops == &test_ops);
+	return -1;
+}
+
+static void wait32(const struct ck_ec_wait_state *state, const uint32_t *addr,
+		   uint32_t expected, const struct timespec *deadline)
+{
+	(void)addr;
+	(void)expected;
+	(void)deadline;
+
+	assert(state->ops == &test_ops);
+	return;
+}
+
+static void wait64(const struct ck_ec_wait_state *state, const uint64_t *addr,
+		   uint64_t expected, const struct timespec *deadline)
+{
+	(void)addr;
+	(void)expected;
+	(void)deadline;
+
+	assert(state->ops == &test_ops);
+	return;
+}
+
+static void wake32(const struct ck_ec_ops *ops, const uint32_t *addr)
+{
+	(void)addr;
+
+	assert(ops == &test_ops);
+	woken = true;
+	return;
+}
+
+static void wake64(const struct ck_ec_ops *ops, const uint64_t *addr)
+{
+	(void)addr;
+
+	assert(ops == &test_ops);
+	woken = true;
+	return;
+}
+
+/*
+ * Check that adding a value calls the wake function when the sign bit
+ * is set, and does not call it when the sign bit is unset (modulo
+ * wrap-around).
+ */
+struct example {
+	uint64_t initial;
+	uint64_t increment;
+};
+
+const struct example examples[] = {
+	{ INT32_MAX, 0 },
+	{ INT32_MAX, 1 },
+	{ 0 + (0U << 31), 0 },
+	{ 1 + (0U << 31), 0 },
+	{ 0 + (1U << 31), 0 },
+	{ 1 + (1U << 31), 0 },
+
+	{ 0 + (0U << 31), 1 },
+	{ 1 + (0U << 31), 1 },
+	{ 0 + (1U << 31), 1 },
+	{ 1 + (1U << 31), 1 },
+
+	{ 0 + (0U << 31), INT32_MAX },
+	{ 1 + (0U << 31), INT32_MAX },
+	{ 0 + (1U << 31), INT32_MAX },
+	{ 1 + (1U << 31), INT32_MAX },
+
+	{ INT64_MAX, 0 },
+	{ INT64_MAX, 1 },
+	{ 0 + (0ULL << 63), 0 },
+	{ 1 + (0ULL << 63), 0 },
+	{ 0 + (1ULL << 63), 0 },
+	{ 1 + (1ULL << 63), 0 },
+
+	{ 0 + (0ULL << 63), 1 },
+	{ 1 + (0ULL << 63), 1 },
+	{ 0 + (1ULL << 63), 1 },
+	{ 1 + (1ULL << 63), 1 },
+
+	{ 0 + (0ULL << 63), INT64_MAX },
+	{ 1 + (0ULL << 63), INT64_MAX },
+	{ 0 + (1ULL << 63), INT64_MAX },
+	{ 1 + (1ULL << 63), INT64_MAX },
+};
+
+static inline int test_wakeup(const struct example *example)
+{
+	for (size_t i = 0; i < 2; i++) {
+		const struct ck_ec_mode *mode = &modes[i];
+		const uint32_t increment = example->increment & INT32_MAX;
+		struct ck_ec32 ec;
+		bool should_wake;
+		bool may_wake;
+
+		ec.counter = example->initial;
+		should_wake = increment != 0 && (ec.counter & (1U << 31));
+		may_wake = should_wake || (ec.counter & (1U << 31));
+
+		woken = false;
+		ck_ec32_add(&ec, mode, increment);
+		assert(!should_wake || woken);
+		assert(may_wake || !woken);
+		assert(!woken || ck_ec32_has_waiters(&ec) == false);
+
+		/* Test inc now. */
+		ec.counter = example->initial + increment;
+		should_wake = ec.counter & (1U << 31);
+		may_wake = should_wake || ((ec.counter + 1) & (1U << 31));
+
+		woken = false;
+		ck_ec32_inc(&ec, mode);
+		assert(!should_wake || woken);
+		assert(may_wake || !woken);
+		assert(!woken || ck_ec32_has_waiters(&ec) == false);
+	}
+
+#ifdef CK_F_EC64
+	for (size_t i = 0; i < 2; i++) {
+		const struct ck_ec_mode *mode = &modes[i];
+		const uint64_t increment = example->increment & INT64_MAX;
+		struct ck_ec64 ec;
+		bool should_wake;
+		bool may_wake;
+
+		ec.counter = example->initial;
+		should_wake = increment != 0 && (ec.counter & 1);
+		may_wake = should_wake || (ec.counter & 1);
+
+		woken = false;
+		ck_ec64_add(&ec, mode, increment);
+		assert(!should_wake || woken);
+		assert(may_wake || !woken);
+		assert(!woken || ck_ec64_has_waiters(&ec) == false);
+
+		/* Test inc now. */
+		ec.counter = example->initial + increment;
+		should_wake = ec.counter & 1;
+
+		woken = false;
+		ck_ec64_inc(&ec, mode);
+		assert(should_wake == woken);
+		assert(!woken || ck_ec64_has_waiters(&ec) == false);
+	}
+#endif /* CK_F_EC64 */
+
+	return 0;
+}
+
+TEST(test_wakeup, examples)
--- a/regressions/ck_epoch/validate/ck_epoch_call.c
+++ b/regressions/ck_epoch/validate/ck_epoch_call.c
@ -37,6 +37,7 @@ static void
 cb(ck_epoch_entry_t *p)
 {

+	/* Test that we can reregister the callback. */
 	if (counter == 0)
 		ck_epoch_call(&record[1], p, cb);

@ -50,15 +51,22 @@ int
 main(void)
 {
 	ck_epoch_entry_t entry;
+	ck_epoch_entry_t another;

-	ck_epoch_register(&epoch, &record[0]);
-	ck_epoch_register(&epoch, &record[1]);
+	ck_epoch_register(&epoch, &record[0], NULL);
+	ck_epoch_register(&epoch, &record[1], NULL);

 	ck_epoch_call(&record[1], &entry, cb);
 	ck_epoch_barrier(&record[1]);
 	ck_epoch_barrier(&record[1]);
-	if (counter != 2)
-		ck_error("Expected counter value 2, read %u.\n", counter);
+
+	/* Make sure that strict works. */
+	ck_epoch_call_strict(&record[1], &entry, cb);
+	ck_epoch_call_strict(&record[1], &another, cb);
+	ck_epoch_barrier(&record[1]);
+
+	if (counter != 4)
+		ck_error("Expected counter value 4, read %u.\n", counter);

 	return 0;
 }
--- a/regressions/ck_epoch/validate/ck_epoch_poll.c
+++ b/regressions/ck_epoch/validate/ck_epoch_poll.c
@ -86,10 +86,14 @@ static void *
 read_thread(void *unused CK_CC_UNUSED)
 {
 	unsigned int j;
-	ck_epoch_record_t record CK_CC_CACHELINE;
+	ck_epoch_record_t *record CK_CC_CACHELINE;
 	ck_stack_entry_t *cursor, *n;

-	ck_epoch_register(&stack_epoch, &record);
+	record = malloc(sizeof *record);
+	if (record == NULL)
+		ck_error("record allocation failure");
+
+	ck_epoch_register(&stack_epoch, record, NULL);

 	if (aff_iterate(&a)) {
 		perror("ERROR: failed to affine thread");
@ -108,7 +112,7 @@ read_thread(void *unused CK_CC_UNUSED)

 	j = 0;
 	for (;;) {
-		ck_epoch_begin(&record, NULL);
+		ck_epoch_begin(record, NULL);
 		CK_STACK_FOREACH(&stack, cursor) {
 			if (cursor == NULL)
 				continue;
@ -116,7 +120,7 @@ read_thread(void *unused CK_CC_UNUSED)
 			n = CK_STACK_NEXT(cursor);
 			j += ck_pr_load_ptr(&n) != NULL;
 		}
-		ck_epoch_end(&record, NULL);
+		ck_epoch_end(record, NULL);

 		if (j != 0 && ck_pr_load_uint(&readers) == 0)
 			ck_pr_store_uint(&readers, 1);
@ -138,10 +142,13 @@ write_thread(void *unused CK_CC_UNUSED)
 {
 	struct node **entry, *e;
 	unsigned int i, j, tid;
-	ck_epoch_record_t record;
+	ck_epoch_record_t *record;
 	ck_stack_entry_t *s;

-	ck_epoch_register(&stack_epoch, &record);
+	record = malloc(sizeof *record);
+	if (record == NULL)
+		ck_error("record allocation failure");
+	ck_epoch_register(&stack_epoch, record, NULL);

 	if (aff_iterate(&a)) {
 		perror("ERROR: failed to affine thread");
@ -178,23 +185,23 @@ write_thread(void *unused CK_CC_UNUSED)
 		}

 		for (i = 0; i < PAIRS_S; i++) {
-			ck_epoch_begin(&record, NULL);
+			ck_epoch_begin(record, NULL);
 			s = ck_stack_pop_upmc(&stack);
 			e = stack_container(s);
-			ck_epoch_end(&record, NULL);
+			ck_epoch_end(record, NULL);

-			ck_epoch_call(&record, &e->epoch_entry, destructor);
-			ck_epoch_poll(&record);
+			ck_epoch_call(record, &e->epoch_entry, destructor);
+			ck_epoch_poll(record);
 		}
 	}

-	ck_epoch_barrier(&record);
+	ck_epoch_barrier(record);

 	if (tid == 0) {
-		fprintf(stderr, "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b[W] Peak: %u (%2.2f%%)\n    Reclamations: %lu\n\n",
-			record.n_peak,
-			(double)record.n_peak / ((double)PAIRS_S * ITERATE_S) * 100,
-			record.n_dispatch);
+		fprintf(stderr, "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b[W] Peak: %u (%2.2f%%)\n    Reclamations: %u\n\n",
+			record->n_peak,
+			(double)record->n_peak / ((double)PAIRS_S * ITERATE_S) * 100,
+			record->n_dispatch);
 	}

 	ck_pr_inc_uint(&e_barrier);
--- a/regressions/ck_epoch/validate/ck_epoch_section.c
+++ b/regressions/ck_epoch/validate/ck_epoch_section.c
@ -46,8 +46,8 @@ setup_test(void)
 {

 	ck_epoch_init(&epc);
-	ck_epoch_register(&epc, &record);
-	ck_epoch_register(&epc, &record2);
+	ck_epoch_register(&epc, &record, NULL);
+	ck_epoch_register(&epc, &record2, NULL);
 	cleanup_calls = 0;

 	return;
@ -88,7 +88,8 @@ test_simple_read_section(void)
 	ck_epoch_begin(&record, &section);
 	ck_epoch_call(&record, &entry, cleanup);
 	assert(cleanup_calls == 0);
-	ck_epoch_end(&record, &section);
+	if (ck_epoch_end(&record, &section) == false)
+		ck_error("expected no more sections");
 	ck_epoch_barrier(&record);
 	assert(cleanup_calls == 1);

@ -157,7 +158,7 @@ reader_work(void *arg)
 	ck_epoch_section_t section;
 	struct obj *o;

-	ck_epoch_register(&epc, &local_record);
+	ck_epoch_register(&epc, &local_record, NULL);

 	o = (struct obj *)arg;

--- a/regressions/ck_epoch/validate/ck_epoch_section_2.c
+++ b/regressions/ck_epoch/validate/ck_epoch_section_2.c
@ -64,7 +64,7 @@ read_thread(void *unused CK_CC_UNUSED)

 	record = malloc(sizeof *record);
 	assert(record != NULL);
-	ck_epoch_register(&epoch, record);
+	ck_epoch_register(&epoch, record, NULL);

 	if (aff_iterate(&a)) {
 		perror("ERROR: failed to affine thread");
@ -110,11 +110,14 @@ read_thread(void *unused CK_CC_UNUSED)
 		}

 		ck_epoch_begin(record, &section[1]);
-
-		assert(section[0].bucket != section[1].bucket);
+		if (section[0].bucket == section[1].bucket) {
+			ck_error("%u == %u\n",
+			    section[0].bucket, section[1].bucket);
+		}
 		ck_epoch_end(record, &section[0]);

-		assert(ck_pr_load_uint(&record->active) > 0);
+		if (ck_pr_load_uint(&record->active) == 0)
+			ck_error("active: %u\n", record->active);

 		if (ck_pr_load_uint(&leave) == 1) {
 			ck_epoch_end(record, &section[1]);
@ -130,10 +133,14 @@ read_thread(void *unused CK_CC_UNUSED)
 static void *
 write_thread(void *unused CK_CC_UNUSED)
 {
-	ck_epoch_record_t record;
+	ck_epoch_record_t *record;
 	unsigned long iterations = 0;

-	ck_epoch_register(&epoch, &record);
+	record = malloc(sizeof *record);
+	if (record == NULL)
+		ck_error("record allocation failure");
+
+	ck_epoch_register(&epoch, record, NULL);

 	if (aff_iterate(&a)) {
 		perror("ERROR: failed to affine thread");
@ -147,7 +154,7 @@ write_thread(void *unused CK_CC_UNUSED)
 		if (!(iterations % 1048575))
 			fprintf(stderr, ".");

-		ck_epoch_synchronize(&record);
+		ck_epoch_synchronize(record);
 		iterations++;

 		if (ck_pr_load_uint(&leave) == 1)
--- a/regressions/ck_epoch/validate/ck_epoch_synchronize.c
+++ b/regressions/ck_epoch/validate/ck_epoch_synchronize.c
@ -86,12 +86,15 @@ static void *
 read_thread(void *unused CK_CC_UNUSED)
 {
 	unsigned int j;
-	ck_epoch_record_t record CK_CC_CACHELINE;
+	ck_epoch_record_t *record CK_CC_CACHELINE;
 	ck_stack_entry_t *cursor;
 	ck_stack_entry_t *n;
 	unsigned int i;

-	ck_epoch_register(&stack_epoch, &record);
+	record = malloc(sizeof *record);
+	if (record == NULL)
+		ck_error("record allocation failure");
+	ck_epoch_register(&stack_epoch, record, NULL);

 	if (aff_iterate(&a)) {
 		perror("ERROR: failed to affine thread");
@ -112,7 +115,7 @@ read_thread(void *unused CK_CC_UNUSED)
 	for (;;) {
 		i = 0;

-		ck_epoch_begin(&record, NULL);
+		ck_epoch_begin(record, NULL);
 		CK_STACK_FOREACH(&stack, cursor) {
 			if (cursor == NULL)
 				continue;
@ -123,7 +126,7 @@ read_thread(void *unused CK_CC_UNUSED)
 			if (i++ > 4098)
 				break;
 		}
-		ck_epoch_end(&record, NULL);
+		ck_epoch_end(record, NULL);

 		if (j != 0 && ck_pr_load_uint(&readers) == 0)
 			ck_pr_store_uint(&readers, 1);
@ -145,10 +148,13 @@ write_thread(void *unused CK_CC_UNUSED)
 {
 	struct node **entry, *e;
 	unsigned int i, j, tid;
-	ck_epoch_record_t record;
+	ck_epoch_record_t *record;
 	ck_stack_entry_t *s;

-	ck_epoch_register(&stack_epoch, &record);
+	record = malloc(sizeof *record);
+	if (record == NULL)
+		ck_error("record allocation failure");
+	ck_epoch_register(&stack_epoch, record, NULL);

 	if (aff_iterate(&a)) {
 		perror("ERROR: failed to affine thread");
@ -180,17 +186,17 @@ write_thread(void *unused CK_CC_UNUSED)
 			ck_pr_stall();

 		for (i = 0; i < PAIRS_S; i++) {
-			ck_epoch_begin(&record, NULL);
+			ck_epoch_begin(record, NULL);
 			s = ck_stack_pop_upmc(&stack);
 			e = stack_container(s);
-			ck_epoch_end(&record, NULL);
+			ck_epoch_end(record, NULL);

 			if (i & 1) {
-				ck_epoch_synchronize(&record);
-				ck_epoch_reclaim(&record);
-				ck_epoch_call(&record, &e->epoch_entry, destructor);
+				ck_epoch_synchronize(record);
+				ck_epoch_reclaim(record);
+				ck_epoch_call(record, &e->epoch_entry, destructor);
 			} else {
-				ck_epoch_barrier(&record);
+				ck_epoch_barrier(record);
 				destructor(&e->epoch_entry);
 			}

@ -201,13 +207,13 @@ write_thread(void *unused CK_CC_UNUSED)
 		}
 	}

-	ck_epoch_synchronize(&record);
+	ck_epoch_synchronize(record);

 	if (tid == 0) {
-		fprintf(stderr, "[W] Peak: %u (%2.2f%%)\n    Reclamations: %lu\n\n",
-			record.n_peak,
-			(double)record.n_peak / ((double)PAIRS_S * ITERATE_S) * 100,
-			record.n_dispatch);
+		fprintf(stderr, "[W] Peak: %u (%2.2f%%)\n    Reclamations: %u\n\n",
+			record->n_peak,
+			(double)record->n_peak / ((double)PAIRS_S * ITERATE_S) * 100,
+			record->n_dispatch);
 	}

 	ck_pr_inc_uint(&e_barrier);
--- a/regressions/ck_epoch/validate/ck_stack.c
+++ b/regressions/ck_epoch/validate/ck_stack.c
@ -81,7 +81,7 @@ thread(void *unused CK_CC_UNUSED)
 	unsigned long smr = 0;
 	unsigned int i;

-	ck_epoch_register(&stack_epoch, &record);
+	ck_epoch_register(&stack_epoch, &record, NULL);

 	if (aff_iterate(&a)) {
 		perror("ERROR: failed to affine thread");
@ -118,7 +118,7 @@ thread(void *unused CK_CC_UNUSED)
 	while (ck_pr_load_uint(&e_barrier) < n_threads);

 	fprintf(stderr, "Deferrals: %lu (%2.2f)\n", smr, (double)smr / PAIRS);
-	fprintf(stderr, "Peak: %u (%2.2f%%), %u pending\nReclamations: %lu\n\n",
+	fprintf(stderr, "Peak: %u (%2.2f%%), %u pending\nReclamations: %u\n\n",
 			record.n_peak,
 			(double)record.n_peak / PAIRS * 100,
 			record.n_pending,
--- a/regressions/ck_epoch/validate/torture.c
+++ b/regressions/ck_epoch/validate/torture.c
@ -31,8 +31,8 @@
 #include <unistd.h>
 #include <ck_cc.h>
 #include <ck_pr.h>
+#include <inttypes.h>
 #include <stdbool.h>
-#include <stddef.h>
 #include <string.h>
 #include <ck_epoch.h>
 #include <ck_stack.h>
@ -119,7 +119,7 @@ read_thread(void *unused CK_CC_UNUSED)

 	record = malloc(sizeof *record);
 	assert(record != NULL);
-	ck_epoch_register(&epoch, record);
+	ck_epoch_register(&epoch, record, NULL);

 	if (aff_iterate(&a)) {
 		perror("ERROR: failed to affine thread");
@ -147,10 +147,11 @@ write_thread(void *unused CK_CC_UNUSED)
 	ck_epoch_record_t *record;
 	unsigned long iterations = 0;
 	bool c = ck_pr_faa_uint(&first, 1);
+	uint64_t ac = 0;

 	record = malloc(sizeof *record);
 	assert(record != NULL);
-	ck_epoch_register(&epoch, record);
+	ck_epoch_register(&epoch, record, NULL);

 	if (aff_iterate(&a)) {
 		perror("ERROR: failed to affine thread");
@ -160,6 +161,12 @@ write_thread(void *unused CK_CC_UNUSED)
 	ck_pr_inc_uint(&barrier);
 	while (ck_pr_load_uint(&barrier) < n_threads);

+#define CK_EPOCH_S	do {		\
+	uint64_t _s = rdtsc();		\
+	ck_epoch_synchronize(record);	\
+	ac += rdtsc() - _s;		\
+} while (0)
+
 	do {
 		/*
 		 * A thread should never observe invalid.value > valid.value.
@ -167,33 +174,34 @@ write_thread(void *unused CK_CC_UNUSED)
 		 * invalid.value <= valid.value is valid.
 		 */
 		if (!c) ck_pr_store_uint(&valid.value, 1);
-		ck_epoch_synchronize(record);
+		CK_EPOCH_S;
 		if (!c) ck_pr_store_uint(&invalid.value, 1);

 		ck_pr_fence_store();
 		if (!c) ck_pr_store_uint(&valid.value, 2);
-		ck_epoch_synchronize(record);
+		CK_EPOCH_S;
 		if (!c) ck_pr_store_uint(&invalid.value, 2);

 		ck_pr_fence_store();
 		if (!c) ck_pr_store_uint(&valid.value, 3);
-		ck_epoch_synchronize(record);
+		CK_EPOCH_S;
 		if (!c) ck_pr_store_uint(&invalid.value, 3);

 		ck_pr_fence_store();
 		if (!c) ck_pr_store_uint(&valid.value, 4);
-		ck_epoch_synchronize(record);
+		CK_EPOCH_S;
 		if (!c) ck_pr_store_uint(&invalid.value, 4);

-		ck_epoch_synchronize(record);
+		CK_EPOCH_S;
 		if (!c) ck_pr_store_uint(&invalid.value, 0);
-		ck_epoch_synchronize(record);
+		CK_EPOCH_S;

-		iterations += 4;
+		iterations += 6;
 	} while (ck_pr_load_uint(&leave) == 0 &&
 		 ck_pr_load_uint(&n_rd) > 0);

 	fprintf(stderr, "%lu iterations\n", iterations);
+	fprintf(stderr, "%" PRIu64 " average latency\n", ac / iterations);
 	return NULL;
 }

--- a/regressions/ck_hp/validate/ck_hp_fifo.c
+++ b/regressions/ck_hp/validate/ck_hp_fifo.c
@ -55,6 +55,7 @@ static struct affinity a;
 static int size;
 static unsigned int barrier;
 static unsigned int e_barrier;
+static unsigned int s_barrier;

 static void *
 test(void *c)
@ -98,6 +99,9 @@ test(void *c)
 		}
 	}

+	ck_pr_inc_uint(&s_barrier);
+	while (ck_pr_load_uint(&s_barrier) < (unsigned int)nthr);
+
 	for (i = 0; i < ITERATIONS; i++) {
 		for (j = 0; j < size; j++) {
 			fifo_entry = malloc(sizeof(ck_hp_fifo_entry_t));
--- a/regressions/ck_hs/benchmark/apply.c
+++ b/regressions/ck_hs/benchmark/apply.c
@ -6,9 +6,9 @@
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
- * 1. Redistributions of source code must retain the above copyrighs
+ * 1. Redistributions of source code must retain the above copyrights
 *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyrighs
+ * 2. Redistributions in binary form must reproduce the above copyrights
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
--- a/regressions/ck_hs/benchmark/parallel_bytestring.c
+++ b/regressions/ck_hs/benchmark/parallel_bytestring.c
@ -5,9 +5,9 @@
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
- * 1. Redistributions of source code must retain the above copyrighs
+ * 1. Redistributions of source code must retain the above copyrights
 *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyrighs
+ * 2. Redistributions in binary form must reproduce the above copyrights
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
@ -147,7 +147,7 @@ set_init(void)
 #endif

 	ck_epoch_init(&epoch_hs);
-	ck_epoch_register(&epoch_hs, &epoch_wr);
+	ck_epoch_register(&epoch_hs, &epoch_wr, NULL);
 	common_srand48((long int)time(NULL));
 	if (ck_hs_init(&hs, mode, hs_hash, hs_compare, &my_allocator, 65536, common_lrand48()) == false) {
 		perror("ck_hs_init");
@ -234,7 +234,7 @@ reader(void *unused)
 		perror("WARNING: Failed to affine thread");

 	s = j = a = 0;
-	ck_epoch_register(&epoch_hs, &epoch_record);
+	ck_epoch_register(&epoch_hs, &epoch_record, NULL);
 	for (;;) {
 		j++;
 		ck_epoch_begin(&epoch_record, NULL);
@ -454,8 +454,8 @@ main(int argc, char *argv[])
 	ck_epoch_record_t epoch_temporary = epoch_wr;
 	ck_epoch_synchronize(&epoch_wr);

-	fprintf(stderr, " '- Summary: %u pending, %u peak, %lu reclamations -> "
-	    "%u pending, %u peak, %lu reclamations\n\n",
+	fprintf(stderr, " '- Summary: %u pending, %u peak, %u reclamations -> "
+	    "%u pending, %u peak, %u reclamations\n\n",
 	    epoch_temporary.n_pending, epoch_temporary.n_peak, epoch_temporary.n_dispatch,
 	    epoch_wr.n_pending, epoch_wr.n_peak, epoch_wr.n_dispatch);

@ -593,8 +593,8 @@ main(int argc, char *argv[])
 	epoch_temporary = epoch_wr;
 	ck_epoch_synchronize(&epoch_wr);

-	fprintf(stderr, " '- Summary: %u pending, %u peak, %lu reclamations -> "
-	    "%u pending, %u peak, %lu reclamations\n\n",
+	fprintf(stderr, " '- Summary: %u pending, %u peak, %u reclamations -> "
+	    "%u pending, %u peak, %u reclamations\n\n",
 	    epoch_temporary.n_pending, epoch_temporary.n_peak, epoch_temporary.n_dispatch,
 	    epoch_wr.n_pending, epoch_wr.n_peak, epoch_wr.n_dispatch);
 	return 0;
--- a/regressions/ck_hs/benchmark/serial.c
+++ b/regressions/ck_hs/benchmark/serial.c
@ -5,9 +5,9 @@
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
- * 1. Redistributions of source code must retain the above copyrighs
+ * 1. Redistributions of source code must retain the above copyrights
 *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyrighs
+ * 2. Redistributions in binary form must reproduce the above copyrights
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
--- a/regressions/ck_hs/validate/serial.c
+++ b/regressions/ck_hs/validate/serial.c
@ -5,9 +5,9 @@
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
- * 1. Redistributions of source code must retain the above copyrighs
+ * 1. Redistributions of source code must retain the above copyrights
 *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyrighs
+ * 2. Redistributions in binary form must reproduce the above copyrights
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
@ -57,12 +57,28 @@ static struct ck_malloc my_allocator = {
 	.free = hs_free
 };

+static void
+stub_free(void *p, size_t b, bool r)
+{
+
+	(void)b;
+	(void)r;
+
+	fprintf(stderr, "Ignoring reclamation of %p\n", p);
+	return;
+}
+
+static struct ck_malloc stub_allocator = {
+	.malloc = hs_malloc,
+	.free = stub_free
+};
+
 const char *test[] = { "Samy", "Al", "Bahra", "dances", "in", "the", "wind.", "Once",
-			"upon", "a", "time", "his", "gypsy", "ate", "one", "itsy",
-			    "bitsy", "spider.", "What", "goes", "up", "must",
-				"come", "down.", "What", "is", "down", "stays",
-				    "down.", "A", "B", "C", "D", "E", "F", "G", "H",
-					"I", "J", "K", "L", "M", "N", "O", "P", "Q" };
+                       "upon", "a", "time", "his", "gypsy", "ate", "one", "itsy",
+                       "bitsy", "spider.", "What", "goes", "up", "must",
+                       "come", "down.", "What", "is", "down", "stays",
+                       "down.", "A", "B", "C", "D", "E", "F", "G", "H",
+                       "I", "J", "K", "L", "M", "N", "O", "P", "Q" };

 const char *negative = "negative";

@ -136,13 +152,21 @@ run_test(unsigned int is, unsigned int ad)
 	size_t i, j;
 	const char *blob = "#blobs";
 	unsigned long h;
+	ck_hs_iterator_t it;

 	if (ck_hs_init(&hs[0], CK_HS_MODE_SPMC | CK_HS_MODE_OBJECT | ad, hs_hash, hs_compare, &my_allocator, is, 6602834) == false)
 		ck_error("ck_hs_init\n");

 	for (j = 0; j < size; j++) {
 		for (i = 0; i < sizeof(test) / sizeof(*test); i++) {
-			h = test[i][0];
+			unsigned long h_1;
+
+			h = CK_HS_HASH(&hs[j], hs_hash, test[i]);
+			h_1 = ck_hs_hash(&hs[j], test[i]);
+
+			if (h != h_1)
+				ck_error("h != h_1 (%lu != %lu)\n", h, h_1);
+
 			if (ck_hs_get(&hs[j], h, test[i]) != NULL) {
 				continue;
 			}
@ -181,6 +205,58 @@ run_test(unsigned int is, unsigned int ad)
 			}
 		}

+		/* Test iteration */
+		if (j == 0) {
+			/* Avoid the blob stuff as it's not in the test array. */
+			ck_hs_iterator_init(&it);
+			void *k = NULL;
+			int matches = 0;
+			int entries = 0;
+			while (ck_hs_next(&hs[j], &it, &k) == true) {
+				entries++;
+				for (i = 0; i < sizeof(test) / sizeof(*test); i++) {
+					int x = strcmp(test[i], (char *)k);
+					if (x == 0) {
+						matches++;
+						break;
+					}
+				}
+			}
+
+			if (entries != matches) {
+				ck_error("Iteration must match all elements, has: %d, matched: %d [%d]", entries, matches, is);
+			}
+
+			/*
+			 * Now test iteration in the face of grows (spmc).
+			 * In order to test usage after reclamation, we
+			 * stub the allocator.
+			 */
+			ck_hs_iterator_init(&it);
+			k = NULL;
+			matches = 0;
+			entries = 0;
+			hs[j].m = &stub_allocator;
+			while (ck_hs_next_spmc(&hs[j], &it, &k) == true) {
+				entries++;
+				for (i = 0; i < sizeof(test) / sizeof(*test); i++) {
+					int x = strcmp(test[i], (char *)k);
+					if (x == 0) {
+						matches++;
+						break;
+					}
+				}
+				if (entries == 20) {
+					ck_hs_grow(&hs[j], 128);
+				}
+			}
+			hs[j].m = &my_allocator;
+
+			if (entries != matches) {
+				ck_error("After growth, iteration must match all elements, has: %d, matched: %d [%d]", entries, matches, is);
+			}
+		}
+
 		/* Test grow semantics. */
 		ck_hs_grow(&hs[j], 128);
 		for (i = 0; i < sizeof(test) / sizeof(*test); i++) {
--- a/regressions/ck_ht/benchmark/parallel_bytestring.c
+++ b/regressions/ck_ht/benchmark/parallel_bytestring.c
@ -132,7 +132,7 @@ table_init(void)
 #endif

 	ck_epoch_init(&epoch_ht);
-	ck_epoch_register(&epoch_ht, &epoch_wr);
+	ck_epoch_register(&epoch_ht, &epoch_wr, NULL);
 	common_srand48((long int)time(NULL));
 	if (ck_ht_init(&ht, mode, NULL, &my_allocator, 8, common_lrand48()) == false) {
 		perror("ck_ht_init");
@ -221,7 +221,7 @@ reader(void *unused)
 		perror("WARNING: Failed to affine thread");

 	s = j = a = 0;
-	ck_epoch_register(&epoch_ht, &epoch_record);
+	ck_epoch_register(&epoch_ht, &epoch_record, NULL);
 	for (;;) {
 		j++;
 		ck_epoch_begin(&epoch_record, NULL);
@ -426,8 +426,8 @@ main(int argc, char *argv[])
 	ck_epoch_record_t epoch_temporary = epoch_wr;
 	ck_epoch_synchronize(&epoch_wr);

-	fprintf(stderr, " '- Summary: %u pending, %u peak, %lu reclamations -> "
-	    "%u pending, %u peak, %lu reclamations\n\n",
+	fprintf(stderr, " '- Summary: %u pending, %u peak, %u reclamations -> "
+	    "%u pending, %u peak, %u reclamations\n\n",
 	    epoch_temporary.n_pending, epoch_temporary.n_peak, epoch_temporary.n_dispatch,
 	    epoch_wr.n_pending, epoch_wr.n_peak, epoch_wr.n_dispatch);

@ -551,8 +551,8 @@ main(int argc, char *argv[])
 	epoch_temporary = epoch_wr;
 	ck_epoch_synchronize(&epoch_wr);

-	fprintf(stderr, " '- Summary: %u pending, %u peak, %lu reclamations -> "
-	    "%u pending, %u peak, %lu reclamations\n\n",
+	fprintf(stderr, " '- Summary: %u pending, %u peak, %u reclamations -> "
+	    "%u pending, %u peak, %u reclamations\n\n",
 	    epoch_temporary.n_pending, epoch_temporary.n_peak, epoch_temporary.n_dispatch,
 	    epoch_wr.n_pending, epoch_wr.n_peak, epoch_wr.n_dispatch);
 	return 0;
--- a/regressions/ck_ht/benchmark/parallel_direct.c
+++ b/regressions/ck_ht/benchmark/parallel_direct.c
@ -136,7 +136,7 @@ table_init(void)
 {

 	ck_epoch_init(&epoch_ht);
-	ck_epoch_register(&epoch_ht, &epoch_wr);
+	ck_epoch_register(&epoch_ht, &epoch_wr, NULL);
 	common_srand48((long int)time(NULL));
 	if (ck_ht_init(&ht, CK_HT_MODE_DIRECT, hash_function, &my_allocator, 8, common_lrand48()) == false) {
 		perror("ck_ht_init");
@ -221,7 +221,7 @@ ht_reader(void *unused)
 		perror("WARNING: Failed to affine thread");

 	s = j = a = 0;
-	ck_epoch_register(&epoch_ht, &epoch_record);
+	ck_epoch_register(&epoch_ht, &epoch_record, NULL);
 	for (;;) {
 		j++;
 		ck_epoch_begin(&epoch_record, NULL);
@ -412,8 +412,8 @@ main(int argc, char *argv[])
 	ck_epoch_record_t epoch_temporary = epoch_wr;
 	ck_epoch_synchronize(&epoch_wr);

-	fprintf(stderr, " '- Summary: %u pending, %u peak, %lu reclamations -> "
-	    "%u pending, %u peak, %lu reclamations\n\n",
+	fprintf(stderr, " '- Summary: %u pending, %u peak, %u reclamations -> "
+	    "%u pending, %u peak, %u reclamations\n\n",
 	    epoch_temporary.n_pending, epoch_temporary.n_peak, epoch_temporary.n_dispatch,
 	    epoch_wr.n_pending, epoch_wr.n_peak, epoch_wr.n_dispatch);

@ -537,8 +537,8 @@ main(int argc, char *argv[])
 	epoch_temporary = epoch_wr;
 	ck_epoch_synchronize(&epoch_wr);

-	fprintf(stderr, " '- Summary: %u pending, %u peak, %lu reclamations -> "
-	    "%u pending, %u peak, %lu reclamations\n\n",
+	fprintf(stderr, " '- Summary: %u pending, %u peak, %u reclamations -> "
+	    "%u pending, %u peak, %u reclamations\n\n",
 	    epoch_temporary.n_pending, epoch_temporary.n_peak, epoch_temporary.n_dispatch,
 	    epoch_wr.n_pending, epoch_wr.n_peak, epoch_wr.n_dispatch);
 	return 0;
--- a/regressions/ck_pr/benchmark/Makefile
+++ b/regressions/ck_pr/benchmark/Makefile
@ -1,6 +1,8 @@
 .PHONY: clean

-all: ck_pr_cas_64 ck_pr_fas_64 ck_pr_cas_64_2 ck_pr_add_64 ck_pr_faa_64 ck_pr_neg_64 fp
+OBJECTS=ck_pr_cas_64 ck_pr_fas_64 ck_pr_cas_64_2 ck_pr_add_64 ck_pr_faa_64 ck_pr_neg_64 fp
+
+all: $(OBJECTS)

 fp: fp.c
 	$(CC) $(CFLAGS) -o fp fp.c
@ -24,8 +26,7 @@ ck_pr_neg_64: ck_pr_neg_64.c
 	$(CC) $(CFLAGS) -o ck_pr_neg_64 ck_pr_neg_64.c -lm

 clean:
-	rm -rf ck_pr_cas_64 ck_pr_fas_64 ck_pr_cas_64_2 ck_pr_add_64 \
-	       ck_pr_faa_64 ck_pr_neg_64 *.dSYM *.exe
+	rm -rf *.dSYM *.exe *.o $(OBJECTS)

 include ../../../build/regressions.build
 CFLAGS+=$(PTHREAD_CFLAGS) -D_GNU_SOURCE
--- a/regressions/ck_pr/validate/Makefile
+++ b/regressions/ck_pr/validate/Makefile
@ -4,7 +4,7 @@ OBJECTS=ck_pr_cas ck_pr_faa ck_pr_inc ck_pr_dec ck_pr_bts \
 	ck_pr_btr ck_pr_btc ck_pr_load ck_pr_store 	  \
 	ck_pr_and ck_pr_or ck_pr_xor ck_pr_add ck_pr_sub  \
 	ck_pr_fas ck_pr_bin ck_pr_btx ck_pr_fax ck_pr_n	  \
-	ck_pr_unary
+	ck_pr_unary ck_pr_fence ck_pr_dec_zero ck_pr_inc_zero

 all: $(OBJECTS)

@ -20,12 +20,21 @@ ck_pr_cas: ck_pr_cas.c
 ck_pr_inc: ck_pr_inc.c
 	$(CC) $(CFLAGS) -o ck_pr_inc ck_pr_inc.c

+ck_pr_inc_zero: ck_pr_inc_zero.c
+	$(CC) $(CFLAGS) -o ck_pr_inc_zero ck_pr_inc_zero.c
+
 ck_pr_dec: ck_pr_dec.c
 	$(CC) $(CFLAGS) -o ck_pr_dec ck_pr_dec.c

+ck_pr_dec_zero: ck_pr_dec_zero.c
+	$(CC) $(CFLAGS) -o ck_pr_dec_zero ck_pr_dec_zero.c
+
 ck_pr_faa: ck_pr_faa.c
 	$(CC) $(CFLAGS) -o ck_pr_faa ck_pr_faa.c

+ck_pr_fence: ck_pr_fence.c
+	$(CC) $(CFLAGS) -o ck_pr_fence ck_pr_fence.c
+
 ck_pr_btc: ck_pr_btc.c
 	$(CC) $(CFLAGS) -o ck_pr_btc ck_pr_btc.c

--- a/regressions/ck_pr/validate/ck_pr_dec_zero.c
+++ b/regressions/ck_pr/validate/ck_pr_dec_zero.c
@ -0,0 +1,105 @@
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <ck_pr.h>
+
+#define EXPECT(ACTUAL, IS_ZERO, TYPE, INITIAL) do {			\
+	TYPE expected = (TYPE)((TYPE)INITIAL - (TYPE)1);		\
+	if ((ACTUAL) != expected) {					\
+		printf("FAIL [ %" PRIx64" != %" PRIx64" ]\n",		\
+		       (uint64_t)(ACTUAL),				\
+		       (uint64_t)expected);				\
+		exit(EXIT_FAILURE);					\
+	}								\
+									\
+	if ((IS_ZERO) != ((ACTUAL) == 0)) {				\
+		printf("FAIL [ %s != %s ]\n",				\
+		       ((IS_ZERO) ? "true" : "false"),			\
+		       (((ACTUAL) == 0) ? "true" : "false"));		\
+		exit(EXIT_FAILURE);					\
+	}								\
+	} while (0)
+
+#define TEST_ZERO(TYPE, SUFFIX) do {				\
+		TYPE datum;					\
+		bool is_zero;					\
+								\
+		datum = 0;					\
+		ck_pr_dec_##SUFFIX##_zero(&datum, &is_zero);	\
+		EXPECT(datum, is_zero, TYPE, 0);		\
+								\
+		datum = (TYPE)-1;				\
+		ck_pr_dec_##SUFFIX##_zero(&datum, &is_zero);	\
+		EXPECT(datum, is_zero, TYPE, -1);		\
+								\
+		datum = (TYPE)1;				\
+		ck_pr_dec_##SUFFIX##_zero(&datum, &is_zero);	\
+		EXPECT(datum, is_zero, TYPE, 1);		\
+								\
+		datum = (TYPE)2;				\
+		ck_pr_dec_##SUFFIX##_zero(&datum, &is_zero);	\
+		EXPECT(datum, is_zero, TYPE, 2);		\
+	} while (0)
+
+#define TEST_IS_ZERO(TYPE, SUFFIX) do {				 \
+		TYPE datum;					 \
+		bool is_zero;					 \
+								 \
+		datum = 0;					 \
+		is_zero = ck_pr_dec_##SUFFIX##_is_zero(&datum);	 \
+		EXPECT(datum, is_zero, TYPE, 0);		 \
+								 \
+		datum = (TYPE)-1;				 \
+		is_zero = ck_pr_dec_##SUFFIX##_is_zero(&datum);	 \
+		EXPECT(datum, is_zero, TYPE, -1);		 \
+								 \
+		datum = (TYPE)1;				 \
+		is_zero = ck_pr_dec_##SUFFIX##_is_zero(&datum);	 \
+		EXPECT(datum, is_zero, TYPE, 1);		 \
+								 \
+		datum = (TYPE)2;				 \
+		is_zero = ck_pr_dec_##SUFFIX##_is_zero(&datum);	 \
+		EXPECT(datum, is_zero, TYPE, 2);		 \
+	} while (0)
+
+#define TEST(TYPE, SUFFIX) do {			\
+	TEST_ZERO(TYPE, SUFFIX);		\
+	TEST_IS_ZERO(TYPE, SUFFIX);		\
+} while (0)
+
+int
+main(void)
+{
+
+#ifdef CK_F_PR_DEC_64_ZERO
+	TEST(uint64_t, 64);
+#endif
+
+#ifdef CK_F_PR_DEC_32_ZERO
+	TEST(uint32_t, 32);
+#endif
+
+#ifdef CK_F_PR_DEC_16_ZERO
+	TEST(uint16_t, 16);
+#endif
+
+#ifdef CK_F_PR_DEC_8_ZERO
+	TEST(uint8_t, 8);
+#endif
+
+#ifdef CK_F_PR_DEC_UINT_ZERO
+	TEST(unsigned int, uint);
+#endif
+
+#ifdef CK_F_PR_DEC_INT_ZERO
+	TEST(int, int);
+#endif
+
+#ifdef CK_F_PR_DEC_CHAR_ZERO
+	TEST(char, char);
+#endif
+
+	return (0);
+}
--- a/regressions/ck_pr/validate/ck_pr_fence.c
+++ b/regressions/ck_pr/validate/ck_pr_fence.c
@ -0,0 +1,80 @@
+/*
+ * Copyright 2009-2018 Samy Al Bahra.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <ck_pr.h>
+#include "../../common.h"
+
+int
+main(void)
+{
+	int r = 0;
+
+	/* Below serves as a marker. */
+	ck_pr_sub_int(&r, 31337);
+
+	/*
+	 * This is a simple test to help ensure all fences compile or crash
+	 * on target. Below are generated according to the underlying memory
+	 * model's ordering.
+	 */
+	ck_pr_fence_atomic();
+	ck_pr_fence_atomic_store();
+	ck_pr_fence_atomic_load();
+	ck_pr_fence_store_atomic();
+	ck_pr_fence_load_atomic();
+	ck_pr_fence_load();
+	ck_pr_fence_load_store();
+	ck_pr_fence_store();
+	ck_pr_fence_store_load();
+	ck_pr_fence_memory();
+	ck_pr_fence_release();
+	ck_pr_fence_acquire();
+	ck_pr_fence_acqrel();
+	ck_pr_fence_lock();
+	ck_pr_fence_unlock();
+
+	/* Below serves as a marker. */
+	ck_pr_sub_int(&r, 31337);
+
+	/* The following are generating assuming RMO. */
+	ck_pr_fence_strict_atomic();
+	ck_pr_fence_strict_atomic_store();
+	ck_pr_fence_strict_atomic_load();
+	ck_pr_fence_strict_store_atomic();
+	ck_pr_fence_strict_load_atomic();
+	ck_pr_fence_strict_load();
+	ck_pr_fence_strict_load_store();
+	ck_pr_fence_strict_store();
+	ck_pr_fence_strict_store_load();
+	ck_pr_fence_strict_memory();
+	ck_pr_fence_strict_release();
+	ck_pr_fence_strict_acquire();
+	ck_pr_fence_strict_acqrel();
+	ck_pr_fence_strict_lock();
+	ck_pr_fence_strict_unlock();
+	return 0;
+}
+
--- a/regressions/ck_pr/validate/ck_pr_inc_zero.c
+++ b/regressions/ck_pr/validate/ck_pr_inc_zero.c
@ -0,0 +1,105 @@
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <ck_pr.h>
+
+#define EXPECT(ACTUAL, IS_ZERO, TYPE, INITIAL) do {			\
+	TYPE expected = (TYPE)((TYPE)INITIAL + (TYPE)1);		\
+	if ((ACTUAL) != expected) {					\
+		printf("FAIL [ %" PRIx64" != %" PRIx64" ]\n",		\
+		       (uint64_t)(ACTUAL),				\
+		       (uint64_t)expected);				\
+		exit(EXIT_FAILURE);					\
+	}								\
+									\
+	if ((IS_ZERO) != ((ACTUAL) == 0)) {				\
+		printf("FAIL [ %s != %s ]\n",				\
+		       ((IS_ZERO) ? "true" : "false"),			\
+		       (((ACTUAL) == 0) ? "true" : "false"));		\
+		exit(EXIT_FAILURE);					\
+	}								\
+	} while (0)
+
+#define TEST_ZERO(TYPE, SUFFIX) do {				\
+		TYPE datum;					\
+		bool is_zero;					\
+								\
+		datum = 0;					\
+		ck_pr_inc_##SUFFIX##_zero(&datum, &is_zero);	\
+		EXPECT(datum, is_zero, TYPE, 0);		\
+								\
+		datum = (TYPE)-1;				\
+		ck_pr_inc_##SUFFIX##_zero(&datum, &is_zero);	\
+		EXPECT(datum, is_zero, TYPE, -1);		\
+								\
+		datum = (TYPE)1;				\
+		ck_pr_inc_##SUFFIX##_zero(&datum, &is_zero);	\
+		EXPECT(datum, is_zero, TYPE, 1);		\
+								\
+		datum = (TYPE)2;				\
+		ck_pr_inc_##SUFFIX##_zero(&datum, &is_zero);	\
+		EXPECT(datum, is_zero, TYPE, 2);		\
+	} while (0)
+
+#define TEST_IS_ZERO(TYPE, SUFFIX) do {				 \
+		TYPE datum;					 \
+		bool is_zero;					 \
+								 \
+		datum = 0;					 \
+		is_zero = ck_pr_inc_##SUFFIX##_is_zero(&datum);	 \
+		EXPECT(datum, is_zero, TYPE, 0);		 \
+								 \
+		datum = (TYPE)-1;				 \
+		is_zero = ck_pr_inc_##SUFFIX##_is_zero(&datum);	 \
+		EXPECT(datum, is_zero, TYPE, -1);		 \
+								 \
+		datum = (TYPE)1;				 \
+		is_zero = ck_pr_inc_##SUFFIX##_is_zero(&datum);	 \
+		EXPECT(datum, is_zero, TYPE, 1);		 \
+								 \
+		datum = (TYPE)2;				 \
+		is_zero = ck_pr_inc_##SUFFIX##_is_zero(&datum);	 \
+		EXPECT(datum, is_zero, TYPE, 2);		 \
+	} while (0)
+
+#define TEST(TYPE, SUFFIX) do {			\
+	TEST_ZERO(TYPE, SUFFIX);		\
+	TEST_IS_ZERO(TYPE, SUFFIX);		\
+} while (0)
+
+int
+main(void)
+{
+
+#ifdef CK_F_PR_INC_64_ZERO
+	TEST(uint64_t, 64);
+#endif
+
+#ifdef CK_F_PR_INC_32_ZERO
+	TEST(uint32_t, 32);
+#endif
+
+#ifdef CK_F_PR_INC_16_ZERO
+	TEST(uint16_t, 16);
+#endif
+
+#ifdef CK_F_PR_INC_8_ZERO
+	TEST(uint8_t, 8);
+#endif
+
+#ifdef CK_F_PR_INC_UINT_ZERO
+	TEST(unsigned int, uint);
+#endif
+
+#ifdef CK_F_PR_INC_INT_ZERO
+	TEST(int, int);
+#endif
+
+#ifdef CK_F_PR_INC_CHAR_ZERO
+	TEST(char, char);
+#endif
+
+	return (0);
+}
--- a/regressions/ck_pr/validate/ck_pr_load.c
+++ b/regressions/ck_pr/validate/ck_pr_load.c
@ -118,6 +118,7 @@ rg_width(int m)
 int
 main(void)
 {
+	void *ptr = (void *)(intptr_t)-1;

 	common_srand((unsigned int)getpid());

@ -143,6 +144,11 @@ main(void)
 	ck_pr_load_64_2(&b, &a);
 	printf("%" PRIx64 ":%" PRIx64 "\n", a[0], a[1]);
 #endif
+	printf("ck_pr_load_ptr: ");
+	if (ck_pr_load_ptr(&ptr) != (void *)(intptr_t)(-1))
+		printf("Failed : %p != %p\n", ck_pr_load_ptr(&ptr), (void *)(intptr_t)(-1));
+	else
+		printf("SUCCESS\n");

 	return (0);
 }
--- a/regressions/ck_pr/validate/ck_pr_store.c
+++ b/regressions/ck_pr/validate/ck_pr_store.c
@ -119,6 +119,8 @@ rg_width(int m)
 int
 main(void)
 {
+	void *ptr;
+
 #if defined(CK_F_PR_STORE_DOUBLE) && defined(CK_F_PR_LOAD_DOUBLE)
 	double d;

@ -145,6 +147,12 @@ main(void)
 #ifdef CK_F_PR_STORE_8
 	CK_PR_STORE_B(8);
 #endif
+	printf("ck_pr_store_ptr: ");
+	ck_pr_store_ptr(&ptr, (void *)(intptr_t)-1);
+	if (ptr != (void *)(intptr_t)(-1))
+		printf("Failed : %p != %p\n", ptr, (void *)(intptr_t)-1);
+	else
+		printf("SUCCESS\n");

 	return (0);
 }
--- a/regressions/ck_rhs/benchmark/parallel_bytestring.c
+++ b/regressions/ck_rhs/benchmark/parallel_bytestring.c
@ -5,9 +5,9 @@
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
- * 1. Redistributions of source code must retain the above copyrighs
+ * 1. Redistributions of source code must retain the above copyrights
 *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyrighs
+ * 2. Redistributions in binary form must reproduce the above copyrights
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
@ -144,7 +144,7 @@ set_init(void)


 	ck_epoch_init(&epoch_hs);
-	ck_epoch_register(&epoch_hs, &epoch_wr);
+	ck_epoch_register(&epoch_hs, &epoch_wr, NULL);
 	common_srand48((long int)time(NULL));
 	if (ck_rhs_init(&hs, mode, hs_hash, hs_compare, &my_allocator, 65536, common_lrand48()) == false) {
 		perror("ck_rhs_init");
@ -231,7 +231,7 @@ reader(void *unused)
 		perror("WARNING: Failed to affine thread");

 	s = j = a = 0;
-	ck_epoch_register(&epoch_hs, &epoch_record);
+	ck_epoch_register(&epoch_hs, &epoch_record, NULL);
 	for (;;) {
 		j++;
 		ck_epoch_begin(&epoch_record, NULL);
@ -451,8 +451,8 @@ main(int argc, char *argv[])
 	ck_epoch_record_t epoch_temporary = epoch_wr;
 	ck_epoch_synchronize(&epoch_wr);

-	fprintf(stderr, " '- Summary: %u pending, %u peak, %lu reclamations -> "
-	    "%u pending, %u peak, %lu reclamations\n\n",
+	fprintf(stderr, " '- Summary: %u pending, %u peak, %u reclamations -> "
+	    "%u pending, %u peak, %u reclamations\n\n",
 	    epoch_temporary.n_pending, epoch_temporary.n_peak, epoch_temporary.n_dispatch,
 	    epoch_wr.n_pending, epoch_wr.n_peak, epoch_wr.n_dispatch);

@ -590,8 +590,8 @@ main(int argc, char *argv[])
 	epoch_temporary = epoch_wr;
 	ck_epoch_synchronize(&epoch_wr);

-	fprintf(stderr, " '- Summary: %u pending, %u peak, %lu reclamations -> "
-	    "%u pending, %u peak, %lu reclamations\n\n",
+	fprintf(stderr, " '- Summary: %u pending, %u peak, %u reclamations -> "
+	    "%u pending, %u peak, %u reclamations\n\n",
 	    epoch_temporary.n_pending, epoch_temporary.n_peak, epoch_temporary.n_dispatch,
 	    epoch_wr.n_pending, epoch_wr.n_peak, epoch_wr.n_dispatch);
 	return 0;
--- a/regressions/ck_rhs/benchmark/serial.c
+++ b/regressions/ck_rhs/benchmark/serial.c
@ -5,9 +5,9 @@
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
- * 1. Redistributions of source code must retain the above copyrighs
+ * 1. Redistributions of source code must retain the above copyrights
 *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyrighs
+ * 2. Redistributions in binary form must reproduce the above copyrights
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
--- a/regressions/ck_rhs/validate/serial.c
+++ b/regressions/ck_rhs/validate/serial.c
@ -5,9 +5,9 @@
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
- * 1. Redistributions of source code must retain the above copyrighs
+ * 1. Redistributions of source code must retain the above copyrights
 *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyrighs
+ * 2. Redistributions in binary form must reproduce the above copyrights
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
--- a/regressions/ck_ring/validate/Makefile
+++ b/regressions/ck_ring/validate/Makefile
@ -2,7 +2,7 @@

 OBJECTS=ck_ring_spsc ck_ring_spmc ck_ring_spmc_template ck_ring_mpmc \
 	ck_ring_mpmc_template
-SIZE=16384
+SIZE=2048

 all: $(OBJECTS)

--- a/regressions/ck_sequence/validate/ck_sequence.c
+++ b/regressions/ck_sequence/validate/ck_sequence.c
@ -122,7 +122,7 @@ main(int argc, char *argv[])
 		ck_error("Usage: ck_sequence <number of threads> <affinity delta>\n");
 	}

-	n_threads = atoi(argv[1]);
+	n_threads = atoi(argv[1]) - 1;
 	if (n_threads <= 0) {
 		ck_error("ERROR: Number of threads must be greater than 0\n");
 	}
@ -163,6 +163,8 @@ main(int argc, char *argv[])
                counter++;
 		if (ck_pr_load_uint(&barrier) == 0)
                        break;
+
+		ck_pr_stall();
        }

        printf("%u updates made.\n", counter);
--- a/regressions/ck_spinlock/ck_hclh.h
+++ b/regressions/ck_spinlock/ck_hclh.h
@ -1,9 +1,16 @@
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#if CORES < 2
+#undef CORES
+#define CORES 2
+#endif
+
 #define LOCK_NAME "ck_clh"
 #define LOCK_DEFINE static ck_spinlock_hclh_t CK_CC_CACHELINE *glob_lock; \
 		    static ck_spinlock_hclh_t CK_CC_CACHELINE *local_lock[CORES / 2]
+
 #define LOCK_STATE ck_spinlock_hclh_t *na = malloc(MAX(sizeof(ck_spinlock_hclh_t), 64))
-#define LOCK ck_spinlock_hclh_lock(&glob_lock, &local_lock[(core % CORES) / 2], na)
+#define LOCK ck_spinlock_hclh_lock(&glob_lock, &local_lock[core % (CORES / 2)], na)
 #define UNLOCK ck_spinlock_hclh_unlock(&na)
 #define LOCK_INIT do {							\
 	int _i;								\
--- a/regressions/common.h
+++ b/regressions/common.h
@ -267,13 +267,11 @@ struct affinity {
 #define AFFINITY_INITIALIZER {0, 0}

 #ifdef __linux__
-#ifndef gettid
 static pid_t
-gettid(void)
+common_gettid(void)
 {
 	return syscall(__NR_gettid);
 }
-#endif /* gettid */

 CK_CC_UNUSED static int
 aff_iterate(struct affinity *acb)
@ -285,7 +283,10 @@ aff_iterate(struct affinity *acb)
 	CPU_ZERO(&s);
 	CPU_SET(c % CORES, &s);

-	return sched_setaffinity(gettid(), sizeof(s), &s);
+	if (sched_setaffinity(common_gettid(), sizeof(s), &s) != 0)
+		perror("WARNING: Could not affine thread");
+	
+        return 0;
 }

 CK_CC_UNUSED static int
@ -297,7 +298,10 @@ aff_iterate_core(struct affinity *acb, unsigned int *core)
 	CPU_ZERO(&s);
 	CPU_SET((*core) % CORES, &s);

-	return sched_setaffinity(gettid(), sizeof(s), &s);
+	if (sched_setaffinity(common_gettid(), sizeof(s), &s) != 0)
+		perror("WARNING: Could not affine thread");
+	
+        return 0;
 }
 #elif defined(__MACH__)
 CK_CC_UNUSED static int
--- a/src/Makefile.in
+++ b/src/Makefile.in
@ -11,6 +11,7 @@ OBJECTS=ck_barrier_centralized.o	\
 	ck_barrier_dissemination.o	\
 	ck_barrier_tournament.o		\
 	ck_barrier_mcs.o		\
+	ck_ec.o				\
 	ck_epoch.o			\
 	ck_ht.o				\
 	ck_hp.o				\
@ -24,11 +25,14 @@ libck.so: $(OBJECTS)
 	$(LD) $(LDFLAGS) -o $(TARGET_DIR)/libck.so $(OBJECTS)

 libck.a: $(OBJECTS)
-	ar rcs $(TARGET_DIR)/libck.a $(OBJECTS)
+	$(AR) rcs $(TARGET_DIR)/libck.a $(OBJECTS)

 ck_array.o: $(INCLUDE_DIR)/ck_array.h $(SDIR)/ck_array.c
 	$(CC) $(CFLAGS) -c -o $(TARGET_DIR)/ck_array.o $(SDIR)/ck_array.c

+ck_ec.o: $(INCLUDE_DIR)/ck_ec.h $(SDIR)/ck_ec.c $(SDIR)/ck_ec_timeutil.h
+	$(CC) $(CFLAGS) -c -o $(TARGET_DIR)/ck_ec.o $(SDIR)/ck_ec.c
+
 ck_epoch.o: $(INCLUDE_DIR)/ck_epoch.h $(SDIR)/ck_epoch.c $(INCLUDE_DIR)/ck_stack.h
 	$(CC) $(CFLAGS) -c -o $(TARGET_DIR)/ck_epoch.o $(SDIR)/ck_epoch.c

--- a/src/ck_barrier_combining.c
+++ b/src/ck_barrier_combining.c
@ -35,7 +35,7 @@ struct ck_barrier_combining_queue {
 	struct ck_barrier_combining_group *tail;
 };

-CK_CC_INLINE static struct ck_barrier_combining_group *
+static struct ck_barrier_combining_group *
 ck_barrier_combining_queue_dequeue(struct ck_barrier_combining_queue *queue)
 {
 	struct ck_barrier_combining_group *front = NULL;
@ -48,7 +48,7 @@ ck_barrier_combining_queue_dequeue(struct ck_barrier_combining_queue *queue)
 	return front;
 }

-CK_CC_INLINE static void
+static void
 ck_barrier_combining_insert(struct ck_barrier_combining_group *parent,
    struct ck_barrier_combining_group *tnode,
    struct ck_barrier_combining_group **child)
@ -72,7 +72,7 @@ ck_barrier_combining_insert(struct ck_barrier_combining_group *parent,
 * into the barrier's tree. We use a queue to implement this
 * traversal.
 */
-CK_CC_INLINE static void
+static void
 ck_barrier_combining_queue_enqueue(struct ck_barrier_combining_queue *queue,
    struct ck_barrier_combining_group *node_value)
 {
@ -185,10 +185,10 @@ ck_barrier_combining_aux(struct ck_barrier_combining *barrier,
 		ck_pr_fence_store();
 		ck_pr_store_uint(&tnode->sense, ~tnode->sense);
 	} else {
-		ck_pr_fence_memory();
 		while (sense != ck_pr_load_uint(&tnode->sense))
 			ck_pr_stall();
 	}
+	ck_pr_fence_memory();

 	return;
 }
--- a/src/ck_ec.c
+++ b/src/ck_ec.c
@ -0,0 +1,425 @@
+#include <ck_ec.h>
+#include <ck_limits.h>
+
+#include "ck_ec_timeutil.h"
+
+#define DEFAULT_BUSY_LOOP_ITER 100U
+
+/*
+ * The 2ms, 8x/iter default parameter hit 1.024 seconds after 3
+ * iterations.
+ */
+#define DEFAULT_INITIAL_WAIT_NS 2000000L  /* Start at 2 ms */
+/* Grow the wait time 8x/iteration. */
+#define DEFAULT_WAIT_SCALE_FACTOR 8
+#define DEFAULT_WAIT_SHIFT_COUNT 0
+
+struct ck_ec32_slow_path_state {
+	struct ck_ec32 *ec;
+	uint32_t flagged_word;
+};
+
+#ifdef CK_F_EC64
+struct ck_ec64_slow_path_state {
+	struct ck_ec64 *ec;
+	uint64_t flagged_word;
+};
+#endif
+
+/* Once we've waited for >= 1 sec, go for the full deadline. */
+static const struct timespec final_wait_time = {
+	.tv_sec = 1
+};
+
+void
+ck_ec32_wake(struct ck_ec32 *ec, const struct ck_ec_ops *ops)
+{
+	/* Spurious wake-ups are OK. Clear the flag before futexing. */
+	ck_pr_and_32(&ec->counter, (1U << 31) - 1);
+	ops->wake32(ops, &ec->counter);
+	return;
+}
+
+int
+ck_ec32_wait_slow(struct ck_ec32 *ec,
+    const struct ck_ec_ops *ops,
+    uint32_t old_value,
+    const struct timespec *deadline)
+{
+	return ck_ec32_wait_pred_slow(ec, ops, old_value,
+				      NULL, NULL, deadline);
+}
+
+#ifdef CK_F_EC64
+void
+ck_ec64_wake(struct ck_ec64 *ec, const struct ck_ec_ops *ops)
+{
+	ck_pr_and_64(&ec->counter, ~1);
+	ops->wake64(ops, &ec->counter);
+	return;
+}
+
+int
+ck_ec64_wait_slow(struct ck_ec64 *ec,
+    const struct ck_ec_ops *ops,
+    uint64_t old_value,
+    const struct timespec *deadline)
+{
+	return ck_ec64_wait_pred_slow(ec, ops, old_value,
+				      NULL, NULL, deadline);
+}
+#endif
+
+int
+ck_ec_deadline_impl(struct timespec *new_deadline,
+    const struct ck_ec_ops *ops,
+    const struct timespec *timeout)
+{
+	struct timespec now;
+	int r;
+
+	if (timeout == NULL) {
+		new_deadline->tv_sec = TIME_MAX;
+		new_deadline->tv_nsec = NSEC_MAX;
+		return 0;
+	}
+
+	r = ops->gettime(ops, &now);
+	if (r != 0) {
+		return -1;
+	}
+
+	*new_deadline = timespec_add(now, *timeout);
+	return 0;
+}
+
+/* The rest of the file implements wait_pred_slow. */
+
+/*
+ * Returns a timespec value for deadline_ptr. If deadline_ptr is NULL,
+ * returns a timespec far in the future.
+ */
+static struct timespec
+canonical_deadline(const struct timespec *deadline_ptr)
+{
+
+	if (deadline_ptr == NULL) {
+		return (struct timespec) { .tv_sec = TIME_MAX };
+	}
+
+	return *deadline_ptr;
+}
+
+/*
+ * Really slow (sleeping) path for ck_ec_wait.	Drives the exponential
+ * backoff scheme to sleep for longer and longer periods of time,
+ * until either the sleep function returns true (the eventcount's
+ * value has changed), or the predicate returns non-0 (something else
+ * has changed).
+ *
+ * If deadline is ever reached, returns -1 (timeout).
+ *
+ * TODO: add some form of randomisation to the intermediate timeout
+ * values.
+ */
+static int
+exponential_backoff(struct ck_ec_wait_state *wait_state,
+    bool (*sleep)(const void *sleep_state,
+	const struct ck_ec_wait_state *wait_state,
+	const struct timespec *partial_deadline),
+    const void *sleep_state,
+    int (*pred)(const struct ck_ec_wait_state *state,
+	struct timespec *deadline),
+    const struct timespec *deadline)
+{
+	struct timespec begin;
+	struct timespec stop_backoff;
+	const struct ck_ec_ops *ops = wait_state->ops;
+	const uint32_t scale_factor = (ops->wait_scale_factor != 0)
+	    ? ops->wait_scale_factor
+	    : DEFAULT_WAIT_SCALE_FACTOR;
+	const uint32_t shift_count = (ops->wait_shift_count != 0)
+	    ? ops->wait_shift_count
+	    : DEFAULT_WAIT_SHIFT_COUNT;
+	uint32_t wait_ns = (ops->initial_wait_ns != 0)
+	    ? ops->initial_wait_ns
+	    : DEFAULT_INITIAL_WAIT_NS;
+	bool first = true;
+
+	for (;;) {
+		struct timespec now;
+		struct timespec partial_deadline;
+
+		if (check_deadline(&now, ops, *deadline) == true) {
+			/* Timeout. Bail out. */
+			return -1;
+		}
+
+		if (first) {
+			begin = now;
+			wait_state->start = begin;
+			stop_backoff = timespec_add(begin, final_wait_time);
+			first = false;
+		}
+
+		wait_state->now = now;
+		if (timespec_cmp(now, stop_backoff) >= 0) {
+			partial_deadline = *deadline;
+		} else {
+			do {
+				partial_deadline =
+				    timespec_add_ns(begin, wait_ns);
+				wait_ns =
+				    wait_time_scale(wait_ns,
+						    scale_factor,
+						    shift_count);
+			} while (timespec_cmp(partial_deadline, now) <= 0);
+		}
+
+		if (pred != NULL) {
+			int r = pred(wait_state, &partial_deadline);
+			if (r != 0) {
+				return r;
+			}
+		}
+
+		/* Canonicalize deadlines in the far future to NULL. */
+		if (sleep(sleep_state, wait_state,
+			  ((partial_deadline.tv_sec == TIME_MAX)
+			   ? NULL :  &partial_deadline)) == true) {
+			return 0;
+		}
+	}
+}
+
+/*
+ * Loops up to BUSY_LOOP_ITER times, or until ec's counter value
+ * (including the flag) differs from old_value.
+ *
+ * Returns the new value in ec.
+ */
+#define DEF_WAIT_EASY(W)						\
+	static uint##W##_t ck_ec##W##_wait_easy(struct ck_ec##W* ec,	\
+						const struct ck_ec_ops *ops, \
+						uint##W##_t expected)	\
+	{								\
+		uint##W##_t current = ck_pr_load_##W(&ec->counter);	\
+		size_t n = (ops->busy_loop_iter != 0)			\
+		    ? ops->busy_loop_iter				\
+		    : DEFAULT_BUSY_LOOP_ITER;				\
+									\
+		for (size_t i = 0;					\
+		     i < n && current == expected;			\
+		     i++) {						\
+			ck_pr_stall();					\
+			current = ck_pr_load_##W(&ec->counter);		\
+		}							\
+									\
+		return current;						\
+	}
+
+DEF_WAIT_EASY(32)
+#ifdef CK_F_EC64
+DEF_WAIT_EASY(64)
+#endif
+#undef DEF_WAIT_EASY
+/*
+ * Attempts to upgrade ec->counter from unflagged to flagged.
+ *
+ * Returns true if the event count has changed. Otherwise, ec's
+ * counter word is equal to flagged on return, or has been at some
+ * time before the return.
+ */
+#define DEF_UPGRADE(W)							\
+	static bool ck_ec##W##_upgrade(struct ck_ec##W* ec,		\
+				       uint##W##_t current,		\
+				       uint##W##_t unflagged,		\
+				       uint##W##_t flagged)		\
+	{								\
+		uint##W##_t old_word;					\
+									\
+		if (current == flagged) {				\
+			/* Nothing to do, no change. */			\
+			return false;					\
+		}							\
+									\
+		if (current != unflagged) {				\
+			/* We have a different counter value! */	\
+			return true;					\
+		}							\
+									\
+		/*							\
+		 * Flag the counter value. The CAS only fails if the	\
+		 * counter is already flagged, or has a new value.	\
+		 */							\
+		return (ck_pr_cas_##W##_value(&ec->counter,		\
+					      unflagged, flagged,	\
+					      &old_word) == false &&	\
+			old_word != flagged);				\
+	}
+
+DEF_UPGRADE(32)
+#ifdef CK_F_EC64
+DEF_UPGRADE(64)
+#endif
+#undef DEF_UPGRADE
+
+/*
+ * Blocks until partial_deadline on the ck_ec. Returns true if the
+ * eventcount's value has changed. If partial_deadline is NULL, wait
+ * forever.
+ */
+static bool
+ck_ec32_wait_slow_once(const void *vstate,
+    const struct ck_ec_wait_state *wait_state,
+    const struct timespec *partial_deadline)
+{
+	const struct ck_ec32_slow_path_state *state = vstate;
+	const struct ck_ec32 *ec = state->ec;
+	const uint32_t flagged_word = state->flagged_word;
+
+	wait_state->ops->wait32(wait_state, &ec->counter,
+				flagged_word, partial_deadline);
+	return ck_pr_load_32(&ec->counter) != flagged_word;
+}
+
+#ifdef CK_F_EC64
+static bool
+ck_ec64_wait_slow_once(const void *vstate,
+    const struct ck_ec_wait_state *wait_state,
+    const struct timespec *partial_deadline)
+{
+	const struct ck_ec64_slow_path_state *state = vstate;
+	const struct ck_ec64 *ec = state->ec;
+	const uint64_t flagged_word = state->flagged_word;
+
+	/* futex_wait will only compare the low 32 bits. Perform a
+	 * full comparison here to maximise the changes of catching an
+	 * ABA in the low 32 bits.
+	 */
+	if (ck_pr_load_64(&ec->counter) != flagged_word) {
+		return true;
+	}
+
+	wait_state->ops->wait64(wait_state, &ec->counter,
+				flagged_word, partial_deadline);
+	return ck_pr_load_64(&ec->counter) != flagged_word;
+}
+#endif
+
+/*
+ * The full wait logic is a lot of code (> 1KB). Encourage the
+ * compiler to lay this all out linearly with LIKELY annotations on
+ * every early exit.
+ */
+#define WAIT_SLOW_BODY(W, ec, ops, pred, data, deadline_ptr,		\
+		       old_value, unflagged, flagged)			\
+	do {								\
+		struct ck_ec_wait_state wait_state = {			\
+			.ops = ops,					\
+			.data = data					\
+		};							\
+		const struct ck_ec##W##_slow_path_state state = {	\
+			.ec = ec,					\
+			.flagged_word = flagged				\
+		};							\
+		const struct timespec deadline =			\
+			canonical_deadline(deadline_ptr);		\
+									\
+		/* Detect infinite past deadlines. */			\
+		if (CK_CC_LIKELY(deadline.tv_sec <= 0)) {		\
+			return -1;					\
+		}							\
+									\
+		for (;;) {						\
+			uint##W##_t current;				\
+			int r;						\
+									\
+			current = ck_ec##W##_wait_easy(ec, ops, unflagged); \
+									\
+			/*						\
+			 * We're about to wait harder (i.e.,		\
+			 * potentially with futex). Make sure the	\
+			 * counter word is flagged.			\
+			 */						\
+			if (CK_CC_LIKELY(				\
+				ck_ec##W##_upgrade(ec, current,		\
+					unflagged, flagged) == true)) { \
+				ck_pr_fence_acquire();			\
+				return 0;				\
+			}						\
+									\
+			/*						\
+			 * By now, ec->counter == flagged_word (at	\
+			 * some point in the past). Spin some more to	\
+			 * heuristically let any in-flight SP inc/add	\
+			 * to retire. This does not affect		\
+			 * correctness, but practically eliminates	\
+			 * lost wake-ups.				\
+			 */						\
+			current = ck_ec##W##_wait_easy(ec, ops, flagged); \
+			if (CK_CC_LIKELY(current != flagged_word)) {	\
+				ck_pr_fence_acquire();			\
+				return 0;				\
+			}						\
+									\
+			r = exponential_backoff(&wait_state,		\
+						ck_ec##W##_wait_slow_once, \
+						&state,			\
+						pred, &deadline); \
+			if (r != 0) {					\
+				return r;				\
+			}						\
+									\
+			if (ck_ec##W##_value(ec) != old_value) {	\
+				ck_pr_fence_acquire();			\
+				return 0;				\
+			}						\
+									\
+			/* Spurious wake-up. Redo the slow path. */	\
+		}							\
+	} while (0)
+
+int
+ck_ec32_wait_pred_slow(struct ck_ec32 *ec,
+    const struct ck_ec_ops *ops,
+    uint32_t old_value,
+    int (*pred)(const struct ck_ec_wait_state *state,
+	struct timespec *deadline),
+    void *data,
+    const struct timespec *deadline_ptr)
+{
+	const uint32_t unflagged_word = old_value;
+	const uint32_t flagged_word = old_value | (1UL << 31);
+
+	if (CK_CC_UNLIKELY(ck_ec32_value(ec) != old_value)) {
+		return 0;
+	}
+
+	WAIT_SLOW_BODY(32, ec, ops, pred, data, deadline_ptr,
+		       old_value, unflagged_word, flagged_word);
+}
+
+#ifdef CK_F_EC64
+int
+ck_ec64_wait_pred_slow(struct ck_ec64 *ec,
+    const struct ck_ec_ops *ops,
+    uint64_t old_value,
+    int (*pred)(const struct ck_ec_wait_state *state,
+	struct timespec *deadline),
+    void *data,
+    const struct timespec *deadline_ptr)
+{
+	const uint64_t unflagged_word = old_value << 1;
+	const uint64_t flagged_word = unflagged_word | 1;
+
+	if (CK_CC_UNLIKELY(ck_ec64_value(ec) != old_value)) {
+		return 0;
+	}
+
+	WAIT_SLOW_BODY(64, ec, ops, pred, data, deadline_ptr,
+		       old_value, unflagged_word, flagged_word);
+}
+#endif
+
+#undef WAIT_SLOW_BODY
--- a/src/ck_ec_timeutil.h
+++ b/src/ck_ec_timeutil.h
@ -0,0 +1,150 @@
+#ifndef CK_EC_TIMEUTIL_H
+#define CK_EC_TIMEUTIL_H
+#include <ck_cc.h>
+#include <ck_ec.h>
+#include <ck_limits.h>
+#include <ck_stdint.h>
+#include <sys/time.h>
+
+#define TIME_MAX ((time_t)((1ULL << ((sizeof(time_t) * CHAR_BIT) - 1)) - 1))
+#define NSEC_MAX ((1000L * 1000 * 1000) - 1)
+
+/*
+ * Approximates (nsec * multiplier) >> shift. Clamps to UINT32_MAX on
+ * overflow.
+ */
+CK_CC_UNUSED static uint32_t
+wait_time_scale(uint32_t nsec,
+		uint32_t multiplier,
+		unsigned int shift)
+{
+	uint64_t temp = (uint64_t)nsec * multiplier;
+	uint64_t max = (uint64_t)UINT32_MAX << shift;
+
+	if (temp >= max) {
+		return UINT32_MAX;
+	}
+
+	return temp >> shift;
+}
+
+
+/*
+ * Returns ts + ns. ns is clamped to at most 1 second. Clamps the
+ * return value to TIME_MAX, NSEC_MAX on overflow.
+ *
+ */
+CK_CC_UNUSED static struct timespec timespec_add_ns(const struct timespec ts,
+						    uint32_t ns)
+{
+	struct timespec ret = {
+		.tv_sec = TIME_MAX,
+		.tv_nsec = NSEC_MAX
+	};
+	time_t sec;
+	uint32_t sum_ns;
+
+	if (ns > (uint32_t)NSEC_MAX) {
+		if (ts.tv_sec >= TIME_MAX) {
+			return ret;
+		}
+
+		ret.tv_sec = ts.tv_sec + 1;
+		ret.tv_nsec = ts.tv_nsec;
+		return ret;
+	}
+
+	sec = ts.tv_sec;
+	sum_ns = ns + ts.tv_nsec;
+	if (sum_ns > NSEC_MAX) {
+		if (sec >= TIME_MAX) {
+			return ret;
+		}
+
+		sec++;
+		sum_ns -= (NSEC_MAX + 1);
+	}
+
+	ret.tv_sec = sec;
+	ret.tv_nsec = sum_ns;
+	return ret;
+}
+
+
+/*
+ * Returns ts + inc. If inc is negative, it is normalized to 0.
+ * Clamps the return value to TIME_MAX, NSEC_MAX on overflow.
+ */
+CK_CC_UNUSED static struct timespec timespec_add(const struct timespec ts,
+						 const struct timespec inc)
+{
+	/* Initial return value is clamped to infinite future. */
+	struct timespec ret = {
+		.tv_sec = TIME_MAX,
+		.tv_nsec = NSEC_MAX
+	};
+	time_t sec;
+	unsigned long nsec;
+
+	/* Non-positive delta is a no-op. Invalid nsec is another no-op. */
+	if (inc.tv_sec < 0 || inc.tv_nsec < 0 || inc.tv_nsec > NSEC_MAX) {
+		return ts;
+	}
+
+	/* Detect overflow early. */
+	if (inc.tv_sec > TIME_MAX - ts.tv_sec) {
+		return ret;
+	}
+
+	sec = ts.tv_sec + inc.tv_sec;
+	/* This sum can't overflow if the inputs are valid.*/
+	nsec = (unsigned long)ts.tv_nsec + inc.tv_nsec;
+
+	if (nsec > NSEC_MAX) {
+		if (sec >= TIME_MAX) {
+			return ret;
+		}
+
+		sec++;
+		nsec -= (NSEC_MAX + 1);
+	}
+
+	ret.tv_sec = sec;
+	ret.tv_nsec = nsec;
+	return ret;
+}
+
+/* Compares two timespecs. Returns -1 if x < y, 0 if x == y, and 1 if x > y. */
+CK_CC_UNUSED static int timespec_cmp(const struct timespec x,
+				     const struct timespec y)
+{
+	if (x.tv_sec != y.tv_sec) {
+		return (x.tv_sec < y.tv_sec) ? -1 : 1;
+	}
+
+	if (x.tv_nsec != y.tv_nsec) {
+		return (x.tv_nsec < y.tv_nsec) ? -1 : 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Overwrites now with the current CLOCK_MONOTONIC time, and returns
+ * true if the current time is greater than or equal to the deadline,
+ * or the clock is somehow broken.
+ */
+CK_CC_UNUSED static bool check_deadline(struct timespec *now,
+					const struct ck_ec_ops *ops,
+					const struct timespec deadline)
+{
+	int r;
+
+	r = ops->gettime(ops, now);
+	if (r != 0) {
+		return true;
+	}
+
+	return timespec_cmp(*now, deadline) >= 0;
+}
+#endif /* !CK_EC_TIMEUTIL_H */
--- a/src/ck_epoch.c
+++ b/src/ck_epoch.c
@ -127,6 +127,14 @@
 */
 #define CK_EPOCH_GRACE 3U

+/*
+ * CK_EPOCH_LENGTH must be a power-of-2 (because (CK_EPOCH_LENGTH - 1) is used
+ * as a mask, and it must be at least 3 (see comments above).
+ */
+#if (CK_EPOCH_LENGTH < 3 || (CK_EPOCH_LENGTH & (CK_EPOCH_LENGTH - 1)) != 0)
+#error "CK_EPOCH_LENGTH must be a power of 2 and >= 3"
+#endif
+
 enum {
 	CK_EPOCH_STATE_USED = 0,
 	CK_EPOCH_STATE_FREE = 1
@ -139,7 +147,7 @@ CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,

 #define CK_EPOCH_SENSE_MASK	(CK_EPOCH_SENSE - 1)

-void
+bool
 _ck_epoch_delref(struct ck_epoch_record *record,
    struct ck_epoch_section *section)
 {
@ -150,7 +158,7 @@ _ck_epoch_delref(struct ck_epoch_record *record,
 	current->count--;

 	if (current->count > 0)
-		return;
+		return false;

 	/*
 	 * If the current bucket no longer has any references, then
@ -161,8 +169,7 @@ _ck_epoch_delref(struct ck_epoch_record *record,
 	 * If no other active bucket exists, then the record will go
 	 * inactive in order to allow for forward progress.
 	 */
-	other = &record->local.bucket[(i + 1) &
-	    CK_EPOCH_SENSE_MASK];
+	other = &record->local.bucket[(i + 1) & CK_EPOCH_SENSE_MASK];
 	if (other->count > 0 &&
 	    ((int)(current->epoch - other->epoch) < 0)) {
 		/*
@ -172,7 +179,7 @@ _ck_epoch_delref(struct ck_epoch_record *record,
 		ck_pr_store_uint(&record->epoch, other->epoch);
 	}

-	return;
+	return true;
 }

 void
@ -230,7 +237,7 @@ ck_epoch_init(struct ck_epoch *global)
 }

 struct ck_epoch_record *
-ck_epoch_recycle(struct ck_epoch *global)
+ck_epoch_recycle(struct ck_epoch *global, void *ct)
 {
 	struct ck_epoch_record *record;
 	ck_stack_entry_t *cursor;
@ -249,6 +256,12 @@ ck_epoch_recycle(struct ck_epoch *global)
 			    CK_EPOCH_STATE_USED);
 			if (state == CK_EPOCH_STATE_FREE) {
 				ck_pr_dec_uint(&global->n_free);
+				ck_pr_store_ptr(&record->ct, ct);
+
+				/*
+				 * The context pointer is ordered by a
+				 * subsequent protected section.
+				 */
 				return record;
 			}
 		}
@ -258,7 +271,8 @@ ck_epoch_recycle(struct ck_epoch *global)
 }

 void
-ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record)
+ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record,
+    void *ct)
 {
 	size_t i;

@ -269,6 +283,7 @@ ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record)
 	record->n_dispatch = 0;
 	record->n_peak = 0;
 	record->n_pending = 0;
+	record->ct = ct;
 	memset(&record->local, 0, sizeof record->local);

 	for (i = 0; i < CK_EPOCH_LENGTH; i++)
@ -295,6 +310,7 @@ ck_epoch_unregister(struct ck_epoch_record *record)
 	for (i = 0; i < CK_EPOCH_LENGTH; i++)
 		ck_stack_init(&record->pending[i]);

+	ck_pr_store_ptr(&record->ct, NULL);
 	ck_pr_fence_store();
 	ck_pr_store_uint(&record->state, CK_EPOCH_STATE_FREE);
 	ck_pr_inc_uint(&global->n_free);
@ -340,31 +356,41 @@ ck_epoch_scan(struct ck_epoch *global,
 	return NULL;
 }

-static void
-ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e)
+static unsigned int
+ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e, ck_stack_t *deferred)
 {
 	unsigned int epoch = e & (CK_EPOCH_LENGTH - 1);
 	ck_stack_entry_t *head, *next, *cursor;
+	unsigned int n_pending, n_peak;
 	unsigned int i = 0;

-	head = CK_STACK_FIRST(&record->pending[epoch]);
-	ck_stack_init(&record->pending[epoch]);
-
+	head = ck_stack_batch_pop_upmc(&record->pending[epoch]);
 	for (cursor = head; cursor != NULL; cursor = next) {
 		struct ck_epoch_entry *entry =
 		    ck_epoch_entry_container(cursor);

 		next = CK_STACK_NEXT(cursor);
-		entry->function(entry);
+		if (deferred != NULL)
+			ck_stack_push_spnc(deferred, &entry->stack_entry);
+		else
+			entry->function(entry);
+
 		i++;
 	}

-	if (record->n_pending > record->n_peak)
-		record->n_peak = record->n_pending;
+	n_peak = ck_pr_load_uint(&record->n_peak);
+	n_pending = ck_pr_load_uint(&record->n_pending);

-	record->n_dispatch += i;
-	record->n_pending -= i;
-	return;
+	/* We don't require accuracy around peak calculation. */
+	if (n_pending > n_peak)
+		ck_pr_store_uint(&record->n_peak, n_peak);
+
+	if (i > 0) {
+		ck_pr_add_uint(&record->n_dispatch, i);
+		ck_pr_sub_uint(&record->n_pending, i);
+	}
+
+	return i;
 }

 /*
@ -376,7 +402,18 @@ ck_epoch_reclaim(struct ck_epoch_record *record)
 	unsigned int epoch;

 	for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++)
-		ck_epoch_dispatch(record, epoch);
+		ck_epoch_dispatch(record, epoch, NULL);
+
+	return;
+}
+
+CK_CC_FORCE_INLINE static void
+epoch_block(struct ck_epoch *global, struct ck_epoch_record *cr,
+    ck_epoch_wait_cb_t *cb, void *ct)
+{
+
+	if (cb != NULL)
+		cb(global, cr, ct);

 	return;
 }
@ -385,9 +422,9 @@ ck_epoch_reclaim(struct ck_epoch_record *record)
 * This function must not be called with-in read section.
 */
 void
-ck_epoch_synchronize(struct ck_epoch_record *record)
+ck_epoch_synchronize_wait(struct ck_epoch *global,
+    ck_epoch_wait_cb_t *cb, void *ct)
 {
-	struct ck_epoch *global = record->global;
 	struct ck_epoch_record *cr;
 	unsigned int delta, epoch, goal, i;
 	bool active;
@ -424,10 +461,27 @@ ck_epoch_synchronize(struct ck_epoch_record *record)
 			 * period.
 			 */
 			e_d = ck_pr_load_uint(&global->epoch);
-			if (e_d != delta) {
-				delta = e_d;
-				goto reload;
+			if (e_d == delta) {
+				epoch_block(global, cr, cb, ct);
+				continue;
 			}
+
+			/*
+			 * If the epoch has been updated, we may have already
+			 * met our goal.
+			 */
+			delta = e_d;
+			if ((goal > epoch) & (delta >= goal))
+				goto leave;
+
+			epoch_block(global, cr, cb, ct);
+
+			/*
+			 * If the epoch has been updated, then a grace period
+			 * requires that all threads are observed idle at the
+			 * same epoch.
+			 */
+			cr = NULL;
 		}

 		/*
@ -459,20 +513,6 @@ ck_epoch_synchronize(struct ck_epoch_record *record)
 		 * Otherwise, we have just acquired latest snapshot.
 		 */
 		delta = delta + r;
-		continue;
-
-reload:
-		if ((goal > epoch) & (delta >= goal)) {
-			/*
-			 * Right now, epoch overflow is handled as an edge
-			 * case. If we have already observed an epoch
-			 * generation, then we can be sure no hazardous
-			 * references exist to objects from this generation. We
-			 * can actually avoid an addtional scan step at this
-			 * point.
-			 */
-			break;
-		}
 	}

 	/*
@ -480,8 +520,16 @@ reload:
 	 * However, if non-temporal instructions are used, full barrier
 	 * semantics are necessary.
 	 */
+leave:
 	ck_pr_fence_memory();
-	record->epoch = delta;
+	return;
+}
+
+void
+ck_epoch_synchronize(struct ck_epoch_record *record)
+{
+
+	ck_epoch_synchronize_wait(record->global, NULL, NULL);
 	return;
 }

@ -494,6 +542,16 @@ ck_epoch_barrier(struct ck_epoch_record *record)
 	return;
 }

+void
+ck_epoch_barrier_wait(struct ck_epoch_record *record, ck_epoch_wait_cb_t *cb,
+    void *ct)
+{
+
+	ck_epoch_synchronize_wait(record->global, cb, ct);
+	ck_epoch_reclaim(record);
+	return;
+}
+
 /*
 * It may be worth it to actually apply these deferral semantics to an epoch
 * that was observed at ck_epoch_call time. The problem is that the latter
@ -505,41 +563,61 @@ ck_epoch_barrier(struct ck_epoch_record *record)
 * is far from ideal too.
 */
 bool
-ck_epoch_poll(struct ck_epoch_record *record)
+ck_epoch_poll_deferred(struct ck_epoch_record *record, ck_stack_t *deferred)
 {
 	bool active;
 	unsigned int epoch;
-	unsigned int snapshot;
 	struct ck_epoch_record *cr = NULL;
 	struct ck_epoch *global = record->global;
+	unsigned int n_dispatch;

 	epoch = ck_pr_load_uint(&global->epoch);

 	/* Serialize epoch snapshots with respect to global epoch. */
 	ck_pr_fence_memory();
+
+	/*
+	 * At this point, epoch is the current global epoch value.
+	 * There may or may not be active threads which observed epoch - 1.
+	 * (ck_epoch_scan() will tell us that). However, there should be
+	 * no active threads which observed epoch - 2.
+	 *
+	 * Note that checking epoch - 2 is necessary, as race conditions can
+	 * allow another thread to increment the global epoch before this
+	 * thread runs.
+	 */
+	n_dispatch = ck_epoch_dispatch(record, epoch - 2, deferred);
+
 	cr = ck_epoch_scan(global, cr, epoch, &active);
-	if (cr != NULL) {
-		record->epoch = epoch;
-		return false;
-	}
+	if (cr != NULL)
+		return (n_dispatch > 0);

 	/* We are at a grace period if all threads are inactive. */
 	if (active == false) {
 		record->epoch = epoch;
 		for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++)
-			ck_epoch_dispatch(record, epoch);
+			ck_epoch_dispatch(record, epoch, deferred);

 		return true;
 	}

-	/* If an active thread exists, rely on epoch observation. */
-	if (ck_pr_cas_uint_value(&global->epoch, epoch, epoch + 1,
-	    &snapshot) == false) {
-		record->epoch = snapshot;
-	} else {
-		record->epoch = epoch + 1;
-	}
+	/*
+	 * If an active thread exists, rely on epoch observation.
+	 *
+	 * All the active threads entered the epoch section during
+	 * the current epoch. Therefore, we can now run the handlers
+	 * for the immediately preceding epoch and attempt to
+	 * advance the epoch if it hasn't been already.
+	 */
+	(void)ck_pr_cas_uint(&global->epoch, epoch, epoch + 1);

-	ck_epoch_dispatch(record, epoch + 1);
+	ck_epoch_dispatch(record, epoch - 1, deferred);
 	return true;
 }
+
+bool
+ck_epoch_poll(struct ck_epoch_record *record)
+{
+
+	return ck_epoch_poll_deferred(record, NULL);
+}
--- a/src/ck_hs.c
+++ b/src/ck_hs.c
@ -105,19 +105,10 @@ ck_hs_map_signal(struct ck_hs_map *map, unsigned long h)
 	return;
 }

-void
-ck_hs_iterator_init(struct ck_hs_iterator *iterator)
+static bool 
+_ck_hs_next(struct ck_hs *hs, struct ck_hs_map *map,
+    struct ck_hs_iterator *i, void **key)
 {
-
-	iterator->cursor = NULL;
-	iterator->offset = 0;
-	return;
-}
-
-bool
-ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key)
-{
-	struct ck_hs_map *map = hs->map;
 	void *value;

 	if (i->offset >= map->capacity)
@ -129,6 +120,8 @@ ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key)
 #ifdef CK_HS_PP
 			if (hs->mode & CK_HS_MODE_OBJECT)
 				value = CK_HS_VMA(value);
+#else
+			(void)hs; /* Avoid unused parameter warning. */
 #endif
 			i->offset++;
 			*key = value;
@ -139,6 +132,35 @@ ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key)
 	return false;
 }

+void
+ck_hs_iterator_init(struct ck_hs_iterator *iterator)
+{
+
+	iterator->cursor = NULL;
+	iterator->offset = 0;
+	iterator->map = NULL;
+	return;
+}
+
+bool
+ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key)
+{
+
+	return _ck_hs_next(hs, hs->map, i, key);
+}
+
+bool
+ck_hs_next_spmc(struct ck_hs *hs, struct ck_hs_iterator *i, void **key)
+{
+	struct ck_hs_map *m = i->map;
+
+	if (m == NULL) {
+		m = i->map = ck_pr_load_ptr(&hs->map);
+	}
+
+	return _ck_hs_next(hs, m, i, key);
+}
+
 void
 ck_hs_stat(struct ck_hs *hs, struct ck_hs_stat *st)
 {
@ -206,7 +228,7 @@ ck_hs_map_create(struct ck_hs *hs, unsigned long entries)
 	map->probe_limit = (unsigned int)limit;
 	map->probe_maximum = 0;
 	map->capacity = n_entries;
-	map->step = ck_internal_bsf(n_entries);
+	map->step = ck_cc_ffsl(n_entries);
 	map->mask = n_entries - 1;
 	map->n_entries = 0;

--- a/src/ck_ht.c
+++ b/src/ck_ht.c
@ -30,9 +30,6 @@
 /*
 * This implementation borrows several techniques from Josh Dybnis's
 * nbds library which can be found at http://code.google.com/p/nbds
- *
- * This release currently only includes support for 64-bit platforms.
- * We can address 32-bit platforms in a future release.
 */
 #include <ck_cc.h>
 #include <ck_md.h>
@ -171,7 +168,7 @@ ck_ht_map_create(struct ck_ht *table, CK_HT_TYPE entries)
 	map->deletions = 0;
 	map->probe_maximum = 0;
 	map->capacity = n_entries;
-	map->step = ck_internal_bsf_64(map->capacity);
+	map->step = ck_cc_ffsll(map->capacity);
 	map->mask = map->capacity - 1;
 	map->n_entries = 0;
 	map->entries = (struct ck_ht_entry *)(((uintptr_t)&map[1] + prefix +
--- a/src/ck_ht_hash.h
+++ b/src/ck_ht_hash.h
@ -88,7 +88,15 @@ static inline uint64_t rotl64 ( uint64_t x, int8_t r )

 FORCE_INLINE static uint32_t getblock ( const uint32_t * p, int i )
 {
+#ifdef __s390x__
+  uint32_t res;
+
+  __asm__ ("	lrv	%0,%1\n"
+	   : "=r" (res) : "Q" (p[i]) : "cc", "mem");
+  return res;
+#else
  return p[i];
+#endif /* !__s390x__ */
 }

 //-----------------------------------------------------------------------------
@ -147,7 +155,9 @@ static inline void MurmurHash3_x86_32 ( const void * key, int len,
  switch(len & 3)
  {
  case 3: k1 ^= tail[2] << 16;
+  /* fall through */
  case 2: k1 ^= tail[1] << 8;
+  /* fall through */
  case 1: k1 ^= tail[0];
          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
  };
@ -196,11 +206,17 @@ static inline uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed
  switch(len & 7)
  {
  case 7: h ^= (uint64_t)(data2[6]) << 48;
+  /* fall through */
  case 6: h ^= (uint64_t)(data2[5]) << 40;
+  /* fall through */
  case 5: h ^= (uint64_t)(data2[4]) << 32;
+  /* fall through */
  case 4: h ^= (uint64_t)(data2[3]) << 24;
+  /* fall through */
  case 3: h ^= (uint64_t)(data2[2]) << 16;
+  /* fall through */
  case 2: h ^= (uint64_t)(data2[1]) << 8;
+  /* fall through */
  case 1: h ^= (uint64_t)(data2[0]);
          h *= m;
  };
@ -249,7 +265,9 @@ static inline uint64_t MurmurHash64B ( const void * key, int len, uint64_t seed
  switch(len)
  {
  case 3: h2 ^= ((const unsigned char*)data)[2] << 16;
+  /* fall through */
  case 2: h2 ^= ((const unsigned char*)data)[1] << 8;
+  /* fall through */
  case 1: h2 ^= ((const unsigned char*)data)[0];
      h2 *= m;
  };
--- a/src/ck_internal.h
+++ b/src/ck_internal.h
@ -80,40 +80,3 @@ ck_internal_max_32(uint32_t x, uint32_t y)

 	return x ^ ((x ^ y) & -(x < y));
 }
-
-CK_CC_INLINE static unsigned long
-ck_internal_bsf(unsigned long v)
-{
-#if defined(__GNUC__)
-	return __builtin_ffs(v);
-#else
-	unsigned int i;
-	const unsigned int s = sizeof(unsigned long) * 8 - 1;
-
-	for (i = 0; i < s; i++) {
-		if (v & (1UL << (s - i)))
-			return sizeof(unsigned long) * 8 - i;
-	}
-
-	return 1;
-#endif /* !__GNUC__ */
-}
-
-CK_CC_INLINE static uint64_t
-ck_internal_bsf_64(uint64_t v)
-{
-#if defined(__GNUC__)
-	return __builtin_ffs(v);
-#else
-	unsigned int i;
-	const unsigned int s = sizeof(unsigned long) * 8 - 1;
-
-	for (i = 0; i < s; i++) {
-		if (v & (1ULL << (63U - i)))
-			return i;
-	}
-#endif /* !__GNUC__ */
-
-	return 1;
-}
-
--- a/src/ck_rhs.c
+++ b/src/ck_rhs.c
@ -366,7 +366,7 @@ ck_rhs_map_create(struct ck_rhs *hs, unsigned long entries)
 	map->probe_limit = (unsigned int)limit;
 	map->probe_maximum = 0;
 	map->capacity = n_entries;
-	map->step = ck_internal_bsf(n_entries);
+	map->step = ck_cc_ffsl(n_entries);
 	map->mask = n_entries - 1;
 	map->n_entries = 0;

--- a/tools/ci-build.sh
+++ b/tools/ci-build.sh
@ -0,0 +1,15 @@
+#!/bin/sh
+#
+# Skeleton for continuous integration testing.
+##############################################################################
+
+set -x
+
+export CFLAGS="-DITERATE=400 -DPAIRS_S=100 -DITERATIONS=24 -DSTEPS=10000"
+./configure $@
+
+if [ `uname -s` = "FreeBSD" ]; then
+	make -j $(sysctl -n hw.ncpu)
+else
+	make -j
+fi