From c8de762782141e9068a372be879d45fd366ee3d9 Mon Sep 17 00:00:00 2001
From: girlbossceo <june@girlboss.ceo>
Date: Sun, 12 Nov 2023 16:28:12 -0500
Subject: [PATCH] initial commit of hardened_malloc-sys

Signed-off-by: girlbossceo <june@girlboss.ceo>
---
 .gitignore                                    |    3 +
 .gitmodules                                   |    3 +
 CODE_OF_CONDUCT.md                            |  134 +
 Cargo.toml                                    |   30 +
 LICENCE                                       |  202 ++
 LICENCE_GRAPHENEOS                            |   19 +
 README.md                                     |   10 +
 build.rs                                      |   61 +
 hardened_malloc_sources.txt                   |   16 +
 src/hardened_malloc/.clang-tidy               |    2 +
 src/hardened_malloc/.github/dependabot.yml    |    7 +
 .../.github/workflows/build-and-test.yml      |   53 +
 src/hardened_malloc/.gitignore                |    2 +
 src/hardened_malloc/Android.bp                |   83 +
 src/hardened_malloc/CREDITS                   |  283 ++
 .../KERNEL_FEATURE_WISHLIST.md                |   35 +
 src/hardened_malloc/LICENSE                   |   19 +
 src/hardened_malloc/Makefile                  |  148 +
 src/hardened_malloc/README.md                 | 1037 ++++++
 src/hardened_malloc/androidtest/Android.bp    |   25 +
 .../androidtest/AndroidTest.xml               |   13 +
 .../androidtest/memtag/Android.bp             |   16 +
 .../androidtest/memtag/memtag_test.cc         |  297 ++
 .../src/grapheneos/hmalloc/MemtagTest.java    |   95 +
 src/hardened_malloc/arm_mte.h                 |   91 +
 src/hardened_malloc/calculate_waste.py        |   81 +
 src/hardened_malloc/chacha.c                  |  177 +
 src/hardened_malloc/chacha.h                  |   17 +
 src/hardened_malloc/config/default.mk         |   23 +
 src/hardened_malloc/config/light.mk           |   23 +
 src/hardened_malloc/h_malloc.c                | 2190 ++++++++++++
 src/hardened_malloc/include/h_malloc.h        |  129 +
 src/hardened_malloc/memory.c                  |  120 +
 src/hardened_malloc/memory.h                  |   29 +
 src/hardened_malloc/memtag.h                  |   49 +
 src/hardened_malloc/mutex.h                   |   28 +
 src/hardened_malloc/new.cc                    |  153 +
 src/hardened_malloc/pages.c                   |   88 +
 src/hardened_malloc/pages.h                   |   32 +
 src/hardened_malloc/preload.sh                |    6 +
 src/hardened_malloc/random.c                  |  128 +
 src/hardened_malloc/random.h                  |   25 +
 src/hardened_malloc/test/.gitignore           |   44 +
 src/hardened_malloc/test/Makefile             |   76 +
 src/hardened_malloc/test/__init__.py          |    0
 .../test/delete_type_size_mismatch.cc         |   14 +
 src/hardened_malloc/test/double_free_large.c  |   13 +
 .../test/double_free_large_delayed.c          |   18 +
 src/hardened_malloc/test/double_free_small.c  |   13 +
 .../test/double_free_small_delayed.c          |   18 +
 .../test/impossibly_large_malloc.c            |    8 +
 .../test/invalid_free_protected.c             |   15 +
 .../test/invalid_free_small_region.c          |   13 +
 .../test/invalid_free_small_region_far.c      |   13 +
 .../test/invalid_free_unprotected.c           |   15 +
 .../test/invalid_malloc_object_size_small.c   |   15 +
 ...alid_malloc_object_size_small_quarantine.c |   15 +
 .../test/invalid_malloc_usable_size_small.c   |   13 +
 ...alid_malloc_usable_size_small_quarantine.c |   13 +
 src/hardened_malloc/test/large_array_growth.c |   18 +
 src/hardened_malloc/test/mallinfo.c           |   44 +
 src/hardened_malloc/test/mallinfo2.c          |   44 +
 src/hardened_malloc/test/malloc_info.c        |   35 +
 src/hardened_malloc/test/malloc_object_size.c |   12 +
 .../test/malloc_object_size_offset.c          |   12 +
 src/hardened_malloc/test/offset.c             |   50 +
 .../test/overflow_large_1_byte.c              |   15 +
 .../test/overflow_large_8_byte.c              |   15 +
 .../test/overflow_small_1_byte.c              |   15 +
 .../test/overflow_small_8_byte.c              |   16 +
 .../test/read_after_free_large.c              |   21 +
 .../test/read_after_free_small.c              |   21 +
 src/hardened_malloc/test/read_zero_size.c     |   13 +
 src/hardened_malloc/test/realloc_init.c       |   33 +
 src/hardened_malloc/test/string_overflow.c    |   20 +
 src/hardened_malloc/test/test_smc.py          |  242 ++
 src/hardened_malloc/test/test_util.h          |   10 +
 .../test/unaligned_free_large.c               |   12 +
 .../test/unaligned_free_small.c               |   12 +
 .../test/unaligned_malloc_usable_size_small.c |   12 +
 src/hardened_malloc/test/uninitialized_free.c |    8 +
 .../test/uninitialized_malloc_usable_size.c   |    8 +
 .../test/uninitialized_read_large.c           |   14 +
 .../test/uninitialized_read_small.c           |   14 +
 .../test/uninitialized_realloc.c              |   11 +
 .../test/write_after_free_large.c             |   13 +
 .../test/write_after_free_large_reuse.c       |   16 +
 .../test/write_after_free_small.c             |   19 +
 .../test/write_after_free_small_reuse.c       |   21 +
 src/hardened_malloc/test/write_zero_size.c    |   12 +
 src/hardened_malloc/third_party/libdivide.h   | 3126 +++++++++++++++++
 src/hardened_malloc/util.c                    |   41 +
 src/hardened_malloc/util.h                    |   88 +
 src/lib.rs                                    |   78 +
 94 files changed, 10439 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .gitmodules
 create mode 100644 CODE_OF_CONDUCT.md
 create mode 100644 Cargo.toml
 create mode 100644 LICENCE
 create mode 100644 LICENCE_GRAPHENEOS
 create mode 100644 README.md
 create mode 100644 build.rs
 create mode 100644 hardened_malloc_sources.txt
 create mode 100644 src/hardened_malloc/.clang-tidy
 create mode 100644 src/hardened_malloc/.github/dependabot.yml
 create mode 100644 src/hardened_malloc/.github/workflows/build-and-test.yml
 create mode 100644 src/hardened_malloc/.gitignore
 create mode 100644 src/hardened_malloc/Android.bp
 create mode 100644 src/hardened_malloc/CREDITS
 create mode 100644 src/hardened_malloc/KERNEL_FEATURE_WISHLIST.md
 create mode 100644 src/hardened_malloc/LICENSE
 create mode 100644 src/hardened_malloc/Makefile
 create mode 100644 src/hardened_malloc/README.md
 create mode 100644 src/hardened_malloc/androidtest/Android.bp
 create mode 100644 src/hardened_malloc/androidtest/AndroidTest.xml
 create mode 100644 src/hardened_malloc/androidtest/memtag/Android.bp
 create mode 100644 src/hardened_malloc/androidtest/memtag/memtag_test.cc
 create mode 100644 src/hardened_malloc/androidtest/src/grapheneos/hmalloc/MemtagTest.java
 create mode 100644 src/hardened_malloc/arm_mte.h
 create mode 100755 src/hardened_malloc/calculate_waste.py
 create mode 100644 src/hardened_malloc/chacha.c
 create mode 100644 src/hardened_malloc/chacha.h
 create mode 100644 src/hardened_malloc/config/default.mk
 create mode 100644 src/hardened_malloc/config/light.mk
 create mode 100644 src/hardened_malloc/h_malloc.c
 create mode 100644 src/hardened_malloc/include/h_malloc.h
 create mode 100644 src/hardened_malloc/memory.c
 create mode 100644 src/hardened_malloc/memory.h
 create mode 100644 src/hardened_malloc/memtag.h
 create mode 100644 src/hardened_malloc/mutex.h
 create mode 100644 src/hardened_malloc/new.cc
 create mode 100644 src/hardened_malloc/pages.c
 create mode 100644 src/hardened_malloc/pages.h
 create mode 100755 src/hardened_malloc/preload.sh
 create mode 100644 src/hardened_malloc/random.c
 create mode 100644 src/hardened_malloc/random.h
 create mode 100644 src/hardened_malloc/test/.gitignore
 create mode 100644 src/hardened_malloc/test/Makefile
 create mode 100644 src/hardened_malloc/test/__init__.py
 create mode 100644 src/hardened_malloc/test/delete_type_size_mismatch.cc
 create mode 100644 src/hardened_malloc/test/double_free_large.c
 create mode 100644 src/hardened_malloc/test/double_free_large_delayed.c
 create mode 100644 src/hardened_malloc/test/double_free_small.c
 create mode 100644 src/hardened_malloc/test/double_free_small_delayed.c
 create mode 100644 src/hardened_malloc/test/impossibly_large_malloc.c
 create mode 100644 src/hardened_malloc/test/invalid_free_protected.c
 create mode 100644 src/hardened_malloc/test/invalid_free_small_region.c
 create mode 100644 src/hardened_malloc/test/invalid_free_small_region_far.c
 create mode 100644 src/hardened_malloc/test/invalid_free_unprotected.c
 create mode 100644 src/hardened_malloc/test/invalid_malloc_object_size_small.c
 create mode 100644 src/hardened_malloc/test/invalid_malloc_object_size_small_quarantine.c
 create mode 100644 src/hardened_malloc/test/invalid_malloc_usable_size_small.c
 create mode 100644 src/hardened_malloc/test/invalid_malloc_usable_size_small_quarantine.c
 create mode 100644 src/hardened_malloc/test/large_array_growth.c
 create mode 100644 src/hardened_malloc/test/mallinfo.c
 create mode 100644 src/hardened_malloc/test/mallinfo2.c
 create mode 100644 src/hardened_malloc/test/malloc_info.c
 create mode 100644 src/hardened_malloc/test/malloc_object_size.c
 create mode 100644 src/hardened_malloc/test/malloc_object_size_offset.c
 create mode 100644 src/hardened_malloc/test/offset.c
 create mode 100644 src/hardened_malloc/test/overflow_large_1_byte.c
 create mode 100644 src/hardened_malloc/test/overflow_large_8_byte.c
 create mode 100644 src/hardened_malloc/test/overflow_small_1_byte.c
 create mode 100644 src/hardened_malloc/test/overflow_small_8_byte.c
 create mode 100644 src/hardened_malloc/test/read_after_free_large.c
 create mode 100644 src/hardened_malloc/test/read_after_free_small.c
 create mode 100644 src/hardened_malloc/test/read_zero_size.c
 create mode 100644 src/hardened_malloc/test/realloc_init.c
 create mode 100644 src/hardened_malloc/test/string_overflow.c
 create mode 100644 src/hardened_malloc/test/test_smc.py
 create mode 100644 src/hardened_malloc/test/test_util.h
 create mode 100644 src/hardened_malloc/test/unaligned_free_large.c
 create mode 100644 src/hardened_malloc/test/unaligned_free_small.c
 create mode 100644 src/hardened_malloc/test/unaligned_malloc_usable_size_small.c
 create mode 100644 src/hardened_malloc/test/uninitialized_free.c
 create mode 100644 src/hardened_malloc/test/uninitialized_malloc_usable_size.c
 create mode 100644 src/hardened_malloc/test/uninitialized_read_large.c
 create mode 100644 src/hardened_malloc/test/uninitialized_read_small.c
 create mode 100644 src/hardened_malloc/test/uninitialized_realloc.c
 create mode 100644 src/hardened_malloc/test/write_after_free_large.c
 create mode 100644 src/hardened_malloc/test/write_after_free_large_reuse.c
 create mode 100644 src/hardened_malloc/test/write_after_free_small.c
 create mode 100644 src/hardened_malloc/test/write_after_free_small_reuse.c
 create mode 100644 src/hardened_malloc/test/write_zero_size.c
 create mode 100644 src/hardened_malloc/third_party/libdivide.h
 create mode 100644 src/hardened_malloc/util.c
 create mode 100644 src/hardened_malloc/util.h
 create mode 100644 src/lib.rs

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ffea379
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+/target
+.DS_Store
+Cargo.lock
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..63dadb8
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "hardened_malloc"]
+	path = "src/hardened_malloc"
+	url = https://github.com/GrapheneOS/hardened_malloc.git
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..c55cb31
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,134 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at Email
+[strawberry@pupbrain.dev] or via Matrix [@strawberry:puppygock.gay]
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
+
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..5156ab2
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "hardened_malloc-sys"
+build = "build.rs"
+description = "hardened_malloc rust wrapper (sys crate)"
+authors = ["strawberry <strawberry@pupbrain.dev>"]
+version = "0.1.0"
+edition = "2021"
+license = "Apache-2.0 and MIT"
+repository = "https://github.com/girlbossceo/hardened_malloc-sys"
+categories = ["api-bindings", "memory-management"]
+keywords = ["hardened_malloc", "malloc", "hardened memory allocator", "security"]
+readme = "README.md"
+exclude = [
+    "/src/hardened_malloc/test",
+    "/src/hardened_malloc/androidtest",
+    "/src/hardened_malloc/out",
+    "/src/hardened_malloc/out-light",
+]
+
+[features]
+default = ["light"]
+light = []
+# "standard" feature is "default.mk" config in hardened_malloc
+standard = []
+
+[dependencies]
+libc = "0.2"
+
+[build-dependencies]
+cc = "1.0"
\ No newline at end of file
diff --git a/LICENCE b/LICENCE
new file mode 100644
index 0000000..2d9a3a5
--- /dev/null
+++ b/LICENCE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2023] [June]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/LICENCE_GRAPHENEOS b/LICENCE_GRAPHENEOS
new file mode 100644
index 0000000..3b9e2c0
--- /dev/null
+++ b/LICENCE_GRAPHENEOS
@@ -0,0 +1,19 @@
+Copyright © 2018-2023 GrapheneOS
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8905fa7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,10 @@
+# hardened_malloc-sys
+
+the sys repo, rust wrapper
+
+### TODO:
+- [ ] test if this even works
+- [ ] add support for explicit make config args on top of choosing variant
+- [ ] make build script better overall
+- [ ] support C preprocessor macro definitions
+- [ ] add support for hardened_malloc's tests and our own tests
\ No newline at end of file
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..b0988f7
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,61 @@
+use std::{env, process::Command, path::Path};
+
+/// If submodules were not synced, sync them to actually build hardened_malloc
+fn update_submodules() {
+    let program = "git";
+    let dir = "../";
+    let args = ["submodule", "update", "--init", "--recursive"];
+    println!(
+        "Running command: \"{} {}\" in directory: {}",
+        program,
+        args.join(" "),
+        dir
+    );
+    let ret = Command::new(program).current_dir(dir).args(args).status();
+
+    match ret.map(|status| (status.success(), status.code())) {
+        Ok((true, _)) => (),
+        Ok((false, Some(c))) => panic!("Command failed with error code {}", c),
+        Ok((false, None)) => panic!("Command exited with no error code, possibly killed by system"),
+        Err(e) => panic!("Command failed with error: {}", e),
+    }
+}
+
+fn main() {
+    if !Path::new("src/hardened_malloc/Makefile").exists() {
+        update_submodules();
+    }
+    let variant: &str;
+    
+    if cfg!(feature = "light") {
+        variant = "light";
+    } else {
+        variant = "default";
+    }
+
+    //TODO: handle support for explicit make flags like N_ARENA=1 and such
+
+    let mut make_command = Command::new("make");
+    let make_output = make_command
+    .current_dir("src/hardened_malloc/")
+    .env("V", "1") // always verbose mode for cargo
+    .env("VARIANT", variant)
+    .output()
+    .unwrap_or_else(|error| {
+        panic!("Failed to run 'make {}': ", error);
+    });
+    if !make_output.status.success() {
+        panic!(
+            "building hardened_malloc failed:\n{:?}\n{}\n{}",
+            make_command,
+            String::from_utf8_lossy(&make_output.stdout),
+            String::from_utf8_lossy(&make_output.stderr)
+        );
+    }
+
+    //println!("cargo:rustc-link-search=native=src/hardened_malloc");
+    
+    //println!("cargo:rerun-if-changed=build.rs");
+    println!("cargo:rerun-if-changed=src/hardened_malloc/");
+    //println!("cargo:out_dir={}", env::var("OUT_DIR").unwrap());
+}
\ No newline at end of file
diff --git a/hardened_malloc_sources.txt b/hardened_malloc_sources.txt
new file mode 100644
index 0000000..0688911
--- /dev/null
+++ b/hardened_malloc_sources.txt
@@ -0,0 +1,16 @@
+src/hardened_malloc/chacha.c
+src/hardened_malloc/h_malloc.c
+src/hardened_malloc/memory.c
+src/hardened_malloc/pages.c
+src/hardened_malloc/random.c
+src/hardened_malloc/util.c
+src/hardened_malloc/arm_mte.h
+src/hardened_malloc/chacha.h
+src/hardened_malloc/memory.h
+src/hardened_malloc/memtag.h
+src/hardened_malloc/mutex.h
+src/hardened_malloc/pages.h
+src/hardened_malloc/random.h
+src/hardened_malloc/util.h
+src/hardened_malloc/new.cc
+src/hardened_malloc/third_party/libdivide.h
\ No newline at end of file
diff --git a/src/hardened_malloc/.clang-tidy b/src/hardened_malloc/.clang-tidy
new file mode 100644
index 0000000..ea78ba3
--- /dev/null
+++ b/src/hardened_malloc/.clang-tidy
@@ -0,0 +1,2 @@
+Checks: 'bugprone-*,-bugprone-easily-swappable-parameters,-bugprone-macro-parentheses,-bugprone-too-small-loop-variable,cert-*,-cert-err33-c,clang-analyzer-*,-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,-clang-diagnostic-constant-logical-operand,readability-*,-readability-function-cognitive-complexity,-readability-identifier-length,-readability-inconsistent-declaration-parameter-name,-readability-magic-numbers,-readability-named-parameter,llvm-include-order,misc-*'
+WarningsAsErrors: '*'
diff --git a/src/hardened_malloc/.github/dependabot.yml b/src/hardened_malloc/.github/dependabot.yml
new file mode 100644
index 0000000..5e1954b
--- /dev/null
+++ b/src/hardened_malloc/.github/dependabot.yml
@@ -0,0 +1,7 @@
+version: 2
+updates:
+  - package-ecosystem: github-actions
+    directory: "/"
+    schedule:
+      interval: daily
+    target-branch: main
diff --git a/src/hardened_malloc/.github/workflows/build-and-test.yml b/src/hardened_malloc/.github/workflows/build-and-test.yml
new file mode 100644
index 0000000..82496af
--- /dev/null
+++ b/src/hardened_malloc/.github/workflows/build-and-test.yml
@@ -0,0 +1,53 @@
+name: Build and run tests
+
+on:
+  push:
+  pull_request:
+  schedule:
+    - cron: '0 2 * * *'
+
+jobs:
+  build-ubuntu-gcc:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        version: [12]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setting up gcc version
+        run: |
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${{ matrix.version }} 100
+      - name: Build
+        run: make test
+  build-ubuntu-clang:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        version: [14, 15]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setting up clang version
+        run: |
+          sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100
+          sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{ matrix.version }} 100
+      - name: Build
+        run: CC=clang CXX=clang++ make test
+  build-musl:
+    runs-on: ubuntu-latest
+    container:
+      image: alpine:latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install dependencies
+        run: apk update && apk add build-base python3
+      - name: Build
+        run: make test
+  build-ubuntu-gcc-aarch64:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install dependencies
+        run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgcc-s1-arm64-cross cpp-aarch64-linux-gnu
+      - name: Build
+        run: CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-gcc++ make CONFIG_NATIVE=false
diff --git a/src/hardened_malloc/.gitignore b/src/hardened_malloc/.gitignore
new file mode 100644
index 0000000..e5cdb39
--- /dev/null
+++ b/src/hardened_malloc/.gitignore
@@ -0,0 +1,2 @@
+out/
+out-light/
diff --git a/src/hardened_malloc/Android.bp b/src/hardened_malloc/Android.bp
new file mode 100644
index 0000000..0db6a04
--- /dev/null
+++ b/src/hardened_malloc/Android.bp
@@ -0,0 +1,83 @@
+common_cflags = [
+    "-pipe",
+    "-O3",
+    //"-flto",
+    "-fPIC",
+    "-fvisibility=hidden",
+    //"-fno-plt",
+    "-Wall",
+    "-Wextra",
+    "-Wcast-align",
+    "-Wcast-qual",
+    "-Wwrite-strings",
+    "-Werror",
+    "-DH_MALLOC_PREFIX",
+    "-DZERO_ON_FREE=true",
+    "-DWRITE_AFTER_FREE_CHECK=true",
+    "-DSLOT_RANDOMIZE=true",
+    "-DSLAB_CANARY=true",
+    "-DSLAB_QUARANTINE_RANDOM_LENGTH=1",
+    "-DSLAB_QUARANTINE_QUEUE_LENGTH=1",
+    "-DCONFIG_EXTENDED_SIZE_CLASSES=true",
+    "-DCONFIG_LARGE_SIZE_CLASSES=true",
+    "-DGUARD_SLABS_INTERVAL=1",
+    "-DGUARD_SIZE_DIVISOR=2",
+    "-DREGION_QUARANTINE_RANDOM_LENGTH=256",
+    "-DREGION_QUARANTINE_QUEUE_LENGTH=1024",
+    "-DREGION_QUARANTINE_SKIP_THRESHOLD=33554432", // 32MiB
+    "-DFREE_SLABS_QUARANTINE_RANDOM_LENGTH=32",
+    "-DCONFIG_CLASS_REGION_SIZE=34359738368", // 32GiB
+    "-DN_ARENA=1",
+    "-DCONFIG_STATS=true",
+    "-DCONFIG_SELF_INIT=false",
+]
+
+cc_defaults {
+    name: "hardened_malloc_defaults",
+    defaults: ["linux_bionic_supported"],
+    cflags: common_cflags,
+    conlyflags: ["-std=c17", "-Wmissing-prototypes"],
+    stl: "none",
+}
+
+lib_src_files = [
+    "chacha.c",
+    "h_malloc.c",
+    "memory.c",
+    "pages.c",
+    "random.c",
+    "util.c",
+]
+
+cc_library {
+    name: "libhardened_malloc",
+    ramdisk_available: true,
+    vendor_ramdisk_available: true,
+    recovery_available: true,
+    defaults: ["hardened_malloc_defaults"],
+    srcs: lib_src_files,
+    export_include_dirs: ["include"],
+    static_libs: ["libasync_safe"],
+    target: {
+        android: {
+            shared: {
+                enabled: false,
+            },
+            system_shared_libs: [],
+        },
+        linux_bionic: {
+            system_shared_libs: [],
+        },
+    },
+    product_variables: {
+        debuggable: {
+            cflags: ["-DLABEL_MEMORY"],
+        },
+        device_has_arm_mte: {
+            cflags: ["-DHAS_ARM_MTE", "-march=armv9-a+memtag"]
+        },
+    },
+    apex_available: [
+        "com.android.runtime",
+    ],
+}
diff --git a/src/hardened_malloc/CREDITS b/src/hardened_malloc/CREDITS
new file mode 100644
index 0000000..31b6875
--- /dev/null
+++ b/src/hardened_malloc/CREDITS
@@ -0,0 +1,283 @@
+chacha.c is a simple conversion of chacha-merged.c to a keystream-only implementation:
+
+    chacha-merged.c version 20080118
+    D. J. Bernstein
+    Public domain.
+
+h_malloc.c open-addressed hash table (regions_grow, regions_insert, regions_find, regions_delete):
+
+    Copyright (c) 2008, 2010, 2011, 2016 Otto Moerbeek <otto@drijf.net>
+    Copyright (c) 2012 Matthew Dempsky <matthew@openbsd.org>
+    Copyright (c) 2008 Damien Miller <djm@openbsd.org>
+    Copyright (c) 2000 Poul-Henning Kamp <phk@FreeBSD.org>
+
+    Permission to use, copy, modify, and distribute this software for any
+    purpose with or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+
+    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+libdivide:
+
+    Copyright (C) 2010 - 2019 ridiculous_fish, <libdivide@ridiculousfish.com>
+    Copyright (C) 2016 - 2019 Kim Walisch, <kim.walisch@gmail.com>
+
+    Boost Software License - Version 1.0 - August 17th, 2003
+
+    Permission is hereby granted, free of charge, to any person or organization
+    obtaining a copy of the software and accompanying documentation covered by
+    this license (the "Software") to use, reproduce, display, distribute,
+    execute, and transmit the Software, and to prepare derivative works of the
+    Software, and to permit third-parties to whom the Software is furnished to
+    do so, all subject to the following:
+
+    The copyright notices in the Software and this entire statement, including
+    the above license grant, this restriction and the following disclaimer,
+    must be included in all copies of the Software, in whole or in part, and
+    all derivative works of the Software, unless such copies or derivative
+    works are solely in the form of machine-executable object code generated by
+    a source language processor.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+    SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+    FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+random.c get_random_{type}_uniform functions are based on Fast Random Integer
+Generation in an Interval by Daniel Lemire
+
+arm_mte.h arm_mte_tag_and_clear_mem function contents were copied from storeTags function in scudo:
+
+    ==============================================================================
+    The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
+    ==============================================================================
+
+                                     Apache License
+                               Version 2.0, January 2004
+                            http://www.apache.org/licenses/
+
+        TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+        1. Definitions.
+
+          "License" shall mean the terms and conditions for use, reproduction,
+          and distribution as defined by Sections 1 through 9 of this document.
+
+          "Licensor" shall mean the copyright owner or entity authorized by
+          the copyright owner that is granting the License.
+
+          "Legal Entity" shall mean the union of the acting entity and all
+          other entities that control, are controlled by, or are under common
+          control with that entity. For the purposes of this definition,
+          "control" means (i) the power, direct or indirect, to cause the
+          direction or management of such entity, whether by contract or
+          otherwise, or (ii) ownership of fifty percent (50%) or more of the
+          outstanding shares, or (iii) beneficial ownership of such entity.
+
+          "You" (or "Your") shall mean an individual or Legal Entity
+          exercising permissions granted by this License.
+
+          "Source" form shall mean the preferred form for making modifications,
+          including but not limited to software source code, documentation
+          source, and configuration files.
+
+          "Object" form shall mean any form resulting from mechanical
+          transformation or translation of a Source form, including but
+          not limited to compiled object code, generated documentation,
+          and conversions to other media types.
+
+          "Work" shall mean the work of authorship, whether in Source or
+          Object form, made available under the License, as indicated by a
+          copyright notice that is included in or attached to the work
+          (an example is provided in the Appendix below).
+
+          "Derivative Works" shall mean any work, whether in Source or Object
+          form, that is based on (or derived from) the Work and for which the
+          editorial revisions, annotations, elaborations, or other modifications
+          represent, as a whole, an original work of authorship. For the purposes
+          of this License, Derivative Works shall not include works that remain
+          separable from, or merely link (or bind by name) to the interfaces of,
+          the Work and Derivative Works thereof.
+
+          "Contribution" shall mean any work of authorship, including
+          the original version of the Work and any modifications or additions
+          to that Work or Derivative Works thereof, that is intentionally
+          submitted to Licensor for inclusion in the Work by the copyright owner
+          or by an individual or Legal Entity authorized to submit on behalf of
+          the copyright owner. For the purposes of this definition, "submitted"
+          means any form of electronic, verbal, or written communication sent
+          to the Licensor or its representatives, including but not limited to
+          communication on electronic mailing lists, source code control systems,
+          and issue tracking systems that are managed by, or on behalf of, the
+          Licensor for the purpose of discussing and improving the Work, but
+          excluding communication that is conspicuously marked or otherwise
+          designated in writing by the copyright owner as "Not a Contribution."
+
+          "Contributor" shall mean Licensor and any individual or Legal Entity
+          on behalf of whom a Contribution has been received by Licensor and
+          subsequently incorporated within the Work.
+
+        2. Grant of Copyright License. Subject to the terms and conditions of
+          this License, each Contributor hereby grants to You a perpetual,
+          worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+          copyright license to reproduce, prepare Derivative Works of,
+          publicly display, publicly perform, sublicense, and distribute the
+          Work and such Derivative Works in Source or Object form.
+
+        3. Grant of Patent License. Subject to the terms and conditions of
+          this License, each Contributor hereby grants to You a perpetual,
+          worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+          (except as stated in this section) patent license to make, have made,
+          use, offer to sell, sell, import, and otherwise transfer the Work,
+          where such license applies only to those patent claims licensable
+          by such Contributor that are necessarily infringed by their
+          Contribution(s) alone or by combination of their Contribution(s)
+          with the Work to which such Contribution(s) was submitted. If You
+          institute patent litigation against any entity (including a
+          cross-claim or counterclaim in a lawsuit) alleging that the Work
+          or a Contribution incorporated within the Work constitutes direct
+          or contributory patent infringement, then any patent licenses
+          granted to You under this License for that Work shall terminate
+          as of the date such litigation is filed.
+
+        4. Redistribution. You may reproduce and distribute copies of the
+          Work or Derivative Works thereof in any medium, with or without
+          modifications, and in Source or Object form, provided that You
+          meet the following conditions:
+
+          (a) You must give any other recipients of the Work or
+              Derivative Works a copy of this License; and
+
+          (b) You must cause any modified files to carry prominent notices
+              stating that You changed the files; and
+
+          (c) You must retain, in the Source form of any Derivative Works
+              that You distribute, all copyright, patent, trademark, and
+              attribution notices from the Source form of the Work,
+              excluding those notices that do not pertain to any part of
+              the Derivative Works; and
+
+          (d) If the Work includes a "NOTICE" text file as part of its
+              distribution, then any Derivative Works that You distribute must
+              include a readable copy of the attribution notices contained
+              within such NOTICE file, excluding those notices that do not
+              pertain to any part of the Derivative Works, in at least one
+              of the following places: within a NOTICE text file distributed
+              as part of the Derivative Works; within the Source form or
+              documentation, if provided along with the Derivative Works; or,
+              within a display generated by the Derivative Works, if and
+              wherever such third-party notices normally appear. The contents
+              of the NOTICE file are for informational purposes only and
+              do not modify the License. You may add Your own attribution
+              notices within Derivative Works that You distribute, alongside
+              or as an addendum to the NOTICE text from the Work, provided
+              that such additional attribution notices cannot be construed
+              as modifying the License.
+
+          You may add Your own copyright statement to Your modifications and
+          may provide additional or different license terms and conditions
+          for use, reproduction, or distribution of Your modifications, or
+          for any such Derivative Works as a whole, provided Your use,
+          reproduction, and distribution of the Work otherwise complies with
+          the conditions stated in this License.
+
+        5. Submission of Contributions. Unless You explicitly state otherwise,
+          any Contribution intentionally submitted for inclusion in the Work
+          by You to the Licensor shall be under the terms and conditions of
+          this License, without any additional terms or conditions.
+          Notwithstanding the above, nothing herein shall supersede or modify
+          the terms of any separate license agreement you may have executed
+          with Licensor regarding such Contributions.
+
+        6. Trademarks. This License does not grant permission to use the trade
+          names, trademarks, service marks, or product names of the Licensor,
+          except as required for reasonable and customary use in describing the
+          origin of the Work and reproducing the content of the NOTICE file.
+
+        7. Disclaimer of Warranty. Unless required by applicable law or
+          agreed to in writing, Licensor provides the Work (and each
+          Contributor provides its Contributions) on an "AS IS" BASIS,
+          WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+          implied, including, without limitation, any warranties or conditions
+          of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+          PARTICULAR PURPOSE. You are solely responsible for determining the
+          appropriateness of using or redistributing the Work and assume any
+          risks associated with Your exercise of permissions under this License.
+
+        8. Limitation of Liability. In no event and under no legal theory,
+          whether in tort (including negligence), contract, or otherwise,
+          unless required by applicable law (such as deliberate and grossly
+          negligent acts) or agreed to in writing, shall any Contributor be
+          liable to You for damages, including any direct, indirect, special,
+          incidental, or consequential damages of any character arising as a
+          result of this License or out of the use or inability to use the
+          Work (including but not limited to damages for loss of goodwill,
+          work stoppage, computer failure or malfunction, or any and all
+          other commercial damages or losses), even if such Contributor
+          has been advised of the possibility of such damages.
+
+        9. Accepting Warranty or Additional Liability. While redistributing
+          the Work or Derivative Works thereof, You may choose to offer,
+          and charge a fee for, acceptance of support, warranty, indemnity,
+          or other liability obligations and/or rights consistent with this
+          License. However, in accepting such obligations, You may act only
+          on Your own behalf and on Your sole responsibility, not on behalf
+          of any other Contributor, and only if You agree to indemnify,
+          defend, and hold each Contributor harmless for any liability
+          incurred by, or claims asserted against, such Contributor by reason
+          of your accepting any such warranty or additional liability.
+
+        END OF TERMS AND CONDITIONS
+
+        APPENDIX: How to apply the Apache License to your work.
+
+          To apply the Apache License to your work, attach the following
+          boilerplate notice, with the fields enclosed by brackets "[]"
+          replaced with your own identifying information. (Don't include
+          the brackets!)  The text should be enclosed in the appropriate
+          comment syntax for the file format. We also recommend that a
+          file or class name and description of purpose be included on the
+          same "printed page" as the copyright notice for easier
+          identification within third-party archives.
+
+        Copyright [yyyy] [name of copyright owner]
+
+        Licensed under the Apache License, Version 2.0 (the "License");
+        you may not use this file except in compliance with the License.
+        You may obtain a copy of the License at
+
+           http://www.apache.org/licenses/LICENSE-2.0
+
+        Unless required by applicable law or agreed to in writing, software
+        distributed under the License is distributed on an "AS IS" BASIS,
+        WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+        See the License for the specific language governing permissions and
+        limitations under the License.
+
+
+    ---- LLVM Exceptions to the Apache 2.0 License ----
+
+    As an exception, if, as a result of your compiling your source code, portions
+    of this Software are embedded into an Object form of such source code, you
+    may redistribute such embedded portions in such Object form without complying
+    with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+    In addition, if you combine or link compiled forms of this Software with
+    software that is licensed under the GPLv2 ("Combined Software") and if a
+    court of competent jurisdiction determines that the patent provision (Section
+    3), the indemnity provision (Section 9) or other Section of the License
+    conflicts with the conditions of the GPLv2, you may retroactively and
+    prospectively choose to deem waived or otherwise exclude such Section(s) of
+    the License, but only in their entirety and only with respect to the Combined
+    Software.
+
+    ==============================================================================
diff --git a/src/hardened_malloc/KERNEL_FEATURE_WISHLIST.md b/src/hardened_malloc/KERNEL_FEATURE_WISHLIST.md
new file mode 100644
index 0000000..c3a474d
--- /dev/null
+++ b/src/hardened_malloc/KERNEL_FEATURE_WISHLIST.md
@@ -0,0 +1,35 @@
+Very important and should be an easy sell:
+
+* improved robustness for high vma count on high memory machines
+* much higher `vm.max_map_count` by default
+* work on improving performance and resource usage with high vma count
+* add a way to disable the brk heap and have mmap grow upwards like it did in
+  the past (preserving the same high base entropy)
+
+Somewhat important and an easy sell:
+
+* alternative to `RLIMIT_AS` for accountable mappings only
+    * memory control groups are sometimes a better option but there are still
+      users of `RLIMIT_AS` that are problematic for mitigations or simply fast
+      garbage collector implementations, etc. mapping lots of `PROT_NONE` memory
+* mremap flag to disable unmapping the source mapping
+    * also needed by jemalloc for different reasons
+    * not needed if the kernel gets first class support for arbitrarily sized
+      guard pages and a virtual memory quarantine feature
+    * `MREMAP_DONTUNMAP` is now available but doesn't support expanding the
+      mapping which may be an issue due to VMA merging being unreliable
+
+Fairly infeasible to land but could reduce overhead and extend coverage of
+security features to other code directly using mmap:
+
+* first class support for arbitrarily sized guard pages for mmap and mremap to
+  eliminate half of the resulting VMAs and reduce 2 system calls to 1
+    * not usable if it doesn't support mremap (shrink, grow, grow via move)
+    * not usable if the guard page size is static
+    * should support changing guard size for mremap growth via move
+    * must be possible to set it up from the process
+* virtual memory quarantine
+    * must be possible to set it up from the process
+* first-class support for aligned mappings with mmap and ideally mremap
+    * not usable unless guard page support is provided and of course it has to
+      work with this too
diff --git a/src/hardened_malloc/LICENSE b/src/hardened_malloc/LICENSE
new file mode 100644
index 0000000..5311a0f
--- /dev/null
+++ b/src/hardened_malloc/LICENSE
@@ -0,0 +1,19 @@
+Copyright © 2018-2023 GrapheneOS
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/src/hardened_malloc/Makefile b/src/hardened_malloc/Makefile
new file mode 100644
index 0000000..574a088
--- /dev/null
+++ b/src/hardened_malloc/Makefile
@@ -0,0 +1,148 @@
+VARIANT := default
+
+ifneq ($(VARIANT),)
+    CONFIG_FILE := config/$(VARIANT).mk
+    include config/$(VARIANT).mk
+endif
+
+ifeq ($(VARIANT),default)
+    SUFFIX :=
+else
+    SUFFIX := -$(VARIANT)
+endif
+
+OUT := out$(SUFFIX)
+
+define safe_flag
+$(shell $(CC) $(if $(filter clang%,$(CC)),-Werror=unknown-warning-option) -E $1 - </dev/null >/dev/null 2>&1 && echo $1 || echo $2)
+endef
+
+CPPFLAGS := $(CPPFLAGS) -D_GNU_SOURCE -I include
+SHARED_FLAGS := -pipe -O3 -flto -fPIC -fvisibility=hidden -fno-plt \
+    $(call safe_flag,-fstack-clash-protection) $(call safe_flag,-fcf-protection) -fstack-protector-strong \
+    -Wall -Wextra $(call safe_flag,-Wcast-align=strict,-Wcast-align) -Wcast-qual -Wwrite-strings \
+    -Wundef
+
+ifeq ($(CONFIG_WERROR),true)
+    SHARED_FLAGS += -Werror
+endif
+
+ifeq ($(CONFIG_NATIVE),true)
+    SHARED_FLAGS += -march=native
+endif
+
+ifeq ($(CONFIG_UBSAN),true)
+    SHARED_FLAGS += -fsanitize=undefined -fno-sanitize-recover=undefined
+endif
+
+CFLAGS := $(CFLAGS) -std=c17 $(SHARED_FLAGS) -Wmissing-prototypes -Wstrict-prototypes
+CXXFLAGS := $(CXXFLAGS) -std=c++17 -fsized-deallocation $(SHARED_FLAGS)
+LDFLAGS := $(LDFLAGS) -Wl,-O1,--as-needed,-z,defs,-z,relro,-z,now,-z,nodlopen,-z,text
+
+SOURCES := chacha.c h_malloc.c memory.c pages.c random.c util.c
+OBJECTS := $(SOURCES:.c=.o)
+
+ifeq ($(CONFIG_CXX_ALLOCATOR),true)
+    # make sure LTO is compatible in case CC and CXX don't match (such as clang and g++)
+    CXX := $(CC)
+    LDLIBS += -lstdc++
+
+    SOURCES += new.cc
+    OBJECTS += new.o
+endif
+
+OBJECTS := $(addprefix $(OUT)/,$(OBJECTS))
+
+ifeq (,$(filter $(CONFIG_SEAL_METADATA),true false))
+    $(error CONFIG_SEAL_METADATA must be true or false)
+endif
+
+ifeq (,$(filter $(CONFIG_ZERO_ON_FREE),true false))
+    $(error CONFIG_ZERO_ON_FREE must be true or false)
+endif
+
+ifeq (,$(filter $(CONFIG_WRITE_AFTER_FREE_CHECK),true false))
+    $(error CONFIG_WRITE_AFTER_FREE_CHECK must be true or false)
+endif
+
+ifeq (,$(filter $(CONFIG_SLOT_RANDOMIZE),true false))
+    $(error CONFIG_SLOT_RANDOMIZE must be true or false)
+endif
+
+ifeq (,$(filter $(CONFIG_SLAB_CANARY),true false))
+    $(error CONFIG_SLAB_CANARY must be true or false)
+endif
+
+ifeq (,$(filter $(CONFIG_EXTENDED_SIZE_CLASSES),true false))
+    $(error CONFIG_EXTENDED_SIZE_CLASSES must be true or false)
+endif
+
+ifeq (,$(filter $(CONFIG_LARGE_SIZE_CLASSES),true false))
+    $(error CONFIG_LARGE_SIZE_CLASSES must be true or false)
+endif
+
+ifeq (,$(filter $(CONFIG_STATS),true false))
+    $(error CONFIG_STATS must be true or false)
+endif
+
+ifeq (,$(filter $(CONFIG_SELF_INIT),true false))
+    $(error CONFIG_SELF_INIT must be true or false)
+endif
+
+CPPFLAGS += \
+    -DCONFIG_SEAL_METADATA=$(CONFIG_SEAL_METADATA) \
+    -DZERO_ON_FREE=$(CONFIG_ZERO_ON_FREE) \
+    -DWRITE_AFTER_FREE_CHECK=$(CONFIG_WRITE_AFTER_FREE_CHECK) \
+    -DSLOT_RANDOMIZE=$(CONFIG_SLOT_RANDOMIZE) \
+    -DSLAB_CANARY=$(CONFIG_SLAB_CANARY) \
+    -DSLAB_QUARANTINE_RANDOM_LENGTH=$(CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH) \
+    -DSLAB_QUARANTINE_QUEUE_LENGTH=$(CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH) \
+    -DCONFIG_EXTENDED_SIZE_CLASSES=$(CONFIG_EXTENDED_SIZE_CLASSES) \
+    -DCONFIG_LARGE_SIZE_CLASSES=$(CONFIG_LARGE_SIZE_CLASSES) \
+    -DGUARD_SLABS_INTERVAL=$(CONFIG_GUARD_SLABS_INTERVAL) \
+    -DGUARD_SIZE_DIVISOR=$(CONFIG_GUARD_SIZE_DIVISOR) \
+    -DREGION_QUARANTINE_RANDOM_LENGTH=$(CONFIG_REGION_QUARANTINE_RANDOM_LENGTH) \
+    -DREGION_QUARANTINE_QUEUE_LENGTH=$(CONFIG_REGION_QUARANTINE_QUEUE_LENGTH) \
+    -DREGION_QUARANTINE_SKIP_THRESHOLD=$(CONFIG_REGION_QUARANTINE_SKIP_THRESHOLD) \
+    -DFREE_SLABS_QUARANTINE_RANDOM_LENGTH=$(CONFIG_FREE_SLABS_QUARANTINE_RANDOM_LENGTH) \
+    -DCONFIG_CLASS_REGION_SIZE=$(CONFIG_CLASS_REGION_SIZE) \
+    -DN_ARENA=$(CONFIG_N_ARENA) \
+    -DCONFIG_STATS=$(CONFIG_STATS) \
+    -DCONFIG_SELF_INIT=$(CONFIG_SELF_INIT)
+
+$(OUT)/libhardened_malloc$(SUFFIX).so: $(OBJECTS) | $(OUT)
+	$(CC) $(CFLAGS) $(LDFLAGS) -shared $^ $(LDLIBS) -o $@
+
+$(OUT):
+	mkdir -p $(OUT)
+
+$(OUT)/chacha.o: chacha.c chacha.h util.h $(CONFIG_FILE) | $(OUT)
+	$(COMPILE.c) $(OUTPUT_OPTION) $<
+$(OUT)/h_malloc.o: h_malloc.c include/h_malloc.h mutex.h memory.h pages.h random.h util.h $(CONFIG_FILE) | $(OUT)
+	$(COMPILE.c) $(OUTPUT_OPTION) $<
+$(OUT)/memory.o: memory.c memory.h util.h $(CONFIG_FILE) | $(OUT)
+	$(COMPILE.c) $(OUTPUT_OPTION) $<
+$(OUT)/new.o: new.cc include/h_malloc.h util.h $(CONFIG_FILE) | $(OUT)
+	$(COMPILE.cc) $(OUTPUT_OPTION) $<
+$(OUT)/pages.o: pages.c pages.h memory.h util.h $(CONFIG_FILE) | $(OUT)
+	$(COMPILE.c) $(OUTPUT_OPTION) $<
+$(OUT)/random.o: random.c random.h chacha.h util.h $(CONFIG_FILE) | $(OUT)
+	$(COMPILE.c) $(OUTPUT_OPTION) $<
+$(OUT)/util.o: util.c util.h $(CONFIG_FILE) | $(OUT)
+	$(COMPILE.c) $(OUTPUT_OPTION) $<
+
+check: tidy
+
+tidy:
+	clang-tidy --extra-arg=-std=c17 $(filter %.c,$(SOURCES)) -- $(CPPFLAGS)
+	clang-tidy --extra-arg=-std=c++17 $(filter %.cc,$(SOURCES)) -- $(CPPFLAGS)
+
+clean:
+	rm -f $(OUT)/libhardened_malloc.so $(OBJECTS)
+	$(MAKE) -C test/ clean
+
+test: $(OUT)/libhardened_malloc$(SUFFIX).so
+	$(MAKE) -C test/
+	python3 -m unittest discover --start-directory test/
+
+.PHONY: check clean tidy test
diff --git a/src/hardened_malloc/README.md b/src/hardened_malloc/README.md
new file mode 100644
index 0000000..8962037
--- /dev/null
+++ b/src/hardened_malloc/README.md
@@ -0,0 +1,1037 @@
+# Hardened malloc
+
+* [Introduction](#introduction)
+* [Dependencies](#dependencies)
+* [Testing](#testing)
+    * [Individual Applications](#individual-applications)
+    * [Automated Test Framework](#automated-test-framework)
+* [Compatibility](#compatibility)
+* [OS integration](#os-integration)
+    * [Android-based operating systems](#android-based-operating-systems)
+    * [Traditional Linux-based operating systems](#traditional-linux-based-operating-systems)
+* [Configuration](#configuration)
+* [Core design](#core-design)
+* [Security properties](#security-properties)
+* [Randomness](#randomness)
+* [Size classes](#size-classes)
+* [Scalability](#scalability)
+    * [Small (slab) allocations](#small-slab-allocations)
+        * [Thread caching (or lack thereof)](#thread-caching-or-lack-thereof)
+    * [Large allocations](#large-allocations)
+* [Memory tagging](#memory-tagging)
+* [API extensions](#api-extensions)
+* [Stats](#stats)
+* [System calls](#system-calls)
+
+## Introduction
+
+This is a security-focused general purpose memory allocator providing the
+malloc API along with various extensions. It provides substantial hardening
+against heap corruption vulnerabilities. The security-focused design also leads
+to much less metadata overhead and memory waste from fragmentation than a more
+traditional allocator design. It aims to provide decent overall performance
+with a focus on long-term performance and memory usage rather than allocator
+micro-benchmarks. It offers scalability via a configurable number of entirely
+independent arenas, with the internal locking within arenas further divided
+up per size class.
+
+This project currently supports Bionic (Android), musl and glibc. It may
+support other non-Linux operating systems in the future. For Android, there's
+custom integration and other hardening features which is also planned for musl
+in the future. The glibc support will be limited to replacing the malloc
+implementation because musl is a much more robust and cleaner base to build on
+and can cover the same use cases.
+
+This allocator is intended as a successor to a previous implementation based on
+extending OpenBSD malloc with various additional security features. It's still
+heavily based on the OpenBSD malloc design, albeit not on the existing code
+other than reusing the hash table implementation. The main differences in the
+design are that it's solely focused on hardening rather than finding bugs, uses
+finer-grained size classes along with slab sizes going beyond 4k to reduce
+internal fragmentation, doesn't rely on the kernel having fine-grained mmap
+randomization and only targets 64-bit to make aggressive use of the large
+address space. There are lots of smaller differences in the implementation
+approach. It incorporates the previous extensions made to OpenBSD malloc
+including adding padding to allocations for canaries (distinct from the current
+OpenBSD malloc canaries), write-after-free detection tied to the existing
+clearing on free, queues alongside the existing randomized arrays for
+quarantining allocations and proper double-free detection for quarantined
+allocations. The per-size-class memory regions with their own random bases were
+loosely inspired by the size and type-based partitioning in PartitionAlloc. The
+planned changes to OpenBSD malloc ended up being too extensive and invasive so
+this project was started as a fresh implementation better able to accomplish
+the goals. For 32-bit, a port of OpenBSD malloc with small extensions can be
+used instead as this allocator fundamentally doesn't support that environment.
+
+## Dependencies
+
+Debian stable (currently Debian 12) determines the most ancient set of
+supported dependencies:
+
+* glibc 2.36
+* Linux 6.1
+* Clang 14.0.6 or GCC 12.2.0
+
+For Android, the Linux GKI 5.10, 5.15 and 6.1 branches are supported.
+
+However, using more recent releases is highly recommended. Older versions of
+the dependencies may be compatible at the moment but are not tested and will
+explicitly not be supported.
+
+For external malloc replacement with musl, musl 1.1.20 is required. However,
+there will be custom integration offering better performance in the future
+along with other hardening for the C standard library implementation.
+
+For Android, only the current generation, actively developed maintenance branch of the Android
+Open Source Project will be supported, which currently means `android13-qpr2-release`.
+
+## Testing
+
+### Individual Applications
+
+The `preload.sh` script can be used for testing with dynamically linked
+executables using glibc or musl:
+
+    ./preload.sh krita --new-image RGBA,U8,500,500
+
+It can be necessary to substantially increase the `vm.max_map_count` sysctl to
+accommodate the large number of mappings caused by guard slabs and large
+allocation guard regions. The number of mappings can also be drastically
+reduced via a significant increase to `CONFIG_GUARD_SLABS_INTERVAL` but the
+feature has a low performance and memory usage cost so that isn't recommended.
+
+It can offer slightly better performance when integrated into the C standard
+library and there are other opportunities for similar hardening within C
+standard library and dynamic linker implementations. For example, a library
+region can be implemented to offer similar isolation for dynamic libraries as
+this allocator offers across different size classes. The intention is that this
+will be offered as part of hardened variants of the Bionic and musl C standard
+libraries.
+
+### Automated Test Framework
+
+A collection of simple, automated tests are provided and can be run with the
+make command as follows:
+
+    make test
+
+## Compatibility
+
+OpenSSH 8.1 or higher is required to allow the mprotect `PROT_READ|PROT_WRITE`
+system calls in the seccomp-bpf filter rather than killing the process.
+
+## OS integration
+
+### Android-based operating systems
+
+On GrapheneOS, hardened\_malloc is integrated into the standard C library as
+the standard malloc implementation. Other Android-based operating systems can
+reuse [the integration
+code](https://github.com/GrapheneOS/platform_bionic/commit/20160b81611d6f2acd9ab59241bebeac7cf1d71c)
+to provide it. If desired, jemalloc can be left as a runtime configuration
+option by only conditionally using hardened\_malloc to give users the choice
+between performance and security. However, this reduces security for threat
+models where persistent state is untrusted, i.e. verified boot and attestation
+(see the [attestation sister project](https://attestation.app/about)).
+
+Make sure to raise `vm.max_map_count` substantially too to accommodate the very
+large number of guard pages created by hardened\_malloc. This can be done in
+`init.rc` (`system/core/rootdir/init.rc`) near the other virtual memory
+configuration:
+
+    write /proc/sys/vm/max_map_count 1048576
+
+This is unnecessary if you set `CONFIG_GUARD_SLABS_INTERVAL` to a very large
+value in the build configuration.
+
+### Traditional Linux-based operating systems
+
+On traditional Linux-based operating systems, hardened\_malloc can either be
+integrated into the libc implementation as a replacement for the standard
+malloc implementation or loaded as a dynamic library. Rather than rebuilding
+each executable to be linked against it, it can be added as a preloaded
+library to `/etc/ld.so.preload`. For example, with `libhardened_malloc.so`
+installed to `/usr/local/lib/libhardened_malloc.so`, add that full path as a
+line to the `/etc/ld.so.preload` configuration file:
+
+    /usr/local/lib/libhardened_malloc.so
+
+The format of this configuration file is a whitespace-separated list, so it's
+good practice to put each library on a separate line.
+
+Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis
+will not work when `AT_SECURE` is set such as with setuid binaries. It's also
+generally not a recommended approach for production usage. The recommendation
+is to enable it globally and make exceptions for performance critical cases by
+running the application in a container / namespace without it enabled.
+
+Make sure to raise `vm.max_map_count` substantially too to accommodate the very
+large number of guard pages created by hardened\_malloc. As an example, in
+`/etc/sysctl.d/hardened_malloc.conf`:
+
+    vm.max_map_count = 1048576
+
+This is unnecessary if you set `CONFIG_GUARD_SLABS_INTERVAL` to a very large
+value in the build configuration.
+
+On arm64, make sure your kernel is configured to use 4k pages since we haven't
+yet added support for 16k and 64k pages. The kernel also has to be configured
+to use 4 level page tables for the full 48 bit address space instead of only
+having a 39 bit address space for the default hardened\_malloc configuration.
+It's possible to reduce the class region size substantially to make a 39 bit
+address space workable but the defaults won't work.
+
+## Configuration
+
+You can set some configuration options at compile-time via arguments to the
+make command as follows:
+
+    make CONFIG_EXAMPLE=false
+
+Configuration options are provided when there are significant compromises
+between portability, performance, memory usage or security. The core design
+choices are not configurable and the allocator remains very security-focused
+even with all the optional features disabled.
+
+The configuration system supports a configuration template system with two
+standard presets: the default configuration (`config/default.mk`) and a light
+configuration (`config/light.mk`). Packagers are strongly encouraged to ship
+both the standard `default` and `light` configuration. You can choose the
+configuration to build using `make VARIANT=light` where `make VARIANT=default`
+is the same as `make`. Non-default configuration templates will build a library
+with the suffix `-variant` such as `libhardened_malloc-light.so` and will use
+an `out-variant` directory instead of `out` for the build.
+
+The `default` configuration template has all normal optional security features
+enabled (just not the niche `CONFIG_SEAL_METADATA`) and is quite aggressive in
+terms of sacrificing performance and memory usage for security. The `light`
+configuration template disables the slab quarantines, write after free check,
+slot randomization and raises the guard slab interval from 1 to 8 but leaves
+zero-on-free and slab canaries enabled. The `light` configuration has solid
+performance and memory usage while still being far more secure than mainstream
+allocators with much better security properties. Disabling zero-on-free would
+gain more performance but doesn't make much difference for small allocations
+without also disabling slab canaries. Slab canaries slightly raise memory use
+and slightly slow down performance but are quite important to mitigate small
+overflows and C string overflows. Disabling slab canaries is not recommended
+in most cases since it would no longer be a strict upgrade over traditional
+allocators with headers on allocations and basic consistency checks for them.
+
+For reduced memory usage at the expense of performance (this will also reduce
+the size of the empty slab caches and quarantines, saving a lot of memory,
+since those are currently based on the size of the largest size class):
+
+    make \
+    N_ARENA=1 \
+    CONFIG_EXTENDED_SIZE_CLASSES=false
+
+The following boolean configuration options are available:
+
+* `CONFIG_WERROR`: `true` (default) or `false` to control whether compiler
+  warnings are treated as errors. This is highly recommended, but it can be
+  disabled to avoid patching the Makefile if a compiler version not tested by
+  the project is being used and has warnings. Investigating these warnings is
+  still recommended and the intention is to always be free of any warnings.
+* `CONFIG_NATIVE`: `true` (default) or `false` to control whether the code is
+  optimized for the detected CPU on the host. If this is disabled, setting up a
+  custom `-march` higher than the baseline architecture is highly recommended
+  due to substantial performance benefits for this code.
+* `CONFIG_CXX_ALLOCATOR`: `true` (default) or `false` to control whether the
+  C++ allocator is replaced for slightly improved performance and detection of
+  mismatched sizes for sized deallocation (often type confusion bugs). This
+  will result in linking against the C++ standard library.
+* `CONFIG_ZERO_ON_FREE`: `true` (default) or `false` to control whether small
+  allocations are zeroed on free, to mitigate use-after-free and uninitialized
+  use vulnerabilities along with purging lots of potentially sensitive data
+  from the process as soon as possible. This has a performance cost scaling to
+  the size of the allocation, which is usually acceptable. This is not relevant
+  to large allocations because the pages are given back to the kernel.
+* `CONFIG_WRITE_AFTER_FREE_CHECK`: `true` (default) or `false` to control
+  sanity checking that new small allocations contain zeroed memory. This can
+  detect writes caused by a write-after-free vulnerability and mixes well with
+  the features for making memory reuse randomized / delayed. This has a
+  performance cost scaling to the size of the allocation, which is usually
+  acceptable. This is not relevant to large allocations because they're always
+  a fresh memory mapping from the kernel.
+* `CONFIG_SLOT_RANDOMIZE`: `true` (default) or `false` to randomize selection
+  of free slots within slabs. This has a measurable performance cost and isn't
+  one of the important security features, but the cost has been deemed more
+  than acceptable to be enabled by default.
+* `CONFIG_SLAB_CANARY`: `true` (default) or `false` to enable support for
+  adding 8 byte canaries to the end of memory allocations. The primary purpose
+  of the canaries is to render small fixed size buffer overflows harmless by
+  absorbing them. The first byte of the canary is always zero, containing
+  overflows caused by a missing C string NUL terminator. The other 7 bytes are
+  a per-slab random value. On free, integrity of the canary is checked to
+  detect attacks like linear overflows or other forms of heap corruption caused
+  by imprecise exploit primitives. However, checking on free will often be too
+  late to prevent exploitation so it's not the main purpose of the canaries.
+* `CONFIG_SEAL_METADATA`: `true` or `false` (default) to control whether Memory
+  Protection Keys are used to disable access to all writable allocator state
+  outside of the memory allocator code. It's currently disabled by default due
+  to a significant performance cost for this use case on current generation
+  hardware, which may become drastically lower in the future. Whether or not
+  this feature is enabled, the metadata is all contained within an isolated
+  memory region with high entropy random guard regions around it.
+
+The following integer configuration options are available:
+
+* `CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH`: `1` (default) to control the number
+  of slots in the random array used to randomize reuse for small memory
+  allocations. This sets the length for the largest size class (either 16kiB
+  or 128kiB based on `CONFIG_EXTENDED_SIZE_CLASSES`) and the quarantine length
+  for smaller size classes is scaled to match the total memory of the
+  quarantined allocations (1 becomes 1024 for 16 byte allocations with 16kiB
+  as the largest size class, or 8192 with 128kiB as the largest).
+* `CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH`: `1` (default) to control the number of
+  slots in the queue used to delay reuse for small memory allocations. This
+  sets the length for the largest size class (either 16kiB or 128kiB based on
+  `CONFIG_EXTENDED_SIZE_CLASSES`) and the quarantine length for smaller size
+  classes is scaled to match the total memory of the quarantined allocations (1
+  becomes 1024 for 16 byte allocations with 16kiB as the largest size class, or
+  8192 with 128kiB as the largest).
+* `CONFIG_GUARD_SLABS_INTERVAL`: `1` (default) to control the number of slabs
+  before a slab is skipped and left as an unused memory protected guard slab.
+  The default of `1` leaves a guard slab between every slab. This feature does
+  not have a *direct* performance cost, but it makes the address space usage
+  sparser which can indirectly hurt performance. The kernel also needs to track
+  a lot more memory mappings, which uses a bit of extra memory and slows down
+  memory mapping and memory protection changes in the process. The kernel uses
+  O(log n) algorithms for this and system calls are already fairly slow anyway,
+  so having many extra mappings doesn't usually add up to a significant cost.
+* `CONFIG_GUARD_SIZE_DIVISOR`: `2` (default) to control the maximum size of the
+  guard regions placed on both sides of large memory allocations, relative to
+  the usable size of the memory allocation.
+* `CONFIG_REGION_QUARANTINE_RANDOM_LENGTH`: `256` (default) to control the
+  number of slots in the random array used to randomize region reuse for large
+  memory allocations.
+* `CONFIG_REGION_QUARANTINE_QUEUE_LENGTH`: `1024` (default) to control the
+  number of slots in the queue used to delay region reuse for large memory
+  allocations.
+* `CONFIG_REGION_QUARANTINE_SKIP_THRESHOLD`: `33554432` (default) to control
+  the size threshold where large allocations will not be quarantined.
+* `CONFIG_FREE_SLABS_QUARANTINE_RANDOM_LENGTH`: `32` (default) to control the
+  number of slots in the random array used to randomize free slab reuse.
+* `CONFIG_CLASS_REGION_SIZE`: `34359738368` (default) to control the size of
+  the size class regions.
+* `CONFIG_N_ARENA`: `4` (default) to control the number of arenas
+* `CONFIG_STATS`: `false` (default) to control whether stats on allocation /
+  deallocation count and active allocations are tracked. See the [section on
+  stats](#stats) for more details.
+* `CONFIG_EXTENDED_SIZE_CLASSES`: `true` (default) to control whether small
+  size class go up to 128kiB instead of the minimum requirement for avoiding
+  memory waste of 16kiB. The option to extend it even further will be offered
+  in the future when better support for larger slab allocations is added. See
+  the [section on size classes](#size-classes) below for details.
+* `CONFIG_LARGE_SIZE_CLASSES`: `true` (default) to control whether large
+  allocations use the slab allocation size class scheme instead of page size
+  granularity. See the [section on size classes](#size-classes) below for
+  details.
+
+There will be more control over enabled features in the future along with
+control over fairly arbitrarily chosen values like the size of empty slab
+caches (making them smaller improves security and reduces memory usage while
+larger caches can substantially improves performance).
+
+## Core design
+
+The core design of the allocator is very simple / minimalist. The allocator is
+exclusive to 64-bit platforms in order to take full advantage of the abundant
+address space without being constrained by needing to keep the design
+compatible with 32-bit.
+
+The mutable allocator state is entirely located within a dedicated metadata
+region, and the allocator is designed around this approach for both small
+(slab) allocations and large allocations. This provides reliable, deterministic
+protections against invalid free including double frees, and protects metadata
+from attackers. Traditional allocator exploitation techniques do not work with
+the hardened\_malloc implementation.
+
+Small allocations are always located in a large memory region reserved for slab
+allocations. On free, it can be determined that an allocation is one of the
+small size classes from the address range. If arenas are enabled, the arena is
+also determined from the address range as each arena has a dedicated sub-region
+in the slab allocation region. Arenas provide totally independent slab
+allocators with their own allocator state and no coordination between them.
+Once the base region is determined (simply the slab allocation region as a
+whole without any arenas enabled), the size class is determined from the
+address range too, since it's divided up into a sub-region for each size class.
+There's a top level slab allocation region, divided up into arenas, with each
+of those divided up into size class regions. The size class regions each have a
+random base within a large guard region. Once the size class is determined, the
+slab size is known, and the index of the slab is calculated and used to obtain
+the slab metadata for the slab from the slab metadata array. Finally, the index
+of the slot within the slab provides the index of the bit tracking the slot in
+the bitmap. Every slab allocation slot has a dedicated bit in a bitmap tracking
+whether it's free, along with a separate bitmap for tracking allocations in the
+quarantine. The slab metadata entries in the array have intrusive lists
+threaded through them to track partial slabs (partially filled, and these are
+the first choice for allocation), empty slabs (limited amount of cached free
+memory) and free slabs (purged / memory protected).
+
+Large allocations are tracked via a global hash table mapping their address to
+their size and random guard size. They're simply memory mappings and get mapped
+on allocation and then unmapped on free. Large allocations are the only dynamic
+memory mappings made by the allocator, since the address space for allocator
+state (including both small / large allocation metadata) and slab allocations
+is statically reserved.
+
+This allocator is aimed at production usage, not aiding with finding and fixing
+memory corruption bugs for software development. It does find many latent bugs
+but won't include features like the option of generating and storing stack
+traces for each allocation to include the allocation site in related error
+messages. The design choices are based around minimizing overhead and
+maximizing security which often leads to different decisions than a tool
+attempting to find bugs. For example, it uses zero-based sanitization on free
+and doesn't minimize slack space from size class rounding between the end of an
+allocation and the canary / guard region. Zero-based filling has the least
+chance of uncovering latent bugs, but also the best chance of mitigating
+vulnerabilities. The canary feature is primarily meant to act as padding
+absorbing small overflows to render them harmless, so slack space is helpful
+rather than harmful despite not detecting the corruption on free. The canary
+needs detection on free in order to have any hope of stopping other kinds of
+issues like a sequential overflow, which is why it's included.  It's assumed
+that an attacker can figure out the allocator is in use so the focus is
+explicitly not on detecting bugs that are impossible to exploit with it in use
+like an 8 byte overflow. The design choices would be different if performance
+was a bit less important and if a core goal was finding latent bugs.
+
+## Security properties
+
+* Fully out-of-line metadata/state with protection from corruption
+    * Address space for allocator state is entirely reserved during
+      initialization and never reused for allocations or anything else
+    * State within global variables is entirely read-only after initialization
+      with pointers to the isolated allocator state so leaking the address of
+      the library doesn't leak the address of writable state
+    * Allocator state is located within a dedicated region with high entropy
+      randomly sized guard regions around it
+    * Protection via Memory Protection Keys (MPK) on x86\_64 (disabled by
+      default due to low benefit-cost ratio on top of baseline protections)
+    * [future] Protection via MTE on ARMv8.5+
+* Deterministic detection of any invalid free (unallocated, unaligned, etc.)
+    * Validation of the size passed for C++14 sized deallocation by `delete`
+      even for code compiled with earlier standards (detects type confusion if
+      the size is different) and by various containers using the allocator API
+      directly
+* Isolated memory region for slab allocations
+    * Top-level isolated regions for each arena
+    * Divided up into isolated inner regions for each size class
+        * High entropy random base for each size class region
+        * No deterministic / low entropy offsets between allocations with
+          different size classes
+    * Metadata is completely outside the slab allocation region
+        * No references to metadata within the slab allocation region
+        * No deterministic / low entropy offsets to metadata
+    * Entire slab region starts out non-readable and non-writable
+    * Slabs beyond the cache limit are purged and become non-readable and
+      non-writable memory again
+        * Placed into a queue for reuse in FIFO order to maximize the time
+          spent memory protected
+        * Randomized array is used to add a random delay for reuse
+* Fine-grained randomization within memory regions
+    * Randomly sized guard regions for large allocations
+    * Random slot selection within slabs
+    * Randomized delayed free for small and large allocations along with slabs
+      themselves
+    * [in-progress] Randomized choice of slabs
+    * [in-progress] Randomized allocation of slabs
+* Slab allocations are zeroed on free
+* Detection of write-after-free for slab allocations by verifying zero filling
+  is intact at allocation time
+* Delayed free via a combination of FIFO and randomization for slab allocations
+* Large allocations are purged and memory protected on free with the memory
+  mapping kept reserved in a quarantine to detect use-after-free
+    * The quarantine is primarily based on a FIFO ring buffer, with the oldest
+      mapping in the quarantine being unmapped to make room for the most
+      recently freed mapping
+    * Another layer of the quarantine swaps with a random slot in an array to
+      randomize the number of large deallocations required to push mappings out
+      of the quarantine
+* Memory in fresh allocations is consistently zeroed due to it either being
+  fresh pages or zeroed on free after previous usage
+* Random canaries placed after each slab allocation to *absorb*
+  and then later detect overflows/underflows
+    * High entropy per-slab random values
+    * Leading byte is zeroed to contain C string overflows
+* Possible slab locations are skipped and remain memory protected, leaving slab
+  size class regions interspersed with guard pages
+* Zero size allocations are a dedicated size class with the entire region
+  remaining non-readable and non-writable
+* Extension for retrieving the size of allocations with fallback to a sentinel
+  for pointers not managed by the allocator [in-progress, full implementation
+  needs to be ported from the previous OpenBSD malloc-based allocator]
+    * Can also return accurate values for pointers *within* small allocations
+    * The same applies to pointers within the first page of large allocations,
+      otherwise it currently has to return a sentinel
+* No alignment tricks interfering with ASLR like jemalloc, PartitionAlloc, etc.
+* No usage of the legacy brk heap
+* Aggressive sanity checks
+    * Errors other than ENOMEM from mmap, munmap, mprotect and mremap treated
+      as fatal, which can help to detect memory management gone wrong elsewhere
+      in the process.
+* Memory tagging for slab allocations via MTE on ARMv8.5+
+    * random memory tags as the baseline, providing probabilistic protection
+      against various forms of memory corruption
+    * dedicated tag for free slots, set on free, for deterministic protection
+      against accessing freed memory
+    * guarantee distinct tags for adjacent memory allocations by incrementing
+      past matching values for deterministic detection of linear overflows
+    * [future] store previous random tag and increment it to get the next tag
+      for that slot to provide deterministic use-after-free detection through
+      multiple cycles of memory reuse
+
+## Randomness
+
+The current implementation of random number generation for randomization-based
+mitigations is based on generating a keystream from a stream cipher (ChaCha8)
+in small chunks. Separate CSPRNGs are used for each small size class in each
+arena, large allocations and initialization in order to fit into the
+fine-grained locking model without needing to waste memory per thread by
+having the CSPRNG state in Thread Local Storage. Similarly, it's protected via
+the same approach taken for the rest of the metadata. The stream cipher is
+regularly reseeded from the OS to provide backtracking and prediction
+resistance with a negligible cost. The reseed interval simply needs to be
+adjusted to the point that it stops registering as having any significant
+performance impact. The performance impact on recent Linux kernels is
+primarily from the high cost of system calls and locking since the
+implementation is quite efficient (ChaCha20), especially for just generating
+the key and nonce for another stream cipher (ChaCha8).
+
+ChaCha8 is a great fit because it's extremely fast across platforms without
+relying on hardware support or complex platform-specific code. The security
+margins of ChaCha20 would be completely overkill for the use case. Using
+ChaCha8 avoids needing to resort to a non-cryptographically secure PRNG or
+something without a lot of scrutiny. The current implementation is simply the
+reference implementation of ChaCha8 converted into a pure keystream by ripping
+out the XOR of the message into the keystream.
+
+The random range generation functions are a highly optimized implementation
+too. Traditional uniform random number generation within a range is very high
+overhead and can easily dwarf the cost of an efficient CSPRNG.
+
+## Size classes
+
+The zero byte size class is a special case of the smallest regular size class.
+It's allocated in a dedicated region like other size classes but with the slabs
+never being made readable and writable so the only memory usage is for the slab
+metadata.
+
+The choice of size classes for slab allocation is the same as jemalloc, which
+is a careful balance between minimizing internal and external fragmentation. If
+there are more size classes, more memory is wasted on free slots available only
+to allocation requests of those sizes (external fragmentation). If there are
+fewer size classes, the spacing between them is larger and more memory is
+wasted due to rounding up to the size classes (internal fragmentation). There
+are 4 special size classes for the smallest sizes (16, 32, 48, 64) that are
+simply spaced out by the minimum spacing (16). Afterwards, there are four size
+classes for every power of two spacing which results in bounding the internal
+fragmentation below 20% for each size class. This also means there are 4 size
+classes for each doubling in size.
+
+The slot counts tied to the size classes are specific to this allocator rather
+than being taken from jemalloc. Slabs are always a span of pages so the slot
+count needs to be tuned to minimize waste due to rounding to the page size. For
+now, this allocator is set up only for 4096 byte pages as a small page size is
+desirable for finer-grained memory protection and randomization. It could be
+ported to larger page sizes in the future. The current slot counts are only a
+preliminary set of values.
+
+| size class | worst case internal fragmentation | slab slots | slab size | internal fragmentation for slabs |
+| - | - | - | - | - |
+| 16 | 93.75% | 256 | 4096 | 0.0% |
+| 32 | 46.88% | 128 | 4096 | 0.0% |
+| 48 | 31.25% | 85 | 4096 | 0.390625% |
+| 64 | 23.44% | 64 | 4096 | 0.0% |
+| 80 | 18.75% | 51 | 4096 | 0.390625% |
+| 96 | 15.62% | 42 | 4096 | 1.5625% |
+| 112 | 13.39% | 36 | 4096 | 1.5625% |
+| 128 | 11.72% | 64 | 8192 | 0.0% |
+| 160 | 19.38% | 51 | 8192 | 0.390625% |
+| 192 | 16.15% | 64 | 12288 | 0.0% |
+| 224 | 13.84% | 54 | 12288 | 1.5625% |
+| 256 | 12.11% | 64 | 16384 | 0.0% |
+| 320 | 19.69% | 64 | 20480 | 0.0% |
+| 384 | 16.41% | 64 | 24576 | 0.0% |
+| 448 | 14.06% | 64 | 28672 | 0.0% |
+| 512 | 12.3% | 64 | 32768 | 0.0% |
+| 640 | 19.84% | 64 | 40960 | 0.0% |
+| 768 | 16.54% | 64 | 49152 | 0.0% |
+| 896 | 14.17% | 64 | 57344 | 0.0% |
+| 1024 | 12.4% | 64 | 65536 | 0.0% |
+| 1280 | 19.92% | 16 | 20480 | 0.0% |
+| 1536 | 16.6% | 16 | 24576 | 0.0% |
+| 1792 | 14.23% | 16 | 28672 | 0.0% |
+| 2048 | 12.45% | 16 | 32768 | 0.0% |
+| 2560 | 19.96% | 8 | 20480 | 0.0% |
+| 3072 | 16.63% | 8 | 24576 | 0.0% |
+| 3584 | 14.26% | 8 | 28672 | 0.0% |
+| 4096 | 12.48% | 8 | 32768 | 0.0% |
+| 5120 | 19.98% | 8 | 40960 | 0.0% |
+| 6144 | 16.65% | 8 | 49152 | 0.0% |
+| 7168 | 14.27% | 8 | 57344 | 0.0% |
+| 8192 | 12.49% | 8 | 65536 | 0.0% |
+| 10240 | 19.99% | 6 | 61440 | 0.0% |
+| 12288 | 16.66% | 5 | 61440 | 0.0% |
+| 14336 | 14.28% | 4 | 57344 | 0.0% |
+| 16384 | 12.49% | 4 | 65536 | 0.0% |
+
+The slab allocation size classes end at 16384 since that's the final size for
+2048 byte spacing and the next spacing class matches the page size of 4096
+bytes on the target platforms. This is the minimum set of small size classes
+required to avoid substantial waste from rounding.
+
+The `CONFIG_EXTENDED_SIZE_CLASSES` option extends the size classes up to
+131072, with a final spacing class of 16384. This offers improved performance
+compared to the minimum set of size classes. The security story is complicated,
+since the slab allocation has both advantages like size class isolation
+completely avoiding reuse of any of the address space for any other size
+classes or other data. It also has disadvantages like caching a small number of
+empty slabs and deterministic guard sizes. The cache will be configurable in
+the future, making it possible to disable slab caching for the largest slab
+allocation sizes, to force unmapping them immediately and putting them in the
+slab quarantine, which eliminates most of the security disadvantage at the
+expense of also giving up most of the performance advantage, but while
+retaining the isolation.
+
+| size class | worst case internal fragmentation | slab slots | slab size | internal fragmentation for slabs |
+| - | - | - | - | - |
+| 20480 | 20.0% | 1 | 20480 | 0.0% |
+| 24576 | 16.66% | 1 | 24576 | 0.0% |
+| 28672 | 14.28% | 1 | 28672 | 0.0% |
+| 32768 | 12.5% | 1 | 32768 | 0.0% |
+| 40960 | 20.0% | 1 | 40960 | 0.0% |
+| 49152 | 16.66% | 1 | 49152 | 0.0% |
+| 57344 | 14.28% | 1 | 57344 | 0.0% |
+| 65536 | 12.5% | 1 | 65536 | 0.0% |
+| 81920 | 20.0% | 1 | 81920 | 0.0% |
+| 98304 | 16.67% | 1 | 98304 | 0.0% |
+| 114688 | 14.28% | 1 | 114688 | 0.0% |
+| 131072 | 12.5% | 1 | 131072 | 0.0% |
+
+The `CONFIG_LARGE_SIZE_CLASSES` option controls whether large allocations use
+the same size class scheme providing 4 size classes for every doubling of size.
+It increases virtual memory consumption but drastically improves performance
+where realloc is used without proper growth factors, which is fairly common and
+destroys performance in some commonly used programs. If large size classes are
+disabled, the granularity is instead the page size, which is currently always
+4096 bytes on supported platforms.
+
+## Scalability
+
+### Small (slab) allocations
+
+As a baseline form of fine-grained locking, the slab allocator has entirely
+separate allocators for each size class. Each size class has a dedicated lock,
+CSPRNG and other state.
+
+The slab allocator's scalability primarily comes from dividing up the slab
+allocation region into independent arenas assigned to threads. The arenas are
+just entirely separate slab allocators with their own sub-regions for each size
+class. Using 4 arenas reserves a region 4 times as large and the relevant slab
+allocator metadata is determined based on address, as part of the same approach
+to finding the per-size-class metadata. The part that's still open to different
+design choices is how arenas are assigned to threads. One approach is
+statically assigning arenas via round-robin like the standard jemalloc
+implementation, or statically assigning to a random arena which is essentially
+the current implementation. Another option is dynamic load balancing via a
+heuristic like `sched_getcpu` for per-CPU arenas, which would offer better
+performance than randomly choosing an arena each time while being more
+predictable for an attacker. There are actually some security benefits from
+this assignment being completely static, since it isolates threads from each
+other. Static assignment can also reduce memory usage since threads may have
+varying usage of size classes.
+
+When there's substantial allocation or deallocation pressure, the allocator
+does end up calling into the kernel to purge / protect unused slabs by
+replacing them with fresh `PROT_NONE` regions along with unprotecting slabs
+when partially filled and cached empty slabs are depleted. There will be
+configuration over the amount of cached empty slabs, but it's not entirely a
+performance vs. memory trade-off since memory protecting unused slabs is a nice
+opportunistic boost to security. However, it's not really part of the core
+security model or features so it's quite reasonable to use much larger empty
+slab caches when the memory usage is acceptable. It would also be reasonable to
+attempt to use heuristics for dynamically tuning the size, but there's not a
+great one size fits all approach so it isn't currently part of this allocator
+implementation.
+
+#### Thread caching (or lack thereof)
+
+Thread caches are a commonly implemented optimization in modern allocators but
+aren't very suitable for a hardened allocator even when implemented via arrays
+like jemalloc rather than free lists. They would prevent the allocator from
+having perfect knowledge about which memory is free in a way that's both race
+free and works with fully out-of-line metadata. It would also interfere with
+the quality of fine-grained randomization even with randomization support in
+the thread caches. The caches would also end up with much weaker protection
+than the dedicated metadata region. Potentially worst of all, it's inherently
+incompatible with the important quarantine feature.
+
+The primary benefit from a thread cache is performing batches of allocations
+and batches of deallocations to amortize the cost of the synchronization used
+by locking. The issue is not contention but rather the cost of synchronization
+itself. Performing operations in large batches isn't necessarily a good thing
+in terms of reducing contention to improve scalability. Large thread caches
+like TCMalloc are a legacy design choice and aren't a good approach for a
+modern allocator. In jemalloc, thread caches are fairly small and have a form
+of garbage collection to clear them out when they aren't being heavily used.
+Since this is a hardened allocator with a bunch of small costs for the security
+features, the synchronization is already a smaller percentage of the overall
+time compared to a much leaner performance-oriented allocator. These benefits
+could be obtained via allocation queues and deallocation queues which would
+avoid bypassing the quarantine and wouldn't have as much of an impact on
+randomization. However, deallocation queues would also interfere with having
+global knowledge about what is free. An allocation queue alone wouldn't have
+many drawbacks, but it isn't currently planned even as an optional feature
+since it probably wouldn't be enabled by default and isn't worth the added
+complexity.
+
+The secondary benefit of thread caches is being able to avoid the underlying
+allocator implementation entirely for some allocations and deallocations when
+they're mixed together rather than many allocations being done together or many
+frees being done together. The value of this depends a lot on the application
+and it's entirely unsuitable / incompatible with a hardened allocator since it
+bypasses all of the underlying security and would destroy much of the security
+value.
+
+### Large allocations
+
+The expectation is that the allocator does not need to perform well for large
+allocations, especially in terms of scalability. When the performance for large
+allocations isn't good enough, the approach will be to enable more slab
+allocation size classes. Doubling the maximum size of slab allocations only
+requires adding 4 size classes while keeping internal waste bounded below 20%.
+
+Large allocations are implemented as a wrapper on top of the kernel memory
+mapping API. The addresses and sizes are tracked in a global data structure
+with a global lock. The current implementation is a hash table and could easily
+use fine-grained locking, but it would have little benefit since most of the
+locking is in the kernel. Most of the contention will be on the `mmap_sem` lock
+for the process in the kernel. Ideally, it could simply map memory when
+allocating and unmap memory when freeing. However, this is a hardened allocator
+and the security features require extra system calls due to lack of direct
+support for this kind of hardening in the kernel. Randomly sized guard regions
+are placed around each allocation which requires mapping a `PROT_NONE` region
+including the guard regions and then unprotecting the usable area between them.
+The quarantine implementation requires clobbering the mapping with a fresh
+`PROT_NONE` mapping using `MAP_FIXED` on free to hold onto the region while
+it's in the quarantine, until it's eventually unmapped when it's pushed out of
+the quarantine. This means there are 2x as many system calls for allocating and
+freeing as there would be if the kernel supported these features directly.
+
+## Memory tagging
+
+**Memory tagging has been implemented and this section is currently
+out-of-date.**
+
+Integrating extensive support for ARMv8.5 memory tagging is planned and this
+section will be expanded to cover the details on the chosen design. The approach
+for slab allocations is currently covered, but it can also be used for the
+allocator metadata region and large allocations.
+
+Memory allocations are already always multiples of naturally aligned 16 byte
+units, so memory tags are a natural fit into a malloc implementation due to the
+16 byte alignment requirement. The only extra memory consumption will come from
+the hardware supported storage for the tag values (4 bits per 16 bytes).
+
+The baseline policy will be to generate random tags for each slab allocation
+slot on first use. The highest value will be reserved for marking freed memory
+allocations to detect any accesses to freed memory so it won't be part of the
+generated range. Adjacent slots will be guaranteed to have distinct memory tags
+in order to guarantee that linear overflows are detected. There are a few ways
+of implementing this and it will end up depending on the performance costs of
+different approaches. If there's an efficient way to fetch the adjacent tag
+values without wasting extra memory, it will be possible to check for them and
+skip them either by generating a new random value in a loop or incrementing
+past them since the tiny bit of bias wouldn't matter. Another approach would be
+alternating odd and even tag values but that would substantially reduce the
+overall randomness of the tags and there's very little entropy from the start.
+
+Once a slab allocation has been freed, the tag will be set to the reserved
+value for free memory and the previous tag value will be stored inside the
+allocation itself. The next time the slot is allocated, the chosen tag value
+will be the previous value incremented by one to provide use-after-free
+detection between generations of allocations. The stored tag will be wiped
+before retagging the memory, to avoid leaking it and as part of preserving the
+security property of newly allocated memory being zeroed due to zero-on-free.
+It will eventually wrap all the way around, but this ends up providing a strong
+guarantee for many allocation cycles due to the combination of 4 bit tags with
+the FIFO quarantine feature providing delayed free. It also benefits from
+random slot allocation and the randomized portion of delayed free, which result
+in a further delay along with preventing a deterministic bypass by forcing a
+reuse after a certain number of allocation cycles. Similarly to the initial tag
+generation, tag values for adjacent allocations will be skipped by incrementing
+past them.
+
+For example, consider this slab of allocations that are not yet used with 15
+representing the tag for free memory. For the sake of simplicity, there will be
+no quarantine or other slabs for this example:
+
+    | 15 | 15 | 15 | 15 | 15 | 15 |
+
+Three slots are randomly chosen for allocations, with random tags assigned (2,
+7, 14) since these slots haven't ever been used and don't have saved values:
+
+    | 15 | 2  | 15 | 7  | 14 | 15 |
+
+The 2nd allocation slot is freed, and is set back to the tag for free memory
+(15), but with the previous tag value stored in the freed space:
+
+    | 15 | 15 | 15 | 7  | 14 | 15 |
+
+The first slot is allocated for the first time, receiving the random value 3:
+
+    | 3  | 15 | 15 | 7  | 14 | 15 |
+
+The 2nd slot is randomly chosen again, so the previous tag (2) is retrieved and
+incremented to 3 as part of the use-after-free mitigation. An adjacent
+allocation already uses the tag 3, so the tag is further incremented to 4 (it
+would be incremented to 5 if one of the adjacent tags was 4):
+
+    | 3  | 4  | 15 | 7  | 14 | 15 |
+
+The last slot is randomly chosen for the next allocation, and is assigned the
+random value 14. However, it's placed next to an allocation with the tag 14 so
+the tag is incremented and wraps around to 0:
+
+    | 3  | 4  | 15 | 7  | 14 | 0  |
+
+## API extensions
+
+The `void free_sized(void *ptr, size_t expected_size)` function exposes the
+sized deallocation sanity checks for C. A performance-oriented allocator could
+use the same API as an optimization to avoid a potential cache miss from
+reading the size from metadata.
+
+The `size_t malloc_object_size(void *ptr)` function returns an *upper bound* on
+the accessible size of the relevant object (if any) by querying the malloc
+implementation. It's similar to the `__builtin_object_size` intrinsic used by
+`_FORTIFY_SOURCE` but via dynamically querying the malloc implementation rather
+than determining constant sizes at compile-time. The current implementation is
+just a naive placeholder returning much looser upper bounds than the intended
+implementation. It's a valid implementation of the API already, but it will
+become fully accurate once it's finished. This function is **not** currently
+safe to call from signal handlers, but another API will be provided to make
+that possible with a compile-time configuration option to avoid the necessary
+overhead if the functionality isn't being used (in a way that doesn't change
+break API compatibility based on the configuration).
+
+The `size_t malloc_object_size_fast(void *ptr)` is comparable, but avoids
+expensive operations like locking or even atomics. It provides significantly
+less useful results falling back to higher upper bounds, but is very fast. In
+this implementation, it retrieves an upper bound on the size for small memory
+allocations based on calculating the size class region. This function is safe
+to use from signal handlers already.
+
+## Stats
+
+If stats are enabled, hardened\_malloc keeps tracks allocator statistics in
+order to provide implementations of `mallinfo` and `malloc_info`.
+
+On Android, `mallinfo` is used for [mallinfo-based garbage collection
+triggering](https://developer.android.com/preview/features#mallinfo) so
+hardened\_malloc enables `CONFIG_STATS` by default. The `malloc_info`
+implementation on Android is the standard one in Bionic, with the information
+provided to Bionic via Android's internal extended `mallinfo` API with support
+for arenas and size class bins. This means the `malloc_info` output is fully
+compatible, including still having `jemalloc-1` as the version of the data
+format to retain compatibility with existing tooling.
+
+On non-Android Linux, `mallinfo` has zeroed fields even with `CONFIG_STATS`
+enabled because glibc `mallinfo` is inherently broken. It defines the fields as
+`int` instead of `size_t`, resulting in undefined signed overflows. It also
+misuses the fields and provides a strange, idiosyncratic set of values rather
+than following the SVID/XPG `mallinfo` definition. The `malloc_info` function
+is still provided, with a similar format as what Android uses, with tweaks for
+hardened\_malloc and the version set to `hardened_malloc-1`. The data format
+may be changed in the future.
+
+As an example, consider the following program from the hardened\_malloc tests:
+
+```c
+#include <pthread.h>
+
+#include <malloc.h>
+
+__attribute__((optimize(0)))
+void leak_memory(void) {
+    (void)malloc(1024 * 1024 * 1024);
+    (void)malloc(16);
+    (void)malloc(32);
+    (void)malloc(4096);
+}
+
+void *do_work(void *p) {
+    leak_memory();
+    return NULL;
+}
+
+int main(void) {
+    pthread_t thread[4];
+    for (int i = 0; i < 4; i++) {
+        pthread_create(&thread[i], NULL, do_work, NULL);
+    }
+    for (int i = 0; i < 4; i++) {
+        pthread_join(thread[i], NULL);
+    }
+
+    malloc_info(0, stdout);
+}
+```
+
+This produces the following output when piped through `xmllint --format -`:
+
+```xml
+<?xml version="1.0"?>
+<malloc version="hardened_malloc-1">
+  <heap nr="0">
+    <bin nr="2" size="32">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>4096</slab_allocated>
+      <allocated>32</allocated>
+    </bin>
+    <bin nr="3" size="48">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>4096</slab_allocated>
+      <allocated>48</allocated>
+    </bin>
+    <bin nr="13" size="320">
+      <nmalloc>4</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>20480</slab_allocated>
+      <allocated>1280</allocated>
+    </bin>
+    <bin nr="29" size="5120">
+      <nmalloc>2</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>40960</slab_allocated>
+      <allocated>10240</allocated>
+    </bin>
+    <bin nr="45" size="81920">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>81920</slab_allocated>
+      <allocated>81920</allocated>
+    </bin>
+  </heap>
+  <heap nr="1">
+    <bin nr="2" size="32">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>4096</slab_allocated>
+      <allocated>32</allocated>
+    </bin>
+    <bin nr="3" size="48">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>4096</slab_allocated>
+      <allocated>48</allocated>
+    </bin>
+    <bin nr="29" size="5120">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>40960</slab_allocated>
+      <allocated>5120</allocated>
+    </bin>
+  </heap>
+  <heap nr="2">
+    <bin nr="2" size="32">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>4096</slab_allocated>
+      <allocated>32</allocated>
+    </bin>
+    <bin nr="3" size="48">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>4096</slab_allocated>
+      <allocated>48</allocated>
+    </bin>
+    <bin nr="29" size="5120">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>40960</slab_allocated>
+      <allocated>5120</allocated>
+    </bin>
+  </heap>
+  <heap nr="3">
+    <bin nr="2" size="32">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>4096</slab_allocated>
+      <allocated>32</allocated>
+    </bin>
+    <bin nr="3" size="48">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>4096</slab_allocated>
+      <allocated>48</allocated>
+    </bin>
+    <bin nr="29" size="5120">
+      <nmalloc>1</nmalloc>
+      <ndalloc>0</ndalloc>
+      <slab_allocated>40960</slab_allocated>
+      <allocated>5120</allocated>
+    </bin>
+  </heap>
+  <heap nr="4">
+    <allocated_large>4294967296</allocated_large>
+  </heap>
+</malloc>
+```
+
+The heap entries correspond to the arenas. Unlike jemalloc, hardened\_malloc
+doesn't handle large allocations within the arenas, so it presents those in the
+`malloc_info` statistics as a separate arena dedicated to large allocations.
+For example, with 4 arenas enabled, there will be a 5th arena in the statistics
+for the large allocations.
+
+The `nmalloc` / `ndalloc` fields are 64-bit integers tracking allocation and
+deallocation count. These are defined as wrapping on overflow, per the jemalloc
+implementation.
+
+See the [section on size classes](#size-classes) to map the size class bin
+number to the corresponding size class. The bin index begins at 0, mapping to
+the 0 byte size class, followed by 1 for the 16 bytes, 2 for 32 bytes, etc. and
+large allocations are treated as one group.
+
+When stats aren't enabled, the `malloc_info` output will be an empty `malloc`
+element.
+
+## System calls
+
+This is intended to aid with creating system call whitelists via seccomp-bpf
+and will change over time.
+
+System calls used by all build configurations:
+
+* `futex(uaddr, FUTEX_WAIT_PRIVATE, val, NULL)` (via `pthread_mutex_lock`)
+* `futex(uaddr, FUTEX_WAKE_PRIVATE, val)` (via `pthread_mutex_unlock`)
+* `getrandom(buf, buflen, 0)` (to seed and regularly reseed the CSPRNG)
+* `mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)`
+* `mmap(ptr, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0)`
+* `mprotect(ptr, size, PROT_READ)`
+* `mprotect(ptr, size, PROT_READ|PROT_WRITE)`
+* `mremap(old, old_size, new_size, 0)`
+* `mremap(old, old_size, new_size, MREMAP_MAYMOVE|MREMAP_FIXED, new)`
+* `munmap`
+* `write(STDERR_FILENO, buf, len)` (before aborting due to memory corruption)
+* `madvise(ptr, size, MADV_DONTNEED)`
+
+The main distinction from a typical malloc implementation is the use of
+getrandom. A common compatibility issue is that existing system call whitelists
+often omit getrandom partly due to older code using the legacy `/dev/urandom`
+interface along with the overall lack of security features in mainstream libc
+implementations.
+
+Additional system calls when `CONFIG_SEAL_METADATA=true` is set:
+
+* `pkey_alloc`
+* `pkey_mprotect` instead of `mprotect` with an additional `pkey` parameter,
+  but otherwise the same (regular `mprotect` is never called)
+
+Additional system calls for Android builds with `LABEL_MEMORY`:
+
+* `prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, name)`
diff --git a/src/hardened_malloc/androidtest/Android.bp b/src/hardened_malloc/androidtest/Android.bp
new file mode 100644
index 0000000..ae0aa49
--- /dev/null
+++ b/src/hardened_malloc/androidtest/Android.bp
@@ -0,0 +1,25 @@
+java_test_host {
+    name: "HMallocTest",
+    srcs: [
+        "src/**/*.java",
+    ],
+
+    libs: [
+        "tradefed",
+        "compatibility-tradefed",
+        "compatibility-host-util",
+    ],
+
+    static_libs: [
+        "cts-host-utils",
+        "frameworks-base-hostutils",
+    ],
+
+    test_suites: [
+        "general-tests",
+    ],
+
+    data_device_bins_64: [
+        "memtag_test",
+    ],
+}
diff --git a/src/hardened_malloc/androidtest/AndroidTest.xml b/src/hardened_malloc/androidtest/AndroidTest.xml
new file mode 100644
index 0000000..333f1dd
--- /dev/null
+++ b/src/hardened_malloc/androidtest/AndroidTest.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<configuration description="hardened_malloc test">
+
+    <target_preparer class="com.android.compatibility.common.tradefed.targetprep.FilePusher">
+        <option name="cleanup" value="true" />
+        <option name="push" value="memtag_test->/data/local/tmp/memtag_test" />
+    </target_preparer>
+
+    <test class="com.android.compatibility.common.tradefed.testtype.JarHostTest" >
+        <option name="jar" value="HMallocTest.jar" />
+    </test>
+
+</configuration>
diff --git a/src/hardened_malloc/androidtest/memtag/Android.bp b/src/hardened_malloc/androidtest/memtag/Android.bp
new file mode 100644
index 0000000..14ab691
--- /dev/null
+++ b/src/hardened_malloc/androidtest/memtag/Android.bp
@@ -0,0 +1,16 @@
+cc_test {
+    name: "memtag_test",
+    srcs: ["memtag_test.cc"],
+    cflags: [
+        "-Wall",
+        "-Werror",
+        "-Wextra",
+        "-O0",
+    ],
+
+    compile_multilib: "64",
+
+    sanitize: {
+        memtag_heap: true,
+    },
+}
diff --git a/src/hardened_malloc/androidtest/memtag/memtag_test.cc b/src/hardened_malloc/androidtest/memtag/memtag_test.cc
new file mode 100644
index 0000000..ca491d8
--- /dev/null
+++ b/src/hardened_malloc/androidtest/memtag/memtag_test.cc
@@ -0,0 +1,297 @@
+// needed to uncondionally enable assertions
+#undef NDEBUG
+#include <assert.h>
+#include <malloc.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_map>
+
+using namespace std;
+
+using u8 = uint8_t;
+using uptr = uintptr_t;
+using u64 = uint64_t;
+
+const size_t DEFAULT_ALLOC_SIZE = 8;
+const size_t CANARY_SIZE = 8;
+
+void do_context_switch() {
+    utsname s;
+    uname(&s);
+}
+
+u8 get_pointer_tag(void *ptr) {
+    return (((uptr) ptr) >> 56) & 0xf;
+}
+
+void *untag_pointer(void *ptr) {
+    const uintptr_t mask = UINTPTR_MAX >> 8;
+    return (void *) ((uintptr_t) ptr & mask);
+}
+
+// This test checks that slab slot allocation uses tag that is distint from tags of its neighbors
+// and from the tag of the previous allocation that used the same slot
+void tag_distinctness() {
+    // 0 and 15 are reserved
+    const int min_tag = 1;
+    const int max_tag = 14;
+
+    struct SizeClass {
+        int size;
+        int slot_cnt;
+    };
+
+    // values from size_classes[] and size_class_slots[] in h_malloc.c
+    SizeClass size_classes[] = {
+        { .size = 16,    .slot_cnt = 256, },
+        { .size = 32,    .slot_cnt = 128, },
+        // this size class is used by allocations that are made by the addr_tag_map, which breaks
+        // tag distinctess checks
+        // { .size = 48,    .slot_cnt = 85,  },
+        { .size = 64,    .slot_cnt = 64,  },
+        { .size = 80,    .slot_cnt = 51,  },
+        { .size = 96,    .slot_cnt = 42,  },
+        { .size = 112,   .slot_cnt = 36,  },
+        { .size = 128,   .slot_cnt = 64,  },
+        { .size = 160,   .slot_cnt = 51,  },
+        { .size = 192,   .slot_cnt = 64,  },
+        { .size = 224,   .slot_cnt = 54,  },
+        { .size = 10240, .slot_cnt = 6,   },
+        { .size = 20480, .slot_cnt = 1,   },
+    };
+
+    int tag_usage[max_tag + 1];
+
+    for (size_t sc_idx = 0; sc_idx < sizeof(size_classes) / sizeof(SizeClass); ++sc_idx) {
+        SizeClass &sc = size_classes[sc_idx];
+
+        const size_t full_alloc_size = sc.size;
+        const size_t alloc_size = full_alloc_size - CANARY_SIZE;
+
+        // "tdc" is short for "tag distinctness check"
+        int left_neighbor_tdc_cnt = 0;
+        int right_neighbor_tdc_cnt = 0;
+        int prev_alloc_tdc_cnt = 0;
+
+        int iter_cnt = 600;
+
+        unordered_map<uptr, u8> addr_tag_map;
+        addr_tag_map.reserve(iter_cnt * sc.slot_cnt);
+
+        u64 seen_tags = 0;
+
+        for (int iter = 0; iter < iter_cnt; ++iter) {
+            uptr allocations[256]; // 256 is max slot count
+
+            for (int i = 0; i < sc.slot_cnt; ++i) {
+                u8 *p = (u8 *) malloc(alloc_size);
+                assert(p);
+                uptr addr = (uptr) untag_pointer(p);
+                u8 tag = get_pointer_tag(p);
+
+                assert(tag >= min_tag && tag <= max_tag);
+                seen_tags |= 1 << tag;
+                ++tag_usage[tag];
+
+                // check most recent tags of left and right neighbors
+
+                auto left = addr_tag_map.find(addr - full_alloc_size);
+                if (left != addr_tag_map.end()) {
+                    assert(left->second != tag);
+                    ++left_neighbor_tdc_cnt;
+                }
+
+                auto right = addr_tag_map.find(addr + full_alloc_size);
+                if (right != addr_tag_map.end()) {
+                    assert(right->second != tag);
+                    ++right_neighbor_tdc_cnt;
+                }
+
+                // check previous tag of this slot
+                auto prev = addr_tag_map.find(addr);
+                if (prev != addr_tag_map.end()) {
+                    assert(prev->second != tag);
+                    ++prev_alloc_tdc_cnt;
+                    addr_tag_map.erase(addr);
+                }
+
+                addr_tag_map.emplace(addr, tag);
+
+                for (size_t j = 0; j < alloc_size; ++j) {
+                    // check that slot is zeroed
+                    assert(p[j] == 0);
+                    // check that slot is readable and writable
+                    p[j]++;
+                }
+
+                allocations[i] = addr;
+            }
+
+            // free some of allocations to allow their slots to be reused
+            for (int i = sc.slot_cnt - 1; i >= 0; i -= 2) {
+                free((void *) allocations[i]);
+            }
+        }
+
+        // check that all of the tags were used, except reserved ones
+        assert(seen_tags == (0xffff & ~(1 << 0 | 1 << 15)));
+
+        printf("size_class\t%i\t" "tdc_left %i\t" "tdc_right %i\t" "tdc_prev_alloc %i\n",
+               sc.size, left_neighbor_tdc_cnt, right_neighbor_tdc_cnt, prev_alloc_tdc_cnt);
+
+        // make sure tag distinctess checks were actually performed
+        int min_tdc_cnt = sc.slot_cnt * iter_cnt / 5;
+
+        assert(prev_alloc_tdc_cnt > min_tdc_cnt);
+
+        if (sc.slot_cnt > 1) {
+            assert(left_neighbor_tdc_cnt > min_tdc_cnt);
+            assert(right_neighbor_tdc_cnt > min_tdc_cnt);
+        }
+
+        // async tag check failures are reported on context switch
+        do_context_switch();
+    }
+
+    printf("\nTag use counters:\n");
+
+    int min = INT_MAX;
+    int max = 0;
+    double geomean = 0.0;
+    for (int i = min_tag; i <= max_tag; ++i) {
+        int v = tag_usage[i];
+        geomean += log(v);
+        min = std::min(min, v);
+        max = std::max(max, v);
+        printf("%i\t%i\n", i, tag_usage[i]);
+    }
+    int tag_cnt = 1 + max_tag - min_tag;
+    geomean = exp(geomean / tag_cnt);
+
+    double max_deviation = std::max((double) max - geomean, geomean - min);
+
+    printf("geomean: %.2f, max deviation from geomean: %.2f%%\n", geomean, (100.0 * max_deviation) / geomean);
+}
+
+u8* alloc_default() {
+    const size_t full_alloc_size = DEFAULT_ALLOC_SIZE + CANARY_SIZE;
+    set<uptr> addrs;
+
+    // make sure allocation has both left and right neighbors, otherwise overflow/underflow tests
+    // will fail when allocation is at the end/beginning of slab
+    for (;;) {
+        u8 *p = (u8 *) malloc(DEFAULT_ALLOC_SIZE);
+        assert(p);
+
+        uptr addr = (uptr) untag_pointer(p);
+        uptr left = addr - full_alloc_size;
+        if (addrs.find(left) != addrs.end()) {
+            uptr right = addr + full_alloc_size;
+            if (addrs.find(right) != addrs.end()) {
+                return p;
+            }
+        }
+
+        addrs.emplace(addr);
+    }
+}
+
+volatile u8 u8_var;
+
+void read_after_free() {
+    u8 *p = alloc_default();
+    free(p);
+    volatile u8 v = p[0];
+    (void) v;
+}
+
+void write_after_free() {
+    u8 *p = alloc_default();
+    free(p);
+    p[0] = 1;
+}
+
+void underflow_read() {
+    u8 *p = alloc_default();
+    volatile u8 v = p[-1];
+    (void) v;
+}
+
+void underflow_write() {
+    u8 *p = alloc_default();
+    p[-1] = 1;
+}
+
+void overflow_read() {
+    u8 *p = alloc_default();
+    volatile u8 v = p[DEFAULT_ALLOC_SIZE + CANARY_SIZE];
+    (void) v;
+}
+
+void overflow_write() {
+    u8 *p = alloc_default();
+    p[DEFAULT_ALLOC_SIZE + CANARY_SIZE] = 1;
+}
+
+void untagged_read() {
+    u8 *p = alloc_default();
+    p = (u8 *) untag_pointer(p);
+    volatile u8 v = p[0];
+    (void) v;
+}
+
+void untagged_write() {
+    u8 *p = alloc_default();
+    p = (u8 *) untag_pointer(p);
+    p[0] = 1;
+}
+
+map<string, function<void()>> tests = {
+#define TEST(s) { #s, s }
+    TEST(tag_distinctness),
+    TEST(read_after_free),
+    TEST(write_after_free),
+    TEST(overflow_read),
+    TEST(overflow_write),
+    TEST(underflow_read),
+    TEST(underflow_write),
+    TEST(untagged_read),
+    TEST(untagged_write),
+#undef TEST
+};
+
+void segv_handler(int, siginfo_t *si, void *) {
+    fprintf(stderr, "SEGV_CODE %i", si->si_code);
+    exit(139); // standard exit code for SIGSEGV
+}
+
+int main(int argc, char **argv) {
+    setbuf(stdout, NULL);
+    assert(argc == 2);
+
+    auto test_name = string(argv[1]);
+    auto test_fn = tests[test_name];
+    assert(test_fn != nullptr);
+
+    assert(mallopt(M_BIONIC_SET_HEAP_TAGGING_LEVEL, M_HEAP_TAGGING_LEVEL_ASYNC) == 1);
+
+    struct sigaction sa = {
+        .sa_sigaction = segv_handler,
+        .sa_flags = SA_SIGINFO,
+    };
+
+    assert(sigaction(SIGSEGV, &sa, nullptr) == 0);
+
+    test_fn();
+    do_context_switch();
+    
+    return 0;
+}
diff --git a/src/hardened_malloc/androidtest/src/grapheneos/hmalloc/MemtagTest.java b/src/hardened_malloc/androidtest/src/grapheneos/hmalloc/MemtagTest.java
new file mode 100644
index 0000000..8cb7a45
--- /dev/null
+++ b/src/hardened_malloc/androidtest/src/grapheneos/hmalloc/MemtagTest.java
@@ -0,0 +1,95 @@
+package grapheneos.hmalloc;
+
+import com.android.tradefed.device.DeviceNotAvailableException;
+import com.android.tradefed.testtype.DeviceJUnit4ClassRunner;
+import com.android.tradefed.testtype.junit4.BaseHostJUnit4Test;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+@RunWith(DeviceJUnit4ClassRunner.class)
+public class MemtagTest extends BaseHostJUnit4Test {
+
+    private static final String TEST_BINARY = "/data/local/tmp/memtag_test";
+
+    enum Result {
+        SUCCESS(0, ""),
+        // it's expected that the device is configured to use asymm MTE tag checking mode
+        ASYNC_MTE_ERROR(139, "SEGV_CODE 8"),
+        SYNC_MTE_ERROR(139, "SEGV_CODE 9"),
+        ;
+
+        public final int exitCode;
+        public final String stderr;
+
+        Result(int exitCode, String stderr) {
+            this.exitCode = exitCode;
+            this.stderr = stderr;
+        }
+    }
+
+    private static final int SEGV_EXIT_CODE = 139;
+
+    private void runTest(String name, Result expectedResult) throws DeviceNotAvailableException {
+        var args = new ArrayList<String>();
+        args.add(TEST_BINARY);
+        args.add(name);
+        String cmdLine = String.join(" ", args);
+
+        var result = getDevice().executeShellV2Command(cmdLine);
+
+        assertEquals("process exit code", expectedResult.exitCode, result.getExitCode().intValue());
+        assertEquals("stderr", expectedResult.stderr, result.getStderr());
+    }
+
+    @Test
+    public void tag_distinctness() throws DeviceNotAvailableException {
+        runTest("tag_distinctness", Result.SUCCESS);
+    }
+
+    @Test
+    public void read_after_free() throws DeviceNotAvailableException {
+        runTest("read_after_free", Result.SYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void write_after_free() throws DeviceNotAvailableException {
+        runTest("write_after_free", Result.ASYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void underflow_read() throws DeviceNotAvailableException {
+        runTest("underflow_read", Result.SYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void underflow_write() throws DeviceNotAvailableException {
+        runTest("underflow_write", Result.ASYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void overflow_read() throws DeviceNotAvailableException {
+        runTest("overflow_read", Result.SYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void overflow_write() throws DeviceNotAvailableException {
+        runTest("overflow_write", Result.ASYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void untagged_read() throws DeviceNotAvailableException {
+        runTest("untagged_read", Result.SYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void untagged_write() throws DeviceNotAvailableException {
+        runTest("untagged_write", Result.ASYNC_MTE_ERROR);
+    }
+}
diff --git a/src/hardened_malloc/arm_mte.h b/src/hardened_malloc/arm_mte.h
new file mode 100644
index 0000000..ea3445e
--- /dev/null
+++ b/src/hardened_malloc/arm_mte.h
@@ -0,0 +1,91 @@
+#ifndef ARM_MTE_H
+#define ARM_MTE_H
+
+#include <arm_acle.h>
+#include <util.h>
+
+// Returns a tagged pointer.
+// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/IRG--Insert-Random-Tag-
+static inline void *arm_mte_create_random_tag(void *p, u64 exclusion_mask) {
+    return __arm_mte_create_random_tag(p, exclusion_mask);
+}
+
+// Tag the memory region with the tag specified in tag bits of tagged_ptr. Memory region itself is
+// zeroed.
+// tagged_ptr has to be aligned by 16, and len has to be a multiple of 16 (tag granule size).
+//
+// Arm's software optimization guide says:
+// "it is recommended to use STZGM (or DCZGVA) to set tag if data is not a concern." (STZGM and
+// DCGZVA are zeroing variants of tagging instructions).
+//
+// Contents of this function were copied from scudo:
+// https://android.googlesource.com/platform/external/scudo/+/refs/tags/android-14.0.0_r1/standalone/memtag.h#167
+//
+// scudo is licensed under the Apache License v2.0 with LLVM Exceptions, which is compatible with
+// the hardened_malloc's MIT license
+static inline void arm_mte_tag_and_clear_mem(void *tagged_ptr, size_t len) {
+    uintptr_t Begin = (uintptr_t) tagged_ptr;
+    uintptr_t End = Begin + len;
+    uintptr_t LineSize, Next, Tmp;
+    __asm__ __volatile__(
+        ".arch_extension memtag \n\t"
+
+        // Compute the cache line size in bytes (DCZID_EL0 stores it as the log2
+        // of the number of 4-byte words) and bail out to the slow path if DCZID_EL0
+        // indicates that the DC instructions are unavailable.
+        "DCZID .req %[Tmp] \n\t"
+        "mrs DCZID, dczid_el0 \n\t"
+        "tbnz DCZID, #4, 3f \n\t"
+        "and DCZID, DCZID, #15 \n\t"
+        "mov %[LineSize], #4 \n\t"
+        "lsl %[LineSize], %[LineSize], DCZID \n\t"
+        ".unreq DCZID \n\t"
+
+        // Our main loop doesn't handle the case where we don't need to perform any
+        // DC GZVA operations. If the size of our tagged region is less than
+        // twice the cache line size, bail out to the slow path since it's not
+        // guaranteed that we'll be able to do a DC GZVA.
+        "Size .req %[Tmp] \n\t"
+        "sub Size, %[End], %[Cur] \n\t"
+        "cmp Size, %[LineSize], lsl #1 \n\t"
+        "b.lt 3f \n\t"
+        ".unreq Size \n\t"
+
+        "LineMask .req %[Tmp] \n\t"
+        "sub LineMask, %[LineSize], #1 \n\t"
+
+        // STZG until the start of the next cache line.
+        "orr %[Next], %[Cur], LineMask \n\t"
+
+        "1:\n\t"
+        "stzg %[Cur], [%[Cur]], #16 \n\t"
+        "cmp %[Cur], %[Next] \n\t"
+        "b.lt 1b \n\t"
+
+        // DC GZVA cache lines until we have no more full cache lines.
+        "bic %[Next], %[End], LineMask \n\t"
+        ".unreq LineMask \n\t"
+
+        "2: \n\t"
+        "dc gzva, %[Cur] \n\t"
+        "add %[Cur], %[Cur], %[LineSize] \n\t"
+        "cmp %[Cur], %[Next] \n\t"
+        "b.lt 2b \n\t"
+
+        // STZG until the end of the tagged region. This loop is also used to handle
+        // slow path cases.
+
+        "3: \n\t"
+        "cmp %[Cur], %[End] \n\t"
+        "b.ge 4f \n\t"
+        "stzg %[Cur], [%[Cur]], #16 \n\t"
+        "b 3b \n\t"
+
+        "4: \n\t"
+
+        : [Cur] "+&r"(Begin), [LineSize] "=&r"(LineSize), [Next] "=&r"(Next), [Tmp] "=&r"(Tmp)
+        : [End] "r"(End)
+        : "memory"
+    );
+}
+#endif
diff --git a/src/hardened_malloc/calculate_waste.py b/src/hardened_malloc/calculate_waste.py
new file mode 100755
index 0000000..ca26d9a
--- /dev/null
+++ b/src/hardened_malloc/calculate_waste.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+from sys import argv
+
+size_classes = [
+    16, 32, 48, 64, 80, 96, 112, 128,
+    160, 192, 224, 256,
+    320, 384, 448, 512,
+    640, 768, 896, 1024,
+    1280, 1536, 1792, 2048,
+    2560, 3072, 3584, 4096,
+    5120, 6144, 7168, 8192,
+    10240, 12288, 14336, 16384,
+    20480, 24576, 28672, 32768,
+    40960, 49152, 57344, 65536,
+    81920, 98304, 114688, 131072,
+]
+
+size_class_slots = [
+    256, 128, 85, 64, 51, 42, 36, 64,
+    51, 64, 54, 64,
+    64, 64, 64, 64,
+    64, 64, 64, 64,
+    16, 16, 16, 16,
+    8, 8, 8, 8,
+    8, 8, 8, 8,
+    6, 5, 4, 4,
+    2, 2, 2, 2,
+    1, 1, 1, 1,
+    1, 1, 1, 1,
+]
+
+fragmentation = [100 - 1 / 16 * 100]
+
+for i in range(len(size_classes) - 1):
+    size_class = size_classes[i + 1]
+    worst_case = size_classes[i] + 1
+    used = worst_case / size_class
+    fragmentation.append(100 - used * 100);
+
+def page_align(size):
+    return (size + 4095) & ~4095
+
+print("| ", end="")
+print("size class", "worst case internal fragmentation", "slab slots", "slab size", "internal fragmentation for slabs", sep=" | ", end=" |\n")
+print("| ", end='')
+print("-", "-", "-", "-", "-", sep=" | ", end=" |\n")
+for size, slots, fragmentation in zip(size_classes, size_class_slots, fragmentation):
+    used = size * slots
+    real = page_align(used)
+    print("| ", end='')
+    print(size, f"{fragmentation:.4}%", slots, real, str(100 - used / real * 100) + "%", sep=" | ", end=" |\n")
+
+if len(argv) < 2:
+    exit()
+
+max_bits = 256
+max_page_span = 16
+
+print()
+
+print("maximum bitmap size is {}-bit".format(max_bits))
+print("maximum page span size is {} ({})".format(max_page_span, max_page_span * 4096))
+
+for size_class in size_classes:
+    choices = []
+    for bits in range(1, max_bits + 1):
+        used = size_class * bits
+        real = page_align(used)
+        if real > 65536:
+            continue
+        pages = real / 4096
+        efficiency = used / real * 100
+        choices.append((bits, used, real, pages, efficiency))
+
+    choices.sort(key=lambda x: x[4], reverse=True)
+
+    print()
+    print("size_class:", size_class)
+    for choice in choices[:10]:
+        print(choice)
diff --git a/src/hardened_malloc/chacha.c b/src/hardened_malloc/chacha.c
new file mode 100644
index 0000000..541a7ac
--- /dev/null
+++ b/src/hardened_malloc/chacha.c
@@ -0,0 +1,177 @@
+// Based on chacha-merged.c version 20080118
+// D. J. Bernstein
+// Public domain.
+
+#include "chacha.h"
+
+// ChaCha8
+static const unsigned rounds = 8;
+
+#define U8C(v) (v##U)
+#define U32C(v) (v##U)
+
+#define U8V(v) ((u8)(v) & U8C(0xFF))
+#define U32V(v) ((u32)(v) & U32C(0xFFFFFFFF))
+
+#define ROTL32(v, n) \
+    (U32V((v) << (n)) | ((v) >> (32 - (n))))
+
+#define U8TO32_LITTLE(p) \
+    (((u32)((p)[0])) | \
+     ((u32)((p)[1]) <<  8) | \
+     ((u32)((p)[2]) << 16) | \
+     ((u32)((p)[3]) << 24))
+
+#define U32TO8_LITTLE(p, v) \
+    do { \
+        (p)[0] = U8V((v)); \
+        (p)[1] = U8V((v) >>  8); \
+        (p)[2] = U8V((v) >> 16); \
+        (p)[3] = U8V((v) >> 24); \
+    } while (0)
+
+#define ROTATE(v, c) (ROTL32(v, c))
+#define XOR(v, w) ((v) ^ (w))
+#define PLUS(v, w) (U32V((v) + (w)))
+#define PLUSONE(v) (PLUS((v), 1))
+
+#define QUARTERROUND(a, b, c, d) \
+    a = PLUS(a, b); d = ROTATE(XOR(d, a), 16); \
+    c = PLUS(c, d); b = ROTATE(XOR(b, c), 12); \
+    a = PLUS(a, b); d = ROTATE(XOR(d, a), 8); \
+    c = PLUS(c, d); b = ROTATE(XOR(b, c), 7);
+
+static const char sigma[16] = "expand 32-byte k";
+
+void chacha_keysetup(chacha_ctx *x, const u8 *k) {
+    x->input[0] = U8TO32_LITTLE(sigma + 0);
+    x->input[1] = U8TO32_LITTLE(sigma + 4);
+    x->input[2] = U8TO32_LITTLE(sigma + 8);
+    x->input[3] = U8TO32_LITTLE(sigma + 12);
+    x->input[4] = U8TO32_LITTLE(k + 0);
+    x->input[5] = U8TO32_LITTLE(k + 4);
+    x->input[6] = U8TO32_LITTLE(k + 8);
+    x->input[7] = U8TO32_LITTLE(k + 12);
+    x->input[8] = U8TO32_LITTLE(k + 16);
+    x->input[9] = U8TO32_LITTLE(k + 20);
+    x->input[10] = U8TO32_LITTLE(k + 24);
+    x->input[11] = U8TO32_LITTLE(k + 28);
+}
+
+void chacha_ivsetup(chacha_ctx *x, const u8 *iv) {
+    x->input[12] = 0;
+    x->input[13] = 0;
+    x->input[14] = U8TO32_LITTLE(iv + 0);
+    x->input[15] = U8TO32_LITTLE(iv + 4);
+}
+
+void chacha_keystream_bytes(chacha_ctx *x, u8 *c, u32 bytes) {
+    if (!bytes) {
+        return;
+    }
+
+    u8 *ctarget;
+    u8 tmp[64];
+
+    u32 j0 = x->input[0];
+    u32 j1 = x->input[1];
+    u32 j2 = x->input[2];
+    u32 j3 = x->input[3];
+    u32 j4 = x->input[4];
+    u32 j5 = x->input[5];
+    u32 j6 = x->input[6];
+    u32 j7 = x->input[7];
+    u32 j8 = x->input[8];
+    u32 j9 = x->input[9];
+    u32 j10 = x->input[10];
+    u32 j11 = x->input[11];
+    u32 j12 = x->input[12];
+    u32 j13 = x->input[13];
+    u32 j14 = x->input[14];
+    u32 j15 = x->input[15];
+
+    for (;;) {
+        if (bytes < 64) {
+            ctarget = c;
+            c = tmp;
+        }
+        u32 x0 = j0;
+        u32 x1 = j1;
+        u32 x2 = j2;
+        u32 x3 = j3;
+        u32 x4 = j4;
+        u32 x5 = j5;
+        u32 x6 = j6;
+        u32 x7 = j7;
+        u32 x8 = j8;
+        u32 x9 = j9;
+        u32 x10 = j10;
+        u32 x11 = j11;
+        u32 x12 = j12;
+        u32 x13 = j13;
+        u32 x14 = j14;
+        u32 x15 = j15;
+        for (unsigned i = rounds; i > 0; i -= 2) {
+            QUARTERROUND(x0, x4, x8, x12)
+            QUARTERROUND(x1, x5, x9, x13)
+            QUARTERROUND(x2, x6, x10, x14)
+            QUARTERROUND(x3, x7, x11, x15)
+            QUARTERROUND(x0, x5, x10, x15)
+            QUARTERROUND(x1, x6, x11, x12)
+            QUARTERROUND(x2, x7, x8, x13)
+            QUARTERROUND(x3, x4, x9, x14)
+        }
+        x0 = PLUS(x0, j0);
+        x1 = PLUS(x1, j1);
+        x2 = PLUS(x2, j2);
+        x3 = PLUS(x3, j3);
+        x4 = PLUS(x4, j4);
+        x5 = PLUS(x5, j5);
+        x6 = PLUS(x6, j6);
+        x7 = PLUS(x7, j7);
+        x8 = PLUS(x8, j8);
+        x9 = PLUS(x9, j9);
+        x10 = PLUS(x10, j10);
+        x11 = PLUS(x11, j11);
+        x12 = PLUS(x12, j12);
+        x13 = PLUS(x13, j13);
+        x14 = PLUS(x14, j14);
+        x15 = PLUS(x15, j15);
+
+        j12 = PLUSONE(j12);
+        if (!j12) {
+            j13 = PLUSONE(j13);
+            // stopping at 2^70 bytes per nonce is user's responsibility
+        }
+
+        U32TO8_LITTLE(c + 0, x0);
+        U32TO8_LITTLE(c + 4, x1);
+        U32TO8_LITTLE(c + 8, x2);
+        U32TO8_LITTLE(c + 12, x3);
+        U32TO8_LITTLE(c + 16, x4);
+        U32TO8_LITTLE(c + 20, x5);
+        U32TO8_LITTLE(c + 24, x6);
+        U32TO8_LITTLE(c + 28, x7);
+        U32TO8_LITTLE(c + 32, x8);
+        U32TO8_LITTLE(c + 36, x9);
+        U32TO8_LITTLE(c + 40, x10);
+        U32TO8_LITTLE(c + 44, x11);
+        U32TO8_LITTLE(c + 48, x12);
+        U32TO8_LITTLE(c + 52, x13);
+        U32TO8_LITTLE(c + 56, x14);
+        U32TO8_LITTLE(c + 60, x15);
+
+        if (bytes <= 64) {
+            if (bytes < 64) {
+                for (unsigned i = 0; i < bytes; ++i) {
+                    ctarget[i] = c[i];
+                }
+            }
+            x->input[12] = j12;
+            x->input[13] = j13;
+            return;
+        }
+        bytes -= 64;
+        c += 64;
+    }
+}
diff --git a/src/hardened_malloc/chacha.h b/src/hardened_malloc/chacha.h
new file mode 100644
index 0000000..81d070f
--- /dev/null
+++ b/src/hardened_malloc/chacha.h
@@ -0,0 +1,17 @@
+#ifndef CHACHA_H
+#define CHACHA_H
+
+#include "util.h"
+
+#define CHACHA_KEY_SIZE 32
+#define CHACHA_IV_SIZE 8
+
+typedef struct {
+    u32 input[16];
+} chacha_ctx;
+
+void chacha_keysetup(chacha_ctx *x, const u8 *k);
+void chacha_ivsetup(chacha_ctx *x, const u8 *iv);
+void chacha_keystream_bytes(chacha_ctx *x, u8 *c, u32 bytes);
+
+#endif
diff --git a/src/hardened_malloc/config/default.mk b/src/hardened_malloc/config/default.mk
new file mode 100644
index 0000000..71b1cc4
--- /dev/null
+++ b/src/hardened_malloc/config/default.mk
@@ -0,0 +1,23 @@
+CONFIG_WERROR := true
+CONFIG_NATIVE := true
+CONFIG_CXX_ALLOCATOR := true
+CONFIG_UBSAN := false
+CONFIG_SEAL_METADATA := false
+CONFIG_ZERO_ON_FREE := true
+CONFIG_WRITE_AFTER_FREE_CHECK := true
+CONFIG_SLOT_RANDOMIZE := true
+CONFIG_SLAB_CANARY := true
+CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH := 1
+CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH := 1
+CONFIG_EXTENDED_SIZE_CLASSES := true
+CONFIG_LARGE_SIZE_CLASSES := true
+CONFIG_GUARD_SLABS_INTERVAL := 1
+CONFIG_GUARD_SIZE_DIVISOR := 2
+CONFIG_REGION_QUARANTINE_RANDOM_LENGTH := 256
+CONFIG_REGION_QUARANTINE_QUEUE_LENGTH := 1024
+CONFIG_REGION_QUARANTINE_SKIP_THRESHOLD := 33554432 # 32MiB
+CONFIG_FREE_SLABS_QUARANTINE_RANDOM_LENGTH := 32
+CONFIG_CLASS_REGION_SIZE := 34359738368 # 32GiB
+CONFIG_N_ARENA := 4
+CONFIG_STATS := false
+CONFIG_SELF_INIT := true
diff --git a/src/hardened_malloc/config/light.mk b/src/hardened_malloc/config/light.mk
new file mode 100644
index 0000000..88a0e1f
--- /dev/null
+++ b/src/hardened_malloc/config/light.mk
@@ -0,0 +1,23 @@
+CONFIG_WERROR := true
+CONFIG_NATIVE := true
+CONFIG_CXX_ALLOCATOR := true
+CONFIG_UBSAN := false
+CONFIG_SEAL_METADATA := false
+CONFIG_ZERO_ON_FREE := true
+CONFIG_WRITE_AFTER_FREE_CHECK := false
+CONFIG_SLOT_RANDOMIZE := false
+CONFIG_SLAB_CANARY := true
+CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH := 0
+CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH := 0
+CONFIG_EXTENDED_SIZE_CLASSES := true
+CONFIG_LARGE_SIZE_CLASSES := true
+CONFIG_GUARD_SLABS_INTERVAL := 8
+CONFIG_GUARD_SIZE_DIVISOR := 2
+CONFIG_REGION_QUARANTINE_RANDOM_LENGTH := 256
+CONFIG_REGION_QUARANTINE_QUEUE_LENGTH := 1024
+CONFIG_REGION_QUARANTINE_SKIP_THRESHOLD := 33554432 # 32MiB
+CONFIG_FREE_SLABS_QUARANTINE_RANDOM_LENGTH := 32
+CONFIG_CLASS_REGION_SIZE := 34359738368 # 32GiB
+CONFIG_N_ARENA := 4
+CONFIG_STATS := false
+CONFIG_SELF_INIT := true
diff --git a/src/hardened_malloc/h_malloc.c b/src/hardened_malloc/h_malloc.c
new file mode 100644
index 0000000..ffcf0e4
--- /dev/null
+++ b/src/hardened_malloc/h_malloc.c
@@ -0,0 +1,2190 @@
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <pthread.h>
+#include <unistd.h>
+
+#include "third_party/libdivide.h"
+
+#include "h_malloc.h"
+#include "memory.h"
+#include "memtag.h"
+#include "mutex.h"
+#include "pages.h"
+#include "random.h"
+#include "util.h"
+
+#ifdef USE_PKEY
+#include <sys/mman.h>
+#endif
+
+#define SLAB_QUARANTINE (SLAB_QUARANTINE_RANDOM_LENGTH > 0 || SLAB_QUARANTINE_QUEUE_LENGTH > 0)
+#define REGION_QUARANTINE (REGION_QUARANTINE_RANDOM_LENGTH > 0 || REGION_QUARANTINE_QUEUE_LENGTH > 0)
+#define MREMAP_MOVE_THRESHOLD ((size_t)32 * 1024 * 1024)
+
+static_assert(sizeof(void *) == 8, "64-bit only");
+
+static_assert(!WRITE_AFTER_FREE_CHECK || ZERO_ON_FREE, "WRITE_AFTER_FREE_CHECK depends on ZERO_ON_FREE");
+
+static_assert(SLAB_QUARANTINE_RANDOM_LENGTH >= 0 && SLAB_QUARANTINE_RANDOM_LENGTH <= 65536,
+    "invalid slab quarantine random length");
+static_assert(SLAB_QUARANTINE_QUEUE_LENGTH >= 0 && SLAB_QUARANTINE_QUEUE_LENGTH <= 65536,
+    "invalid slab quarantine queue length");
+static_assert(REGION_QUARANTINE_RANDOM_LENGTH >= 0 && REGION_QUARANTINE_RANDOM_LENGTH <= 65536,
+    "invalid region quarantine random length");
+static_assert(REGION_QUARANTINE_QUEUE_LENGTH >= 0 && REGION_QUARANTINE_QUEUE_LENGTH <= 65536,
+    "invalid region quarantine queue length");
+static_assert(FREE_SLABS_QUARANTINE_RANDOM_LENGTH >= 0 && FREE_SLABS_QUARANTINE_RANDOM_LENGTH <= 65536,
+    "invalid free slabs quarantine random length");
+
+static_assert(GUARD_SLABS_INTERVAL >= 1, "invalid guard slabs interval (minimum 1)");
+static_assert(GUARD_SIZE_DIVISOR >= 1, "invalid guard size divisor (minimum 1)");
+static_assert(CONFIG_CLASS_REGION_SIZE >= 1048576, "invalid class region size (minimum 1048576)");
+static_assert(CONFIG_CLASS_REGION_SIZE <= 1099511627776, "invalid class region size (maximum 1099511627776)");
+static_assert(REGION_QUARANTINE_SKIP_THRESHOLD >= 0,
+    "invalid region quarantine skip threshold (minimum 0)");
+static_assert(MREMAP_MOVE_THRESHOLD >= REGION_QUARANTINE_SKIP_THRESHOLD,
+    "mremap move threshold must be above region quarantine limit");
+
+// either sizeof(u64) or 0
+static const size_t canary_size = SLAB_CANARY ? sizeof(u64) : 0;
+
+static_assert(N_ARENA >= 1, "must have at least 1 arena");
+static_assert(N_ARENA <= 256, "maximum number of arenas is currently 256");
+#define CACHELINE_SIZE 64
+
+#if N_ARENA > 1
+__attribute__((tls_model("initial-exec")))
+static _Thread_local unsigned thread_arena = N_ARENA;
+static atomic_uint thread_arena_counter = 0;
+#else
+static const unsigned thread_arena = 0;
+#endif
+
+static union {
+    struct {
+        void *slab_region_start;
+        void *_Atomic slab_region_end;
+        struct size_class *size_class_metadata[N_ARENA];
+        struct region_allocator *region_allocator;
+        struct region_metadata *regions[2];
+#ifdef USE_PKEY
+        int metadata_pkey;
+#endif
+#ifdef MEMTAG
+        bool is_memtag_disabled;
+#endif
+    };
+    char padding[PAGE_SIZE];
+} ro __attribute__((aligned(PAGE_SIZE)));
+
+static inline void *get_slab_region_end(void) {
+    return atomic_load_explicit(&ro.slab_region_end, memory_order_acquire);
+}
+
+#ifdef MEMTAG
+static inline bool is_memtag_enabled(void) {
+    return !ro.is_memtag_disabled;
+}
+#endif
+
+#define SLAB_METADATA_COUNT
+
+struct slab_metadata {
+    u64 bitmap[4];
+    struct slab_metadata *next;
+    struct slab_metadata *prev;
+#if SLAB_CANARY
+    u64 canary_value;
+#endif
+#ifdef SLAB_METADATA_COUNT
+    u16 count;
+#endif
+#if SLAB_QUARANTINE
+    u64 quarantine_bitmap[4];
+#endif
+#ifdef HAS_ARM_MTE
+    // arm_mte_tags is used as a u4 array (MTE tags are 4-bit wide)
+    //
+    // Its size is calculated by the following formula:
+    // (MAX_SLAB_SLOT_COUNT + 2) / 2
+    // MAX_SLAB_SLOT_COUNT is currently 256, 2 extra slots are needed for branchless handling of
+    // edge slots in tag_and_clear_slab_slot()
+    //
+    // It's intentionally placed at the end of struct to improve locality: for most size classes,
+    // slot count is far lower than MAX_SLAB_SLOT_COUNT.
+    u8 arm_mte_tags[129];
+#endif
+};
+
+static const size_t min_align = 16;
+#define MIN_SLAB_SIZE_CLASS_SHIFT 4
+
+#if !CONFIG_EXTENDED_SIZE_CLASSES
+static const size_t max_slab_size_class = 16384;
+#define MAX_SLAB_SIZE_CLASS_SHIFT 14
+// limit on the number of cached empty slabs before attempting purging instead
+static const size_t max_empty_slabs_total = max_slab_size_class * 4;
+#else
+static const size_t max_slab_size_class = 131072;
+#define MAX_SLAB_SIZE_CLASS_SHIFT 17
+// limit on the number of cached empty slabs before attempting purging instead
+static const size_t max_empty_slabs_total = max_slab_size_class;
+#endif
+
+#if SLAB_QUARANTINE && CONFIG_EXTENDED_SIZE_CLASSES
+static const size_t min_extended_size_class = 20480;
+#endif
+
+static const u32 size_classes[] = {
+    /* 0 */ 0,
+    /* 16 */ 16, 32, 48, 64, 80, 96, 112, 128,
+    /* 32 */ 160, 192, 224, 256,
+    /* 64 */ 320, 384, 448, 512,
+    /* 128 */ 640, 768, 896, 1024,
+    /* 256 */ 1280, 1536, 1792, 2048,
+    /* 512 */ 2560, 3072, 3584, 4096,
+    /* 1024 */ 5120, 6144, 7168, 8192,
+    /* 2048 */ 10240, 12288, 14336, 16384,
+#if CONFIG_EXTENDED_SIZE_CLASSES
+    /* 4096 */ 20480, 24576, 28672, 32768,
+    /* 8192 */ 40960, 49152, 57344, 65536,
+    /* 16384 */ 81920, 98304, 114688, 131072,
+#endif
+};
+
+static const u16 size_class_slots[] = {
+    /* 0 */ 256,
+    /* 16 */ 256, 128, 85, 64, 51, 42, 36, 64,
+    /* 32 */ 51, 64, 54, 64,
+    /* 64 */ 64, 64, 64, 64,
+    /* 128 */ 64, 64, 64, 64,
+    /* 256 */ 16, 16, 16, 16,
+    /* 512 */ 8, 8, 8, 8,
+    /* 1024 */ 8, 8, 8, 8,
+    /* 2048 */ 6, 5, 4, 4,
+#if CONFIG_EXTENDED_SIZE_CLASSES
+    /* 4096 */ 1, 1, 1, 1,
+    /* 8192 */ 1, 1, 1, 1,
+    /* 16384 */ 1, 1, 1, 1,
+#endif
+};
+
+static size_t get_slots(unsigned class) {
+    return size_class_slots[class];
+}
+
+static const char *const size_class_labels[] = {
+    /* 0 */ "malloc 0",
+    /* 16 */ "malloc 16", "malloc 32", "malloc 48", "malloc 64",
+    /* 16 */ "malloc 80", "malloc 96", "malloc 112", "malloc 128",
+    /* 32 */ "malloc 160", "malloc 192", "malloc 224", "malloc 256",
+    /* 64 */ "malloc 320", "malloc 384", "malloc 448", "malloc 512",
+    /* 128 */ "malloc 640", "malloc 768", "malloc 896", "malloc 1024",
+    /* 256 */ "malloc 1280", "malloc 1536", "malloc 1792", "malloc 2048",
+    /* 512 */ "malloc 2560", "malloc 3072", "malloc 3584", "malloc 4096",
+    /* 1024 */ "malloc 5120", "malloc 6144", "malloc 7168", "malloc 8192",
+    /* 2048 */ "malloc 10240", "malloc 12288", "malloc 14336", "malloc 16384",
+#if CONFIG_EXTENDED_SIZE_CLASSES
+    /* 4096 */ "malloc 20480", "malloc 24576", "malloc 28672", "malloc 32768",
+    /* 8192 */ "malloc 40960", "malloc 49152", "malloc 57344", "malloc 65536",
+    /* 16384 */ "malloc 81920", "malloc 98304", "malloc 114688", "malloc 131072",
+#endif
+};
+
+static void label_slab(void *slab, size_t slab_size, unsigned class) {
+    memory_set_name(slab, slab_size, size_class_labels[class]);
+}
+
+#define N_SIZE_CLASSES (sizeof(size_classes) / sizeof(size_classes[0]))
+
+struct size_info {
+    size_t size;
+    size_t class;
+};
+
+static inline struct size_info get_size_info(size_t size) {
+    if (unlikely(size == 0)) {
+        return (struct size_info){0, 0};
+    }
+    // size <= 64 is needed for correctness and raising it to size <= 128 is an optimization
+    if (size <= 128) {
+        return (struct size_info){align(size, 16), ((size - 1) >> 4) + 1};
+    }
+
+    static const size_t initial_spacing_multiplier = 5;
+    static const size_t special_small_sizes = 5; // 0, 16, 32, 48, 64
+
+    size_t spacing_class_shift = log2u64(size - 1) - 2;
+    size_t spacing_class = 1ULL << spacing_class_shift;
+    size_t real_size = align(size, spacing_class);
+    size_t spacing_class_index = (real_size >> spacing_class_shift) - initial_spacing_multiplier;
+    size_t index = (spacing_class_shift - 4) * 4 + special_small_sizes + spacing_class_index;
+    return (struct size_info){real_size, index};
+}
+
+// alignment must be a power of 2 <= PAGE_SIZE since slabs are only page aligned
+static inline struct size_info get_size_info_align(size_t size, size_t alignment) {
+    for (unsigned class = 1; class < N_SIZE_CLASSES; class++) {
+        size_t real_size = size_classes[class];
+        if (size <= real_size && !(real_size & (alignment - 1))) {
+            return (struct size_info){real_size, class};
+        }
+    }
+    fatal_error("invalid size for slabs");
+}
+
+static size_t get_slab_size(size_t slots, size_t size) {
+    return page_align(slots * size);
+}
+
+struct __attribute__((aligned(CACHELINE_SIZE))) size_class {
+    struct mutex lock;
+
+    void *class_region_start;
+    struct slab_metadata *slab_info;
+    struct libdivide_u32_t size_divisor;
+    struct libdivide_u64_t slab_size_divisor;
+
+#if SLAB_QUARANTINE_RANDOM_LENGTH > 0
+    void *quarantine_random[SLAB_QUARANTINE_RANDOM_LENGTH << (MAX_SLAB_SIZE_CLASS_SHIFT - MIN_SLAB_SIZE_CLASS_SHIFT)];
+#endif
+
+#if SLAB_QUARANTINE_QUEUE_LENGTH > 0
+    void *quarantine_queue[SLAB_QUARANTINE_QUEUE_LENGTH << (MAX_SLAB_SIZE_CLASS_SHIFT - MIN_SLAB_SIZE_CLASS_SHIFT)];
+    size_t quarantine_queue_index;
+#endif
+
+    // slabs with at least one allocated slot and at least one free slot
+    //
+    // LIFO doubly-linked list
+    struct slab_metadata *partial_slabs;
+
+    // slabs without allocated slots that are cached for near-term usage
+    //
+    // LIFO singly-linked list
+    struct slab_metadata *empty_slabs;
+    size_t empty_slabs_total; // length * slab_size
+
+    // slabs without allocated slots that are purged and memory protected
+    //
+    // FIFO singly-linked list
+    struct slab_metadata *free_slabs_head;
+    struct slab_metadata *free_slabs_tail;
+    struct slab_metadata *free_slabs_quarantine[FREE_SLABS_QUARANTINE_RANDOM_LENGTH];
+
+#if CONFIG_STATS
+    u64 nmalloc; // may wrap (per jemalloc API)
+    u64 ndalloc; // may wrap (per jemalloc API)
+    size_t allocated;
+    size_t slab_allocated;
+#endif
+
+    struct random_state rng;
+    size_t metadata_allocated;
+    size_t metadata_count;
+    size_t metadata_count_unguarded;
+};
+
+#define CLASS_REGION_SIZE (size_t)CONFIG_CLASS_REGION_SIZE
+#define REAL_CLASS_REGION_SIZE (CLASS_REGION_SIZE * 2)
+#define ARENA_SIZE (REAL_CLASS_REGION_SIZE * N_SIZE_CLASSES)
+static const size_t slab_region_size = ARENA_SIZE * N_ARENA;
+static_assert(PAGE_SIZE == 4096, "bitmap handling will need adjustment for other page sizes");
+
+static void *get_slab(const struct size_class *c, size_t slab_size, const struct slab_metadata *metadata) {
+    size_t index = metadata - c->slab_info;
+    return (char *)c->class_region_start + (index * slab_size);
+}
+
+#define MAX_METADATA_MAX (CLASS_REGION_SIZE / PAGE_SIZE)
+
+static size_t get_metadata_max(size_t slab_size) {
+    return CLASS_REGION_SIZE / slab_size;
+}
+
+static struct slab_metadata *alloc_metadata(struct size_class *c, size_t slab_size, bool non_zero_size) {
+    if (unlikely(c->metadata_count >= c->metadata_allocated)) {
+        size_t metadata_max = get_metadata_max(slab_size);
+        if (unlikely(c->metadata_count >= metadata_max)) {
+            errno = ENOMEM;
+            return NULL;
+        }
+        size_t allocate = max(c->metadata_allocated * 2, PAGE_SIZE / sizeof(struct slab_metadata));
+        if (allocate > metadata_max) {
+            allocate = metadata_max;
+        }
+        if (unlikely(memory_protect_rw_metadata(c->slab_info, allocate * sizeof(struct slab_metadata)))) {
+            return NULL;
+        }
+        c->metadata_allocated = allocate;
+    }
+
+    struct slab_metadata *metadata = c->slab_info + c->metadata_count;
+    void *slab = get_slab(c, slab_size, metadata);
+    if (non_zero_size && memory_protect_rw(slab, slab_size)) {
+        return NULL;
+    }
+    c->metadata_count++;
+    c->metadata_count_unguarded++;
+    if (c->metadata_count_unguarded >= GUARD_SLABS_INTERVAL) {
+        c->metadata_count++;
+        c->metadata_count_unguarded = 0;
+    }
+    return metadata;
+}
+
+static void set_used_slot(struct slab_metadata *metadata, size_t index) {
+    size_t bucket = index / U64_WIDTH;
+    metadata->bitmap[bucket] |= 1UL << (index - bucket * U64_WIDTH);
+#ifdef SLAB_METADATA_COUNT
+    metadata->count++;
+#endif
+}
+
+static void clear_used_slot(struct slab_metadata *metadata, size_t index) {
+    size_t bucket = index / U64_WIDTH;
+    metadata->bitmap[bucket] &= ~(1UL << (index - bucket * U64_WIDTH));
+#ifdef SLAB_METADATA_COUNT
+    metadata->count--;
+#endif
+}
+
+static bool is_used_slot(const struct slab_metadata *metadata, size_t index) {
+    size_t bucket = index / U64_WIDTH;
+    return (metadata->bitmap[bucket] >> (index - bucket * U64_WIDTH)) & 1UL;
+}
+
+#if SLAB_QUARANTINE
+static void set_quarantine_slot(struct slab_metadata *metadata, size_t index) {
+    size_t bucket = index / U64_WIDTH;
+    metadata->quarantine_bitmap[bucket] |= 1UL << (index - bucket * U64_WIDTH);
+}
+
+static void clear_quarantine_slot(struct slab_metadata *metadata, size_t index) {
+    size_t bucket = index / U64_WIDTH;
+    metadata->quarantine_bitmap[bucket] &= ~(1UL << (index - bucket * U64_WIDTH));
+}
+
+static bool is_quarantine_slot(const struct slab_metadata *metadata, size_t index) {
+    size_t bucket = index / U64_WIDTH;
+    return (metadata->quarantine_bitmap[bucket] >> (index - bucket * U64_WIDTH)) & 1UL;
+}
+#endif
+
+static u64 get_mask(size_t slots) {
+    return slots < U64_WIDTH ? ~0UL << slots : 0;
+}
+
+static size_t get_free_slot(struct random_state *rng, size_t slots, const struct slab_metadata *metadata) {
+    if (SLOT_RANDOMIZE) {
+        // randomize start location for linear search (uniform random choice is too slow)
+        size_t random_index = get_random_u16_uniform(rng, slots);
+        size_t first_bitmap = random_index / U64_WIDTH;
+        u64 random_split = ~(~0UL << (random_index - first_bitmap * U64_WIDTH));
+
+        size_t i = first_bitmap;
+        u64 masked = metadata->bitmap[i];
+        masked |= random_split;
+        for (;;) {
+            if (i == slots / U64_WIDTH) {
+                masked |= get_mask(slots - i * U64_WIDTH);
+            }
+
+            if (masked != ~0UL) {
+                return ffz64(masked) - 1 + i * U64_WIDTH;
+            }
+
+            i = i == (slots - 1) / U64_WIDTH ? 0 : i + 1;
+            masked = metadata->bitmap[i];
+        }
+    } else {
+        for (size_t i = 0; i <= (slots - 1) / U64_WIDTH; i++) {
+            u64 masked = metadata->bitmap[i];
+            if (i == (slots - 1) / U64_WIDTH) {
+                masked |= get_mask(slots - i * U64_WIDTH);
+            }
+
+            if (masked != ~0UL) {
+                return ffz64(masked) - 1 + i * U64_WIDTH;
+            }
+        }
+    }
+
+    fatal_error("no zero bits");
+}
+
+static bool has_free_slots(size_t slots, const struct slab_metadata *metadata) {
+#ifdef SLAB_METADATA_COUNT
+    return metadata->count < slots;
+#else
+    if (slots <= U64_WIDTH) {
+        u64 masked = metadata->bitmap[0] | get_mask(slots);
+        return masked != ~0UL;
+    }
+    if (slots <= U64_WIDTH * 2) {
+        u64 masked = metadata->bitmap[1] | get_mask(slots - U64_WIDTH);
+        return metadata->bitmap[0] != ~0UL || masked != ~0UL;
+    }
+    if (slots <= U64_WIDTH * 3) {
+        u64 masked = metadata->bitmap[2] | get_mask(slots - U64_WIDTH * 2);
+        return metadata->bitmap[0] != ~0UL || metadata->bitmap[1] != ~0UL || masked != ~0UL;
+    }
+    u64 masked = metadata->bitmap[3] | get_mask(slots - U64_WIDTH * 3);
+    return metadata->bitmap[0] != ~0UL || metadata->bitmap[1] != ~0UL || metadata->bitmap[2] != ~0UL || masked != ~0UL;
+#endif
+}
+
+static bool is_free_slab(const struct slab_metadata *metadata) {
+#ifdef SLAB_METADATA_COUNT
+    return !metadata->count;
+#else
+    return !metadata->bitmap[0] && !metadata->bitmap[1] && !metadata->bitmap[2] &&
+        !metadata->bitmap[3];
+#endif
+}
+
+static struct slab_metadata *get_metadata(const struct size_class *c, const void *p) {
+    size_t offset = (const char *)p - (const char *)c->class_region_start;
+    size_t index = libdivide_u64_do(offset, &c->slab_size_divisor);
+    // still caught without this check either as a read access violation or "double free"
+    if (unlikely(index >= c->metadata_allocated)) {
+        fatal_error("invalid free within a slab yet to be used");
+    }
+    return c->slab_info + index;
+}
+
+static void *slot_pointer(size_t size, void *slab, size_t slot) {
+    return (char *)slab + slot * size;
+}
+
+static void write_after_free_check(const char *p, size_t size) {
+    if (!WRITE_AFTER_FREE_CHECK) {
+        return;
+    }
+
+#ifdef HAS_ARM_MTE
+    if (likely(is_memtag_enabled())) {
+        return;
+    }
+#endif
+
+    for (size_t i = 0; i < size; i += sizeof(u64)) {
+        if (unlikely(*(const u64 *)(const void *)(p + i))) {
+            fatal_error("detected write after free");
+        }
+    }
+}
+
+static void set_slab_canary_value(UNUSED struct slab_metadata *metadata, UNUSED struct random_state *rng) {
+#if SLAB_CANARY
+    static const u64 canary_mask = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ?
+        0xffffffffffffff00UL :
+        0x00ffffffffffffffUL;
+
+    metadata->canary_value = get_random_u64(rng) & canary_mask;
+#ifdef HAS_ARM_MTE
+    if (unlikely(metadata->canary_value == 0)) {
+        // 0 is reserved to support disabling MTE at runtime (this is required on Android).
+        // When MTE is enabled, writing and reading of canaries is disabled, i.e. canary remains zeroed.
+        // After MTE is disabled, canaries that are set to 0 are ignored, since they wouldn't match
+        // slab's metadata->canary_value.
+        metadata->canary_value = 0x100; // 0x100 was chosen as the smallest acceptable value
+    }
+#endif
+#endif
+}
+
+static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void *p, UNUSED size_t size) {
+#if SLAB_CANARY
+#ifdef HAS_ARM_MTE
+    if (likely(is_memtag_enabled())) {
+        return;
+    }
+#endif
+
+    memcpy((char *)p + size - canary_size, &metadata->canary_value, canary_size);
+#endif
+}
+
+static void check_canary(UNUSED const struct slab_metadata *metadata, UNUSED const void *p, UNUSED size_t size) {
+#if SLAB_CANARY
+#ifdef HAS_ARM_MTE
+    if (likely(is_memtag_enabled())) {
+        return;
+    }
+#endif
+
+    u64 canary_value;
+    memcpy(&canary_value, (const char *)p + size - canary_size, canary_size);
+
+#ifdef HAS_ARM_MTE
+    if (unlikely(canary_value == 0)) {
+        return;
+    }
+#endif
+
+    if (unlikely(canary_value != metadata->canary_value)) {
+        fatal_error("canary corrupted");
+    }
+#endif
+}
+
+static inline void stats_small_allocate(UNUSED struct size_class *c, UNUSED size_t size) {
+#if CONFIG_STATS
+    c->allocated += size;
+    c->nmalloc++;
+#endif
+}
+
+static inline void stats_small_deallocate(UNUSED struct size_class *c, UNUSED size_t size) {
+#if CONFIG_STATS
+    c->allocated -= size;
+    c->ndalloc++;
+#endif
+}
+
+static inline void stats_slab_allocate(UNUSED struct size_class *c, UNUSED size_t slab_size) {
+#if CONFIG_STATS
+    c->slab_allocated += slab_size;
+#endif
+}
+
+static inline void stats_slab_deallocate(UNUSED struct size_class *c, UNUSED size_t slab_size) {
+#if CONFIG_STATS
+    c->slab_allocated -= slab_size;
+#endif
+}
+
+#ifdef HAS_ARM_MTE
+static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_ptr, size_t slot_idx, size_t slot_size) {
+    // arm_mte_tags is an array of 4-bit unsigned integers stored as u8 array (MTE tags are 4-bit wide)
+    //
+    // It stores the most recent tag for each slab slot, or 0 if the slot was never used.
+    // Slab indices in arm_mte_tags array are shifted to the right by 1, and size of this array
+    // is (MAX_SLAB_SLOT_COUNT + 2). This means that first and last values of arm_mte_tags array
+    // are always 0, which allows to handle edge slots in a branchless way when tag exclusion mask
+    // is constructed.
+    u8 *slot_tags = metadata->arm_mte_tags;
+
+    // Tag exclusion mask. 0 tag is always excluded to detect accesses to slab memory via untagged
+    // pointers. Moreover, 0 tag is excluded in bionic via PR_MTE_TAG_MASK prctl
+    u64 tem = (1 << 0) | (1 << RESERVED_TAG);
+
+    // current or previous tag of left neighbor or 0 if there's no left neighbor or if it was never used
+    tem |= (1 << u4_arr_get(slot_tags, slot_idx));
+    // previous tag of this slot or 0 if it was never used
+    tem |= (1 << u4_arr_get(slot_tags, slot_idx + 1));
+    // current or previous tag of right neighbor or 0 if there's no right neighbor or if it was never used
+    tem |= (1 << u4_arr_get(slot_tags, slot_idx + 2));
+
+    void *tagged_ptr = arm_mte_create_random_tag(slot_ptr, tem);
+    // slot addresses and sizes are always aligned by 16
+    arm_mte_tag_and_clear_mem(tagged_ptr, slot_size);
+
+    // store new tag of this slot
+    u4_arr_set(slot_tags, slot_idx + 1, get_pointer_tag(tagged_ptr));
+
+    return tagged_ptr;
+}
+#endif
+
+static inline void *allocate_small(unsigned arena, size_t requested_size) {
+    struct size_info info = get_size_info(requested_size);
+    size_t size = likely(info.size) ? info.size : 16;
+
+    struct size_class *c = &ro.size_class_metadata[arena][info.class];
+    size_t slots = get_slots(info.class);
+    size_t slab_size = get_slab_size(slots, size);
+
+    mutex_lock(&c->lock);
+
+    if (c->partial_slabs == NULL) {
+        if (c->empty_slabs != NULL) {
+            struct slab_metadata *metadata = c->empty_slabs;
+            c->empty_slabs = c->empty_slabs->next;
+            c->empty_slabs_total -= slab_size;
+
+            metadata->next = NULL;
+            metadata->prev = NULL;
+
+            c->partial_slabs = slots > 1 ? metadata : NULL;
+
+            void *slab = get_slab(c, slab_size, metadata);
+            size_t slot = get_free_slot(&c->rng, slots, metadata);
+            set_used_slot(metadata, slot);
+            void *p = slot_pointer(size, slab, slot);
+            if (requested_size) {
+                write_after_free_check(p, size - canary_size);
+                set_canary(metadata, p, size);
+#ifdef HAS_ARM_MTE
+                if (likely(is_memtag_enabled())) {
+                    p = tag_and_clear_slab_slot(metadata, p, slot, size);
+                }
+#endif
+            }
+            stats_small_allocate(c, size);
+
+            mutex_unlock(&c->lock);
+            return p;
+        }
+
+        if (c->free_slabs_head != NULL) {
+            struct slab_metadata *metadata = c->free_slabs_head;
+            set_slab_canary_value(metadata, &c->rng);
+
+            void *slab = get_slab(c, slab_size, metadata);
+            if (requested_size && memory_protect_rw(slab, slab_size)) {
+                mutex_unlock(&c->lock);
+                return NULL;
+            }
+
+            c->free_slabs_head = c->free_slabs_head->next;
+            if (c->free_slabs_head == NULL) {
+                c->free_slabs_tail = NULL;
+            }
+
+            metadata->next = NULL;
+            metadata->prev = NULL;
+
+            c->partial_slabs = slots > 1 ? metadata : NULL;
+
+            size_t slot = get_free_slot(&c->rng, slots, metadata);
+            set_used_slot(metadata, slot);
+            void *p = slot_pointer(size, slab, slot);
+            if (requested_size) {
+                set_canary(metadata, p, size);
+#ifdef HAS_ARM_MTE
+                if (likely(is_memtag_enabled())) {
+                    p = tag_and_clear_slab_slot(metadata, p, slot, size);
+                }
+#endif
+            }
+            stats_slab_allocate(c, slab_size);
+            stats_small_allocate(c, size);
+
+            mutex_unlock(&c->lock);
+            return p;
+        }
+
+        struct slab_metadata *metadata = alloc_metadata(c, slab_size, requested_size);
+        if (unlikely(metadata == NULL)) {
+            mutex_unlock(&c->lock);
+            return NULL;
+        }
+        set_slab_canary_value(metadata, &c->rng);
+
+        c->partial_slabs = slots > 1 ? metadata : NULL;
+        void *slab = get_slab(c, slab_size, metadata);
+        size_t slot = get_free_slot(&c->rng, slots, metadata);
+        set_used_slot(metadata, slot);
+        void *p = slot_pointer(size, slab, slot);
+        if (requested_size) {
+            set_canary(metadata, p, size);
+#ifdef HAS_ARM_MTE
+            if (likely(is_memtag_enabled())) {
+                p = tag_and_clear_slab_slot(metadata, p, slot, size);
+            }
+#endif
+        }
+        stats_slab_allocate(c, slab_size);
+        stats_small_allocate(c, size);
+
+        mutex_unlock(&c->lock);
+        return p;
+    }
+
+    struct slab_metadata *metadata = c->partial_slabs;
+    size_t slot = get_free_slot(&c->rng, slots, metadata);
+    set_used_slot(metadata, slot);
+
+    if (!has_free_slots(slots, metadata)) {
+        c->partial_slabs = c->partial_slabs->next;
+        if (c->partial_slabs) {
+            c->partial_slabs->prev = NULL;
+        }
+    }
+
+    void *slab = get_slab(c, slab_size, metadata);
+    void *p = slot_pointer(size, slab, slot);
+    if (requested_size) {
+        write_after_free_check(p, size - canary_size);
+        set_canary(metadata, p, size);
+#ifdef HAS_ARM_MTE
+        if (likely(is_memtag_enabled())) {
+            p = tag_and_clear_slab_slot(metadata, p, slot, size);
+        }
+#endif
+    }
+    stats_small_allocate(c, size);
+
+    mutex_unlock(&c->lock);
+    return p;
+}
+
+struct slab_size_class_info {
+    unsigned arena;
+    size_t class;
+};
+
+static struct slab_size_class_info slab_size_class(const void *p) {
+    size_t offset = (const char *)p - (const char *)ro.slab_region_start;
+    unsigned arena = 0;
+    if (N_ARENA > 1) {
+        arena = offset / ARENA_SIZE;
+        offset -= arena * ARENA_SIZE;
+    }
+    return (struct slab_size_class_info){arena, offset / REAL_CLASS_REGION_SIZE};
+}
+
+static size_t slab_usable_size(const void *p) {
+    return size_classes[slab_size_class(p).class];
+}
+
+static void enqueue_free_slab(struct size_class *c, struct slab_metadata *metadata) {
+    metadata->next = NULL;
+
+    static_assert(FREE_SLABS_QUARANTINE_RANDOM_LENGTH < (u16)-1, "free slabs quarantine too large");
+    size_t index = get_random_u16_uniform(&c->rng, FREE_SLABS_QUARANTINE_RANDOM_LENGTH);
+    struct slab_metadata *substitute = c->free_slabs_quarantine[index];
+    c->free_slabs_quarantine[index] = metadata;
+
+    if (substitute == NULL) {
+        return;
+    }
+
+    if (c->free_slabs_tail != NULL) {
+        c->free_slabs_tail->next = substitute;
+    } else {
+        c->free_slabs_head = substitute;
+    }
+    c->free_slabs_tail = substitute;
+}
+
+// preserves errno
+static inline void deallocate_small(void *p, const size_t *expected_size) {
+    struct slab_size_class_info size_class_info = slab_size_class(p);
+    size_t class = size_class_info.class;
+
+    struct size_class *c = &ro.size_class_metadata[size_class_info.arena][class];
+    size_t size = size_classes[class];
+    if (expected_size && unlikely(size != *expected_size)) {
+        fatal_error("sized deallocation mismatch (small)");
+    }
+    bool is_zero_size = size == 0;
+    if (unlikely(is_zero_size)) {
+        size = 16;
+    }
+    size_t slots = get_slots(class);
+    size_t slab_size = get_slab_size(slots, size);
+
+    mutex_lock(&c->lock);
+
+    stats_small_deallocate(c, size);
+
+    struct slab_metadata *metadata = get_metadata(c, p);
+    void *slab = get_slab(c, slab_size, metadata);
+    size_t slot = libdivide_u32_do((char *)p - (char *)slab, &c->size_divisor);
+
+    if (unlikely(slot_pointer(size, slab, slot) != p)) {
+        fatal_error("invalid unaligned free");
+    }
+
+    if (unlikely(!is_used_slot(metadata, slot))) {
+        fatal_error("double free");
+    }
+
+    if (likely(!is_zero_size)) {
+        check_canary(metadata, p, size);
+
+        bool skip_zero = false;
+#ifdef HAS_ARM_MTE
+        if (likely(is_memtag_enabled())) {
+            arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size);
+            // metadata->arm_mte_tags is intentionally not updated, see tag_and_clear_slab_slot()
+            skip_zero = true;
+        }
+#endif
+
+        if (ZERO_ON_FREE && !skip_zero) {
+            memset(p, 0, size - canary_size);
+        }
+    }
+
+#if SLAB_QUARANTINE
+    if (unlikely(is_quarantine_slot(metadata, slot))) {
+        fatal_error("double free (quarantine)");
+    }
+
+    set_quarantine_slot(metadata, slot);
+
+    size_t quarantine_shift = clz64(size) - (63 - MAX_SLAB_SIZE_CLASS_SHIFT);
+
+#if SLAB_QUARANTINE_RANDOM_LENGTH > 0
+    size_t slab_quarantine_random_length = SLAB_QUARANTINE_RANDOM_LENGTH << quarantine_shift;
+
+    size_t random_index = get_random_u16_uniform(&c->rng, slab_quarantine_random_length);
+    void *random_substitute = c->quarantine_random[random_index];
+    c->quarantine_random[random_index] = p;
+
+    if (random_substitute == NULL) {
+        mutex_unlock(&c->lock);
+        return;
+    }
+
+    p = random_substitute;
+#endif
+
+#if SLAB_QUARANTINE_QUEUE_LENGTH > 0
+    size_t slab_quarantine_queue_length = SLAB_QUARANTINE_QUEUE_LENGTH << quarantine_shift;
+
+    void *queue_substitute = c->quarantine_queue[c->quarantine_queue_index];
+    c->quarantine_queue[c->quarantine_queue_index] = p;
+    c->quarantine_queue_index = (c->quarantine_queue_index + 1) % slab_quarantine_queue_length;
+
+    if (queue_substitute == NULL) {
+        mutex_unlock(&c->lock);
+        return;
+    }
+
+    p = queue_substitute;
+#endif
+
+    metadata = get_metadata(c, p);
+    slab = get_slab(c, slab_size, metadata);
+    slot = libdivide_u32_do((char *)p - (char *)slab, &c->size_divisor);
+
+    clear_quarantine_slot(metadata, slot);
+#endif
+
+    // triggered even for slots == 1 and then undone below
+    if (!has_free_slots(slots, metadata)) {
+        metadata->next = c->partial_slabs;
+        metadata->prev = NULL;
+
+        if (c->partial_slabs) {
+            c->partial_slabs->prev = metadata;
+        }
+        c->partial_slabs = metadata;
+    }
+
+    clear_used_slot(metadata, slot);
+
+    if (is_free_slab(metadata)) {
+        if (metadata->prev) {
+            metadata->prev->next = metadata->next;
+        } else {
+            c->partial_slabs = metadata->next;
+        }
+        if (metadata->next) {
+            metadata->next->prev = metadata->prev;
+        }
+
+        metadata->prev = NULL;
+
+        if (c->empty_slabs_total + slab_size > max_empty_slabs_total) {
+            int saved_errno = errno;
+            if (!memory_map_fixed(slab, slab_size)) {
+                label_slab(slab, slab_size, class);
+                stats_slab_deallocate(c, slab_size);
+                enqueue_free_slab(c, metadata);
+                mutex_unlock(&c->lock);
+                return;
+            }
+            memory_purge(slab, slab_size);
+            errno = saved_errno;
+            // handle out-of-memory by putting it into the empty slabs list
+        }
+
+        metadata->next = c->empty_slabs;
+        c->empty_slabs = metadata;
+        c->empty_slabs_total += slab_size;
+    }
+
+    mutex_unlock(&c->lock);
+}
+
+struct region_metadata {
+    void *p;
+    size_t size;
+    size_t guard_size;
+};
+
+struct quarantine_info {
+    void *p;
+    size_t size;
+};
+
+#define INITIAL_REGION_TABLE_SIZE 128
+#define MAX_REGION_TABLE_SIZE (CLASS_REGION_SIZE / PAGE_SIZE / sizeof(struct region_metadata))
+
+struct region_allocator {
+    struct mutex lock;
+    struct region_metadata *regions;
+    size_t total;
+    size_t free;
+#if CONFIG_STATS
+    size_t allocated;
+#endif
+#if REGION_QUARANTINE_RANDOM_LENGTH
+    struct quarantine_info quarantine_random[REGION_QUARANTINE_RANDOM_LENGTH];
+#endif
+#if REGION_QUARANTINE_QUEUE_LENGTH
+    struct quarantine_info quarantine_queue[REGION_QUARANTINE_QUEUE_LENGTH];
+    size_t quarantine_queue_index;
+#endif
+    struct random_state rng;
+};
+
+static inline void stats_large_allocate(UNUSED struct region_allocator *ra, UNUSED size_t size) {
+#if CONFIG_STATS
+    ra->allocated += size;
+#endif
+}
+
+static inline void stats_large_deallocate(UNUSED struct region_allocator *ra, UNUSED size_t size) {
+#if CONFIG_STATS
+    ra->allocated -= size;
+#endif
+}
+
+struct __attribute__((aligned(PAGE_SIZE))) slab_info_mapping {
+    struct slab_metadata slab_info[MAX_METADATA_MAX];
+};
+
+struct __attribute__((aligned(PAGE_SIZE))) allocator_state {
+    struct size_class size_class_metadata[N_ARENA][N_SIZE_CLASSES];
+    struct region_allocator region_allocator;
+    // padding until next page boundary for mprotect
+    struct region_metadata regions_a[MAX_REGION_TABLE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+    // padding until next page boundary for mprotect
+    struct region_metadata regions_b[MAX_REGION_TABLE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+    // padding until next page boundary for mprotect
+    struct slab_info_mapping slab_info_mapping[N_ARENA][N_SIZE_CLASSES];
+    // padding until next page boundary for mprotect
+};
+
+static void regions_quarantine_deallocate_pages(void *p, size_t size, size_t guard_size) {
+    if (!REGION_QUARANTINE || size >= REGION_QUARANTINE_SKIP_THRESHOLD) {
+        deallocate_pages(p, size, guard_size);
+        return;
+    }
+
+    if (unlikely(memory_map_fixed(p, size))) {
+        memory_purge(p, size);
+    } else {
+        memory_set_name(p, size, "malloc large quarantine");
+    }
+
+    struct quarantine_info target =
+        (struct quarantine_info){(char *)p - guard_size, size + guard_size * 2};
+
+    struct region_allocator *ra = ro.region_allocator;
+
+    mutex_lock(&ra->lock);
+
+#if REGION_QUARANTINE_RANDOM_LENGTH
+    size_t index = get_random_u64_uniform(&ra->rng, REGION_QUARANTINE_RANDOM_LENGTH);
+    struct quarantine_info random_substitute = ra->quarantine_random[index];
+    ra->quarantine_random[index] = target;
+    if (random_substitute.p == NULL) {
+        mutex_unlock(&ra->lock);
+        return;
+    }
+    target = random_substitute;
+#endif
+
+#if REGION_QUARANTINE_QUEUE_LENGTH
+    struct quarantine_info queue_substitute = ra->quarantine_queue[ra->quarantine_queue_index];
+    ra->quarantine_queue[ra->quarantine_queue_index] = target;
+    ra->quarantine_queue_index = (ra->quarantine_queue_index + 1) % REGION_QUARANTINE_QUEUE_LENGTH;
+    target = queue_substitute;
+#endif
+
+    mutex_unlock(&ra->lock);
+
+    if (target.p != NULL) {
+        memory_unmap(target.p, target.size);
+    }
+}
+
+static int regions_grow(void) {
+    struct region_allocator *ra = ro.region_allocator;
+
+    if (ra->total > SIZE_MAX / sizeof(struct region_metadata) / 2) {
+        return 1;
+    }
+
+    size_t newtotal = ra->total * 2;
+    size_t newsize = newtotal * sizeof(struct region_metadata);
+    size_t mask = newtotal - 1;
+
+    if (newtotal > MAX_REGION_TABLE_SIZE) {
+        return 1;
+    }
+
+    struct region_metadata *p = ra->regions == ro.regions[0] ?
+        ro.regions[1] : ro.regions[0];
+
+    if (memory_protect_rw_metadata(p, newsize)) {
+        return 1;
+    }
+
+    for (size_t i = 0; i < ra->total; i++) {
+        const void *q = ra->regions[i].p;
+        if (q != NULL) {
+            size_t index = hash_page(q) & mask;
+            while (p[index].p != NULL) {
+                index = (index - 1) & mask;
+            }
+            p[index] = ra->regions[i];
+        }
+    }
+
+    memory_map_fixed(ra->regions, ra->total * sizeof(struct region_metadata));
+    memory_set_name(ra->regions, ra->total * sizeof(struct region_metadata), "malloc allocator_state");
+    ra->free = ra->free + ra->total;
+    ra->total = newtotal;
+    ra->regions = p;
+    return 0;
+}
+
+static int regions_insert(void *p, size_t size, size_t guard_size) {
+    struct region_allocator *ra = ro.region_allocator;
+
+    if (ra->free * 4 < ra->total) {
+        if (regions_grow()) {
+            return 1;
+        }
+    }
+
+    size_t mask = ra->total - 1;
+    size_t index = hash_page(p) & mask;
+    void *q = ra->regions[index].p;
+    while (q != NULL) {
+        index = (index - 1) & mask;
+        q = ra->regions[index].p;
+    }
+    ra->regions[index].p = p;
+    ra->regions[index].size = size;
+    ra->regions[index].guard_size = guard_size;
+    ra->free--;
+    return 0;
+}
+
+static struct region_metadata *regions_find(const void *p) {
+    const struct region_allocator *ra = ro.region_allocator;
+
+    size_t mask = ra->total - 1;
+    size_t index = hash_page(p) & mask;
+    void *r = ra->regions[index].p;
+    while (r != p && r != NULL) {
+        index = (index - 1) & mask;
+        r = ra->regions[index].p;
+    }
+    return (r == p && r != NULL) ? &ra->regions[index] : NULL;
+}
+
+static void regions_delete(const struct region_metadata *region) {
+    struct region_allocator *ra = ro.region_allocator;
+
+    size_t mask = ra->total - 1;
+
+    ra->free++;
+
+    size_t i = region - ra->regions;
+    for (;;) {
+        ra->regions[i].p = NULL;
+        ra->regions[i].size = 0;
+        size_t j = i;
+        for (;;) {
+            i = (i - 1) & mask;
+            if (ra->regions[i].p == NULL) {
+                return;
+            }
+            size_t r = hash_page(ra->regions[i].p) & mask;
+            if ((i <= r && r < j) || (r < j && j < i) || (j < i && i <= r)) {
+                continue;
+            }
+            ra->regions[j] = ra->regions[i];
+            break;
+        }
+    }
+}
+
+int get_metadata_key(void) {
+#ifdef USE_PKEY
+    return ro.metadata_pkey;
+#else
+    return -1;
+#endif
+}
+
+static inline void thread_set_metadata_access(UNUSED unsigned access) {
+#ifdef USE_PKEY
+    if (ro.metadata_pkey == -1) {
+        return;
+    }
+    pkey_set(ro.metadata_pkey, access);
+#endif
+}
+
+static inline void thread_unseal_metadata(void) {
+    thread_set_metadata_access(0);
+}
+
+static inline void thread_seal_metadata(void) {
+#ifdef USE_PKEY
+    thread_set_metadata_access(PKEY_DISABLE_ACCESS);
+#endif
+}
+
+static void full_lock(void) {
+    thread_unseal_metadata();
+    mutex_lock(&ro.region_allocator->lock);
+    for (unsigned arena = 0; arena < N_ARENA; arena++) {
+        for (unsigned class = 0; class < N_SIZE_CLASSES; class++) {
+            mutex_lock(&ro.size_class_metadata[arena][class].lock);
+        }
+    }
+    thread_seal_metadata();
+}
+
+static void full_unlock(void) {
+    thread_unseal_metadata();
+    mutex_unlock(&ro.region_allocator->lock);
+    for (unsigned arena = 0; arena < N_ARENA; arena++) {
+        for (unsigned class = 0; class < N_SIZE_CLASSES; class++) {
+            mutex_unlock(&ro.size_class_metadata[arena][class].lock);
+        }
+    }
+    thread_seal_metadata();
+}
+
+static void post_fork_child(void) {
+    thread_unseal_metadata();
+
+    mutex_init(&ro.region_allocator->lock);
+    random_state_init(&ro.region_allocator->rng);
+    for (unsigned arena = 0; arena < N_ARENA; arena++) {
+        for (unsigned class = 0; class < N_SIZE_CLASSES; class++) {
+            struct size_class *c = &ro.size_class_metadata[arena][class];
+            mutex_init(&c->lock);
+            random_state_init(&c->rng);
+        }
+    }
+    thread_seal_metadata();
+}
+
+static inline bool is_init(void) {
+    return get_slab_region_end() != NULL;
+}
+
+static inline void enforce_init(void) {
+    if (unlikely(!is_init())) {
+        fatal_error("invalid uninitialized allocator usage");
+    }
+}
+
+static struct mutex init_lock = MUTEX_INITIALIZER;
+
+COLD static void init_slow_path(void) {
+
+    mutex_lock(&init_lock);
+
+    if (unlikely(is_init())) {
+        mutex_unlock(&init_lock);
+        return;
+    }
+
+#ifdef USE_PKEY
+    ro.metadata_pkey = pkey_alloc(0, 0);
+#endif
+
+    if (unlikely(sysconf(_SC_PAGESIZE) != PAGE_SIZE)) {
+        fatal_error("runtime page size does not match compile-time page size which is not supported");
+    }
+
+    struct random_state *rng = allocate_pages(sizeof(struct random_state), PAGE_SIZE, true, "malloc init rng");
+    if (unlikely(rng == NULL)) {
+        fatal_error("failed to allocate init rng");
+    }
+    random_state_init(rng);
+
+    size_t metadata_guard_size =
+        (get_random_u64_uniform(rng, REAL_CLASS_REGION_SIZE / PAGE_SIZE) + 1) * PAGE_SIZE;
+
+    struct allocator_state *allocator_state =
+        allocate_pages(sizeof(struct allocator_state), metadata_guard_size, false, "malloc allocator_state");
+    if (unlikely(allocator_state == NULL)) {
+        fatal_error("failed to reserve allocator state");
+    }
+    if (unlikely(memory_protect_rw_metadata(allocator_state, offsetof(struct allocator_state, regions_a)))) {
+        fatal_error("failed to unprotect allocator state");
+    }
+
+    ro.region_allocator = &allocator_state->region_allocator;
+    struct region_allocator *ra = ro.region_allocator;
+
+    mutex_init(&ra->lock);
+    random_state_init_from_random_state(&ra->rng, rng);
+    ro.regions[0] = allocator_state->regions_a;
+    ro.regions[1] = allocator_state->regions_b;
+    ra->regions = ro.regions[0];
+    ra->total = INITIAL_REGION_TABLE_SIZE;
+    ra->free = INITIAL_REGION_TABLE_SIZE;
+    if (unlikely(memory_protect_rw_metadata(ra->regions, ra->total * sizeof(struct region_metadata)))) {
+        fatal_error("failed to unprotect memory for regions table");
+    }
+#ifdef HAS_ARM_MTE
+    if (likely(is_memtag_enabled())) {
+        ro.slab_region_start = memory_map_mte(slab_region_size);
+    } else {
+        ro.slab_region_start = memory_map(slab_region_size);
+    }
+#else
+    ro.slab_region_start = memory_map(slab_region_size);
+#endif
+    if (unlikely(ro.slab_region_start == NULL)) {
+        fatal_error("failed to allocate slab region");
+    }
+    void *slab_region_end = (char *)ro.slab_region_start + slab_region_size;
+    memory_set_name(ro.slab_region_start, slab_region_size, "malloc slab region gap");
+
+    for (unsigned arena = 0; arena < N_ARENA; arena++) {
+        ro.size_class_metadata[arena] = allocator_state->size_class_metadata[arena];
+        for (unsigned class = 0; class < N_SIZE_CLASSES; class++) {
+            struct size_class *c = &ro.size_class_metadata[arena][class];
+
+            mutex_init(&c->lock);
+            random_state_init_from_random_state(&c->rng, rng);
+
+            size_t bound = (REAL_CLASS_REGION_SIZE - CLASS_REGION_SIZE) / PAGE_SIZE - 1;
+            size_t gap = (get_random_u64_uniform(rng, bound) + 1) * PAGE_SIZE;
+            c->class_region_start = (char *)ro.slab_region_start + ARENA_SIZE * arena + REAL_CLASS_REGION_SIZE * class + gap;
+            label_slab(c->class_region_start, CLASS_REGION_SIZE, class);
+
+            size_t size = size_classes[class];
+            if (size == 0) {
+                size = 16;
+            }
+            c->size_divisor = libdivide_u32_gen(size);
+            size_t slab_size = get_slab_size(get_slots(class), size);
+            c->slab_size_divisor = libdivide_u64_gen(slab_size);
+            c->slab_info = allocator_state->slab_info_mapping[arena][class].slab_info;
+        }
+    }
+
+    deallocate_pages(rng, sizeof(struct random_state), PAGE_SIZE);
+
+    atomic_store_explicit(&ro.slab_region_end, slab_region_end, memory_order_release);
+
+    if (unlikely(memory_protect_ro(&ro, sizeof(ro)))) {
+        fatal_error("failed to protect allocator data");
+    }
+    memory_set_name(&ro, sizeof(ro), "malloc read-only after init");
+
+    mutex_unlock(&init_lock);
+
+    // may allocate, so wait until the allocator is initialized to avoid deadlocking
+    if (unlikely(pthread_atfork(full_lock, full_unlock, post_fork_child))) {
+        fatal_error("pthread_atfork failed");
+    }
+}
+
+static inline unsigned init(void) {
+    unsigned arena = thread_arena;
+#if N_ARENA > 1
+    if (likely(arena < N_ARENA)) {
+        return arena;
+    }
+    thread_arena = arena = thread_arena_counter++ % N_ARENA;
+#endif
+    if (unlikely(!is_init())) {
+        init_slow_path();
+    }
+    return arena;
+}
+
+#if CONFIG_SELF_INIT
+// trigger early initialization to set up pthread_atfork and protect state as soon as possible
+COLD __attribute__((constructor(101))) static void trigger_early_init(void) {
+    // avoid calling init directly to skip it if this isn't the malloc implementation
+    h_free(h_malloc(16));
+}
+#endif
+
+// Returns 0 on overflow.
+static size_t get_large_size_class(size_t size) {
+    if (CONFIG_LARGE_SIZE_CLASSES) {
+        // Continue small size class growth pattern of power of 2 spacing classes:
+        //
+        // 4 KiB [20 KiB, 24 KiB, 28 KiB, 32 KiB]
+        // 8 KiB [40 KiB, 48 KiB, 54 KiB, 64 KiB]
+        // 16 KiB [80 KiB, 96 KiB, 112 KiB, 128 KiB]
+        // 32 KiB [160 KiB, 192 KiB, 224 KiB, 256 KiB]
+        // 512 KiB [2560 KiB, 3 MiB, 3584 KiB, 4 MiB]
+        // 1 MiB [5 MiB, 6 MiB, 7 MiB, 8 MiB]
+        // etc.
+        return get_size_info(max(size, (size_t)PAGE_SIZE)).size;
+    }
+    return page_align(size);
+}
+
+static size_t get_guard_size(struct random_state *state, size_t size) {
+    return (get_random_u64_uniform(state, size / PAGE_SIZE / GUARD_SIZE_DIVISOR) + 1) * PAGE_SIZE;
+}
+
+static void *allocate_large(size_t size) {
+    size = get_large_size_class(size);
+    if (unlikely(!size)) {
+        errno = ENOMEM;
+        return NULL;
+    }
+
+    struct region_allocator *ra = ro.region_allocator;
+
+    mutex_lock(&ra->lock);
+    size_t guard_size = get_guard_size(&ra->rng, size);
+    mutex_unlock(&ra->lock);
+
+    void *p = allocate_pages(size, guard_size, true, "malloc large");
+    if (p == NULL) {
+        return NULL;
+    }
+
+    mutex_lock(&ra->lock);
+    if (unlikely(regions_insert(p, size, guard_size))) {
+        mutex_unlock(&ra->lock);
+        deallocate_pages(p, size, guard_size);
+        return NULL;
+    }
+    stats_large_allocate(ra, size);
+    mutex_unlock(&ra->lock);
+
+    return p;
+}
+
+static inline void *allocate(unsigned arena, size_t size) {
+    return size <= max_slab_size_class ? allocate_small(arena, size) : allocate_large(size);
+}
+
+static void deallocate_large(void *p, const size_t *expected_size) {
+    enforce_init();
+    thread_unseal_metadata();
+
+    struct region_allocator *ra = ro.region_allocator;
+
+    mutex_lock(&ra->lock);
+    const struct region_metadata *region = regions_find(p);
+    if (unlikely(region == NULL)) {
+        fatal_error("invalid free");
+    }
+    size_t size = region->size;
+    if (expected_size && unlikely(size != get_large_size_class(*expected_size))) {
+        fatal_error("sized deallocation mismatch (large)");
+    }
+    size_t guard_size = region->guard_size;
+    regions_delete(region);
+    stats_large_deallocate(ra, size);
+    mutex_unlock(&ra->lock);
+
+    regions_quarantine_deallocate_pages(p, size, guard_size);
+}
+
+static int allocate_aligned(unsigned arena, void **memptr, size_t alignment, size_t size, size_t min_alignment) {
+    if ((alignment - 1) & alignment || alignment < min_alignment) {
+        return EINVAL;
+    }
+
+    if (alignment <= PAGE_SIZE) {
+        if (size <= max_slab_size_class && alignment > min_align) {
+            size = get_size_info_align(size, alignment).size;
+        }
+
+        void *p = allocate(arena, size);
+        if (unlikely(p == NULL)) {
+            return ENOMEM;
+        }
+        *memptr = p;
+        return 0;
+    }
+
+    size = get_large_size_class(size);
+    if (unlikely(!size)) {
+        return ENOMEM;
+    }
+
+    struct region_allocator *ra = ro.region_allocator;
+
+    mutex_lock(&ra->lock);
+    size_t guard_size = get_guard_size(&ra->rng, size);
+    mutex_unlock(&ra->lock);
+
+    void *p = allocate_pages_aligned(size, alignment, guard_size, "malloc large");
+    if (unlikely(p == NULL)) {
+        return ENOMEM;
+    }
+
+    mutex_lock(&ra->lock);
+    if (unlikely(regions_insert(p, size, guard_size))) {
+        mutex_unlock(&ra->lock);
+        deallocate_pages(p, size, guard_size);
+        return ENOMEM;
+    }
+    mutex_unlock(&ra->lock);
+
+    *memptr = p;
+    return 0;
+}
+
+static size_t adjust_size_for_canary(size_t size) {
+    if (size > 0 && size <= max_slab_size_class) {
+        return size + canary_size;
+    }
+    return size;
+}
+
+static int alloc_aligned(void **memptr, size_t alignment, size_t size, size_t min_alignment) {
+    unsigned arena = init();
+    thread_unseal_metadata();
+    size = adjust_size_for_canary(size);
+    int ret = allocate_aligned(arena, memptr, alignment, size, min_alignment);
+    thread_seal_metadata();
+    return ret;
+}
+
+static void *alloc_aligned_simple(size_t alignment, size_t size) {
+    void *ptr;
+    int ret = alloc_aligned(&ptr, alignment, size, 1);
+    if (unlikely(ret)) {
+        errno = ret;
+        return NULL;
+    }
+    return ptr;
+}
+
+static inline void *alloc(size_t size) {
+    unsigned arena = init();
+    thread_unseal_metadata();
+    void *p = allocate(arena, size);
+    thread_seal_metadata();
+    return p;
+}
+
+EXPORT void *h_malloc(size_t size) {
+    size = adjust_size_for_canary(size);
+    return alloc(size);
+}
+
+EXPORT void *h_calloc(size_t nmemb, size_t size) {
+    size_t total_size;
+    if (unlikely(__builtin_mul_overflow(nmemb, size, &total_size))) {
+        errno = ENOMEM;
+        return NULL;
+    }
+    total_size = adjust_size_for_canary(total_size);
+    void *p = alloc(total_size);
+    if (!ZERO_ON_FREE && likely(p != NULL) && total_size && total_size <= max_slab_size_class) {
+        memset(p, 0, total_size - canary_size);
+    }
+#ifdef HAS_ARM_MTE
+    // use an assert instead of adding a conditional to memset() above (freed memory is always
+    // zeroed when MTE is enabled)
+    static_assert(ZERO_ON_FREE, "disabling ZERO_ON_FREE reduces performance when ARM MTE is enabled");
+#endif
+    return p;
+}
+
+EXPORT void *h_realloc(void *old, size_t size) {
+    size = adjust_size_for_canary(size);
+    if (old == NULL) {
+        return alloc(size);
+    }
+
+    if (size > max_slab_size_class) {
+        size = get_large_size_class(size);
+        if (unlikely(!size)) {
+            errno = ENOMEM;
+            return NULL;
+        }
+    }
+
+    void *old_orig = old;
+    old = untag_pointer(old);
+
+    size_t old_size;
+    if (old < get_slab_region_end() && old >= ro.slab_region_start) {
+        old_size = slab_usable_size(old);
+        if (size <= max_slab_size_class && get_size_info(size).size == old_size) {
+            return old_orig;
+        }
+        thread_unseal_metadata();
+    } else {
+        enforce_init();
+        thread_unseal_metadata();
+
+        struct region_allocator *ra = ro.region_allocator;
+
+        mutex_lock(&ra->lock);
+        const struct region_metadata *region = regions_find(old);
+        if (unlikely(region == NULL)) {
+            fatal_error("invalid realloc");
+        }
+        old_size = region->size;
+        size_t old_guard_size = region->guard_size;
+        if (old_size == size) {
+            mutex_unlock(&ra->lock);
+            thread_seal_metadata();
+            return old;
+        }
+        mutex_unlock(&ra->lock);
+
+        if (size > max_slab_size_class) {
+            // in-place shrink
+            if (size < old_size) {
+                void *new_end = (char *)old + size;
+                if (memory_map_fixed(new_end, old_guard_size)) {
+                    thread_seal_metadata();
+                    return NULL;
+                }
+                memory_set_name(new_end, old_guard_size, "malloc large");
+                void *new_guard_end = (char *)new_end + old_guard_size;
+                regions_quarantine_deallocate_pages(new_guard_end, old_size - size, 0);
+
+                mutex_lock(&ra->lock);
+                struct region_metadata *region = regions_find(old);
+                if (unlikely(region == NULL)) {
+                    fatal_error("invalid realloc");
+                }
+                region->size = size;
+                stats_large_deallocate(ra, old_size - size);
+                mutex_unlock(&ra->lock);
+
+                thread_seal_metadata();
+                return old;
+            }
+
+#ifdef HAVE_COMPATIBLE_MREMAP
+            static const bool vma_merging_reliable = false;
+            if (vma_merging_reliable) {
+                // in-place growth
+                void *guard_end = (char *)old + old_size + old_guard_size;
+                size_t extra = size - old_size;
+                if (!memory_remap((char *)old + old_size, old_guard_size, old_guard_size + extra)) {
+                    if (memory_protect_rw((char *)old + old_size, extra)) {
+                        memory_unmap(guard_end, extra);
+                    } else {
+                        mutex_lock(&ra->lock);
+                        struct region_metadata *region = regions_find(old);
+                        if (region == NULL) {
+                            fatal_error("invalid realloc");
+                        }
+                        region->size = size;
+                        stats_large_allocate(ra, extra);
+                        mutex_unlock(&ra->lock);
+
+                        thread_seal_metadata();
+                        return old;
+                    }
+                }
+            }
+
+            size_t copy_size = min(size, old_size);
+            if (copy_size >= MREMAP_MOVE_THRESHOLD) {
+                void *new = allocate_large(size);
+                if (new == NULL) {
+                    thread_seal_metadata();
+                    return NULL;
+                }
+
+                mutex_lock(&ra->lock);
+                struct region_metadata *region = regions_find(old);
+                if (unlikely(region == NULL)) {
+                    fatal_error("invalid realloc");
+                }
+                regions_delete(region);
+                stats_large_deallocate(ra, old_size);
+                mutex_unlock(&ra->lock);
+
+                if (memory_remap_fixed(old, old_size, new, size)) {
+                    memcpy(new, old, copy_size);
+                    deallocate_pages(old, old_size, old_guard_size);
+                } else {
+                    memory_unmap((char *)old - old_guard_size, old_guard_size);
+                    memory_unmap((char *)old + page_align(old_size), old_guard_size);
+                }
+                thread_seal_metadata();
+                return new;
+            }
+#endif
+        }
+    }
+
+    void *new = allocate(init(), size);
+    if (new == NULL) {
+        thread_seal_metadata();
+        return NULL;
+    }
+    size_t copy_size = min(size, old_size);
+    if (copy_size > 0 && copy_size <= max_slab_size_class) {
+        copy_size -= canary_size;
+    }
+    memcpy(new, old_orig, copy_size);
+    if (old_size <= max_slab_size_class) {
+        deallocate_small(old, NULL);
+    } else {
+        deallocate_large(old, NULL);
+    }
+    thread_seal_metadata();
+    return new;
+}
+
+EXPORT int h_posix_memalign(void **memptr, size_t alignment, size_t size) {
+    return alloc_aligned(memptr, alignment, size, sizeof(void *));
+}
+
+EXPORT void *h_aligned_alloc(size_t alignment, size_t size) {
+    return alloc_aligned_simple(alignment, size);
+}
+
+EXPORT void *h_memalign(size_t alignment, size_t size) ALIAS(h_aligned_alloc);
+
+#ifndef __ANDROID__
+EXPORT void *h_valloc(size_t size) {
+    return alloc_aligned_simple(PAGE_SIZE, size);
+}
+
+EXPORT void *h_pvalloc(size_t size) {
+    size = page_align(size);
+    if (unlikely(!size)) {
+        errno = ENOMEM;
+        return NULL;
+    }
+    return alloc_aligned_simple(PAGE_SIZE, size);
+}
+#endif
+
+// preserves errno
+EXPORT void h_free(void *p) {
+    if (p == NULL) {
+        return;
+    }
+
+    p = untag_pointer(p);
+
+    if (p < get_slab_region_end() && p >= ro.slab_region_start) {
+        thread_unseal_metadata();
+        deallocate_small(p, NULL);
+        thread_seal_metadata();
+        return;
+    }
+
+    int saved_errno = errno;
+    deallocate_large(p, NULL);
+    errno = saved_errno;
+
+    thread_seal_metadata();
+}
+
+#ifdef __GLIBC__
+EXPORT void h_cfree(void *ptr) ALIAS(h_free);
+#endif
+
+EXPORT void h_free_sized(void *p, size_t expected_size) {
+    if (p == NULL) {
+        return;
+    }
+
+    p = untag_pointer(p);
+
+    expected_size = adjust_size_for_canary(expected_size);
+
+    if (p < get_slab_region_end() && p >= ro.slab_region_start) {
+        thread_unseal_metadata();
+        expected_size = get_size_info(expected_size).size;
+        deallocate_small(p, &expected_size);
+        thread_seal_metadata();
+        return;
+    }
+
+    deallocate_large(p, &expected_size);
+
+    thread_seal_metadata();
+}
+
+static inline void memory_corruption_check_small(const void *p) {
+    struct slab_size_class_info size_class_info = slab_size_class(p);
+    size_t class = size_class_info.class;
+    struct size_class *c = &ro.size_class_metadata[size_class_info.arena][class];
+    size_t size = size_classes[class];
+    bool is_zero_size = size == 0;
+    if (unlikely(is_zero_size)) {
+        size = 16;
+    }
+    size_t slab_size = get_slab_size(get_slots(class), size);
+
+    mutex_lock(&c->lock);
+
+    const struct slab_metadata *metadata = get_metadata(c, p);
+    void *slab = get_slab(c, slab_size, metadata);
+    size_t slot = libdivide_u32_do((const char *)p - (const char *)slab, &c->size_divisor);
+
+    if (unlikely(slot_pointer(size, slab, slot) != p)) {
+        fatal_error("invalid unaligned malloc_usable_size");
+    }
+
+    if (unlikely(!is_used_slot(metadata, slot))) {
+        fatal_error("invalid malloc_usable_size");
+    }
+
+    if (likely(!is_zero_size)) {
+        check_canary(metadata, p, size);
+    }
+
+#if SLAB_QUARANTINE
+    if (unlikely(is_quarantine_slot(metadata, slot))) {
+        fatal_error("invalid malloc_usable_size (quarantine)");
+    }
+#endif
+
+    mutex_unlock(&c->lock);
+}
+
+EXPORT size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *arg) {
+    if (arg == NULL) {
+        return 0;
+    }
+
+    const void *p = untag_const_pointer(arg);
+
+    if (p < get_slab_region_end() && p >= ro.slab_region_start) {
+        thread_unseal_metadata();
+        memory_corruption_check_small(p);
+        thread_seal_metadata();
+
+        size_t size = slab_usable_size(p);
+        return size ? size - canary_size : 0;
+    }
+
+    enforce_init();
+    thread_unseal_metadata();
+
+    struct region_allocator *ra = ro.region_allocator;
+    mutex_lock(&ra->lock);
+    const struct region_metadata *region = regions_find(p);
+    if (unlikely(region == NULL)) {
+        fatal_error("invalid malloc_usable_size");
+    }
+    size_t size = region->size;
+    mutex_unlock(&ra->lock);
+
+    thread_seal_metadata();
+    return size;
+}
+
+EXPORT size_t h_malloc_object_size(const void *p) {
+    if (p == NULL) {
+        return 0;
+    }
+
+    const void *slab_region_end = get_slab_region_end();
+    if (p < slab_region_end && p >= ro.slab_region_start) {
+        thread_unseal_metadata();
+
+        struct slab_size_class_info size_class_info = slab_size_class(p);
+        size_t class = size_class_info.class;
+        size_t size_class = size_classes[class];
+        struct size_class *c = &ro.size_class_metadata[size_class_info.arena][class];
+
+        mutex_lock(&c->lock);
+
+        const struct slab_metadata *metadata = get_metadata(c, p);
+        size_t slab_size = get_slab_size(get_slots(class), size_class);
+        void *slab = get_slab(c, slab_size, metadata);
+        size_t slot = libdivide_u32_do((const char *)p - (const char *)slab, &c->size_divisor);
+
+        if (unlikely(!is_used_slot(metadata, slot))) {
+            fatal_error("invalid malloc_object_size");
+        }
+
+#if SLAB_QUARANTINE
+        if (unlikely(is_quarantine_slot(metadata, slot))) {
+            fatal_error("invalid malloc_object_size (quarantine)");
+        }
+#endif
+
+        void *start = slot_pointer(size_class, slab, slot);
+        size_t offset = (const char *)p - (const char *)start;
+
+        mutex_unlock(&c->lock);
+        thread_seal_metadata();
+
+        size_t size = slab_usable_size(p);
+        return size ? size - canary_size - offset : 0;
+    }
+
+    if (unlikely(slab_region_end == NULL)) {
+        return SIZE_MAX;
+    }
+
+    thread_unseal_metadata();
+
+    struct region_allocator *ra = ro.region_allocator;
+    mutex_lock(&ra->lock);
+    const struct region_metadata *region = regions_find(p);
+    size_t size = region == NULL ? SIZE_MAX : region->size;
+    mutex_unlock(&ra->lock);
+
+    thread_seal_metadata();
+    return size;
+}
+
+EXPORT size_t h_malloc_object_size_fast(const void *p) {
+    if (p == NULL) {
+        return 0;
+    }
+
+    const void *slab_region_end = get_slab_region_end();
+    if (p < slab_region_end && p >= ro.slab_region_start) {
+        size_t size = slab_usable_size(p);
+        return size ? size - canary_size : 0;
+    }
+
+    if (unlikely(slab_region_end == NULL)) {
+        return 0;
+    }
+
+    return SIZE_MAX;
+}
+
+EXPORT int h_mallopt(UNUSED int param, UNUSED int value) {
+#ifdef __ANDROID__
+    if (param == M_PURGE) {
+        h_malloc_trim(0);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+EXPORT int h_malloc_trim(UNUSED size_t pad) {
+    if (unlikely(!is_init())) {
+        return 0;
+    }
+
+    thread_unseal_metadata();
+
+    bool is_trimmed = false;
+
+    for (unsigned arena = 0; arena < N_ARENA; arena++) {
+        // skip zero byte size class since there's nothing to change
+        for (unsigned class = 1; class < N_SIZE_CLASSES; class++) {
+            struct size_class *c = &ro.size_class_metadata[arena][class];
+            size_t size = size_classes[class];
+            size_t slab_size = get_slab_size(get_slots(class), size);
+
+            mutex_lock(&c->lock);
+
+            struct slab_metadata *iterator = c->empty_slabs;
+            while (iterator) {
+                void *slab = get_slab(c, slab_size, iterator);
+                if (memory_map_fixed(slab, slab_size)) {
+                    break;
+                }
+                label_slab(slab, slab_size, class);
+                stats_slab_deallocate(c, slab_size);
+
+                struct slab_metadata *trimmed = iterator;
+                iterator = iterator->next;
+                c->empty_slabs_total -= slab_size;
+
+                enqueue_free_slab(c, trimmed);
+
+                is_trimmed = true;
+            }
+            c->empty_slabs = iterator;
+
+#if SLAB_QUARANTINE && CONFIG_EXTENDED_SIZE_CLASSES
+            if (size >= min_extended_size_class) {
+                size_t quarantine_shift = clz64(size) - (63 - MAX_SLAB_SIZE_CLASS_SHIFT);
+
+#if SLAB_QUARANTINE_RANDOM_LENGTH > 0
+                size_t slab_quarantine_random_length = SLAB_QUARANTINE_RANDOM_LENGTH << quarantine_shift;
+                for (size_t i = 0; i < slab_quarantine_random_length; i++) {
+                    void *p = c->quarantine_random[i];
+                    if (p != NULL) {
+                        memory_purge(p, size);
+                    }
+                }
+#endif
+
+#if SLAB_QUARANTINE_QUEUE_LENGTH > 0
+                size_t slab_quarantine_queue_length = SLAB_QUARANTINE_QUEUE_LENGTH << quarantine_shift;
+                for (size_t i = 0; i < slab_quarantine_queue_length; i++) {
+                    void *p = c->quarantine_queue[i];
+                    if (p != NULL) {
+                        memory_purge(p, size);
+                    }
+                }
+#endif
+            }
+#endif
+
+            mutex_unlock(&c->lock);
+        }
+    }
+
+    thread_seal_metadata();
+
+    return is_trimmed;
+}
+
+EXPORT void h_malloc_stats(void) {}
+
+#if defined(__GLIBC__) || defined(__ANDROID__)
+// glibc mallinfo is broken and replaced with mallinfo2
+#if defined(__GLIBC__)
+EXPORT struct mallinfo h_mallinfo(void) {
+    return (struct mallinfo){0};
+}
+
+EXPORT struct mallinfo2 h_mallinfo2(void) {
+    struct mallinfo2 info = {0};
+#else
+EXPORT struct mallinfo h_mallinfo(void) {
+    struct mallinfo info = {0};
+#endif
+
+#if CONFIG_STATS
+    if (unlikely(!is_init())) {
+        return info;
+    }
+
+    thread_unseal_metadata();
+
+    struct region_allocator *ra = ro.region_allocator;
+    mutex_lock(&ra->lock);
+    info.hblkhd += ra->allocated;
+    info.uordblks += ra->allocated;
+    mutex_unlock(&ra->lock);
+
+    for (unsigned arena = 0; arena < N_ARENA; arena++) {
+        for (unsigned class = 0; class < N_SIZE_CLASSES; class++) {
+            struct size_class *c = &ro.size_class_metadata[arena][class];
+
+            mutex_lock(&c->lock);
+            info.hblkhd += c->slab_allocated;
+            info.uordblks += c->allocated;
+            mutex_unlock(&c->lock);
+        }
+    }
+
+    info.fordblks = info.hblkhd - info.uordblks;
+    info.usmblks = info.hblkhd;
+
+    thread_seal_metadata();
+#endif
+
+    return info;
+}
+#endif
+
+#ifndef __ANDROID__
+EXPORT int h_malloc_info(int options, FILE *fp) {
+    if (options) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    fputs("<malloc version=\"hardened_malloc-1\">", fp);
+
+#if CONFIG_STATS
+    if (likely(is_init())) {
+        thread_unseal_metadata();
+
+        for (unsigned arena = 0; arena < N_ARENA; arena++) {
+            fprintf(fp, "<heap nr=\"%u\">", arena);
+
+            for (unsigned class = 0; class < N_SIZE_CLASSES; class++) {
+                struct size_class *c = &ro.size_class_metadata[arena][class];
+
+                mutex_lock(&c->lock);
+                u64 nmalloc = c->nmalloc;
+                u64 ndalloc = c->ndalloc;
+                size_t slab_allocated = c->slab_allocated;
+                size_t allocated = c->allocated;
+                mutex_unlock(&c->lock);
+
+                if (nmalloc || ndalloc || slab_allocated || allocated) {
+                    fprintf(fp, "<bin nr=\"%u\" size=\"%" PRIu32 "\">"
+                            "<nmalloc>%" PRIu64 "</nmalloc>"
+                            "<ndalloc>%" PRIu64 "</ndalloc>"
+                            "<slab_allocated>%zu</slab_allocated>"
+                            "<allocated>%zu</allocated>"
+                            "</bin>", class, size_classes[class], nmalloc, ndalloc, slab_allocated,
+                            allocated);
+                }
+            }
+
+            fputs("</heap>", fp);
+        }
+
+        struct region_allocator *ra = ro.region_allocator;
+        mutex_lock(&ra->lock);
+        size_t region_allocated = ra->allocated;
+        mutex_unlock(&ra->lock);
+
+        fprintf(fp, "<heap nr=\"%u\">"
+                "<allocated_large>%zu</allocated_large>"
+                "</heap>", N_ARENA, region_allocated);
+
+        thread_seal_metadata();
+    }
+#endif
+
+    fputs("</malloc>", fp);
+
+    return 0;
+}
+#endif
+
+#ifdef __ANDROID__
+EXPORT size_t h_mallinfo_narenas(void) {
+    // Consider region allocator to be an arena with index N_ARENA.
+    return N_ARENA + 1;
+}
+
+EXPORT size_t h_mallinfo_nbins(void) {
+    return N_SIZE_CLASSES;
+}
+
+// This internal Android API uses mallinfo in a non-standard way to implement malloc_info:
+//
+// hblkhd: total mapped memory as usual
+// ordblks: large allocations
+// uordblks: huge allocations
+// fsmblks: small allocations
+// (other fields are unused)
+EXPORT struct mallinfo h_mallinfo_arena_info(UNUSED size_t arena) {
+    struct mallinfo info = {0};
+
+#if CONFIG_STATS
+    if (unlikely(!is_init())) {
+        return info;
+    }
+
+    thread_unseal_metadata();
+
+    if (arena < N_ARENA) {
+        for (unsigned class = 0; class < N_SIZE_CLASSES; class++) {
+            struct size_class *c = &ro.size_class_metadata[arena][class];
+
+            mutex_lock(&c->lock);
+            info.hblkhd += c->slab_allocated;
+            info.fsmblks += c->allocated;
+            mutex_unlock(&c->lock);
+        }
+    } else if (arena == N_ARENA) {
+        struct region_allocator *ra = ro.region_allocator;
+        mutex_lock(&ra->lock);
+        info.hblkhd = ra->allocated;
+        // our large allocations are roughly comparable to jemalloc huge allocations
+        info.uordblks = ra->allocated;
+        mutex_unlock(&ra->lock);
+    }
+
+    thread_seal_metadata();
+#endif
+
+    return info;
+}
+
+// This internal Android API uses mallinfo in a non-standard way to implement malloc_info:
+//
+// ordblks: total allocated space
+// uordblks: nmalloc
+// fordblks: ndalloc
+// (other fields are unused)
+EXPORT struct mallinfo h_mallinfo_bin_info(UNUSED size_t arena, UNUSED size_t bin) {
+    struct mallinfo info = {0};
+
+#if CONFIG_STATS
+    if (unlikely(!is_init())) {
+        return info;
+    }
+
+    if (arena < N_ARENA && bin < N_SIZE_CLASSES) {
+        thread_seal_metadata();
+
+        struct size_class *c = &ro.size_class_metadata[arena][bin];
+
+        mutex_lock(&c->lock);
+        info.ordblks = c->allocated;
+        info.uordblks = c->nmalloc;
+        info.fordblks = c->ndalloc;
+        mutex_unlock(&c->lock);
+
+        thread_unseal_metadata();
+    }
+#endif
+
+    return info;
+}
+
+COLD EXPORT int h_malloc_iterate(UNUSED uintptr_t base, UNUSED size_t size,
+                          UNUSED void (*callback)(uintptr_t ptr, size_t size, void *arg),
+                          UNUSED void *arg) {
+    fatal_error("not implemented");
+}
+
+COLD EXPORT void h_malloc_disable(void) {
+    init();
+    full_lock();
+}
+
+COLD EXPORT void h_malloc_enable(void) {
+    enforce_init();
+    full_unlock();
+}
+#endif
+
+#ifdef __GLIBC__
+COLD EXPORT void *h_malloc_get_state(void) {
+    errno = ENOSYS;
+    return NULL;
+}
+
+COLD EXPORT int h_malloc_set_state(UNUSED void *state) {
+    return -2;
+}
+#endif
+
+#ifdef __ANDROID__
+COLD EXPORT void h_malloc_disable_memory_tagging(void) {
+#ifdef HAS_ARM_MTE
+    mutex_lock(&init_lock);
+    if (!ro.is_memtag_disabled) {
+        if (is_init()) {
+            if (unlikely(memory_protect_rw(&ro, sizeof(ro)))) {
+                fatal_error("failed to unprotect allocator data");
+            }
+            ro.is_memtag_disabled = true;
+            if (unlikely(memory_protect_ro(&ro, sizeof(ro)))) {
+                fatal_error("failed to protect allocator data");
+            }
+        } else {
+            // bionic calls this function very early in some cases
+            ro.is_memtag_disabled = true;
+        }
+    }
+    mutex_unlock(&init_lock);
+#endif
+}
+#endif
diff --git a/src/hardened_malloc/include/h_malloc.h b/src/hardened_malloc/include/h_malloc.h
new file mode 100644
index 0000000..0eee395
--- /dev/null
+++ b/src/hardened_malloc/include/h_malloc.h
@@ -0,0 +1,129 @@
+#ifndef ALLOCATOR_H
+#define ALLOCATOR_H
+
+#include <stdio.h>
+
+#include <malloc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef H_MALLOC_PREFIX
+#define h_malloc malloc
+#define h_calloc calloc
+#define h_realloc realloc
+#define h_aligned_alloc aligned_alloc
+#define h_free free
+
+#define h_posix_memalign posix_memalign
+
+#define h_malloc_usable_size malloc_usable_size
+#define h_mallopt mallopt
+#define h_malloc_trim malloc_trim
+#define h_malloc_stats malloc_stats
+#define h_mallinfo mallinfo
+#define h_mallinfo2 mallinfo2
+#define h_malloc_info malloc_info
+
+#define h_memalign memalign
+#define h_valloc valloc
+#define h_pvalloc pvalloc
+#define h_cfree cfree
+#define h_malloc_get_state malloc_get_state
+#define h_malloc_set_state malloc_set_state
+
+#define h_mallinfo_narenas mallinfo_narenas
+#define h_mallinfo_nbins mallinfo_nbins
+#define h_mallinfo_arena_info mallinfo_arena_info
+#define h_mallinfo_bin_info mallinfo_bin_info
+
+#define h_malloc_iterate malloc_iterate
+#define h_malloc_disable malloc_disable
+#define h_malloc_enable malloc_enable
+
+#define h_malloc_object_size malloc_object_size
+#define h_malloc_object_size_fast malloc_object_size_fast
+#define h_free_sized free_sized
+#endif
+
+// C standard
+__attribute__((malloc)) __attribute__((alloc_size(1))) void *h_malloc(size_t size);
+__attribute__((malloc)) __attribute__((alloc_size(1, 2))) void *h_calloc(size_t nmemb, size_t size);
+__attribute__((alloc_size(2))) void *h_realloc(void *ptr, size_t size);
+__attribute__((malloc)) __attribute__((alloc_size(2))) __attribute__((alloc_align(1)))
+void *h_aligned_alloc(size_t alignment, size_t size);
+void h_free(void *ptr);
+
+// POSIX
+int h_posix_memalign(void **memptr, size_t alignment, size_t size);
+
+#ifdef __ANDROID__
+#define H_MALLOC_USABLE_SIZE_CONST const
+#else
+#define H_MALLOC_USABLE_SIZE_CONST
+#endif
+
+// glibc extensions
+size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *ptr);
+int h_mallopt(int param, int value);
+int h_malloc_trim(size_t pad);
+void h_malloc_stats(void);
+#if defined(__GLIBC__) || defined(__ANDROID__)
+struct mallinfo h_mallinfo(void);
+#endif
+#ifndef __ANDROID__
+int h_malloc_info(int options, FILE *fp);
+#endif
+
+// obsolete glibc extensions
+__attribute__((malloc)) __attribute__((alloc_size(2))) __attribute__((alloc_align(1)))
+void *h_memalign(size_t alignment, size_t size);
+#ifndef __ANDROID__
+__attribute__((malloc)) __attribute__((alloc_size(1))) void *h_valloc(size_t size);
+__attribute__((malloc)) void *h_pvalloc(size_t size);
+#endif
+#ifdef __GLIBC__
+void h_cfree(void *ptr) __THROW;
+void *h_malloc_get_state(void);
+int h_malloc_set_state(void *state);
+#endif
+
+// Android extensions
+#ifdef __ANDROID__
+size_t h_mallinfo_narenas(void);
+size_t h_mallinfo_nbins(void);
+struct mallinfo h_mallinfo_arena_info(size_t arena);
+struct mallinfo h_mallinfo_bin_info(size_t arena, size_t bin);
+int h_malloc_iterate(uintptr_t base, size_t size, void (*callback)(uintptr_t ptr, size_t size, void *arg),
+              void *arg);
+void h_malloc_disable(void);
+void h_malloc_enable(void);
+void h_malloc_disable_memory_tagging(void);
+#endif
+
+// hardened_malloc extensions
+
+// return an upper bound on object size for any pointer based on malloc metadata
+size_t h_malloc_object_size(const void *ptr);
+
+// similar to malloc_object_size, but avoiding locking so the results are much more limited
+size_t h_malloc_object_size_fast(const void *ptr);
+
+// The free function with an extra parameter for passing the size requested at
+// allocation time.
+//
+// This offers the same functionality as C++14 sized deallocation and can be
+// used to implement it.
+//
+// A performance-oriented allocator would use this as a performance
+// enhancement with undefined behavior on a mismatch. Instead, this hardened
+// allocator implementation uses it to improve security by checking that the
+// passed size matches the allocated size.
+void h_free_sized(void *ptr, size_t expected_size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/hardened_malloc/memory.c b/src/hardened_malloc/memory.c
new file mode 100644
index 0000000..5434060
--- /dev/null
+++ b/src/hardened_malloc/memory.c
@@ -0,0 +1,120 @@
+#include <errno.h>
+
+#include <sys/mman.h>
+
+#ifdef LABEL_MEMORY
+#include <sys/prctl.h>
+#endif
+
+#ifndef PR_SET_VMA
+#define PR_SET_VMA 0x53564d41
+#endif
+
+#ifndef PR_SET_VMA_ANON_NAME
+#define PR_SET_VMA_ANON_NAME 0
+#endif
+
+#include "memory.h"
+#include "util.h"
+
+void *memory_map(size_t size) {
+    void *p = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+    if (unlikely(p == MAP_FAILED)) {
+        if (errno != ENOMEM) {
+            fatal_error("non-ENOMEM mmap failure");
+        }
+        return NULL;
+    }
+    return p;
+}
+
+#ifdef HAS_ARM_MTE
+// Note that PROT_MTE can't be cleared via mprotect
+void *memory_map_mte(size_t size) {
+    void *p = mmap(NULL, size, PROT_MTE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+    if (unlikely(p == MAP_FAILED)) {
+        if (errno != ENOMEM) {
+            fatal_error("non-ENOMEM MTE mmap failure");
+        }
+        return NULL;
+    }
+    return p;
+}
+#endif
+
+bool memory_map_fixed(void *ptr, size_t size) {
+    void *p = mmap(ptr, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
+    bool ret = p == MAP_FAILED;
+    if (unlikely(ret) && errno != ENOMEM) {
+        fatal_error("non-ENOMEM MAP_FIXED mmap failure");
+    }
+    return ret;
+}
+
+bool memory_unmap(void *ptr, size_t size) {
+    bool ret = munmap(ptr, size);
+    if (unlikely(ret) && errno != ENOMEM) {
+        fatal_error("non-ENOMEM munmap failure");
+    }
+    return ret;
+}
+
+static bool memory_protect_prot(void *ptr, size_t size, int prot, UNUSED int pkey) {
+#ifdef USE_PKEY
+    bool ret = pkey_mprotect(ptr, size, prot, pkey);
+#else
+    bool ret = mprotect(ptr, size, prot);
+#endif
+    if (unlikely(ret) && errno != ENOMEM) {
+        fatal_error("non-ENOMEM mprotect failure");
+    }
+    return ret;
+}
+
+bool memory_protect_ro(void *ptr, size_t size) {
+    return memory_protect_prot(ptr, size, PROT_READ, -1);
+}
+
+bool memory_protect_rw(void *ptr, size_t size) {
+    return memory_protect_prot(ptr, size, PROT_READ|PROT_WRITE, -1);
+}
+
+bool memory_protect_rw_metadata(void *ptr, size_t size) {
+    return memory_protect_prot(ptr, size, PROT_READ|PROT_WRITE, get_metadata_key());
+}
+
+#ifdef HAVE_COMPATIBLE_MREMAP
+bool memory_remap(void *old, size_t old_size, size_t new_size) {
+    void *ptr = mremap(old, old_size, new_size, 0);
+    bool ret = ptr == MAP_FAILED;
+    if (unlikely(ret) && errno != ENOMEM) {
+        fatal_error("non-ENOMEM mremap failure");
+    }
+    return ret;
+}
+
+bool memory_remap_fixed(void *old, size_t old_size, void *new, size_t new_size) {
+    void *ptr = mremap(old, old_size, new_size, MREMAP_MAYMOVE|MREMAP_FIXED, new);
+    bool ret = ptr == MAP_FAILED;
+    if (unlikely(ret) && errno != ENOMEM) {
+        fatal_error("non-ENOMEM MREMAP_FIXED mremap failure");
+    }
+    return ret;
+}
+#endif
+
+bool memory_purge(void *ptr, size_t size) {
+    int ret = madvise(ptr, size, MADV_DONTNEED);
+    if (unlikely(ret) && errno != ENOMEM) {
+        fatal_error("non-ENOMEM MADV_DONTNEED madvise failure");
+    }
+    return ret;
+}
+
+bool memory_set_name(UNUSED void *ptr, UNUSED size_t size, UNUSED const char *name) {
+#ifdef LABEL_MEMORY
+    return prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, name);
+#else
+    return false;
+#endif
+}
diff --git a/src/hardened_malloc/memory.h b/src/hardened_malloc/memory.h
new file mode 100644
index 0000000..6e4cd4d
--- /dev/null
+++ b/src/hardened_malloc/memory.h
@@ -0,0 +1,29 @@
+#ifndef MEMORY_H
+#define MEMORY_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#ifdef __linux__
+#define HAVE_COMPATIBLE_MREMAP
+#endif
+
+int get_metadata_key(void);
+
+void *memory_map(size_t size);
+#ifdef HAS_ARM_MTE
+void *memory_map_mte(size_t size);
+#endif
+bool memory_map_fixed(void *ptr, size_t size);
+bool memory_unmap(void *ptr, size_t size);
+bool memory_protect_ro(void *ptr, size_t size);
+bool memory_protect_rw(void *ptr, size_t size);
+bool memory_protect_rw_metadata(void *ptr, size_t size);
+#ifdef HAVE_COMPATIBLE_MREMAP
+bool memory_remap(void *old, size_t old_size, size_t new_size);
+bool memory_remap_fixed(void *old, size_t old_size, void *new, size_t new_size);
+#endif
+bool memory_purge(void *ptr, size_t size);
+bool memory_set_name(void *ptr, size_t size, const char *name);
+
+#endif
diff --git a/src/hardened_malloc/memtag.h b/src/hardened_malloc/memtag.h
new file mode 100644
index 0000000..0ba4cbc
--- /dev/null
+++ b/src/hardened_malloc/memtag.h
@@ -0,0 +1,49 @@
+#ifndef MEMTAG_H
+#define MEMTAG_H
+
+#include "util.h"
+
+#ifdef HAS_ARM_MTE
+#include "arm_mte.h"
+#define MEMTAG 1
+#define RESERVED_TAG 15
+#define TAG_WIDTH 4
+#endif
+
+static inline void *untag_pointer(void *ptr) {
+#ifdef HAS_ARM_MTE
+    const uintptr_t mask = UINTPTR_MAX >> 8;
+    return (void *) ((uintptr_t) ptr & mask);
+#else
+    return ptr;
+#endif
+}
+
+static inline const void *untag_const_pointer(const void *ptr) {
+#ifdef HAS_ARM_MTE
+    const uintptr_t mask = UINTPTR_MAX >> 8;
+    return (const void *) ((uintptr_t) ptr & mask);
+#else
+    return ptr;
+#endif
+}
+
+static inline void *set_pointer_tag(void *ptr, u8 tag) {
+#ifdef HAS_ARM_MTE
+    return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));
+#else
+    (void) tag;
+    return ptr;
+#endif
+}
+
+static inline u8 get_pointer_tag(void *ptr) {
+#ifdef HAS_ARM_MTE
+    return (((uintptr_t) ptr) >> 56) & 0xf;
+#else
+    (void) ptr;
+    return 0;
+#endif
+}
+
+#endif
diff --git a/src/hardened_malloc/mutex.h b/src/hardened_malloc/mutex.h
new file mode 100644
index 0000000..b8f77f9
--- /dev/null
+++ b/src/hardened_malloc/mutex.h
@@ -0,0 +1,28 @@
+#ifndef MUTEX_H
+#define MUTEX_H
+
+#include <pthread.h>
+
+#include "util.h"
+
+struct mutex {
+    pthread_mutex_t lock;
+};
+
+#define MUTEX_INITIALIZER (struct mutex){PTHREAD_MUTEX_INITIALIZER}
+
+static inline void mutex_init(struct mutex *m) {
+    if (unlikely(pthread_mutex_init(&m->lock, NULL))) {
+        fatal_error("mutex initialization failed");
+    }
+}
+
+static inline void mutex_lock(struct mutex *m) {
+    pthread_mutex_lock(&m->lock);
+}
+
+static inline void mutex_unlock(struct mutex *m) {
+    pthread_mutex_unlock(&m->lock);
+}
+
+#endif
diff --git a/src/hardened_malloc/new.cc b/src/hardened_malloc/new.cc
new file mode 100644
index 0000000..165e19e
--- /dev/null
+++ b/src/hardened_malloc/new.cc
@@ -0,0 +1,153 @@
+// needed with libstdc++ but not libc++
+#if __has_include(<bits/functexcept.h>)
+#include <bits/functexcept.h>
+#endif
+
+#include <new>
+
+#include "h_malloc.h"
+#include "util.h"
+
+COLD static void *handle_out_of_memory(size_t size, bool nothrow) {
+    void *ptr = nullptr;
+
+    do {
+        std::new_handler handler = std::get_new_handler();
+        if (handler == nullptr) {
+            break;
+        }
+
+        try {
+            handler();
+        } catch (const std::bad_alloc &) {
+            break;
+        }
+
+        ptr = h_malloc(size);
+    } while (ptr == nullptr);
+
+    if (ptr == nullptr && !nothrow) {
+        std::__throw_bad_alloc();
+    }
+    return ptr;
+}
+
+static inline void *new_impl(size_t size, bool nothrow) {
+    void *ptr = h_malloc(size);
+    if (likely(ptr != nullptr)) {
+        return ptr;
+    }
+    return handle_out_of_memory(size, nothrow);
+}
+
+EXPORT void *operator new(size_t size) {
+    return new_impl(size, false);
+}
+
+EXPORT void *operator new[](size_t size) {
+    return new_impl(size, false);
+}
+
+EXPORT void *operator new(size_t size, const std::nothrow_t &) noexcept {
+    return new_impl(size, true);
+}
+
+EXPORT void *operator new[](size_t size, const std::nothrow_t &) noexcept {
+    return new_impl(size, true);
+}
+
+EXPORT void operator delete(void *ptr) noexcept {
+    h_free(ptr);
+}
+
+EXPORT void operator delete[](void *ptr) noexcept {
+    h_free(ptr);
+}
+
+EXPORT void operator delete(void *ptr, const std::nothrow_t &) noexcept {
+    h_free(ptr);
+}
+
+EXPORT void operator delete[](void *ptr, const std::nothrow_t &) noexcept {
+    h_free(ptr);
+}
+
+EXPORT void operator delete(void *ptr, size_t size) noexcept {
+    h_free_sized(ptr, size);
+}
+
+EXPORT void operator delete[](void *ptr, size_t size) noexcept {
+    h_free_sized(ptr, size);
+}
+
+COLD static void *handle_out_of_memory(size_t size, size_t alignment, bool nothrow) {
+    void *ptr = nullptr;
+
+    do {
+        std::new_handler handler = std::get_new_handler();
+        if (handler == nullptr) {
+            break;
+        }
+
+        try {
+            handler();
+        } catch (const std::bad_alloc &) {
+            break;
+        }
+
+        ptr = h_aligned_alloc(alignment, size);
+    } while (ptr == nullptr);
+
+    if (ptr == nullptr && !nothrow) {
+        std::__throw_bad_alloc();
+    }
+    return ptr;
+}
+
+static inline void *new_impl(size_t size, size_t alignment, bool nothrow) {
+    void *ptr = h_aligned_alloc(alignment, size);
+    if (likely(ptr != nullptr)) {
+        return ptr;
+    }
+    return handle_out_of_memory(size, alignment, nothrow);
+}
+
+EXPORT void *operator new(size_t size, std::align_val_t alignment) {
+    return new_impl(size, static_cast<size_t>(alignment), false);
+}
+
+EXPORT void *operator new[](size_t size, std::align_val_t alignment) {
+    return new_impl(size, static_cast<size_t>(alignment), false);
+}
+
+EXPORT void *operator new(size_t size, std::align_val_t alignment, const std::nothrow_t &) noexcept {
+    return new_impl(size, static_cast<size_t>(alignment), true);
+}
+
+EXPORT void *operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t &) noexcept {
+    return new_impl(size, static_cast<size_t>(alignment), true);
+}
+
+EXPORT void operator delete(void *ptr, std::align_val_t) noexcept {
+    h_free(ptr);
+}
+
+EXPORT void operator delete[](void *ptr, std::align_val_t) noexcept {
+    h_free(ptr);
+}
+
+EXPORT void operator delete(void *ptr, std::align_val_t, const std::nothrow_t &) noexcept {
+    h_free(ptr);
+}
+
+EXPORT void operator delete[](void *ptr, std::align_val_t, const std::nothrow_t &) noexcept {
+    h_free(ptr);
+}
+
+EXPORT void operator delete(void *ptr, size_t size, std::align_val_t) noexcept {
+    h_free_sized(ptr, size);
+}
+
+EXPORT void operator delete[](void *ptr, size_t size, std::align_val_t) noexcept {
+    h_free_sized(ptr, size);
+}
diff --git a/src/hardened_malloc/pages.c b/src/hardened_malloc/pages.c
new file mode 100644
index 0000000..27558de
--- /dev/null
+++ b/src/hardened_malloc/pages.c
@@ -0,0 +1,88 @@
+#include <errno.h>
+
+#include "memory.h"
+#include "pages.h"
+#include "util.h"
+
+static bool add_guards(size_t size, size_t guard_size, size_t *total_size) {
+    return __builtin_add_overflow(size, guard_size, total_size) ||
+        __builtin_add_overflow(*total_size, guard_size, total_size);
+}
+
+void *allocate_pages(size_t usable_size, size_t guard_size, bool unprotect, const char *name) {
+    size_t real_size;
+    if (unlikely(add_guards(usable_size, guard_size, &real_size))) {
+        errno = ENOMEM;
+        return NULL;
+    }
+    void *real = memory_map(real_size);
+    if (unlikely(real == NULL)) {
+        return NULL;
+    }
+    memory_set_name(real, real_size, name);
+    void *usable = (char *)real + guard_size;
+    if (unprotect && unlikely(memory_protect_rw(usable, usable_size))) {
+        memory_unmap(real, real_size);
+        return NULL;
+    }
+    return usable;
+}
+
+void *allocate_pages_aligned(size_t usable_size, size_t alignment, size_t guard_size, const char *name) {
+    usable_size = page_align(usable_size);
+    if (unlikely(!usable_size)) {
+        errno = ENOMEM;
+        return NULL;
+    }
+
+    size_t alloc_size;
+    if (unlikely(__builtin_add_overflow(usable_size, alignment - PAGE_SIZE, &alloc_size))) {
+        errno = ENOMEM;
+        return NULL;
+    }
+
+    size_t real_alloc_size;
+    if (unlikely(add_guards(alloc_size, guard_size, &real_alloc_size))) {
+        errno = ENOMEM;
+        return NULL;
+    }
+
+    void *real = memory_map(real_alloc_size);
+    if (unlikely(real == NULL)) {
+        return NULL;
+    }
+    memory_set_name(real, real_alloc_size, name);
+
+    void *usable = (char *)real + guard_size;
+
+    size_t lead_size = align((uintptr_t)usable, alignment) - (uintptr_t)usable;
+    size_t trail_size = alloc_size - lead_size - usable_size;
+    void *base = (char *)usable + lead_size;
+
+    if (unlikely(memory_protect_rw(base, usable_size))) {
+        memory_unmap(real, real_alloc_size);
+        return NULL;
+    }
+
+    if (lead_size) {
+        if (unlikely(memory_unmap(real, lead_size))) {
+            memory_unmap(real, real_alloc_size);
+            return NULL;
+        }
+    }
+
+    if (trail_size) {
+        if (unlikely(memory_unmap((char *)base + usable_size + guard_size, trail_size))) {
+            memory_unmap(real, real_alloc_size);
+            return NULL;
+        }
+    }
+
+    return base;
+}
+
+void deallocate_pages(void *usable, size_t usable_size, size_t guard_size) {
+    if (unlikely(memory_unmap((char *)usable - guard_size, usable_size + guard_size * 2))) {
+        memory_purge(usable, usable_size);
+    }
+}
diff --git a/src/hardened_malloc/pages.h b/src/hardened_malloc/pages.h
new file mode 100644
index 0000000..8795ddc
--- /dev/null
+++ b/src/hardened_malloc/pages.h
@@ -0,0 +1,32 @@
+#ifndef PAGES_H
+#define PAGES_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "util.h"
+
+#define PAGE_SHIFT 12
+#ifndef PAGE_SIZE
+#define PAGE_SIZE ((size_t)1 << PAGE_SHIFT)
+#endif
+
+void *allocate_pages(size_t usable_size, size_t guard_size, bool unprotect, const char *name);
+void *allocate_pages_aligned(size_t usable_size, size_t alignment, size_t guard_size, const char *name);
+void deallocate_pages(void *usable, size_t usable_size, size_t guard_size);
+
+static inline size_t page_align(size_t size) {
+    return align(size, PAGE_SIZE);
+}
+
+static inline size_t hash_page(const void *p) {
+    uintptr_t u = (uintptr_t)p >> PAGE_SHIFT;
+    size_t sum = u;
+    sum = (sum << 7) - sum + (u >> 16);
+    sum = (sum << 7) - sum + (u >> 32);
+    sum = (sum << 7) - sum + (u >> 48);
+    return sum;
+}
+
+#endif
diff --git a/src/hardened_malloc/preload.sh b/src/hardened_malloc/preload.sh
new file mode 100755
index 0000000..ee6abb6
--- /dev/null
+++ b/src/hardened_malloc/preload.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+[[ $LD_PRELOAD ]] && LD_PRELOAD+=" "
+export LD_PRELOAD+="$dir/libhardened_malloc.so"
+exec "$@"
diff --git a/src/hardened_malloc/random.c b/src/hardened_malloc/random.c
new file mode 100644
index 0000000..8883531
--- /dev/null
+++ b/src/hardened_malloc/random.c
@@ -0,0 +1,128 @@
+#include <errno.h>
+#include <string.h>
+
+#include "chacha.h"
+#include "random.h"
+#include "util.h"
+
+#include <sys/random.h>
+
+static void get_random_seed(void *buf, size_t size) {
+    while (size) {
+        ssize_t r;
+
+        do {
+            r = getrandom(buf, size, 0);
+        } while (r == -1 && errno == EINTR);
+
+        if (r <= 0) {
+            fatal_error("getrandom failed");
+        }
+
+        buf = (char *)buf + r;
+        size -= r;
+    }
+}
+
+void random_state_init(struct random_state *state) {
+    u8 rnd[CHACHA_KEY_SIZE + CHACHA_IV_SIZE];
+    get_random_seed(rnd, sizeof(rnd));
+    chacha_keysetup(&state->ctx, rnd);
+    chacha_ivsetup(&state->ctx, rnd + CHACHA_KEY_SIZE);
+    state->index = RANDOM_CACHE_SIZE;
+    state->reseed = 0;
+}
+
+void random_state_init_from_random_state(struct random_state *state, struct random_state *source) {
+    u8 rnd[CHACHA_KEY_SIZE + CHACHA_IV_SIZE];
+    get_random_bytes(source, rnd, sizeof(rnd));
+    chacha_keysetup(&state->ctx, rnd);
+    chacha_ivsetup(&state->ctx, rnd + CHACHA_KEY_SIZE);
+    state->index = RANDOM_CACHE_SIZE;
+    state->reseed = 0;
+}
+
+static void refill(struct random_state *state) {
+    if (state->reseed >= RANDOM_RESEED_SIZE) {
+        random_state_init(state);
+    }
+    chacha_keystream_bytes(&state->ctx, state->cache, RANDOM_CACHE_SIZE);
+    state->index = 0;
+    state->reseed += RANDOM_CACHE_SIZE;
+}
+
+void get_random_bytes(struct random_state *state, void *buf, size_t size) {
+    // avoid needless copying to and from the cache as an optimization
+    if (size > RANDOM_CACHE_SIZE / 2) {
+        chacha_keystream_bytes(&state->ctx, buf, size);
+        return;
+    }
+
+    while (size) {
+        if (state->index == RANDOM_CACHE_SIZE) {
+            refill(state);
+        }
+
+        size_t remaining = RANDOM_CACHE_SIZE - state->index;
+        size_t copy_size = min(size, remaining);
+        memcpy(buf, state->cache + state->index, copy_size);
+        state->index += copy_size;
+
+        buf = (char *)buf + copy_size;
+        size -= copy_size;
+    }
+}
+
+u16 get_random_u16(struct random_state *state) {
+    u16 value;
+    unsigned remaining = RANDOM_CACHE_SIZE - state->index;
+    if (remaining < sizeof(value)) {
+        refill(state);
+    }
+    memcpy(&value, state->cache + state->index, sizeof(value));
+    state->index += sizeof(value);
+    return value;
+}
+
+// See Fast Random Integer Generation in an Interval by Daniel Lemire
+u16 get_random_u16_uniform(struct random_state *state, u16 bound) {
+    u32 random = get_random_u16(state);
+    u32 multiresult = random * bound;
+    u16 leftover = multiresult;
+    if (leftover < bound) {
+        u16 threshold = -bound % bound;
+        while (leftover < threshold) {
+            random = get_random_u16(state);
+            multiresult = random * bound;
+            leftover = (u16)multiresult;
+        }
+    }
+    return multiresult >> 16;
+}
+
+u64 get_random_u64(struct random_state *state) {
+    u64 value;
+    unsigned remaining = RANDOM_CACHE_SIZE - state->index;
+    if (remaining < sizeof(value)) {
+        refill(state);
+    }
+    memcpy(&value, state->cache + state->index, sizeof(value));
+    state->index += sizeof(value);
+    return value;
+}
+
+// See Fast Random Integer Generation in an Interval by Daniel Lemire
+u64 get_random_u64_uniform(struct random_state *state, u64 bound) {
+    u128 random = get_random_u64(state);
+    u128 multiresult = random * bound;
+    u64 leftover = multiresult;
+    if (leftover < bound) {
+        u64 threshold = -bound % bound;
+        while (leftover < threshold) {
+            random = get_random_u64(state);
+            multiresult = random * bound;
+            leftover = multiresult;
+        }
+    }
+    return multiresult >> 64;
+}
diff --git a/src/hardened_malloc/random.h b/src/hardened_malloc/random.h
new file mode 100644
index 0000000..14703bb
--- /dev/null
+++ b/src/hardened_malloc/random.h
@@ -0,0 +1,25 @@
+#ifndef RANDOM_H
+#define RANDOM_H
+
+#include "chacha.h"
+#include "util.h"
+
+#define RANDOM_CACHE_SIZE 256U
+#define RANDOM_RESEED_SIZE (256U * 1024)
+
+struct random_state {
+    unsigned index;
+    unsigned reseed;
+    chacha_ctx ctx;
+    u8 cache[RANDOM_CACHE_SIZE];
+};
+
+void random_state_init(struct random_state *state);
+void random_state_init_from_random_state(struct random_state *state, struct random_state *source);
+void get_random_bytes(struct random_state *state, void *buf, size_t size);
+u16 get_random_u16(struct random_state *state);
+u16 get_random_u16_uniform(struct random_state *state, u16 bound);
+u64 get_random_u64(struct random_state *state);
+u64 get_random_u64_uniform(struct random_state *state, u64 bound);
+
+#endif
diff --git a/src/hardened_malloc/test/.gitignore b/src/hardened_malloc/test/.gitignore
new file mode 100644
index 0000000..d37a6a7
--- /dev/null
+++ b/src/hardened_malloc/test/.gitignore
@@ -0,0 +1,44 @@
+large_array_growth
+mallinfo
+mallinfo2
+malloc_info
+offset
+delete_type_size_mismatch
+double_free_large
+double_free_large_delayed
+double_free_small
+double_free_small_delayed
+invalid_free_protected
+invalid_free_small_region
+invalid_free_small_region_far
+invalid_free_unprotected
+read_after_free_large
+read_after_free_small
+read_zero_size
+string_overflow
+unaligned_free_large
+unaligned_free_small
+uninitialized_free
+uninitialized_malloc_usable_size
+uninitialized_realloc
+write_after_free_large
+write_after_free_large_reuse
+write_after_free_small
+write_after_free_small_reuse
+write_zero_size
+unaligned_malloc_usable_size_small
+invalid_malloc_usable_size_small
+invalid_malloc_usable_size_small_quarantine
+malloc_object_size
+malloc_object_size_offset
+invalid_malloc_object_size_small
+invalid_malloc_object_size_small_quarantine
+impossibly_large_malloc
+overflow_large_1_byte
+overflow_large_8_byte
+overflow_small_1_byte
+overflow_small_8_byte
+uninitialized_read_large
+uninitialized_read_small
+realloc_init
+__pycache__/
diff --git a/src/hardened_malloc/test/Makefile b/src/hardened_malloc/test/Makefile
new file mode 100644
index 0000000..0eb3921
--- /dev/null
+++ b/src/hardened_malloc/test/Makefile
@@ -0,0 +1,76 @@
+CONFIG_SLAB_CANARY := true
+CONFIG_EXTENDED_SIZE_CLASSES := true
+
+ifneq ($(VARIANT),)
+    $(error testing non-default variants not yet supported)
+endif
+
+ifeq (,$(filter $(CONFIG_SLAB_CANARY),true false))
+    $(error CONFIG_SLAB_CANARY must be true or false)
+endif
+
+dir=$(dir $(realpath $(firstword $(MAKEFILE_LIST))))
+
+CPPFLAGS := \
+    -D_GNU_SOURCE \
+    -DSLAB_CANARY=$(CONFIG_SLAB_CANARY) \
+    -DCONFIG_EXTENDED_SIZE_CLASSES=$(CONFIG_EXTENDED_SIZE_CLASSES)
+
+SHARED_FLAGS := -O3
+
+CFLAGS := -std=c17 $(SHARED_FLAGS) -Wmissing-prototypes
+CXXFLAGS := -std=c++17 -fsized-deallocation $(SHARED_FLAGS)
+LDFLAGS := -Wl,-L$(dir)../out,-R,$(dir)../out
+
+LDLIBS := -lpthread -lhardened_malloc
+
+EXECUTABLES := \
+    offset \
+    mallinfo \
+    mallinfo2 \
+    malloc_info \
+    large_array_growth \
+    double_free_large \
+    double_free_large_delayed \
+    double_free_small \
+    double_free_small_delayed \
+    unaligned_free_large \
+    unaligned_free_small \
+    read_after_free_large \
+    read_after_free_small \
+    write_after_free_large \
+    write_after_free_large_reuse \
+    write_after_free_small \
+    write_after_free_small_reuse \
+    read_zero_size \
+    write_zero_size \
+    invalid_free_protected \
+    invalid_free_unprotected \
+    invalid_free_small_region \
+    invalid_free_small_region_far \
+    uninitialized_read_small \
+    uninitialized_read_large \
+    uninitialized_free \
+    uninitialized_realloc \
+    uninitialized_malloc_usable_size \
+    overflow_large_1_byte \
+    overflow_large_8_byte \
+    overflow_small_1_byte \
+    overflow_small_8_byte \
+    string_overflow \
+    delete_type_size_mismatch \
+    unaligned_malloc_usable_size_small \
+    invalid_malloc_usable_size_small \
+    invalid_malloc_usable_size_small_quarantine \
+    malloc_object_size \
+    malloc_object_size_offset \
+    invalid_malloc_object_size_small \
+    invalid_malloc_object_size_small_quarantine \
+    impossibly_large_malloc \
+    realloc_init
+
+all: $(EXECUTABLES)
+
+clean:
+	rm -f $(EXECUTABLES)
+	rm -fr ./__pycache__
diff --git a/src/hardened_malloc/test/__init__.py b/src/hardened_malloc/test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/hardened_malloc/test/delete_type_size_mismatch.cc b/src/hardened_malloc/test/delete_type_size_mismatch.cc
new file mode 100644
index 0000000..92bb374
--- /dev/null
+++ b/src/hardened_malloc/test/delete_type_size_mismatch.cc
@@ -0,0 +1,14 @@
+#include <stdint.h>
+
+#include "test_util.h"
+
+struct foo {
+    uint64_t a, b, c, d;
+};
+
+OPTNONE int main(void) {
+    void *p = new char;
+    struct foo *c = (struct foo *)p;
+    delete c;
+    return 0;
+}
diff --git a/src/hardened_malloc/test/double_free_large.c b/src/hardened_malloc/test/double_free_large.c
new file mode 100644
index 0000000..ee740e1
--- /dev/null
+++ b/src/hardened_malloc/test/double_free_large.c
@@ -0,0 +1,13 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    void *p = malloc(256 * 1024);
+    if (!p) {
+        return 1;
+    }
+    free(p);
+    free(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/double_free_large_delayed.c b/src/hardened_malloc/test/double_free_large_delayed.c
new file mode 100644
index 0000000..232a812
--- /dev/null
+++ b/src/hardened_malloc/test/double_free_large_delayed.c
@@ -0,0 +1,18 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    void *p = malloc(256 * 1024);
+    if (!p) {
+        return 1;
+    }
+    void *q = malloc(256 * 1024);
+    if (!q) {
+        return 1;
+    }
+    free(p);
+    free(q);
+    free(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/double_free_small.c b/src/hardened_malloc/test/double_free_small.c
new file mode 100644
index 0000000..94ab0ba
--- /dev/null
+++ b/src/hardened_malloc/test/double_free_small.c
@@ -0,0 +1,13 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    void *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+    free(p);
+    free(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/double_free_small_delayed.c b/src/hardened_malloc/test/double_free_small_delayed.c
new file mode 100644
index 0000000..5a9a34e
--- /dev/null
+++ b/src/hardened_malloc/test/double_free_small_delayed.c
@@ -0,0 +1,18 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    void *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+    void *q = malloc(16);
+    if (!q) {
+        return 1;
+    }
+    free(p);
+    free(q);
+    free(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/impossibly_large_malloc.c b/src/hardened_malloc/test/impossibly_large_malloc.c
new file mode 100644
index 0000000..63cdc0c
--- /dev/null
+++ b/src/hardened_malloc/test/impossibly_large_malloc.c
@@ -0,0 +1,8 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(-8);
+    return !(p == NULL);
+}
diff --git a/src/hardened_malloc/test/invalid_free_protected.c b/src/hardened_malloc/test/invalid_free_protected.c
new file mode 100644
index 0000000..0364baa
--- /dev/null
+++ b/src/hardened_malloc/test/invalid_free_protected.c
@@ -0,0 +1,15 @@
+#include <stdlib.h>
+
+#include <sys/mman.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    free(malloc(16));
+    char *p = mmap(NULL, 4096 * 16, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+    if (p == MAP_FAILED) {
+        return 1;
+    }
+    free(p + 4096 * 8);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/invalid_free_small_region.c b/src/hardened_malloc/test/invalid_free_small_region.c
new file mode 100644
index 0000000..81cfbf2
--- /dev/null
+++ b/src/hardened_malloc/test/invalid_free_small_region.c
@@ -0,0 +1,13 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+    char *q = p + 4096 * 4;
+    free(q);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/invalid_free_small_region_far.c b/src/hardened_malloc/test/invalid_free_small_region_far.c
new file mode 100644
index 0000000..c35c1ba
--- /dev/null
+++ b/src/hardened_malloc/test/invalid_free_small_region_far.c
@@ -0,0 +1,13 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+    char *q = p + 1024 * 1024 * 1024;
+    free(q);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/invalid_free_unprotected.c b/src/hardened_malloc/test/invalid_free_unprotected.c
new file mode 100644
index 0000000..26254ab
--- /dev/null
+++ b/src/hardened_malloc/test/invalid_free_unprotected.c
@@ -0,0 +1,15 @@
+#include <stdlib.h>
+
+#include <sys/mman.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    free(malloc(16));
+    char *p = mmap(NULL, 4096 * 16, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+    if (p == MAP_FAILED) {
+        return 1;
+    }
+    free(p + 4096 * 8);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/invalid_malloc_object_size_small.c b/src/hardened_malloc/test/invalid_malloc_object_size_small.c
new file mode 100644
index 0000000..33cc78f
--- /dev/null
+++ b/src/hardened_malloc/test/invalid_malloc_object_size_small.c
@@ -0,0 +1,15 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+size_t malloc_object_size(void *ptr);
+
+OPTNONE int main(void) {
+    char *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+    char *q = p + 4096 * 4;
+    malloc_object_size(q);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/invalid_malloc_object_size_small_quarantine.c b/src/hardened_malloc/test/invalid_malloc_object_size_small_quarantine.c
new file mode 100644
index 0000000..1a26bc0
--- /dev/null
+++ b/src/hardened_malloc/test/invalid_malloc_object_size_small_quarantine.c
@@ -0,0 +1,15 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+size_t malloc_object_size(void *ptr);
+
+OPTNONE int main(void) {
+    void *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+    free(p);
+    malloc_object_size(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/invalid_malloc_usable_size_small.c b/src/hardened_malloc/test/invalid_malloc_usable_size_small.c
new file mode 100644
index 0000000..440aa6b
--- /dev/null
+++ b/src/hardened_malloc/test/invalid_malloc_usable_size_small.c
@@ -0,0 +1,13 @@
+#include <malloc.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+    char *q = p + 4096 * 4;
+    malloc_usable_size(q);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/invalid_malloc_usable_size_small_quarantine.c b/src/hardened_malloc/test/invalid_malloc_usable_size_small_quarantine.c
new file mode 100644
index 0000000..926acd7
--- /dev/null
+++ b/src/hardened_malloc/test/invalid_malloc_usable_size_small_quarantine.c
@@ -0,0 +1,13 @@
+#include <malloc.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    void *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+    free(p);
+    malloc_usable_size(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/large_array_growth.c b/src/hardened_malloc/test/large_array_growth.c
new file mode 100644
index 0000000..09f89c5
--- /dev/null
+++ b/src/hardened_malloc/test/large_array_growth.c
@@ -0,0 +1,18 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    void *p = NULL;
+    size_t size = 256 * 1024;
+
+    for (unsigned i = 0; i < 20; i++) {
+        p = realloc(p, size);
+        if (!p) {
+            return 1;
+        }
+        memset(p, 'a', size);
+        size = size * 3 / 2;
+    }
+}
diff --git a/src/hardened_malloc/test/mallinfo.c b/src/hardened_malloc/test/mallinfo.c
new file mode 100644
index 0000000..6008040
--- /dev/null
+++ b/src/hardened_malloc/test/mallinfo.c
@@ -0,0 +1,44 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(__GLIBC__) || defined(__ANDROID__)
+#include <malloc.h>
+#endif
+
+#include "test_util.h"
+
+static void print_mallinfo(void) {
+#if defined(__GLIBC__) || defined(__ANDROID__)
+    struct mallinfo info = mallinfo();
+    printf("mallinfo:\n");
+    printf("arena: %zu\n", (size_t)info.arena);
+    printf("ordblks: %zu\n", (size_t)info.ordblks);
+    printf("smblks: %zu\n", (size_t)info.smblks);
+    printf("hblks: %zu\n", (size_t)info.hblks);
+    printf("hblkhd: %zu\n", (size_t)info.hblkhd);
+    printf("usmblks: %zu\n", (size_t)info.usmblks);
+    printf("fsmblks: %zu\n", (size_t)info.fsmblks);
+    printf("uordblks: %zu\n", (size_t)info.uordblks);
+    printf("fordblks: %zu\n", (size_t)info.fordblks);
+    printf("keepcost: %zu\n", (size_t)info.keepcost);
+#endif
+}
+
+OPTNONE int main(void) {
+    void *a[4];
+
+    a[0] = malloc(1024 * 1024 * 1024);
+    a[1] = malloc(16);
+    a[2] = malloc(32);
+    a[3] = malloc(64);
+
+    print_mallinfo();
+
+    free(a[0]);
+    free(a[1]);
+    free(a[2]);
+    free(a[3]);
+
+    printf("\n");
+    print_mallinfo();
+}
diff --git a/src/hardened_malloc/test/mallinfo2.c b/src/hardened_malloc/test/mallinfo2.c
new file mode 100644
index 0000000..2f4cd33
--- /dev/null
+++ b/src/hardened_malloc/test/mallinfo2.c
@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(__GLIBC__)
+#include <malloc.h>
+#endif
+
+#include "test_util.h"
+
+static void print_mallinfo2(void) {
+#if defined(__GLIBC__)
+    struct mallinfo2 info = mallinfo2();
+    printf("mallinfo2:\n");
+    printf("arena: %zu\n", (size_t)info.arena);
+    printf("ordblks: %zu\n", (size_t)info.ordblks);
+    printf("smblks: %zu\n", (size_t)info.smblks);
+    printf("hblks: %zu\n", (size_t)info.hblks);
+    printf("hblkhd: %zu\n", (size_t)info.hblkhd);
+    printf("usmblks: %zu\n", (size_t)info.usmblks);
+    printf("fsmblks: %zu\n", (size_t)info.fsmblks);
+    printf("uordblks: %zu\n", (size_t)info.uordblks);
+    printf("fordblks: %zu\n", (size_t)info.fordblks);
+    printf("keepcost: %zu\n", (size_t)info.keepcost);
+#endif
+}
+
+OPTNONE int main(void) {
+    void *a[4];
+
+    a[0] = malloc(1024 * 1024 * 1024);
+    a[1] = malloc(16);
+    a[2] = malloc(32);
+    a[3] = malloc(64);
+
+    print_mallinfo2();
+
+    free(a[0]);
+    free(a[1]);
+    free(a[2]);
+    free(a[3]);
+
+    printf("\n");
+    print_mallinfo2();
+}
diff --git a/src/hardened_malloc/test/malloc_info.c b/src/hardened_malloc/test/malloc_info.c
new file mode 100644
index 0000000..50b256f
--- /dev/null
+++ b/src/hardened_malloc/test/malloc_info.c
@@ -0,0 +1,35 @@
+#include <pthread.h>
+#include <stdio.h>
+
+#if defined(__GLIBC__) || defined(__ANDROID__)
+#include <malloc.h>
+#endif
+
+#include "test_util.h"
+#include "../util.h"
+
+OPTNONE static void leak_memory(void) {
+    (void)!malloc(1024 * 1024 * 1024);
+    (void)!malloc(16);
+    (void)!malloc(32);
+    (void)!malloc(4096);
+}
+
+static void *do_work(UNUSED void *p) {
+    leak_memory();
+    return NULL;
+}
+
+int main(void) {
+    pthread_t thread[4];
+    for (int i = 0; i < 4; i++) {
+        pthread_create(&thread[i], NULL, do_work, NULL);
+    }
+    for (int i = 0; i < 4; i++) {
+        pthread_join(thread[i], NULL);
+    }
+
+#if defined(__GLIBC__) || defined(__ANDROID__)
+    malloc_info(0, stdout);
+#endif
+}
diff --git a/src/hardened_malloc/test/malloc_object_size.c b/src/hardened_malloc/test/malloc_object_size.c
new file mode 100644
index 0000000..5ab9280
--- /dev/null
+++ b/src/hardened_malloc/test/malloc_object_size.c
@@ -0,0 +1,12 @@
+#include <stdbool.h>
+#include <stdlib.h>
+
+#include "test_util.h"
+
+size_t malloc_object_size(void *ptr);
+
+OPTNONE int main(void) {
+    char *p = malloc(16);
+    size_t size = malloc_object_size(p);
+    return size != (SLAB_CANARY ? 24 : 32);
+}
diff --git a/src/hardened_malloc/test/malloc_object_size_offset.c b/src/hardened_malloc/test/malloc_object_size_offset.c
new file mode 100644
index 0000000..d605906
--- /dev/null
+++ b/src/hardened_malloc/test/malloc_object_size_offset.c
@@ -0,0 +1,12 @@
+#include <stdbool.h>
+#include <stdlib.h>
+
+#include "test_util.h"
+
+size_t malloc_object_size(void *ptr);
+
+OPTNONE int main(void) {
+    char *p = malloc(16);
+    size_t size = malloc_object_size(p + 5);
+    return size != (SLAB_CANARY ? 19 : 27);
+}
diff --git a/src/hardened_malloc/test/offset.c b/src/hardened_malloc/test/offset.c
new file mode 100644
index 0000000..af14f5c
--- /dev/null
+++ b/src/hardened_malloc/test/offset.c
@@ -0,0 +1,50 @@
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static size_t size_classes[] = {
+    /* large */ 4 * 1024 * 1024,
+    /* 0 */ 0,
+    /* 16 */ 16, 32, 48, 64, 80, 96, 112, 128,
+    /* 32 */ 160, 192, 224, 256,
+    /* 64 */ 320, 384, 448, 512,
+    /* 128 */ 640, 768, 896, 1024,
+    /* 256 */ 1280, 1536, 1792, 2048,
+    /* 512 */ 2560, 3072, 3584, 4096,
+    /* 1024 */ 5120, 6144, 7168, 8192,
+    /* 2048 */ 10240, 12288, 14336, 16384,
+#if CONFIG_EXTENDED_SIZE_CLASSES
+    /* 4096 */ 20480, 24576, 28672, 32768,
+    /* 8192 */ 40960, 49152, 57344, 65536,
+    /* 16384 */ 81920, 98304, 114688, 131072,
+#endif
+};
+
+#define N_SIZE_CLASSES (sizeof(size_classes) / sizeof(size_classes[0]))
+
+static const size_t canary_size = SLAB_CANARY ? sizeof(uint64_t) : 0;
+
+int main(void) {
+    for (unsigned i = 2; i < N_SIZE_CLASSES; i++) {
+        size_classes[i] -= canary_size;
+    }
+
+    void *p[N_SIZE_CLASSES];
+    for (unsigned i = 0; i < N_SIZE_CLASSES; i++) {
+        size_t size = size_classes[i];
+        p[i] = malloc(size);
+        if (!p[i]) {
+            return 1;
+        }
+        void *q = malloc(size);
+        if (!q) {
+            return 1;
+        }
+        if (i != 0) {
+            printf("%zu to %zu: %zd\n", size_classes[i - 1], size, p[i] - p[i - 1]);
+        }
+        printf("%zu to %zu: %zd\n", size, size, q - p[i]);
+    }
+    return 0;
+}
diff --git a/src/hardened_malloc/test/overflow_large_1_byte.c b/src/hardened_malloc/test/overflow_large_1_byte.c
new file mode 100644
index 0000000..a74bbfd
--- /dev/null
+++ b/src/hardened_malloc/test/overflow_large_1_byte.c
@@ -0,0 +1,15 @@
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(256 * 1024);
+    if (!p) {
+        return 1;
+    }
+    size_t size = malloc_usable_size(p);
+    *(p + size) = 0;
+    free(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/overflow_large_8_byte.c b/src/hardened_malloc/test/overflow_large_8_byte.c
new file mode 100644
index 0000000..4c7d15c
--- /dev/null
+++ b/src/hardened_malloc/test/overflow_large_8_byte.c
@@ -0,0 +1,15 @@
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(256 * 1024);
+    if (!p) {
+        return 1;
+    }
+    size_t size = malloc_usable_size(p);
+    *(p + size + 7) = 0;
+    free(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/overflow_small_1_byte.c b/src/hardened_malloc/test/overflow_small_1_byte.c
new file mode 100644
index 0000000..f4f60e1
--- /dev/null
+++ b/src/hardened_malloc/test/overflow_small_1_byte.c
@@ -0,0 +1,15 @@
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(8);
+    if (!p) {
+        return 1;
+    }
+    size_t size = malloc_usable_size(p);
+    *(p + size) = 1;
+    free(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/overflow_small_8_byte.c b/src/hardened_malloc/test/overflow_small_8_byte.c
new file mode 100644
index 0000000..4256d54
--- /dev/null
+++ b/src/hardened_malloc/test/overflow_small_8_byte.c
@@ -0,0 +1,16 @@
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(8);
+    if (!p) {
+        return 1;
+    }
+    size_t size = malloc_usable_size(p);
+    // XOR is used to avoid the test having a 1/256 chance to fail
+    *(p + size + 7) ^= 1;
+    free(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/read_after_free_large.c b/src/hardened_malloc/test/read_after_free_large.c
new file mode 100644
index 0000000..f5fa18c
--- /dev/null
+++ b/src/hardened_malloc/test/read_after_free_large.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(256 * 1024);
+    if (!p) {
+        return 1;
+    }
+    memset(p, 'a', 16);
+    free(p);
+    for (size_t i = 0; i < 256 * 1024; i++) {
+        printf("%x\n", p[i]);
+        if (p[i] != '\0') {
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/src/hardened_malloc/test/read_after_free_small.c b/src/hardened_malloc/test/read_after_free_small.c
new file mode 100644
index 0000000..2a969ab
--- /dev/null
+++ b/src/hardened_malloc/test/read_after_free_small.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+    memset(p, 'a', 16);
+    free(p);
+    for (size_t i = 0; i < 16; i++) {
+        printf("%x\n", p[i]);
+        if (p[i] != '\0') {
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/src/hardened_malloc/test/read_zero_size.c b/src/hardened_malloc/test/read_zero_size.c
new file mode 100644
index 0000000..53838f2
--- /dev/null
+++ b/src/hardened_malloc/test/read_zero_size.c
@@ -0,0 +1,13 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(0);
+    if (!p) {
+        return 1;
+    }
+    printf("%c\n", *p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/realloc_init.c b/src/hardened_malloc/test/realloc_init.c
new file mode 100644
index 0000000..01ec573
--- /dev/null
+++ b/src/hardened_malloc/test/realloc_init.c
@@ -0,0 +1,33 @@
+#include <pthread.h>
+#include <stdlib.h>
+
+static void *thread_func(void *arg) {
+    arg = realloc(arg, 1024);
+    if (!arg) {
+        exit(EXIT_FAILURE);
+    }
+
+    free(arg);
+
+    return NULL;
+}
+
+int main(void) {
+    void *mem = realloc(NULL, 12);
+    if (!mem) {
+        return EXIT_FAILURE;
+    }
+
+    pthread_t thread;
+    int r = pthread_create(&thread, NULL, thread_func, mem);
+    if (r != 0) {
+        return EXIT_FAILURE;
+    }
+
+    r = pthread_join(thread, NULL);
+    if (r != 0) {
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/hardened_malloc/test/string_overflow.c b/src/hardened_malloc/test/string_overflow.c
new file mode 100644
index 0000000..c2dda6d
--- /dev/null
+++ b/src/hardened_malloc/test/string_overflow.c
@@ -0,0 +1,20 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <malloc.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+
+    size_t size = malloc_usable_size(p);
+    memset(p, 'a', size);
+    printf("overflow by %zu bytes\n", strlen(p) - size);
+
+    return 0;
+}
diff --git a/src/hardened_malloc/test/test_smc.py b/src/hardened_malloc/test/test_smc.py
new file mode 100644
index 0000000..170278e
--- /dev/null
+++ b/src/hardened_malloc/test/test_smc.py
@@ -0,0 +1,242 @@
+import os
+import subprocess
+import unittest
+
+
+class TestSimpleMemoryCorruption(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(self):
+        self.dir = os.path.dirname(os.path.realpath(__file__))
+
+    def run_test(self, test_name):
+        sub = subprocess.Popen(self.dir + "/" + test_name,
+                               stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = sub.communicate()
+        return stdout, stderr, sub.returncode
+
+    def test_delete_type_size_mismatch(self):
+        _stdout, stderr, returncode = self.run_test(
+            "delete_type_size_mismatch")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode(
+            "utf-8"), "fatal allocator error: sized deallocation mismatch (small)\n")
+
+    def test_double_free_large_delayed(self):
+        _stdout, stderr, returncode = self.run_test(
+            "double_free_large_delayed")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: invalid free\n")
+
+    def test_double_free_large(self):
+        _stdout, stderr, returncode = self.run_test("double_free_large")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: invalid free\n")
+
+    def test_double_free_small_delayed(self):
+        _stdout, stderr, returncode = self.run_test(
+            "double_free_small_delayed")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: double free (quarantine)\n")
+
+    def test_double_free_small(self):
+        _stdout, stderr, returncode = self.run_test("double_free_small")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: double free (quarantine)\n")
+
+    def test_overflow_large_1_byte(self):
+        _stdout, _stderr, returncode = self.run_test(
+            "overflow_large_1_byte")
+        self.assertEqual(returncode, -11)
+
+    def test_overflow_large_8_byte(self):
+        _stdout, _stderr, returncode = self.run_test(
+            "overflow_large_8_byte")
+        self.assertEqual(returncode, -11)
+
+    def test_overflow_small_1_byte(self):
+        _stdout, stderr, returncode = self.run_test(
+            "overflow_small_1_byte")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: canary corrupted\n")
+
+    def test_overflow_small_8_byte(self):
+        _stdout, stderr, returncode = self.run_test(
+            "overflow_small_8_byte")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: canary corrupted\n")
+
+    def test_invalid_free_protected(self):
+        _stdout, stderr, returncode = self.run_test("invalid_free_protected")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: invalid free\n")
+
+    def test_invalid_free_small_region_far(self):
+        _stdout, stderr, returncode = self.run_test(
+            "invalid_free_small_region_far")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode(
+            "utf-8"), "fatal allocator error: invalid free within a slab yet to be used\n")
+
+    def test_invalid_free_small_region(self):
+        _stdout, stderr, returncode = self.run_test(
+            "invalid_free_small_region")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: double free\n")
+
+    def test_invalid_free_unprotected(self):
+        _stdout, stderr, returncode = self.run_test("invalid_free_unprotected")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: invalid free\n")
+
+    def test_invalid_malloc_usable_size_small_quarantene(self):
+        _stdout, stderr, returncode = self.run_test(
+            "invalid_malloc_usable_size_small_quarantine")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode(
+            "utf-8"), "fatal allocator error: invalid malloc_usable_size (quarantine)\n")
+
+    def test_invalid_malloc_usable_size_small(self):
+        _stdout, stderr, returncode = self.run_test(
+            "invalid_malloc_usable_size_small")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode(
+            "utf-8"), "fatal allocator error: invalid malloc_usable_size\n")
+
+    def test_read_after_free_large(self):
+        _stdout, _stderr, returncode = self.run_test("read_after_free_large")
+        self.assertEqual(returncode, -11)
+
+    def test_read_after_free_small(self):
+        stdout, _stderr, returncode = self.run_test("read_after_free_small")
+        self.assertEqual(returncode, 0)
+        self.assertEqual(stdout.decode("utf-8"),
+                         "0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n")
+
+    def test_read_zero_size(self):
+        _stdout, _stderr, returncode = self.run_test("read_zero_size")
+        self.assertEqual(returncode, -11)
+
+    def test_string_overflow(self):
+        stdout, _stderr, returncode = self.run_test("string_overflow")
+        self.assertEqual(returncode, 0)
+        self.assertEqual(stdout.decode("utf-8"), "overflow by 0 bytes\n")
+
+    def test_unaligned_free_large(self):
+        _stdout, stderr, returncode = self.run_test("unaligned_free_large")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: invalid free\n")
+
+    def test_unaligned_free_small(self):
+        _stdout, stderr, returncode = self.run_test("unaligned_free_small")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: invalid unaligned free\n")
+
+    def test_unaligned_malloc_usable_size_small(self):
+        _stdout, stderr, returncode = self.run_test(
+            "unaligned_malloc_usable_size_small")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: invalid unaligned malloc_usable_size\n")
+
+    def test_uninitialized_free(self):
+        _stdout, stderr, returncode = self.run_test("uninitialized_free")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: invalid free\n")
+
+    def test_uninitialized_malloc_usable_size(self):
+        _stdout, stderr, returncode = self.run_test(
+            "uninitialized_malloc_usable_size")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: invalid malloc_usable_size\n")
+
+    def test_uninitialized_realloc(self):
+        _stdout, stderr, returncode = self.run_test("uninitialized_realloc")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: invalid realloc\n")
+
+    def test_write_after_free_large_reuse(self):
+        _stdout, _stderr, returncode = self.run_test(
+            "write_after_free_large_reuse")
+        self.assertEqual(returncode, -11)
+
+    def test_write_after_free_large(self):
+        _stdout, _stderr, returncode = self.run_test("write_after_free_large")
+        self.assertEqual(returncode, -11)
+
+    def test_write_after_free_small_reuse(self):
+        _stdout, stderr, returncode = self.run_test(
+            "write_after_free_small_reuse")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: detected write after free\n")
+
+    def test_write_after_free_small(self):
+        _stdout, stderr, returncode = self.run_test("write_after_free_small")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode("utf-8"),
+                         "fatal allocator error: detected write after free\n")
+
+    def test_write_zero_size(self):
+        _stdout, _stderr, returncode = self.run_test("write_zero_size")
+        self.assertEqual(returncode, -11)
+
+    def test_malloc_object_size(self):
+        _stdout, _stderr, returncode = self.run_test("malloc_object_size")
+        self.assertEqual(returncode, 0)
+
+    def test_malloc_object_size_offset(self):
+        _stdout, _stderr, returncode = self.run_test(
+            "malloc_object_size_offset")
+        self.assertEqual(returncode, 0)
+
+    def test_invalid_malloc_object_size_small(self):
+        _stdout, stderr, returncode = self.run_test(
+            "invalid_malloc_object_size_small")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode(
+            "utf-8"), "fatal allocator error: invalid malloc_object_size\n")
+
+    def test_invalid_malloc_object_size_small_quarantine(self):
+        _stdout, stderr, returncode = self.run_test(
+            "invalid_malloc_object_size_small_quarantine")
+        self.assertEqual(returncode, -6)
+        self.assertEqual(stderr.decode(
+            "utf-8"), "fatal allocator error: invalid malloc_object_size (quarantine)\n")
+
+    def test_impossibly_large_malloc(self):
+        _stdout, stderr, returncode = self.run_test(
+            "impossibly_large_malloc")
+        self.assertEqual(returncode, 0)
+
+    def test_uninitialized_read_small(self):
+        _stdout, stderr, returncode = self.run_test(
+            "uninitialized_read_small")
+        self.assertEqual(returncode, 0)
+
+    def test_uninitialized_read_large(self):
+        _stdout, stderr, returncode = self.run_test(
+            "uninitialized_read_large")
+        self.assertEqual(returncode, 0)
+
+    def test_realloc_init(self):
+        _stdout, _stderr, returncode = self.run_test(
+            "realloc_init")
+        self.assertEqual(returncode, 0)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/hardened_malloc/test/test_util.h b/src/hardened_malloc/test/test_util.h
new file mode 100644
index 0000000..d2d78a6
--- /dev/null
+++ b/src/hardened_malloc/test/test_util.h
@@ -0,0 +1,10 @@
+#ifndef TEST_UTIL_H
+#define TEST_UTIL_H
+
+#ifdef __clang__
+#define OPTNONE __attribute__((optnone))
+#else
+#define OPTNONE __attribute__((optimize(0)))
+#endif
+
+#endif
diff --git a/src/hardened_malloc/test/unaligned_free_large.c b/src/hardened_malloc/test/unaligned_free_large.c
new file mode 100644
index 0000000..7c42347
--- /dev/null
+++ b/src/hardened_malloc/test/unaligned_free_large.c
@@ -0,0 +1,12 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(256 * 1024);
+    if (!p) {
+        return 1;
+    }
+    free(p + 1);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/unaligned_free_small.c b/src/hardened_malloc/test/unaligned_free_small.c
new file mode 100644
index 0000000..25ca757
--- /dev/null
+++ b/src/hardened_malloc/test/unaligned_free_small.c
@@ -0,0 +1,12 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+    free(p + 1);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/unaligned_malloc_usable_size_small.c b/src/hardened_malloc/test/unaligned_malloc_usable_size_small.c
new file mode 100644
index 0000000..c897c0d
--- /dev/null
+++ b/src/hardened_malloc/test/unaligned_malloc_usable_size_small.c
@@ -0,0 +1,12 @@
+#include <malloc.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(16);
+    if (!p) {
+        return 1;
+    }
+    malloc_usable_size(p + 1);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/uninitialized_free.c b/src/hardened_malloc/test/uninitialized_free.c
new file mode 100644
index 0000000..1ba3fcf
--- /dev/null
+++ b/src/hardened_malloc/test/uninitialized_free.c
@@ -0,0 +1,8 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    free((void *)1);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/uninitialized_malloc_usable_size.c b/src/hardened_malloc/test/uninitialized_malloc_usable_size.c
new file mode 100644
index 0000000..f2abfd1
--- /dev/null
+++ b/src/hardened_malloc/test/uninitialized_malloc_usable_size.c
@@ -0,0 +1,8 @@
+#include <malloc.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    malloc_usable_size((void *)1);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/uninitialized_read_large.c b/src/hardened_malloc/test/uninitialized_read_large.c
new file mode 100644
index 0000000..03400ad
--- /dev/null
+++ b/src/hardened_malloc/test/uninitialized_read_large.c
@@ -0,0 +1,14 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(256 * 1024);
+    for (unsigned i = 0; i < 256 * 1024; i++) {
+        if (p[i] != 0) {
+            return 1;
+        }
+    }
+    free(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/uninitialized_read_small.c b/src/hardened_malloc/test/uninitialized_read_small.c
new file mode 100644
index 0000000..92bdf10
--- /dev/null
+++ b/src/hardened_malloc/test/uninitialized_read_small.c
@@ -0,0 +1,14 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(8);
+    for (unsigned i = 0; i < 8; i++) {
+        if (p[i] != 0) {
+            return 1;
+        }
+    }
+    free(p);
+    return 0;
+}
diff --git a/src/hardened_malloc/test/uninitialized_realloc.c b/src/hardened_malloc/test/uninitialized_realloc.c
new file mode 100644
index 0000000..ef173f6
--- /dev/null
+++ b/src/hardened_malloc/test/uninitialized_realloc.c
@@ -0,0 +1,11 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    void *p = realloc((void *)1, 16);
+    if (!p) {
+        return 1;
+    }
+    return 0;
+}
diff --git a/src/hardened_malloc/test/write_after_free_large.c b/src/hardened_malloc/test/write_after_free_large.c
new file mode 100644
index 0000000..9561b9f
--- /dev/null
+++ b/src/hardened_malloc/test/write_after_free_large.c
@@ -0,0 +1,13 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(256 * 1024);
+    if (!p) {
+        return 1;
+    }
+    free(p);
+    p[64 * 1024 + 1] = 'a';
+    return 0;
+}
diff --git a/src/hardened_malloc/test/write_after_free_large_reuse.c b/src/hardened_malloc/test/write_after_free_large_reuse.c
new file mode 100644
index 0000000..e802035
--- /dev/null
+++ b/src/hardened_malloc/test/write_after_free_large_reuse.c
@@ -0,0 +1,16 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "test_util.h"
+#include "../util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(256 * 1024);
+    if (!p) {
+        return 1;
+    }
+    free(p);
+    UNUSED char *q = malloc(256 * 1024);
+    p[64 * 1024 + 1] = 'a';
+    return 0;
+}
diff --git a/src/hardened_malloc/test/write_after_free_small.c b/src/hardened_malloc/test/write_after_free_small.c
new file mode 100644
index 0000000..7850cd6
--- /dev/null
+++ b/src/hardened_malloc/test/write_after_free_small.c
@@ -0,0 +1,19 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(128);
+    if (!p) {
+        return 1;
+    }
+    free(p);
+
+    p[65] = 'a';
+
+    // trigger reuse of the allocation
+    for (size_t i = 0; i < 100000; i++) {
+        free(malloc(128));
+    }
+    return 0;
+}
diff --git a/src/hardened_malloc/test/write_after_free_small_reuse.c b/src/hardened_malloc/test/write_after_free_small_reuse.c
new file mode 100644
index 0000000..3318a91
--- /dev/null
+++ b/src/hardened_malloc/test/write_after_free_small_reuse.c
@@ -0,0 +1,21 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+#include "../util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(128);
+    if (!p) {
+        return 1;
+    }
+    free(p);
+    UNUSED char *q = malloc(128);
+
+    p[65] = 'a';
+
+    // trigger reuse of the allocation
+    for (size_t i = 0; i < 100000; i++) {
+        free(malloc(128));
+    }
+    return 0;
+}
diff --git a/src/hardened_malloc/test/write_zero_size.c b/src/hardened_malloc/test/write_zero_size.c
new file mode 100644
index 0000000..49d26ea
--- /dev/null
+++ b/src/hardened_malloc/test/write_zero_size.c
@@ -0,0 +1,12 @@
+#include <stdlib.h>
+
+#include "test_util.h"
+
+OPTNONE int main(void) {
+    char *p = malloc(0);
+    if (!p) {
+        return 1;
+    }
+    *p = 5;
+    return 0;
+}
diff --git a/src/hardened_malloc/third_party/libdivide.h b/src/hardened_malloc/third_party/libdivide.h
new file mode 100644
index 0000000..e9a31d1
--- /dev/null
+++ b/src/hardened_malloc/third_party/libdivide.h
@@ -0,0 +1,3126 @@
+// libdivide.h - Optimized integer division
+// https://libdivide.com
+//
+// Copyright (C) 2010 - 2021 ridiculous_fish, <libdivide@ridiculousfish.com>
+// Copyright (C) 2016 - 2021 Kim Walisch, <kim.walisch@gmail.com>
+//
+// libdivide is dual-licensed under the Boost or zlib licenses.
+// You may use libdivide under the terms of either of these.
+// See LICENSE.txt for more details.
+
+#ifndef LIBDIVIDE_H
+#define LIBDIVIDE_H
+
+#define LIBDIVIDE_VERSION "5.0"
+#define LIBDIVIDE_VERSION_MAJOR 5
+#define LIBDIVIDE_VERSION_MINOR 0
+
+#include <stdint.h>
+#if !defined(__AVR__)
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#if defined(LIBDIVIDE_SSE2)
+#include <emmintrin.h>
+#endif
+#if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
+#include <immintrin.h>
+#endif
+#if defined(LIBDIVIDE_NEON)
+#include <arm_neon.h>
+#endif
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#pragma warning(push)
+// disable warning C4146: unary minus operator applied
+// to unsigned type, result still unsigned
+#pragma warning(disable : 4146)
+// disable warning C4204: nonstandard extension used : non-constant aggregate 
+// initializer
+//
+// It's valid C99
+#pragma warning(disable : 4204)
+#define LIBDIVIDE_VC
+#endif
+
+#if !defined(__has_builtin)
+#define __has_builtin(x) 0
+#endif
+
+#if defined(__SIZEOF_INT128__)
+#define HAS_INT128_T
+// clang-cl on Windows does not yet support 128-bit division
+#if !(defined(__clang__) && defined(LIBDIVIDE_VC))
+#define HAS_INT128_DIV
+#endif
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define LIBDIVIDE_X86_64
+#endif
+
+#if defined(__i386__)
+#define LIBDIVIDE_i386
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define LIBDIVIDE_GCC_STYLE_ASM
+#endif
+
+#if defined(__cplusplus) || defined(LIBDIVIDE_VC)
+#define LIBDIVIDE_FUNCTION __FUNCTION__
+#else
+#define LIBDIVIDE_FUNCTION __func__
+#endif
+
+// Set up forced inlining if possible.
+// We need both the attribute and keyword to avoid "might not be inlineable" warnings.
+#ifdef __has_attribute
+#if __has_attribute(always_inline)
+#define LIBDIVIDE_INLINE __attribute__((always_inline)) inline
+#endif
+#endif
+#ifndef LIBDIVIDE_INLINE
+#define LIBDIVIDE_INLINE inline
+#endif
+
+#if defined(__AVR__)
+#define LIBDIVIDE_ERROR(msg)
+#else
+#define LIBDIVIDE_ERROR(msg)                                                                     \
+    do {                                                                                         \
+        fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \
+        abort();                                                                                 \
+    } while (0)
+#endif
+
+#if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__)
+#define LIBDIVIDE_ASSERT(x)                                                           \
+    do {                                                                              \
+        if (!(x)) {                                                                   \
+            fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \
+                LIBDIVIDE_FUNCTION, #x);                                              \
+            abort();                                                                  \
+        }                                                                             \
+    } while (0)
+#else
+#define LIBDIVIDE_ASSERT(x)
+#endif
+
+#ifdef __cplusplus
+namespace libdivide {
+#endif
+
+// pack divider structs to prevent compilers from padding.
+// This reduces memory usage by up to 43% when using a large
+// array of libdivide dividers and improves performance
+// by up to 10% because of reduced memory bandwidth.
+#pragma pack(push, 1)
+
+struct libdivide_u16_t {
+    uint16_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s16_t {
+    int16_t magic;
+    uint8_t more;
+};
+
+struct libdivide_u32_t {
+    uint32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s32_t {
+    int32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_u64_t {
+    uint64_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s64_t {
+    int64_t magic;
+    uint8_t more;
+};
+
+struct libdivide_u16_branchfree_t {
+    uint16_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s16_branchfree_t {
+    int16_t magic;
+    uint8_t more;
+};
+
+struct libdivide_u32_branchfree_t {
+    uint32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s32_branchfree_t {
+    int32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_u64_branchfree_t {
+    uint64_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s64_branchfree_t {
+    int64_t magic;
+    uint8_t more;
+};
+
+#pragma pack(pop)
+
+// Explanation of the "more" field:
+//
+// * Bits 0-5 is the shift value (for shift path or mult path).
+// * Bit 6 is the add indicator for mult path.
+// * Bit 7 is set if the divisor is negative. We use bit 7 as the negative
+//   divisor indicator so that we can efficiently use sign extension to
+//   create a bitmask with all bits set to 1 (if the divisor is negative)
+//   or 0 (if the divisor is positive).
+//
+// u32: [0-4] shift value
+//      [5] ignored
+//      [6] add indicator
+//      magic number of 0 indicates shift path
+//
+// s32: [0-4] shift value
+//      [5] ignored
+//      [6] add indicator
+//      [7] indicates negative divisor
+//      magic number of 0 indicates shift path
+//
+// u64: [0-5] shift value
+//      [6] add indicator
+//      magic number of 0 indicates shift path
+//
+// s64: [0-5] shift value
+//      [6] add indicator
+//      [7] indicates negative divisor
+//      magic number of 0 indicates shift path
+//
+// In s32 and s64 branchfree modes, the magic number is negated according to
+// whether the divisor is negated. In branchfree strategy, it is not negated.
+
+enum {
+    LIBDIVIDE_16_SHIFT_MASK = 0x1F,
+    LIBDIVIDE_32_SHIFT_MASK = 0x1F,
+    LIBDIVIDE_64_SHIFT_MASK = 0x3F,
+    LIBDIVIDE_ADD_MARKER = 0x40,
+    LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
+};
+
+static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d);
+static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d);
+static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d);
+static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
+static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d);
+static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
+
+static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d);
+static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d);
+static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
+static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
+static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
+static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
+
+static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
+    int16_t numer, int16_t magic, uint8_t more);
+static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
+    int16_t numer, const struct libdivide_s16_t* denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
+    uint16_t numer, uint16_t magic, uint8_t more);    
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
+    uint16_t numer, const struct libdivide_u16_t* denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
+    int32_t numer, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
+    uint32_t numer, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
+    int64_t numer, const struct libdivide_s64_t *denom);
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
+    uint64_t numer, const struct libdivide_u64_t *denom);
+
+static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
+    int16_t numer, const struct libdivide_s16_branchfree_t* denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
+    uint16_t numer, const struct libdivide_u16_branchfree_t* denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
+    int32_t numer, const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
+    uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
+    int64_t numer, const struct libdivide_s64_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
+    uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
+
+static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
+
+static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
+    const struct libdivide_s16_branchfree_t* denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
+    const struct libdivide_u16_branchfree_t* denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
+    const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
+    const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(
+    const struct libdivide_s64_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(
+    const struct libdivide_u64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) {
+    uint32_t xl = x, yl = y;
+    uint32_t rl = xl * yl;
+    return (uint16_t)(rl >> 16);
+}
+
+static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) {
+    int32_t xl = x, yl = y;
+    int32_t rl = xl * yl;
+    // needs to be arithmetic shift
+    return (int16_t)(rl >> 16);
+}
+
+static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
+    uint64_t xl = x, yl = y;
+    uint64_t rl = xl * yl;
+    return (uint32_t)(rl >> 32);
+}
+
+static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
+    int64_t xl = x, yl = y;
+    int64_t rl = xl * yl;
+    // needs to be arithmetic shift
+    return (int32_t)(rl >> 32);
+}
+
+static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
+#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
+    return __umulh(x, y);
+#elif defined(HAS_INT128_T)
+    __uint128_t xl = x, yl = y;
+    __uint128_t rl = xl * yl;
+    return (uint64_t)(rl >> 64);
+#else
+    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
+    uint32_t mask = 0xFFFFFFFF;
+    uint32_t x0 = (uint32_t)(x & mask);
+    uint32_t x1 = (uint32_t)(x >> 32);
+    uint32_t y0 = (uint32_t)(y & mask);
+    uint32_t y1 = (uint32_t)(y >> 32);
+    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
+    uint64_t x0y1 = x0 * (uint64_t)y1;
+    uint64_t x1y0 = x1 * (uint64_t)y0;
+    uint64_t x1y1 = x1 * (uint64_t)y1;
+    uint64_t temp = x1y0 + x0y0_hi;
+    uint64_t temp_lo = temp & mask;
+    uint64_t temp_hi = temp >> 32;
+
+    return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
+#endif
+}
+
+static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
+#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
+    return __mulh(x, y);
+#elif defined(HAS_INT128_T)
+    __int128_t xl = x, yl = y;
+    __int128_t rl = xl * yl;
+    return (int64_t)(rl >> 64);
+#else
+    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
+    uint32_t mask = 0xFFFFFFFF;
+    uint32_t x0 = (uint32_t)(x & mask);
+    uint32_t y0 = (uint32_t)(y & mask);
+    int32_t x1 = (int32_t)(x >> 32);
+    int32_t y1 = (int32_t)(y >> 32);
+    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
+    int64_t t = x1 * (int64_t)y0 + x0y0_hi;
+    int64_t w1 = x0 * (int64_t)y1 + (t & mask);
+
+    return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32);
+#endif
+}
+
+static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
+#if defined(__AVR__)
+    // Fast way to count leading zeros
+    // On the AVR 8-bit architecture __builtin_clz() works on a int16_t.
+    return __builtin_clz(val);
+#elif defined(__GNUC__) || __has_builtin(__builtin_clz)
+    // Fast way to count leading zeros
+    return __builtin_clz(val) - 16;
+#elif defined(LIBDIVIDE_VC)
+    unsigned long result;
+    if (_BitScanReverse(&result, (unsigned long)val)) {
+        return (int16_t)(15 - result);
+    }
+    return 0;
+#else
+    if (val == 0) return 16;
+    int16_t result = 4;
+    uint16_t hi = 0xFU << 12;
+    while ((val & hi) == 0) {
+        hi >>= 4;
+        result += 4;
+    }
+    while (val & hi) {
+        result -= 1;
+        hi <<= 1;
+    }
+    return result;
+#endif
+}
+
+static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
+#if defined(__AVR__)
+   // Fast way to count leading zeros
+    return __builtin_clzl(val);
+#elif defined(__GNUC__) || __has_builtin(__builtin_clz)
+    // Fast way to count leading zeros
+    return __builtin_clz(val);
+#elif defined(LIBDIVIDE_VC)
+    unsigned long result;
+    if (_BitScanReverse(&result, val)) {
+        return 31 - result;
+    }
+    return 0;
+#else
+    if (val == 0) return 32;
+    int32_t result = 8;
+    uint32_t hi = 0xFFU << 24;
+    while ((val & hi) == 0) {
+        hi >>= 8;
+        result += 8;
+    }
+    while (val & hi) {
+        result -= 1;
+        hi <<= 1;
+    }
+    return result;
+#endif
+}
+
+static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
+#if defined(__GNUC__) || __has_builtin(__builtin_clzll)
+    // Fast way to count leading zeros
+    return __builtin_clzll(val);
+#elif defined(LIBDIVIDE_VC) && defined(_WIN64)
+    unsigned long result;
+    if (_BitScanReverse64(&result, val)) {
+        return 63 - result;
+    }
+    return 0;
+#else
+    uint32_t hi = val >> 32;
+    uint32_t lo = val & 0xFFFFFFFF;
+    if (hi != 0) return libdivide_count_leading_zeros32(hi);
+    return 32 + libdivide_count_leading_zeros32(lo);
+#endif
+}
+
+// libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit
+// uint {v}. The result must fit in 16 bits.
+// Returns the quotient directly and the remainder in *r
+static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
+    uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) {
+    uint32_t n = ((uint32_t)u1 << 16) | u0;
+    uint16_t result = (uint16_t)(n / v);
+    *r = (uint16_t)(n - result * (uint32_t)v);
+    return result;
+}
+
+// libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
+// uint {v}. The result must fit in 32 bits.
+// Returns the quotient directly and the remainder in *r
+static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32(
+    uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
+#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM)
+    uint32_t result;
+    __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1));
+    return result;
+#else
+    uint64_t n = ((uint64_t)u1 << 32) | u0;
+    uint32_t result = (uint32_t)(n / v);
+    *r = (uint32_t)(n - result * (uint64_t)v);
+    return result;
+#endif
+}
+
+// libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The
+// result must fit in 64 bits. Returns the quotient directly and the remainder in *r
+static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
+    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
+    // N.B. resist the temptation to use __uint128_t here.
+    // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than
+    // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because
+    // it's not LIBDIVIDE_INLINEd.
+#if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM)
+    uint64_t result;
+    __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
+    return result;
+#else
+    // We work in base 2**32.
+    // A uint32 holds a single digit. A uint64 holds two digits.
+    // Our numerator is conceptually [num3, num2, num1, num0].
+    // Our denominator is [den1, den0].
+    const uint64_t b = ((uint64_t)1 << 32);
+
+    // The high and low digits of our computed quotient.
+    uint32_t q1;
+    uint32_t q0;
+
+    // The normalization shift factor.
+    int shift;
+
+    // The high and low digits of our denominator (after normalizing).
+    // Also the low 2 digits of our numerator (after normalizing).
+    uint32_t den1;
+    uint32_t den0;
+    uint32_t num1;
+    uint32_t num0;
+
+    // A partial remainder.
+    uint64_t rem;
+
+    // The estimated quotient, and its corresponding remainder (unrelated to true remainder).
+    uint64_t qhat;
+    uint64_t rhat;
+
+    // Variables used to correct the estimated quotient.
+    uint64_t c1;
+    uint64_t c2;
+
+    // Check for overflow and divide by 0.
+    if (numhi >= den) {
+        if (r != NULL) *r = ~0ull;
+        return ~0ull;
+    }
+
+    // Determine the normalization factor. We multiply den by this, so that its leading digit is at
+    // least half b. In binary this means just shifting left by the number of leading zeros, so that
+    // there's a 1 in the MSB.
+    // We also shift numer by the same amount. This cannot overflow because numhi < den.
+    // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
+    // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
+    // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
+    shift = libdivide_count_leading_zeros64(den);
+    den <<= shift;
+    numhi <<= shift;
+    numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
+    numlo <<= shift;
+
+    // Extract the low digits of the numerator and both digits of the denominator.
+    num1 = (uint32_t)(numlo >> 32);
+    num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
+    den1 = (uint32_t)(den >> 32);
+    den0 = (uint32_t)(den & 0xFFFFFFFFu);
+
+    // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
+    // Estimate q1 as [n3 n2] / [d1], and then correct it.
+    // Note while qhat may be 2 digits, q1 is always 1 digit.
+    qhat = numhi / den1;
+    rhat = numhi % den1;
+    c1 = qhat * den0;
+    c2 = rhat * b + num1;
+    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
+    q1 = (uint32_t)qhat;
+
+    // Compute the true (partial) remainder.
+    rem = numhi * b + num1 - q1 * den;
+
+    // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
+    // Estimate q0 as [rem1 rem0] / [d1] and correct it.
+    qhat = rem / den1;
+    rhat = rem % den1;
+    c1 = qhat * den0;
+    c2 = rhat * b + num0;
+    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
+    q0 = (uint32_t)qhat;
+
+    // Return remainder if requested.
+    if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift;
+    return ((uint64_t)q1 << 32) | q0;
+#endif
+}
+
+// Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
+static LIBDIVIDE_INLINE void libdivide_u128_shift(
+    uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
+    if (signed_shift > 0) {
+        uint32_t shift = signed_shift;
+        *u1 <<= shift;
+        *u1 |= *u0 >> (64 - shift);
+        *u0 <<= shift;
+    } else if (signed_shift < 0) {
+        uint32_t shift = -signed_shift;
+        *u0 >>= shift;
+        *u0 |= *u1 << (64 - shift);
+        *u1 >>= shift;
+    }
+}
+
+// Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
+static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
+    uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
+#if defined(HAS_INT128_T) && defined(HAS_INT128_DIV)
+    __uint128_t ufull = u_hi;
+    __uint128_t vfull = v_hi;
+    ufull = (ufull << 64) | u_lo;
+    vfull = (vfull << 64) | v_lo;
+    uint64_t res = (uint64_t)(ufull / vfull);
+    __uint128_t remainder = ufull - (vfull * res);
+    *r_lo = (uint64_t)remainder;
+    *r_hi = (uint64_t)(remainder >> 64);
+    return res;
+#else
+    // Adapted from "Unsigned Doubleword Division" in Hacker's Delight
+    // We want to compute u / v
+    typedef struct {
+        uint64_t hi;
+        uint64_t lo;
+    } u128_t;
+    u128_t u = {u_hi, u_lo};
+    u128_t v = {v_hi, v_lo};
+
+    if (v.hi == 0) {
+        // divisor v is a 64 bit value, so we just need one 128/64 division
+        // Note that we are simpler than Hacker's Delight here, because we know
+        // the quotient fits in 64 bits whereas Hacker's Delight demands a full
+        // 128 bit quotient
+        *r_hi = 0;
+        return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo);
+    }
+    // Here v >= 2**64
+    // We know that v.hi != 0, so count leading zeros is OK
+    // We have 0 <= n <= 63
+    uint32_t n = libdivide_count_leading_zeros64(v.hi);
+
+    // Normalize the divisor so its MSB is 1
+    u128_t v1t = v;
+    libdivide_u128_shift(&v1t.hi, &v1t.lo, n);
+    uint64_t v1 = v1t.hi;  // i.e. v1 = v1t >> 64
+
+    // To ensure no overflow
+    u128_t u1 = u;
+    libdivide_u128_shift(&u1.hi, &u1.lo, -1);
+
+    // Get quotient from divide unsigned insn.
+    uint64_t rem_ignored;
+    uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored);
+
+    // Undo normalization and division of u by 2.
+    u128_t q0 = {0, q1};
+    libdivide_u128_shift(&q0.hi, &q0.lo, n);
+    libdivide_u128_shift(&q0.hi, &q0.lo, -63);
+
+    // Make q0 correct or too small by 1
+    // Equivalent to `if (q0 != 0) q0 = q0 - 1;`
+    if (q0.hi != 0 || q0.lo != 0) {
+        q0.hi -= (q0.lo == 0);  // borrow
+        q0.lo -= 1;
+    }
+
+    // Now q0 is correct.
+    // Compute q0 * v as q0v
+    // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo)
+    // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) +
+    //   (q0.lo * v.hi <<  64) + q0.lo * v.lo)
+    // Each term is 128 bit
+    // High half of full product (upper 128 bits!) are dropped
+    u128_t q0v = {0, 0};
+    q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
+    q0v.lo = q0.lo * v.lo;
+
+    // Compute u - q0v as u_q0v
+    // This is the remainder
+    u128_t u_q0v = u;
+    u_q0v.hi -= q0v.hi + (u.lo < q0v.lo);  // second term is borrow
+    u_q0v.lo -= q0v.lo;
+
+    // Check if u_q0v >= v
+    // This checks if our remainder is larger than the divisor
+    if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {
+        // Increment q0
+        q0.lo += 1;
+        q0.hi += (q0.lo == 0);  // carry
+
+        // Subtract v from remainder
+        u_q0v.hi -= v.hi + (u_q0v.lo < v.lo);
+        u_q0v.lo -= v.lo;
+    }
+
+    *r_hi = u_q0v.hi;
+    *r_lo = u_q0v.lo;
+
+    LIBDIVIDE_ASSERT(q0.hi == 0);
+    return q0.lo;
+#endif
+}
+
+////////// UINT16
+
+static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
+    uint16_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_u16_t result;
+    uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d));
+
+    // Power of 2
+    if ((d & (d - 1)) == 0) {
+        // We need to subtract 1 from the shift value in case of an unsigned
+        // branchfree divider because there is a hardcoded right shift by 1
+        // in its division algorithm. Because of this we also need to add back
+        // 1 in its recovery algorithm.
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
+    }
+    else {
+        uint8_t more;
+        uint16_t rem, proposed_m;
+        proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
+
+        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
+        const uint16_t e = d - rem;
+
+        // This power works if e < 2**floor_log_2_d.
+        if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
+            // This power works
+            more = floor_log_2_d;
+        }
+        else {
+            // We have to use the general 17-bit algorithm.  We need to compute
+            // (2**power) / d. However, we already have (2**(power-1))/d and
+            // its remainder.  By doubling both, and then correcting the
+            // remainder, we can compute the larger division.
+            // don't care about overflow here - in fact, we expect it
+            proposed_m += proposed_m;
+            const uint16_t twice_rem = rem + rem;
+            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
+            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+        }
+        result.magic = 1 + proposed_m;
+        result.more = more;
+        // result.more's shift should in general be ceil_log_2_d. But if we
+        // used the smaller power, we subtract one from the shift because we're
+        // using the smaller power. If we're using the larger power, we
+        // subtract one from the shift because it's taken care of by the add
+        // indicator. So floor_log_2_d happens to be correct in both cases.
+    }
+    return result;
+}
+
+struct libdivide_u16_t libdivide_u16_gen(uint16_t d) {
+    return libdivide_internal_u16_gen(d, 0);
+}
+
+struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
+    if (d == 1) {
+        LIBDIVIDE_ERROR("branchfree divider must be != 1");
+    }
+    struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
+    struct libdivide_u16_branchfree_t ret = {
+        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) };
+    return ret;
+}
+
+// The original libdivide_u16_do takes a const pointer. However, this cannot be used
+// with a compile time constant libdivide_u16_t: it will generate a warning about
+// taking the address of a temporary. Hence this overload.
+uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
+    if (!magic) {
+        return numer >> more;
+    }
+    else {
+        uint16_t q = libdivide_mullhi_u16(magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            uint16_t t = ((numer - q) >> 1) + q;
+            return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
+        }
+        else {
+            // All upper bits are 0,
+            // don't need to mask them off.
+            return q >> more;
+        }
+    }    
+}
+
+uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
+    return libdivide_u16_do_raw(numer, denom->magic, denom->more);
+}
+
+uint16_t libdivide_u16_branchfree_do(
+    uint16_t numer, const struct libdivide_u16_branchfree_t* denom) {
+    uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
+    uint16_t t = ((numer - q) >> 1) + q;
+    return t >> denom->more;
+}
+
+uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return (uint16_t)1 << shift;
+    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
+        // We compute q = n/d = n*m / 2^(16 + shift)
+        // Therefore we have d = 2^(16 + shift) / m
+        // We need to ceil it.
+        // We know d is not a power of 2, so m is not a power of 2,
+        // so we can just add 1 to the floor
+        uint16_t hi_dividend = (uint16_t)1 << shift;
+        uint16_t rem_ignored;
+        return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored);
+    } else {
+        // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
+        // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
+        // Also note that shift may be as high as 15, so shift + 1 will
+        // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
+        // then double the quotient and remainder.
+        uint32_t half_n = (uint32_t)1 << (16 + shift);
+        uint32_t d = ( (uint32_t)1 << 16) | denom->magic;
+        // Note that the quotient is guaranteed <= 16 bits, but the remainder
+        // may need 17!
+        uint16_t half_q = (uint16_t)(half_n / d);
+        uint32_t rem = half_n % d;
+        // We computed 2^(16+shift)/(m+2^16)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits
+        uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
+uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return (uint16_t)1 << (shift + 1);
+    } else {
+        // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
+        // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
+        // Also note that shift may be as high as 15, so shift + 1 will
+        // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
+        // then double the quotient and remainder.
+        uint32_t half_n = (uint32_t)1 << (16 + shift);
+        uint32_t d = ((uint32_t)1 << 16) | denom->magic;
+        // Note that the quotient is guaranteed <= 16 bits, but the remainder
+        // may need 17!
+        uint16_t half_q = (uint16_t)(half_n / d);
+        uint32_t rem = half_n % d;
+        // We computed 2^(16+shift)/(m+2^16)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
+        uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
+////////// UINT32
+
+static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen(
+    uint32_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_u32_t result;
+    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);
+
+    // Power of 2
+    if ((d & (d - 1)) == 0) {
+        // We need to subtract 1 from the shift value in case of an unsigned
+        // branchfree divider because there is a hardcoded right shift by 1
+        // in its division algorithm. Because of this we also need to add back
+        // 1 in its recovery algorithm.
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
+    } else {
+        uint8_t more;
+        uint32_t rem, proposed_m;
+        proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem);
+
+        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
+        const uint32_t e = d - rem;
+
+        // This power works if e < 2**floor_log_2_d.
+        if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) {
+            // This power works
+            more = (uint8_t)floor_log_2_d;
+        } else {
+            // We have to use the general 33-bit algorithm.  We need to compute
+            // (2**power) / d. However, we already have (2**(power-1))/d and
+            // its remainder.  By doubling both, and then correcting the
+            // remainder, we can compute the larger division.
+            // don't care about overflow here - in fact, we expect it
+            proposed_m += proposed_m;
+            const uint32_t twice_rem = rem + rem;
+            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
+        }
+        result.magic = 1 + proposed_m;
+        result.more = more;
+        // result.more's shift should in general be ceil_log_2_d. But if we
+        // used the smaller power, we subtract one from the shift because we're
+        // using the smaller power. If we're using the larger power, we
+        // subtract one from the shift because it's taken care of by the add
+        // indicator. So floor_log_2_d happens to be correct in both cases.
+    }
+    return result;
+}
+
+struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
+    return libdivide_internal_u32_gen(d, 0);
+}
+
+struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
+    if (d == 1) {
+        LIBDIVIDE_ERROR("branchfree divider must be != 1");
+    }
+    struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
+    struct libdivide_u32_branchfree_t ret = {
+        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
+    return ret;
+}
+
+uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return numer >> more;
+    } else {
+        uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            uint32_t t = ((numer - q) >> 1) + q;
+            return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
+        } else {
+            // All upper bits are 0,
+            // don't need to mask them off.
+            return q >> more;
+        }
+    }
+}
+
+uint32_t libdivide_u32_branchfree_do(
+    uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
+    uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
+    uint32_t t = ((numer - q) >> 1) + q;
+    return t >> denom->more;
+}
+
+uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return (uint32_t)1 << shift;
+    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
+        // We compute q = n/d = n*m / 2^(32 + shift)
+        // Therefore we have d = 2^(32 + shift) / m
+        // We need to ceil it.
+        // We know d is not a power of 2, so m is not a power of 2,
+        // so we can just add 1 to the floor
+        uint32_t hi_dividend = (uint32_t)1 << shift;
+        uint32_t rem_ignored;
+        return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
+    } else {
+        // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
+        // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
+        // Also note that shift may be as high as 31, so shift + 1 will
+        // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
+        // then double the quotient and remainder.
+        uint64_t half_n = (uint64_t)1 << (32 + shift);
+        uint64_t d = ((uint64_t)1 << 32) | denom->magic;
+        // Note that the quotient is guaranteed <= 32 bits, but the remainder
+        // may need 33!
+        uint32_t half_q = (uint32_t)(half_n / d);
+        uint64_t rem = half_n % d;
+        // We computed 2^(32+shift)/(m+2^32)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
+        uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
+uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return (uint32_t)1 << (shift + 1);
+    } else {
+        // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
+        // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
+        // Also note that shift may be as high as 31, so shift + 1 will
+        // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
+        // then double the quotient and remainder.
+        uint64_t half_n = (uint64_t)1 << (32 + shift);
+        uint64_t d = ((uint64_t)1 << 32) | denom->magic;
+        // Note that the quotient is guaranteed <= 32 bits, but the remainder
+        // may need 33!
+        uint32_t half_q = (uint32_t)(half_n / d);
+        uint64_t rem = half_n % d;
+        // We computed 2^(32+shift)/(m+2^32)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
+        uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
+/////////// UINT64
+
+static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(
+    uint64_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_u64_t result;
+    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d);
+
+    // Power of 2
+    if ((d & (d - 1)) == 0) {
+        // We need to subtract 1 from the shift value in case of an unsigned
+        // branchfree divider because there is a hardcoded right shift by 1
+        // in its division algorithm. Because of this we also need to add back
+        // 1 in its recovery algorithm.
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
+    } else {
+        uint64_t proposed_m, rem;
+        uint8_t more;
+        // (1 << (64 + floor_log_2_d)) / d
+        proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem);
+
+        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
+        const uint64_t e = d - rem;
+
+        // This power works if e < 2**floor_log_2_d.
+        if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
+            // This power works
+            more = (uint8_t)floor_log_2_d;
+        } else {
+            // We have to use the general 65-bit algorithm.  We need to compute
+            // (2**power) / d. However, we already have (2**(power-1))/d and
+            // its remainder. By doubling both, and then correcting the
+            // remainder, we can compute the larger division.
+            // don't care about overflow here - in fact, we expect it
+            proposed_m += proposed_m;
+            const uint64_t twice_rem = rem + rem;
+            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
+        }
+        result.magic = 1 + proposed_m;
+        result.more = more;
+        // result.more's shift should in general be ceil_log_2_d. But if we
+        // used the smaller power, we subtract one from the shift because we're
+        // using the smaller power. If we're using the larger power, we
+        // subtract one from the shift because it's taken care of by the add
+        // indicator. So floor_log_2_d happens to be correct in both cases,
+        // which is why we do it outside of the if statement.
+    }
+    return result;
+}
+
+struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
+    return libdivide_internal_u64_gen(d, 0);
+}
+
+struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
+    if (d == 1) {
+        LIBDIVIDE_ERROR("branchfree divider must be != 1");
+    }
+    struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1);
+    struct libdivide_u64_branchfree_t ret = {
+        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};
+    return ret;
+}
+
+uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return numer >> more;
+    } else {
+        uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            uint64_t t = ((numer - q) >> 1) + q;
+            return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
+        } else {
+            // All upper bits are 0,
+            // don't need to mask them off.
+            return q >> more;
+        }
+    }
+}
+
+uint64_t libdivide_u64_branchfree_do(
+    uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
+    uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
+    uint64_t t = ((numer - q) >> 1) + q;
+    return t >> denom->more;
+}
+
+uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return (uint64_t)1 << shift;
+    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
+        // We compute q = n/d = n*m / 2^(64 + shift)
+        // Therefore we have d = 2^(64 + shift) / m
+        // We need to ceil it.
+        // We know d is not a power of 2, so m is not a power of 2,
+        // so we can just add 1 to the floor
+        uint64_t hi_dividend = (uint64_t)1 << shift;
+        uint64_t rem_ignored;
+        return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);
+    } else {
+        // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
+        // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
+        // libdivide_u32_recover for more on what we do here.
+        // TODO: do something better than 128 bit math
+
+        // Full n is a (potentially) 129 bit value
+        // half_n is a 128 bit value
+        // Compute the hi half of half_n. Low half is 0.
+        uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
+        // d is a 65 bit value. The high bit is always set to 1.
+        const uint64_t d_hi = 1, d_lo = denom->magic;
+        // Note that the quotient is guaranteed <= 64 bits,
+        // but the remainder may need 65!
+        uint64_t r_hi, r_lo;
+        uint64_t half_q =
+            libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
+        // We computed 2^(64+shift)/(m+2^64)
+        // Double the remainder ('dr') and check if that is larger than d
+        // Note that d is a 65 bit value, so r1 is small and so r1 + r1
+        // cannot overflow
+        uint64_t dr_lo = r_lo + r_lo;
+        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
+        int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
+        uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
+        return full_q + 1;
+    }
+}
+
+uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return (uint64_t)1 << (shift + 1);
+    } else {
+        // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
+        // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
+        // libdivide_u32_recover for more on what we do here.
+        // TODO: do something better than 128 bit math
+
+        // Full n is a (potentially) 129 bit value
+        // half_n is a 128 bit value
+        // Compute the hi half of half_n. Low half is 0.
+        uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
+        // d is a 65 bit value. The high bit is always set to 1.
+        const uint64_t d_hi = 1, d_lo = denom->magic;
+        // Note that the quotient is guaranteed <= 64 bits,
+        // but the remainder may need 65!
+        uint64_t r_hi, r_lo;
+        uint64_t half_q =
+            libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
+        // We computed 2^(64+shift)/(m+2^64)
+        // Double the remainder ('dr') and check if that is larger than d
+        // Note that d is a 65 bit value, so r1 is small and so r1 + r1
+        // cannot overflow
+        uint64_t dr_lo = r_lo + r_lo;
+        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
+        int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
+        uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
+        return full_q + 1;
+    }
+}
+
+/////////// SINT16
+
+static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
+    int16_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_s16_t result;
+
+    // If d is a power of 2, or negative a power of 2, we have to use a shift.
+    // This is especially important because the magic algorithm fails for -1.
+    // To check if d is a power of 2 or its inverse, it suffices to check
+    // whether its absolute value has exactly one bit set. This works even for
+    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
+    // and is a power of 2.
+    uint16_t ud = (uint16_t)d;
+    uint16_t absD = (d < 0) ? -ud : ud;
+    uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD);
+    // check if exactly one bit is set,
+    // don't care if absD is 0 since that's divide by zero
+    if ((absD & (absD - 1)) == 0) {
+        // Branchfree and normal paths are exactly the same
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
+    } else {
+        LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
+
+        uint8_t more;
+        // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word
+        // is 0 and the high word is floor_log_2_d - 1
+        uint16_t rem, proposed_m;
+        proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
+        const uint16_t e = absD - rem;
+
+        // We are going to start with a power of floor_log_2_d - 1.
+        // This works if works if e < 2**floor_log_2_d.
+        if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) {
+            // This power works
+            more = (uint8_t)(floor_log_2_d - 1);
+        } else {
+            // We need to go one higher. This should not make proposed_m
+            // overflow, but it will make it negative when interpreted as an
+            // int16_t.
+            proposed_m += proposed_m;
+            const uint16_t twice_rem = rem + rem;
+            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
+        }
+
+        proposed_m += 1;
+        int16_t magic = (int16_t)proposed_m;
+
+        // Mark if we are negative. Note we only negate the magic number in the
+        // branchfull case.
+        if (d < 0) {
+            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
+            if (!branchfree) {
+                magic = -magic;
+            }
+        }
+
+        result.more = more;
+        result.magic = magic;
+    }
+    return result;
+}
+
+struct libdivide_s16_t libdivide_s16_gen(int16_t d) {
+    return libdivide_internal_s16_gen(d, 0);
+}
+
+struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
+    struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1);
+    struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more};
+    return result;
+}
+
+// The original libdivide_s16_do takes a const pointer. However, this cannot be used
+// with a compile time constant libdivide_s16_t: it will generate a warning about
+// taking the address of a temporary. Hence this overload.
+int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+
+    if (!magic) {
+        uint16_t sign = (int8_t)more >> 7;
+        uint16_t mask = ((uint16_t)1 << shift) - 1;
+        uint16_t uq = numer + ((numer >> 15) & mask);
+        int16_t q = (int16_t)uq;
+        q >>= shift;
+        q = (q ^ sign) - sign;
+        return q;
+    } else {
+        uint16_t uq = (uint16_t)libdivide_mullhi_s16(magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift and then sign extend
+            int16_t sign = (int8_t)more >> 7;
+            // q += (more < 0 ? -numer : numer)
+            // cast required to avoid UB
+            uq += ((uint16_t)numer ^ sign) - sign;
+        }
+        int16_t q = (int16_t)uq;
+        q >>= shift;
+        q += (q < 0);
+        return q;
+    }
+}
+
+int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
+    return libdivide_s16_do_raw(numer, denom->magic, denom->more);
+}
+
+int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+    // must be arithmetic shift and then sign extend
+    int16_t sign = (int8_t)more >> 7;
+    int16_t magic = denom->magic;
+    int16_t q = libdivide_mullhi_s16(magic, numer);
+    q += numer;
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is a power of
+    // 2, or (2**shift) if it is not a power of 2
+    uint16_t is_power_of_2 = (magic == 0);
+    uint16_t q_sign = (uint16_t)(q >> 15);
+    q += q_sign & (((uint16_t)1 << shift) - is_power_of_2);
+
+    // Now arithmetic right shift
+    q >>= shift;
+    // Negate if needed
+    q = (q ^ sign) - sign;
+
+    return q;
+}
+
+int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+    if (!denom->magic) {
+        uint16_t absD = (uint16_t)1 << shift;
+        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
+            absD = -absD;
+        }
+        return (int16_t)absD;
+    } else {
+        // Unsigned math is much easier
+        // We negate the magic number only in the branchfull case, and we don't
+        // know which case we're in. However we have enough information to
+        // determine the correct sign of the magic number. The divisor was
+        // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
+        // the magic number's sign is opposite that of the divisor.
+        // We want to compute the positive magic number.
+        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
+        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
+
+        // Handle the power of 2 case (including branchfree)
+        if (denom->magic == 0) {
+            int16_t result = (uint16_t)1 << shift;
+            return negative_divisor ? -result : result;
+        }
+
+        uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic);
+        uint32_t n = (uint32_t)1 << (16 + shift);  // this shift cannot exceed 30
+        uint16_t q = (uint16_t)(n / d);
+        int16_t result = (int16_t)q;
+        result += 1;
+        return negative_divisor ? -result : result;
+    }
+}
+
+int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) {
+    return libdivide_s16_recover((const struct libdivide_s16_t *)denom);
+}
+
+/////////// SINT32
+
+static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(
+    int32_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_s32_t result;
+
+    // If d is a power of 2, or negative a power of 2, we have to use a shift.
+    // This is especially important because the magic algorithm fails for -1.
+    // To check if d is a power of 2 or its inverse, it suffices to check
+    // whether its absolute value has exactly one bit set. This works even for
+    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
+    // and is a power of 2.
+    uint32_t ud = (uint32_t)d;
+    uint32_t absD = (d < 0) ? -ud : ud;
+    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD);
+    // check if exactly one bit is set,
+    // don't care if absD is 0 since that's divide by zero
+    if ((absD & (absD - 1)) == 0) {
+        // Branchfree and normal paths are exactly the same
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
+    } else {
+        LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
+
+        uint8_t more;
+        // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word
+        // is 0 and the high word is floor_log_2_d - 1
+        uint32_t rem, proposed_m;
+        proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
+        const uint32_t e = absD - rem;
+
+        // We are going to start with a power of floor_log_2_d - 1.
+        // This works if works if e < 2**floor_log_2_d.
+        if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) {
+            // This power works
+            more = (uint8_t)(floor_log_2_d - 1);
+        } else {
+            // We need to go one higher. This should not make proposed_m
+            // overflow, but it will make it negative when interpreted as an
+            // int32_t.
+            proposed_m += proposed_m;
+            const uint32_t twice_rem = rem + rem;
+            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
+        }
+
+        proposed_m += 1;
+        int32_t magic = (int32_t)proposed_m;
+
+        // Mark if we are negative. Note we only negate the magic number in the
+        // branchfull case.
+        if (d < 0) {
+            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
+            if (!branchfree) {
+                magic = -magic;
+            }
+        }
+
+        result.more = more;
+        result.magic = magic;
+    }
+    return result;
+}
+
+struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
+    return libdivide_internal_s32_gen(d, 0);
+}
+
+struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
+    struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1);
+    struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more};
+    return result;
+}
+
+int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+
+    if (!denom->magic) {
+        uint32_t sign = (int8_t)more >> 7;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
+        uint32_t uq = numer + ((numer >> 31) & mask);
+        int32_t q = (int32_t)uq;
+        q >>= shift;
+        q = (q ^ sign) - sign;
+        return q;
+    } else {
+        uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift and then sign extend
+            int32_t sign = (int8_t)more >> 7;
+            // q += (more < 0 ? -numer : numer)
+            // cast required to avoid UB
+            uq += ((uint32_t)numer ^ sign) - sign;
+        }
+        int32_t q = (int32_t)uq;
+        q >>= shift;
+        q += (q < 0);
+        return q;
+    }
+}
+
+int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+    // must be arithmetic shift and then sign extend
+    int32_t sign = (int8_t)more >> 7;
+    int32_t magic = denom->magic;
+    int32_t q = libdivide_mullhi_s32(magic, numer);
+    q += numer;
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is a power of
+    // 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    uint32_t q_sign = (uint32_t)(q >> 31);
+    q += q_sign & (((uint32_t)1 << shift) - is_power_of_2);
+
+    // Now arithmetic right shift
+    q >>= shift;
+    // Negate if needed
+    q = (q ^ sign) - sign;
+
+    return q;
+}
+
+int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+    if (!denom->magic) {
+        uint32_t absD = (uint32_t)1 << shift;
+        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
+            absD = -absD;
+        }
+        return (int32_t)absD;
+    } else {
+        // Unsigned math is much easier
+        // We negate the magic number only in the branchfull case, and we don't
+        // know which case we're in. However we have enough information to
+        // determine the correct sign of the magic number. The divisor was
+        // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
+        // the magic number's sign is opposite that of the divisor.
+        // We want to compute the positive magic number.
+        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
+        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
+
+        // Handle the power of 2 case (including branchfree)
+        if (denom->magic == 0) {
+            int32_t result = (uint32_t)1 << shift;
+            return negative_divisor ? -result : result;
+        }
+
+        uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);
+        uint64_t n = (uint64_t)1 << (32 + shift);  // this shift cannot exceed 30
+        uint32_t q = (uint32_t)(n / d);
+        int32_t result = (int32_t)q;
+        result += 1;
+        return negative_divisor ? -result : result;
+    }
+}
+
+int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {
+    return libdivide_s32_recover((const struct libdivide_s32_t *)denom);
+}
+
+///////////// SINT64
+
+static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(
+    int64_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_s64_t result;
+
+    // If d is a power of 2, or negative a power of 2, we have to use a shift.
+    // This is especially important because the magic algorithm fails for -1.
+    // To check if d is a power of 2 or its inverse, it suffices to check
+    // whether its absolute value has exactly one bit set.  This works even for
+    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
+    // and is a power of 2.
+    uint64_t ud = (uint64_t)d;
+    uint64_t absD = (d < 0) ? -ud : ud;
+    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD);
+    // check if exactly one bit is set,
+    // don't care if absD is 0 since that's divide by zero
+    if ((absD & (absD - 1)) == 0) {
+        // Branchfree and non-branchfree cases are the same
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
+    } else {
+        // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word
+        // is 0 and the high word is floor_log_2_d - 1
+        uint8_t more;
+        uint64_t rem, proposed_m;
+        proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
+        const uint64_t e = absD - rem;
+
+        // We are going to start with a power of floor_log_2_d - 1.
+        // This works if works if e < 2**floor_log_2_d.
+        if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
+            // This power works
+            more = (uint8_t)(floor_log_2_d - 1);
+        } else {
+            // We need to go one higher. This should not make proposed_m
+            // overflow, but it will make it negative when interpreted as an
+            // int32_t.
+            proposed_m += proposed_m;
+            const uint64_t twice_rem = rem + rem;
+            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
+            // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we
+            // also set ADD_MARKER this is an annoying optimization that
+            // enables algorithm #4 to avoid the mask. However we always set it
+            // in the branchfree case
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
+        }
+        proposed_m += 1;
+        int64_t magic = (int64_t)proposed_m;
+
+        // Mark if we are negative
+        if (d < 0) {
+            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
+            if (!branchfree) {
+                magic = -magic;
+            }
+        }
+
+        result.more = more;
+        result.magic = magic;
+    }
+    return result;
+}
+
+struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
+    return libdivide_internal_s64_gen(d, 0);
+}
+
+struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
+    struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1);
+    struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more};
+    return ret;
+}
+
+int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+
+    if (!denom->magic) {  // shift path
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
+        uint64_t uq = numer + ((numer >> 63) & mask);
+        int64_t q = (int64_t)uq;
+        q >>= shift;
+        // must be arithmetic shift and then sign-extend
+        int64_t sign = (int8_t)more >> 7;
+        q = (q ^ sign) - sign;
+        return q;
+    } else {
+        uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift and then sign extend
+            int64_t sign = (int8_t)more >> 7;
+            // q += (more < 0 ? -numer : numer)
+            // cast required to avoid UB
+            uq += ((uint64_t)numer ^ sign) - sign;
+        }
+        int64_t q = (int64_t)uq;
+        q >>= shift;
+        q += (q < 0);
+        return q;
+    }
+}
+
+int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift and then sign extend
+    int64_t sign = (int8_t)more >> 7;
+    int64_t magic = denom->magic;
+    int64_t q = libdivide_mullhi_s64(magic, numer);
+    q += numer;
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is a power of
+    // 2, or (2**shift) if it is not a power of 2.
+    uint64_t is_power_of_2 = (magic == 0);
+    uint64_t q_sign = (uint64_t)(q >> 63);
+    q += q_sign & (((uint64_t)1 << shift) - is_power_of_2);
+
+    // Arithmetic right shift
+    q >>= shift;
+    // Negate if needed
+    q = (q ^ sign) - sign;
+
+    return q;
+}
+
+int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    if (denom->magic == 0) {  // shift path
+        uint64_t absD = (uint64_t)1 << shift;
+        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
+            absD = -absD;
+        }
+        return (int64_t)absD;
+    } else {
+        // Unsigned math is much easier
+        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
+        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
+
+        uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);
+        uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0;
+        uint64_t rem_ignored;
+        uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);
+        int64_t result = (int64_t)(q + 1);
+        if (negative_divisor) {
+            result = -result;
+        }
+        return result;
+    }
+}
+
+int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {
+    return libdivide_s64_recover((const struct libdivide_s64_t *)denom);
+}
+
+// Simplest possible vector type division: treat the vector type as an array
+// of underlying native type.
+#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
+    const size_t count = sizeof(VecT) / sizeof(IntT); \
+    VecT result; \
+    IntT *pSource = (IntT *)&numers; \
+    IntT *pTarget = (IntT *)&result; \
+    for (size_t loop=0; loop<count; ++loop) { \
+        pTarget[loop] = libdivide_##Algo##_do(pSource[loop], denom); \
+    } \
+    return result; \
+
+#if defined(LIBDIVIDE_NEON)
+
+static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128(
+    uint16x8_t numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128(
+    int16x8_t numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128(
+    uint32x4_t numers, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128(
+    int32x4_t numers, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128(
+    uint64x2_t numers, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128(
+    int64x2_t numers, const struct libdivide_s64_t *denom);
+
+static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128(
+    uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128(
+    int16x8_t numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128(
+    uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128(
+    int32x4_t numers, const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128(
+    uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom);
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(
+    int64x2_t numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+// Logical right shift by runtime value.
+// NEON implements right shift as left shits by negative values.
+static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {
+    int32_t wamt = (int32_t)(amt);
+    return vshlq_u32(v, vdupq_n_s32(-wamt));
+}
+
+static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {
+    int64_t wamt = (int64_t)(amt);
+    return vshlq_u64(v, vdupq_n_s64(-wamt));
+}
+
+// Arithmetic right shift by runtime value.
+static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {
+    int32_t wamt = (int32_t)(amt);
+    return vshlq_s32(v, vdupq_n_s32(-wamt));
+}
+
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {
+    int64_t wamt = (int64_t)(amt);
+    return vshlq_s64(v, vdupq_n_s64(-wamt));
+}
+
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); }
+
+static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) {
+    // Desire is [x0, x1, x2, x3]
+    uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b));  // [_, x0, _, x1]
+    uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b));           //[_, x2, _, x3]
+    return vuzp2q_u32(w1, w2);                                               // [x0, x1, x2, x3]
+}
+
+static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) {
+    int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b));  // [_, x0, _, x1]
+    int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b));           //[_, x2, _, x3]
+    return vuzp2q_s32(w1, w2);                                              // [x0, x1, x2, x3]
+}
+
+static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) {
+    // full 128 bits product is:
+    // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
+    // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
+
+    // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits.
+    uint64x2_t y = vdupq_n_u64(sy);
+    uint32x2_t x0 = vmovn_u64(x);
+    uint32x2_t y0 = vmovn_u64(y);
+    uint32x2_t x1 = vshrn_n_u64(x, 32);
+    uint32x2_t y1 = vshrn_n_u64(y, 32);
+
+    // Compute x0*y0.
+    uint64x2_t x0y0 = vmull_u32(x0, y0);
+    uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32);
+
+    // Compute other intermediate products.
+    uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0);  // temp = x0y0_hi + x1*y0;
+    // We want to split temp into its low 32 bits and high 32 bits, both
+    // in the low half of 64 bit registers.
+    // Use shifts to avoid needing a reg for the mask.
+    uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32);  // temp_lo = temp & 0xFFFFFFFF;
+    uint64x2_t temp_hi = vshrq_n_u64(temp, 32);                   // temp_hi = temp >> 32;
+
+    temp_lo = vmlal_u32(temp_lo, x0, y1);  // temp_lo += x0*y0
+    temp_lo = vshrq_n_u64(temp_lo, 32);    // temp_lo >>= 32
+    temp_hi = vmlal_u32(temp_hi, x1, y1);  // temp_hi += x1*y1
+    uint64x2_t result = vaddq_u64(temp_hi, temp_lo);
+    return result;
+}
+
+static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
+    int64x2_t p = vreinterpretq_s64_u64(
+        libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy)));
+    int64x2_t y = vdupq_n_s64(sy);
+    int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y);
+    int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x);
+    p = vsubq_s64(p, t1);
+    p = vsubq_s64(p, t2);
+    return p;
+}
+
+////////// UINT16
+
+uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)
+}
+
+uint16x8_t libdivide_u16_branchfree_do_vec128(uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)
+}
+
+////////// UINT32
+
+uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return libdivide_u32_neon_srl(numers, more);
+    } else {
+        uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            // Note we can use halving-subtract to avoid the shift.
+            uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
+            return libdivide_u32_neon_srl(t, shift);
+        } else {
+            return libdivide_u32_neon_srl(q, more);
+        }
+    }
+}
+
+uint32x4_t libdivide_u32_branchfree_do_vec128(
+    uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) {
+    uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
+    uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
+    return libdivide_u32_neon_srl(t, denom->more);
+}
+
+////////// UINT64
+
+uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return libdivide_u64_neon_srl(numers, more);
+    } else {
+        uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            // No 64-bit halving subtracts in NEON :(
+            uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
+            return libdivide_u64_neon_srl(t, shift);
+        } else {
+            return libdivide_u64_neon_srl(q, more);
+        }
+    }
+}
+
+uint64x2_t libdivide_u64_branchfree_do_vec128(
+    uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) {
+    uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
+    uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
+    return libdivide_u64_neon_srl(t, denom->more);
+}
+
+////////// SINT16
+
+int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)
+}
+
+int16x8_t libdivide_s16_branchfree_do_vec128(int16x8_t numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)
+}
+
+////////// SINT32
+
+int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
+        int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak));
+        q = libdivide_s32_neon_sra(q, shift);
+        int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = vsubq_s32(veorq_s32(q, sign), sign);
+        return q;
+    } else {
+        int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign));
+        }
+        // q >>= shift
+        q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK);
+        q = vaddq_s32(
+            q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31)));  // q += (q < 0)
+        return q;
+    }
+}
+
+int32x4_t libdivide_s32_branchfree_do_vec128(
+    int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+    // must be arithmetic shift
+    int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
+    int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic);
+    q = vaddq_s32(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    int32x4_t q_sign = vshrq_n_s32(q, 31);  // q_sign = q >> 31
+    int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2);
+    q = vaddq_s32(q, vandq_s32(q_sign, mask));  // q = q + (q_sign & mask)
+    q = libdivide_s32_neon_sra(q, shift);       // q >>= shift
+    q = vsubq_s32(veorq_s32(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
+int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    int64_t magic = denom->magic;
+    if (magic == 0) {  // shift path
+        uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
+        int64x2_t roundToZeroTweak = vdupq_n_s64(mask);  // TODO: no need to sign extend
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        int64x2_t q =
+            vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak));
+        q = libdivide_s64_neon_sra(q, shift);
+        // q = (q ^ sign) - sign;
+        int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7));
+        q = vsubq_s64(veorq_s64(q, sign), sign);
+        return q;
+    } else {
+        int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: no need to widen
+            // q += ((numer ^ sign) - sign);
+            q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign));
+        }
+        // q >>= denom->mult_path.shift
+        q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = vaddq_s64(
+            q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63)));  // q += (q < 0)
+        return q;
+    }
+}
+
+int64x2_t libdivide_s64_branchfree_do_vec128(
+    int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) {
+    int64_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift
+    int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: avoid sign extend
+
+    // libdivide_mullhi_s64(numers, magic);
+    int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
+    q = vaddq_s64(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
+    uint32_t is_power_of_2 = (magic == 0);
+    int64x2_t q_sign = libdivide_s64_signbits(q);  // q_sign = q >> 63
+    int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2);
+    q = vaddq_s64(q, vandq_s64(q_sign, mask));  // q = q + (q_sign & mask)
+    q = libdivide_s64_neon_sra(q, shift);       // q >>= shift
+    q = vsubq_s64(veorq_s64(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
+#endif
+
+#if defined(LIBDIVIDE_AVX512)
+
+static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512(
+    __m512i numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512(
+    __m512i numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512(
+    __m512i numers, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512(
+    __m512i numers, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512(
+    __m512i numers, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512(
+    __m512i numers, const struct libdivide_s64_t *denom);
+
+static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u64_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) {
+    ;
+    return _mm512_srai_epi64(v, 63);
+}
+
+static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) {
+    return _mm512_srai_epi64(v, amt);
+}
+
+// Here, b is assumed to contain one 32-bit value repeated.
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {
+    __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);
+    __m512i a1X3X = _mm512_srli_epi64(a, 32);
+    __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
+    __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask);
+    return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// b is one 32-bit value repeated.
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {
+    __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);
+    __m512i a1X3X = _mm512_srli_epi64(a, 32);
+    __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
+    __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask);
+    return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// Here, y is assumed to contain one 64-bit value repeated.
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {
+    // see m128i variant for comments.
+    __m512i x0y0 = _mm512_mul_epu32(x, y);
+    __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32);
+
+    __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
+    __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
+
+    __m512i x0y1 = _mm512_mul_epu32(x, y1);
+    __m512i x1y0 = _mm512_mul_epu32(x1, y);
+    __m512i x1y1 = _mm512_mul_epu32(x1, y1);
+
+    __m512i mask = _mm512_set1_epi64(0xFFFFFFFF);
+    __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi);
+    __m512i temp_lo = _mm512_and_si512(temp, mask);
+    __m512i temp_hi = _mm512_srli_epi64(temp, 32);
+
+    temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32);
+    temp_hi = _mm512_add_epi64(x1y1, temp_hi);
+    return _mm512_add_epi64(temp_lo, temp_hi);
+}
+
+// y is one 64-bit value repeated.
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {
+    __m512i p = libdivide_mullhi_u64_vec512(x, y);
+    __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y);
+    __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x);
+    p = _mm512_sub_epi64(p, t1);
+    p = _mm512_sub_epi64(p, t2);
+    return p;
+}
+
+////////// UINT16
+
+__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)
+}
+
+__m512i libdivide_u16_branchfree_do_vec512(__m512i numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)
+}
+
+////////// UINT32
+
+__m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm512_srli_epi32(numers, more);
+    } else {
+        __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
+            return _mm512_srli_epi32(t, shift);
+        } else {
+            return _mm512_srli_epi32(q, more);
+        }
+    }
+}
+
+__m512i libdivide_u32_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
+    __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
+    return _mm512_srli_epi32(t, denom->more);
+}
+
+////////// UINT64
+
+__m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm512_srli_epi64(numers, more);
+    } else {
+        __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
+            return _mm512_srli_epi64(t, shift);
+        } else {
+            return _mm512_srli_epi64(q, more);
+        }
+    }
+}
+
+__m512i libdivide_u64_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
+    __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
+    return _mm512_srli_epi64(t, denom->more);
+}
+
+////////// SINT16
+
+__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)
+}
+
+__m512i libdivide_s16_branchfree_do_vec512(__m512i numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)
+}
+
+////////// SINT32
+
+__m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
+        __m512i roundToZeroTweak = _mm512_set1_epi32(mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        __m512i q = _mm512_add_epi32(
+            numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));
+        q = _mm512_srai_epi32(q, shift);
+        __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);
+        return q;
+    } else {
+        __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
+        q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31));  // q += (q < 0)
+        return q;
+    }
+}
+
+__m512i libdivide_s32_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+    // must be arithmetic shift
+    __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+    __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic));
+    q = _mm512_add_epi32(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    __m512i q_sign = _mm512_srai_epi32(q, 31);  // q_sign = q >> 31
+    __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
+    q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
+    q = _mm512_srai_epi32(q, shift);                          // q >>= shift
+    q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
+__m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    int64_t magic = denom->magic;
+    if (magic == 0) {  // shift path
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
+        __m512i roundToZeroTweak = _mm512_set1_epi64(mask);
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        __m512i q = _mm512_add_epi64(
+            numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vec512(q, shift);
+        __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);
+        return q;
+    } else {
+        __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign));
+        }
+        // q >>= denom->mult_path.shift
+        q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63));  // q += (q < 0)
+        return q;
+    }
+}
+
+__m512i libdivide_s64_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s64_branchfree_t *denom) {
+    int64_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift
+    __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+
+    // libdivide_mullhi_s64(numers, magic);
+    __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
+    q = _mm512_add_epi64(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
+    uint32_t is_power_of_2 = (magic == 0);
+    __m512i q_sign = libdivide_s64_signbits_vec512(q);  // q_sign = q >> 63
+    __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2);
+    q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vec512(q, shift);           // q >>= shift
+    q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
+#endif
+
+#if defined(LIBDIVIDE_AVX2)
+
+static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256(
+    __m256i numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256(
+    __m256i numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256(
+    __m256i numers, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256(
+    __m256i numers, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256(
+    __m256i numers, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256(
+    __m256i numers, const struct libdivide_s64_t *denom);
+
+static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u64_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+// Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
+static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) {
+    __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
+    __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
+    return signBits;
+}
+
+// Implementation of _mm256_srai_epi64 (from AVX512).
+static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) {
+    const int b = 64 - amt;
+    __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1));
+    __m256i x = _mm256_srli_epi64(v, amt);
+    __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);
+    return result;
+}
+
+// Here, b is assumed to contain one 32-bit value repeated.
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {
+    __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);
+    __m256i a1X3X = _mm256_srli_epi64(a, 32);
+    __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
+    __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask);
+    return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// b is one 32-bit value repeated.
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {
+    __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);
+    __m256i a1X3X = _mm256_srli_epi64(a, 32);
+    __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
+    __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask);
+    return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// Here, y is assumed to contain one 64-bit value repeated.
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {
+    // see m128i variant for comments.
+    __m256i x0y0 = _mm256_mul_epu32(x, y);
+    __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32);
+
+    __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
+    __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
+
+    __m256i x0y1 = _mm256_mul_epu32(x, y1);
+    __m256i x1y0 = _mm256_mul_epu32(x1, y);
+    __m256i x1y1 = _mm256_mul_epu32(x1, y1);
+
+    __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF);
+    __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi);
+    __m256i temp_lo = _mm256_and_si256(temp, mask);
+    __m256i temp_hi = _mm256_srli_epi64(temp, 32);
+
+    temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32);
+    temp_hi = _mm256_add_epi64(x1y1, temp_hi);
+    return _mm256_add_epi64(temp_lo, temp_hi);
+}
+
+// y is one 64-bit value repeated.
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {
+    __m256i p = libdivide_mullhi_u64_vec256(x, y);
+    __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y);
+    __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x);
+    p = _mm256_sub_epi64(p, t1);
+    p = _mm256_sub_epi64(p, t2);
+    return p;
+}
+
+////////// UINT16
+
+__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16)
+}
+
+__m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree)
+}
+
+////////// UINT32
+
+__m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm256_srli_epi32(numers, more);
+    } else {
+        __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
+            return _mm256_srli_epi32(t, shift);
+        } else {
+            return _mm256_srli_epi32(q, more);
+        }
+    }
+}
+
+__m256i libdivide_u32_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
+    __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
+    return _mm256_srli_epi32(t, denom->more);
+}
+
+////////// UINT64
+
+__m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm256_srli_epi64(numers, more);
+    } else {
+        __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
+            return _mm256_srli_epi64(t, shift);
+        } else {
+            return _mm256_srli_epi64(q, more);
+        }
+    }
+}
+
+__m256i libdivide_u64_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
+    __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
+    return _mm256_srli_epi64(t, denom->more);
+}
+
+////////// SINT16
+
+__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16)
+}
+
+__m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree)
+}
+
+////////// SINT32
+
+__m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
+        __m256i roundToZeroTweak = _mm256_set1_epi32(mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        __m256i q = _mm256_add_epi32(
+            numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));
+        q = _mm256_srai_epi32(q, shift);
+        __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);
+        return q;
+    } else {
+        __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
+        q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31));  // q += (q < 0)
+        return q;
+    }
+}
+
+__m256i libdivide_s32_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+    // must be arithmetic shift
+    __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+    __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic));
+    q = _mm256_add_epi32(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    __m256i q_sign = _mm256_srai_epi32(q, 31);  // q_sign = q >> 31
+    __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
+    q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
+    q = _mm256_srai_epi32(q, shift);                          // q >>= shift
+    q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
+__m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    int64_t magic = denom->magic;
+    if (magic == 0) {  // shift path
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
+        __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        __m256i q = _mm256_add_epi64(
+            numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vec256(q, shift);
+        __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);
+        return q;
+    } else {
+        __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign));
+        }
+        // q >>= denom->mult_path.shift
+        q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63));  // q += (q < 0)
+        return q;
+    }
+}
+
+__m256i libdivide_s64_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s64_branchfree_t *denom) {
+    int64_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift
+    __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+
+    // libdivide_mullhi_s64(numers, magic);
+    __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
+    q = _mm256_add_epi64(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
+    uint32_t is_power_of_2 = (magic == 0);
+    __m256i q_sign = libdivide_s64_signbits_vec256(q);  // q_sign = q >> 63
+    __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
+    q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vec256(q, shift);           // q >>= shift
+    q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
+#endif
+
+#if defined(LIBDIVIDE_SSE2)
+
+static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128(
+    __m128i numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128(
+    __m128i numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128(
+    __m128i numers, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128(
+    __m128i numers, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128(
+    __m128i numers, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128(
+    __m128i numers, const struct libdivide_s64_t *denom);
+
+static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u64_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+// Implementation of _mm_srai_epi64(v, 63) (from AVX512).
+static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) {
+    __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
+    __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
+    return signBits;
+}
+
+// Implementation of _mm_srai_epi64 (from AVX512).
+static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) {
+    const int b = 64 - amt;
+    __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1));
+    __m128i x = _mm_srli_epi64(v, amt);
+    __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
+    return result;
+}
+
+// Here, b is assumed to contain one 32-bit value repeated.
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {
+    __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
+    __m128i a1X3X = _mm_srli_epi64(a, 32);
+    __m128i mask = _mm_set_epi32(-1, 0, -1, 0);
+    __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);
+    return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// SSE2 does not have a signed multiplication instruction, but we can convert
+// unsigned to signed pretty efficiently. Again, b is just a 32 bit value
+// repeated four times.
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {
+    __m128i p = libdivide_mullhi_u32_vec128(a, b);
+    // t1 = (a >> 31) & y, arithmetic shift
+    __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
+    __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
+    p = _mm_sub_epi32(p, t1);
+    p = _mm_sub_epi32(p, t2);
+    return p;
+}
+
+// Here, y is assumed to contain one 64-bit value repeated.
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {
+    // full 128 bits product is:
+    // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
+    // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
+
+    // Compute x0*y0.
+    // Note x1, y1 are ignored by mul_epu32.
+    __m128i x0y0 = _mm_mul_epu32(x, y);
+    __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32);
+
+    // Get x1, y1 in the low bits.
+    // We could shuffle or right shift. Shuffles are preferred as they preserve
+    // the source register for the next computation.
+    __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
+    __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
+
+    // No need to mask off top 32 bits for mul_epu32.
+    __m128i x0y1 = _mm_mul_epu32(x, y1);
+    __m128i x1y0 = _mm_mul_epu32(x1, y);
+    __m128i x1y1 = _mm_mul_epu32(x1, y1);
+
+    // Mask here selects low bits only.
+    __m128i mask = _mm_set1_epi64x(0xFFFFFFFF);
+    __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
+    __m128i temp_lo = _mm_and_si128(temp, mask);
+    __m128i temp_hi = _mm_srli_epi64(temp, 32);
+
+    temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
+    temp_hi = _mm_add_epi64(x1y1, temp_hi);
+    return _mm_add_epi64(temp_lo, temp_hi);
+}
+
+// y is one 64-bit value repeated.
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {
+    __m128i p = libdivide_mullhi_u64_vec128(x, y);
+    __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y);
+    __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x);
+    p = _mm_sub_epi64(p, t1);
+    p = _mm_sub_epi64(p, t2);
+    return p;
+}
+
+////////// UINT26
+
+__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16)
+}
+
+__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree)
+}
+
+////////// UINT32
+
+__m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm_srli_epi32(numers, more);
+    } else {
+        __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
+            return _mm_srli_epi32(t, shift);
+        } else {
+            return _mm_srli_epi32(q, more);
+        }
+    }
+}
+
+__m128i libdivide_u32_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
+    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
+    return _mm_srli_epi32(t, denom->more);
+}
+
+////////// UINT64
+
+__m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm_srli_epi64(numers, more);
+    } else {
+        __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
+            return _mm_srli_epi64(t, shift);
+        } else {
+            return _mm_srli_epi64(q, more);
+        }
+    }
+}
+
+__m128i libdivide_u64_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
+    __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
+    return _mm_srli_epi64(t, denom->more);
+}
+
+////////// SINT16
+
+__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16)
+}
+
+__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree)
+}
+
+////////// SINT32
+
+__m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
+        __m128i roundToZeroTweak = _mm_set1_epi32(mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        __m128i q =
+            _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
+        q = _mm_srai_epi32(q, shift);
+        __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);
+        return q;
+    } else {
+        __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
+        q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));  // q += (q < 0)
+        return q;
+    }
+}
+
+__m128i libdivide_s32_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+    // must be arithmetic shift
+    __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+    __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic));
+    q = _mm_add_epi32(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    __m128i q_sign = _mm_srai_epi32(q, 31);  // q_sign = q >> 31
+    __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
+    q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
+    q = _mm_srai_epi32(q, shift);                       // q >>= shift
+    q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
+__m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    int64_t magic = denom->magic;
+    if (magic == 0) {  // shift path
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
+        __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        __m128i q =
+            _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vec128(q, shift);
+        __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);
+        return q;
+    } else {
+        __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
+        }
+        // q >>= denom->mult_path.shift
+        q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));  // q += (q < 0)
+        return q;
+    }
+}
+
+__m128i libdivide_s64_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s64_branchfree_t *denom) {
+    int64_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift
+    __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+
+    // libdivide_mullhi_s64(numers, magic);
+    __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
+    q = _mm_add_epi64(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
+    uint32_t is_power_of_2 = (magic == 0);
+    __m128i q_sign = libdivide_s64_signbits_vec128(q);  // q_sign = q >> 63
+    __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
+    q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vec128(q, shift);     // q >>= shift
+    q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
+#endif
+
+/////////// C++ stuff
+
+#ifdef __cplusplus
+
+enum Branching {
+    BRANCHFULL,  // use branching algorithms
+    BRANCHFREE   // use branchfree algorithms
+};
+
+#if defined(LIBDIVIDE_NEON)
+// Helper to deduce NEON vector type for integral type.
+template <typename T>
+struct NeonVecFor {};
+
+template <>
+struct NeonVecFor<uint16_t> {
+    typedef uint16x8_t type;
+};
+
+template <>
+struct NeonVecFor<int16_t> {
+    typedef int16x8_t type;
+};
+
+template <>
+struct NeonVecFor<uint32_t> {
+    typedef uint32x4_t type;
+};
+
+template <>
+struct NeonVecFor<int32_t> {
+    typedef int32x4_t type;
+};
+
+template <>
+struct NeonVecFor<uint64_t> {
+    typedef uint64x2_t type;
+};
+
+template <>
+struct NeonVecFor<int64_t> {
+    typedef int64x2_t type;
+};
+#endif
+
+// Versions of our algorithms for SIMD.
+#if defined(LIBDIVIDE_NEON)
+#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)                    \
+    LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
+        typename NeonVecFor<INT_TYPE>::type n) const {           \
+        return libdivide_##ALGO##_do_vec128(n, &denom);          \
+    }
+#else
+#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
+#endif
+#if defined(LIBDIVIDE_SSE2)
+#define LIBDIVIDE_DIVIDE_SSE2(ALGO)                     \
+    LIBDIVIDE_INLINE __m128i divide(__m128i n) const {  \
+        return libdivide_##ALGO##_do_vec128(n, &denom); \
+    }
+#else
+#define LIBDIVIDE_DIVIDE_SSE2(ALGO)
+#endif
+
+#if defined(LIBDIVIDE_AVX2)
+#define LIBDIVIDE_DIVIDE_AVX2(ALGO)                     \
+    LIBDIVIDE_INLINE __m256i divide(__m256i n) const {  \
+        return libdivide_##ALGO##_do_vec256(n, &denom); \
+    }
+#else
+#define LIBDIVIDE_DIVIDE_AVX2(ALGO)
+#endif
+
+#if defined(LIBDIVIDE_AVX512)
+#define LIBDIVIDE_DIVIDE_AVX512(ALGO)                   \
+    LIBDIVIDE_INLINE __m512i divide(__m512i n) const {  \
+        return libdivide_##ALGO##_do_vec512(n, &denom); \
+    }
+#else
+#define LIBDIVIDE_DIVIDE_AVX512(ALGO)
+#endif
+
+// The DISPATCHER_GEN() macro generates C++ methods (for the given integer
+// and algorithm types) that redirect to libdivide's C API.
+#define DISPATCHER_GEN(T, ALGO)                                                       \
+    libdivide_##ALGO##_t denom;                                                       \
+    LIBDIVIDE_INLINE dispatcher() {}                                                  \
+    LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {}            \
+    LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
+    LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
+    LIBDIVIDE_DIVIDE_NEON(ALGO, T)                                                    \
+    LIBDIVIDE_DIVIDE_SSE2(ALGO)                                                       \
+    LIBDIVIDE_DIVIDE_AVX2(ALGO)                                                       \
+    LIBDIVIDE_DIVIDE_AVX512(ALGO)
+
+// The dispatcher selects a specific division algorithm for a given
+// type and ALGO using partial template specialization.
+template <typename _IntT, Branching ALGO>
+struct dispatcher {};
+
+template <>
+struct dispatcher<int16_t, BRANCHFULL> {
+    DISPATCHER_GEN(int16_t, s16)
+};
+template <>
+struct dispatcher<int16_t, BRANCHFREE> {
+    DISPATCHER_GEN(int16_t, s16_branchfree)
+};
+template <>
+struct dispatcher<uint16_t, BRANCHFULL> {
+    DISPATCHER_GEN(uint16_t, u16)
+};
+template <>
+struct dispatcher<uint16_t, BRANCHFREE> {
+    DISPATCHER_GEN(uint16_t, u16_branchfree)
+};
+template <>
+struct dispatcher<int32_t, BRANCHFULL> {
+    DISPATCHER_GEN(int32_t, s32)
+};
+template <>
+struct dispatcher<int32_t, BRANCHFREE> {
+    DISPATCHER_GEN(int32_t, s32_branchfree)
+};
+template <>
+struct dispatcher<uint32_t, BRANCHFULL> {
+    DISPATCHER_GEN(uint32_t, u32)
+};
+template <>
+struct dispatcher<uint32_t, BRANCHFREE> {
+    DISPATCHER_GEN(uint32_t, u32_branchfree)
+};
+template <>
+struct dispatcher<int64_t, BRANCHFULL> {
+    DISPATCHER_GEN(int64_t, s64)
+};
+template <>
+struct dispatcher<int64_t, BRANCHFREE> {
+    DISPATCHER_GEN(int64_t, s64_branchfree)
+};
+template <>
+struct dispatcher<uint64_t, BRANCHFULL> {
+    DISPATCHER_GEN(uint64_t, u64)
+};
+template <>
+struct dispatcher<uint64_t, BRANCHFREE> {
+    DISPATCHER_GEN(uint64_t, u64_branchfree)
+};
+
+// This is the main divider class for use by the user (C++ API).
+// The actual division algorithm is selected using the dispatcher struct
+// based on the integer and algorithm template parameters.
+template <typename T, Branching ALGO = BRANCHFULL>
+class divider {
+   private:
+    typedef dispatcher<T, ALGO> dispatcher_t;
+
+   public:
+    // We leave the default constructor empty so that creating
+    // an array of dividers and then initializing them
+    // later doesn't slow us down.
+    divider() {}
+
+    // Constructor that takes the divisor as a parameter
+    LIBDIVIDE_INLINE divider(T d) : div(d) {}
+
+    // Divides n by the divisor
+    LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); }
+
+    // Recovers the divisor, returns the value that was
+    // used to initialize this divider object.
+    T recover() const { return div.recover(); }
+
+    bool operator==(const divider<T, ALGO> &other) const {
+        return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more;
+    }
+
+    bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
+
+    // Vector variants treat the input as packed integer values with the same type as the divider
+    // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed
+    // quotients.
+#if defined(LIBDIVIDE_SSE2)
+    LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); }
+#endif
+#if defined(LIBDIVIDE_AVX2)
+    LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); }
+#endif
+#if defined(LIBDIVIDE_AVX512)
+    LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); }
+#endif
+#if defined(LIBDIVIDE_NEON)
+    LIBDIVIDE_INLINE typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const {
+        return div.divide(n);
+    }
+#endif
+
+   private:
+    // Storage for the actual divisor
+    dispatcher_t div;
+};
+
+// Overload of operator / for scalar division
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE T operator/(T n, const divider<T, ALGO> &div) {
+    return div.divide(n);
+}
+
+// Overload of operator /= for scalar division
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE T &operator/=(T &n, const divider<T, ALGO> &div) {
+    n = div.divide(n);
+    return n;
+}
+
+// Overloads for vector types.
+#if defined(LIBDIVIDE_SSE2)
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider<T, ALGO> &div) {
+    return div.divide(n);
+}
+
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider<T, ALGO> &div) {
+    n = div.divide(n);
+    return n;
+}
+#endif
+#if defined(LIBDIVIDE_AVX2)
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider<T, ALGO> &div) {
+    return div.divide(n);
+}
+
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider<T, ALGO> &div) {
+    n = div.divide(n);
+    return n;
+}
+#endif
+#if defined(LIBDIVIDE_AVX512)
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider<T, ALGO> &div) {
+    return div.divide(n);
+}
+
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
+    n = div.divide(n);
+    return n;
+}
+#endif
+
+#if defined(LIBDIVIDE_NEON)
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
+    return div.divide(n);
+}
+
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
+    n = div.divide(n);
+    return n;
+}
+#endif
+
+#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
+// libdivide::branchfree_divider<T>
+template <typename T>
+using branchfree_divider = divider<T, BRANCHFREE>;
+#endif
+
+}  // namespace libdivide
+
+#endif  // __cplusplus
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#endif  // LIBDIVIDE_H
diff --git a/src/hardened_malloc/util.c b/src/hardened_malloc/util.c
new file mode 100644
index 0000000..a3d6f0c
--- /dev/null
+++ b/src/hardened_malloc/util.c
@@ -0,0 +1,41 @@
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <unistd.h>
+
+#ifdef __ANDROID__
+#include <async_safe/log.h>
+#endif
+
+#include "util.h"
+
+#ifndef __ANDROID__
+static int write_full(int fd, const char *buf, size_t length) {
+    do {
+        ssize_t bytes_written = write(fd, buf, length);
+        if (bytes_written == -1) {
+            if (errno == EINTR) {
+                continue;
+            }
+            return -1;
+        }
+        buf += bytes_written;
+        length -= bytes_written;
+    } while (length);
+
+    return 0;
+}
+#endif
+
+COLD noreturn void fatal_error(const char *s) {
+#ifdef __ANDROID__
+    async_safe_fatal("hardened_malloc: fatal allocator error: %s", s);
+#else
+    const char *prefix = "fatal allocator error: ";
+    (void)(write_full(STDERR_FILENO, prefix, strlen(prefix)) != -1 &&
+        write_full(STDERR_FILENO, s, strlen(s)) != -1 &&
+        write_full(STDERR_FILENO, "\n", 1));
+    abort();
+#endif
+}
diff --git a/src/hardened_malloc/util.h b/src/hardened_malloc/util.h
new file mode 100644
index 0000000..fc22c23
--- /dev/null
+++ b/src/hardened_malloc/util.h
@@ -0,0 +1,88 @@
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+// C11 noreturn doesn't work in C++
+#define noreturn __attribute__((noreturn))
+
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+#define min(x, y) ({ \
+    __typeof__(x) _x = (x); \
+    __typeof__(y) _y = (y); \
+    (void) (&_x == &_y); \
+    _x < _y ? _x : _y; })
+
+#define max(x, y) ({ \
+    __typeof__(x) _x = (x); \
+    __typeof__(y) _y = (y); \
+    (void) (&_x == &_y); \
+    _x > _y ? _x : _y; })
+
+#define COLD __attribute__((cold))
+#define UNUSED __attribute__((unused))
+#define EXPORT __attribute__((visibility("default")))
+
+#define STRINGIFY(s) #s
+#define ALIAS(f) __attribute__((alias(STRINGIFY(f))))
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef unsigned __int128 u128;
+
+#define U64_WIDTH 64
+
+static inline int ffz64(u64 x) {
+    return __builtin_ffsll(~x);
+}
+
+// parameter must not be 0
+static inline int clz64(u64 x) {
+    return __builtin_clzll(x);
+}
+
+// parameter must not be 0
+static inline u64 log2u64(u64 x) {
+    return U64_WIDTH - clz64(x) - 1;
+}
+
+static inline size_t align(size_t size, size_t align) {
+    size_t mask = align - 1;
+    return (size + mask) & ~mask;
+}
+
+// u4_arr_{set,get} are helper functions for using u8 array as an array of unsigned 4-bit values.
+
+// val is treated as a 4-bit value
+static inline void u4_arr_set(u8 *arr, size_t idx, u8 val) {
+    size_t off = idx >> 1;
+    size_t shift = (idx & 1) << 2;
+    u8 mask = (u8) (0xf0 >> shift);
+    arr[off] = (arr[off] & mask) | (val << shift);
+}
+
+static inline u8 u4_arr_get(const u8 *arr, size_t idx) {
+    size_t off = idx >> 1;
+    size_t shift = (idx & 1) << 2;
+    return (u8) ((arr[off] >> shift) & 0xf);
+}
+
+COLD noreturn void fatal_error(const char *s);
+
+#if CONFIG_SEAL_METADATA
+
+#ifdef __GLIBC__
+#define USE_PKEY
+#else
+#error "CONFIG_SEAL_METADATA requires Memory Protection Key support"
+#endif
+
+#endif // CONFIG_SEAL_METADATA
+
+#endif
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..b204ba2
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,78 @@
+#![no_std]
+
+use core::ffi::{c_void, c_int};
+
+extern crate libc;
+
+extern "C" {
+    /*
+    TODO: implement this
+
+    #ifdef __ANDROID__
+#define H_MALLOC_USABLE_SIZE_CONST const
+#else
+#define H_MALLOC_USABLE_SIZE_CONST
+#endif
+
+    for:
+    // glibc extensions
+size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *ptr);
+ */
+
+    /* C standard */
+
+    pub fn h_malloc(size: usize) -> *mut c_void;
+    pub fn h_calloc(nmemb: usize, size: usize) -> *mut c_void;
+    pub fn h_realloc(ptr: *mut c_void, size: usize) -> *mut c_void;
+    pub fn h_aligned_malloc(alignment: usize, size: usize) -> *mut c_void;
+    pub fn h_free(ptr: *mut c_void);
+
+    /* POSIX */
+
+    pub fn h_posix_memalign(memptr: *mut *mut c_void, alignment: usize, size: usize) -> c_int;
+
+    /* glibc extensions */
+
+    pub fn h_malloc_usable_size(ptr: *const c_void) -> usize;
+    pub fn h_mallopt(param: c_int, value: c_int) -> c_int;
+    pub fn h_malloc_trim(pad: usize) -> c_int;
+    pub fn h_malloc_stats(void: c_void) -> c_void;
+
+    /* obsolete glibc extensions */
+
+    pub fn h_memalign(alignment: usize, size: usize) -> *mut c_void;
+    pub fn h_pvalloc(size: usize) -> *mut c_void;
+    pub fn h_cfree(ptr: *mut c_void) -> c_void;
+    pub fn h_malloc_get_state(void: c_void) -> c_void;
+    pub fn h_malloc_set_state(ptr: *mut c_void) -> c_int;
+
+    /*TODO: implement this see the top:
+    #if defined(__GLIBC__) || defined(__ANDROID__)
+struct mallinfo h_mallinfo(void);
+#endif
+#ifndef __ANDROID__
+int h_malloc_info(int options, FILE *fp);
+#endif
+ */
+
+    /* hardened_malloc extensions */
+
+    /// return an upper bound on object size for any pointer based on malloc metadata
+    pub fn h_malloc_object_size(ptr: *const c_void) -> usize;
+
+    /// similar to malloc_object_size, but avoids locking so the results are much more limited
+    pub fn h_malloc_object_size_fast(ptr: *const c_void) -> usize;
+
+
+    /// The free function with an extra parameter for passing the size requested at
+    /// allocation time.
+    ///
+    /// This offers the same functionality as C++14 sized deallocation and can be
+    /// used to implement it.
+    ///
+    /// A performance-oriented allocator would use this as a performance
+    /// enhancement with undefined behavior on a mismatch. Instead, this hardened
+    /// allocator implementation uses it to improve security by checking that the
+    /// passed size matches the allocated size.
+    pub fn h_free_sized(ptr: *mut c_void, expected_size: usize) -> c_void;
+}
\ No newline at end of file