From c8de762782141e9068a372be879d45fd366ee3d9 Mon Sep 17 00:00:00 2001 From: girlbossceo Date: Sun, 12 Nov 2023 16:28:12 -0500 Subject: [PATCH] initial commit of hardened_malloc-sys Signed-off-by: girlbossceo --- .gitignore | 3 + .gitmodules | 3 + CODE_OF_CONDUCT.md | 134 + Cargo.toml | 30 + LICENCE | 202 ++ LICENCE_GRAPHENEOS | 19 + README.md | 10 + build.rs | 61 + hardened_malloc_sources.txt | 16 + src/hardened_malloc/.clang-tidy | 2 + src/hardened_malloc/.github/dependabot.yml | 7 + .../.github/workflows/build-and-test.yml | 53 + src/hardened_malloc/.gitignore | 2 + src/hardened_malloc/Android.bp | 83 + src/hardened_malloc/CREDITS | 283 ++ .../KERNEL_FEATURE_WISHLIST.md | 35 + src/hardened_malloc/LICENSE | 19 + src/hardened_malloc/Makefile | 148 + src/hardened_malloc/README.md | 1037 ++++++ src/hardened_malloc/androidtest/Android.bp | 25 + .../androidtest/AndroidTest.xml | 13 + .../androidtest/memtag/Android.bp | 16 + .../androidtest/memtag/memtag_test.cc | 297 ++ .../src/grapheneos/hmalloc/MemtagTest.java | 95 + src/hardened_malloc/arm_mte.h | 91 + src/hardened_malloc/calculate_waste.py | 81 + src/hardened_malloc/chacha.c | 177 + src/hardened_malloc/chacha.h | 17 + src/hardened_malloc/config/default.mk | 23 + src/hardened_malloc/config/light.mk | 23 + src/hardened_malloc/h_malloc.c | 2190 ++++++++++++ src/hardened_malloc/include/h_malloc.h | 129 + src/hardened_malloc/memory.c | 120 + src/hardened_malloc/memory.h | 29 + src/hardened_malloc/memtag.h | 49 + src/hardened_malloc/mutex.h | 28 + src/hardened_malloc/new.cc | 153 + src/hardened_malloc/pages.c | 88 + src/hardened_malloc/pages.h | 32 + src/hardened_malloc/preload.sh | 6 + src/hardened_malloc/random.c | 128 + src/hardened_malloc/random.h | 25 + src/hardened_malloc/test/.gitignore | 44 + src/hardened_malloc/test/Makefile | 76 + src/hardened_malloc/test/__init__.py | 0 .../test/delete_type_size_mismatch.cc | 14 + src/hardened_malloc/test/double_free_large.c | 13 + .../test/double_free_large_delayed.c | 18 + src/hardened_malloc/test/double_free_small.c | 13 + .../test/double_free_small_delayed.c | 18 + .../test/impossibly_large_malloc.c | 8 + .../test/invalid_free_protected.c | 15 + .../test/invalid_free_small_region.c | 13 + .../test/invalid_free_small_region_far.c | 13 + .../test/invalid_free_unprotected.c | 15 + .../test/invalid_malloc_object_size_small.c | 15 + ...alid_malloc_object_size_small_quarantine.c | 15 + .../test/invalid_malloc_usable_size_small.c | 13 + ...alid_malloc_usable_size_small_quarantine.c | 13 + src/hardened_malloc/test/large_array_growth.c | 18 + src/hardened_malloc/test/mallinfo.c | 44 + src/hardened_malloc/test/mallinfo2.c | 44 + src/hardened_malloc/test/malloc_info.c | 35 + src/hardened_malloc/test/malloc_object_size.c | 12 + .../test/malloc_object_size_offset.c | 12 + src/hardened_malloc/test/offset.c | 50 + .../test/overflow_large_1_byte.c | 15 + .../test/overflow_large_8_byte.c | 15 + .../test/overflow_small_1_byte.c | 15 + .../test/overflow_small_8_byte.c | 16 + .../test/read_after_free_large.c | 21 + .../test/read_after_free_small.c | 21 + src/hardened_malloc/test/read_zero_size.c | 13 + src/hardened_malloc/test/realloc_init.c | 33 + src/hardened_malloc/test/string_overflow.c | 20 + src/hardened_malloc/test/test_smc.py | 242 ++ src/hardened_malloc/test/test_util.h | 10 + .../test/unaligned_free_large.c | 12 + .../test/unaligned_free_small.c | 12 + .../test/unaligned_malloc_usable_size_small.c | 12 + src/hardened_malloc/test/uninitialized_free.c | 8 + .../test/uninitialized_malloc_usable_size.c | 8 + .../test/uninitialized_read_large.c | 14 + .../test/uninitialized_read_small.c | 14 + .../test/uninitialized_realloc.c | 11 + .../test/write_after_free_large.c | 13 + .../test/write_after_free_large_reuse.c | 16 + .../test/write_after_free_small.c | 19 + .../test/write_after_free_small_reuse.c | 21 + src/hardened_malloc/test/write_zero_size.c | 12 + src/hardened_malloc/third_party/libdivide.h | 3126 +++++++++++++++++ src/hardened_malloc/util.c | 41 + src/hardened_malloc/util.h | 88 + src/lib.rs | 78 + 94 files changed, 10439 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 CODE_OF_CONDUCT.md create mode 100644 Cargo.toml create mode 100644 LICENCE create mode 100644 LICENCE_GRAPHENEOS create mode 100644 README.md create mode 100644 build.rs create mode 100644 hardened_malloc_sources.txt create mode 100644 src/hardened_malloc/.clang-tidy create mode 100644 src/hardened_malloc/.github/dependabot.yml create mode 100644 src/hardened_malloc/.github/workflows/build-and-test.yml create mode 100644 src/hardened_malloc/.gitignore create mode 100644 src/hardened_malloc/Android.bp create mode 100644 src/hardened_malloc/CREDITS create mode 100644 src/hardened_malloc/KERNEL_FEATURE_WISHLIST.md create mode 100644 src/hardened_malloc/LICENSE create mode 100644 src/hardened_malloc/Makefile create mode 100644 src/hardened_malloc/README.md create mode 100644 src/hardened_malloc/androidtest/Android.bp create mode 100644 src/hardened_malloc/androidtest/AndroidTest.xml create mode 100644 src/hardened_malloc/androidtest/memtag/Android.bp create mode 100644 src/hardened_malloc/androidtest/memtag/memtag_test.cc create mode 100644 src/hardened_malloc/androidtest/src/grapheneos/hmalloc/MemtagTest.java create mode 100644 src/hardened_malloc/arm_mte.h create mode 100755 src/hardened_malloc/calculate_waste.py create mode 100644 src/hardened_malloc/chacha.c create mode 100644 src/hardened_malloc/chacha.h create mode 100644 src/hardened_malloc/config/default.mk create mode 100644 src/hardened_malloc/config/light.mk create mode 100644 src/hardened_malloc/h_malloc.c create mode 100644 src/hardened_malloc/include/h_malloc.h create mode 100644 src/hardened_malloc/memory.c create mode 100644 src/hardened_malloc/memory.h create mode 100644 src/hardened_malloc/memtag.h create mode 100644 src/hardened_malloc/mutex.h create mode 100644 src/hardened_malloc/new.cc create mode 100644 src/hardened_malloc/pages.c create mode 100644 src/hardened_malloc/pages.h create mode 100755 src/hardened_malloc/preload.sh create mode 100644 src/hardened_malloc/random.c create mode 100644 src/hardened_malloc/random.h create mode 100644 src/hardened_malloc/test/.gitignore create mode 100644 src/hardened_malloc/test/Makefile create mode 100644 src/hardened_malloc/test/__init__.py create mode 100644 src/hardened_malloc/test/delete_type_size_mismatch.cc create mode 100644 src/hardened_malloc/test/double_free_large.c create mode 100644 src/hardened_malloc/test/double_free_large_delayed.c create mode 100644 src/hardened_malloc/test/double_free_small.c create mode 100644 src/hardened_malloc/test/double_free_small_delayed.c create mode 100644 src/hardened_malloc/test/impossibly_large_malloc.c create mode 100644 src/hardened_malloc/test/invalid_free_protected.c create mode 100644 src/hardened_malloc/test/invalid_free_small_region.c create mode 100644 src/hardened_malloc/test/invalid_free_small_region_far.c create mode 100644 src/hardened_malloc/test/invalid_free_unprotected.c create mode 100644 src/hardened_malloc/test/invalid_malloc_object_size_small.c create mode 100644 src/hardened_malloc/test/invalid_malloc_object_size_small_quarantine.c create mode 100644 src/hardened_malloc/test/invalid_malloc_usable_size_small.c create mode 100644 src/hardened_malloc/test/invalid_malloc_usable_size_small_quarantine.c create mode 100644 src/hardened_malloc/test/large_array_growth.c create mode 100644 src/hardened_malloc/test/mallinfo.c create mode 100644 src/hardened_malloc/test/mallinfo2.c create mode 100644 src/hardened_malloc/test/malloc_info.c create mode 100644 src/hardened_malloc/test/malloc_object_size.c create mode 100644 src/hardened_malloc/test/malloc_object_size_offset.c create mode 100644 src/hardened_malloc/test/offset.c create mode 100644 src/hardened_malloc/test/overflow_large_1_byte.c create mode 100644 src/hardened_malloc/test/overflow_large_8_byte.c create mode 100644 src/hardened_malloc/test/overflow_small_1_byte.c create mode 100644 src/hardened_malloc/test/overflow_small_8_byte.c create mode 100644 src/hardened_malloc/test/read_after_free_large.c create mode 100644 src/hardened_malloc/test/read_after_free_small.c create mode 100644 src/hardened_malloc/test/read_zero_size.c create mode 100644 src/hardened_malloc/test/realloc_init.c create mode 100644 src/hardened_malloc/test/string_overflow.c create mode 100644 src/hardened_malloc/test/test_smc.py create mode 100644 src/hardened_malloc/test/test_util.h create mode 100644 src/hardened_malloc/test/unaligned_free_large.c create mode 100644 src/hardened_malloc/test/unaligned_free_small.c create mode 100644 src/hardened_malloc/test/unaligned_malloc_usable_size_small.c create mode 100644 src/hardened_malloc/test/uninitialized_free.c create mode 100644 src/hardened_malloc/test/uninitialized_malloc_usable_size.c create mode 100644 src/hardened_malloc/test/uninitialized_read_large.c create mode 100644 src/hardened_malloc/test/uninitialized_read_small.c create mode 100644 src/hardened_malloc/test/uninitialized_realloc.c create mode 100644 src/hardened_malloc/test/write_after_free_large.c create mode 100644 src/hardened_malloc/test/write_after_free_large_reuse.c create mode 100644 src/hardened_malloc/test/write_after_free_small.c create mode 100644 src/hardened_malloc/test/write_after_free_small_reuse.c create mode 100644 src/hardened_malloc/test/write_zero_size.c create mode 100644 src/hardened_malloc/third_party/libdivide.h create mode 100644 src/hardened_malloc/util.c create mode 100644 src/hardened_malloc/util.h create mode 100644 src/lib.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ffea379 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target +.DS_Store +Cargo.lock \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..63dadb8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "hardened_malloc"] + path = "src/hardened_malloc" + url = https://github.com/GrapheneOS/hardened_malloc.git diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..c55cb31 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,134 @@ + +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at Email +[strawberry@pupbrain.dev] or via Matrix [@strawberry:puppygock.gay] +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations + diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..5156ab2 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "hardened_malloc-sys" +build = "build.rs" +description = "hardened_malloc rust wrapper (sys crate)" +authors = ["strawberry "] +version = "0.1.0" +edition = "2021" +license = "Apache-2.0 and MIT" +repository = "https://github.com/girlbossceo/hardened_malloc-sys" +categories = ["api-bindings", "memory-management"] +keywords = ["hardened_malloc", "malloc", "hardened memory allocator", "security"] +readme = "README.md" +exclude = [ + "/src/hardened_malloc/test", + "/src/hardened_malloc/androidtest", + "/src/hardened_malloc/out", + "/src/hardened_malloc/out-light", +] + +[features] +default = ["light"] +light = [] +# "standard" feature is "default.mk" config in hardened_malloc +standard = [] + +[dependencies] +libc = "0.2" + +[build-dependencies] +cc = "1.0" \ No newline at end of file diff --git a/LICENCE b/LICENCE new file mode 100644 index 0000000..2d9a3a5 --- /dev/null +++ b/LICENCE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2023] [June] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/LICENCE_GRAPHENEOS b/LICENCE_GRAPHENEOS new file mode 100644 index 0000000..3b9e2c0 --- /dev/null +++ b/LICENCE_GRAPHENEOS @@ -0,0 +1,19 @@ +Copyright © 2018-2023 GrapheneOS + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..8905fa7 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# hardened_malloc-sys + +the sys repo, rust wrapper + +### TODO: +- [ ] test if this even works +- [ ] add support for explicit make config args on top of choosing variant +- [ ] make build script better overall +- [ ] support C preprocessor macro definitions +- [ ] add support for hardened_malloc's tests and our own tests \ No newline at end of file diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..b0988f7 --- /dev/null +++ b/build.rs @@ -0,0 +1,61 @@ +use std::{env, process::Command, path::Path}; + +/// If submodules were not synced, sync them to actually build hardened_malloc +fn update_submodules() { + let program = "git"; + let dir = "../"; + let args = ["submodule", "update", "--init", "--recursive"]; + println!( + "Running command: \"{} {}\" in directory: {}", + program, + args.join(" "), + dir + ); + let ret = Command::new(program).current_dir(dir).args(args).status(); + + match ret.map(|status| (status.success(), status.code())) { + Ok((true, _)) => (), + Ok((false, Some(c))) => panic!("Command failed with error code {}", c), + Ok((false, None)) => panic!("Command exited with no error code, possibly killed by system"), + Err(e) => panic!("Command failed with error: {}", e), + } +} + +fn main() { + if !Path::new("src/hardened_malloc/Makefile").exists() { + update_submodules(); + } + let variant: &str; + + if cfg!(feature = "light") { + variant = "light"; + } else { + variant = "default"; + } + + //TODO: handle support for explicit make flags like N_ARENA=1 and such + + let mut make_command = Command::new("make"); + let make_output = make_command + .current_dir("src/hardened_malloc/") + .env("V", "1") // always verbose mode for cargo + .env("VARIANT", variant) + .output() + .unwrap_or_else(|error| { + panic!("Failed to run 'make {}': ", error); + }); + if !make_output.status.success() { + panic!( + "building hardened_malloc failed:\n{:?}\n{}\n{}", + make_command, + String::from_utf8_lossy(&make_output.stdout), + String::from_utf8_lossy(&make_output.stderr) + ); + } + + //println!("cargo:rustc-link-search=native=src/hardened_malloc"); + + //println!("cargo:rerun-if-changed=build.rs"); + println!("cargo:rerun-if-changed=src/hardened_malloc/"); + //println!("cargo:out_dir={}", env::var("OUT_DIR").unwrap()); +} \ No newline at end of file diff --git a/hardened_malloc_sources.txt b/hardened_malloc_sources.txt new file mode 100644 index 0000000..0688911 --- /dev/null +++ b/hardened_malloc_sources.txt @@ -0,0 +1,16 @@ +src/hardened_malloc/chacha.c +src/hardened_malloc/h_malloc.c +src/hardened_malloc/memory.c +src/hardened_malloc/pages.c +src/hardened_malloc/random.c +src/hardened_malloc/util.c +src/hardened_malloc/arm_mte.h +src/hardened_malloc/chacha.h +src/hardened_malloc/memory.h +src/hardened_malloc/memtag.h +src/hardened_malloc/mutex.h +src/hardened_malloc/pages.h +src/hardened_malloc/random.h +src/hardened_malloc/util.h +src/hardened_malloc/new.cc +src/hardened_malloc/third_party/libdivide.h \ No newline at end of file diff --git a/src/hardened_malloc/.clang-tidy b/src/hardened_malloc/.clang-tidy new file mode 100644 index 0000000..ea78ba3 --- /dev/null +++ b/src/hardened_malloc/.clang-tidy @@ -0,0 +1,2 @@ +Checks: 'bugprone-*,-bugprone-easily-swappable-parameters,-bugprone-macro-parentheses,-bugprone-too-small-loop-variable,cert-*,-cert-err33-c,clang-analyzer-*,-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,-clang-diagnostic-constant-logical-operand,readability-*,-readability-function-cognitive-complexity,-readability-identifier-length,-readability-inconsistent-declaration-parameter-name,-readability-magic-numbers,-readability-named-parameter,llvm-include-order,misc-*' +WarningsAsErrors: '*' diff --git a/src/hardened_malloc/.github/dependabot.yml b/src/hardened_malloc/.github/dependabot.yml new file mode 100644 index 0000000..5e1954b --- /dev/null +++ b/src/hardened_malloc/.github/dependabot.yml @@ -0,0 +1,7 @@ +version: 2 +updates: + - package-ecosystem: github-actions + directory: "/" + schedule: + interval: daily + target-branch: main diff --git a/src/hardened_malloc/.github/workflows/build-and-test.yml b/src/hardened_malloc/.github/workflows/build-and-test.yml new file mode 100644 index 0000000..82496af --- /dev/null +++ b/src/hardened_malloc/.github/workflows/build-and-test.yml @@ -0,0 +1,53 @@ +name: Build and run tests + +on: + push: + pull_request: + schedule: + - cron: '0 2 * * *' + +jobs: + build-ubuntu-gcc: + runs-on: ubuntu-latest + strategy: + matrix: + version: [12] + steps: + - uses: actions/checkout@v4 + - name: Setting up gcc version + run: | + sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100 + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${{ matrix.version }} 100 + - name: Build + run: make test + build-ubuntu-clang: + runs-on: ubuntu-latest + strategy: + matrix: + version: [14, 15] + steps: + - uses: actions/checkout@v4 + - name: Setting up clang version + run: | + sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100 + sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{ matrix.version }} 100 + - name: Build + run: CC=clang CXX=clang++ make test + build-musl: + runs-on: ubuntu-latest + container: + image: alpine:latest + steps: + - uses: actions/checkout@v4 + - name: Install dependencies + run: apk update && apk add build-base python3 + - name: Build + run: make test + build-ubuntu-gcc-aarch64: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install dependencies + run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgcc-s1-arm64-cross cpp-aarch64-linux-gnu + - name: Build + run: CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-gcc++ make CONFIG_NATIVE=false diff --git a/src/hardened_malloc/.gitignore b/src/hardened_malloc/.gitignore new file mode 100644 index 0000000..e5cdb39 --- /dev/null +++ b/src/hardened_malloc/.gitignore @@ -0,0 +1,2 @@ +out/ +out-light/ diff --git a/src/hardened_malloc/Android.bp b/src/hardened_malloc/Android.bp new file mode 100644 index 0000000..0db6a04 --- /dev/null +++ b/src/hardened_malloc/Android.bp @@ -0,0 +1,83 @@ +common_cflags = [ + "-pipe", + "-O3", + //"-flto", + "-fPIC", + "-fvisibility=hidden", + //"-fno-plt", + "-Wall", + "-Wextra", + "-Wcast-align", + "-Wcast-qual", + "-Wwrite-strings", + "-Werror", + "-DH_MALLOC_PREFIX", + "-DZERO_ON_FREE=true", + "-DWRITE_AFTER_FREE_CHECK=true", + "-DSLOT_RANDOMIZE=true", + "-DSLAB_CANARY=true", + "-DSLAB_QUARANTINE_RANDOM_LENGTH=1", + "-DSLAB_QUARANTINE_QUEUE_LENGTH=1", + "-DCONFIG_EXTENDED_SIZE_CLASSES=true", + "-DCONFIG_LARGE_SIZE_CLASSES=true", + "-DGUARD_SLABS_INTERVAL=1", + "-DGUARD_SIZE_DIVISOR=2", + "-DREGION_QUARANTINE_RANDOM_LENGTH=256", + "-DREGION_QUARANTINE_QUEUE_LENGTH=1024", + "-DREGION_QUARANTINE_SKIP_THRESHOLD=33554432", // 32MiB + "-DFREE_SLABS_QUARANTINE_RANDOM_LENGTH=32", + "-DCONFIG_CLASS_REGION_SIZE=34359738368", // 32GiB + "-DN_ARENA=1", + "-DCONFIG_STATS=true", + "-DCONFIG_SELF_INIT=false", +] + +cc_defaults { + name: "hardened_malloc_defaults", + defaults: ["linux_bionic_supported"], + cflags: common_cflags, + conlyflags: ["-std=c17", "-Wmissing-prototypes"], + stl: "none", +} + +lib_src_files = [ + "chacha.c", + "h_malloc.c", + "memory.c", + "pages.c", + "random.c", + "util.c", +] + +cc_library { + name: "libhardened_malloc", + ramdisk_available: true, + vendor_ramdisk_available: true, + recovery_available: true, + defaults: ["hardened_malloc_defaults"], + srcs: lib_src_files, + export_include_dirs: ["include"], + static_libs: ["libasync_safe"], + target: { + android: { + shared: { + enabled: false, + }, + system_shared_libs: [], + }, + linux_bionic: { + system_shared_libs: [], + }, + }, + product_variables: { + debuggable: { + cflags: ["-DLABEL_MEMORY"], + }, + device_has_arm_mte: { + cflags: ["-DHAS_ARM_MTE", "-march=armv9-a+memtag"] + }, + }, + apex_available: [ + "com.android.runtime", + ], +} diff --git a/src/hardened_malloc/CREDITS b/src/hardened_malloc/CREDITS new file mode 100644 index 0000000..31b6875 --- /dev/null +++ b/src/hardened_malloc/CREDITS @@ -0,0 +1,283 @@ +chacha.c is a simple conversion of chacha-merged.c to a keystream-only implementation: + + chacha-merged.c version 20080118 + D. J. Bernstein + Public domain. + +h_malloc.c open-addressed hash table (regions_grow, regions_insert, regions_find, regions_delete): + + Copyright (c) 2008, 2010, 2011, 2016 Otto Moerbeek + Copyright (c) 2012 Matthew Dempsky + Copyright (c) 2008 Damien Miller + Copyright (c) 2000 Poul-Henning Kamp + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +libdivide: + + Copyright (C) 2010 - 2019 ridiculous_fish, + Copyright (C) 2016 - 2019 Kim Walisch, + + Boost Software License - Version 1.0 - August 17th, 2003 + + Permission is hereby granted, free of charge, to any person or organization + obtaining a copy of the software and accompanying documentation covered by + this license (the "Software") to use, reproduce, display, distribute, + execute, and transmit the Software, and to prepare derivative works of the + Software, and to permit third-parties to whom the Software is furnished to + do so, all subject to the following: + + The copyright notices in the Software and this entire statement, including + the above license grant, this restriction and the following disclaimer, + must be included in all copies of the Software, in whole or in part, and + all derivative works of the Software, unless such copies or derivative + works are solely in the form of machine-executable object code generated by + a source language processor. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT + SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE + FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + +random.c get_random_{type}_uniform functions are based on Fast Random Integer +Generation in an Interval by Daniel Lemire + +arm_mte.h arm_mte_tag_and_clear_mem function contents were copied from storeTags function in scudo: + + ============================================================================== + The LLVM Project is under the Apache License v2.0 with LLVM Exceptions: + ============================================================================== + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + ---- LLVM Exceptions to the Apache 2.0 License ---- + + As an exception, if, as a result of your compiling your source code, portions + of this Software are embedded into an Object form of such source code, you + may redistribute such embedded portions in such Object form without complying + with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + + In addition, if you combine or link compiled forms of this Software with + software that is licensed under the GPLv2 ("Combined Software") and if a + court of competent jurisdiction determines that the patent provision (Section + 3), the indemnity provision (Section 9) or other Section of the License + conflicts with the conditions of the GPLv2, you may retroactively and + prospectively choose to deem waived or otherwise exclude such Section(s) of + the License, but only in their entirety and only with respect to the Combined + Software. + + ============================================================================== diff --git a/src/hardened_malloc/KERNEL_FEATURE_WISHLIST.md b/src/hardened_malloc/KERNEL_FEATURE_WISHLIST.md new file mode 100644 index 0000000..c3a474d --- /dev/null +++ b/src/hardened_malloc/KERNEL_FEATURE_WISHLIST.md @@ -0,0 +1,35 @@ +Very important and should be an easy sell: + +* improved robustness for high vma count on high memory machines +* much higher `vm.max_map_count` by default +* work on improving performance and resource usage with high vma count +* add a way to disable the brk heap and have mmap grow upwards like it did in + the past (preserving the same high base entropy) + +Somewhat important and an easy sell: + +* alternative to `RLIMIT_AS` for accountable mappings only + * memory control groups are sometimes a better option but there are still + users of `RLIMIT_AS` that are problematic for mitigations or simply fast + garbage collector implementations, etc. mapping lots of `PROT_NONE` memory +* mremap flag to disable unmapping the source mapping + * also needed by jemalloc for different reasons + * not needed if the kernel gets first class support for arbitrarily sized + guard pages and a virtual memory quarantine feature + * `MREMAP_DONTUNMAP` is now available but doesn't support expanding the + mapping which may be an issue due to VMA merging being unreliable + +Fairly infeasible to land but could reduce overhead and extend coverage of +security features to other code directly using mmap: + +* first class support for arbitrarily sized guard pages for mmap and mremap to + eliminate half of the resulting VMAs and reduce 2 system calls to 1 + * not usable if it doesn't support mremap (shrink, grow, grow via move) + * not usable if the guard page size is static + * should support changing guard size for mremap growth via move + * must be possible to set it up from the process +* virtual memory quarantine + * must be possible to set it up from the process +* first-class support for aligned mappings with mmap and ideally mremap + * not usable unless guard page support is provided and of course it has to + work with this too diff --git a/src/hardened_malloc/LICENSE b/src/hardened_malloc/LICENSE new file mode 100644 index 0000000..5311a0f --- /dev/null +++ b/src/hardened_malloc/LICENSE @@ -0,0 +1,19 @@ +Copyright © 2018-2023 GrapheneOS + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/src/hardened_malloc/Makefile b/src/hardened_malloc/Makefile new file mode 100644 index 0000000..574a088 --- /dev/null +++ b/src/hardened_malloc/Makefile @@ -0,0 +1,148 @@ +VARIANT := default + +ifneq ($(VARIANT),) + CONFIG_FILE := config/$(VARIANT).mk + include config/$(VARIANT).mk +endif + +ifeq ($(VARIANT),default) + SUFFIX := +else + SUFFIX := -$(VARIANT) +endif + +OUT := out$(SUFFIX) + +define safe_flag +$(shell $(CC) $(if $(filter clang%,$(CC)),-Werror=unknown-warning-option) -E $1 - /dev/null 2>&1 && echo $1 || echo $2) +endef + +CPPFLAGS := $(CPPFLAGS) -D_GNU_SOURCE -I include +SHARED_FLAGS := -pipe -O3 -flto -fPIC -fvisibility=hidden -fno-plt \ + $(call safe_flag,-fstack-clash-protection) $(call safe_flag,-fcf-protection) -fstack-protector-strong \ + -Wall -Wextra $(call safe_flag,-Wcast-align=strict,-Wcast-align) -Wcast-qual -Wwrite-strings \ + -Wundef + +ifeq ($(CONFIG_WERROR),true) + SHARED_FLAGS += -Werror +endif + +ifeq ($(CONFIG_NATIVE),true) + SHARED_FLAGS += -march=native +endif + +ifeq ($(CONFIG_UBSAN),true) + SHARED_FLAGS += -fsanitize=undefined -fno-sanitize-recover=undefined +endif + +CFLAGS := $(CFLAGS) -std=c17 $(SHARED_FLAGS) -Wmissing-prototypes -Wstrict-prototypes +CXXFLAGS := $(CXXFLAGS) -std=c++17 -fsized-deallocation $(SHARED_FLAGS) +LDFLAGS := $(LDFLAGS) -Wl,-O1,--as-needed,-z,defs,-z,relro,-z,now,-z,nodlopen,-z,text + +SOURCES := chacha.c h_malloc.c memory.c pages.c random.c util.c +OBJECTS := $(SOURCES:.c=.o) + +ifeq ($(CONFIG_CXX_ALLOCATOR),true) + # make sure LTO is compatible in case CC and CXX don't match (such as clang and g++) + CXX := $(CC) + LDLIBS += -lstdc++ + + SOURCES += new.cc + OBJECTS += new.o +endif + +OBJECTS := $(addprefix $(OUT)/,$(OBJECTS)) + +ifeq (,$(filter $(CONFIG_SEAL_METADATA),true false)) + $(error CONFIG_SEAL_METADATA must be true or false) +endif + +ifeq (,$(filter $(CONFIG_ZERO_ON_FREE),true false)) + $(error CONFIG_ZERO_ON_FREE must be true or false) +endif + +ifeq (,$(filter $(CONFIG_WRITE_AFTER_FREE_CHECK),true false)) + $(error CONFIG_WRITE_AFTER_FREE_CHECK must be true or false) +endif + +ifeq (,$(filter $(CONFIG_SLOT_RANDOMIZE),true false)) + $(error CONFIG_SLOT_RANDOMIZE must be true or false) +endif + +ifeq (,$(filter $(CONFIG_SLAB_CANARY),true false)) + $(error CONFIG_SLAB_CANARY must be true or false) +endif + +ifeq (,$(filter $(CONFIG_EXTENDED_SIZE_CLASSES),true false)) + $(error CONFIG_EXTENDED_SIZE_CLASSES must be true or false) +endif + +ifeq (,$(filter $(CONFIG_LARGE_SIZE_CLASSES),true false)) + $(error CONFIG_LARGE_SIZE_CLASSES must be true or false) +endif + +ifeq (,$(filter $(CONFIG_STATS),true false)) + $(error CONFIG_STATS must be true or false) +endif + +ifeq (,$(filter $(CONFIG_SELF_INIT),true false)) + $(error CONFIG_SELF_INIT must be true or false) +endif + +CPPFLAGS += \ + -DCONFIG_SEAL_METADATA=$(CONFIG_SEAL_METADATA) \ + -DZERO_ON_FREE=$(CONFIG_ZERO_ON_FREE) \ + -DWRITE_AFTER_FREE_CHECK=$(CONFIG_WRITE_AFTER_FREE_CHECK) \ + -DSLOT_RANDOMIZE=$(CONFIG_SLOT_RANDOMIZE) \ + -DSLAB_CANARY=$(CONFIG_SLAB_CANARY) \ + -DSLAB_QUARANTINE_RANDOM_LENGTH=$(CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH) \ + -DSLAB_QUARANTINE_QUEUE_LENGTH=$(CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH) \ + -DCONFIG_EXTENDED_SIZE_CLASSES=$(CONFIG_EXTENDED_SIZE_CLASSES) \ + -DCONFIG_LARGE_SIZE_CLASSES=$(CONFIG_LARGE_SIZE_CLASSES) \ + -DGUARD_SLABS_INTERVAL=$(CONFIG_GUARD_SLABS_INTERVAL) \ + -DGUARD_SIZE_DIVISOR=$(CONFIG_GUARD_SIZE_DIVISOR) \ + -DREGION_QUARANTINE_RANDOM_LENGTH=$(CONFIG_REGION_QUARANTINE_RANDOM_LENGTH) \ + -DREGION_QUARANTINE_QUEUE_LENGTH=$(CONFIG_REGION_QUARANTINE_QUEUE_LENGTH) \ + -DREGION_QUARANTINE_SKIP_THRESHOLD=$(CONFIG_REGION_QUARANTINE_SKIP_THRESHOLD) \ + -DFREE_SLABS_QUARANTINE_RANDOM_LENGTH=$(CONFIG_FREE_SLABS_QUARANTINE_RANDOM_LENGTH) \ + -DCONFIG_CLASS_REGION_SIZE=$(CONFIG_CLASS_REGION_SIZE) \ + -DN_ARENA=$(CONFIG_N_ARENA) \ + -DCONFIG_STATS=$(CONFIG_STATS) \ + -DCONFIG_SELF_INIT=$(CONFIG_SELF_INIT) + +$(OUT)/libhardened_malloc$(SUFFIX).so: $(OBJECTS) | $(OUT) + $(CC) $(CFLAGS) $(LDFLAGS) -shared $^ $(LDLIBS) -o $@ + +$(OUT): + mkdir -p $(OUT) + +$(OUT)/chacha.o: chacha.c chacha.h util.h $(CONFIG_FILE) | $(OUT) + $(COMPILE.c) $(OUTPUT_OPTION) $< +$(OUT)/h_malloc.o: h_malloc.c include/h_malloc.h mutex.h memory.h pages.h random.h util.h $(CONFIG_FILE) | $(OUT) + $(COMPILE.c) $(OUTPUT_OPTION) $< +$(OUT)/memory.o: memory.c memory.h util.h $(CONFIG_FILE) | $(OUT) + $(COMPILE.c) $(OUTPUT_OPTION) $< +$(OUT)/new.o: new.cc include/h_malloc.h util.h $(CONFIG_FILE) | $(OUT) + $(COMPILE.cc) $(OUTPUT_OPTION) $< +$(OUT)/pages.o: pages.c pages.h memory.h util.h $(CONFIG_FILE) | $(OUT) + $(COMPILE.c) $(OUTPUT_OPTION) $< +$(OUT)/random.o: random.c random.h chacha.h util.h $(CONFIG_FILE) | $(OUT) + $(COMPILE.c) $(OUTPUT_OPTION) $< +$(OUT)/util.o: util.c util.h $(CONFIG_FILE) | $(OUT) + $(COMPILE.c) $(OUTPUT_OPTION) $< + +check: tidy + +tidy: + clang-tidy --extra-arg=-std=c17 $(filter %.c,$(SOURCES)) -- $(CPPFLAGS) + clang-tidy --extra-arg=-std=c++17 $(filter %.cc,$(SOURCES)) -- $(CPPFLAGS) + +clean: + rm -f $(OUT)/libhardened_malloc.so $(OBJECTS) + $(MAKE) -C test/ clean + +test: $(OUT)/libhardened_malloc$(SUFFIX).so + $(MAKE) -C test/ + python3 -m unittest discover --start-directory test/ + +.PHONY: check clean tidy test diff --git a/src/hardened_malloc/README.md b/src/hardened_malloc/README.md new file mode 100644 index 0000000..8962037 --- /dev/null +++ b/src/hardened_malloc/README.md @@ -0,0 +1,1037 @@ +# Hardened malloc + +* [Introduction](#introduction) +* [Dependencies](#dependencies) +* [Testing](#testing) + * [Individual Applications](#individual-applications) + * [Automated Test Framework](#automated-test-framework) +* [Compatibility](#compatibility) +* [OS integration](#os-integration) + * [Android-based operating systems](#android-based-operating-systems) + * [Traditional Linux-based operating systems](#traditional-linux-based-operating-systems) +* [Configuration](#configuration) +* [Core design](#core-design) +* [Security properties](#security-properties) +* [Randomness](#randomness) +* [Size classes](#size-classes) +* [Scalability](#scalability) + * [Small (slab) allocations](#small-slab-allocations) + * [Thread caching (or lack thereof)](#thread-caching-or-lack-thereof) + * [Large allocations](#large-allocations) +* [Memory tagging](#memory-tagging) +* [API extensions](#api-extensions) +* [Stats](#stats) +* [System calls](#system-calls) + +## Introduction + +This is a security-focused general purpose memory allocator providing the +malloc API along with various extensions. It provides substantial hardening +against heap corruption vulnerabilities. The security-focused design also leads +to much less metadata overhead and memory waste from fragmentation than a more +traditional allocator design. It aims to provide decent overall performance +with a focus on long-term performance and memory usage rather than allocator +micro-benchmarks. It offers scalability via a configurable number of entirely +independent arenas, with the internal locking within arenas further divided +up per size class. + +This project currently supports Bionic (Android), musl and glibc. It may +support other non-Linux operating systems in the future. For Android, there's +custom integration and other hardening features which is also planned for musl +in the future. The glibc support will be limited to replacing the malloc +implementation because musl is a much more robust and cleaner base to build on +and can cover the same use cases. + +This allocator is intended as a successor to a previous implementation based on +extending OpenBSD malloc with various additional security features. It's still +heavily based on the OpenBSD malloc design, albeit not on the existing code +other than reusing the hash table implementation. The main differences in the +design are that it's solely focused on hardening rather than finding bugs, uses +finer-grained size classes along with slab sizes going beyond 4k to reduce +internal fragmentation, doesn't rely on the kernel having fine-grained mmap +randomization and only targets 64-bit to make aggressive use of the large +address space. There are lots of smaller differences in the implementation +approach. It incorporates the previous extensions made to OpenBSD malloc +including adding padding to allocations for canaries (distinct from the current +OpenBSD malloc canaries), write-after-free detection tied to the existing +clearing on free, queues alongside the existing randomized arrays for +quarantining allocations and proper double-free detection for quarantined +allocations. The per-size-class memory regions with their own random bases were +loosely inspired by the size and type-based partitioning in PartitionAlloc. The +planned changes to OpenBSD malloc ended up being too extensive and invasive so +this project was started as a fresh implementation better able to accomplish +the goals. For 32-bit, a port of OpenBSD malloc with small extensions can be +used instead as this allocator fundamentally doesn't support that environment. + +## Dependencies + +Debian stable (currently Debian 12) determines the most ancient set of +supported dependencies: + +* glibc 2.36 +* Linux 6.1 +* Clang 14.0.6 or GCC 12.2.0 + +For Android, the Linux GKI 5.10, 5.15 and 6.1 branches are supported. + +However, using more recent releases is highly recommended. Older versions of +the dependencies may be compatible at the moment but are not tested and will +explicitly not be supported. + +For external malloc replacement with musl, musl 1.1.20 is required. However, +there will be custom integration offering better performance in the future +along with other hardening for the C standard library implementation. + +For Android, only the current generation, actively developed maintenance branch of the Android +Open Source Project will be supported, which currently means `android13-qpr2-release`. + +## Testing + +### Individual Applications + +The `preload.sh` script can be used for testing with dynamically linked +executables using glibc or musl: + + ./preload.sh krita --new-image RGBA,U8,500,500 + +It can be necessary to substantially increase the `vm.max_map_count` sysctl to +accommodate the large number of mappings caused by guard slabs and large +allocation guard regions. The number of mappings can also be drastically +reduced via a significant increase to `CONFIG_GUARD_SLABS_INTERVAL` but the +feature has a low performance and memory usage cost so that isn't recommended. + +It can offer slightly better performance when integrated into the C standard +library and there are other opportunities for similar hardening within C +standard library and dynamic linker implementations. For example, a library +region can be implemented to offer similar isolation for dynamic libraries as +this allocator offers across different size classes. The intention is that this +will be offered as part of hardened variants of the Bionic and musl C standard +libraries. + +### Automated Test Framework + +A collection of simple, automated tests are provided and can be run with the +make command as follows: + + make test + +## Compatibility + +OpenSSH 8.1 or higher is required to allow the mprotect `PROT_READ|PROT_WRITE` +system calls in the seccomp-bpf filter rather than killing the process. + +## OS integration + +### Android-based operating systems + +On GrapheneOS, hardened\_malloc is integrated into the standard C library as +the standard malloc implementation. Other Android-based operating systems can +reuse [the integration +code](https://github.com/GrapheneOS/platform_bionic/commit/20160b81611d6f2acd9ab59241bebeac7cf1d71c) +to provide it. If desired, jemalloc can be left as a runtime configuration +option by only conditionally using hardened\_malloc to give users the choice +between performance and security. However, this reduces security for threat +models where persistent state is untrusted, i.e. verified boot and attestation +(see the [attestation sister project](https://attestation.app/about)). + +Make sure to raise `vm.max_map_count` substantially too to accommodate the very +large number of guard pages created by hardened\_malloc. This can be done in +`init.rc` (`system/core/rootdir/init.rc`) near the other virtual memory +configuration: + + write /proc/sys/vm/max_map_count 1048576 + +This is unnecessary if you set `CONFIG_GUARD_SLABS_INTERVAL` to a very large +value in the build configuration. + +### Traditional Linux-based operating systems + +On traditional Linux-based operating systems, hardened\_malloc can either be +integrated into the libc implementation as a replacement for the standard +malloc implementation or loaded as a dynamic library. Rather than rebuilding +each executable to be linked against it, it can be added as a preloaded +library to `/etc/ld.so.preload`. For example, with `libhardened_malloc.so` +installed to `/usr/local/lib/libhardened_malloc.so`, add that full path as a +line to the `/etc/ld.so.preload` configuration file: + + /usr/local/lib/libhardened_malloc.so + +The format of this configuration file is a whitespace-separated list, so it's +good practice to put each library on a separate line. + +Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis +will not work when `AT_SECURE` is set such as with setuid binaries. It's also +generally not a recommended approach for production usage. The recommendation +is to enable it globally and make exceptions for performance critical cases by +running the application in a container / namespace without it enabled. + +Make sure to raise `vm.max_map_count` substantially too to accommodate the very +large number of guard pages created by hardened\_malloc. As an example, in +`/etc/sysctl.d/hardened_malloc.conf`: + + vm.max_map_count = 1048576 + +This is unnecessary if you set `CONFIG_GUARD_SLABS_INTERVAL` to a very large +value in the build configuration. + +On arm64, make sure your kernel is configured to use 4k pages since we haven't +yet added support for 16k and 64k pages. The kernel also has to be configured +to use 4 level page tables for the full 48 bit address space instead of only +having a 39 bit address space for the default hardened\_malloc configuration. +It's possible to reduce the class region size substantially to make a 39 bit +address space workable but the defaults won't work. + +## Configuration + +You can set some configuration options at compile-time via arguments to the +make command as follows: + + make CONFIG_EXAMPLE=false + +Configuration options are provided when there are significant compromises +between portability, performance, memory usage or security. The core design +choices are not configurable and the allocator remains very security-focused +even with all the optional features disabled. + +The configuration system supports a configuration template system with two +standard presets: the default configuration (`config/default.mk`) and a light +configuration (`config/light.mk`). Packagers are strongly encouraged to ship +both the standard `default` and `light` configuration. You can choose the +configuration to build using `make VARIANT=light` where `make VARIANT=default` +is the same as `make`. Non-default configuration templates will build a library +with the suffix `-variant` such as `libhardened_malloc-light.so` and will use +an `out-variant` directory instead of `out` for the build. + +The `default` configuration template has all normal optional security features +enabled (just not the niche `CONFIG_SEAL_METADATA`) and is quite aggressive in +terms of sacrificing performance and memory usage for security. The `light` +configuration template disables the slab quarantines, write after free check, +slot randomization and raises the guard slab interval from 1 to 8 but leaves +zero-on-free and slab canaries enabled. The `light` configuration has solid +performance and memory usage while still being far more secure than mainstream +allocators with much better security properties. Disabling zero-on-free would +gain more performance but doesn't make much difference for small allocations +without also disabling slab canaries. Slab canaries slightly raise memory use +and slightly slow down performance but are quite important to mitigate small +overflows and C string overflows. Disabling slab canaries is not recommended +in most cases since it would no longer be a strict upgrade over traditional +allocators with headers on allocations and basic consistency checks for them. + +For reduced memory usage at the expense of performance (this will also reduce +the size of the empty slab caches and quarantines, saving a lot of memory, +since those are currently based on the size of the largest size class): + + make \ + N_ARENA=1 \ + CONFIG_EXTENDED_SIZE_CLASSES=false + +The following boolean configuration options are available: + +* `CONFIG_WERROR`: `true` (default) or `false` to control whether compiler + warnings are treated as errors. This is highly recommended, but it can be + disabled to avoid patching the Makefile if a compiler version not tested by + the project is being used and has warnings. Investigating these warnings is + still recommended and the intention is to always be free of any warnings. +* `CONFIG_NATIVE`: `true` (default) or `false` to control whether the code is + optimized for the detected CPU on the host. If this is disabled, setting up a + custom `-march` higher than the baseline architecture is highly recommended + due to substantial performance benefits for this code. +* `CONFIG_CXX_ALLOCATOR`: `true` (default) or `false` to control whether the + C++ allocator is replaced for slightly improved performance and detection of + mismatched sizes for sized deallocation (often type confusion bugs). This + will result in linking against the C++ standard library. +* `CONFIG_ZERO_ON_FREE`: `true` (default) or `false` to control whether small + allocations are zeroed on free, to mitigate use-after-free and uninitialized + use vulnerabilities along with purging lots of potentially sensitive data + from the process as soon as possible. This has a performance cost scaling to + the size of the allocation, which is usually acceptable. This is not relevant + to large allocations because the pages are given back to the kernel. +* `CONFIG_WRITE_AFTER_FREE_CHECK`: `true` (default) or `false` to control + sanity checking that new small allocations contain zeroed memory. This can + detect writes caused by a write-after-free vulnerability and mixes well with + the features for making memory reuse randomized / delayed. This has a + performance cost scaling to the size of the allocation, which is usually + acceptable. This is not relevant to large allocations because they're always + a fresh memory mapping from the kernel. +* `CONFIG_SLOT_RANDOMIZE`: `true` (default) or `false` to randomize selection + of free slots within slabs. This has a measurable performance cost and isn't + one of the important security features, but the cost has been deemed more + than acceptable to be enabled by default. +* `CONFIG_SLAB_CANARY`: `true` (default) or `false` to enable support for + adding 8 byte canaries to the end of memory allocations. The primary purpose + of the canaries is to render small fixed size buffer overflows harmless by + absorbing them. The first byte of the canary is always zero, containing + overflows caused by a missing C string NUL terminator. The other 7 bytes are + a per-slab random value. On free, integrity of the canary is checked to + detect attacks like linear overflows or other forms of heap corruption caused + by imprecise exploit primitives. However, checking on free will often be too + late to prevent exploitation so it's not the main purpose of the canaries. +* `CONFIG_SEAL_METADATA`: `true` or `false` (default) to control whether Memory + Protection Keys are used to disable access to all writable allocator state + outside of the memory allocator code. It's currently disabled by default due + to a significant performance cost for this use case on current generation + hardware, which may become drastically lower in the future. Whether or not + this feature is enabled, the metadata is all contained within an isolated + memory region with high entropy random guard regions around it. + +The following integer configuration options are available: + +* `CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH`: `1` (default) to control the number + of slots in the random array used to randomize reuse for small memory + allocations. This sets the length for the largest size class (either 16kiB + or 128kiB based on `CONFIG_EXTENDED_SIZE_CLASSES`) and the quarantine length + for smaller size classes is scaled to match the total memory of the + quarantined allocations (1 becomes 1024 for 16 byte allocations with 16kiB + as the largest size class, or 8192 with 128kiB as the largest). +* `CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH`: `1` (default) to control the number of + slots in the queue used to delay reuse for small memory allocations. This + sets the length for the largest size class (either 16kiB or 128kiB based on + `CONFIG_EXTENDED_SIZE_CLASSES`) and the quarantine length for smaller size + classes is scaled to match the total memory of the quarantined allocations (1 + becomes 1024 for 16 byte allocations with 16kiB as the largest size class, or + 8192 with 128kiB as the largest). +* `CONFIG_GUARD_SLABS_INTERVAL`: `1` (default) to control the number of slabs + before a slab is skipped and left as an unused memory protected guard slab. + The default of `1` leaves a guard slab between every slab. This feature does + not have a *direct* performance cost, but it makes the address space usage + sparser which can indirectly hurt performance. The kernel also needs to track + a lot more memory mappings, which uses a bit of extra memory and slows down + memory mapping and memory protection changes in the process. The kernel uses + O(log n) algorithms for this and system calls are already fairly slow anyway, + so having many extra mappings doesn't usually add up to a significant cost. +* `CONFIG_GUARD_SIZE_DIVISOR`: `2` (default) to control the maximum size of the + guard regions placed on both sides of large memory allocations, relative to + the usable size of the memory allocation. +* `CONFIG_REGION_QUARANTINE_RANDOM_LENGTH`: `256` (default) to control the + number of slots in the random array used to randomize region reuse for large + memory allocations. +* `CONFIG_REGION_QUARANTINE_QUEUE_LENGTH`: `1024` (default) to control the + number of slots in the queue used to delay region reuse for large memory + allocations. +* `CONFIG_REGION_QUARANTINE_SKIP_THRESHOLD`: `33554432` (default) to control + the size threshold where large allocations will not be quarantined. +* `CONFIG_FREE_SLABS_QUARANTINE_RANDOM_LENGTH`: `32` (default) to control the + number of slots in the random array used to randomize free slab reuse. +* `CONFIG_CLASS_REGION_SIZE`: `34359738368` (default) to control the size of + the size class regions. +* `CONFIG_N_ARENA`: `4` (default) to control the number of arenas +* `CONFIG_STATS`: `false` (default) to control whether stats on allocation / + deallocation count and active allocations are tracked. See the [section on + stats](#stats) for more details. +* `CONFIG_EXTENDED_SIZE_CLASSES`: `true` (default) to control whether small + size class go up to 128kiB instead of the minimum requirement for avoiding + memory waste of 16kiB. The option to extend it even further will be offered + in the future when better support for larger slab allocations is added. See + the [section on size classes](#size-classes) below for details. +* `CONFIG_LARGE_SIZE_CLASSES`: `true` (default) to control whether large + allocations use the slab allocation size class scheme instead of page size + granularity. See the [section on size classes](#size-classes) below for + details. + +There will be more control over enabled features in the future along with +control over fairly arbitrarily chosen values like the size of empty slab +caches (making them smaller improves security and reduces memory usage while +larger caches can substantially improves performance). + +## Core design + +The core design of the allocator is very simple / minimalist. The allocator is +exclusive to 64-bit platforms in order to take full advantage of the abundant +address space without being constrained by needing to keep the design +compatible with 32-bit. + +The mutable allocator state is entirely located within a dedicated metadata +region, and the allocator is designed around this approach for both small +(slab) allocations and large allocations. This provides reliable, deterministic +protections against invalid free including double frees, and protects metadata +from attackers. Traditional allocator exploitation techniques do not work with +the hardened\_malloc implementation. + +Small allocations are always located in a large memory region reserved for slab +allocations. On free, it can be determined that an allocation is one of the +small size classes from the address range. If arenas are enabled, the arena is +also determined from the address range as each arena has a dedicated sub-region +in the slab allocation region. Arenas provide totally independent slab +allocators with their own allocator state and no coordination between them. +Once the base region is determined (simply the slab allocation region as a +whole without any arenas enabled), the size class is determined from the +address range too, since it's divided up into a sub-region for each size class. +There's a top level slab allocation region, divided up into arenas, with each +of those divided up into size class regions. The size class regions each have a +random base within a large guard region. Once the size class is determined, the +slab size is known, and the index of the slab is calculated and used to obtain +the slab metadata for the slab from the slab metadata array. Finally, the index +of the slot within the slab provides the index of the bit tracking the slot in +the bitmap. Every slab allocation slot has a dedicated bit in a bitmap tracking +whether it's free, along with a separate bitmap for tracking allocations in the +quarantine. The slab metadata entries in the array have intrusive lists +threaded through them to track partial slabs (partially filled, and these are +the first choice for allocation), empty slabs (limited amount of cached free +memory) and free slabs (purged / memory protected). + +Large allocations are tracked via a global hash table mapping their address to +their size and random guard size. They're simply memory mappings and get mapped +on allocation and then unmapped on free. Large allocations are the only dynamic +memory mappings made by the allocator, since the address space for allocator +state (including both small / large allocation metadata) and slab allocations +is statically reserved. + +This allocator is aimed at production usage, not aiding with finding and fixing +memory corruption bugs for software development. It does find many latent bugs +but won't include features like the option of generating and storing stack +traces for each allocation to include the allocation site in related error +messages. The design choices are based around minimizing overhead and +maximizing security which often leads to different decisions than a tool +attempting to find bugs. For example, it uses zero-based sanitization on free +and doesn't minimize slack space from size class rounding between the end of an +allocation and the canary / guard region. Zero-based filling has the least +chance of uncovering latent bugs, but also the best chance of mitigating +vulnerabilities. The canary feature is primarily meant to act as padding +absorbing small overflows to render them harmless, so slack space is helpful +rather than harmful despite not detecting the corruption on free. The canary +needs detection on free in order to have any hope of stopping other kinds of +issues like a sequential overflow, which is why it's included. It's assumed +that an attacker can figure out the allocator is in use so the focus is +explicitly not on detecting bugs that are impossible to exploit with it in use +like an 8 byte overflow. The design choices would be different if performance +was a bit less important and if a core goal was finding latent bugs. + +## Security properties + +* Fully out-of-line metadata/state with protection from corruption + * Address space for allocator state is entirely reserved during + initialization and never reused for allocations or anything else + * State within global variables is entirely read-only after initialization + with pointers to the isolated allocator state so leaking the address of + the library doesn't leak the address of writable state + * Allocator state is located within a dedicated region with high entropy + randomly sized guard regions around it + * Protection via Memory Protection Keys (MPK) on x86\_64 (disabled by + default due to low benefit-cost ratio on top of baseline protections) + * [future] Protection via MTE on ARMv8.5+ +* Deterministic detection of any invalid free (unallocated, unaligned, etc.) + * Validation of the size passed for C++14 sized deallocation by `delete` + even for code compiled with earlier standards (detects type confusion if + the size is different) and by various containers using the allocator API + directly +* Isolated memory region for slab allocations + * Top-level isolated regions for each arena + * Divided up into isolated inner regions for each size class + * High entropy random base for each size class region + * No deterministic / low entropy offsets between allocations with + different size classes + * Metadata is completely outside the slab allocation region + * No references to metadata within the slab allocation region + * No deterministic / low entropy offsets to metadata + * Entire slab region starts out non-readable and non-writable + * Slabs beyond the cache limit are purged and become non-readable and + non-writable memory again + * Placed into a queue for reuse in FIFO order to maximize the time + spent memory protected + * Randomized array is used to add a random delay for reuse +* Fine-grained randomization within memory regions + * Randomly sized guard regions for large allocations + * Random slot selection within slabs + * Randomized delayed free for small and large allocations along with slabs + themselves + * [in-progress] Randomized choice of slabs + * [in-progress] Randomized allocation of slabs +* Slab allocations are zeroed on free +* Detection of write-after-free for slab allocations by verifying zero filling + is intact at allocation time +* Delayed free via a combination of FIFO and randomization for slab allocations +* Large allocations are purged and memory protected on free with the memory + mapping kept reserved in a quarantine to detect use-after-free + * The quarantine is primarily based on a FIFO ring buffer, with the oldest + mapping in the quarantine being unmapped to make room for the most + recently freed mapping + * Another layer of the quarantine swaps with a random slot in an array to + randomize the number of large deallocations required to push mappings out + of the quarantine +* Memory in fresh allocations is consistently zeroed due to it either being + fresh pages or zeroed on free after previous usage +* Random canaries placed after each slab allocation to *absorb* + and then later detect overflows/underflows + * High entropy per-slab random values + * Leading byte is zeroed to contain C string overflows +* Possible slab locations are skipped and remain memory protected, leaving slab + size class regions interspersed with guard pages +* Zero size allocations are a dedicated size class with the entire region + remaining non-readable and non-writable +* Extension for retrieving the size of allocations with fallback to a sentinel + for pointers not managed by the allocator [in-progress, full implementation + needs to be ported from the previous OpenBSD malloc-based allocator] + * Can also return accurate values for pointers *within* small allocations + * The same applies to pointers within the first page of large allocations, + otherwise it currently has to return a sentinel +* No alignment tricks interfering with ASLR like jemalloc, PartitionAlloc, etc. +* No usage of the legacy brk heap +* Aggressive sanity checks + * Errors other than ENOMEM from mmap, munmap, mprotect and mremap treated + as fatal, which can help to detect memory management gone wrong elsewhere + in the process. +* Memory tagging for slab allocations via MTE on ARMv8.5+ + * random memory tags as the baseline, providing probabilistic protection + against various forms of memory corruption + * dedicated tag for free slots, set on free, for deterministic protection + against accessing freed memory + * guarantee distinct tags for adjacent memory allocations by incrementing + past matching values for deterministic detection of linear overflows + * [future] store previous random tag and increment it to get the next tag + for that slot to provide deterministic use-after-free detection through + multiple cycles of memory reuse + +## Randomness + +The current implementation of random number generation for randomization-based +mitigations is based on generating a keystream from a stream cipher (ChaCha8) +in small chunks. Separate CSPRNGs are used for each small size class in each +arena, large allocations and initialization in order to fit into the +fine-grained locking model without needing to waste memory per thread by +having the CSPRNG state in Thread Local Storage. Similarly, it's protected via +the same approach taken for the rest of the metadata. The stream cipher is +regularly reseeded from the OS to provide backtracking and prediction +resistance with a negligible cost. The reseed interval simply needs to be +adjusted to the point that it stops registering as having any significant +performance impact. The performance impact on recent Linux kernels is +primarily from the high cost of system calls and locking since the +implementation is quite efficient (ChaCha20), especially for just generating +the key and nonce for another stream cipher (ChaCha8). + +ChaCha8 is a great fit because it's extremely fast across platforms without +relying on hardware support or complex platform-specific code. The security +margins of ChaCha20 would be completely overkill for the use case. Using +ChaCha8 avoids needing to resort to a non-cryptographically secure PRNG or +something without a lot of scrutiny. The current implementation is simply the +reference implementation of ChaCha8 converted into a pure keystream by ripping +out the XOR of the message into the keystream. + +The random range generation functions are a highly optimized implementation +too. Traditional uniform random number generation within a range is very high +overhead and can easily dwarf the cost of an efficient CSPRNG. + +## Size classes + +The zero byte size class is a special case of the smallest regular size class. +It's allocated in a dedicated region like other size classes but with the slabs +never being made readable and writable so the only memory usage is for the slab +metadata. + +The choice of size classes for slab allocation is the same as jemalloc, which +is a careful balance between minimizing internal and external fragmentation. If +there are more size classes, more memory is wasted on free slots available only +to allocation requests of those sizes (external fragmentation). If there are +fewer size classes, the spacing between them is larger and more memory is +wasted due to rounding up to the size classes (internal fragmentation). There +are 4 special size classes for the smallest sizes (16, 32, 48, 64) that are +simply spaced out by the minimum spacing (16). Afterwards, there are four size +classes for every power of two spacing which results in bounding the internal +fragmentation below 20% for each size class. This also means there are 4 size +classes for each doubling in size. + +The slot counts tied to the size classes are specific to this allocator rather +than being taken from jemalloc. Slabs are always a span of pages so the slot +count needs to be tuned to minimize waste due to rounding to the page size. For +now, this allocator is set up only for 4096 byte pages as a small page size is +desirable for finer-grained memory protection and randomization. It could be +ported to larger page sizes in the future. The current slot counts are only a +preliminary set of values. + +| size class | worst case internal fragmentation | slab slots | slab size | internal fragmentation for slabs | +| - | - | - | - | - | +| 16 | 93.75% | 256 | 4096 | 0.0% | +| 32 | 46.88% | 128 | 4096 | 0.0% | +| 48 | 31.25% | 85 | 4096 | 0.390625% | +| 64 | 23.44% | 64 | 4096 | 0.0% | +| 80 | 18.75% | 51 | 4096 | 0.390625% | +| 96 | 15.62% | 42 | 4096 | 1.5625% | +| 112 | 13.39% | 36 | 4096 | 1.5625% | +| 128 | 11.72% | 64 | 8192 | 0.0% | +| 160 | 19.38% | 51 | 8192 | 0.390625% | +| 192 | 16.15% | 64 | 12288 | 0.0% | +| 224 | 13.84% | 54 | 12288 | 1.5625% | +| 256 | 12.11% | 64 | 16384 | 0.0% | +| 320 | 19.69% | 64 | 20480 | 0.0% | +| 384 | 16.41% | 64 | 24576 | 0.0% | +| 448 | 14.06% | 64 | 28672 | 0.0% | +| 512 | 12.3% | 64 | 32768 | 0.0% | +| 640 | 19.84% | 64 | 40960 | 0.0% | +| 768 | 16.54% | 64 | 49152 | 0.0% | +| 896 | 14.17% | 64 | 57344 | 0.0% | +| 1024 | 12.4% | 64 | 65536 | 0.0% | +| 1280 | 19.92% | 16 | 20480 | 0.0% | +| 1536 | 16.6% | 16 | 24576 | 0.0% | +| 1792 | 14.23% | 16 | 28672 | 0.0% | +| 2048 | 12.45% | 16 | 32768 | 0.0% | +| 2560 | 19.96% | 8 | 20480 | 0.0% | +| 3072 | 16.63% | 8 | 24576 | 0.0% | +| 3584 | 14.26% | 8 | 28672 | 0.0% | +| 4096 | 12.48% | 8 | 32768 | 0.0% | +| 5120 | 19.98% | 8 | 40960 | 0.0% | +| 6144 | 16.65% | 8 | 49152 | 0.0% | +| 7168 | 14.27% | 8 | 57344 | 0.0% | +| 8192 | 12.49% | 8 | 65536 | 0.0% | +| 10240 | 19.99% | 6 | 61440 | 0.0% | +| 12288 | 16.66% | 5 | 61440 | 0.0% | +| 14336 | 14.28% | 4 | 57344 | 0.0% | +| 16384 | 12.49% | 4 | 65536 | 0.0% | + +The slab allocation size classes end at 16384 since that's the final size for +2048 byte spacing and the next spacing class matches the page size of 4096 +bytes on the target platforms. This is the minimum set of small size classes +required to avoid substantial waste from rounding. + +The `CONFIG_EXTENDED_SIZE_CLASSES` option extends the size classes up to +131072, with a final spacing class of 16384. This offers improved performance +compared to the minimum set of size classes. The security story is complicated, +since the slab allocation has both advantages like size class isolation +completely avoiding reuse of any of the address space for any other size +classes or other data. It also has disadvantages like caching a small number of +empty slabs and deterministic guard sizes. The cache will be configurable in +the future, making it possible to disable slab caching for the largest slab +allocation sizes, to force unmapping them immediately and putting them in the +slab quarantine, which eliminates most of the security disadvantage at the +expense of also giving up most of the performance advantage, but while +retaining the isolation. + +| size class | worst case internal fragmentation | slab slots | slab size | internal fragmentation for slabs | +| - | - | - | - | - | +| 20480 | 20.0% | 1 | 20480 | 0.0% | +| 24576 | 16.66% | 1 | 24576 | 0.0% | +| 28672 | 14.28% | 1 | 28672 | 0.0% | +| 32768 | 12.5% | 1 | 32768 | 0.0% | +| 40960 | 20.0% | 1 | 40960 | 0.0% | +| 49152 | 16.66% | 1 | 49152 | 0.0% | +| 57344 | 14.28% | 1 | 57344 | 0.0% | +| 65536 | 12.5% | 1 | 65536 | 0.0% | +| 81920 | 20.0% | 1 | 81920 | 0.0% | +| 98304 | 16.67% | 1 | 98304 | 0.0% | +| 114688 | 14.28% | 1 | 114688 | 0.0% | +| 131072 | 12.5% | 1 | 131072 | 0.0% | + +The `CONFIG_LARGE_SIZE_CLASSES` option controls whether large allocations use +the same size class scheme providing 4 size classes for every doubling of size. +It increases virtual memory consumption but drastically improves performance +where realloc is used without proper growth factors, which is fairly common and +destroys performance in some commonly used programs. If large size classes are +disabled, the granularity is instead the page size, which is currently always +4096 bytes on supported platforms. + +## Scalability + +### Small (slab) allocations + +As a baseline form of fine-grained locking, the slab allocator has entirely +separate allocators for each size class. Each size class has a dedicated lock, +CSPRNG and other state. + +The slab allocator's scalability primarily comes from dividing up the slab +allocation region into independent arenas assigned to threads. The arenas are +just entirely separate slab allocators with their own sub-regions for each size +class. Using 4 arenas reserves a region 4 times as large and the relevant slab +allocator metadata is determined based on address, as part of the same approach +to finding the per-size-class metadata. The part that's still open to different +design choices is how arenas are assigned to threads. One approach is +statically assigning arenas via round-robin like the standard jemalloc +implementation, or statically assigning to a random arena which is essentially +the current implementation. Another option is dynamic load balancing via a +heuristic like `sched_getcpu` for per-CPU arenas, which would offer better +performance than randomly choosing an arena each time while being more +predictable for an attacker. There are actually some security benefits from +this assignment being completely static, since it isolates threads from each +other. Static assignment can also reduce memory usage since threads may have +varying usage of size classes. + +When there's substantial allocation or deallocation pressure, the allocator +does end up calling into the kernel to purge / protect unused slabs by +replacing them with fresh `PROT_NONE` regions along with unprotecting slabs +when partially filled and cached empty slabs are depleted. There will be +configuration over the amount of cached empty slabs, but it's not entirely a +performance vs. memory trade-off since memory protecting unused slabs is a nice +opportunistic boost to security. However, it's not really part of the core +security model or features so it's quite reasonable to use much larger empty +slab caches when the memory usage is acceptable. It would also be reasonable to +attempt to use heuristics for dynamically tuning the size, but there's not a +great one size fits all approach so it isn't currently part of this allocator +implementation. + +#### Thread caching (or lack thereof) + +Thread caches are a commonly implemented optimization in modern allocators but +aren't very suitable for a hardened allocator even when implemented via arrays +like jemalloc rather than free lists. They would prevent the allocator from +having perfect knowledge about which memory is free in a way that's both race +free and works with fully out-of-line metadata. It would also interfere with +the quality of fine-grained randomization even with randomization support in +the thread caches. The caches would also end up with much weaker protection +than the dedicated metadata region. Potentially worst of all, it's inherently +incompatible with the important quarantine feature. + +The primary benefit from a thread cache is performing batches of allocations +and batches of deallocations to amortize the cost of the synchronization used +by locking. The issue is not contention but rather the cost of synchronization +itself. Performing operations in large batches isn't necessarily a good thing +in terms of reducing contention to improve scalability. Large thread caches +like TCMalloc are a legacy design choice and aren't a good approach for a +modern allocator. In jemalloc, thread caches are fairly small and have a form +of garbage collection to clear them out when they aren't being heavily used. +Since this is a hardened allocator with a bunch of small costs for the security +features, the synchronization is already a smaller percentage of the overall +time compared to a much leaner performance-oriented allocator. These benefits +could be obtained via allocation queues and deallocation queues which would +avoid bypassing the quarantine and wouldn't have as much of an impact on +randomization. However, deallocation queues would also interfere with having +global knowledge about what is free. An allocation queue alone wouldn't have +many drawbacks, but it isn't currently planned even as an optional feature +since it probably wouldn't be enabled by default and isn't worth the added +complexity. + +The secondary benefit of thread caches is being able to avoid the underlying +allocator implementation entirely for some allocations and deallocations when +they're mixed together rather than many allocations being done together or many +frees being done together. The value of this depends a lot on the application +and it's entirely unsuitable / incompatible with a hardened allocator since it +bypasses all of the underlying security and would destroy much of the security +value. + +### Large allocations + +The expectation is that the allocator does not need to perform well for large +allocations, especially in terms of scalability. When the performance for large +allocations isn't good enough, the approach will be to enable more slab +allocation size classes. Doubling the maximum size of slab allocations only +requires adding 4 size classes while keeping internal waste bounded below 20%. + +Large allocations are implemented as a wrapper on top of the kernel memory +mapping API. The addresses and sizes are tracked in a global data structure +with a global lock. The current implementation is a hash table and could easily +use fine-grained locking, but it would have little benefit since most of the +locking is in the kernel. Most of the contention will be on the `mmap_sem` lock +for the process in the kernel. Ideally, it could simply map memory when +allocating and unmap memory when freeing. However, this is a hardened allocator +and the security features require extra system calls due to lack of direct +support for this kind of hardening in the kernel. Randomly sized guard regions +are placed around each allocation which requires mapping a `PROT_NONE` region +including the guard regions and then unprotecting the usable area between them. +The quarantine implementation requires clobbering the mapping with a fresh +`PROT_NONE` mapping using `MAP_FIXED` on free to hold onto the region while +it's in the quarantine, until it's eventually unmapped when it's pushed out of +the quarantine. This means there are 2x as many system calls for allocating and +freeing as there would be if the kernel supported these features directly. + +## Memory tagging + +**Memory tagging has been implemented and this section is currently +out-of-date.** + +Integrating extensive support for ARMv8.5 memory tagging is planned and this +section will be expanded to cover the details on the chosen design. The approach +for slab allocations is currently covered, but it can also be used for the +allocator metadata region and large allocations. + +Memory allocations are already always multiples of naturally aligned 16 byte +units, so memory tags are a natural fit into a malloc implementation due to the +16 byte alignment requirement. The only extra memory consumption will come from +the hardware supported storage for the tag values (4 bits per 16 bytes). + +The baseline policy will be to generate random tags for each slab allocation +slot on first use. The highest value will be reserved for marking freed memory +allocations to detect any accesses to freed memory so it won't be part of the +generated range. Adjacent slots will be guaranteed to have distinct memory tags +in order to guarantee that linear overflows are detected. There are a few ways +of implementing this and it will end up depending on the performance costs of +different approaches. If there's an efficient way to fetch the adjacent tag +values without wasting extra memory, it will be possible to check for them and +skip them either by generating a new random value in a loop or incrementing +past them since the tiny bit of bias wouldn't matter. Another approach would be +alternating odd and even tag values but that would substantially reduce the +overall randomness of the tags and there's very little entropy from the start. + +Once a slab allocation has been freed, the tag will be set to the reserved +value for free memory and the previous tag value will be stored inside the +allocation itself. The next time the slot is allocated, the chosen tag value +will be the previous value incremented by one to provide use-after-free +detection between generations of allocations. The stored tag will be wiped +before retagging the memory, to avoid leaking it and as part of preserving the +security property of newly allocated memory being zeroed due to zero-on-free. +It will eventually wrap all the way around, but this ends up providing a strong +guarantee for many allocation cycles due to the combination of 4 bit tags with +the FIFO quarantine feature providing delayed free. It also benefits from +random slot allocation and the randomized portion of delayed free, which result +in a further delay along with preventing a deterministic bypass by forcing a +reuse after a certain number of allocation cycles. Similarly to the initial tag +generation, tag values for adjacent allocations will be skipped by incrementing +past them. + +For example, consider this slab of allocations that are not yet used with 15 +representing the tag for free memory. For the sake of simplicity, there will be +no quarantine or other slabs for this example: + + | 15 | 15 | 15 | 15 | 15 | 15 | + +Three slots are randomly chosen for allocations, with random tags assigned (2, +7, 14) since these slots haven't ever been used and don't have saved values: + + | 15 | 2 | 15 | 7 | 14 | 15 | + +The 2nd allocation slot is freed, and is set back to the tag for free memory +(15), but with the previous tag value stored in the freed space: + + | 15 | 15 | 15 | 7 | 14 | 15 | + +The first slot is allocated for the first time, receiving the random value 3: + + | 3 | 15 | 15 | 7 | 14 | 15 | + +The 2nd slot is randomly chosen again, so the previous tag (2) is retrieved and +incremented to 3 as part of the use-after-free mitigation. An adjacent +allocation already uses the tag 3, so the tag is further incremented to 4 (it +would be incremented to 5 if one of the adjacent tags was 4): + + | 3 | 4 | 15 | 7 | 14 | 15 | + +The last slot is randomly chosen for the next allocation, and is assigned the +random value 14. However, it's placed next to an allocation with the tag 14 so +the tag is incremented and wraps around to 0: + + | 3 | 4 | 15 | 7 | 14 | 0 | + +## API extensions + +The `void free_sized(void *ptr, size_t expected_size)` function exposes the +sized deallocation sanity checks for C. A performance-oriented allocator could +use the same API as an optimization to avoid a potential cache miss from +reading the size from metadata. + +The `size_t malloc_object_size(void *ptr)` function returns an *upper bound* on +the accessible size of the relevant object (if any) by querying the malloc +implementation. It's similar to the `__builtin_object_size` intrinsic used by +`_FORTIFY_SOURCE` but via dynamically querying the malloc implementation rather +than determining constant sizes at compile-time. The current implementation is +just a naive placeholder returning much looser upper bounds than the intended +implementation. It's a valid implementation of the API already, but it will +become fully accurate once it's finished. This function is **not** currently +safe to call from signal handlers, but another API will be provided to make +that possible with a compile-time configuration option to avoid the necessary +overhead if the functionality isn't being used (in a way that doesn't change +break API compatibility based on the configuration). + +The `size_t malloc_object_size_fast(void *ptr)` is comparable, but avoids +expensive operations like locking or even atomics. It provides significantly +less useful results falling back to higher upper bounds, but is very fast. In +this implementation, it retrieves an upper bound on the size for small memory +allocations based on calculating the size class region. This function is safe +to use from signal handlers already. + +## Stats + +If stats are enabled, hardened\_malloc keeps tracks allocator statistics in +order to provide implementations of `mallinfo` and `malloc_info`. + +On Android, `mallinfo` is used for [mallinfo-based garbage collection +triggering](https://developer.android.com/preview/features#mallinfo) so +hardened\_malloc enables `CONFIG_STATS` by default. The `malloc_info` +implementation on Android is the standard one in Bionic, with the information +provided to Bionic via Android's internal extended `mallinfo` API with support +for arenas and size class bins. This means the `malloc_info` output is fully +compatible, including still having `jemalloc-1` as the version of the data +format to retain compatibility with existing tooling. + +On non-Android Linux, `mallinfo` has zeroed fields even with `CONFIG_STATS` +enabled because glibc `mallinfo` is inherently broken. It defines the fields as +`int` instead of `size_t`, resulting in undefined signed overflows. It also +misuses the fields and provides a strange, idiosyncratic set of values rather +than following the SVID/XPG `mallinfo` definition. The `malloc_info` function +is still provided, with a similar format as what Android uses, with tweaks for +hardened\_malloc and the version set to `hardened_malloc-1`. The data format +may be changed in the future. + +As an example, consider the following program from the hardened\_malloc tests: + +```c +#include + +#include + +__attribute__((optimize(0))) +void leak_memory(void) { + (void)malloc(1024 * 1024 * 1024); + (void)malloc(16); + (void)malloc(32); + (void)malloc(4096); +} + +void *do_work(void *p) { + leak_memory(); + return NULL; +} + +int main(void) { + pthread_t thread[4]; + for (int i = 0; i < 4; i++) { + pthread_create(&thread[i], NULL, do_work, NULL); + } + for (int i = 0; i < 4; i++) { + pthread_join(thread[i], NULL); + } + + malloc_info(0, stdout); +} +``` + +This produces the following output when piped through `xmllint --format -`: + +```xml + + + + + 1 + 0 + 4096 + 32 + + + 1 + 0 + 4096 + 48 + + + 4 + 0 + 20480 + 1280 + + + 2 + 0 + 40960 + 10240 + + + 1 + 0 + 81920 + 81920 + + + + + 1 + 0 + 4096 + 32 + + + 1 + 0 + 4096 + 48 + + + 1 + 0 + 40960 + 5120 + + + + + 1 + 0 + 4096 + 32 + + + 1 + 0 + 4096 + 48 + + + 1 + 0 + 40960 + 5120 + + + + + 1 + 0 + 4096 + 32 + + + 1 + 0 + 4096 + 48 + + + 1 + 0 + 40960 + 5120 + + + + 4294967296 + + +``` + +The heap entries correspond to the arenas. Unlike jemalloc, hardened\_malloc +doesn't handle large allocations within the arenas, so it presents those in the +`malloc_info` statistics as a separate arena dedicated to large allocations. +For example, with 4 arenas enabled, there will be a 5th arena in the statistics +for the large allocations. + +The `nmalloc` / `ndalloc` fields are 64-bit integers tracking allocation and +deallocation count. These are defined as wrapping on overflow, per the jemalloc +implementation. + +See the [section on size classes](#size-classes) to map the size class bin +number to the corresponding size class. The bin index begins at 0, mapping to +the 0 byte size class, followed by 1 for the 16 bytes, 2 for 32 bytes, etc. and +large allocations are treated as one group. + +When stats aren't enabled, the `malloc_info` output will be an empty `malloc` +element. + +## System calls + +This is intended to aid with creating system call whitelists via seccomp-bpf +and will change over time. + +System calls used by all build configurations: + +* `futex(uaddr, FUTEX_WAIT_PRIVATE, val, NULL)` (via `pthread_mutex_lock`) +* `futex(uaddr, FUTEX_WAKE_PRIVATE, val)` (via `pthread_mutex_unlock`) +* `getrandom(buf, buflen, 0)` (to seed and regularly reseed the CSPRNG) +* `mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)` +* `mmap(ptr, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0)` +* `mprotect(ptr, size, PROT_READ)` +* `mprotect(ptr, size, PROT_READ|PROT_WRITE)` +* `mremap(old, old_size, new_size, 0)` +* `mremap(old, old_size, new_size, MREMAP_MAYMOVE|MREMAP_FIXED, new)` +* `munmap` +* `write(STDERR_FILENO, buf, len)` (before aborting due to memory corruption) +* `madvise(ptr, size, MADV_DONTNEED)` + +The main distinction from a typical malloc implementation is the use of +getrandom. A common compatibility issue is that existing system call whitelists +often omit getrandom partly due to older code using the legacy `/dev/urandom` +interface along with the overall lack of security features in mainstream libc +implementations. + +Additional system calls when `CONFIG_SEAL_METADATA=true` is set: + +* `pkey_alloc` +* `pkey_mprotect` instead of `mprotect` with an additional `pkey` parameter, + but otherwise the same (regular `mprotect` is never called) + +Additional system calls for Android builds with `LABEL_MEMORY`: + +* `prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, name)` diff --git a/src/hardened_malloc/androidtest/Android.bp b/src/hardened_malloc/androidtest/Android.bp new file mode 100644 index 0000000..ae0aa49 --- /dev/null +++ b/src/hardened_malloc/androidtest/Android.bp @@ -0,0 +1,25 @@ +java_test_host { + name: "HMallocTest", + srcs: [ + "src/**/*.java", + ], + + libs: [ + "tradefed", + "compatibility-tradefed", + "compatibility-host-util", + ], + + static_libs: [ + "cts-host-utils", + "frameworks-base-hostutils", + ], + + test_suites: [ + "general-tests", + ], + + data_device_bins_64: [ + "memtag_test", + ], +} diff --git a/src/hardened_malloc/androidtest/AndroidTest.xml b/src/hardened_malloc/androidtest/AndroidTest.xml new file mode 100644 index 0000000..333f1dd --- /dev/null +++ b/src/hardened_malloc/androidtest/AndroidTest.xml @@ -0,0 +1,13 @@ + + + + + + + + + + diff --git a/src/hardened_malloc/androidtest/memtag/Android.bp b/src/hardened_malloc/androidtest/memtag/Android.bp new file mode 100644 index 0000000..14ab691 --- /dev/null +++ b/src/hardened_malloc/androidtest/memtag/Android.bp @@ -0,0 +1,16 @@ +cc_test { + name: "memtag_test", + srcs: ["memtag_test.cc"], + cflags: [ + "-Wall", + "-Werror", + "-Wextra", + "-O0", + ], + + compile_multilib: "64", + + sanitize: { + memtag_heap: true, + }, +} diff --git a/src/hardened_malloc/androidtest/memtag/memtag_test.cc b/src/hardened_malloc/androidtest/memtag/memtag_test.cc new file mode 100644 index 0000000..ca491d8 --- /dev/null +++ b/src/hardened_malloc/androidtest/memtag/memtag_test.cc @@ -0,0 +1,297 @@ +// needed to uncondionally enable assertions +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace std; + +using u8 = uint8_t; +using uptr = uintptr_t; +using u64 = uint64_t; + +const size_t DEFAULT_ALLOC_SIZE = 8; +const size_t CANARY_SIZE = 8; + +void do_context_switch() { + utsname s; + uname(&s); +} + +u8 get_pointer_tag(void *ptr) { + return (((uptr) ptr) >> 56) & 0xf; +} + +void *untag_pointer(void *ptr) { + const uintptr_t mask = UINTPTR_MAX >> 8; + return (void *) ((uintptr_t) ptr & mask); +} + +// This test checks that slab slot allocation uses tag that is distint from tags of its neighbors +// and from the tag of the previous allocation that used the same slot +void tag_distinctness() { + // 0 and 15 are reserved + const int min_tag = 1; + const int max_tag = 14; + + struct SizeClass { + int size; + int slot_cnt; + }; + + // values from size_classes[] and size_class_slots[] in h_malloc.c + SizeClass size_classes[] = { + { .size = 16, .slot_cnt = 256, }, + { .size = 32, .slot_cnt = 128, }, + // this size class is used by allocations that are made by the addr_tag_map, which breaks + // tag distinctess checks + // { .size = 48, .slot_cnt = 85, }, + { .size = 64, .slot_cnt = 64, }, + { .size = 80, .slot_cnt = 51, }, + { .size = 96, .slot_cnt = 42, }, + { .size = 112, .slot_cnt = 36, }, + { .size = 128, .slot_cnt = 64, }, + { .size = 160, .slot_cnt = 51, }, + { .size = 192, .slot_cnt = 64, }, + { .size = 224, .slot_cnt = 54, }, + { .size = 10240, .slot_cnt = 6, }, + { .size = 20480, .slot_cnt = 1, }, + }; + + int tag_usage[max_tag + 1]; + + for (size_t sc_idx = 0; sc_idx < sizeof(size_classes) / sizeof(SizeClass); ++sc_idx) { + SizeClass &sc = size_classes[sc_idx]; + + const size_t full_alloc_size = sc.size; + const size_t alloc_size = full_alloc_size - CANARY_SIZE; + + // "tdc" is short for "tag distinctness check" + int left_neighbor_tdc_cnt = 0; + int right_neighbor_tdc_cnt = 0; + int prev_alloc_tdc_cnt = 0; + + int iter_cnt = 600; + + unordered_map addr_tag_map; + addr_tag_map.reserve(iter_cnt * sc.slot_cnt); + + u64 seen_tags = 0; + + for (int iter = 0; iter < iter_cnt; ++iter) { + uptr allocations[256]; // 256 is max slot count + + for (int i = 0; i < sc.slot_cnt; ++i) { + u8 *p = (u8 *) malloc(alloc_size); + assert(p); + uptr addr = (uptr) untag_pointer(p); + u8 tag = get_pointer_tag(p); + + assert(tag >= min_tag && tag <= max_tag); + seen_tags |= 1 << tag; + ++tag_usage[tag]; + + // check most recent tags of left and right neighbors + + auto left = addr_tag_map.find(addr - full_alloc_size); + if (left != addr_tag_map.end()) { + assert(left->second != tag); + ++left_neighbor_tdc_cnt; + } + + auto right = addr_tag_map.find(addr + full_alloc_size); + if (right != addr_tag_map.end()) { + assert(right->second != tag); + ++right_neighbor_tdc_cnt; + } + + // check previous tag of this slot + auto prev = addr_tag_map.find(addr); + if (prev != addr_tag_map.end()) { + assert(prev->second != tag); + ++prev_alloc_tdc_cnt; + addr_tag_map.erase(addr); + } + + addr_tag_map.emplace(addr, tag); + + for (size_t j = 0; j < alloc_size; ++j) { + // check that slot is zeroed + assert(p[j] == 0); + // check that slot is readable and writable + p[j]++; + } + + allocations[i] = addr; + } + + // free some of allocations to allow their slots to be reused + for (int i = sc.slot_cnt - 1; i >= 0; i -= 2) { + free((void *) allocations[i]); + } + } + + // check that all of the tags were used, except reserved ones + assert(seen_tags == (0xffff & ~(1 << 0 | 1 << 15))); + + printf("size_class\t%i\t" "tdc_left %i\t" "tdc_right %i\t" "tdc_prev_alloc %i\n", + sc.size, left_neighbor_tdc_cnt, right_neighbor_tdc_cnt, prev_alloc_tdc_cnt); + + // make sure tag distinctess checks were actually performed + int min_tdc_cnt = sc.slot_cnt * iter_cnt / 5; + + assert(prev_alloc_tdc_cnt > min_tdc_cnt); + + if (sc.slot_cnt > 1) { + assert(left_neighbor_tdc_cnt > min_tdc_cnt); + assert(right_neighbor_tdc_cnt > min_tdc_cnt); + } + + // async tag check failures are reported on context switch + do_context_switch(); + } + + printf("\nTag use counters:\n"); + + int min = INT_MAX; + int max = 0; + double geomean = 0.0; + for (int i = min_tag; i <= max_tag; ++i) { + int v = tag_usage[i]; + geomean += log(v); + min = std::min(min, v); + max = std::max(max, v); + printf("%i\t%i\n", i, tag_usage[i]); + } + int tag_cnt = 1 + max_tag - min_tag; + geomean = exp(geomean / tag_cnt); + + double max_deviation = std::max((double) max - geomean, geomean - min); + + printf("geomean: %.2f, max deviation from geomean: %.2f%%\n", geomean, (100.0 * max_deviation) / geomean); +} + +u8* alloc_default() { + const size_t full_alloc_size = DEFAULT_ALLOC_SIZE + CANARY_SIZE; + set addrs; + + // make sure allocation has both left and right neighbors, otherwise overflow/underflow tests + // will fail when allocation is at the end/beginning of slab + for (;;) { + u8 *p = (u8 *) malloc(DEFAULT_ALLOC_SIZE); + assert(p); + + uptr addr = (uptr) untag_pointer(p); + uptr left = addr - full_alloc_size; + if (addrs.find(left) != addrs.end()) { + uptr right = addr + full_alloc_size; + if (addrs.find(right) != addrs.end()) { + return p; + } + } + + addrs.emplace(addr); + } +} + +volatile u8 u8_var; + +void read_after_free() { + u8 *p = alloc_default(); + free(p); + volatile u8 v = p[0]; + (void) v; +} + +void write_after_free() { + u8 *p = alloc_default(); + free(p); + p[0] = 1; +} + +void underflow_read() { + u8 *p = alloc_default(); + volatile u8 v = p[-1]; + (void) v; +} + +void underflow_write() { + u8 *p = alloc_default(); + p[-1] = 1; +} + +void overflow_read() { + u8 *p = alloc_default(); + volatile u8 v = p[DEFAULT_ALLOC_SIZE + CANARY_SIZE]; + (void) v; +} + +void overflow_write() { + u8 *p = alloc_default(); + p[DEFAULT_ALLOC_SIZE + CANARY_SIZE] = 1; +} + +void untagged_read() { + u8 *p = alloc_default(); + p = (u8 *) untag_pointer(p); + volatile u8 v = p[0]; + (void) v; +} + +void untagged_write() { + u8 *p = alloc_default(); + p = (u8 *) untag_pointer(p); + p[0] = 1; +} + +map> tests = { +#define TEST(s) { #s, s } + TEST(tag_distinctness), + TEST(read_after_free), + TEST(write_after_free), + TEST(overflow_read), + TEST(overflow_write), + TEST(underflow_read), + TEST(underflow_write), + TEST(untagged_read), + TEST(untagged_write), +#undef TEST +}; + +void segv_handler(int, siginfo_t *si, void *) { + fprintf(stderr, "SEGV_CODE %i", si->si_code); + exit(139); // standard exit code for SIGSEGV +} + +int main(int argc, char **argv) { + setbuf(stdout, NULL); + assert(argc == 2); + + auto test_name = string(argv[1]); + auto test_fn = tests[test_name]; + assert(test_fn != nullptr); + + assert(mallopt(M_BIONIC_SET_HEAP_TAGGING_LEVEL, M_HEAP_TAGGING_LEVEL_ASYNC) == 1); + + struct sigaction sa = { + .sa_sigaction = segv_handler, + .sa_flags = SA_SIGINFO, + }; + + assert(sigaction(SIGSEGV, &sa, nullptr) == 0); + + test_fn(); + do_context_switch(); + + return 0; +} diff --git a/src/hardened_malloc/androidtest/src/grapheneos/hmalloc/MemtagTest.java b/src/hardened_malloc/androidtest/src/grapheneos/hmalloc/MemtagTest.java new file mode 100644 index 0000000..8cb7a45 --- /dev/null +++ b/src/hardened_malloc/androidtest/src/grapheneos/hmalloc/MemtagTest.java @@ -0,0 +1,95 @@ +package grapheneos.hmalloc; + +import com.android.tradefed.device.DeviceNotAvailableException; +import com.android.tradefed.testtype.DeviceJUnit4ClassRunner; +import com.android.tradefed.testtype.junit4.BaseHostJUnit4Test; + +import org.junit.Test; +import org.junit.runner.RunWith; + +import java.io.IOException; +import java.util.ArrayList; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +@RunWith(DeviceJUnit4ClassRunner.class) +public class MemtagTest extends BaseHostJUnit4Test { + + private static final String TEST_BINARY = "/data/local/tmp/memtag_test"; + + enum Result { + SUCCESS(0, ""), + // it's expected that the device is configured to use asymm MTE tag checking mode + ASYNC_MTE_ERROR(139, "SEGV_CODE 8"), + SYNC_MTE_ERROR(139, "SEGV_CODE 9"), + ; + + public final int exitCode; + public final String stderr; + + Result(int exitCode, String stderr) { + this.exitCode = exitCode; + this.stderr = stderr; + } + } + + private static final int SEGV_EXIT_CODE = 139; + + private void runTest(String name, Result expectedResult) throws DeviceNotAvailableException { + var args = new ArrayList(); + args.add(TEST_BINARY); + args.add(name); + String cmdLine = String.join(" ", args); + + var result = getDevice().executeShellV2Command(cmdLine); + + assertEquals("process exit code", expectedResult.exitCode, result.getExitCode().intValue()); + assertEquals("stderr", expectedResult.stderr, result.getStderr()); + } + + @Test + public void tag_distinctness() throws DeviceNotAvailableException { + runTest("tag_distinctness", Result.SUCCESS); + } + + @Test + public void read_after_free() throws DeviceNotAvailableException { + runTest("read_after_free", Result.SYNC_MTE_ERROR); + } + + @Test + public void write_after_free() throws DeviceNotAvailableException { + runTest("write_after_free", Result.ASYNC_MTE_ERROR); + } + + @Test + public void underflow_read() throws DeviceNotAvailableException { + runTest("underflow_read", Result.SYNC_MTE_ERROR); + } + + @Test + public void underflow_write() throws DeviceNotAvailableException { + runTest("underflow_write", Result.ASYNC_MTE_ERROR); + } + + @Test + public void overflow_read() throws DeviceNotAvailableException { + runTest("overflow_read", Result.SYNC_MTE_ERROR); + } + + @Test + public void overflow_write() throws DeviceNotAvailableException { + runTest("overflow_write", Result.ASYNC_MTE_ERROR); + } + + @Test + public void untagged_read() throws DeviceNotAvailableException { + runTest("untagged_read", Result.SYNC_MTE_ERROR); + } + + @Test + public void untagged_write() throws DeviceNotAvailableException { + runTest("untagged_write", Result.ASYNC_MTE_ERROR); + } +} diff --git a/src/hardened_malloc/arm_mte.h b/src/hardened_malloc/arm_mte.h new file mode 100644 index 0000000..ea3445e --- /dev/null +++ b/src/hardened_malloc/arm_mte.h @@ -0,0 +1,91 @@ +#ifndef ARM_MTE_H +#define ARM_MTE_H + +#include +#include + +// Returns a tagged pointer. +// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/IRG--Insert-Random-Tag- +static inline void *arm_mte_create_random_tag(void *p, u64 exclusion_mask) { + return __arm_mte_create_random_tag(p, exclusion_mask); +} + +// Tag the memory region with the tag specified in tag bits of tagged_ptr. Memory region itself is +// zeroed. +// tagged_ptr has to be aligned by 16, and len has to be a multiple of 16 (tag granule size). +// +// Arm's software optimization guide says: +// "it is recommended to use STZGM (or DCZGVA) to set tag if data is not a concern." (STZGM and +// DCGZVA are zeroing variants of tagging instructions). +// +// Contents of this function were copied from scudo: +// https://android.googlesource.com/platform/external/scudo/+/refs/tags/android-14.0.0_r1/standalone/memtag.h#167 +// +// scudo is licensed under the Apache License v2.0 with LLVM Exceptions, which is compatible with +// the hardened_malloc's MIT license +static inline void arm_mte_tag_and_clear_mem(void *tagged_ptr, size_t len) { + uintptr_t Begin = (uintptr_t) tagged_ptr; + uintptr_t End = Begin + len; + uintptr_t LineSize, Next, Tmp; + __asm__ __volatile__( + ".arch_extension memtag \n\t" + + // Compute the cache line size in bytes (DCZID_EL0 stores it as the log2 + // of the number of 4-byte words) and bail out to the slow path if DCZID_EL0 + // indicates that the DC instructions are unavailable. + "DCZID .req %[Tmp] \n\t" + "mrs DCZID, dczid_el0 \n\t" + "tbnz DCZID, #4, 3f \n\t" + "and DCZID, DCZID, #15 \n\t" + "mov %[LineSize], #4 \n\t" + "lsl %[LineSize], %[LineSize], DCZID \n\t" + ".unreq DCZID \n\t" + + // Our main loop doesn't handle the case where we don't need to perform any + // DC GZVA operations. If the size of our tagged region is less than + // twice the cache line size, bail out to the slow path since it's not + // guaranteed that we'll be able to do a DC GZVA. + "Size .req %[Tmp] \n\t" + "sub Size, %[End], %[Cur] \n\t" + "cmp Size, %[LineSize], lsl #1 \n\t" + "b.lt 3f \n\t" + ".unreq Size \n\t" + + "LineMask .req %[Tmp] \n\t" + "sub LineMask, %[LineSize], #1 \n\t" + + // STZG until the start of the next cache line. + "orr %[Next], %[Cur], LineMask \n\t" + + "1:\n\t" + "stzg %[Cur], [%[Cur]], #16 \n\t" + "cmp %[Cur], %[Next] \n\t" + "b.lt 1b \n\t" + + // DC GZVA cache lines until we have no more full cache lines. + "bic %[Next], %[End], LineMask \n\t" + ".unreq LineMask \n\t" + + "2: \n\t" + "dc gzva, %[Cur] \n\t" + "add %[Cur], %[Cur], %[LineSize] \n\t" + "cmp %[Cur], %[Next] \n\t" + "b.lt 2b \n\t" + + // STZG until the end of the tagged region. This loop is also used to handle + // slow path cases. + + "3: \n\t" + "cmp %[Cur], %[End] \n\t" + "b.ge 4f \n\t" + "stzg %[Cur], [%[Cur]], #16 \n\t" + "b 3b \n\t" + + "4: \n\t" + + : [Cur] "+&r"(Begin), [LineSize] "=&r"(LineSize), [Next] "=&r"(Next), [Tmp] "=&r"(Tmp) + : [End] "r"(End) + : "memory" + ); +} +#endif diff --git a/src/hardened_malloc/calculate_waste.py b/src/hardened_malloc/calculate_waste.py new file mode 100755 index 0000000..ca26d9a --- /dev/null +++ b/src/hardened_malloc/calculate_waste.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +from sys import argv + +size_classes = [ + 16, 32, 48, 64, 80, 96, 112, 128, + 160, 192, 224, 256, + 320, 384, 448, 512, + 640, 768, 896, 1024, + 1280, 1536, 1792, 2048, + 2560, 3072, 3584, 4096, + 5120, 6144, 7168, 8192, + 10240, 12288, 14336, 16384, + 20480, 24576, 28672, 32768, + 40960, 49152, 57344, 65536, + 81920, 98304, 114688, 131072, +] + +size_class_slots = [ + 256, 128, 85, 64, 51, 42, 36, 64, + 51, 64, 54, 64, + 64, 64, 64, 64, + 64, 64, 64, 64, + 16, 16, 16, 16, + 8, 8, 8, 8, + 8, 8, 8, 8, + 6, 5, 4, 4, + 2, 2, 2, 2, + 1, 1, 1, 1, + 1, 1, 1, 1, +] + +fragmentation = [100 - 1 / 16 * 100] + +for i in range(len(size_classes) - 1): + size_class = size_classes[i + 1] + worst_case = size_classes[i] + 1 + used = worst_case / size_class + fragmentation.append(100 - used * 100); + +def page_align(size): + return (size + 4095) & ~4095 + +print("| ", end="") +print("size class", "worst case internal fragmentation", "slab slots", "slab size", "internal fragmentation for slabs", sep=" | ", end=" |\n") +print("| ", end='') +print("-", "-", "-", "-", "-", sep=" | ", end=" |\n") +for size, slots, fragmentation in zip(size_classes, size_class_slots, fragmentation): + used = size * slots + real = page_align(used) + print("| ", end='') + print(size, f"{fragmentation:.4}%", slots, real, str(100 - used / real * 100) + "%", sep=" | ", end=" |\n") + +if len(argv) < 2: + exit() + +max_bits = 256 +max_page_span = 16 + +print() + +print("maximum bitmap size is {}-bit".format(max_bits)) +print("maximum page span size is {} ({})".format(max_page_span, max_page_span * 4096)) + +for size_class in size_classes: + choices = [] + for bits in range(1, max_bits + 1): + used = size_class * bits + real = page_align(used) + if real > 65536: + continue + pages = real / 4096 + efficiency = used / real * 100 + choices.append((bits, used, real, pages, efficiency)) + + choices.sort(key=lambda x: x[4], reverse=True) + + print() + print("size_class:", size_class) + for choice in choices[:10]: + print(choice) diff --git a/src/hardened_malloc/chacha.c b/src/hardened_malloc/chacha.c new file mode 100644 index 0000000..541a7ac --- /dev/null +++ b/src/hardened_malloc/chacha.c @@ -0,0 +1,177 @@ +// Based on chacha-merged.c version 20080118 +// D. J. Bernstein +// Public domain. + +#include "chacha.h" + +// ChaCha8 +static const unsigned rounds = 8; + +#define U8C(v) (v##U) +#define U32C(v) (v##U) + +#define U8V(v) ((u8)(v) & U8C(0xFF)) +#define U32V(v) ((u32)(v) & U32C(0xFFFFFFFF)) + +#define ROTL32(v, n) \ + (U32V((v) << (n)) | ((v) >> (32 - (n)))) + +#define U8TO32_LITTLE(p) \ + (((u32)((p)[0])) | \ + ((u32)((p)[1]) << 8) | \ + ((u32)((p)[2]) << 16) | \ + ((u32)((p)[3]) << 24)) + +#define U32TO8_LITTLE(p, v) \ + do { \ + (p)[0] = U8V((v)); \ + (p)[1] = U8V((v) >> 8); \ + (p)[2] = U8V((v) >> 16); \ + (p)[3] = U8V((v) >> 24); \ + } while (0) + +#define ROTATE(v, c) (ROTL32(v, c)) +#define XOR(v, w) ((v) ^ (w)) +#define PLUS(v, w) (U32V((v) + (w))) +#define PLUSONE(v) (PLUS((v), 1)) + +#define QUARTERROUND(a, b, c, d) \ + a = PLUS(a, b); d = ROTATE(XOR(d, a), 16); \ + c = PLUS(c, d); b = ROTATE(XOR(b, c), 12); \ + a = PLUS(a, b); d = ROTATE(XOR(d, a), 8); \ + c = PLUS(c, d); b = ROTATE(XOR(b, c), 7); + +static const char sigma[16] = "expand 32-byte k"; + +void chacha_keysetup(chacha_ctx *x, const u8 *k) { + x->input[0] = U8TO32_LITTLE(sigma + 0); + x->input[1] = U8TO32_LITTLE(sigma + 4); + x->input[2] = U8TO32_LITTLE(sigma + 8); + x->input[3] = U8TO32_LITTLE(sigma + 12); + x->input[4] = U8TO32_LITTLE(k + 0); + x->input[5] = U8TO32_LITTLE(k + 4); + x->input[6] = U8TO32_LITTLE(k + 8); + x->input[7] = U8TO32_LITTLE(k + 12); + x->input[8] = U8TO32_LITTLE(k + 16); + x->input[9] = U8TO32_LITTLE(k + 20); + x->input[10] = U8TO32_LITTLE(k + 24); + x->input[11] = U8TO32_LITTLE(k + 28); +} + +void chacha_ivsetup(chacha_ctx *x, const u8 *iv) { + x->input[12] = 0; + x->input[13] = 0; + x->input[14] = U8TO32_LITTLE(iv + 0); + x->input[15] = U8TO32_LITTLE(iv + 4); +} + +void chacha_keystream_bytes(chacha_ctx *x, u8 *c, u32 bytes) { + if (!bytes) { + return; + } + + u8 *ctarget; + u8 tmp[64]; + + u32 j0 = x->input[0]; + u32 j1 = x->input[1]; + u32 j2 = x->input[2]; + u32 j3 = x->input[3]; + u32 j4 = x->input[4]; + u32 j5 = x->input[5]; + u32 j6 = x->input[6]; + u32 j7 = x->input[7]; + u32 j8 = x->input[8]; + u32 j9 = x->input[9]; + u32 j10 = x->input[10]; + u32 j11 = x->input[11]; + u32 j12 = x->input[12]; + u32 j13 = x->input[13]; + u32 j14 = x->input[14]; + u32 j15 = x->input[15]; + + for (;;) { + if (bytes < 64) { + ctarget = c; + c = tmp; + } + u32 x0 = j0; + u32 x1 = j1; + u32 x2 = j2; + u32 x3 = j3; + u32 x4 = j4; + u32 x5 = j5; + u32 x6 = j6; + u32 x7 = j7; + u32 x8 = j8; + u32 x9 = j9; + u32 x10 = j10; + u32 x11 = j11; + u32 x12 = j12; + u32 x13 = j13; + u32 x14 = j14; + u32 x15 = j15; + for (unsigned i = rounds; i > 0; i -= 2) { + QUARTERROUND(x0, x4, x8, x12) + QUARTERROUND(x1, x5, x9, x13) + QUARTERROUND(x2, x6, x10, x14) + QUARTERROUND(x3, x7, x11, x15) + QUARTERROUND(x0, x5, x10, x15) + QUARTERROUND(x1, x6, x11, x12) + QUARTERROUND(x2, x7, x8, x13) + QUARTERROUND(x3, x4, x9, x14) + } + x0 = PLUS(x0, j0); + x1 = PLUS(x1, j1); + x2 = PLUS(x2, j2); + x3 = PLUS(x3, j3); + x4 = PLUS(x4, j4); + x5 = PLUS(x5, j5); + x6 = PLUS(x6, j6); + x7 = PLUS(x7, j7); + x8 = PLUS(x8, j8); + x9 = PLUS(x9, j9); + x10 = PLUS(x10, j10); + x11 = PLUS(x11, j11); + x12 = PLUS(x12, j12); + x13 = PLUS(x13, j13); + x14 = PLUS(x14, j14); + x15 = PLUS(x15, j15); + + j12 = PLUSONE(j12); + if (!j12) { + j13 = PLUSONE(j13); + // stopping at 2^70 bytes per nonce is user's responsibility + } + + U32TO8_LITTLE(c + 0, x0); + U32TO8_LITTLE(c + 4, x1); + U32TO8_LITTLE(c + 8, x2); + U32TO8_LITTLE(c + 12, x3); + U32TO8_LITTLE(c + 16, x4); + U32TO8_LITTLE(c + 20, x5); + U32TO8_LITTLE(c + 24, x6); + U32TO8_LITTLE(c + 28, x7); + U32TO8_LITTLE(c + 32, x8); + U32TO8_LITTLE(c + 36, x9); + U32TO8_LITTLE(c + 40, x10); + U32TO8_LITTLE(c + 44, x11); + U32TO8_LITTLE(c + 48, x12); + U32TO8_LITTLE(c + 52, x13); + U32TO8_LITTLE(c + 56, x14); + U32TO8_LITTLE(c + 60, x15); + + if (bytes <= 64) { + if (bytes < 64) { + for (unsigned i = 0; i < bytes; ++i) { + ctarget[i] = c[i]; + } + } + x->input[12] = j12; + x->input[13] = j13; + return; + } + bytes -= 64; + c += 64; + } +} diff --git a/src/hardened_malloc/chacha.h b/src/hardened_malloc/chacha.h new file mode 100644 index 0000000..81d070f --- /dev/null +++ b/src/hardened_malloc/chacha.h @@ -0,0 +1,17 @@ +#ifndef CHACHA_H +#define CHACHA_H + +#include "util.h" + +#define CHACHA_KEY_SIZE 32 +#define CHACHA_IV_SIZE 8 + +typedef struct { + u32 input[16]; +} chacha_ctx; + +void chacha_keysetup(chacha_ctx *x, const u8 *k); +void chacha_ivsetup(chacha_ctx *x, const u8 *iv); +void chacha_keystream_bytes(chacha_ctx *x, u8 *c, u32 bytes); + +#endif diff --git a/src/hardened_malloc/config/default.mk b/src/hardened_malloc/config/default.mk new file mode 100644 index 0000000..71b1cc4 --- /dev/null +++ b/src/hardened_malloc/config/default.mk @@ -0,0 +1,23 @@ +CONFIG_WERROR := true +CONFIG_NATIVE := true +CONFIG_CXX_ALLOCATOR := true +CONFIG_UBSAN := false +CONFIG_SEAL_METADATA := false +CONFIG_ZERO_ON_FREE := true +CONFIG_WRITE_AFTER_FREE_CHECK := true +CONFIG_SLOT_RANDOMIZE := true +CONFIG_SLAB_CANARY := true +CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH := 1 +CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH := 1 +CONFIG_EXTENDED_SIZE_CLASSES := true +CONFIG_LARGE_SIZE_CLASSES := true +CONFIG_GUARD_SLABS_INTERVAL := 1 +CONFIG_GUARD_SIZE_DIVISOR := 2 +CONFIG_REGION_QUARANTINE_RANDOM_LENGTH := 256 +CONFIG_REGION_QUARANTINE_QUEUE_LENGTH := 1024 +CONFIG_REGION_QUARANTINE_SKIP_THRESHOLD := 33554432 # 32MiB +CONFIG_FREE_SLABS_QUARANTINE_RANDOM_LENGTH := 32 +CONFIG_CLASS_REGION_SIZE := 34359738368 # 32GiB +CONFIG_N_ARENA := 4 +CONFIG_STATS := false +CONFIG_SELF_INIT := true diff --git a/src/hardened_malloc/config/light.mk b/src/hardened_malloc/config/light.mk new file mode 100644 index 0000000..88a0e1f --- /dev/null +++ b/src/hardened_malloc/config/light.mk @@ -0,0 +1,23 @@ +CONFIG_WERROR := true +CONFIG_NATIVE := true +CONFIG_CXX_ALLOCATOR := true +CONFIG_UBSAN := false +CONFIG_SEAL_METADATA := false +CONFIG_ZERO_ON_FREE := true +CONFIG_WRITE_AFTER_FREE_CHECK := false +CONFIG_SLOT_RANDOMIZE := false +CONFIG_SLAB_CANARY := true +CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH := 0 +CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH := 0 +CONFIG_EXTENDED_SIZE_CLASSES := true +CONFIG_LARGE_SIZE_CLASSES := true +CONFIG_GUARD_SLABS_INTERVAL := 8 +CONFIG_GUARD_SIZE_DIVISOR := 2 +CONFIG_REGION_QUARANTINE_RANDOM_LENGTH := 256 +CONFIG_REGION_QUARANTINE_QUEUE_LENGTH := 1024 +CONFIG_REGION_QUARANTINE_SKIP_THRESHOLD := 33554432 # 32MiB +CONFIG_FREE_SLABS_QUARANTINE_RANDOM_LENGTH := 32 +CONFIG_CLASS_REGION_SIZE := 34359738368 # 32GiB +CONFIG_N_ARENA := 4 +CONFIG_STATS := false +CONFIG_SELF_INIT := true diff --git a/src/hardened_malloc/h_malloc.c b/src/hardened_malloc/h_malloc.c new file mode 100644 index 0000000..ffcf0e4 --- /dev/null +++ b/src/hardened_malloc/h_malloc.c @@ -0,0 +1,2190 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "third_party/libdivide.h" + +#include "h_malloc.h" +#include "memory.h" +#include "memtag.h" +#include "mutex.h" +#include "pages.h" +#include "random.h" +#include "util.h" + +#ifdef USE_PKEY +#include +#endif + +#define SLAB_QUARANTINE (SLAB_QUARANTINE_RANDOM_LENGTH > 0 || SLAB_QUARANTINE_QUEUE_LENGTH > 0) +#define REGION_QUARANTINE (REGION_QUARANTINE_RANDOM_LENGTH > 0 || REGION_QUARANTINE_QUEUE_LENGTH > 0) +#define MREMAP_MOVE_THRESHOLD ((size_t)32 * 1024 * 1024) + +static_assert(sizeof(void *) == 8, "64-bit only"); + +static_assert(!WRITE_AFTER_FREE_CHECK || ZERO_ON_FREE, "WRITE_AFTER_FREE_CHECK depends on ZERO_ON_FREE"); + +static_assert(SLAB_QUARANTINE_RANDOM_LENGTH >= 0 && SLAB_QUARANTINE_RANDOM_LENGTH <= 65536, + "invalid slab quarantine random length"); +static_assert(SLAB_QUARANTINE_QUEUE_LENGTH >= 0 && SLAB_QUARANTINE_QUEUE_LENGTH <= 65536, + "invalid slab quarantine queue length"); +static_assert(REGION_QUARANTINE_RANDOM_LENGTH >= 0 && REGION_QUARANTINE_RANDOM_LENGTH <= 65536, + "invalid region quarantine random length"); +static_assert(REGION_QUARANTINE_QUEUE_LENGTH >= 0 && REGION_QUARANTINE_QUEUE_LENGTH <= 65536, + "invalid region quarantine queue length"); +static_assert(FREE_SLABS_QUARANTINE_RANDOM_LENGTH >= 0 && FREE_SLABS_QUARANTINE_RANDOM_LENGTH <= 65536, + "invalid free slabs quarantine random length"); + +static_assert(GUARD_SLABS_INTERVAL >= 1, "invalid guard slabs interval (minimum 1)"); +static_assert(GUARD_SIZE_DIVISOR >= 1, "invalid guard size divisor (minimum 1)"); +static_assert(CONFIG_CLASS_REGION_SIZE >= 1048576, "invalid class region size (minimum 1048576)"); +static_assert(CONFIG_CLASS_REGION_SIZE <= 1099511627776, "invalid class region size (maximum 1099511627776)"); +static_assert(REGION_QUARANTINE_SKIP_THRESHOLD >= 0, + "invalid region quarantine skip threshold (minimum 0)"); +static_assert(MREMAP_MOVE_THRESHOLD >= REGION_QUARANTINE_SKIP_THRESHOLD, + "mremap move threshold must be above region quarantine limit"); + +// either sizeof(u64) or 0 +static const size_t canary_size = SLAB_CANARY ? sizeof(u64) : 0; + +static_assert(N_ARENA >= 1, "must have at least 1 arena"); +static_assert(N_ARENA <= 256, "maximum number of arenas is currently 256"); +#define CACHELINE_SIZE 64 + +#if N_ARENA > 1 +__attribute__((tls_model("initial-exec"))) +static _Thread_local unsigned thread_arena = N_ARENA; +static atomic_uint thread_arena_counter = 0; +#else +static const unsigned thread_arena = 0; +#endif + +static union { + struct { + void *slab_region_start; + void *_Atomic slab_region_end; + struct size_class *size_class_metadata[N_ARENA]; + struct region_allocator *region_allocator; + struct region_metadata *regions[2]; +#ifdef USE_PKEY + int metadata_pkey; +#endif +#ifdef MEMTAG + bool is_memtag_disabled; +#endif + }; + char padding[PAGE_SIZE]; +} ro __attribute__((aligned(PAGE_SIZE))); + +static inline void *get_slab_region_end(void) { + return atomic_load_explicit(&ro.slab_region_end, memory_order_acquire); +} + +#ifdef MEMTAG +static inline bool is_memtag_enabled(void) { + return !ro.is_memtag_disabled; +} +#endif + +#define SLAB_METADATA_COUNT + +struct slab_metadata { + u64 bitmap[4]; + struct slab_metadata *next; + struct slab_metadata *prev; +#if SLAB_CANARY + u64 canary_value; +#endif +#ifdef SLAB_METADATA_COUNT + u16 count; +#endif +#if SLAB_QUARANTINE + u64 quarantine_bitmap[4]; +#endif +#ifdef HAS_ARM_MTE + // arm_mte_tags is used as a u4 array (MTE tags are 4-bit wide) + // + // Its size is calculated by the following formula: + // (MAX_SLAB_SLOT_COUNT + 2) / 2 + // MAX_SLAB_SLOT_COUNT is currently 256, 2 extra slots are needed for branchless handling of + // edge slots in tag_and_clear_slab_slot() + // + // It's intentionally placed at the end of struct to improve locality: for most size classes, + // slot count is far lower than MAX_SLAB_SLOT_COUNT. + u8 arm_mte_tags[129]; +#endif +}; + +static const size_t min_align = 16; +#define MIN_SLAB_SIZE_CLASS_SHIFT 4 + +#if !CONFIG_EXTENDED_SIZE_CLASSES +static const size_t max_slab_size_class = 16384; +#define MAX_SLAB_SIZE_CLASS_SHIFT 14 +// limit on the number of cached empty slabs before attempting purging instead +static const size_t max_empty_slabs_total = max_slab_size_class * 4; +#else +static const size_t max_slab_size_class = 131072; +#define MAX_SLAB_SIZE_CLASS_SHIFT 17 +// limit on the number of cached empty slabs before attempting purging instead +static const size_t max_empty_slabs_total = max_slab_size_class; +#endif + +#if SLAB_QUARANTINE && CONFIG_EXTENDED_SIZE_CLASSES +static const size_t min_extended_size_class = 20480; +#endif + +static const u32 size_classes[] = { + /* 0 */ 0, + /* 16 */ 16, 32, 48, 64, 80, 96, 112, 128, + /* 32 */ 160, 192, 224, 256, + /* 64 */ 320, 384, 448, 512, + /* 128 */ 640, 768, 896, 1024, + /* 256 */ 1280, 1536, 1792, 2048, + /* 512 */ 2560, 3072, 3584, 4096, + /* 1024 */ 5120, 6144, 7168, 8192, + /* 2048 */ 10240, 12288, 14336, 16384, +#if CONFIG_EXTENDED_SIZE_CLASSES + /* 4096 */ 20480, 24576, 28672, 32768, + /* 8192 */ 40960, 49152, 57344, 65536, + /* 16384 */ 81920, 98304, 114688, 131072, +#endif +}; + +static const u16 size_class_slots[] = { + /* 0 */ 256, + /* 16 */ 256, 128, 85, 64, 51, 42, 36, 64, + /* 32 */ 51, 64, 54, 64, + /* 64 */ 64, 64, 64, 64, + /* 128 */ 64, 64, 64, 64, + /* 256 */ 16, 16, 16, 16, + /* 512 */ 8, 8, 8, 8, + /* 1024 */ 8, 8, 8, 8, + /* 2048 */ 6, 5, 4, 4, +#if CONFIG_EXTENDED_SIZE_CLASSES + /* 4096 */ 1, 1, 1, 1, + /* 8192 */ 1, 1, 1, 1, + /* 16384 */ 1, 1, 1, 1, +#endif +}; + +static size_t get_slots(unsigned class) { + return size_class_slots[class]; +} + +static const char *const size_class_labels[] = { + /* 0 */ "malloc 0", + /* 16 */ "malloc 16", "malloc 32", "malloc 48", "malloc 64", + /* 16 */ "malloc 80", "malloc 96", "malloc 112", "malloc 128", + /* 32 */ "malloc 160", "malloc 192", "malloc 224", "malloc 256", + /* 64 */ "malloc 320", "malloc 384", "malloc 448", "malloc 512", + /* 128 */ "malloc 640", "malloc 768", "malloc 896", "malloc 1024", + /* 256 */ "malloc 1280", "malloc 1536", "malloc 1792", "malloc 2048", + /* 512 */ "malloc 2560", "malloc 3072", "malloc 3584", "malloc 4096", + /* 1024 */ "malloc 5120", "malloc 6144", "malloc 7168", "malloc 8192", + /* 2048 */ "malloc 10240", "malloc 12288", "malloc 14336", "malloc 16384", +#if CONFIG_EXTENDED_SIZE_CLASSES + /* 4096 */ "malloc 20480", "malloc 24576", "malloc 28672", "malloc 32768", + /* 8192 */ "malloc 40960", "malloc 49152", "malloc 57344", "malloc 65536", + /* 16384 */ "malloc 81920", "malloc 98304", "malloc 114688", "malloc 131072", +#endif +}; + +static void label_slab(void *slab, size_t slab_size, unsigned class) { + memory_set_name(slab, slab_size, size_class_labels[class]); +} + +#define N_SIZE_CLASSES (sizeof(size_classes) / sizeof(size_classes[0])) + +struct size_info { + size_t size; + size_t class; +}; + +static inline struct size_info get_size_info(size_t size) { + if (unlikely(size == 0)) { + return (struct size_info){0, 0}; + } + // size <= 64 is needed for correctness and raising it to size <= 128 is an optimization + if (size <= 128) { + return (struct size_info){align(size, 16), ((size - 1) >> 4) + 1}; + } + + static const size_t initial_spacing_multiplier = 5; + static const size_t special_small_sizes = 5; // 0, 16, 32, 48, 64 + + size_t spacing_class_shift = log2u64(size - 1) - 2; + size_t spacing_class = 1ULL << spacing_class_shift; + size_t real_size = align(size, spacing_class); + size_t spacing_class_index = (real_size >> spacing_class_shift) - initial_spacing_multiplier; + size_t index = (spacing_class_shift - 4) * 4 + special_small_sizes + spacing_class_index; + return (struct size_info){real_size, index}; +} + +// alignment must be a power of 2 <= PAGE_SIZE since slabs are only page aligned +static inline struct size_info get_size_info_align(size_t size, size_t alignment) { + for (unsigned class = 1; class < N_SIZE_CLASSES; class++) { + size_t real_size = size_classes[class]; + if (size <= real_size && !(real_size & (alignment - 1))) { + return (struct size_info){real_size, class}; + } + } + fatal_error("invalid size for slabs"); +} + +static size_t get_slab_size(size_t slots, size_t size) { + return page_align(slots * size); +} + +struct __attribute__((aligned(CACHELINE_SIZE))) size_class { + struct mutex lock; + + void *class_region_start; + struct slab_metadata *slab_info; + struct libdivide_u32_t size_divisor; + struct libdivide_u64_t slab_size_divisor; + +#if SLAB_QUARANTINE_RANDOM_LENGTH > 0 + void *quarantine_random[SLAB_QUARANTINE_RANDOM_LENGTH << (MAX_SLAB_SIZE_CLASS_SHIFT - MIN_SLAB_SIZE_CLASS_SHIFT)]; +#endif + +#if SLAB_QUARANTINE_QUEUE_LENGTH > 0 + void *quarantine_queue[SLAB_QUARANTINE_QUEUE_LENGTH << (MAX_SLAB_SIZE_CLASS_SHIFT - MIN_SLAB_SIZE_CLASS_SHIFT)]; + size_t quarantine_queue_index; +#endif + + // slabs with at least one allocated slot and at least one free slot + // + // LIFO doubly-linked list + struct slab_metadata *partial_slabs; + + // slabs without allocated slots that are cached for near-term usage + // + // LIFO singly-linked list + struct slab_metadata *empty_slabs; + size_t empty_slabs_total; // length * slab_size + + // slabs without allocated slots that are purged and memory protected + // + // FIFO singly-linked list + struct slab_metadata *free_slabs_head; + struct slab_metadata *free_slabs_tail; + struct slab_metadata *free_slabs_quarantine[FREE_SLABS_QUARANTINE_RANDOM_LENGTH]; + +#if CONFIG_STATS + u64 nmalloc; // may wrap (per jemalloc API) + u64 ndalloc; // may wrap (per jemalloc API) + size_t allocated; + size_t slab_allocated; +#endif + + struct random_state rng; + size_t metadata_allocated; + size_t metadata_count; + size_t metadata_count_unguarded; +}; + +#define CLASS_REGION_SIZE (size_t)CONFIG_CLASS_REGION_SIZE +#define REAL_CLASS_REGION_SIZE (CLASS_REGION_SIZE * 2) +#define ARENA_SIZE (REAL_CLASS_REGION_SIZE * N_SIZE_CLASSES) +static const size_t slab_region_size = ARENA_SIZE * N_ARENA; +static_assert(PAGE_SIZE == 4096, "bitmap handling will need adjustment for other page sizes"); + +static void *get_slab(const struct size_class *c, size_t slab_size, const struct slab_metadata *metadata) { + size_t index = metadata - c->slab_info; + return (char *)c->class_region_start + (index * slab_size); +} + +#define MAX_METADATA_MAX (CLASS_REGION_SIZE / PAGE_SIZE) + +static size_t get_metadata_max(size_t slab_size) { + return CLASS_REGION_SIZE / slab_size; +} + +static struct slab_metadata *alloc_metadata(struct size_class *c, size_t slab_size, bool non_zero_size) { + if (unlikely(c->metadata_count >= c->metadata_allocated)) { + size_t metadata_max = get_metadata_max(slab_size); + if (unlikely(c->metadata_count >= metadata_max)) { + errno = ENOMEM; + return NULL; + } + size_t allocate = max(c->metadata_allocated * 2, PAGE_SIZE / sizeof(struct slab_metadata)); + if (allocate > metadata_max) { + allocate = metadata_max; + } + if (unlikely(memory_protect_rw_metadata(c->slab_info, allocate * sizeof(struct slab_metadata)))) { + return NULL; + } + c->metadata_allocated = allocate; + } + + struct slab_metadata *metadata = c->slab_info + c->metadata_count; + void *slab = get_slab(c, slab_size, metadata); + if (non_zero_size && memory_protect_rw(slab, slab_size)) { + return NULL; + } + c->metadata_count++; + c->metadata_count_unguarded++; + if (c->metadata_count_unguarded >= GUARD_SLABS_INTERVAL) { + c->metadata_count++; + c->metadata_count_unguarded = 0; + } + return metadata; +} + +static void set_used_slot(struct slab_metadata *metadata, size_t index) { + size_t bucket = index / U64_WIDTH; + metadata->bitmap[bucket] |= 1UL << (index - bucket * U64_WIDTH); +#ifdef SLAB_METADATA_COUNT + metadata->count++; +#endif +} + +static void clear_used_slot(struct slab_metadata *metadata, size_t index) { + size_t bucket = index / U64_WIDTH; + metadata->bitmap[bucket] &= ~(1UL << (index - bucket * U64_WIDTH)); +#ifdef SLAB_METADATA_COUNT + metadata->count--; +#endif +} + +static bool is_used_slot(const struct slab_metadata *metadata, size_t index) { + size_t bucket = index / U64_WIDTH; + return (metadata->bitmap[bucket] >> (index - bucket * U64_WIDTH)) & 1UL; +} + +#if SLAB_QUARANTINE +static void set_quarantine_slot(struct slab_metadata *metadata, size_t index) { + size_t bucket = index / U64_WIDTH; + metadata->quarantine_bitmap[bucket] |= 1UL << (index - bucket * U64_WIDTH); +} + +static void clear_quarantine_slot(struct slab_metadata *metadata, size_t index) { + size_t bucket = index / U64_WIDTH; + metadata->quarantine_bitmap[bucket] &= ~(1UL << (index - bucket * U64_WIDTH)); +} + +static bool is_quarantine_slot(const struct slab_metadata *metadata, size_t index) { + size_t bucket = index / U64_WIDTH; + return (metadata->quarantine_bitmap[bucket] >> (index - bucket * U64_WIDTH)) & 1UL; +} +#endif + +static u64 get_mask(size_t slots) { + return slots < U64_WIDTH ? ~0UL << slots : 0; +} + +static size_t get_free_slot(struct random_state *rng, size_t slots, const struct slab_metadata *metadata) { + if (SLOT_RANDOMIZE) { + // randomize start location for linear search (uniform random choice is too slow) + size_t random_index = get_random_u16_uniform(rng, slots); + size_t first_bitmap = random_index / U64_WIDTH; + u64 random_split = ~(~0UL << (random_index - first_bitmap * U64_WIDTH)); + + size_t i = first_bitmap; + u64 masked = metadata->bitmap[i]; + masked |= random_split; + for (;;) { + if (i == slots / U64_WIDTH) { + masked |= get_mask(slots - i * U64_WIDTH); + } + + if (masked != ~0UL) { + return ffz64(masked) - 1 + i * U64_WIDTH; + } + + i = i == (slots - 1) / U64_WIDTH ? 0 : i + 1; + masked = metadata->bitmap[i]; + } + } else { + for (size_t i = 0; i <= (slots - 1) / U64_WIDTH; i++) { + u64 masked = metadata->bitmap[i]; + if (i == (slots - 1) / U64_WIDTH) { + masked |= get_mask(slots - i * U64_WIDTH); + } + + if (masked != ~0UL) { + return ffz64(masked) - 1 + i * U64_WIDTH; + } + } + } + + fatal_error("no zero bits"); +} + +static bool has_free_slots(size_t slots, const struct slab_metadata *metadata) { +#ifdef SLAB_METADATA_COUNT + return metadata->count < slots; +#else + if (slots <= U64_WIDTH) { + u64 masked = metadata->bitmap[0] | get_mask(slots); + return masked != ~0UL; + } + if (slots <= U64_WIDTH * 2) { + u64 masked = metadata->bitmap[1] | get_mask(slots - U64_WIDTH); + return metadata->bitmap[0] != ~0UL || masked != ~0UL; + } + if (slots <= U64_WIDTH * 3) { + u64 masked = metadata->bitmap[2] | get_mask(slots - U64_WIDTH * 2); + return metadata->bitmap[0] != ~0UL || metadata->bitmap[1] != ~0UL || masked != ~0UL; + } + u64 masked = metadata->bitmap[3] | get_mask(slots - U64_WIDTH * 3); + return metadata->bitmap[0] != ~0UL || metadata->bitmap[1] != ~0UL || metadata->bitmap[2] != ~0UL || masked != ~0UL; +#endif +} + +static bool is_free_slab(const struct slab_metadata *metadata) { +#ifdef SLAB_METADATA_COUNT + return !metadata->count; +#else + return !metadata->bitmap[0] && !metadata->bitmap[1] && !metadata->bitmap[2] && + !metadata->bitmap[3]; +#endif +} + +static struct slab_metadata *get_metadata(const struct size_class *c, const void *p) { + size_t offset = (const char *)p - (const char *)c->class_region_start; + size_t index = libdivide_u64_do(offset, &c->slab_size_divisor); + // still caught without this check either as a read access violation or "double free" + if (unlikely(index >= c->metadata_allocated)) { + fatal_error("invalid free within a slab yet to be used"); + } + return c->slab_info + index; +} + +static void *slot_pointer(size_t size, void *slab, size_t slot) { + return (char *)slab + slot * size; +} + +static void write_after_free_check(const char *p, size_t size) { + if (!WRITE_AFTER_FREE_CHECK) { + return; + } + +#ifdef HAS_ARM_MTE + if (likely(is_memtag_enabled())) { + return; + } +#endif + + for (size_t i = 0; i < size; i += sizeof(u64)) { + if (unlikely(*(const u64 *)(const void *)(p + i))) { + fatal_error("detected write after free"); + } + } +} + +static void set_slab_canary_value(UNUSED struct slab_metadata *metadata, UNUSED struct random_state *rng) { +#if SLAB_CANARY + static const u64 canary_mask = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ? + 0xffffffffffffff00UL : + 0x00ffffffffffffffUL; + + metadata->canary_value = get_random_u64(rng) & canary_mask; +#ifdef HAS_ARM_MTE + if (unlikely(metadata->canary_value == 0)) { + // 0 is reserved to support disabling MTE at runtime (this is required on Android). + // When MTE is enabled, writing and reading of canaries is disabled, i.e. canary remains zeroed. + // After MTE is disabled, canaries that are set to 0 are ignored, since they wouldn't match + // slab's metadata->canary_value. + metadata->canary_value = 0x100; // 0x100 was chosen as the smallest acceptable value + } +#endif +#endif +} + +static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void *p, UNUSED size_t size) { +#if SLAB_CANARY +#ifdef HAS_ARM_MTE + if (likely(is_memtag_enabled())) { + return; + } +#endif + + memcpy((char *)p + size - canary_size, &metadata->canary_value, canary_size); +#endif +} + +static void check_canary(UNUSED const struct slab_metadata *metadata, UNUSED const void *p, UNUSED size_t size) { +#if SLAB_CANARY +#ifdef HAS_ARM_MTE + if (likely(is_memtag_enabled())) { + return; + } +#endif + + u64 canary_value; + memcpy(&canary_value, (const char *)p + size - canary_size, canary_size); + +#ifdef HAS_ARM_MTE + if (unlikely(canary_value == 0)) { + return; + } +#endif + + if (unlikely(canary_value != metadata->canary_value)) { + fatal_error("canary corrupted"); + } +#endif +} + +static inline void stats_small_allocate(UNUSED struct size_class *c, UNUSED size_t size) { +#if CONFIG_STATS + c->allocated += size; + c->nmalloc++; +#endif +} + +static inline void stats_small_deallocate(UNUSED struct size_class *c, UNUSED size_t size) { +#if CONFIG_STATS + c->allocated -= size; + c->ndalloc++; +#endif +} + +static inline void stats_slab_allocate(UNUSED struct size_class *c, UNUSED size_t slab_size) { +#if CONFIG_STATS + c->slab_allocated += slab_size; +#endif +} + +static inline void stats_slab_deallocate(UNUSED struct size_class *c, UNUSED size_t slab_size) { +#if CONFIG_STATS + c->slab_allocated -= slab_size; +#endif +} + +#ifdef HAS_ARM_MTE +static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_ptr, size_t slot_idx, size_t slot_size) { + // arm_mte_tags is an array of 4-bit unsigned integers stored as u8 array (MTE tags are 4-bit wide) + // + // It stores the most recent tag for each slab slot, or 0 if the slot was never used. + // Slab indices in arm_mte_tags array are shifted to the right by 1, and size of this array + // is (MAX_SLAB_SLOT_COUNT + 2). This means that first and last values of arm_mte_tags array + // are always 0, which allows to handle edge slots in a branchless way when tag exclusion mask + // is constructed. + u8 *slot_tags = metadata->arm_mte_tags; + + // Tag exclusion mask. 0 tag is always excluded to detect accesses to slab memory via untagged + // pointers. Moreover, 0 tag is excluded in bionic via PR_MTE_TAG_MASK prctl + u64 tem = (1 << 0) | (1 << RESERVED_TAG); + + // current or previous tag of left neighbor or 0 if there's no left neighbor or if it was never used + tem |= (1 << u4_arr_get(slot_tags, slot_idx)); + // previous tag of this slot or 0 if it was never used + tem |= (1 << u4_arr_get(slot_tags, slot_idx + 1)); + // current or previous tag of right neighbor or 0 if there's no right neighbor or if it was never used + tem |= (1 << u4_arr_get(slot_tags, slot_idx + 2)); + + void *tagged_ptr = arm_mte_create_random_tag(slot_ptr, tem); + // slot addresses and sizes are always aligned by 16 + arm_mte_tag_and_clear_mem(tagged_ptr, slot_size); + + // store new tag of this slot + u4_arr_set(slot_tags, slot_idx + 1, get_pointer_tag(tagged_ptr)); + + return tagged_ptr; +} +#endif + +static inline void *allocate_small(unsigned arena, size_t requested_size) { + struct size_info info = get_size_info(requested_size); + size_t size = likely(info.size) ? info.size : 16; + + struct size_class *c = &ro.size_class_metadata[arena][info.class]; + size_t slots = get_slots(info.class); + size_t slab_size = get_slab_size(slots, size); + + mutex_lock(&c->lock); + + if (c->partial_slabs == NULL) { + if (c->empty_slabs != NULL) { + struct slab_metadata *metadata = c->empty_slabs; + c->empty_slabs = c->empty_slabs->next; + c->empty_slabs_total -= slab_size; + + metadata->next = NULL; + metadata->prev = NULL; + + c->partial_slabs = slots > 1 ? metadata : NULL; + + void *slab = get_slab(c, slab_size, metadata); + size_t slot = get_free_slot(&c->rng, slots, metadata); + set_used_slot(metadata, slot); + void *p = slot_pointer(size, slab, slot); + if (requested_size) { + write_after_free_check(p, size - canary_size); + set_canary(metadata, p, size); +#ifdef HAS_ARM_MTE + if (likely(is_memtag_enabled())) { + p = tag_and_clear_slab_slot(metadata, p, slot, size); + } +#endif + } + stats_small_allocate(c, size); + + mutex_unlock(&c->lock); + return p; + } + + if (c->free_slabs_head != NULL) { + struct slab_metadata *metadata = c->free_slabs_head; + set_slab_canary_value(metadata, &c->rng); + + void *slab = get_slab(c, slab_size, metadata); + if (requested_size && memory_protect_rw(slab, slab_size)) { + mutex_unlock(&c->lock); + return NULL; + } + + c->free_slabs_head = c->free_slabs_head->next; + if (c->free_slabs_head == NULL) { + c->free_slabs_tail = NULL; + } + + metadata->next = NULL; + metadata->prev = NULL; + + c->partial_slabs = slots > 1 ? metadata : NULL; + + size_t slot = get_free_slot(&c->rng, slots, metadata); + set_used_slot(metadata, slot); + void *p = slot_pointer(size, slab, slot); + if (requested_size) { + set_canary(metadata, p, size); +#ifdef HAS_ARM_MTE + if (likely(is_memtag_enabled())) { + p = tag_and_clear_slab_slot(metadata, p, slot, size); + } +#endif + } + stats_slab_allocate(c, slab_size); + stats_small_allocate(c, size); + + mutex_unlock(&c->lock); + return p; + } + + struct slab_metadata *metadata = alloc_metadata(c, slab_size, requested_size); + if (unlikely(metadata == NULL)) { + mutex_unlock(&c->lock); + return NULL; + } + set_slab_canary_value(metadata, &c->rng); + + c->partial_slabs = slots > 1 ? metadata : NULL; + void *slab = get_slab(c, slab_size, metadata); + size_t slot = get_free_slot(&c->rng, slots, metadata); + set_used_slot(metadata, slot); + void *p = slot_pointer(size, slab, slot); + if (requested_size) { + set_canary(metadata, p, size); +#ifdef HAS_ARM_MTE + if (likely(is_memtag_enabled())) { + p = tag_and_clear_slab_slot(metadata, p, slot, size); + } +#endif + } + stats_slab_allocate(c, slab_size); + stats_small_allocate(c, size); + + mutex_unlock(&c->lock); + return p; + } + + struct slab_metadata *metadata = c->partial_slabs; + size_t slot = get_free_slot(&c->rng, slots, metadata); + set_used_slot(metadata, slot); + + if (!has_free_slots(slots, metadata)) { + c->partial_slabs = c->partial_slabs->next; + if (c->partial_slabs) { + c->partial_slabs->prev = NULL; + } + } + + void *slab = get_slab(c, slab_size, metadata); + void *p = slot_pointer(size, slab, slot); + if (requested_size) { + write_after_free_check(p, size - canary_size); + set_canary(metadata, p, size); +#ifdef HAS_ARM_MTE + if (likely(is_memtag_enabled())) { + p = tag_and_clear_slab_slot(metadata, p, slot, size); + } +#endif + } + stats_small_allocate(c, size); + + mutex_unlock(&c->lock); + return p; +} + +struct slab_size_class_info { + unsigned arena; + size_t class; +}; + +static struct slab_size_class_info slab_size_class(const void *p) { + size_t offset = (const char *)p - (const char *)ro.slab_region_start; + unsigned arena = 0; + if (N_ARENA > 1) { + arena = offset / ARENA_SIZE; + offset -= arena * ARENA_SIZE; + } + return (struct slab_size_class_info){arena, offset / REAL_CLASS_REGION_SIZE}; +} + +static size_t slab_usable_size(const void *p) { + return size_classes[slab_size_class(p).class]; +} + +static void enqueue_free_slab(struct size_class *c, struct slab_metadata *metadata) { + metadata->next = NULL; + + static_assert(FREE_SLABS_QUARANTINE_RANDOM_LENGTH < (u16)-1, "free slabs quarantine too large"); + size_t index = get_random_u16_uniform(&c->rng, FREE_SLABS_QUARANTINE_RANDOM_LENGTH); + struct slab_metadata *substitute = c->free_slabs_quarantine[index]; + c->free_slabs_quarantine[index] = metadata; + + if (substitute == NULL) { + return; + } + + if (c->free_slabs_tail != NULL) { + c->free_slabs_tail->next = substitute; + } else { + c->free_slabs_head = substitute; + } + c->free_slabs_tail = substitute; +} + +// preserves errno +static inline void deallocate_small(void *p, const size_t *expected_size) { + struct slab_size_class_info size_class_info = slab_size_class(p); + size_t class = size_class_info.class; + + struct size_class *c = &ro.size_class_metadata[size_class_info.arena][class]; + size_t size = size_classes[class]; + if (expected_size && unlikely(size != *expected_size)) { + fatal_error("sized deallocation mismatch (small)"); + } + bool is_zero_size = size == 0; + if (unlikely(is_zero_size)) { + size = 16; + } + size_t slots = get_slots(class); + size_t slab_size = get_slab_size(slots, size); + + mutex_lock(&c->lock); + + stats_small_deallocate(c, size); + + struct slab_metadata *metadata = get_metadata(c, p); + void *slab = get_slab(c, slab_size, metadata); + size_t slot = libdivide_u32_do((char *)p - (char *)slab, &c->size_divisor); + + if (unlikely(slot_pointer(size, slab, slot) != p)) { + fatal_error("invalid unaligned free"); + } + + if (unlikely(!is_used_slot(metadata, slot))) { + fatal_error("double free"); + } + + if (likely(!is_zero_size)) { + check_canary(metadata, p, size); + + bool skip_zero = false; +#ifdef HAS_ARM_MTE + if (likely(is_memtag_enabled())) { + arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size); + // metadata->arm_mte_tags is intentionally not updated, see tag_and_clear_slab_slot() + skip_zero = true; + } +#endif + + if (ZERO_ON_FREE && !skip_zero) { + memset(p, 0, size - canary_size); + } + } + +#if SLAB_QUARANTINE + if (unlikely(is_quarantine_slot(metadata, slot))) { + fatal_error("double free (quarantine)"); + } + + set_quarantine_slot(metadata, slot); + + size_t quarantine_shift = clz64(size) - (63 - MAX_SLAB_SIZE_CLASS_SHIFT); + +#if SLAB_QUARANTINE_RANDOM_LENGTH > 0 + size_t slab_quarantine_random_length = SLAB_QUARANTINE_RANDOM_LENGTH << quarantine_shift; + + size_t random_index = get_random_u16_uniform(&c->rng, slab_quarantine_random_length); + void *random_substitute = c->quarantine_random[random_index]; + c->quarantine_random[random_index] = p; + + if (random_substitute == NULL) { + mutex_unlock(&c->lock); + return; + } + + p = random_substitute; +#endif + +#if SLAB_QUARANTINE_QUEUE_LENGTH > 0 + size_t slab_quarantine_queue_length = SLAB_QUARANTINE_QUEUE_LENGTH << quarantine_shift; + + void *queue_substitute = c->quarantine_queue[c->quarantine_queue_index]; + c->quarantine_queue[c->quarantine_queue_index] = p; + c->quarantine_queue_index = (c->quarantine_queue_index + 1) % slab_quarantine_queue_length; + + if (queue_substitute == NULL) { + mutex_unlock(&c->lock); + return; + } + + p = queue_substitute; +#endif + + metadata = get_metadata(c, p); + slab = get_slab(c, slab_size, metadata); + slot = libdivide_u32_do((char *)p - (char *)slab, &c->size_divisor); + + clear_quarantine_slot(metadata, slot); +#endif + + // triggered even for slots == 1 and then undone below + if (!has_free_slots(slots, metadata)) { + metadata->next = c->partial_slabs; + metadata->prev = NULL; + + if (c->partial_slabs) { + c->partial_slabs->prev = metadata; + } + c->partial_slabs = metadata; + } + + clear_used_slot(metadata, slot); + + if (is_free_slab(metadata)) { + if (metadata->prev) { + metadata->prev->next = metadata->next; + } else { + c->partial_slabs = metadata->next; + } + if (metadata->next) { + metadata->next->prev = metadata->prev; + } + + metadata->prev = NULL; + + if (c->empty_slabs_total + slab_size > max_empty_slabs_total) { + int saved_errno = errno; + if (!memory_map_fixed(slab, slab_size)) { + label_slab(slab, slab_size, class); + stats_slab_deallocate(c, slab_size); + enqueue_free_slab(c, metadata); + mutex_unlock(&c->lock); + return; + } + memory_purge(slab, slab_size); + errno = saved_errno; + // handle out-of-memory by putting it into the empty slabs list + } + + metadata->next = c->empty_slabs; + c->empty_slabs = metadata; + c->empty_slabs_total += slab_size; + } + + mutex_unlock(&c->lock); +} + +struct region_metadata { + void *p; + size_t size; + size_t guard_size; +}; + +struct quarantine_info { + void *p; + size_t size; +}; + +#define INITIAL_REGION_TABLE_SIZE 128 +#define MAX_REGION_TABLE_SIZE (CLASS_REGION_SIZE / PAGE_SIZE / sizeof(struct region_metadata)) + +struct region_allocator { + struct mutex lock; + struct region_metadata *regions; + size_t total; + size_t free; +#if CONFIG_STATS + size_t allocated; +#endif +#if REGION_QUARANTINE_RANDOM_LENGTH + struct quarantine_info quarantine_random[REGION_QUARANTINE_RANDOM_LENGTH]; +#endif +#if REGION_QUARANTINE_QUEUE_LENGTH + struct quarantine_info quarantine_queue[REGION_QUARANTINE_QUEUE_LENGTH]; + size_t quarantine_queue_index; +#endif + struct random_state rng; +}; + +static inline void stats_large_allocate(UNUSED struct region_allocator *ra, UNUSED size_t size) { +#if CONFIG_STATS + ra->allocated += size; +#endif +} + +static inline void stats_large_deallocate(UNUSED struct region_allocator *ra, UNUSED size_t size) { +#if CONFIG_STATS + ra->allocated -= size; +#endif +} + +struct __attribute__((aligned(PAGE_SIZE))) slab_info_mapping { + struct slab_metadata slab_info[MAX_METADATA_MAX]; +}; + +struct __attribute__((aligned(PAGE_SIZE))) allocator_state { + struct size_class size_class_metadata[N_ARENA][N_SIZE_CLASSES]; + struct region_allocator region_allocator; + // padding until next page boundary for mprotect + struct region_metadata regions_a[MAX_REGION_TABLE_SIZE] __attribute__((aligned(PAGE_SIZE))); + // padding until next page boundary for mprotect + struct region_metadata regions_b[MAX_REGION_TABLE_SIZE] __attribute__((aligned(PAGE_SIZE))); + // padding until next page boundary for mprotect + struct slab_info_mapping slab_info_mapping[N_ARENA][N_SIZE_CLASSES]; + // padding until next page boundary for mprotect +}; + +static void regions_quarantine_deallocate_pages(void *p, size_t size, size_t guard_size) { + if (!REGION_QUARANTINE || size >= REGION_QUARANTINE_SKIP_THRESHOLD) { + deallocate_pages(p, size, guard_size); + return; + } + + if (unlikely(memory_map_fixed(p, size))) { + memory_purge(p, size); + } else { + memory_set_name(p, size, "malloc large quarantine"); + } + + struct quarantine_info target = + (struct quarantine_info){(char *)p - guard_size, size + guard_size * 2}; + + struct region_allocator *ra = ro.region_allocator; + + mutex_lock(&ra->lock); + +#if REGION_QUARANTINE_RANDOM_LENGTH + size_t index = get_random_u64_uniform(&ra->rng, REGION_QUARANTINE_RANDOM_LENGTH); + struct quarantine_info random_substitute = ra->quarantine_random[index]; + ra->quarantine_random[index] = target; + if (random_substitute.p == NULL) { + mutex_unlock(&ra->lock); + return; + } + target = random_substitute; +#endif + +#if REGION_QUARANTINE_QUEUE_LENGTH + struct quarantine_info queue_substitute = ra->quarantine_queue[ra->quarantine_queue_index]; + ra->quarantine_queue[ra->quarantine_queue_index] = target; + ra->quarantine_queue_index = (ra->quarantine_queue_index + 1) % REGION_QUARANTINE_QUEUE_LENGTH; + target = queue_substitute; +#endif + + mutex_unlock(&ra->lock); + + if (target.p != NULL) { + memory_unmap(target.p, target.size); + } +} + +static int regions_grow(void) { + struct region_allocator *ra = ro.region_allocator; + + if (ra->total > SIZE_MAX / sizeof(struct region_metadata) / 2) { + return 1; + } + + size_t newtotal = ra->total * 2; + size_t newsize = newtotal * sizeof(struct region_metadata); + size_t mask = newtotal - 1; + + if (newtotal > MAX_REGION_TABLE_SIZE) { + return 1; + } + + struct region_metadata *p = ra->regions == ro.regions[0] ? + ro.regions[1] : ro.regions[0]; + + if (memory_protect_rw_metadata(p, newsize)) { + return 1; + } + + for (size_t i = 0; i < ra->total; i++) { + const void *q = ra->regions[i].p; + if (q != NULL) { + size_t index = hash_page(q) & mask; + while (p[index].p != NULL) { + index = (index - 1) & mask; + } + p[index] = ra->regions[i]; + } + } + + memory_map_fixed(ra->regions, ra->total * sizeof(struct region_metadata)); + memory_set_name(ra->regions, ra->total * sizeof(struct region_metadata), "malloc allocator_state"); + ra->free = ra->free + ra->total; + ra->total = newtotal; + ra->regions = p; + return 0; +} + +static int regions_insert(void *p, size_t size, size_t guard_size) { + struct region_allocator *ra = ro.region_allocator; + + if (ra->free * 4 < ra->total) { + if (regions_grow()) { + return 1; + } + } + + size_t mask = ra->total - 1; + size_t index = hash_page(p) & mask; + void *q = ra->regions[index].p; + while (q != NULL) { + index = (index - 1) & mask; + q = ra->regions[index].p; + } + ra->regions[index].p = p; + ra->regions[index].size = size; + ra->regions[index].guard_size = guard_size; + ra->free--; + return 0; +} + +static struct region_metadata *regions_find(const void *p) { + const struct region_allocator *ra = ro.region_allocator; + + size_t mask = ra->total - 1; + size_t index = hash_page(p) & mask; + void *r = ra->regions[index].p; + while (r != p && r != NULL) { + index = (index - 1) & mask; + r = ra->regions[index].p; + } + return (r == p && r != NULL) ? &ra->regions[index] : NULL; +} + +static void regions_delete(const struct region_metadata *region) { + struct region_allocator *ra = ro.region_allocator; + + size_t mask = ra->total - 1; + + ra->free++; + + size_t i = region - ra->regions; + for (;;) { + ra->regions[i].p = NULL; + ra->regions[i].size = 0; + size_t j = i; + for (;;) { + i = (i - 1) & mask; + if (ra->regions[i].p == NULL) { + return; + } + size_t r = hash_page(ra->regions[i].p) & mask; + if ((i <= r && r < j) || (r < j && j < i) || (j < i && i <= r)) { + continue; + } + ra->regions[j] = ra->regions[i]; + break; + } + } +} + +int get_metadata_key(void) { +#ifdef USE_PKEY + return ro.metadata_pkey; +#else + return -1; +#endif +} + +static inline void thread_set_metadata_access(UNUSED unsigned access) { +#ifdef USE_PKEY + if (ro.metadata_pkey == -1) { + return; + } + pkey_set(ro.metadata_pkey, access); +#endif +} + +static inline void thread_unseal_metadata(void) { + thread_set_metadata_access(0); +} + +static inline void thread_seal_metadata(void) { +#ifdef USE_PKEY + thread_set_metadata_access(PKEY_DISABLE_ACCESS); +#endif +} + +static void full_lock(void) { + thread_unseal_metadata(); + mutex_lock(&ro.region_allocator->lock); + for (unsigned arena = 0; arena < N_ARENA; arena++) { + for (unsigned class = 0; class < N_SIZE_CLASSES; class++) { + mutex_lock(&ro.size_class_metadata[arena][class].lock); + } + } + thread_seal_metadata(); +} + +static void full_unlock(void) { + thread_unseal_metadata(); + mutex_unlock(&ro.region_allocator->lock); + for (unsigned arena = 0; arena < N_ARENA; arena++) { + for (unsigned class = 0; class < N_SIZE_CLASSES; class++) { + mutex_unlock(&ro.size_class_metadata[arena][class].lock); + } + } + thread_seal_metadata(); +} + +static void post_fork_child(void) { + thread_unseal_metadata(); + + mutex_init(&ro.region_allocator->lock); + random_state_init(&ro.region_allocator->rng); + for (unsigned arena = 0; arena < N_ARENA; arena++) { + for (unsigned class = 0; class < N_SIZE_CLASSES; class++) { + struct size_class *c = &ro.size_class_metadata[arena][class]; + mutex_init(&c->lock); + random_state_init(&c->rng); + } + } + thread_seal_metadata(); +} + +static inline bool is_init(void) { + return get_slab_region_end() != NULL; +} + +static inline void enforce_init(void) { + if (unlikely(!is_init())) { + fatal_error("invalid uninitialized allocator usage"); + } +} + +static struct mutex init_lock = MUTEX_INITIALIZER; + +COLD static void init_slow_path(void) { + + mutex_lock(&init_lock); + + if (unlikely(is_init())) { + mutex_unlock(&init_lock); + return; + } + +#ifdef USE_PKEY + ro.metadata_pkey = pkey_alloc(0, 0); +#endif + + if (unlikely(sysconf(_SC_PAGESIZE) != PAGE_SIZE)) { + fatal_error("runtime page size does not match compile-time page size which is not supported"); + } + + struct random_state *rng = allocate_pages(sizeof(struct random_state), PAGE_SIZE, true, "malloc init rng"); + if (unlikely(rng == NULL)) { + fatal_error("failed to allocate init rng"); + } + random_state_init(rng); + + size_t metadata_guard_size = + (get_random_u64_uniform(rng, REAL_CLASS_REGION_SIZE / PAGE_SIZE) + 1) * PAGE_SIZE; + + struct allocator_state *allocator_state = + allocate_pages(sizeof(struct allocator_state), metadata_guard_size, false, "malloc allocator_state"); + if (unlikely(allocator_state == NULL)) { + fatal_error("failed to reserve allocator state"); + } + if (unlikely(memory_protect_rw_metadata(allocator_state, offsetof(struct allocator_state, regions_a)))) { + fatal_error("failed to unprotect allocator state"); + } + + ro.region_allocator = &allocator_state->region_allocator; + struct region_allocator *ra = ro.region_allocator; + + mutex_init(&ra->lock); + random_state_init_from_random_state(&ra->rng, rng); + ro.regions[0] = allocator_state->regions_a; + ro.regions[1] = allocator_state->regions_b; + ra->regions = ro.regions[0]; + ra->total = INITIAL_REGION_TABLE_SIZE; + ra->free = INITIAL_REGION_TABLE_SIZE; + if (unlikely(memory_protect_rw_metadata(ra->regions, ra->total * sizeof(struct region_metadata)))) { + fatal_error("failed to unprotect memory for regions table"); + } +#ifdef HAS_ARM_MTE + if (likely(is_memtag_enabled())) { + ro.slab_region_start = memory_map_mte(slab_region_size); + } else { + ro.slab_region_start = memory_map(slab_region_size); + } +#else + ro.slab_region_start = memory_map(slab_region_size); +#endif + if (unlikely(ro.slab_region_start == NULL)) { + fatal_error("failed to allocate slab region"); + } + void *slab_region_end = (char *)ro.slab_region_start + slab_region_size; + memory_set_name(ro.slab_region_start, slab_region_size, "malloc slab region gap"); + + for (unsigned arena = 0; arena < N_ARENA; arena++) { + ro.size_class_metadata[arena] = allocator_state->size_class_metadata[arena]; + for (unsigned class = 0; class < N_SIZE_CLASSES; class++) { + struct size_class *c = &ro.size_class_metadata[arena][class]; + + mutex_init(&c->lock); + random_state_init_from_random_state(&c->rng, rng); + + size_t bound = (REAL_CLASS_REGION_SIZE - CLASS_REGION_SIZE) / PAGE_SIZE - 1; + size_t gap = (get_random_u64_uniform(rng, bound) + 1) * PAGE_SIZE; + c->class_region_start = (char *)ro.slab_region_start + ARENA_SIZE * arena + REAL_CLASS_REGION_SIZE * class + gap; + label_slab(c->class_region_start, CLASS_REGION_SIZE, class); + + size_t size = size_classes[class]; + if (size == 0) { + size = 16; + } + c->size_divisor = libdivide_u32_gen(size); + size_t slab_size = get_slab_size(get_slots(class), size); + c->slab_size_divisor = libdivide_u64_gen(slab_size); + c->slab_info = allocator_state->slab_info_mapping[arena][class].slab_info; + } + } + + deallocate_pages(rng, sizeof(struct random_state), PAGE_SIZE); + + atomic_store_explicit(&ro.slab_region_end, slab_region_end, memory_order_release); + + if (unlikely(memory_protect_ro(&ro, sizeof(ro)))) { + fatal_error("failed to protect allocator data"); + } + memory_set_name(&ro, sizeof(ro), "malloc read-only after init"); + + mutex_unlock(&init_lock); + + // may allocate, so wait until the allocator is initialized to avoid deadlocking + if (unlikely(pthread_atfork(full_lock, full_unlock, post_fork_child))) { + fatal_error("pthread_atfork failed"); + } +} + +static inline unsigned init(void) { + unsigned arena = thread_arena; +#if N_ARENA > 1 + if (likely(arena < N_ARENA)) { + return arena; + } + thread_arena = arena = thread_arena_counter++ % N_ARENA; +#endif + if (unlikely(!is_init())) { + init_slow_path(); + } + return arena; +} + +#if CONFIG_SELF_INIT +// trigger early initialization to set up pthread_atfork and protect state as soon as possible +COLD __attribute__((constructor(101))) static void trigger_early_init(void) { + // avoid calling init directly to skip it if this isn't the malloc implementation + h_free(h_malloc(16)); +} +#endif + +// Returns 0 on overflow. +static size_t get_large_size_class(size_t size) { + if (CONFIG_LARGE_SIZE_CLASSES) { + // Continue small size class growth pattern of power of 2 spacing classes: + // + // 4 KiB [20 KiB, 24 KiB, 28 KiB, 32 KiB] + // 8 KiB [40 KiB, 48 KiB, 54 KiB, 64 KiB] + // 16 KiB [80 KiB, 96 KiB, 112 KiB, 128 KiB] + // 32 KiB [160 KiB, 192 KiB, 224 KiB, 256 KiB] + // 512 KiB [2560 KiB, 3 MiB, 3584 KiB, 4 MiB] + // 1 MiB [5 MiB, 6 MiB, 7 MiB, 8 MiB] + // etc. + return get_size_info(max(size, (size_t)PAGE_SIZE)).size; + } + return page_align(size); +} + +static size_t get_guard_size(struct random_state *state, size_t size) { + return (get_random_u64_uniform(state, size / PAGE_SIZE / GUARD_SIZE_DIVISOR) + 1) * PAGE_SIZE; +} + +static void *allocate_large(size_t size) { + size = get_large_size_class(size); + if (unlikely(!size)) { + errno = ENOMEM; + return NULL; + } + + struct region_allocator *ra = ro.region_allocator; + + mutex_lock(&ra->lock); + size_t guard_size = get_guard_size(&ra->rng, size); + mutex_unlock(&ra->lock); + + void *p = allocate_pages(size, guard_size, true, "malloc large"); + if (p == NULL) { + return NULL; + } + + mutex_lock(&ra->lock); + if (unlikely(regions_insert(p, size, guard_size))) { + mutex_unlock(&ra->lock); + deallocate_pages(p, size, guard_size); + return NULL; + } + stats_large_allocate(ra, size); + mutex_unlock(&ra->lock); + + return p; +} + +static inline void *allocate(unsigned arena, size_t size) { + return size <= max_slab_size_class ? allocate_small(arena, size) : allocate_large(size); +} + +static void deallocate_large(void *p, const size_t *expected_size) { + enforce_init(); + thread_unseal_metadata(); + + struct region_allocator *ra = ro.region_allocator; + + mutex_lock(&ra->lock); + const struct region_metadata *region = regions_find(p); + if (unlikely(region == NULL)) { + fatal_error("invalid free"); + } + size_t size = region->size; + if (expected_size && unlikely(size != get_large_size_class(*expected_size))) { + fatal_error("sized deallocation mismatch (large)"); + } + size_t guard_size = region->guard_size; + regions_delete(region); + stats_large_deallocate(ra, size); + mutex_unlock(&ra->lock); + + regions_quarantine_deallocate_pages(p, size, guard_size); +} + +static int allocate_aligned(unsigned arena, void **memptr, size_t alignment, size_t size, size_t min_alignment) { + if ((alignment - 1) & alignment || alignment < min_alignment) { + return EINVAL; + } + + if (alignment <= PAGE_SIZE) { + if (size <= max_slab_size_class && alignment > min_align) { + size = get_size_info_align(size, alignment).size; + } + + void *p = allocate(arena, size); + if (unlikely(p == NULL)) { + return ENOMEM; + } + *memptr = p; + return 0; + } + + size = get_large_size_class(size); + if (unlikely(!size)) { + return ENOMEM; + } + + struct region_allocator *ra = ro.region_allocator; + + mutex_lock(&ra->lock); + size_t guard_size = get_guard_size(&ra->rng, size); + mutex_unlock(&ra->lock); + + void *p = allocate_pages_aligned(size, alignment, guard_size, "malloc large"); + if (unlikely(p == NULL)) { + return ENOMEM; + } + + mutex_lock(&ra->lock); + if (unlikely(regions_insert(p, size, guard_size))) { + mutex_unlock(&ra->lock); + deallocate_pages(p, size, guard_size); + return ENOMEM; + } + mutex_unlock(&ra->lock); + + *memptr = p; + return 0; +} + +static size_t adjust_size_for_canary(size_t size) { + if (size > 0 && size <= max_slab_size_class) { + return size + canary_size; + } + return size; +} + +static int alloc_aligned(void **memptr, size_t alignment, size_t size, size_t min_alignment) { + unsigned arena = init(); + thread_unseal_metadata(); + size = adjust_size_for_canary(size); + int ret = allocate_aligned(arena, memptr, alignment, size, min_alignment); + thread_seal_metadata(); + return ret; +} + +static void *alloc_aligned_simple(size_t alignment, size_t size) { + void *ptr; + int ret = alloc_aligned(&ptr, alignment, size, 1); + if (unlikely(ret)) { + errno = ret; + return NULL; + } + return ptr; +} + +static inline void *alloc(size_t size) { + unsigned arena = init(); + thread_unseal_metadata(); + void *p = allocate(arena, size); + thread_seal_metadata(); + return p; +} + +EXPORT void *h_malloc(size_t size) { + size = adjust_size_for_canary(size); + return alloc(size); +} + +EXPORT void *h_calloc(size_t nmemb, size_t size) { + size_t total_size; + if (unlikely(__builtin_mul_overflow(nmemb, size, &total_size))) { + errno = ENOMEM; + return NULL; + } + total_size = adjust_size_for_canary(total_size); + void *p = alloc(total_size); + if (!ZERO_ON_FREE && likely(p != NULL) && total_size && total_size <= max_slab_size_class) { + memset(p, 0, total_size - canary_size); + } +#ifdef HAS_ARM_MTE + // use an assert instead of adding a conditional to memset() above (freed memory is always + // zeroed when MTE is enabled) + static_assert(ZERO_ON_FREE, "disabling ZERO_ON_FREE reduces performance when ARM MTE is enabled"); +#endif + return p; +} + +EXPORT void *h_realloc(void *old, size_t size) { + size = adjust_size_for_canary(size); + if (old == NULL) { + return alloc(size); + } + + if (size > max_slab_size_class) { + size = get_large_size_class(size); + if (unlikely(!size)) { + errno = ENOMEM; + return NULL; + } + } + + void *old_orig = old; + old = untag_pointer(old); + + size_t old_size; + if (old < get_slab_region_end() && old >= ro.slab_region_start) { + old_size = slab_usable_size(old); + if (size <= max_slab_size_class && get_size_info(size).size == old_size) { + return old_orig; + } + thread_unseal_metadata(); + } else { + enforce_init(); + thread_unseal_metadata(); + + struct region_allocator *ra = ro.region_allocator; + + mutex_lock(&ra->lock); + const struct region_metadata *region = regions_find(old); + if (unlikely(region == NULL)) { + fatal_error("invalid realloc"); + } + old_size = region->size; + size_t old_guard_size = region->guard_size; + if (old_size == size) { + mutex_unlock(&ra->lock); + thread_seal_metadata(); + return old; + } + mutex_unlock(&ra->lock); + + if (size > max_slab_size_class) { + // in-place shrink + if (size < old_size) { + void *new_end = (char *)old + size; + if (memory_map_fixed(new_end, old_guard_size)) { + thread_seal_metadata(); + return NULL; + } + memory_set_name(new_end, old_guard_size, "malloc large"); + void *new_guard_end = (char *)new_end + old_guard_size; + regions_quarantine_deallocate_pages(new_guard_end, old_size - size, 0); + + mutex_lock(&ra->lock); + struct region_metadata *region = regions_find(old); + if (unlikely(region == NULL)) { + fatal_error("invalid realloc"); + } + region->size = size; + stats_large_deallocate(ra, old_size - size); + mutex_unlock(&ra->lock); + + thread_seal_metadata(); + return old; + } + +#ifdef HAVE_COMPATIBLE_MREMAP + static const bool vma_merging_reliable = false; + if (vma_merging_reliable) { + // in-place growth + void *guard_end = (char *)old + old_size + old_guard_size; + size_t extra = size - old_size; + if (!memory_remap((char *)old + old_size, old_guard_size, old_guard_size + extra)) { + if (memory_protect_rw((char *)old + old_size, extra)) { + memory_unmap(guard_end, extra); + } else { + mutex_lock(&ra->lock); + struct region_metadata *region = regions_find(old); + if (region == NULL) { + fatal_error("invalid realloc"); + } + region->size = size; + stats_large_allocate(ra, extra); + mutex_unlock(&ra->lock); + + thread_seal_metadata(); + return old; + } + } + } + + size_t copy_size = min(size, old_size); + if (copy_size >= MREMAP_MOVE_THRESHOLD) { + void *new = allocate_large(size); + if (new == NULL) { + thread_seal_metadata(); + return NULL; + } + + mutex_lock(&ra->lock); + struct region_metadata *region = regions_find(old); + if (unlikely(region == NULL)) { + fatal_error("invalid realloc"); + } + regions_delete(region); + stats_large_deallocate(ra, old_size); + mutex_unlock(&ra->lock); + + if (memory_remap_fixed(old, old_size, new, size)) { + memcpy(new, old, copy_size); + deallocate_pages(old, old_size, old_guard_size); + } else { + memory_unmap((char *)old - old_guard_size, old_guard_size); + memory_unmap((char *)old + page_align(old_size), old_guard_size); + } + thread_seal_metadata(); + return new; + } +#endif + } + } + + void *new = allocate(init(), size); + if (new == NULL) { + thread_seal_metadata(); + return NULL; + } + size_t copy_size = min(size, old_size); + if (copy_size > 0 && copy_size <= max_slab_size_class) { + copy_size -= canary_size; + } + memcpy(new, old_orig, copy_size); + if (old_size <= max_slab_size_class) { + deallocate_small(old, NULL); + } else { + deallocate_large(old, NULL); + } + thread_seal_metadata(); + return new; +} + +EXPORT int h_posix_memalign(void **memptr, size_t alignment, size_t size) { + return alloc_aligned(memptr, alignment, size, sizeof(void *)); +} + +EXPORT void *h_aligned_alloc(size_t alignment, size_t size) { + return alloc_aligned_simple(alignment, size); +} + +EXPORT void *h_memalign(size_t alignment, size_t size) ALIAS(h_aligned_alloc); + +#ifndef __ANDROID__ +EXPORT void *h_valloc(size_t size) { + return alloc_aligned_simple(PAGE_SIZE, size); +} + +EXPORT void *h_pvalloc(size_t size) { + size = page_align(size); + if (unlikely(!size)) { + errno = ENOMEM; + return NULL; + } + return alloc_aligned_simple(PAGE_SIZE, size); +} +#endif + +// preserves errno +EXPORT void h_free(void *p) { + if (p == NULL) { + return; + } + + p = untag_pointer(p); + + if (p < get_slab_region_end() && p >= ro.slab_region_start) { + thread_unseal_metadata(); + deallocate_small(p, NULL); + thread_seal_metadata(); + return; + } + + int saved_errno = errno; + deallocate_large(p, NULL); + errno = saved_errno; + + thread_seal_metadata(); +} + +#ifdef __GLIBC__ +EXPORT void h_cfree(void *ptr) ALIAS(h_free); +#endif + +EXPORT void h_free_sized(void *p, size_t expected_size) { + if (p == NULL) { + return; + } + + p = untag_pointer(p); + + expected_size = adjust_size_for_canary(expected_size); + + if (p < get_slab_region_end() && p >= ro.slab_region_start) { + thread_unseal_metadata(); + expected_size = get_size_info(expected_size).size; + deallocate_small(p, &expected_size); + thread_seal_metadata(); + return; + } + + deallocate_large(p, &expected_size); + + thread_seal_metadata(); +} + +static inline void memory_corruption_check_small(const void *p) { + struct slab_size_class_info size_class_info = slab_size_class(p); + size_t class = size_class_info.class; + struct size_class *c = &ro.size_class_metadata[size_class_info.arena][class]; + size_t size = size_classes[class]; + bool is_zero_size = size == 0; + if (unlikely(is_zero_size)) { + size = 16; + } + size_t slab_size = get_slab_size(get_slots(class), size); + + mutex_lock(&c->lock); + + const struct slab_metadata *metadata = get_metadata(c, p); + void *slab = get_slab(c, slab_size, metadata); + size_t slot = libdivide_u32_do((const char *)p - (const char *)slab, &c->size_divisor); + + if (unlikely(slot_pointer(size, slab, slot) != p)) { + fatal_error("invalid unaligned malloc_usable_size"); + } + + if (unlikely(!is_used_slot(metadata, slot))) { + fatal_error("invalid malloc_usable_size"); + } + + if (likely(!is_zero_size)) { + check_canary(metadata, p, size); + } + +#if SLAB_QUARANTINE + if (unlikely(is_quarantine_slot(metadata, slot))) { + fatal_error("invalid malloc_usable_size (quarantine)"); + } +#endif + + mutex_unlock(&c->lock); +} + +EXPORT size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *arg) { + if (arg == NULL) { + return 0; + } + + const void *p = untag_const_pointer(arg); + + if (p < get_slab_region_end() && p >= ro.slab_region_start) { + thread_unseal_metadata(); + memory_corruption_check_small(p); + thread_seal_metadata(); + + size_t size = slab_usable_size(p); + return size ? size - canary_size : 0; + } + + enforce_init(); + thread_unseal_metadata(); + + struct region_allocator *ra = ro.region_allocator; + mutex_lock(&ra->lock); + const struct region_metadata *region = regions_find(p); + if (unlikely(region == NULL)) { + fatal_error("invalid malloc_usable_size"); + } + size_t size = region->size; + mutex_unlock(&ra->lock); + + thread_seal_metadata(); + return size; +} + +EXPORT size_t h_malloc_object_size(const void *p) { + if (p == NULL) { + return 0; + } + + const void *slab_region_end = get_slab_region_end(); + if (p < slab_region_end && p >= ro.slab_region_start) { + thread_unseal_metadata(); + + struct slab_size_class_info size_class_info = slab_size_class(p); + size_t class = size_class_info.class; + size_t size_class = size_classes[class]; + struct size_class *c = &ro.size_class_metadata[size_class_info.arena][class]; + + mutex_lock(&c->lock); + + const struct slab_metadata *metadata = get_metadata(c, p); + size_t slab_size = get_slab_size(get_slots(class), size_class); + void *slab = get_slab(c, slab_size, metadata); + size_t slot = libdivide_u32_do((const char *)p - (const char *)slab, &c->size_divisor); + + if (unlikely(!is_used_slot(metadata, slot))) { + fatal_error("invalid malloc_object_size"); + } + +#if SLAB_QUARANTINE + if (unlikely(is_quarantine_slot(metadata, slot))) { + fatal_error("invalid malloc_object_size (quarantine)"); + } +#endif + + void *start = slot_pointer(size_class, slab, slot); + size_t offset = (const char *)p - (const char *)start; + + mutex_unlock(&c->lock); + thread_seal_metadata(); + + size_t size = slab_usable_size(p); + return size ? size - canary_size - offset : 0; + } + + if (unlikely(slab_region_end == NULL)) { + return SIZE_MAX; + } + + thread_unseal_metadata(); + + struct region_allocator *ra = ro.region_allocator; + mutex_lock(&ra->lock); + const struct region_metadata *region = regions_find(p); + size_t size = region == NULL ? SIZE_MAX : region->size; + mutex_unlock(&ra->lock); + + thread_seal_metadata(); + return size; +} + +EXPORT size_t h_malloc_object_size_fast(const void *p) { + if (p == NULL) { + return 0; + } + + const void *slab_region_end = get_slab_region_end(); + if (p < slab_region_end && p >= ro.slab_region_start) { + size_t size = slab_usable_size(p); + return size ? size - canary_size : 0; + } + + if (unlikely(slab_region_end == NULL)) { + return 0; + } + + return SIZE_MAX; +} + +EXPORT int h_mallopt(UNUSED int param, UNUSED int value) { +#ifdef __ANDROID__ + if (param == M_PURGE) { + h_malloc_trim(0); + return 1; + } +#endif + return 0; +} + +EXPORT int h_malloc_trim(UNUSED size_t pad) { + if (unlikely(!is_init())) { + return 0; + } + + thread_unseal_metadata(); + + bool is_trimmed = false; + + for (unsigned arena = 0; arena < N_ARENA; arena++) { + // skip zero byte size class since there's nothing to change + for (unsigned class = 1; class < N_SIZE_CLASSES; class++) { + struct size_class *c = &ro.size_class_metadata[arena][class]; + size_t size = size_classes[class]; + size_t slab_size = get_slab_size(get_slots(class), size); + + mutex_lock(&c->lock); + + struct slab_metadata *iterator = c->empty_slabs; + while (iterator) { + void *slab = get_slab(c, slab_size, iterator); + if (memory_map_fixed(slab, slab_size)) { + break; + } + label_slab(slab, slab_size, class); + stats_slab_deallocate(c, slab_size); + + struct slab_metadata *trimmed = iterator; + iterator = iterator->next; + c->empty_slabs_total -= slab_size; + + enqueue_free_slab(c, trimmed); + + is_trimmed = true; + } + c->empty_slabs = iterator; + +#if SLAB_QUARANTINE && CONFIG_EXTENDED_SIZE_CLASSES + if (size >= min_extended_size_class) { + size_t quarantine_shift = clz64(size) - (63 - MAX_SLAB_SIZE_CLASS_SHIFT); + +#if SLAB_QUARANTINE_RANDOM_LENGTH > 0 + size_t slab_quarantine_random_length = SLAB_QUARANTINE_RANDOM_LENGTH << quarantine_shift; + for (size_t i = 0; i < slab_quarantine_random_length; i++) { + void *p = c->quarantine_random[i]; + if (p != NULL) { + memory_purge(p, size); + } + } +#endif + +#if SLAB_QUARANTINE_QUEUE_LENGTH > 0 + size_t slab_quarantine_queue_length = SLAB_QUARANTINE_QUEUE_LENGTH << quarantine_shift; + for (size_t i = 0; i < slab_quarantine_queue_length; i++) { + void *p = c->quarantine_queue[i]; + if (p != NULL) { + memory_purge(p, size); + } + } +#endif + } +#endif + + mutex_unlock(&c->lock); + } + } + + thread_seal_metadata(); + + return is_trimmed; +} + +EXPORT void h_malloc_stats(void) {} + +#if defined(__GLIBC__) || defined(__ANDROID__) +// glibc mallinfo is broken and replaced with mallinfo2 +#if defined(__GLIBC__) +EXPORT struct mallinfo h_mallinfo(void) { + return (struct mallinfo){0}; +} + +EXPORT struct mallinfo2 h_mallinfo2(void) { + struct mallinfo2 info = {0}; +#else +EXPORT struct mallinfo h_mallinfo(void) { + struct mallinfo info = {0}; +#endif + +#if CONFIG_STATS + if (unlikely(!is_init())) { + return info; + } + + thread_unseal_metadata(); + + struct region_allocator *ra = ro.region_allocator; + mutex_lock(&ra->lock); + info.hblkhd += ra->allocated; + info.uordblks += ra->allocated; + mutex_unlock(&ra->lock); + + for (unsigned arena = 0; arena < N_ARENA; arena++) { + for (unsigned class = 0; class < N_SIZE_CLASSES; class++) { + struct size_class *c = &ro.size_class_metadata[arena][class]; + + mutex_lock(&c->lock); + info.hblkhd += c->slab_allocated; + info.uordblks += c->allocated; + mutex_unlock(&c->lock); + } + } + + info.fordblks = info.hblkhd - info.uordblks; + info.usmblks = info.hblkhd; + + thread_seal_metadata(); +#endif + + return info; +} +#endif + +#ifndef __ANDROID__ +EXPORT int h_malloc_info(int options, FILE *fp) { + if (options) { + errno = EINVAL; + return -1; + } + + fputs("", fp); + +#if CONFIG_STATS + if (likely(is_init())) { + thread_unseal_metadata(); + + for (unsigned arena = 0; arena < N_ARENA; arena++) { + fprintf(fp, "", arena); + + for (unsigned class = 0; class < N_SIZE_CLASSES; class++) { + struct size_class *c = &ro.size_class_metadata[arena][class]; + + mutex_lock(&c->lock); + u64 nmalloc = c->nmalloc; + u64 ndalloc = c->ndalloc; + size_t slab_allocated = c->slab_allocated; + size_t allocated = c->allocated; + mutex_unlock(&c->lock); + + if (nmalloc || ndalloc || slab_allocated || allocated) { + fprintf(fp, "" + "%" PRIu64 "" + "%" PRIu64 "" + "%zu" + "%zu" + "", class, size_classes[class], nmalloc, ndalloc, slab_allocated, + allocated); + } + } + + fputs("", fp); + } + + struct region_allocator *ra = ro.region_allocator; + mutex_lock(&ra->lock); + size_t region_allocated = ra->allocated; + mutex_unlock(&ra->lock); + + fprintf(fp, "" + "%zu" + "", N_ARENA, region_allocated); + + thread_seal_metadata(); + } +#endif + + fputs("", fp); + + return 0; +} +#endif + +#ifdef __ANDROID__ +EXPORT size_t h_mallinfo_narenas(void) { + // Consider region allocator to be an arena with index N_ARENA. + return N_ARENA + 1; +} + +EXPORT size_t h_mallinfo_nbins(void) { + return N_SIZE_CLASSES; +} + +// This internal Android API uses mallinfo in a non-standard way to implement malloc_info: +// +// hblkhd: total mapped memory as usual +// ordblks: large allocations +// uordblks: huge allocations +// fsmblks: small allocations +// (other fields are unused) +EXPORT struct mallinfo h_mallinfo_arena_info(UNUSED size_t arena) { + struct mallinfo info = {0}; + +#if CONFIG_STATS + if (unlikely(!is_init())) { + return info; + } + + thread_unseal_metadata(); + + if (arena < N_ARENA) { + for (unsigned class = 0; class < N_SIZE_CLASSES; class++) { + struct size_class *c = &ro.size_class_metadata[arena][class]; + + mutex_lock(&c->lock); + info.hblkhd += c->slab_allocated; + info.fsmblks += c->allocated; + mutex_unlock(&c->lock); + } + } else if (arena == N_ARENA) { + struct region_allocator *ra = ro.region_allocator; + mutex_lock(&ra->lock); + info.hblkhd = ra->allocated; + // our large allocations are roughly comparable to jemalloc huge allocations + info.uordblks = ra->allocated; + mutex_unlock(&ra->lock); + } + + thread_seal_metadata(); +#endif + + return info; +} + +// This internal Android API uses mallinfo in a non-standard way to implement malloc_info: +// +// ordblks: total allocated space +// uordblks: nmalloc +// fordblks: ndalloc +// (other fields are unused) +EXPORT struct mallinfo h_mallinfo_bin_info(UNUSED size_t arena, UNUSED size_t bin) { + struct mallinfo info = {0}; + +#if CONFIG_STATS + if (unlikely(!is_init())) { + return info; + } + + if (arena < N_ARENA && bin < N_SIZE_CLASSES) { + thread_seal_metadata(); + + struct size_class *c = &ro.size_class_metadata[arena][bin]; + + mutex_lock(&c->lock); + info.ordblks = c->allocated; + info.uordblks = c->nmalloc; + info.fordblks = c->ndalloc; + mutex_unlock(&c->lock); + + thread_unseal_metadata(); + } +#endif + + return info; +} + +COLD EXPORT int h_malloc_iterate(UNUSED uintptr_t base, UNUSED size_t size, + UNUSED void (*callback)(uintptr_t ptr, size_t size, void *arg), + UNUSED void *arg) { + fatal_error("not implemented"); +} + +COLD EXPORT void h_malloc_disable(void) { + init(); + full_lock(); +} + +COLD EXPORT void h_malloc_enable(void) { + enforce_init(); + full_unlock(); +} +#endif + +#ifdef __GLIBC__ +COLD EXPORT void *h_malloc_get_state(void) { + errno = ENOSYS; + return NULL; +} + +COLD EXPORT int h_malloc_set_state(UNUSED void *state) { + return -2; +} +#endif + +#ifdef __ANDROID__ +COLD EXPORT void h_malloc_disable_memory_tagging(void) { +#ifdef HAS_ARM_MTE + mutex_lock(&init_lock); + if (!ro.is_memtag_disabled) { + if (is_init()) { + if (unlikely(memory_protect_rw(&ro, sizeof(ro)))) { + fatal_error("failed to unprotect allocator data"); + } + ro.is_memtag_disabled = true; + if (unlikely(memory_protect_ro(&ro, sizeof(ro)))) { + fatal_error("failed to protect allocator data"); + } + } else { + // bionic calls this function very early in some cases + ro.is_memtag_disabled = true; + } + } + mutex_unlock(&init_lock); +#endif +} +#endif diff --git a/src/hardened_malloc/include/h_malloc.h b/src/hardened_malloc/include/h_malloc.h new file mode 100644 index 0000000..0eee395 --- /dev/null +++ b/src/hardened_malloc/include/h_malloc.h @@ -0,0 +1,129 @@ +#ifndef ALLOCATOR_H +#define ALLOCATOR_H + +#include + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef H_MALLOC_PREFIX +#define h_malloc malloc +#define h_calloc calloc +#define h_realloc realloc +#define h_aligned_alloc aligned_alloc +#define h_free free + +#define h_posix_memalign posix_memalign + +#define h_malloc_usable_size malloc_usable_size +#define h_mallopt mallopt +#define h_malloc_trim malloc_trim +#define h_malloc_stats malloc_stats +#define h_mallinfo mallinfo +#define h_mallinfo2 mallinfo2 +#define h_malloc_info malloc_info + +#define h_memalign memalign +#define h_valloc valloc +#define h_pvalloc pvalloc +#define h_cfree cfree +#define h_malloc_get_state malloc_get_state +#define h_malloc_set_state malloc_set_state + +#define h_mallinfo_narenas mallinfo_narenas +#define h_mallinfo_nbins mallinfo_nbins +#define h_mallinfo_arena_info mallinfo_arena_info +#define h_mallinfo_bin_info mallinfo_bin_info + +#define h_malloc_iterate malloc_iterate +#define h_malloc_disable malloc_disable +#define h_malloc_enable malloc_enable + +#define h_malloc_object_size malloc_object_size +#define h_malloc_object_size_fast malloc_object_size_fast +#define h_free_sized free_sized +#endif + +// C standard +__attribute__((malloc)) __attribute__((alloc_size(1))) void *h_malloc(size_t size); +__attribute__((malloc)) __attribute__((alloc_size(1, 2))) void *h_calloc(size_t nmemb, size_t size); +__attribute__((alloc_size(2))) void *h_realloc(void *ptr, size_t size); +__attribute__((malloc)) __attribute__((alloc_size(2))) __attribute__((alloc_align(1))) +void *h_aligned_alloc(size_t alignment, size_t size); +void h_free(void *ptr); + +// POSIX +int h_posix_memalign(void **memptr, size_t alignment, size_t size); + +#ifdef __ANDROID__ +#define H_MALLOC_USABLE_SIZE_CONST const +#else +#define H_MALLOC_USABLE_SIZE_CONST +#endif + +// glibc extensions +size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *ptr); +int h_mallopt(int param, int value); +int h_malloc_trim(size_t pad); +void h_malloc_stats(void); +#if defined(__GLIBC__) || defined(__ANDROID__) +struct mallinfo h_mallinfo(void); +#endif +#ifndef __ANDROID__ +int h_malloc_info(int options, FILE *fp); +#endif + +// obsolete glibc extensions +__attribute__((malloc)) __attribute__((alloc_size(2))) __attribute__((alloc_align(1))) +void *h_memalign(size_t alignment, size_t size); +#ifndef __ANDROID__ +__attribute__((malloc)) __attribute__((alloc_size(1))) void *h_valloc(size_t size); +__attribute__((malloc)) void *h_pvalloc(size_t size); +#endif +#ifdef __GLIBC__ +void h_cfree(void *ptr) __THROW; +void *h_malloc_get_state(void); +int h_malloc_set_state(void *state); +#endif + +// Android extensions +#ifdef __ANDROID__ +size_t h_mallinfo_narenas(void); +size_t h_mallinfo_nbins(void); +struct mallinfo h_mallinfo_arena_info(size_t arena); +struct mallinfo h_mallinfo_bin_info(size_t arena, size_t bin); +int h_malloc_iterate(uintptr_t base, size_t size, void (*callback)(uintptr_t ptr, size_t size, void *arg), + void *arg); +void h_malloc_disable(void); +void h_malloc_enable(void); +void h_malloc_disable_memory_tagging(void); +#endif + +// hardened_malloc extensions + +// return an upper bound on object size for any pointer based on malloc metadata +size_t h_malloc_object_size(const void *ptr); + +// similar to malloc_object_size, but avoiding locking so the results are much more limited +size_t h_malloc_object_size_fast(const void *ptr); + +// The free function with an extra parameter for passing the size requested at +// allocation time. +// +// This offers the same functionality as C++14 sized deallocation and can be +// used to implement it. +// +// A performance-oriented allocator would use this as a performance +// enhancement with undefined behavior on a mismatch. Instead, this hardened +// allocator implementation uses it to improve security by checking that the +// passed size matches the allocated size. +void h_free_sized(void *ptr, size_t expected_size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/hardened_malloc/memory.c b/src/hardened_malloc/memory.c new file mode 100644 index 0000000..5434060 --- /dev/null +++ b/src/hardened_malloc/memory.c @@ -0,0 +1,120 @@ +#include + +#include + +#ifdef LABEL_MEMORY +#include +#endif + +#ifndef PR_SET_VMA +#define PR_SET_VMA 0x53564d41 +#endif + +#ifndef PR_SET_VMA_ANON_NAME +#define PR_SET_VMA_ANON_NAME 0 +#endif + +#include "memory.h" +#include "util.h" + +void *memory_map(size_t size) { + void *p = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (unlikely(p == MAP_FAILED)) { + if (errno != ENOMEM) { + fatal_error("non-ENOMEM mmap failure"); + } + return NULL; + } + return p; +} + +#ifdef HAS_ARM_MTE +// Note that PROT_MTE can't be cleared via mprotect +void *memory_map_mte(size_t size) { + void *p = mmap(NULL, size, PROT_MTE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (unlikely(p == MAP_FAILED)) { + if (errno != ENOMEM) { + fatal_error("non-ENOMEM MTE mmap failure"); + } + return NULL; + } + return p; +} +#endif + +bool memory_map_fixed(void *ptr, size_t size) { + void *p = mmap(ptr, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); + bool ret = p == MAP_FAILED; + if (unlikely(ret) && errno != ENOMEM) { + fatal_error("non-ENOMEM MAP_FIXED mmap failure"); + } + return ret; +} + +bool memory_unmap(void *ptr, size_t size) { + bool ret = munmap(ptr, size); + if (unlikely(ret) && errno != ENOMEM) { + fatal_error("non-ENOMEM munmap failure"); + } + return ret; +} + +static bool memory_protect_prot(void *ptr, size_t size, int prot, UNUSED int pkey) { +#ifdef USE_PKEY + bool ret = pkey_mprotect(ptr, size, prot, pkey); +#else + bool ret = mprotect(ptr, size, prot); +#endif + if (unlikely(ret) && errno != ENOMEM) { + fatal_error("non-ENOMEM mprotect failure"); + } + return ret; +} + +bool memory_protect_ro(void *ptr, size_t size) { + return memory_protect_prot(ptr, size, PROT_READ, -1); +} + +bool memory_protect_rw(void *ptr, size_t size) { + return memory_protect_prot(ptr, size, PROT_READ|PROT_WRITE, -1); +} + +bool memory_protect_rw_metadata(void *ptr, size_t size) { + return memory_protect_prot(ptr, size, PROT_READ|PROT_WRITE, get_metadata_key()); +} + +#ifdef HAVE_COMPATIBLE_MREMAP +bool memory_remap(void *old, size_t old_size, size_t new_size) { + void *ptr = mremap(old, old_size, new_size, 0); + bool ret = ptr == MAP_FAILED; + if (unlikely(ret) && errno != ENOMEM) { + fatal_error("non-ENOMEM mremap failure"); + } + return ret; +} + +bool memory_remap_fixed(void *old, size_t old_size, void *new, size_t new_size) { + void *ptr = mremap(old, old_size, new_size, MREMAP_MAYMOVE|MREMAP_FIXED, new); + bool ret = ptr == MAP_FAILED; + if (unlikely(ret) && errno != ENOMEM) { + fatal_error("non-ENOMEM MREMAP_FIXED mremap failure"); + } + return ret; +} +#endif + +bool memory_purge(void *ptr, size_t size) { + int ret = madvise(ptr, size, MADV_DONTNEED); + if (unlikely(ret) && errno != ENOMEM) { + fatal_error("non-ENOMEM MADV_DONTNEED madvise failure"); + } + return ret; +} + +bool memory_set_name(UNUSED void *ptr, UNUSED size_t size, UNUSED const char *name) { +#ifdef LABEL_MEMORY + return prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, name); +#else + return false; +#endif +} diff --git a/src/hardened_malloc/memory.h b/src/hardened_malloc/memory.h new file mode 100644 index 0000000..6e4cd4d --- /dev/null +++ b/src/hardened_malloc/memory.h @@ -0,0 +1,29 @@ +#ifndef MEMORY_H +#define MEMORY_H + +#include +#include + +#ifdef __linux__ +#define HAVE_COMPATIBLE_MREMAP +#endif + +int get_metadata_key(void); + +void *memory_map(size_t size); +#ifdef HAS_ARM_MTE +void *memory_map_mte(size_t size); +#endif +bool memory_map_fixed(void *ptr, size_t size); +bool memory_unmap(void *ptr, size_t size); +bool memory_protect_ro(void *ptr, size_t size); +bool memory_protect_rw(void *ptr, size_t size); +bool memory_protect_rw_metadata(void *ptr, size_t size); +#ifdef HAVE_COMPATIBLE_MREMAP +bool memory_remap(void *old, size_t old_size, size_t new_size); +bool memory_remap_fixed(void *old, size_t old_size, void *new, size_t new_size); +#endif +bool memory_purge(void *ptr, size_t size); +bool memory_set_name(void *ptr, size_t size, const char *name); + +#endif diff --git a/src/hardened_malloc/memtag.h b/src/hardened_malloc/memtag.h new file mode 100644 index 0000000..0ba4cbc --- /dev/null +++ b/src/hardened_malloc/memtag.h @@ -0,0 +1,49 @@ +#ifndef MEMTAG_H +#define MEMTAG_H + +#include "util.h" + +#ifdef HAS_ARM_MTE +#include "arm_mte.h" +#define MEMTAG 1 +#define RESERVED_TAG 15 +#define TAG_WIDTH 4 +#endif + +static inline void *untag_pointer(void *ptr) { +#ifdef HAS_ARM_MTE + const uintptr_t mask = UINTPTR_MAX >> 8; + return (void *) ((uintptr_t) ptr & mask); +#else + return ptr; +#endif +} + +static inline const void *untag_const_pointer(const void *ptr) { +#ifdef HAS_ARM_MTE + const uintptr_t mask = UINTPTR_MAX >> 8; + return (const void *) ((uintptr_t) ptr & mask); +#else + return ptr; +#endif +} + +static inline void *set_pointer_tag(void *ptr, u8 tag) { +#ifdef HAS_ARM_MTE + return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr)); +#else + (void) tag; + return ptr; +#endif +} + +static inline u8 get_pointer_tag(void *ptr) { +#ifdef HAS_ARM_MTE + return (((uintptr_t) ptr) >> 56) & 0xf; +#else + (void) ptr; + return 0; +#endif +} + +#endif diff --git a/src/hardened_malloc/mutex.h b/src/hardened_malloc/mutex.h new file mode 100644 index 0000000..b8f77f9 --- /dev/null +++ b/src/hardened_malloc/mutex.h @@ -0,0 +1,28 @@ +#ifndef MUTEX_H +#define MUTEX_H + +#include + +#include "util.h" + +struct mutex { + pthread_mutex_t lock; +}; + +#define MUTEX_INITIALIZER (struct mutex){PTHREAD_MUTEX_INITIALIZER} + +static inline void mutex_init(struct mutex *m) { + if (unlikely(pthread_mutex_init(&m->lock, NULL))) { + fatal_error("mutex initialization failed"); + } +} + +static inline void mutex_lock(struct mutex *m) { + pthread_mutex_lock(&m->lock); +} + +static inline void mutex_unlock(struct mutex *m) { + pthread_mutex_unlock(&m->lock); +} + +#endif diff --git a/src/hardened_malloc/new.cc b/src/hardened_malloc/new.cc new file mode 100644 index 0000000..165e19e --- /dev/null +++ b/src/hardened_malloc/new.cc @@ -0,0 +1,153 @@ +// needed with libstdc++ but not libc++ +#if __has_include() +#include +#endif + +#include + +#include "h_malloc.h" +#include "util.h" + +COLD static void *handle_out_of_memory(size_t size, bool nothrow) { + void *ptr = nullptr; + + do { + std::new_handler handler = std::get_new_handler(); + if (handler == nullptr) { + break; + } + + try { + handler(); + } catch (const std::bad_alloc &) { + break; + } + + ptr = h_malloc(size); + } while (ptr == nullptr); + + if (ptr == nullptr && !nothrow) { + std::__throw_bad_alloc(); + } + return ptr; +} + +static inline void *new_impl(size_t size, bool nothrow) { + void *ptr = h_malloc(size); + if (likely(ptr != nullptr)) { + return ptr; + } + return handle_out_of_memory(size, nothrow); +} + +EXPORT void *operator new(size_t size) { + return new_impl(size, false); +} + +EXPORT void *operator new[](size_t size) { + return new_impl(size, false); +} + +EXPORT void *operator new(size_t size, const std::nothrow_t &) noexcept { + return new_impl(size, true); +} + +EXPORT void *operator new[](size_t size, const std::nothrow_t &) noexcept { + return new_impl(size, true); +} + +EXPORT void operator delete(void *ptr) noexcept { + h_free(ptr); +} + +EXPORT void operator delete[](void *ptr) noexcept { + h_free(ptr); +} + +EXPORT void operator delete(void *ptr, const std::nothrow_t &) noexcept { + h_free(ptr); +} + +EXPORT void operator delete[](void *ptr, const std::nothrow_t &) noexcept { + h_free(ptr); +} + +EXPORT void operator delete(void *ptr, size_t size) noexcept { + h_free_sized(ptr, size); +} + +EXPORT void operator delete[](void *ptr, size_t size) noexcept { + h_free_sized(ptr, size); +} + +COLD static void *handle_out_of_memory(size_t size, size_t alignment, bool nothrow) { + void *ptr = nullptr; + + do { + std::new_handler handler = std::get_new_handler(); + if (handler == nullptr) { + break; + } + + try { + handler(); + } catch (const std::bad_alloc &) { + break; + } + + ptr = h_aligned_alloc(alignment, size); + } while (ptr == nullptr); + + if (ptr == nullptr && !nothrow) { + std::__throw_bad_alloc(); + } + return ptr; +} + +static inline void *new_impl(size_t size, size_t alignment, bool nothrow) { + void *ptr = h_aligned_alloc(alignment, size); + if (likely(ptr != nullptr)) { + return ptr; + } + return handle_out_of_memory(size, alignment, nothrow); +} + +EXPORT void *operator new(size_t size, std::align_val_t alignment) { + return new_impl(size, static_cast(alignment), false); +} + +EXPORT void *operator new[](size_t size, std::align_val_t alignment) { + return new_impl(size, static_cast(alignment), false); +} + +EXPORT void *operator new(size_t size, std::align_val_t alignment, const std::nothrow_t &) noexcept { + return new_impl(size, static_cast(alignment), true); +} + +EXPORT void *operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t &) noexcept { + return new_impl(size, static_cast(alignment), true); +} + +EXPORT void operator delete(void *ptr, std::align_val_t) noexcept { + h_free(ptr); +} + +EXPORT void operator delete[](void *ptr, std::align_val_t) noexcept { + h_free(ptr); +} + +EXPORT void operator delete(void *ptr, std::align_val_t, const std::nothrow_t &) noexcept { + h_free(ptr); +} + +EXPORT void operator delete[](void *ptr, std::align_val_t, const std::nothrow_t &) noexcept { + h_free(ptr); +} + +EXPORT void operator delete(void *ptr, size_t size, std::align_val_t) noexcept { + h_free_sized(ptr, size); +} + +EXPORT void operator delete[](void *ptr, size_t size, std::align_val_t) noexcept { + h_free_sized(ptr, size); +} diff --git a/src/hardened_malloc/pages.c b/src/hardened_malloc/pages.c new file mode 100644 index 0000000..27558de --- /dev/null +++ b/src/hardened_malloc/pages.c @@ -0,0 +1,88 @@ +#include + +#include "memory.h" +#include "pages.h" +#include "util.h" + +static bool add_guards(size_t size, size_t guard_size, size_t *total_size) { + return __builtin_add_overflow(size, guard_size, total_size) || + __builtin_add_overflow(*total_size, guard_size, total_size); +} + +void *allocate_pages(size_t usable_size, size_t guard_size, bool unprotect, const char *name) { + size_t real_size; + if (unlikely(add_guards(usable_size, guard_size, &real_size))) { + errno = ENOMEM; + return NULL; + } + void *real = memory_map(real_size); + if (unlikely(real == NULL)) { + return NULL; + } + memory_set_name(real, real_size, name); + void *usable = (char *)real + guard_size; + if (unprotect && unlikely(memory_protect_rw(usable, usable_size))) { + memory_unmap(real, real_size); + return NULL; + } + return usable; +} + +void *allocate_pages_aligned(size_t usable_size, size_t alignment, size_t guard_size, const char *name) { + usable_size = page_align(usable_size); + if (unlikely(!usable_size)) { + errno = ENOMEM; + return NULL; + } + + size_t alloc_size; + if (unlikely(__builtin_add_overflow(usable_size, alignment - PAGE_SIZE, &alloc_size))) { + errno = ENOMEM; + return NULL; + } + + size_t real_alloc_size; + if (unlikely(add_guards(alloc_size, guard_size, &real_alloc_size))) { + errno = ENOMEM; + return NULL; + } + + void *real = memory_map(real_alloc_size); + if (unlikely(real == NULL)) { + return NULL; + } + memory_set_name(real, real_alloc_size, name); + + void *usable = (char *)real + guard_size; + + size_t lead_size = align((uintptr_t)usable, alignment) - (uintptr_t)usable; + size_t trail_size = alloc_size - lead_size - usable_size; + void *base = (char *)usable + lead_size; + + if (unlikely(memory_protect_rw(base, usable_size))) { + memory_unmap(real, real_alloc_size); + return NULL; + } + + if (lead_size) { + if (unlikely(memory_unmap(real, lead_size))) { + memory_unmap(real, real_alloc_size); + return NULL; + } + } + + if (trail_size) { + if (unlikely(memory_unmap((char *)base + usable_size + guard_size, trail_size))) { + memory_unmap(real, real_alloc_size); + return NULL; + } + } + + return base; +} + +void deallocate_pages(void *usable, size_t usable_size, size_t guard_size) { + if (unlikely(memory_unmap((char *)usable - guard_size, usable_size + guard_size * 2))) { + memory_purge(usable, usable_size); + } +} diff --git a/src/hardened_malloc/pages.h b/src/hardened_malloc/pages.h new file mode 100644 index 0000000..8795ddc --- /dev/null +++ b/src/hardened_malloc/pages.h @@ -0,0 +1,32 @@ +#ifndef PAGES_H +#define PAGES_H + +#include +#include +#include + +#include "util.h" + +#define PAGE_SHIFT 12 +#ifndef PAGE_SIZE +#define PAGE_SIZE ((size_t)1 << PAGE_SHIFT) +#endif + +void *allocate_pages(size_t usable_size, size_t guard_size, bool unprotect, const char *name); +void *allocate_pages_aligned(size_t usable_size, size_t alignment, size_t guard_size, const char *name); +void deallocate_pages(void *usable, size_t usable_size, size_t guard_size); + +static inline size_t page_align(size_t size) { + return align(size, PAGE_SIZE); +} + +static inline size_t hash_page(const void *p) { + uintptr_t u = (uintptr_t)p >> PAGE_SHIFT; + size_t sum = u; + sum = (sum << 7) - sum + (u >> 16); + sum = (sum << 7) - sum + (u >> 32); + sum = (sum << 7) - sum + (u >> 48); + return sum; +} + +#endif diff --git a/src/hardened_malloc/preload.sh b/src/hardened_malloc/preload.sh new file mode 100755 index 0000000..ee6abb6 --- /dev/null +++ b/src/hardened_malloc/preload.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +[[ $LD_PRELOAD ]] && LD_PRELOAD+=" " +export LD_PRELOAD+="$dir/libhardened_malloc.so" +exec "$@" diff --git a/src/hardened_malloc/random.c b/src/hardened_malloc/random.c new file mode 100644 index 0000000..8883531 --- /dev/null +++ b/src/hardened_malloc/random.c @@ -0,0 +1,128 @@ +#include +#include + +#include "chacha.h" +#include "random.h" +#include "util.h" + +#include + +static void get_random_seed(void *buf, size_t size) { + while (size) { + ssize_t r; + + do { + r = getrandom(buf, size, 0); + } while (r == -1 && errno == EINTR); + + if (r <= 0) { + fatal_error("getrandom failed"); + } + + buf = (char *)buf + r; + size -= r; + } +} + +void random_state_init(struct random_state *state) { + u8 rnd[CHACHA_KEY_SIZE + CHACHA_IV_SIZE]; + get_random_seed(rnd, sizeof(rnd)); + chacha_keysetup(&state->ctx, rnd); + chacha_ivsetup(&state->ctx, rnd + CHACHA_KEY_SIZE); + state->index = RANDOM_CACHE_SIZE; + state->reseed = 0; +} + +void random_state_init_from_random_state(struct random_state *state, struct random_state *source) { + u8 rnd[CHACHA_KEY_SIZE + CHACHA_IV_SIZE]; + get_random_bytes(source, rnd, sizeof(rnd)); + chacha_keysetup(&state->ctx, rnd); + chacha_ivsetup(&state->ctx, rnd + CHACHA_KEY_SIZE); + state->index = RANDOM_CACHE_SIZE; + state->reseed = 0; +} + +static void refill(struct random_state *state) { + if (state->reseed >= RANDOM_RESEED_SIZE) { + random_state_init(state); + } + chacha_keystream_bytes(&state->ctx, state->cache, RANDOM_CACHE_SIZE); + state->index = 0; + state->reseed += RANDOM_CACHE_SIZE; +} + +void get_random_bytes(struct random_state *state, void *buf, size_t size) { + // avoid needless copying to and from the cache as an optimization + if (size > RANDOM_CACHE_SIZE / 2) { + chacha_keystream_bytes(&state->ctx, buf, size); + return; + } + + while (size) { + if (state->index == RANDOM_CACHE_SIZE) { + refill(state); + } + + size_t remaining = RANDOM_CACHE_SIZE - state->index; + size_t copy_size = min(size, remaining); + memcpy(buf, state->cache + state->index, copy_size); + state->index += copy_size; + + buf = (char *)buf + copy_size; + size -= copy_size; + } +} + +u16 get_random_u16(struct random_state *state) { + u16 value; + unsigned remaining = RANDOM_CACHE_SIZE - state->index; + if (remaining < sizeof(value)) { + refill(state); + } + memcpy(&value, state->cache + state->index, sizeof(value)); + state->index += sizeof(value); + return value; +} + +// See Fast Random Integer Generation in an Interval by Daniel Lemire +u16 get_random_u16_uniform(struct random_state *state, u16 bound) { + u32 random = get_random_u16(state); + u32 multiresult = random * bound; + u16 leftover = multiresult; + if (leftover < bound) { + u16 threshold = -bound % bound; + while (leftover < threshold) { + random = get_random_u16(state); + multiresult = random * bound; + leftover = (u16)multiresult; + } + } + return multiresult >> 16; +} + +u64 get_random_u64(struct random_state *state) { + u64 value; + unsigned remaining = RANDOM_CACHE_SIZE - state->index; + if (remaining < sizeof(value)) { + refill(state); + } + memcpy(&value, state->cache + state->index, sizeof(value)); + state->index += sizeof(value); + return value; +} + +// See Fast Random Integer Generation in an Interval by Daniel Lemire +u64 get_random_u64_uniform(struct random_state *state, u64 bound) { + u128 random = get_random_u64(state); + u128 multiresult = random * bound; + u64 leftover = multiresult; + if (leftover < bound) { + u64 threshold = -bound % bound; + while (leftover < threshold) { + random = get_random_u64(state); + multiresult = random * bound; + leftover = multiresult; + } + } + return multiresult >> 64; +} diff --git a/src/hardened_malloc/random.h b/src/hardened_malloc/random.h new file mode 100644 index 0000000..14703bb --- /dev/null +++ b/src/hardened_malloc/random.h @@ -0,0 +1,25 @@ +#ifndef RANDOM_H +#define RANDOM_H + +#include "chacha.h" +#include "util.h" + +#define RANDOM_CACHE_SIZE 256U +#define RANDOM_RESEED_SIZE (256U * 1024) + +struct random_state { + unsigned index; + unsigned reseed; + chacha_ctx ctx; + u8 cache[RANDOM_CACHE_SIZE]; +}; + +void random_state_init(struct random_state *state); +void random_state_init_from_random_state(struct random_state *state, struct random_state *source); +void get_random_bytes(struct random_state *state, void *buf, size_t size); +u16 get_random_u16(struct random_state *state); +u16 get_random_u16_uniform(struct random_state *state, u16 bound); +u64 get_random_u64(struct random_state *state); +u64 get_random_u64_uniform(struct random_state *state, u64 bound); + +#endif diff --git a/src/hardened_malloc/test/.gitignore b/src/hardened_malloc/test/.gitignore new file mode 100644 index 0000000..d37a6a7 --- /dev/null +++ b/src/hardened_malloc/test/.gitignore @@ -0,0 +1,44 @@ +large_array_growth +mallinfo +mallinfo2 +malloc_info +offset +delete_type_size_mismatch +double_free_large +double_free_large_delayed +double_free_small +double_free_small_delayed +invalid_free_protected +invalid_free_small_region +invalid_free_small_region_far +invalid_free_unprotected +read_after_free_large +read_after_free_small +read_zero_size +string_overflow +unaligned_free_large +unaligned_free_small +uninitialized_free +uninitialized_malloc_usable_size +uninitialized_realloc +write_after_free_large +write_after_free_large_reuse +write_after_free_small +write_after_free_small_reuse +write_zero_size +unaligned_malloc_usable_size_small +invalid_malloc_usable_size_small +invalid_malloc_usable_size_small_quarantine +malloc_object_size +malloc_object_size_offset +invalid_malloc_object_size_small +invalid_malloc_object_size_small_quarantine +impossibly_large_malloc +overflow_large_1_byte +overflow_large_8_byte +overflow_small_1_byte +overflow_small_8_byte +uninitialized_read_large +uninitialized_read_small +realloc_init +__pycache__/ diff --git a/src/hardened_malloc/test/Makefile b/src/hardened_malloc/test/Makefile new file mode 100644 index 0000000..0eb3921 --- /dev/null +++ b/src/hardened_malloc/test/Makefile @@ -0,0 +1,76 @@ +CONFIG_SLAB_CANARY := true +CONFIG_EXTENDED_SIZE_CLASSES := true + +ifneq ($(VARIANT),) + $(error testing non-default variants not yet supported) +endif + +ifeq (,$(filter $(CONFIG_SLAB_CANARY),true false)) + $(error CONFIG_SLAB_CANARY must be true or false) +endif + +dir=$(dir $(realpath $(firstword $(MAKEFILE_LIST)))) + +CPPFLAGS := \ + -D_GNU_SOURCE \ + -DSLAB_CANARY=$(CONFIG_SLAB_CANARY) \ + -DCONFIG_EXTENDED_SIZE_CLASSES=$(CONFIG_EXTENDED_SIZE_CLASSES) + +SHARED_FLAGS := -O3 + +CFLAGS := -std=c17 $(SHARED_FLAGS) -Wmissing-prototypes +CXXFLAGS := -std=c++17 -fsized-deallocation $(SHARED_FLAGS) +LDFLAGS := -Wl,-L$(dir)../out,-R,$(dir)../out + +LDLIBS := -lpthread -lhardened_malloc + +EXECUTABLES := \ + offset \ + mallinfo \ + mallinfo2 \ + malloc_info \ + large_array_growth \ + double_free_large \ + double_free_large_delayed \ + double_free_small \ + double_free_small_delayed \ + unaligned_free_large \ + unaligned_free_small \ + read_after_free_large \ + read_after_free_small \ + write_after_free_large \ + write_after_free_large_reuse \ + write_after_free_small \ + write_after_free_small_reuse \ + read_zero_size \ + write_zero_size \ + invalid_free_protected \ + invalid_free_unprotected \ + invalid_free_small_region \ + invalid_free_small_region_far \ + uninitialized_read_small \ + uninitialized_read_large \ + uninitialized_free \ + uninitialized_realloc \ + uninitialized_malloc_usable_size \ + overflow_large_1_byte \ + overflow_large_8_byte \ + overflow_small_1_byte \ + overflow_small_8_byte \ + string_overflow \ + delete_type_size_mismatch \ + unaligned_malloc_usable_size_small \ + invalid_malloc_usable_size_small \ + invalid_malloc_usable_size_small_quarantine \ + malloc_object_size \ + malloc_object_size_offset \ + invalid_malloc_object_size_small \ + invalid_malloc_object_size_small_quarantine \ + impossibly_large_malloc \ + realloc_init + +all: $(EXECUTABLES) + +clean: + rm -f $(EXECUTABLES) + rm -fr ./__pycache__ diff --git a/src/hardened_malloc/test/__init__.py b/src/hardened_malloc/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/hardened_malloc/test/delete_type_size_mismatch.cc b/src/hardened_malloc/test/delete_type_size_mismatch.cc new file mode 100644 index 0000000..92bb374 --- /dev/null +++ b/src/hardened_malloc/test/delete_type_size_mismatch.cc @@ -0,0 +1,14 @@ +#include + +#include "test_util.h" + +struct foo { + uint64_t a, b, c, d; +}; + +OPTNONE int main(void) { + void *p = new char; + struct foo *c = (struct foo *)p; + delete c; + return 0; +} diff --git a/src/hardened_malloc/test/double_free_large.c b/src/hardened_malloc/test/double_free_large.c new file mode 100644 index 0000000..ee740e1 --- /dev/null +++ b/src/hardened_malloc/test/double_free_large.c @@ -0,0 +1,13 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + void *p = malloc(256 * 1024); + if (!p) { + return 1; + } + free(p); + free(p); + return 0; +} diff --git a/src/hardened_malloc/test/double_free_large_delayed.c b/src/hardened_malloc/test/double_free_large_delayed.c new file mode 100644 index 0000000..232a812 --- /dev/null +++ b/src/hardened_malloc/test/double_free_large_delayed.c @@ -0,0 +1,18 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + void *p = malloc(256 * 1024); + if (!p) { + return 1; + } + void *q = malloc(256 * 1024); + if (!q) { + return 1; + } + free(p); + free(q); + free(p); + return 0; +} diff --git a/src/hardened_malloc/test/double_free_small.c b/src/hardened_malloc/test/double_free_small.c new file mode 100644 index 0000000..94ab0ba --- /dev/null +++ b/src/hardened_malloc/test/double_free_small.c @@ -0,0 +1,13 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + void *p = malloc(16); + if (!p) { + return 1; + } + free(p); + free(p); + return 0; +} diff --git a/src/hardened_malloc/test/double_free_small_delayed.c b/src/hardened_malloc/test/double_free_small_delayed.c new file mode 100644 index 0000000..5a9a34e --- /dev/null +++ b/src/hardened_malloc/test/double_free_small_delayed.c @@ -0,0 +1,18 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + void *p = malloc(16); + if (!p) { + return 1; + } + void *q = malloc(16); + if (!q) { + return 1; + } + free(p); + free(q); + free(p); + return 0; +} diff --git a/src/hardened_malloc/test/impossibly_large_malloc.c b/src/hardened_malloc/test/impossibly_large_malloc.c new file mode 100644 index 0000000..63cdc0c --- /dev/null +++ b/src/hardened_malloc/test/impossibly_large_malloc.c @@ -0,0 +1,8 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(-8); + return !(p == NULL); +} diff --git a/src/hardened_malloc/test/invalid_free_protected.c b/src/hardened_malloc/test/invalid_free_protected.c new file mode 100644 index 0000000..0364baa --- /dev/null +++ b/src/hardened_malloc/test/invalid_free_protected.c @@ -0,0 +1,15 @@ +#include + +#include + +#include "test_util.h" + +OPTNONE int main(void) { + free(malloc(16)); + char *p = mmap(NULL, 4096 * 16, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (p == MAP_FAILED) { + return 1; + } + free(p + 4096 * 8); + return 0; +} diff --git a/src/hardened_malloc/test/invalid_free_small_region.c b/src/hardened_malloc/test/invalid_free_small_region.c new file mode 100644 index 0000000..81cfbf2 --- /dev/null +++ b/src/hardened_malloc/test/invalid_free_small_region.c @@ -0,0 +1,13 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(16); + if (!p) { + return 1; + } + char *q = p + 4096 * 4; + free(q); + return 0; +} diff --git a/src/hardened_malloc/test/invalid_free_small_region_far.c b/src/hardened_malloc/test/invalid_free_small_region_far.c new file mode 100644 index 0000000..c35c1ba --- /dev/null +++ b/src/hardened_malloc/test/invalid_free_small_region_far.c @@ -0,0 +1,13 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(16); + if (!p) { + return 1; + } + char *q = p + 1024 * 1024 * 1024; + free(q); + return 0; +} diff --git a/src/hardened_malloc/test/invalid_free_unprotected.c b/src/hardened_malloc/test/invalid_free_unprotected.c new file mode 100644 index 0000000..26254ab --- /dev/null +++ b/src/hardened_malloc/test/invalid_free_unprotected.c @@ -0,0 +1,15 @@ +#include + +#include + +#include "test_util.h" + +OPTNONE int main(void) { + free(malloc(16)); + char *p = mmap(NULL, 4096 * 16, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (p == MAP_FAILED) { + return 1; + } + free(p + 4096 * 8); + return 0; +} diff --git a/src/hardened_malloc/test/invalid_malloc_object_size_small.c b/src/hardened_malloc/test/invalid_malloc_object_size_small.c new file mode 100644 index 0000000..33cc78f --- /dev/null +++ b/src/hardened_malloc/test/invalid_malloc_object_size_small.c @@ -0,0 +1,15 @@ +#include + +#include "test_util.h" + +size_t malloc_object_size(void *ptr); + +OPTNONE int main(void) { + char *p = malloc(16); + if (!p) { + return 1; + } + char *q = p + 4096 * 4; + malloc_object_size(q); + return 0; +} diff --git a/src/hardened_malloc/test/invalid_malloc_object_size_small_quarantine.c b/src/hardened_malloc/test/invalid_malloc_object_size_small_quarantine.c new file mode 100644 index 0000000..1a26bc0 --- /dev/null +++ b/src/hardened_malloc/test/invalid_malloc_object_size_small_quarantine.c @@ -0,0 +1,15 @@ +#include + +#include "test_util.h" + +size_t malloc_object_size(void *ptr); + +OPTNONE int main(void) { + void *p = malloc(16); + if (!p) { + return 1; + } + free(p); + malloc_object_size(p); + return 0; +} diff --git a/src/hardened_malloc/test/invalid_malloc_usable_size_small.c b/src/hardened_malloc/test/invalid_malloc_usable_size_small.c new file mode 100644 index 0000000..440aa6b --- /dev/null +++ b/src/hardened_malloc/test/invalid_malloc_usable_size_small.c @@ -0,0 +1,13 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(16); + if (!p) { + return 1; + } + char *q = p + 4096 * 4; + malloc_usable_size(q); + return 0; +} diff --git a/src/hardened_malloc/test/invalid_malloc_usable_size_small_quarantine.c b/src/hardened_malloc/test/invalid_malloc_usable_size_small_quarantine.c new file mode 100644 index 0000000..926acd7 --- /dev/null +++ b/src/hardened_malloc/test/invalid_malloc_usable_size_small_quarantine.c @@ -0,0 +1,13 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + void *p = malloc(16); + if (!p) { + return 1; + } + free(p); + malloc_usable_size(p); + return 0; +} diff --git a/src/hardened_malloc/test/large_array_growth.c b/src/hardened_malloc/test/large_array_growth.c new file mode 100644 index 0000000..09f89c5 --- /dev/null +++ b/src/hardened_malloc/test/large_array_growth.c @@ -0,0 +1,18 @@ +#include +#include + +#include "test_util.h" + +OPTNONE int main(void) { + void *p = NULL; + size_t size = 256 * 1024; + + for (unsigned i = 0; i < 20; i++) { + p = realloc(p, size); + if (!p) { + return 1; + } + memset(p, 'a', size); + size = size * 3 / 2; + } +} diff --git a/src/hardened_malloc/test/mallinfo.c b/src/hardened_malloc/test/mallinfo.c new file mode 100644 index 0000000..6008040 --- /dev/null +++ b/src/hardened_malloc/test/mallinfo.c @@ -0,0 +1,44 @@ +#include +#include + +#if defined(__GLIBC__) || defined(__ANDROID__) +#include +#endif + +#include "test_util.h" + +static void print_mallinfo(void) { +#if defined(__GLIBC__) || defined(__ANDROID__) + struct mallinfo info = mallinfo(); + printf("mallinfo:\n"); + printf("arena: %zu\n", (size_t)info.arena); + printf("ordblks: %zu\n", (size_t)info.ordblks); + printf("smblks: %zu\n", (size_t)info.smblks); + printf("hblks: %zu\n", (size_t)info.hblks); + printf("hblkhd: %zu\n", (size_t)info.hblkhd); + printf("usmblks: %zu\n", (size_t)info.usmblks); + printf("fsmblks: %zu\n", (size_t)info.fsmblks); + printf("uordblks: %zu\n", (size_t)info.uordblks); + printf("fordblks: %zu\n", (size_t)info.fordblks); + printf("keepcost: %zu\n", (size_t)info.keepcost); +#endif +} + +OPTNONE int main(void) { + void *a[4]; + + a[0] = malloc(1024 * 1024 * 1024); + a[1] = malloc(16); + a[2] = malloc(32); + a[3] = malloc(64); + + print_mallinfo(); + + free(a[0]); + free(a[1]); + free(a[2]); + free(a[3]); + + printf("\n"); + print_mallinfo(); +} diff --git a/src/hardened_malloc/test/mallinfo2.c b/src/hardened_malloc/test/mallinfo2.c new file mode 100644 index 0000000..2f4cd33 --- /dev/null +++ b/src/hardened_malloc/test/mallinfo2.c @@ -0,0 +1,44 @@ +#include +#include + +#if defined(__GLIBC__) +#include +#endif + +#include "test_util.h" + +static void print_mallinfo2(void) { +#if defined(__GLIBC__) + struct mallinfo2 info = mallinfo2(); + printf("mallinfo2:\n"); + printf("arena: %zu\n", (size_t)info.arena); + printf("ordblks: %zu\n", (size_t)info.ordblks); + printf("smblks: %zu\n", (size_t)info.smblks); + printf("hblks: %zu\n", (size_t)info.hblks); + printf("hblkhd: %zu\n", (size_t)info.hblkhd); + printf("usmblks: %zu\n", (size_t)info.usmblks); + printf("fsmblks: %zu\n", (size_t)info.fsmblks); + printf("uordblks: %zu\n", (size_t)info.uordblks); + printf("fordblks: %zu\n", (size_t)info.fordblks); + printf("keepcost: %zu\n", (size_t)info.keepcost); +#endif +} + +OPTNONE int main(void) { + void *a[4]; + + a[0] = malloc(1024 * 1024 * 1024); + a[1] = malloc(16); + a[2] = malloc(32); + a[3] = malloc(64); + + print_mallinfo2(); + + free(a[0]); + free(a[1]); + free(a[2]); + free(a[3]); + + printf("\n"); + print_mallinfo2(); +} diff --git a/src/hardened_malloc/test/malloc_info.c b/src/hardened_malloc/test/malloc_info.c new file mode 100644 index 0000000..50b256f --- /dev/null +++ b/src/hardened_malloc/test/malloc_info.c @@ -0,0 +1,35 @@ +#include +#include + +#if defined(__GLIBC__) || defined(__ANDROID__) +#include +#endif + +#include "test_util.h" +#include "../util.h" + +OPTNONE static void leak_memory(void) { + (void)!malloc(1024 * 1024 * 1024); + (void)!malloc(16); + (void)!malloc(32); + (void)!malloc(4096); +} + +static void *do_work(UNUSED void *p) { + leak_memory(); + return NULL; +} + +int main(void) { + pthread_t thread[4]; + for (int i = 0; i < 4; i++) { + pthread_create(&thread[i], NULL, do_work, NULL); + } + for (int i = 0; i < 4; i++) { + pthread_join(thread[i], NULL); + } + +#if defined(__GLIBC__) || defined(__ANDROID__) + malloc_info(0, stdout); +#endif +} diff --git a/src/hardened_malloc/test/malloc_object_size.c b/src/hardened_malloc/test/malloc_object_size.c new file mode 100644 index 0000000..5ab9280 --- /dev/null +++ b/src/hardened_malloc/test/malloc_object_size.c @@ -0,0 +1,12 @@ +#include +#include + +#include "test_util.h" + +size_t malloc_object_size(void *ptr); + +OPTNONE int main(void) { + char *p = malloc(16); + size_t size = malloc_object_size(p); + return size != (SLAB_CANARY ? 24 : 32); +} diff --git a/src/hardened_malloc/test/malloc_object_size_offset.c b/src/hardened_malloc/test/malloc_object_size_offset.c new file mode 100644 index 0000000..d605906 --- /dev/null +++ b/src/hardened_malloc/test/malloc_object_size_offset.c @@ -0,0 +1,12 @@ +#include +#include + +#include "test_util.h" + +size_t malloc_object_size(void *ptr); + +OPTNONE int main(void) { + char *p = malloc(16); + size_t size = malloc_object_size(p + 5); + return size != (SLAB_CANARY ? 19 : 27); +} diff --git a/src/hardened_malloc/test/offset.c b/src/hardened_malloc/test/offset.c new file mode 100644 index 0000000..af14f5c --- /dev/null +++ b/src/hardened_malloc/test/offset.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include + +static size_t size_classes[] = { + /* large */ 4 * 1024 * 1024, + /* 0 */ 0, + /* 16 */ 16, 32, 48, 64, 80, 96, 112, 128, + /* 32 */ 160, 192, 224, 256, + /* 64 */ 320, 384, 448, 512, + /* 128 */ 640, 768, 896, 1024, + /* 256 */ 1280, 1536, 1792, 2048, + /* 512 */ 2560, 3072, 3584, 4096, + /* 1024 */ 5120, 6144, 7168, 8192, + /* 2048 */ 10240, 12288, 14336, 16384, +#if CONFIG_EXTENDED_SIZE_CLASSES + /* 4096 */ 20480, 24576, 28672, 32768, + /* 8192 */ 40960, 49152, 57344, 65536, + /* 16384 */ 81920, 98304, 114688, 131072, +#endif +}; + +#define N_SIZE_CLASSES (sizeof(size_classes) / sizeof(size_classes[0])) + +static const size_t canary_size = SLAB_CANARY ? sizeof(uint64_t) : 0; + +int main(void) { + for (unsigned i = 2; i < N_SIZE_CLASSES; i++) { + size_classes[i] -= canary_size; + } + + void *p[N_SIZE_CLASSES]; + for (unsigned i = 0; i < N_SIZE_CLASSES; i++) { + size_t size = size_classes[i]; + p[i] = malloc(size); + if (!p[i]) { + return 1; + } + void *q = malloc(size); + if (!q) { + return 1; + } + if (i != 0) { + printf("%zu to %zu: %zd\n", size_classes[i - 1], size, p[i] - p[i - 1]); + } + printf("%zu to %zu: %zd\n", size, size, q - p[i]); + } + return 0; +} diff --git a/src/hardened_malloc/test/overflow_large_1_byte.c b/src/hardened_malloc/test/overflow_large_1_byte.c new file mode 100644 index 0000000..a74bbfd --- /dev/null +++ b/src/hardened_malloc/test/overflow_large_1_byte.c @@ -0,0 +1,15 @@ +#include +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(256 * 1024); + if (!p) { + return 1; + } + size_t size = malloc_usable_size(p); + *(p + size) = 0; + free(p); + return 0; +} diff --git a/src/hardened_malloc/test/overflow_large_8_byte.c b/src/hardened_malloc/test/overflow_large_8_byte.c new file mode 100644 index 0000000..4c7d15c --- /dev/null +++ b/src/hardened_malloc/test/overflow_large_8_byte.c @@ -0,0 +1,15 @@ +#include +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(256 * 1024); + if (!p) { + return 1; + } + size_t size = malloc_usable_size(p); + *(p + size + 7) = 0; + free(p); + return 0; +} diff --git a/src/hardened_malloc/test/overflow_small_1_byte.c b/src/hardened_malloc/test/overflow_small_1_byte.c new file mode 100644 index 0000000..f4f60e1 --- /dev/null +++ b/src/hardened_malloc/test/overflow_small_1_byte.c @@ -0,0 +1,15 @@ +#include +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(8); + if (!p) { + return 1; + } + size_t size = malloc_usable_size(p); + *(p + size) = 1; + free(p); + return 0; +} diff --git a/src/hardened_malloc/test/overflow_small_8_byte.c b/src/hardened_malloc/test/overflow_small_8_byte.c new file mode 100644 index 0000000..4256d54 --- /dev/null +++ b/src/hardened_malloc/test/overflow_small_8_byte.c @@ -0,0 +1,16 @@ +#include +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(8); + if (!p) { + return 1; + } + size_t size = malloc_usable_size(p); + // XOR is used to avoid the test having a 1/256 chance to fail + *(p + size + 7) ^= 1; + free(p); + return 0; +} diff --git a/src/hardened_malloc/test/read_after_free_large.c b/src/hardened_malloc/test/read_after_free_large.c new file mode 100644 index 0000000..f5fa18c --- /dev/null +++ b/src/hardened_malloc/test/read_after_free_large.c @@ -0,0 +1,21 @@ +#include +#include +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(256 * 1024); + if (!p) { + return 1; + } + memset(p, 'a', 16); + free(p); + for (size_t i = 0; i < 256 * 1024; i++) { + printf("%x\n", p[i]); + if (p[i] != '\0') { + return 1; + } + } + return 0; +} diff --git a/src/hardened_malloc/test/read_after_free_small.c b/src/hardened_malloc/test/read_after_free_small.c new file mode 100644 index 0000000..2a969ab --- /dev/null +++ b/src/hardened_malloc/test/read_after_free_small.c @@ -0,0 +1,21 @@ +#include +#include +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(16); + if (!p) { + return 1; + } + memset(p, 'a', 16); + free(p); + for (size_t i = 0; i < 16; i++) { + printf("%x\n", p[i]); + if (p[i] != '\0') { + return 1; + } + } + return 0; +} diff --git a/src/hardened_malloc/test/read_zero_size.c b/src/hardened_malloc/test/read_zero_size.c new file mode 100644 index 0000000..53838f2 --- /dev/null +++ b/src/hardened_malloc/test/read_zero_size.c @@ -0,0 +1,13 @@ +#include +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(0); + if (!p) { + return 1; + } + printf("%c\n", *p); + return 0; +} diff --git a/src/hardened_malloc/test/realloc_init.c b/src/hardened_malloc/test/realloc_init.c new file mode 100644 index 0000000..01ec573 --- /dev/null +++ b/src/hardened_malloc/test/realloc_init.c @@ -0,0 +1,33 @@ +#include +#include + +static void *thread_func(void *arg) { + arg = realloc(arg, 1024); + if (!arg) { + exit(EXIT_FAILURE); + } + + free(arg); + + return NULL; +} + +int main(void) { + void *mem = realloc(NULL, 12); + if (!mem) { + return EXIT_FAILURE; + } + + pthread_t thread; + int r = pthread_create(&thread, NULL, thread_func, mem); + if (r != 0) { + return EXIT_FAILURE; + } + + r = pthread_join(thread, NULL); + if (r != 0) { + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/src/hardened_malloc/test/string_overflow.c b/src/hardened_malloc/test/string_overflow.c new file mode 100644 index 0000000..c2dda6d --- /dev/null +++ b/src/hardened_malloc/test/string_overflow.c @@ -0,0 +1,20 @@ +#include +#include +#include + +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(16); + if (!p) { + return 1; + } + + size_t size = malloc_usable_size(p); + memset(p, 'a', size); + printf("overflow by %zu bytes\n", strlen(p) - size); + + return 0; +} diff --git a/src/hardened_malloc/test/test_smc.py b/src/hardened_malloc/test/test_smc.py new file mode 100644 index 0000000..170278e --- /dev/null +++ b/src/hardened_malloc/test/test_smc.py @@ -0,0 +1,242 @@ +import os +import subprocess +import unittest + + +class TestSimpleMemoryCorruption(unittest.TestCase): + + @classmethod + def setUpClass(self): + self.dir = os.path.dirname(os.path.realpath(__file__)) + + def run_test(self, test_name): + sub = subprocess.Popen(self.dir + "/" + test_name, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = sub.communicate() + return stdout, stderr, sub.returncode + + def test_delete_type_size_mismatch(self): + _stdout, stderr, returncode = self.run_test( + "delete_type_size_mismatch") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode( + "utf-8"), "fatal allocator error: sized deallocation mismatch (small)\n") + + def test_double_free_large_delayed(self): + _stdout, stderr, returncode = self.run_test( + "double_free_large_delayed") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: invalid free\n") + + def test_double_free_large(self): + _stdout, stderr, returncode = self.run_test("double_free_large") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: invalid free\n") + + def test_double_free_small_delayed(self): + _stdout, stderr, returncode = self.run_test( + "double_free_small_delayed") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: double free (quarantine)\n") + + def test_double_free_small(self): + _stdout, stderr, returncode = self.run_test("double_free_small") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: double free (quarantine)\n") + + def test_overflow_large_1_byte(self): + _stdout, _stderr, returncode = self.run_test( + "overflow_large_1_byte") + self.assertEqual(returncode, -11) + + def test_overflow_large_8_byte(self): + _stdout, _stderr, returncode = self.run_test( + "overflow_large_8_byte") + self.assertEqual(returncode, -11) + + def test_overflow_small_1_byte(self): + _stdout, stderr, returncode = self.run_test( + "overflow_small_1_byte") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: canary corrupted\n") + + def test_overflow_small_8_byte(self): + _stdout, stderr, returncode = self.run_test( + "overflow_small_8_byte") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: canary corrupted\n") + + def test_invalid_free_protected(self): + _stdout, stderr, returncode = self.run_test("invalid_free_protected") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: invalid free\n") + + def test_invalid_free_small_region_far(self): + _stdout, stderr, returncode = self.run_test( + "invalid_free_small_region_far") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode( + "utf-8"), "fatal allocator error: invalid free within a slab yet to be used\n") + + def test_invalid_free_small_region(self): + _stdout, stderr, returncode = self.run_test( + "invalid_free_small_region") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: double free\n") + + def test_invalid_free_unprotected(self): + _stdout, stderr, returncode = self.run_test("invalid_free_unprotected") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: invalid free\n") + + def test_invalid_malloc_usable_size_small_quarantene(self): + _stdout, stderr, returncode = self.run_test( + "invalid_malloc_usable_size_small_quarantine") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode( + "utf-8"), "fatal allocator error: invalid malloc_usable_size (quarantine)\n") + + def test_invalid_malloc_usable_size_small(self): + _stdout, stderr, returncode = self.run_test( + "invalid_malloc_usable_size_small") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode( + "utf-8"), "fatal allocator error: invalid malloc_usable_size\n") + + def test_read_after_free_large(self): + _stdout, _stderr, returncode = self.run_test("read_after_free_large") + self.assertEqual(returncode, -11) + + def test_read_after_free_small(self): + stdout, _stderr, returncode = self.run_test("read_after_free_small") + self.assertEqual(returncode, 0) + self.assertEqual(stdout.decode("utf-8"), + "0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n") + + def test_read_zero_size(self): + _stdout, _stderr, returncode = self.run_test("read_zero_size") + self.assertEqual(returncode, -11) + + def test_string_overflow(self): + stdout, _stderr, returncode = self.run_test("string_overflow") + self.assertEqual(returncode, 0) + self.assertEqual(stdout.decode("utf-8"), "overflow by 0 bytes\n") + + def test_unaligned_free_large(self): + _stdout, stderr, returncode = self.run_test("unaligned_free_large") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: invalid free\n") + + def test_unaligned_free_small(self): + _stdout, stderr, returncode = self.run_test("unaligned_free_small") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: invalid unaligned free\n") + + def test_unaligned_malloc_usable_size_small(self): + _stdout, stderr, returncode = self.run_test( + "unaligned_malloc_usable_size_small") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: invalid unaligned malloc_usable_size\n") + + def test_uninitialized_free(self): + _stdout, stderr, returncode = self.run_test("uninitialized_free") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: invalid free\n") + + def test_uninitialized_malloc_usable_size(self): + _stdout, stderr, returncode = self.run_test( + "uninitialized_malloc_usable_size") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: invalid malloc_usable_size\n") + + def test_uninitialized_realloc(self): + _stdout, stderr, returncode = self.run_test("uninitialized_realloc") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: invalid realloc\n") + + def test_write_after_free_large_reuse(self): + _stdout, _stderr, returncode = self.run_test( + "write_after_free_large_reuse") + self.assertEqual(returncode, -11) + + def test_write_after_free_large(self): + _stdout, _stderr, returncode = self.run_test("write_after_free_large") + self.assertEqual(returncode, -11) + + def test_write_after_free_small_reuse(self): + _stdout, stderr, returncode = self.run_test( + "write_after_free_small_reuse") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: detected write after free\n") + + def test_write_after_free_small(self): + _stdout, stderr, returncode = self.run_test("write_after_free_small") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode("utf-8"), + "fatal allocator error: detected write after free\n") + + def test_write_zero_size(self): + _stdout, _stderr, returncode = self.run_test("write_zero_size") + self.assertEqual(returncode, -11) + + def test_malloc_object_size(self): + _stdout, _stderr, returncode = self.run_test("malloc_object_size") + self.assertEqual(returncode, 0) + + def test_malloc_object_size_offset(self): + _stdout, _stderr, returncode = self.run_test( + "malloc_object_size_offset") + self.assertEqual(returncode, 0) + + def test_invalid_malloc_object_size_small(self): + _stdout, stderr, returncode = self.run_test( + "invalid_malloc_object_size_small") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode( + "utf-8"), "fatal allocator error: invalid malloc_object_size\n") + + def test_invalid_malloc_object_size_small_quarantine(self): + _stdout, stderr, returncode = self.run_test( + "invalid_malloc_object_size_small_quarantine") + self.assertEqual(returncode, -6) + self.assertEqual(stderr.decode( + "utf-8"), "fatal allocator error: invalid malloc_object_size (quarantine)\n") + + def test_impossibly_large_malloc(self): + _stdout, stderr, returncode = self.run_test( + "impossibly_large_malloc") + self.assertEqual(returncode, 0) + + def test_uninitialized_read_small(self): + _stdout, stderr, returncode = self.run_test( + "uninitialized_read_small") + self.assertEqual(returncode, 0) + + def test_uninitialized_read_large(self): + _stdout, stderr, returncode = self.run_test( + "uninitialized_read_large") + self.assertEqual(returncode, 0) + + def test_realloc_init(self): + _stdout, _stderr, returncode = self.run_test( + "realloc_init") + self.assertEqual(returncode, 0) + +if __name__ == '__main__': + unittest.main() diff --git a/src/hardened_malloc/test/test_util.h b/src/hardened_malloc/test/test_util.h new file mode 100644 index 0000000..d2d78a6 --- /dev/null +++ b/src/hardened_malloc/test/test_util.h @@ -0,0 +1,10 @@ +#ifndef TEST_UTIL_H +#define TEST_UTIL_H + +#ifdef __clang__ +#define OPTNONE __attribute__((optnone)) +#else +#define OPTNONE __attribute__((optimize(0))) +#endif + +#endif diff --git a/src/hardened_malloc/test/unaligned_free_large.c b/src/hardened_malloc/test/unaligned_free_large.c new file mode 100644 index 0000000..7c42347 --- /dev/null +++ b/src/hardened_malloc/test/unaligned_free_large.c @@ -0,0 +1,12 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(256 * 1024); + if (!p) { + return 1; + } + free(p + 1); + return 0; +} diff --git a/src/hardened_malloc/test/unaligned_free_small.c b/src/hardened_malloc/test/unaligned_free_small.c new file mode 100644 index 0000000..25ca757 --- /dev/null +++ b/src/hardened_malloc/test/unaligned_free_small.c @@ -0,0 +1,12 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(16); + if (!p) { + return 1; + } + free(p + 1); + return 0; +} diff --git a/src/hardened_malloc/test/unaligned_malloc_usable_size_small.c b/src/hardened_malloc/test/unaligned_malloc_usable_size_small.c new file mode 100644 index 0000000..c897c0d --- /dev/null +++ b/src/hardened_malloc/test/unaligned_malloc_usable_size_small.c @@ -0,0 +1,12 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(16); + if (!p) { + return 1; + } + malloc_usable_size(p + 1); + return 0; +} diff --git a/src/hardened_malloc/test/uninitialized_free.c b/src/hardened_malloc/test/uninitialized_free.c new file mode 100644 index 0000000..1ba3fcf --- /dev/null +++ b/src/hardened_malloc/test/uninitialized_free.c @@ -0,0 +1,8 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + free((void *)1); + return 0; +} diff --git a/src/hardened_malloc/test/uninitialized_malloc_usable_size.c b/src/hardened_malloc/test/uninitialized_malloc_usable_size.c new file mode 100644 index 0000000..f2abfd1 --- /dev/null +++ b/src/hardened_malloc/test/uninitialized_malloc_usable_size.c @@ -0,0 +1,8 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + malloc_usable_size((void *)1); + return 0; +} diff --git a/src/hardened_malloc/test/uninitialized_read_large.c b/src/hardened_malloc/test/uninitialized_read_large.c new file mode 100644 index 0000000..03400ad --- /dev/null +++ b/src/hardened_malloc/test/uninitialized_read_large.c @@ -0,0 +1,14 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(256 * 1024); + for (unsigned i = 0; i < 256 * 1024; i++) { + if (p[i] != 0) { + return 1; + } + } + free(p); + return 0; +} diff --git a/src/hardened_malloc/test/uninitialized_read_small.c b/src/hardened_malloc/test/uninitialized_read_small.c new file mode 100644 index 0000000..92bdf10 --- /dev/null +++ b/src/hardened_malloc/test/uninitialized_read_small.c @@ -0,0 +1,14 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(8); + for (unsigned i = 0; i < 8; i++) { + if (p[i] != 0) { + return 1; + } + } + free(p); + return 0; +} diff --git a/src/hardened_malloc/test/uninitialized_realloc.c b/src/hardened_malloc/test/uninitialized_realloc.c new file mode 100644 index 0000000..ef173f6 --- /dev/null +++ b/src/hardened_malloc/test/uninitialized_realloc.c @@ -0,0 +1,11 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + void *p = realloc((void *)1, 16); + if (!p) { + return 1; + } + return 0; +} diff --git a/src/hardened_malloc/test/write_after_free_large.c b/src/hardened_malloc/test/write_after_free_large.c new file mode 100644 index 0000000..9561b9f --- /dev/null +++ b/src/hardened_malloc/test/write_after_free_large.c @@ -0,0 +1,13 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(256 * 1024); + if (!p) { + return 1; + } + free(p); + p[64 * 1024 + 1] = 'a'; + return 0; +} diff --git a/src/hardened_malloc/test/write_after_free_large_reuse.c b/src/hardened_malloc/test/write_after_free_large_reuse.c new file mode 100644 index 0000000..e802035 --- /dev/null +++ b/src/hardened_malloc/test/write_after_free_large_reuse.c @@ -0,0 +1,16 @@ +#include +#include + +#include "test_util.h" +#include "../util.h" + +OPTNONE int main(void) { + char *p = malloc(256 * 1024); + if (!p) { + return 1; + } + free(p); + UNUSED char *q = malloc(256 * 1024); + p[64 * 1024 + 1] = 'a'; + return 0; +} diff --git a/src/hardened_malloc/test/write_after_free_small.c b/src/hardened_malloc/test/write_after_free_small.c new file mode 100644 index 0000000..7850cd6 --- /dev/null +++ b/src/hardened_malloc/test/write_after_free_small.c @@ -0,0 +1,19 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(128); + if (!p) { + return 1; + } + free(p); + + p[65] = 'a'; + + // trigger reuse of the allocation + for (size_t i = 0; i < 100000; i++) { + free(malloc(128)); + } + return 0; +} diff --git a/src/hardened_malloc/test/write_after_free_small_reuse.c b/src/hardened_malloc/test/write_after_free_small_reuse.c new file mode 100644 index 0000000..3318a91 --- /dev/null +++ b/src/hardened_malloc/test/write_after_free_small_reuse.c @@ -0,0 +1,21 @@ +#include + +#include "test_util.h" +#include "../util.h" + +OPTNONE int main(void) { + char *p = malloc(128); + if (!p) { + return 1; + } + free(p); + UNUSED char *q = malloc(128); + + p[65] = 'a'; + + // trigger reuse of the allocation + for (size_t i = 0; i < 100000; i++) { + free(malloc(128)); + } + return 0; +} diff --git a/src/hardened_malloc/test/write_zero_size.c b/src/hardened_malloc/test/write_zero_size.c new file mode 100644 index 0000000..49d26ea --- /dev/null +++ b/src/hardened_malloc/test/write_zero_size.c @@ -0,0 +1,12 @@ +#include + +#include "test_util.h" + +OPTNONE int main(void) { + char *p = malloc(0); + if (!p) { + return 1; + } + *p = 5; + return 0; +} diff --git a/src/hardened_malloc/third_party/libdivide.h b/src/hardened_malloc/third_party/libdivide.h new file mode 100644 index 0000000..e9a31d1 --- /dev/null +++ b/src/hardened_malloc/third_party/libdivide.h @@ -0,0 +1,3126 @@ +// libdivide.h - Optimized integer division +// https://libdivide.com +// +// Copyright (C) 2010 - 2021 ridiculous_fish, +// Copyright (C) 2016 - 2021 Kim Walisch, +// +// libdivide is dual-licensed under the Boost or zlib licenses. +// You may use libdivide under the terms of either of these. +// See LICENSE.txt for more details. + +#ifndef LIBDIVIDE_H +#define LIBDIVIDE_H + +#define LIBDIVIDE_VERSION "5.0" +#define LIBDIVIDE_VERSION_MAJOR 5 +#define LIBDIVIDE_VERSION_MINOR 0 + +#include +#if !defined(__AVR__) +#include +#include +#endif + +#if defined(LIBDIVIDE_SSE2) +#include +#endif +#if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512) +#include +#endif +#if defined(LIBDIVIDE_NEON) +#include +#endif + +#if defined(_MSC_VER) +#include +#pragma warning(push) +// disable warning C4146: unary minus operator applied +// to unsigned type, result still unsigned +#pragma warning(disable : 4146) +// disable warning C4204: nonstandard extension used : non-constant aggregate +// initializer +// +// It's valid C99 +#pragma warning(disable : 4204) +#define LIBDIVIDE_VC +#endif + +#if !defined(__has_builtin) +#define __has_builtin(x) 0 +#endif + +#if defined(__SIZEOF_INT128__) +#define HAS_INT128_T +// clang-cl on Windows does not yet support 128-bit division +#if !(defined(__clang__) && defined(LIBDIVIDE_VC)) +#define HAS_INT128_DIV +#endif +#endif + +#if defined(__x86_64__) || defined(_M_X64) +#define LIBDIVIDE_X86_64 +#endif + +#if defined(__i386__) +#define LIBDIVIDE_i386 +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define LIBDIVIDE_GCC_STYLE_ASM +#endif + +#if defined(__cplusplus) || defined(LIBDIVIDE_VC) +#define LIBDIVIDE_FUNCTION __FUNCTION__ +#else +#define LIBDIVIDE_FUNCTION __func__ +#endif + +// Set up forced inlining if possible. +// We need both the attribute and keyword to avoid "might not be inlineable" warnings. +#ifdef __has_attribute +#if __has_attribute(always_inline) +#define LIBDIVIDE_INLINE __attribute__((always_inline)) inline +#endif +#endif +#ifndef LIBDIVIDE_INLINE +#define LIBDIVIDE_INLINE inline +#endif + +#if defined(__AVR__) +#define LIBDIVIDE_ERROR(msg) +#else +#define LIBDIVIDE_ERROR(msg) \ + do { \ + fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \ + abort(); \ + } while (0) +#endif + +#if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) +#define LIBDIVIDE_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \ + LIBDIVIDE_FUNCTION, #x); \ + abort(); \ + } \ + } while (0) +#else +#define LIBDIVIDE_ASSERT(x) +#endif + +#ifdef __cplusplus +namespace libdivide { +#endif + +// pack divider structs to prevent compilers from padding. +// This reduces memory usage by up to 43% when using a large +// array of libdivide dividers and improves performance +// by up to 10% because of reduced memory bandwidth. +#pragma pack(push, 1) + +struct libdivide_u16_t { + uint16_t magic; + uint8_t more; +}; + +struct libdivide_s16_t { + int16_t magic; + uint8_t more; +}; + +struct libdivide_u32_t { + uint32_t magic; + uint8_t more; +}; + +struct libdivide_s32_t { + int32_t magic; + uint8_t more; +}; + +struct libdivide_u64_t { + uint64_t magic; + uint8_t more; +}; + +struct libdivide_s64_t { + int64_t magic; + uint8_t more; +}; + +struct libdivide_u16_branchfree_t { + uint16_t magic; + uint8_t more; +}; + +struct libdivide_s16_branchfree_t { + int16_t magic; + uint8_t more; +}; + +struct libdivide_u32_branchfree_t { + uint32_t magic; + uint8_t more; +}; + +struct libdivide_s32_branchfree_t { + int32_t magic; + uint8_t more; +}; + +struct libdivide_u64_branchfree_t { + uint64_t magic; + uint8_t more; +}; + +struct libdivide_s64_branchfree_t { + int64_t magic; + uint8_t more; +}; + +#pragma pack(pop) + +// Explanation of the "more" field: +// +// * Bits 0-5 is the shift value (for shift path or mult path). +// * Bit 6 is the add indicator for mult path. +// * Bit 7 is set if the divisor is negative. We use bit 7 as the negative +// divisor indicator so that we can efficiently use sign extension to +// create a bitmask with all bits set to 1 (if the divisor is negative) +// or 0 (if the divisor is positive). +// +// u32: [0-4] shift value +// [5] ignored +// [6] add indicator +// magic number of 0 indicates shift path +// +// s32: [0-4] shift value +// [5] ignored +// [6] add indicator +// [7] indicates negative divisor +// magic number of 0 indicates shift path +// +// u64: [0-5] shift value +// [6] add indicator +// magic number of 0 indicates shift path +// +// s64: [0-5] shift value +// [6] add indicator +// [7] indicates negative divisor +// magic number of 0 indicates shift path +// +// In s32 and s64 branchfree modes, the magic number is negated according to +// whether the divisor is negated. In branchfree strategy, it is not negated. + +enum { + LIBDIVIDE_16_SHIFT_MASK = 0x1F, + LIBDIVIDE_32_SHIFT_MASK = 0x1F, + LIBDIVIDE_64_SHIFT_MASK = 0x3F, + LIBDIVIDE_ADD_MARKER = 0x40, + LIBDIVIDE_NEGATIVE_DIVISOR = 0x80 +}; + +static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d); +static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d); +static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d); +static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d); +static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d); +static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d); + +static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d); +static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d); +static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d); +static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d); +static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); +static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d); + +static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw( + int16_t numer, int16_t magic, uint8_t more); +static LIBDIVIDE_INLINE int16_t libdivide_s16_do( + int16_t numer, const struct libdivide_s16_t* denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw( + uint16_t numer, uint16_t magic, uint8_t more); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_do( + uint16_t numer, const struct libdivide_u16_t* denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_do( + int32_t numer, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE uint32_t libdivide_u32_do( + uint32_t numer, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE int64_t libdivide_s64_do( + int64_t numer, const struct libdivide_s64_t *denom); +static LIBDIVIDE_INLINE uint64_t libdivide_u64_do( + uint64_t numer, const struct libdivide_u64_t *denom); + +static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do( + int16_t numer, const struct libdivide_s16_branchfree_t* denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do( + uint16_t numer, const struct libdivide_u16_branchfree_t* denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do( + int32_t numer, const struct libdivide_s32_branchfree_t *denom); +static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do( + uint32_t numer, const struct libdivide_u32_branchfree_t *denom); +static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do( + int64_t numer, const struct libdivide_s64_branchfree_t *denom); +static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do( + uint64_t numer, const struct libdivide_u64_branchfree_t *denom); + +static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); +static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); + +static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover( + const struct libdivide_s16_branchfree_t* denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover( + const struct libdivide_u16_branchfree_t* denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover( + const struct libdivide_s32_branchfree_t *denom); +static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover( + const struct libdivide_u32_branchfree_t *denom); +static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover( + const struct libdivide_s64_branchfree_t *denom); +static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover( + const struct libdivide_u64_branchfree_t *denom); + +//////// Internal Utility Functions + +static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) { + uint32_t xl = x, yl = y; + uint32_t rl = xl * yl; + return (uint16_t)(rl >> 16); +} + +static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) { + int32_t xl = x, yl = y; + int32_t rl = xl * yl; + // needs to be arithmetic shift + return (int16_t)(rl >> 16); +} + +static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) { + uint64_t xl = x, yl = y; + uint64_t rl = xl * yl; + return (uint32_t)(rl >> 32); +} + +static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { + int64_t xl = x, yl = y; + int64_t rl = xl * yl; + // needs to be arithmetic shift + return (int32_t)(rl >> 32); +} + +static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { +#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) + return __umulh(x, y); +#elif defined(HAS_INT128_T) + __uint128_t xl = x, yl = y; + __uint128_t rl = xl * yl; + return (uint64_t)(rl >> 64); +#else + // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) + uint32_t mask = 0xFFFFFFFF; + uint32_t x0 = (uint32_t)(x & mask); + uint32_t x1 = (uint32_t)(x >> 32); + uint32_t y0 = (uint32_t)(y & mask); + uint32_t y1 = (uint32_t)(y >> 32); + uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); + uint64_t x0y1 = x0 * (uint64_t)y1; + uint64_t x1y0 = x1 * (uint64_t)y0; + uint64_t x1y1 = x1 * (uint64_t)y1; + uint64_t temp = x1y0 + x0y0_hi; + uint64_t temp_lo = temp & mask; + uint64_t temp_hi = temp >> 32; + + return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32); +#endif +} + +static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { +#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) + return __mulh(x, y); +#elif defined(HAS_INT128_T) + __int128_t xl = x, yl = y; + __int128_t rl = xl * yl; + return (int64_t)(rl >> 64); +#else + // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) + uint32_t mask = 0xFFFFFFFF; + uint32_t x0 = (uint32_t)(x & mask); + uint32_t y0 = (uint32_t)(y & mask); + int32_t x1 = (int32_t)(x >> 32); + int32_t y1 = (int32_t)(y >> 32); + uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); + int64_t t = x1 * (int64_t)y0 + x0y0_hi; + int64_t w1 = x0 * (int64_t)y1 + (t & mask); + + return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32); +#endif +} + +static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) { +#if defined(__AVR__) + // Fast way to count leading zeros + // On the AVR 8-bit architecture __builtin_clz() works on a int16_t. + return __builtin_clz(val); +#elif defined(__GNUC__) || __has_builtin(__builtin_clz) + // Fast way to count leading zeros + return __builtin_clz(val) - 16; +#elif defined(LIBDIVIDE_VC) + unsigned long result; + if (_BitScanReverse(&result, (unsigned long)val)) { + return (int16_t)(15 - result); + } + return 0; +#else + if (val == 0) return 16; + int16_t result = 4; + uint16_t hi = 0xFU << 12; + while ((val & hi) == 0) { + hi >>= 4; + result += 4; + } + while (val & hi) { + result -= 1; + hi <<= 1; + } + return result; +#endif +} + +static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) { +#if defined(__AVR__) + // Fast way to count leading zeros + return __builtin_clzl(val); +#elif defined(__GNUC__) || __has_builtin(__builtin_clz) + // Fast way to count leading zeros + return __builtin_clz(val); +#elif defined(LIBDIVIDE_VC) + unsigned long result; + if (_BitScanReverse(&result, val)) { + return 31 - result; + } + return 0; +#else + if (val == 0) return 32; + int32_t result = 8; + uint32_t hi = 0xFFU << 24; + while ((val & hi) == 0) { + hi >>= 8; + result += 8; + } + while (val & hi) { + result -= 1; + hi <<= 1; + } + return result; +#endif +} + +static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) { +#if defined(__GNUC__) || __has_builtin(__builtin_clzll) + // Fast way to count leading zeros + return __builtin_clzll(val); +#elif defined(LIBDIVIDE_VC) && defined(_WIN64) + unsigned long result; + if (_BitScanReverse64(&result, val)) { + return 63 - result; + } + return 0; +#else + uint32_t hi = val >> 32; + uint32_t lo = val & 0xFFFFFFFF; + if (hi != 0) return libdivide_count_leading_zeros32(hi); + return 32 + libdivide_count_leading_zeros32(lo); +#endif +} + +// libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit +// uint {v}. The result must fit in 16 bits. +// Returns the quotient directly and the remainder in *r +static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16( + uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) { + uint32_t n = ((uint32_t)u1 << 16) | u0; + uint16_t result = (uint16_t)(n / v); + *r = (uint16_t)(n - result * (uint32_t)v); + return result; +} + +// libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit +// uint {v}. The result must fit in 32 bits. +// Returns the quotient directly and the remainder in *r +static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32( + uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { +#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM) + uint32_t result; + __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1)); + return result; +#else + uint64_t n = ((uint64_t)u1 << 32) | u0; + uint32_t result = (uint32_t)(n / v); + *r = (uint32_t)(n - result * (uint64_t)v); + return result; +#endif +} + +// libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The +// result must fit in 64 bits. Returns the quotient directly and the remainder in *r +static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64( + uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) { + // N.B. resist the temptation to use __uint128_t here. + // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than + // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because + // it's not LIBDIVIDE_INLINEd. +#if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM) + uint64_t result; + __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi)); + return result; +#else + // We work in base 2**32. + // A uint32 holds a single digit. A uint64 holds two digits. + // Our numerator is conceptually [num3, num2, num1, num0]. + // Our denominator is [den1, den0]. + const uint64_t b = ((uint64_t)1 << 32); + + // The high and low digits of our computed quotient. + uint32_t q1; + uint32_t q0; + + // The normalization shift factor. + int shift; + + // The high and low digits of our denominator (after normalizing). + // Also the low 2 digits of our numerator (after normalizing). + uint32_t den1; + uint32_t den0; + uint32_t num1; + uint32_t num0; + + // A partial remainder. + uint64_t rem; + + // The estimated quotient, and its corresponding remainder (unrelated to true remainder). + uint64_t qhat; + uint64_t rhat; + + // Variables used to correct the estimated quotient. + uint64_t c1; + uint64_t c2; + + // Check for overflow and divide by 0. + if (numhi >= den) { + if (r != NULL) *r = ~0ull; + return ~0ull; + } + + // Determine the normalization factor. We multiply den by this, so that its leading digit is at + // least half b. In binary this means just shifting left by the number of leading zeros, so that + // there's a 1 in the MSB. + // We also shift numer by the same amount. This cannot overflow because numhi < den. + // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting + // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is + // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it. + shift = libdivide_count_leading_zeros64(den); + den <<= shift; + numhi <<= shift; + numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63); + numlo <<= shift; + + // Extract the low digits of the numerator and both digits of the denominator. + num1 = (uint32_t)(numlo >> 32); + num0 = (uint32_t)(numlo & 0xFFFFFFFFu); + den1 = (uint32_t)(den >> 32); + den0 = (uint32_t)(den & 0xFFFFFFFFu); + + // We wish to compute q1 = [n3 n2 n1] / [d1 d0]. + // Estimate q1 as [n3 n2] / [d1], and then correct it. + // Note while qhat may be 2 digits, q1 is always 1 digit. + qhat = numhi / den1; + rhat = numhi % den1; + c1 = qhat * den0; + c2 = rhat * b + num1; + if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; + q1 = (uint32_t)qhat; + + // Compute the true (partial) remainder. + rem = numhi * b + num1 - q1 * den; + + // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0]. + // Estimate q0 as [rem1 rem0] / [d1] and correct it. + qhat = rem / den1; + rhat = rem % den1; + c1 = qhat * den0; + c2 = rhat * b + num0; + if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; + q0 = (uint32_t)qhat; + + // Return remainder if requested. + if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift; + return ((uint64_t)q1 << 32) | q0; +#endif +} + +// Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0) +static LIBDIVIDE_INLINE void libdivide_u128_shift( + uint64_t *u1, uint64_t *u0, int32_t signed_shift) { + if (signed_shift > 0) { + uint32_t shift = signed_shift; + *u1 <<= shift; + *u1 |= *u0 >> (64 - shift); + *u0 <<= shift; + } else if (signed_shift < 0) { + uint32_t shift = -signed_shift; + *u0 >>= shift; + *u0 |= *u1 << (64 - shift); + *u1 >>= shift; + } +} + +// Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder. +static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64( + uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { +#if defined(HAS_INT128_T) && defined(HAS_INT128_DIV) + __uint128_t ufull = u_hi; + __uint128_t vfull = v_hi; + ufull = (ufull << 64) | u_lo; + vfull = (vfull << 64) | v_lo; + uint64_t res = (uint64_t)(ufull / vfull); + __uint128_t remainder = ufull - (vfull * res); + *r_lo = (uint64_t)remainder; + *r_hi = (uint64_t)(remainder >> 64); + return res; +#else + // Adapted from "Unsigned Doubleword Division" in Hacker's Delight + // We want to compute u / v + typedef struct { + uint64_t hi; + uint64_t lo; + } u128_t; + u128_t u = {u_hi, u_lo}; + u128_t v = {v_hi, v_lo}; + + if (v.hi == 0) { + // divisor v is a 64 bit value, so we just need one 128/64 division + // Note that we are simpler than Hacker's Delight here, because we know + // the quotient fits in 64 bits whereas Hacker's Delight demands a full + // 128 bit quotient + *r_hi = 0; + return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo); + } + // Here v >= 2**64 + // We know that v.hi != 0, so count leading zeros is OK + // We have 0 <= n <= 63 + uint32_t n = libdivide_count_leading_zeros64(v.hi); + + // Normalize the divisor so its MSB is 1 + u128_t v1t = v; + libdivide_u128_shift(&v1t.hi, &v1t.lo, n); + uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64 + + // To ensure no overflow + u128_t u1 = u; + libdivide_u128_shift(&u1.hi, &u1.lo, -1); + + // Get quotient from divide unsigned insn. + uint64_t rem_ignored; + uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored); + + // Undo normalization and division of u by 2. + u128_t q0 = {0, q1}; + libdivide_u128_shift(&q0.hi, &q0.lo, n); + libdivide_u128_shift(&q0.hi, &q0.lo, -63); + + // Make q0 correct or too small by 1 + // Equivalent to `if (q0 != 0) q0 = q0 - 1;` + if (q0.hi != 0 || q0.lo != 0) { + q0.hi -= (q0.lo == 0); // borrow + q0.lo -= 1; + } + + // Now q0 is correct. + // Compute q0 * v as q0v + // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo) + // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) + + // (q0.lo * v.hi << 64) + q0.lo * v.lo) + // Each term is 128 bit + // High half of full product (upper 128 bits!) are dropped + u128_t q0v = {0, 0}; + q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo); + q0v.lo = q0.lo * v.lo; + + // Compute u - q0v as u_q0v + // This is the remainder + u128_t u_q0v = u; + u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow + u_q0v.lo -= q0v.lo; + + // Check if u_q0v >= v + // This checks if our remainder is larger than the divisor + if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) { + // Increment q0 + q0.lo += 1; + q0.hi += (q0.lo == 0); // carry + + // Subtract v from remainder + u_q0v.hi -= v.hi + (u_q0v.lo < v.lo); + u_q0v.lo -= v.lo; + } + + *r_hi = u_q0v.hi; + *r_lo = u_q0v.lo; + + LIBDIVIDE_ASSERT(q0.hi == 0); + return q0.lo; +#endif +} + +////////// UINT16 + +static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen( + uint16_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_u16_t result; + uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d)); + + // Power of 2 + if ((d & (d - 1)) == 0) { + // We need to subtract 1 from the shift value in case of an unsigned + // branchfree divider because there is a hardcoded right shift by 1 + // in its division algorithm. Because of this we also need to add back + // 1 in its recovery algorithm. + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); + } + else { + uint8_t more; + uint16_t rem, proposed_m; + proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem); + + LIBDIVIDE_ASSERT(rem > 0 && rem < d); + const uint16_t e = d - rem; + + // This power works if e < 2**floor_log_2_d. + if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) { + // This power works + more = floor_log_2_d; + } + else { + // We have to use the general 17-bit algorithm. We need to compute + // (2**power) / d. However, we already have (2**(power-1))/d and + // its remainder. By doubling both, and then correcting the + // remainder, we can compute the larger division. + // don't care about overflow here - in fact, we expect it + proposed_m += proposed_m; + const uint16_t twice_rem = rem + rem; + if (twice_rem >= d || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + } + result.magic = 1 + proposed_m; + result.more = more; + // result.more's shift should in general be ceil_log_2_d. But if we + // used the smaller power, we subtract one from the shift because we're + // using the smaller power. If we're using the larger power, we + // subtract one from the shift because it's taken care of by the add + // indicator. So floor_log_2_d happens to be correct in both cases. + } + return result; +} + +struct libdivide_u16_t libdivide_u16_gen(uint16_t d) { + return libdivide_internal_u16_gen(d, 0); +} + +struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) { + if (d == 1) { + LIBDIVIDE_ERROR("branchfree divider must be != 1"); + } + struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1); + struct libdivide_u16_branchfree_t ret = { + tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) }; + return ret; +} + +// The original libdivide_u16_do takes a const pointer. However, this cannot be used +// with a compile time constant libdivide_u16_t: it will generate a warning about +// taking the address of a temporary. Hence this overload. +uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) { + if (!magic) { + return numer >> more; + } + else { + uint16_t q = libdivide_mullhi_u16(magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + uint16_t t = ((numer - q) >> 1) + q; + return t >> (more & LIBDIVIDE_16_SHIFT_MASK); + } + else { + // All upper bits are 0, + // don't need to mask them off. + return q >> more; + } + } +} + +uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) { + return libdivide_u16_do_raw(numer, denom->magic, denom->more); +} + +uint16_t libdivide_u16_branchfree_do( + uint16_t numer, const struct libdivide_u16_branchfree_t* denom) { + uint16_t q = libdivide_mullhi_u16(denom->magic, numer); + uint16_t t = ((numer - q) >> 1) + q; + return t >> denom->more; +} + +uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + + if (!denom->magic) { + return (uint16_t)1 << shift; + } else if (!(more & LIBDIVIDE_ADD_MARKER)) { + // We compute q = n/d = n*m / 2^(16 + shift) + // Therefore we have d = 2^(16 + shift) / m + // We need to ceil it. + // We know d is not a power of 2, so m is not a power of 2, + // so we can just add 1 to the floor + uint16_t hi_dividend = (uint16_t)1 << shift; + uint16_t rem_ignored; + return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored); + } else { + // Here we wish to compute d = 2^(16+shift+1)/(m+2^16). + // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now + // Also note that shift may be as high as 15, so shift + 1 will + // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and + // then double the quotient and remainder. + uint32_t half_n = (uint32_t)1 << (16 + shift); + uint32_t d = ( (uint32_t)1 << 16) | denom->magic; + // Note that the quotient is guaranteed <= 16 bits, but the remainder + // may need 17! + uint16_t half_q = (uint16_t)(half_n / d); + uint32_t rem = half_n % d; + // We computed 2^(16+shift)/(m+2^16) + // Need to double it, and then add 1 to the quotient if doubling th + // remainder would increase the quotient. + // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits + uint16_t full_q = half_q + half_q + ((rem << 1) >= d); + + // We rounded down in gen (hence +1) + return full_q + 1; + } +} + +uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + + if (!denom->magic) { + return (uint16_t)1 << (shift + 1); + } else { + // Here we wish to compute d = 2^(16+shift+1)/(m+2^16). + // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now + // Also note that shift may be as high as 15, so shift + 1 will + // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and + // then double the quotient and remainder. + uint32_t half_n = (uint32_t)1 << (16 + shift); + uint32_t d = ((uint32_t)1 << 16) | denom->magic; + // Note that the quotient is guaranteed <= 16 bits, but the remainder + // may need 17! + uint16_t half_q = (uint16_t)(half_n / d); + uint32_t rem = half_n % d; + // We computed 2^(16+shift)/(m+2^16) + // Need to double it, and then add 1 to the quotient if doubling th + // remainder would increase the quotient. + // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits + uint16_t full_q = half_q + half_q + ((rem << 1) >= d); + + // We rounded down in gen (hence +1) + return full_q + 1; + } +} + +////////// UINT32 + +static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen( + uint32_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_u32_t result; + uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d); + + // Power of 2 + if ((d & (d - 1)) == 0) { + // We need to subtract 1 from the shift value in case of an unsigned + // branchfree divider because there is a hardcoded right shift by 1 + // in its division algorithm. Because of this we also need to add back + // 1 in its recovery algorithm. + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); + } else { + uint8_t more; + uint32_t rem, proposed_m; + proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem); + + LIBDIVIDE_ASSERT(rem > 0 && rem < d); + const uint32_t e = d - rem; + + // This power works if e < 2**floor_log_2_d. + if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) { + // This power works + more = (uint8_t)floor_log_2_d; + } else { + // We have to use the general 33-bit algorithm. We need to compute + // (2**power) / d. However, we already have (2**(power-1))/d and + // its remainder. By doubling both, and then correcting the + // remainder, we can compute the larger division. + // don't care about overflow here - in fact, we expect it + proposed_m += proposed_m; + const uint32_t twice_rem = rem + rem; + if (twice_rem >= d || twice_rem < rem) proposed_m += 1; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); + } + result.magic = 1 + proposed_m; + result.more = more; + // result.more's shift should in general be ceil_log_2_d. But if we + // used the smaller power, we subtract one from the shift because we're + // using the smaller power. If we're using the larger power, we + // subtract one from the shift because it's taken care of by the add + // indicator. So floor_log_2_d happens to be correct in both cases. + } + return result; +} + +struct libdivide_u32_t libdivide_u32_gen(uint32_t d) { + return libdivide_internal_u32_gen(d, 0); +} + +struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) { + if (d == 1) { + LIBDIVIDE_ERROR("branchfree divider must be != 1"); + } + struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1); + struct libdivide_u32_branchfree_t ret = { + tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)}; + return ret; +} + +uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return numer >> more; + } else { + uint32_t q = libdivide_mullhi_u32(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + uint32_t t = ((numer - q) >> 1) + q; + return t >> (more & LIBDIVIDE_32_SHIFT_MASK); + } else { + // All upper bits are 0, + // don't need to mask them off. + return q >> more; + } + } +} + +uint32_t libdivide_u32_branchfree_do( + uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { + uint32_t q = libdivide_mullhi_u32(denom->magic, numer); + uint32_t t = ((numer - q) >> 1) + q; + return t >> denom->more; +} + +uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + + if (!denom->magic) { + return (uint32_t)1 << shift; + } else if (!(more & LIBDIVIDE_ADD_MARKER)) { + // We compute q = n/d = n*m / 2^(32 + shift) + // Therefore we have d = 2^(32 + shift) / m + // We need to ceil it. + // We know d is not a power of 2, so m is not a power of 2, + // so we can just add 1 to the floor + uint32_t hi_dividend = (uint32_t)1 << shift; + uint32_t rem_ignored; + return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored); + } else { + // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). + // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now + // Also note that shift may be as high as 31, so shift + 1 will + // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and + // then double the quotient and remainder. + uint64_t half_n = (uint64_t)1 << (32 + shift); + uint64_t d = ((uint64_t)1 << 32) | denom->magic; + // Note that the quotient is guaranteed <= 32 bits, but the remainder + // may need 33! + uint32_t half_q = (uint32_t)(half_n / d); + uint64_t rem = half_n % d; + // We computed 2^(32+shift)/(m+2^32) + // Need to double it, and then add 1 to the quotient if doubling th + // remainder would increase the quotient. + // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits + uint32_t full_q = half_q + half_q + ((rem << 1) >= d); + + // We rounded down in gen (hence +1) + return full_q + 1; + } +} + +uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + + if (!denom->magic) { + return (uint32_t)1 << (shift + 1); + } else { + // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). + // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now + // Also note that shift may be as high as 31, so shift + 1 will + // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and + // then double the quotient and remainder. + uint64_t half_n = (uint64_t)1 << (32 + shift); + uint64_t d = ((uint64_t)1 << 32) | denom->magic; + // Note that the quotient is guaranteed <= 32 bits, but the remainder + // may need 33! + uint32_t half_q = (uint32_t)(half_n / d); + uint64_t rem = half_n % d; + // We computed 2^(32+shift)/(m+2^32) + // Need to double it, and then add 1 to the quotient if doubling th + // remainder would increase the quotient. + // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits + uint32_t full_q = half_q + half_q + ((rem << 1) >= d); + + // We rounded down in gen (hence +1) + return full_q + 1; + } +} + +/////////// UINT64 + +static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen( + uint64_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_u64_t result; + uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d); + + // Power of 2 + if ((d & (d - 1)) == 0) { + // We need to subtract 1 from the shift value in case of an unsigned + // branchfree divider because there is a hardcoded right shift by 1 + // in its division algorithm. Because of this we also need to add back + // 1 in its recovery algorithm. + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); + } else { + uint64_t proposed_m, rem; + uint8_t more; + // (1 << (64 + floor_log_2_d)) / d + proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem); + + LIBDIVIDE_ASSERT(rem > 0 && rem < d); + const uint64_t e = d - rem; + + // This power works if e < 2**floor_log_2_d. + if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) { + // This power works + more = (uint8_t)floor_log_2_d; + } else { + // We have to use the general 65-bit algorithm. We need to compute + // (2**power) / d. However, we already have (2**(power-1))/d and + // its remainder. By doubling both, and then correcting the + // remainder, we can compute the larger division. + // don't care about overflow here - in fact, we expect it + proposed_m += proposed_m; + const uint64_t twice_rem = rem + rem; + if (twice_rem >= d || twice_rem < rem) proposed_m += 1; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); + } + result.magic = 1 + proposed_m; + result.more = more; + // result.more's shift should in general be ceil_log_2_d. But if we + // used the smaller power, we subtract one from the shift because we're + // using the smaller power. If we're using the larger power, we + // subtract one from the shift because it's taken care of by the add + // indicator. So floor_log_2_d happens to be correct in both cases, + // which is why we do it outside of the if statement. + } + return result; +} + +struct libdivide_u64_t libdivide_u64_gen(uint64_t d) { + return libdivide_internal_u64_gen(d, 0); +} + +struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) { + if (d == 1) { + LIBDIVIDE_ERROR("branchfree divider must be != 1"); + } + struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1); + struct libdivide_u64_branchfree_t ret = { + tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)}; + return ret; +} + +uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return numer >> more; + } else { + uint64_t q = libdivide_mullhi_u64(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + uint64_t t = ((numer - q) >> 1) + q; + return t >> (more & LIBDIVIDE_64_SHIFT_MASK); + } else { + // All upper bits are 0, + // don't need to mask them off. + return q >> more; + } + } +} + +uint64_t libdivide_u64_branchfree_do( + uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { + uint64_t q = libdivide_mullhi_u64(denom->magic, numer); + uint64_t t = ((numer - q) >> 1) + q; + return t >> denom->more; +} + +uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + + if (!denom->magic) { + return (uint64_t)1 << shift; + } else if (!(more & LIBDIVIDE_ADD_MARKER)) { + // We compute q = n/d = n*m / 2^(64 + shift) + // Therefore we have d = 2^(64 + shift) / m + // We need to ceil it. + // We know d is not a power of 2, so m is not a power of 2, + // so we can just add 1 to the floor + uint64_t hi_dividend = (uint64_t)1 << shift; + uint64_t rem_ignored; + return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored); + } else { + // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). + // Notice (m + 2^64) is a 65 bit number. This gets hairy. See + // libdivide_u32_recover for more on what we do here. + // TODO: do something better than 128 bit math + + // Full n is a (potentially) 129 bit value + // half_n is a 128 bit value + // Compute the hi half of half_n. Low half is 0. + uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0; + // d is a 65 bit value. The high bit is always set to 1. + const uint64_t d_hi = 1, d_lo = denom->magic; + // Note that the quotient is guaranteed <= 64 bits, + // but the remainder may need 65! + uint64_t r_hi, r_lo; + uint64_t half_q = + libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); + // We computed 2^(64+shift)/(m+2^64) + // Double the remainder ('dr') and check if that is larger than d + // Note that d is a 65 bit value, so r1 is small and so r1 + r1 + // cannot overflow + uint64_t dr_lo = r_lo + r_lo; + uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry + int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); + uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); + return full_q + 1; + } +} + +uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + + if (!denom->magic) { + return (uint64_t)1 << (shift + 1); + } else { + // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). + // Notice (m + 2^64) is a 65 bit number. This gets hairy. See + // libdivide_u32_recover for more on what we do here. + // TODO: do something better than 128 bit math + + // Full n is a (potentially) 129 bit value + // half_n is a 128 bit value + // Compute the hi half of half_n. Low half is 0. + uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0; + // d is a 65 bit value. The high bit is always set to 1. + const uint64_t d_hi = 1, d_lo = denom->magic; + // Note that the quotient is guaranteed <= 64 bits, + // but the remainder may need 65! + uint64_t r_hi, r_lo; + uint64_t half_q = + libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); + // We computed 2^(64+shift)/(m+2^64) + // Double the remainder ('dr') and check if that is larger than d + // Note that d is a 65 bit value, so r1 is small and so r1 + r1 + // cannot overflow + uint64_t dr_lo = r_lo + r_lo; + uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry + int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); + uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); + return full_q + 1; + } +} + +/////////// SINT16 + +static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen( + int16_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_s16_t result; + + // If d is a power of 2, or negative a power of 2, we have to use a shift. + // This is especially important because the magic algorithm fails for -1. + // To check if d is a power of 2 or its inverse, it suffices to check + // whether its absolute value has exactly one bit set. This works even for + // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set + // and is a power of 2. + uint16_t ud = (uint16_t)d; + uint16_t absD = (d < 0) ? -ud : ud; + uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD); + // check if exactly one bit is set, + // don't care if absD is 0 since that's divide by zero + if ((absD & (absD - 1)) == 0) { + // Branchfree and normal paths are exactly the same + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); + } else { + LIBDIVIDE_ASSERT(floor_log_2_d >= 1); + + uint8_t more; + // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word + // is 0 and the high word is floor_log_2_d - 1 + uint16_t rem, proposed_m; + proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem); + const uint16_t e = absD - rem; + + // We are going to start with a power of floor_log_2_d - 1. + // This works if works if e < 2**floor_log_2_d. + if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) { + // This power works + more = (uint8_t)(floor_log_2_d - 1); + } else { + // We need to go one higher. This should not make proposed_m + // overflow, but it will make it negative when interpreted as an + // int16_t. + proposed_m += proposed_m; + const uint16_t twice_rem = rem + rem; + if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); + } + + proposed_m += 1; + int16_t magic = (int16_t)proposed_m; + + // Mark if we are negative. Note we only negate the magic number in the + // branchfull case. + if (d < 0) { + more |= LIBDIVIDE_NEGATIVE_DIVISOR; + if (!branchfree) { + magic = -magic; + } + } + + result.more = more; + result.magic = magic; + } + return result; +} + +struct libdivide_s16_t libdivide_s16_gen(int16_t d) { + return libdivide_internal_s16_gen(d, 0); +} + +struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) { + struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1); + struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more}; + return result; +} + +// The original libdivide_s16_do takes a const pointer. However, this cannot be used +// with a compile time constant libdivide_s16_t: it will generate a warning about +// taking the address of a temporary. Hence this overload. +int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) { + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + + if (!magic) { + uint16_t sign = (int8_t)more >> 7; + uint16_t mask = ((uint16_t)1 << shift) - 1; + uint16_t uq = numer + ((numer >> 15) & mask); + int16_t q = (int16_t)uq; + q >>= shift; + q = (q ^ sign) - sign; + return q; + } else { + uint16_t uq = (uint16_t)libdivide_mullhi_s16(magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift and then sign extend + int16_t sign = (int8_t)more >> 7; + // q += (more < 0 ? -numer : numer) + // cast required to avoid UB + uq += ((uint16_t)numer ^ sign) - sign; + } + int16_t q = (int16_t)uq; + q >>= shift; + q += (q < 0); + return q; + } +} + +int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) { + return libdivide_s16_do_raw(numer, denom->magic, denom->more); +} + +int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + // must be arithmetic shift and then sign extend + int16_t sign = (int8_t)more >> 7; + int16_t magic = denom->magic; + int16_t q = libdivide_mullhi_s16(magic, numer); + q += numer; + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is a power of + // 2, or (2**shift) if it is not a power of 2 + uint16_t is_power_of_2 = (magic == 0); + uint16_t q_sign = (uint16_t)(q >> 15); + q += q_sign & (((uint16_t)1 << shift) - is_power_of_2); + + // Now arithmetic right shift + q >>= shift; + // Negate if needed + q = (q ^ sign) - sign; + + return q; +} + +int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + if (!denom->magic) { + uint16_t absD = (uint16_t)1 << shift; + if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { + absD = -absD; + } + return (int16_t)absD; + } else { + // Unsigned math is much easier + // We negate the magic number only in the branchfull case, and we don't + // know which case we're in. However we have enough information to + // determine the correct sign of the magic number. The divisor was + // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set, + // the magic number's sign is opposite that of the divisor. + // We want to compute the positive magic number. + int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); + int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; + + // Handle the power of 2 case (including branchfree) + if (denom->magic == 0) { + int16_t result = (uint16_t)1 << shift; + return negative_divisor ? -result : result; + } + + uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic); + uint32_t n = (uint32_t)1 << (16 + shift); // this shift cannot exceed 30 + uint16_t q = (uint16_t)(n / d); + int16_t result = (int16_t)q; + result += 1; + return negative_divisor ? -result : result; + } +} + +int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) { + return libdivide_s16_recover((const struct libdivide_s16_t *)denom); +} + +/////////// SINT32 + +static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen( + int32_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_s32_t result; + + // If d is a power of 2, or negative a power of 2, we have to use a shift. + // This is especially important because the magic algorithm fails for -1. + // To check if d is a power of 2 or its inverse, it suffices to check + // whether its absolute value has exactly one bit set. This works even for + // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set + // and is a power of 2. + uint32_t ud = (uint32_t)d; + uint32_t absD = (d < 0) ? -ud : ud; + uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD); + // check if exactly one bit is set, + // don't care if absD is 0 since that's divide by zero + if ((absD & (absD - 1)) == 0) { + // Branchfree and normal paths are exactly the same + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); + } else { + LIBDIVIDE_ASSERT(floor_log_2_d >= 1); + + uint8_t more; + // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word + // is 0 and the high word is floor_log_2_d - 1 + uint32_t rem, proposed_m; + proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem); + const uint32_t e = absD - rem; + + // We are going to start with a power of floor_log_2_d - 1. + // This works if works if e < 2**floor_log_2_d. + if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) { + // This power works + more = (uint8_t)(floor_log_2_d - 1); + } else { + // We need to go one higher. This should not make proposed_m + // overflow, but it will make it negative when interpreted as an + // int32_t. + proposed_m += proposed_m; + const uint32_t twice_rem = rem + rem; + if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); + } + + proposed_m += 1; + int32_t magic = (int32_t)proposed_m; + + // Mark if we are negative. Note we only negate the magic number in the + // branchfull case. + if (d < 0) { + more |= LIBDIVIDE_NEGATIVE_DIVISOR; + if (!branchfree) { + magic = -magic; + } + } + + result.more = more; + result.magic = magic; + } + return result; +} + +struct libdivide_s32_t libdivide_s32_gen(int32_t d) { + return libdivide_internal_s32_gen(d, 0); +} + +struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) { + struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1); + struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more}; + return result; +} + +int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + + if (!denom->magic) { + uint32_t sign = (int8_t)more >> 7; + uint32_t mask = ((uint32_t)1 << shift) - 1; + uint32_t uq = numer + ((numer >> 31) & mask); + int32_t q = (int32_t)uq; + q >>= shift; + q = (q ^ sign) - sign; + return q; + } else { + uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift and then sign extend + int32_t sign = (int8_t)more >> 7; + // q += (more < 0 ? -numer : numer) + // cast required to avoid UB + uq += ((uint32_t)numer ^ sign) - sign; + } + int32_t q = (int32_t)uq; + q >>= shift; + q += (q < 0); + return q; + } +} + +int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift and then sign extend + int32_t sign = (int8_t)more >> 7; + int32_t magic = denom->magic; + int32_t q = libdivide_mullhi_s32(magic, numer); + q += numer; + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is a power of + // 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + uint32_t q_sign = (uint32_t)(q >> 31); + q += q_sign & (((uint32_t)1 << shift) - is_power_of_2); + + // Now arithmetic right shift + q >>= shift; + // Negate if needed + q = (q ^ sign) - sign; + + return q; +} + +int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + if (!denom->magic) { + uint32_t absD = (uint32_t)1 << shift; + if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { + absD = -absD; + } + return (int32_t)absD; + } else { + // Unsigned math is much easier + // We negate the magic number only in the branchfull case, and we don't + // know which case we're in. However we have enough information to + // determine the correct sign of the magic number. The divisor was + // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set, + // the magic number's sign is opposite that of the divisor. + // We want to compute the positive magic number. + int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); + int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; + + // Handle the power of 2 case (including branchfree) + if (denom->magic == 0) { + int32_t result = (uint32_t)1 << shift; + return negative_divisor ? -result : result; + } + + uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic); + uint64_t n = (uint64_t)1 << (32 + shift); // this shift cannot exceed 30 + uint32_t q = (uint32_t)(n / d); + int32_t result = (int32_t)q; + result += 1; + return negative_divisor ? -result : result; + } +} + +int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) { + return libdivide_s32_recover((const struct libdivide_s32_t *)denom); +} + +///////////// SINT64 + +static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen( + int64_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_s64_t result; + + // If d is a power of 2, or negative a power of 2, we have to use a shift. + // This is especially important because the magic algorithm fails for -1. + // To check if d is a power of 2 or its inverse, it suffices to check + // whether its absolute value has exactly one bit set. This works even for + // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set + // and is a power of 2. + uint64_t ud = (uint64_t)d; + uint64_t absD = (d < 0) ? -ud : ud; + uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD); + // check if exactly one bit is set, + // don't care if absD is 0 since that's divide by zero + if ((absD & (absD - 1)) == 0) { + // Branchfree and non-branchfree cases are the same + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); + } else { + // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word + // is 0 and the high word is floor_log_2_d - 1 + uint8_t more; + uint64_t rem, proposed_m; + proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem); + const uint64_t e = absD - rem; + + // We are going to start with a power of floor_log_2_d - 1. + // This works if works if e < 2**floor_log_2_d. + if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) { + // This power works + more = (uint8_t)(floor_log_2_d - 1); + } else { + // We need to go one higher. This should not make proposed_m + // overflow, but it will make it negative when interpreted as an + // int32_t. + proposed_m += proposed_m; + const uint64_t twice_rem = rem + rem; + if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; + // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we + // also set ADD_MARKER this is an annoying optimization that + // enables algorithm #4 to avoid the mask. However we always set it + // in the branchfree case + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); + } + proposed_m += 1; + int64_t magic = (int64_t)proposed_m; + + // Mark if we are negative + if (d < 0) { + more |= LIBDIVIDE_NEGATIVE_DIVISOR; + if (!branchfree) { + magic = -magic; + } + } + + result.more = more; + result.magic = magic; + } + return result; +} + +struct libdivide_s64_t libdivide_s64_gen(int64_t d) { + return libdivide_internal_s64_gen(d, 0); +} + +struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) { + struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1); + struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more}; + return ret; +} + +int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + + if (!denom->magic) { // shift path + uint64_t mask = ((uint64_t)1 << shift) - 1; + uint64_t uq = numer + ((numer >> 63) & mask); + int64_t q = (int64_t)uq; + q >>= shift; + // must be arithmetic shift and then sign-extend + int64_t sign = (int8_t)more >> 7; + q = (q ^ sign) - sign; + return q; + } else { + uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift and then sign extend + int64_t sign = (int8_t)more >> 7; + // q += (more < 0 ? -numer : numer) + // cast required to avoid UB + uq += ((uint64_t)numer ^ sign) - sign; + } + int64_t q = (int64_t)uq; + q >>= shift; + q += (q < 0); + return q; + } +} + +int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + // must be arithmetic shift and then sign extend + int64_t sign = (int8_t)more >> 7; + int64_t magic = denom->magic; + int64_t q = libdivide_mullhi_s64(magic, numer); + q += numer; + + // If q is non-negative, we have nothing to do. + // If q is negative, we want to add either (2**shift)-1 if d is a power of + // 2, or (2**shift) if it is not a power of 2. + uint64_t is_power_of_2 = (magic == 0); + uint64_t q_sign = (uint64_t)(q >> 63); + q += q_sign & (((uint64_t)1 << shift) - is_power_of_2); + + // Arithmetic right shift + q >>= shift; + // Negate if needed + q = (q ^ sign) - sign; + + return q; +} + +int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + if (denom->magic == 0) { // shift path + uint64_t absD = (uint64_t)1 << shift; + if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { + absD = -absD; + } + return (int64_t)absD; + } else { + // Unsigned math is much easier + int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); + int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; + + uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic); + uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0; + uint64_t rem_ignored; + uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored); + int64_t result = (int64_t)(q + 1); + if (negative_divisor) { + result = -result; + } + return result; + } +} + +int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) { + return libdivide_s64_recover((const struct libdivide_s64_t *)denom); +} + +// Simplest possible vector type division: treat the vector type as an array +// of underlying native type. +#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \ + const size_t count = sizeof(VecT) / sizeof(IntT); \ + VecT result; \ + IntT *pSource = (IntT *)&numers; \ + IntT *pTarget = (IntT *)&result; \ + for (size_t loop=0; loop64. + + // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits. + uint64x2_t y = vdupq_n_u64(sy); + uint32x2_t x0 = vmovn_u64(x); + uint32x2_t y0 = vmovn_u64(y); + uint32x2_t x1 = vshrn_n_u64(x, 32); + uint32x2_t y1 = vshrn_n_u64(y, 32); + + // Compute x0*y0. + uint64x2_t x0y0 = vmull_u32(x0, y0); + uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32); + + // Compute other intermediate products. + uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0); // temp = x0y0_hi + x1*y0; + // We want to split temp into its low 32 bits and high 32 bits, both + // in the low half of 64 bit registers. + // Use shifts to avoid needing a reg for the mask. + uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32); // temp_lo = temp & 0xFFFFFFFF; + uint64x2_t temp_hi = vshrq_n_u64(temp, 32); // temp_hi = temp >> 32; + + temp_lo = vmlal_u32(temp_lo, x0, y1); // temp_lo += x0*y0 + temp_lo = vshrq_n_u64(temp_lo, 32); // temp_lo >>= 32 + temp_hi = vmlal_u32(temp_hi, x1, y1); // temp_hi += x1*y1 + uint64x2_t result = vaddq_u64(temp_hi, temp_lo); + return result; +} + +static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) { + int64x2_t p = vreinterpretq_s64_u64( + libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy))); + int64x2_t y = vdupq_n_s64(sy); + int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y); + int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x); + p = vsubq_s64(p, t1); + p = vsubq_s64(p, t2); + return p; +} + +////////// UINT16 + +uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16) +} + +uint16x8_t libdivide_u16_branchfree_do_vec128(uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree) +} + +////////// UINT32 + +uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return libdivide_u32_neon_srl(numers, more); + } else { + uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + // Note we can use halving-subtract to avoid the shift. + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q); + return libdivide_u32_neon_srl(t, shift); + } else { + return libdivide_u32_neon_srl(q, more); + } + } +} + +uint32x4_t libdivide_u32_branchfree_do_vec128( + uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) { + uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic); + uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q); + return libdivide_u32_neon_srl(t, denom->more); +} + +////////// UINT64 + +uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return libdivide_u64_neon_srl(numers, more); + } else { + uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + // No 64-bit halving subtracts in NEON :( + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q); + return libdivide_u64_neon_srl(t, shift); + } else { + return libdivide_u64_neon_srl(q, more); + } + } +} + +uint64x2_t libdivide_u64_branchfree_do_vec128( + uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) { + uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic); + uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q); + return libdivide_u64_neon_srl(t, denom->more); +} + +////////// SINT16 + +int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16) +} + +int16x8_t libdivide_s16_branchfree_do_vec128(int16x8_t numers, const struct libdivide_s16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree) +} + +////////// SINT32 + +int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = ((uint32_t)1 << shift) - 1; + int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask); + // q = numer + ((numer >> 31) & roundToZeroTweak); + int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak)); + q = libdivide_s32_neon_sra(q, shift); + int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = vsubq_s32(veorq_s32(q, sign), sign); + return q; + } else { + int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign)); + } + // q >>= shift + q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK); + q = vaddq_s32( + q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31))); // q += (q < 0) + return q; + } +} + +int32x4_t libdivide_s32_branchfree_do_vec128( + int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) { + int32_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift + int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); + int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic); + q = vaddq_s32(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + int32x4_t q_sign = vshrq_n_s32(q, 31); // q_sign = q >> 31 + int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2); + q = vaddq_s32(q, vandq_s32(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s32_neon_sra(q, shift); // q >>= shift + q = vsubq_s32(veorq_s32(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +////////// SINT64 + +int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { // shift path + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = ((uint64_t)1 << shift) - 1; + int64x2_t roundToZeroTweak = vdupq_n_s64(mask); // TODO: no need to sign extend + // q = numer + ((numer >> 63) & roundToZeroTweak); + int64x2_t q = + vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_neon_sra(q, shift); + // q = (q ^ sign) - sign; + int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7)); + q = vsubq_s64(veorq_s64(q, sign), sign); + return q; + } else { + int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: no need to widen + // q += ((numer ^ sign) - sign); + q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign)); + } + // q >>= denom->mult_path.shift + q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = vaddq_s64( + q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63))); // q += (q < 0) + return q; + } +} + +int64x2_t libdivide_s64_branchfree_do_vec128( + int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) { + int64_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + // must be arithmetic shift + int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: avoid sign extend + + // libdivide_mullhi_s64(numers, magic); + int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic); + q = vaddq_s64(q, numers); // q += numers + + // If q is non-negative, we have nothing to do. + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2. + uint32_t is_power_of_2 = (magic == 0); + int64x2_t q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 + int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2); + q = vaddq_s64(q, vandq_s64(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_neon_sra(q, shift); // q >>= shift + q = vsubq_s64(veorq_s64(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +#endif + +#if defined(LIBDIVIDE_AVX512) + +static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512( + __m512i numers, const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512( + __m512i numers, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512( + __m512i numers, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512( + __m512i numers, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512( + __m512i numers, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512( + __m512i numers, const struct libdivide_s64_t *denom); + +static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512( + __m512i numers, const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512( + __m512i numers, const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512( + __m512i numers, const struct libdivide_u32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512( + __m512i numers, const struct libdivide_s32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512( + __m512i numers, const struct libdivide_u64_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512( + __m512i numers, const struct libdivide_s64_branchfree_t *denom); + +//////// Internal Utility Functions + +static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) { + ; + return _mm512_srai_epi64(v, 63); +} + +static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) { + return _mm512_srai_epi64(v, amt); +} + +// Here, b is assumed to contain one 32-bit value repeated. +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) { + __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32); + __m512i a1X3X = _mm512_srli_epi64(a, 32); + __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); + __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask); + return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3); +} + +// b is one 32-bit value repeated. +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) { + __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32); + __m512i a1X3X = _mm512_srli_epi64(a, 32); + __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); + __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask); + return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3); +} + +// Here, y is assumed to contain one 64-bit value repeated. +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) { + // see m128i variant for comments. + __m512i x0y0 = _mm512_mul_epu32(x, y); + __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32); + + __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1)); + __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1)); + + __m512i x0y1 = _mm512_mul_epu32(x, y1); + __m512i x1y0 = _mm512_mul_epu32(x1, y); + __m512i x1y1 = _mm512_mul_epu32(x1, y1); + + __m512i mask = _mm512_set1_epi64(0xFFFFFFFF); + __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi); + __m512i temp_lo = _mm512_and_si512(temp, mask); + __m512i temp_hi = _mm512_srli_epi64(temp, 32); + + temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm512_add_epi64(x1y1, temp_hi); + return _mm512_add_epi64(temp_lo, temp_hi); +} + +// y is one 64-bit value repeated. +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) { + __m512i p = libdivide_mullhi_u64_vec512(x, y); + __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y); + __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x); + p = _mm512_sub_epi64(p, t1); + p = _mm512_sub_epi64(p, t2); + return p; +} + +////////// UINT16 + +__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16) +} + +__m512i libdivide_u16_branchfree_do_vec512(__m512i numers, const struct libdivide_u16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree) +} + +////////// UINT32 + +__m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm512_srli_epi32(numers, more); + } else { + __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); + return _mm512_srli_epi32(t, shift); + } else { + return _mm512_srli_epi32(q, more); + } + } +} + +__m512i libdivide_u32_branchfree_do_vec512( + __m512i numers, const struct libdivide_u32_branchfree_t *denom) { + __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic)); + __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); + return _mm512_srli_epi32(t, denom->more); +} + +////////// UINT64 + +__m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm512_srli_epi64(numers, more); + } else { + __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); + return _mm512_srli_epi64(t, shift); + } else { + return _mm512_srli_epi64(q, more); + } + } +} + +__m512i libdivide_u64_branchfree_do_vec512( + __m512i numers, const struct libdivide_u64_branchfree_t *denom) { + __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic)); + __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); + return _mm512_srli_epi64(t, denom->more); +} + +////////// SINT16 + +__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16) +} + +__m512i libdivide_s16_branchfree_do_vec512(__m512i numers, const struct libdivide_s16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree) +} + +////////// SINT32 + +__m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = ((uint32_t)1 << shift) - 1; + __m512i roundToZeroTweak = _mm512_set1_epi32(mask); + // q = numer + ((numer >> 31) & roundToZeroTweak); + __m512i q = _mm512_add_epi32( + numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak)); + q = _mm512_srai_epi32(q, shift); + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); + return q; + } else { + __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign)); + } + // q >>= shift + q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); + q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0) + return q; + } +} + +__m512i libdivide_s32_branchfree_do_vec512( + __m512i numers, const struct libdivide_s32_branchfree_t *denom) { + int32_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic)); + q = _mm512_add_epi32(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31 + __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); + q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm512_srai_epi32(q, shift); // q >>= shift + q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +////////// SINT64 + +__m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { // shift path + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = ((uint64_t)1 << shift) - 1; + __m512i roundToZeroTweak = _mm512_set1_epi64(mask); + // q = numer + ((numer >> 63) & roundToZeroTweak); + __m512i q = _mm512_add_epi64( + numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vec512(q, shift); + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); + return q; + } else { + __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign)); + } + // q >>= denom->mult_path.shift + q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0) + return q; + } +} + +__m512i libdivide_s64_branchfree_do_vec512( + __m512i numers, const struct libdivide_s64_branchfree_t *denom) { + int64_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + // must be arithmetic shift + __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); + + // libdivide_mullhi_s64(numers, magic); + __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic)); + q = _mm512_add_epi64(q, numers); // q += numers + + // If q is non-negative, we have nothing to do. + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2. + uint32_t is_power_of_2 = (magic == 0); + __m512i q_sign = libdivide_s64_signbits_vec512(q); // q_sign = q >> 63 + __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2); + q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vec512(q, shift); // q >>= shift + q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +#endif + +#if defined(LIBDIVIDE_AVX2) + +static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256( + __m256i numers, const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256( + __m256i numers, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256( + __m256i numers, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256( + __m256i numers, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256( + __m256i numers, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256( + __m256i numers, const struct libdivide_s64_t *denom); + +static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256( + __m256i numers, const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256( + __m256i numers, const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256( + __m256i numers, const struct libdivide_u32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256( + __m256i numers, const struct libdivide_s32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256( + __m256i numers, const struct libdivide_u64_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256( + __m256i numers, const struct libdivide_s64_branchfree_t *denom); + +//////// Internal Utility Functions + +// Implementation of _mm256_srai_epi64(v, 63) (from AVX512). +static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) { + __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); + __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31); + return signBits; +} + +// Implementation of _mm256_srai_epi64 (from AVX512). +static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) { + const int b = 64 - amt; + __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1)); + __m256i x = _mm256_srli_epi64(v, amt); + __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m); + return result; +} + +// Here, b is assumed to contain one 32-bit value repeated. +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) { + __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32); + __m256i a1X3X = _mm256_srli_epi64(a, 32); + __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); + __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask); + return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3); +} + +// b is one 32-bit value repeated. +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) { + __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32); + __m256i a1X3X = _mm256_srli_epi64(a, 32); + __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); + __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask); + return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3); +} + +// Here, y is assumed to contain one 64-bit value repeated. +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) { + // see m128i variant for comments. + __m256i x0y0 = _mm256_mul_epu32(x, y); + __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32); + + __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1)); + __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1)); + + __m256i x0y1 = _mm256_mul_epu32(x, y1); + __m256i x1y0 = _mm256_mul_epu32(x1, y); + __m256i x1y1 = _mm256_mul_epu32(x1, y1); + + __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF); + __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi); + __m256i temp_lo = _mm256_and_si256(temp, mask); + __m256i temp_hi = _mm256_srli_epi64(temp, 32); + + temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm256_add_epi64(x1y1, temp_hi); + return _mm256_add_epi64(temp_lo, temp_hi); +} + +// y is one 64-bit value repeated. +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) { + __m256i p = libdivide_mullhi_u64_vec256(x, y); + __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y); + __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x); + p = _mm256_sub_epi64(p, t1); + p = _mm256_sub_epi64(p, t2); + return p; +} + +////////// UINT16 + +__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16) +} + +__m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree) +} + +////////// UINT32 + +__m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm256_srli_epi32(numers, more); + } else { + __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); + return _mm256_srli_epi32(t, shift); + } else { + return _mm256_srli_epi32(q, more); + } + } +} + +__m256i libdivide_u32_branchfree_do_vec256( + __m256i numers, const struct libdivide_u32_branchfree_t *denom) { + __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic)); + __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); + return _mm256_srli_epi32(t, denom->more); +} + +////////// UINT64 + +__m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm256_srli_epi64(numers, more); + } else { + __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); + return _mm256_srli_epi64(t, shift); + } else { + return _mm256_srli_epi64(q, more); + } + } +} + +__m256i libdivide_u64_branchfree_do_vec256( + __m256i numers, const struct libdivide_u64_branchfree_t *denom) { + __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic)); + __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); + return _mm256_srli_epi64(t, denom->more); +} + +////////// SINT16 + +__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16) +} + +__m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree) +} + +////////// SINT32 + +__m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = ((uint32_t)1 << shift) - 1; + __m256i roundToZeroTweak = _mm256_set1_epi32(mask); + // q = numer + ((numer >> 31) & roundToZeroTweak); + __m256i q = _mm256_add_epi32( + numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak)); + q = _mm256_srai_epi32(q, shift); + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); + return q; + } else { + __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign)); + } + // q >>= shift + q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); + q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0) + return q; + } +} + +__m256i libdivide_s32_branchfree_do_vec256( + __m256i numers, const struct libdivide_s32_branchfree_t *denom) { + int32_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic)); + q = _mm256_add_epi32(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31 + __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); + q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm256_srai_epi32(q, shift); // q >>= shift + q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +////////// SINT64 + +__m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { // shift path + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = ((uint64_t)1 << shift) - 1; + __m256i roundToZeroTweak = _mm256_set1_epi64x(mask); + // q = numer + ((numer >> 63) & roundToZeroTweak); + __m256i q = _mm256_add_epi64( + numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vec256(q, shift); + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); + return q; + } else { + __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign)); + } + // q >>= denom->mult_path.shift + q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0) + return q; + } +} + +__m256i libdivide_s64_branchfree_do_vec256( + __m256i numers, const struct libdivide_s64_branchfree_t *denom) { + int64_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + // must be arithmetic shift + __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); + + // libdivide_mullhi_s64(numers, magic); + __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic)); + q = _mm256_add_epi64(q, numers); // q += numers + + // If q is non-negative, we have nothing to do. + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2. + uint32_t is_power_of_2 = (magic == 0); + __m256i q_sign = libdivide_s64_signbits_vec256(q); // q_sign = q >> 63 + __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2); + q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vec256(q, shift); // q >>= shift + q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +#endif + +#if defined(LIBDIVIDE_SSE2) + +static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128( + __m128i numers, const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128( + __m128i numers, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128( + __m128i numers, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128( + __m128i numers, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128( + __m128i numers, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128( + __m128i numers, const struct libdivide_s64_t *denom); + +static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128( + __m128i numers, const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128( + __m128i numers, const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128( + __m128i numers, const struct libdivide_u32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128( + __m128i numers, const struct libdivide_s32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128( + __m128i numers, const struct libdivide_u64_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128( + __m128i numers, const struct libdivide_s64_branchfree_t *denom); + +//////// Internal Utility Functions + +// Implementation of _mm_srai_epi64(v, 63) (from AVX512). +static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) { + __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); + __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31); + return signBits; +} + +// Implementation of _mm_srai_epi64 (from AVX512). +static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) { + const int b = 64 - amt; + __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1)); + __m128i x = _mm_srli_epi64(v, amt); + __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); + return result; +} + +// Here, b is assumed to contain one 32-bit value repeated. +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) { + __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); + __m128i a1X3X = _mm_srli_epi64(a, 32); + __m128i mask = _mm_set_epi32(-1, 0, -1, 0); + __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask); + return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); +} + +// SSE2 does not have a signed multiplication instruction, but we can convert +// unsigned to signed pretty efficiently. Again, b is just a 32 bit value +// repeated four times. +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) { + __m128i p = libdivide_mullhi_u32_vec128(a, b); + // t1 = (a >> 31) & y, arithmetic shift + __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); + __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a); + p = _mm_sub_epi32(p, t1); + p = _mm_sub_epi32(p, t2); + return p; +} + +// Here, y is assumed to contain one 64-bit value repeated. +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) { + // full 128 bits product is: + // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64) + // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64. + + // Compute x0*y0. + // Note x1, y1 are ignored by mul_epu32. + __m128i x0y0 = _mm_mul_epu32(x, y); + __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32); + + // Get x1, y1 in the low bits. + // We could shuffle or right shift. Shuffles are preferred as they preserve + // the source register for the next computation. + __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1)); + __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1)); + + // No need to mask off top 32 bits for mul_epu32. + __m128i x0y1 = _mm_mul_epu32(x, y1); + __m128i x1y0 = _mm_mul_epu32(x1, y); + __m128i x1y1 = _mm_mul_epu32(x1, y1); + + // Mask here selects low bits only. + __m128i mask = _mm_set1_epi64x(0xFFFFFFFF); + __m128i temp = _mm_add_epi64(x1y0, x0y0_hi); + __m128i temp_lo = _mm_and_si128(temp, mask); + __m128i temp_hi = _mm_srli_epi64(temp, 32); + + temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm_add_epi64(x1y1, temp_hi); + return _mm_add_epi64(temp_lo, temp_hi); +} + +// y is one 64-bit value repeated. +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) { + __m128i p = libdivide_mullhi_u64_vec128(x, y); + __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y); + __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x); + p = _mm_sub_epi64(p, t1); + p = _mm_sub_epi64(p, t2); + return p; +} + +////////// UINT26 + +__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16) +} + +__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree) +} + +////////// UINT32 + +__m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm_srli_epi32(numers, more); + } else { + __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); + return _mm_srli_epi32(t, shift); + } else { + return _mm_srli_epi32(q, more); + } + } +} + +__m128i libdivide_u32_branchfree_do_vec128( + __m128i numers, const struct libdivide_u32_branchfree_t *denom) { + __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic)); + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); + return _mm_srli_epi32(t, denom->more); +} + +////////// UINT64 + +__m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm_srli_epi64(numers, more); + } else { + __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); + return _mm_srli_epi64(t, shift); + } else { + return _mm_srli_epi64(q, more); + } + } +} + +__m128i libdivide_u64_branchfree_do_vec128( + __m128i numers, const struct libdivide_u64_branchfree_t *denom) { + __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic)); + __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); + return _mm_srli_epi64(t, denom->more); +} + +////////// SINT16 + +__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16) +} + +__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree) +} + +////////// SINT32 + +__m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = ((uint32_t)1 << shift) - 1; + __m128i roundToZeroTweak = _mm_set1_epi32(mask); + // q = numer + ((numer >> 31) & roundToZeroTweak); + __m128i q = + _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); + q = _mm_srai_epi32(q, shift); + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); + return q; + } else { + __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); + } + // q >>= shift + q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) + return q; + } +} + +__m128i libdivide_s32_branchfree_do_vec128( + __m128i numers, const struct libdivide_s32_branchfree_t *denom) { + int32_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic)); + q = _mm_add_epi32(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 + __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); + q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm_srai_epi32(q, shift); // q >>= shift + q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +////////// SINT64 + +__m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { // shift path + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = ((uint64_t)1 << shift) - 1; + __m128i roundToZeroTweak = _mm_set1_epi64x(mask); + // q = numer + ((numer >> 63) & roundToZeroTweak); + __m128i q = + _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vec128(q, shift); + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); + return q; + } else { + __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); + } + // q >>= denom->mult_path.shift + q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) + return q; + } +} + +__m128i libdivide_s64_branchfree_do_vec128( + __m128i numers, const struct libdivide_s64_branchfree_t *denom) { + int64_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + // must be arithmetic shift + __m128i sign = _mm_set1_epi32((int8_t)more >> 7); + + // libdivide_mullhi_s64(numers, magic); + __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic)); + q = _mm_add_epi64(q, numers); // q += numers + + // If q is non-negative, we have nothing to do. + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2. + uint32_t is_power_of_2 = (magic == 0); + __m128i q_sign = libdivide_s64_signbits_vec128(q); // q_sign = q >> 63 + __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2); + q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vec128(q, shift); // q >>= shift + q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +#endif + +/////////// C++ stuff + +#ifdef __cplusplus + +enum Branching { + BRANCHFULL, // use branching algorithms + BRANCHFREE // use branchfree algorithms +}; + +#if defined(LIBDIVIDE_NEON) +// Helper to deduce NEON vector type for integral type. +template +struct NeonVecFor {}; + +template <> +struct NeonVecFor { + typedef uint16x8_t type; +}; + +template <> +struct NeonVecFor { + typedef int16x8_t type; +}; + +template <> +struct NeonVecFor { + typedef uint32x4_t type; +}; + +template <> +struct NeonVecFor { + typedef int32x4_t type; +}; + +template <> +struct NeonVecFor { + typedef uint64x2_t type; +}; + +template <> +struct NeonVecFor { + typedef int64x2_t type; +}; +#endif + +// Versions of our algorithms for SIMD. +#if defined(LIBDIVIDE_NEON) +#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \ + LIBDIVIDE_INLINE typename NeonVecFor::type divide( \ + typename NeonVecFor::type n) const { \ + return libdivide_##ALGO##_do_vec128(n, &denom); \ + } +#else +#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) +#endif +#if defined(LIBDIVIDE_SSE2) +#define LIBDIVIDE_DIVIDE_SSE2(ALGO) \ + LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \ + return libdivide_##ALGO##_do_vec128(n, &denom); \ + } +#else +#define LIBDIVIDE_DIVIDE_SSE2(ALGO) +#endif + +#if defined(LIBDIVIDE_AVX2) +#define LIBDIVIDE_DIVIDE_AVX2(ALGO) \ + LIBDIVIDE_INLINE __m256i divide(__m256i n) const { \ + return libdivide_##ALGO##_do_vec256(n, &denom); \ + } +#else +#define LIBDIVIDE_DIVIDE_AVX2(ALGO) +#endif + +#if defined(LIBDIVIDE_AVX512) +#define LIBDIVIDE_DIVIDE_AVX512(ALGO) \ + LIBDIVIDE_INLINE __m512i divide(__m512i n) const { \ + return libdivide_##ALGO##_do_vec512(n, &denom); \ + } +#else +#define LIBDIVIDE_DIVIDE_AVX512(ALGO) +#endif + +// The DISPATCHER_GEN() macro generates C++ methods (for the given integer +// and algorithm types) that redirect to libdivide's C API. +#define DISPATCHER_GEN(T, ALGO) \ + libdivide_##ALGO##_t denom; \ + LIBDIVIDE_INLINE dispatcher() {} \ + LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \ + LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \ + LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \ + LIBDIVIDE_DIVIDE_NEON(ALGO, T) \ + LIBDIVIDE_DIVIDE_SSE2(ALGO) \ + LIBDIVIDE_DIVIDE_AVX2(ALGO) \ + LIBDIVIDE_DIVIDE_AVX512(ALGO) + +// The dispatcher selects a specific division algorithm for a given +// type and ALGO using partial template specialization. +template +struct dispatcher {}; + +template <> +struct dispatcher { + DISPATCHER_GEN(int16_t, s16) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(int16_t, s16_branchfree) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint16_t, u16) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint16_t, u16_branchfree) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(int32_t, s32) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(int32_t, s32_branchfree) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint32_t, u32) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint32_t, u32_branchfree) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(int64_t, s64) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(int64_t, s64_branchfree) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint64_t, u64) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint64_t, u64_branchfree) +}; + +// This is the main divider class for use by the user (C++ API). +// The actual division algorithm is selected using the dispatcher struct +// based on the integer and algorithm template parameters. +template +class divider { + private: + typedef dispatcher dispatcher_t; + + public: + // We leave the default constructor empty so that creating + // an array of dividers and then initializing them + // later doesn't slow us down. + divider() {} + + // Constructor that takes the divisor as a parameter + LIBDIVIDE_INLINE divider(T d) : div(d) {} + + // Divides n by the divisor + LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); } + + // Recovers the divisor, returns the value that was + // used to initialize this divider object. + T recover() const { return div.recover(); } + + bool operator==(const divider &other) const { + return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more; + } + + bool operator!=(const divider &other) const { return !(*this == other); } + + // Vector variants treat the input as packed integer values with the same type as the divider + // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed + // quotients. +#if defined(LIBDIVIDE_SSE2) + LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); } +#endif +#if defined(LIBDIVIDE_AVX2) + LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); } +#endif +#if defined(LIBDIVIDE_AVX512) + LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); } +#endif +#if defined(LIBDIVIDE_NEON) + LIBDIVIDE_INLINE typename NeonVecFor::type divide(typename NeonVecFor::type n) const { + return div.divide(n); + } +#endif + + private: + // Storage for the actual divisor + dispatcher_t div; +}; + +// Overload of operator / for scalar division +template +LIBDIVIDE_INLINE T operator/(T n, const divider &div) { + return div.divide(n); +} + +// Overload of operator /= for scalar division +template +LIBDIVIDE_INLINE T &operator/=(T &n, const divider &div) { + n = div.divide(n); + return n; +} + +// Overloads for vector types. +#if defined(LIBDIVIDE_SSE2) +template +LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider &div) { + return div.divide(n); +} + +template +LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider &div) { + n = div.divide(n); + return n; +} +#endif +#if defined(LIBDIVIDE_AVX2) +template +LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider &div) { + return div.divide(n); +} + +template +LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider &div) { + n = div.divide(n); + return n; +} +#endif +#if defined(LIBDIVIDE_AVX512) +template +LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider &div) { + return div.divide(n); +} + +template +LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider &div) { + n = div.divide(n); + return n; +} +#endif + +#if defined(LIBDIVIDE_NEON) +template +LIBDIVIDE_INLINE typename NeonVecFor::type operator/(typename NeonVecFor::type n, const divider &div) { + return div.divide(n); +} + +template +LIBDIVIDE_INLINE typename NeonVecFor::type operator/=(typename NeonVecFor::type &n, const divider &div) { + n = div.divide(n); + return n; +} +#endif + +#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900) +// libdivide::branchfree_divider +template +using branchfree_divider = divider; +#endif + +} // namespace libdivide + +#endif // __cplusplus + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +#endif // LIBDIVIDE_H diff --git a/src/hardened_malloc/util.c b/src/hardened_malloc/util.c new file mode 100644 index 0000000..a3d6f0c --- /dev/null +++ b/src/hardened_malloc/util.c @@ -0,0 +1,41 @@ +#include +#include +#include + +#include + +#ifdef __ANDROID__ +#include +#endif + +#include "util.h" + +#ifndef __ANDROID__ +static int write_full(int fd, const char *buf, size_t length) { + do { + ssize_t bytes_written = write(fd, buf, length); + if (bytes_written == -1) { + if (errno == EINTR) { + continue; + } + return -1; + } + buf += bytes_written; + length -= bytes_written; + } while (length); + + return 0; +} +#endif + +COLD noreturn void fatal_error(const char *s) { +#ifdef __ANDROID__ + async_safe_fatal("hardened_malloc: fatal allocator error: %s", s); +#else + const char *prefix = "fatal allocator error: "; + (void)(write_full(STDERR_FILENO, prefix, strlen(prefix)) != -1 && + write_full(STDERR_FILENO, s, strlen(s)) != -1 && + write_full(STDERR_FILENO, "\n", 1)); + abort(); +#endif +} diff --git a/src/hardened_malloc/util.h b/src/hardened_malloc/util.h new file mode 100644 index 0000000..fc22c23 --- /dev/null +++ b/src/hardened_malloc/util.h @@ -0,0 +1,88 @@ +#ifndef UTIL_H +#define UTIL_H + +#include +#include +#include + +// C11 noreturn doesn't work in C++ +#define noreturn __attribute__((noreturn)) + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define min(x, y) ({ \ + __typeof__(x) _x = (x); \ + __typeof__(y) _y = (y); \ + (void) (&_x == &_y); \ + _x < _y ? _x : _y; }) + +#define max(x, y) ({ \ + __typeof__(x) _x = (x); \ + __typeof__(y) _y = (y); \ + (void) (&_x == &_y); \ + _x > _y ? _x : _y; }) + +#define COLD __attribute__((cold)) +#define UNUSED __attribute__((unused)) +#define EXPORT __attribute__((visibility("default"))) + +#define STRINGIFY(s) #s +#define ALIAS(f) __attribute__((alias(STRINGIFY(f)))) + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef unsigned __int128 u128; + +#define U64_WIDTH 64 + +static inline int ffz64(u64 x) { + return __builtin_ffsll(~x); +} + +// parameter must not be 0 +static inline int clz64(u64 x) { + return __builtin_clzll(x); +} + +// parameter must not be 0 +static inline u64 log2u64(u64 x) { + return U64_WIDTH - clz64(x) - 1; +} + +static inline size_t align(size_t size, size_t align) { + size_t mask = align - 1; + return (size + mask) & ~mask; +} + +// u4_arr_{set,get} are helper functions for using u8 array as an array of unsigned 4-bit values. + +// val is treated as a 4-bit value +static inline void u4_arr_set(u8 *arr, size_t idx, u8 val) { + size_t off = idx >> 1; + size_t shift = (idx & 1) << 2; + u8 mask = (u8) (0xf0 >> shift); + arr[off] = (arr[off] & mask) | (val << shift); +} + +static inline u8 u4_arr_get(const u8 *arr, size_t idx) { + size_t off = idx >> 1; + size_t shift = (idx & 1) << 2; + return (u8) ((arr[off] >> shift) & 0xf); +} + +COLD noreturn void fatal_error(const char *s); + +#if CONFIG_SEAL_METADATA + +#ifdef __GLIBC__ +#define USE_PKEY +#else +#error "CONFIG_SEAL_METADATA requires Memory Protection Key support" +#endif + +#endif // CONFIG_SEAL_METADATA + +#endif diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..b204ba2 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,78 @@ +#![no_std] + +use core::ffi::{c_void, c_int}; + +extern crate libc; + +extern "C" { + /* + TODO: implement this + + #ifdef __ANDROID__ +#define H_MALLOC_USABLE_SIZE_CONST const +#else +#define H_MALLOC_USABLE_SIZE_CONST +#endif + + for: + // glibc extensions +size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *ptr); + */ + + /* C standard */ + + pub fn h_malloc(size: usize) -> *mut c_void; + pub fn h_calloc(nmemb: usize, size: usize) -> *mut c_void; + pub fn h_realloc(ptr: *mut c_void, size: usize) -> *mut c_void; + pub fn h_aligned_malloc(alignment: usize, size: usize) -> *mut c_void; + pub fn h_free(ptr: *mut c_void); + + /* POSIX */ + + pub fn h_posix_memalign(memptr: *mut *mut c_void, alignment: usize, size: usize) -> c_int; + + /* glibc extensions */ + + pub fn h_malloc_usable_size(ptr: *const c_void) -> usize; + pub fn h_mallopt(param: c_int, value: c_int) -> c_int; + pub fn h_malloc_trim(pad: usize) -> c_int; + pub fn h_malloc_stats(void: c_void) -> c_void; + + /* obsolete glibc extensions */ + + pub fn h_memalign(alignment: usize, size: usize) -> *mut c_void; + pub fn h_pvalloc(size: usize) -> *mut c_void; + pub fn h_cfree(ptr: *mut c_void) -> c_void; + pub fn h_malloc_get_state(void: c_void) -> c_void; + pub fn h_malloc_set_state(ptr: *mut c_void) -> c_int; + + /*TODO: implement this see the top: + #if defined(__GLIBC__) || defined(__ANDROID__) +struct mallinfo h_mallinfo(void); +#endif +#ifndef __ANDROID__ +int h_malloc_info(int options, FILE *fp); +#endif + */ + + /* hardened_malloc extensions */ + + /// return an upper bound on object size for any pointer based on malloc metadata + pub fn h_malloc_object_size(ptr: *const c_void) -> usize; + + /// similar to malloc_object_size, but avoids locking so the results are much more limited + pub fn h_malloc_object_size_fast(ptr: *const c_void) -> usize; + + + /// The free function with an extra parameter for passing the size requested at + /// allocation time. + /// + /// This offers the same functionality as C++14 sized deallocation and can be + /// used to implement it. + /// + /// A performance-oriented allocator would use this as a performance + /// enhancement with undefined behavior on a mismatch. Instead, this hardened + /// allocator implementation uses it to improve security by checking that the + /// passed size matches the allocated size. + pub fn h_free_sized(ptr: *mut c_void, expected_size: usize) -> c_void; +} \ No newline at end of file