hash.cpp

100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#include "crucible/core/hash.hpp"

#include <bit>
#include <climits>
#include <functional>

#include "crucible/core/random.hpp"

namespace crucible
{
	namespace
	{
		using hash_function = std::function<std::uint64_t(std::byte const *, std::size_t)>;

		[[nodiscard]] constexpr auto fnv_hash_with_basis(std::uint64_t const basis, std::byte const *data, std::size_t const size) -> std::uint64_t
		{
			// This algorithm assumes 8-bit bytes
			static_assert(sizeof(std::byte) == sizeof(char));
			static_assert(CHAR_BIT == 8);

			auto hash { basis };

			for (std::size_t i { 0 }; i < size; ++i) {
				hash = (hash & 0xffffffffffffff00) | (static_cast<std::uint8_t>(hash & 0x00000000000000ff) ^ std::to_integer<std::uint8_t>(data[i]));

				// Safe: overflow is possible here, but the algorithm specifically asks for the result of the
				// multiply to be truncated to 64 bits (which is how unsigned overflow works anyway)
				hash *= 0x100000001b3;
			}

			return hash;
		}

		// Fowler-Noll-Vo 1a algorithm by Glenn Fowler, Landon Curt Noll, and Kiem-Phong Vo
		// See: https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
		[[nodiscard]] constexpr auto fnv_hash(std::byte const *data, std::size_t const size) -> std::uint64_t
		{
			/* chongo <Landon Curt Noll> /\../\ */
			constexpr std::byte const signature[] {
				std::byte { 0x63 }, std::byte { 0x68 }, std::byte { 0x6f }, std::byte { 0x6e },
				std::byte { 0x67 }, std::byte { 0x6f }, std::byte { 0x20 }, std::byte { 0x3c },
				std::byte { 0x4c }, std::byte { 0x61 }, std::byte { 0x6e }, std::byte { 0x64 },
				std::byte { 0x6f }, std::byte { 0x6e }, std::byte { 0x20 }, std::byte { 0x43 },
				std::byte { 0x75 }, std::byte { 0x72 }, std::byte { 0x74 }, std::byte { 0x20 },
				std::byte { 0x4e }, std::byte { 0x6f }, std::byte { 0x6c }, std::byte { 0x6c },
				std::byte { 0x3e }, std::byte { 0x20 }, std::byte { 0x2f }, std::byte { 0x5c },
				std::byte { 0x2e }, std::byte { 0x2e }, std::byte { 0x2f }, std::byte { 0x5c }
			};

			constexpr auto basis = fnv_hash_with_basis(0, signature, sizeof(signature));

			return fnv_hash_with_basis(basis, data, size);
		}

		template<std::size_t C = 2, std::size_t D = 4>
		class sip_hash
		{
		public:
			sip_hash(std::uint64_t k0, std::uint64_t k1) :
				m_k0 { k0 },
				m_k1 { k1 }
			{}

			// SipHash algorithm by Jean-Philippe Aumasson and Daniel J. Bernstein
			// See: https://cr.yp.to/siphash/siphash-20120918.pdf
			auto operator()(std::byte const *data, std::size_t const size) -> std::uint64_t
			{
				// Initialization
				std::uint64_t v0 { m_k0 ^ 0x736f6d6570736575 };
				std::uint64_t v1 { m_k1 ^ 0x646f72616e646f6d };
				std::uint64_t v2 { m_k0 ^ 0x6c7967656e657261 };
				std::uint64_t v3 { m_k1 ^ 0x7465646279746573 };

				auto sip_round {
					[&v0, &v1, &v2, &v3]() {
						v0 += v1;
						v1 = (v1 << 13) | (v1 >> 51);
						v1 ^= v0;
						v0 = (v0 << 32) | (v0 >> 32);
						v2 += v1;
						v1 = (v1 << 17) | (v1 >> 47);
						v1 ^= v2;
						v2 = (v2 << 32) | (v2 >> 32);
						v2 += v3;
						v3 = (v3 << 16) | (v3 >> 48);
						v3 ^= v2;
						v0 += v3;
						v3 = (v3 << 21) | (v3 >> 43);
						v3 ^= v0;
					}
				};

				// Compression
				std::size_t i { 0 };

				while (i + sizeof(std::uint64_t) < size) {
					// TODO: handle big-endian systems
					// The `std::memcpy()` below assumes that `std::uint64_t` is little-endian
					static_assert(std::endian::native == std::endian::little);

					std::uint64_t word { 0 };
					std::memcpy(&word, data + i, sizeof(std::uint64_t));

					v3 ^= word;

					for (std::size_t c { 0 }; c < C; ++c) {
						sip_round();
					}

					v0 ^= word;

					i += sizeof(std::uint64_t);
				}

				{
					std::uint64_t last_word { 0 };
					std::memcpy(&last_word, data + i, size - i);
					last_word |= size & 0xff;

					v3 ^= last_word;

					for (std::size_t c { 0 }; c < C; ++c) {
						sip_round();
					}

					v0 ^= last_word;
				}

				// Finalization
				v2 ^= 0xff;

				for (std::size_t d { 0 }; d < D; ++d) {
					sip_round();
				}

				return v0 ^ v1 ^ v2 ^ v3;
			}

		private:
			std::uint64_t m_k0 { 0 };

			std::uint64_t m_k1 { 0 };
		};
	}

	auto hash(void const *pointer) -> std::uint64_t
	{
		// Safe: we are allowed to convert any pointer to `std::uintptr_t`, as long as we either don't
		// change it or don't try to convert it back to a pointer
		auto const address = reinterpret_cast<std::uintptr_t>(pointer);

		// Address models std::integral, which we already know how to hash
		return hash(address);
	}

	struct hash_manager_data
	{
		hash_function hash { fnv_hash };
	};

	hash_manager_handle::hash_manager_handle(std::unique_ptr<hash_manager_data> data) :
		m_data { std::move(data) }
	{}

	auto hash_manager_handle::initialize() -> void
	{
		// It is an error to initialize the hash manager twice (because it renders all existing hash
		// tables invalid with no way to notify them)
		CRUCIBLE_ASSERT_NOT(m_initialized);
		m_initialized = true;

		auto k0_result { read_random_integer<std::uint64_t>() };

		if (k0_result.is_failure()) {
			return;
		}

		auto k1_result { read_random_integer<std::uint64_t>() };

		if (k1_result.is_failure()) {
			return;
		}

		// We only switch to SipHash if we are able to successfully read two random secrets from the
		// host's entropy pool; we fall back to FNV otherwise.
		//
		// The chosen hash function, and any state like these keys, is allowed to vary system to system,
		// program to program, and even run to run; we only guarantee that hashes remain consistent
		// within the current process (i.e., we will never switch out the hash function after we choose
		// one).
		m_data->hash = sip_hash { k0_result.success(), k1_result.success() };
	}

	auto hash_manager_handle::hash_text(char const *text, std::size_t const size) const -> std::uint64_t
	{
		// Safe: we can alias anything through a `std::byte` pointer
		auto const *data { reinterpret_cast<std::byte const *>(text) };

		// Make sure we can use the same `size`
		static_assert(sizeof(std::byte) == sizeof(char));

		return m_data->hash(data, size);
	}

	auto hash_manager_handle::hash_data(std::byte const *data, std::size_t const size) const -> std::uint64_t
	{
		return m_data->hash(data, size);
	}

	auto hash_manager() -> hash_manager_handle &
	{
		static hash_manager_handle instance { std::make_unique<hash_manager_data>() };
		return instance;
	}
}