Echo Writes Code

test_hash.cpp

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include "crucible/core.hpp"
#include "crucible/test.hpp"

#include <cstdint>

namespace test_hash
{
	CRUCIBLE_TEST_SCENARIO(hash_short_strings)
	{
		// A list of 256 random dictionary words
		auto const all_values {
			crucible::make_heap_buffer<crucible::string_view>({
				"physoclist", "Gordyaean", "Panionian", "infirm", "gigerium", "gymnosporous", "discredit",
				"angulated", "onus", "pharmacopsychology", "brutalize", "variolization", "unmindfulness",
				"woodwall", "cryingly", "phytolithological", "convenience", "tregadyne", "arecoline", "reunite",
				"horsefoot", "spatted", "nannoplankton", "Noam", "Tabellariaceae", "spaework", "align", "tell",
				"preworthy", "photopography", "scleranth", "ungainly", "cumene", "nosotrophy", "barb",
				"semiduplex", "Lapageria", "hyperimmunity", "tritely", "oversusceptible", "trilabe",
				"sufficientness", "valethood", "Malebranchism", "Paumari", "preliterally", "slobbery",
				"refringence", "lam", "preindulge", "agitative", "cheirography", "Mephistopheleanly", "ladkin",
				"turnhall", "sunburntness", "metaplasm", "Madonna", "pollable", "mootman", "cheddaring",
				"pontic", "hypsoisotherm", "libretto", "granulation", "sarsenet", "megaerg",
				"adrenocorticotropic", "Soohong", "gris", "Eucommia", "wrathfulness", "woodruff",
				"corrigibleness", "enthraldom", "logeum", "unreeving", "archdetective", "Maskins",
				"pancreatalgia", "miseffect", "Norseland", "ureteropyosis", "Coniothyrium", "pteropaedes",
				"Ctenophora", "enterocele", "was", "superambitious", "bridgebuilding", "triglochin", "glider",
				"exaltative", "drivescrew", "hypophloeous", "deboistness", "declensionally", "constringe",
				"uncontrovertibly", "testaceology", "Kate", "phrenosinic", "Venezuelan", "syllogistics",
				"megacerine", "speech", "unordinariness", "fooling", "reannexation", "blackfire", "sloosh",
				"tracheolar", "callidness", "zokor", "telemeteorographic", "basial", "pteridophilism",
				"wristfall", "unproportionedness", "amidoketone", "thermantic", "Kshatriyahood", "Sertularia",
				"neurataxia", "Calite", "thaumoscopic", "pruning", "unalleviably", "kingless",
				"irreconcilability", "Claude", "Sphaerocarpus", "parasitophobia", "nebbuck", "kinnikinnick",
				"dockization", "Abrahamite", "repurchase", "Homerian", "charlatanically", "polderboy",
				"saurischian", "cardiorrhaphy", "apoplectoid", "mankin", "allotee", "aminate", "notal",
				"Hypnaceae", "identism", "understring", "Anaxagorean", "titty", "salvability",
				"hydrophthalmus", "esponton", "brassily", "Cocanucos", "slugging", "palet", "spontaneous",
				"cation", "superagency", "challie", "Mirach", "cinematographic", "alexipyretic",
				"corticosterone", "semihyperbolical", "mewer", "chivalresque", "nights", "subequally", "knotty",
				"nonvariable", "ambrosine", "Sphenopteris", "Adai", "meteorite", "tomjohn", "Yokuts",
				"Franklin", "unjudgelike", "violoncellist", "abolitionist", "festerment", "Novemberish",
				"unicostate", "noncirculation", "claught", "scattered", "curatial", "sweetberry", "unbragged",
				"Llanberisslate", "birational", "thurible", "thetically", "cipolin", "prestress", "tailpipe",
				"thiuram", "participable", "redolence", "lobstering", "feveret", "microsublimation",
				"choreutic", "tagrag", "platitudinization", "solipedous", "Alopiidae", "Hildebrandist",
				"silaginoid", "bore", "festucine", "thymocyte", "blacker", "gib", "hydrargyrism",
				"unapplicableness", "unintelligibly", "Mister", "rentaller", "undermine", "Emesidae",
				"coxcombhood", "squidge", "postponement", "magnify", "stapelia", "diathermotherapy", "chutney",
				"stupefactive", "holler", "vociferous", "visie", "scrounger", "homoerotic", "synchronized",
				"puddinghouse", "acuclosure", "zoophytical", "unprovide", "noncustomary", "twitch",
				"nonauthentication", "frow", "klop", "unfeigningly", "encampment", "preregistration",
				"mimeographic", "delphacid", "kilter", "remarry"
			})
		};

		// Hash the words in the word list
		auto const all_hashes {
			crucible::iterate(all_values)
				.map([](crucible::string_view const s) {
						return crucible::hash(s);
				}).materialize<crucible::heap_buffer<std::uint64_t>>()
		};

		// Same number of buckets as keys, to simulate a really bad load factor
		auto buckets { crucible::make_heap_buffer<std::size_t>(all_values.size()) };

		// Simulate a hash table by just counting how many items would go into each bucket, assuming we
		// store an item at hash(item) modulo (number of buckets)
		for (std::size_t i { 0 }; i < all_hashes.size(); ++i) {
#if 0 // Debugging information
			CRUCIBLE_PRINT("hash({}) = {} ({} mod {})", all_values[i], all_hashes[i], all_hashes[i] % buckets.size(), buckets.size());
#endif

			auto const j { all_hashes[i] % buckets.size() };
			++buckets[j];

#if 0 // Debugging information
			if (buckets[j] > 1) {
				CRUCIBLE_PRINT("collision at {}", all_values[i]);
			}
#endif

		}

		// A very simple conservative test: given that we have the same number of buckets as keys, we
		// want to have <50% of the populated buckets (iem count > 0) have a collision (item count > 1).
		//
		// There are much more sophisticated tests that can check for stricter guarantees (one worth
		// looking into is the chi-squared test), but this test is more about verifying that the API
		// works and produces more-or-less evenly distributed hashes (i.e., hash tables won't just utterly
		// break) than it is about proving anything in particular about the hash function.

		double const populated_bucket_count {
			crucible::iterate(buckets)
				.fold([](double const result, std::size_t const item_count) {
					return result + (item_count > 0 ? 1.0 : 0.0);
				}, 0.0)
		};

		double const collision_bucket_count {
			crucible::iterate(buckets)
				.fold([](double const result, std::size_t const item_count) {
					return result + (item_count > 1 ? 1.0 : 0.0);
				}, 0.0)
		};

#if 0 // Debugging information
		double const load { all_hashes.size() / static_cast<double>(buckets.size()) };
		double const ratio { collision_bucket_count / populated_bucket_count };

		CRUCIBLE_PRINT("buckets: {} collisions: {} populated: {} ratio: {} load: {}", buckets.size(), collision_bucket_count, populated_bucket_count, ratio, load);
#endif

		CRUCIBLE_ASSERT_LT(collision_bucket_count / populated_bucket_count, 0.5);
	}
}

CRUCIBLE_TEST_MAIN