Theory Universal_Hash_Families.Carter_Wegman_Hash_Family
section ‹Carter-Wegman Hash Family\label{sec:carter_wegman}›
theory Carter_Wegman_Hash_Family
imports
Interpolation_Polynomials_HOL_Algebra.Interpolation_Polynomial_Cardinalities
Universal_Hash_Families_More_Independent_Families
begin
text ‹The Carter-Wegman hash family is a generic method to obtain
$k$-universal hash families for arbitrary $k$. (There are faster solutions, such as tabulation
hashing, which are limited to a specific $k$. See for example \<^cite>‹"thorup2010"›.)
The construction was described by Wegman and Carter~\<^cite>‹"wegman1981"›, it is a hash
family between the elements of a finite field and works by choosing randomly a polynomial
over the field with degree less than $k$. The hash function is the evaluation of a such a
polynomial.
Using the property that the fraction of polynomials interpolating a given set of $s \leq k$
points is @{term "1/(card (carrier R)^s)"}, which is shown in
\<^cite>‹"Interpolation_Polynomials_HOL_Algebra-AFP"›, it is possible to obtain both that
the hash functions are $k$-wise independent and uniformly distributed.
In the following two locales are introduced, the main reason for both is to make the statements
of the theorems and proofs more concise. The first locale @{term "poly_hash_family"} fixes a finite
ring $R$ and the probability space of the polynomials of degree less than $k$. Because the ring is
not a field, the family is not yet $k$-universal, but it is still possible to state a few results such
as the fact that the range of the hash function is a subset of the carrier of the ring.
The second locale @{term "carter_wegman_hash_family"} is an extension of the former with the
assumption that $R$ is a field with which the $k$-universality follows.
The reason for using two separate locales is to support use cases, where the ring is only probably
a field. For example if it is the set of integers modulo an approximate prime, in such a situation a
subset of the properties of an algorithm using approximate primes would need to be verified
even if $R$ is only a ring.›
definition (in ring) "hash x ω = eval ω x"
locale poly_hash_family = ring +
fixes k :: nat
assumes finite_carrier[simp]: "finite (carrier R)"
assumes k_ge_0: "k > 0"
begin
definition space where "space = bounded_degree_polynomials R k"
definition M where "M = measure_pmf (pmf_of_set space)"
lemma finite_space[simp]:"finite space"
unfolding space_def using fin_degree_bounded finite_carrier by simp
lemma non_empty_bounded_degree_polynomials[simp]:"space ≠ {}"
unfolding space_def using non_empty_bounded_degree_polynomials by simp
text ‹This is to add @{thm [source] carrier_not_empty} to the simp set in the context of
@{locale "poly_hash_family"}:›
lemma non_empty_carrier[simp]: "carrier R ≠ {}"
by (simp add:carrier_not_empty)
sublocale prob_space "M"
by (simp add:M_def prob_space_measure_pmf)
lemma hash_range[simp]:
assumes "ω ∈ space"
assumes "x ∈ carrier R"
shows "hash x ω ∈ carrier R"
using assms unfolding hash_def space_def bounded_degree_polynomials_def
by (simp, metis eval_in_carrier polynomial_incl univ_poly_carrier)
lemma hash_range_2:
assumes "ω ∈ space"
shows "(λx. hash x ω) ` carrier R ⊆ carrier R"
using hash_range assms by auto
lemma integrable_M[simp]:
fixes f :: "'a list ⇒ 'c::{banach, second_countable_topology}"
shows "integrable M f"
unfolding M_def
by (rule integrable_measure_pmf_finite, simp)
end
locale carter_wegman_hash_family = poly_hash_family +
assumes field_R: "field R"
begin
sublocale field
using field_R by simp
abbreviation "field_size ≡ card (carrier R)"
lemma poly_cards:
assumes "K ⊆ carrier R"
assumes "card K ≤ k"
assumes "y ` K ⊆ (carrier R)"
shows
"card {ω ∈ space. (∀k ∈ K. eval ω k = y k)} = field_size^(k-card K)"
unfolding space_def
using interpolating_polynomials_card[where n="k-card K" and K="K"] assms
using finite_carrier finite_subset by fastforce
lemma poly_cards_single:
assumes "x ∈ carrier R"
assumes "y ∈ carrier R"
shows "card {ω ∈ space. eval ω x = y} = field_size^(k-1)"
using poly_cards[where K="{x}" and y="λ_. y", simplified] assms k_ge_0 by simp
lemma hash_prob:
assumes "K ⊆ carrier R"
assumes "card K ≤ k"
assumes "y ` K ⊆ carrier R"
shows
"prob {ω. (∀x ∈ K. hash x ω = y x)} = 1/(real field_size)^card K"
proof -
have "𝟬 ∈ carrier R" by simp
hence a:"field_size > 0"
using finite_carrier card_gt_0_iff by blast
have b:"real (card {ω ∈ space. ∀x∈K. eval ω x = y x}) / real (card space) =
1 / real field_size ^ card K"
using a assms(2)
apply (simp add: frac_eq_eq poly_cards[OF assms(1,2,3)] power_add[symmetric])
by (simp add:space_def bounded_degree_polynomials_card)
show ?thesis
unfolding M_def
by (simp add:hash_def measure_pmf_of_set Int_def b)
qed
lemma prob_single:
assumes "x ∈ carrier R" "y ∈ carrier R"
shows "prob {ω. hash x ω = y} = 1/(real field_size)"
using hash_prob[where K="{x}"] assms finite_carrier k_ge_0 by simp
lemma prob_range:
assumes [simp]:"x ∈ carrier R"
shows "prob {ω. hash x ω ∈ A} = card (A ∩ carrier R) / field_size"
proof -
have "prob {ω. hash x ω ∈ A} = prob (⋃a ∈ A ∩ carrier R. {ω. hash x ω = a})"
by (rule measure_pmf_eq, auto simp:M_def)
also have "... = (∑ a ∈ (A ∩ carrier R). prob {ω. hash x ω = a})"
by (rule measure_finite_Union, auto simp:M_def disjoint_family_on_def)
also have "... = (∑ a ∈ (A ∩ carrier R). 1/(real field_size))"
by (rule sum.cong, auto simp:prob_single)
also have "... = card (A ∩ carrier R) / field_size"
by simp
finally show ?thesis by simp
qed
lemma indep:
assumes "J ⊆ carrier R"
assumes "card J ≤ k"
shows "indep_vars (λ_. discrete) hash J"
proof -
have "𝟬 ∈ carrier R" by simp
hence card_R_ge_0:"field_size > 0"
using card_gt_0_iff finite_carrier by blast
have fin_J: "finite J"
using finite_carrier assms(1) finite_subset by blast
show ?thesis
proof (rule indep_vars_pmf[OF M_def])
fix a
fix J'
assume a: "J' ⊆ J" "finite J'"
have card_J': "card J' ≤ k"
by (metis card_mono order_trans a(1) assms(2) fin_J)
have J'_in_carr: "J' ⊆ carrier R" by (metis order_trans a(1) assms(1))
show "prob {ω. ∀x∈J'. hash x ω = a x} = (∏x∈J'. prob {ω. hash x ω = a x})"
proof (cases "a ` J' ⊆ carrier R")
case True
have a_carr: "⋀x. x ∈ J' ⟹ a x ∈ carrier R" using True by force
have "prob {ω. ∀x∈J'. hash x ω = a x} =
real (card {ω ∈ space. ∀x∈J'. eval ω x = a x}) / real (card space)"
by (simp add:M_def measure_pmf_of_set Int_def hash_def)
also have "... = real (field_size ^ (k - card J')) / real (card space)"
using True by (simp add: poly_cards[OF J'_in_carr card_J'])
also have
"... = real field_size ^ (k - card J') / real field_size ^ k"
by (simp add:space_def bounded_degree_polynomials_card)
also have
"... = real field_size ^ ((k - 1) * card J') / real field_size ^ (k * card J')"
using card_J' by (simp add:power_add[symmetric] power_mult[symmetric]
diff_mult_distrib frac_eq_eq add.commute)
also have
"... = (real field_size ^ (k - 1)) ^ card J' / (real field_size ^ k) ^ card J'"
by (simp add:power_add power_mult)
also have
"... = (∏x∈J'. real (card {ω ∈ space. eval ω x = a x}) / real (card space))"
using a_carr poly_cards_single[OF subsetD[OF J'_in_carr]]
by (simp add:space_def bounded_degree_polynomials_card power_divide)
also have "... = (∏x∈J'. prob {ω. hash x ω = a x})"
by (simp add:measure_pmf_of_set M_def Int_def hash_def)
finally show ?thesis by simp
next
case False
then obtain j where j_def: "j ∈ J'" "a j ∉ carrier R" by blast
have "{ω ∈ space. hash j ω = a j} ⊆ {ω ∈ space. hash j ω ∉ carrier R}"
by (rule subsetI, simp add:j_def)
also have "... ⊆ {}" using j_def(1) J'_in_carr hash_range by blast
finally have b:"{ω ∈ space. hash j ω = a j} = {}" by simp
hence "real (card ({ω ∈ space. hash j ω = a j})) = 0" by simp
hence "(∏x∈J'. real (card {ω ∈ space. hash x ω = a x})) = 0"
using a(2) prod_zero[OF a(2)] j_def(1) by auto
moreover have
"{ω ∈ space. ∀x∈J'. hash x ω = a x} ⊆ {ω ∈ space. hash j ω = a j}"
using j_def by blast
hence "{ω ∈ space. ∀x∈J'. hash x ω = a x} = {}" using b by blast
ultimately show ?thesis
by (simp add:measure_pmf_of_set M_def Int_def prod_dividef)
qed
qed
qed
lemma k_wise_indep:
"k_wise_indep_vars k (λ_. discrete) hash (carrier R)"
unfolding k_wise_indep_vars_def using indep by simp
lemma inj_if_degree_1:
assumes "ω ∈ space"
assumes "degree ω = 1"
shows "inj_on (λx. hash x ω) (carrier R)"
using assms eval_inj_if_degree_1
by (simp add:M_def space_def bounded_degree_polynomials_def hash_def)
lemma uniform:
assumes "i ∈ carrier R"
shows "uniform_on (hash i) (carrier R)"
proof -
have a:
"⋀a. prob {ω. hash i ω ∈ {a}} = indicat_real (carrier R) a / real field_size"
by (subst prob_range[OF assms], simp add:indicator_def)
show ?thesis
by (rule uniform_onI, use a M_def in auto)
qed
text ‹This the main result of this section - the Carter-Wegman hash family is $k$-universal.›
theorem k_universal:
"k_universal k hash (carrier R) (carrier R)"
using uniform k_wise_indep by (simp add:k_universal_def)
end
lemma poly_hash_familyI:
assumes "ring R"
assumes "finite (carrier R)"
assumes "0 < k"
shows "poly_hash_family R k"
using assms
by (simp add:poly_hash_family_def poly_hash_family_axioms_def)
lemma carter_wegman_hash_familyI:
assumes "field F"
assumes "finite (carrier F)"
assumes "0 < k"
shows "carter_wegman_hash_family F k"
using assms field.is_ring[OF assms(1)] poly_hash_familyI
by (simp add:carter_wegman_hash_family_def carter_wegman_hash_family_axioms_def)
lemma hash_k_wise_indep:
assumes "field F ∧ finite (carrier F)"
assumes "1 ≤ n"
shows
"prob_space.k_wise_indep_vars (pmf_of_set (bounded_degree_polynomials F n)) n
(λ_. pmf_of_set (carrier F)) (ring.hash F) (carrier F)"
proof -
interpret carter_wegman_hash_family "F" "n"
using assms carter_wegman_hash_familyI by force
have "k_wise_indep_vars n (λ_. pmf_of_set (carrier F)) hash (carrier F)"
by (rule k_wise_indep_vars_compose[OF k_wise_indep], simp)
thus ?thesis
by (simp add:M_def space_def)
qed
lemma hash_prob_single:
assumes "field F ∧ finite (carrier F)"
assumes "x ∈ carrier F"
assumes "1 ≤ n"
assumes "y ∈ carrier F"
shows
"𝒫(ω in pmf_of_set (bounded_degree_polynomials F n). ring.hash F x ω = y)
= 1/(real (card (carrier F)))"
proof -
interpret carter_wegman_hash_family "F" "n"
using assms carter_wegman_hash_familyI by force
show ?thesis
using prob_single[OF assms(2,4)] by (simp add:M_def space_def)
qed
end