# Theory Universal_Hash_Families.Carter_Wegman_Hash_Family

section ‹Carter-Wegman Hash Family\label{sec:carter_wegman}›

theory Carter_Wegman_Hash_Family
imports
Interpolation_Polynomials_HOL_Algebra.Interpolation_Polynomial_Cardinalities
Universal_Hash_Families_More_Independent_Families
begin

text ‹The Carter-Wegman hash family is a generic method to obtain
$k$-universal hash families for arbitrary $k$. (There are faster solutions, such as tabulation
hashing, which are limited to a specific $k$. See for example \<^cite>‹"thorup2010"›.)

The construction was described by Wegman and Carter~\<^cite>‹"wegman1981"›, it is a hash
family between the elements of a finite field and works by choosing randomly a polynomial
over the field with degree less than $k$. The hash function is the evaluation of a such a
polynomial.

Using the property that the fraction of polynomials interpolating a given set of $s \leq k$
points is @{term "1/(card (carrier R)^s)"}, which is shown in
\<^cite>‹"Interpolation_Polynomials_HOL_Algebra-AFP"›, it is possible to obtain both that
the hash functions are $k$-wise independent and uniformly distributed.

In the following two locales are introduced, the main reason for both is to make the statements
of the theorems and proofs more concise. The first locale @{term "poly_hash_family"} fixes a finite
ring $R$ and the probability space of the polynomials of degree less than $k$. Because the ring is
not a field, the family is not yet $k$-universal, but it is still possible to state a few results such
as the fact that the range of the hash function is a subset of the carrier of the ring.

The second locale @{term "carter_wegman_hash_family"} is an extension of the former with the
assumption that $R$ is a field with which the $k$-universality follows.

The reason for using two separate locales is to support use cases, where the ring is only probably
a field. For example if it is the set of integers modulo an approximate prime, in such a situation a
subset of the properties of an algorithm using approximate primes would need to be verified
even if $R$ is only a ring.›

definition (in ring) "hash x ω = eval ω x"

locale poly_hash_family = ring +
fixes k :: nat
assumes finite_carrier[simp]: "finite (carrier R)"
assumes k_ge_0: "k > 0"
begin

definition space where "space = bounded_degree_polynomials R k"
definition M where "M = measure_pmf (pmf_of_set space)"

lemma finite_space[simp]:"finite space"
unfolding space_def using fin_degree_bounded finite_carrier by simp

lemma non_empty_bounded_degree_polynomials[simp]:"space ≠ {}"
unfolding space_def using non_empty_bounded_degree_polynomials by simp

text ‹This is to add @{thm [source] carrier_not_empty} to the simp set in the context of
@{locale "poly_hash_family"}:›

lemma non_empty_carrier[simp]: "carrier R ≠ {}"

sublocale prob_space "M"

lemma hash_range[simp]:
assumes "ω ∈ space"
assumes "x ∈ carrier R"
shows "hash x ω ∈ carrier R"
using assms unfolding hash_def space_def bounded_degree_polynomials_def
by (simp, metis eval_in_carrier polynomial_incl univ_poly_carrier)

lemma  hash_range_2:
assumes "ω ∈ space"
shows "(λx. hash x ω)  carrier R ⊆ carrier R"
using hash_range assms by auto

lemma integrable_M[simp]:
fixes f :: "'a list ⇒ 'c::{banach, second_countable_topology}"
shows "integrable M f"
unfolding M_def
by (rule integrable_measure_pmf_finite, simp)

end

locale carter_wegman_hash_family = poly_hash_family +
assumes field_R: "field R"
begin
sublocale field
using field_R by simp

abbreviation "field_size ≡ card (carrier R)"

lemma poly_cards:
assumes "K ⊆ carrier R"
assumes "card K ≤ k"
assumes "y  K ⊆ (carrier R)"
shows
"card {ω ∈ space. (∀k ∈ K. eval ω k = y k)} = field_size^(k-card K)"
unfolding space_def
using interpolating_polynomials_card[where n="k-card K" and K="K"] assms
using finite_carrier finite_subset by fastforce

lemma poly_cards_single:
assumes "x ∈ carrier R"
assumes "y ∈ carrier R"
shows "card {ω ∈ space. eval ω x = y} = field_size^(k-1)"
using poly_cards[where K="{x}" and y="λ_. y", simplified] assms k_ge_0 by simp

lemma hash_prob:
assumes "K ⊆ carrier R"
assumes "card K ≤ k"
assumes "y  K ⊆ carrier R"
shows
"prob {ω. (∀x ∈ K. hash x ω = y x)} = 1/(real field_size)^card K"
proof -
have "𝟬 ∈ carrier R" by simp

hence a:"field_size > 0"
using finite_carrier card_gt_0_iff by blast

have b:"real (card {ω ∈ space. ∀x∈K. eval ω x = y x}) / real (card space) =
1 / real field_size ^ card K"
using a assms(2)

show ?thesis
unfolding M_def
by (simp add:hash_def measure_pmf_of_set Int_def b)
qed

lemma prob_single:
assumes "x ∈ carrier R" "y ∈ carrier R"
shows "prob {ω. hash x ω = y} = 1/(real field_size)"
using hash_prob[where K="{x}"] assms finite_carrier k_ge_0 by simp

lemma prob_range:
assumes [simp]:"x ∈ carrier R"
shows "prob {ω. hash x ω ∈ A} = card (A ∩ carrier R) / field_size"
proof -
have "prob {ω. hash x ω ∈ A} = prob (⋃a ∈ A ∩ carrier R. {ω. hash x ω = a})"
by (rule measure_pmf_eq, auto simp:M_def)
also have "... = (∑ a ∈ (A ∩ carrier R). prob {ω. hash x ω = a})"
by (rule measure_finite_Union, auto simp:M_def disjoint_family_on_def)
also have "... = (∑ a ∈ (A ∩ carrier R). 1/(real field_size))"
by (rule sum.cong, auto simp:prob_single)
also have "... = card (A ∩ carrier R) / field_size"
by simp
finally show ?thesis by simp
qed

lemma indep:
assumes "J ⊆ carrier R"
assumes "card J ≤ k"
shows "indep_vars (λ_. discrete) hash J"
proof -
have "𝟬 ∈ carrier R" by simp
hence card_R_ge_0:"field_size > 0"
using card_gt_0_iff finite_carrier by blast

have fin_J: "finite J"
using finite_carrier assms(1) finite_subset by blast

show ?thesis
proof (rule indep_vars_pmf[OF M_def])
fix a
fix J'
assume a: "J' ⊆ J" "finite J'"
have card_J': "card J' ≤ k"
by (metis card_mono order_trans a(1) assms(2) fin_J)
have J'_in_carr: "J' ⊆ carrier R" by (metis order_trans a(1) assms(1))

show "prob {ω. ∀x∈J'. hash x ω = a x} = (∏x∈J'. prob  {ω. hash x ω = a x})"
proof (cases "a  J' ⊆ carrier R")
case True
have a_carr: "⋀x. x ∈ J' ⟹ a x ∈ carrier R"  using True by force
have "prob {ω. ∀x∈J'. hash x ω = a x} =
real (card {ω ∈ space. ∀x∈J'. eval ω x = a x}) / real (card space)"
by (simp add:M_def measure_pmf_of_set Int_def hash_def)
also have "... = real (field_size ^ (k - card J')) / real (card space)"
using True by (simp add: poly_cards[OF J'_in_carr card_J'])
also have
"... = real field_size ^ (k - card J') / real field_size ^ k"
also have
"... = real field_size ^ ((k - 1) * card J') / real field_size ^ (k * card J')"
also have
"... = (real field_size ^ (k - 1)) ^ card J' / (real field_size ^ k) ^ card J'"
also have
"... =  (∏x∈J'. real (card {ω ∈ space. eval ω x = a x}) / real (card space))"
using a_carr poly_cards_single[OF subsetD[OF J'_in_carr]]
also have "... = (∏x∈J'. prob {ω. hash x ω = a x})"
by (simp add:measure_pmf_of_set M_def Int_def hash_def)
finally show ?thesis by simp
next
case False
then obtain j where j_def: "j ∈ J'" "a j ∉ carrier R" by blast
have "{ω ∈ space. hash j ω = a j} ⊆ {ω ∈ space. hash j ω ∉ carrier R}"
also have "... ⊆ {}" using j_def(1) J'_in_carr hash_range by blast
finally have b:"{ω ∈ space. hash j ω = a j} = {}" by simp
hence "real (card ({ω ∈ space. hash j ω = a j})) = 0" by simp
hence "(∏x∈J'. real (card {ω ∈ space. hash x ω = a x})) = 0"
using a(2) prod_zero[OF a(2)] j_def(1) by auto
moreover have
"{ω ∈ space. ∀x∈J'. hash x ω = a x} ⊆ {ω ∈ space. hash j ω = a j}"
using j_def by blast
hence "{ω ∈ space. ∀x∈J'. hash x ω = a x} = {}" using b by blast
ultimately show ?thesis
by (simp add:measure_pmf_of_set M_def Int_def prod_dividef)
qed
qed
qed

lemma k_wise_indep:
"k_wise_indep_vars k (λ_. discrete) hash (carrier R)"
unfolding k_wise_indep_vars_def using indep by simp

lemma inj_if_degree_1:
assumes "ω ∈ space"
assumes "degree ω = 1"
shows "inj_on (λx. hash x ω) (carrier R)"
using assms eval_inj_if_degree_1
by (simp add:M_def space_def bounded_degree_polynomials_def hash_def)

lemma uniform:
assumes "i ∈ carrier R"
shows "uniform_on (hash i) (carrier R)"
proof -
have a:
"⋀a. prob {ω. hash i ω ∈ {a}} = indicat_real (carrier R) a / real field_size"
by (subst prob_range[OF assms], simp add:indicator_def)
show ?thesis
by (rule uniform_onI, use a M_def in auto)
qed

text ‹This the main result of this section - the Carter-Wegman hash family is $k$-universal.›

theorem k_universal:
"k_universal k hash (carrier R) (carrier R)"
using uniform k_wise_indep by (simp add:k_universal_def)

end

lemma poly_hash_familyI:
assumes "ring R"
assumes "finite (carrier R)"
assumes "0 < k"
shows "poly_hash_family R k"
using assms

lemma carter_wegman_hash_familyI:
assumes "field F"
assumes "finite (carrier F)"
assumes "0 < k"
shows "carter_wegman_hash_family F k"
using assms field.is_ring[OF assms(1)] poly_hash_familyI

lemma hash_k_wise_indep:
assumes "field F ∧ finite (carrier F)"
assumes "1 ≤ n"
shows
"prob_space.k_wise_indep_vars (pmf_of_set (bounded_degree_polynomials F n)) n
(λ_. pmf_of_set (carrier F)) (ring.hash F) (carrier F)"
proof -
interpret carter_wegman_hash_family "F" "n"
using assms carter_wegman_hash_familyI by force
have "k_wise_indep_vars n (λ_. pmf_of_set (carrier F)) hash (carrier F)"
by (rule k_wise_indep_vars_compose[OF k_wise_indep], simp)
thus ?thesis
qed

lemma hash_prob_single:
assumes "field F ∧ finite (carrier F)"
assumes "x ∈ carrier F"
assumes "1 ≤ n"
assumes "y ∈ carrier F"
shows
"𝒫(ω in pmf_of_set (bounded_degree_polynomials F n). ring.hash F x ω = y)
= 1/(real (card (carrier F)))"
proof -
interpret carter_wegman_hash_family "F" "n"
using assms carter_wegman_hash_familyI by force
show ?thesis
using prob_single[OF assms(2,4)] by (simp add:M_def space_def)
qed

end