section ‹Carter-Wegman Hash Family\label{sec:carter_wegman}› theory Carter_Wegman_Hash_Family imports Interpolation_Polynomials_HOL_Algebra.Interpolation_Polynomial_Cardinalities Universal_Hash_Families_More_Independent_Families begin text ‹The Carter-Wegman hash family is a generic method to obtain $k$-universal hash families for arbitrary $k$. (There are faster solutions, such as tabulation hashing, which are limited to a specific $k$. See for example \<^cite>‹"thorup2010"›.) The construction was described by Wegman and Carter~\<^cite>‹"wegman1981"›, it is a hash family between the elements of a finite field and works by choosing randomly a polynomial over the field with degree less than $k$. The hash function is the evaluation of a such a polynomial. Using the property that the fraction of polynomials interpolating a given set of $s \leq k$ points is @{term "1/(card (carrier R)^s)"}, which is shown in \<^cite>‹"Interpolation_Polynomials_HOL_Algebra-AFP"›, it is possible to obtain both that the hash functions are $k$-wise independent and uniformly distributed. In the following two locales are introduced, the main reason for both is to make the statements of the theorems and proofs more concise. The first locale @{term "poly_hash_family"} fixes a finite ring $R$ and the probability space of the polynomials of degree less than $k$. Because the ring is not a field, the family is not yet $k$-universal, but it is still possible to state a few results such as the fact that the range of the hash function is a subset of the carrier of the ring. The second locale @{term "carter_wegman_hash_family"} is an extension of the former with the assumption that $R$ is a field with which the $k$-universality follows. The reason for using two separate locales is to support use cases, where the ring is only probably a field. For example if it is the set of integers modulo an approximate prime, in such a situation a subset of the properties of an algorithm using approximate primes would need to be verified even if $R$ is only a ring.› definition (in ring) "hash x ω = eval ω x" locale poly_hash_family = ring + fixes k :: nat assumes finite_carrier[simp]: "finite (carrier R)" assumes k_ge_0: "k > 0" begin definition space where "space = bounded_degree_polynomials R k" definition M where "M = measure_pmf (pmf_of_set space)" lemma finite_space[simp]:"finite space" unfolding space_def using fin_degree_bounded finite_carrier by simp lemma non_empty_bounded_degree_polynomials[simp]:"space ≠ {}" unfolding space_def using non_empty_bounded_degree_polynomials by simp text ‹This is to add @{thm [source] carrier_not_empty} to the simp set in the context of @{locale "poly_hash_family"}:› lemma non_empty_carrier[simp]: "carrier R ≠ {}" by (simp add:carrier_not_empty) sublocale prob_space "M" by (simp add:M_def prob_space_measure_pmf) lemma hash_range[simp]: assumes "ω ∈ space" assumes "x ∈ carrier R" shows "hash x ω ∈ carrier R" using assms unfolding hash_def space_def bounded_degree_polynomials_def by (simp, metis eval_in_carrier polynomial_incl univ_poly_carrier) lemma hash_range_2: assumes "ω ∈ space" shows "(λx. hash x ω) ` carrier R ⊆ carrier R" using hash_range assms by auto lemma integrable_M[simp]: fixes f :: "'a list ⇒ 'c::{banach, second_countable_topology}" shows "integrable M f" unfolding M_def by (rule integrable_measure_pmf_finite, simp) end locale carter_wegman_hash_family = poly_hash_family + assumes field_R: "field R" begin sublocale field using field_R by simp abbreviation "field_size ≡ card (carrier R)" lemma poly_cards: assumes "K ⊆ carrier R" assumes "card K ≤ k" assumes "y ` K ⊆ (carrier R)" shows "card {ω ∈ space. (∀k ∈ K. eval ω k = y k)} = field_size^(k-card K)" unfolding space_def using interpolating_polynomials_card[where n="k-card K" and K="K"] assms using finite_carrier finite_subset by fastforce lemma poly_cards_single: assumes "x ∈ carrier R" assumes "y ∈ carrier R" shows "card {ω ∈ space. eval ω x = y} = field_size^(k-1)" using poly_cards[where K="{x}" and y="λ_. y", simplified] assms k_ge_0 by simp lemma hash_prob: assumes "K ⊆ carrier R" assumes "card K ≤ k" assumes "y ` K ⊆ carrier R" shows "prob {ω. (∀x ∈ K. hash x ω = y x)} = 1/(real field_size)^card K" proof - have "𝟬 ∈ carrier R" by simp hence a:"field_size > 0" using finite_carrier card_gt_0_iff by blast have b:"real (card {ω ∈ space. ∀x∈K. eval ω x = y x}) / real (card space) = 1 / real field_size ^ card K" using a assms(2) apply (simp add: frac_eq_eq poly_cards[OF assms(1,2,3)] power_add[symmetric]) by (simp add:space_def bounded_degree_polynomials_card) show ?thesis unfolding M_def by (simp add:hash_def measure_pmf_of_set Int_def b) qed lemma prob_single: assumes "x ∈ carrier R" "y ∈ carrier R" shows "prob {ω. hash x ω = y} = 1/(real field_size)" using hash_prob[where K="{x}"] assms finite_carrier k_ge_0 by simp lemma prob_range: assumes [simp]:"x ∈ carrier R" shows "prob {ω. hash x ω ∈ A} = card (A ∩ carrier R) / field_size" proof - have "prob {ω. hash x ω ∈ A} = prob (⋃a ∈ A ∩ carrier R. {ω. hash x ω = a})" by (rule measure_pmf_eq, auto simp:M_def) also have "... = (∑ a ∈ (A ∩ carrier R). prob {ω. hash x ω = a})" by (rule measure_finite_Union, auto simp:M_def disjoint_family_on_def) also have "... = (∑ a ∈ (A ∩ carrier R). 1/(real field_size))" by (rule sum.cong, auto simp:prob_single) also have "... = card (A ∩ carrier R) / field_size" by simp finally show ?thesis by simp qed lemma indep: assumes "J ⊆ carrier R" assumes "card J ≤ k" shows "indep_vars (λ_. discrete) hash J" proof - have "𝟬 ∈ carrier R" by simp hence card_R_ge_0:"field_size > 0" using card_gt_0_iff finite_carrier by blast have fin_J: "finite J" using finite_carrier assms(1) finite_subset by blast show ?thesis proof (rule indep_vars_pmf[OF M_def]) fix a fix J' assume a: "J' ⊆ J" "finite J'" have card_J': "card J' ≤ k" by (metis card_mono order_trans a(1) assms(2) fin_J) have J'_in_carr: "J' ⊆ carrier R" by (metis order_trans a(1) assms(1)) show "prob {ω. ∀x∈J'. hash x ω = a x} = (∏x∈J'. prob {ω. hash x ω = a x})" proof (cases "a ` J' ⊆ carrier R") case True have a_carr: "⋀x. x ∈ J' ⟹ a x ∈ carrier R" using True by force have "prob {ω. ∀x∈J'. hash x ω = a x} = real (card {ω ∈ space. ∀x∈J'. eval ω x = a x}) / real (card space)" by (simp add:M_def measure_pmf_of_set Int_def hash_def) also have "... = real (field_size ^ (k - card J')) / real (card space)" using True by (simp add: poly_cards[OF J'_in_carr card_J']) also have "... = real field_size ^ (k - card J') / real field_size ^ k" by (simp add:space_def bounded_degree_polynomials_card) also have "... = real field_size ^ ((k - 1) * card J') / real field_size ^ (k * card J')" using card_J' by (simp add:power_add[symmetric] power_mult[symmetric] diff_mult_distrib frac_eq_eq add.commute) also have "... = (real field_size ^ (k - 1)) ^ card J' / (real field_size ^ k) ^ card J'" by (simp add:power_add power_mult) also have "... = (∏x∈J'. real (card {ω ∈ space. eval ω x = a x}) / real (card space))" using a_carr poly_cards_single[OF subsetD[OF J'_in_carr]] by (simp add:space_def bounded_degree_polynomials_card power_divide) also have "... = (∏x∈J'. prob {ω. hash x ω = a x})" by (simp add:measure_pmf_of_set M_def Int_def hash_def) finally show ?thesis by simp next case False then obtain j where j_def: "j ∈ J'" "a j ∉ carrier R" by blast have "{ω ∈ space. hash j ω = a j} ⊆ {ω ∈ space. hash j ω ∉ carrier R}" by (rule subsetI, simp add:j_def) also have "... ⊆ {}" using j_def(1) J'_in_carr hash_range by blast finally have b:"{ω ∈ space. hash j ω = a j} = {}" by simp hence "real (card ({ω ∈ space. hash j ω = a j})) = 0" by simp hence "(∏x∈J'. real (card {ω ∈ space. hash x ω = a x})) = 0" using a(2) prod_zero[OF a(2)] j_def(1) by auto moreover have "{ω ∈ space. ∀x∈J'. hash x ω = a x} ⊆ {ω ∈ space. hash j ω = a j}" using j_def by blast hence "{ω ∈ space. ∀x∈J'. hash x ω = a x} = {}" using b by blast ultimately show ?thesis by (simp add:measure_pmf_of_set M_def Int_def prod_dividef) qed qed qed lemma k_wise_indep: "k_wise_indep_vars k (λ_. discrete) hash (carrier R)" unfolding k_wise_indep_vars_def using indep by simp lemma inj_if_degree_1: assumes "ω ∈ space" assumes "degree ω = 1" shows "inj_on (λx. hash x ω) (carrier R)" using assms eval_inj_if_degree_1 by (simp add:M_def space_def bounded_degree_polynomials_def hash_def) lemma uniform: assumes "i ∈ carrier R" shows "uniform_on (hash i) (carrier R)" proof - have a: "⋀a. prob {ω. hash i ω ∈ {a}} = indicat_real (carrier R) a / real field_size" by (subst prob_range[OF assms], simp add:indicator_def) show ?thesis by (rule uniform_onI, use a M_def in auto) qed text ‹This the main result of this section - the Carter-Wegman hash family is $k$-universal.› theorem k_universal: "k_universal k hash (carrier R) (carrier R)" using uniform k_wise_indep by (simp add:k_universal_def) end lemma poly_hash_familyI: assumes "ring R" assumes "finite (carrier R)" assumes "0 < k" shows "poly_hash_family R k" using assms by (simp add:poly_hash_family_def poly_hash_family_axioms_def) lemma carter_wegman_hash_familyI: assumes "field F" assumes "finite (carrier F)" assumes "0 < k" shows "carter_wegman_hash_family F k" using assms field.is_ring[OF assms(1)] poly_hash_familyI by (simp add:carter_wegman_hash_family_def carter_wegman_hash_family_axioms_def) lemma hash_k_wise_indep: assumes "field F ∧ finite (carrier F)" assumes "1 ≤ n" shows "prob_space.k_wise_indep_vars (pmf_of_set (bounded_degree_polynomials F n)) n (λ_. pmf_of_set (carrier F)) (ring.hash F) (carrier F)" proof - interpret carter_wegman_hash_family "F" "n" using assms carter_wegman_hash_familyI by force have "k_wise_indep_vars n (λ_. pmf_of_set (carrier F)) hash (carrier F)" by (rule k_wise_indep_vars_compose[OF k_wise_indep], simp) thus ?thesis by (simp add:M_def space_def) qed lemma hash_prob_single: assumes "field F ∧ finite (carrier F)" assumes "x ∈ carrier F" assumes "1 ≤ n" assumes "y ∈ carrier F" shows "𝒫(ω in pmf_of_set (bounded_degree_polynomials F n). ring.hash F x ω = y) = 1/(real (card (carrier F)))" proof - interpret carter_wegman_hash_family "F" "n" using assms carter_wegman_hash_familyI by force show ?thesis using prob_single[OF assms(2,4)] by (simp add:M_def space_def) qed end