(* Title: HOL/Probability/Information.thy Author: Johannes Hölzl, TU München Author: Armin Heller, TU München *) section ‹Information theory› theory Information imports Independent_Family begin subsection "Information theory" locale information_space = prob_space + fixes b :: real assumes b_gt_1: "1 < b" context information_space begin text ‹Introduce some simplification rules for logarithm of base \<^term>‹b›.› lemma log_neg_const: assumes "x ≤ 0" shows "log b x = log b 0" proof - { fix u :: real have "x ≤ 0" by fact also have "0 < exp u" using exp_gt_zero . finally have "exp u ≠ x" by auto } then show "log b x = log b 0" by (simp add: log_def ln_real_def) qed lemma log_mult_eq: "log b (A * B) = (if 0 < A * B then log b ¦A¦ + log b ¦B¦ else log b 0)" using log_mult[of "¦A¦" "¦B¦"] b_gt_1 log_neg_const[of "A * B"] by (auto simp: zero_less_mult_iff mult_le_0_iff) lemma log_inverse_eq: "log b (inverse B) = (if 0 < B then - log b B else log b 0)" using ln_inverse log_def log_neg_const by force lemma log_divide_eq: "log b (A / B) = (if 0 < A * B then (log b ¦A¦) - log b ¦B¦ else log b 0)" unfolding divide_inverse log_mult_eq log_inverse_eq abs_inverse by (auto simp: zero_less_mult_iff mult_le_0_iff) lemmas log_simps = log_mult_eq log_inverse_eq log_divide_eq end subsection "Kullback$-$Leibler divergence" text ‹The Kullback$-$Leibler divergence is also known as relative entropy or Kullback$-$Leibler distance.› definition "entropy_density b M N = log b ∘ enn2real ∘ RN_deriv M N" definition "KL_divergence b M N = integral⇧^{L}N (entropy_density b M N)" lemma measurable_entropy_density[measurable]: "entropy_density b M N ∈ borel_measurable M" unfolding entropy_density_def by auto lemma (in sigma_finite_measure) KL_density: fixes f :: "'a ⇒ real" assumes "1 < b" assumes f[measurable]: "f ∈ borel_measurable M" and nn: "AE x in M. 0 ≤ f x" shows "KL_divergence b M (density M f) = (∫x. f x * log b (f x) ∂M)" unfolding KL_divergence_def proof (subst integral_real_density) show [measurable]: "entropy_density b M (density M (λx. ennreal (f x))) ∈ borel_measurable M" using f by (auto simp: comp_def entropy_density_def) have "density M (RN_deriv M (density M f)) = density M f" using f nn by (intro density_RN_deriv_density) auto then have eq: "AE x in M. RN_deriv M (density M f) x = f x" using f nn by (intro density_unique) auto have "AE x in M. f x * entropy_density b M (density M (λx. ennreal (f x))) x = f x * log b (f x)" using eq nn by (auto simp: entropy_density_def) then show "(∫x. f x * entropy_density b M (density M (λx. ennreal (f x))) x ∂M) = (∫x. f x * log b (f x) ∂M)" by (intro integral_cong_AE) measurable qed fact+ lemma (in sigma_finite_measure) KL_density_density: fixes f g :: "'a ⇒ real" assumes "1 < b" assumes f: "f ∈ borel_measurable M" "AE x in M. 0 ≤ f x" assumes g: "g ∈ borel_measurable M" "AE x in M. 0 ≤ g x" assumes ac: "AE x in M. f x = 0 ⟶ g x = 0" shows "KL_divergence b (density M f) (density M g) = (∫x. g x * log b (g x / f x) ∂M)" proof - interpret Mf: sigma_finite_measure "density M f" using f by (subst sigma_finite_iff_density_finite) auto have "KL_divergence b (density M f) (density M g) = KL_divergence b (density M f) (density (density M f) (λx. g x / f x))" using f g ac by (subst density_density_divide) simp_all also have "… = (∫x. (g x / f x) * log b (g x / f x) ∂density M f)" using f g ‹1 < b› by (intro Mf.KL_density) (auto simp: AE_density) also have "… = (∫x. g x * log b (g x / f x) ∂M)" using ac f g ‹1 < b› by (subst integral_density) (auto intro!: integral_cong_AE) finally show ?thesis . qed lemma (in information_space) KL_gt_0: fixes D :: "'a ⇒ real" assumes "prob_space (density M D)" assumes D: "D ∈ borel_measurable M" "AE x in M. 0 ≤ D x" assumes int: "integrable M (λx. D x * log b (D x))" assumes A: "density M D ≠ M" shows "0 < KL_divergence b M (density M D)" proof - interpret N: prob_space "density M D" by fact obtain A where "A ∈ sets M" "emeasure (density M D) A ≠ emeasure M A" using measure_eqI[of "density M D" M] ‹density M D ≠ M› by auto let ?D_set = "{x∈space M. D x ≠ 0}" have [simp, intro]: "?D_set ∈ sets M" using D by auto have D_neg: "(∫⇧^{+}x. ennreal (- D x) ∂M) = 0" using D by (subst nn_integral_0_iff_AE) (auto simp: ennreal_neg) have "(∫⇧^{+}x. ennreal (D x) ∂M) = emeasure (density M D) (space M)" using D by (simp add: emeasure_density cong: nn_integral_cong) then have D_pos: "(∫⇧^{+}x. ennreal (D x) ∂M) = 1" using N.emeasure_space_1 by simp have "integrable M D" using D D_pos D_neg unfolding real_integrable_def real_lebesgue_integral_def by simp_all then have "integral⇧^{L}M D = 1" using D D_pos D_neg by (simp add: real_lebesgue_integral_def) have "0 ≤ 1 - measure M ?D_set" using prob_le_1 by (auto simp: field_simps) also have "… = (∫ x. D x - indicator ?D_set x ∂M)" using ‹integrable M D› ‹integral⇧^{L}M D = 1› by (simp add: emeasure_eq_measure) also have "… < (∫ x. D x * (ln b * log b (D x)) ∂M)" proof (rule integral_less_AE) show "integrable M (λx. D x - indicator ?D_set x)" using ‹integrable M D› by (auto simp: less_top[symmetric]) next from integrable_mult_left(1)[OF int, of "ln b"] show "integrable M (λx. D x * (ln b * log b (D x)))" by (simp add: ac_simps) next show "emeasure M {x∈space M. D x ≠ 1 ∧ D x ≠ 0} ≠ 0" proof assume eq_0: "emeasure M {x∈space M. D x ≠ 1 ∧ D x ≠ 0} = 0" then have disj: "AE x in M. D x = 1 ∨ D x = 0" using D(1) by (auto intro!: AE_I[OF subset_refl] sets.sets_Collect) have "emeasure M {x∈space M. D x = 1} = (∫⇧^{+}x. indicator {x∈space M. D x = 1} x ∂M)" using D(1) by auto also have "… = (∫⇧^{+}x. ennreal (D x) ∂M)" using disj by (auto intro!: nn_integral_cong_AE simp: indicator_def one_ennreal_def) finally have "AE x in M. D x = 1" using D D_pos by (intro AE_I_eq_1) auto then have "(∫⇧^{+}x. indicator A x∂M) = (∫⇧^{+}x. ennreal (D x) * indicator A x∂M)" by (intro nn_integral_cong_AE) (auto simp: one_ennreal_def[symmetric]) also have "… = density M D A" using ‹A ∈ sets M› D by (simp add: emeasure_density) finally show False using ‹A ∈ sets M› ‹emeasure (density M D) A ≠ emeasure M A› by simp qed show "{x∈space M. D x ≠ 1 ∧ D x ≠ 0} ∈ sets M" using D(1) by (auto intro: sets.sets_Collect_conj) show "AE t in M. t ∈ {x∈space M. D x ≠ 1 ∧ D x ≠ 0} ⟶ D t - indicator ?D_set t ≠ D t * (ln b * log b (D t))" using D(2) proof (eventually_elim, safe) fix t assume Dt: "t ∈ space M" "D t ≠ 1" "D t ≠ 0" "0 ≤ D t" and eq: "D t - indicator ?D_set t = D t * (ln b * log b (D t))" have "D t - 1 = D t - indicator ?D_set t" using Dt by simp also note eq also have "D t * (ln b * log b (D t)) = - D t * ln (1 / D t)" using b_gt_1 ‹D t ≠ 0› ‹0 ≤ D t› by (simp add: log_def ln_div less_le) finally have "ln (1 / D t) = 1 / D t - 1" using ‹D t ≠ 0› by (auto simp: field_simps) from ln_eq_minus_one[OF _ this] ‹D t ≠ 0› ‹0 ≤ D t› ‹D t ≠ 1› show False by auto qed show "AE t in M. D t - indicator ?D_set t ≤ D t * (ln b * log b (D t))" using D(2) AE_space proof eventually_elim fix t assume "t ∈ space M" "0 ≤ D t" show "D t - indicator ?D_set t ≤ D t * (ln b * log b (D t))" proof cases assume asm: "D t ≠ 0" then have "0 < D t" using ‹0 ≤ D t› by auto then have "0 < 1 / D t" by auto have "D t - indicator ?D_set t ≤ - D t * (1 / D t - 1)" using asm ‹t ∈ space M› by (simp add: field_simps) also have "- D t * (1 / D t - 1) ≤ - D t * ln (1 / D t)" using ln_le_minus_one ‹0 < 1 / D t› by (intro mult_left_mono_neg) auto also have "… = D t * (ln b * log b (D t))" using ‹0 < D t› b_gt_1 by (simp_all add: log_def ln_div) finally show ?thesis by simp qed simp qed qed also have "… = (∫ x. ln b * (D x * log b (D x)) ∂M)" by (simp add: ac_simps) also have "… = ln b * (∫ x. D x * log b (D x) ∂M)" using int by simp finally show ?thesis using b_gt_1 D by (subst KL_density) (auto simp: zero_less_mult_iff) qed lemma (in sigma_finite_measure) KL_same_eq_0: "KL_divergence b M M = 0" proof - have "AE x in M. 1 = RN_deriv M M x" proof (rule RN_deriv_unique) show "density M (λx. 1) = M" by (simp add: density_1) qed auto then have "AE x in M. log b (enn2real (RN_deriv M M x)) = 0" by (elim AE_mp) simp from integral_cong_AE[OF _ _ this] have "integral⇧^{L}M (entropy_density b M M) = 0" by (simp add: entropy_density_def comp_def) then show "KL_divergence b M M = 0" unfolding KL_divergence_def by auto qed lemma (in information_space) KL_eq_0_iff_eq: fixes D :: "'a ⇒ real" assumes "prob_space (density M D)" assumes D: "D ∈ borel_measurable M" "AE x in M. 0 ≤ D x" assumes int: "integrable M (λx. D x * log b (D x))" shows "KL_divergence b M (density M D) = 0 ⟷ density M D = M" using KL_same_eq_0[of b] KL_gt_0[OF assms] by (auto simp: less_le) lemma (in information_space) KL_eq_0_iff_eq_ac: fixes D :: "'a ⇒ real" assumes "prob_space N" assumes ac: "absolutely_continuous M N" "sets N = sets M" assumes int: "integrable N (entropy_density b M N)" shows "KL_divergence b M N = 0 ⟷ N = M" proof - interpret N: prob_space N by fact have "finite_measure N" by unfold_locales from real_RN_deriv[OF this ac] obtain D where D: "random_variable borel D" "AE x in M. RN_deriv M N x = ennreal (D x)" "AE x in N. 0 < D x" "⋀x. 0 ≤ D x" by this auto have "N = density M (RN_deriv M N)" using ac by (rule density_RN_deriv[symmetric]) also have "… = density M D" using D by (auto intro!: density_cong) finally have N: "N = density M D" . from absolutely_continuous_AE[OF ac(2,1) D(2)] D b_gt_1 ac measurable_entropy_density have "integrable N (λx. log b (D x))" by (intro integrable_cong_AE[THEN iffD2, OF _ _ _ int]) (auto simp: N entropy_density_def) with D b_gt_1 have "integrable M (λx. D x * log b (D x))" by (subst integrable_real_density[symmetric]) (auto simp: N[symmetric] comp_def) with ‹prob_space N› D show ?thesis unfolding N by (intro KL_eq_0_iff_eq) auto qed lemma (in information_space) KL_nonneg: assumes "prob_space (density M D)" assumes D: "D ∈ borel_measurable M" "AE x in M. 0 ≤ D x" assumes int: "integrable M (λx. D x * log b (D x))" shows "0 ≤ KL_divergence b M (density M D)" using KL_gt_0[OF assms] by (cases "density M D = M") (auto simp: KL_same_eq_0) lemma (in sigma_finite_measure) KL_density_density_nonneg: fixes f g :: "'a ⇒ real" assumes "1 < b" assumes f: "f ∈ borel_measurable M" "AE x in M. 0 ≤ f x" "prob_space (density M f)" assumes g: "g ∈ borel_measurable M" "AE x in M. 0 ≤ g x" "prob_space (density M g)" assumes ac: "AE x in M. f x = 0 ⟶ g x = 0" assumes int: "integrable M (λx. g x * log b (g x / f x))" shows "0 ≤ KL_divergence b (density M f) (density M g)" proof - interpret Mf: prob_space "density M f" by fact interpret Mf: information_space "density M f" b by standard fact have eq: "density (density M f) (λx. g x / f x) = density M g" (is "?DD = _") using f g ac by (subst density_density_divide) simp_all have "0 ≤ KL_divergence b (density M f) (density (density M f) (λx. g x / f x))" proof (rule Mf.KL_nonneg) show "prob_space ?DD" unfolding eq by fact from f g show "(λx. g x / f x) ∈ borel_measurable (density M f)" by auto show "AE x in density M f. 0 ≤ g x / f x" using f g by (auto simp: AE_density) show "integrable (density M f) (λx. g x / f x * log b (g x / f x))" using ‹1 < b› f g ac by (subst integrable_density) (auto intro!: integrable_cong_AE[THEN iffD2, OF _ _ _ int] measurable_If) qed also have "… = KL_divergence b (density M f) (density M g)" using f g ac by (subst density_density_divide) simp_all finally show ?thesis . qed subsection ‹Finite Entropy› definition (in information_space) finite_entropy :: "'b measure ⇒ ('a ⇒ 'b) ⇒ ('b ⇒ real) ⇒ bool" where "finite_entropy S X f ⟷ distributed M S X f ∧ integrable S (λx. f x * log b (f x)) ∧ (∀x∈space S. 0 ≤ f x)" lemma (in information_space) finite_entropy_simple_function: assumes X: "simple_function M X" shows "finite_entropy (count_space (X`space M)) X (λa. measure M {x ∈ space M. X x = a})" unfolding finite_entropy_def proof safe have [simp]: "finite (X ` space M)" using X by (auto simp: simple_function_def) then show "integrable (count_space (X ` space M)) (λx. prob {xa ∈ space M. X xa = x} * log b (prob {xa ∈ space M. X xa = x}))" by (rule integrable_count_space) have d: "distributed M (count_space (X ` space M)) X (λx. ennreal (if x ∈ X`space M then prob {xa ∈ space M. X xa = x} else 0))" by (rule distributed_simple_function_superset[OF X]) (auto intro!: arg_cong[where f=prob]) show "distributed M (count_space (X ` space M)) X (λx. ennreal (prob {xa ∈ space M. X xa = x}))" by (rule distributed_cong_density[THEN iffD1, OF _ _ _ d]) auto qed (rule measure_nonneg) lemma ac_fst: assumes "sigma_finite_measure T" shows "absolutely_continuous S (distr (S ⨂⇩_{M}T) S fst)" proof - interpret sigma_finite_measure T by fact { fix A assume A: "A ∈ sets S" "emeasure S A = 0" then have "fst -` A ∩ space (S ⨂⇩_{M}T) = A × space T" by (auto simp: space_pair_measure dest!: sets.sets_into_space) with A have "emeasure (S ⨂⇩_{M}T) (fst -` A ∩ space (S ⨂⇩_{M}T)) = 0" by (simp add: emeasure_pair_measure_Times) } then show ?thesis unfolding absolutely_continuous_def by (metis emeasure_distr measurable_fst null_setsD1 null_setsD2 null_setsI sets_distr subsetI) qed lemma ac_snd: assumes "sigma_finite_measure T" shows "absolutely_continuous T (distr (S ⨂⇩_{M}T) T snd)" proof - interpret sigma_finite_measure T by fact { fix A assume A: "A ∈ sets T" "emeasure T A = 0" then have "snd -` A ∩ space (S ⨂⇩_{M}T) = space S × A" by (auto simp: space_pair_measure dest!: sets.sets_into_space) with A have "emeasure (S ⨂⇩_{M}T) (snd -` A ∩ space (S ⨂⇩_{M}T)) = 0" by (simp add: emeasure_pair_measure_Times) } then show ?thesis unfolding absolutely_continuous_def by (metis emeasure_distr measurable_snd null_setsD1 null_setsD2 null_setsI sets_distr subsetI) qed lemma (in information_space) finite_entropy_integrable: "finite_entropy S X Px ⟹ integrable S (λx. Px x * log b (Px x))" unfolding finite_entropy_def by auto lemma (in information_space) finite_entropy_distributed: "finite_entropy S X Px ⟹ distributed M S X Px" unfolding finite_entropy_def by auto lemma (in information_space) finite_entropy_nn: "finite_entropy S X Px ⟹ x ∈ space S ⟹ 0 ≤ Px x" by (auto simp: finite_entropy_def) lemma (in information_space) finite_entropy_measurable: "finite_entropy S X Px ⟹ Px ∈ S →⇩_{M}borel" using distributed_real_measurable[of S Px M X] finite_entropy_nn[of S X Px] finite_entropy_distributed[of S X Px] by auto lemma (in information_space) subdensity_finite_entropy: fixes g :: "'b ⇒ real" and f :: "'c ⇒ real" assumes T: "T ∈ measurable P Q" assumes f: "finite_entropy P X f" assumes g: "finite_entropy Q Y g" assumes Y: "Y = T ∘ X" shows "AE x in P. g (T x) = 0 ⟶ f x = 0" using subdensity[OF T, of M X "λx. ennreal (f x)" Y "λx. ennreal (g x)"] finite_entropy_distributed[OF f] finite_entropy_distributed[OF g] finite_entropy_nn[OF f] finite_entropy_nn[OF g] assms by auto lemma (in information_space) finite_entropy_integrable_transform: "finite_entropy S X Px ⟹ distributed M T Y Py ⟹ (⋀x. x ∈ space T ⟹ 0 ≤ Py x) ⟹ X = (λx. f (Y x)) ⟹ f ∈ measurable T S ⟹ integrable T (λx. Py x * log b (Px (f x)))" using distributed_transform_integrable[of M T Y Py S X Px f "λx. log b (Px x)"] using distributed_real_measurable[of S Px M X] by (auto simp: finite_entropy_def) subsection ‹Mutual Information› definition (in prob_space) "mutual_information b S T X Y = KL_divergence b (distr M S X ⨂⇩_{M}distr M T Y) (distr M (S ⨂⇩_{M}T) (λx. (X x, Y x)))" lemma (in information_space) mutual_information_indep_vars: fixes S T X Y defines "P ≡ distr M S X ⨂⇩_{M}distr M T Y" defines "Q ≡ distr M (S ⨂⇩_{M}T) (λx. (X x, Y x))" shows "indep_var S X T Y ⟷ (random_variable S X ∧ random_variable T Y ∧ absolutely_continuous P Q ∧ integrable Q (entropy_density b P Q) ∧ mutual_information b S T X Y = 0)" unfolding indep_var_distribution_eq proof safe assume rv[measurable]: "random_variable S X" "random_variable T Y" interpret X: prob_space "distr M S X" by (rule prob_space_distr) fact interpret Y: prob_space "distr M T Y" by (rule prob_space_distr) fact interpret XY: pair_prob_space "distr M S X" "distr M T Y" by standard interpret P: information_space P b unfolding P_def by standard (rule b_gt_1) interpret Q: prob_space Q unfolding Q_def by (rule prob_space_distr) simp { assume "distr M S X ⨂⇩_{M}distr M T Y = distr M (S ⨂⇩_{M}T) (λx. (X x, Y x))" then have [simp]: "Q = P" unfolding Q_def P_def by simp show ac: "absolutely_continuous P Q" by (simp add: absolutely_continuous_def) then have ed: "entropy_density b P Q ∈ borel_measurable P" by simp have "AE x in P. 1 = RN_deriv P Q x" proof (rule P.RN_deriv_unique) show "density P (λx. 1) = Q" unfolding ‹Q = P› by (intro measure_eqI) (auto simp: emeasure_density) qed auto then have ae_0: "AE x in P. entropy_density b P Q x = 0" by (auto simp: entropy_density_def) then have "integrable P (entropy_density b P Q) ⟷ integrable Q (λx. 0::real)" using ed unfolding ‹Q = P› by (intro integrable_cong_AE) auto then show "integrable Q (entropy_density b P Q)" by simp from ae_0 have "mutual_information b S T X Y = (∫x. 0 ∂P)" unfolding mutual_information_def KL_divergence_def P_def[symmetric] Q_def[symmetric] ‹Q = P› by (intro integral_cong_AE) auto then show "mutual_information b S T X Y = 0" by simp } { assume ac: "absolutely_continuous P Q" assume int: "integrable Q (entropy_density b P Q)" assume I_eq_0: "mutual_information b S T X Y = 0" have eq: "Q = P" proof (rule P.KL_eq_0_iff_eq_ac[THEN iffD1]) show "prob_space Q" by unfold_locales show "absolutely_continuous P Q" by fact show "integrable Q (entropy_density b P Q)" by fact show "sets Q = sets P" by (simp add: P_def Q_def sets_pair_measure) show "KL_divergence b P Q = 0" using I_eq_0 unfolding mutual_information_def by (simp add: P_def Q_def) qed then show "distr M S X ⨂⇩_{M}distr M T Y = distr M (S ⨂⇩_{M}T) (λx. (X x, Y x))" unfolding P_def Q_def .. } qed abbreviation (in information_space) mutual_information_Pow ("ℐ'(_ ; _')") where "ℐ(X ; Y) ≡ mutual_information b (count_space (X`space M)) (count_space (Y`space M)) X Y" lemma (in information_space) fixes Pxy :: "'b × 'c ⇒ real" and Px :: "'b ⇒ real" and Py :: "'c ⇒ real" assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" assumes Fx: "finite_entropy S X Px" and Fy: "finite_entropy T Y Py" assumes Fxy: "finite_entropy (S ⨂⇩_{M}T) (λx. (X x, Y x)) Pxy" defines "f ≡ λx. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))" shows mutual_information_distr': "mutual_information b S T X Y = integral⇧^{L}(S ⨂⇩_{M}T) f" (is "?M = ?R") and mutual_information_nonneg': "0 ≤ mutual_information b S T X Y" proof - have Px: "distributed M S X Px" and Px_nn: "⋀x. x ∈ space S ⟹ 0 ≤ Px x" using Fx by (auto simp: finite_entropy_def) have Py: "distributed M T Y Py" and Py_nn: "⋀x. x ∈ space T ⟹ 0 ≤ Py x" using Fy by (auto simp: finite_entropy_def) have Pxy: "distributed M (S ⨂⇩_{M}T) (λx. (X x, Y x)) Pxy" and Pxy_nn: "⋀x. x ∈ space (S ⨂⇩_{M}T) ⟹ 0 ≤ Pxy x" "⋀x y. x ∈ space S ⟹ y ∈ space T ⟹ 0 ≤ Pxy (x, y)" using Fxy by (auto simp: finite_entropy_def space_pair_measure) have [measurable]: "Px ∈ S →⇩_{M}borel" using Px Px_nn by (intro distributed_real_measurable) have [measurable]: "Py ∈ T →⇩_{M}borel" using Py Py_nn by (intro distributed_real_measurable) have measurable_Pxy[measurable]: "Pxy ∈ (S ⨂⇩_{M}T) →⇩_{M}borel" using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) have X[measurable]: "random_variable S X" using Px by auto have Y[measurable]: "random_variable T Y" using Py by auto interpret S: sigma_finite_measure S by fact interpret T: sigma_finite_measure T by fact interpret ST: pair_sigma_finite S T .. interpret X: prob_space "distr M S X" using X by (rule prob_space_distr) interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr) interpret XY: pair_prob_space "distr M S X" "distr M T Y" .. let ?P = "S ⨂⇩_{M}T" let ?D = "distr M ?P (λx. (X x, Y x))" { fix A assume "A ∈ sets S" with X[THEN measurable_space] Y[THEN measurable_space] have "emeasure (distr M S X) A = emeasure ?D (A × space T)" by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) } note marginal_eq1 = this { fix A assume "A ∈ sets T" with X[THEN measurable_space] Y[THEN measurable_space] have "emeasure (distr M T Y) A = emeasure ?D (space S × A)" by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) } note marginal_eq2 = this have distr_eq: "distr M S X ⨂⇩_{M}distr M T Y = density ?P (λx. ennreal (Px (fst x) * Py (snd x)))" unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density] proof (subst pair_measure_density) show "(λx. ennreal (Px x)) ∈ borel_measurable S" "(λy. ennreal (Py y)) ∈ borel_measurable T" using Px Py by (auto simp: distributed_def) show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] .. show "density (S ⨂⇩_{M}T) (λ(x, y). ennreal (Px x) * ennreal (Py y)) = density (S ⨂⇩_{M}T) (λx. ennreal (Px (fst x) * Py (snd x)))" using Px_nn Py_nn by (auto intro!: density_cong simp: distributed_def ennreal_mult space_pair_measure) qed fact have M: "?M = KL_divergence b (density ?P (λx. ennreal (Px (fst x) * Py (snd x)))) (density ?P (λx. ennreal (Pxy x)))" unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] .. from Px Py have f: "(λx. Px (fst x) * Py (snd x)) ∈ borel_measurable ?P" by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'') have PxPy_nonneg: "AE x in ?P. 0 ≤ Px (fst x) * Py (snd x)" using Px_nn Py_nn by (auto simp: space_pair_measure) have A: "(AE x in ?P. Px (fst x) = 0 ⟶ Pxy x = 0)" by (rule subdensity_real[OF measurable_fst Pxy Px]) (insert Px_nn Pxy_nn, auto simp: space_pair_measure) moreover have B: "(AE x in ?P. Py (snd x) = 0 ⟶ Pxy x = 0)" by (rule subdensity_real[OF measurable_snd Pxy Py]) (insert Py_nn Pxy_nn, auto simp: space_pair_measure) ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 ⟶ Pxy x = 0" by auto show "?M = ?R" unfolding M f_def using Pxy_nn Px_nn Py_nn by (intro ST.KL_density_density b_gt_1 f PxPy_nonneg ac) (auto simp: space_pair_measure) have X: "X = fst ∘ (λx. (X x, Y x))" and Y: "Y = snd ∘ (λx. (X x, Y x))" by auto have "integrable (S ⨂⇩_{M}T) (λx. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)))" using finite_entropy_integrable[OF Fxy] using finite_entropy_integrable_transform[OF Fx Pxy, of fst] using finite_entropy_integrable_transform[OF Fy Pxy, of snd] by (simp add: Pxy_nn) moreover have "f ∈ borel_measurable (S ⨂⇩_{M}T)" unfolding f_def using Px Py Pxy by (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'' intro!: borel_measurable_times borel_measurable_log borel_measurable_divide) ultimately have int: "integrable (S ⨂⇩_{M}T) f" apply (rule integrable_cong_AE_imp) using A B by (auto simp: f_def log_divide_eq log_mult_eq field_simps space_pair_measure Px_nn Py_nn Pxy_nn less_le) show "0 ≤ ?M" unfolding M proof (intro ST.KL_density_density_nonneg) show "prob_space (density (S ⨂⇩_{M}T) (λx. ennreal (Pxy x))) " unfolding distributed_distr_eq_density[OF Pxy, symmetric] using distributed_measurable[OF Pxy] by (rule prob_space_distr) show "prob_space (density (S ⨂⇩_{M}T) (λx. ennreal (Px (fst x) * Py (snd x))))" unfolding distr_eq[symmetric] by unfold_locales show "integrable (S ⨂⇩_{M}T) (λx. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))))" using int unfolding f_def . qed (insert ac, auto simp: b_gt_1 Px_nn Py_nn Pxy_nn space_pair_measure) qed lemma (in information_space) fixes Pxy :: "'b × 'c ⇒ real" and Px :: "'b ⇒ real" and Py :: "'c ⇒ real" assumes "sigma_finite_measure S" "sigma_finite_measure T" assumes Px: "distributed M S X Px" and Px_nn: "⋀x. x ∈ space S ⟹ 0 ≤ Px x" and Py: "distributed M T Y Py" and Py_nn: "⋀y. y ∈ space T ⟹ 0 ≤ Py y" and Pxy: "distributed M (S ⨂⇩_{M}T) (λx. (X x, Y x)) Pxy" and Pxy_nn: "⋀x y. x ∈ space S ⟹ y ∈ space T ⟹ 0 ≤ Pxy (x, y)" defines "f ≡ λx. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))" shows mutual_information_distr: "mutual_information b S T X Y = integral⇧^{L}(S ⨂⇩_{M}T) f" (is "?M = ?R") and mutual_information_nonneg: "integrable (S ⨂⇩_{M}T) f ⟹ 0 ≤ mutual_information b S T X Y" proof - have X[measurable]: "random_variable S X" using Px by (auto simp: distributed_def) have Y[measurable]: "random_variable T Y" using Py by (auto simp: distributed_def) have [measurable]: "Px ∈ S →⇩_{M}borel" using Px Px_nn by (intro distributed_real_measurable) have [measurable]: "Py ∈ T →⇩_{M}borel" using Py Py_nn by (intro distributed_real_measurable) have measurable_Pxy[measurable]: "Pxy ∈ (S ⨂⇩_{M}T) →⇩_{M}borel" using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) interpret S: sigma_finite_measure S by fact interpret T: sigma_finite_measure T by fact interpret ST: pair_sigma_finite S T .. interpret X: prob_space "distr M S X" using X by (rule prob_space_distr) interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr) interpret XY: pair_prob_space "distr M S X" "distr M T Y" .. let ?P = "S ⨂⇩_{M}T" let ?D = "distr M ?P (λx. (X x, Y x))" { fix A assume "A ∈ sets S" with X[THEN measurable_space] Y[THEN measurable_space] have "emeasure (distr M S X) A = emeasure ?D (A × space T)" by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) } note marginal_eq1 = this { fix A assume "A ∈ sets T" with X[THEN measurable_space] Y[THEN measurable_space] have "emeasure (distr M T Y) A = emeasure ?D (space S × A)" by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) } note marginal_eq2 = this have distr_eq: "distr M S X ⨂⇩_{M}distr M T Y = density ?P (λx. ennreal (Px (fst x) * Py (snd x)))" unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density] proof (subst pair_measure_density) show "(λx. ennreal (Px x)) ∈ borel_measurable S" "(λy. ennreal (Py y)) ∈ borel_measurable T" using Px Py by (auto simp: distributed_def) show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] .. show "density (S ⨂⇩_{M}T) (λ(x, y). ennreal (Px x) * ennreal (Py y)) = density (S ⨂⇩_{M}T) (λx. ennreal (Px (fst x) * Py (snd x)))" using Px_nn Py_nn by (auto intro!: density_cong simp: distributed_def ennreal_mult space_pair_measure) qed fact have M: "?M = KL_divergence b (density ?P (λx. ennreal (Px (fst x) * Py (snd x)))) (density ?P (λx. ennreal (Pxy x)))" unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] .. from Px Py have f: "(λx. Px (fst x) * Py (snd x)) ∈ borel_measurable ?P" by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'') have PxPy_nonneg: "AE x in ?P. 0 ≤ Px (fst x) * Py (snd x)" using Px_nn Py_nn by (auto simp: space_pair_measure) have "(AE x in ?P. Px (fst x) = 0 ⟶ Pxy x = 0)" by (rule subdensity_real[OF measurable_fst Pxy Px]) (insert Px_nn Pxy_nn, auto simp: space_pair_measure) moreover have "(AE x in ?P. Py (snd x) = 0 ⟶ Pxy x = 0)" by (rule subdensity_real[OF measurable_snd Pxy Py]) (insert Py_nn Pxy_nn, auto simp: space_pair_measure) ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 ⟶ Pxy x = 0" by auto show "?M = ?R" unfolding M f_def using b_gt_1 f PxPy_nonneg ac Pxy_nn by (intro ST.KL_density_density) (auto simp: space_pair_measure) assume int: "integrable (S ⨂⇩_{M}T) f" show "0 ≤ ?M" unfolding M proof (intro ST.KL_density_density_nonneg) show "prob_space (density (S ⨂⇩_{M}T) (λx. ennreal (Pxy x))) " unfolding distributed_distr_eq_density[OF Pxy, symmetric] using distributed_measurable[OF Pxy] by (rule prob_space_distr) show "prob_space (density (S ⨂⇩_{M}T) (λx. ennreal (Px (fst x) * Py (snd x))))" unfolding distr_eq[symmetric] by unfold_locales show "integrable (S ⨂⇩_{M}T) (λx. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))))" using int unfolding f_def . qed (insert ac, auto simp: b_gt_1 Px_nn Py_nn Pxy_nn space_pair_measure) qed lemma (in information_space) fixes Pxy :: "'b × 'c ⇒ real" and Px :: "'b ⇒ real" and Py :: "'c ⇒ real" assumes "sigma_finite_measure S" "sigma_finite_measure T" assumes Px[measurable]: "distributed M S X Px" and Px_nn: "⋀x. x ∈ space S ⟹ 0 ≤ Px x" and Py[measurable]: "distributed M T Y Py" and Py_nn: "⋀x. x ∈ space T ⟹ 0 ≤ Py x" and Pxy[measurable]: "distributed M (S ⨂⇩_{M}T) (λx. (X x, Y x)) Pxy" and Pxy_nn: "⋀x. x ∈ space (S ⨂⇩_{M}T) ⟹ 0 ≤ Pxy x" assumes ae: "AE x in S. AE y in T. Pxy (x, y) = Px x * Py y" shows mutual_information_eq_0: "mutual_information b S T X Y = 0" proof - interpret S: sigma_finite_measure S by fact interpret T: sigma_finite_measure T by fact interpret ST: pair_sigma_finite S T .. note distributed_real_measurable[OF Px_nn Px, measurable] distributed_real_measurable[OF Py_nn Py, measurable] distributed_real_measurable[OF Pxy_nn Pxy, measurable] have "AE x in S ⨂⇩_{M}T. Px (fst x) = 0 ⟶ Pxy x = 0" by (rule subdensity_real[OF measurable_fst Pxy Px]) (auto simp: Px_nn Pxy_nn space_pair_measure) moreover have "AE x in S ⨂⇩_{M}T. Py (snd x) = 0 ⟶ Pxy x = 0" by (rule subdensity_real[OF measurable_snd Pxy Py]) (auto simp: Py_nn Pxy_nn space_pair_measure) moreover have "AE x in S ⨂⇩_{M}T. Pxy x = Px (fst x) * Py (snd x)" by (intro ST.AE_pair_measure) (auto simp: ae intro!: measurable_snd'' measurable_fst'') ultimately have "AE x in S ⨂⇩_{M}T. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) = 0" by eventually_elim simp then have "(∫x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) ∂(S ⨂⇩_{M}T)) = (∫x. 0 ∂(S ⨂⇩_{M}T))" by (intro integral_cong_AE) auto then show ?thesis by (subst mutual_information_distr[OF assms(1-8)]) (auto simp add: space_pair_measure) qed lemma (in information_space) mutual_information_simple_distributed: assumes X: "simple_distributed M X Px" and Y: "simple_distributed M Y Py" assumes XY: "simple_distributed M (λx. (X x, Y x)) Pxy" shows "ℐ(X ; Y) = (∑(x, y)∈(λx. (X x, Y x))`space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))" proof (subst mutual_information_distr[OF _ _ simple_distributed[OF X] _ simple_distributed[OF Y] _ simple_distributed_joint[OF XY]]) note fin = simple_distributed_joint_finite[OF XY, simp] show "sigma_finite_measure (count_space (X ` space M))" by (simp add: sigma_finite_measure_count_space_finite) show "sigma_finite_measure (count_space (Y ` space M))" by (simp add: sigma_finite_measure_count_space_finite) let ?Pxy = "λx. (if x ∈ (λx. (X x, Y x)) ` space M then Pxy x else 0)" let ?f = "λx. ?Pxy x * log b (?Pxy x / (Px (fst x) * Py (snd x)))" have "⋀x. ?f x = (if x ∈ (λx. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) else 0)" by auto with fin show "(∫ x. ?f x ∂(count_space (X ` space M) ⨂⇩_{M}count_space (Y ` space M))) = (∑(x, y)∈(λx. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))" by (auto simp add: pair_measure_count_space lebesgue_integral_count_space_finite sum.If_cases split_beta' intro!: sum.cong) qed (insert X Y XY, auto simp: simple_distributed_def) lemma (in information_space) fixes Pxy :: "'b × 'c ⇒ real" and Px :: "'b ⇒ real" and Py :: "'c ⇒ real" assumes Px: "simple_distributed M X Px" and Py: "simple_distributed M Y Py" assumes Pxy: "simple_distributed M (λx. (X x, Y x)) Pxy" assumes ae: "∀x∈space M. Pxy (X x, Y x) = Px (X x) * Py (Y x)" shows mutual_information_eq_0_simple: "ℐ(X ; Y) = 0" proof (subst mutual_information_simple_distributed[OF Px Py Pxy]) have "(∑(x, y)∈(λx. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) = (∑(x, y)∈(λx. (X x, Y x)) ` space M. 0)" by (intro sum.cong) (auto simp: ae) then show "(∑(x, y)∈(λx. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) = 0" by simp qed subsection ‹Entropy› definition (in prob_space) entropy :: "real ⇒ 'b measure ⇒ ('a ⇒ 'b) ⇒ real" where "entropy b S X = - KL_divergence b S (distr M S X)" abbreviation (in information_space) entropy_Pow ("ℋ'(_')") where "ℋ(X) ≡ entropy b (count_space (X`space M)) X" lemma (in prob_space) distributed_RN_deriv: assumes X: "distributed M S X Px" shows "AE x in S. RN_deriv S (density S Px) x = Px x" proof - have "distributed M S X (RN_deriv S (density S Px))" by (metis RN_derivI assms borel_measurable_RN_deriv distributed_def) then show ?thesis using assms distributed_unique by blast qed lemma (in information_space) fixes X :: "'a ⇒ 'b" assumes X[measurable]: "distributed M MX X f" and nn: "⋀x. x ∈ space MX ⟹ 0 ≤ f x" shows entropy_distr: "entropy b MX X = - (∫x. f x * log b (f x) ∂MX)" (is ?eq) proof - note D = distributed_measurable[OF X] distributed_borel_measurable[OF X] note ae = distributed_RN_deriv[OF X] note distributed_real_measurable[OF nn X, measurable] have ae_eq: "AE x in distr M MX X. log b (enn2real (RN_deriv MX (distr M MX X) x)) = log b (f x)" unfolding distributed_distr_eq_density[OF X] using D ae by (auto simp: AE_density) have int_eq: "(∫ x. f x * log b (f x) ∂MX) = (∫ x. log b (f x) ∂distr M MX X)" unfolding distributed_distr_eq_density[OF X] using D by (subst integral_density) (auto simp: nn) show ?eq unfolding entropy_def KL_divergence_def entropy_density_def comp_def int_eq neg_equal_iff_equal using ae_eq by (intro integral_cong_AE) (auto simp: nn) qed lemma (in information_space) entropy_le: fixes Px :: "'b ⇒ real" and MX :: "'b measure" assumes X[measurable]: "distributed M MX X Px" and Px_nn[simp]: "⋀x. x ∈ space MX ⟹ 0 ≤ Px x" and fin: "emeasure MX {x ∈ space MX. Px x ≠ 0} ≠ top" and int: "integrable MX (λx. - Px x * log b (Px x))" shows "entropy b MX X ≤ log b (measure MX {x ∈ space MX. Px x ≠ 0})" proof - note Px = distributed_borel_measurable[OF X] interpret X: prob_space "distr M MX X" using distributed_measurable[OF X] by (rule prob_space_distr) have " - log b (measure MX {x ∈ space MX. Px x ≠ 0}) = - log b (∫ x. indicator {x ∈ space MX. Px x ≠ 0} x ∂MX)" using Px Px_nn fin by (auto simp: measure_def) also have "- log b (∫ x. indicator {x ∈ space MX. Px x ≠ 0} x ∂MX) = - log b (∫ x. 1 / Px x ∂distr M MX X)" proof - have "integral⇧^{L}MX (indicator {x ∈ space MX. Px x ≠ 0}) = LINT x|MX. Px x *⇩_{R}(1 / Px x)" by (rule Bochner_Integration.integral_cong) auto also have "... = LINT x|density MX (λx. ennreal (Px x)). 1 / Px x" by (rule integral_density [symmetric]) (use Px Px_nn in auto) finally show ?thesis unfolding distributed_distr_eq_density[OF X] by simp qed also have "… ≤ (∫ x. - log b (1 / Px x) ∂distr M MX X)" proof (rule X.jensens_inequality[of "λx. 1 / Px x" "{0<..}" 0 1 "λx. - log b x"]) show "AE x in distr M MX X. 1 / Px x ∈ {0<..}" unfolding distributed_distr_eq_density[OF X] using Px by (auto simp: AE_density) have [simp]: "⋀x. x ∈ space MX ⟹ ennreal (if Px x = 0 then 0 else 1) = indicator {x ∈ space MX. Px x ≠ 0} x" by (auto simp: one_ennreal_def) have "(∫⇧^{+}x. ennreal (- (if Px x = 0 then 0 else 1)) ∂MX) = (∫⇧^{+}x. 0 ∂MX)" by (intro nn_integral_cong) (auto simp: ennreal_neg) then show "integrable (distr M MX X) (λx. 1 / Px x)" unfolding distributed_distr_eq_density[OF X] using Px by (auto simp: nn_integral_density real_integrable_def fin ennreal_neg ennreal_mult[symmetric] cong: nn_integral_cong) have "integrable MX (λx. Px x * log b (1 / Px x)) = integrable MX (λx. - Px x * log b (Px x))" using Px by (intro integrable_cong_AE) (auto simp: log_divide_eq less_le) then show "integrable (distr M MX X) (λx. - log b (1 / Px x))" unfolding distributed_distr_eq_density[OF X] using Px int by (subst integrable_real_density) auto qed (auto simp: minus_log_convex[OF b_gt_1]) also have "… = (∫ x. log b (Px x) ∂distr M MX X)" unfolding distributed_distr_eq_density[OF X] using Px by (intro integral_cong_AE) (auto simp: AE_density log_divide_eq) also have "… = - entropy b MX X" unfolding distributed_distr_eq_density[OF X] using Px by (subst entropy_distr[OF X]) (auto simp: integral_density) finally show ?thesis by simp qed lemma (in information_space) entropy_le_space: fixes Px :: "'b ⇒ real" and MX :: "'b measure" assumes X: "distributed M MX X Px" and Px_nn[simp]: "⋀x. x ∈ space MX ⟹ 0 ≤ Px x" and fin: "finite_measure MX" and int: "integrable MX (λx. - Px x * log b (Px x))" shows "entropy b MX X ≤ log b (measure MX (space MX))" proof - note Px = distributed_borel_measurable[OF X] interpret finite_measure MX by fact have "entropy b MX X ≤ log b (measure MX {x ∈ space MX. Px x ≠ 0})" using int X by (intro entropy_le) auto also have "… ≤ log b (measure MX (space MX))" using Px distributed_imp_emeasure_nonzero[OF X] by (intro Transcendental.log_mono) (auto intro!: finite_measure_mono b_gt_1 less_le[THEN iffD2] simp: emeasure_eq_measure cong: conj_cong) finally show ?thesis . qed lemma (in information_space) entropy_uniform: assumes X: "distributed M MX X (λx. indicator A x / measure MX A)" (is "distributed _ _ _ ?f") shows "entropy b MX X = log b (measure MX A)" proof (subst entropy_distr[OF X]) have [simp]: "emeasure MX A ≠ ∞" using uniform_distributed_params[OF X] by (auto simp add: measure_def) have eq: "(∫ x. indicator A x / measure MX A * log b (indicator A x / measure MX A) ∂MX) = (∫ x. (- log b (measure MX A) / measure MX A) * indicator A x ∂MX)" using uniform_distributed_params[OF X] by (intro Bochner_Integration.integral_cong) (auto split: split_indicator simp: log_divide_eq zero_less_measure_iff) show "- (∫ x. indicator A x / measure MX A * log b (indicator A x / measure MX A) ∂MX) = log b (measure MX A)" unfolding eq using uniform_distributed_params[OF X] by (subst Bochner_Integration.integral_mult_right) (auto simp: measure_def less_top[symmetric] intro!: integrable_real_indicator) qed simp lemma (in information_space) entropy_simple_distributed: "simple_distributed M X f ⟹ ℋ(X) = - (∑x∈X`space M. f x * log b (f x))" by (subst entropy_distr[OF simple_distributed]) (auto simp add: lebesgue_integral_count_space_finite) lemma (in information_space) entropy_le_card_not_0: assumes X: "simple_distributed M X f" shows "ℋ(X) ≤ log b (card (X ` space M ∩ {x. f x ≠ 0}))" proof - let ?X = "count_space (X`space M)" have "ℋ(X) ≤ log b (measure ?X {x ∈ space ?X. f x ≠ 0})" by (rule entropy_le[OF simple_distributed[OF X]]) (insert X, auto simp: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space) also have "measure ?X {x ∈ space ?X. f x ≠ 0} = card (X ` space M ∩ {x. f x ≠ 0})" by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def Int_def) finally show ?thesis . qed lemma (in information_space) entropy_le_card: assumes X: "simple_distributed M X f" shows "ℋ(X) ≤ log b (real (card (X ` space M)))" proof - let ?X = "count_space (X`space M)" have "ℋ(X) ≤ log b (measure ?X (space ?X))" by (rule entropy_le_space[OF simple_distributed[OF X]]) (insert X, auto simp: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space finite_measure_count_space) also have "measure ?X (space ?X) = card (X ` space M)" by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def) finally show ?thesis . qed subsection ‹Conditional Mutual Information› definition (in prob_space) "conditional_mutual_information b MX MY MZ X Y Z ≡ mutual_information b MX (MY ⨂⇩_{M}MZ) X (λx. (Y x, Z x)) - mutual_information b MX MZ X Z" abbreviation (in information_space) conditional_mutual_information_Pow ("ℐ'( _ ; _ | _ ')") where "ℐ(X ; Y | Z) ≡ conditional_mutual_information b (count_space (X ` space M)) (count_space (Y ` space M)) (count_space (Z ` space M)) X Y Z" lemma (in information_space) assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P" assumes Px[measurable]: "distributed M S X Px" and Px_nn[simp]: "⋀x. x ∈ space S ⟹ 0 ≤ Px x" assumes Pz[measurable]: "distributed M P Z Pz" and Pz_nn[simp]: "⋀z. z ∈ space P ⟹ 0 ≤ Pz z" assumes Pyz[measurable]: "distributed M (T ⨂⇩_{M}P) (λx. (Y x, Z x)) Pyz" and Pyz_nn[simp]: "⋀y z. y ∈ space T ⟹ z ∈ space P ⟹ 0 ≤ Pyz (y, z)" assumes Pxz[measurable]: "distributed M (S ⨂⇩_{M}P) (λx. (X x, Z x)) Pxz" and Pxz_nn[simp]: "⋀x z. x ∈ space S ⟹ z ∈ space P ⟹ 0 ≤ Pxz (x, z)" assumes Pxyz[measurable]: "distributed M (S ⨂⇩_{M}T ⨂⇩_{M}P) (λx. (X x, Y x, Z x)) Pxyz" and Pxyz_nn[simp]: "⋀x y z. x ∈ space S ⟹ y ∈ space T ⟹ z ∈ space P ⟹ 0 ≤ Pxyz (x, y, z)" assumes I1: "integrable (S ⨂⇩_{M}T ⨂⇩_{M}P) (λ(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))" assumes I2: "integrable (S ⨂⇩_{M}T ⨂⇩_{M}P) (λ(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))" shows conditional_mutual_information_generic_eq: "conditional_mutual_information b S T P X Y Z = (∫(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) ∂(S ⨂⇩_{M}T ⨂⇩_{M}P))" (is "?eq") and conditional_mutual_information_generic_nonneg: "0 ≤ conditional_mutual_information b S T P X Y Z" (is "?nonneg") proof - have [measurable]: "Px ∈ S →⇩_{M}borel" using Px Px_nn by (intro distributed_real_measurable) have [measurable]: "Pz ∈ P →⇩_{M}borel" using Pz Pz_nn by (intro distributed_real_measurable) have measurable_Pyz[measurable]: "Pyz ∈ (T ⨂⇩_{M}P) →⇩_{M}borel" using Pyz Pyz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) have measurable_Pxz[measurable]: "Pxz ∈ (S ⨂⇩_{M}P) →⇩_{M}borel" using Pxz Pxz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) have measurable_Pxyz[measurable]: "Pxyz ∈ (S ⨂⇩_{M}T ⨂⇩_{M}P) →⇩_{M}borel" using Pxyz Pxyz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) interpret S: sigma_finite_measure S by fact interpret T: sigma_finite_measure T by fact interpret P: sigma_finite_measure P by fact interpret TP: pair_sigma_finite T P .. interpret SP: pair_sigma_finite S P .. interpret ST: pair_sigma_finite S T .. interpret SPT: pair_sigma_finite "S ⨂⇩_{M}P" T .. interpret STP: pair_sigma_finite S "T ⨂⇩_{M}P" .. interpret TPS: pair_sigma_finite "T ⨂⇩_{M}P" S .. have TP: "sigma_finite_measure (T ⨂⇩_{M}P)" .. have SP: "sigma_finite_measure (S ⨂⇩_{M}P)" .. have YZ: "random_variable (T ⨂⇩_{M}P) (λx. (Y x, Z x))" using Pyz by (simp add: distributed_measurable) from Pxz Pxyz have distr_eq: "distr M (S ⨂⇩_{M}P) (λx. (X x, Z x)) = distr (distr M (S ⨂⇩_{M}T ⨂⇩_{M}P) (λx. (X x, Y x, Z x))) (S ⨂⇩_{M}P) (λ(x, y, z). (x, z))" by (simp add: comp_def distr_distr) have "mutual_information b S P X Z = (∫x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) ∂(S ⨂⇩_{M}P))" by (rule mutual_information_distr[OF S P Px Px_nn Pz Pz_nn Pxz Pxz_nn]) also have "… = (∫(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) ∂(S ⨂⇩_{M}T ⨂⇩_{M}P))" using b_gt_1 Pxz Px Pz by (subst distributed_transform_integral[OF Pxyz _ Pxz _, where T="λ(x, y, z). (x, z)"]) (auto simp: split_beta' space_pair_measure) finally have mi_eq: "mutual_information b S P X Z = (∫(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) ∂(S ⨂⇩_{M}T ⨂⇩_{M}P))" . have ae1: "AE x in S ⨂⇩_{M}T ⨂⇩_{M}P. Px (fst x) = 0 ⟶ Pxyz x = 0" by (intro subdensity_real[of fst, OF _ Pxyz Px]) (auto simp: space_pair_measure) moreover have ae2: "AE x in S ⨂⇩_{M}T ⨂⇩_{M}P. Pz (snd (snd x)) = 0 ⟶ Pxyz x = 0" by (intro subdensity_real[of "λx. snd (snd x)", OF _ Pxyz Pz]) (auto simp: space_pair_measure) moreover have ae3: "AE x in S ⨂⇩_{M}T ⨂⇩_{M}P. Pxz (fst x, snd (snd x)) = 0 ⟶ Pxyz x = 0" by (intro subdensity_real[of "λx. (fst x, snd (snd x))", OF _ Pxyz Pxz]) (auto simp: space_pair_measure) moreover have ae4: "AE x in S ⨂⇩_{M}T ⨂⇩_{M}P. Pyz (snd x) = 0 ⟶ Pxyz x = 0" by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) (auto simp: space_pair_measure) ultimately have ae: "AE x in S ⨂⇩_{M}T ⨂⇩_{M}P. Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) - Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) = Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) " using AE_space proof eventually_elim case (elim x) show ?case proof cases assume "Pxyz x ≠ 0" with elim have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))" "0 < Pyz (snd x)" "0 < Pxyz x" by (auto simp: space_pair_measure less_le) then show ?thesis using b_gt_1 by (simp add: log_simps less_imp_le field_simps) qed simp qed with I1 I2 show ?eq unfolding conditional_mutual_information_def apply (subst mi_eq) apply (subst mutual_information_distr[OF S TP Px Px_nn Pyz _ Pxyz]) apply (auto simp: space_pair_measure) apply (subst Bochner_Integration.integral_diff[symmetric]) apply (auto intro!: integral_cong_AE simp: split_beta' simp del: Bochner_Integration.integral_diff) done let ?P = "density (S ⨂⇩_{M}T ⨂⇩_{M}P) Pxyz" interpret P: prob_space ?P unfolding distributed_distr_eq_density[OF Pxyz, symmetric] by (rule prob_space_distr) simp let ?Q = "density (T ⨂⇩_{M}P) Pyz" interpret Q: prob_space ?Q unfolding distributed_distr_eq_density[OF Pyz, symmetric] by (rule prob_space_distr) simp let ?f = "λ(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)" from subdensity_real[of snd, OF _ Pyz Pz _ AE_I2 AE_I2] have aeX1: "AE x in T ⨂⇩_{M}P. Pz (snd x) = 0 ⟶ Pyz x = 0" by (auto simp: comp_def space_pair_measure) have aeX2: "AE x in T ⨂⇩_{M}P. 0 ≤ Pz (snd x)" using Pz by (intro TP.AE_pair_measure) (auto simp: comp_def) have aeX3: "AE y in T ⨂⇩_{M}P. (∫⇧^{+}x. ennreal (Pxz (x, snd y)) ∂S) = ennreal (Pz (snd y))" using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz] by (intro TP.AE_pair_measure) auto have "(∫⇧^{+}x. ?f x ∂?P) ≤ (∫⇧^{+}(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) ∂(S ⨂⇩_{M}T ⨂⇩_{M}P))" by (subst nn_integral_density) (auto intro!: nn_integral_mono simp: space_pair_measure ennreal_mult[symmetric]) also have "… = (∫⇧^{+}(y, z). (∫⇧^{+}x. ennreal (Pxz (x, z)) * ennreal (Pyz (y, z) / Pz z) ∂S) ∂(T ⨂⇩_{M}P))" by (subst STP.nn_integral_snd[symmetric]) (auto simp add: split_beta' ennreal_mult[symmetric] space_pair_measure intro!: nn_integral_cong) also have "… = (∫⇧^{+}x. ennreal (Pyz x) * 1 ∂T ⨂⇩_{M}P)" proof - have D: "(∫⇧^{+}x. ennreal (Pxz (x, b)) * ennreal (Pyz (a, b) / Pz b) ∂S) = ennreal (Pyz (a, b))" if "Pz b = 0 ⟶ Pyz (a, b) = 0" "a ∈ space T ∧ b ∈ space P" "(∫⇧^{+}x. ennreal (Pxz (x, b)) ∂S) = ennreal (Pz b)" for a b using that by (subst nn_integral_multc) (auto split: prod.split simp: ennreal_mult[symmetric]) show ?thesis apply (rule nn_integral_cong_AE) using aeX1 aeX2 aeX3 by (force simp add: space_pair_measure D) qed also have "… = 1" using Q.emeasure_space_1 distributed_distr_eq_density[OF Pyz] by (subst nn_integral_density[symmetric]) auto finally have le1: "(∫⇧^{+}x. ?f x ∂?P) ≤ 1" . also have "… < ∞" by simp finally have fin: "(∫⇧^{+}x. ?f x ∂?P) ≠ ∞" by simp have pos: "(∫⇧^{+}x. ?f x ∂?P) ≠ 0" apply (subst nn_integral_density) apply (simp_all add: split_beta') proof let ?g = "λx. ennreal (Pxyz x) * (Pxz (fst x, snd (snd x)) * Pyz (snd x) / (Pz (snd (snd x)) * Pxyz x))" assume "(∫⇧^{+}x. ?g x ∂(S ⨂⇩_{M}T ⨂⇩_{M}P)) = 0" then have "AE x in S ⨂⇩_{M}T ⨂⇩_{M}P. ?g x = 0" by (intro nn_integral_0_iff_AE[THEN iffD1]) auto then have "AE x in S ⨂⇩_{M}T ⨂⇩_{M}P. Pxyz x = 0" using ae1 ae2 ae3 ae4 by (auto split: if_split_asm simp: mult_le_0_iff divide_le_0_iff space_pair_measure) then have "(∫⇧^{+}x. ennreal (Pxyz x) ∂S ⨂⇩_{M}T ⨂⇩_{M}P) = 0" by (subst nn_integral_cong_AE[of _ "λx. 0"]) auto with P.emeasure_space_1 show False by (subst (asm) emeasure_density) (auto cong: nn_integral_cong) qed have neg: "(∫⇧^{+}x. - ?f x ∂?P) = 0" by (subst nn_integral_0_iff_AE) (auto simp: space_pair_measure ennreal_neg) have I3: "integrable (S ⨂⇩_{M}T ⨂⇩_{M}P) (λ(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))" apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ Bochner_Integration.integrable_diff[OF I1 I2]]) using ae apply (auto simp: split_beta') done have "- log b 1 ≤ - log b (integral⇧^{L}?P ?f)" proof (intro le_imp_neg_le Transcendental.log_mono[OF b_gt_1]) have If: "integrable ?P ?f" unfolding real_integrable_def proof (intro conjI) from neg show "(∫⇧^{+}x. - ?f x ∂?P) ≠ ∞" by simp from fin show "(∫⇧^{+}x. ?f x ∂?P) ≠ ∞" by simp qed simp then have "(∫⇧^{+}x. ?f x ∂?P) = (∫x. ?f x ∂?P)" apply (rule nn_integral_eq_integral) apply (auto simp: space_pair_measure ennreal_neg) done with pos le1 show "0 < (∫x. ?f x ∂?P)" "(∫x. ?f x ∂?P) ≤ 1" by (simp_all add: one_ennreal.rep_eq zero_less_iff_neq_zero[symmetric]) qed also have "- log b (integral⇧^{L}?P ?f) ≤ (∫ x. - log b (?f x) ∂?P)" proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"]) show "AE x in ?P. ?f x ∈ {0<..}" unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]] using ae1 ae2 ae3 ae4 by (auto simp: space_pair_measure less_le) show "integrable ?P ?f" unfolding real_integrable_def using fin neg by (auto simp: split_beta') have "integrable (S ⨂⇩_{M}T ⨂⇩_{M}P) (λx. Pxyz x * - log b (?f x))" apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3]) using ae1 ae2 ae3 ae4 apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps less_le space_pair_measure) done then show "integrable ?P (λx. - log b (?f x))" by (subst integrable_real_density) (auto simp: space_pair_measure) qed (auto simp: b_gt_1 minus_log_convex) also have "… = conditional_mutual_information b S T P X Y Z" unfolding ‹?eq› apply (subst integral_real_density) apply simp apply (force simp: space_pair_measure) apply simp apply (intro integral_cong_AE) using ae1 ae2 ae3 ae4 apply (auto simp: