Theory Prefix_Free_Code_Combinators

section ‹Introduction\label{sec:intro}›

theory Prefix_Free_Code_Combinators
  imports
    "HOL-Library.Extended_Real"
    "HOL-Library.Float"
    "HOL-Library.FuncSet"
    "HOL-Library.List_Lexorder"
    "HOL-Library.Log_Nat"
    "HOL-Library.Sublist"
begin

text ‹The encoders are represented as partial prefix-free functions. The advantage of
prefix free codes is that they can be easily combined by concatenation. The approach
of using prefix free codes (on the byte-level) for the representation of complex data
structures is common in many industry encoding libraries (cf. \<^cite>‹"rfc8949"›).

The reason for representing encoders using partial functions, stems from some use-cases
where the objects to be encoded may be in a much smaller sets, as their type may suggest.
For example a natural number may be known to have a given range, or a function may be
encodable because it has a finite domain.

Note: Prefix-free codes can also be automatically derived using Huffmans' algorithm, which was
formalized by Blanchette~\<^cite>‹"Huffman-AFP"›. This is especially useful if it is possible to transmit
a dictionary before the data. On the other hand these standard codes are useful, when the above is
impractical and/or the distribution of the input is unknown or expected to be close to the one's
implied by standard codes.

The following section contains general definitions and results, followed by
Section~\ref{sec:dep_enc} to \ref{sec:float_enc} where encoders for primitive types
and combinators are defined. Each construct is accompanied by lemmas verifying that they
form prefix free codes as well as bounds on the bit count to encode the data.
Section~\ref{sec:examples} concludes with a few examples.›

section ‹Encodings›

fun opt_prefix where
  "opt_prefix (Some x) (Some y) = prefix x y" |
  "opt_prefix _ _ = False"

definition "opt_comp x y = (opt_prefix x y ∨ opt_prefix y x)"

fun opt_append :: "bool list option ⇒ bool list option ⇒ bool list option"
  where
    "opt_append (Some x) (Some y) = Some (x@y)" |
    "opt_append _ _ = None"

lemma opt_comp_sym: "opt_comp x y = opt_comp y x"
  by (simp add:opt_comp_def, blast)

lemma opt_comp_append:
  assumes "opt_comp (opt_append x y) z"
  shows "opt_comp x z"
proof -
  obtain x' y' z' where a: "x = Some x'" "y = Some y'" "z = Some z'"
    using assms
    by (cases x, case_tac[!] y, case_tac[!] z, auto simp: opt_comp_def)
  have "prefix (x'@y') z' ∨ prefix z' (x'@y')"
    using a assms by (simp add:opt_comp_def)
  hence "prefix x' z' ∨ prefix z' x'"
    using prefix_same_cases append_prefixD by blast
  thus ?thesis
    using a by (simp add:opt_comp_def)
qed

lemma opt_comp_append_2:
  assumes "opt_comp x (opt_append y z)"
  shows "opt_comp x y"
  using opt_comp_append opt_comp_sym assms by blast

lemma opt_comp_append_3:
  assumes "opt_comp (opt_append x y) (opt_append x z)"
  shows "opt_comp y z"
  using assms
  by (cases x, case_tac[!] y, case_tac[!] z, auto simp: opt_comp_def)

type_synonym 'a encoding = "'a ⇀ bool list"

definition is_encoding :: "'a encoding ⇒ bool"
  where "is_encoding f = (∀x y. opt_prefix (f x) (f y) ⟶ x = y)"

text ‹An encoding function is represented as partial functions into lists of booleans, where
each list element represents a bit. Such a function is defined to be an encoding, if it is
prefix-free on its domain. This is similar to the formalization by Hibon and
Paulson~\<^cite>‹"Source_Coding_Theorem-AFP"› except for the use of partial functions for the
practical reasons described in Section~\ref{sec:intro}.›

lemma is_encodingI:
  assumes "⋀x x' y y'. e x = Some x' ⟹ e y = Some y' ⟹
    prefix x' y' ⟹ x = y"
  shows "is_encoding e"
proof -
  have "opt_prefix (e x) (e y) ⟹ x = y" for x y
    using assms by (cases "e x", case_tac[!] "e y", auto)
  thus ?thesis by (simp add:is_encoding_def)
qed

lemma is_encodingI_2:
  assumes "⋀x y . opt_comp (e x) (e y) ⟹ x = y"
  shows "is_encoding e"
  using assms by (simp add:opt_comp_def is_encoding_def)

lemma encoding_triv: "is_encoding Map.empty"
  by (rule is_encodingI_2, simp add:opt_comp_def)

lemma is_encodingD:
  assumes "is_encoding e"
  assumes "opt_comp (e x) (e y)"
  shows "x = y"
  using assms by (auto simp add:opt_comp_def is_encoding_def)

lemma encoding_imp_inj:
  assumes "is_encoding f"
  shows "inj_on f (dom f)"
  using assms
  by (intro inj_onI, simp add:is_encoding_def, force)

fun bit_count :: "bool list option ⇒ ereal" where
  "bit_count None = ∞" |
  "bit_count (Some x) = ereal (length x)"

lemma bit_count_finite_imp_dom:
  "bit_count (f x) < ∞ ⟹ x ∈ dom f"
  by (cases "f x", auto)

lemma bit_count_append:
  "bit_count (opt_append x y) = bit_count x + bit_count y"
  by (cases x, case_tac[!] "y", simp_all)

section ‹(Dependent) Products\label{sec:dep_enc}›

definition encode_dependent_prod ::
  "'a encoding ⇒ ('a ⇒ 'b encoding) ⇒ ('a × 'b) encoding"
  (infixr ‹⋈⇩_e› 65)
  where
    "encode_dependent_prod e f x =
      opt_append (e (fst x)) (f (fst x) (snd x))"

lemma dependent_encoding:
  assumes "is_encoding e1"
  assumes "⋀x. x ∈ dom e1 ⟹ is_encoding (e2 x)"
  shows "is_encoding (e1 ⋈⇩_e e2)"
proof (rule is_encodingI_2)
  fix x y
  assume a:"opt_comp ((e1 ⋈⇩_e e2) x) ((e1 ⋈⇩_e e2) y)"
  have d:"opt_comp (e1 (fst x)) (e1 (fst y))"
    using a unfolding encode_dependent_prod_def
    by (metis opt_comp_append opt_comp_append_2)
  hence b:"fst x = fst y"
    using is_encodingD[OF assms(1)] by simp
  hence "opt_comp (e2 (fst x) (snd x)) (e2 (fst x) (snd y))"
    using a unfolding encode_dependent_prod_def by (metis opt_comp_append_3)
  moreover have "fst x ∈ dom e1" using d b
    by (cases "e1 (fst x)", simp_all add:opt_comp_def dom_def)
  ultimately have c:"snd x = snd y"
    using is_encodingD[OF assms(2)] by simp
  show "x = y"
    using b c by (simp add: prod_eq_iff)
qed

lemma dependent_bit_count:
  "bit_count ((e⇩₁ ⋈⇩_e e⇩₂) (x⇩₁,x⇩₂)) =
    bit_count (e⇩₁ x⇩₁) + bit_count (e⇩₂ x⇩₁ x⇩₂)"
  by (simp add: encode_dependent_prod_def bit_count_append)

lemma dependent_bit_count_2:
  "bit_count ((e⇩₁ ⋈⇩_e e⇩₂) x) =
    bit_count (e⇩₁ (fst x)) + bit_count (e⇩₂ (fst x) (snd x))"
  by (simp add: encode_dependent_prod_def bit_count_append)

text ‹This abbreviation is for non-dependent products.›

abbreviation encode_prod ::
  "'a encoding ⇒ 'b encoding ⇒ ('a × 'b) encoding"
  (infixr ‹×⇩_e› 65)
  where
    "encode_prod e1 e2 ≡ e1 ⋈⇩_e (λ_. e2)"

section ‹Composition›

lemma encoding_compose:
  assumes "is_encoding f"
  assumes "inj_on g {x. p x}"
  shows "is_encoding (λx. if p x then f (g x) else None)"
  using assms by (simp add:comp_def is_encoding_def inj_onD)

lemma encoding_compose_2:
  assumes "is_encoding f"
  assumes "inj g"
  shows "is_encoding (λx. f (g x))"
  using assms by (simp add:comp_def is_encoding_def inj_onD)

section ‹Natural Numbers›

fun encode_bounded_nat :: "nat ⇒ nat ⇒ bool list" where
  "encode_bounded_nat (Suc l) n =
    (let r = n ≥ (2^l) in r#encode_bounded_nat l (n-of_bool r*2^l))" |
  "encode_bounded_nat 0 _ = []"

lemma encode_bounded_nat_prefix_free:
  fixes u v l :: nat
  assumes "u < 2^l"
  assumes "v < 2^l"
  assumes "prefix (encode_bounded_nat l u) (encode_bounded_nat l v)"
  shows "u = v"
  using assms
proof (induction l arbitrary: u v)
  case 0
  then show ?case by simp
next
  case (Suc l)
  have "prefix (encode_bounded_nat l (u - of_bool (u ≥ 2^l)*2^l))
    (encode_bounded_nat l (v - of_bool (v ≥ 2^l)*2^l))"
    and a:"(u ≥ 2^l) = (v ≥ 2^l)"
    using Suc(4) by (simp_all add: Let_def split: split_of_bool_asm)
  moreover have "u - of_bool (u ≥ 2^l)*2^l < 2^l"
    using Suc(2) by (cases "u < 2^l", auto simp add:of_bool_def)
  moreover have "v - of_bool (v ≥ 2^l)*2^l < 2^l"
    using Suc(3) by (cases "v < 2^l", auto simp add:of_bool_def)
  ultimately have
    "u - of_bool (u ≥ 2^l)*2^l = v - of_bool (v ≥ 2^l)*2^l"
    by (intro Suc(1), simp_all)
  thus "u = v" using a by (simp split: split_of_bool_asm)
qed

definition Nb⇩_e :: "nat ⇒ nat encoding"
  where "Nb⇩_e l n = (
    if n < l
      then Some (encode_bounded_nat (floorlog 2 (l-1)) n)
      else None)"

text ‹@{term "Nb⇩_e l"} is  encoding for natural numbers strictly smaller than
  @{term "l"} using a fixed length encoding.›

lemma bounded_nat_bit_count:
  "bit_count (Nb⇩_e l y) = (if y < l then floorlog 2 (l-1) else ∞)"
proof -
  have a:"length (encode_bounded_nat h m) = h" for h m
    by (induction h arbitrary: m, simp, simp add:Let_def)
  show ?thesis
    using a by (simp add:Nb⇩_e_def)
qed

lemma bounded_nat_bit_count_2:
  assumes "y < l"
  shows "bit_count (Nb⇩_e l y) = floorlog 2 (l-1)"
  using assms bounded_nat_bit_count by simp

lemma "dom (Nb⇩_e l) = {..<l}"
  by (simp add:Nb⇩_e_def dom_def lessThan_def)

lemma bounded_nat_encoding: "is_encoding (Nb⇩_e l)"
proof -
  have "x < l ⟹ x < 2 ^ floorlog 2 (l-1)" for x :: nat
    by (intro floorlog_leD floorlog_mono, auto)
  thus ?thesis
    using encode_bounded_nat_prefix_free
    by (intro is_encodingI, simp add:Nb⇩_e_def split:if_splits, blast)
qed

fun encode_unary_nat :: "nat ⇒ bool list" where
  "encode_unary_nat (Suc l) = False#(encode_unary_nat l)" |
  "encode_unary_nat 0 = [True]"

lemma encode_unary_nat_prefix_free:
  fixes u v :: nat
  assumes "prefix (encode_unary_nat u) (encode_unary_nat v)"
  shows "u = v"
  using assms
proof (induction u arbitrary: v)
  case 0
  then show ?case by (cases v, simp_all)
next
  case (Suc u)
  then show ?case by (cases v, simp_all)
qed

definition Nu⇩_e :: "nat encoding"
  where "Nu⇩_e n = Some (encode_unary_nat n)"

text ‹@{term "Nu⇩_e"} is encoding for natural numbers using unary encoding. It is
inefficient except for special cases, where the probability of large numbers decreases
exponentially with its magnitude.›

lemma unary_nat_bit_count:
  "bit_count (Nu⇩_e n) = Suc n"
  unfolding Nu⇩_e_def by (induction n, auto)

lemma unary_encoding: "is_encoding Nu⇩_e"
    using encode_unary_nat_prefix_free
    by (intro is_encodingI, simp add:Nu⇩_e_def)

text ‹Encoding for positive numbers using Elias-Gamma code.›

definition Ng⇩_e :: "nat encoding" where
  "Ng⇩_e n =
    (if n > 0
      then (Nu⇩_e ⋈⇩_e (λr. Nb⇩_e (2^r)))
        (let r = floorlog 2 n - 1 in (r, n - 2^r))
      else None)"

text ‹@{term "Ng⇩_e"} is an encoding for positive numbers using Elias-Gamma encoding\<^cite>‹"elias1975"›.›

lemma elias_gamma_bit_count:
  "bit_count (Ng⇩_e n) = (if n > 0 then 2 * ⌊log 2 n⌋ + 1 else (∞::ereal))"
proof (cases "n > 0")
  case True
  define r where "r = floorlog 2 n - Suc 0"
  have "floorlog 2 n ≠ 0"
    using True
    by (simp add:floorlog_eq_zero_iff)
  hence a:"floorlog 2 n > 0" by simp

  have "n < 2^(floorlog 2 n)"
    using True floorlog_bounds by simp
  also have "... = 2^(r+1)"
    using a by (simp add:r_def)
  finally have "n < 2^(r+1)" by simp
  hence b:"n - 2^r < 2^r" by simp
  have "floorlog 2 (2 ^ r - Suc 0) ≤ r"
    by (rule floorlog_leI, auto)
  moreover have "r ≤ floorlog 2 (2 ^ r - Suc 0)"
    by (cases r, simp, auto intro: floorlog_geI)
  ultimately have c:"floorlog 2 (2 ^ r - Suc 0) = r"
    using order_antisym by blast

  have "bit_count (Ng⇩_e n) = bit_count (Nu⇩_e r) +
    bit_count (Nb⇩_e (2 ^ r) (n - 2 ^ r))"
    using True by (simp add:Ng⇩_e_def r_def[symmetric] dependent_bit_count)
  also have "... = ereal (r + 1) + ereal (r)"
    using b c
    by (simp add: unary_nat_bit_count bounded_nat_bit_count)
  also have "... = 2 * r + 1" by simp
  also have "... = 2 * ⌊log 2 n⌋ + 1"
    using True by (simp add:floorlog_def r_def)
  finally show ?thesis using True by simp
next
  case False
  then show ?thesis by (simp add:Ng⇩_e_def)
qed

lemma elias_gamma_encoding: "is_encoding Ng⇩_e"
proof -
  have a: "inj_on (λx. let r = floorlog 2 x - 1 in (r, x - 2 ^ r))
    {n. 0 < n}"
  proof (rule inj_onI)
    fix x y :: nat
    assume "x ∈ {n. 0 < n}"
    hence x_pos: "0 < x" by simp
    assume "y ∈ {n. 0 < n}"
    hence y_pos: "0 < y" by simp
    define r where "r = floorlog 2 x - Suc 0"
    assume b:"(let r = floorlog 2 x - 1 in (r, x - 2 ^ r)) =
      (let r = floorlog 2 y - 1 in (r, y - 2 ^ r))"
    hence c:"r = floorlog 2 y - Suc 0"
      by (simp_all add:Let_def r_def)
    have "x - 2^r = y - 2^r" using b
      by (simp add:Let_def r_def[symmetric] c[symmetric]  prod_eq_iff)
    moreover have "x ≥ 2^r"
      using r_def x_pos floorlog_bounds by simp
    moreover have "y ≥ 2^r"
      using c floorlog_bounds y_pos by simp
    ultimately show "x = y" using eq_diff_iff by blast
  qed

  have "is_encoding (λn. Ng⇩_e n)"
    unfolding Ng⇩_e_def using a
    by (intro encoding_compose[where f="Nu⇩_e ⋈⇩_e (λr. Nb⇩_e (2^r))"]
        dependent_encoding unary_encoding bounded_nat_encoding) auto
  thus ?thesis by simp
qed

definition N⇩_e :: "nat encoding" where "N⇩_e x = Ng⇩_e (x+1)"

text ‹@{term "N⇩_e"} is an encoding for all natural numbers using exponential Golomb
encoding~\<^cite>‹"teuhola1978"›. Exponential Golomb codes are also used in video compression
applications~\<^cite>‹"richardson2010"›.›

lemma exp_golomb_encoding: "is_encoding N⇩_e"
proof -
  have "is_encoding (λn. N⇩_e n)"
    unfolding N⇩_e_def
    by (intro encoding_compose_2[where g="(λn. n + 1)"] elias_gamma_encoding, auto)
  thus ?thesis by simp
qed

lemma exp_golomb_bit_count_exact:
  "bit_count (N⇩_e n) = 2 * ⌊log 2 (n+1)⌋ + 1"
  by (simp add:N⇩_e_def elias_gamma_bit_count)

lemma exp_golomb_bit_count:
  "bit_count (N⇩_e n) ≤ (2 * log 2 (real n+1) + 1)"
  by (simp add:exp_golomb_bit_count_exact add.commute)

lemma exp_golomb_bit_count_est:
  assumes "n ≤ m "
  shows "bit_count (N⇩_e n) ≤ (2 * log 2 (real m+1) + 1)"
proof -
  have "bit_count (N⇩_e n) ≤ (2 * log 2 (real n+1) + 1)"
    using exp_golomb_bit_count by simp
  also have "... ≤ (2 * log 2 (real m+1) + 1)"
    using assms by simp
  finally show ?thesis by simp
qed

section ‹Integers›

definition I⇩_e :: "int encoding" where
  "I⇩_e x = N⇩_e (nat (if x ≤0 then (-2 * x) else (2*x-1)))"

text ‹@{term "I⇩_e"} is an encoding for integers using exponential Golomb codes by embedding
the integers into the natural numbers, specifically the positive numbers are embedded into
the odd-numbers and the negative numbers are embedded into the even numbers. The embedding
has the benefit, that the bit count for an integer only depends on its absolute value.›

lemma int_encoding: "is_encoding I⇩_e"
proof -
  have "inj (λx. nat (if x ≤ 0 then - 2 * x else 2 * x - 1))"
    by (rule inj_onI, auto simp add:eq_nat_nat_iff, presburger)
  thus ?thesis
    unfolding I⇩_e_def
    by (intro exp_golomb_encoding encoding_compose_2[where f="N⇩_e"])
      auto
qed

lemma int_bit_count: "bit_count (I⇩_e n) = 2 * ⌊log 2 (2*¦n¦+1)⌋ +1"
proof -
  have a:"m > 0 ⟹
    ⌊log (real 2) (real (2 * m))⌋ = ⌊log (real 2) (real (2 * m + 1))⌋"
    for m :: nat by (rule  floor_log_eq_if, auto)
  have "n > 0 ⟹
    ⌊log 2 (2 * real_of_int n)⌋ = ⌊log 2 (2 * real_of_int n + 1)⌋"
    using  a[where m="nat n"] by (simp add:add.commute)
  thus ?thesis
    by (simp add:I⇩_e_def exp_golomb_bit_count_exact floorlog_def)
qed

lemma int_bit_count_1:
  assumes "abs n > 0"
  shows "bit_count (I⇩_e n) = 2 * ⌊log 2 ¦n¦⌋ +3"
proof -
  have a:"m > 0 ⟹
    ⌊log (real 2) (real (2 * m))⌋ = ⌊log (real 2) (real (2 * m + 1))⌋"
    for m :: nat by (rule  floor_log_eq_if, auto)
  have "n < 0 ⟹
    ⌊log 2 (-2 * real_of_int n)⌋ = ⌊log 2 (1-2 * real_of_int n)⌋"
    using a[where m="nat (-n)"] by (simp add:add.commute)
  hence "bit_count (I⇩_e n) = 2 * ⌊log 2 (2*real_of_int ¦n¦)⌋ +1"
    using assms
    by (simp add:I⇩_e_def exp_golomb_bit_count_exact floorlog_def)
  also have "... = 2 * ⌊log 2 ¦n¦⌋ + 3"
    using assms by (subst log_mult, auto)
  finally show ?thesis by simp
qed

lemma int_bit_count_est_1:
  assumes "¦n¦ ≤ r"
  shows "bit_count (I⇩_e n) ≤ 2 * log 2 (r+1) +3"
proof (cases "abs n > 0")
  case True
  have "real_of_int ⌊log 2 ¦real_of_int n¦⌋ ≤ log 2  ¦real_of_int n¦"
    using of_int_floor_le by blast
  also have "... ≤ log 2 (real_of_int r+1)"
    using True assms by force
  finally have
    "real_of_int ⌊log 2 ¦real_of_int n¦⌋ ≤ log 2 (real_of_int r + 1)"
    by simp
  then show ?thesis
    using True assms by (simp add:int_bit_count_1)
next
  case False
  have "r ≥ 0" using assms by simp
  moreover have "n = 0" using False by simp
  ultimately show ?thesis by (simp add:I⇩_e_def exp_golomb_bit_count_exact)
qed

lemma int_bit_count_est:
  assumes "¦n¦ ≤ r"
  shows "bit_count (I⇩_e n) ≤ 2 * log 2 (2*r+1) +1"
proof -
  have "bit_count (I⇩_e n) ≤ 2 * log 2 (2*¦n¦+1) +1"
    by (simp add:int_bit_count)
  also have "... ≤ 2 * log 2 (2* r + 1) + 1"
    using assms by simp
  finally show ?thesis by simp
qed

section ‹Lists›

definition Lf⇩_e where
  "Lf⇩_e e n xs =
    (if length xs = n
      then fold (λx y. opt_append y (e x)) xs (Some [])
      else None)"

text ‹@{term "Lf⇩_e e n"} is an encoding for lists of length @{term"n"},
where the elements are encoding using the encoder @{term "e"}.›

lemma fixed_list_encoding:
  assumes "is_encoding e"
  shows "is_encoding (Lf⇩_e e n)"
proof (induction n)
  case 0
  then show ?case
    by (rule is_encodingI_2, simp_all add:Lf⇩_e_def opt_comp_def split:if_splits)
next
  case (Suc n)
  show ?case
  proof (rule is_encodingI_2)
    fix x y
    assume a:"opt_comp (Lf⇩_e e (Suc n) x) (Lf⇩_e e (Suc n) y)"
    have b:"length x = Suc n" using a
      by (cases "length x = Suc n", simp_all add:Lf⇩_e_def opt_comp_def)
    then obtain x1 x2 where x_def: "x = x1@[x2]" "length x1 = n"
      by (metis length_append_singleton lessI nat.inject order.refl
          take_all take_hd_drop)
    have c:"length y = Suc n" using a
      by (cases "length y = Suc n", simp_all add:Lf⇩_e_def opt_comp_def)
    then obtain y1 y2 where y_def: "y = y1@[y2]" "length y1 = n"
      by (metis length_append_singleton lessI nat.inject order.refl
          take_all take_hd_drop)
    have d: "opt_comp (opt_append (Lf⇩_e e n x1) (e x2))
      (opt_append (Lf⇩_e e n y1) (e y2)) "
      using a b c by (simp add:Lf⇩_e_def x_def y_def)
    hence "opt_comp (Lf⇩_e e n x1)  (Lf⇩_e e n y1)"
      using opt_comp_append opt_comp_append_2 by blast
    hence e:"x1 = y1"
      using is_encodingD[OF Suc] by blast
    hence "opt_comp (e x2) (e y2)"
      using opt_comp_append_3 d by simp
    hence "x2 = y2"
      using is_encodingD[OF assms] by blast
    thus "x = y" using e x_def y_def by simp
  qed
qed

lemma fixed_list_bit_count:
  "bit_count (Lf⇩_e e n xs) =
    (if length xs = n then (∑x ← xs. (bit_count (e x))) else ∞)"
proof (induction n arbitrary: xs)
  case 0
  then show ?case by (simp add:Lf⇩_e_def)
next
  case (Suc n)
  show ?case
  proof (cases "length xs = Suc n")
    case True
    then obtain x1 x2 where x_def: "xs = x1@[x2]" "length x1 = n"
      by (metis length_append_singleton lessI nat.inject order.refl
          take_all take_hd_drop)
    have "bit_count (Lf⇩_e e n x1) = (∑x←x1. bit_count (e x))"
      using x_def(2) Suc by simp
    then show ?thesis by (simp add:Lf⇩_e_def x_def bit_count_append)
  next
    case False
    then show ?thesis by (simp add:Lf⇩_e_def)
  qed
qed

definition L⇩_e
  where "L⇩_e e xs = (Nu⇩_e ⋈⇩_e (λn. Lf⇩_e e n)) (length xs, xs)"

text ‹@{term "L⇩_e e"} is an encoding for arbitrary length lists, where the elements are encoding using the
encoder @{term "e"}.›

lemma list_encoding:
  assumes "is_encoding e"
  shows "is_encoding (L⇩_e e)"
proof -
  have "inj (λxs. (length xs, xs))"
    by (simp add: inj_on_def)

  hence "is_encoding (λxs. L⇩_e e xs)"
    using assms unfolding L⇩_e_def
    by (intro encoding_compose_2[where g=" (λx. (length x, x))"]
        dependent_encoding unary_encoding fixed_list_encoding) auto
  thus ?thesis by simp
qed

lemma sum_list_triv_ereal:
  fixes a :: ereal
  shows "sum_list (map (λ_. a) xs) = length xs * a"
  apply (cases a, simp add:sum_list_triv)
  by (induction xs, simp, simp)+

lemma list_bit_count:
  "bit_count (L⇩_e e xs) = (∑x ← xs. bit_count (e x) + 1) + 1"
proof -
  have "bit_count (L⇩_e e xs) =
    ereal (1 + real (length xs)) + (∑x←xs. bit_count (e x))"
    by (simp add: L⇩_e_def dependent_bit_count fixed_list_bit_count unary_nat_bit_count)
  also have "... = (∑x←xs. bit_count (e x)) +  (∑x ← xs. 1) + 1"
    by (simp add:ac_simps group_cancel.add1 sum_list_triv_ereal)
  also have "... = (∑x ← xs. bit_count (e x) + 1) + 1"
    by (simp add:sum_list_addf)
  finally show ?thesis by simp
qed

section ‹Functions›

definition encode_fun :: "'a list ⇒ 'b encoding ⇒ ('a ⇒ 'b) encoding"
  (infixr ‹→⇩_e› 65)  where
  "encode_fun xs e f =
    (if f ∈ extensional (set xs)
      then (Lf⇩_e e (length xs) (map f xs))
      else None)"

text ‹@{term "xs →⇩_e e"} is an encoding for functions whose domain is @{term "set xs"},
where the values are encoding using the encoder @{term "e"}.›

lemma fun_encoding:
  assumes "is_encoding e"
  shows "is_encoding (xs →⇩_e e)"
proof -
  have a:"inj_on (λx. map x xs) {x. x ∈ extensional (set xs)}"
    by (rule inj_onI) (simp add: extensionalityI)
  have "is_encoding (λx. (xs →⇩_e e) x)"
    unfolding encode_fun_def
    by (intro encoding_compose[where f="Lf⇩_e e (length xs)"]
        fixed_list_encoding assms a)
  thus ?thesis by simp
qed

lemma fun_bit_count:
  "bit_count ((xs →⇩_e e) f) =
    (if f ∈ extensional (set xs) then (∑x ← xs. bit_count (e (f x))) else ∞)"
  by (simp add:encode_fun_def fixed_list_bit_count comp_def)

lemma fun_bit_count_est:
  assumes "f ∈ extensional (set xs)"
  assumes "⋀x. x ∈ set xs ⟹ bit_count (e (f x)) ≤ a"
  shows "bit_count  ((xs →⇩_e e) f)  ≤ ereal (real (length xs)) * a"
proof -
  have "bit_count  ((xs →⇩_e e) f) = (∑x ← xs. bit_count (e (f x)))"
    using assms(1) by (simp add:fun_bit_count)
  also have "... ≤ (∑x ← xs. a)"
    by (intro sum_list_mono assms(2), simp)
  also have "... =  ereal (real (length xs)) * a"
    by (simp add:sum_list_triv_ereal)
  finally show ?thesis by simp
qed

section ‹Finite Sets›

definition S⇩_e :: "'a encoding ⇒ 'a set encoding" where
  "S⇩_e e S =
    (if finite S ∧ S ⊆ dom e
      then (L⇩_e e (linorder.sorted_key_list_of_set (≤) (the ∘ e) S))
      else None)"

text ‹@{term "S⇩_e e"} is an encoding for finite sets whose elements are encoded using the
  encoder @{term "e"}.›

lemma set_encoding:
  assumes "is_encoding e"
  shows "is_encoding (S⇩_e e)"
proof -
  have a:"inj_on (the ∘ e) (dom e)"
    using inj_on_def
    by (intro comp_inj_on encoding_imp_inj assms, fastforce)

  interpret folding_insort_key "(≤)" "(<)" "(dom e)" "(the ∘ e)"
    using a by (unfold_locales) auto
  have "is_encoding (λS. S⇩_e e S)"
    unfolding S⇩_e_def using sorted_key_list_of_set_inject
    by (intro encoding_compose[where f="L⇩_e e"] list_encoding assms(1) inj_onI, simp)
  thus ?thesis by simp
qed

lemma set_bit_count:
  assumes "is_encoding e"
  shows "bit_count (S⇩_e e S) = (if finite S then (∑x ∈ S. bit_count (e x)+1)+1 else ∞)"
proof (cases "finite S")
  case f:True
  have "bit_count (S⇩_e e S) = (∑x∈S. bit_count (e x)+1)+1"
  proof (cases "S ⊆ dom e")
    case True

    have a:"inj_on (the ∘ e) (dom e)"
      using inj_on_def by (intro comp_inj_on encoding_imp_inj[OF assms], fastforce)

    interpret folding_insort_key "(≤)" "(<)" "(dom e)" "(the ∘ e)"
      using a by (unfold_locales) auto

    have b:"distinct (linorder.sorted_key_list_of_set (≤) (the ∘ e) S)"
      (is "distinct ?l") using distinct_sorted_key_list_of_set True
        distinct_if_distinct_map by auto

    have "bit_count (S⇩_e e S) = (∑x←?l. bit_count (e x) + 1) + 1"
      using f True by (simp add:S⇩_e_def list_bit_count)
    also have "... = (∑x∈S. bit_count (e x)+1)+1"
      by (simp add: sum_list_distinct_conv_sum_set[OF b]
          set_sorted_key_list_of_set[OF True f])
    finally show ?thesis by simp
  next
    case False
    hence "∃i∈S. e i = None" by force
    hence "∃i∈S. bit_count (e i) = ∞" by force
    hence "(∑x∈S. bit_count (e x) + 1) = ∞"
      by (simp add:sum_Pinfty f)
    then show ?thesis using False by (simp add:S⇩_e_def)
  qed
  thus ?thesis using f by simp
next
  case False
  then show ?thesis by (simp add:S⇩_e_def)
qed

lemma sum_triv_ereal:
  fixes a :: ereal
  assumes "finite S"
  shows "(∑_ ∈ S. a) = card S * a"
proof (cases a)
  case (real r)
  then show ?thesis by simp
next
  case PInf
  show ?thesis using assms PInf
    by (induction S rule:finite_induct, auto)
next
  case MInf
  show ?thesis using assms MInf
    by (induction S rule:finite_induct, auto)
qed

lemma set_bit_count_est:
  assumes "is_encoding f"
  assumes "finite S"
  assumes "card S ≤ m"
  assumes "0 ≤ a"
  assumes "⋀x. x ∈ S ⟹ bit_count (f x) ≤ a"
  shows "bit_count (S⇩_e f S) ≤ ereal (real m) * (a+1) + 1"
proof -
  have "bit_count (S⇩_e f S) = (∑x∈S. bit_count (f x) + 1) + 1"
    using assms by (simp add:set_bit_count)
  also have "... ≤ (∑x∈S. a + 1) + 1"
    using assms by (intro sum_mono add_mono) auto
  also have "... = ereal (real (card S)) * (a + 1) + 1"
    by (simp add:sum_triv_ereal[OF assms(2)])
  also have "... ≤ ereal (real m) * (a+1) + 1"
    using assms(3,4) by (intro add_mono ereal_mult_right_mono) auto
  finally show ?thesis by simp
qed

section ‹Floating point numbers\label{sec:float_enc}›
definition F⇩_e where "F⇩_e f = (I⇩_e ×⇩_e I⇩_e) (mantissa f,exponent f)"

lemma float_encoding:
  "is_encoding F⇩_e"
proof -
  have "inj (λx. (mantissa x, exponent x))" (is "inj ?g")
  proof (rule injI)
    fix x y
    assume "(mantissa x, exponent x) = (mantissa y, exponent y)"
    hence "real_of_float x = real_of_float y"
      by (simp add:mantissa_exponent)
    thus "x = y"
      by (metis real_of_float_inverse)
  qed
  thus "is_encoding (λf. F⇩_e f)"
    unfolding F⇩_e_def
    by (intro encoding_compose_2[where g="?g"]
        dependent_encoding int_encoding) auto
qed

lemma suc_n_le_2_pow_n:
  fixes n :: nat
  shows "n + 1 ≤ 2 ^ n"
  by (induction n, simp, simp)

lemma float_bit_count_1:
  "bit_count (F⇩_e f) ≤ 6 + 2 * (log 2 (¦mantissa f¦ + 1) +
    log 2 (¦exponent f¦ + 1))" (is "?lhs ≤ ?rhs")
proof -
  have "?lhs = bit_count (I⇩_e (mantissa f)) +
    bit_count (I⇩_e (exponent f))"
    by (simp add:F⇩_e_def dependent_bit_count)
  also have "... ≤
    ereal (2 * log 2 (real_of_int (¦mantissa f¦ + 1)) + 3) +
    ereal (2 * log 2 (real_of_int (¦exponent f¦ + 1)) + 3)"
    by (intro int_bit_count_est_1 add_mono) auto
  also have "... = ?rhs"
    by simp
  finally show ?thesis by simp
qed

text ‹The following establishes an estimate for the bit count of a floating
point number in non-normalized representation:›

lemma float_bit_count_2:
  fixes m :: int
  fixes e :: int
  defines "f ≡ float_of (m * 2 powr e)"
  shows "bit_count (F⇩_e f) ≤
    6 + 2 * (log 2 (¦m¦ + 2) + log 2 (¦e¦ + 1))"
proof -
  have b:" (r + 1) * int i ≤ r * (2 ^ i - 1) + 1"
    if b_assms: "r ≥ 1" for r :: int and i :: nat
  proof (cases "i > 0")
    case True
    have "(r + 1) * int i = r * i + 2 * int ((i-1)+1) - i"
      using True by (simp add:algebra_simps)
    also have "... ≤ r * i + int (2^1) * int (2^(i-1)) - i"
      using b_assms
      by (intro add_mono diff_mono mult_mono of_nat_mono suc_n_le_2_pow_n)
        simp_all
    also have "... = r * i + 2^i - i"
      using True
      by (subst of_nat_mult[symmetric], subst power_add[symmetric])
        simp
    also have "... = r *i + 1 * (2 ^ i - int i - 1) + 1"  by simp
    also have "... ≤ r *i + r * (2 ^ i - int i - 1) + 1"
      using b_assms
      by (intro add_mono mult_mono, simp_all)
    also have "... = r * (2 ^ i - 1) + 1"
      by (simp add:algebra_simps)
    finally show ?thesis by simp
  next
    case False
    hence "i = 0" by simp
    then show ?thesis by simp
  qed

  have a:"log 2 (¦mantissa f¦ + 1) + log 2 (¦exponent f¦ + 1) ≤
    log 2 (¦m¦+2) + log 2 (¦e¦+1)"
  proof (cases "f=0")
    case True then show ?thesis by simp
  next
    case False
    moreover have "f = Float m e"
      by (simp add:f_def Float.abs_eq)
    ultimately obtain i :: nat
      where m_def: "m = mantissa f * 2 ^ i"
        and e_def: "e = exponent f - i"
      using denormalize_shift by blast

    have mantissa_ge_1: "1 ≤ ¦mantissa f¦"
      using False mantissa_noteq_0 by fastforce

    have "(¦mantissa f¦ + 1) * (¦exponent f¦ + 1) =
      (¦mantissa f¦ + 1) * (¦e+i¦+1)"
      by (simp add:e_def)
    also have "...  ≤ (¦mantissa f¦ + 1) * ((¦e¦+¦i¦)+1)"
      by (intro mult_mono add_mono, simp_all)
    also have "... = (¦mantissa f¦ + 1) * ((¦e¦+1)+i)"
      by simp
    also have "... = (¦mantissa f¦ + 1) * (¦e¦+1) + (¦mantissa f¦+1)*i"
      by (simp add:algebra_simps)
    also have "... ≤ (¦mantissa f¦ + 1) * (¦e¦+1) + (¦mantissa f¦ * (2^i-1)+1)"
      by (intro add_mono b mantissa_ge_1, simp)
    also have "... = (¦mantissa f¦ + 1) * (¦e¦+1) + (¦mantissa f¦ * (2^i-1)+1)*(1)"
      by simp
    also have
      "... ≤ (¦mantissa f¦ + 1) * (¦e¦+1) + (¦mantissa f¦*  (2^i-1)+1)*(¦e¦+1)"
      by (intro add_mono mult_left_mono, simp_all)
    also have "... = ((¦mantissa f¦ + 1)+(¦mantissa f¦*  (2^i-1)+1))*(¦e¦+1)"
      by (simp add:algebra_simps)
    also have "... = (¦mantissa f¦*2^i+2)*(¦e¦+1)"
      by (simp add:algebra_simps)
    also have "... = (¦m¦+2)*(¦e¦+1)"
      by (simp add:m_def abs_mult)
    finally have "(¦mantissa f¦ + 1) * (¦exponent f¦ + 1) ≤ (¦m¦+2)*(¦e¦+1)"
      by simp

    hence "(¦real_of_int (mantissa f)¦ + 1) * (¦of_int (exponent f)¦ + 1) ≤
      (¦of_int m¦+2)*(¦of_int e¦+1)"
      by (simp flip:of_int_abs) (metis (mono_tags, opaque_lifting) numeral_One
          of_int_add of_int_le_iff of_int_mult of_int_numeral)
    then show ?thesis by (simp flip: log_mult_pos)
  qed
  have "bit_count (F⇩_e f) ≤
    6 + 2 * (log 2 (¦mantissa f¦ + 1) + log 2 (¦exponent f¦ + 1))"
    using float_bit_count_1 by simp
  also have "... ≤ 6 + 2 * (log 2 (¦m¦ + 2) + log 2 (¦e¦ + 1))"
    using a by simp
  finally show ?thesis by simp
qed

lemma float_bit_count_zero:
  "bit_count (F⇩_e (float_of 0)) = 2"
  by (simp add:F⇩_e_def dependent_bit_count int_bit_count
      zero_float.abs_eq[symmetric])



end