Theory MDP_reward

(* Author: Maximilian Schäffeler *)

theory MDP_reward
  imports
    Bounded_Functions
    MDP_reward_Util
    Blinfun_Util
    MDP_disc
begin

section ‹Markov Decision Processes with Rewards›

locale MDP_reward = discrete_MDP A K
  for
    A and 
    K :: "'s ::countable × 'a ::countable  's pmf" +
  fixes
    r :: "('s × 'a)  real" and
    l :: real
  assumes
    zero_le_disc [simp]: "0  l" and
    r_bounded: "bounded (range r)"
begin

text ‹
This extension to the basic MDPs is formalized with another locale.
It assumes the existence of a reward function @{term r} which takes a state-action pair to a real 
number. We assume that the function is bounded @{prop r_bounded}.

Furthermore, we fix a discounting factor @{term l}, where @{term "0  l  l < 1"}.
›

subsection ‹Util›
subsubsection ‹Basic Properties of rewards›
lemma r_bfun: "r  bfun"
  using r_bounded
  by auto

lemma r_bounded': "bounded (r ` X)"
  by (auto intro: r_bounded bounded_subset)

definition "rM = (sa. ¦r sa¦)"

lemma abs_r_le_rM: "¦r sa¦  rM"
  using bounded_norm_le_SUP_norm r_bounded rM_def by fastforce

lemma abs_rM_eq_rM [simp]: "¦rM¦ = rM"
  using abs_r_le_rM by fastforce

lemma rM_nonneg: "0  rM"
  using abs_rM_eq_rM by linarith

lemma measurable_r_nth [measurable]: "(λt. r (t !! i))  borel_measurable S"
  by measurable

lemma integrable_r_nth [simp]: "integrable (𝒯 p s) (λt. r (t !! i))"
  by (fastforce simp: bounded_iff intro: abs_r_le_rM)

lemma expectation_abs_r_le: "measure_pmf.expectation d (λa. ¦r (s, a)¦)  rM"
  using abs_r_le_rM
  by (fastforce intro!: measure_pmf.integral_le_const measure_pmf.integrable_const_bound)

lemma abs_exp_r_le: "¦measure_pmf.expectation d r¦  rM"
  using abs_r_le_rM
  by (fastforce intro!: measure_pmf.integral_le_const order.trans[OF integral_abs_bound] measure_pmf.integrable_const_bound)

subsubsection ‹Infinite disounted sums›
lemma abs_disc_eq[simp]: "¦l ^ i * x¦ = l ^ i * ¦x¦"
  by (auto simp: abs_mult)

lemma norm_l_pow_eq[simp]: "norm (l^t *R F) = l^t * norm F"
  by auto

subsection ‹Total Reward for Single Traces›

abbreviation "ν_trace_fin t N  i < N. l ^ i * r (t !! i)"
abbreviation "ν_trace t  i. l ^ i * r (t !! i)"

lemma abs_ν_trace_fin_le: "¦ν_trace_fin t N¦  (i < N. l^i * rM)"
  by (auto intro!: sum_mono order.trans[OF sum_abs] mult_left_mono abs_r_le_rM)

lemma measurable_suminf_reward[measurable]: "ν_trace  borel_measurable S"
  by measurable

lemma integrable_ν_trace_fin: "integrable (𝒯 p s) (λt. ν_trace_fin t N)"
  by (fastforce simp: bounded_iff intro: abs_ν_trace_fin_le)


context 
  fixes p :: "('s, 'a) pol"
begin

subsection ‹Expected Finite-Horizon Discounted Reward›
definition "ν_fin n s = t. ν_trace_fin t n 𝒯 p s"

lemma abs_ν_fin_le: "¦ν_fin N s¦  (i<N. l^i * rM)"
  unfolding ν_fin_def
  using abs_ν_trace_fin_le
  by (fastforce intro!: prob_space.integral_le_const order_trans[OF integral_abs_bound])

lemma ν_fin_bfun: "(λs. ν_fin N s)  bfun"
  by (auto intro!: abs_ν_fin_le)

lift_definition νb_fin :: "nat  's b real" is ν_fin
  using ν_fin_bfun .

lemma ν_fin_Suc[simp]: "ν_fin (Suc n) s = ν_fin n s + l ^ n * t.  r (t !! n) 𝒯 p s"
  by (simp add: ν_fin_def)

lemma ν_fin_zero[simp]: "ν_fin 0 s = 0"
  by (simp add: ν_fin_def)

lemma ν_fin_eq_Pn: "ν_fin n s = (i<n. l^i * measure_pmf.expectation (Pn' p s i) r)"
  by (induction n) (auto simp: Pn'_eq_𝒯 integral_distr)
end

subsection ‹Expected Total Discounted Reward›

definition "ν p s = lim (λn. ν_fin p n s)"

lemmas ν_eq_lim = ν_def

lemma ν_eq_Pn: "ν p s = (i. l^i * measure_pmf.expectation (Pn' p s i) r)"
  by (simp add: ν_fin_eq_Pn ν_eq_lim suminf_eq_lim)


subsection ‹Reward of a Decision Rule›
context 
  fixes d :: "('s, 'a) dec"
begin
abbreviation "r_dec s  a. r (s, a) d s"

lemma abs_r_dec_le: "¦r_dec s¦  rM"
  using expectation_abs_r_le integral_abs_bound order_trans by fast

lemma r_dec_eq_r_K0: "r_dec s = measure_pmf.expectation (K0' d s) r"
  by (simp add: K0'_def)

lemma r_dec_bfun: "r_dec  bfun"
  using abs_r_dec_le by (auto intro!: bfun_normI)

lift_definition r_decb :: "'s b real" is "r_dec"
  using r_dec_bfun .

declare r_decb.rep_eq[simp] bfun.Bfun_inverse[simp]

lemma norm_r_dec_le: "norm r_decb  rM"
  by (simp add: abs_r_dec_le norm_bound)
end

lemma r_dec_det [simp]: "r_dec (mk_dec_det d) s = r (s, d s)"
  unfolding mk_dec_det_def by auto

subsection ‹Transition Probability Matrix for MDPs›

context
  fixes p :: "nat  ('s, 'a) dec"
begin
definition "𝒫X n = push_exp (λs. Xn' (mk_markovian p) s n)"

lemma 𝒫X_0[simp]: "𝒫X 0 = id"
  by (simp add: 𝒫X_def)

lemma 𝒫X_bounded_linear[simp]: "bounded_linear (𝒫X t)"
  unfolding 𝒫X_def by simp

lemma norm_𝒫X [simp]: "onorm (𝒫X t) = 1"
  unfolding 𝒫X_def by simp

lemma norm_𝒫X_apply[simp]: "norm (𝒫X n x)  norm x"
  using onorm[OF 𝒫X_bounded_linear] by simp

lemma 𝒫X_bound_r: "norm (𝒫X t (r_decb (p t)))  rM"
  using norm_𝒫X_apply norm_r_dec_le order.trans by blast

lemma 𝒫X_bounded_r: "bounded (range (λt. (𝒫X t (r_decb (p t)))))"
  using 𝒫X_bound_r by (auto intro!: boundedI)

end

lemma ν_fin_elem: "ν_fin (mk_markovian p) n s = (i<n. l^i * 𝒫X p i (r_decb (p i)) s)"
  unfolding 𝒫X_def ν_fin_eq_Pn Pn'_markovian_eq_Xn'_bind measure_pmf_bind
  using measure_pmf_in_subprob_algebra abs_r_le_rM
  by (subst integral_bind) (auto simp: r_dec_eq_r_K0)

lemma νb_fin_eq_𝒫X: "νb_fin (mk_markovian p) n = (i<n. l^i *R 𝒫X p i (r_decb (p i)))"
  by (auto simp: ν_fin_elem sum_apply_bfun νb_fin.rep_eq)

lemma ν_fin_eq_𝒫X: "ν_fin (mk_markovian p) n = (i<n. l^i *R 𝒫X p i (r_decb (p i)))"
  by (metis νb_fin.rep_eq νb_fin_eq_𝒫X)


text @{term "𝒫1 d v"} defines for each state the expected value of @{term v} 
after taking a single step in the MDP according to the decision rule @{term d}.  
›

context
  fixes d :: "('s, 'a) dec"
begin
lift_definition 𝒫1 :: "('s b real) L ('s b real)" is "push_exp (K_st d)"
  using push_exp_bounded_linear .

lemma 𝒫1_bfun_one [simp]:"𝒫1 1 = 1"
  by (auto simp: 𝒫1.rep_eq)

lemma 𝒫1_pow_bfun_one [simp]: "(𝒫1^^t) 1 = 1"
  by (induction t) auto

lemma 𝒫1_pow: "blinfun_apply (𝒫1 ^^ n) = blinfun_apply 𝒫1 ^^ n"
  by (induction n) auto

lemma norm_𝒫1 [simp]: "norm 𝒫1 = 1"
  by (simp add: norm_blinfun.rep_eq 𝒫1.rep_eq)
end

lemma 𝒫X_Suc: "𝒫X p (Suc n) v = 𝒫1 (p 0) ((𝒫X (λn. p (Suc n)) n) v)"
  unfolding 𝒫X_def 𝒫1.rep_eq
  by (fastforce intro!: abs_le_norm_bfun integral_bind[where K = "count_space UNIV"]
      simp: measure_pmf_in_subprob_algebra measure_pmf_bind Suc_Xn'_markovian)

lemma 𝒫X_Suc': "𝒫X p (Suc n) v = 𝒫X p n (𝒫1 (p n) v)"
proof (induction n arbitrary: p)
  case 0
  thus ?case
    by (simp add: 𝒫X_Suc)
next
  case (Suc n)
  thus ?case 
    by (metis 𝒫X_Suc)
qed

lemma 𝒫X_const: "𝒫X (λ_. d) n = 𝒫1 d ^^ n"
  by (induction n) (auto simp add: 𝒫1_pow 𝒫X_Suc)

lemma 𝒫X_sconst: "𝒫X (λ_. p) n = 𝒫1 p ^^n"
  using 𝒫X_const.

lemma norm_P_n[simp]: "onorm (𝒫1 d ^^ n) = 1"
  using norm_𝒫X[of "λ_. d"] by (auto simp: 𝒫X_sconst)

lemma norm_𝒫1_pow [simp]: "norm (𝒫1 d ^^ t) = 1"
  by (simp add: norm_blinfun.rep_eq)

lemma 𝒫X_Suc_n_elem: "𝒫X p n (𝒫1 (p n) v) = 𝒫X p (Suc n) v"
  using 𝒫X_Suc' 𝒫1.rep_eq by auto

lemma 𝒫1_eq_𝒫X_one: "blinfun_apply (𝒫1 (p 0)) = 𝒫X p 1"
  by (auto simp: 𝒫X_Suc' 𝒫1.rep_eq)


lemma 𝒫1_pos: "0  u  0  𝒫1 d u"
  by (auto simp: 𝒫1.rep_eq less_eq_bfun_def)

lemma 𝒫1_nonneg: "nonneg_blinfun (𝒫1 d)"
  by (simp add: 𝒫1_pos nonneg_blinfun_def)

lemma 𝒫1_n_pos: "0  u  0  (𝒫1 d ^^ n) u"
  by (induction n) (auto simp: 𝒫1.rep_eq less_eq_bfun_def)

lemma 𝒫1_n_nonneg: "nonneg_blinfun (𝒫1 d ^^ n)"
  by (simp add: 𝒫1_n_pos nonneg_blinfun_def)

lemma 𝒫1_n_disc_pos: "0  u  0  (l^n *R 𝒫1 d ^^n) u"
  by (auto simp: 𝒫1_n_pos scaleR_nonneg_nonneg blinfun.scaleR_left)

lemma 𝒫1_sum_pos: "0  u  0  (tn. l^t *R (𝒫1 d ^^ t)) u"
  using 𝒫1_n_pos 𝒫1_pos
  by (induction n) (auto simp: blinfun.add_left blinfun.scaleR_left scaleR_nonneg_nonneg)

lemma 𝒫1_sum_ge: 
  assumes "0  u" 
  shows "u  (tn. l^t *R 𝒫1 d ^^t) u"
  using 𝒫1_n_disc_pos[OF assms, of "Suc _"]
  by (induction n) (auto intro: add_increasing2 simp add: blinfun.add_left)


subsection ‹The Bellman Operator›
definition "L d v  r_decb d + l *R 𝒫1 d v"

lemma norm_L_le: "norm (L d v)  rM + l * norm v"
  using norm_blinfun[of "𝒫1 d"] norm_𝒫1 norm_r_dec_le
  by (auto intro!: norm_add_rule_thm mult_left_mono simp: L_def)

lemma abs_L_le: "¦L d v s¦  rM + l * norm v"
  using order.trans[OF norm_le_norm_bfun norm_L_le] by auto

subsubsection ‹Bellman Operator for Single Actions›
abbreviation "La a v s  r (s, a) + l * measure_pmf.expectation (K (s,a)) v"

lemma La_le:
  fixes v :: "'s b real"
  shows "¦La a v s¦  rM + l * norm v"
  using abs_r_le_rM
  by (fastforce intro: order_trans[OF abs_triangle_ineq] order_trans[OF integral_abs_bound]  
      add_mono mult_mono measure_pmf.integral_le_const abs_le_norm_bfun 
      simp: abs_mult)

lemma La_bounded:
  "bounded (range (λa. La a (apply_bfun v) s))"
  using La_le by (auto intro!: boundedI)

lemma La_int: 
  fixes d :: "'a pmf" and v :: "'s b real"
  shows "(a. La a v s d) = (a. r (s, a) d) + l * a. s'. v s' K (s, a) d"
proof (subst Bochner_Integration.integral_add)
  show "integrable d (λa. r (s, a))"
    using abs_r_le_rM by (fastforce intro!: bounded_integrable simp: bounded_iff)
  show "integrable d (λa. l * s'. v s' K (s, a))"
    by (intro bounded_integrable) 
      (auto intro!: mult_mono order_trans[OF integral_abs_bound] boundedI[of _ "l * norm v"]
        measure_pmf.integral_le_const simp: abs_le_norm_bfun abs_mult)
qed auto

lemma L_eq_La: "L d v s = measure_pmf.expectation (d s) (λa. La a v s)"
  unfolding La_int L_def K_st_def 𝒫1.rep_eq
  by (auto simp: measure_pmf_bind integral_measure_pmf_bind[where B = "norm v"] abs_le_norm_bfun)

lemma L_eq_La_det: "L (mk_dec_det d) v s = La (d s) v s"
  by (auto simp: L_eq_La mk_dec_det_def)

lemma La_eq_L: "measure_pmf.expectation p (λa. La a (apply_bfun v) s) = 
  L (λt. if t = s then p else return_pmf (SOME a. a  A t)) v s"
  unfolding L_eq_La by auto

lemma L_le: "L d v s  rM + l * norm v"
  unfolding L_def
  using norm_𝒫1 norm_blinfun[of "(𝒫1 d)"] abs_r_dec_le
  by (fastforce intro: order_trans[OF le_norm_bfun] add_mono mult_left_mono dest: abs_le_D1)

lemma La_le': "La a (apply_bfun v) s  rM + l * norm v"
  using La_le abs_le_D1 by blast


subsection ‹Optimality Equations›

definition " (v :: 's b real) s = (d  DR. L d v s)"

lemma ℒ_bfun: " v  bfun"
  unfolding ℒ_def using abs_L_le ex_dec by (fastforce intro!: cSup_abs_le bfun_normI)

lift_definition b :: "('s b real)  's b real" is 
  using ℒ_bfun .

lemma L_bounded[simp, intro]: "bounded (range (λp. L p v s))"
  using abs_L_le by (auto intro!: boundedI)

lemma L_bounded'[simp, intro]: "bounded ((λp. L p v s) ` X)"
  by (auto intro: bounded_subset)

lemma L_bdd_above[simp, intro]: "bdd_above ((λp. L p v s) ` X)"
  by (auto intro: bounded_imp_bdd_above)

lemma L_le_ℒb: "is_dec d  L d v  b v"
  by (fastforce simp: b.rep_eq ℒ_def intro!: cSUP_upper)

subsubsection ‹Equivalences involving @{const b}

lemma SUP_step_MR_eq:
  " v s = (pa  {pa. set_pmf pa  A s}. (a. La a v s measure_pmf pa))"
  unfolding ℒ_def
proof (intro antisym)
  show "(dDR. L d v s)  (pa  {pa. set_pmf pa  A s}. a. La a v s measure_pmf pa)"
  proof (rule cSUP_mono)
    show "DR  {}"
      using DR_ne .
  next show "bdd_above ((λpa. a. La a v s measure_pmf pa) ` {pa. set_pmf pa  A s})"
      using La_bounded La_le
      by (auto intro!: order_trans[OF integral_abs_bound] 
          bounded_imp_bdd_above boundedI[where B = "rM + l * norm v"] 
          measure_pmf.integral_le_const bounded_integrable)
  next show "m{pa. set_pmf pa  A s}. L n v s  a. La a v s measure_pmf m" if "n  DR" for n
      using that
      by (fastforce simp: L_eq_La La_int is_dec_def)
  qed
next
  have aux: "{pa. set_pmf pa  A s}  {}"
    using DR_ne is_dec_def by auto
  show "(pa{pa. set_pmf pa  A s}. a. La a v s measure_pmf pa)  (dDR. L d v s)"
  proof (intro cSUP_least[OF aux] cSUP_upper2)
    fix n 
    assume h: "n  {pa. set_pmf pa  A s}"
    let ?p = "(λs'. if s = s' then n else SOME a. set_pmf a  A s')"
    have aux: "a. set_pmf a  A sa" for sa
      using ex_dec is_dec_def by blast
    show "?p  DR"
      unfolding is_dec_def using h someI_ex[OF aux] by auto
    thus "(a. La a v s n)  L ?p v s"
      by (auto simp: L_eq_La)
    show "bdd_above ((λd. L d v s) ` DR)"
      by (fastforce intro!: bounded_imp_bdd_above simp: bounded_def)
  next
  qed
qed

lemma b_eq_SUP_La: "b v s = (p  {p. set_pmf p  A s}. a. La a v s measure_pmf p)"
  using SUP_step_MR_eq b.rep_eq by presburger

lemma SUP_step_det_eq: "(d  DD. L (mk_dec_det d) v s) = (a  A s. La a v s)"
proof (intro antisym cSUP_mono)
  show "bdd_above ((λa. La a v s) ` A s)"
    using La_bounded by (fastforce intro!: bounded_imp_bdd_above simp: bounded_def)
  show "bdd_above ((λd. L (mk_dec_det d) v s) ` DD)"
    by (auto intro!: bounded_imp_bdd_above boundedI abs_L_le)
  show "mA s. L (mk_dec_det n) v s  La m v s" if "n  DD" for n
    using that is_dec_det_def by (auto simp: L_eq_La_det intro: bexI[of _ "n s"])
  show "mDD. La n v s  L (mk_dec_det m) v s" if "n  A s" for n
    using that A_ne
    by (fastforce simp: L_eq_La_det is_dec_det_def some_in_eq
        intro!: bexI[of _ "λs'. if s = s' then _ else SOME a. a  A s'"])
qed (auto simp: A_ne)

lemma integrable_La: "integrable (measure_pmf x) (λa. La a (apply_bfun v) s)"
proof (intro Bochner_Integration.integrable_add integrable_mult_right)
  show "integrable (measure_pmf x) (λx. r (s, x))"
    using abs_r_le_rM 
    by (auto intro: measure_pmf.integrable_const_bound[of _ "rM"])
next
  show "integrable (measure_pmf x) (λx. measure_pmf.expectation (K (s, x)) v)"
    by (auto intro!: bounded_integrable boundedI order.trans[OF integral_abs_bound] 
        measure_pmf.integral_le_const abs_le_norm_bfun)
qed

lemma SUP_La_eq_det:
  fixes v :: "'s b real"
  shows "(p{p. set_pmf p  A s}. a. La a v s measure_pmf p) = (aA s. La a v s)"
proof (intro antisym)
  show "(pa{pa. set_pmf pa  A s}. measure_pmf.expectation pa (λa. La a v s))
     (aA s. La a v s)"
    using ex_dec is_dec_def integrable_La A_ne La_bounded
    by (fastforce intro: bounded_range_subset intro!: cSUP_least lemma_4_3_1)
  show "(aA s. La a v s)  (p{p. set_pmf p  A s}. a. La a v s measure_pmf p)"
    unfolding SUP_step_MR_eq[symmetric] SUP_step_det_eq[symmetric] ℒ_def
    using ex_dec_det by (fastforce intro!: cSUP_mono)
qed

lemma ℒ_eq_SUP_det: " v s = (d  DD. L (mk_dec_det d) v s)"
  using SUP_step_MR_eq SUP_step_det_eq SUP_La_eq_det by auto

lemma b_eq_SUP_det: "b v s = (d  DD. L (mk_dec_det d) v s)"
  using ℒ_eq_SUP_det unfolding b.rep_eq by auto


subsection ‹Monotonicity›

lemma 𝒫X_mono[intro]: "a  b  𝒫X p n a  𝒫X p n b"
  by (fastforce simp: 𝒫X_def intro: integral_mono)

lemma 𝒫1_mono[intro]: "a  b  𝒫1 p a  𝒫1 p b"
  using 𝒫1_nonneg by auto

lemma L_mono[intro]: "u  v  L d u  L d v"
  unfolding L_def by (auto intro: scaleR_left_mono)

lemma b_mono[intro]: "u  v  b u  b v"
  using  ex_dec L_mono[of u v] 
  by (fastforce intro!: cSUP_mono simp: b.rep_eq ℒ_def)

lemma step_mono:
  assumes "b v  v" "d  DR"
  shows "L d v  v"
  using assms L_le_ℒb order.trans by blast

lemma step_mono_elem_det:
  assumes "v  b v" "e > 0"
  shows "dDD. v  L (mk_dec_det d) v + e *R 1"
proof -
  have "v s  (aA s. La a v s)" for s
    using SUP_step_det_eq b_eq_SUP_det assms(1) by fastforce
  hence "aA s. v s - e < La a v s" for s
    using A_ne La_le'
    by (subst less_cSUP_iff[symmetric]) (fastforce simp: assms add_strict_increasing algebra_simps intro!: bdd_above.I2)+
  hence aux: "aA s. v s  La a v s + e" for s
    by (auto simp: diff_less_eq intro: less_imp_le)
  then obtain d where "is_dec_det d" "v s  L (mk_dec_det d) v s + e" for s
    by (metis L_eq_La_det is_dec_det_def)
  thus ?thesis
    by fastforce
qed

lemma step_mono_elem:
  assumes "v  b v" "e > 0"
  shows "dDR. v  L d v + e *R 1"
  using assms step_mono_elem_det by blast

lemma 𝒫X_L_le:
  assumes "b v  v" "p  ΠMR"
  shows "𝒫X p n (L (p n) v)  𝒫X p n v"
  using assms step_mono by auto

end

locale MDP_reward_disc = MDP_reward A K r l
  for
    A and 
    K :: "'s ::countable × 'a ::countable  's pmf" and
    r l +
  assumes
    disc_lt_one [simp]: "l < 1"
begin

definition "is_opt_act v s = is_arg_max (λa. La a v s) (λa. a  A s)"
abbreviation "opt_acts v s  {a. is_opt_act v s a}"

lemma summable_disc [intro, simp]: "summable (λi. l ^ i * x)"
  by (simp add: mult.commute)

lemma summable_r_disc[intro, simp]:
  "summable (λi. ¦l ^ i * r (sa i)¦)"
  "summable (λi. l ^ i * ¦r (sa i)¦)"
  "summable (λi. l ^ i * r (sa i))"
proof -
  show "summable (λi. ¦l ^ i * r (sa i)¦)"
    using abs_r_le_rM
    by (fastforce intro!: mult_left_mono summable_comparison_test'[OF summable_disc])
  thus "summable (λi. l ^ i * r (sa i))" "summable (λi. l ^ i * ¦r (sa i)¦)"
    by (auto intro: summable_rabs_cancel)
qed

lemma summable_norm_disc_I[intro]:
  assumes "summable (λt. (l^t * norm F))"
  shows "summable (λt. norm (l^t *R F))"
  using assms by auto

lemma summable_norm_disc_I'[intro]:
  assumes "summable (λt. (l^t * norm (F t)))"
  shows "summable (λt. norm (l^t *R F t))"
  using assms by auto

lemma summable_discI [intro]:
  assumes "bounded (range F)"
  shows "summable (λt. l^t * norm (F t))"
proof -
  obtain b where "norm (F x)  b" for x
    using assms by (auto simp: bounded_iff)
  thus ?thesis
    using Abel_lemma[of l 1 F b] by (auto simp: mult.commute)
qed

lemma summable_disc_reward [intro]:
  assumes "bounded (range (F :: nat  'b :: banach))"
  shows "summable (λt. l^t *R (F t))"
  using assms by (auto intro: summable_norm_cancel)

lemma summable_norm_bfun_disc: "summable (λt. l^t * norm (apply_bfun f t))"
  using norm_le_norm_bfun
  by (auto simp: mult.commute[of "l^_"] intro!: Abel_lemma[of _ 1 _ "norm f"])

lemma summable_bfun_disc [simp]: "summable (λt. l^t * (apply_bfun f t))"
proof -
  have "norm (l^t * apply_bfun f t) = l^t * norm (apply_bfun f t)" for t
    by (auto simp: abs_mult)
  hence "summable (λt. norm (l^t * (apply_bfun f t)))"
    by (auto simp only: abs_mult)
  thus ?thesis
    by (auto intro: summable_norm_cancel)
qed

lemma norm_bfun_disc_le: "norm f  B  (x. l^x * norm (apply_bfun f x))  (x. l^x * B)"
  by (fastforce intro!: suminf_le mult_left_mono norm_le_norm_bfun intro: order.trans)

lemma norm_bfun_disc_le': "norm f  B  (x. l^x * (apply_bfun f x))  (x. l^x * B)"
  by (auto simp: mult_left_mono intro!: suminf_le order.trans[OF _ norm_bfun_disc_le])

lemma sum_disc_lim_l: "(x. l^x * B) = B /(1-l)"
  by (simp add: suminf_mult2[symmetric] summable_geometric suminf_geometric[of l])

lemma sum_disc_bound: "(x. l^x * apply_bfun f x)  (norm f) /(1-l)"
  using norm_bfun_disc_le' sum_disc_lim  by auto

lemma sum_disc_bound':
  fixes f :: "nat  'b b real"
  assumes h: "n. norm (f n)  B"
  shows "norm (x. l^x *R f x)  B /(1-l)"
proof -
  have "norm (x. l^x *R f x)   (x. norm (l^x *R f x))"
    using h
    by (fastforce intro!: boundedI summable_norm)
  also have "  (x. l^x * B)"
    using h
    by (auto intro!: suminf_le boundedI simp: mult_mono')
  also have " = B /(1-l)"
    by (simp add: sum_disc_lim)
  finally show "norm (x. l^x *R f x)  B /(1-l)" .
qed


lemma abs_ν_trace_le: "¦ν_trace t¦  (i. l ^ i * rM)"
  by (auto intro!: abs_r_le_rM mult_left_mono order_trans[OF summable_rabs] suminf_le)

lemma integrable_ν_trace: "integrable (𝒯 p s) ν_trace"
  by (fastforce simp: bounded_iff intro: abs_ν_trace_le)

context 
  fixes p :: "('s, 'a) pol"
begin

lemma ν_eq_ν_trace: "ν p s = t. ν_trace t 𝒯 p s"
proof -
  have "(λn. ν_fin p n s)  t. ν_trace t 𝒯 p s"
    unfolding ν_fin_def
  proof(intro integral_dominated_convergence)
    show "AE x in 𝒯 p s. ν_trace_fin x  ν_trace x"
      using summable_LIMSEQ by blast
  next
    have "(i<N. l ^ i * rM)  (N. l ^ N * rM)" for N
      by (auto intro: sum_le_suminf simp: rM_nonneg)
    thus "AE x in 𝒯 p s. norm (ν_trace_fin x N)  (N. l ^ N * rM)" for N
      using order_trans[OF abs_ν_trace_fin_le] by fastforce
  qed auto
  thus ?thesis
    using ν_eq_lim limI by fastforce
qed

lemma abs_ν_le: "¦ν p s¦  (i. l^i * rM)"
  unfolding ν_eq_Pn
  using abs_exp_r_le
  by (fastforce intro!: order.trans[OF summable_rabs] suminf_le summable_comparison_test'[OF summable_disc] mult_left_mono)

lemma ν_le: "ν p s  (i. l^i * rM)"
  by (auto intro: abs_ν_le abs_le_D1)

(* 6.1.2 in Puterman *)
lemma ν_bfun: "ν p  bfun"
  by (auto intro!: abs_ν_le)

lift_definition νb :: "'s b real" is "ν p"
  using ν_bfun by blast

lemma norm_ν_le: "norm νb  rM / (1-l)"
  using abs_ν_le sum_disc_lim
  by (auto simp: νb.rep_eq norm_bfun_def' intro: cSUP_least)
end

lemma ν_as_markovian: "ν (mk_markovian (as_markovian p (return_pmf s))) s = ν p s"
  by (auto simp: ν_eq_Pn Pn_as_markovian_eq Pn'_def)

lemma νb_as_markovian: "νb (mk_markovian (as_markovian p (return_pmf s))) s = νb p s"
  using ν_as_markovian by (auto simp: νb.rep_eq)

subsection ‹Optimal Reward›

definition "ν_MD s  p  ΠMD. ν (mk_markovian_det p) s"
definition "ν_opt s  p  ΠHR. ν p s"

lemma ν_opt_bfun: "ν_opt  bfun"
  using abs_ν_le policies_ne 
  by (fastforce simp: ν_opt_def intro!: order_trans[OF cSup_abs_le] bfun_normI)

lift_definition νb_opt :: "'s b real" is ν_opt
  using ν_opt_bfun .

lemma νb_opt_eq: "νb_opt s = (p  ΠHR. νb p s)"
  using νb.rep_eq νb_opt.rep_eq ν_opt_def by presburger

lemma ν_le_ν_opt [intro]:
  assumes "is_policy p"
  shows "ν p s  ν_opt s"
  unfolding ν_opt_def using abs_ν_le assms
  by (force intro: cSUP_upper intro!: bounded_imp_bdd_above boundedI)

lemma νb_le_opt [intro]: "p  ΠHR  νb p  νb_opt"
  using ν_le by (fastforce simp: νb.rep_eq νb_opt.rep_eq)

lemma νb_le_opt_MD [intro]: "p  ΠMD  νb (mk_markovian_det p)  νb_opt"
  by (auto simp: mk_markovian_det_def is_dec_det_def is_dec_def is_policy_def)

lemma νb_le_opt_DD [intro]: "is_dec_det d  νb (mk_stationary_det d)  νb_opt"
  by (auto simp add: is_policy_def mk_markovian_def)

lemma νb_le_opt_DR [intro]: "is_dec d  νb (mk_stationary d)  νb_opt"
  by (auto simp add: is_policy_def mk_markovian_def)

lemma νb_opt_eq_MR: "νb_opt s = (p  ΠMR. νb (mk_markovian p) s)"
proof (rule antisym)
  show "νb_opt s  (pΠMR. νb (mk_markovian p) s)"
    unfolding νb_opt_eq
  proof (rule cSUP_mono)
    show "ΠHR  {}"
      using policies_ne by simp
    show "bdd_above ((λp. νb (mk_markovian p) s) ` ΠMR)"
      by (auto intro!: boundedI bounded_imp_bdd_above abs_ν_le simp: νb.rep_eq) 
    show "n  ΠHR  mΠMR. νb n s  νb (mk_markovian m) s" for n
      using is_ΠMR_as_markovian by (subst νb_as_markovian[symmetric]) fastforce     
  qed
  show "(pΠMR. νb (mk_markovian p) s)  νb_opt s"
    using ΠMR_ne ΠMR_imp_policies 
    by (auto intro!: cSUP_mono bounded_imp_bdd_above boundedI abs_ν_le simp: νb_opt_eq  νb.rep_eq)
qed

lemma summable_norm_disc_reward'[simp]: "summable (λt. l^t * norm (𝒫X p t (r_decb (p t))))"
  using 𝒫X_bounded_r by auto

lemma summable_disc_reward_𝒫X [simp]: "summable (λt. l^t *R 𝒫X p t (r_decb (p t)))"
  using summable_disc_reward 𝒫X_bounded_r by blast

lemma disc_reward_tendsto:
  "(λn. t<n. l^t *R 𝒫X p t (r_decb (p t)))  (t. l^t *R 𝒫X p t (r_decb (p t)))"
  by (simp add: summable_LIMSEQ)

lemma ν_eq_𝒫X: "ν (mk_markovian p) = (i. l^i *R 𝒫X p i (r_decb (p i)))"
proof -
  have "ν (mk_markovian p) s = (i. l^i * 𝒫X p i (r_decb (p i)) s)" for s
    unfolding νb.rep_eq 𝒫X_def ν_eq_Pn Pn'_markovian_eq_Xn'_bind measure_pmf_bind
    using measure_pmf_in_subprob_algebra abs_r_le_rM
    by (subst integral_bind) (auto simp: r_dec_eq_r_K0)
  thus ?thesis
    by (auto simp: suminf_apply_bfun)
qed

lemma νb_eq_𝒫X: "νb (mk_markovian p) = (i. l^i *R 𝒫X p i (r_decb (p i)))"
  by (auto simp: ν_eq_𝒫X νb.rep_eq)

lemma νb_fin_tendsto_νb: "(νb_fin (mk_markovian p))  νb (mk_markovian p)"
  using disc_reward_tendsto νb_eq_𝒫X νb_fin_eq_𝒫X
  by presburger

lemma norm_𝒫1_l_less: "norm (l *R 𝒫1 d) < 1"
  by auto
lemma disc_𝒫1_tendsto: "(λn. (tn. l^t *R 𝒫1 d ^^t))  (t. l^t *R 𝒫1 d ^^t)"
  by (fastforce simp: bounded_iff intro: summable_LIMSEQ')

lemma disc_𝒫1_lim: "lim (λn. (tn. l^t *R 𝒫1 d ^^ t)) = (t. l^t *R 𝒫1 d ^^t)"
  using limI disc_𝒫1_tendsto
  by blast

lemma convergent_disc_𝒫1: "convergent (λn. (tn. l^t *R 𝒫1 d ^^t))"
  using convergentI disc_𝒫1_tendsto 
  by blast

lemma 𝒫1_suminf_ge: 
  assumes "0  u" shows "u  (t. l^t *R 𝒫1 d ^^t) u"
proof -
  have aux: "x. (λn. (tn. l^t *R 𝒫1 d ^^t) u x)  (t. l^t *R 𝒫1 d ^^t) u x"
    using bfun_tendsto_apply_bfun disc_𝒫1_lim lim_blinfun_apply[OF convergent_disc_𝒫1] 
    by fastforce
  have "n. u  (tn. l^t *R 𝒫1 d ^^t) u"
    using 𝒫1_sum_ge[OF assms] by auto
  thus ?thesis
    by (auto intro!: LIMSEQ_le_const[OF aux])
qed

lemma 𝒫1_suminf_pos: 
  assumes "0  u" 
  shows "0  (t. l^t *R 𝒫1 d ^^t) u"
  using 𝒫1_suminf_ge[of u] assms order.trans by auto

lemma lemma_6_1_2_b:
  assumes "v  u"
  shows "(t. l^t *R 𝒫1 d ^^t) v  (t. l^t *R 𝒫1 d ^^t) u"
proof -
  have "0  (n. l ^ n *R 𝒫1 d ^^ n) (u - v)"
    using 𝒫1_suminf_pos assms by simp
  thus ?thesis
    by (simp add: blinfun.diff_right)
qed

lemma ν_stationary: "νb (mk_stationary d) = (t. l^t *R (𝒫1 d ^^ t)) (r_decb d)"
proof -
  have "νb (mk_stationary d) = (t. (l ^ t *R (𝒫1 d ^^ t)) (r_decb d))"
    by (simp add: νb_eq_𝒫X scaleR_blinfun.rep_eq 𝒫X_sconst)
  also have "...  = (t. (l ^ t *R (𝒫1 d ^^ t))) (r_decb d)"
    by (subst bounded_linear.suminf[where f = "λx. blinfun_apply x (r_decb d)"]) 
      (auto intro!: bounded_linear.suminf boundedI)
  finally show ?thesis .
qed

lemma ν_stationary_inv: "νb (mk_stationary d) = invL (id_blinfun - l *R 𝒫1 d) (r_decb d)"
  by (auto simp: ν_stationary invL_inf_sum blincomp_scaleR_right)


text ‹The value of a markovian policy can be expressed in terms of @{const L}.›

lemma ν_step: "νb (mk_markovian p) = L (p 0) (νb (mk_markovian (λn. p (Suc n))))"
proof -
  have s: "summable (λt. l^t *R (𝒫X p (Suc t) (r_decb (p (Suc t)))))"
    using 𝒫X_bound_r by (auto intro!: boundedI[of _ rM])
  have 
    "νb (mk_markovian p) = r_decb (p 0) + (t. l ^ (Suc t) *R 𝒫X p (Suc t) (r_decb (p (Suc t))))"
    by (subst suminf_split_head) (auto simp: νb_eq_𝒫X)
  also have 
    " = r_decb (p 0) + l *R (t. 𝒫1 (p 0) (l^t *R 𝒫X (λn. p (Suc n)) t (r_decb (p (Suc t)))))"
    using suminf_scaleR_right[OF s] by (auto simp: 𝒫X_Suc blinfun.scaleR_right)
  also have 
    " = L (p 0) (νb (mk_markovian (λn. p (Suc n))))"
    using blinfun.bounded_linear_right bounded_linear.suminf[of "blinfun_apply (𝒫1 (p 0))"]
    by (fastforce simp add: νb_eq_𝒫X L_def)
  finally show ?thesis .
qed

lemma L_ν_fix: "νb (mk_stationary d) = L d (νb (mk_stationary d))"
  using ν_step .

lemma L_fix_ν:
  assumes "L p v = v"
  shows "v = νb (mk_stationary p)"
proof -
  have "r_decb p = (id_blinfun - l *R 𝒫1 p) v"
    using assms by (auto simp: eq_diff_eq L_def blinfun.diff_left blinfun.scaleR_left)
  hence "v = (t. (l *R 𝒫1 p)^^t) (r_decb p)"
    using inv_norm_le'(2)[OF norm_𝒫1_l_less] by auto
  thus "v = νb (mk_stationary p)"
    by (auto simp: ν_stationary blincomp_scaleR_right)
qed

lemma L_ν_fix_iff: "L d v = v  v = νb (mk_stationary d)"
  using L_fix_ν L_ν_fix by auto

subsection ‹Properties of Solutions of the Optimality Equations›

abbreviation "𝒫d p n v  l^n *R 𝒫X p n v"

lemma 𝒫d_lim: "(λn. (𝒫d p n v))  0"
proof -
  have "(λn. l^n * norm v)  0"
    by (auto intro!: tendsto_eq_intros)
  moreover have "norm (𝒫d p n v)  l^n * norm v" for p n
    by (simp add: mult_mono')
  ultimately have "(λn. norm (𝒫d p n v))  0" for p
    by (auto simp: Lim_transform_bound[where g = "λn. (l^n * norm v)"])
  thus "(λn. (𝒫d p n v))  0" for p
    using tendsto_norm_zero_cancel by fast
qed



(* 6.2.2 a) in Puterman *)

lemma ℒ_dec_ge_opt:
  assumes "b v  v"
  shows "νb_opt  v"
proof -
  have "νb (mk_markovian p)  v" if "p  ΠMR" for p
  proof -
    let ?p = "mk_markovian p"
    have aux: "νb_fin ?p n + l^n *R 𝒫X p n v  v" for n
    proof (induction n)
      case (Suc n)
      have "𝒫X p n (r_decb (p n)) + l *R (𝒫X p (Suc n) v)  𝒫X p n v"
        using 𝒫X_L_le assms that by (simp add: 𝒫X_Suc_n_elem L_def linear_simps)
      hence "νb_fin ?p (n + 1) + l^(n + 1) *R (𝒫X p (n + 1) v)  νb_fin ?p n + l^n *R (𝒫X p n v)"
        by (auto simp del: scaleR_scaleR intro: scaleR_left_mono simp: νb_fin_eq_𝒫X 
            mult.commute[of l] scaleR_add_right[symmetric] scaleR_scaleR[symmetric])
      also have "  v"
        using Suc.IH by (auto simp: νb_fin_eq_𝒫X)
      finally show ?case
        by auto
    qed (auto simp: νb_fin_eq_𝒫X)
    have 1: "(λn. (νb_fin ?p n + 𝒫d p n v) s)  νb ?p s" for s
      using bfun_tendsto_apply_bfun Limits.tendsto_add[OF νb_fin_tendsto_νb 𝒫d_lim] by fastforce
    have "νb ?p s  v s" for s
      using that aux assms by (fastforce intro!: lim_mono[OF _ 1, of  _ _ "λn. v s"])
    thus ?thesis
      using that by blast
  qed
  thus ?thesis
    using policies_ne by (fastforce simp: is_policy_def νb_opt_eq_MR intro!: cSUP_least)
qed

lemma ℒ_inc_le_opt:
  assumes "v  b v"
  shows "v  νb_opt"
proof -
  have le_elem: "v s  νb_opt s + (e/(1-l))" if "e > 0" for s e
  proof -
    obtain d where "d  DR" and hd: "v  L d v + e *R 1"
      using assms step_mono_elem e > 0 by blast
    let ?Pinf = "(i. l^i *R 𝒫1 d^^i)"
    have "v  r_decb d + l *R (𝒫1 d) v + e *R 1"
      using hd L_def by fastforce
    hence "(id_blinfun - l *R 𝒫1 d) v  r_decb d + e *R 1"
      by (auto simp: blinfun.diff_left blinfun.scaleR_left algebra_simps)
    hence "?Pinf ((id_blinfun - l *R 𝒫1 d) v)  ?Pinf (r_decb d + e *R 1)"
      using lemma_6_1_2_b 𝒫1_def hd by auto
    hence "v  ?Pinf (r_decb d + e *R 1)"
      using inv_norm_le'(2)[of "l *R 𝒫1 d"] by (auto simp: blincomp_scaleR_right)
    also have " = νb (mk_stationary d) + e *R ?Pinf 1"
      by (simp add: ν_stationary blinfun.add_right blinfun.scaleR_right)
    also have " = νb (mk_stationary d) + e *R (i. (l^i *R ((𝒫1 d^^i))) 1)"
      using convergent_disc_𝒫1 
      by (auto simp: summable_iff_convergent' bounded_linear.suminf[of "λx. blinfun_apply x 1"])
    also have " = νb (mk_stationary d) + e *R (i. (l^i *R 1))"
      by (auto simp: scaleR_blinfun.rep_eq)
    also have "  (νb (mk_stationary d) + (e / (1-l)) *R  1)"
      by (auto simp: bounded_linear.suminf[symmetric, where f = "λx. x *R 1"] 
          suminf_geometric bounded_linear_scaleR_left summable_geometric)
    finally have "v s  (νb (mk_stationary d) + (e/(1-l)) *R  1) s"
      by auto
    thus "v s  νb_opt s + (e/(1-l))"
      using d  DR νb_le_opt
      by (auto simp: is_policy_def mk_markovian_def less_eq_bfun_def intro: order_trans)
  qed
  have "v s  νb_opt s + e" if "e > 0" for s e
  proof -
    have "e * (1 - l) > 0"
      by (simp add: 0 < e)
    thus "v s  νb_opt s + e"
      using disc_lt_one that le_elem by (fastforce split: if_splits)
  qed
  thus ?thesis
    by (fastforce intro: field_le_epsilon)
qed    
lemma ℒ_fix_imp_opt:
  assumes "v = b v"
  shows "v = νb_opt"
  using assms dual_order.antisym[OF ℒ_dec_ge_opt ℒ_inc_le_opt] by auto

lemma bounded_P: "bounded (𝒫1 ` X)"
  by (auto simp: bounded_iff)

subsection ‹Solutions to the Optimality Equation›
subsubsection @{const b} and @{const L} are Contraction Mappings›
declare bounded_apply_blinfun[intro] bounded_apply_bfun'[intro]

lemma contraction_ℒ: "dist (b v) (b u)  l * dist v u"
proof -
  have "dist (b v s) (b u s)  l * dist v u" if "b u s  b v s" for s v u
  proof -
    have "dist (b v s) (b u s)  (d  DR. L d v s - L d u s)"
      using ex_dec that by (fastforce intro!: le_SUP_diff' simp: dist_real_def b.rep_eq ℒ_def)
    also have " = (d  DR. l * (𝒫1 d (v - u) s))"
      by (auto simp: L_def right_diff_distrib blinfun.diff_right)
    also have " = l * (d  DR. 𝒫1 d (v - u) s)"
      using DR_ne bounded_P by (fastforce intro: bounded_SUP_mul)
    also have "  l * norm (d  DR. 𝒫1 d (v - u) s)"
      by (simp add: mult_left_mono)
    also have "  l * (d  DR. norm ((𝒫1 d (v - u)) s))"
    proof -
      have "bounded ((λx. norm ((𝒫1 x (v - u)) s)) ` DR)"
        using bounded_apply_bfun' bounded_P bounded_apply_blinfun bounded_norm_comp by metis
      thus ?thesis
        using DR_ne ex_dec bounded_norm_comp by (fastforce intro!: mult_left_mono)
    qed
    also have "  l * (p  DR. norm (𝒫1 p ((v - u))))"
      using DR_ne abs_le_norm_bfun bounded_P
      by (fastforce simp: bounded_norm_comp intro!: bounded_imp_bdd_above mult_left_mono cSUP_mono)
    also have "  l * (p  DR. norm ((v - u)))"
      using norm_push_exp_le_norm DR_ne
      by (fastforce simp: 𝒫1.rep_eq intro!: mult_left_mono cSUP_mono)
    also have " = l * dist v u"
      by (auto simp: dist_norm)
    finally show ?thesis .
  qed
  hence "b u s  b v s  dist (b v s) (b u s)  l * dist v u" 
    "b v s  b u s  dist (b v s) (b u s)  l * dist v u" for u v s
    by (fastforce simp: dist_commute)+
  thus ?thesis
    using linear[of "b u _"] by (fastforce intro: dist_bound)
qed

lemma is_contraction_ℒ: "is_contraction b"
  using contraction_ℒ zero_le_disc disc_lt_one unfolding is_contraction_def by blast

lemma contraction_L: "dist (L p v) (L p u)  l * dist v u"
proof -
  have aux: "L p v s - L p u s  l * dist v u" if lea: "L p v s  L p u s" for v s u
  proof -
    have "L p v s - L p u s = (l *R  (𝒫1 p v - 𝒫1 p u)) s"
      by (simp add: L_def scale_right_diff_distrib)
    also have "  l * norm (𝒫1 p (v - u) s)"
      by (auto simp: blinfun.diff_right intro!: mult_left_mono)
    also have "  l * norm (𝒫1 p (v - u))"
      using abs_le_norm_bfun by (auto intro!: mult_left_mono)
    also have "  l * dist v u"
      by (simp add: 𝒫1.rep_eq mult_left_mono norm_push_exp_le_norm dist_norm)
    finally show ?thesis
      by auto
  qed
  have "dist (L p v s) (L p u s)  l * dist v u" for v s u
    using aux[of v _ u] aux[of u _ v]
    by (cases "L p v s  L p u s") (auto simp: dist_real_def dist_commute)
  thus "dist (L p v) (L p u)  l * dist v u"
    by (simp add: dist_bound)
qed

lemma is_contraction_L: "is_contraction (L p)"
  unfolding is_contraction_def using contraction_L disc_lt_one zero_le_disc by blast

subsubsection ‹Existence of a Fixpoint of @{const b}
lemma b_conv:
  "∃!v. b v = v" "(λn. (b ^^ n) v)  (THE v. b v = v)"
  using banach'[OF is_contraction_ℒ] by auto

lemma b_fix_iff_opt [simp]: "b v = v  v = νb_opt"
  using banach'(1) is_contraction_ℒ ℒ_fix_imp_opt by metis

lemma νb_opt_fix: "νb_opt = (THE v. b v = v)"
  by auto

lemma b_opt [simp]: "b νb_opt = νb_opt"
  by auto

lemma b_lim: "(λn. (b ^^ n) v)  νb_opt"
  using b_conv(2) νb_opt_fix by presburger

lemma thm_6_2_6: "νb p = νb_opt  b (νb p) = νb p"
  by force

lemma thm_6_2_6': "ν p = ν_opt  b (νb p) = νb p"
  using thm_6_2_6 νb.rep_eq νb_opt.rep_eq by fastforce

subsection ‹Existence of Optimal Policies›

definition "ν_improving v d  (s. is_arg_max (λd. (L d v) s) (λd. d  DR) d)"

lemma ν_improving_iff: "ν_improving v d  d  DR  (d'  DR. s. L d' v s  L d v s)"
  by (auto simp: ν_improving_def is_arg_max_linorder)

lemma ν_improving_D_MR[dest]: "ν_improving v d  d  DR"
  by (auto simp add: ν_improving_iff)

lemma ν_improving_ge: "ν_improving v d  d'  DR  L d' v s  L d v s"
  by (auto simp: ν_improving_iff)

lemma ν_improving_imp_ℒb: "ν_improving v d  b v = L d v"
  by (fastforce intro!: cSup_eq_maximum simp: ν_improving_iff b.rep_eq ℒ_def)

lemma b_imp_ν_improving: 
  assumes "d  DR" "b v = L d v"
  shows "ν_improving v d"
  using assms L_le_ℒb by (auto simp: ν_improving_iff assms(2)[symmetric])

lemma ν_improving_alt:
  assumes "d  DR"
  shows "ν_improving v d  b v = L d v"
  using b_imp_ν_improving ν_improving_imp_ℒb assms by blast

definition "ν_conserving d = ν_improving (νb_opt) d"

lemma ν_conserving_iff: "ν_conserving d  d  DR  (d'  DR. s. L d' νb_opt s  L d νb_opt s)"
  by (auto simp: ν_conserving_def ν_improving_iff)

lemma ν_conserving_ge: "ν_conserving d  d'  DR  L d' νb_opt s  L d νb_opt s"
  by (auto simp: ν_conserving_iff intro: ν_improving_ge)

lemma ν_conserving_imp_ℒb [simp]: "ν_conserving d  L d νb_opt = νb_opt"
  using ν_improving_imp_ℒb by (fastforce simp: ν_conserving_def)

lemma b_imp_ν_conserving:
  assumes "d  DR" "b νb_opt = L d νb_opt"
  shows "ν_conserving d"
  using b_imp_ν_improving assms by (auto simp: ν_conserving_def)

lemma ν_conserving_alt: 
  assumes "d  DR"
  shows "ν_conserving d  b νb_opt = L d νb_opt"
  unfolding ν_conserving_def using ν_improving_alt assms by auto

lemma ν_conserving_alt':
  assumes "d  DR"
  shows "ν_conserving d  L d νb_opt = νb_opt"
  using assms ν_conserving_alt by auto

subsubsection ‹Conserving Decision Rules are Optimal›

theorem ex_improving_imp_conserving:
  assumes "v. d. ν_improving v (mk_dec_det d)"
  shows "d. ν_conserving (mk_dec_det d)"
  by (simp add: assms ν_conserving_def)

theorem conserving_imp_opt[simp]:
  assumes "ν_conserving (mk_dec_det d)"
  shows "νb (mk_stationary_det d) = νb_opt"
  using L_ν_fix_iff ν_conserving_imp_ℒb[OF assms] by simp

lemma conserving_imp_opt':
  assumes "d. ν_conserving (mk_dec_det d)"
  shows "d  DD. (νb (mk_stationary_det d)) = νb_opt"
  using assms by (fastforce simp: ν_conserving_def)

theorem improving_att_imp_det_opt:
  assumes "v. d. ν_improving v (mk_dec_det d)"
  shows "νb_opt s = (d  DD. νb (mk_stationary_det d) s)"
proof -
  obtain d where d: "ν_conserving (mk_dec_det d)"
    using assms ex_improving_imp_conserving by auto
  hence "d  DD"
    using ν_conserving_iff is_dec_mk_dec_det_iff by blast
  thus ?thesis
    using ΠMR_imp_policies νb_le_opt
    by (fastforce intro!: cSup_eq_maximum[where z = "νb_opt s", symmetric]
        simp: conserving_imp_opt[OF d] image_iff)
qed


lemma b_sup_att_dec:
  assumes "d  DR" "b v = L d v"
  shows "d'  DD. b v = L (mk_dec_det d') v"
proof -
  have "a A s. L d v s = La a v s" for s
    unfolding L_eq_La
    using assms is_dec_def La_bounded A_ne b.rep_eq ℒ_def
    by (intro lemma_4_3_1') 
      (auto intro: bounded_range_subset simp: assms(2)[symmetric] L_eq_La[symmetric] SUP_step_MR_eq)
  then obtain d' where d: "d' s  A s" "L d v s = La (d' s) v s" for s
    by metis
  thus ?thesis
    using assms d
    by (fastforce simp: is_dec_det_def mk_dec_det_def L_eq_La)
qed

lemma b_sup_att_dec':
  assumes "d  DR" "b v = L d v"
  shows "d'  DD. ν_improving v (mk_dec_det d')"
  using b_sup_att_dec ν_improving_alt assms by force

subsubsection ‹Deterministic Decision Rules are Optimal›

lemma opt_imp_opt_dec_det:
  assumes "p  ΠHR" "νb p = νb_opt" 
  shows "d  DD. νb (mk_stationary_det d) = νb_opt"
proof -
  have aux: "L (as_markovian p (return_pmf s) 0) νb_opt s = νb_opt s" for s
  proof -
    let ?ps = "as_markovian p (return_pmf s)"
    have markovian_suc_le: "νb (mk_markovian (λn. as_markovian p (return_pmf s) (Suc n)))  νb_opt"
      using is_ΠMR_as_markovian assms by (auto simp: is_policy_def mk_markovian_def)
    have aux_le: "x f g. f  g  apply_bfun f x  apply_bfun g x"
      unfolding less_eq_bfun_def by auto
    have "νb_opt s = νb (mk_markovian ?ps) s"
      using assms νb_as_markovian by metis
    also have " = L (?ps 0) (νb (mk_markovian (λn. ?ps (Suc n)))) s"
      using ν_step by blast
    also have "  L (?ps 0) (νb_opt) s"
      unfolding L_def using markovian_suc_le 𝒫1_mono by (auto intro!: mult_left_mono)
    finally have "νb_opt s  L (?ps 0) (νb_opt) s" .
    have "as_markovian p (return_pmf s) 0  DR"
      using is_ΠMR_as_markovian assms by fast
    have "L (?ps 0) νb_opt  νb_opt"
      using ?ps 0  DR L_le_ℒb[of "?ps 0" "νb_opt"] by simp
    thus "L (?ps 0) νb_opt s = νb_opt s"
      using νb_opt s  (L (?ps 0) νb_opt) s by (auto intro!: antisym)
  qed
  have "L (p []) v s = L (as_markovian p (return_pmf s) 0) v s" for v s
    by (auto simp: L_def 𝒫1.rep_eq K_st_def)
  hence "L (p []) νb_opt = νb_opt"
    using aux by auto
  hence "d  DD. L (mk_dec_det d) νb_opt = νb_opt"
    using b_sup_att_dec assms(1) b_opt is_policy_def mem_Collect_eq by metis
  thus ?thesis
    using conserving_imp_opt' ν_conserving_alt' by blast
qed

subsubsection ‹Optimal Decision Rules for Finite Action Spaces›

(* 6.2.10 *)
lemma ex_opt_act: 
assumes "s. finite (A s)"
shows "a  A s. La a (v :: _ b _) s = b v s"
      unfolding b.rep_eq ℒ_eq_SUP_det SUP_step_det_eq
      using arg_max_on_in[OF assms A_ne]
      by (auto simp: cSup_eq_Sup_fin Sup_fin_Max assms A_ne finite_arg_max_eq_Max[symmetric])

lemma ex_opt_dec_det:
assumes "s. finite (A s)"
shows "d  DD. L (mk_dec_det d) (v :: _ b _) = b v"
  unfolding is_dec_det_def mk_dec_det_def
  using ex_opt_act[OF assms]  someI_ex
  apply (auto intro!: exI[of _ λs. SOME a. a  A s  La a v s = b v s] bfun_eqI)
   apply (smt (verit, best) someI_ex)
  apply (subst L_eq_La)
  apply (subst expectation_return_pmf)
  by (smt (verit, best) someI_ex)

lemma thm_6_2_10:
  assumes "s. finite (A s)"
  shows "d  DD. νb_opt = νb (mk_stationary_det d)"
  using assms conserving_imp_opt' b_opt L_ν_fix_iff ex_opt_dec_det 
  by metis

subsubsection ‹Existence of Epsilon-Optimal Policies›

lemma ex_det_eps:
  assumes "0 < e"
  shows "d  DD. b v  L (mk_dec_det d) v + e *R 1"
proof -
  have "a  A s. b v s  La a v s + e" for s
  proof -
    have "bdd_above ((λa. La a v s) ` A s)"
      using La_le by (auto intro!: boundedI bounded_imp_bdd_above)
    hence "a  A s. b v s - e < La a v s"
      unfolding b.rep_eq ℒ_eq_SUP_det SUP_step_det_eq
      by (auto simp: less_cSUP_iff[OF A_ne, symmetric] 0 < e)
    thus "a  A s. b v s  La a v s + e"
      by force
  qed
  thus ?thesis
    unfolding mk_dec_det_def is_dec_det_def
    by (auto simp: L_def 𝒫1.rep_eq bind_return_pmf K_st_def less_eq_bfun_def) metis
qed

lemma thm_6_2_11:
  assumes "eps > 0"
  shows "d  DD. νb_opt  νb (mk_stationary_det d) + eps *R 1"
proof -
  have "(1-l) * eps > 0"
    by (simp add: assms)
  then obtain d where "d  DD" and d: "b νb_opt  L (mk_dec_det d) νb_opt + ((1-l)*eps) *R 1"
    using ex_det_eps[of _ νb_opt] by auto
  let ?d = "mk_dec_det d"
  let ?lK = "l *R 𝒫1 ?d"
  let ?lK_opt = "l *R 𝒫1 ?d νb_opt"
  have "νb_opt   r_decb ?d + ?lK_opt + ((1-l)*eps) *R 1"
    using L_def ℒ_fix_imp_opt d by simp
  hence "νb_opt - ?lK_opt - ((1-l)*eps) *R 1  r_decb ?d"
    by (simp add: cancel_ab_semigroup_add_class.diff_right_commute diff_le_eq)
  hence "(i. ?lK ^^ i) (νb_opt - ?lK_opt - ((1-l)*eps) *R 1)  νb (mk_stationary ?d)"
    using lemma_6_1_2_b suminf_cong by (simp add: blincomp_scaleR_right ν_stationary)
  hence "((i. ?lK ^^ i) oL (id_blinfun - ?lK)) νb_opt - (i. ?lK ^^ i) (((1-l)*eps) *R 1) 
     (νb (mk_stationary ?d))"
    by (simp add: blinfun.diff_right blinfun.diff_left blinfun.scaleR_left)
  hence le: "νb_opt - (i. ?lK ^^ i) (((1-l)*eps) *R 1)  νb (mk_stationary ?d)"
    by (auto simp: inv_norm_le')
  have s: "summable (λi. (l *R 𝒫1 ?d)^^i)"
    using convergent_disc_𝒫1 summable_iff_convergent'
    by (simp add: blincomp_scaleR_right summable_iff_convergent')
  have "(i. ?lK ^^ i) (((1-l)*eps) *R 1) = eps *R 1"
  proof -
    have "(i. ?lK ^^ i) (((1-l)*eps) *R 1) = ((1-l)*eps) *R (i. ?lK^^i) 1"
      using blinfun.scaleR_right by blast
    also have " = ((1-l)*eps) *R (i. (?lK^^i) 1) "
      using s by (auto simp: bounded_linear.suminf[of "λx. blinfun_apply x 1"])
    also have " = ((1-l)*eps) *R (i. (l ^ i)) *R 1"
      by (auto simp: blinfun.scaleR_left blincomp_scaleR_right bounded_linear_scaleR_left 
          bounded_linear.suminf[of "λx. x *R 1"])
    also have " = ((1-l)*eps) *R (1 / (1-l)) *R 1"
      by (simp add: suminf_geometric)
    also have " = eps *R 1"
      using disc_lt_one 0 < (1 - l) * eps by auto
    finally show ?thesis .
  qed
  thus ?thesis
    using d  DD diff_le_eq le
    by auto
qed

lemma ex_det_dist_eps:
  assumes "0 < (e :: real)"
  shows "d  DD. dist (b v) (L (mk_dec_det d) v)  e"
proof -
  obtain d where "d  DD" "L (mk_dec_det d) v  (b v)" 
    and h2: "b v  L (mk_dec_det d) v + e *R 1"
    using assms ex_det_eps L_le_ℒb by blast
  hence "0  b v -  L (mk_dec_det d) v"
    by simp
  moreover have "b v - L (mk_dec_det d) v  e *R 1"
    using h2 by (simp add: add.commute diff_le_eq)
  ultimately have "s. ¦(b v) s -  L (mk_dec_det d) v s¦  e"
    unfolding less_eq_bfun_def by auto
  hence "dist (b v) (L (mk_dec_det d) v)  e"
    unfolding dist_bfun.rep_eq by (auto intro!: cSUP_least simp: dist_real_def)
  thus ?thesis
    using d  DD 
    by auto
qed

lemma less_imp_ex_add_le: "(x :: real) < y  eps>0. x + eps  y"
  by (meson field_le_epsilon less_le_not_le nle_le)

lemma νb_opt_le_det: "νb_opt s  (d  DD. νb (mk_stationary_det d) s)"
proof (subst le_cSUP_iff, safe)
  fix y
  assume "y < νb_opt s"
  then obtain eps where 1: "y  νb_opt s - eps" and "eps > 0"
    using less_imp_ex_add_le by force
  hence "eps / 2 > 0" by auto
  obtain d where "d  DD" and "νb_opt s  νb (mk_stationary_det d) s + eps / 2"
    using thm_6_2_11[OF eps / 2 > 0] by fastforce
  hence "y < νb (mk_stationary_det d) s"
    using eps > 0 by (auto simp: diff_less_eq intro: le_less_trans[OF 1])
  thus "iDD. y < νb (mk_stationary_det i) s"
    using d  DD by blast
next
  show "DD = {}  False"
    using D_det_ne by blast
  show "bdd_above ((λd. νb (mk_stationary_det d) s) ` DD)"
    by (auto intro!: bounded_imp_bdd_above boundedI abs_ν_le simp: νb.rep_eq)
qed

lemma νb_opt_eq_det: "νb_opt s = (d  DD. νb (mk_stationary_det d) s)"
  using νb_le_opt_DD D_det_ne
  by (fastforce intro!: antisym[OF νb_opt_le_det] cSUP_least)

(* unused, delete? *)
lemma lemma_6_3_1_a:
  assumes "v0  bfun"
  shows "uniform_limit UNIV (λn. ((λv.  (Bfun v)) ^^ n) v0) ν_opt sequentially"
proof -
  have ℒ_Bfun_eq: "v0  bfun  ((λv.  (Bfun v))^^n) v0 = (b ^^n) (Bfun v0)" for n
    by (induction n) (auto simp: b.rep_eq apply_bfun_inverse)
  have "uniform_limit UNIV (λn. (b ^^ n) (Bfun v0)) νb_opt sequentially"
    by (intro tendsto_bfun_uniform_limit[OF b_lim])
  hence "uniform_limit UNIV (λn. (b ^^ n) (Bfun v0)) ν_opt sequentially"
    by (simp add: ν_opt_bfun νb_opt.rep_eq)
  thus ?thesis
    by (auto simp: assms ℒ_Bfun_eq)
qed

lemma dist_Suc_tendsto_zero:
  assumes "(λn. f n)  (y::_::real_normed_vector)"
  shows "(λn. dist (f n) (f (Suc n)))  0"
  using assms tendsto_diff tendsto_norm LIMSEQ_Suc by (fastforce simp: dist_norm)

lemma dist_ℒb_tendsto: "(λn. dist ((b^^n) v) ((b^^(Suc n)) v))  0"
  using b_lim by (fast intro!: dist_Suc_tendsto_zero)

definition "max_L_ex s v  has_arg_max (λa. La a v s) (A s)"

lemma νb_fin_zero[simp]: "νb_fin p 0 = 0"
  by (auto simp: νb_fin.rep_eq)

lemma νb_fin_Suc[simp]: 
  "νb_fin (mk_stationary d) (Suc n) = νb_fin (mk_stationary d) n + ((l *R 𝒫1 d)^^ n) (r_decb d)"
  by (auto simp: 𝒫X_sconst νb_fin.rep_eq ν_fin_eq_𝒫X blincomp_scaleR_right blinfun.scaleR_left)

lemma νb_fin_eq: "νb_fin (mk_stationary d) n = (i < n. ((l *R 𝒫1 d)^^ i)) (r_decb d)"
  by (induction n) (auto simp add: plus_blinfun.rep_eq)

lemma L_iter: "(L d ^^ m) v = νb_fin (mk_stationary d) m + ((l *R 𝒫1 d)^^ m) v"
proof (induction m arbitrary: v)
  case (Suc m)
  have "(L d ^^ Suc m) v = (L d ^^ m) (L d v)"
    by (simp add: funpow_Suc_right del: funpow.simps)
  also have " = νb_fin (mk_stationary d) m + ((l *R 𝒫1 d) ^^ m) (L d v)"
    using Suc by simp
  also have " = νb_fin (mk_stationary d) (Suc m) + ((l *R 𝒫1 d) ^^ Suc m) v"
    unfolding L_def 
    by (auto simp: 𝒫1_pow blinfun.bilinear_simps blincomp_scaleR_right funpow_swap1) 
  finally show ?case .
qed simp

lemma bounded_stationary_νb_fin: "bounded ((λx. (νb_fin (mk_stationary x) N) s) ` X)"
  using νb_fin.rep_eq abs_ν_fin_le by (auto intro!: boundedI)

lemma bounded_disc_𝒫1: "bounded ((λx. (((l *R 𝒫1 x) ^^ m) v) s) ` X)"
  by (auto simp: 𝒫X_const[symmetric] blinfun.bilinear_simps blincomp_scaleR_right 
      intro!: boundedI[of _  "l ^ m * norm v"] mult_left_mono order.trans[OF abs_le_norm_bfun])

lemma bounded_disc_𝒫1': "bounded ((λx. ((𝒫1 x ^^ m) v) s) ` X)"
  by (auto simp: 𝒫X_const[symmetric] intro!: boundedI[of _  "norm v"] order.trans[OF abs_le_norm_bfun])

lemma L_iter_le_ℒb: "is_dec d  (L d ^^ n) v  (b ^^ n) v"
  using order_trans[OF L_mono L_le_ℒb] by (induction n) auto

end

subsection ‹More Restrictive MDP Locales›
locale MDP_fin_acts = discrete_MDP +
  assumes "s. finite (A s)"

locale MDP_att_ℒ = MDP_reward_disc A K r l
  for
    A and 
    K :: "'s ::countable × 'a ::countable  's pmf" and
    r and l +
  assumes Sup_att: "max_L_ex (s :: 's) v"
begin
theorem b_eq_argmax_La:
  fixes v :: "'s b real"
  assumes "is_arg_max (λa. La a v s) (λa. a  A s) a"
  shows "b v s = La a v s"
  using La_le assms A_ne b.rep_eq ℒ_eq_SUP_det SUP_step_det_eq
  by (auto intro!: cSUP_upper2 antisym cSUP_least simp: is_arg_max_linorder)

lemma La_le_arg_max: "a  A s  La a v s  La (arg_max_on (λa. La a v s) (A s)) v s"
  using Sup_att app_arg_max_ge[OF Sup_att[unfolded max_L_ex_def]]
  by (simp add: arg_max_on_def)

lemma arg_max_on_in: "has_arg_max f Q  arg_max_on f Q  Q"
  using has_arg_max_arg_max by (auto simp: arg_max_on_def)

lemma b_eq_La_max: "b v s = La (arg_max_on (λa. La a v s) (A s)) v s"
  using app_arg_max_eq_SUP[symmetric] Sup_att max_L_ex_def 
  by (auto simp: b_eq_SUP_det SUP_step_det_eq)

lemma ex_opt_det: "d  DD. b v = L (mk_dec_det d) v"
proof -
  define d where "d = (λs. arg_max_on (λa. La a v s) (A s))"
  have "b v s = L (mk_dec_det d) v s" for s
    by (auto simp: d_def b_eq_La_max L_eq_La_det)
  moreover have "d  DD"
    using Sup_att arg_max_on_in by (auto simp: d_def is_dec_det_def max_L_ex_def)
  ultimately show ?thesis
    by auto
qed

lemma ex_improving_det: "d  DD. ν_improving v (mk_dec_det d)"
  using ν_improving_alt ex_opt_det by auto
end

locale MDP_act = discrete_MDP A K for A :: "'s::countable  'a::countable set" and K +
  fixes arb_act ::  "'a set  'a"
  assumes arb_act_in[simp]: "X  {}  arb_act X  X" 

locale MDP_act_disc = MDP_act A K + MDP_att_ℒ A K r l
  for A :: "'s::countable  'a::countable set" and K r l
begin


lemma is_opt_act_some: "is_opt_act v s (arb_act (opt_acts v s))"
  using arb_act_in[of "{a. is_arg_max (λa. La a v s) (λa. a  A s) a}"] Sup_att has_arg_max_def
  unfolding max_L_ex_def is_opt_act_def by auto

lemma some_opt_acts_in_A: "arb_act (opt_acts v s)  A s"
  using is_opt_act_some unfolding is_opt_act_def is_arg_max_def by auto

lemma ν_improving_opt_acts: "ν_improving v0 (mk_dec_det (λs. arb_act (opt_acts (apply_bfun v0) s)))"
  using is_opt_act_def is_opt_act_some some_opt_acts_in_A
  by (subst ν_improving_alt) (fastforce simp: L_eq_La_det b_eq_argmax_La is_dec_det_def)+

end

locale MDP_finite_type = MDP_reward_disc A K r l
  for A and K :: "'s :: finite × 'a :: finite  's pmf" and r l

end