Theory Regular-Sets.Derivatives

```section "Derivatives of regular expressions"

(* Author: Christian Urban *)

theory Derivatives
imports Regular_Exp
begin

text‹This theory is based on work by Brozowski \<^cite>‹"Brzozowski64"› and Antimirov \<^cite>‹"Antimirov95"›.›

subsection ‹Brzozowski's derivatives of regular expressions›

fun
deriv :: "'a ⇒ 'a rexp ⇒ 'a rexp"
where
"deriv c (Zero) = Zero"
| "deriv c (One) = Zero"
| "deriv c (Atom c') = (if c = c' then One else Zero)"
| "deriv c (Plus r1 r2) = Plus (deriv c r1) (deriv c r2)"
| "deriv c (Times r1 r2) =
(if nullable r1 then Plus (Times (deriv c r1) r2) (deriv c r2) else Times (deriv c r1) r2)"
| "deriv c (Star r) = Times (deriv c r) (Star r)"

fun
derivs :: "'a list ⇒ 'a rexp ⇒ 'a rexp"
where
"derivs [] r = r"
| "derivs (c # s) r = derivs s (deriv c r)"

lemma atoms_deriv_subset: "atoms (deriv x r) ⊆ atoms r"
by (induction r) (auto)

lemma atoms_derivs_subset: "atoms (derivs w r) ⊆ atoms r"
by (induction w arbitrary: r) (auto dest: atoms_deriv_subset[THEN subsetD])

lemma lang_deriv: "lang (deriv c r) = Deriv c (lang r)"
by (induct r) (simp_all add: nullable_iff)

lemma lang_derivs: "lang (derivs s r) = Derivs s (lang r)"
by (induct s arbitrary: r) (simp_all add: lang_deriv)

text ‹A regular expression matcher:›

definition matcher :: "'a rexp ⇒ 'a list ⇒ bool" where
"matcher r s = nullable (derivs s r)"

lemma matcher_correctness: "matcher r s ⟷ s ∈ lang r"
by (induct s arbitrary: r)
(simp_all add: nullable_iff lang_deriv matcher_def Deriv_def)

subsection ‹Antimirov's partial derivatives›

abbreviation
"Timess rs r ≡ (⋃r' ∈ rs. {Times r' r})"

lemma Timess_eq_image:
"Timess rs r = (λr'. Times r' r) ` rs"
by auto

primrec
pderiv :: "'a ⇒ 'a rexp ⇒ 'a rexp set"
where
"pderiv c Zero = {}"
| "pderiv c One = {}"
| "pderiv c (Atom c') = (if c = c' then {One} else {})"
| "pderiv c (Plus r1 r2) = (pderiv c r1) ∪ (pderiv c r2)"
| "pderiv c (Times r1 r2) =
(if nullable r1 then Timess (pderiv c r1) r2 ∪ pderiv c r2 else Timess (pderiv c r1) r2)"
| "pderiv c (Star r) = Timess (pderiv c r) (Star r)"

primrec
pderivs :: "'a list ⇒ 'a rexp ⇒ ('a rexp) set"
where
"pderivs [] r = {r}"
| "pderivs (c # s) r = ⋃ (pderivs s ` pderiv c r)"

abbreviation
pderiv_set :: "'a ⇒ 'a rexp set ⇒ 'a rexp set"
where
"pderiv_set c rs ≡ ⋃ (pderiv c ` rs)"

abbreviation
pderivs_set :: "'a list ⇒ 'a rexp set ⇒ 'a rexp set"
where
"pderivs_set s rs ≡ ⋃ (pderivs s ` rs)"

lemma pderivs_append:
"pderivs (s1 @ s2) r = ⋃ (pderivs s2 ` pderivs s1 r)"
by (induct s1 arbitrary: r) (simp_all)

lemma pderivs_snoc:
shows "pderivs (s @ [c]) r = pderiv_set c (pderivs s r)"

lemma pderivs_simps [simp]:
shows "pderivs s Zero = (if s = [] then {Zero} else {})"
and   "pderivs s One = (if s = [] then {One} else {})"
and   "pderivs s (Plus r1 r2) = (if s = [] then {Plus r1 r2} else (pderivs s r1) ∪ (pderivs s r2))"
by (induct s) (simp_all)

lemma pderivs_Atom:
shows "pderivs s (Atom c) ⊆ {Atom c, One}"
by (induct s) (simp_all)

subsection ‹Relating left-quotients and partial derivatives›

lemma Deriv_pderiv:
shows "Deriv c (lang r) = ⋃ (lang ` pderiv c r)"
by (induct r) (auto simp add: nullable_iff conc_UNION_distrib)

lemma Derivs_pderivs:
shows "Derivs s (lang r) = ⋃ (lang ` pderivs s r)"
proof (induct s arbitrary: r)
case (Cons c s)
have ih: "⋀r. Derivs s (lang r) = ⋃ (lang ` pderivs s r)" by fact
have "Derivs (c # s) (lang r) = Derivs s (Deriv c (lang r))" by simp
also have "… = Derivs s (⋃ (lang ` pderiv c r))" by (simp add: Deriv_pderiv)
also have "… = Derivss s (lang ` (pderiv c r))"
also have "… = ⋃ (lang ` (pderivs_set s (pderiv c r)))"
using ih by auto
also have "… = ⋃ (lang ` (pderivs (c # s) r))" by simp
finally show "Derivs (c # s) (lang r) = ⋃ (lang ` pderivs (c # s) r)" .

subsection ‹Relating derivatives and partial derivatives›

lemma deriv_pderiv:
shows "⋃ (lang ` (pderiv c r)) = lang (deriv c r)"
unfolding lang_deriv Deriv_pderiv by simp

lemma derivs_pderivs:
shows "⋃ (lang ` (pderivs s r)) = lang (derivs s r)"
unfolding lang_derivs Derivs_pderivs by simp

subsection ‹Finiteness property of partial derivatives›

definition
pderivs_lang :: "'a lang ⇒ 'a rexp ⇒ 'a rexp set"
where
"pderivs_lang A r ≡ ⋃x ∈ A. pderivs x r"

lemma pderivs_lang_subsetI:
assumes "⋀s. s ∈ A ⟹ pderivs s r ⊆ C"
shows "pderivs_lang A r ⊆ C"
using assms unfolding pderivs_lang_def by (rule UN_least)

lemma pderivs_lang_union:
shows "pderivs_lang (A ∪ B) r = (pderivs_lang A r ∪ pderivs_lang B r)"

lemma pderivs_lang_subset:
shows "A ⊆ B ⟹ pderivs_lang A r ⊆ pderivs_lang B r"

definition
"UNIV1 ≡ UNIV - {[]}"

lemma pderivs_lang_Zero [simp]:
shows "pderivs_lang UNIV1 Zero = {}"
unfolding UNIV1_def pderivs_lang_def by auto

lemma pderivs_lang_One [simp]:
shows "pderivs_lang UNIV1 One = {}"
unfolding UNIV1_def pderivs_lang_def by (auto split: if_splits)

lemma pderivs_lang_Atom [simp]:
shows "pderivs_lang UNIV1 (Atom c) = {One}"
unfolding UNIV1_def pderivs_lang_def
apply(auto)
apply(frule rev_subsetD)
apply(rule pderivs_Atom)
apply(simp)
apply(case_tac xa)
apply(auto split: if_splits)
done

lemma pderivs_lang_Plus [simp]:
shows "pderivs_lang UNIV1 (Plus r1 r2) = pderivs_lang UNIV1 r1 ∪ pderivs_lang UNIV1 r2"
unfolding UNIV1_def pderivs_lang_def by auto

text ‹Non-empty suffixes of a string (needed for the cases of @{const Times} and @{const Star} below)›

definition
"PSuf s ≡ {v. v ≠ [] ∧ (∃u. u @ v = s)}"

lemma PSuf_snoc:
shows "PSuf (s @ [c]) = (PSuf s) @@ {[c]} ∪ {[c]}"
unfolding PSuf_def conc_def
by (auto simp add: append_eq_append_conv2 append_eq_Cons_conv)

lemma PSuf_Union:
shows "(⋃v ∈ PSuf s @@ {[c]}. f v) = (⋃v ∈ PSuf s. f (v @ [c]))"

lemma pderivs_lang_snoc:
shows "pderivs_lang (PSuf s @@ {[c]}) r = (pderiv_set c (pderivs_lang (PSuf s) r))"
unfolding pderivs_lang_def

lemma pderivs_Times:
shows "pderivs s (Times r1 r2) ⊆ Timess (pderivs s r1) r2 ∪ (pderivs_lang (PSuf s) r2)"
proof (induct s rule: rev_induct)
case (snoc c s)
have ih: "pderivs s (Times r1 r2) ⊆ Timess (pderivs s r1) r2 ∪ (pderivs_lang (PSuf s) r2)"
by fact
have "pderivs (s @ [c]) (Times r1 r2) = pderiv_set c (pderivs s (Times r1 r2))"
also have "… ⊆ pderiv_set c (Timess (pderivs s r1) r2 ∪ (pderivs_lang (PSuf s) r2))"
using ih by fastforce
also have "… = pderiv_set c (Timess (pderivs s r1) r2) ∪ pderiv_set c (pderivs_lang (PSuf s) r2)"
by (simp)
also have "… = pderiv_set c (Timess (pderivs s r1) r2) ∪ pderivs_lang (PSuf s @@ {[c]}) r2"
also
have "… ⊆ pderiv_set c (Timess (pderivs s r1) r2) ∪ pderiv c r2 ∪ pderivs_lang (PSuf s @@ {[c]}) r2"
by auto
also
have "… ⊆ Timess (pderiv_set c (pderivs s r1)) r2 ∪ pderiv c r2 ∪ pderivs_lang (PSuf s @@ {[c]}) r2"
also have "… = Timess (pderivs (s @ [c]) r1) r2 ∪ pderiv c r2 ∪ pderivs_lang (PSuf s @@ {[c]}) r2"
also have "… ⊆ Timess (pderivs (s @ [c]) r1) r2 ∪ pderivs_lang (PSuf (s @ [c])) r2"
unfolding pderivs_lang_def by (auto simp add: PSuf_snoc)
finally show ?case .
qed (simp)

lemma pderivs_lang_Times_aux1:
assumes a: "s ∈ UNIV1"
shows "pderivs_lang (PSuf s) r ⊆ pderivs_lang UNIV1 r"
using a unfolding UNIV1_def PSuf_def pderivs_lang_def by auto

lemma pderivs_lang_Times_aux2:
assumes a: "s ∈ UNIV1"
shows "Timess (pderivs s r1) r2 ⊆ Timess (pderivs_lang UNIV1 r1) r2"
using a unfolding pderivs_lang_def by auto

lemma pderivs_lang_Times:
shows "pderivs_lang UNIV1 (Times r1 r2) ⊆ Timess (pderivs_lang UNIV1 r1) r2 ∪ pderivs_lang UNIV1 r2"
apply(rule pderivs_lang_subsetI)
apply(rule subset_trans)
apply(rule pderivs_Times)
using pderivs_lang_Times_aux1 pderivs_lang_Times_aux2
apply auto
apply blast
done

lemma pderivs_Star:
assumes a: "s ≠ []"
shows "pderivs s (Star r) ⊆ Timess (pderivs_lang (PSuf s) r) (Star r)"
using a
proof (induct s rule: rev_induct)
case (snoc c s)
have ih: "s ≠ [] ⟹ pderivs s (Star r) ⊆ Timess (pderivs_lang (PSuf s) r) (Star r)" by fact
{ assume asm: "s ≠ []"
have "pderivs (s @ [c]) (Star r) = pderiv_set c (pderivs s (Star r))" by (simp add: pderivs_snoc)
also have "… ⊆ pderiv_set c (Timess (pderivs_lang (PSuf s) r) (Star r))"
using ih[OF asm] by fast
also have "… ⊆ Timess (pderiv_set c (pderivs_lang (PSuf s) r)) (Star r) ∪ pderiv c (Star r)"
by (auto split: if_splits)
also have "… ⊆ Timess (pderivs_lang (PSuf (s @ [c])) r) (Star r) ∪ (Timess (pderiv c r) (Star r))"
by (simp only: PSuf_snoc pderivs_lang_snoc pderivs_lang_union)
also have "… = Timess (pderivs_lang (PSuf (s @ [c])) r) (Star r)"
by (auto simp add: PSuf_snoc PSuf_Union pderivs_snoc pderivs_lang_def)
finally have ?case .
}
moreover
{ assume asm: "s = []"
then have ?case by (auto simp add: pderivs_lang_def pderivs_snoc PSuf_def)
}
ultimately show ?case by blast
qed (simp)

lemma pderivs_lang_Star:
shows "pderivs_lang UNIV1 (Star r) ⊆ Timess (pderivs_lang UNIV1 r) (Star r)"
apply(rule pderivs_lang_subsetI)
apply(rule subset_trans)
apply(rule pderivs_Star)
done

lemma finite_Timess [simp]:
assumes a: "finite A"
shows "finite (Timess A r)"
using a by auto

lemma finite_pderivs_lang_UNIV1:
shows "finite (pderivs_lang UNIV1 r)"
apply(induct r)
finite_subset[OF pderivs_lang_Times]
finite_subset[OF pderivs_lang_Star])
done

lemma pderivs_lang_UNIV:
shows "pderivs_lang UNIV r = pderivs [] r ∪ pderivs_lang UNIV1 r"
unfolding UNIV1_def pderivs_lang_def
by blast

lemma finite_pderivs_lang_UNIV:
shows "finite (pderivs_lang UNIV r)"
unfolding pderivs_lang_UNIV

lemma finite_pderivs_lang:
shows "finite (pderivs_lang A r)"
by (metis finite_pderivs_lang_UNIV pderivs_lang_subset rev_finite_subset subset_UNIV)

text‹The following relationship between the alphabetic width of regular expressions
(called ‹awidth› below) and the number of partial derivatives was proved
by Antimirov~\<^cite>‹"Antimirov95"› and formalized by Max Haslbeck.›

fun awidth :: "'a rexp ⇒ nat" where
"awidth Zero = 0" |
"awidth One = 0" |
"awidth (Atom a) = 1" |
"awidth (Plus r1 r2) = awidth r1 + awidth r2" |
"awidth (Times r1 r2) = awidth r1 + awidth r2" |
"awidth (Star r1) = awidth r1"

lemma card_Timess_pderivs_lang_le:
"card (Timess (pderivs_lang A r) s) ≤ card (pderivs_lang A r)"
using finite_pderivs_lang unfolding Timess_eq_image by (rule card_image_le)

lemma card_pderivs_lang_UNIV1_le_awidth: "card (pderivs_lang UNIV1 r) ≤ awidth r"
proof (induction r)
case (Plus r1 r2)
have "card (pderivs_lang UNIV1 (Plus r1 r2)) = card (pderivs_lang UNIV1 r1 ∪ pderivs_lang UNIV1 r2)" by simp
also have "… ≤ card (pderivs_lang UNIV1 r1) + card (pderivs_lang UNIV1 r2)"
also have "… ≤ awidth (Plus r1 r2)" using Plus.IH by simp
finally show ?case .
next
case (Times r1 r2)
have "card (pderivs_lang UNIV1 (Times r1 r2)) ≤ card (Timess (pderivs_lang UNIV1 r1) r2 ∪ pderivs_lang UNIV1 r2)"
by (simp add: card_mono finite_pderivs_lang pderivs_lang_Times)
also have "… ≤ card (Timess (pderivs_lang UNIV1 r1) r2) + card (pderivs_lang UNIV1 r2)"
also have "… ≤ card (pderivs_lang UNIV1 r1) + card (pderivs_lang UNIV1 r2)"
also have "… ≤ awidth (Times r1 r2)" using Times.IH by simp
finally show ?case .
next
case (Star r)
have "card (pderivs_lang UNIV1 (Star r)) ≤ card (Timess (pderivs_lang UNIV1 r) (Star r))"
by (simp add: card_mono finite_pderivs_lang pderivs_lang_Star)
also have "… ≤ card (pderivs_lang UNIV1 r)" by (rule card_Timess_pderivs_lang_le)
also have "… ≤ awidth (Star r)" by (simp add: Star.IH)
finally show ?case .
qed (auto)

text‹Antimirov's Theorem 3.4:›
theorem card_pderivs_lang_UNIV_le_awidth: "card (pderivs_lang UNIV r) ≤ awidth r + 1"
proof -
have "card (insert r (pderivs_lang UNIV1 r)) ≤ Suc (card (pderivs_lang UNIV1 r))"
by(auto simp: card_insert_if[OF finite_pderivs_lang_UNIV1])
also have "… ≤ Suc (awidth r)" by(simp add: card_pderivs_lang_UNIV1_le_awidth)
finally show ?thesis by(simp add: pderivs_lang_UNIV)
qed

text‹Antimirov's Corollary 3.5:›
corollary card_pderivs_lang_le_awidth: "card (pderivs_lang A r) ≤ awidth r + 1"
by(rule order_trans[OF
card_mono[OF finite_pderivs_lang_UNIV pderivs_lang_subset[OF subset_UNIV]]
card_pderivs_lang_UNIV_le_awidth])

end
```