# Theory KMP

(*<*)
theory KMP
imports
Theory_Of_Lists
begin

hide_const abs

(*>*)
section‹ Knuth-Morris-Pratt matching according to Bird \label{sec:KMP} ›

subsection‹ Step 1: Specification \label{sec:KMP:specification} ›

text‹

We begin with the specification of string matching given by \<^citet>‹‹Chapter~16› in
"Bird:PearlsofFAD:2010"›. (References to Bird'' in the following are to this text.) Note that
we assume @{const ‹eq›} has some nice properties (see \S\ref{sec:equality}) and
use strict lists.

›

fixrec endswith :: "[:'a::Eq_def:] → [:'a:] → tr" where
[simp del]: "endswith⋅pat = selem⋅pat oo stails"

fixrec matches :: "[:'a::Eq_def:] → [:'a:] → [:Integer:]" where
[simp del]: "matches⋅pat = smap⋅slength oo sfilter⋅(endswith⋅pat) oo sinits"

text‹

Bird describes @{term "matches⋅pat⋅xs"} as returning a list of integers ‹p› such that ‹pat› is a
suffix of @{term "stake⋅p⋅xs"}.''

The following examples illustrate this behaviour:

›

lemma "matches⋅[::]⋅[::] = [:0:]"
unfolding matches.unfold endswith.unfold by simp

lemma "matches⋅[::]⋅[:10::Integer, 20, 30:] = [:0, 1, 2, 3:]"
unfolding matches.unfold endswith.unfold by simp

lemma "matches⋅[:1::Integer,2,3,1,2:]⋅[:1,2,1,2,3,1,2,3,1,2:] = [:7, 10:]"
unfolding matches.unfold endswith.unfold
by (simp add: sfilter_scons_let del: sfilter_strict_scons sfilter.simps)

lemma endswith_strict[simp]:
"endswith⋅⊥ = ⊥"
"endswith⋅pat⋅⊥ = ⊥"

lemma matches_strict[simp]:
"matches⋅⊥ = ⊥"
"matches⋅pat⋅⊥ = ⊥"
by (fixrec_simp; clarsimp simp: cfun_eq_iff)+

text‹

Bird's strategy for deriving KMP from this specification is encoded in the following lemmas:
if we can rewrite @{const ‹endswith›} as a composition of a predicate with a
@{const ‹sfoldl›}, then we can rewrite @{const ‹matches›} into a @{const ‹sscanl›}.

›

lemma fork_sfoldl:
shows "sfoldl⋅f1⋅z1 && sfoldl⋅f2⋅z2 = sfoldl⋅(Λ (a, b) z. (f1⋅a⋅z, f2⋅b⋅z))⋅(z1, z2)" (is "?lhs = ?rhs")
proof(rule cfun_eqI)
fix xs show "?lhs⋅xs = ?rhs⋅xs"
by (induct xs arbitrary: z1 z2) simp_all
qed

lemma smap_sfilter_split_cfcomp: ―‹ Bird (16.4) ›
assumes "f⋅⊥ = ⊥"
assumes "p⋅⊥ = ⊥"
shows "smap⋅f oo sfilter⋅(p oo g) = smap⋅cfst oo sfilter⋅(p oo csnd) oo smap⋅(f && g)" (is "?lhs = ?rhs")
proof(rule cfun_eqI)
fix xs show "?lhs⋅xs = ?rhs⋅xs"
using assms by (induct xs) (simp_all add: If2_def[symmetric] split: If2_splits)
qed

lemma Bird_strategy:
assumes endswith: "endswith⋅pat = p oo sfoldl⋅op⋅z"
assumes step: "step = (Λ (n, x) y. (n + 1, op⋅x⋅y))"
assumes "p⋅⊥ = ⊥" ―‹ We can reasonably expect the predicate to be strict ›
shows "matches⋅pat = smap⋅cfst oo sfilter⋅(p oo csnd) oo sscanl⋅step⋅(0, z)"
apply (simp add: matches.simps assoc_oo endswith)
apply (subst smap_sfilter_split_cfcomp, fastforce, fact)
apply (subst slength_sfoldl)
apply (subst fork_sfoldl)
apply (subst sinits_sscanl)
apply (fold step)
apply (rule refl)
done

text‹

Bird proceeds by reworking @{const ‹endswith›} into the form required by @{thm [source] "Bird_strategy"}.
This is eased by an alternative definition of @{const ‹endswith›}.

›

lemma sfilter_supto:
assumes "0 ≤ d"
shows "sfilter⋅(Λ x. le⋅(MkI⋅n - x)⋅(MkI⋅d))⋅(supto⋅(MkI⋅m)⋅(MkI⋅n))
= supto⋅(MkI⋅(if m ≤ n - d then n - d else m))⋅(MkI⋅n)" (is "?sfilterp⋅?suptomn = _")
proof(cases "m ≤ n - d")
case True
moreover
from True assms have "?sfilterp⋅?suptomn = ?sfilterp⋅(supto⋅(MkI⋅m)⋅(MkI⋅(n - d - 1)) :@ supto⋅(MkI⋅(n - d))⋅(MkI⋅n))"
using supto_split1 by auto
moreover from True assms have "?sfilterp⋅(supto⋅(MkI⋅m)⋅(MkI⋅(n - d - 1))) = [::]" by auto
ultimately show ?thesis by (clarsimp intro!: trans[OF sfilter_cong[OF refl] sfilter_const_TT])
next
case False then show ?thesis
by (clarsimp intro!: trans[OF sfilter_cong[OF refl] sfilter_const_TT])
qed

lemma endswith_eq_sdrop: "endswith⋅pat⋅xs = eq⋅pat⋅(sdrop⋅(slength⋅xs - slength⋅pat)⋅xs)"
proof(cases "pat = ⊥" "xs = ⊥" rule: bool.exhaust[case_product bool.exhaust])
case False_False then show ?thesis
by (cases "endswith⋅pat⋅xs";
simp only: endswith.simps cfcomp2 stails_sdrop';
force simp: zero_Integer_def one_Integer_def elim: slengthE)
qed simp_all

lemma endswith_def2:  ―‹ Bird p127 ›
shows "endswith⋅pat⋅xs = eq⋅pat⋅(shead⋅(sfilter⋅(Λ x. prefix⋅x⋅pat)⋅(stails⋅xs)))" (is "?lhs = ?rhs")
proof(cases "pat = ⊥" "xs = ⊥" rule: bool.exhaust[case_product bool.exhaust])
case False_False
from False_False obtain patl xsl where *: "slength⋅xs = MkI⋅xsl" "0 ≤ xsl" "slength⋅pat = MkI⋅patl" "0 ≤ patl"
by (meson Integer.exhaust slength_bottom_iff slength_ge_0)
let ?patl_xsl = "if patl ≤ xsl then xsl - patl else 0"
have "?rhs = eq⋅pat⋅(shead⋅(sfilter⋅(Λ x. le⋅(slength⋅x)⋅(slength⋅pat) andalso prefix⋅x⋅pat)⋅(stails⋅xs)))"
by (subst prefix_slength_strengthen) simp
also have "… = eq⋅pat⋅(shead⋅(sfilter⋅(Λ x. prefix⋅x⋅pat)⋅(sfilter⋅(Λ x. le⋅(slength⋅x)⋅(slength⋅pat))⋅(stails⋅xs))))"
also have "… = eq⋅pat⋅(shead⋅(smap⋅(Λ k. sdrop⋅k⋅xs)⋅(sfilter⋅(Λ k. prefix⋅(sdrop⋅k⋅xs)⋅pat)⋅(sfilter⋅(Λ k. le⋅(slength⋅(sdrop⋅k⋅xs))⋅(MkI⋅patl))⋅(supto⋅(MkI⋅0)⋅(MkI⋅xsl))))))"
using ‹slength⋅xs = MkI⋅xsl› ‹slength⋅pat = MkI⋅patl›
by (simp add: stails_sdrop' sfilter_smap' cfcomp1 zero_Integer_def)
also have "… = eq⋅pat⋅(shead⋅(smap⋅(Λ k. sdrop⋅k⋅xs)⋅(sfilter⋅(Λ k. prefix⋅(sdrop⋅k⋅xs)⋅pat)⋅(sfilter⋅(Λ k. le⋅(MkI⋅xsl - k)⋅(MkI⋅patl))⋅(supto⋅(MkI⋅0)⋅(MkI⋅xsl))))))"
using ‹slength⋅xs = MkI⋅xsl›
by (subst (2) sfilter_cong[where p'="Λ x. le⋅(MkI⋅xsl - x)⋅(MkI⋅patl)"]; fastforce simp: zero_Integer_def)
also have "… = If prefix⋅(sdrop⋅(MkI⋅?patl_xsl)⋅xs)⋅pat
then eq⋅pat⋅(sdrop⋅(MkI⋅?patl_xsl)⋅xs)
else eq⋅pat⋅(shead⋅(smap⋅(Λ k. sdrop⋅k⋅xs)⋅(sfilter⋅(Λ x. prefix⋅(sdrop⋅x⋅xs)⋅pat)⋅(supto⋅(MkI⋅(?patl_xsl + 1))⋅(MkI⋅xsl)))))"
using False_False ‹0 ≤ xsl› ‹0 ≤ patl›
by (subst sfilter_supto) (auto simp: If_distr one_Integer_def)
also have "… = ?lhs" (is "If ?c then _ else ?else = _")
proof(cases ?c)
case TT with ‹slength⋅xs = MkI⋅xsl› ‹slength⋅pat = MkI⋅patl›
show ?thesis by (simp add: endswith_eq_sdrop sdrop_neg zero_Integer_def)
next
case FF ―‹ Recursive case: the lists generated by ‹supto› are too short ›
have "?else = shead⋅(smap⋅(Λ x. eq⋅pat⋅(sdrop⋅x⋅xs))⋅(sfilter⋅(Λ x. prefix⋅(sdrop⋅x⋅xs)⋅pat)⋅(supto⋅(MkI⋅(?patl_xsl + 1))⋅(MkI⋅xsl))))"
using FF by (subst shead_smap_distr[where f="eq⋅pat", symmetric]) (auto simp: cfcomp1)
also have "… = shead⋅(smap⋅(Λ x. seq⋅x⋅FF)⋅(sfilter⋅(Λ x. prefix⋅(sdrop⋅x⋅xs)⋅pat)⋅(supto⋅(MkI⋅(?patl_xsl + 1))⋅(MkI⋅xsl))))"
using False_False * by (subst smap_cong[OF refl, where f'="Λ x. seq⋅x⋅FF"]) (auto simp: zero_Integer_def split: if_splits)
also from * FF have "… = ?lhs"
apply (auto 0 0 simp: shead_smap_distr seq_conv_if endswith_eq_sdrop zero_Integer_def dest!: prefix_FF_not_snilD)
apply (metis (full_types) le_MkI_MkI linorder_not_less order_refl prefix_FF_not_snilD sdrop_all zless_imp_add1_zle)
using FF apply fastforce
done
finally show ?thesis using FF by simp
finally show ?thesis ..
qed simp_all

text‹

Bird then generalizes @{term ‹sfilter⋅(Λ x. prefix⋅x⋅pat) oo stails›} to @{term ‹split›},
where ‹split⋅pat⋅xs› splits ‹pat› into two lists ‹us› and ‹vs› so that
@{prop ‹us :@ vs = pat›} and ‹us› is the longest suffix of ‹xs› that is a prefix of ‹pat›.''

›

fixrec split :: "[:'a::Eq_def:] → [:'a:] → [:'a:] × [:'a:]" where ―‹ Bird p128 ›
[simp del]: "split⋅pat⋅xs = If prefix⋅xs⋅pat then (xs, sdrop⋅(slength⋅xs)⋅pat) else split⋅pat⋅(stail⋅xs)"

lemma split_strict[simp]:
shows "split⋅⊥ = ⊥"
and "split⋅pat⋅⊥ = ⊥"
by fixrec_simp+

lemma split_bottom_iff[simp]: "(split⋅pat⋅xs = ⊥) ⟷ (pat = ⊥ ∨ xs = ⊥)"
by (cases "pat = ⊥"; clarsimp) (induct xs; subst split.unfold; simp)

lemma split_snil[simp]:
assumes "pat ≠ ⊥"
shows "split⋅pat⋅[::] = ([::], pat)"
using assms by fixrec_simp

lemma split_pattern: ―‹ Bird p128, observation ›
assumes "xs ≠ ⊥"
assumes "split⋅pat⋅xs = (us, vs)"
shows "us :@ vs = pat"
using assms
proof(cases "pat = ⊥", simp, induct xs arbitrary: us vs)
case snil then show ?case by (subst (asm) split.unfold) simp
next
case (scons x xs) then show ?case
by (subst (asm) (3) split.unfold)
(fastforce dest: prefix_sdrop_slength simp: If2_def[symmetric] split: If2_splits)
qed simp

lemma endswith_split: ―‹ Bird p128, after defining ‹split› ›
shows "endswith⋅pat = snull oo csnd oo split⋅pat"
proof(rule cfun_eqI)
fix xs show "endswith⋅pat⋅xs = (snull oo csnd oo split⋅pat)⋅xs"
proof(cases "pat = ⊥", simp, induct xs)
case (scons x xs) then show ?case
unfolding endswith_def2
by (subst split.unfold)
(fastforce dest: prefix_sdrop_prefix_eq simp: If2_def[symmetric] If_distr snull_eq_snil split: If2_splits)
qed

lemma split_length_lt:
assumes "pat ≠ ⊥"
assumes "xs ≠ ⊥"
shows "lt⋅(slength⋅(prod.fst (split⋅pat⋅xs)))⋅(slength⋅xs + 1) = TT"
using assms
proof(induct xs)
case (scons x xs) then show ?case
by (subst split.unfold)
(auto simp: If2_def[symmetric] one_Integer_def split: If2_splits elim!: slengthE elim: lt_trans)
qed simp_all

text‹

The predicate ‹p› required by @{thm [source] "Bird_strategy"} is therefore ‹snull oo csnd›. It
remains to find ‹op› and ‹z› such that:

▪ @{term ‹split⋅pat⋅[::] = z›}
▪ @{term ‹split⋅pat⋅(xs :@ [:x:]) = op⋅(split⋅pat⋅xs)⋅x›}

›
text‹

so that @{term ‹split = sfoldl⋅op⋅z›}.

We obtain @{term ‹z = ([::], pat)›} directly from the definition of @{term ‹split›}.

Bird derives ‹op› on the basis of this crucial observation:

›

lemma split_snoc: ―‹ Bird p128 ›
shows "split⋅pat⋅(xs :@ [:x:]) = split⋅pat⋅(cfst⋅(split⋅pat⋅xs) :@ [:x:])"
proof(cases "pat = ⊥", simp, cases "x = ⊥", simp, induct xs)
case (scons x xs) then show ?case
apply -
apply (subst (1 3) split.unfold)
apply (clarsimp simp: If2_def[symmetric] split: If2_splits; intro conjI impI)
apply (subst split.unfold; fastforce)
apply (subst split.unfold; fastforce)
done
qed simp_all

fixrec ―‹ Bird p129 ›
op :: "[:'a::Eq_def:] → [:'a:] × [:'a:] → 'a → [:'a:] × [:'a:]"
where
[simp del]:
"op⋅pat⋅(us, vs)⋅x =
(     If prefix⋅[:x:]⋅vs then (us :@ [:x:], stail⋅vs)
else If snull⋅us then ([::], pat)
else op⋅pat⋅(split⋅pat⋅(stail⋅us))⋅x )"

lemma op_strict[simp]:
"op⋅pat⋅⊥ = ⊥"
"op⋅pat⋅(us, ⊥) = ⊥"
"op⋅pat⋅usvs⋅⊥ = ⊥"
by fixrec_simp+

text‹

Bird demonstrates that @{const ‹op›} is partially correct wrt @{const ‹split›}, i.e.,
@{prop "op⋅pat⋅(split⋅pat⋅xs)⋅x ⊑ split⋅pat⋅(xs :@ [:x:])"}. For total correctness we
essentially prove that @{const ‹op›} terminates on well-defined arguments with an inductive argument.

›

lemma op_induct[case_names step]:
fixes usvs:: "[:'a:] × 'b"
assumes step: "⋀usvs. (⋀usvs'. lt⋅(slength⋅(cfst⋅usvs'))⋅(slength⋅(cfst⋅usvs)) = TT ⟹ P usvs') ⟹ P usvs"
shows "P usvs"
proof(induct usvs rule: wf_induct)
let ?r = "{ (usvs', usvs) |(usvs :: [:'a:] × 'b) (usvs' :: [:'a:] × 'b). lt⋅(slength⋅(cfst⋅usvs'))⋅(slength⋅(cfst⋅usvs)) = TT }"
show "wf ?r"
proof(rule wf_subset[OF wf_inv_image[where f="λ(x, _). slength⋅x", OF wf_subset[OF wf_Integer_ge_less_than[where d=0]]]])
let ?rslen = "{ (slength⋅us', slength⋅us) |(us :: [:'a:]) (us' :: [:'a:]). lt⋅(slength⋅us')⋅(slength⋅us) = TT }"
show "?rslen ⊆ Integer_ge_less_than 0"
apply (clarsimp simp: Integer_ge_less_than_def zero_Integer_def)
apply (metis Integer.exhaust dist_eq_tr(4) dist_eq_tr(6) lt_Integer_bottom_iff lt_MkI_MkI slength_ge_0)
done
show "?r ⊆ inv_image ?rslen (λ(x, _). slength⋅x)" by (auto 0 3)
qed
fix usvs
assume "∀usvs'. (usvs', usvs) ∈ ?r ⟶ P usvs'"
then show "P usvs"
by - (rule step; clarsimp; metis eq_fst_iff)
qed

lemma op_induct'[case_names step]:
assumes step: "⋀us. (⋀us'. lt⋅(slength⋅us')⋅(slength⋅us) = TT ⟹ P us') ⟹ P us"
shows "P us"
by (rule op_induct[where P="P ∘ prod.fst" and usvs="(us, vs)" for vs::unit, simplified])
(fastforce intro: step)

lemma split_snoc_op:
"split⋅pat⋅(xs :@ [:x:]) = op⋅pat⋅(split⋅pat⋅xs)⋅x"
proof(induct "split⋅pat⋅xs" arbitrary: xs rule: op_induct)
case (step xs) show ?case
proof(cases "pat = ⊥" "xs = ⊥" "x = ⊥" rule: bool.exhaust[case_product bool.exhaust bool.exhaust])
case False_False_False
obtain us vs where *: "split⋅pat⋅xs = (us, vs)" by fastforce
from False_False_False * have **: "prefix⋅(us :@ [:x:])⋅pat = prefix⋅[:x:]⋅vs"
using split_pattern same_prefix_prefix sappend_bottom_iff by blast
from False_False_False * **
have ***: "sdrop⋅(slength⋅(us :@ [:x:]))⋅pat = stail⋅vs" if "prefix⋅(us :@ [:x:])⋅pat = TT"
using sdrop_sappend_same[where xs="us :@ [:x:]"] that
by (cases vs; clarsimp) (fastforce dest!: split_pattern)
from False_False_False * ** *** show ?thesis
apply -
apply (subst split_snoc)
apply (subst split.unfold)
apply (subst op.unfold)
apply (fastforce simp: If2_def[symmetric] snull_FF_conv split: If2_splits intro: step split_length_lt)
done
qed simp_all
qed

lemma split_sfoldl_op:
assumes "pat ≠ ⊥"
shows "sfoldl⋅(op⋅pat)⋅([::], pat) = split⋅pat" (is "?lhs = ?rhs")
proof -
have "?lhs⋅xs = ?rhs⋅xs" for xs
using assms by (induct xs rule: srev_induct) (simp_all add: split_snoc_op)
then show ?thesis by (simp add: cfun_eq_iff)
qed

lemma matches_op:
shows "matches⋅pat = smap⋅cfst oo sfilter⋅(snull oo csnd oo csnd)
oo sscanl⋅(Λ (n, usvs) x. (n + 1, op⋅pat⋅usvs⋅x))⋅(0, ([::], pat))" (is "?lhs = ?rhs")
proof(cases "pat = ⊥")
case True
then have "?lhs⋅xs = ?rhs⋅xs" for xs by (cases xs; clarsimp)
then show ?thesis by (simp add: cfun_eq_iff)
next
case False then show ?thesis
apply (subst (2) oo_assoc)
apply (rule Bird_strategy)
apply (simp_all add: endswith_split split_sfoldl_op oo_assoc)
done
qed

text‹

Using @{thm [source] "split_sfoldl_op"} we can rewrite @{const ‹op›} into a more perspicuous form
that exhibits how KMP handles the failure of the text to continue matching the pattern:

›

fixrec
op' :: "[:'a::Eq_def:] → [:'a:] × [:'a:] → 'a → [:'a:] × [:'a:]"
where
[simp del]:
"op'⋅pat⋅(us, vs)⋅x =
(     If prefix⋅[:x:]⋅vs then (us :@ [:x:], stail⋅vs) ― ‹ continue matching ›
else If snull⋅us then ([::], pat) ― ‹ fail at the start of the pattern: discard ‹x› ›
else sfoldl⋅(op'⋅pat)⋅([::], pat)⋅(stail⋅us :@ [:x:]) ― ‹ fail later: discard ‹shead⋅us› and determine where to restart ›
)"

text‹

Intuitively if ‹x› continues the pattern match then we
extend the @{const ‹split›} of ‹pat›
recorded in ‹us› and ‹vs›.  Otherwise we
need to find a prefix of ‹pat› to continue matching
with. If we have yet to make any progress (i.e., ‹us =
‹z›) and discard ‹x›.  Otherwise, because a
match cannot begin with @{term ‹us :@ [:x:]›}, we @{const
‹split›} ‹pat› (aka ‹z›) by
iterating @{const ‹op'›} over @{term
‹stail⋅us :@ [:x:]›}.  The remainder of the
development is about memoising this last computation.

This is not yet the full KMP algorithm as it lacks what we call the
K' optimisation, which we add in \S\ref{sec:KMP:data_refinement}.
Note that a termination proof for @{const "op'"} in HOL is tricky due
to its use of higher-order nested recursion via @{const
‹sfoldl›}.

›

lemma op'_strict[simp]:
"op'⋅pat⋅⊥ = ⊥"
"op'⋅pat⋅(us, ⊥) = ⊥"
"op'⋅pat⋅usvs⋅⊥ = ⊥"
by fixrec_simp+

lemma sfoldl_op'_strict[simp]:
"op'⋅pat⋅(sfoldl⋅(op'⋅pat)⋅(us, ⊥)⋅xs)⋅x = ⊥"
by (induct xs arbitrary: x rule: srev_induct) simp_all

lemma op'_op:
shows "op'⋅pat⋅usvs⋅x = op⋅pat⋅usvs⋅x"
proof(cases "pat = ⊥" "x = ⊥" rule: bool.exhaust[case_product bool.exhaust])
case True_False then show ?thesis
apply (subst op'.unfold)
apply (subst op.unfold)
apply simp
done
next
case False_False then show ?thesis
proof(induct usvs arbitrary: x rule: op_induct)
case (step usvs x)
have *: "sfoldl⋅(op'⋅pat)⋅([::], pat)⋅xs = sfoldl⋅(op⋅pat)⋅([::], pat)⋅xs"
if "lt⋅(slength⋅xs)⋅(slength⋅(cfst⋅usvs)) = TT" for xs
using that
proof(induct xs rule: srev_induct)
case (ssnoc x' xs')
from ssnoc(1,2,4) have "lt⋅(slength⋅xs')⋅(slength⋅(cfst⋅usvs)) = TT"
using lt_slength_0(2) lt_trans by auto
moreover
from step(2) ssnoc(1,2,4) have "lt⋅(slength⋅(cfst⋅(split⋅pat⋅xs')))⋅(slength⋅(cfst⋅usvs)) = TT"
using lt_trans split_length_lt by (auto 10 0)
ultimately show ?case by (simp add: ssnoc.hyps split_sfoldl_op split_snoc_op step)
qed simp_all
from step.prems show ?case
apply (subst op'.unfold)
apply (subst op.unfold)
apply (clarsimp simp: If2_def[symmetric] snull_FF_conv split_sfoldl_op[symmetric] * split: If2_splits)
apply (clarsimp simp: split_sfoldl_op step split_length_lt)
done
qed
qed simp_all

subsection‹ Step 2: Data refinement and the K' optimisation \label{sec:KMP:data_refinement} ›

text‹

Bird memoises the restart computation in @{const ‹op'›} in two steps.  The first reifies
the control structure of @{const ‹op'›} into a non-wellfounded tree, which we discuss here. The
second increases the sharing in this tree; see \S\ref{sec:KMP:increase_sharing}.

Briefly, we cache the @{term ‹sfoldl⋅(op'⋅pat)⋅([::], pat)⋅(stail⋅us :@ [:x:])›}
computation in @{const ‹op'›} by finding a representation'' type @{typ "'t"}
for the abstract'' type @{typ ‹[:'a::Eq_def:] × [:'a:]›}, a
pair of functions @{term ‹rep :: [:'a::Eq_def:] × [:'a:] → 't›},
@{term ‹abs :: 't → [:'a::Eq_def:] × [:'a:]›} where @{prop ‹abs oo rep = ID›}, and then
finding a derived form of @{const ‹op'›} that works on @{typ "'t"} rather
than @{typ "[:'a::Eq_def:] × [:'a:]"}. We also take the opportunity to add the K' optimisation in the form of the @{term ‹next›}
function.

As such steps are essentially @{emph ‹deus ex machina›}, we try to provide some intuition
after showing the new definitions.

›

domain 'a tree ―‹ Bird p130 ›
= Null
| Node (label :: 'a) (lazy left :: "'a tree") (lazy right :: "'a tree") ―‹ Strict in the label @{typ "'a"} ›

(*<*)

lemma tree_injects'[simp]: ―‹ An unconditional form of @{thm [source] tree.injects}. ›
"(Node⋅a⋅l⋅r = Node⋅a'⋅l'⋅r') ⟷ (a = a' ∧ (a ≠ ⊥ ⟶ l = l' ∧ r = r'))"
by (cases "a = ⊥"; clarsimp)

lemma match_Null_match_Node_tree_case: "match_Null⋅t⋅k1 +++ match_Node⋅t⋅k2 = tree_case⋅k1⋅k2⋅t"
by (cases t) simp_all

lemma match_Node_mplus_match_Node: "match_Node⋅x⋅k1 +++ match_Node⋅x⋅k2 = match_Node⋅x⋅(Λ v l r. k1⋅v⋅l⋅r +++ k2⋅v⋅l⋅r)"
by (cases x; clarsimp)

lemma tree_case_distr:
"f⋅⊥ = ⊥ ⟹ f⋅(tree_case⋅g⋅h⋅t) = tree_case⋅(f⋅g)⋅(Λ x l r. f⋅(h⋅x⋅l⋅r))⋅t"
"(tree_case⋅g'⋅h'⋅t)⋅z = tree_case⋅(g'⋅z)⋅(Λ x l r. h'⋅x⋅l⋅r⋅z)⋅t"
by (case_tac [!] t) simp_all

lemma tree_case_cong:
assumes "t = t'"
assumes "t' = Null ⟹ n = n'"
assumes "⋀v l r. ⟦t' = Node⋅v⋅l⋅r; v ≠ ⊥⟧ ⟹ c v l r = c' v l r"
assumes "cont (λ(x, y, z). c x y z)"
assumes "cont (λ(x, y, z). c' x y z)"
shows "tree_case⋅n⋅(Λ v l r. c v l r)⋅t = tree_case⋅n'⋅(Λ v l r. c' v l r)⋅t'"
using assms by (cases t; cases t'; clarsimp simp: prod_cont_iff)

lemma tree_take_smaller:
assumes "tree_take i⋅t = tree_take i⋅u"
assumes "j ≤ i"
shows "tree_take j⋅t = tree_take j⋅u"
using assms by (metis min.orderE tree.take_take)

fixrec tree_map' :: "('a → 'b) → 'a tree → 'b tree" where
"tree_map'⋅f⋅Null = Null"
| "a ≠ ⊥ ⟹ tree_map'⋅f⋅(Node⋅a⋅l⋅r) = Node⋅(f⋅a)⋅(tree_map'⋅f⋅l)⋅(tree_map'⋅f⋅r)"

lemma tree_map'_strict[simp]: "tree_map'⋅f⋅⊥ = ⊥"
by fixrec_simp

lemma tree_map'_ID': "tree_map'⋅ID⋅xs = xs"
by (induct xs) simp_all

lemma tree_map'_ID[simp]: "tree_map'⋅ID = ID"
by (clarsimp simp: cfun_eq_iff tree_map'_ID')

lemma tree_map'_strict_scons[simp]:
assumes "f⋅⊥ = ⊥"
shows "tree_map'⋅f⋅(Node⋅a⋅l⋅r) = Node⋅(f⋅a)⋅(tree_map'⋅f⋅l)⋅(tree_map'⋅f⋅r)"
using assms by (cases "a = ⊥"; clarsimp)

lemma tree_map'_comp'[simp]:
assumes "f⋅⊥ = ⊥"
shows "tree_map'⋅f⋅(tree_map'⋅g⋅t) = tree_map'⋅(f oo g)⋅t"
using assms by (induct t) simp_all

lemma tree_map'_comp[simp]:
assumes "f⋅⊥ = ⊥"
shows "tree_map'⋅f oo tree_map'⋅g = tree_map'⋅(f oo g)"
using assms by (clarsimp simp: cfun_eq_iff)

lemma tree_unique: ―‹ Adapted from @{cite [cite_macro=citet] "Matthews:1999"} for \emph{contractive functions}. ›
fixes x :: "'a tree"
assumes xfx: "x = f⋅x"
assumes f: "⋀i t u. tree_take i⋅t = tree_take i⋅u
⟹ tree_take (Suc i)⋅(f⋅t) = tree_take (Suc i)⋅(f⋅u)"
shows "x = fix⋅f"
proof(rule tree.take_lemma)
fix i show "tree_take i⋅x = tree_take i⋅(fix⋅f)"
proof(induct i)
case (Suc i) from xfx f[OF Suc, folded fix_eq] show ?case by simp
qed simp
qed

(*>*)

fixrec "next" :: "[:'a::Eq_def:] → ([:'a:] × [:'a:]) tree → ([:'a:] × [:'a:]) tree" where
"next⋅[::]⋅t = t"
| "⟦x ≠ ⊥; xs ≠ ⊥⟧ ⟹
next⋅(x :# xs)⋅Null = Null"
| "⟦x ≠ ⊥; xs ≠ ⊥⟧ ⟹
next⋅(x :# xs)⋅(Node⋅(us, [::])⋅l⋅r) = Node⋅(us, [::])⋅l⋅r"
| "⟦v ≠ ⊥; vs ≠ ⊥; x ≠ ⊥; xs ≠ ⊥⟧ ⟹
next⋅(x :# xs)⋅(Node⋅(us, v :# vs)⋅l⋅r) = If eq⋅x⋅v then l else Node⋅(us, v :# vs)⋅l⋅r"

fixrec ―‹ Bird p131 an even simpler form'', with the K' optimisation ›
root2  :: "[:'a::Eq_def:] → ([:'a:] × [:'a:]) tree"
and op2    :: "[:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree"
and rep2   :: "[:'a:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree"
and left2  :: "[:'a:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree"
and right2 :: "[:'a:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree"
where
[simp del]:
"root2⋅pat = rep2⋅pat⋅([::], pat)"
| "op2⋅pat⋅Null⋅x = root2⋅pat"
| "usvs ≠ ⊥ ⟹
op2⋅pat⋅(Node⋅usvs⋅l⋅r)⋅x = If prefix⋅[:x:]⋅(csnd⋅usvs) then r else op2⋅pat⋅l⋅x"
| [simp del]:
"rep2⋅pat⋅usvs = Node⋅usvs⋅(left2⋅pat⋅usvs)⋅(right2⋅pat⋅usvs)"
| "left2⋅pat⋅([::], vs) = next⋅vs⋅Null"
| "⟦u ≠ ⊥; us ≠ ⊥⟧ ⟹
left2⋅pat⋅(u :# us, vs) = next⋅vs⋅(sfoldl⋅(op2⋅pat)⋅(root2⋅pat)⋅us)" ―‹ Note the use of @{term ‹op2›} and @{const ‹next›}. ›
| "right2⋅pat⋅(us, [::]) = Null" ―‹ Unreachable ›
| "⟦v ≠ ⊥; vs ≠ ⊥⟧ ⟹
right2⋅pat⋅(us, v :# vs) = rep2⋅pat⋅(us :@ [:v:], vs)"

fixrec abs2 :: "([:'a:] × [:'a:]) tree → [:'a:] × [:'a:]" where
"usvs ≠ ⊥ ⟹ abs2⋅(Node⋅usvs⋅l⋅r) = usvs"

fixrec matches2 :: "[:'a::Eq_def:] → [:'a:] → [:Integer:]" where
[simp del]: "matches2⋅pat = smap⋅cfst oo sfilter⋅(snull oo csnd oo abs2 oo csnd)
oo sscanl⋅(Λ (n, x) y. (n + 1, op2⋅pat⋅x⋅y))⋅(0, root2⋅pat)"

text‹

\begin{figure}
\centering
\begin{tikzpicture}[
shorten >=1pt,
node distance=1.5cm,
on grid,
auto,
initial text=,
thick,
accepting/.style = {rectangle,minimum size=0.3cm}
]
\node[state,accepting] (q_0i)                     {};
\node[state,initial]   (q_0)  [right=of q_0i]     {$q_0$};
\node[state]           (q_1)  [right=of q_0]      {$q_1$};
\node[state]           (q_2)  [right=of q_1]      {$q_2$};
\node[state]           (q_3)  [right=of q_2]      {$q_3$};
\node[state]           (q_4)  [right=of q_3]      {$q_4$};
\node[state,double]    (q_5)  [right=of q_4]      {$q_5$};
\node[state,accepting] (q_5r) [right=of q_5]      {};

\path[->] (q_0) edge [bend left] node [above]       {0} (q_1)
(q_1) edge [bend left] node [above]       {1} (q_2)
(q_2) edge [bend left] node [above]       {0} (q_3)
(q_3) edge [bend left] node [above]       {0} (q_4)
(q_4) edge [bend left] node [above]       {1} (q_5)
(q_5) edge [bend left] node [above]       {*} (q_5r);

\path[->] (q_0) edge [bend right]               (q_0i)
(q_1) edge [bend left]                (q_0)
(q_2) edge [bend left]                (q_0)  % MP
(q_2) edge [bend left,color=red]      (q_0i) % K opt
(q_3) edge [bend left]                (q_1)
(q_4) edge [bend left]                (q_1)  % MP
(q_4) edge [bend left,color=red]      (q_0)  % K opt
(q_5) edge [bend left]                (q_2);
\end{tikzpicture}
\caption{An example from \<^citet>‹‹\S2.1› in "CrochemoreRytter:2002"›. The MP tree for the
pattern $01001$ is drawn in black: right transitions are labelled
with a symbol, whereas left transitions are unlabelled. The two
K'-optimised left transitions are shown in red. The boxes denote
@{const ‹Null›}. The root node is $q_0$.}
\label{fig:example_tree}
\end{figure}

This tree can be interpreted as a sort of automaton\footnote{\<^citet>‹‹\S3.1› in "Bird:2012"› suggests it can
be thought of as a doubly-linked list, following \<^citet>‹"TakeichiAkama:1990"›.)}, where @{const
‹op2›} goes @{const ‹right›} if the pattern
continues with the next element of the text, and @{const
‹left›} otherwise, to determine how much of a prefix of
the pattern could still be in play.  Figure~\ref{fig:example_tree}
visualises such an automaton for the pattern $01001$, used by \<^citet>‹‹\S2.1› in "CrochemoreRytter:2002"› to
illustrate the difference between Morris-Pratt (MP) and
Knuth-Morris-Pratt (KMP) preprocessing as we discuss below.  Note that
these are not the classical Mealy machines that correspond to regular
expressions, where all outgoing transitions are labelled with symbols.

The following lemma shows how our sample automaton is encoded as a non-wellfounded tree.

›

lemma concrete_tree_KMP:
shows "root2⋅[:0::Integer, 1, 0, 0, 1:]
= (μ q0. Node⋅([::], [:0, 1, 0, 0, 1:])
⋅Null
⋅(μ q1. Node⋅([:0:], [:1, 0, 0, 1:])
⋅q0
⋅(μ q2. Node⋅([:0,1:], [:0, 0, 1:])
⋅Null ―‹ K optimisation: MP ‹q0› ›
⋅(Node⋅([:0,1,0:], [:0, 1:])
⋅q1
⋅(Node⋅([:0,1,0,0:], [:1:])
⋅q0 ―‹ K optimisation: MP ‹q1› ›
⋅(Node⋅([:0,1,0,0,1:], [::])⋅q2⋅Null))))))"
(is "?lhs = fix⋅?F")
proof(rule tree_unique) ✐‹tag invisible›
note rep2.simps[simp]
show "?lhs = ?F⋅?lhs"
apply (subst root2.unfold; simp)
apply (rule tree_unique; simp)
apply (intro conjI)
apply (subst (1) root2.unfold; simp)
apply (subst (1) root2.unfold; fastforce)
apply (rule tree_unique; simp)
apply (intro conjI)
apply (subst (1) root2.unfold; simp)
apply (subst (1) root2.unfold; simp)
apply (subst (1) root2.unfold; simp)
apply (subst (1) root2.unfold; fastforce)
apply (subst (1) root2.unfold; simp)
apply (subst (1) root2.unfold; simp)
apply (subst (1) root2.unfold; simp)
apply (subst (1 2) root2.unfold; fastforce)
apply (subst (1) root2.unfold; simp)
apply (subst (1) root2.unfold; simp)
apply (subst (1) root2.unfold; fastforce)
apply (rename_tac i t u; case_tac i; clarsimp)
apply (rename_tac t u i; case_tac i; clarsimp)
apply (rename_tac t u i; case_tac i; clarsimp)
apply (meson Suc_n_not_le_n linear tree_take_smaller)
apply (rule parallel_fix_ind; simp)
apply (rename_tac i t u x y; case_tac i; clarsimp)
apply (rename_tac i; case_tac i; clarsimp; intro conjI)
apply (meson Suc_n_not_le_n linear tree_take_smaller)
apply (rename_tac i; case_tac i; clarsimp)
apply (rename_tac i; case_tac i; clarsimp)
apply (meson Suc_n_not_le_n linear tree_take_smaller)
done
fix i :: nat
fix t u :: "([:Integer:] × [:Integer:]) tree"
assume "tree_take i⋅t = tree_take i⋅u"
then show "tree_take (Suc i)⋅(?F⋅t) = tree_take (Suc i)⋅(?F⋅u)"
apply simp
apply (rule parallel_fix_ind; simp)
apply (case_tac i; clarsimp; intro conjI)
apply (meson Suc_n_not_le_n linear tree_take_smaller)
apply (rule parallel_fix_ind; simp)
apply (rename_tac j t0 t1; case_tac j; clarsimp)
apply (rename_tac j; case_tac j; clarsimp; intro conjI)
apply (meson Suc_n_not_le_n linear tree_take_smaller)
apply (rename_tac j; case_tac j; clarsimp; intro conjI)
apply (meson Suc_n_not_le_n linear tree_take_smaller)
apply (rename_tac j; case_tac j; clarsimp)
apply (meson Suc_n_not_le_n linear tree_take_smaller)
done
qed

text‹

The sharing that we expect from a lazy (call-by-need) evaluator is here implied by the use of
nested fixed points.

The KMP preprocessor is expressed by the @{const ‹left2›} function, where @{const ‹op2›} is used
to match the pattern against itself; the use of @{const ‹op2›} in @{const ‹matches2›} (the driver'')
is responsible for matching the (preprocessed) pattern against the text. This formally cashes in
an observation by \<^citet>‹‹\S5› in "vanderWoude:1989"›, that these two algorithms
are essentially the same, which has eluded other presentations\footnote{For instance, contrast
our shared use of @{const ‹op2›} with the separated \texttt{match}
and \texttt{rematch} functions of \<^citet>‹‹Figure~1› in "AgerDanvyRohde:2006"›.}.

Bird uses @{const ‹Null›} on a left path to signal to the driver that it should discard the
current element of the text and restart matching from the beginning of the pattern (i.e,
@{const ‹root2›}). This is a step towards the removal of @{term ‹us›} in \S\ref{sec:KMP:step8}.

Note that the @{const ‹Null›} at the end of the rightmost path is unreachable: the rightmost
@{const ‹Node›} has @{term "vs = [::]"} and therefore @{const ‹op2›} always takes the left branch.

The K' optimisation is perhaps best understood by example. Consider
the automaton in Figure~\ref{fig:example_tree}, and a text beginning
with \texttt{011}. Using the MP (black) transitions we take the path
$\rightarrow q_0 \stackrel{{\mathtt{0}}}{\rightarrow} q_1 \stackrel{\mathtt{1}}{\rightarrow} \overbrace{q_2 \rightarrow q_0 \rightarrow \Box}$. Now, due to the failure of the comparison of the
current element of the text (\texttt{1}) at $q_2$, we can predict that
the (identical) comparison at node $q_0$ will fail as well, and
therefore have $q_2$ left-branch directly to $\Box$. This saves a
comparison in the driver at the cost of another in the preprocessor
(in @{const ‹next›}). These optimisations are the red
arrows in the diagram, and can in general save an arbitrary number of
driver comparisons; consider the pattern $\mathtt{1}^n$ for instance.

More formally, @{const ‹next›} ensures that the heads of
the suffixes of the pattern (@{term ‹vs›}) on consecutive
labels on left paths are distinct; see below for a proof of this fact
in our setting, and \<^citet>‹‹\S3.3.4› in
"Gusfield:1997"› for a classical account. Unlike Bird's suggestion
(p134), our @{const ‹next›} function is not recursive.

We note in passing that while MP only allows ‹Null› on
the left of the root node, ‹Null› can be on the left of
any KMP node except for the rightmost
(i.e., the one that signals a complete pattern match) where no optimisation is possible.

We proceed with the formalities of the data refinement.

›

schematic_goal root2_op2_rep2_left2_right2_def: ― ‹ Obtain the definition of these functions as a single fixed point ›
"( root2  :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree
, op2    :: [:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree
, rep2   :: [:'a:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree
, left2  :: [:'a:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree
, right2 :: [:'a:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree )
= fix⋅?F"
unfolding op2_def root2_def rep2_def left2_def right2_def by simp

lemma abs2_strict[simp]:
"abs2⋅⊥ = ⊥"
"abs2⋅Null = ⊥"
by fixrec_simp+

lemma next_strict[simp]:
"next⋅⊥ = ⊥"
"next⋅xs⋅⊥ = ⊥"
"next⋅(x :# xs)⋅(Node⋅(us, ⊥)⋅l⋅r) = ⊥"
apply fixrec_simp
apply (cases xs; fixrec_simp; simp)
apply (cases "x = ⊥"; cases "xs = ⊥"; cases "us = ⊥"; fixrec_simp)
done

lemma next_Null[simp]:
assumes "xs ≠ ⊥"
shows "next⋅xs⋅Null = Null"
using assms by (cases xs) simp_all

lemma next_snil[simp]:
assumes "xs ≠ ⊥"
shows "next⋅xs⋅(Node⋅(us, [::])⋅l⋅r) = Node⋅(us, [::])⋅l⋅r"
using assms by (cases xs) simp_all

lemma op2_rep2_left2_right2_strict[simp]:
"op2⋅pat⋅⊥ = ⊥"
"op2⋅pat⋅(Node⋅(us, ⊥)⋅l⋅r) = ⊥"
"op2⋅pat⋅(Node⋅usvs⋅l⋅r)⋅⊥ = ⊥"
"rep2⋅pat⋅⊥ = ⊥"
"left2⋅pat⋅(⊥, vs) = ⊥"
"left2⋅pat⋅(us, ⊥) = ⊥"
"right2⋅pat⋅(us, ⊥) = ⊥"
apply fixrec_simp
apply (cases "us = ⊥"; fixrec_simp; simp)
apply (cases "usvs = ⊥"; fixrec_simp; simp)
apply fixrec_simp
apply fixrec_simp
apply (cases us; fixrec_simp)
apply fixrec_simp
done

lemma snd_abs_root2_bottom[simp]: "prod.snd (abs2⋅(root2⋅⊥)) = ⊥"

lemma abs_rep2_ID'[simp]: "abs2⋅(rep2⋅pat⋅usvs) = usvs"
by (cases "usvs = ⊥"; subst rep2.unfold; clarsimp)

lemma abs_rep2_ID: "abs2 oo rep2⋅pat = ID"
by (clarsimp simp: cfun_eq_iff)

lemma rep2_snoc_right2: ―‹ Bird p131 ›
assumes "prefix⋅[:x:]⋅vs = TT"
shows "rep2⋅pat⋅(us :@ [:x:], stail⋅vs) = right2⋅pat⋅(us, vs)"
using assms by (cases "x = ⊥"; cases vs; clarsimp)

lemma not_prefix_op2_next:
assumes "prefix⋅[:x:]⋅xs = FF"
shows "op2⋅pat⋅(next⋅xs⋅(rep2⋅pat⋅usvs))⋅x = op2⋅pat⋅(rep2⋅pat⋅usvs)⋅x"
proof -
obtain us vs where "usvs = (us, vs)" by force
with assms show ?thesis
by (cases xs; cases us; clarsimp; cases vs;
clarsimp simp: rep2.simps prefix_singleton_FF If2_def[symmetric] split: If2_splits)
qed

text‹

Bird's appeal to ‹foldl_fusion› (p130) is too weak to
justify this data refinement as his condition (iii) requires the
worker functions to coincide on all representation values. Concretely

\begin{center}
@{prop "rep2⋅pat⋅(op⋅pat⋅(abs2⋅t)⋅x) = op2⋅pat⋅t⋅x"} ―‹Bird (17.2)›
\end{center}

where ‹t› is an arbitrary tree. This does not hold for junk representations
such as:

\begin{center}
@{term ‹t = Node⋅(pat, [::])⋅Null⋅Null›}
\end{center}

Using worker/wrapper fusion \<^citep>‹"GillHutton:2009" and "Gammie:2011"› specialised to @{const
‹sscanl›} (@{thm [source] "sscanl_ww_fusion"}) we only
need to establish this identity for valid representations, i.e., when
‹t› lies under the image of ‹rep2›. In
pictures, we show that this diagram commutes:

\begin{center}
\begin{tikzcd}[column sep=8em]
usvs \arrow[r, "‹Λ usvs. op⋅pat⋅usvs⋅x›"] \arrow[d, "‹rep2⋅pat›"] & usvs' \arrow[d, "‹rep2⋅pat›"] \\
t \arrow[r, "‹Λ usvs. op2⋅pat⋅usvs⋅x ›"]                         & t'
\end{tikzcd}
\end{center}

Clearly this result self-composes: after an initial @{term
‹rep2⋅pat›} step, we can repeatedly simulate
@{const ‹op›} steps with @{const ‹op2›} steps.

›

lemma op_op2_refinement:
assumes "pat ≠ ⊥"
shows "rep2⋅pat⋅(op⋅pat⋅usvs⋅x) = op2⋅pat⋅(rep2⋅pat⋅usvs)⋅x"
proof(cases "x = ⊥" "usvs = ⊥" rule: bool.exhaust[case_product bool.exhaust])
case False_False
then have "x ≠ ⊥" "usvs ≠ ⊥" by simp_all
then show ?thesis
proof(induct usvs arbitrary: x rule: op_induct)
case (step usvs)
obtain us vs where usvs: "usvs = (us, vs)" by fastforce
have *: "sfoldl⋅(op2⋅pat)⋅(root2⋅pat)⋅xs = rep2⋅pat⋅(split⋅pat⋅xs)" if "lt⋅(slength⋅xs)⋅(slength⋅us) = TT" for xs
using that
proof(induct xs rule: srev_induct)
case (ssnoc x xs)
from ssnoc(1,2,4) have IH: "sfoldl⋅(op2⋅pat)⋅(root2⋅pat)⋅xs = rep2⋅pat⋅(split⋅pat⋅xs)"
by - (rule ssnoc; auto intro: lt_trans dest: lt_slength_0)
obtain us' vs' where us'vs': "split⋅pat⋅xs = (us', vs')" by fastforce
from ‹pat ≠ ⊥› ssnoc(1,2,4) usvs show ?case
apply (clarsimp simp: split_sfoldl_op[symmetric] IH)
apply (rule step(1)[simplified abs_rep2_ID', simplified, symmetric])
using lt_trans split_length_lt split_sfoldl_op apply fastforce+
done
qed (fastforce simp: ‹pat ≠ ⊥› root2.unfold)+
have **: "If snull⋅us then rep2⋅pat⋅([::], pat) else rep2⋅pat⋅(op⋅pat⋅(split⋅pat⋅(stail⋅us))⋅x)
= op2⋅pat⋅(left2⋅pat⋅(us, vs))⋅x" if "prefix⋅[:x:]⋅vs = FF"
proof(cases us)
case snil with that show ?thesis
by simp (metis next_Null op2.simps(1) prefix.simps(1) prefix_FF_not_snilD root2.simps)
next
case (scons u' us')
from ‹pat ≠ ⊥› scons have "lt⋅(slength⋅(cfst⋅(split⋅pat⋅us')))⋅(slength⋅us) = TT"
using split_length_lt by fastforce
from ‹pat ≠ ⊥› ‹x ≠ ⊥› usvs that scons this show ?thesis
by (clarsimp simp: * step(1)[simplified abs_rep2_ID'] not_prefix_op2_next)
qed simp
from ‹usvs ≠ ⊥› usvs show ?case
apply (subst (2) rep2.unfold)
apply (subst op2.unfold)
apply (subst op.unfold)
apply (clarsimp simp: If_distr rep2_snoc_right2 ** cong: If_cong)
done
qed

text‹

Therefore the result of this data refinement is extensionally equal to
the specification:

›

lemma data_refinement:
shows "matches = matches2"
proof(intro cfun_eqI)
fix pat xs :: "[:'a:]" show "matches⋅pat⋅xs = matches2⋅pat⋅xs"
proof(cases "pat = ⊥")
case True then show ?thesis by (cases xs; clarsimp simp: matches2.simps)
next
case False then show ?thesis
unfolding matches2.simps
apply (subst matches_op) ―‹ Continue with previous derivation. ›
apply (subst sscanl_ww_fusion[where wrap="ID ** abs2" and unwrap="ID ** rep2⋅pat" and f'="Λ (n, x) y. (n + 1, op2⋅pat⋅x⋅y)"])
apply (simp add: oo_assoc sfilter_smap root2.unfold)
done
qed
qed

text‹

This computation can be thought of as a pair coroutines with a
producer (@{const ‹root2›}/@{const ‹rep2›})
/ consumer (@{const ‹op2›}) structure. It turns out that
laziness is not essential (see \S\ref{sec:implementations}), though it
does depend on being able to traverse incompletely defined trees.

The key difficulty in defining this computation in HOL using present
technology is that @{const ‹op2›} is neither terminating
nor @{emph ‹friendly›} in the terminology of \<^citet>‹"BlanchetteEtAl:2017"›.

While this representation works for automata with this sort of
structure, it is unclear how general it is; in particular it may not
work so well if @{const ‹left›} branches can go forward
as well as back. See also the commentary in \<^citet>‹"HinzeJeuring:2001"›, who observe that sharing is easily lost, and so
it is probably only useful in closed'' settings like the present
one, unless the language is extended in unusual ways \<^citep>‹"JeanninEtAl:2017"›.

\label{thm:k_property}

We conclude by proving that @{const ‹rep2›} produces
trees that have the K' property, viz that labels on consecutive nodes
on a left path do not start with the same symbol. This also
establishes the productivity of @{const ‹root2›}. The
pattern of proof used here -- induction nested in coinduction --
recurs in \S\ref{sec:KMP:increase_sharing}.

›

coinductive K :: "([:'a::Eq:] × [:'a:]) tree ⇒ bool" where
"K Null"
| "⟦ usvs ≠ ⊥; K l; K r;
⋀v vs. csnd⋅usvs = v :# vs ⟹ l = Null ∨ (∃v' vs'. csnd⋅(label⋅l) = v' :# vs' ∧ eq⋅v⋅v' = FF)
⟧ ⟹ K (Node⋅usvs⋅l⋅r)"

declare K.intros[intro!, simp]

lemma sfoldl_op2_root2_rep2_split:
assumes "pat ≠ ⊥"
shows "sfoldl⋅(op2⋅pat)⋅(root2⋅pat)⋅xs = rep2⋅pat⋅(split⋅pat⋅xs)"
proof(induct xs rule: srev_induct)
case (ssnoc x xs) with ‹pat ≠ ⊥› ssnoc show ?case by (clarsimp simp: split_sfoldl_op[symmetric] op_op2_refinement)
qed (simp_all add: ‹pat ≠ ⊥› root2.unfold)

lemma K_rep2:
assumes "pat ≠ ⊥"
assumes "us :@ vs = pat"
shows "K (rep2⋅pat⋅(us, vs))"
using assms
proof(coinduction arbitrary: us vs)
case (K us vs) then show ?case
proof(induct us arbitrary: vs rule: op_induct')
case (step us)
from step.prems have "us ≠ ⊥" "vs ≠ ⊥" by auto
show ?case
proof(cases us)
case bottom with ‹us ≠ ⊥› show ?thesis by simp
next
case snil with step.prems show ?thesis by (cases vs; force simp: rep2.simps)
next
case (scons u' us')
from ‹pat ≠ ⊥› scons ‹us ≠ ⊥› ‹vs ≠ ⊥›
obtain usl vsl where splitl: "split⋅pat⋅us' = (usl, vsl)" "usl ≠ ⊥" "vsl ≠ ⊥" "usl :@ vsl = pat"
by (metis (no_types, opaque_lifting) Rep_cfun_strict1 prod.collapse sappend_strict sappend_strict2 split_pattern)
from scons obtain l r where r: "rep2⋅pat⋅(us, vs) = Node⋅(us, vs)⋅l⋅r" by (simp add: rep2.simps)
moreover
have "(∃us vs. l = rep2⋅pat⋅(us, vs) ∧ us :@ vs = pat) ∨ K l"
proof(cases vs)
case snil with scons splitl r show ?thesis
by (clarsimp simp: rep2.simps sfoldl_op2_root2_rep2_split)
next
case scons
with ‹pat ≠ ⊥› ‹us = u' :# us'› ‹u' ≠ ⊥› ‹us' ≠ ⊥› ‹vs ≠ ⊥› r splitl show ?thesis
apply (clarsimp simp: rep2.simps sfoldl_op2_root2_rep2_split)
apply (cases vsl; cases usl; clarsimp simp: If2_def[symmetric] sfoldl_op2_root2_rep2_split split: If2_splits)
apply (rename_tac ul' usl')
apply (cut_tac us'="prod.fst (split⋅pat⋅usl')" and vs="prod.snd (split⋅pat⋅usl')" in step(1); clarsimp simp: split_pattern)
apply (metis fst_conv lt_trans slength.simps(2) split_length_lt step.prems(1))
apply (erule disjE; clarsimp simp: sfoldl_op2_root2_rep2_split)
apply (rename_tac b l r)
apply (case_tac b; clarsimp simp: rep2.simps)
apply (auto simp: If2_def[symmetric] rep2.simps dest: split_pattern[rotated] split: If2_splits)
done
qed (simp add: ‹vs ≠ ⊥›)
moreover
from ‹us :@ vs = pat› ‹us ≠ ⊥› ‹vs ≠ ⊥› r
have "(∃usr vsr. r = rep2⋅pat⋅(usr, vsr) ∧ usr :@ vsr = pat) ∨ K r"
by (cases vs; clarsimp simp: rep2.simps)
moreover
have "l = Null ∨ (∃v' vs'. csnd⋅(label⋅l) = v' :# vs' ∧ eq⋅v⋅v' = FF)" if "vs = v :# vs'" for v vs'
proof(cases vsl)
case snil with ‹us :@ vs = pat› ‹us = u' :# us'› splitl show ?thesis
using split_length_lt[where pat=pat and xs=us']
by (force elim: slengthE simp: one_Integer_def split: if_splits)
next
case scons
from splitl have "lt⋅(slength⋅usl)⋅(slength⋅us' + 1) = TT"
by (metis fst_conv fst_strict split_bottom_iff split_length_lt)
with scons ‹pat ≠ ⊥› ‹us = u' :# us'› ‹u' ≠ ⊥› ‹us' ≠ ⊥› ‹vs ≠ ⊥› r splitl ‹vs = v :# vs'› show ?thesis
using step(1)[OF _ ‹pat ≠ ⊥›, where us'="prod.fst (split⋅pat⋅us')" and vs="prod.snd (split⋅pat⋅us')"]
by (clarsimp simp: rep2.simps sfoldl_op2_root2_rep2_split If2_def[symmetric] split: If2_splits)
qed (simp add: ‹vsl ≠ ⊥›)
moreover note ‹pat ≠ ⊥› ‹us ≠ ⊥› ‹vs ≠ ⊥›
ultimately show ?thesis by auto
qed
qed
qed

theorem K_root2:
assumes "pat ≠ ⊥"
shows "K (root2⋅pat)"
using assms unfolding root2.unfold by (simp add: K_rep2)

text‹

The remaining steps are as follows:

▪ 3. introduce an accumulating parameter (‹grep›).
▪ 4. inline ‹rep› and simplify.
▪ 5. simplify to Bird's simpler forms.''
▪ 6. memoise ‹left›.
▪ 7. simplify, unfold ‹prefix›.
▪ 9. factor out ‹pat›.

›

subsection‹ Step 3: Introduce an accumulating parameter (grep) ›

text‹

Next we prepare for the second memoization step (\S\ref{sec:KMP:increase_sharing})
by introducing an accumulating parameter to @{const ‹rep2›} that supplies the value of the left
subtree.

We retain @{const ‹rep2›} as a wrapper for now, and inline @{const ‹right2›} to speed up
simplification.

›

fixrec ―‹ Bird p131 / p132 ›
root3  :: "[:'a::Eq_def:] → ([:'a:] × [:'a:]) tree"
and op3    :: "[:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree"
and rep3   :: "[:'a:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree"
and grep3  :: "[:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree"
where
[simp del]:
"root3⋅pat = rep3⋅pat⋅([::], pat)"
| "op3⋅pat⋅Null⋅x = root3⋅pat"
| "usvs ≠ ⊥ ⟹
op3⋅pat⋅(Node⋅usvs⋅l⋅r)⋅x = If prefix⋅[:x:]⋅(csnd⋅usvs) then r else op3⋅pat⋅l⋅x"
| [simp del]: ―‹ Inline @{const ‹left2›}, factor out @{const ‹next›}. ›
"rep3⋅pat⋅usvs = grep3⋅pat⋅(case cfst⋅usvs of [::] ⇒ Null | u :# us ⇒ sfoldl⋅(op3⋅pat)⋅(root3⋅pat)⋅us)⋅usvs"
| [simp del]:  ―‹ @{const ‹rep2›} with @{const ‹left2›} abstracted, @{const ‹right2›} inlined. ›
"grep3⋅pat⋅l⋅usvs = Node⋅usvs⋅(next⋅(csnd⋅usvs)⋅l)⋅(case csnd⋅usvs of
[::] ⇒ Null
| v :# vs ⇒ rep3⋅pat⋅(cfst⋅usvs :@ [:v:], vs))"

schematic_goal root3_op3_rep3_grep3_def:
"( root3  :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree
, op3    :: [:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree
, rep3   :: [:'a:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree
, grep3  :: [:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree )
= fix⋅?F"
unfolding root3_def op3_def rep3_def grep3_def by simp

lemma r3_2:
"(Λ (root, op, rep, grep). (root, op, rep))⋅
( root3  :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree
, op3    :: [:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree
, rep3   :: [:'a:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree
, grep3  :: [:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree )
= (Λ (root, op, rep, left, right). (root, op, rep))⋅
( root2  :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree
, op2    :: [:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree
, rep2   :: [:'a::Eq_def:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree
, left2  :: [:'a::Eq_def:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree
, right2 :: [:'a::Eq_def:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree )"
unfolding root2_op2_rep2_left2_right2_def root3_op3_rep3_grep3_def
apply (simp add: match_snil_match_scons_slist_case match_Null_match_Node_tree_case slist_case_distr tree_case_distr)
apply (simp add: fix_cprod fix_const) ―‹ Very slow. Sensitive to tuple order due to the asymmetry of ‹fix_cprod›. ›
done

subsection‹ Step 4: Inline rep ›

text‹

We further simplify by inlining @{const ‹rep3›} into @{const ‹root3›} and @{const ‹grep3›}.

›

fixrec
root4  :: "[:'a::Eq_def:] → ([:'a:] × [:'a:]) tree"
and op4    :: "[:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree"
and grep4  :: "[:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree"
where
[simp del]:
"root4⋅pat = grep4⋅pat⋅Null⋅([::], pat)"
| "op4⋅pat⋅Null⋅x = root4⋅pat"
| "usvs ≠ ⊥ ⟹
op4⋅pat⋅(Node⋅usvs⋅l⋅r)⋅x = If prefix⋅[:x:]⋅(csnd⋅usvs) then r else op4⋅pat⋅l⋅x"
| [simp del]:
"grep4⋅pat⋅l⋅usvs = Node⋅usvs⋅(next⋅(csnd⋅usvs)⋅l)⋅(case csnd⋅usvs of
[::] ⇒ Null
| v :# vs ⇒ grep4⋅pat⋅(case cfst⋅usvs :@ [:v:] of
[::] ⇒ Null ―‹ unreachable ›
| u :# us ⇒ sfoldl⋅(op4⋅pat)⋅(root4⋅pat)⋅us)⋅(cfst⋅usvs :@ [:v:], vs))"

schematic_goal root4_op4_grep4_def:
"( root4  :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree
, op4    :: [:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree
, grep4  :: [:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree )
= fix⋅?F"
unfolding root4_def op4_def grep4_def by simp

lemma fix_syn4_permute:
assumes "cont (λ(X1, X2, X3, X4). F1 X1 X2 X3 X4)"
assumes "cont (λ(X1, X2, X3, X4). F2 X1 X2 X3 X4)"
assumes "cont (λ(X1, X2, X3, X4). F3 X1 X2 X3 X4)"
assumes "cont (λ(X1, X2, X3, X4). F4 X1 X2 X3 X4)"
shows "fix_syn (λ(X1, X2, X3, X4). (F1 X1 X2 X3 X4, F2 X1 X2 X3 X4, F3 X1 X2 X3 X4, F4 X1 X2 X3 X4))
= (λ(x1, x2, x4, x3). (x1, x2, x3, x4))
(fix_syn (λ(X1, X2, X4, X3). (F1 X1 X2 X3 X4, F2 X1 X2 X3 X4, F4 X1 X2 X3 X4, F3 X1 X2 X3 X4)))"
by (induct rule: parallel_fix_ind) (use assms in ‹auto simp: prod_cont_iff›)

lemma r4_3:
"( root4  :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree
, op4    :: [:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree
, grep4  :: [:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree )
= (Λ (root, op, rep, grep). (root, op, grep))⋅
( root3  :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree
, op3    :: [:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree
, rep3   :: [:'a:] → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree
, grep3  :: [:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree )"
unfolding root3_op3_rep3_grep3_def root4_op4_grep4_def
apply (clarsimp simp: slist_case_distr match_Null_match_Node_tree_case tree_case_distr eta_cfun)
apply (subst fix_syn4_permute; clarsimp simp: fix_cprod fix_const) ―‹ Slow ›
done

subsection‹ Step 5: Simplify to Bird's simpler forms'' ›

text‹

The remainder of @{const ‹left2›} in @{const ‹grep4›} can be simplified by transforming the
@{text "case"} scrutinee from @{term "cfst⋅usvs :@ [:v:]"} into @{term "cfst⋅usvs"}.

›

fixrec
root5  :: "[:'a::Eq_def:] → ([:'a:] × [:'a:]) tree"
and op5    :: "[:'a::Eq_def:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree"
and grep5  :: "[:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree"
where
[simp del]:
"root5⋅pat = grep5⋅pat⋅Null⋅([::], pat)"
| "op5⋅pat⋅Null⋅x = root5⋅pat"
| "usvs ≠ ⊥ ⟹
op5⋅pat⋅(Node⋅usvs⋅l⋅r)⋅x = If prefix⋅[:x:]⋅(csnd⋅usvs) then r else op5⋅pat⋅l⋅x"
| [simp del]:
"grep5⋅pat⋅l⋅usvs = Node⋅usvs⋅(next⋅(csnd⋅usvs)⋅l)⋅(case csnd⋅usvs of
[::] ⇒ Null
| v :# vs ⇒ grep5⋅pat⋅(case cfst⋅usvs of ― ‹ was @{term ‹cfst⋅usvs :@ [:v:]›} ›
[::] ⇒ root5⋅pat
| u :# us ⇒ sfoldl⋅(op5⋅pat)⋅(root5⋅pat)⋅(us :@ [:v:]))⋅(cfst⋅usvs :@ [:v:], vs))"

schematic_goal root5_op5_grep5_def:
"( root5  :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree
, op5    :: [:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree
, grep5  :: [:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree )
= fix⋅?F"
unfolding root5_def op5_def grep5_def by simp

lemma op5_grep5_strict[simp]:
"op5⋅pat⋅⊥ = ⊥"
"op5⋅pat⋅(Node⋅(us, ⊥)⋅l⋅r) = ⊥"
"op5⋅pat⋅(Node⋅usvs⋅l⋅r)⋅⊥ = ⊥"
"grep5⋅pat⋅l⋅⊥ = ⊥"
apply fixrec_simp
apply (cases "us = ⊥"; fixrec_simp; simp)
apply (cases "usvs = ⊥"; fixrec_simp; simp)
apply fixrec_simp
done

lemma r5_4:
"( root5  :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree
, op5    :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree
, grep5  :: [:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree )
= ( root4  :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree
, op4    :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree
, grep4  :: [:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree )"
unfolding root4_op4_grep4_def root5_op5_grep5_def
by (clarsimp simp: slist_case_distr slist_case_snoc stail_sappend cong: slist_case_cong)

subsection‹ Step 6: Memoize left \label{sec:KMP:increase_sharing} ›

text‹

The last substantial step is to memoise the computation of the left subtrees by tying the knot.

Note this makes the computation of ‹us› in the tree redundant; we remove it in \S\ref{sec:KMP:step8}.

›

fixrec ―‹ Bird p132 ›
root6  :: "[:'a::Eq_def:] → ([:'a:] × [:'a:]) tree"
and op6    :: "[:'a::Eq_def:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree"
and grep6  :: "[:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a:]) tree"
where
[simp del]:
"root6⋅pat = grep6⋅pat⋅Null⋅([::], pat)"
| "op6⋅pat⋅Null⋅x = root6⋅pat"
| "usvs ≠ ⊥ ⟹
op6⋅pat⋅(Node⋅usvs⋅l⋅r)⋅x = If prefix⋅[:x:]⋅(csnd⋅usvs) then r else op6⋅pat⋅l⋅x"
| [simp del]:
"grep6⋅pat⋅l⋅usvs = Node⋅usvs⋅(next⋅(csnd⋅usvs)⋅l)⋅(case csnd⋅usvs of
[::] ⇒ Null
| v :# vs ⇒ grep6⋅pat⋅(op6⋅pat⋅l⋅v)⋅(cfst⋅usvs :@ [:v:], vs))"

schematic_goal root6_op6_grep6_def:
"( root6  :: [:'a::Eq_def:] → ([:'a:] × [:'a:]) tree
, op6    :: [:'a:] → ([:'a:] × [:'a:]) tree → 'a → ([:'a:] × [:'a:]) tree
, grep6  :: [:'a:] → ([:'a:] × [:'a:]) tree → [:'a:] × [:'a:] → ([:'a:] × [:'a`