Now I turned to Haskell for a change. I do think it helped, to learn more both about the problem and about Haskell. I hope to backport this exercise to the ML world, in the meanwhile here is the gist:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# OPTIONS -XGADTs #-} | |
-- This is an excercise in constructing a combinator-based regular | |
-- expression matcher for self-education. | |
-- | |
-- Special thanks to Roman Cheplyaka (@shebang) for pointing out to me | |
-- the need for state reduction to eliminate exponential complexity on | |
-- certain benchmarks. | |
module Regex (Regex, SM, token, compile, run, char, string) where | |
-- Regular expressions are parameterized by the token and return | |
-- types. For convenience I use GADTs to capture different ways regex | |
-- shapes as constructors, and compile them later. | |
-- | |
-- There is a very curious situation with the Kleene star. I included | |
-- it initially, and indeed it seems that in ML it would be necessary. | |
-- However Haskell admits a definition: | |
-- | |
-- star :: Regex t r -> Regex t [r] | |
-- star x = s where s = Choice (Zip (:) x s) (Empty []) | |
-- | |
-- It is a lot less efficient than a custom case would be, but it | |
-- works as expected. `Control.Applicative` contains `many` that is | |
-- the generalized version of `star`. | |
import Control.Applicative | |
import Control.Monad.State | |
import Data.Char | |
import Data.List | |
import qualified Data.Set as S | |
data Regex t r where | |
Empty :: r -> Regex t r | |
Fail :: Regex t r | |
Map :: (a -> b) -> Regex t a -> Regex t b | |
Zip :: (a -> b -> r) -> Regex t a -> Regex t b -> Regex t r | |
Choice :: Regex t r -> Regex t r -> Regex t r | |
Token :: (t -> Maybe r) -> Regex t r | |
instance Functor (Regex t) where | |
fmap = Map | |
instance Applicative (Regex t) where | |
pure = Empty | |
(<*>) = Zip ($) | |
instance Alternative (Regex t) where | |
empty = Fail | |
(<|>) = Choice | |
-- In order to recognize state sharing I need to compare | |
-- states. Physical equality is not available in Haskell, so instead I | |
-- mark them with fresh identifiers. | |
newtype Id = Id Integer deriving (Eq, Ord, Show) | |
data Gen = Gen Integer Integer | |
type Fresh a = State Gen a | |
initial :: Gen | |
initial = Gen 1 0 | |
gen :: Gen -> (Id, Gen) | |
gen (Gen a b) = (Id b, Gen a (a + b)) | |
split :: Gen -> (Gen, Gen) | |
split (Gen a b) = (Gen (2 * a) b, Gen (2 * a) (a + b)) | |
fresh :: Fresh Id | |
fresh = do (id, next) <- gets gen | |
modify (const next) | |
return id | |
-- State trees may be infinite and therefore I need to sometimes split | |
-- ID generation. | |
par :: Fresh a -> Fresh b -> Fresh (a, b) | |
par a b = do (g0, g1) <- gets split | |
let (g2, g3) = split g0 | |
(x, _) = runState a g2 | |
(y, _) = runState b g3 | |
modify (const g1) | |
return (x, y) | |
-- State machines represent compiled regular expressions. The machine | |
-- is either in an final state (accept or reject), or is waiting for | |
-- the next token. Continuations are provided for no token (end of | |
-- stream) and every possible next token. Waiting states are marked, | |
-- so that forked states can be reduced by eliminating duplicates. | |
data SM t r where | |
Accept :: r -> SM t r | |
Reject :: SM t r | |
Expect :: Id -> SM t r -> (t -> SM t r) -> SM t r | |
Fork :: SM t r -> SM t r -> SM t r | |
instance Functor (SM t) where | |
fmap f (Accept x) = Accept (f x) | |
fmap f Reject = Reject | |
fmap f (Expect i z e) = Expect i (fmap f z) (fmap f . e) | |
fmap f (Fork a b) = Fork (fmap f a) (fmap f b) | |
-- Identical state reduction happens here. Backup values implement | |
-- greedy matching (see machine runner below). | |
cut :: Maybe r -> SM t r -> (Maybe r, SM t r) | |
cut backup sm = (backup' `mplus` backup, sm') where | |
(sm', (backup', _)) = runState (mk sm) (Nothing, S.empty) | |
jn a@(Accept _) _ = return a | |
jn a b@(Accept x) = do let up (Nothing, s) = (Just x, s) | |
up x = x | |
modify up | |
return a | |
jn Reject x = return x | |
jn x Reject = return x | |
jn x y = return $ Fork x y | |
mk (Fork a b) = do ax <- mk a | |
bx <- mk b | |
jn ax bx | |
mk sm@(Expect id _ _) = do let f (_, s) = S.member id s | |
let up (x, s) = (x, S.insert id s) | |
seen <- gets f | |
if seen | |
then return Reject | |
else do modify up | |
return sm | |
mk sm = return sm | |
-- Compilation uses CPS with accept and reject continuations. | |
compile :: Regex t r -> SM t r | |
compile rx = fst $ runState (c (Accept id) Reject rx) initial | |
c :: SM t (a -> r) -> SM t r -> Regex t a -> Fresh (SM t r) | |
c y n (Empty x) = return $ fmap ($x) y | |
c y n Fail = return $ n | |
c y n (Map f a) = c (fmap (\r x -> r (f x)) y) n a | |
c y n (Zip f a b) = do let k r x y = r (f y x) | |
bm <- c (fmap k y) (fmap (\x _ -> x) n) b | |
c bm n a | |
c y n (Choice a b) = do (am, bm) <- c y n a `par` c y n b | |
return $ Fork am bm | |
c y n (Token f) = do i <- fresh | |
let e t = p t $ f t | |
p t (Just x) = fmap ($x) y | |
p t Nothing = push t n | |
return $ Expect i n e where | |
push :: t -> SM t r -> SM t r | |
push t (Expect _ _ f) = f t | |
push t (Fork a b) = Fork (push t a) (push t b) | |
push _ x = x | |
advance :: SM t r -> SM t r | |
advance (Expect _ z _) = z | |
advance (Fork a b) = Fork (advance a) (advance b) | |
advance x = x | |
-- Finally, the machine runner function. | |
run :: SM t r -> [t] -> Maybe r | |
run sm ts = r Nothing sm ts where | |
r _ (Accept x) _ = Just x | |
r b Reject _ = b | |
r b sm (t:ts) = r b' (push t sm') ts where (b', sm') = cut b sm | |
r b sm [] = r b' (advance sm') [] where (b', sm') = cut b sm | |
string :: Eq t => [t] -> Regex t [t] | |
string [] = Empty [] | |
string (c:cs) = Zip (:) (char c) (string cs) | |
token :: (t -> Maybe r) -> Regex t r | |
token = Token | |
char :: Eq t => t -> Regex t t | |
char c = Token (\x -> if x == c then Just c else Nothing) | |