Thursday, July 14, 2011

Combinatory Regular Expressions in Haskell

Combinatory parsing fascinates me. It is an area that has open problems, but also a lot of solved problems that make great exercises. A while ago I tried to define a combinatory regular expression matcher in F#, and was puzzled at how complex it turned out to be. As usual, marginal understanding was to blame.

Now I turned to Haskell for a change. I do think it helped, to learn more both about the problem and about Haskell. I hope to backport this exercise to the ML world, in the meanwhile here is the gist:

{-# OPTIONS -XGADTs #-}
-- This is an excercise in constructing a combinator-based regular
-- expression matcher for self-education.
--
-- Special thanks to Roman Cheplyaka (@shebang) for pointing out to me
-- the need for state reduction to eliminate exponential complexity on
-- certain benchmarks.
module Regex (Regex, SM, token, compile, run, char, string) where
-- Regular expressions are parameterized by the token and return
-- types. For convenience I use GADTs to capture different ways regex
-- shapes as constructors, and compile them later.
--
-- There is a very curious situation with the Kleene star. I included
-- it initially, and indeed it seems that in ML it would be necessary.
-- However Haskell admits a definition:
--
-- star :: Regex t r -> Regex t [r]
-- star x = s where s = Choice (Zip (:) x s) (Empty [])
--
-- It is a lot less efficient than a custom case would be, but it
-- works as expected. `Control.Applicative` contains `many` that is
-- the generalized version of `star`.
import Control.Applicative
import Control.Monad.State
import Data.Char
import Data.List
import qualified Data.Set as S
data Regex t r where
Empty :: r -> Regex t r
Fail :: Regex t r
Map :: (a -> b) -> Regex t a -> Regex t b
Zip :: (a -> b -> r) -> Regex t a -> Regex t b -> Regex t r
Choice :: Regex t r -> Regex t r -> Regex t r
Token :: (t -> Maybe r) -> Regex t r
instance Functor (Regex t) where
fmap = Map
instance Applicative (Regex t) where
pure = Empty
(<*>) = Zip ($)
instance Alternative (Regex t) where
empty = Fail
(<|>) = Choice
-- In order to recognize state sharing I need to compare
-- states. Physical equality is not available in Haskell, so instead I
-- mark them with fresh identifiers.
newtype Id = Id Integer deriving (Eq, Ord, Show)
data Gen = Gen Integer Integer
type Fresh a = State Gen a
initial :: Gen
initial = Gen 1 0
gen :: Gen -> (Id, Gen)
gen (Gen a b) = (Id b, Gen a (a + b))
split :: Gen -> (Gen, Gen)
split (Gen a b) = (Gen (2 * a) b, Gen (2 * a) (a + b))
fresh :: Fresh Id
fresh = do (id, next) <- gets gen
modify (const next)
return id
-- State trees may be infinite and therefore I need to sometimes split
-- ID generation.
par :: Fresh a -> Fresh b -> Fresh (a, b)
par a b = do (g0, g1) <- gets split
let (g2, g3) = split g0
(x, _) = runState a g2
(y, _) = runState b g3
modify (const g1)
return (x, y)
-- State machines represent compiled regular expressions. The machine
-- is either in an final state (accept or reject), or is waiting for
-- the next token. Continuations are provided for no token (end of
-- stream) and every possible next token. Waiting states are marked,
-- so that forked states can be reduced by eliminating duplicates.
data SM t r where
Accept :: r -> SM t r
Reject :: SM t r
Expect :: Id -> SM t r -> (t -> SM t r) -> SM t r
Fork :: SM t r -> SM t r -> SM t r
instance Functor (SM t) where
fmap f (Accept x) = Accept (f x)
fmap f Reject = Reject
fmap f (Expect i z e) = Expect i (fmap f z) (fmap f . e)
fmap f (Fork a b) = Fork (fmap f a) (fmap f b)
-- Identical state reduction happens here. Backup values implement
-- greedy matching (see machine runner below).
cut :: Maybe r -> SM t r -> (Maybe r, SM t r)
cut backup sm = (backup' `mplus` backup, sm') where
(sm', (backup', _)) = runState (mk sm) (Nothing, S.empty)
jn a@(Accept _) _ = return a
jn a b@(Accept x) = do let up (Nothing, s) = (Just x, s)
up x = x
modify up
return a
jn Reject x = return x
jn x Reject = return x
jn x y = return $ Fork x y
mk (Fork a b) = do ax <- mk a
bx <- mk b
jn ax bx
mk sm@(Expect id _ _) = do let f (_, s) = S.member id s
let up (x, s) = (x, S.insert id s)
seen <- gets f
if seen
then return Reject
else do modify up
return sm
mk sm = return sm
-- Compilation uses CPS with accept and reject continuations.
compile :: Regex t r -> SM t r
compile rx = fst $ runState (c (Accept id) Reject rx) initial
c :: SM t (a -> r) -> SM t r -> Regex t a -> Fresh (SM t r)
c y n (Empty x) = return $ fmap ($x) y
c y n Fail = return $ n
c y n (Map f a) = c (fmap (\r x -> r (f x)) y) n a
c y n (Zip f a b) = do let k r x y = r (f y x)
bm <- c (fmap k y) (fmap (\x _ -> x) n) b
c bm n a
c y n (Choice a b) = do (am, bm) <- c y n a `par` c y n b
return $ Fork am bm
c y n (Token f) = do i <- fresh
let e t = p t $ f t
p t (Just x) = fmap ($x) y
p t Nothing = push t n
return $ Expect i n e where
push :: t -> SM t r -> SM t r
push t (Expect _ _ f) = f t
push t (Fork a b) = Fork (push t a) (push t b)
push _ x = x
advance :: SM t r -> SM t r
advance (Expect _ z _) = z
advance (Fork a b) = Fork (advance a) (advance b)
advance x = x
-- Finally, the machine runner function.
run :: SM t r -> [t] -> Maybe r
run sm ts = r Nothing sm ts where
r _ (Accept x) _ = Just x
r b Reject _ = b
r b sm (t:ts) = r b' (push t sm') ts where (b', sm') = cut b sm
r b sm [] = r b' (advance sm') [] where (b', sm') = cut b sm
string :: Eq t => [t] -> Regex t [t]
string [] = Empty []
string (c:cs) = Zip (:) (char c) (string cs)
token :: (t -> Maybe r) -> Regex t r
token = Token
char :: Eq t => t -> Regex t t
char c = Token (\x -> if x == c then Just c else Nothing)
view raw Regex.hs hosted with ❤ by GitHub