I would like to parse a basic indented language using Megaparsec. Originally I was using Parsec which I managed to get working correctly with indentation but now I'm having quite some trouble.
I've been following a tutorial here and here's the code I have to parse a language ignoring indentation.
module Parser where
import Data.Functor ((<$>), (<$))
import Control.Applicative (Applicative(..))
import qualified Control.Monad as M
import Control.Monad (void)
import Data.Functor.Identity
import Data.Text (Text)
import qualified Data.Text as Text
import Data.Void
import Text.Megaparsec
import Text.Megaparsec.Char
import Text.Megaparsec.Perm
import Text.Megaparsec.Expr
import qualified Text.Megaparsec.Char.Lexer as L
import Text.Pretty.Simple
import Data.Either.Unwrap
--import Lexer
import Syntax
type Parser = Parsec Void String
lineComment :: Parser ()
lineComment = L.skipLineComment "#"
scn :: Parser ()
scn = L.space space1 lineComment empty
sc :: Parser () -- ‘sc’ stands for “space consumer”
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
where
f x = x == ' ' || x == '\t'
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
symbol :: String -> Parser String
symbol = L.symbol sc
integer :: Parser Integer
integer = lexeme L.decimal
semi :: Parser String
semi = symbol ";"
rword :: String -> Parser ()
rword w = lexeme (string w *> notFollowedBy alphaNumChar)
rws :: [String] -- list of reserved words
rws = ["if","then","else","while","do","skip","true","false","not","and","or"]
identifier :: Parser String
identifier = (lexeme . try) (p >>= check)
where
p = (:) <$> letterChar <*> many alphaNumChar
check x = if x `elem` rws
then fail $ "keyword " ++ show x ++ " cannot be an identifier"
else return x
parens :: Parser a -> Parser a
parens = between (symbol "(") (symbol ")")
whileParser :: Parser Stmt
whileParser = between sc eof stmt
stmt :: Parser Stmt
stmt = f <$> sepBy1 stmt' semi
where
-- if there's only one stmt return it without using ‘Seq’
f l = if length l == 1 then head l else Seq l
stmt' :: Parser Stmt
stmt' = ifStmt
<|> whileStmt
<|> skipStmt
<|> assignStmt
<|> parens stmt
ifStmt :: Parser Stmt
ifStmt = do
rword "if"
cond <- bExpr
rword "then"
stmt1 <- stmt
rword "else"
stmt2 <- stmt
return (If cond stmt1 stmt2)
whileStmt :: Parser Stmt
whileStmt = do
rword "while"
cond <- bExpr
rword "do"
stmt1 <- stmt
return (While cond stmt1)
assignStmt :: Parser Stmt
assignStmt = do
var <- identifier
void (symbol ":=")
expr <- aExpr
return (Assign var expr)
skipStmt :: Parser Stmt
skipStmt = Skip <$ rword "skip"
aExpr :: Parser AExpr
aExpr = makeExprParser aTerm aOperators
bExpr :: Parser BExpr
bExpr = makeExprParser bTerm bOperators
aOperators :: [[Operator Parser AExpr]]
aOperators =
[ [Prefix (Neg <$ symbol "-") ]
, [ InfixL (ABinary Multiply <$ symbol "*")
, InfixL (ABinary Divide <$ symbol "/") ]
, [ InfixL (ABinary Add <$ symbol "+")
, InfixL (ABinary Subtract <$ symbol "-") ]
]
bOperators :: [[Operator Parser BExpr]]
bOperators =
[ [Prefix (Not <$ rword "not") ]
, [InfixL (BBinary And <$ rword "and")
, InfixL (BBinary Or <$ rword "or") ]
]
aTerm :: Parser AExpr
aTerm = parens aExpr
<|> Var <$> identifier
<|> IntConst <$> integer
bTerm :: Parser BExpr
bTerm = parens bExpr
<|> (BoolConst True <$ rword "true")
<|> (BoolConst False <$ rword "false")
<|> rExpr
rExpr :: Parser BExpr
rExpr = do
a1 <- aExpr
op <- relation
a2 <- aExpr
return (RBinary op a1 a2)
relation :: Parser RBinOp
relation = (symbol ">" *> pure Greater)
<|> (symbol "<" *> pure Less)
parsePrint :: String -> IO()
parsePrint s = do
parseTest stmt' s
Running this parses correctly.
parsePrint $ unlines
[ "while (true) do if(false) then x := 5 else y := 20"
]
This is the code for parsing indentation from the second tutorial here.
{-# LANGUAGE TupleSections #-}
module Main where
import Control.Applicative (empty)
import Control.Monad (void)
import Data.Void
import Data.Char (isAlphaNum)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
type Parser = Parsec Void String
lineComment :: Parser ()
lineComment = L.skipLineComment "#"
scn :: Parser ()
scn = L.space space1 lineComment empty
sc :: Parser ()
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
where
f x = x == ' ' || x == '\t'
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
pItem :: Parser String
pItem = lexeme (takeWhile1P Nothing f) <?> "list item"
where
f x = isAlphaNum x || x == '-'
pComplexItem :: Parser (String, [String])
pComplexItem = L.indentBlock scn p
where
p = do
header <- pItem
return (L.IndentMany Nothing (return . (header, )) pLineFold)
pLineFold :: Parser String
pLineFold = L.lineFold scn $ \sc' ->
let ps = takeWhile1P Nothing f `sepBy1` try sc'
f x = isAlphaNum x || x == '-'
in unwords <$> ps <* sc
pItemList :: Parser (String, [(String, [String])])
pItemList = L.nonIndented scn (L.indentBlock scn p)
where
p = do
header <- pItem
return (L.IndentSome Nothing (return . (header, )) pComplexItem)
parser :: Parser (String, [(String, [String])])
parser = pItemList <* eof
main :: IO ()
main = return ()
I would like as an example for this to parse correctly.
parsePrint $ unlines
[ "while (true) do"
, " if(false) then x := 5 else y := 20"
]
How could I parse indentation correctly? Also are there any other places with tutorials/documentation on using Megaparsec?
After spending a lot of time on this over the last couple of weeks I managed to work it out. It was a matter of moving from using strings to using my own "Expr" data type.
For anybody else who would like to start writing an indented language this code could be a good start!
Parser
{-# LANGUAGE TupleSections #-}
module IndentTest where
import Control.Applicative (empty)
import Control.Monad (void)
import Data.Void
import Data.Char (isAlphaNum)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
import Text.Megaparsec.Expr
import Block
type Parser = Parsec Void String
-- Tokens
lineComment :: Parser ()
lineComment = L.skipLineComment "#"
scn :: Parser ()
scn = L.space space1 lineComment empty
sc :: Parser ()
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
where
f x = x == ' ' || x == '\t'
symbol :: String -> Parser String
symbol = L.symbol sc
rword :: String -> Parser ()
rword w = lexeme (string w *> notFollowedBy alphaNumChar)
rws :: [String] -- list of reserved words
rws = ["module", "println", "import", "let", "if","then","else","while","do","skip","true","false","not","and","or"]
word :: Parser String
word = (lexeme . try) (p >>= check)
where
p = (:) <$> alphaNumChar <*> many alphaNumChar
check x = if x `elem` rws
then fail $ "keyword " ++ show x ++ " cannot be an word"
else return x
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
integer :: Parser Integer
integer = lexeme L.decimal
parens :: Parser a -> Parser a
parens = between (symbol "(") (symbol ")")
aTerm :: Parser AExpr
aTerm = parens aExpr
<|> Var <$> identifier
<|> IntConst <$> integer
aOperators :: [[Operator Parser AExpr]]
aOperators =
[ [Prefix (Neg <$ symbol "-") ]
, [ InfixL (ABinary Multiply <$ symbol "*")
, InfixL (ABinary Divide <$ symbol "/") ]
, [ InfixL (ABinary Add <$ symbol "+")
, InfixL (ABinary Subtract <$ symbol "-") ]
]
aExpr :: Parser AExpr
aExpr = makeExprParser aTerm aOperators
assignArith :: Parser Expr
assignArith = do
var <- identifier
symbol ":"
vType <- valType
symbol "="
e <- aExpr
return $ AssignArith vType var e
bTerm :: Parser BExpr
bTerm = parens bExpr
<|> (BoolConst True <$ rword "true")
<|> (BoolConst False <$ rword "false")
<|> rExpr
bOperators :: [[Operator Parser BExpr]]
bOperators =
[ [Prefix (Not <$ rword "not") ]
, [InfixL (BBinary And <$ rword "and")
, InfixL (BBinary Or <$ rword "or") ]
]
bExpr :: Parser BExpr
bExpr = makeExprParser bTerm bOperators
rExpr :: Parser BExpr
rExpr = do
a1 <- aExpr
op <- relation
a2 <- aExpr
return (RBinary op a1 a2)
relation :: Parser RBinOp
relation = (symbol ">" *> pure Greater)
<|> (symbol "<" *> pure Less)
identifier :: Parser String
identifier = (lexeme . try) (p >>= check)
where
p = (:) <$> letterChar <*> many alphaNumChar
check x = if x `elem` rws
then fail $ "keyword " ++ show x ++ " cannot be an identifier"
else return x
stringLiteral :: Parser Expr
stringLiteral = do
value <- char '"' >> manyTill L.charLiteral (char '"')
symbol ";"
return $ StringLiteral value
assignString :: Parser Expr
assignString = do
var <- identifier
symbol ":"
vType <- valType
symbol "="
e <- stringLiteral
return (AssignString vType var e)
arrayDef :: Parser Expr
arrayDef = do
name <- identifier
symbol ":"
symbol "["
arrType <- word
symbol "]"
symbol "="
return $ ArrayDef arrType name
arrayValues :: Parser Expr
arrayValues = do
symbol "["
values <- many identifier
symbol "]"
return $ ArrayValues values
arrayAssign :: Parser Expr
arrayAssign = do
def <- arrayDef
values <- arrayValues
return $ ArrayAssignment def values
arrayElementSelect :: Parser Expr
arrayElementSelect = do
symbol "!!"
elementNum <- word
return $ ArrayElementSelect elementNum
moduleParser :: Parser Expr
moduleParser = L.nonIndented scn (L.indentBlock scn p)
where
p = do
rword "module"
name <- identifier
return (L.IndentSome Nothing (return . (Module name)) expr')
valType :: Parser Expr
valType = do
value <- identifier
return $ Type value
argumentType :: Parser Expr
argumentType = do
value <- identifier
return $ ArgumentType value
returnType :: Parser Expr
returnType = do
value <- identifier
return $ ReturnType value
argument :: Parser Expr
argument = do
value <- identifier
return $ Argument value
-- Function parser
functionParser :: Parser Expr
functionParser = L.indentBlock scn p
where
p = do
name <- identifier
symbol ":"
argTypes <- some argumentType
symbol "->"
rType <- IndentTest.returnType
nameDup <- L.lineFold scn $ \sp' ->
(identifier) `sepBy1` try sp' <* scn
args <- many argument
symbol "="
if(name == "main") then
return (L.IndentMany Nothing (return . (MainFunction name argTypes args rType)) expr')
else
return (L.IndentMany Nothing (return . (Function name argTypes args rType)) expr')
functionCallParser :: Parser Expr
functionCallParser = do
name <- identifier
args <- parens $ many argument
return $ FunctionCall name args
printParser :: Parser Expr
printParser = do
rword "println"
bodyArr <- identifier
symbol ";"
return $ Print bodyArr
valueToken :: Parser String
valueToken = lexeme (takeWhile1P Nothing f) <?> "list item"
where
f x = isAlphaNum x || x == '-'
ifStmt :: Parser Expr
ifStmt = L.indentBlock scn p
where
p = do
rword "if"
cond <- bExpr
return (L.IndentMany Nothing (return . (If cond)) expr')
elseStmt :: Parser Expr
elseStmt = L.indentBlock scn p
where
p = do
rword "else"
return (L.IndentMany Nothing (return . (Else)) expr')
whereStmt :: Parser Expr
whereStmt = do
rword "where"
symbol "{"
exprs <- many expr
symbol "}"
return $ (Where exprs)
expr :: Parser Expr
expr = f <$> sepBy1 expr' (symbol ";")
where
-- if there's only one expr return it without using ‘Seq’
f l = if length l == 1 then head l else Seq l
expr' :: Parser Expr
expr' = try moduleParser
<|> try functionParser
<|> try ifStmt
<|> try elseStmt
<|> try arrayAssign
<|> arrayElementSelect
<|> try assignArith
<|> try functionCallParser
<|> try assignString
<|> try printParser
<|> try whereStmt
<|> try stringLiteral
parser :: Parser Expr
parser = expr'
parseFromFile file = runParser expr file <$> readFile file
parseString input =
case parse expr' "" input of
Left e -> show e
Right x -> show x
parsePrint :: String -> IO()
parsePrint s = parseTest' parser s
Block/Expr - The AST consists of this
module Block where
import Data.List
import Text.Show.Functions
import Data.Char
import Data.Maybe
-- Boolean expressions
data BExpr
= BoolConst Bool
| Not BExpr
| BBinary BBinOp BExpr BExpr
| RBinary RBinOp AExpr AExpr
instance Show BExpr where
show (BoolConst b) = lowerString $ show b
show (Not n) = show n
show (BBinary bbinop bExpr1 bExpr2) = show bExpr1 ++ " " ++ show bbinop ++ " " ++ show bExpr2
show (RBinary rbinop aExpr1 aExpr2) = show aExpr1 ++ " " ++ show rbinop ++ " " ++ show aExpr2
-- Boolean ops
data BBinOp
= And
| Or
instance Show BBinOp where
show (And) = "&&"
show (Or) = "||"
-- R binary ops
data RBinOp
= Greater
| Less
instance Show RBinOp where
show (Greater) = ">"
show (Less) = "<"
-- Arithmetic expressions
data AExpr
= Var String
| IntConst Integer
| Neg AExpr
| ABinary ABinOp AExpr AExpr
| Parenthesis AExpr
instance Show AExpr where
show (Var v) = v
show (IntConst i) = show i
show (Neg aExpr) = "-" ++ show aExpr
show (ABinary aBinOp aExpr1 aExpr2) = show aExpr1 ++ " " ++ show aBinOp ++ " " ++ show aExpr2
show (Parenthesis aExpr) = "(" ++ show aExpr ++ ")"
-- Arithmetic ops
data ABinOp
= OpeningParenthesis
| ClosingParenthesis
| Add
| Subtract
| Multiply
| Divide
instance Show ABinOp where
show (Add) = "+"
show (Subtract) = "-"
show (Multiply) = "*"
show (Divide) = "/"
show (OpeningParenthesis) = "("
show (ClosingParenthesis) = ")"
-- Statements
data Expr
= Seq [Expr]
| Module String [Expr]
| Import String String
| MainFunction {name ::String, argTypes:: [Expr], args::[Expr], returnType::Expr, body::[Expr]}
| Function String [Expr] [Expr] Expr [Expr]
| FunctionCall String [Expr]
| Type String
| ValueType String
| Argument String
| ArgumentType String
| ReturnType String
| AssignArith Expr String AExpr
| AssignString Expr String Expr
| If BExpr [Expr]
| Else [Expr]
| While BExpr [Expr]
| Print String
| Return Expr
| ArrayValues [String]
| ArrayDef String String
| ArrayAssignment Expr Expr
| ArrayElementSelect String
| Lambda String String
| Where [Expr]
| StringLiteral String
| Skip
instance Show Expr where
show (Module name bodyArray) =
-- Get the main function tree
"public class " ++ name ++ "{\n" ++
"public static void main(String[] args){\n" ++
name ++ " " ++ lowerString name ++ "= new " ++ name ++ "();\n" ++
intercalate "\n" (map (\mStatement -> if(isFunctionCall mStatement) then (lowerString name ++ "." ++ show mStatement) else show mStatement) (body ((filter (isMainFunction) bodyArray)!!0))) ++
"}\n" ++
getFunctionString bodyArray ++
"}\n"
show (Import directory moduleName) = "import " ++ directory ++ moduleName
show (Function name argTypes args returnType body) = "public " ++ show returnType ++ " " ++ name ++ "("++ intercalate ", " (zipWith (\x y -> x ++ " " ++ y) (map show argTypes) (map show args)) ++"){\n" ++ intercalate "\n" (map show body) ++ "}"
show (MainFunction name argTypes args returnType body) =
intercalate "\n " $ map show body
show (FunctionCall name exprs) = name ++ "(" ++ (intercalate ", " (map show exprs)) ++ ");"
show (Type b) = b
show (Argument b) = b
show (ArgumentType b) = b
show (ReturnType b) = b
show (AssignArith vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
show (AssignString vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
show (If condition statement) = "if(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
show (Else statement) = " else {\n" ++ intercalate "\n" (map show statement) ++ "}"
show (While condition statement) = "while(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
show (Skip) = "[skip]"
show (Seq s) = "[seq]"
show (Return expr) = "return " ++ show expr ++ ";"
show (Print exprs) = "System.out.println(" ++ exprs ++ ");" --"System.out.println(" ++ intercalate " " (map show exprs) ++ ");"
show (ArrayDef arrType name) = arrType ++ "[] " ++ name ++ "="
show (ArrayValues exprs) = "{" ++ intercalate ", " exprs ++ "};"
show (ArrayAssignment arr values) = show arr ++ show values
show (ArrayElementSelect i) = "[" ++ i ++ "];"
show (Lambda valName collectionName) = ""
show (Where exprs) = intercalate "\n" (map show exprs)
show (StringLiteral value) = "\"" ++ value ++ "\""
show (_) = "<unknown>"
lowerString str = [ toLower loweredString | loweredString <- str]
extractMain :: Expr -> Maybe String
extractMain (MainFunction m _ _ _ _) = Just m
extractMain _ = Nothing
extractFunctionCall :: Expr -> Maybe String
extractFunctionCall (FunctionCall m _) = Just m
extractFunctionCall _ = Nothing
isMainFunction :: Expr -> Bool
isMainFunction e = isJust $ extractMain e
isFunctionCall :: Expr -> Bool
isFunctionCall e = isJust $ extractFunctionCall e
{--
getInnerMainFunctionString :: [Expr] -> String -> String
getInnerMainFunctionString e instanceName = do
if(isMainFunction (e!!0)) then
show (e!!0)
else
getInnerMainFunctionString (drop 1 e) instanceName
--}
getFunctionString :: [Expr] -> String
getFunctionString e = do
if(isMainFunction (e!!0)) then
""
else
"" ++ show (e!!0) ++ getFunctionString (drop 1 e)
Code Example
module IndentationTest
testFunction : int -> void
testFunction x =
if(x < 50)
println x;
nextX :int = x + 1 * 2 - 3 / 2 + 5
testFunction (nextX)
else
last :int = 1000
println last;
main : String -> IO
main args =
x :int = 3
y :int = 10
z :int = 15
arrTest:[int] = [x y z]
println arrTest;
testFunction (x)
stringTest :String = "Helloworld";
This will successfully parse the example code. Just pass it into the parsePrint function.
Related
I am trying to parse a bool expression in Haskell. This line is giving me an error: BoolExpr <$> parseBoolOp <*> (n : ns). This is the error:
• Couldn't match type ‘[]’ with ‘Parser’
Expected type: Parser [Expr]
Actual type: [Expr]
-- define the expression types
data Expr
= BoolExpr BoolOp [Expr]
deriving (Show, Eq)
-- define the type for bool value
data Value
= BoolVal Bool
deriving (Show, Eq)
-- many x = Parser.some x <|> pure []
-- some x = (:) <$> x <*> Parser.many x
kstar :: Alternative f => f a -> f [a]
kstar x = kplus x <|> pure []
kplus :: Alternative f => f a -> f [a]
kplus x = (:) <$> x <*> kstar x
symbol :: String -> Parser String
symbol xs = token (string xs)
-- a bool expression is the operator followed by one or more expressions that we have to parse
-- TODO: add bool expressions
boolExpr :: Parser Expr
boolExpr = do
n <- parseExpr
ns <- kstar (symbol "," >> parseExpr)
BoolExpr <$> parseBoolOp <*> (n : ns)
-- an atom is a literalExpr, which can be an actual literal or some other things
parseAtom :: Parser Expr
parseAtom =
do
literalExpr
-- the main parsing function which alternates between all the options you have
parseExpr :: Parser Expr
parseExpr =
do
parseAtom
<|> parseParens boolExpr
<|> parseParens parseExpr
-- implement parsing bool operations, these are 'and' and 'or'
parseBoolOp :: Parser BoolOp
parseBoolOp =
do symbol "and" >> return And
<|> do symbol "or" >> return Or
The boolExpr is expecting a Parser [Expr] but I am returning only an [Expr]. Is there a way to fix this or do it in another way? When I try pure (n:ns), evalStr "(and true (and false true) true)" returns Left (ParseError "'a' didn't match expected character") instead of Right (BoolVal False)
The expression (n : ns) is a list. Therefore the compiler thinks that the applicative operators <*> and <$> should be used in the context [], while you want Parser instead.
I would guess you need pure (n : ns) instead.
I'm trying to write a parser to parse a loop in the following form:
(:= x 0)
Do ((< x 10))->
(:= x (+ x 1))
print(x)
Od
What's occurring however is that my parser can only work for a loop whose body contains only one statement. To parse more than one statement, the body above would have to be written in the following way:
(:= x (+ x 1))(:= x 20)
I have tried to use delimiters such as semi-colons to try and force the parser to allow for loop body parsing to be taken line by line the above behaviour persists such that it would have to be written like: (:= x (+ x 1));(:= x 20) instead of on separate lines.
Please find my parsers below:
parsersHStatement :: Parser HStatement
parsersHStatement = try (parsePrint) <|> try (parseDo) <|> try (parseEval)
parseLoopBody :: Parser [HStatement]
parseLoopBody = many1 $ parsersHStatement
parseDo :: Parser HStatement
parseDo = do
spaces
_ <- string "Do"
spaces
_ <- string "("
p <- try (parseExpr) <|> try (parseBool)
_ <- string ")->"
spaces
q <- parseLoopBody <* spaces
spaces
_ <- string "Od"
return $ Do p q
parseEval :: Parser HStatement
parseEval = liftM Eval $ parsersHVal
parsersHVal :: Parser HVal
parsersHVal = try (parseAssign) <|> try (parsePrimitiveValue) <|> try (parseExpr)
parsePrint :: Parser HStatement
parsePrint = string "print(" *> parsersHVal <* string ")" >>= (return . Print)
parseExpr :: Parser HVal
parseExpr = do
char '('
spaces
op <- try (parseOperation)
spaces
x <- try (sepBy (parseExpr <|> parseVarOrInt) spaces)
spaces
char ')'
return $ Expr op x
parseBool :: Parser HVal
parseBool = classifyBool <$> ( (string "True") <|> (string "False") )
where
classifyBool "True" = Bool True
classifyBool "False" = Bool False
Within parseLoopBody, I tried 'feeding' spaces (many1 $ spaces *> ...) but nothing would parse then.
The following is the ADT:
data HVal
= Integer Integer
| Var String
| Bool Bool
| List [HVal]
| Expr Operation [HVal]
| Assign HVal HVal
deriving (Show, Eq, Read)
data HStatement
= Eval HVal -- Bridge between HVal and HStatement
| Print HVal
| Do HVal [HStatement]
deriving (Show, Eq, Read)
parseDo was altered to the following :
parseDo :: Parser HStatement
parseDo = do
string "Do"
spaces
string "("
p <- try (parseExpr) <|> try (parseBool)
string ")->"
spaces
q <- many1 $ parsersHStatement
spaces
string "Od"
return $ Do p q
This allows for the parsing of two statements but the second statement breaks the loop.
After a lot of fiddling around, it seemed that the error lay responsible on the function parseEval. This was changed to:
parseEval :: Parser HStatement
parseEval = do
x <- try (parseAssign) <|> try (parseExpr)
spaces
return $ Eval x
Furthermore my parseDo function was changed to:
parseDo :: Parser HStatement
parseDo = do
string "Do"
spaces
string "("
p <- try (parseExpr) <|> try (parseBool)
string ")->"
spaces
q <- many1 $ parsersHStatement
spaces
string "Od"
return $ Do p q
I'm trying to implement car, cdr, and cons functionality into a toy language I'm writing however when I try to execute my car function through main, I get the following error:
./parser "car [1 2 3]"
parser: parser.hs:(48,27)-(55,45): Non-exhaustive patterns in case
The function on lines 48-55 is the following:
parseOp :: Parser HVal
parseOp = (many1 letter <|> string "+" <|> string "-" <|> string "*" <|> string "/" <|> string "%" <|> string "&&" <|> string "||") >>=
(\x -> return $ case x of
"&&" -> Op And
"||" -> Op Or
"+" -> Op Add
"-" -> Op Sub
"*" -> Op Mult
"/" -> Op Div
"%" -> Op Mod)
I'm really unsure why the error message points to this function because it has nothing to do with the list functionality. The car function is working however because I was able to successfully execute it through GHCI. I know my problem is due to parsing but I don't see where it is. The following are the functions that relate to lists. I can't see from them how they are influenced by parseOp.
data HVal = Number Integer
| String String
| Boolean Bool
| List [HVal]
| Op Op
| Expr Op HVal HVal
| Car [HVal]
deriving (Read)
car :: [HVal] -> HVal
car xs = head xs
parseListFunctions :: Parser HVal
parseListFunctions = do
_ <- string "car "
_ <- char '['
x <- parseList
_ <- char ']'
return $ Car [x]
parseExpr :: Parser HVal
parseExpr = parseNumber
<|> parseOp
<|> parseBool
<|> parseListFunctions
<|> do
_ <- char '['
x <- parseList
_ <- char ']'
return x
<|> do
_ <- char '('
x <- parseExpression
_ <- char ')'
return x
eval :: HVal -> HVal
eval val#(Number _) = val
eval val#(String _) = val
eval val#(Boolean _) = val
eval val#(List _) = val -- Look at list eval NOT WORKING
eval val#(Op _) = val
eval (Expr op x y) = eval $ evalExpr (eval x) op (eval y)
eval (Car xs) = eval $ car xs
The removal of many1 letter in parseOp transfers the same error to the following function parseBool:
parseBool :: Parser HVal
parseBool = many1 letter >>= (\x -> return $ case x of
"True" -> Boolean True
"False" -> Boolean False)
You write
parseExpr = ... <|> parseOp <|> ... <|> parseListFunctions <|> ...
and so
car ...
is passed to parseOp first, then parseListFunctions. The parseOp parser succeeds in the
many1 letter
branch, and so in the \x -> return $ case x of ..., x is bound to "car". Because parseOp succeeds (and returns an error value with an embedded, not-yet-evaluated inexhaustive case error!), parseListFunctions is never tried.
You will need to modify your grammar to reduce the ambiguity in it, so that these conflicts where multiple branches may match do not arise.
I have the following grammar for expressions that I'm trying to represent as a Haskell ADT:
Expr = SimpleExpr [OPrelation SimpleExpr]
SimpleExpr = [OPunary] Term {OPadd Term}
Term = Factor {OPmult Factor}
where:
{} means 0 or more
[] means 0 or 1
OPmult, OPadd, OPrelation, OPunary are classes of operators
Note that this grammar does get precedence right.
Here's something I tried:
data Expr = Expr SimpleExpr (Maybe OPrelation) (Maybe SimpleExpr)
data SimpleExpr = SimpleExpr (Maybe OPunary) Term [OPadd] [Term]
data Term = Term Factor [OPmult] [Factor]
which in hindsight I think is awful, especially the [OPadd] [Term] and [OPmult] [Factor] parts. Because, for example, in the parse tree for 1+2+3 it would put [+, +] in one branch and [2, 3] in another, meaning they're decoupled.
What would be a good representation that'll play nice later in the next stages of compilation?
Decomposing { } and [ ] into more data types seems like an overkill
Using lists seems not quite right as it would no longer be a tree (Just a node that's a list)
Maybe for { }. A good idea ?
And finally, I'm assuming after parsing I'll have to pass over the Parse Tree and reduce it to an AST? or should the whole grammar be modified to be less complex? or maybe it's abstract enough?
The AST does not need to be that close to the grammar. The grammar is structured into multiple levels to encode precedence and uses repetition to avoid left-recursion while still being able to correctly handle left-associative operators. The AST does not need to worry about such things.
Instead I'd define the AST like this:
data Expr = BinaryOperation BinaryOperator Expr Expr
| UnaryOperation UnaryOperator Expr
| Literal LiteralValue
| Variable Id
data BinaryOperator = Add | Sub | Mul | Div
data UnaryOperator = Not | Negate
Here's an additional answer that might help you. I don't want to spoil your fun, so here's a very simple example grammar:
-- Expr = Term ['+' Term]
-- Term = Factor ['*' Factor]
-- Factor = number | '(' Expr ')'
-- number = one or more digits
Using a CST
As one approach, we can represent this grammar as a concrete syntax tree (CST):
data Expr = TermE Term | PlusE Term Term deriving (Show)
data Term = FactorT Factor | TimesT Factor Factor deriving (Show)
data Factor = NumberF Int | ParenF Expr deriving (Show)
A Parsec-based parser to turn the concrete syntax into a CST might look like this:
expr :: Parser Expr
expr = do
t1 <- term
(PlusE t1 <$ symbol "+" <*> term)
<|> pure (TermE t1)
term :: Parser Term
term = do
f1 <- factor
(TimesT f1 <$ symbol "*" <*> factor)
<|> pure (FactorT f1)
factor :: Parser Factor
factor = NumberF . read <$> lexeme (many1 (satisfy isDigit))
<|> ParenF <$> between (symbol "(") (symbol ")") expr
with helper functions for whitespace processing:
lexeme :: Parser a -> Parser a
lexeme p = p <* spaces
symbol :: String -> Parser String
symbol = lexeme . string
and main entry point:
parseExpr :: String -> Expr
parseExpr pgm = case parse (spaces *> expr) "(string)" pgm of
Right e -> e
Left err -> error $ show err
after which we can run:
> parseExpr "1+1*(3+4)"
PlusE (FactorT (Number 1)) (TimesT (Number 1) (ParenF (PlusE
(FactorT (Number 3)) (FactorT (Number 4)))))
>
To convert this into the following AST:
data AExpr -- Abstract Expression
= NumberA Int
| PlusA AExpr AExpr
| TimesA AExpr AExpr
we could write:
aexpr :: Expr -> AExpr
aexpr (TermE t) = aterm t
aexpr (PlusE t1 t2) = PlusA (aterm t1) (aterm t2)
aterm :: Term -> AExpr
aterm (FactorT f) = afactor f
aterm (TimesT f1 f2) = TimesA (afactor f1) (afactor f2)
afactor :: Factor -> AExpr
afactor (NumberF n) = NumberA n
afactor (ParenF e) = aexpr e
To interpret the AST, we could use:
interp :: AExpr -> Int
interp (NumberA n) = n
interp (PlusA e1 e2) = interp e1 + interp e2
interp (TimesA e1 e2) = interp e1 * interp e2
and then write:
calc :: String -> Int
calc = interp . aexpr . parseExpr
after which we have a crude little calculator:
> calc "1 + 2 * (6 + 3)"
19
>
Skipping the CST
As an alternative approach, we could replace the parser with one that parses directly into an AST of type AExpr:
expr :: Parser AExpr
expr = do
t1 <- term
(PlusA t1 <$ symbol "+" <*> term)
<|> pure t1
term :: Parser AExpr
term = do
f1 <- factor
(TimesA f1 <$ symbol "*" <*> factor)
<|> pure f1
factor :: Parser AExpr
factor = NumberA . read <$> lexeme (many1 (satisfy isDigit))
<|> between (symbol "(") (symbol ")") expr
You can see how little the structure of these parsers changes. All that's disappeared is the distinction between expressions, terms, and factors at the type level, and constructors like TermE, FactorT, and ParenF whose only purpose is to allow embedding of these types within each other.
In more complex scenarios, the CST and AST parsers might exhibit bigger differences. (For example, in a grammar that allowed 1 + 2 + 3, this might be represented as a single constructor data Expr = ... | PlusE [Term] | ... in the CST but with a nested series of binary PlusA constructors in the same AExpr AST type as above.)
After redefining parseExpr to return an AExpr and dropping the aexpr step from calc, everything else stays the same, and we still have:
> calc "1 + 2 * (6 + 3)"
19
>
Programs for Reference
Here's the full program using an intermediate CST:
-- Calc1.hs, using a CST
{-# OPTIONS_GHC -Wall #-}
module Calc1 where
import Data.Char
import Text.Parsec
import Text.Parsec.String
data Expr = TermE Term | PlusE Term Term deriving (Show)
data Term = FactorT Factor | TimesT Factor Factor deriving (Show)
data Factor = NumberF Int | ParenF Expr deriving (Show)
lexeme :: Parser a -> Parser a
lexeme p = p <* spaces
symbol :: String -> Parser String
symbol = lexeme . string
expr :: Parser Expr
expr = do
t1 <- term
(PlusE t1 <$ symbol "+" <*> term)
<|> pure (TermE t1)
term :: Parser Term
term = do
f1 <- factor
(TimesT f1 <$ symbol "*" <*> factor)
<|> pure (FactorT f1)
factor :: Parser Factor
factor = NumberF . read <$> lexeme (many1 (satisfy isDigit))
<|> ParenF <$> between (symbol "(") (symbol ")") expr
parseExpr :: String -> Expr
parseExpr pgm = case parse (spaces *> expr) "(string)" pgm of
Right e -> e
Left err -> error $ show err
data AExpr -- Abstract Expression
= NumberA Int
| PlusA AExpr AExpr
| TimesA AExpr AExpr
aexpr :: Expr -> AExpr
aexpr (TermE t) = aterm t
aexpr (PlusE t1 t2) = PlusA (aterm t1) (aterm t2)
aterm :: Term -> AExpr
aterm (FactorT f) = afactor f
aterm (TimesT f1 f2) = TimesA (afactor f1) (afactor f2)
afactor :: Factor -> AExpr
afactor (NumberF n) = NumberA n
afactor (ParenF e) = aexpr e
interp :: AExpr -> Int
interp (NumberA n) = n
interp (PlusA e1 e2) = interp e1 + interp e2
interp (TimesA e1 e2) = interp e1 * interp e2
calc :: String -> Int
calc = interp . aexpr . parseExpr
and here's the full program for the more traditional solution that skips an explicit CST representation:
-- Calc2.hs, with direct parsing to AST
{-# OPTIONS_GHC -Wall #-}
module Calc where
import Data.Char
import Text.Parsec
import Text.Parsec.String
lexeme :: Parser a -> Parser a
lexeme p = p <* spaces
symbol :: String -> Parser String
symbol = lexeme . string
expr :: Parser AExpr
expr = do
t1 <- term
(PlusA t1 <$ symbol "+" <*> term)
<|> pure t1
term :: Parser AExpr
term = do
f1 <- factor
(TimesA f1 <$ symbol "*" <*> factor)
<|> pure f1
factor :: Parser AExpr
factor = NumberA . read <$> lexeme (many1 (satisfy isDigit))
<|> between (symbol "(") (symbol ")") expr
parseExpr :: String -> AExpr
parseExpr pgm = case parse (spaces *> expr) "(string)" pgm of
Right e -> e
Left err -> error $ show err
data AExpr -- Abstract Expression
= NumberA Int
| PlusA AExpr AExpr
| TimesA AExpr AExpr
interp :: AExpr -> Int
interp (NumberA n) = n
interp (PlusA e1 e2) = interp e1 + interp e2
interp (TimesA e1 e2) = interp e1 * interp e2
calc :: String -> Int
calc = interp . parseExpr
Okay so Buhr's answer is quite nice. Here's how I did though (no CST) inspired by sepp2k's response:
The AST:
data OP = OPplus | OPminus | OPstar | OPdiv
| OPidiv | OPmod | OPand | OPeq | OPneq
| OPless | OPgreater | OPle | OPge
| OPin | OPor
data Expr =
Relation Expr OP Expr -- > < == >= etc..
| Unary OP Expr -- + -
| Mult Expr OP Expr -- * / div mod and
| Add Expr OP Expr -- + - or
| FactorInt Int | FactorReal Double
| FactorStr String
| FactorTrue | FactorFalse
| FactorNil
| FactorDesig Designator -- identifiers
| FactorNot Expr
| FactorFuncCall FuncCall deriving (Show)
The parsers:
parseExpr :: Parser Expr
parseExpr = (try $ Relation <$>
parseSimpleExpr <*> parseOPrelation <*> parseSimpleExpr)
<|> parseSimpleExpr
parseSimpleExpr :: Parser Expr
parseSimpleExpr = (try simpleAdd)
<|> (try $ Unary <$> parseOPunary <*> simpleAdd)
<|> (try $ Unary <$> parseOPunary <*> parseSimpleExpr)
<|> parseTerm
where simpleAdd = Add <$> parseTerm <*> parseOPadd <*> parseSimpleExpr
parseTerm :: Parser Expr
parseTerm = (try $ Mult <$>
parseFactor <*> parseOPmult <*> parseTerm)
<|> parseFactor
parseFactor :: Parser Expr
parseFactor =
(parseKWnot >> FactorNot <$> parseFactor)
<|> (exactTok "true" >> return FactorTrue)
<|> (exactTok "false" >> return FactorFalse)
<|> (parseNumber)
<|> (FactorStr <$> parseString)
<|> (betweenCharTok '(' ')' parseExpr)
<|> (FactorDesig <$> parseDesignator)
<|> (FactorFuncCall <$> parseFuncCall)
I didn't include basic parsers like parseOPadd as those are what you'd expect and are easy to build.
I still parsed according to the grammar but tweaked it slightly to match my AST.
You could check out the full source which is a compiler for Pascal here.
I need to parse this syntax for function declaration
foo x = 1
Func "foo" (Ident "x") = 1
foo (x = 1) = 1
Func "foo" (Label "x" 1) = 1
foo x = y = 1
Func "foo" (Ident "x") = (Label "y" 1)
I have written this parser
module SimpleParser where
import Text.Parsec.String (Parser)
import Text.Parsec.Language (emptyDef)
import Text.Parsec
import qualified Text.Parsec.Token as Tok
import Text.Parsec.Char
import Prelude
lexer :: Tok.TokenParser ()
lexer = Tok.makeTokenParser style
where
style = emptyDef {
Tok.identLetter = alphaNum
}
parens :: Parser a -> Parser a
parens = Tok.parens lexer
commaSep :: Parser a -> Parser [a]
commaSep = Tok.commaSep1 lexer
commaSep1 :: Parser a -> Parser [a]
commaSep1 = Tok.commaSep1 lexer
identifier :: Parser String
identifier = Tok.identifier lexer
reservedOp :: String -> Parser ()
reservedOp = Tok.reservedOp lexer
data Expr = IntLit Int | Ident String | Label String Expr | Func String Expr Expr | ExprList [Expr] deriving (Eq, Ord, Show)
integer :: Parser Integer
integer = Tok.integer lexer
litInt :: Parser Expr
litInt = do
n <- integer
return $ IntLit (fromInteger n)
ident :: Parser Expr
ident = Ident <$> identifier
paramLabelItem = litInt <|> paramLabel
paramLabel :: Parser Expr
paramLabel = do
lbl <- try (identifier <* reservedOp "=")
body <- paramLabelItem
return $ Label lbl body
paramItem :: Parser Expr
paramItem = parens paramRecord <|> litInt <|> try paramLabel <|> ident
paramRecord :: Parser Expr
paramRecord = ExprList <$> commaSep1 paramItem
func :: Parser Expr
func = do
name <- identifier
params <- paramRecord
reservedOp "="
body <- paramRecord
return $ (Func name params body)
parseExpr :: String -> Either ParseError Expr
parseExpr s = parse func "" s
I can parse foo (x) = 1 but I cannot parse foo x = 1
parseExpr "foo x = 1"
Left (line 1, column 10):
unexpected end of input
expecting digit, "," or "="
I understand it tries to parse this code like Func "foo" (Label "x" 1) and fails. But after fail why it cannot try to parse it like Func "foo" (Ident "x") = 1
Is there any way to do it?
Also I have tried to swap ident and paramLabel
paramItem = parens paramRecord <|> litInt <|> try paramLabel <|> ident
paramItem = parens paramRecord <|> litInt <|> try ident <|> paramLabel
In this case I can parse foo x = 1 but I cannot parse foo (x = 1) = 2
parseExpr "foo (x = 1) = 2"
Left (line 1, column 8):
unexpected "="
expecting "," or ")"
Here is how I understand how Parsec backtracking works (and doesn't work):
In:
(try a <|> try b <|> c)
if a fails, b will be tried, and if b subsequently fails c will be tried.
However, in:
(try a <|> try b <|> c) >> d
if a succeeds but d fails, Parsec does not go back and try b. Once a succeeded Parsec considers the whole choice as parsed and it moves on to d. It will never go back to try b or c.
This doesn't work either for the same reason:
(try (try a <|> try b <|> c)) >> d
Once a or b succeeds, the whole choice succeeds, and therefor the outer try succeeds. Parsing then moves on to d.
A solution is to distribute d to within the choice:
try (a >> d) <|> try (b >> d) <|> (c >> d)
Now if a succeeds but d fails, b >> d will be tried.