Parsing complex operands in boolean expressions in Python 2.7 - parsing

I am trying to modify the example code in pyparsing to handle operands that are key value pairs, like:
(Region:US and Region:EU) or (Region:Asia)
This is a boolean expression with three operands - Region:US, Region:EU and Region:Asia. If they were simple operands like x, y and z, I'd be good to go. I don't need to do any special processing on them to break up the key-value pairs. I need to treat the operand in its entirety as though it might have just been x, and need to assign truth values to it and evaluate the full expression.
How might I modify the following code to handle this:
#
# simpleBool.py
#
# Example of defining a boolean logic parser using
# the operatorGrammar helper method in pyparsing.
#
# In this example, parse actions associated with each
# operator expression will "compile" the expression
# into BoolXXX class instances, which can then
# later be evaluated for their boolean value.
#
# Copyright 2006, by Paul McGuire
# Updated 2013-Sep-14 - improved Python 2/3 cross-compatibility
#
from pyparsing import infixNotation, opAssoc, Keyword, Word, alphas
# define classes to be built at parse time, as each matching
# expression type is parsed
class BoolOperand(object):
def __init__(self,t):
self.label = t[0]
self.value = eval(t[0])
def __bool__(self):
return self.value
def __str__(self):
return self.label
__repr__ = __str__
__nonzero__ = __bool__
class BoolBinOp(object):
def __init__(self,t):
self.args = t[0][0::2]
def __str__(self):
sep = " %s " % self.reprsymbol
return "(" + sep.join(map(str,self.args)) + ")"
def __bool__(self):
return self.evalop(bool(a) for a in self.args)
__nonzero__ = __bool__
__repr__ = __str__
class BoolAnd(BoolBinOp):
reprsymbol = '&'
evalop = all
class BoolOr(BoolBinOp):
reprsymbol = '|'
evalop = any
class BoolNot(object):
def __init__(self,t):
self.arg = t[0][1]
def __bool__(self):
v = bool(self.arg)
return not v
def __str__(self):
return "~" + str(self.arg)
__repr__ = __str__
__nonzero__ = __bool__
TRUE = Keyword("True")
FALSE = Keyword("False")
boolOperand = TRUE | FALSE | Word(alphas,max=1)
boolOperand.setParseAction(BoolOperand)
# define expression, based on expression operand and
# list of operations in precedence order
boolExpr = infixNotation( boolOperand,
[
("not", 1, opAssoc.RIGHT, BoolNot),
("and", 2, opAssoc.LEFT, BoolAnd),
("or", 2, opAssoc.LEFT, BoolOr),
])
if __name__ == "__main__":
p = True
q = False
r = True
tests = [("p", True),
("q", False),
("p and q", False),
("p and not q", True),
("not not p", True),
("not(p and q)", True),
("q or not p and r", False),
("q or not p or not r", False),
("q or not (p and r)", False),
("p or q or r", True),
("p or q or r and False", True),
("(p or q or r) and False", False),
]
print("p =", p)
print("q =", q)
print("r =", r)
print()
for t,expected in tests:
res = boolExpr.parseString(t)[0]
success = "PASS" if bool(res) == expected else "FAIL"
print (t,'\n', res, '=', bool(res),'\n', success, '\n')
Instead of p, q, r, I'd like to use "Region:US", "Region:EU" and "Region:Asia." Any ideas?
EDIT: Using Paul McGuire's suggestion, I tried writing the following code which breaks on parsing:
#
# simpleBool.py
#
# Example of defining a boolean logic parser using
# the operatorGrammar helper method in pyparsing.
#
# In this example, parse actions associated with each
# operator expression will "compile" the expression
# into BoolXXX class instances, which can then
# later be evaluated for their boolean value.
#
# Copyright 2006, by Paul McGuire
# Updated 2013-Sep-14 - improved Python 2/3 cross-compatibility
#
from pyparsing import infixNotation, opAssoc, Keyword, Word, alphas
# define classes to be built at parse time, as each matching
# expression type is parsed
class BoolOperand(object):
def __init__(self,t):
self.label = t[0]
self.value = validValues[t[0]]
def __bool__(self):
return self.value
def __str__(self):
return self.label
__repr__ = __str__
__nonzero__ = __bool__
class BoolBinOp(object):
def __init__(self,t):
self.args = t[0][0::2]
def __str__(self):
sep = " %s " % self.reprsymbol
return "(" + sep.join(map(str,self.args)) + ")"
def __bool__(self):
return self.evalop(bool(a) for a in self.args)
__nonzero__ = __bool__
__repr__ = __str__
class BoolAnd(BoolBinOp):
reprsymbol = '&'
evalop = all
class BoolOr(BoolBinOp):
reprsymbol = '|'
evalop = any
class BoolNot(object):
def __init__(self,t):
self.arg = t[0][1]
def __bool__(self):
v = bool(self.arg)
return not v
def __str__(self):
return "~" + str(self.arg)
__repr__ = __str__
__nonzero__ = __bool__
TRUE = Keyword("True")
FALSE = Keyword("False")
boolOperand = TRUE | FALSE | Word(alphas+":",max=1)
boolOperand.setParseAction(BoolOperand)
# define expression, based on expression operand and
# list of operations in precedence order
boolExpr = infixNotation( boolOperand,
[
("not", 1, opAssoc.RIGHT, BoolNot),
("and", 2, opAssoc.LEFT, BoolAnd),
("or", 2, opAssoc.LEFT, BoolOr),
])
if __name__ == "__main__":
validValues = {
"Region:US": False,
"Region:EU": True,
"Type:Global Assets>24": True
}
tests = [("Region:US", True),
("Region:EU", False),
("Region:US and Region:EU", False),
("Region:US and not Region:EU", True),
("not not Region:US", True),
("not(Region:US and Region:EU)", True),
("Region:EU or not Region:US and Type:Global Assets>24", False),
("Region:EU or not Region:US or not Type:Global Assets>24", False),
("Region:EU or not (Region:US and Type:Global Assets>24)", False),
("Region:US or Region:EU or Type:Global Assets>24", True),
("Region:US or Region:EU or Type:Global Assets>24 and False", True),
("(Region:US or Region:EU or Type:Global Assets>24) and False", False),
]
print("Region:US =", validValues["Region:US"])
print("Region:EU =", validValues["Region:EU"])
print("Type:Global Assets>24 =", validValues["Type:Global Assets>24"])
print()
for t,expected in tests:
res = boolExpr.parseString(t)[0]
success = "PASS" if bool(res) == expected else "FAIL"
print (t,'\n', res, '=', bool(res),'\n', success, '\n')
Thanks to Paul McGuire's help, here is the solution:
boolOperand = TRUE | FALSE | Combine(Word(alphas)+":"+quotedString) | Word(alphas+":<>")
This does the parsing as I wanted it.

There are two parts to making this change: changing the parser, and then changing the post-parsing behavior to accommodate these new values.
To parse operands that are not just simple 1-character names, change this line in the parser:
boolOperand = TRUE | FALSE | Word(alphas,max=1)
The simplest (but not strictest would be to just change it to:
boolOperand = TRUE | FALSE | Word(alphas+":")
But this would accept, in addition to your valid values of "Region:US" or "TimeZone:UTC", presumably invalid values like "XouEWRL:sdlkfj", ":sldjf:ljsdf:sdljf", and even ":::::::". If you want to tighten up the parser, you could enforce the key entry to:
valid_key = oneOf("Region Country City State ZIP")
valid_value = Word(alphas+"_")
valid_kv = Combine(valid_key + ":" + valid_value)
boolOperand = TRUE | FALSE | valid_kv
That should take care of the parser.
Second, you will need to change how this entry is evaluated after the parsing is done. In my example, I was emphasizing the parsing part, not the evaluating part, so I left this to simply call the eval() builtin. In your case, you will probably need to initialize a dict of valid values for each acceptable key-value pair, and then change the code in BoolOperand to do a dict lookup instead of calling eval. (This has the added benefit of not calling eval() with user-entered data, which has all kinds of potential for security problems.)

Related

What is wrong with thiis code. I am trying to write a program of infix to postfix conversion in python

infix to postfix conversion using python
I am using deque for creating stack
from collections import deque
class Infix_to_prefix:
def __init__(self):
self.container = deque()
self.output = deque()
def push(self,val):
self.container.append(val)
def pop(self):
self.container.pop()
def peek(self):
return self.container[-1]
def is_empty(self):
if len(self.container)==0:
return True
else:
return False
def is_operator(self, ch):
if ch in "+-*/^":
return True
else:
return False
def is_operand(self,ch):
if (ch>='A' and ch<='Z') or (ch>='a' and ch<='z'):
return True
else:
False
def ranking(self, top, ch):
rank = {
'+': 1,
'-': 1,
'*': 2,
'/': 2,
'^': 3
}
if rank[top]<rank[ch]:
return True
else:
return False
def Conversion(self, exp):
for ch in exp:
if self.is_operand(ch):
self.output.append(ch)
elif self.is_operator(ch):
while True:
if self.is_empty():
self.push(ch)
else:
top = self.peek()
if top == '(':
self.push(ch)
break
elif self.ranking(top, ch):
self.push(ch)
break
else:
cpop = self.pop()
self.output.append(cpop)
elif ch == '(':
self.push(ch)
elif ch == ')':
cpop = self.pop()
while cpop!='(':
self.output.append(cpop)
cpop = self.pop()
while not self.is_empty():
cpop = self.pop()
self.output.append(cpop)
print(("").join(self.output))
exp = "a+b*(c^d-e)^(f+g*h)-i"
s = Infix_to_prefix()
s.Conversion(exp)
When I am running this code it just keep running.
Is there is another method or another program for solving infix to postfix.
point out what is wrong in this code.
Please ignore from this line
also tell me why stack flow always keeps telling me to add some more detail every time i atry to ask a question.

How do I use Pytorch's "tanslation with a seq2seq" using my own inputs?

I am following the guide here
Currently this is the model:
SOS_token = 0
EOS_token = 1
class Lang:
def __init__(self, name):
self.name = name
self.word2index = {}
self.word2count = {}
self.index2word = {0: "SOS", 1: "EOS"}
self.n_words = 2 # Count SOS and EOS
def addSentence(self, sentence):
for word in sentence.split(' '):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.n_words
self.word2count[word] = 1
self.index2word[self.n_words] = word
self.n_words += 1
else:
self.word2count[word] += 1
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
return s
def readLangs(lang1, lang2, reverse=False):
print("Reading lines...")
# Read the file and split into lines
lines = open('Scribe/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
# Reverse pairs, make Lang instances
if reverse:
pairs = [list(reversed(p)) for p in pairs]
input_lang = Lang(lang2)
output_lang = Lang(lang1)
else:
input_lang = Lang(lang1)
output_lang = Lang(lang2)
return input_lang, output_lang, pair
MAX_LENGTH = 5000
eng_prefixes = (
"i am ", "i m ",
"he is", "he s ",
"she is", "she s ",
"you are", "you re ",
"we are", "we re ",
"they are", "they re "
)
def filterPair(p):
return len(p[0].split(' ')) < MAX_LENGTH and \
len(p[1].split(' ')) < MAX_LENGTH and \
p[1].startswith(eng_prefixes)
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
def prepareData(lang1, lang2, reverse=False):
input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
print("Read %s sentence pairs" % len(pairs))
pairs = filterPairs(pairs)
print("Trimmed to %s sentence pairs" % len(pairs))
print("Counting words...")
for pair in pairs:
input_lang.addSentence(pair[0])
output_lang.addSentence(pair[1])
print("Counted words:")
print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)
return input_lang, output_lang, pairs
The difference between what I'm trying to do and the guide is that I'm trying to insert my input languages as list of strings instead of reading them from a file:
pairs=['string one goes like this', 'string two goes like this']
input_lang = Lang(pairs[0][0])
output_lang = Lang(pairs[1][1])
But I it seems like when I try to count the number of words input_lang.n_words in my string I always get 2.
Is there something I'm missing in calling the class Lang?
Update:
I ran
language = Lang('english')
for sentence in pairs: language.addSentence(sentence)
print (language.n_words)
and that gave me the number of words in pairs
Though, that doesn't give me input_lang and output_lang like the guide did:
for pair in pairs:
input_lang.addSentence(pair[0])
output_lang.addSentence(pair[1])
So first of all you are initialising the Lang object with calls to pairs[0][0] and pairs[1][1] which is the same as Lang('s') and Lang('t')
The Lang object is supposed to be an object that stores information about a language so I would expect you need to only initialise it once with Lang('english') and then add the sentences from you dataset to the Lang object with the Lang.addSentence function.
Right now you aren't loading your dataset into the Lang object at all so when you want to know language.n_words it is just the initial value it gets when the object is created self.n_words = 2 # Count SOS and EOS
None of what you are doing in your question makes any sense, but I think what you want is the following:
language = Lang('english')
for sentence in pairs: language.addSentence(sentence)
print (language.n_words)

Replace until all occurrences are removed

I have the following strings:
",||||||||||||||"
",|||||a|||||,|"
I would like to achieve that all occurrences of ",|" are replaced with ",,"
The output should be the following:
",,,,,,,,,,,,,,,"
",,,,,,a|||||,,"
When I run .gsub(',|', ',,') on the strings I get not the desired output.
",,|||||||||||||"
",,||||a|||||,,"
That's because it does not run gsub several times.
Is there a similar method that runs recursively.
A regular expression matches can not overlap. Since matches are what is used for replacement, you can't do it that way. Here's two workarounds:
str = ",|||||a|||||,|"
while str.gsub!(/,\|/, ',,'); end
str = ",|||||a|||||,|"
str.gsub!(/,(\|+)/) { "," * ($1.length + 1) }
smoke_weed_every_day = lambda do |piper|
commatosed = piper.gsub(',|', ',,')
commatosed == piper ? piper : smoke_weed_every_day.(commatosed)
end
smoke_weed_every_day.(",||||||||||||||") # => ",,,,,,,,,,,,,,,"
smoke_weed_every_day.(",|||||a|||||,|") # => ",,,,,,a|||||,,"
From an old library of mine. This method iterates until the block output is equal to its input :
def loop_until_convergence(x)
x = yield(previous = x) until previous == x
x
end
puts loop_until_convergence(',||||||||||||||') { |s| s.gsub(',|', ',,') }
# ",,,,,,,,,,,,,,,"
puts loop_until_convergence(',|||||a|||||,|') { |s| s.gsub(',|', ',,') }
# ",,,,,,a|||||,,"
As a bonus, you can calculate a square root in very few iterations :
def root(n)
loop_until_convergence(1) { |x| 0.5 * (x + n / x) }
end
p root(2)
# 1.414213562373095
p root(3)
# 1.7320508075688772
As with #Amandan's second solution there is no need to iterate until no further changes are made.
COMMA = ','
PIPE = '|'
def replace_pipes_after_comma(str)
run = false
str.gsub(/./) do |s|
case s
when PIPE
run ? COMMA : PIPE
when COMMA
run = true
COMMA
else
run = false
s
end
end
end
replace_pipes_after_comma ",||||||||||||||"
#=> ",,,,,,,,,,,,,,,"
replace_pipes_after_comma ",|||||a|||||,|"
#=> ",,,,,,a|||||,,"

How to separate brackets in ruby?

I've been using the following code for the problem. I'm making a program to change the IUPAC name into structure, so i want to analyse the string entered by the user.In IUPAC name there are brackets as well. I want to extract the compound name as per the brackets. The way I have shown in the end.
I want to modify the way such that the output comes out to be like this and to be stored in an array :
As ["(4'-cyanobiphenyl-4-yl)","5-[(4'-cyanobiphenyl-4-yl)oxy]",
"({5-[(4'-cyanobiphenyl-4-yl)oxy]pentyl}" .... and so on ]
And the code for splitting which i wrote is:
Reg_bracket=/([^(){}\[\]]*)([(){}\[\]])/
attr_reader :obrk, :cbrk
def count_level_br
#xbrk=0
#cbrk=0
if #temp1
#obrk+=1 if #temp1[1]=="(" || #temp1[1]=="[" ||#temp1[1]=="{"
#obrk-=1 if #temp1[1]==")" || #temp1[1]=="]" ||#temp1[1]=="}"
end
puts #obrk.to_s
end
def split_at_bracket(str=nil) #to split the brackets according to Regex
if str a=str
else a=self
end
a=~Reg_bracket
if $& #temp1=[$1,$2,$']
end
#temp1||=[a,"",""]
end
def find_block
#obrk=0 , r=""
#temp1||=["",""]
split_at_bracket
r<<#temp1[0]<<#temp1[1]
count_level_br
while #obrk!=0
split_at_bracket(#temp1[2])
r<<#temp1[0]<<#temp1[1]
count_level_br
puts r.to_s
if #obrk==0
puts "Level 0 has reached"
#puts "Close brackets are #{#cbrk}"
return r
end
end #end
end
end #class end'
I ve used the regex to match the brackets. And then when it finds any bracket it gives the result of before match, after match and second after match and then keeps on doing it until it reaches to the end.
The output which I m getting right now is this.
1
2
1-[(
3
1-[({
4
1-[({5-[
5
1-[({5-[(
4
1-[({5-[(4'-cyanobiphenyl-4-yl)
3
1-[({5-[(4'-cyanobiphenyl-4-yl)oxy]
2
1-[({5-[(4'-cyanobiphenyl-4-yl)oxy]pentyl}
1
1-[({5-[(4'-cyanobiphenyl-4-yl)oxy]pentyl}oxy)
0
1-[({5-[(4'-cyanobiphenyl-4-yl)oxy]pentyl}oxy)carbonyl]
Level 0 has reached
testing ends'
I have written a simple program to match the string using three different regular expressions. The first one will help separate out the parenthesis, the second will separate out the square brackets and the third will give the curly braces. Here is the following code. I hope you will be able to use it in your program effectively.
reg1 = /(\([a-z0-9\'\-\[\]\{\}]+.+\))/ # for parenthesis
reg2 = /(\[[a-z0-9\'\-\(\)\{\}]+.+\])/ # for square brackets
reg3 = /(\{[a-z0-9\'\-\(\)\[\]]+.+\})/ # for curly braces
a = Array.new
s = gets.chomp
x = reg1.match(s)
a << x.to_s
str = x.to_s.chop.reverse.chop.reverse
while x != nil do
x = reg1.match(str)
a << x.to_s
str = x.to_s.chop
end
x = reg2.match(s)
a << x.to_s
str = x.to_s.chop.reverse.chop.reverse
while x != nil do
x = reg2.match(str)
a << x.to_s
str = x.to_s.chop
end
x = reg3.match(s)
a << x.to_s
str = x.to_s.chop.reverse.chop.reverse
while x != nil do
x = reg3.match(str)
a << x.to_s
str = x.to_s.chop
end
puts a
The output is a follows :
ruby reg_yo.rb
4,4'{-1-[({5-[(4'-cyanobiphenyl-4-yl)oxy]pentyl}oxy)carbonyl]-2-[(4'-cyanobiphe‌​nyl-4-yl)oxy]ethylene}dihexanoic acid # input string
({5-[(4'-cyanobiphenyl-4-yl)oxy]pentyl}oxy)carbonyl]-2-[(4'-cyanobiphe‌​nyl-4-yl)
(4'-cyanobiphenyl-4-yl)oxy]pentyl}oxy)
(4'-cyanobiphenyl-4-yl)
[({5-[(4'-cyanobiphenyl-4-yl)oxy]pentyl}oxy)carbonyl]-2-[(4'-cyanobiphe‌​nyl-4-yl)oxy]
[(4'-cyanobiphenyl-4-yl)oxy]pentyl}oxy)carbonyl]
[(4'-cyanobiphenyl-4-yl)oxy]
{-1-[({5-[(4'-cyanobiphenyl-4-yl)oxy]pentyl}oxy)carbonyl]-2-[(4'-cyanobiphe‌​nyl-4-yl)oxy]ethylene}
{5-[(4'-cyanobiphenyl-4-yl)oxy]pentyl}
Update : I have modified the code so as to search for recursive patterns.

Use Scala parser combinator to parse CSV files

I'm trying to write a CSV parser using Scala parser combinators. The grammar is based on RFC4180. I came up with the following code. It almost works, but I cannot get it to correctly separate different records. What did I miss?
object CSV extends RegexParsers {
def COMMA = ","
def DQUOTE = "\""
def DQUOTE2 = "\"\"" ^^ { case _ => "\"" }
def CR = "\r"
def LF = "\n"
def CRLF = "\r\n"
def TXT = "[^\",\r\n]".r
def file: Parser[List[List[String]]] = ((record~((CRLF~>record)*))<~(CRLF?)) ^^ {
case r~rs => r::rs
}
def record: Parser[List[String]] = (field~((COMMA~>field)*)) ^^ {
case f~fs => f::fs
}
def field: Parser[String] = escaped|nonescaped
def escaped: Parser[String] = (DQUOTE~>((TXT|COMMA|CR|LF|DQUOTE2)*)<~DQUOTE) ^^ { case ls => ls.mkString("")}
def nonescaped: Parser[String] = (TXT*) ^^ { case ls => ls.mkString("") }
def parse(s: String) = parseAll(file, s) match {
case Success(res, _) => res
case _ => List[List[String]]()
}
}
println(CSV.parse(""" "foo", "bar", 123""" + "\r\n" +
"hello, world, 456" + "\r\n" +
""" spam, 789, egg"""))
// Output: List(List(foo, bar, 123hello, world, 456spam, 789, egg))
// Expected: List(List(foo, bar, 123), List(hello, world, 456), List(spam, 789, egg))
Update: problem solved
The default RegexParsers ignore whitespaces including space, tab, carriage return, and line breaks using the regular expression [\s]+. The problem of the parser above unable to separate records is due to this. We need to disable skipWhitespace mode. Replacing whiteSpace definition to just [ \t]} does not solve the problem because it will ignore all spaces within fields (thus "foo bar" in the CSV becomes "foobar"), which is undesired. The updated source of the parser is thus
import scala.util.parsing.combinator._
// A CSV parser based on RFC4180
// https://www.rfc-editor.org/rfc/rfc4180
object CSV extends RegexParsers {
override val skipWhitespace = false // meaningful spaces in CSV
def COMMA = ","
def DQUOTE = "\""
def DQUOTE2 = "\"\"" ^^ { case _ => "\"" } // combine 2 dquotes into 1
def CRLF = "\r\n" | "\n"
def TXT = "[^\",\r\n]".r
def SPACES = "[ \t]+".r
def file: Parser[List[List[String]]] = repsep(record, CRLF) <~ (CRLF?)
def record: Parser[List[String]] = repsep(field, COMMA)
def field: Parser[String] = escaped|nonescaped
def escaped: Parser[String] = {
((SPACES?)~>DQUOTE~>((TXT|COMMA|CRLF|DQUOTE2)*)<~DQUOTE<~(SPACES?)) ^^ {
case ls => ls.mkString("")
}
}
def nonescaped: Parser[String] = (TXT*) ^^ { case ls => ls.mkString("") }
def parse(s: String) = parseAll(file, s) match {
case Success(res, _) => res
case e => throw new Exception(e.toString)
}
}
What you missed is whitespace. I threw in a couple bonus improvements.
import scala.util.parsing.combinator._
object CSV extends RegexParsers {
override protected val whiteSpace = """[ \t]""".r
def COMMA = ","
def DQUOTE = "\""
def DQUOTE2 = "\"\"" ^^ { case _ => "\"" }
def CR = "\r"
def LF = "\n"
def CRLF = "\r\n"
def TXT = "[^\",\r\n]".r
def file: Parser[List[List[String]]] = repsep(record, CRLF) <~ opt(CRLF)
def record: Parser[List[String]] = rep1sep(field, COMMA)
def field: Parser[String] = (escaped|nonescaped)
def escaped: Parser[String] = (DQUOTE~>((TXT|COMMA|CR|LF|DQUOTE2)*)<~DQUOTE) ^^ { case ls => ls.mkString("")}
def nonescaped: Parser[String] = (TXT*) ^^ { case ls => ls.mkString("") }
def parse(s: String) = parseAll(file, s) match {
case Success(res, _) => res
case _ => List[List[String]]()
}
}
With Scala Parser Combinators library out of the Scala standard library starting from 2.11 there is no good reason not to use the much more performant Parboiled2 library.
Here is a version of the CSV parser in Parboiled2's DSL:
/* based on comments in https://github.com/sirthias/parboiled2/issues/61 */
import org.parboiled2._
case class Parboiled2CsvParser(input: ParserInput, delimeter: String) extends Parser {
def DQUOTE = '"'
def DELIMITER_TOKEN = rule(capture(delimeter))
def DQUOTE2 = rule("\"\"" ~ push("\""))
def CRLF = rule(capture("\r\n" | "\n"))
def NON_CAPTURING_CRLF = rule("\r\n" | "\n")
val delims = s"$delimeter\r\n" + DQUOTE
def TXT = rule(capture(!anyOf(delims) ~ ANY))
val WHITESPACE = CharPredicate(" \t")
def SPACES: Rule0 = rule(oneOrMore(WHITESPACE))
def escaped = rule(optional(SPACES) ~
DQUOTE ~ (zeroOrMore(DELIMITER_TOKEN | TXT | CRLF | DQUOTE2) ~ DQUOTE ~
optional(SPACES)) ~> (_.mkString("")))
def nonEscaped = rule(zeroOrMore(TXT | capture(DQUOTE)) ~> (_.mkString("")))
def field = rule(escaped | nonEscaped)
def row: Rule1[Seq[String]] = rule(oneOrMore(field).separatedBy(delimeter))
def file = rule(zeroOrMore(row).separatedBy(NON_CAPTURING_CRLF))
def parsed() : Try[Seq[Seq[String]]] = file.run()
}
The default whitespace for RegexParsers parsers is \s+, which includes new lines. So CR, LF and CRLF never get a chance to be processed, as it is automatically skipped by the parser.

Resources